A bunch of small fixes in Dict.

Improve text analysis for markup tags.
Use Libbsc for plain text and PPMd for markup mixed text.
Change thresholds.
This commit is contained in:
Moinak Ghosh 2015-01-11 17:36:46 +05:30
parent 66a482c968
commit 077da83d5d
4 changed files with 34 additions and 31 deletions

View file

@ -40,10 +40,6 @@
#include <pc_archive.h> #include <pc_archive.h>
#include "filters/analyzer/analyzer.h" #include "filters/analyzer/analyzer.h"
#define FIFTY_PCT(x) (((x)/10) * 5)
#define FORTY_PCT(x) (((x)/10) * 4)
#define ONE_PCT(x) ((x)/100)
static unsigned int lzma_count = 0; static unsigned int lzma_count = 0;
static unsigned int bzip2_count = 0; static unsigned int bzip2_count = 0;
static unsigned int bsc_count = 0; static unsigned int bsc_count = 0;
@ -246,9 +242,12 @@ int
is_bsc_type(int btype) is_bsc_type(int btype)
{ {
int stype = PC_SUBTYPE(btype); int stype = PC_SUBTYPE(btype);
return ((stype == TYPE_MARKUP) | (stype == TYPE_BMP) | (stype == TYPE_DNA_SEQ) | int mtype = PC_TYPE(btype);
return ((stype == TYPE_BMP) | (stype == TYPE_DNA_SEQ) |
(stype == TYPE_MP4) | (stype == TYPE_FLAC) | (stype == TYPE_AVI) | (stype == TYPE_MP4) | (stype == TYPE_FLAC) | (stype == TYPE_AVI) |
(stype == TYPE_DICOM) | (stype == TYPE_MEDIA_BSC)); (stype == TYPE_DICOM) | (stype == TYPE_MEDIA_BSC) |
(mtype == TYPE_TEXT && stype != TYPE_MARKUP));
} }
int int
@ -260,7 +259,8 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
int stype = PC_SUBTYPE(btype); int stype = PC_SUBTYPE(btype);
analyzer_ctx_t actx; analyzer_ctx_t actx;
if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF) { if (btype == TYPE_UNKNOWN || PC_TYPE(btype) == TYPE_TEXT ||
stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF) {
if (adat->actx == NULL) { if (adat->actx == NULL) {
analyze_buffer(src, srclen, &actx); analyze_buffer(src, srclen, &actx);
adat->actx = &actx; adat->actx = &actx;
@ -271,8 +271,6 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
} else if (adat->adapt_mode == 1) { } else if (adat->adapt_mode == 1) {
btype = adat->actx->fifty_pct.btype; btype = adat->actx->fifty_pct.btype;
} }
if (stype == TYPE_PDF)
btype |= TYPE_MARKUP;
} }
/* Reset analyzer context for subsequent calls. */ /* Reset analyzer context for subsequent calls. */

View file

@ -27,13 +27,13 @@
#define FIFTY_PCT(x) (((x)/10) * 5) #define FIFTY_PCT(x) (((x)/10) * 5)
#define FORTY_PCT(x) (((x)/10) * 4) #define FORTY_PCT(x) (((x)/10) * 4)
#define ONE_PCT(x) ((x)/100) #define TEN_PCT(x) ((x)/10)
void void
analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx) analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
{ {
uchar_t *src1 = (uchar_t *)src; uchar_t *src1 = (uchar_t *)src;
uint64_t i, tot8b, tot_8b, lbytes; uint64_t i, tot8b, tot_8b, lbytes, spc;
uchar_t cur_byte, prev_byte; uchar_t cur_byte, prev_byte;
uint64_t tag1, tag2, tag3; uint64_t tag1, tag2, tag3;
double tagcnt, pct_tag; double tagcnt, pct_tag;
@ -47,11 +47,13 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
tag2 = 0; tag2 = 0;
tag3 = 0; tag3 = 0;
lbytes = 0; lbytes = 0;
spc = 0;
prev_byte = cur_byte = 0; prev_byte = cur_byte = 0;
for (i = 0; i < srclen; i++) { for (i = 0; i < srclen; i++) {
cur_byte = src1[i]; cur_byte = src1[i];
tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization
lbytes += (cur_byte < 32); lbytes += (cur_byte < 32);
spc += (cur_byte == ' ');
tag1 += (cur_byte == '<'); tag1 += (cur_byte == '<');
tag2 += (cur_byte == '>'); tag2 += (cur_byte == '>');
tag3 += ((prev_byte == '<') & (cur_byte == '/')); tag3 += ((prev_byte == '<') & (cur_byte == '/'));
@ -65,7 +67,7 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
* significance levels. * significance levels.
*/ */
tot_8b = tot8b / 0x80 + lbytes; tot_8b = tot8b / 0x80 + lbytes;
tagcnt = tag1 + tag2 + tag3; tagcnt = tag1 + tag2;
pct_tag = tagcnt / (double)srclen; pct_tag = tagcnt / (double)srclen;
if (tot_8b > FORTY_PCT(srclen)) { if (tot_8b > FORTY_PCT(srclen)) {
actx->forty_pct.btype = TYPE_BINARY; actx->forty_pct.btype = TYPE_BINARY;
@ -80,13 +82,13 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
} }
tot8b /= 0x80; tot8b /= 0x80;
if (tot8b == 0 && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) { if (tot8b <= TEN_PCT((double)srclen) && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
actx->one_pct.btype = TYPE_TEXT; actx->one_pct.btype = TYPE_TEXT;
} }
markup = 0; markup = 0;
if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 && if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 &&
tagcnt > (double)srclen * 0.001) tagcnt > (double)spc * 0.1)
markup = 1; markup = 1;
if (markup) { if (markup) {
@ -120,7 +122,7 @@ analyze_buffer_simple(void *src, uint64_t srclen)
* Heuristics for detecting BINARY vs generic TEXT * Heuristics for detecting BINARY vs generic TEXT
*/ */
tot8b /= 0x80; tot8b /= 0x80;
if (tot8b == 0 && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) { if (tot8b <= TEN_PCT((double)srclen) && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
btype = TYPE_TEXT; btype = TYPE_TEXT;
} }
return (btype); return (btype);

View file

@ -324,7 +324,6 @@ DictFilter::DictFilter()
SEPARATOR['\r'] = 1; SEPARATOR['\r'] = 1;
SEPARATOR['>'] = 1; SEPARATOR['>'] = 1;
SEPARATOR[']'] = 1; SEPARATOR[']'] = 1;
SEPARATOR['\''] = 1;
SEPARATOR[')'] = 1; SEPARATOR[')'] = 1;
SEPARATOR['.'] = 1; SEPARATOR['.'] = 1;
SEPARATOR['?'] = 1; SEPARATOR['?'] = 1;
@ -847,13 +846,12 @@ DictFilter::Forward_Dict(uint8_t *src, uint32_t size, uint8_t *dst, uint32_t *ds
* Encode literal numeric strings. * Encode literal numeric strings.
*/ */
converted = 0; converted = 0;
if (word[0] != '+' && word[0] != '-' && word[0] != '0' && if (isdigit(word[0]) && word[0] != '0' && toklen > 4 && toklen < 10) {
toklen > 4 && toklen < 10) {
copy_bytes(num, word, toklen); copy_bytes(num, word, toklen);
num[toklen] = '\0'; num[toklen] = '\0';
val = strtoul((const char *)num, (char **)&word, 10); val = strtoul((const char *)num, (char **)&word, 10);
if (*word == '\0') { if (*word == '\0' && word - num == toklen && val > 0) {
uint8_t tok_hdr[10], *dnum; uint8_t tok_hdr[10], *dnum;
sz = sizeof (tok_hdr); sz = sizeof (tok_hdr);
dnum = to_base_enc(val, tok_hdr, sz); dnum = to_base_enc(val, tok_hdr, sz);
@ -914,7 +912,7 @@ DictFilter::Inverse_Dict(uint8_t *src, uint32_t srclen, uint8_t *dst, uint32_t *
decode_dict_entry_t *w_dict; decode_dict_entry_t *w_dict;
end = src + srclen; end = src + srclen;
srcpos = (uint8_t *)strchr((const char *)src, ' '); srcpos = (uint8_t *)memchr((const void *)src, ' ', WORD_MAX);
if (srcpos - src > 12) { if (srcpos - src > 12) {
return (0); return (0);
} }
@ -924,7 +922,11 @@ DictFilter::Inverse_Dict(uint8_t *src, uint32_t srclen, uint8_t *dst, uint32_t *
w_dict = new decode_dict_entry_t[numWords]; w_dict = new decode_dict_entry_t[numWords];
for (i = 0; i < numWords && srcpos < end; i++) { for (i = 0; i < numWords && srcpos < end; i++) {
uint8_t *w_src = srcpos; uint8_t *w_src = srcpos;
srcpos = (uint8_t *)strchr((const char *)srcpos, ' '); size_t limit;
limit = end - srcpos;
if (limit > WORD_MAX+1) limit = WORD_MAX+1;
srcpos = (uint8_t *)memchr((const void *)srcpos, ' ', limit);
if (srcpos - w_src > WORD_MAX) if (srcpos - w_src > WORD_MAX)
return (0); return (0);
@ -1089,6 +1091,7 @@ dict_decode(uint8_t *from, uint64_t fromlen, uint8_t *to, uint64_t *dstlen)
log_msg(LOG_ERR, 0, "dict_decode: Failed.\n"); log_msg(LOG_ERR, 0, "dict_decode: Failed.\n");
return (-1); return (-1);
} }
if (dl < *dstlen) { if (dl < *dstlen) {
log_msg(LOG_ERR, 0, "dict_decode: Expected: %" PRIu64 ", Got: %" PRIu64 "\n", log_msg(LOG_ERR, 0, "dict_decode: Expected: %" PRIu64 ", Got: %" PRIu64 "\n",
*dstlen, dl); *dstlen, dl);

View file

@ -211,7 +211,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
int result; int result;
uint64_t _dstlen, fromlen; uint64_t _dstlen, fromlen;
uchar_t *from, *to; uchar_t *from, *to;
int stype, dict, analyzed; int stype, analyzed;
analyzer_ctx_t actx; analyzer_ctx_t actx;
DEBUG_STAT_EN(double strt, en); DEBUG_STAT_EN(double strt, en);
@ -221,9 +221,9 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
fromlen = srclen; fromlen = srclen;
result = 0; result = 0;
stype = PC_SUBTYPE(btype); stype = PC_SUBTYPE(btype);
dict = 0;
analyzed = 0; analyzed = 0;
if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF || interesting) { if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF ||
PC_TYPE(btype) == TYPE_TEXT || interesting) {
analyze_buffer(src, srclen, &actx); analyze_buffer(src, srclen, &actx);
analyzed = 1; analyzed = 1;
if (pctx->adapt_mode) if (pctx->adapt_mode)
@ -258,12 +258,13 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
if (pctx->lzp_preprocess) { if (pctx->lzp_preprocess) {
int b_type; int b_type;
b_type = btype;
if (analyzed) if (analyzed)
b_type = PC_TYPE(actx.forty_pct.btype); b_type = PC_TYPE(actx.one_pct.btype);
else else
b_type = PC_TYPE(analyze_buffer_simple(from, fromlen)); b_type = analyze_buffer_simple(from, fromlen);
if (b_type == TYPE_TEXT) { if (PC_TYPE(b_type) == TYPE_TEXT) {
_dstlen = fromlen; _dstlen = fromlen;
result = dict_encode(from, fromlen, to, &_dstlen); result = dict_encode(from, fromlen, to, &_dstlen);
if (result != -1) { if (result != -1) {
@ -273,7 +274,6 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
to = tmp; to = tmp;
fromlen = _dstlen; fromlen = _dstlen;
type |= PREPROC_TYPE_DICT; type |= PREPROC_TYPE_DICT;
dict = result;
} }
} }
} }
@ -1354,7 +1354,7 @@ start_decompress(pc_ctx_t *pctx, const char *filename, char *to_filename)
log_msg(LOG_ERR, 1, "Can't seek in metadata fd: "); log_msg(LOG_ERR, 1, "Can't seek in metadata fd: ");
UNCOMP_BAIL; UNCOMP_BAIL;
} }
/* /*
* Finally create the metadata context. * Finally create the metadata context.
*/ */
@ -1364,7 +1364,7 @@ start_decompress(pc_ctx_t *pctx, const char *filename, char *to_filename)
UNCOMP_BAIL; UNCOMP_BAIL;
} }
} }
uncompfd = -1; uncompfd = -1;
if (setup_extractor(pctx) == -1) { if (setup_extractor(pctx) == -1) {
log_msg(LOG_ERR, 0, "Setup of extraction context failed."); log_msg(LOG_ERR, 0, "Setup of extraction context failed.");
@ -1400,7 +1400,7 @@ start_decompress(pc_ctx_t *pctx, const char *filename, char *to_filename)
if (pctx->archive_mode) { if (pctx->archive_mode) {
nprocs = nprocs > 1 ? nprocs-1:nprocs; nprocs = nprocs > 1 ? nprocs-1:nprocs;
} }
if (pctx->nthreads > 0 && pctx->nthreads < nprocs) if (pctx->nthreads > 0 && pctx->nthreads < nprocs)
nprocs = pctx->nthreads; nprocs = pctx->nthreads;
else else