diff --git a/adaptive_compress.c b/adaptive_compress.c index ccbee77..ddb2631 100644 --- a/adaptive_compress.c +++ b/adaptive_compress.c @@ -40,10 +40,6 @@ #include #include "filters/analyzer/analyzer.h" -#define FIFTY_PCT(x) (((x)/10) * 5) -#define FORTY_PCT(x) (((x)/10) * 4) -#define ONE_PCT(x) ((x)/100) - static unsigned int lzma_count = 0; static unsigned int bzip2_count = 0; static unsigned int bsc_count = 0; @@ -246,9 +242,12 @@ int is_bsc_type(int btype) { int stype = PC_SUBTYPE(btype); - return ((stype == TYPE_MARKUP) | (stype == TYPE_BMP) | (stype == TYPE_DNA_SEQ) | + int mtype = PC_TYPE(btype); + + return ((stype == TYPE_BMP) | (stype == TYPE_DNA_SEQ) | (stype == TYPE_MP4) | (stype == TYPE_FLAC) | (stype == TYPE_AVI) | - (stype == TYPE_DICOM) | (stype == TYPE_MEDIA_BSC)); + (stype == TYPE_DICOM) | (stype == TYPE_MEDIA_BSC) | + (mtype == TYPE_TEXT && stype != TYPE_MARKUP)); } int @@ -260,7 +259,8 @@ adapt_compress(void *src, uint64_t srclen, void *dst, int stype = PC_SUBTYPE(btype); analyzer_ctx_t actx; - if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF) { + if (btype == TYPE_UNKNOWN || PC_TYPE(btype) == TYPE_TEXT || + stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF) { if (adat->actx == NULL) { analyze_buffer(src, srclen, &actx); adat->actx = &actx; @@ -271,8 +271,6 @@ adapt_compress(void *src, uint64_t srclen, void *dst, } else if (adat->adapt_mode == 1) { btype = adat->actx->fifty_pct.btype; } - if (stype == TYPE_PDF) - btype |= TYPE_MARKUP; } /* Reset analyzer context for subsequent calls. */ diff --git a/filters/analyzer/analyzer.c b/filters/analyzer/analyzer.c index 9e2a53a..a6c2878 100644 --- a/filters/analyzer/analyzer.c +++ b/filters/analyzer/analyzer.c @@ -27,13 +27,13 @@ #define FIFTY_PCT(x) (((x)/10) * 5) #define FORTY_PCT(x) (((x)/10) * 4) -#define ONE_PCT(x) ((x)/100) +#define TEN_PCT(x) ((x)/10) void analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx) { uchar_t *src1 = (uchar_t *)src; - uint64_t i, tot8b, tot_8b, lbytes; + uint64_t i, tot8b, tot_8b, lbytes, spc; uchar_t cur_byte, prev_byte; uint64_t tag1, tag2, tag3; double tagcnt, pct_tag; @@ -47,11 +47,13 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx) tag2 = 0; tag3 = 0; lbytes = 0; + spc = 0; prev_byte = cur_byte = 0; for (i = 0; i < srclen; i++) { cur_byte = src1[i]; tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization lbytes += (cur_byte < 32); + spc += (cur_byte == ' '); tag1 += (cur_byte == '<'); tag2 += (cur_byte == '>'); tag3 += ((prev_byte == '<') & (cur_byte == '/')); @@ -65,7 +67,7 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx) * significance levels. */ tot_8b = tot8b / 0x80 + lbytes; - tagcnt = tag1 + tag2 + tag3; + tagcnt = tag1 + tag2; pct_tag = tagcnt / (double)srclen; if (tot_8b > FORTY_PCT(srclen)) { actx->forty_pct.btype = TYPE_BINARY; @@ -80,13 +82,13 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx) } tot8b /= 0x80; - if (tot8b == 0 && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) { + if (tot8b <= TEN_PCT((double)srclen) && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) { actx->one_pct.btype = TYPE_TEXT; } markup = 0; if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 && - tagcnt > (double)srclen * 0.001) + tagcnt > (double)spc * 0.1) markup = 1; if (markup) { @@ -120,7 +122,7 @@ analyze_buffer_simple(void *src, uint64_t srclen) * Heuristics for detecting BINARY vs generic TEXT */ tot8b /= 0x80; - if (tot8b == 0 && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) { + if (tot8b <= TEN_PCT((double)srclen) && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) { btype = TYPE_TEXT; } return (btype); diff --git a/filters/dict/DictFilter.cpp b/filters/dict/DictFilter.cpp index 252b3d4..00119de 100644 --- a/filters/dict/DictFilter.cpp +++ b/filters/dict/DictFilter.cpp @@ -324,7 +324,6 @@ DictFilter::DictFilter() SEPARATOR['\r'] = 1; SEPARATOR['>'] = 1; SEPARATOR[']'] = 1; - SEPARATOR['\''] = 1; SEPARATOR[')'] = 1; SEPARATOR['.'] = 1; SEPARATOR['?'] = 1; @@ -847,13 +846,12 @@ DictFilter::Forward_Dict(uint8_t *src, uint32_t size, uint8_t *dst, uint32_t *ds * Encode literal numeric strings. */ converted = 0; - if (word[0] != '+' && word[0] != '-' && word[0] != '0' && - toklen > 4 && toklen < 10) { + if (isdigit(word[0]) && word[0] != '0' && toklen > 4 && toklen < 10) { copy_bytes(num, word, toklen); num[toklen] = '\0'; val = strtoul((const char *)num, (char **)&word, 10); - if (*word == '\0') { + if (*word == '\0' && word - num == toklen && val > 0) { uint8_t tok_hdr[10], *dnum; sz = sizeof (tok_hdr); dnum = to_base_enc(val, tok_hdr, sz); @@ -914,7 +912,7 @@ DictFilter::Inverse_Dict(uint8_t *src, uint32_t srclen, uint8_t *dst, uint32_t * decode_dict_entry_t *w_dict; end = src + srclen; - srcpos = (uint8_t *)strchr((const char *)src, ' '); + srcpos = (uint8_t *)memchr((const void *)src, ' ', WORD_MAX); if (srcpos - src > 12) { return (0); } @@ -924,7 +922,11 @@ DictFilter::Inverse_Dict(uint8_t *src, uint32_t srclen, uint8_t *dst, uint32_t * w_dict = new decode_dict_entry_t[numWords]; for (i = 0; i < numWords && srcpos < end; i++) { uint8_t *w_src = srcpos; - srcpos = (uint8_t *)strchr((const char *)srcpos, ' '); + size_t limit; + + limit = end - srcpos; + if (limit > WORD_MAX+1) limit = WORD_MAX+1; + srcpos = (uint8_t *)memchr((const void *)srcpos, ' ', limit); if (srcpos - w_src > WORD_MAX) return (0); @@ -1089,6 +1091,7 @@ dict_decode(uint8_t *from, uint64_t fromlen, uint8_t *to, uint64_t *dstlen) log_msg(LOG_ERR, 0, "dict_decode: Failed.\n"); return (-1); } + if (dl < *dstlen) { log_msg(LOG_ERR, 0, "dict_decode: Expected: %" PRIu64 ", Got: %" PRIu64 "\n", *dstlen, dl); diff --git a/pcompress.c b/pcompress.c index a524b93..332581b 100644 --- a/pcompress.c +++ b/pcompress.c @@ -211,7 +211,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t int result; uint64_t _dstlen, fromlen; uchar_t *from, *to; - int stype, dict, analyzed; + int stype, analyzed; analyzer_ctx_t actx; DEBUG_STAT_EN(double strt, en); @@ -221,9 +221,9 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t fromlen = srclen; result = 0; stype = PC_SUBTYPE(btype); - dict = 0; analyzed = 0; - if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF || interesting) { + if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF || + PC_TYPE(btype) == TYPE_TEXT || interesting) { analyze_buffer(src, srclen, &actx); analyzed = 1; if (pctx->adapt_mode) @@ -258,12 +258,13 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t if (pctx->lzp_preprocess) { int b_type; + b_type = btype; if (analyzed) - b_type = PC_TYPE(actx.forty_pct.btype); + b_type = PC_TYPE(actx.one_pct.btype); else - b_type = PC_TYPE(analyze_buffer_simple(from, fromlen)); + b_type = analyze_buffer_simple(from, fromlen); - if (b_type == TYPE_TEXT) { + if (PC_TYPE(b_type) == TYPE_TEXT) { _dstlen = fromlen; result = dict_encode(from, fromlen, to, &_dstlen); if (result != -1) { @@ -273,7 +274,6 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t to = tmp; fromlen = _dstlen; type |= PREPROC_TYPE_DICT; - dict = result; } } } @@ -1354,7 +1354,7 @@ start_decompress(pc_ctx_t *pctx, const char *filename, char *to_filename) log_msg(LOG_ERR, 1, "Can't seek in metadata fd: "); UNCOMP_BAIL; } - + /* * Finally create the metadata context. */ @@ -1364,7 +1364,7 @@ start_decompress(pc_ctx_t *pctx, const char *filename, char *to_filename) UNCOMP_BAIL; } } - + uncompfd = -1; if (setup_extractor(pctx) == -1) { log_msg(LOG_ERR, 0, "Setup of extraction context failed."); @@ -1400,7 +1400,7 @@ start_decompress(pc_ctx_t *pctx, const char *filename, char *to_filename) if (pctx->archive_mode) { nprocs = nprocs > 1 ? nprocs-1:nprocs; } - + if (pctx->nthreads > 0 && pctx->nthreads < nprocs) nprocs = pctx->nthreads; else