diff --git a/adaptive_compress.c b/adaptive_compress.c index 3688de6..3bac489 100644 --- a/adaptive_compress.c +++ b/adaptive_compress.c @@ -244,10 +244,11 @@ is_bsc_type(int btype) int stype = PC_SUBTYPE(btype); int mtype = PC_TYPE(btype); - return ((stype == TYPE_BMP) | (stype == TYPE_DNA_SEQ) | + return ((stype == TYPE_BMP) | (stype == TYPE_DNA_SEQ) | (stype == TYPE_PNM) | (stype == TYPE_MP4) | (stype == TYPE_FLAC) | (stype == TYPE_AVI) | (stype == TYPE_DICOM) | (stype == TYPE_MEDIA_BSC) | - (mtype & TYPE_TEXT && stype != TYPE_MARKUP)); + (mtype & TYPE_TEXT && stype != TYPE_MARKUP) | + (mtype & TYPE_BINARY && stype == TYPE_MARKUP)); } int @@ -266,13 +267,14 @@ adapt_compress(void *src, uint64_t srclen, void *dst, adat->actx = &actx; } if (adat->adapt_mode == 2) { - btype = adat->actx->forty_pct.btype; + btype = adat->actx->thirty_pct.btype; } else if (adat->adapt_mode == 1) { btype = adat->actx->fifty_pct.btype; } } + /* Reset analyzer context for subsequent calls. */ adat->actx = NULL; diff --git a/archive/pc_arc_filter.c b/archive/pc_arc_filter.c index 82889e3..62d9367 100644 --- a/archive/pc_arc_filter.c +++ b/archive/pc_arc_filter.c @@ -93,7 +93,7 @@ add_filters_by_type(struct type_data *typetab, struct filter_flags *ff) typetab[slot].filter_private = sdat; typetab[slot].filter_func = packpnm_filter; typetab[slot].filter_name = "packPNM"; - typetab[slot].result_type = TYPE_BINARY; + typetab[slot].result_type = TYPE_BINARY | TYPE_MEDIA_BSC; } #endif diff --git a/archive/pc_archive.c b/archive/pc_archive.c index d2d698d..32d9662 100644 --- a/archive/pc_archive.c +++ b/archive/pc_archive.c @@ -1471,10 +1471,6 @@ copy_data_out(struct archive *ar, struct archive *aw, struct archive_entry *entr return (ARCHIVE_FATAL); } } - /* - * If the filter above fails we fall through below to consume - * the data for the entry. - */ } for (;;) { diff --git a/filters/analyzer/analyzer.c b/filters/analyzer/analyzer.c index 0a13d31..4630ea4 100644 --- a/filters/analyzer/analyzer.c +++ b/filters/analyzer/analyzer.c @@ -25,9 +25,9 @@ #include "utils.h" #include "analyzer.h" -#define FIFTY_PCT(x) (((x)/10) * 5) -#define FORTY_PCT(x) (((x)/10) * 4) -#define TEN_PCT(x) ((x)/10) +#define FIFTY_PCT(x) ((((double)x)/10) * 5) +#define THIRTY_PCT(x) ((((double)x)/10) * 3) +#define TEN_PCT(x) (((double)x)/10) void analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx) @@ -37,7 +37,6 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx) uchar_t cur_byte, prev_byte; uint64_t tag1, tag2, tag3; double tagcnt, pct_tag; - int markup; /* * Count number of 8-bit binary bytes and XML tags in source. @@ -49,9 +48,10 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx) lbytes = 0; spc = 0; prev_byte = cur_byte = 0; + memset(actx, 0, sizeof (analyzer_ctx_t)); for (i = 0; i < srclen; i++) { cur_byte = src1[i]; - tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization + tot8b += (cur_byte > 127); lbytes += (cur_byte < 32); spc += (cur_byte == ' '); tag1 += (cur_byte == '<'); @@ -66,13 +66,13 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx) * Heuristics for detecting BINARY vs generic TEXT vs XML data at various * significance levels. */ - tot_8b = tot8b / 0x80 + lbytes; + tot_8b = tot8b + lbytes; tagcnt = tag1 + tag2; pct_tag = tagcnt / (double)srclen; - if (tot_8b > FORTY_PCT(srclen)) { - actx->forty_pct.btype = TYPE_BINARY; + if (tot_8b > THIRTY_PCT(srclen)) { + actx->thirty_pct.btype = TYPE_BINARY; } else { - actx->forty_pct.btype = TYPE_TEXT; + actx->thirty_pct.btype = TYPE_TEXT; } if (tot_8b > FIFTY_PCT(srclen)) { @@ -81,23 +81,18 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx) actx->fifty_pct.btype = TYPE_TEXT; } - tot8b /= 0x80; + /* This should be tot8b and not tot_8b. */ if (tot8b <= TEN_PCT((double)srclen) && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) { - actx->one_pct.btype = TYPE_TEXT; + actx->ten_pct.btype = TYPE_TEXT; + } else { + actx->ten_pct.btype = TYPE_BINARY; } - markup = 0; if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 && - tagcnt > (double)spc * 0.06) - markup = 1; - - if (markup) { - if (actx->forty_pct.btype == TYPE_TEXT) - actx->forty_pct.btype |= TYPE_MARKUP; - if (actx->fifty_pct.btype == TYPE_TEXT) - actx->fifty_pct.btype |= TYPE_MARKUP; - if (actx->one_pct.btype == TYPE_TEXT) - actx->one_pct.btype |= TYPE_MARKUP; + tagcnt > (double)spc * 0.06) { + actx->thirty_pct.btype |= TYPE_MARKUP; + actx->fifty_pct.btype |= TYPE_MARKUP; + actx->ten_pct.btype |= TYPE_MARKUP; } } diff --git a/filters/analyzer/analyzer.h b/filters/analyzer/analyzer.h index 9eefd9c..cb1254e 100644 --- a/filters/analyzer/analyzer.h +++ b/filters/analyzer/analyzer.h @@ -34,8 +34,8 @@ struct significance_value { }; typedef struct _analyzer_ctx { - struct significance_value one_pct; - struct significance_value forty_pct; + struct significance_value ten_pct; + struct significance_value thirty_pct; struct significance_value fifty_pct; } analyzer_ctx_t; diff --git a/filters/dispack/dis.cpp b/filters/dispack/dis.cpp index 802dd8c..23725e3 100644 --- a/filters/dispack/dis.cpp +++ b/filters/dispack/dis.cpp @@ -911,10 +911,6 @@ DisUnFilter(sU8 *source,sU32 sourceSize,sU8 *dest,sU32 destSize,sU32 memStart) return sTRUE; } -/* - * NOTE: function unused. Retained for future need. - */ -#if 0 /* * Try to estimate if the given data block contains 32-bit x86 instructions * especially of the call and jmp variety. @@ -939,7 +935,6 @@ is_x86_code(uchar_t *buf, int len) avgFreq = ln>>8; return (freq[0x8b] > avgFreq && freq[0x00] > avgFreq * 2 && freq[0xE8] > 6); } -#endif #ifdef __cplusplus extern "C" { @@ -1025,10 +1020,6 @@ Inverse_E89(uint8_t *src, uint64_t sz) return (0); } -/* - * NOTE: function unused. Retained for future need. - */ -#if 0 /* * 32-bit x86 executable packer top-level routines. Detected x86 executable data * are passed through these encoding routines. The data chunk is split into 32KB @@ -1058,7 +1049,6 @@ dispack_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen) pos_to = to; to_last = to + *dstlen; while (len > 0) { - DisFilterCtx ctx(0, DISFILTER_BLOCK); sU32 sz; sU16 origsize; sU32 out; @@ -1082,8 +1072,7 @@ dispack_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen) out = sz; if (is_x86_code(pos, sz)) { - ctx.ResetCtx(0, sz); - rv = DisFilter(ctx, pos, sz, 0, pos_to, out); + rv = DisFilter(pos, sz, 0, pos_to, out); } else { rv = NULL; } @@ -1128,7 +1117,6 @@ dispack_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen) #endif return (0); } -#endif /* * This function retained for ability to decode older archives encoded using raw block diff --git a/filters/dispack/dis.hpp b/filters/dispack/dis.hpp index 723c044..84f6883 100644 --- a/filters/dispack/dis.hpp +++ b/filters/dispack/dis.hpp @@ -32,6 +32,7 @@ extern "C" { #endif +int dispack_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen); int dispack_decode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen); int Forward_E89(uint8_t *src, uint64_t sz); diff --git a/pcompress.c b/pcompress.c index c05ae6b..a1539e2 100644 --- a/pcompress.c +++ b/pcompress.c @@ -233,12 +233,34 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t /* * Dispack is used for 32-bit EXE files via a libarchive filter routine. - * However if Dispack fails or 64-bit exes are detected we apply an E8E9 - * CALL/JMP transform filter. + * For 64-bit exes or AR archives we apply an E8E9 CALL/JMP transform filter. */ if (pctx->exe_preprocess) { - if (stype == TYPE_EXE32 || stype == TYPE_EXE64 || - stype == TYPE_ARCHIVE_AR || stype == TYPE_EXE32_PE) { + int processed = 0; + + if (stype == TYPE_EXE32 || stype == TYPE_EXE32_PE || + stype == TYPE_EXE64 || stype == TYPE_ARCHIVE_AR) { + /* + * If file-level Dispack did not happen for 32-bit EXEs it was + * most likely that the file was large. So, as a workaround, + * we do raw-block Dispack here. However if even this fails to + * get any worthwhile reduction we do E8E9 as the final + * fallback. + */ + _dstlen = fromlen; + result = dispack_encode((uchar_t *)from, fromlen, to, &_dstlen); + if (result != -1) { + uchar_t *tmp; + tmp = from; + from = to; + to = tmp; + fromlen = _dstlen; + type |= PREPROC_TYPE_DISPACK; + processed = 1; + } + } + + if (!processed) { _dstlen = fromlen; memcpy(to, from, fromlen); if (Forward_E89(to, fromlen) == 0) { @@ -260,10 +282,11 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t int b_type; b_type = btype; - if (analyzed) - b_type = PC_TYPE(actx.one_pct.btype); - else + if (analyzed) { + b_type = actx.ten_pct.btype; + } else { b_type = analyze_buffer_simple(from, fromlen); + } if (PC_TYPE(b_type) & TYPE_TEXT) { _dstlen = fromlen; @@ -286,7 +309,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t b_type = btype; if (analyzed) - b_type = actx.forty_pct.btype; + b_type = actx.thirty_pct.btype; if (!(PC_TYPE(b_type) & TYPE_BINARY)) { hashsize = lzp_hash_size(level); @@ -311,7 +334,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t b_type = btype; if (analyzed) - b_type = actx.one_pct.btype; + b_type = actx.ten_pct.btype; if (!(PC_TYPE(b_type) & TYPE_TEXT)) { _dstlen = fromlen;