Multitue of tweaks and improvements.

* Use BSC for PNM type and Markup containing binary data.
* Change thresholds in analyzer.
* Properly use double precision in analyzer for accuracy.
* Indicate BSC processing of packPNM output
* Bring back raw-block Dispack for file not processed by Dispack filter.
This commit is contained in:
Moinak Ghosh 2015-03-22 23:36:04 +05:30
parent 4360c5581f
commit 6a757ddb2c
8 changed files with 59 additions and 54 deletions

View file

@ -244,10 +244,11 @@ is_bsc_type(int btype)
int stype = PC_SUBTYPE(btype); int stype = PC_SUBTYPE(btype);
int mtype = PC_TYPE(btype); int mtype = PC_TYPE(btype);
return ((stype == TYPE_BMP) | (stype == TYPE_DNA_SEQ) | return ((stype == TYPE_BMP) | (stype == TYPE_DNA_SEQ) | (stype == TYPE_PNM) |
(stype == TYPE_MP4) | (stype == TYPE_FLAC) | (stype == TYPE_AVI) | (stype == TYPE_MP4) | (stype == TYPE_FLAC) | (stype == TYPE_AVI) |
(stype == TYPE_DICOM) | (stype == TYPE_MEDIA_BSC) | (stype == TYPE_DICOM) | (stype == TYPE_MEDIA_BSC) |
(mtype & TYPE_TEXT && stype != TYPE_MARKUP)); (mtype & TYPE_TEXT && stype != TYPE_MARKUP) |
(mtype & TYPE_BINARY && stype == TYPE_MARKUP));
} }
int int
@ -266,13 +267,14 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
adat->actx = &actx; adat->actx = &actx;
} }
if (adat->adapt_mode == 2) { if (adat->adapt_mode == 2) {
btype = adat->actx->forty_pct.btype; btype = adat->actx->thirty_pct.btype;
} else if (adat->adapt_mode == 1) { } else if (adat->adapt_mode == 1) {
btype = adat->actx->fifty_pct.btype; btype = adat->actx->fifty_pct.btype;
} }
} }
/* Reset analyzer context for subsequent calls. */ /* Reset analyzer context for subsequent calls. */
adat->actx = NULL; adat->actx = NULL;

View file

@ -93,7 +93,7 @@ add_filters_by_type(struct type_data *typetab, struct filter_flags *ff)
typetab[slot].filter_private = sdat; typetab[slot].filter_private = sdat;
typetab[slot].filter_func = packpnm_filter; typetab[slot].filter_func = packpnm_filter;
typetab[slot].filter_name = "packPNM"; typetab[slot].filter_name = "packPNM";
typetab[slot].result_type = TYPE_BINARY; typetab[slot].result_type = TYPE_BINARY | TYPE_MEDIA_BSC;
} }
#endif #endif

View file

@ -1471,10 +1471,6 @@ copy_data_out(struct archive *ar, struct archive *aw, struct archive_entry *entr
return (ARCHIVE_FATAL); return (ARCHIVE_FATAL);
} }
} }
/*
* If the filter above fails we fall through below to consume
* the data for the entry.
*/
} }
for (;;) { for (;;) {

View file

@ -25,9 +25,9 @@
#include "utils.h" #include "utils.h"
#include "analyzer.h" #include "analyzer.h"
#define FIFTY_PCT(x) (((x)/10) * 5) #define FIFTY_PCT(x) ((((double)x)/10) * 5)
#define FORTY_PCT(x) (((x)/10) * 4) #define THIRTY_PCT(x) ((((double)x)/10) * 3)
#define TEN_PCT(x) ((x)/10) #define TEN_PCT(x) (((double)x)/10)
void void
analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx) analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
@ -37,7 +37,6 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
uchar_t cur_byte, prev_byte; uchar_t cur_byte, prev_byte;
uint64_t tag1, tag2, tag3; uint64_t tag1, tag2, tag3;
double tagcnt, pct_tag; double tagcnt, pct_tag;
int markup;
/* /*
* Count number of 8-bit binary bytes and XML tags in source. * Count number of 8-bit binary bytes and XML tags in source.
@ -49,9 +48,10 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
lbytes = 0; lbytes = 0;
spc = 0; spc = 0;
prev_byte = cur_byte = 0; prev_byte = cur_byte = 0;
memset(actx, 0, sizeof (analyzer_ctx_t));
for (i = 0; i < srclen; i++) { for (i = 0; i < srclen; i++) {
cur_byte = src1[i]; cur_byte = src1[i];
tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization tot8b += (cur_byte > 127);
lbytes += (cur_byte < 32); lbytes += (cur_byte < 32);
spc += (cur_byte == ' '); spc += (cur_byte == ' ');
tag1 += (cur_byte == '<'); tag1 += (cur_byte == '<');
@ -66,13 +66,13 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
* Heuristics for detecting BINARY vs generic TEXT vs XML data at various * Heuristics for detecting BINARY vs generic TEXT vs XML data at various
* significance levels. * significance levels.
*/ */
tot_8b = tot8b / 0x80 + lbytes; tot_8b = tot8b + lbytes;
tagcnt = tag1 + tag2; tagcnt = tag1 + tag2;
pct_tag = tagcnt / (double)srclen; pct_tag = tagcnt / (double)srclen;
if (tot_8b > FORTY_PCT(srclen)) { if (tot_8b > THIRTY_PCT(srclen)) {
actx->forty_pct.btype = TYPE_BINARY; actx->thirty_pct.btype = TYPE_BINARY;
} else { } else {
actx->forty_pct.btype = TYPE_TEXT; actx->thirty_pct.btype = TYPE_TEXT;
} }
if (tot_8b > FIFTY_PCT(srclen)) { if (tot_8b > FIFTY_PCT(srclen)) {
@ -81,23 +81,18 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
actx->fifty_pct.btype = TYPE_TEXT; actx->fifty_pct.btype = TYPE_TEXT;
} }
tot8b /= 0x80; /* This should be tot8b and not tot_8b. */
if (tot8b <= TEN_PCT((double)srclen) && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) { if (tot8b <= TEN_PCT((double)srclen) && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
actx->one_pct.btype = TYPE_TEXT; actx->ten_pct.btype = TYPE_TEXT;
} else {
actx->ten_pct.btype = TYPE_BINARY;
} }
markup = 0;
if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 && if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 &&
tagcnt > (double)spc * 0.06) tagcnt > (double)spc * 0.06) {
markup = 1; actx->thirty_pct.btype |= TYPE_MARKUP;
actx->fifty_pct.btype |= TYPE_MARKUP;
if (markup) { actx->ten_pct.btype |= TYPE_MARKUP;
if (actx->forty_pct.btype == TYPE_TEXT)
actx->forty_pct.btype |= TYPE_MARKUP;
if (actx->fifty_pct.btype == TYPE_TEXT)
actx->fifty_pct.btype |= TYPE_MARKUP;
if (actx->one_pct.btype == TYPE_TEXT)
actx->one_pct.btype |= TYPE_MARKUP;
} }
} }

View file

@ -34,8 +34,8 @@ struct significance_value {
}; };
typedef struct _analyzer_ctx { typedef struct _analyzer_ctx {
struct significance_value one_pct; struct significance_value ten_pct;
struct significance_value forty_pct; struct significance_value thirty_pct;
struct significance_value fifty_pct; struct significance_value fifty_pct;
} analyzer_ctx_t; } analyzer_ctx_t;

View file

@ -911,10 +911,6 @@ DisUnFilter(sU8 *source,sU32 sourceSize,sU8 *dest,sU32 destSize,sU32 memStart)
return sTRUE; return sTRUE;
} }
/*
* NOTE: function unused. Retained for future need.
*/
#if 0
/* /*
* Try to estimate if the given data block contains 32-bit x86 instructions * Try to estimate if the given data block contains 32-bit x86 instructions
* especially of the call and jmp variety. * especially of the call and jmp variety.
@ -939,7 +935,6 @@ is_x86_code(uchar_t *buf, int len)
avgFreq = ln>>8; avgFreq = ln>>8;
return (freq[0x8b] > avgFreq && freq[0x00] > avgFreq * 2 && freq[0xE8] > 6); return (freq[0x8b] > avgFreq && freq[0x00] > avgFreq * 2 && freq[0xE8] > 6);
} }
#endif
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
@ -1025,10 +1020,6 @@ Inverse_E89(uint8_t *src, uint64_t sz)
return (0); return (0);
} }
/*
* NOTE: function unused. Retained for future need.
*/
#if 0
/* /*
* 32-bit x86 executable packer top-level routines. Detected x86 executable data * 32-bit x86 executable packer top-level routines. Detected x86 executable data
* are passed through these encoding routines. The data chunk is split into 32KB * are passed through these encoding routines. The data chunk is split into 32KB
@ -1058,7 +1049,6 @@ dispack_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
pos_to = to; pos_to = to;
to_last = to + *dstlen; to_last = to + *dstlen;
while (len > 0) { while (len > 0) {
DisFilterCtx ctx(0, DISFILTER_BLOCK);
sU32 sz; sU32 sz;
sU16 origsize; sU16 origsize;
sU32 out; sU32 out;
@ -1082,8 +1072,7 @@ dispack_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
out = sz; out = sz;
if (is_x86_code(pos, sz)) { if (is_x86_code(pos, sz)) {
ctx.ResetCtx(0, sz); rv = DisFilter(pos, sz, 0, pos_to, out);
rv = DisFilter(ctx, pos, sz, 0, pos_to, out);
} else { } else {
rv = NULL; rv = NULL;
} }
@ -1128,7 +1117,6 @@ dispack_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
#endif #endif
return (0); return (0);
} }
#endif
/* /*
* This function retained for ability to decode older archives encoded using raw block * This function retained for ability to decode older archives encoded using raw block

View file

@ -32,6 +32,7 @@
extern "C" { extern "C" {
#endif #endif
int dispack_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
int dispack_decode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen); int dispack_decode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
int Forward_E89(uint8_t *src, uint64_t sz); int Forward_E89(uint8_t *src, uint64_t sz);

View file

@ -233,12 +233,34 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
/* /*
* Dispack is used for 32-bit EXE files via a libarchive filter routine. * Dispack is used for 32-bit EXE files via a libarchive filter routine.
* However if Dispack fails or 64-bit exes are detected we apply an E8E9 * For 64-bit exes or AR archives we apply an E8E9 CALL/JMP transform filter.
* CALL/JMP transform filter.
*/ */
if (pctx->exe_preprocess) { if (pctx->exe_preprocess) {
if (stype == TYPE_EXE32 || stype == TYPE_EXE64 || int processed = 0;
stype == TYPE_ARCHIVE_AR || stype == TYPE_EXE32_PE) {
if (stype == TYPE_EXE32 || stype == TYPE_EXE32_PE ||
stype == TYPE_EXE64 || stype == TYPE_ARCHIVE_AR) {
/*
* If file-level Dispack did not happen for 32-bit EXEs it was
* most likely that the file was large. So, as a workaround,
* we do raw-block Dispack here. However if even this fails to
* get any worthwhile reduction we do E8E9 as the final
* fallback.
*/
_dstlen = fromlen;
result = dispack_encode((uchar_t *)from, fromlen, to, &_dstlen);
if (result != -1) {
uchar_t *tmp;
tmp = from;
from = to;
to = tmp;
fromlen = _dstlen;
type |= PREPROC_TYPE_DISPACK;
processed = 1;
}
}
if (!processed) {
_dstlen = fromlen; _dstlen = fromlen;
memcpy(to, from, fromlen); memcpy(to, from, fromlen);
if (Forward_E89(to, fromlen) == 0) { if (Forward_E89(to, fromlen) == 0) {
@ -260,10 +282,11 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
int b_type; int b_type;
b_type = btype; b_type = btype;
if (analyzed) if (analyzed) {
b_type = PC_TYPE(actx.one_pct.btype); b_type = actx.ten_pct.btype;
else } else {
b_type = analyze_buffer_simple(from, fromlen); b_type = analyze_buffer_simple(from, fromlen);
}
if (PC_TYPE(b_type) & TYPE_TEXT) { if (PC_TYPE(b_type) & TYPE_TEXT) {
_dstlen = fromlen; _dstlen = fromlen;
@ -286,7 +309,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
b_type = btype; b_type = btype;
if (analyzed) if (analyzed)
b_type = actx.forty_pct.btype; b_type = actx.thirty_pct.btype;
if (!(PC_TYPE(b_type) & TYPE_BINARY)) { if (!(PC_TYPE(b_type) & TYPE_BINARY)) {
hashsize = lzp_hash_size(level); hashsize = lzp_hash_size(level);
@ -311,7 +334,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
b_type = btype; b_type = btype;
if (analyzed) if (analyzed)
b_type = actx.one_pct.btype; b_type = actx.ten_pct.btype;
if (!(PC_TYPE(b_type) & TYPE_TEXT)) { if (!(PC_TYPE(b_type) & TYPE_TEXT)) {
_dstlen = fromlen; _dstlen = fromlen;