Multitue of tweaks and improvements.

* Use BSC for PNM type and Markup containing binary data.
* Change thresholds in analyzer.
* Properly use double precision in analyzer for accuracy.
* Indicate BSC processing of packPNM output
* Bring back raw-block Dispack for file not processed by Dispack filter.
This commit is contained in:
Moinak Ghosh 2015-03-22 23:36:04 +05:30
parent 4360c5581f
commit 6a757ddb2c
8 changed files with 59 additions and 54 deletions

View file

@ -244,10 +244,11 @@ is_bsc_type(int btype)
int stype = PC_SUBTYPE(btype);
int mtype = PC_TYPE(btype);
return ((stype == TYPE_BMP) | (stype == TYPE_DNA_SEQ) |
return ((stype == TYPE_BMP) | (stype == TYPE_DNA_SEQ) | (stype == TYPE_PNM) |
(stype == TYPE_MP4) | (stype == TYPE_FLAC) | (stype == TYPE_AVI) |
(stype == TYPE_DICOM) | (stype == TYPE_MEDIA_BSC) |
(mtype & TYPE_TEXT && stype != TYPE_MARKUP));
(mtype & TYPE_TEXT && stype != TYPE_MARKUP) |
(mtype & TYPE_BINARY && stype == TYPE_MARKUP));
}
int
@ -266,13 +267,14 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
adat->actx = &actx;
}
if (adat->adapt_mode == 2) {
btype = adat->actx->forty_pct.btype;
btype = adat->actx->thirty_pct.btype;
} else if (adat->adapt_mode == 1) {
btype = adat->actx->fifty_pct.btype;
}
}
/* Reset analyzer context for subsequent calls. */
adat->actx = NULL;

View file

@ -93,7 +93,7 @@ add_filters_by_type(struct type_data *typetab, struct filter_flags *ff)
typetab[slot].filter_private = sdat;
typetab[slot].filter_func = packpnm_filter;
typetab[slot].filter_name = "packPNM";
typetab[slot].result_type = TYPE_BINARY;
typetab[slot].result_type = TYPE_BINARY | TYPE_MEDIA_BSC;
}
#endif

View file

@ -1471,10 +1471,6 @@ copy_data_out(struct archive *ar, struct archive *aw, struct archive_entry *entr
return (ARCHIVE_FATAL);
}
}
/*
* If the filter above fails we fall through below to consume
* the data for the entry.
*/
}
for (;;) {

View file

@ -25,9 +25,9 @@
#include "utils.h"
#include "analyzer.h"
#define FIFTY_PCT(x) (((x)/10) * 5)
#define FORTY_PCT(x) (((x)/10) * 4)
#define TEN_PCT(x) ((x)/10)
#define FIFTY_PCT(x) ((((double)x)/10) * 5)
#define THIRTY_PCT(x) ((((double)x)/10) * 3)
#define TEN_PCT(x) (((double)x)/10)
void
analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
@ -37,7 +37,6 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
uchar_t cur_byte, prev_byte;
uint64_t tag1, tag2, tag3;
double tagcnt, pct_tag;
int markup;
/*
* Count number of 8-bit binary bytes and XML tags in source.
@ -49,9 +48,10 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
lbytes = 0;
spc = 0;
prev_byte = cur_byte = 0;
memset(actx, 0, sizeof (analyzer_ctx_t));
for (i = 0; i < srclen; i++) {
cur_byte = src1[i];
tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization
tot8b += (cur_byte > 127);
lbytes += (cur_byte < 32);
spc += (cur_byte == ' ');
tag1 += (cur_byte == '<');
@ -66,13 +66,13 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
* Heuristics for detecting BINARY vs generic TEXT vs XML data at various
* significance levels.
*/
tot_8b = tot8b / 0x80 + lbytes;
tot_8b = tot8b + lbytes;
tagcnt = tag1 + tag2;
pct_tag = tagcnt / (double)srclen;
if (tot_8b > FORTY_PCT(srclen)) {
actx->forty_pct.btype = TYPE_BINARY;
if (tot_8b > THIRTY_PCT(srclen)) {
actx->thirty_pct.btype = TYPE_BINARY;
} else {
actx->forty_pct.btype = TYPE_TEXT;
actx->thirty_pct.btype = TYPE_TEXT;
}
if (tot_8b > FIFTY_PCT(srclen)) {
@ -81,23 +81,18 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
actx->fifty_pct.btype = TYPE_TEXT;
}
tot8b /= 0x80;
/* This should be tot8b and not tot_8b. */
if (tot8b <= TEN_PCT((double)srclen) && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
actx->one_pct.btype = TYPE_TEXT;
actx->ten_pct.btype = TYPE_TEXT;
} else {
actx->ten_pct.btype = TYPE_BINARY;
}
markup = 0;
if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 &&
tagcnt > (double)spc * 0.06)
markup = 1;
if (markup) {
if (actx->forty_pct.btype == TYPE_TEXT)
actx->forty_pct.btype |= TYPE_MARKUP;
if (actx->fifty_pct.btype == TYPE_TEXT)
actx->fifty_pct.btype |= TYPE_MARKUP;
if (actx->one_pct.btype == TYPE_TEXT)
actx->one_pct.btype |= TYPE_MARKUP;
tagcnt > (double)spc * 0.06) {
actx->thirty_pct.btype |= TYPE_MARKUP;
actx->fifty_pct.btype |= TYPE_MARKUP;
actx->ten_pct.btype |= TYPE_MARKUP;
}
}

View file

@ -34,8 +34,8 @@ struct significance_value {
};
typedef struct _analyzer_ctx {
struct significance_value one_pct;
struct significance_value forty_pct;
struct significance_value ten_pct;
struct significance_value thirty_pct;
struct significance_value fifty_pct;
} analyzer_ctx_t;

View file

@ -911,10 +911,6 @@ DisUnFilter(sU8 *source,sU32 sourceSize,sU8 *dest,sU32 destSize,sU32 memStart)
return sTRUE;
}
/*
* NOTE: function unused. Retained for future need.
*/
#if 0
/*
* Try to estimate if the given data block contains 32-bit x86 instructions
* especially of the call and jmp variety.
@ -939,7 +935,6 @@ is_x86_code(uchar_t *buf, int len)
avgFreq = ln>>8;
return (freq[0x8b] > avgFreq && freq[0x00] > avgFreq * 2 && freq[0xE8] > 6);
}
#endif
#ifdef __cplusplus
extern "C" {
@ -1025,10 +1020,6 @@ Inverse_E89(uint8_t *src, uint64_t sz)
return (0);
}
/*
* NOTE: function unused. Retained for future need.
*/
#if 0
/*
* 32-bit x86 executable packer top-level routines. Detected x86 executable data
* are passed through these encoding routines. The data chunk is split into 32KB
@ -1058,7 +1049,6 @@ dispack_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
pos_to = to;
to_last = to + *dstlen;
while (len > 0) {
DisFilterCtx ctx(0, DISFILTER_BLOCK);
sU32 sz;
sU16 origsize;
sU32 out;
@ -1082,8 +1072,7 @@ dispack_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
out = sz;
if (is_x86_code(pos, sz)) {
ctx.ResetCtx(0, sz);
rv = DisFilter(ctx, pos, sz, 0, pos_to, out);
rv = DisFilter(pos, sz, 0, pos_to, out);
} else {
rv = NULL;
}
@ -1128,7 +1117,6 @@ dispack_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
#endif
return (0);
}
#endif
/*
* This function retained for ability to decode older archives encoded using raw block

View file

@ -32,6 +32,7 @@
extern "C" {
#endif
int dispack_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
int dispack_decode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
int Forward_E89(uint8_t *src, uint64_t sz);

View file

@ -233,12 +233,34 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
/*
* Dispack is used for 32-bit EXE files via a libarchive filter routine.
* However if Dispack fails or 64-bit exes are detected we apply an E8E9
* CALL/JMP transform filter.
* For 64-bit exes or AR archives we apply an E8E9 CALL/JMP transform filter.
*/
if (pctx->exe_preprocess) {
if (stype == TYPE_EXE32 || stype == TYPE_EXE64 ||
stype == TYPE_ARCHIVE_AR || stype == TYPE_EXE32_PE) {
int processed = 0;
if (stype == TYPE_EXE32 || stype == TYPE_EXE32_PE ||
stype == TYPE_EXE64 || stype == TYPE_ARCHIVE_AR) {
/*
* If file-level Dispack did not happen for 32-bit EXEs it was
* most likely that the file was large. So, as a workaround,
* we do raw-block Dispack here. However if even this fails to
* get any worthwhile reduction we do E8E9 as the final
* fallback.
*/
_dstlen = fromlen;
result = dispack_encode((uchar_t *)from, fromlen, to, &_dstlen);
if (result != -1) {
uchar_t *tmp;
tmp = from;
from = to;
to = tmp;
fromlen = _dstlen;
type |= PREPROC_TYPE_DISPACK;
processed = 1;
}
}
if (!processed) {
_dstlen = fromlen;
memcpy(to, from, fromlen);
if (Forward_E89(to, fromlen) == 0) {
@ -260,10 +282,11 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
int b_type;
b_type = btype;
if (analyzed)
b_type = PC_TYPE(actx.one_pct.btype);
else
if (analyzed) {
b_type = actx.ten_pct.btype;
} else {
b_type = analyze_buffer_simple(from, fromlen);
}
if (PC_TYPE(b_type) & TYPE_TEXT) {
_dstlen = fromlen;
@ -286,7 +309,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
b_type = btype;
if (analyzed)
b_type = actx.forty_pct.btype;
b_type = actx.thirty_pct.btype;
if (!(PC_TYPE(b_type) & TYPE_BINARY)) {
hashsize = lzp_hash_size(level);
@ -311,7 +334,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
b_type = btype;
if (analyzed)
b_type = actx.one_pct.btype;
b_type = actx.ten_pct.btype;
if (!(PC_TYPE(b_type) & TYPE_TEXT)) {
_dstlen = fromlen;