Multitue of tweaks and improvements.
* Use BSC for PNM type and Markup containing binary data. * Change thresholds in analyzer. * Properly use double precision in analyzer for accuracy. * Indicate BSC processing of packPNM output * Bring back raw-block Dispack for file not processed by Dispack filter.
This commit is contained in:
parent
4360c5581f
commit
6a757ddb2c
8 changed files with 59 additions and 54 deletions
|
@ -244,10 +244,11 @@ is_bsc_type(int btype)
|
|||
int stype = PC_SUBTYPE(btype);
|
||||
int mtype = PC_TYPE(btype);
|
||||
|
||||
return ((stype == TYPE_BMP) | (stype == TYPE_DNA_SEQ) |
|
||||
return ((stype == TYPE_BMP) | (stype == TYPE_DNA_SEQ) | (stype == TYPE_PNM) |
|
||||
(stype == TYPE_MP4) | (stype == TYPE_FLAC) | (stype == TYPE_AVI) |
|
||||
(stype == TYPE_DICOM) | (stype == TYPE_MEDIA_BSC) |
|
||||
(mtype & TYPE_TEXT && stype != TYPE_MARKUP));
|
||||
(mtype & TYPE_TEXT && stype != TYPE_MARKUP) |
|
||||
(mtype & TYPE_BINARY && stype == TYPE_MARKUP));
|
||||
}
|
||||
|
||||
int
|
||||
|
@ -266,13 +267,14 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
|
|||
adat->actx = &actx;
|
||||
}
|
||||
if (adat->adapt_mode == 2) {
|
||||
btype = adat->actx->forty_pct.btype;
|
||||
btype = adat->actx->thirty_pct.btype;
|
||||
|
||||
} else if (adat->adapt_mode == 1) {
|
||||
btype = adat->actx->fifty_pct.btype;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Reset analyzer context for subsequent calls. */
|
||||
adat->actx = NULL;
|
||||
|
||||
|
|
|
@ -93,7 +93,7 @@ add_filters_by_type(struct type_data *typetab, struct filter_flags *ff)
|
|||
typetab[slot].filter_private = sdat;
|
||||
typetab[slot].filter_func = packpnm_filter;
|
||||
typetab[slot].filter_name = "packPNM";
|
||||
typetab[slot].result_type = TYPE_BINARY;
|
||||
typetab[slot].result_type = TYPE_BINARY | TYPE_MEDIA_BSC;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
|
@ -1471,10 +1471,6 @@ copy_data_out(struct archive *ar, struct archive *aw, struct archive_entry *entr
|
|||
return (ARCHIVE_FATAL);
|
||||
}
|
||||
}
|
||||
/*
|
||||
* If the filter above fails we fall through below to consume
|
||||
* the data for the entry.
|
||||
*/
|
||||
}
|
||||
|
||||
for (;;) {
|
||||
|
|
|
@ -25,9 +25,9 @@
|
|||
#include "utils.h"
|
||||
#include "analyzer.h"
|
||||
|
||||
#define FIFTY_PCT(x) (((x)/10) * 5)
|
||||
#define FORTY_PCT(x) (((x)/10) * 4)
|
||||
#define TEN_PCT(x) ((x)/10)
|
||||
#define FIFTY_PCT(x) ((((double)x)/10) * 5)
|
||||
#define THIRTY_PCT(x) ((((double)x)/10) * 3)
|
||||
#define TEN_PCT(x) (((double)x)/10)
|
||||
|
||||
void
|
||||
analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
|
||||
|
@ -37,7 +37,6 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
|
|||
uchar_t cur_byte, prev_byte;
|
||||
uint64_t tag1, tag2, tag3;
|
||||
double tagcnt, pct_tag;
|
||||
int markup;
|
||||
|
||||
/*
|
||||
* Count number of 8-bit binary bytes and XML tags in source.
|
||||
|
@ -49,9 +48,10 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
|
|||
lbytes = 0;
|
||||
spc = 0;
|
||||
prev_byte = cur_byte = 0;
|
||||
memset(actx, 0, sizeof (analyzer_ctx_t));
|
||||
for (i = 0; i < srclen; i++) {
|
||||
cur_byte = src1[i];
|
||||
tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization
|
||||
tot8b += (cur_byte > 127);
|
||||
lbytes += (cur_byte < 32);
|
||||
spc += (cur_byte == ' ');
|
||||
tag1 += (cur_byte == '<');
|
||||
|
@ -66,13 +66,13 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
|
|||
* Heuristics for detecting BINARY vs generic TEXT vs XML data at various
|
||||
* significance levels.
|
||||
*/
|
||||
tot_8b = tot8b / 0x80 + lbytes;
|
||||
tot_8b = tot8b + lbytes;
|
||||
tagcnt = tag1 + tag2;
|
||||
pct_tag = tagcnt / (double)srclen;
|
||||
if (tot_8b > FORTY_PCT(srclen)) {
|
||||
actx->forty_pct.btype = TYPE_BINARY;
|
||||
if (tot_8b > THIRTY_PCT(srclen)) {
|
||||
actx->thirty_pct.btype = TYPE_BINARY;
|
||||
} else {
|
||||
actx->forty_pct.btype = TYPE_TEXT;
|
||||
actx->thirty_pct.btype = TYPE_TEXT;
|
||||
}
|
||||
|
||||
if (tot_8b > FIFTY_PCT(srclen)) {
|
||||
|
@ -81,23 +81,18 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
|
|||
actx->fifty_pct.btype = TYPE_TEXT;
|
||||
}
|
||||
|
||||
tot8b /= 0x80;
|
||||
/* This should be tot8b and not tot_8b. */
|
||||
if (tot8b <= TEN_PCT((double)srclen) && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
|
||||
actx->one_pct.btype = TYPE_TEXT;
|
||||
actx->ten_pct.btype = TYPE_TEXT;
|
||||
} else {
|
||||
actx->ten_pct.btype = TYPE_BINARY;
|
||||
}
|
||||
|
||||
markup = 0;
|
||||
if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 &&
|
||||
tagcnt > (double)spc * 0.06)
|
||||
markup = 1;
|
||||
|
||||
if (markup) {
|
||||
if (actx->forty_pct.btype == TYPE_TEXT)
|
||||
actx->forty_pct.btype |= TYPE_MARKUP;
|
||||
if (actx->fifty_pct.btype == TYPE_TEXT)
|
||||
tagcnt > (double)spc * 0.06) {
|
||||
actx->thirty_pct.btype |= TYPE_MARKUP;
|
||||
actx->fifty_pct.btype |= TYPE_MARKUP;
|
||||
if (actx->one_pct.btype == TYPE_TEXT)
|
||||
actx->one_pct.btype |= TYPE_MARKUP;
|
||||
actx->ten_pct.btype |= TYPE_MARKUP;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -34,8 +34,8 @@ struct significance_value {
|
|||
};
|
||||
|
||||
typedef struct _analyzer_ctx {
|
||||
struct significance_value one_pct;
|
||||
struct significance_value forty_pct;
|
||||
struct significance_value ten_pct;
|
||||
struct significance_value thirty_pct;
|
||||
struct significance_value fifty_pct;
|
||||
} analyzer_ctx_t;
|
||||
|
||||
|
|
|
@ -911,10 +911,6 @@ DisUnFilter(sU8 *source,sU32 sourceSize,sU8 *dest,sU32 destSize,sU32 memStart)
|
|||
return sTRUE;
|
||||
}
|
||||
|
||||
/*
|
||||
* NOTE: function unused. Retained for future need.
|
||||
*/
|
||||
#if 0
|
||||
/*
|
||||
* Try to estimate if the given data block contains 32-bit x86 instructions
|
||||
* especially of the call and jmp variety.
|
||||
|
@ -939,7 +935,6 @@ is_x86_code(uchar_t *buf, int len)
|
|||
avgFreq = ln>>8;
|
||||
return (freq[0x8b] > avgFreq && freq[0x00] > avgFreq * 2 && freq[0xE8] > 6);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
@ -1025,10 +1020,6 @@ Inverse_E89(uint8_t *src, uint64_t sz)
|
|||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* NOTE: function unused. Retained for future need.
|
||||
*/
|
||||
#if 0
|
||||
/*
|
||||
* 32-bit x86 executable packer top-level routines. Detected x86 executable data
|
||||
* are passed through these encoding routines. The data chunk is split into 32KB
|
||||
|
@ -1058,7 +1049,6 @@ dispack_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
|
|||
pos_to = to;
|
||||
to_last = to + *dstlen;
|
||||
while (len > 0) {
|
||||
DisFilterCtx ctx(0, DISFILTER_BLOCK);
|
||||
sU32 sz;
|
||||
sU16 origsize;
|
||||
sU32 out;
|
||||
|
@ -1082,8 +1072,7 @@ dispack_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
|
|||
|
||||
out = sz;
|
||||
if (is_x86_code(pos, sz)) {
|
||||
ctx.ResetCtx(0, sz);
|
||||
rv = DisFilter(ctx, pos, sz, 0, pos_to, out);
|
||||
rv = DisFilter(pos, sz, 0, pos_to, out);
|
||||
} else {
|
||||
rv = NULL;
|
||||
}
|
||||
|
@ -1128,7 +1117,6 @@ dispack_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
|
|||
#endif
|
||||
return (0);
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* This function retained for ability to decode older archives encoded using raw block
|
||||
|
|
|
@ -32,6 +32,7 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
int dispack_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
|
||||
int dispack_decode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
|
||||
|
||||
int Forward_E89(uint8_t *src, uint64_t sz);
|
||||
|
|
41
pcompress.c
41
pcompress.c
|
@ -233,12 +233,34 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
|
|||
|
||||
/*
|
||||
* Dispack is used for 32-bit EXE files via a libarchive filter routine.
|
||||
* However if Dispack fails or 64-bit exes are detected we apply an E8E9
|
||||
* CALL/JMP transform filter.
|
||||
* For 64-bit exes or AR archives we apply an E8E9 CALL/JMP transform filter.
|
||||
*/
|
||||
if (pctx->exe_preprocess) {
|
||||
if (stype == TYPE_EXE32 || stype == TYPE_EXE64 ||
|
||||
stype == TYPE_ARCHIVE_AR || stype == TYPE_EXE32_PE) {
|
||||
int processed = 0;
|
||||
|
||||
if (stype == TYPE_EXE32 || stype == TYPE_EXE32_PE ||
|
||||
stype == TYPE_EXE64 || stype == TYPE_ARCHIVE_AR) {
|
||||
/*
|
||||
* If file-level Dispack did not happen for 32-bit EXEs it was
|
||||
* most likely that the file was large. So, as a workaround,
|
||||
* we do raw-block Dispack here. However if even this fails to
|
||||
* get any worthwhile reduction we do E8E9 as the final
|
||||
* fallback.
|
||||
*/
|
||||
_dstlen = fromlen;
|
||||
result = dispack_encode((uchar_t *)from, fromlen, to, &_dstlen);
|
||||
if (result != -1) {
|
||||
uchar_t *tmp;
|
||||
tmp = from;
|
||||
from = to;
|
||||
to = tmp;
|
||||
fromlen = _dstlen;
|
||||
type |= PREPROC_TYPE_DISPACK;
|
||||
processed = 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (!processed) {
|
||||
_dstlen = fromlen;
|
||||
memcpy(to, from, fromlen);
|
||||
if (Forward_E89(to, fromlen) == 0) {
|
||||
|
@ -260,10 +282,11 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
|
|||
int b_type;
|
||||
|
||||
b_type = btype;
|
||||
if (analyzed)
|
||||
b_type = PC_TYPE(actx.one_pct.btype);
|
||||
else
|
||||
if (analyzed) {
|
||||
b_type = actx.ten_pct.btype;
|
||||
} else {
|
||||
b_type = analyze_buffer_simple(from, fromlen);
|
||||
}
|
||||
|
||||
if (PC_TYPE(b_type) & TYPE_TEXT) {
|
||||
_dstlen = fromlen;
|
||||
|
@ -286,7 +309,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
|
|||
|
||||
b_type = btype;
|
||||
if (analyzed)
|
||||
b_type = actx.forty_pct.btype;
|
||||
b_type = actx.thirty_pct.btype;
|
||||
|
||||
if (!(PC_TYPE(b_type) & TYPE_BINARY)) {
|
||||
hashsize = lzp_hash_size(level);
|
||||
|
@ -311,7 +334,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
|
|||
|
||||
b_type = btype;
|
||||
if (analyzed)
|
||||
b_type = actx.one_pct.btype;
|
||||
b_type = actx.ten_pct.btype;
|
||||
|
||||
if (!(PC_TYPE(b_type) & TYPE_TEXT)) {
|
||||
_dstlen = fromlen;
|
||||
|
|
Loading…
Reference in a new issue