Multitue of tweaks and improvements.
* Use BSC for PNM type and Markup containing binary data. * Change thresholds in analyzer. * Properly use double precision in analyzer for accuracy. * Indicate BSC processing of packPNM output * Bring back raw-block Dispack for file not processed by Dispack filter.
This commit is contained in:
parent
4360c5581f
commit
6a757ddb2c
8 changed files with 59 additions and 54 deletions
|
@ -244,10 +244,11 @@ is_bsc_type(int btype)
|
||||||
int stype = PC_SUBTYPE(btype);
|
int stype = PC_SUBTYPE(btype);
|
||||||
int mtype = PC_TYPE(btype);
|
int mtype = PC_TYPE(btype);
|
||||||
|
|
||||||
return ((stype == TYPE_BMP) | (stype == TYPE_DNA_SEQ) |
|
return ((stype == TYPE_BMP) | (stype == TYPE_DNA_SEQ) | (stype == TYPE_PNM) |
|
||||||
(stype == TYPE_MP4) | (stype == TYPE_FLAC) | (stype == TYPE_AVI) |
|
(stype == TYPE_MP4) | (stype == TYPE_FLAC) | (stype == TYPE_AVI) |
|
||||||
(stype == TYPE_DICOM) | (stype == TYPE_MEDIA_BSC) |
|
(stype == TYPE_DICOM) | (stype == TYPE_MEDIA_BSC) |
|
||||||
(mtype & TYPE_TEXT && stype != TYPE_MARKUP));
|
(mtype & TYPE_TEXT && stype != TYPE_MARKUP) |
|
||||||
|
(mtype & TYPE_BINARY && stype == TYPE_MARKUP));
|
||||||
}
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
|
@ -266,13 +267,14 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
|
||||||
adat->actx = &actx;
|
adat->actx = &actx;
|
||||||
}
|
}
|
||||||
if (adat->adapt_mode == 2) {
|
if (adat->adapt_mode == 2) {
|
||||||
btype = adat->actx->forty_pct.btype;
|
btype = adat->actx->thirty_pct.btype;
|
||||||
|
|
||||||
} else if (adat->adapt_mode == 1) {
|
} else if (adat->adapt_mode == 1) {
|
||||||
btype = adat->actx->fifty_pct.btype;
|
btype = adat->actx->fifty_pct.btype;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* Reset analyzer context for subsequent calls. */
|
/* Reset analyzer context for subsequent calls. */
|
||||||
adat->actx = NULL;
|
adat->actx = NULL;
|
||||||
|
|
||||||
|
|
|
@ -93,7 +93,7 @@ add_filters_by_type(struct type_data *typetab, struct filter_flags *ff)
|
||||||
typetab[slot].filter_private = sdat;
|
typetab[slot].filter_private = sdat;
|
||||||
typetab[slot].filter_func = packpnm_filter;
|
typetab[slot].filter_func = packpnm_filter;
|
||||||
typetab[slot].filter_name = "packPNM";
|
typetab[slot].filter_name = "packPNM";
|
||||||
typetab[slot].result_type = TYPE_BINARY;
|
typetab[slot].result_type = TYPE_BINARY | TYPE_MEDIA_BSC;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -1471,10 +1471,6 @@ copy_data_out(struct archive *ar, struct archive *aw, struct archive_entry *entr
|
||||||
return (ARCHIVE_FATAL);
|
return (ARCHIVE_FATAL);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/*
|
|
||||||
* If the filter above fails we fall through below to consume
|
|
||||||
* the data for the entry.
|
|
||||||
*/
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (;;) {
|
for (;;) {
|
||||||
|
|
|
@ -25,9 +25,9 @@
|
||||||
#include "utils.h"
|
#include "utils.h"
|
||||||
#include "analyzer.h"
|
#include "analyzer.h"
|
||||||
|
|
||||||
#define FIFTY_PCT(x) (((x)/10) * 5)
|
#define FIFTY_PCT(x) ((((double)x)/10) * 5)
|
||||||
#define FORTY_PCT(x) (((x)/10) * 4)
|
#define THIRTY_PCT(x) ((((double)x)/10) * 3)
|
||||||
#define TEN_PCT(x) ((x)/10)
|
#define TEN_PCT(x) (((double)x)/10)
|
||||||
|
|
||||||
void
|
void
|
||||||
analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
|
analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
|
||||||
|
@ -37,7 +37,6 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
|
||||||
uchar_t cur_byte, prev_byte;
|
uchar_t cur_byte, prev_byte;
|
||||||
uint64_t tag1, tag2, tag3;
|
uint64_t tag1, tag2, tag3;
|
||||||
double tagcnt, pct_tag;
|
double tagcnt, pct_tag;
|
||||||
int markup;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Count number of 8-bit binary bytes and XML tags in source.
|
* Count number of 8-bit binary bytes and XML tags in source.
|
||||||
|
@ -49,9 +48,10 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
|
||||||
lbytes = 0;
|
lbytes = 0;
|
||||||
spc = 0;
|
spc = 0;
|
||||||
prev_byte = cur_byte = 0;
|
prev_byte = cur_byte = 0;
|
||||||
|
memset(actx, 0, sizeof (analyzer_ctx_t));
|
||||||
for (i = 0; i < srclen; i++) {
|
for (i = 0; i < srclen; i++) {
|
||||||
cur_byte = src1[i];
|
cur_byte = src1[i];
|
||||||
tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization
|
tot8b += (cur_byte > 127);
|
||||||
lbytes += (cur_byte < 32);
|
lbytes += (cur_byte < 32);
|
||||||
spc += (cur_byte == ' ');
|
spc += (cur_byte == ' ');
|
||||||
tag1 += (cur_byte == '<');
|
tag1 += (cur_byte == '<');
|
||||||
|
@ -66,13 +66,13 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
|
||||||
* Heuristics for detecting BINARY vs generic TEXT vs XML data at various
|
* Heuristics for detecting BINARY vs generic TEXT vs XML data at various
|
||||||
* significance levels.
|
* significance levels.
|
||||||
*/
|
*/
|
||||||
tot_8b = tot8b / 0x80 + lbytes;
|
tot_8b = tot8b + lbytes;
|
||||||
tagcnt = tag1 + tag2;
|
tagcnt = tag1 + tag2;
|
||||||
pct_tag = tagcnt / (double)srclen;
|
pct_tag = tagcnt / (double)srclen;
|
||||||
if (tot_8b > FORTY_PCT(srclen)) {
|
if (tot_8b > THIRTY_PCT(srclen)) {
|
||||||
actx->forty_pct.btype = TYPE_BINARY;
|
actx->thirty_pct.btype = TYPE_BINARY;
|
||||||
} else {
|
} else {
|
||||||
actx->forty_pct.btype = TYPE_TEXT;
|
actx->thirty_pct.btype = TYPE_TEXT;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tot_8b > FIFTY_PCT(srclen)) {
|
if (tot_8b > FIFTY_PCT(srclen)) {
|
||||||
|
@ -81,23 +81,18 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
|
||||||
actx->fifty_pct.btype = TYPE_TEXT;
|
actx->fifty_pct.btype = TYPE_TEXT;
|
||||||
}
|
}
|
||||||
|
|
||||||
tot8b /= 0x80;
|
/* This should be tot8b and not tot_8b. */
|
||||||
if (tot8b <= TEN_PCT((double)srclen) && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
|
if (tot8b <= TEN_PCT((double)srclen) && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
|
||||||
actx->one_pct.btype = TYPE_TEXT;
|
actx->ten_pct.btype = TYPE_TEXT;
|
||||||
|
} else {
|
||||||
|
actx->ten_pct.btype = TYPE_BINARY;
|
||||||
}
|
}
|
||||||
|
|
||||||
markup = 0;
|
|
||||||
if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 &&
|
if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 &&
|
||||||
tagcnt > (double)spc * 0.06)
|
tagcnt > (double)spc * 0.06) {
|
||||||
markup = 1;
|
actx->thirty_pct.btype |= TYPE_MARKUP;
|
||||||
|
actx->fifty_pct.btype |= TYPE_MARKUP;
|
||||||
if (markup) {
|
actx->ten_pct.btype |= TYPE_MARKUP;
|
||||||
if (actx->forty_pct.btype == TYPE_TEXT)
|
|
||||||
actx->forty_pct.btype |= TYPE_MARKUP;
|
|
||||||
if (actx->fifty_pct.btype == TYPE_TEXT)
|
|
||||||
actx->fifty_pct.btype |= TYPE_MARKUP;
|
|
||||||
if (actx->one_pct.btype == TYPE_TEXT)
|
|
||||||
actx->one_pct.btype |= TYPE_MARKUP;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -34,8 +34,8 @@ struct significance_value {
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef struct _analyzer_ctx {
|
typedef struct _analyzer_ctx {
|
||||||
struct significance_value one_pct;
|
struct significance_value ten_pct;
|
||||||
struct significance_value forty_pct;
|
struct significance_value thirty_pct;
|
||||||
struct significance_value fifty_pct;
|
struct significance_value fifty_pct;
|
||||||
} analyzer_ctx_t;
|
} analyzer_ctx_t;
|
||||||
|
|
||||||
|
|
|
@ -911,10 +911,6 @@ DisUnFilter(sU8 *source,sU32 sourceSize,sU8 *dest,sU32 destSize,sU32 memStart)
|
||||||
return sTRUE;
|
return sTRUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* NOTE: function unused. Retained for future need.
|
|
||||||
*/
|
|
||||||
#if 0
|
|
||||||
/*
|
/*
|
||||||
* Try to estimate if the given data block contains 32-bit x86 instructions
|
* Try to estimate if the given data block contains 32-bit x86 instructions
|
||||||
* especially of the call and jmp variety.
|
* especially of the call and jmp variety.
|
||||||
|
@ -939,7 +935,6 @@ is_x86_code(uchar_t *buf, int len)
|
||||||
avgFreq = ln>>8;
|
avgFreq = ln>>8;
|
||||||
return (freq[0x8b] > avgFreq && freq[0x00] > avgFreq * 2 && freq[0xE8] > 6);
|
return (freq[0x8b] > avgFreq && freq[0x00] > avgFreq * 2 && freq[0xE8] > 6);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
@ -1025,10 +1020,6 @@ Inverse_E89(uint8_t *src, uint64_t sz)
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* NOTE: function unused. Retained for future need.
|
|
||||||
*/
|
|
||||||
#if 0
|
|
||||||
/*
|
/*
|
||||||
* 32-bit x86 executable packer top-level routines. Detected x86 executable data
|
* 32-bit x86 executable packer top-level routines. Detected x86 executable data
|
||||||
* are passed through these encoding routines. The data chunk is split into 32KB
|
* are passed through these encoding routines. The data chunk is split into 32KB
|
||||||
|
@ -1058,7 +1049,6 @@ dispack_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
|
||||||
pos_to = to;
|
pos_to = to;
|
||||||
to_last = to + *dstlen;
|
to_last = to + *dstlen;
|
||||||
while (len > 0) {
|
while (len > 0) {
|
||||||
DisFilterCtx ctx(0, DISFILTER_BLOCK);
|
|
||||||
sU32 sz;
|
sU32 sz;
|
||||||
sU16 origsize;
|
sU16 origsize;
|
||||||
sU32 out;
|
sU32 out;
|
||||||
|
@ -1082,8 +1072,7 @@ dispack_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
|
||||||
|
|
||||||
out = sz;
|
out = sz;
|
||||||
if (is_x86_code(pos, sz)) {
|
if (is_x86_code(pos, sz)) {
|
||||||
ctx.ResetCtx(0, sz);
|
rv = DisFilter(pos, sz, 0, pos_to, out);
|
||||||
rv = DisFilter(ctx, pos, sz, 0, pos_to, out);
|
|
||||||
} else {
|
} else {
|
||||||
rv = NULL;
|
rv = NULL;
|
||||||
}
|
}
|
||||||
|
@ -1128,7 +1117,6 @@ dispack_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
|
||||||
#endif
|
#endif
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This function retained for ability to decode older archives encoded using raw block
|
* This function retained for ability to decode older archives encoded using raw block
|
||||||
|
|
|
@ -32,6 +32,7 @@
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
int dispack_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
|
||||||
int dispack_decode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
|
int dispack_decode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
|
||||||
|
|
||||||
int Forward_E89(uint8_t *src, uint64_t sz);
|
int Forward_E89(uint8_t *src, uint64_t sz);
|
||||||
|
|
41
pcompress.c
41
pcompress.c
|
@ -233,12 +233,34 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Dispack is used for 32-bit EXE files via a libarchive filter routine.
|
* Dispack is used for 32-bit EXE files via a libarchive filter routine.
|
||||||
* However if Dispack fails or 64-bit exes are detected we apply an E8E9
|
* For 64-bit exes or AR archives we apply an E8E9 CALL/JMP transform filter.
|
||||||
* CALL/JMP transform filter.
|
|
||||||
*/
|
*/
|
||||||
if (pctx->exe_preprocess) {
|
if (pctx->exe_preprocess) {
|
||||||
if (stype == TYPE_EXE32 || stype == TYPE_EXE64 ||
|
int processed = 0;
|
||||||
stype == TYPE_ARCHIVE_AR || stype == TYPE_EXE32_PE) {
|
|
||||||
|
if (stype == TYPE_EXE32 || stype == TYPE_EXE32_PE ||
|
||||||
|
stype == TYPE_EXE64 || stype == TYPE_ARCHIVE_AR) {
|
||||||
|
/*
|
||||||
|
* If file-level Dispack did not happen for 32-bit EXEs it was
|
||||||
|
* most likely that the file was large. So, as a workaround,
|
||||||
|
* we do raw-block Dispack here. However if even this fails to
|
||||||
|
* get any worthwhile reduction we do E8E9 as the final
|
||||||
|
* fallback.
|
||||||
|
*/
|
||||||
|
_dstlen = fromlen;
|
||||||
|
result = dispack_encode((uchar_t *)from, fromlen, to, &_dstlen);
|
||||||
|
if (result != -1) {
|
||||||
|
uchar_t *tmp;
|
||||||
|
tmp = from;
|
||||||
|
from = to;
|
||||||
|
to = tmp;
|
||||||
|
fromlen = _dstlen;
|
||||||
|
type |= PREPROC_TYPE_DISPACK;
|
||||||
|
processed = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!processed) {
|
||||||
_dstlen = fromlen;
|
_dstlen = fromlen;
|
||||||
memcpy(to, from, fromlen);
|
memcpy(to, from, fromlen);
|
||||||
if (Forward_E89(to, fromlen) == 0) {
|
if (Forward_E89(to, fromlen) == 0) {
|
||||||
|
@ -260,10 +282,11 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
|
||||||
int b_type;
|
int b_type;
|
||||||
|
|
||||||
b_type = btype;
|
b_type = btype;
|
||||||
if (analyzed)
|
if (analyzed) {
|
||||||
b_type = PC_TYPE(actx.one_pct.btype);
|
b_type = actx.ten_pct.btype;
|
||||||
else
|
} else {
|
||||||
b_type = analyze_buffer_simple(from, fromlen);
|
b_type = analyze_buffer_simple(from, fromlen);
|
||||||
|
}
|
||||||
|
|
||||||
if (PC_TYPE(b_type) & TYPE_TEXT) {
|
if (PC_TYPE(b_type) & TYPE_TEXT) {
|
||||||
_dstlen = fromlen;
|
_dstlen = fromlen;
|
||||||
|
@ -286,7 +309,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
|
||||||
|
|
||||||
b_type = btype;
|
b_type = btype;
|
||||||
if (analyzed)
|
if (analyzed)
|
||||||
b_type = actx.forty_pct.btype;
|
b_type = actx.thirty_pct.btype;
|
||||||
|
|
||||||
if (!(PC_TYPE(b_type) & TYPE_BINARY)) {
|
if (!(PC_TYPE(b_type) & TYPE_BINARY)) {
|
||||||
hashsize = lzp_hash_size(level);
|
hashsize = lzp_hash_size(level);
|
||||||
|
@ -311,7 +334,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
|
||||||
|
|
||||||
b_type = btype;
|
b_type = btype;
|
||||||
if (analyzed)
|
if (analyzed)
|
||||||
b_type = actx.one_pct.btype;
|
b_type = actx.ten_pct.btype;
|
||||||
|
|
||||||
if (!(PC_TYPE(b_type) & TYPE_TEXT)) {
|
if (!(PC_TYPE(b_type) & TYPE_TEXT)) {
|
||||||
_dstlen = fromlen;
|
_dstlen = fromlen;
|
||||||
|
|
Loading…
Reference in a new issue