A bunch of small fixes in Dict.
Improve text analysis for markup tags. Use Libbsc for plain text and PPMd for markup mixed text. Change thresholds.
This commit is contained in:
parent
66a482c968
commit
077da83d5d
4 changed files with 34 additions and 31 deletions
|
@ -40,10 +40,6 @@
|
||||||
#include <pc_archive.h>
|
#include <pc_archive.h>
|
||||||
#include "filters/analyzer/analyzer.h"
|
#include "filters/analyzer/analyzer.h"
|
||||||
|
|
||||||
#define FIFTY_PCT(x) (((x)/10) * 5)
|
|
||||||
#define FORTY_PCT(x) (((x)/10) * 4)
|
|
||||||
#define ONE_PCT(x) ((x)/100)
|
|
||||||
|
|
||||||
static unsigned int lzma_count = 0;
|
static unsigned int lzma_count = 0;
|
||||||
static unsigned int bzip2_count = 0;
|
static unsigned int bzip2_count = 0;
|
||||||
static unsigned int bsc_count = 0;
|
static unsigned int bsc_count = 0;
|
||||||
|
@ -246,9 +242,12 @@ int
|
||||||
is_bsc_type(int btype)
|
is_bsc_type(int btype)
|
||||||
{
|
{
|
||||||
int stype = PC_SUBTYPE(btype);
|
int stype = PC_SUBTYPE(btype);
|
||||||
return ((stype == TYPE_MARKUP) | (stype == TYPE_BMP) | (stype == TYPE_DNA_SEQ) |
|
int mtype = PC_TYPE(btype);
|
||||||
|
|
||||||
|
return ((stype == TYPE_BMP) | (stype == TYPE_DNA_SEQ) |
|
||||||
(stype == TYPE_MP4) | (stype == TYPE_FLAC) | (stype == TYPE_AVI) |
|
(stype == TYPE_MP4) | (stype == TYPE_FLAC) | (stype == TYPE_AVI) |
|
||||||
(stype == TYPE_DICOM) | (stype == TYPE_MEDIA_BSC));
|
(stype == TYPE_DICOM) | (stype == TYPE_MEDIA_BSC) |
|
||||||
|
(mtype == TYPE_TEXT && stype != TYPE_MARKUP));
|
||||||
}
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
|
@ -260,7 +259,8 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
|
||||||
int stype = PC_SUBTYPE(btype);
|
int stype = PC_SUBTYPE(btype);
|
||||||
analyzer_ctx_t actx;
|
analyzer_ctx_t actx;
|
||||||
|
|
||||||
if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF) {
|
if (btype == TYPE_UNKNOWN || PC_TYPE(btype) == TYPE_TEXT ||
|
||||||
|
stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF) {
|
||||||
if (adat->actx == NULL) {
|
if (adat->actx == NULL) {
|
||||||
analyze_buffer(src, srclen, &actx);
|
analyze_buffer(src, srclen, &actx);
|
||||||
adat->actx = &actx;
|
adat->actx = &actx;
|
||||||
|
@ -271,8 +271,6 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
|
||||||
} else if (adat->adapt_mode == 1) {
|
} else if (adat->adapt_mode == 1) {
|
||||||
btype = adat->actx->fifty_pct.btype;
|
btype = adat->actx->fifty_pct.btype;
|
||||||
}
|
}
|
||||||
if (stype == TYPE_PDF)
|
|
||||||
btype |= TYPE_MARKUP;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Reset analyzer context for subsequent calls. */
|
/* Reset analyzer context for subsequent calls. */
|
||||||
|
|
|
@ -27,13 +27,13 @@
|
||||||
|
|
||||||
#define FIFTY_PCT(x) (((x)/10) * 5)
|
#define FIFTY_PCT(x) (((x)/10) * 5)
|
||||||
#define FORTY_PCT(x) (((x)/10) * 4)
|
#define FORTY_PCT(x) (((x)/10) * 4)
|
||||||
#define ONE_PCT(x) ((x)/100)
|
#define TEN_PCT(x) ((x)/10)
|
||||||
|
|
||||||
void
|
void
|
||||||
analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
|
analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
|
||||||
{
|
{
|
||||||
uchar_t *src1 = (uchar_t *)src;
|
uchar_t *src1 = (uchar_t *)src;
|
||||||
uint64_t i, tot8b, tot_8b, lbytes;
|
uint64_t i, tot8b, tot_8b, lbytes, spc;
|
||||||
uchar_t cur_byte, prev_byte;
|
uchar_t cur_byte, prev_byte;
|
||||||
uint64_t tag1, tag2, tag3;
|
uint64_t tag1, tag2, tag3;
|
||||||
double tagcnt, pct_tag;
|
double tagcnt, pct_tag;
|
||||||
|
@ -47,11 +47,13 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
|
||||||
tag2 = 0;
|
tag2 = 0;
|
||||||
tag3 = 0;
|
tag3 = 0;
|
||||||
lbytes = 0;
|
lbytes = 0;
|
||||||
|
spc = 0;
|
||||||
prev_byte = cur_byte = 0;
|
prev_byte = cur_byte = 0;
|
||||||
for (i = 0; i < srclen; i++) {
|
for (i = 0; i < srclen; i++) {
|
||||||
cur_byte = src1[i];
|
cur_byte = src1[i];
|
||||||
tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization
|
tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization
|
||||||
lbytes += (cur_byte < 32);
|
lbytes += (cur_byte < 32);
|
||||||
|
spc += (cur_byte == ' ');
|
||||||
tag1 += (cur_byte == '<');
|
tag1 += (cur_byte == '<');
|
||||||
tag2 += (cur_byte == '>');
|
tag2 += (cur_byte == '>');
|
||||||
tag3 += ((prev_byte == '<') & (cur_byte == '/'));
|
tag3 += ((prev_byte == '<') & (cur_byte == '/'));
|
||||||
|
@ -65,7 +67,7 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
|
||||||
* significance levels.
|
* significance levels.
|
||||||
*/
|
*/
|
||||||
tot_8b = tot8b / 0x80 + lbytes;
|
tot_8b = tot8b / 0x80 + lbytes;
|
||||||
tagcnt = tag1 + tag2 + tag3;
|
tagcnt = tag1 + tag2;
|
||||||
pct_tag = tagcnt / (double)srclen;
|
pct_tag = tagcnt / (double)srclen;
|
||||||
if (tot_8b > FORTY_PCT(srclen)) {
|
if (tot_8b > FORTY_PCT(srclen)) {
|
||||||
actx->forty_pct.btype = TYPE_BINARY;
|
actx->forty_pct.btype = TYPE_BINARY;
|
||||||
|
@ -80,13 +82,13 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
|
||||||
}
|
}
|
||||||
|
|
||||||
tot8b /= 0x80;
|
tot8b /= 0x80;
|
||||||
if (tot8b == 0 && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
|
if (tot8b <= TEN_PCT((double)srclen) && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
|
||||||
actx->one_pct.btype = TYPE_TEXT;
|
actx->one_pct.btype = TYPE_TEXT;
|
||||||
}
|
}
|
||||||
|
|
||||||
markup = 0;
|
markup = 0;
|
||||||
if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 &&
|
if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 &&
|
||||||
tagcnt > (double)srclen * 0.001)
|
tagcnt > (double)spc * 0.1)
|
||||||
markup = 1;
|
markup = 1;
|
||||||
|
|
||||||
if (markup) {
|
if (markup) {
|
||||||
|
@ -120,7 +122,7 @@ analyze_buffer_simple(void *src, uint64_t srclen)
|
||||||
* Heuristics for detecting BINARY vs generic TEXT
|
* Heuristics for detecting BINARY vs generic TEXT
|
||||||
*/
|
*/
|
||||||
tot8b /= 0x80;
|
tot8b /= 0x80;
|
||||||
if (tot8b == 0 && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
|
if (tot8b <= TEN_PCT((double)srclen) && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
|
||||||
btype = TYPE_TEXT;
|
btype = TYPE_TEXT;
|
||||||
}
|
}
|
||||||
return (btype);
|
return (btype);
|
||||||
|
|
|
@ -324,7 +324,6 @@ DictFilter::DictFilter()
|
||||||
SEPARATOR['\r'] = 1;
|
SEPARATOR['\r'] = 1;
|
||||||
SEPARATOR['>'] = 1;
|
SEPARATOR['>'] = 1;
|
||||||
SEPARATOR[']'] = 1;
|
SEPARATOR[']'] = 1;
|
||||||
SEPARATOR['\''] = 1;
|
|
||||||
SEPARATOR[')'] = 1;
|
SEPARATOR[')'] = 1;
|
||||||
SEPARATOR['.'] = 1;
|
SEPARATOR['.'] = 1;
|
||||||
SEPARATOR['?'] = 1;
|
SEPARATOR['?'] = 1;
|
||||||
|
@ -847,13 +846,12 @@ DictFilter::Forward_Dict(uint8_t *src, uint32_t size, uint8_t *dst, uint32_t *ds
|
||||||
* Encode literal numeric strings.
|
* Encode literal numeric strings.
|
||||||
*/
|
*/
|
||||||
converted = 0;
|
converted = 0;
|
||||||
if (word[0] != '+' && word[0] != '-' && word[0] != '0' &&
|
if (isdigit(word[0]) && word[0] != '0' && toklen > 4 && toklen < 10) {
|
||||||
toklen > 4 && toklen < 10) {
|
|
||||||
copy_bytes(num, word, toklen);
|
copy_bytes(num, word, toklen);
|
||||||
num[toklen] = '\0';
|
num[toklen] = '\0';
|
||||||
val = strtoul((const char *)num, (char **)&word, 10);
|
val = strtoul((const char *)num, (char **)&word, 10);
|
||||||
|
|
||||||
if (*word == '\0') {
|
if (*word == '\0' && word - num == toklen && val > 0) {
|
||||||
uint8_t tok_hdr[10], *dnum;
|
uint8_t tok_hdr[10], *dnum;
|
||||||
sz = sizeof (tok_hdr);
|
sz = sizeof (tok_hdr);
|
||||||
dnum = to_base_enc(val, tok_hdr, sz);
|
dnum = to_base_enc(val, tok_hdr, sz);
|
||||||
|
@ -914,7 +912,7 @@ DictFilter::Inverse_Dict(uint8_t *src, uint32_t srclen, uint8_t *dst, uint32_t *
|
||||||
decode_dict_entry_t *w_dict;
|
decode_dict_entry_t *w_dict;
|
||||||
|
|
||||||
end = src + srclen;
|
end = src + srclen;
|
||||||
srcpos = (uint8_t *)strchr((const char *)src, ' ');
|
srcpos = (uint8_t *)memchr((const void *)src, ' ', WORD_MAX);
|
||||||
if (srcpos - src > 12) {
|
if (srcpos - src > 12) {
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
@ -924,7 +922,11 @@ DictFilter::Inverse_Dict(uint8_t *src, uint32_t srclen, uint8_t *dst, uint32_t *
|
||||||
w_dict = new decode_dict_entry_t[numWords];
|
w_dict = new decode_dict_entry_t[numWords];
|
||||||
for (i = 0; i < numWords && srcpos < end; i++) {
|
for (i = 0; i < numWords && srcpos < end; i++) {
|
||||||
uint8_t *w_src = srcpos;
|
uint8_t *w_src = srcpos;
|
||||||
srcpos = (uint8_t *)strchr((const char *)srcpos, ' ');
|
size_t limit;
|
||||||
|
|
||||||
|
limit = end - srcpos;
|
||||||
|
if (limit > WORD_MAX+1) limit = WORD_MAX+1;
|
||||||
|
srcpos = (uint8_t *)memchr((const void *)srcpos, ' ', limit);
|
||||||
if (srcpos - w_src > WORD_MAX)
|
if (srcpos - w_src > WORD_MAX)
|
||||||
return (0);
|
return (0);
|
||||||
|
|
||||||
|
@ -1089,6 +1091,7 @@ dict_decode(uint8_t *from, uint64_t fromlen, uint8_t *to, uint64_t *dstlen)
|
||||||
log_msg(LOG_ERR, 0, "dict_decode: Failed.\n");
|
log_msg(LOG_ERR, 0, "dict_decode: Failed.\n");
|
||||||
return (-1);
|
return (-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dl < *dstlen) {
|
if (dl < *dstlen) {
|
||||||
log_msg(LOG_ERR, 0, "dict_decode: Expected: %" PRIu64 ", Got: %" PRIu64 "\n",
|
log_msg(LOG_ERR, 0, "dict_decode: Expected: %" PRIu64 ", Got: %" PRIu64 "\n",
|
||||||
*dstlen, dl);
|
*dstlen, dl);
|
||||||
|
|
14
pcompress.c
14
pcompress.c
|
@ -211,7 +211,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
|
||||||
int result;
|
int result;
|
||||||
uint64_t _dstlen, fromlen;
|
uint64_t _dstlen, fromlen;
|
||||||
uchar_t *from, *to;
|
uchar_t *from, *to;
|
||||||
int stype, dict, analyzed;
|
int stype, analyzed;
|
||||||
analyzer_ctx_t actx;
|
analyzer_ctx_t actx;
|
||||||
DEBUG_STAT_EN(double strt, en);
|
DEBUG_STAT_EN(double strt, en);
|
||||||
|
|
||||||
|
@ -221,9 +221,9 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
|
||||||
fromlen = srclen;
|
fromlen = srclen;
|
||||||
result = 0;
|
result = 0;
|
||||||
stype = PC_SUBTYPE(btype);
|
stype = PC_SUBTYPE(btype);
|
||||||
dict = 0;
|
|
||||||
analyzed = 0;
|
analyzed = 0;
|
||||||
if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF || interesting) {
|
if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF ||
|
||||||
|
PC_TYPE(btype) == TYPE_TEXT || interesting) {
|
||||||
analyze_buffer(src, srclen, &actx);
|
analyze_buffer(src, srclen, &actx);
|
||||||
analyzed = 1;
|
analyzed = 1;
|
||||||
if (pctx->adapt_mode)
|
if (pctx->adapt_mode)
|
||||||
|
@ -258,12 +258,13 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
|
||||||
if (pctx->lzp_preprocess) {
|
if (pctx->lzp_preprocess) {
|
||||||
int b_type;
|
int b_type;
|
||||||
|
|
||||||
|
b_type = btype;
|
||||||
if (analyzed)
|
if (analyzed)
|
||||||
b_type = PC_TYPE(actx.forty_pct.btype);
|
b_type = PC_TYPE(actx.one_pct.btype);
|
||||||
else
|
else
|
||||||
b_type = PC_TYPE(analyze_buffer_simple(from, fromlen));
|
b_type = analyze_buffer_simple(from, fromlen);
|
||||||
|
|
||||||
if (b_type == TYPE_TEXT) {
|
if (PC_TYPE(b_type) == TYPE_TEXT) {
|
||||||
_dstlen = fromlen;
|
_dstlen = fromlen;
|
||||||
result = dict_encode(from, fromlen, to, &_dstlen);
|
result = dict_encode(from, fromlen, to, &_dstlen);
|
||||||
if (result != -1) {
|
if (result != -1) {
|
||||||
|
@ -273,7 +274,6 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
|
||||||
to = tmp;
|
to = tmp;
|
||||||
fromlen = _dstlen;
|
fromlen = _dstlen;
|
||||||
type |= PREPROC_TYPE_DICT;
|
type |= PREPROC_TYPE_DICT;
|
||||||
dict = result;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue