A bunch of small fixes in Dict.

Improve text analysis for markup tags.
Use Libbsc for plain text and PPMd for markup mixed text.
Change thresholds.
This commit is contained in:
Moinak Ghosh 2015-01-11 17:36:46 +05:30
parent 66a482c968
commit 077da83d5d
4 changed files with 34 additions and 31 deletions

View file

@ -40,10 +40,6 @@
#include <pc_archive.h>
#include "filters/analyzer/analyzer.h"
#define FIFTY_PCT(x) (((x)/10) * 5)
#define FORTY_PCT(x) (((x)/10) * 4)
#define ONE_PCT(x) ((x)/100)
static unsigned int lzma_count = 0;
static unsigned int bzip2_count = 0;
static unsigned int bsc_count = 0;
@ -246,9 +242,12 @@ int
is_bsc_type(int btype)
{
int stype = PC_SUBTYPE(btype);
return ((stype == TYPE_MARKUP) | (stype == TYPE_BMP) | (stype == TYPE_DNA_SEQ) |
int mtype = PC_TYPE(btype);
return ((stype == TYPE_BMP) | (stype == TYPE_DNA_SEQ) |
(stype == TYPE_MP4) | (stype == TYPE_FLAC) | (stype == TYPE_AVI) |
(stype == TYPE_DICOM) | (stype == TYPE_MEDIA_BSC));
(stype == TYPE_DICOM) | (stype == TYPE_MEDIA_BSC) |
(mtype == TYPE_TEXT && stype != TYPE_MARKUP));
}
int
@ -260,7 +259,8 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
int stype = PC_SUBTYPE(btype);
analyzer_ctx_t actx;
if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF) {
if (btype == TYPE_UNKNOWN || PC_TYPE(btype) == TYPE_TEXT ||
stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF) {
if (adat->actx == NULL) {
analyze_buffer(src, srclen, &actx);
adat->actx = &actx;
@ -271,8 +271,6 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
} else if (adat->adapt_mode == 1) {
btype = adat->actx->fifty_pct.btype;
}
if (stype == TYPE_PDF)
btype |= TYPE_MARKUP;
}
/* Reset analyzer context for subsequent calls. */

View file

@ -27,13 +27,13 @@
#define FIFTY_PCT(x) (((x)/10) * 5)
#define FORTY_PCT(x) (((x)/10) * 4)
#define ONE_PCT(x) ((x)/100)
#define TEN_PCT(x) ((x)/10)
void
analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
{
uchar_t *src1 = (uchar_t *)src;
uint64_t i, tot8b, tot_8b, lbytes;
uint64_t i, tot8b, tot_8b, lbytes, spc;
uchar_t cur_byte, prev_byte;
uint64_t tag1, tag2, tag3;
double tagcnt, pct_tag;
@ -47,11 +47,13 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
tag2 = 0;
tag3 = 0;
lbytes = 0;
spc = 0;
prev_byte = cur_byte = 0;
for (i = 0; i < srclen; i++) {
cur_byte = src1[i];
tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization
lbytes += (cur_byte < 32);
spc += (cur_byte == ' ');
tag1 += (cur_byte == '<');
tag2 += (cur_byte == '>');
tag3 += ((prev_byte == '<') & (cur_byte == '/'));
@ -65,7 +67,7 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
* significance levels.
*/
tot_8b = tot8b / 0x80 + lbytes;
tagcnt = tag1 + tag2 + tag3;
tagcnt = tag1 + tag2;
pct_tag = tagcnt / (double)srclen;
if (tot_8b > FORTY_PCT(srclen)) {
actx->forty_pct.btype = TYPE_BINARY;
@ -80,13 +82,13 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
}
tot8b /= 0x80;
if (tot8b == 0 && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
if (tot8b <= TEN_PCT((double)srclen) && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
actx->one_pct.btype = TYPE_TEXT;
}
markup = 0;
if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 &&
tagcnt > (double)srclen * 0.001)
tagcnt > (double)spc * 0.1)
markup = 1;
if (markup) {
@ -120,7 +122,7 @@ analyze_buffer_simple(void *src, uint64_t srclen)
* Heuristics for detecting BINARY vs generic TEXT
*/
tot8b /= 0x80;
if (tot8b == 0 && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
if (tot8b <= TEN_PCT((double)srclen) && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
btype = TYPE_TEXT;
}
return (btype);

View file

@ -324,7 +324,6 @@ DictFilter::DictFilter()
SEPARATOR['\r'] = 1;
SEPARATOR['>'] = 1;
SEPARATOR[']'] = 1;
SEPARATOR['\''] = 1;
SEPARATOR[')'] = 1;
SEPARATOR['.'] = 1;
SEPARATOR['?'] = 1;
@ -847,13 +846,12 @@ DictFilter::Forward_Dict(uint8_t *src, uint32_t size, uint8_t *dst, uint32_t *ds
* Encode literal numeric strings.
*/
converted = 0;
if (word[0] != '+' && word[0] != '-' && word[0] != '0' &&
toklen > 4 && toklen < 10) {
if (isdigit(word[0]) && word[0] != '0' && toklen > 4 && toklen < 10) {
copy_bytes(num, word, toklen);
num[toklen] = '\0';
val = strtoul((const char *)num, (char **)&word, 10);
if (*word == '\0') {
if (*word == '\0' && word - num == toklen && val > 0) {
uint8_t tok_hdr[10], *dnum;
sz = sizeof (tok_hdr);
dnum = to_base_enc(val, tok_hdr, sz);
@ -914,7 +912,7 @@ DictFilter::Inverse_Dict(uint8_t *src, uint32_t srclen, uint8_t *dst, uint32_t *
decode_dict_entry_t *w_dict;
end = src + srclen;
srcpos = (uint8_t *)strchr((const char *)src, ' ');
srcpos = (uint8_t *)memchr((const void *)src, ' ', WORD_MAX);
if (srcpos - src > 12) {
return (0);
}
@ -924,7 +922,11 @@ DictFilter::Inverse_Dict(uint8_t *src, uint32_t srclen, uint8_t *dst, uint32_t *
w_dict = new decode_dict_entry_t[numWords];
for (i = 0; i < numWords && srcpos < end; i++) {
uint8_t *w_src = srcpos;
srcpos = (uint8_t *)strchr((const char *)srcpos, ' ');
size_t limit;
limit = end - srcpos;
if (limit > WORD_MAX+1) limit = WORD_MAX+1;
srcpos = (uint8_t *)memchr((const void *)srcpos, ' ', limit);
if (srcpos - w_src > WORD_MAX)
return (0);
@ -1089,6 +1091,7 @@ dict_decode(uint8_t *from, uint64_t fromlen, uint8_t *to, uint64_t *dstlen)
log_msg(LOG_ERR, 0, "dict_decode: Failed.\n");
return (-1);
}
if (dl < *dstlen) {
log_msg(LOG_ERR, 0, "dict_decode: Expected: %" PRIu64 ", Got: %" PRIu64 "\n",
*dstlen, dl);

View file

@ -211,7 +211,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
int result;
uint64_t _dstlen, fromlen;
uchar_t *from, *to;
int stype, dict, analyzed;
int stype, analyzed;
analyzer_ctx_t actx;
DEBUG_STAT_EN(double strt, en);
@ -221,9 +221,9 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
fromlen = srclen;
result = 0;
stype = PC_SUBTYPE(btype);
dict = 0;
analyzed = 0;
if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF || interesting) {
if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF ||
PC_TYPE(btype) == TYPE_TEXT || interesting) {
analyze_buffer(src, srclen, &actx);
analyzed = 1;
if (pctx->adapt_mode)
@ -258,12 +258,13 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
if (pctx->lzp_preprocess) {
int b_type;
b_type = btype;
if (analyzed)
b_type = PC_TYPE(actx.forty_pct.btype);
b_type = PC_TYPE(actx.one_pct.btype);
else
b_type = PC_TYPE(analyze_buffer_simple(from, fromlen));
b_type = analyze_buffer_simple(from, fromlen);
if (b_type == TYPE_TEXT) {
if (PC_TYPE(b_type) == TYPE_TEXT) {
_dstlen = fromlen;
result = dict_encode(from, fromlen, to, &_dstlen);
if (result != -1) {
@ -273,7 +274,6 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
to = tmp;
fromlen = _dstlen;
type |= PREPROC_TYPE_DICT;
dict = result;
}
}
}
@ -1354,7 +1354,7 @@ start_decompress(pc_ctx_t *pctx, const char *filename, char *to_filename)
log_msg(LOG_ERR, 1, "Can't seek in metadata fd: ");
UNCOMP_BAIL;
}
/*
* Finally create the metadata context.
*/
@ -1364,7 +1364,7 @@ start_decompress(pc_ctx_t *pctx, const char *filename, char *to_filename)
UNCOMP_BAIL;
}
}
uncompfd = -1;
if (setup_extractor(pctx) == -1) {
log_msg(LOG_ERR, 0, "Setup of extraction context failed.");
@ -1400,7 +1400,7 @@ start_decompress(pc_ctx_t *pctx, const char *filename, char *to_filename)
if (pctx->archive_mode) {
nprocs = nprocs > 1 ? nprocs-1:nprocs;
}
if (pctx->nthreads > 0 && pctx->nthreads < nprocs)
nprocs = pctx->nthreads;
else