A bunch of small fixes in Dict.
Improve text analysis for markup tags. Use Libbsc for plain text and PPMd for markup mixed text. Change thresholds.
This commit is contained in:
parent
66a482c968
commit
077da83d5d
4 changed files with 34 additions and 31 deletions
|
@ -40,10 +40,6 @@
|
|||
#include <pc_archive.h>
|
||||
#include "filters/analyzer/analyzer.h"
|
||||
|
||||
#define FIFTY_PCT(x) (((x)/10) * 5)
|
||||
#define FORTY_PCT(x) (((x)/10) * 4)
|
||||
#define ONE_PCT(x) ((x)/100)
|
||||
|
||||
static unsigned int lzma_count = 0;
|
||||
static unsigned int bzip2_count = 0;
|
||||
static unsigned int bsc_count = 0;
|
||||
|
@ -246,9 +242,12 @@ int
|
|||
is_bsc_type(int btype)
|
||||
{
|
||||
int stype = PC_SUBTYPE(btype);
|
||||
return ((stype == TYPE_MARKUP) | (stype == TYPE_BMP) | (stype == TYPE_DNA_SEQ) |
|
||||
int mtype = PC_TYPE(btype);
|
||||
|
||||
return ((stype == TYPE_BMP) | (stype == TYPE_DNA_SEQ) |
|
||||
(stype == TYPE_MP4) | (stype == TYPE_FLAC) | (stype == TYPE_AVI) |
|
||||
(stype == TYPE_DICOM) | (stype == TYPE_MEDIA_BSC));
|
||||
(stype == TYPE_DICOM) | (stype == TYPE_MEDIA_BSC) |
|
||||
(mtype == TYPE_TEXT && stype != TYPE_MARKUP));
|
||||
}
|
||||
|
||||
int
|
||||
|
@ -260,7 +259,8 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
|
|||
int stype = PC_SUBTYPE(btype);
|
||||
analyzer_ctx_t actx;
|
||||
|
||||
if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF) {
|
||||
if (btype == TYPE_UNKNOWN || PC_TYPE(btype) == TYPE_TEXT ||
|
||||
stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF) {
|
||||
if (adat->actx == NULL) {
|
||||
analyze_buffer(src, srclen, &actx);
|
||||
adat->actx = &actx;
|
||||
|
@ -271,8 +271,6 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
|
|||
} else if (adat->adapt_mode == 1) {
|
||||
btype = adat->actx->fifty_pct.btype;
|
||||
}
|
||||
if (stype == TYPE_PDF)
|
||||
btype |= TYPE_MARKUP;
|
||||
}
|
||||
|
||||
/* Reset analyzer context for subsequent calls. */
|
||||
|
|
|
@ -27,13 +27,13 @@
|
|||
|
||||
#define FIFTY_PCT(x) (((x)/10) * 5)
|
||||
#define FORTY_PCT(x) (((x)/10) * 4)
|
||||
#define ONE_PCT(x) ((x)/100)
|
||||
#define TEN_PCT(x) ((x)/10)
|
||||
|
||||
void
|
||||
analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
|
||||
{
|
||||
uchar_t *src1 = (uchar_t *)src;
|
||||
uint64_t i, tot8b, tot_8b, lbytes;
|
||||
uint64_t i, tot8b, tot_8b, lbytes, spc;
|
||||
uchar_t cur_byte, prev_byte;
|
||||
uint64_t tag1, tag2, tag3;
|
||||
double tagcnt, pct_tag;
|
||||
|
@ -47,11 +47,13 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
|
|||
tag2 = 0;
|
||||
tag3 = 0;
|
||||
lbytes = 0;
|
||||
spc = 0;
|
||||
prev_byte = cur_byte = 0;
|
||||
for (i = 0; i < srclen; i++) {
|
||||
cur_byte = src1[i];
|
||||
tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization
|
||||
lbytes += (cur_byte < 32);
|
||||
spc += (cur_byte == ' ');
|
||||
tag1 += (cur_byte == '<');
|
||||
tag2 += (cur_byte == '>');
|
||||
tag3 += ((prev_byte == '<') & (cur_byte == '/'));
|
||||
|
@ -65,7 +67,7 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
|
|||
* significance levels.
|
||||
*/
|
||||
tot_8b = tot8b / 0x80 + lbytes;
|
||||
tagcnt = tag1 + tag2 + tag3;
|
||||
tagcnt = tag1 + tag2;
|
||||
pct_tag = tagcnt / (double)srclen;
|
||||
if (tot_8b > FORTY_PCT(srclen)) {
|
||||
actx->forty_pct.btype = TYPE_BINARY;
|
||||
|
@ -80,13 +82,13 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
|
|||
}
|
||||
|
||||
tot8b /= 0x80;
|
||||
if (tot8b == 0 && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
|
||||
if (tot8b <= TEN_PCT((double)srclen) && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
|
||||
actx->one_pct.btype = TYPE_TEXT;
|
||||
}
|
||||
|
||||
markup = 0;
|
||||
if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 &&
|
||||
tagcnt > (double)srclen * 0.001)
|
||||
tagcnt > (double)spc * 0.1)
|
||||
markup = 1;
|
||||
|
||||
if (markup) {
|
||||
|
@ -120,7 +122,7 @@ analyze_buffer_simple(void *src, uint64_t srclen)
|
|||
* Heuristics for detecting BINARY vs generic TEXT
|
||||
*/
|
||||
tot8b /= 0x80;
|
||||
if (tot8b == 0 && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
|
||||
if (tot8b <= TEN_PCT((double)srclen) && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
|
||||
btype = TYPE_TEXT;
|
||||
}
|
||||
return (btype);
|
||||
|
|
|
@ -324,7 +324,6 @@ DictFilter::DictFilter()
|
|||
SEPARATOR['\r'] = 1;
|
||||
SEPARATOR['>'] = 1;
|
||||
SEPARATOR[']'] = 1;
|
||||
SEPARATOR['\''] = 1;
|
||||
SEPARATOR[')'] = 1;
|
||||
SEPARATOR['.'] = 1;
|
||||
SEPARATOR['?'] = 1;
|
||||
|
@ -847,13 +846,12 @@ DictFilter::Forward_Dict(uint8_t *src, uint32_t size, uint8_t *dst, uint32_t *ds
|
|||
* Encode literal numeric strings.
|
||||
*/
|
||||
converted = 0;
|
||||
if (word[0] != '+' && word[0] != '-' && word[0] != '0' &&
|
||||
toklen > 4 && toklen < 10) {
|
||||
if (isdigit(word[0]) && word[0] != '0' && toklen > 4 && toklen < 10) {
|
||||
copy_bytes(num, word, toklen);
|
||||
num[toklen] = '\0';
|
||||
val = strtoul((const char *)num, (char **)&word, 10);
|
||||
|
||||
if (*word == '\0') {
|
||||
if (*word == '\0' && word - num == toklen && val > 0) {
|
||||
uint8_t tok_hdr[10], *dnum;
|
||||
sz = sizeof (tok_hdr);
|
||||
dnum = to_base_enc(val, tok_hdr, sz);
|
||||
|
@ -914,7 +912,7 @@ DictFilter::Inverse_Dict(uint8_t *src, uint32_t srclen, uint8_t *dst, uint32_t *
|
|||
decode_dict_entry_t *w_dict;
|
||||
|
||||
end = src + srclen;
|
||||
srcpos = (uint8_t *)strchr((const char *)src, ' ');
|
||||
srcpos = (uint8_t *)memchr((const void *)src, ' ', WORD_MAX);
|
||||
if (srcpos - src > 12) {
|
||||
return (0);
|
||||
}
|
||||
|
@ -924,7 +922,11 @@ DictFilter::Inverse_Dict(uint8_t *src, uint32_t srclen, uint8_t *dst, uint32_t *
|
|||
w_dict = new decode_dict_entry_t[numWords];
|
||||
for (i = 0; i < numWords && srcpos < end; i++) {
|
||||
uint8_t *w_src = srcpos;
|
||||
srcpos = (uint8_t *)strchr((const char *)srcpos, ' ');
|
||||
size_t limit;
|
||||
|
||||
limit = end - srcpos;
|
||||
if (limit > WORD_MAX+1) limit = WORD_MAX+1;
|
||||
srcpos = (uint8_t *)memchr((const void *)srcpos, ' ', limit);
|
||||
if (srcpos - w_src > WORD_MAX)
|
||||
return (0);
|
||||
|
||||
|
@ -1089,6 +1091,7 @@ dict_decode(uint8_t *from, uint64_t fromlen, uint8_t *to, uint64_t *dstlen)
|
|||
log_msg(LOG_ERR, 0, "dict_decode: Failed.\n");
|
||||
return (-1);
|
||||
}
|
||||
|
||||
if (dl < *dstlen) {
|
||||
log_msg(LOG_ERR, 0, "dict_decode: Expected: %" PRIu64 ", Got: %" PRIu64 "\n",
|
||||
*dstlen, dl);
|
||||
|
|
20
pcompress.c
20
pcompress.c
|
@ -211,7 +211,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
|
|||
int result;
|
||||
uint64_t _dstlen, fromlen;
|
||||
uchar_t *from, *to;
|
||||
int stype, dict, analyzed;
|
||||
int stype, analyzed;
|
||||
analyzer_ctx_t actx;
|
||||
DEBUG_STAT_EN(double strt, en);
|
||||
|
||||
|
@ -221,9 +221,9 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
|
|||
fromlen = srclen;
|
||||
result = 0;
|
||||
stype = PC_SUBTYPE(btype);
|
||||
dict = 0;
|
||||
analyzed = 0;
|
||||
if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF || interesting) {
|
||||
if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF ||
|
||||
PC_TYPE(btype) == TYPE_TEXT || interesting) {
|
||||
analyze_buffer(src, srclen, &actx);
|
||||
analyzed = 1;
|
||||
if (pctx->adapt_mode)
|
||||
|
@ -258,12 +258,13 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
|
|||
if (pctx->lzp_preprocess) {
|
||||
int b_type;
|
||||
|
||||
b_type = btype;
|
||||
if (analyzed)
|
||||
b_type = PC_TYPE(actx.forty_pct.btype);
|
||||
b_type = PC_TYPE(actx.one_pct.btype);
|
||||
else
|
||||
b_type = PC_TYPE(analyze_buffer_simple(from, fromlen));
|
||||
b_type = analyze_buffer_simple(from, fromlen);
|
||||
|
||||
if (b_type == TYPE_TEXT) {
|
||||
if (PC_TYPE(b_type) == TYPE_TEXT) {
|
||||
_dstlen = fromlen;
|
||||
result = dict_encode(from, fromlen, to, &_dstlen);
|
||||
if (result != -1) {
|
||||
|
@ -273,7 +274,6 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
|
|||
to = tmp;
|
||||
fromlen = _dstlen;
|
||||
type |= PREPROC_TYPE_DICT;
|
||||
dict = result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1354,7 +1354,7 @@ start_decompress(pc_ctx_t *pctx, const char *filename, char *to_filename)
|
|||
log_msg(LOG_ERR, 1, "Can't seek in metadata fd: ");
|
||||
UNCOMP_BAIL;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Finally create the metadata context.
|
||||
*/
|
||||
|
@ -1364,7 +1364,7 @@ start_decompress(pc_ctx_t *pctx, const char *filename, char *to_filename)
|
|||
UNCOMP_BAIL;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
uncompfd = -1;
|
||||
if (setup_extractor(pctx) == -1) {
|
||||
log_msg(LOG_ERR, 0, "Setup of extraction context failed.");
|
||||
|
@ -1400,7 +1400,7 @@ start_decompress(pc_ctx_t *pctx, const char *filename, char *to_filename)
|
|||
if (pctx->archive_mode) {
|
||||
nprocs = nprocs > 1 ? nprocs-1:nprocs;
|
||||
}
|
||||
|
||||
|
||||
if (pctx->nthreads > 0 && pctx->nthreads < nprocs)
|
||||
nprocs = pctx->nthreads;
|
||||
else
|
||||
|
|
Loading…
Reference in a new issue