A bunch of small fixes in Dict.

Improve text analysis for markup tags. Use Libbsc for plain text and PPMd for markup mixed text. Change thresholds.
2015-01-11 17:36:46 +05:30 · 2015-01-11 17:36:46 +05:30 · 077da83d5d
commit 077da83d5d
parent 66a482c968
4 changed files with 34 additions and 31 deletions
--- a/adaptive_compress.c
+++ b/adaptive_compress.c
@ -40,10 +40,6 @@
 #include <pc_archive.h>
 #include "filters/analyzer/analyzer.h"
 #define	FIFTY_PCT(x)	(((x)/10) * 5)
 #define	FORTY_PCT(x)	(((x)/10) * 4)
 #define	ONE_PCT(x)	((x)/100)
 static unsigned int lzma_count = 0;
 static unsigned int bzip2_count = 0;
 static unsigned int bsc_count = 0;
@ -246,9 +242,12 @@ int
 is_bsc_type(int btype)
 {
 	int stype = PC_SUBTYPE(btype);
-	return ((stype == TYPE_MARKUP) | (stype == TYPE_BMP) | (stype == TYPE_DNA_SEQ) |
+	int mtype = PC_TYPE(btype);
 	return ((stype == TYPE_BMP) | (stype == TYPE_DNA_SEQ) |
 	    (stype == TYPE_MP4) | (stype == TYPE_FLAC) | (stype == TYPE_AVI) |
-	    (stype == TYPE_DICOM) | (stype == TYPE_MEDIA_BSC));
+	    (stype == TYPE_DICOM) | (stype == TYPE_MEDIA_BSC) |
 	    (mtype == TYPE_TEXT && stype != TYPE_MARKUP));
 }
 int
@ -260,7 +259,8 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
 	int stype = PC_SUBTYPE(btype);
 	analyzer_ctx_t actx;
-	if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF) {
+	if (btype == TYPE_UNKNOWN || PC_TYPE(btype) == TYPE_TEXT ||
 	    stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF) {
 		if (adat->actx == NULL) {
 			analyze_buffer(src, srclen, &actx);
 			adat->actx = &actx;
@ -271,8 +271,6 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
 		} else if (adat->adapt_mode == 1) {
 			btype = adat->actx->fifty_pct.btype;
 		}
 		if (stype == TYPE_PDF)
 			btype |= TYPE_MARKUP;
 	}
 	/* Reset analyzer context for subsequent calls. */
--- a/filters/analyzer/analyzer.c
+++ b/filters/analyzer/analyzer.c
@ -27,13 +27,13 @@
 #define	FIFTY_PCT(x)	(((x)/10) * 5)
 #define	FORTY_PCT(x)	(((x)/10) * 4)
-#define	ONE_PCT(x)	((x)/100)
+#define	TEN_PCT(x)	((x)/10)
 void
 analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
 {
 	uchar_t *src1 = (uchar_t *)src;
-	uint64_t i, tot8b, tot_8b, lbytes;
+	uint64_t i, tot8b, tot_8b, lbytes, spc;
 	uchar_t cur_byte, prev_byte;
 	uint64_t tag1, tag2, tag3;
 	double tagcnt, pct_tag;
@ -47,11 +47,13 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
 	tag2 = 0;
 	tag3 = 0;
 	lbytes = 0;
 	spc = 0;
 	prev_byte = cur_byte = 0;
 	for (i = 0; i < srclen; i++) {
 		cur_byte = src1[i];
 		tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization
 		lbytes += (cur_byte < 32);
 		spc += (cur_byte == ' ');
 		tag1 += (cur_byte == '<');
 		tag2 += (cur_byte == '>');
 		tag3 += ((prev_byte == '<') & (cur_byte == '/'));
@ -65,7 +67,7 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
 	 * significance levels.
 	 */
 	tot_8b = tot8b / 0x80 + lbytes;
-	tagcnt = tag1 + tag2 + tag3;
+	tagcnt = tag1 + tag2;
 	pct_tag = tagcnt / (double)srclen;
 	if (tot_8b > FORTY_PCT(srclen)) {
 		actx->forty_pct.btype = TYPE_BINARY;
@ -80,13 +82,13 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
 	}
 	tot8b /= 0x80;
-	if (tot8b == 0 && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
+	if (tot8b <= TEN_PCT((double)srclen) && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
 		actx->one_pct.btype = TYPE_TEXT;
 	}
 	markup = 0;
 	if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 &&
-	    tagcnt > (double)srclen * 0.001)
+	    tagcnt > (double)spc * 0.1)
 		markup = 1;
 	if (markup) {
@ -120,7 +122,7 @@ analyze_buffer_simple(void *src, uint64_t srclen)
 	 * Heuristics for detecting BINARY vs generic TEXT
 	 */
 	tot8b /= 0x80;
-	if (tot8b == 0 && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
+	if (tot8b <= TEN_PCT((double)srclen) && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
 		btype = TYPE_TEXT;
 	}
 	return (btype);
--- a/filters/dict/DictFilter.cpp
+++ b/filters/dict/DictFilter.cpp
@ -324,7 +324,6 @@ DictFilter::DictFilter()
 	SEPARATOR['\r'] = 1;
 	SEPARATOR['>'] = 1;
 	SEPARATOR[']'] = 1;
 	SEPARATOR['\''] = 1;
 	SEPARATOR[')'] = 1;
 	SEPARATOR['.'] = 1;
 	SEPARATOR['?'] = 1;
@ -847,13 +846,12 @@ DictFilter::Forward_Dict(uint8_t *src, uint32_t size, uint8_t *dst, uint32_t *ds
 				 * Encode literal numeric strings.
 				 */
 				converted = 0;
-				if (word[0] != '+' && word[0] != '-' && word[0] != '0' &&
+				if (isdigit(word[0]) && word[0] != '0' && toklen > 4 && toklen < 10) {
 				    toklen > 4 && toklen < 10) {
 					copy_bytes(num, word, toklen);
 					num[toklen] = '\0';
 					val = strtoul((const char *)num, (char **)&word, 10);
-					if (*word == '\0') {
+					if (*word == '\0' && word - num == toklen && val > 0) {
 						uint8_t tok_hdr[10], *dnum;
 						sz = sizeof (tok_hdr);
 						dnum = to_base_enc(val, tok_hdr, sz);
@ -914,7 +912,7 @@ DictFilter::Inverse_Dict(uint8_t *src, uint32_t srclen, uint8_t *dst, uint32_t *
 	decode_dict_entry_t *w_dict;
 	end = src + srclen;
-	srcpos = (uint8_t *)strchr((const char *)src, ' ');
+	srcpos = (uint8_t *)memchr((const void *)src, ' ', WORD_MAX);
 	if (srcpos - src > 12) {
 		return (0);
 	}
@ -924,7 +922,11 @@ DictFilter::Inverse_Dict(uint8_t *src, uint32_t srclen, uint8_t *dst, uint32_t *
 	w_dict = new decode_dict_entry_t[numWords];
 	for (i = 0; i < numWords && srcpos < end; i++) {
 		uint8_t *w_src = srcpos;
-		srcpos = (uint8_t *)strchr((const char *)srcpos, ' ');
+		size_t limit;
 		limit = end - srcpos;
 		if (limit > WORD_MAX+1) limit = WORD_MAX+1;
 		srcpos = (uint8_t *)memchr((const void *)srcpos, ' ', limit);
 		if (srcpos - w_src > WORD_MAX)
 			return (0);
@ -1089,6 +1091,7 @@ dict_decode(uint8_t *from, uint64_t fromlen, uint8_t *to, uint64_t *dstlen)
 		log_msg(LOG_ERR, 0, "dict_decode: Failed.\n");
 		return (-1);
 	}
 	if (dl < *dstlen) {
 		log_msg(LOG_ERR, 0, "dict_decode: Expected: %" PRIu64 ", Got: %" PRIu64 "\n",
 		    *dstlen, dl);
--- a/pcompress.c
+++ b/pcompress.c
@ -211,7 +211,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
 	int result;
 	uint64_t _dstlen, fromlen;
 	uchar_t *from, *to;
-	int stype, dict, analyzed;
+	int stype, analyzed;
 	analyzer_ctx_t actx;
 	DEBUG_STAT_EN(double strt, en);
@ -221,9 +221,9 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
 	fromlen = srclen;
 	result = 0;
 	stype = PC_SUBTYPE(btype);
 	dict = 0;
 	analyzed = 0;
-	if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF || interesting) {
+	if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF ||
 	    PC_TYPE(btype) == TYPE_TEXT || interesting) {
 		analyze_buffer(src, srclen, &actx);
 		analyzed = 1;
 		if (pctx->adapt_mode)
@ -258,12 +258,13 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
 	if (pctx->lzp_preprocess) {
 		int b_type;
 		b_type = btype;
 		if (analyzed)
-			b_type = PC_TYPE(actx.forty_pct.btype);
+			b_type = PC_TYPE(actx.one_pct.btype);
 		else
-			b_type = PC_TYPE(analyze_buffer_simple(from, fromlen));
+			b_type = analyze_buffer_simple(from, fromlen);
-		if (b_type == TYPE_TEXT) {
+		if (PC_TYPE(b_type) == TYPE_TEXT) {
 			_dstlen = fromlen;
 			result = dict_encode(from, fromlen, to, &_dstlen);
 			if (result != -1) {
@ -273,7 +274,6 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
 				to = tmp;
 				fromlen = _dstlen;
 				type |= PREPROC_TYPE_DICT;
 				dict = result;
 			}
 		}
 	}