Centralise data analysis routine for optimum performance and leverage.

Utilise buffer data analysis for preprocessing filters.
2014-11-06 22:23:33 +05:30 · 2014-11-06 22:23:33 +05:30 · 507e7c75d3
commit 507e7c75d3
parent 848010fbb5
6 changed files with 195 additions and 125 deletions
--- a/adaptive_compress.c
+++ b/adaptive_compress.c
@ -38,6 +38,7 @@
 #include <pcompress.h>
 #include <allocator.h>
 #include <pc_archive.h>
+#include "filters/analyzer/analyzer.h"

 #define	FIFTY_PCT(x)	(((x)/10) * 5)
 #define	FORTY_PCT(x)	(((x)/10) * 4)
@ -97,8 +98,16 @@ struct adapt_data {
 	void *bsc_data;
 	void *lz4_data;
 	int adapt_mode;
+	analyzer_ctx_t *actx;
 };

+void
+adapt_set_analyzer_ctx(void *data, analyzer_ctx_t *actx)
+{
+	struct adapt_data *adat = (struct adapt_data *)data;
+	adat->actx = actx;
+}
+
 void
 adapt_stats(int show)
 {
@ -246,75 +255,27 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
 	uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data)
 {
 	struct adapt_data *adat = (struct adapt_data *)(data);
-	uchar_t *src1 = (uchar_t *)src;
 	int rv = 0, bsc_type = 0;
 	int stype = PC_SUBTYPE(btype);
+	analyzer_ctx_t actx;

-	if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR) {
-		uint64_t i, tot8b, tag1, tag2, tag3, lbytes;
-		double tagcnt, pct_tag;
-		uchar_t cur_byte, prev_byte;
-		/*
-		 * Count number of 8-bit binary bytes and XML tags in source.
-		 */
-		tot8b = 0;
-		tag1 = 0;
-		tag2 = 0;
-		tag3 = 0;
-		lbytes = 0;
-		prev_byte = cur_byte = 0;
-		for (i = 0; i < srclen; i++) {
-			cur_byte = src1[i];
-			tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization
-			lbytes += (cur_byte < 32);
-			tag1 += (cur_byte == '<');
-			tag2 += (cur_byte == '>');
-			tag3 += ((prev_byte == '<') & (cur_byte == '/'));
-			tag3 += ((prev_byte == '/') & (cur_byte == '>'));
-			if (cur_byte != ' ')
-				prev_byte = cur_byte;
+	if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF) {
+		if (adat->actx == NULL) {
+			analyze_buffer(src, srclen, &actx);
+			adat->actx = &actx;
 		}
+		if (adat->adapt_mode == 2) {
+			btype = adat->actx->forty_pct.btype;

-		/*
-		 * Heuristics for detecting BINARY vs generic TEXT vs XML data.
-		 */
-		tot8b = tot8b / 0x80 + lbytes;
-		tagcnt = tag1 + tag2 + tag3;
-		pct_tag = tagcnt / (double)srclen;
-		if (adat->adapt_mode == 2 && tot8b > FORTY_PCT(srclen)) {
-			btype = TYPE_BINARY;
-		} else if (adat->adapt_mode == 1 && tot8b > FIFTY_PCT(srclen)) {
-			btype = TYPE_BINARY;
-		} else {
-			btype = TYPE_TEXT;
-			if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 &&
-			    tagcnt > (double)srclen * 0.001)
+		} else if (adat->adapt_mode == 1) {
+			btype = adat->actx->fifty_pct.btype;
+		}
+		if (stype == TYPE_PDF)
 			btype |= TYPE_MARKUP;
 	}

-	} else if (stype == TYPE_PDF) {
-		uint64_t i, tot8b;
-		uchar_t cur_byte;
-
-		/*
-		 * For PDF files we need to check for uncompressed PDFs. Those are compressed
-		 * using Libbsc.
-		 */
-		tot8b = 0;
-		for (i = 0; i < srclen; i++) {
-			cur_byte = src1[i];
-			tot8b += (cur_byte & 0x80);
-		}
-
-		tot8b /= 0x80;
-		if (adat->adapt_mode == 2 && tot8b > FORTY_PCT(srclen)) {
-			btype = TYPE_BINARY;
-		} else if (adat->adapt_mode == 1 && tot8b > FIFTY_PCT(srclen)) {
-			btype = TYPE_BINARY;
-		} else {
-			btype = TYPE_TEXT|TYPE_MARKUP;
-		}
-	}
+	/* Reset analyzer context for subsequent calls. */
+	adat->actx = NULL;

 	/*
 	 * Use PPMd if some percentage of source is 7-bit textual bytes, otherwise
--- a/filters/analyzer/analyzer.c
+++ b/filters/analyzer/analyzer.c
@ -23,15 +23,89 @@
 */

 #include "utils.h"
+#include "analyzer.h"
+
+#define	FIFTY_PCT(x)	(((x)/10) * 5)
+#define	FORTY_PCT(x)	(((x)/10) * 4)
+#define	ONE_PCT(x)	((x)/100)
+
+void
+analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
+{
+	uchar_t *src1 = (uchar_t *)src;
+	uint64_t i, tot8b, tot_8b, lbytes;
+	uchar_t cur_byte, prev_byte;
+	uint64_t tag1, tag2, tag3;
+	double tagcnt, pct_tag;
+	int markup;
+
+	/*
+	 * Count number of 8-bit binary bytes and XML tags in source.
+	 */
+	tot8b = 0;
+	tag1 = 0;
+	tag2 = 0;
+	tag3 = 0;
+	lbytes = 0;
+	prev_byte = cur_byte = 0;
+	for (i = 0; i < srclen; i++) {
+		cur_byte = src1[i];
+		tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization
+		lbytes += (cur_byte < 32);
+		tag1 += (cur_byte == '<');
+		tag2 += (cur_byte == '>');
+		tag3 += ((prev_byte == '<') & (cur_byte == '/'));
+		tag3 += ((prev_byte == '/') & (cur_byte == '>'));
+		if (cur_byte != ' ')
+			prev_byte = cur_byte;
+	}
+	
+	/*
+	 * Heuristics for detecting BINARY vs generic TEXT vs XML data at various
+	 * significance levels.
+	 */
+	tot_8b = tot8b / 0x80 + lbytes;
+	tagcnt = tag1 + tag2 + tag3;
+	pct_tag = tagcnt / (double)srclen;
+	if (tot_8b > FORTY_PCT(srclen)) {
+		actx->forty_pct.btype = TYPE_BINARY;
+	} else {
+		actx->forty_pct.btype = TYPE_TEXT;
+	}
+	
+	if (tot_8b > FIFTY_PCT(srclen)) {
+		actx->fifty_pct.btype = TYPE_BINARY;
+	} else {
+		actx->fifty_pct.btype = TYPE_TEXT;
+	}
+
+	tot8b /= 0x80;
+	if (tot8b == 0 && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
+		actx->one_pct.btype = TYPE_TEXT;
+	}
+
+	markup = 0;
+	if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 &&
+	    tagcnt > (double)srclen * 0.001)
+		markup = 1;
+
+	if (markup) {
+		if (actx->forty_pct.btype == TYPE_TEXT)
+			actx->forty_pct.btype |= TYPE_MARKUP;
+		if (actx->fifty_pct.btype == TYPE_TEXT)
+			actx->fifty_pct.btype |= TYPE_MARKUP;
+		if (actx->one_pct.btype == TYPE_TEXT)
+			actx->one_pct.btype |= TYPE_MARKUP;
+	}
+}

 int
-analyze_buffer(void *src, uint64_t srclen)
+analyze_buffer_simple(void *src, uint64_t srclen)
 {
 	uchar_t *src1 = (uchar_t *)src;
 	uint64_t i, tot8b, lbytes;
 	uchar_t cur_byte;
 	int btype = TYPE_UNKNOWN;
-
 	/*
 	 * Count number of 8-bit binary bytes in source
 	 */
@ -42,7 +116,6 @@ analyze_buffer(void *src, uint64_t srclen)
 		tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization
 		lbytes += (cur_byte < 32);
 	}
-
 	/*
 	 * Heuristics for detecting BINARY vs generic TEXT
 	 */
@ -50,6 +123,6 @@ analyze_buffer(void *src, uint64_t srclen)
 	if (tot8b == 0 && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
 		btype = TYPE_TEXT;
 	}
-
 	return (btype);
 }
+
--- a/filters/analyzer/analyzer.h
+++ b/filters/analyzer/analyzer.h
@ -29,7 +29,18 @@
 extern "C" {
 #endif

-extern int analyze_buffer(void *src, uint64_t srclen);
+struct significance_value {
+	int btype;
+};
+
+typedef struct _analyzer_ctx {
+	struct significance_value one_pct;
+	struct significance_value forty_pct;
+	struct significance_value fifty_pct;
+} analyzer_ctx_t;
+
+void analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx);
+int analyze_buffer_simple(void *src, uint64_t srclen);

 #ifdef  __cplusplus
 }
--- a/filters/dict/DictFilter.cpp
+++ b/filters/dict/DictFilter.cpp
@ -36,10 +36,6 @@
 #include "Common.h"
 #include "utils.h"

-extern "C" {
-extern int analyze_buffer(void *src, uint64_t srclen);
-}
-
 class DictFilter
 {
 public:
@ -270,7 +266,6 @@ dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64
 	DictFilter *df = static_cast<DictFilter *>(dict_ctx);
 	u32 fl;
 	u32 dl;
-	int atype;
 	uchar_t *dst;
 	DEBUG_STAT_EN(double strt, en);

@ -283,8 +278,6 @@ dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64
 	fl = (u32)fromlen;
 	dl = (u32)(*dstlen);
 	DEBUG_STAT_EN(strt = get_wtime_millis());
-	atype = analyze_buffer(from, fromlen);
-	if (PC_TYPE(atype) == TYPE_TEXT) {
 	U32_P(to) = LE32(fl);
 	dst = to + 4;
 	dl -= 4;
@ -297,7 +290,6 @@ dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64
 				      get_mb_s(fromlen, strt, en)));
 		return (1);
 	}
-	}
 	DEBUG_STAT_EN(fprintf(stderr, "No DICT\n"));
 	return (-1);
 }
--- a/pcompress.c
+++ b/pcompress.c
@ -211,7 +211,8 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
 	int result;
 	uint64_t _dstlen, fromlen;
 	uchar_t *from, *to;
-	int stype, dict;
+	int stype, dict, analyzed;
+	analyzer_ctx_t actx;
 	DEBUG_STAT_EN(double strt, en);

 	_dstlen = *dstlen;
@ -221,6 +222,14 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
 	result = 0;
 	stype = PC_SUBTYPE(btype);
 	dict = 0;
+	analyzed = 0;
+
+	if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF || interesting) {
+		analyze_buffer(src, srclen, &actx);
+		analyzed = 1;
+		if (pctx->adapt_mode)
+			adapt_set_analyzer_ctx(data, &actx);
+	}

 	/*
 	 * If Dispack is enabled it has to be done first since Dispack analyses the
@ -246,8 +255,15 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
 	 * Enabling LZP also enables the DICT filter since we are dealing with text
 	 * in any case.
 	 */
-	if (pctx->lzp_preprocess && (PC_TYPE(btype) == TYPE_UNKNOWN ||
-	    PC_TYPE(btype) == TYPE_TEXT || interesting)) {
+	if (pctx->lzp_preprocess) {
+		int b_type;
+
+		if (analyzed)
+			b_type = PC_TYPE(actx.one_pct.btype);
+		else
+			b_type = PC_TYPE(analyze_buffer_simple(from, fromlen));
+
+		if (b_type == TYPE_TEXT) {
 			void *dct = new_dict_context();
 			_dstlen = fromlen;
 			result = dict_encode(dct, from, fromlen, to, &_dstlen);
@ -262,13 +278,18 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
 				dict = result;
 			}
 		}
+	}

 #ifndef _MPLV2_LICENSE_
-	if (pctx->lzp_preprocess && stype != TYPE_BMP && stype != TYPE_TIFF &&
-	    PC_TYPE(btype) != TYPE_BINARY) {
-		int hashsize;
+	if (pctx->lzp_preprocess && stype != TYPE_BMP && stype != TYPE_TIFF) {
+		int hashsize, b_type;
 		int64_t result;

+		b_type = btype;
+		if (analyzed)
+			b_type = actx.forty_pct.btype;
+	
+		if (PC_TYPE(b_type) != TYPE_BINARY) {
 			hashsize = lzp_hash_size(level);
 			result = lzp_compress((const uchar_t *)from, to, fromlen,
 					      hashsize, LZP_DEFAULT_LZPMINLEN, 0);
@ -281,14 +302,23 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
 				type |= PREPROC_TYPE_LZP;
 			}
 		}
+	}
 #endif

 	if (pctx->enable_delta2_encode && props->delta2_span > 0 &&
 	    stype != TYPE_DNA_SEQ && stype != TYPE_BMP &&
-	    stype != TYPE_TIFF && stype != TYPE_MP4 && PC_TYPE(btype) != TYPE_TEXT) {
+	    stype != TYPE_TIFF && stype != TYPE_MP4) {
+		int b_type;
+
+		b_type = btype;
+		if (analyzed)
+			b_type = actx.one_pct.btype;
+
+		if (PC_TYPE(b_type) != TYPE_TEXT) {
 			_dstlen = fromlen;
 			result = delta2_encode((uchar_t *)from, fromlen, to,
-				       &_dstlen, props->delta2_span, pctx->delta2_nstrides);
+					       &_dstlen, props->delta2_span,
+					       pctx->delta2_nstrides);
 			if (result != -1) {
 				uchar_t *tmp;
 				tmp = from;
@ -298,6 +328,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
 				type |= PREPROC_TYPE_DELTA2;
 			}
 		}
+	}

 	/*
 	 * Check which is the resulting buffer. If Encoded data is already sitting
--- a/pcompress.h
+++ b/pcompress.h
@ -36,7 +36,8 @@ extern "C" {

 #include <rabin_dedup.h>
 #include <crypto_utils.h>
-#include "meta_stream.h"
+#include <filters/analyzer/analyzer.h>
+#include <meta_stream.h>

 #define	CHUNK_FLAG_SZ	1
 #define	ALGO_SZ		8
@ -152,6 +153,7 @@ extern int lz4_init(void **data, int *level, int nthreads, uint64_t chunksize,
 		    int file_version, compress_op_t op);
 extern int none_init(void **data, int *level, int nthreads, uint64_t chunksize,
 		     int file_version, compress_op_t op);
+extern void adapt_set_analyzer_ctx(void *data, analyzer_ctx_t *actx);

 extern void lzma_props(algo_props_t *data, int level, uint64_t chunksize);
 extern void lzma_mt_props(algo_props_t *data, int level, uint64_t chunksize);