From 507e7c75d3be7e9f494130849daebd081e4dca43 Mon Sep 17 00:00:00 2001
From: Moinak Ghosh <moinakg@gmail.com>
Date: Thu, 6 Nov 2014 22:23:33 +0530
Subject: [PATCH] Centralise data analysis routine for optimum performance and
 leverage. Utilise buffer data analysis for preprocessing filters.

---
 adaptive_compress.c         |  85 ++++++++--------------------
 filters/analyzer/analyzer.c |  81 +++++++++++++++++++++++++--
 filters/analyzer/analyzer.h |  13 ++++-
 filters/dict/DictFilter.cpp |  30 ++++------
 pcompress.c                 | 107 +++++++++++++++++++++++-------------
 pcompress.h                 |   4 +-
 6 files changed, 195 insertions(+), 125 deletions(-)

diff --git a/adaptive_compress.c b/adaptive_compress.c
index bbce293..65a0475 100644
--- a/adaptive_compress.c
+++ b/adaptive_compress.c
@@ -38,6 +38,7 @@
 #include <pcompress.h>
 #include <allocator.h>
 #include <pc_archive.h>
+#include "filters/analyzer/analyzer.h"
 
 #define	FIFTY_PCT(x)	(((x)/10) * 5)
 #define	FORTY_PCT(x)	(((x)/10) * 4)
@@ -97,8 +98,16 @@ struct adapt_data {
 	void *bsc_data;
 	void *lz4_data;
 	int adapt_mode;
+	analyzer_ctx_t *actx;
 };
 
+void
+adapt_set_analyzer_ctx(void *data, analyzer_ctx_t *actx)
+{
+	struct adapt_data *adat = (struct adapt_data *)data;
+	adat->actx = actx;
+}
+
 void
 adapt_stats(int show)
 {
@@ -246,76 +255,28 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
 	uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data)
 {
 	struct adapt_data *adat = (struct adapt_data *)(data);
-	uchar_t *src1 = (uchar_t *)src;
 	int rv = 0, bsc_type = 0;
 	int stype = PC_SUBTYPE(btype);
+	analyzer_ctx_t actx;
 
-	if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR) {
-		uint64_t i, tot8b, tag1, tag2, tag3, lbytes;
-		double tagcnt, pct_tag;
-		uchar_t cur_byte, prev_byte;
-		/*
-		 * Count number of 8-bit binary bytes and XML tags in source.
-		 */
-		tot8b = 0;
-		tag1 = 0;
-		tag2 = 0;
-		tag3 = 0;
-		lbytes = 0;
-		prev_byte = cur_byte = 0;
-		for (i = 0; i < srclen; i++) {
-			cur_byte = src1[i];
-			tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization
-			lbytes += (cur_byte < 32);
-			tag1 += (cur_byte == '<');
-			tag2 += (cur_byte == '>');
-			tag3 += ((prev_byte == '<') & (cur_byte == '/'));
-			tag3 += ((prev_byte == '/') & (cur_byte == '>'));
-			if (cur_byte != ' ')
-				prev_byte = cur_byte;
+	if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF) {
+		if (adat->actx == NULL) {
+			analyze_buffer(src, srclen, &actx);
+			adat->actx = &actx;
 		}
+		if (adat->adapt_mode == 2) {
+			btype = adat->actx->forty_pct.btype;
 
-		/*
-		 * Heuristics for detecting BINARY vs generic TEXT vs XML data.
-		 */
-		tot8b = tot8b / 0x80 + lbytes;
-		tagcnt = tag1 + tag2 + tag3;
-		pct_tag = tagcnt / (double)srclen;
-		if (adat->adapt_mode == 2 && tot8b > FORTY_PCT(srclen)) {
-			btype = TYPE_BINARY;
-		} else if (adat->adapt_mode == 1 && tot8b > FIFTY_PCT(srclen)) {
-			btype = TYPE_BINARY;
-		} else {
-			btype = TYPE_TEXT;
-			if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 &&
-			    tagcnt > (double)srclen * 0.001)
-				btype |= TYPE_MARKUP;
-		}
-
-	} else if (stype == TYPE_PDF) {
-		uint64_t i, tot8b;
-		uchar_t cur_byte;
-
-		/*
-		 * For PDF files we need to check for uncompressed PDFs. Those are compressed
-		 * using Libbsc.
-		 */
-		tot8b = 0;
-		for (i = 0; i < srclen; i++) {
-			cur_byte = src1[i];
-			tot8b += (cur_byte & 0x80);
-		}
-
-		tot8b /= 0x80;
-		if (adat->adapt_mode == 2 && tot8b > FORTY_PCT(srclen)) {
-			btype = TYPE_BINARY;
-		} else if (adat->adapt_mode == 1 && tot8b > FIFTY_PCT(srclen)) {
-			btype = TYPE_BINARY;
-		} else {
-			btype = TYPE_TEXT|TYPE_MARKUP;
+		} else if (adat->adapt_mode == 1) {
+			btype = adat->actx->fifty_pct.btype;
 		}
+		if (stype == TYPE_PDF)
+			btype |= TYPE_MARKUP;
 	}
 
+	/* Reset analyzer context for subsequent calls. */
+	adat->actx = NULL;
+
 	/*
 	 * Use PPMd if some percentage of source is 7-bit textual bytes, otherwise
 	 * use Bzip2 or LZMA. For totally incompressible data we always use LZ4. There
diff --git a/filters/analyzer/analyzer.c b/filters/analyzer/analyzer.c
index a52824a..54033fb 100644
--- a/filters/analyzer/analyzer.c
+++ b/filters/analyzer/analyzer.c
@@ -23,15 +23,89 @@
  */
 
 #include "utils.h"
+#include "analyzer.h"
+
+#define	FIFTY_PCT(x)	(((x)/10) * 5)
+#define	FORTY_PCT(x)	(((x)/10) * 4)
+#define	ONE_PCT(x)	((x)/100)
+
+void
+analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
+{
+	uchar_t *src1 = (uchar_t *)src;
+	uint64_t i, tot8b, tot_8b, lbytes;
+	uchar_t cur_byte, prev_byte;
+	uint64_t tag1, tag2, tag3;
+	double tagcnt, pct_tag;
+	int markup;
+
+	/*
+	 * Count number of 8-bit binary bytes and XML tags in source.
+	 */
+	tot8b = 0;
+	tag1 = 0;
+	tag2 = 0;
+	tag3 = 0;
+	lbytes = 0;
+	prev_byte = cur_byte = 0;
+	for (i = 0; i < srclen; i++) {
+		cur_byte = src1[i];
+		tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization
+		lbytes += (cur_byte < 32);
+		tag1 += (cur_byte == '<');
+		tag2 += (cur_byte == '>');
+		tag3 += ((prev_byte == '<') & (cur_byte == '/'));
+		tag3 += ((prev_byte == '/') & (cur_byte == '>'));
+		if (cur_byte != ' ')
+			prev_byte = cur_byte;
+	}
+	
+	/*
+	 * Heuristics for detecting BINARY vs generic TEXT vs XML data at various
+	 * significance levels.
+	 */
+	tot_8b = tot8b / 0x80 + lbytes;
+	tagcnt = tag1 + tag2 + tag3;
+	pct_tag = tagcnt / (double)srclen;
+	if (tot_8b > FORTY_PCT(srclen)) {
+		actx->forty_pct.btype = TYPE_BINARY;
+	} else {
+		actx->forty_pct.btype = TYPE_TEXT;
+	}
+	
+	if (tot_8b > FIFTY_PCT(srclen)) {
+		actx->fifty_pct.btype = TYPE_BINARY;
+	} else {
+		actx->fifty_pct.btype = TYPE_TEXT;
+	}
+
+	tot8b /= 0x80;
+	if (tot8b == 0 && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
+		actx->one_pct.btype = TYPE_TEXT;
+	}
+
+	markup = 0;
+	if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 &&
+	    tagcnt > (double)srclen * 0.001)
+		markup = 1;
+
+	if (markup) {
+		if (actx->forty_pct.btype == TYPE_TEXT)
+			actx->forty_pct.btype |= TYPE_MARKUP;
+		if (actx->fifty_pct.btype == TYPE_TEXT)
+			actx->fifty_pct.btype |= TYPE_MARKUP;
+		if (actx->one_pct.btype == TYPE_TEXT)
+			actx->one_pct.btype |= TYPE_MARKUP;
+	}
+}
 
 int
-analyze_buffer(void *src, uint64_t srclen)
+analyze_buffer_simple(void *src, uint64_t srclen)
 {
 	uchar_t *src1 = (uchar_t *)src;
 	uint64_t i, tot8b, lbytes;
 	uchar_t cur_byte;
 	int btype = TYPE_UNKNOWN;
-
 	/*
 	 * Count number of 8-bit binary bytes in source
 	 */
@@ -42,7 +116,6 @@ analyze_buffer(void *src, uint64_t srclen)
 		tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization
 		lbytes += (cur_byte < 32);
 	}
-
 	/*
 	 * Heuristics for detecting BINARY vs generic TEXT
 	 */
@@ -50,6 +123,6 @@ analyze_buffer(void *src, uint64_t srclen)
 	if (tot8b == 0 && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
 		btype = TYPE_TEXT;
 	}
-
 	return (btype);
 }
+
diff --git a/filters/analyzer/analyzer.h b/filters/analyzer/analyzer.h
index 682111d..9eefd9c 100644
--- a/filters/analyzer/analyzer.h
+++ b/filters/analyzer/analyzer.h
@@ -29,7 +29,18 @@
 extern "C" {
 #endif
 
-extern int analyze_buffer(void *src, uint64_t srclen);
+struct significance_value {
+	int btype;
+};
+
+typedef struct _analyzer_ctx {
+	struct significance_value one_pct;
+	struct significance_value forty_pct;
+	struct significance_value fifty_pct;
+} analyzer_ctx_t;
+
+void analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx);
+int analyze_buffer_simple(void *src, uint64_t srclen);
 
 #ifdef  __cplusplus
 }
diff --git a/filters/dict/DictFilter.cpp b/filters/dict/DictFilter.cpp
index 299b661..b2636b4 100644
--- a/filters/dict/DictFilter.cpp
+++ b/filters/dict/DictFilter.cpp
@@ -36,10 +36,6 @@
 #include "Common.h"
 #include "utils.h"
 
-extern "C" {
-extern int analyze_buffer(void *src, uint64_t srclen);
-}
-
 class DictFilter
 {
 public:
@@ -270,7 +266,6 @@ dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64
 	DictFilter *df = static_cast<DictFilter *>(dict_ctx);
 	u32 fl;
 	u32 dl;
-	int atype;
 	uchar_t *dst;
 	DEBUG_STAT_EN(double strt, en);
 
@@ -283,20 +278,17 @@ dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64
 	fl = (u32)fromlen;
 	dl = (u32)(*dstlen);
 	DEBUG_STAT_EN(strt = get_wtime_millis());
-	atype = analyze_buffer(from, fromlen);
-	if (PC_TYPE(atype) == TYPE_TEXT) {
-		U32_P(to) = LE32(fl);
-		dst = to + 4;
-		dl -= 4;
-		if (df->Forward_Dict(from, fl, dst, &dl)) {
-			*dstlen = dl + 8;
-			DEBUG_STAT_EN(en = get_wtime_millis());
-			DEBUG_STAT_EN(fprintf(stderr, "DICT: fromlen: %" PRIu64 ", dstlen: %" PRIu64 "\n",
-			    fromlen, *dstlen));
-			DEBUG_STAT_EN(fprintf(stderr, "DICT: Processed at %.3f MB/s\n",
-			    get_mb_s(fromlen, strt, en)));
-			return (1);
-		}
+	U32_P(to) = LE32(fl);
+	dst = to + 4;
+	dl -= 4;
+	if (df->Forward_Dict(from, fl, dst, &dl)) {
+		*dstlen = dl + 8;
+		DEBUG_STAT_EN(en = get_wtime_millis());
+		DEBUG_STAT_EN(fprintf(stderr, "DICT: fromlen: %" PRIu64 ", dstlen: %" PRIu64 "\n",
+				      fromlen, *dstlen));
+		DEBUG_STAT_EN(fprintf(stderr, "DICT: Processed at %.3f MB/s\n",
+				      get_mb_s(fromlen, strt, en)));
+		return (1);
 	}
 	DEBUG_STAT_EN(fprintf(stderr, "No DICT\n"));
 	return (-1);
diff --git a/pcompress.c b/pcompress.c
index 6a687f0..2f6eb03 100644
--- a/pcompress.c
+++ b/pcompress.c
@@ -211,7 +211,8 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
 	int result;
 	uint64_t _dstlen, fromlen;
 	uchar_t *from, *to;
-	int stype, dict;
+	int stype, dict, analyzed;
+	analyzer_ctx_t actx;
 	DEBUG_STAT_EN(double strt, en);
 
 	_dstlen = *dstlen;
@@ -221,6 +222,14 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
 	result = 0;
 	stype = PC_SUBTYPE(btype);
 	dict = 0;
+	analyzed = 0;
+
+	if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF || interesting) {
+		analyze_buffer(src, srclen, &actx);
+		analyzed = 1;
+		if (pctx->adapt_mode)
+			adapt_set_analyzer_ctx(data, &actx);
+	}
 
 	/*
 	 * If Dispack is enabled it has to be done first since Dispack analyses the
@@ -246,56 +255,78 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
 	 * Enabling LZP also enables the DICT filter since we are dealing with text
 	 * in any case.
 	 */
-	if (pctx->lzp_preprocess && (PC_TYPE(btype) == TYPE_UNKNOWN ||
-	    PC_TYPE(btype) == TYPE_TEXT || interesting)) {
-		void *dct = new_dict_context();
-		_dstlen = fromlen;
-		result = dict_encode(dct, from, fromlen, to, &_dstlen);
-		delete_dict_context(dct);
-		if (result != -1) {
-			uchar_t *tmp;
-			tmp = from;
-			from = to;
-			to = tmp;
-			fromlen = _dstlen;
-			type |= PREPROC_TYPE_DICT;
-			dict = result;
+	if (pctx->lzp_preprocess) {
+		int b_type;
+
+		if (analyzed)
+			b_type = PC_TYPE(actx.one_pct.btype);
+		else
+			b_type = PC_TYPE(analyze_buffer_simple(from, fromlen));
+
+		if (b_type == TYPE_TEXT) {
+			void *dct = new_dict_context();
+			_dstlen = fromlen;
+			result = dict_encode(dct, from, fromlen, to, &_dstlen);
+			delete_dict_context(dct);
+			if (result != -1) {
+				uchar_t *tmp;
+				tmp = from;
+				from = to;
+				to = tmp;
+				fromlen = _dstlen;
+				type |= PREPROC_TYPE_DICT;
+				dict = result;
+			}
 		}
 	}
 
 #ifndef _MPLV2_LICENSE_
-	if (pctx->lzp_preprocess && stype != TYPE_BMP && stype != TYPE_TIFF &&
-	    PC_TYPE(btype) != TYPE_BINARY) {
-		int hashsize;
+	if (pctx->lzp_preprocess && stype != TYPE_BMP && stype != TYPE_TIFF) {
+		int hashsize, b_type;
 		int64_t result;
 
-		hashsize = lzp_hash_size(level);
-		result = lzp_compress((const uchar_t *)from, to, fromlen,
+		b_type = btype;
+		if (analyzed)
+			b_type = actx.forty_pct.btype;
+	
+		if (PC_TYPE(b_type) != TYPE_BINARY) {
+			hashsize = lzp_hash_size(level);
+			result = lzp_compress((const uchar_t *)from, to, fromlen,
 					      hashsize, LZP_DEFAULT_LZPMINLEN, 0);
-		if (result >= 0 && result < srclen) {
-			uchar_t *tmp;
-			tmp = from;
-			from = to;
-			to = tmp;
-			fromlen = result;
-			type |= PREPROC_TYPE_LZP;
+			if (result >= 0 && result < srclen) {
+				uchar_t *tmp;
+				tmp = from;
+				from = to;
+				to = tmp;
+				fromlen = result;
+				type |= PREPROC_TYPE_LZP;
+			}
 		}
 	}
 #endif
 
 	if (pctx->enable_delta2_encode && props->delta2_span > 0 &&
 	    stype != TYPE_DNA_SEQ && stype != TYPE_BMP &&
-	    stype != TYPE_TIFF && stype != TYPE_MP4 && PC_TYPE(btype) != TYPE_TEXT) {
-		_dstlen = fromlen;
-		result = delta2_encode((uchar_t *)from, fromlen, to,
-				       &_dstlen, props->delta2_span, pctx->delta2_nstrides);
-		if (result != -1) {
-			uchar_t *tmp;
-			tmp = from;
-			from = to;
-			to = tmp;
-			fromlen = _dstlen;
-			type |= PREPROC_TYPE_DELTA2;
+	    stype != TYPE_TIFF && stype != TYPE_MP4) {
+		int b_type;
+
+		b_type = btype;
+		if (analyzed)
+			b_type = actx.one_pct.btype;
+
+		if (PC_TYPE(b_type) != TYPE_TEXT) {
+			_dstlen = fromlen;
+			result = delta2_encode((uchar_t *)from, fromlen, to,
+					       &_dstlen, props->delta2_span,
+					       pctx->delta2_nstrides);
+			if (result != -1) {
+				uchar_t *tmp;
+				tmp = from;
+				from = to;
+				to = tmp;
+				fromlen = _dstlen;
+				type |= PREPROC_TYPE_DELTA2;
+			}
 		}
 	}
 
diff --git a/pcompress.h b/pcompress.h
index 6c5eb4f..eeaec90 100644
--- a/pcompress.h
+++ b/pcompress.h
@@ -36,7 +36,8 @@ extern "C" {
 
 #include <rabin_dedup.h>
 #include <crypto_utils.h>
-#include "meta_stream.h"
+#include <filters/analyzer/analyzer.h>
+#include <meta_stream.h>
 
 #define	CHUNK_FLAG_SZ	1
 #define	ALGO_SZ		8
@@ -152,6 +153,7 @@ extern int lz4_init(void **data, int *level, int nthreads, uint64_t chunksize,
 		    int file_version, compress_op_t op);
 extern int none_init(void **data, int *level, int nthreads, uint64_t chunksize,
 		     int file_version, compress_op_t op);
+extern void adapt_set_analyzer_ctx(void *data, analyzer_ctx_t *actx);
 
 extern void lzma_props(algo_props_t *data, int level, uint64_t chunksize);
 extern void lzma_mt_props(algo_props_t *data, int level, uint64_t chunksize);