Centralise data analysis routine for optimum performance and leverage.

Utilise buffer data analysis for preprocessing filters.
2014-11-06 22:23:33 +05:30 · 2014-11-06 22:23:33 +05:30 · 507e7c75d3
commit 507e7c75d3
parent 848010fbb5
6 changed files with 195 additions and 125 deletions
--- a/adaptive_compress.c
+++ b/adaptive_compress.c
@ -38,6 +38,7 @@
 #include <pcompress.h>
 #include <allocator.h>
 #include <pc_archive.h>
 #include "filters/analyzer/analyzer.h"
 #define	FIFTY_PCT(x)	(((x)/10) * 5)
 #define	FORTY_PCT(x)	(((x)/10) * 4)
@ -97,8 +98,16 @@ struct adapt_data {
 	void *bsc_data;
 	void *lz4_data;
 	int adapt_mode;
 	analyzer_ctx_t *actx;
 };
 void
 adapt_set_analyzer_ctx(void *data, analyzer_ctx_t *actx)
 {
 	struct adapt_data *adat = (struct adapt_data *)data;
 	adat->actx = actx;
 }
 void
 adapt_stats(int show)
 {
@ -246,76 +255,28 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
 	uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data)
 {
 	struct adapt_data *adat = (struct adapt_data *)(data);
 	uchar_t *src1 = (uchar_t *)src;
 	int rv = 0, bsc_type = 0;
 	int stype = PC_SUBTYPE(btype);
 	analyzer_ctx_t actx;
-	if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR) {
+	if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF) {
-		uint64_t i, tot8b, tag1, tag2, tag3, lbytes;
+		if (adat->actx == NULL) {
-		double tagcnt, pct_tag;
+			analyze_buffer(src, srclen, &actx);
-		uchar_t cur_byte, prev_byte;
+			adat->actx = &actx;
 		/*
 		 * Count number of 8-bit binary bytes and XML tags in source.
 		 */
 		tot8b = 0;
 		tag1 = 0;
 		tag2 = 0;
 		tag3 = 0;
 		lbytes = 0;
 		prev_byte = cur_byte = 0;
 		for (i = 0; i < srclen; i++) {
 			cur_byte = src1[i];
 			tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization
 			lbytes += (cur_byte < 32);
 			tag1 += (cur_byte == '<');
 			tag2 += (cur_byte == '>');
 			tag3 += ((prev_byte == '<') & (cur_byte == '/'));
 			tag3 += ((prev_byte == '/') & (cur_byte == '>'));
 			if (cur_byte != ' ')
 				prev_byte = cur_byte;
 		}
 		if (adat->adapt_mode == 2) {
 			btype = adat->actx->forty_pct.btype;
-		/*
+		} else if (adat->adapt_mode == 1) {
-		 * Heuristics for detecting BINARY vs generic TEXT vs XML data.
+			btype = adat->actx->fifty_pct.btype;
 		 */
 		tot8b = tot8b / 0x80 + lbytes;
 		tagcnt = tag1 + tag2 + tag3;
 		pct_tag = tagcnt / (double)srclen;
 		if (adat->adapt_mode == 2 && tot8b > FORTY_PCT(srclen)) {
 			btype = TYPE_BINARY;
 		} else if (adat->adapt_mode == 1 && tot8b > FIFTY_PCT(srclen)) {
 			btype = TYPE_BINARY;
 		} else {
 			btype = TYPE_TEXT;
 			if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 &&
 			    tagcnt > (double)srclen * 0.001)
 				btype |= TYPE_MARKUP;
 		}
 	} else if (stype == TYPE_PDF) {
 		uint64_t i, tot8b;
 		uchar_t cur_byte;
 		/*
 		 * For PDF files we need to check for uncompressed PDFs. Those are compressed
 		 * using Libbsc.
 		 */
 		tot8b = 0;
 		for (i = 0; i < srclen; i++) {
 			cur_byte = src1[i];
 			tot8b += (cur_byte & 0x80);
 		}
 		tot8b /= 0x80;
 		if (adat->adapt_mode == 2 && tot8b > FORTY_PCT(srclen)) {
 			btype = TYPE_BINARY;
 		} else if (adat->adapt_mode == 1 && tot8b > FIFTY_PCT(srclen)) {
 			btype = TYPE_BINARY;
 		} else {
 			btype = TYPE_TEXT|TYPE_MARKUP;
 		}
 		if (stype == TYPE_PDF)
 			btype |= TYPE_MARKUP;
 	}
 	/* Reset analyzer context for subsequent calls. */
 	adat->actx = NULL;
 	/*
 	 * Use PPMd if some percentage of source is 7-bit textual bytes, otherwise
 	 * use Bzip2 or LZMA. For totally incompressible data we always use LZ4. There
--- a/filters/analyzer/analyzer.c
+++ b/filters/analyzer/analyzer.c
@ -23,15 +23,89 @@
 */
 #include "utils.h"
 #include "analyzer.h"
 #define	FIFTY_PCT(x)	(((x)/10) * 5)
 #define	FORTY_PCT(x)	(((x)/10) * 4)
 #define	ONE_PCT(x)	((x)/100)
 void
 analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
 {
 	uchar_t *src1 = (uchar_t *)src;
 	uint64_t i, tot8b, tot_8b, lbytes;
 	uchar_t cur_byte, prev_byte;
 	uint64_t tag1, tag2, tag3;
 	double tagcnt, pct_tag;
 	int markup;
 	/*
 	 * Count number of 8-bit binary bytes and XML tags in source.
 	 */
 	tot8b = 0;
 	tag1 = 0;
 	tag2 = 0;
 	tag3 = 0;
 	lbytes = 0;
 	prev_byte = cur_byte = 0;
 	for (i = 0; i < srclen; i++) {
 		cur_byte = src1[i];
 		tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization
 		lbytes += (cur_byte < 32);
 		tag1 += (cur_byte == '<');
 		tag2 += (cur_byte == '>');
 		tag3 += ((prev_byte == '<') & (cur_byte == '/'));
 		tag3 += ((prev_byte == '/') & (cur_byte == '>'));
 		if (cur_byte != ' ')
 			prev_byte = cur_byte;
 	}
 	/*
 	 * Heuristics for detecting BINARY vs generic TEXT vs XML data at various
 	 * significance levels.
 	 */
 	tot_8b = tot8b / 0x80 + lbytes;
 	tagcnt = tag1 + tag2 + tag3;
 	pct_tag = tagcnt / (double)srclen;
 	if (tot_8b > FORTY_PCT(srclen)) {
 		actx->forty_pct.btype = TYPE_BINARY;
 	} else {
 		actx->forty_pct.btype = TYPE_TEXT;
 	}
 	if (tot_8b > FIFTY_PCT(srclen)) {
 		actx->fifty_pct.btype = TYPE_BINARY;
 	} else {
 		actx->fifty_pct.btype = TYPE_TEXT;
 	}
 	tot8b /= 0x80;
 	if (tot8b == 0 && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
 		actx->one_pct.btype = TYPE_TEXT;
 	}
 	markup = 0;
 	if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 &&
 	    tagcnt > (double)srclen * 0.001)
 		markup = 1;
 	if (markup) {
 		if (actx->forty_pct.btype == TYPE_TEXT)
 			actx->forty_pct.btype |= TYPE_MARKUP;
 		if (actx->fifty_pct.btype == TYPE_TEXT)
 			actx->fifty_pct.btype |= TYPE_MARKUP;
 		if (actx->one_pct.btype == TYPE_TEXT)
 			actx->one_pct.btype |= TYPE_MARKUP;
 	}
 }
 int
-analyze_buffer(void *src, uint64_t srclen)
+analyze_buffer_simple(void *src, uint64_t srclen)
 {
 	uchar_t *src1 = (uchar_t *)src;
 	uint64_t i, tot8b, lbytes;
 	uchar_t cur_byte;
 	int btype = TYPE_UNKNOWN;
 	/*
 	 * Count number of 8-bit binary bytes in source
 	 */
@ -42,7 +116,6 @@ analyze_buffer(void *src, uint64_t srclen)
 		tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization
 		lbytes += (cur_byte < 32);
 	}
 	/*
 	 * Heuristics for detecting BINARY vs generic TEXT
 	 */
@ -50,6 +123,6 @@ analyze_buffer(void *src, uint64_t srclen)
 	if (tot8b == 0 && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
 		btype = TYPE_TEXT;
 	}
 	return (btype);
 }
--- a/filters/analyzer/analyzer.h
+++ b/filters/analyzer/analyzer.h
@ -29,7 +29,18 @@
 extern "C" {
 #endif
-extern int analyze_buffer(void *src, uint64_t srclen);
+struct significance_value {
 	int btype;
 };
 typedef struct _analyzer_ctx {
 	struct significance_value one_pct;
 	struct significance_value forty_pct;
 	struct significance_value fifty_pct;
 } analyzer_ctx_t;
 void analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx);
 int analyze_buffer_simple(void *src, uint64_t srclen);
 #ifdef  __cplusplus
 }
--- a/filters/dict/DictFilter.cpp
+++ b/filters/dict/DictFilter.cpp
@ -36,10 +36,6 @@
 #include "Common.h"
 #include "utils.h"
 extern "C" {
 extern int analyze_buffer(void *src, uint64_t srclen);
 }
 class DictFilter
 {
 public:
@ -270,7 +266,6 @@ dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64
 	DictFilter *df = static_cast<DictFilter *>(dict_ctx);
 	u32 fl;
 	u32 dl;
 	int atype;
 	uchar_t *dst;
 	DEBUG_STAT_EN(double strt, en);
@ -283,20 +278,17 @@ dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64
 	fl = (u32)fromlen;
 	dl = (u32)(*dstlen);
 	DEBUG_STAT_EN(strt = get_wtime_millis());
-	atype = analyze_buffer(from, fromlen);
+	U32_P(to) = LE32(fl);
-	if (PC_TYPE(atype) == TYPE_TEXT) {
+	dst = to + 4;
-		U32_P(to) = LE32(fl);
+	dl -= 4;
-		dst = to + 4;
+	if (df->Forward_Dict(from, fl, dst, &dl)) {
-		dl -= 4;
+		*dstlen = dl + 8;
-		if (df->Forward_Dict(from, fl, dst, &dl)) {
+		DEBUG_STAT_EN(en = get_wtime_millis());
-			*dstlen = dl + 8;
+		DEBUG_STAT_EN(fprintf(stderr, "DICT: fromlen: %" PRIu64 ", dstlen: %" PRIu64 "\n",
-			DEBUG_STAT_EN(en = get_wtime_millis());
+				      fromlen, *dstlen));
-			DEBUG_STAT_EN(fprintf(stderr, "DICT: fromlen: %" PRIu64 ", dstlen: %" PRIu64 "\n",
+		DEBUG_STAT_EN(fprintf(stderr, "DICT: Processed at %.3f MB/s\n",
-			    fromlen, *dstlen));
+				      get_mb_s(fromlen, strt, en)));
-			DEBUG_STAT_EN(fprintf(stderr, "DICT: Processed at %.3f MB/s\n",
+		return (1);
 			    get_mb_s(fromlen, strt, en)));
 			return (1);
 		}
 	}
 	DEBUG_STAT_EN(fprintf(stderr, "No DICT\n"));
 	return (-1);
--- a/pcompress.c
+++ b/pcompress.c
@ -211,7 +211,8 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
 	int result;
 	uint64_t _dstlen, fromlen;
 	uchar_t *from, *to;
-	int stype, dict;
+	int stype, dict, analyzed;
 	analyzer_ctx_t actx;
 	DEBUG_STAT_EN(double strt, en);
 	_dstlen = *dstlen;
@ -221,6 +222,14 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
 	result = 0;
 	stype = PC_SUBTYPE(btype);
 	dict = 0;
 	analyzed = 0;
 	if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF || interesting) {
 		analyze_buffer(src, srclen, &actx);
 		analyzed = 1;
 		if (pctx->adapt_mode)
 			adapt_set_analyzer_ctx(data, &actx);
 	}
 	/*
 	 * If Dispack is enabled it has to be done first since Dispack analyses the
@ -246,56 +255,78 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
 	 * Enabling LZP also enables the DICT filter since we are dealing with text
 	 * in any case.
 	 */
-	if (pctx->lzp_preprocess && (PC_TYPE(btype) == TYPE_UNKNOWN ||
+	if (pctx->lzp_preprocess) {
-	    PC_TYPE(btype) == TYPE_TEXT || interesting)) {
+		int b_type;
-		void *dct = new_dict_context();
+
-		_dstlen = fromlen;
+		if (analyzed)
-		result = dict_encode(dct, from, fromlen, to, &_dstlen);
+			b_type = PC_TYPE(actx.one_pct.btype);
-		delete_dict_context(dct);
+		else
-		if (result != -1) {
+			b_type = PC_TYPE(analyze_buffer_simple(from, fromlen));
-			uchar_t *tmp;
+
-			tmp = from;
+		if (b_type == TYPE_TEXT) {
-			from = to;
+			void *dct = new_dict_context();
-			to = tmp;
+			_dstlen = fromlen;
-			fromlen = _dstlen;
+			result = dict_encode(dct, from, fromlen, to, &_dstlen);
-			type |= PREPROC_TYPE_DICT;
+			delete_dict_context(dct);
-			dict = result;
+			if (result != -1) {
 				uchar_t *tmp;
 				tmp = from;
 				from = to;
 				to = tmp;
 				fromlen = _dstlen;
 				type |= PREPROC_TYPE_DICT;
 				dict = result;
 			}
 		}
 	}
 #ifndef _MPLV2_LICENSE_
-	if (pctx->lzp_preprocess && stype != TYPE_BMP && stype != TYPE_TIFF &&
+	if (pctx->lzp_preprocess && stype != TYPE_BMP && stype != TYPE_TIFF) {
-	    PC_TYPE(btype) != TYPE_BINARY) {
+		int hashsize, b_type;
 		int hashsize;
 		int64_t result;
-		hashsize = lzp_hash_size(level);
+		b_type = btype;
-		result = lzp_compress((const uchar_t *)from, to, fromlen,
+		if (analyzed)
 			b_type = actx.forty_pct.btype;
 		if (PC_TYPE(b_type) != TYPE_BINARY) {
 			hashsize = lzp_hash_size(level);
 			result = lzp_compress((const uchar_t *)from, to, fromlen,
 					      hashsize, LZP_DEFAULT_LZPMINLEN, 0);
-		if (result >= 0 && result < srclen) {
+			if (result >= 0 && result < srclen) {
-			uchar_t *tmp;
+				uchar_t *tmp;
-			tmp = from;
+				tmp = from;
-			from = to;
+				from = to;
-			to = tmp;
+				to = tmp;
-			fromlen = result;
+				fromlen = result;
-			type |= PREPROC_TYPE_LZP;
+				type |= PREPROC_TYPE_LZP;
 			}
 		}
 	}
 #endif
 	if (pctx->enable_delta2_encode && props->delta2_span > 0 &&
 	    stype != TYPE_DNA_SEQ && stype != TYPE_BMP &&
-	    stype != TYPE_TIFF && stype != TYPE_MP4 && PC_TYPE(btype) != TYPE_TEXT) {
+	    stype != TYPE_TIFF && stype != TYPE_MP4) {
-		_dstlen = fromlen;
+		int b_type;
-		result = delta2_encode((uchar_t *)from, fromlen, to,
+
-				       &_dstlen, props->delta2_span, pctx->delta2_nstrides);
+		b_type = btype;
-		if (result != -1) {
+		if (analyzed)
-			uchar_t *tmp;
+			b_type = actx.one_pct.btype;
-			tmp = from;
+
-			from = to;
+		if (PC_TYPE(b_type) != TYPE_TEXT) {
-			to = tmp;
+			_dstlen = fromlen;
-			fromlen = _dstlen;
+			result = delta2_encode((uchar_t *)from, fromlen, to,
-			type |= PREPROC_TYPE_DELTA2;
+					       &_dstlen, props->delta2_span,
 					       pctx->delta2_nstrides);
 			if (result != -1) {
 				uchar_t *tmp;
 				tmp = from;
 				from = to;
 				to = tmp;
 				fromlen = _dstlen;
 				type |= PREPROC_TYPE_DELTA2;
 			}
 		}
 	}
--- a/pcompress.h
+++ b/pcompress.h
@ -36,7 +36,8 @@ extern "C" {
 #include <rabin_dedup.h>
 #include <crypto_utils.h>
-#include "meta_stream.h"
+#include <filters/analyzer/analyzer.h>
 #include <meta_stream.h>
 #define	CHUNK_FLAG_SZ	1
 #define	ALGO_SZ		8
@ -152,6 +153,7 @@ extern int lz4_init(void **data, int *level, int nthreads, uint64_t chunksize,
 		    int file_version, compress_op_t op);
 extern int none_init(void **data, int *level, int nthreads, uint64_t chunksize,
 		     int file_version, compress_op_t op);
 extern void adapt_set_analyzer_ctx(void *data, analyzer_ctx_t *actx);
 extern void lzma_props(algo_props_t *data, int level, uint64_t chunksize);
 extern void lzma_mt_props(algo_props_t *data, int level, uint64_t chunksize);