From 507e7c75d3be7e9f494130849daebd081e4dca43 Mon Sep 17 00:00:00 2001 From: Moinak Ghosh Date: Thu, 6 Nov 2014 22:23:33 +0530 Subject: [PATCH] Centralise data analysis routine for optimum performance and leverage. Utilise buffer data analysis for preprocessing filters. --- adaptive_compress.c | 85 ++++++++-------------------- filters/analyzer/analyzer.c | 81 +++++++++++++++++++++++++-- filters/analyzer/analyzer.h | 13 ++++- filters/dict/DictFilter.cpp | 30 ++++------ pcompress.c | 107 +++++++++++++++++++++++------------- pcompress.h | 4 +- 6 files changed, 195 insertions(+), 125 deletions(-) diff --git a/adaptive_compress.c b/adaptive_compress.c index bbce293..65a0475 100644 --- a/adaptive_compress.c +++ b/adaptive_compress.c @@ -38,6 +38,7 @@ #include #include #include +#include "filters/analyzer/analyzer.h" #define FIFTY_PCT(x) (((x)/10) * 5) #define FORTY_PCT(x) (((x)/10) * 4) @@ -97,8 +98,16 @@ struct adapt_data { void *bsc_data; void *lz4_data; int adapt_mode; + analyzer_ctx_t *actx; }; +void +adapt_set_analyzer_ctx(void *data, analyzer_ctx_t *actx) +{ + struct adapt_data *adat = (struct adapt_data *)data; + adat->actx = actx; +} + void adapt_stats(int show) { @@ -246,76 +255,28 @@ adapt_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data) { struct adapt_data *adat = (struct adapt_data *)(data); - uchar_t *src1 = (uchar_t *)src; int rv = 0, bsc_type = 0; int stype = PC_SUBTYPE(btype); + analyzer_ctx_t actx; - if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR) { - uint64_t i, tot8b, tag1, tag2, tag3, lbytes; - double tagcnt, pct_tag; - uchar_t cur_byte, prev_byte; - /* - * Count number of 8-bit binary bytes and XML tags in source. - */ - tot8b = 0; - tag1 = 0; - tag2 = 0; - tag3 = 0; - lbytes = 0; - prev_byte = cur_byte = 0; - for (i = 0; i < srclen; i++) { - cur_byte = src1[i]; - tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization - lbytes += (cur_byte < 32); - tag1 += (cur_byte == '<'); - tag2 += (cur_byte == '>'); - tag3 += ((prev_byte == '<') & (cur_byte == '/')); - tag3 += ((prev_byte == '/') & (cur_byte == '>')); - if (cur_byte != ' ') - prev_byte = cur_byte; + if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF) { + if (adat->actx == NULL) { + analyze_buffer(src, srclen, &actx); + adat->actx = &actx; } + if (adat->adapt_mode == 2) { + btype = adat->actx->forty_pct.btype; - /* - * Heuristics for detecting BINARY vs generic TEXT vs XML data. - */ - tot8b = tot8b / 0x80 + lbytes; - tagcnt = tag1 + tag2 + tag3; - pct_tag = tagcnt / (double)srclen; - if (adat->adapt_mode == 2 && tot8b > FORTY_PCT(srclen)) { - btype = TYPE_BINARY; - } else if (adat->adapt_mode == 1 && tot8b > FIFTY_PCT(srclen)) { - btype = TYPE_BINARY; - } else { - btype = TYPE_TEXT; - if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 && - tagcnt > (double)srclen * 0.001) - btype |= TYPE_MARKUP; - } - - } else if (stype == TYPE_PDF) { - uint64_t i, tot8b; - uchar_t cur_byte; - - /* - * For PDF files we need to check for uncompressed PDFs. Those are compressed - * using Libbsc. - */ - tot8b = 0; - for (i = 0; i < srclen; i++) { - cur_byte = src1[i]; - tot8b += (cur_byte & 0x80); - } - - tot8b /= 0x80; - if (adat->adapt_mode == 2 && tot8b > FORTY_PCT(srclen)) { - btype = TYPE_BINARY; - } else if (adat->adapt_mode == 1 && tot8b > FIFTY_PCT(srclen)) { - btype = TYPE_BINARY; - } else { - btype = TYPE_TEXT|TYPE_MARKUP; + } else if (adat->adapt_mode == 1) { + btype = adat->actx->fifty_pct.btype; } + if (stype == TYPE_PDF) + btype |= TYPE_MARKUP; } + /* Reset analyzer context for subsequent calls. */ + adat->actx = NULL; + /* * Use PPMd if some percentage of source is 7-bit textual bytes, otherwise * use Bzip2 or LZMA. For totally incompressible data we always use LZ4. There diff --git a/filters/analyzer/analyzer.c b/filters/analyzer/analyzer.c index a52824a..54033fb 100644 --- a/filters/analyzer/analyzer.c +++ b/filters/analyzer/analyzer.c @@ -23,15 +23,89 @@ */ #include "utils.h" +#include "analyzer.h" + +#define FIFTY_PCT(x) (((x)/10) * 5) +#define FORTY_PCT(x) (((x)/10) * 4) +#define ONE_PCT(x) ((x)/100) + +void +analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx) +{ + uchar_t *src1 = (uchar_t *)src; + uint64_t i, tot8b, tot_8b, lbytes; + uchar_t cur_byte, prev_byte; + uint64_t tag1, tag2, tag3; + double tagcnt, pct_tag; + int markup; + + /* + * Count number of 8-bit binary bytes and XML tags in source. + */ + tot8b = 0; + tag1 = 0; + tag2 = 0; + tag3 = 0; + lbytes = 0; + prev_byte = cur_byte = 0; + for (i = 0; i < srclen; i++) { + cur_byte = src1[i]; + tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization + lbytes += (cur_byte < 32); + tag1 += (cur_byte == '<'); + tag2 += (cur_byte == '>'); + tag3 += ((prev_byte == '<') & (cur_byte == '/')); + tag3 += ((prev_byte == '/') & (cur_byte == '>')); + if (cur_byte != ' ') + prev_byte = cur_byte; + } + + /* + * Heuristics for detecting BINARY vs generic TEXT vs XML data at various + * significance levels. + */ + tot_8b = tot8b / 0x80 + lbytes; + tagcnt = tag1 + tag2 + tag3; + pct_tag = tagcnt / (double)srclen; + if (tot_8b > FORTY_PCT(srclen)) { + actx->forty_pct.btype = TYPE_BINARY; + } else { + actx->forty_pct.btype = TYPE_TEXT; + } + + if (tot_8b > FIFTY_PCT(srclen)) { + actx->fifty_pct.btype = TYPE_BINARY; + } else { + actx->fifty_pct.btype = TYPE_TEXT; + } + + tot8b /= 0x80; + if (tot8b == 0 && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) { + actx->one_pct.btype = TYPE_TEXT; + } + + markup = 0; + if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 && + tagcnt > (double)srclen * 0.001) + markup = 1; + + if (markup) { + if (actx->forty_pct.btype == TYPE_TEXT) + actx->forty_pct.btype |= TYPE_MARKUP; + if (actx->fifty_pct.btype == TYPE_TEXT) + actx->fifty_pct.btype |= TYPE_MARKUP; + if (actx->one_pct.btype == TYPE_TEXT) + actx->one_pct.btype |= TYPE_MARKUP; + } +} int -analyze_buffer(void *src, uint64_t srclen) +analyze_buffer_simple(void *src, uint64_t srclen) { uchar_t *src1 = (uchar_t *)src; uint64_t i, tot8b, lbytes; uchar_t cur_byte; int btype = TYPE_UNKNOWN; - /* * Count number of 8-bit binary bytes in source */ @@ -42,7 +116,6 @@ analyze_buffer(void *src, uint64_t srclen) tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization lbytes += (cur_byte < 32); } - /* * Heuristics for detecting BINARY vs generic TEXT */ @@ -50,6 +123,6 @@ analyze_buffer(void *src, uint64_t srclen) if (tot8b == 0 && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) { btype = TYPE_TEXT; } - return (btype); } + diff --git a/filters/analyzer/analyzer.h b/filters/analyzer/analyzer.h index 682111d..9eefd9c 100644 --- a/filters/analyzer/analyzer.h +++ b/filters/analyzer/analyzer.h @@ -29,7 +29,18 @@ extern "C" { #endif -extern int analyze_buffer(void *src, uint64_t srclen); +struct significance_value { + int btype; +}; + +typedef struct _analyzer_ctx { + struct significance_value one_pct; + struct significance_value forty_pct; + struct significance_value fifty_pct; +} analyzer_ctx_t; + +void analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx); +int analyze_buffer_simple(void *src, uint64_t srclen); #ifdef __cplusplus } diff --git a/filters/dict/DictFilter.cpp b/filters/dict/DictFilter.cpp index 299b661..b2636b4 100644 --- a/filters/dict/DictFilter.cpp +++ b/filters/dict/DictFilter.cpp @@ -36,10 +36,6 @@ #include "Common.h" #include "utils.h" -extern "C" { -extern int analyze_buffer(void *src, uint64_t srclen); -} - class DictFilter { public: @@ -270,7 +266,6 @@ dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64 DictFilter *df = static_cast(dict_ctx); u32 fl; u32 dl; - int atype; uchar_t *dst; DEBUG_STAT_EN(double strt, en); @@ -283,20 +278,17 @@ dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64 fl = (u32)fromlen; dl = (u32)(*dstlen); DEBUG_STAT_EN(strt = get_wtime_millis()); - atype = analyze_buffer(from, fromlen); - if (PC_TYPE(atype) == TYPE_TEXT) { - U32_P(to) = LE32(fl); - dst = to + 4; - dl -= 4; - if (df->Forward_Dict(from, fl, dst, &dl)) { - *dstlen = dl + 8; - DEBUG_STAT_EN(en = get_wtime_millis()); - DEBUG_STAT_EN(fprintf(stderr, "DICT: fromlen: %" PRIu64 ", dstlen: %" PRIu64 "\n", - fromlen, *dstlen)); - DEBUG_STAT_EN(fprintf(stderr, "DICT: Processed at %.3f MB/s\n", - get_mb_s(fromlen, strt, en))); - return (1); - } + U32_P(to) = LE32(fl); + dst = to + 4; + dl -= 4; + if (df->Forward_Dict(from, fl, dst, &dl)) { + *dstlen = dl + 8; + DEBUG_STAT_EN(en = get_wtime_millis()); + DEBUG_STAT_EN(fprintf(stderr, "DICT: fromlen: %" PRIu64 ", dstlen: %" PRIu64 "\n", + fromlen, *dstlen)); + DEBUG_STAT_EN(fprintf(stderr, "DICT: Processed at %.3f MB/s\n", + get_mb_s(fromlen, strt, en))); + return (1); } DEBUG_STAT_EN(fprintf(stderr, "No DICT\n")); return (-1); diff --git a/pcompress.c b/pcompress.c index 6a687f0..2f6eb03 100644 --- a/pcompress.c +++ b/pcompress.c @@ -211,7 +211,8 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t int result; uint64_t _dstlen, fromlen; uchar_t *from, *to; - int stype, dict; + int stype, dict, analyzed; + analyzer_ctx_t actx; DEBUG_STAT_EN(double strt, en); _dstlen = *dstlen; @@ -221,6 +222,14 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t result = 0; stype = PC_SUBTYPE(btype); dict = 0; + analyzed = 0; + + if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF || interesting) { + analyze_buffer(src, srclen, &actx); + analyzed = 1; + if (pctx->adapt_mode) + adapt_set_analyzer_ctx(data, &actx); + } /* * If Dispack is enabled it has to be done first since Dispack analyses the @@ -246,56 +255,78 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t * Enabling LZP also enables the DICT filter since we are dealing with text * in any case. */ - if (pctx->lzp_preprocess && (PC_TYPE(btype) == TYPE_UNKNOWN || - PC_TYPE(btype) == TYPE_TEXT || interesting)) { - void *dct = new_dict_context(); - _dstlen = fromlen; - result = dict_encode(dct, from, fromlen, to, &_dstlen); - delete_dict_context(dct); - if (result != -1) { - uchar_t *tmp; - tmp = from; - from = to; - to = tmp; - fromlen = _dstlen; - type |= PREPROC_TYPE_DICT; - dict = result; + if (pctx->lzp_preprocess) { + int b_type; + + if (analyzed) + b_type = PC_TYPE(actx.one_pct.btype); + else + b_type = PC_TYPE(analyze_buffer_simple(from, fromlen)); + + if (b_type == TYPE_TEXT) { + void *dct = new_dict_context(); + _dstlen = fromlen; + result = dict_encode(dct, from, fromlen, to, &_dstlen); + delete_dict_context(dct); + if (result != -1) { + uchar_t *tmp; + tmp = from; + from = to; + to = tmp; + fromlen = _dstlen; + type |= PREPROC_TYPE_DICT; + dict = result; + } } } #ifndef _MPLV2_LICENSE_ - if (pctx->lzp_preprocess && stype != TYPE_BMP && stype != TYPE_TIFF && - PC_TYPE(btype) != TYPE_BINARY) { - int hashsize; + if (pctx->lzp_preprocess && stype != TYPE_BMP && stype != TYPE_TIFF) { + int hashsize, b_type; int64_t result; - hashsize = lzp_hash_size(level); - result = lzp_compress((const uchar_t *)from, to, fromlen, + b_type = btype; + if (analyzed) + b_type = actx.forty_pct.btype; + + if (PC_TYPE(b_type) != TYPE_BINARY) { + hashsize = lzp_hash_size(level); + result = lzp_compress((const uchar_t *)from, to, fromlen, hashsize, LZP_DEFAULT_LZPMINLEN, 0); - if (result >= 0 && result < srclen) { - uchar_t *tmp; - tmp = from; - from = to; - to = tmp; - fromlen = result; - type |= PREPROC_TYPE_LZP; + if (result >= 0 && result < srclen) { + uchar_t *tmp; + tmp = from; + from = to; + to = tmp; + fromlen = result; + type |= PREPROC_TYPE_LZP; + } } } #endif if (pctx->enable_delta2_encode && props->delta2_span > 0 && stype != TYPE_DNA_SEQ && stype != TYPE_BMP && - stype != TYPE_TIFF && stype != TYPE_MP4 && PC_TYPE(btype) != TYPE_TEXT) { - _dstlen = fromlen; - result = delta2_encode((uchar_t *)from, fromlen, to, - &_dstlen, props->delta2_span, pctx->delta2_nstrides); - if (result != -1) { - uchar_t *tmp; - tmp = from; - from = to; - to = tmp; - fromlen = _dstlen; - type |= PREPROC_TYPE_DELTA2; + stype != TYPE_TIFF && stype != TYPE_MP4) { + int b_type; + + b_type = btype; + if (analyzed) + b_type = actx.one_pct.btype; + + if (PC_TYPE(b_type) != TYPE_TEXT) { + _dstlen = fromlen; + result = delta2_encode((uchar_t *)from, fromlen, to, + &_dstlen, props->delta2_span, + pctx->delta2_nstrides); + if (result != -1) { + uchar_t *tmp; + tmp = from; + from = to; + to = tmp; + fromlen = _dstlen; + type |= PREPROC_TYPE_DELTA2; + } } } diff --git a/pcompress.h b/pcompress.h index 6c5eb4f..eeaec90 100644 --- a/pcompress.h +++ b/pcompress.h @@ -36,7 +36,8 @@ extern "C" { #include #include -#include "meta_stream.h" +#include +#include #define CHUNK_FLAG_SZ 1 #define ALGO_SZ 8 @@ -152,6 +153,7 @@ extern int lz4_init(void **data, int *level, int nthreads, uint64_t chunksize, int file_version, compress_op_t op); extern int none_init(void **data, int *level, int nthreads, uint64_t chunksize, int file_version, compress_op_t op); +extern void adapt_set_analyzer_ctx(void *data, analyzer_ctx_t *actx); extern void lzma_props(algo_props_t *data, int level, uint64_t chunksize); extern void lzma_mt_props(algo_props_t *data, int level, uint64_t chunksize);