Centralise data analysis routine for optimum performance and leverage.

Utilise buffer data analysis for preprocessing filters.
This commit is contained in:
Moinak Ghosh 2014-11-06 22:23:33 +05:30
parent 848010fbb5
commit 507e7c75d3
6 changed files with 195 additions and 125 deletions

View file

@ -38,6 +38,7 @@
#include <pcompress.h> #include <pcompress.h>
#include <allocator.h> #include <allocator.h>
#include <pc_archive.h> #include <pc_archive.h>
#include "filters/analyzer/analyzer.h"
#define FIFTY_PCT(x) (((x)/10) * 5) #define FIFTY_PCT(x) (((x)/10) * 5)
#define FORTY_PCT(x) (((x)/10) * 4) #define FORTY_PCT(x) (((x)/10) * 4)
@ -97,8 +98,16 @@ struct adapt_data {
void *bsc_data; void *bsc_data;
void *lz4_data; void *lz4_data;
int adapt_mode; int adapt_mode;
analyzer_ctx_t *actx;
}; };
void
adapt_set_analyzer_ctx(void *data, analyzer_ctx_t *actx)
{
struct adapt_data *adat = (struct adapt_data *)data;
adat->actx = actx;
}
void void
adapt_stats(int show) adapt_stats(int show)
{ {
@ -246,76 +255,28 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data) uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data)
{ {
struct adapt_data *adat = (struct adapt_data *)(data); struct adapt_data *adat = (struct adapt_data *)(data);
uchar_t *src1 = (uchar_t *)src;
int rv = 0, bsc_type = 0; int rv = 0, bsc_type = 0;
int stype = PC_SUBTYPE(btype); int stype = PC_SUBTYPE(btype);
analyzer_ctx_t actx;
if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR) { if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF) {
uint64_t i, tot8b, tag1, tag2, tag3, lbytes; if (adat->actx == NULL) {
double tagcnt, pct_tag; analyze_buffer(src, srclen, &actx);
uchar_t cur_byte, prev_byte; adat->actx = &actx;
/*
* Count number of 8-bit binary bytes and XML tags in source.
*/
tot8b = 0;
tag1 = 0;
tag2 = 0;
tag3 = 0;
lbytes = 0;
prev_byte = cur_byte = 0;
for (i = 0; i < srclen; i++) {
cur_byte = src1[i];
tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization
lbytes += (cur_byte < 32);
tag1 += (cur_byte == '<');
tag2 += (cur_byte == '>');
tag3 += ((prev_byte == '<') & (cur_byte == '/'));
tag3 += ((prev_byte == '/') & (cur_byte == '>'));
if (cur_byte != ' ')
prev_byte = cur_byte;
} }
if (adat->adapt_mode == 2) {
btype = adat->actx->forty_pct.btype;
/* } else if (adat->adapt_mode == 1) {
* Heuristics for detecting BINARY vs generic TEXT vs XML data. btype = adat->actx->fifty_pct.btype;
*/
tot8b = tot8b / 0x80 + lbytes;
tagcnt = tag1 + tag2 + tag3;
pct_tag = tagcnt / (double)srclen;
if (adat->adapt_mode == 2 && tot8b > FORTY_PCT(srclen)) {
btype = TYPE_BINARY;
} else if (adat->adapt_mode == 1 && tot8b > FIFTY_PCT(srclen)) {
btype = TYPE_BINARY;
} else {
btype = TYPE_TEXT;
if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 &&
tagcnt > (double)srclen * 0.001)
btype |= TYPE_MARKUP;
}
} else if (stype == TYPE_PDF) {
uint64_t i, tot8b;
uchar_t cur_byte;
/*
* For PDF files we need to check for uncompressed PDFs. Those are compressed
* using Libbsc.
*/
tot8b = 0;
for (i = 0; i < srclen; i++) {
cur_byte = src1[i];
tot8b += (cur_byte & 0x80);
}
tot8b /= 0x80;
if (adat->adapt_mode == 2 && tot8b > FORTY_PCT(srclen)) {
btype = TYPE_BINARY;
} else if (adat->adapt_mode == 1 && tot8b > FIFTY_PCT(srclen)) {
btype = TYPE_BINARY;
} else {
btype = TYPE_TEXT|TYPE_MARKUP;
} }
if (stype == TYPE_PDF)
btype |= TYPE_MARKUP;
} }
/* Reset analyzer context for subsequent calls. */
adat->actx = NULL;
/* /*
* Use PPMd if some percentage of source is 7-bit textual bytes, otherwise * Use PPMd if some percentage of source is 7-bit textual bytes, otherwise
* use Bzip2 or LZMA. For totally incompressible data we always use LZ4. There * use Bzip2 or LZMA. For totally incompressible data we always use LZ4. There

View file

@ -23,15 +23,89 @@
*/ */
#include "utils.h" #include "utils.h"
#include "analyzer.h"
#define FIFTY_PCT(x) (((x)/10) * 5)
#define FORTY_PCT(x) (((x)/10) * 4)
#define ONE_PCT(x) ((x)/100)
void
analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
{
uchar_t *src1 = (uchar_t *)src;
uint64_t i, tot8b, tot_8b, lbytes;
uchar_t cur_byte, prev_byte;
uint64_t tag1, tag2, tag3;
double tagcnt, pct_tag;
int markup;
/*
* Count number of 8-bit binary bytes and XML tags in source.
*/
tot8b = 0;
tag1 = 0;
tag2 = 0;
tag3 = 0;
lbytes = 0;
prev_byte = cur_byte = 0;
for (i = 0; i < srclen; i++) {
cur_byte = src1[i];
tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization
lbytes += (cur_byte < 32);
tag1 += (cur_byte == '<');
tag2 += (cur_byte == '>');
tag3 += ((prev_byte == '<') & (cur_byte == '/'));
tag3 += ((prev_byte == '/') & (cur_byte == '>'));
if (cur_byte != ' ')
prev_byte = cur_byte;
}
/*
* Heuristics for detecting BINARY vs generic TEXT vs XML data at various
* significance levels.
*/
tot_8b = tot8b / 0x80 + lbytes;
tagcnt = tag1 + tag2 + tag3;
pct_tag = tagcnt / (double)srclen;
if (tot_8b > FORTY_PCT(srclen)) {
actx->forty_pct.btype = TYPE_BINARY;
} else {
actx->forty_pct.btype = TYPE_TEXT;
}
if (tot_8b > FIFTY_PCT(srclen)) {
actx->fifty_pct.btype = TYPE_BINARY;
} else {
actx->fifty_pct.btype = TYPE_TEXT;
}
tot8b /= 0x80;
if (tot8b == 0 && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
actx->one_pct.btype = TYPE_TEXT;
}
markup = 0;
if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 &&
tagcnt > (double)srclen * 0.001)
markup = 1;
if (markup) {
if (actx->forty_pct.btype == TYPE_TEXT)
actx->forty_pct.btype |= TYPE_MARKUP;
if (actx->fifty_pct.btype == TYPE_TEXT)
actx->fifty_pct.btype |= TYPE_MARKUP;
if (actx->one_pct.btype == TYPE_TEXT)
actx->one_pct.btype |= TYPE_MARKUP;
}
}
int int
analyze_buffer(void *src, uint64_t srclen) analyze_buffer_simple(void *src, uint64_t srclen)
{ {
uchar_t *src1 = (uchar_t *)src; uchar_t *src1 = (uchar_t *)src;
uint64_t i, tot8b, lbytes; uint64_t i, tot8b, lbytes;
uchar_t cur_byte; uchar_t cur_byte;
int btype = TYPE_UNKNOWN; int btype = TYPE_UNKNOWN;
/* /*
* Count number of 8-bit binary bytes in source * Count number of 8-bit binary bytes in source
*/ */
@ -42,7 +116,6 @@ analyze_buffer(void *src, uint64_t srclen)
tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization
lbytes += (cur_byte < 32); lbytes += (cur_byte < 32);
} }
/* /*
* Heuristics for detecting BINARY vs generic TEXT * Heuristics for detecting BINARY vs generic TEXT
*/ */
@ -50,6 +123,6 @@ analyze_buffer(void *src, uint64_t srclen)
if (tot8b == 0 && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) { if (tot8b == 0 && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
btype = TYPE_TEXT; btype = TYPE_TEXT;
} }
return (btype); return (btype);
} }

View file

@ -29,7 +29,18 @@
extern "C" { extern "C" {
#endif #endif
extern int analyze_buffer(void *src, uint64_t srclen); struct significance_value {
int btype;
};
typedef struct _analyzer_ctx {
struct significance_value one_pct;
struct significance_value forty_pct;
struct significance_value fifty_pct;
} analyzer_ctx_t;
void analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx);
int analyze_buffer_simple(void *src, uint64_t srclen);
#ifdef __cplusplus #ifdef __cplusplus
} }

View file

@ -36,10 +36,6 @@
#include "Common.h" #include "Common.h"
#include "utils.h" #include "utils.h"
extern "C" {
extern int analyze_buffer(void *src, uint64_t srclen);
}
class DictFilter class DictFilter
{ {
public: public:
@ -270,7 +266,6 @@ dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64
DictFilter *df = static_cast<DictFilter *>(dict_ctx); DictFilter *df = static_cast<DictFilter *>(dict_ctx);
u32 fl; u32 fl;
u32 dl; u32 dl;
int atype;
uchar_t *dst; uchar_t *dst;
DEBUG_STAT_EN(double strt, en); DEBUG_STAT_EN(double strt, en);
@ -283,20 +278,17 @@ dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64
fl = (u32)fromlen; fl = (u32)fromlen;
dl = (u32)(*dstlen); dl = (u32)(*dstlen);
DEBUG_STAT_EN(strt = get_wtime_millis()); DEBUG_STAT_EN(strt = get_wtime_millis());
atype = analyze_buffer(from, fromlen); U32_P(to) = LE32(fl);
if (PC_TYPE(atype) == TYPE_TEXT) { dst = to + 4;
U32_P(to) = LE32(fl); dl -= 4;
dst = to + 4; if (df->Forward_Dict(from, fl, dst, &dl)) {
dl -= 4; *dstlen = dl + 8;
if (df->Forward_Dict(from, fl, dst, &dl)) { DEBUG_STAT_EN(en = get_wtime_millis());
*dstlen = dl + 8; DEBUG_STAT_EN(fprintf(stderr, "DICT: fromlen: %" PRIu64 ", dstlen: %" PRIu64 "\n",
DEBUG_STAT_EN(en = get_wtime_millis()); fromlen, *dstlen));
DEBUG_STAT_EN(fprintf(stderr, "DICT: fromlen: %" PRIu64 ", dstlen: %" PRIu64 "\n", DEBUG_STAT_EN(fprintf(stderr, "DICT: Processed at %.3f MB/s\n",
fromlen, *dstlen)); get_mb_s(fromlen, strt, en)));
DEBUG_STAT_EN(fprintf(stderr, "DICT: Processed at %.3f MB/s\n", return (1);
get_mb_s(fromlen, strt, en)));
return (1);
}
} }
DEBUG_STAT_EN(fprintf(stderr, "No DICT\n")); DEBUG_STAT_EN(fprintf(stderr, "No DICT\n"));
return (-1); return (-1);

View file

@ -211,7 +211,8 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
int result; int result;
uint64_t _dstlen, fromlen; uint64_t _dstlen, fromlen;
uchar_t *from, *to; uchar_t *from, *to;
int stype, dict; int stype, dict, analyzed;
analyzer_ctx_t actx;
DEBUG_STAT_EN(double strt, en); DEBUG_STAT_EN(double strt, en);
_dstlen = *dstlen; _dstlen = *dstlen;
@ -221,6 +222,14 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
result = 0; result = 0;
stype = PC_SUBTYPE(btype); stype = PC_SUBTYPE(btype);
dict = 0; dict = 0;
analyzed = 0;
if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF || interesting) {
analyze_buffer(src, srclen, &actx);
analyzed = 1;
if (pctx->adapt_mode)
adapt_set_analyzer_ctx(data, &actx);
}
/* /*
* If Dispack is enabled it has to be done first since Dispack analyses the * If Dispack is enabled it has to be done first since Dispack analyses the
@ -246,56 +255,78 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
* Enabling LZP also enables the DICT filter since we are dealing with text * Enabling LZP also enables the DICT filter since we are dealing with text
* in any case. * in any case.
*/ */
if (pctx->lzp_preprocess && (PC_TYPE(btype) == TYPE_UNKNOWN || if (pctx->lzp_preprocess) {
PC_TYPE(btype) == TYPE_TEXT || interesting)) { int b_type;
void *dct = new_dict_context();
_dstlen = fromlen; if (analyzed)
result = dict_encode(dct, from, fromlen, to, &_dstlen); b_type = PC_TYPE(actx.one_pct.btype);
delete_dict_context(dct); else
if (result != -1) { b_type = PC_TYPE(analyze_buffer_simple(from, fromlen));
uchar_t *tmp;
tmp = from; if (b_type == TYPE_TEXT) {
from = to; void *dct = new_dict_context();
to = tmp; _dstlen = fromlen;
fromlen = _dstlen; result = dict_encode(dct, from, fromlen, to, &_dstlen);
type |= PREPROC_TYPE_DICT; delete_dict_context(dct);
dict = result; if (result != -1) {
uchar_t *tmp;
tmp = from;
from = to;
to = tmp;
fromlen = _dstlen;
type |= PREPROC_TYPE_DICT;
dict = result;
}
} }
} }
#ifndef _MPLV2_LICENSE_ #ifndef _MPLV2_LICENSE_
if (pctx->lzp_preprocess && stype != TYPE_BMP && stype != TYPE_TIFF && if (pctx->lzp_preprocess && stype != TYPE_BMP && stype != TYPE_TIFF) {
PC_TYPE(btype) != TYPE_BINARY) { int hashsize, b_type;
int hashsize;
int64_t result; int64_t result;
hashsize = lzp_hash_size(level); b_type = btype;
result = lzp_compress((const uchar_t *)from, to, fromlen, if (analyzed)
b_type = actx.forty_pct.btype;
if (PC_TYPE(b_type) != TYPE_BINARY) {
hashsize = lzp_hash_size(level);
result = lzp_compress((const uchar_t *)from, to, fromlen,
hashsize, LZP_DEFAULT_LZPMINLEN, 0); hashsize, LZP_DEFAULT_LZPMINLEN, 0);
if (result >= 0 && result < srclen) { if (result >= 0 && result < srclen) {
uchar_t *tmp; uchar_t *tmp;
tmp = from; tmp = from;
from = to; from = to;
to = tmp; to = tmp;
fromlen = result; fromlen = result;
type |= PREPROC_TYPE_LZP; type |= PREPROC_TYPE_LZP;
}
} }
} }
#endif #endif
if (pctx->enable_delta2_encode && props->delta2_span > 0 && if (pctx->enable_delta2_encode && props->delta2_span > 0 &&
stype != TYPE_DNA_SEQ && stype != TYPE_BMP && stype != TYPE_DNA_SEQ && stype != TYPE_BMP &&
stype != TYPE_TIFF && stype != TYPE_MP4 && PC_TYPE(btype) != TYPE_TEXT) { stype != TYPE_TIFF && stype != TYPE_MP4) {
_dstlen = fromlen; int b_type;
result = delta2_encode((uchar_t *)from, fromlen, to,
&_dstlen, props->delta2_span, pctx->delta2_nstrides); b_type = btype;
if (result != -1) { if (analyzed)
uchar_t *tmp; b_type = actx.one_pct.btype;
tmp = from;
from = to; if (PC_TYPE(b_type) != TYPE_TEXT) {
to = tmp; _dstlen = fromlen;
fromlen = _dstlen; result = delta2_encode((uchar_t *)from, fromlen, to,
type |= PREPROC_TYPE_DELTA2; &_dstlen, props->delta2_span,
pctx->delta2_nstrides);
if (result != -1) {
uchar_t *tmp;
tmp = from;
from = to;
to = tmp;
fromlen = _dstlen;
type |= PREPROC_TYPE_DELTA2;
}
} }
} }

View file

@ -36,7 +36,8 @@ extern "C" {
#include <rabin_dedup.h> #include <rabin_dedup.h>
#include <crypto_utils.h> #include <crypto_utils.h>
#include "meta_stream.h" #include <filters/analyzer/analyzer.h>
#include <meta_stream.h>
#define CHUNK_FLAG_SZ 1 #define CHUNK_FLAG_SZ 1
#define ALGO_SZ 8 #define ALGO_SZ 8
@ -152,6 +153,7 @@ extern int lz4_init(void **data, int *level, int nthreads, uint64_t chunksize,
int file_version, compress_op_t op); int file_version, compress_op_t op);
extern int none_init(void **data, int *level, int nthreads, uint64_t chunksize, extern int none_init(void **data, int *level, int nthreads, uint64_t chunksize,
int file_version, compress_op_t op); int file_version, compress_op_t op);
extern void adapt_set_analyzer_ctx(void *data, analyzer_ctx_t *actx);
extern void lzma_props(algo_props_t *data, int level, uint64_t chunksize); extern void lzma_props(algo_props_t *data, int level, uint64_t chunksize);
extern void lzma_mt_props(algo_props_t *data, int level, uint64_t chunksize); extern void lzma_mt_props(algo_props_t *data, int level, uint64_t chunksize);