Centralise data analysis routine for optimum performance and leverage.

Utilise buffer data analysis for preprocessing filters.
This commit is contained in:
Moinak Ghosh 2014-11-06 22:23:33 +05:30
parent 848010fbb5
commit 507e7c75d3
6 changed files with 195 additions and 125 deletions

View file

@ -38,6 +38,7 @@
#include <pcompress.h>
#include <allocator.h>
#include <pc_archive.h>
#include "filters/analyzer/analyzer.h"
#define FIFTY_PCT(x) (((x)/10) * 5)
#define FORTY_PCT(x) (((x)/10) * 4)
@ -97,8 +98,16 @@ struct adapt_data {
void *bsc_data;
void *lz4_data;
int adapt_mode;
analyzer_ctx_t *actx;
};
void
adapt_set_analyzer_ctx(void *data, analyzer_ctx_t *actx)
{
struct adapt_data *adat = (struct adapt_data *)data;
adat->actx = actx;
}
void
adapt_stats(int show)
{
@ -246,76 +255,28 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data)
{
struct adapt_data *adat = (struct adapt_data *)(data);
uchar_t *src1 = (uchar_t *)src;
int rv = 0, bsc_type = 0;
int stype = PC_SUBTYPE(btype);
analyzer_ctx_t actx;
if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR) {
uint64_t i, tot8b, tag1, tag2, tag3, lbytes;
double tagcnt, pct_tag;
uchar_t cur_byte, prev_byte;
/*
* Count number of 8-bit binary bytes and XML tags in source.
*/
tot8b = 0;
tag1 = 0;
tag2 = 0;
tag3 = 0;
lbytes = 0;
prev_byte = cur_byte = 0;
for (i = 0; i < srclen; i++) {
cur_byte = src1[i];
tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization
lbytes += (cur_byte < 32);
tag1 += (cur_byte == '<');
tag2 += (cur_byte == '>');
tag3 += ((prev_byte == '<') & (cur_byte == '/'));
tag3 += ((prev_byte == '/') & (cur_byte == '>'));
if (cur_byte != ' ')
prev_byte = cur_byte;
if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF) {
if (adat->actx == NULL) {
analyze_buffer(src, srclen, &actx);
adat->actx = &actx;
}
if (adat->adapt_mode == 2) {
btype = adat->actx->forty_pct.btype;
/*
* Heuristics for detecting BINARY vs generic TEXT vs XML data.
*/
tot8b = tot8b / 0x80 + lbytes;
tagcnt = tag1 + tag2 + tag3;
pct_tag = tagcnt / (double)srclen;
if (adat->adapt_mode == 2 && tot8b > FORTY_PCT(srclen)) {
btype = TYPE_BINARY;
} else if (adat->adapt_mode == 1 && tot8b > FIFTY_PCT(srclen)) {
btype = TYPE_BINARY;
} else {
btype = TYPE_TEXT;
if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 &&
tagcnt > (double)srclen * 0.001)
btype |= TYPE_MARKUP;
}
} else if (stype == TYPE_PDF) {
uint64_t i, tot8b;
uchar_t cur_byte;
/*
* For PDF files we need to check for uncompressed PDFs. Those are compressed
* using Libbsc.
*/
tot8b = 0;
for (i = 0; i < srclen; i++) {
cur_byte = src1[i];
tot8b += (cur_byte & 0x80);
}
tot8b /= 0x80;
if (adat->adapt_mode == 2 && tot8b > FORTY_PCT(srclen)) {
btype = TYPE_BINARY;
} else if (adat->adapt_mode == 1 && tot8b > FIFTY_PCT(srclen)) {
btype = TYPE_BINARY;
} else {
btype = TYPE_TEXT|TYPE_MARKUP;
} else if (adat->adapt_mode == 1) {
btype = adat->actx->fifty_pct.btype;
}
if (stype == TYPE_PDF)
btype |= TYPE_MARKUP;
}
/* Reset analyzer context for subsequent calls. */
adat->actx = NULL;
/*
* Use PPMd if some percentage of source is 7-bit textual bytes, otherwise
* use Bzip2 or LZMA. For totally incompressible data we always use LZ4. There

View file

@ -23,15 +23,89 @@
*/
#include "utils.h"
#include "analyzer.h"
#define FIFTY_PCT(x) (((x)/10) * 5)
#define FORTY_PCT(x) (((x)/10) * 4)
#define ONE_PCT(x) ((x)/100)
void
analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
{
uchar_t *src1 = (uchar_t *)src;
uint64_t i, tot8b, tot_8b, lbytes;
uchar_t cur_byte, prev_byte;
uint64_t tag1, tag2, tag3;
double tagcnt, pct_tag;
int markup;
/*
* Count number of 8-bit binary bytes and XML tags in source.
*/
tot8b = 0;
tag1 = 0;
tag2 = 0;
tag3 = 0;
lbytes = 0;
prev_byte = cur_byte = 0;
for (i = 0; i < srclen; i++) {
cur_byte = src1[i];
tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization
lbytes += (cur_byte < 32);
tag1 += (cur_byte == '<');
tag2 += (cur_byte == '>');
tag3 += ((prev_byte == '<') & (cur_byte == '/'));
tag3 += ((prev_byte == '/') & (cur_byte == '>'));
if (cur_byte != ' ')
prev_byte = cur_byte;
}
/*
* Heuristics for detecting BINARY vs generic TEXT vs XML data at various
* significance levels.
*/
tot_8b = tot8b / 0x80 + lbytes;
tagcnt = tag1 + tag2 + tag3;
pct_tag = tagcnt / (double)srclen;
if (tot_8b > FORTY_PCT(srclen)) {
actx->forty_pct.btype = TYPE_BINARY;
} else {
actx->forty_pct.btype = TYPE_TEXT;
}
if (tot_8b > FIFTY_PCT(srclen)) {
actx->fifty_pct.btype = TYPE_BINARY;
} else {
actx->fifty_pct.btype = TYPE_TEXT;
}
tot8b /= 0x80;
if (tot8b == 0 && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
actx->one_pct.btype = TYPE_TEXT;
}
markup = 0;
if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 &&
tagcnt > (double)srclen * 0.001)
markup = 1;
if (markup) {
if (actx->forty_pct.btype == TYPE_TEXT)
actx->forty_pct.btype |= TYPE_MARKUP;
if (actx->fifty_pct.btype == TYPE_TEXT)
actx->fifty_pct.btype |= TYPE_MARKUP;
if (actx->one_pct.btype == TYPE_TEXT)
actx->one_pct.btype |= TYPE_MARKUP;
}
}
int
analyze_buffer(void *src, uint64_t srclen)
analyze_buffer_simple(void *src, uint64_t srclen)
{
uchar_t *src1 = (uchar_t *)src;
uint64_t i, tot8b, lbytes;
uchar_t cur_byte;
int btype = TYPE_UNKNOWN;
/*
* Count number of 8-bit binary bytes in source
*/
@ -42,7 +116,6 @@ analyze_buffer(void *src, uint64_t srclen)
tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization
lbytes += (cur_byte < 32);
}
/*
* Heuristics for detecting BINARY vs generic TEXT
*/
@ -50,6 +123,6 @@ analyze_buffer(void *src, uint64_t srclen)
if (tot8b == 0 && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
btype = TYPE_TEXT;
}
return (btype);
}

View file

@ -29,7 +29,18 @@
extern "C" {
#endif
extern int analyze_buffer(void *src, uint64_t srclen);
struct significance_value {
int btype;
};
typedef struct _analyzer_ctx {
struct significance_value one_pct;
struct significance_value forty_pct;
struct significance_value fifty_pct;
} analyzer_ctx_t;
void analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx);
int analyze_buffer_simple(void *src, uint64_t srclen);
#ifdef __cplusplus
}

View file

@ -36,10 +36,6 @@
#include "Common.h"
#include "utils.h"
extern "C" {
extern int analyze_buffer(void *src, uint64_t srclen);
}
class DictFilter
{
public:
@ -270,7 +266,6 @@ dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64
DictFilter *df = static_cast<DictFilter *>(dict_ctx);
u32 fl;
u32 dl;
int atype;
uchar_t *dst;
DEBUG_STAT_EN(double strt, en);
@ -283,20 +278,17 @@ dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64
fl = (u32)fromlen;
dl = (u32)(*dstlen);
DEBUG_STAT_EN(strt = get_wtime_millis());
atype = analyze_buffer(from, fromlen);
if (PC_TYPE(atype) == TYPE_TEXT) {
U32_P(to) = LE32(fl);
dst = to + 4;
dl -= 4;
if (df->Forward_Dict(from, fl, dst, &dl)) {
*dstlen = dl + 8;
DEBUG_STAT_EN(en = get_wtime_millis());
DEBUG_STAT_EN(fprintf(stderr, "DICT: fromlen: %" PRIu64 ", dstlen: %" PRIu64 "\n",
fromlen, *dstlen));
DEBUG_STAT_EN(fprintf(stderr, "DICT: Processed at %.3f MB/s\n",
get_mb_s(fromlen, strt, en)));
return (1);
}
U32_P(to) = LE32(fl);
dst = to + 4;
dl -= 4;
if (df->Forward_Dict(from, fl, dst, &dl)) {
*dstlen = dl + 8;
DEBUG_STAT_EN(en = get_wtime_millis());
DEBUG_STAT_EN(fprintf(stderr, "DICT: fromlen: %" PRIu64 ", dstlen: %" PRIu64 "\n",
fromlen, *dstlen));
DEBUG_STAT_EN(fprintf(stderr, "DICT: Processed at %.3f MB/s\n",
get_mb_s(fromlen, strt, en)));
return (1);
}
DEBUG_STAT_EN(fprintf(stderr, "No DICT\n"));
return (-1);

View file

@ -211,7 +211,8 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
int result;
uint64_t _dstlen, fromlen;
uchar_t *from, *to;
int stype, dict;
int stype, dict, analyzed;
analyzer_ctx_t actx;
DEBUG_STAT_EN(double strt, en);
_dstlen = *dstlen;
@ -221,6 +222,14 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
result = 0;
stype = PC_SUBTYPE(btype);
dict = 0;
analyzed = 0;
if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF || interesting) {
analyze_buffer(src, srclen, &actx);
analyzed = 1;
if (pctx->adapt_mode)
adapt_set_analyzer_ctx(data, &actx);
}
/*
* If Dispack is enabled it has to be done first since Dispack analyses the
@ -246,56 +255,78 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
* Enabling LZP also enables the DICT filter since we are dealing with text
* in any case.
*/
if (pctx->lzp_preprocess && (PC_TYPE(btype) == TYPE_UNKNOWN ||
PC_TYPE(btype) == TYPE_TEXT || interesting)) {
void *dct = new_dict_context();
_dstlen = fromlen;
result = dict_encode(dct, from, fromlen, to, &_dstlen);
delete_dict_context(dct);
if (result != -1) {
uchar_t *tmp;
tmp = from;
from = to;
to = tmp;
fromlen = _dstlen;
type |= PREPROC_TYPE_DICT;
dict = result;
if (pctx->lzp_preprocess) {
int b_type;
if (analyzed)
b_type = PC_TYPE(actx.one_pct.btype);
else
b_type = PC_TYPE(analyze_buffer_simple(from, fromlen));
if (b_type == TYPE_TEXT) {
void *dct = new_dict_context();
_dstlen = fromlen;
result = dict_encode(dct, from, fromlen, to, &_dstlen);
delete_dict_context(dct);
if (result != -1) {
uchar_t *tmp;
tmp = from;
from = to;
to = tmp;
fromlen = _dstlen;
type |= PREPROC_TYPE_DICT;
dict = result;
}
}
}
#ifndef _MPLV2_LICENSE_
if (pctx->lzp_preprocess && stype != TYPE_BMP && stype != TYPE_TIFF &&
PC_TYPE(btype) != TYPE_BINARY) {
int hashsize;
if (pctx->lzp_preprocess && stype != TYPE_BMP && stype != TYPE_TIFF) {
int hashsize, b_type;
int64_t result;
hashsize = lzp_hash_size(level);
result = lzp_compress((const uchar_t *)from, to, fromlen,
b_type = btype;
if (analyzed)
b_type = actx.forty_pct.btype;
if (PC_TYPE(b_type) != TYPE_BINARY) {
hashsize = lzp_hash_size(level);
result = lzp_compress((const uchar_t *)from, to, fromlen,
hashsize, LZP_DEFAULT_LZPMINLEN, 0);
if (result >= 0 && result < srclen) {
uchar_t *tmp;
tmp = from;
from = to;
to = tmp;
fromlen = result;
type |= PREPROC_TYPE_LZP;
if (result >= 0 && result < srclen) {
uchar_t *tmp;
tmp = from;
from = to;
to = tmp;
fromlen = result;
type |= PREPROC_TYPE_LZP;
}
}
}
#endif
if (pctx->enable_delta2_encode && props->delta2_span > 0 &&
stype != TYPE_DNA_SEQ && stype != TYPE_BMP &&
stype != TYPE_TIFF && stype != TYPE_MP4 && PC_TYPE(btype) != TYPE_TEXT) {
_dstlen = fromlen;
result = delta2_encode((uchar_t *)from, fromlen, to,
&_dstlen, props->delta2_span, pctx->delta2_nstrides);
if (result != -1) {
uchar_t *tmp;
tmp = from;
from = to;
to = tmp;
fromlen = _dstlen;
type |= PREPROC_TYPE_DELTA2;
stype != TYPE_TIFF && stype != TYPE_MP4) {
int b_type;
b_type = btype;
if (analyzed)
b_type = actx.one_pct.btype;
if (PC_TYPE(b_type) != TYPE_TEXT) {
_dstlen = fromlen;
result = delta2_encode((uchar_t *)from, fromlen, to,
&_dstlen, props->delta2_span,
pctx->delta2_nstrides);
if (result != -1) {
uchar_t *tmp;
tmp = from;
from = to;
to = tmp;
fromlen = _dstlen;
type |= PREPROC_TYPE_DELTA2;
}
}
}

View file

@ -36,7 +36,8 @@ extern "C" {
#include <rabin_dedup.h>
#include <crypto_utils.h>
#include "meta_stream.h"
#include <filters/analyzer/analyzer.h>
#include <meta_stream.h>
#define CHUNK_FLAG_SZ 1
#define ALGO_SZ 8
@ -152,6 +153,7 @@ extern int lz4_init(void **data, int *level, int nthreads, uint64_t chunksize,
int file_version, compress_op_t op);
extern int none_init(void **data, int *level, int nthreads, uint64_t chunksize,
int file_version, compress_op_t op);
extern void adapt_set_analyzer_ctx(void *data, analyzer_ctx_t *actx);
extern void lzma_props(algo_props_t *data, int level, uint64_t chunksize);
extern void lzma_mt_props(algo_props_t *data, int level, uint64_t chunksize);