Centralise data analysis routine for optimum performance and leverage.
Utilise buffer data analysis for preprocessing filters.
This commit is contained in:
parent
848010fbb5
commit
507e7c75d3
6 changed files with 195 additions and 125 deletions
|
@ -38,6 +38,7 @@
|
|||
#include <pcompress.h>
|
||||
#include <allocator.h>
|
||||
#include <pc_archive.h>
|
||||
#include "filters/analyzer/analyzer.h"
|
||||
|
||||
#define FIFTY_PCT(x) (((x)/10) * 5)
|
||||
#define FORTY_PCT(x) (((x)/10) * 4)
|
||||
|
@ -97,8 +98,16 @@ struct adapt_data {
|
|||
void *bsc_data;
|
||||
void *lz4_data;
|
||||
int adapt_mode;
|
||||
analyzer_ctx_t *actx;
|
||||
};
|
||||
|
||||
void
|
||||
adapt_set_analyzer_ctx(void *data, analyzer_ctx_t *actx)
|
||||
{
|
||||
struct adapt_data *adat = (struct adapt_data *)data;
|
||||
adat->actx = actx;
|
||||
}
|
||||
|
||||
void
|
||||
adapt_stats(int show)
|
||||
{
|
||||
|
@ -246,75 +255,27 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
|
|||
uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data)
|
||||
{
|
||||
struct adapt_data *adat = (struct adapt_data *)(data);
|
||||
uchar_t *src1 = (uchar_t *)src;
|
||||
int rv = 0, bsc_type = 0;
|
||||
int stype = PC_SUBTYPE(btype);
|
||||
analyzer_ctx_t actx;
|
||||
|
||||
if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR) {
|
||||
uint64_t i, tot8b, tag1, tag2, tag3, lbytes;
|
||||
double tagcnt, pct_tag;
|
||||
uchar_t cur_byte, prev_byte;
|
||||
/*
|
||||
* Count number of 8-bit binary bytes and XML tags in source.
|
||||
*/
|
||||
tot8b = 0;
|
||||
tag1 = 0;
|
||||
tag2 = 0;
|
||||
tag3 = 0;
|
||||
lbytes = 0;
|
||||
prev_byte = cur_byte = 0;
|
||||
for (i = 0; i < srclen; i++) {
|
||||
cur_byte = src1[i];
|
||||
tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization
|
||||
lbytes += (cur_byte < 32);
|
||||
tag1 += (cur_byte == '<');
|
||||
tag2 += (cur_byte == '>');
|
||||
tag3 += ((prev_byte == '<') & (cur_byte == '/'));
|
||||
tag3 += ((prev_byte == '/') & (cur_byte == '>'));
|
||||
if (cur_byte != ' ')
|
||||
prev_byte = cur_byte;
|
||||
if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF) {
|
||||
if (adat->actx == NULL) {
|
||||
analyze_buffer(src, srclen, &actx);
|
||||
adat->actx = &actx;
|
||||
}
|
||||
if (adat->adapt_mode == 2) {
|
||||
btype = adat->actx->forty_pct.btype;
|
||||
|
||||
/*
|
||||
* Heuristics for detecting BINARY vs generic TEXT vs XML data.
|
||||
*/
|
||||
tot8b = tot8b / 0x80 + lbytes;
|
||||
tagcnt = tag1 + tag2 + tag3;
|
||||
pct_tag = tagcnt / (double)srclen;
|
||||
if (adat->adapt_mode == 2 && tot8b > FORTY_PCT(srclen)) {
|
||||
btype = TYPE_BINARY;
|
||||
} else if (adat->adapt_mode == 1 && tot8b > FIFTY_PCT(srclen)) {
|
||||
btype = TYPE_BINARY;
|
||||
} else {
|
||||
btype = TYPE_TEXT;
|
||||
if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 &&
|
||||
tagcnt > (double)srclen * 0.001)
|
||||
} else if (adat->adapt_mode == 1) {
|
||||
btype = adat->actx->fifty_pct.btype;
|
||||
}
|
||||
if (stype == TYPE_PDF)
|
||||
btype |= TYPE_MARKUP;
|
||||
}
|
||||
|
||||
} else if (stype == TYPE_PDF) {
|
||||
uint64_t i, tot8b;
|
||||
uchar_t cur_byte;
|
||||
|
||||
/*
|
||||
* For PDF files we need to check for uncompressed PDFs. Those are compressed
|
||||
* using Libbsc.
|
||||
*/
|
||||
tot8b = 0;
|
||||
for (i = 0; i < srclen; i++) {
|
||||
cur_byte = src1[i];
|
||||
tot8b += (cur_byte & 0x80);
|
||||
}
|
||||
|
||||
tot8b /= 0x80;
|
||||
if (adat->adapt_mode == 2 && tot8b > FORTY_PCT(srclen)) {
|
||||
btype = TYPE_BINARY;
|
||||
} else if (adat->adapt_mode == 1 && tot8b > FIFTY_PCT(srclen)) {
|
||||
btype = TYPE_BINARY;
|
||||
} else {
|
||||
btype = TYPE_TEXT|TYPE_MARKUP;
|
||||
}
|
||||
}
|
||||
/* Reset analyzer context for subsequent calls. */
|
||||
adat->actx = NULL;
|
||||
|
||||
/*
|
||||
* Use PPMd if some percentage of source is 7-bit textual bytes, otherwise
|
||||
|
|
|
@ -23,15 +23,89 @@
|
|||
*/
|
||||
|
||||
#include "utils.h"
|
||||
#include "analyzer.h"
|
||||
|
||||
#define FIFTY_PCT(x) (((x)/10) * 5)
|
||||
#define FORTY_PCT(x) (((x)/10) * 4)
|
||||
#define ONE_PCT(x) ((x)/100)
|
||||
|
||||
void
|
||||
analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
|
||||
{
|
||||
uchar_t *src1 = (uchar_t *)src;
|
||||
uint64_t i, tot8b, tot_8b, lbytes;
|
||||
uchar_t cur_byte, prev_byte;
|
||||
uint64_t tag1, tag2, tag3;
|
||||
double tagcnt, pct_tag;
|
||||
int markup;
|
||||
|
||||
/*
|
||||
* Count number of 8-bit binary bytes and XML tags in source.
|
||||
*/
|
||||
tot8b = 0;
|
||||
tag1 = 0;
|
||||
tag2 = 0;
|
||||
tag3 = 0;
|
||||
lbytes = 0;
|
||||
prev_byte = cur_byte = 0;
|
||||
for (i = 0; i < srclen; i++) {
|
||||
cur_byte = src1[i];
|
||||
tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization
|
||||
lbytes += (cur_byte < 32);
|
||||
tag1 += (cur_byte == '<');
|
||||
tag2 += (cur_byte == '>');
|
||||
tag3 += ((prev_byte == '<') & (cur_byte == '/'));
|
||||
tag3 += ((prev_byte == '/') & (cur_byte == '>'));
|
||||
if (cur_byte != ' ')
|
||||
prev_byte = cur_byte;
|
||||
}
|
||||
|
||||
/*
|
||||
* Heuristics for detecting BINARY vs generic TEXT vs XML data at various
|
||||
* significance levels.
|
||||
*/
|
||||
tot_8b = tot8b / 0x80 + lbytes;
|
||||
tagcnt = tag1 + tag2 + tag3;
|
||||
pct_tag = tagcnt / (double)srclen;
|
||||
if (tot_8b > FORTY_PCT(srclen)) {
|
||||
actx->forty_pct.btype = TYPE_BINARY;
|
||||
} else {
|
||||
actx->forty_pct.btype = TYPE_TEXT;
|
||||
}
|
||||
|
||||
if (tot_8b > FIFTY_PCT(srclen)) {
|
||||
actx->fifty_pct.btype = TYPE_BINARY;
|
||||
} else {
|
||||
actx->fifty_pct.btype = TYPE_TEXT;
|
||||
}
|
||||
|
||||
tot8b /= 0x80;
|
||||
if (tot8b == 0 && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
|
||||
actx->one_pct.btype = TYPE_TEXT;
|
||||
}
|
||||
|
||||
markup = 0;
|
||||
if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 &&
|
||||
tagcnt > (double)srclen * 0.001)
|
||||
markup = 1;
|
||||
|
||||
if (markup) {
|
||||
if (actx->forty_pct.btype == TYPE_TEXT)
|
||||
actx->forty_pct.btype |= TYPE_MARKUP;
|
||||
if (actx->fifty_pct.btype == TYPE_TEXT)
|
||||
actx->fifty_pct.btype |= TYPE_MARKUP;
|
||||
if (actx->one_pct.btype == TYPE_TEXT)
|
||||
actx->one_pct.btype |= TYPE_MARKUP;
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
analyze_buffer(void *src, uint64_t srclen)
|
||||
analyze_buffer_simple(void *src, uint64_t srclen)
|
||||
{
|
||||
uchar_t *src1 = (uchar_t *)src;
|
||||
uint64_t i, tot8b, lbytes;
|
||||
uchar_t cur_byte;
|
||||
int btype = TYPE_UNKNOWN;
|
||||
|
||||
/*
|
||||
* Count number of 8-bit binary bytes in source
|
||||
*/
|
||||
|
@ -42,7 +116,6 @@ analyze_buffer(void *src, uint64_t srclen)
|
|||
tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization
|
||||
lbytes += (cur_byte < 32);
|
||||
}
|
||||
|
||||
/*
|
||||
* Heuristics for detecting BINARY vs generic TEXT
|
||||
*/
|
||||
|
@ -50,6 +123,6 @@ analyze_buffer(void *src, uint64_t srclen)
|
|||
if (tot8b == 0 && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
|
||||
btype = TYPE_TEXT;
|
||||
}
|
||||
|
||||
return (btype);
|
||||
}
|
||||
|
||||
|
|
|
@ -29,7 +29,18 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
extern int analyze_buffer(void *src, uint64_t srclen);
|
||||
struct significance_value {
|
||||
int btype;
|
||||
};
|
||||
|
||||
typedef struct _analyzer_ctx {
|
||||
struct significance_value one_pct;
|
||||
struct significance_value forty_pct;
|
||||
struct significance_value fifty_pct;
|
||||
} analyzer_ctx_t;
|
||||
|
||||
void analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx);
|
||||
int analyze_buffer_simple(void *src, uint64_t srclen);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
@ -36,10 +36,6 @@
|
|||
#include "Common.h"
|
||||
#include "utils.h"
|
||||
|
||||
extern "C" {
|
||||
extern int analyze_buffer(void *src, uint64_t srclen);
|
||||
}
|
||||
|
||||
class DictFilter
|
||||
{
|
||||
public:
|
||||
|
@ -270,7 +266,6 @@ dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64
|
|||
DictFilter *df = static_cast<DictFilter *>(dict_ctx);
|
||||
u32 fl;
|
||||
u32 dl;
|
||||
int atype;
|
||||
uchar_t *dst;
|
||||
DEBUG_STAT_EN(double strt, en);
|
||||
|
||||
|
@ -283,8 +278,6 @@ dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64
|
|||
fl = (u32)fromlen;
|
||||
dl = (u32)(*dstlen);
|
||||
DEBUG_STAT_EN(strt = get_wtime_millis());
|
||||
atype = analyze_buffer(from, fromlen);
|
||||
if (PC_TYPE(atype) == TYPE_TEXT) {
|
||||
U32_P(to) = LE32(fl);
|
||||
dst = to + 4;
|
||||
dl -= 4;
|
||||
|
@ -297,7 +290,6 @@ dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64
|
|||
get_mb_s(fromlen, strt, en)));
|
||||
return (1);
|
||||
}
|
||||
}
|
||||
DEBUG_STAT_EN(fprintf(stderr, "No DICT\n"));
|
||||
return (-1);
|
||||
}
|
||||
|
|
47
pcompress.c
47
pcompress.c
|
@ -211,7 +211,8 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
|
|||
int result;
|
||||
uint64_t _dstlen, fromlen;
|
||||
uchar_t *from, *to;
|
||||
int stype, dict;
|
||||
int stype, dict, analyzed;
|
||||
analyzer_ctx_t actx;
|
||||
DEBUG_STAT_EN(double strt, en);
|
||||
|
||||
_dstlen = *dstlen;
|
||||
|
@ -221,6 +222,14 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
|
|||
result = 0;
|
||||
stype = PC_SUBTYPE(btype);
|
||||
dict = 0;
|
||||
analyzed = 0;
|
||||
|
||||
if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF || interesting) {
|
||||
analyze_buffer(src, srclen, &actx);
|
||||
analyzed = 1;
|
||||
if (pctx->adapt_mode)
|
||||
adapt_set_analyzer_ctx(data, &actx);
|
||||
}
|
||||
|
||||
/*
|
||||
* If Dispack is enabled it has to be done first since Dispack analyses the
|
||||
|
@ -246,8 +255,15 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
|
|||
* Enabling LZP also enables the DICT filter since we are dealing with text
|
||||
* in any case.
|
||||
*/
|
||||
if (pctx->lzp_preprocess && (PC_TYPE(btype) == TYPE_UNKNOWN ||
|
||||
PC_TYPE(btype) == TYPE_TEXT || interesting)) {
|
||||
if (pctx->lzp_preprocess) {
|
||||
int b_type;
|
||||
|
||||
if (analyzed)
|
||||
b_type = PC_TYPE(actx.one_pct.btype);
|
||||
else
|
||||
b_type = PC_TYPE(analyze_buffer_simple(from, fromlen));
|
||||
|
||||
if (b_type == TYPE_TEXT) {
|
||||
void *dct = new_dict_context();
|
||||
_dstlen = fromlen;
|
||||
result = dict_encode(dct, from, fromlen, to, &_dstlen);
|
||||
|
@ -262,13 +278,18 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
|
|||
dict = result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef _MPLV2_LICENSE_
|
||||
if (pctx->lzp_preprocess && stype != TYPE_BMP && stype != TYPE_TIFF &&
|
||||
PC_TYPE(btype) != TYPE_BINARY) {
|
||||
int hashsize;
|
||||
if (pctx->lzp_preprocess && stype != TYPE_BMP && stype != TYPE_TIFF) {
|
||||
int hashsize, b_type;
|
||||
int64_t result;
|
||||
|
||||
b_type = btype;
|
||||
if (analyzed)
|
||||
b_type = actx.forty_pct.btype;
|
||||
|
||||
if (PC_TYPE(b_type) != TYPE_BINARY) {
|
||||
hashsize = lzp_hash_size(level);
|
||||
result = lzp_compress((const uchar_t *)from, to, fromlen,
|
||||
hashsize, LZP_DEFAULT_LZPMINLEN, 0);
|
||||
|
@ -281,14 +302,23 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
|
|||
type |= PREPROC_TYPE_LZP;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (pctx->enable_delta2_encode && props->delta2_span > 0 &&
|
||||
stype != TYPE_DNA_SEQ && stype != TYPE_BMP &&
|
||||
stype != TYPE_TIFF && stype != TYPE_MP4 && PC_TYPE(btype) != TYPE_TEXT) {
|
||||
stype != TYPE_TIFF && stype != TYPE_MP4) {
|
||||
int b_type;
|
||||
|
||||
b_type = btype;
|
||||
if (analyzed)
|
||||
b_type = actx.one_pct.btype;
|
||||
|
||||
if (PC_TYPE(b_type) != TYPE_TEXT) {
|
||||
_dstlen = fromlen;
|
||||
result = delta2_encode((uchar_t *)from, fromlen, to,
|
||||
&_dstlen, props->delta2_span, pctx->delta2_nstrides);
|
||||
&_dstlen, props->delta2_span,
|
||||
pctx->delta2_nstrides);
|
||||
if (result != -1) {
|
||||
uchar_t *tmp;
|
||||
tmp = from;
|
||||
|
@ -298,6 +328,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
|
|||
type |= PREPROC_TYPE_DELTA2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Check which is the resulting buffer. If Encoded data is already sitting
|
||||
|
|
|
@ -36,7 +36,8 @@ extern "C" {
|
|||
|
||||
#include <rabin_dedup.h>
|
||||
#include <crypto_utils.h>
|
||||
#include "meta_stream.h"
|
||||
#include <filters/analyzer/analyzer.h>
|
||||
#include <meta_stream.h>
|
||||
|
||||
#define CHUNK_FLAG_SZ 1
|
||||
#define ALGO_SZ 8
|
||||
|
@ -152,6 +153,7 @@ extern int lz4_init(void **data, int *level, int nthreads, uint64_t chunksize,
|
|||
int file_version, compress_op_t op);
|
||||
extern int none_init(void **data, int *level, int nthreads, uint64_t chunksize,
|
||||
int file_version, compress_op_t op);
|
||||
extern void adapt_set_analyzer_ctx(void *data, analyzer_ctx_t *actx);
|
||||
|
||||
extern void lzma_props(algo_props_t *data, int level, uint64_t chunksize);
|
||||
extern void lzma_mt_props(algo_props_t *data, int level, uint64_t chunksize);
|
||||
|
|
Loading…
Reference in a new issue