A new Dictionary preprocessor for text files.

This commit is contained in:
Moinak Ghosh 2015-01-09 22:13:24 +05:30
parent 73307c3996
commit 66a482c968
5 changed files with 975 additions and 214 deletions

View file

@ -59,7 +59,7 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
if (cur_byte != ' ')
prev_byte = cur_byte;
}
/*
* Heuristics for detecting BINARY vs generic TEXT vs XML data at various
* significance levels.
@ -72,7 +72,7 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
} else {
actx->forty_pct.btype = TYPE_TEXT;
}
if (tot_8b > FIFTY_PCT(srclen)) {
actx->fifty_pct.btype = TYPE_BINARY;
} else {

File diff suppressed because it is too large Load diff

View file

@ -39,11 +39,8 @@
extern "C" {
#endif
void *new_dict_context();
void delete_dict_context(void *dict_ctx);
int dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
int dict_decode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
int dict_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
int dict_decode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
#ifdef __cplusplus
}

View file

@ -510,7 +510,9 @@ meta_ctx_create(void *pc, int file_version, int comp_fd)
pc_ctx_t *pctx = (pc_ctx_t *)pc;
meta_ctx_t *mctx;
mctx = (meta_ctx_t *)malloc(sizeof (meta_ctx_t));
slab_cache_add(METADATA_CHUNK_SIZE + METADATA_HDR_SZ);
slab_cache_add(sizeof (meta_ctx_t));
mctx = (meta_ctx_t *)slab_alloc(NULL, sizeof (meta_ctx_t));
if (!mctx) {
log_msg(LOG_ERR, 1, "Failed to allocate metadata context.");
return (NULL);

View file

@ -259,15 +259,13 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
int b_type;
if (analyzed)
b_type = PC_TYPE(actx.one_pct.btype);
b_type = PC_TYPE(actx.forty_pct.btype);
else
b_type = PC_TYPE(analyze_buffer_simple(from, fromlen));
if (b_type == TYPE_TEXT) {
void *dct = new_dict_context();
_dstlen = fromlen;
result = dict_encode(dct, from, fromlen, to, &_dstlen);
delete_dict_context(dct);
result = dict_encode(from, fromlen, to, &_dstlen);
if (result != -1) {
uchar_t *tmp;
tmp = from;
@ -346,7 +344,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
_dstlen = srclen;
DEBUG_STAT_EN(strt = get_wtime_millis());
result = cmp_func(src, srclen, dest+9, &_dstlen, level, chdr,
(dict?TYPE_TEXT:btype), data);
btype, data);
DEBUG_STAT_EN(en = get_wtime_millis());
if (result > -1 && _dstlen < srclen) {
@ -407,6 +405,7 @@ preproc_decompress(pc_ctx_t *pctx, compress_func_ptr dec_func, void *src, uint64
memcpy(src, dst, _dstlen);
srclen = _dstlen;
*dstlen = _dstlen;
_dstlen = _dstlen1;
} else {
log_msg(LOG_ERR, 0, "Delta2 decoding failed.");
return (result);
@ -436,13 +435,12 @@ preproc_decompress(pc_ctx_t *pctx, compress_func_ptr dec_func, void *src, uint64
}
if (type & PREPROC_TYPE_DICT) {
void *dct = new_dict_context();
result = dict_decode(dct, src, srclen, dst, &_dstlen);
delete_dict_context(dct);
result = dict_decode(src, srclen, dst, &_dstlen);
if (result != -1) {
memcpy(src, dst, _dstlen);
srclen = _dstlen;
*dstlen = _dstlen;
_dstlen = _dstlen1;
} else {
log_msg(LOG_ERR, 0, "DICT decoding failed.");
return (result);