A new Dictionary preprocessor for text files.

This commit is contained in:
Moinak Ghosh 2015-01-09 22:13:24 +05:30
parent 73307c3996
commit 66a482c968
5 changed files with 975 additions and 214 deletions

View file

@ -59,7 +59,7 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
if (cur_byte != ' ') if (cur_byte != ' ')
prev_byte = cur_byte; prev_byte = cur_byte;
} }
/* /*
* Heuristics for detecting BINARY vs generic TEXT vs XML data at various * Heuristics for detecting BINARY vs generic TEXT vs XML data at various
* significance levels. * significance levels.
@ -72,7 +72,7 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
} else { } else {
actx->forty_pct.btype = TYPE_TEXT; actx->forty_pct.btype = TYPE_TEXT;
} }
if (tot_8b > FIFTY_PCT(srclen)) { if (tot_8b > FIFTY_PCT(srclen)) {
actx->fifty_pct.btype = TYPE_BINARY; actx->fifty_pct.btype = TYPE_BINARY;
} else { } else {

File diff suppressed because it is too large Load diff

View file

@ -39,11 +39,8 @@
extern "C" { extern "C" {
#endif #endif
void *new_dict_context(); int dict_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
void delete_dict_context(void *dict_ctx); int dict_decode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
int dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
int dict_decode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
#ifdef __cplusplus #ifdef __cplusplus
} }

View file

@ -510,7 +510,9 @@ meta_ctx_create(void *pc, int file_version, int comp_fd)
pc_ctx_t *pctx = (pc_ctx_t *)pc; pc_ctx_t *pctx = (pc_ctx_t *)pc;
meta_ctx_t *mctx; meta_ctx_t *mctx;
mctx = (meta_ctx_t *)malloc(sizeof (meta_ctx_t)); slab_cache_add(METADATA_CHUNK_SIZE + METADATA_HDR_SZ);
slab_cache_add(sizeof (meta_ctx_t));
mctx = (meta_ctx_t *)slab_alloc(NULL, sizeof (meta_ctx_t));
if (!mctx) { if (!mctx) {
log_msg(LOG_ERR, 1, "Failed to allocate metadata context."); log_msg(LOG_ERR, 1, "Failed to allocate metadata context.");
return (NULL); return (NULL);

View file

@ -259,15 +259,13 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
int b_type; int b_type;
if (analyzed) if (analyzed)
b_type = PC_TYPE(actx.one_pct.btype); b_type = PC_TYPE(actx.forty_pct.btype);
else else
b_type = PC_TYPE(analyze_buffer_simple(from, fromlen)); b_type = PC_TYPE(analyze_buffer_simple(from, fromlen));
if (b_type == TYPE_TEXT) { if (b_type == TYPE_TEXT) {
void *dct = new_dict_context();
_dstlen = fromlen; _dstlen = fromlen;
result = dict_encode(dct, from, fromlen, to, &_dstlen); result = dict_encode(from, fromlen, to, &_dstlen);
delete_dict_context(dct);
if (result != -1) { if (result != -1) {
uchar_t *tmp; uchar_t *tmp;
tmp = from; tmp = from;
@ -346,7 +344,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
_dstlen = srclen; _dstlen = srclen;
DEBUG_STAT_EN(strt = get_wtime_millis()); DEBUG_STAT_EN(strt = get_wtime_millis());
result = cmp_func(src, srclen, dest+9, &_dstlen, level, chdr, result = cmp_func(src, srclen, dest+9, &_dstlen, level, chdr,
(dict?TYPE_TEXT:btype), data); btype, data);
DEBUG_STAT_EN(en = get_wtime_millis()); DEBUG_STAT_EN(en = get_wtime_millis());
if (result > -1 && _dstlen < srclen) { if (result > -1 && _dstlen < srclen) {
@ -407,6 +405,7 @@ preproc_decompress(pc_ctx_t *pctx, compress_func_ptr dec_func, void *src, uint64
memcpy(src, dst, _dstlen); memcpy(src, dst, _dstlen);
srclen = _dstlen; srclen = _dstlen;
*dstlen = _dstlen; *dstlen = _dstlen;
_dstlen = _dstlen1;
} else { } else {
log_msg(LOG_ERR, 0, "Delta2 decoding failed."); log_msg(LOG_ERR, 0, "Delta2 decoding failed.");
return (result); return (result);
@ -436,13 +435,12 @@ preproc_decompress(pc_ctx_t *pctx, compress_func_ptr dec_func, void *src, uint64
} }
if (type & PREPROC_TYPE_DICT) { if (type & PREPROC_TYPE_DICT) {
void *dct = new_dict_context(); result = dict_decode(src, srclen, dst, &_dstlen);
result = dict_decode(dct, src, srclen, dst, &_dstlen);
delete_dict_context(dct);
if (result != -1) { if (result != -1) {
memcpy(src, dst, _dstlen); memcpy(src, dst, _dstlen);
srclen = _dstlen; srclen = _dstlen;
*dstlen = _dstlen; *dstlen = _dstlen;
_dstlen = _dstlen1;
} else { } else {
log_msg(LOG_ERR, 0, "DICT decoding failed."); log_msg(LOG_ERR, 0, "DICT decoding failed.");
return (result); return (result);