A new Dictionary preprocessor for text files.
This commit is contained in:
parent
73307c3996
commit
66a482c968
5 changed files with 975 additions and 214 deletions
|
@ -59,7 +59,7 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
|
|||
if (cur_byte != ' ')
|
||||
prev_byte = cur_byte;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Heuristics for detecting BINARY vs generic TEXT vs XML data at various
|
||||
* significance levels.
|
||||
|
@ -72,7 +72,7 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
|
|||
} else {
|
||||
actx->forty_pct.btype = TYPE_TEXT;
|
||||
}
|
||||
|
||||
|
||||
if (tot_8b > FIFTY_PCT(srclen)) {
|
||||
actx->fifty_pct.btype = TYPE_BINARY;
|
||||
} else {
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -39,11 +39,8 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
void *new_dict_context();
|
||||
void delete_dict_context(void *dict_ctx);
|
||||
|
||||
int dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
|
||||
int dict_decode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
|
||||
int dict_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
|
||||
int dict_decode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
@ -510,7 +510,9 @@ meta_ctx_create(void *pc, int file_version, int comp_fd)
|
|||
pc_ctx_t *pctx = (pc_ctx_t *)pc;
|
||||
meta_ctx_t *mctx;
|
||||
|
||||
mctx = (meta_ctx_t *)malloc(sizeof (meta_ctx_t));
|
||||
slab_cache_add(METADATA_CHUNK_SIZE + METADATA_HDR_SZ);
|
||||
slab_cache_add(sizeof (meta_ctx_t));
|
||||
mctx = (meta_ctx_t *)slab_alloc(NULL, sizeof (meta_ctx_t));
|
||||
if (!mctx) {
|
||||
log_msg(LOG_ERR, 1, "Failed to allocate metadata context.");
|
||||
return (NULL);
|
||||
|
|
14
pcompress.c
14
pcompress.c
|
@ -259,15 +259,13 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
|
|||
int b_type;
|
||||
|
||||
if (analyzed)
|
||||
b_type = PC_TYPE(actx.one_pct.btype);
|
||||
b_type = PC_TYPE(actx.forty_pct.btype);
|
||||
else
|
||||
b_type = PC_TYPE(analyze_buffer_simple(from, fromlen));
|
||||
|
||||
if (b_type == TYPE_TEXT) {
|
||||
void *dct = new_dict_context();
|
||||
_dstlen = fromlen;
|
||||
result = dict_encode(dct, from, fromlen, to, &_dstlen);
|
||||
delete_dict_context(dct);
|
||||
result = dict_encode(from, fromlen, to, &_dstlen);
|
||||
if (result != -1) {
|
||||
uchar_t *tmp;
|
||||
tmp = from;
|
||||
|
@ -346,7 +344,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
|
|||
_dstlen = srclen;
|
||||
DEBUG_STAT_EN(strt = get_wtime_millis());
|
||||
result = cmp_func(src, srclen, dest+9, &_dstlen, level, chdr,
|
||||
(dict?TYPE_TEXT:btype), data);
|
||||
btype, data);
|
||||
DEBUG_STAT_EN(en = get_wtime_millis());
|
||||
|
||||
if (result > -1 && _dstlen < srclen) {
|
||||
|
@ -407,6 +405,7 @@ preproc_decompress(pc_ctx_t *pctx, compress_func_ptr dec_func, void *src, uint64
|
|||
memcpy(src, dst, _dstlen);
|
||||
srclen = _dstlen;
|
||||
*dstlen = _dstlen;
|
||||
_dstlen = _dstlen1;
|
||||
} else {
|
||||
log_msg(LOG_ERR, 0, "Delta2 decoding failed.");
|
||||
return (result);
|
||||
|
@ -436,13 +435,12 @@ preproc_decompress(pc_ctx_t *pctx, compress_func_ptr dec_func, void *src, uint64
|
|||
}
|
||||
|
||||
if (type & PREPROC_TYPE_DICT) {
|
||||
void *dct = new_dict_context();
|
||||
result = dict_decode(dct, src, srclen, dst, &_dstlen);
|
||||
delete_dict_context(dct);
|
||||
result = dict_decode(src, srclen, dst, &_dstlen);
|
||||
if (result != -1) {
|
||||
memcpy(src, dst, _dstlen);
|
||||
srclen = _dstlen;
|
||||
*dstlen = _dstlen;
|
||||
_dstlen = _dstlen1;
|
||||
} else {
|
||||
log_msg(LOG_ERR, 0, "DICT decoding failed.");
|
||||
return (result);
|
||||
|
|
Loading…
Reference in a new issue