Some fixes in the Dictionary preprocessor.

Fix checking of data type flags.
Allow file-level filters to change output data type.
Tweak analyzer threshold for markup type.
This commit is contained in:
Moinak Ghosh 2015-01-13 19:59:09 +05:30
parent 077da83d5d
commit d5e1d2cdef
11 changed files with 23 additions and 14 deletions

View file

@ -247,7 +247,7 @@ is_bsc_type(int btype)
return ((stype == TYPE_BMP) | (stype == TYPE_DNA_SEQ) | return ((stype == TYPE_BMP) | (stype == TYPE_DNA_SEQ) |
(stype == TYPE_MP4) | (stype == TYPE_FLAC) | (stype == TYPE_AVI) | (stype == TYPE_MP4) | (stype == TYPE_FLAC) | (stype == TYPE_AVI) |
(stype == TYPE_DICOM) | (stype == TYPE_MEDIA_BSC) | (stype == TYPE_DICOM) | (stype == TYPE_MEDIA_BSC) |
(mtype == TYPE_TEXT && stype != TYPE_MARKUP)); (mtype & TYPE_TEXT && stype != TYPE_MARKUP));
} }
int int
@ -259,7 +259,7 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
int stype = PC_SUBTYPE(btype); int stype = PC_SUBTYPE(btype);
analyzer_ctx_t actx; analyzer_ctx_t actx;
if (btype == TYPE_UNKNOWN || PC_TYPE(btype) == TYPE_TEXT || if (btype == TYPE_UNKNOWN || PC_TYPE(btype) & TYPE_TEXT ||
stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF) { stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF) {
if (adat->actx == NULL) { if (adat->actx == NULL) {
analyze_buffer(src, srclen, &actx); analyze_buffer(src, srclen, &actx);
@ -292,14 +292,14 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
rv = ADAPT_COMPRESS_LZ4; rv = ADAPT_COMPRESS_LZ4;
lz4_count++; lz4_count++;
} else if (adat->adapt_mode == 2 && PC_TYPE(btype) == TYPE_BINARY && !bsc_type) { } else if (adat->adapt_mode == 2 && PC_TYPE(btype) & TYPE_BINARY && !bsc_type) {
rv = lzma_compress(src, srclen, dst, dstlen, level, chdr, btype, adat->lzma_data); rv = lzma_compress(src, srclen, dst, dstlen, level, chdr, btype, adat->lzma_data);
if (rv < 0) if (rv < 0)
return (rv); return (rv);
rv = ADAPT_COMPRESS_LZMA; rv = ADAPT_COMPRESS_LZMA;
lzma_count++; lzma_count++;
} else if (adat->adapt_mode == 1 && PC_TYPE(btype) == TYPE_BINARY && !bsc_type) { } else if (adat->adapt_mode == 1 && PC_TYPE(btype) & TYPE_BINARY && !bsc_type) {
rv = bzip2_compress(src, srclen, dst, dstlen, level, chdr, btype, NULL); rv = bzip2_compress(src, srclen, dst, dstlen, level, chdr, btype, NULL);
if (rv < 0) if (rv < 0)
return (rv); return (rv);

View file

@ -81,16 +81,19 @@ add_filters_by_type(struct type_data *typetab, struct filter_flags *ff)
typetab[slot].filter_private = sdat; typetab[slot].filter_private = sdat;
typetab[slot].filter_func = packjpg_filter; typetab[slot].filter_func = packjpg_filter;
typetab[slot].filter_name = "packJPG"; typetab[slot].filter_name = "packJPG";
typetab[slot].result_type = TYPE_BINARY;
slot = TYPE_BMP >> 3; slot = TYPE_BMP >> 3;
typetab[slot].filter_private = sdat; typetab[slot].filter_private = sdat;
typetab[slot].filter_func = packpnm_filter; typetab[slot].filter_func = packpnm_filter;
typetab[slot].filter_name = "packPNM"; typetab[slot].filter_name = "packPNM";
typetab[slot].result_type = TYPE_BINARY;
slot = TYPE_PNM >> 3; slot = TYPE_PNM >> 3;
typetab[slot].filter_private = sdat; typetab[slot].filter_private = sdat;
typetab[slot].filter_func = packpnm_filter; typetab[slot].filter_func = packpnm_filter;
typetab[slot].filter_name = "packPNM"; typetab[slot].filter_name = "packPNM";
typetab[slot].result_type = TYPE_BINARY;
} }
#endif #endif
@ -104,6 +107,7 @@ add_filters_by_type(struct type_data *typetab, struct filter_flags *ff)
typetab[slot].filter_private = sdat; typetab[slot].filter_private = sdat;
typetab[slot].filter_func = dispack_filter; typetab[slot].filter_func = dispack_filter;
typetab[slot].filter_name = "Dispack"; typetab[slot].filter_name = "Dispack";
typetab[slot].result_type = 0;
} }
#ifdef _ENABLE_WAVPACK_ #ifdef _ENABLE_WAVPACK_
@ -118,6 +122,7 @@ add_filters_by_type(struct type_data *typetab, struct filter_flags *ff)
typetab[slot].filter_private = sdat; typetab[slot].filter_private = sdat;
typetab[slot].filter_func = wavpack_filter; typetab[slot].filter_func = wavpack_filter;
typetab[slot].filter_name = "WavPack"; typetab[slot].filter_name = "WavPack";
typetab[slot].result_type = 0;
} }
#endif #endif
} }

View file

@ -100,6 +100,7 @@ struct type_data {
void *filter_private; void *filter_private;
filter_func_ptr filter_func; filter_func_ptr filter_func;
char *filter_name; char *filter_name;
int result_type;
}; };
void add_filters_by_type(struct type_data *typetab, struct filter_flags *ff); void add_filters_by_type(struct type_data *typetab, struct filter_flags *ff);

View file

@ -1042,6 +1042,9 @@ process_by_filter(int fd, int *typ, struct archive *target_arc,
if (wrtn == FILTER_RETURN_ERROR) { if (wrtn == FILTER_RETURN_ERROR) {
log_msg(LOG_ERR, 0, "Warning: Error invoking filter: %s (skipping)", log_msg(LOG_ERR, 0, "Warning: Error invoking filter: %s (skipping)",
typetab[(*typ >> 3)].filter_name); typetab[(*typ >> 3)].filter_name);
} else if (wrtn != FILTER_RETURN_SKIP) {
if (typetab[(*typ >> 3)].result_type != 0)
*typ = typetab[(*typ >> 3)].result_type;
} }
return (wrtn); return (wrtn);
} }

View file

@ -110,7 +110,7 @@ bzip2_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
* can be attempted to be compressed again for a possible gain. For others it is * can be attempted to be compressed again for a possible gain. For others it is
* a waste of time. * a waste of time.
*/ */
if (PC_TYPE(btype) == TYPE_COMPRESSED && level < 7) { if (PC_TYPE(btype) & TYPE_COMPRESSED && level < 7) {
int subtype = PC_SUBTYPE(btype); int subtype = PC_SUBTYPE(btype);
if (subtype != TYPE_COMPRESSED_LZW && subtype != TYPE_COMPRESSED_GZ && if (subtype != TYPE_COMPRESSED_LZW && subtype != TYPE_COMPRESSED_GZ &&

View file

@ -88,7 +88,7 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
markup = 0; markup = 0;
if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 && if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 &&
tagcnt > (double)spc * 0.1) tagcnt > (double)spc * 0.06)
markup = 1; markup = 1;
if (markup) { if (markup) {

View file

@ -165,7 +165,7 @@ libbsc_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
int rv; int rv;
struct libbsc_params *bscdat = (struct libbsc_params *)data; struct libbsc_params *bscdat = (struct libbsc_params *)data;
if (PC_TYPE(btype) == TYPE_COMPRESSED) { if (PC_TYPE(btype) & TYPE_COMPRESSED && level < 7) {
int subtype = PC_SUBTYPE(btype); int subtype = PC_SUBTYPE(btype);
if (subtype == TYPE_COMPRESSED_BZ2 || subtype == TYPE_COMPRESSED_LZMA) if (subtype == TYPE_COMPRESSED_BZ2 || subtype == TYPE_COMPRESSED_LZMA)
return (-1); return (-1);

View file

@ -114,7 +114,7 @@ lz_fx_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
/* /*
* Ignore compressed data in fast modes. * Ignore compressed data in fast modes.
*/ */
if (level < 7 && PC_TYPE(btype) == TYPE_COMPRESSED) if (level < 7 && PC_TYPE(btype) & TYPE_COMPRESSED)
return (-1); return (-1);
rv = lzfx_compress(src, _srclen, dst, &_dstlen, lzdat->htab_bits); rv = lzfx_compress(src, _srclen, dst, &_dstlen, lzdat->htab_bits);

View file

@ -223,7 +223,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
stype = PC_SUBTYPE(btype); stype = PC_SUBTYPE(btype);
analyzed = 0; analyzed = 0;
if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF || if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF ||
PC_TYPE(btype) == TYPE_TEXT || interesting) { PC_TYPE(btype) & TYPE_TEXT || interesting) {
analyze_buffer(src, srclen, &actx); analyze_buffer(src, srclen, &actx);
analyzed = 1; analyzed = 1;
if (pctx->adapt_mode) if (pctx->adapt_mode)
@ -264,7 +264,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
else else
b_type = analyze_buffer_simple(from, fromlen); b_type = analyze_buffer_simple(from, fromlen);
if (PC_TYPE(b_type) == TYPE_TEXT) { if (PC_TYPE(b_type) & TYPE_TEXT) {
_dstlen = fromlen; _dstlen = fromlen;
result = dict_encode(from, fromlen, to, &_dstlen); result = dict_encode(from, fromlen, to, &_dstlen);
if (result != -1) { if (result != -1) {
@ -287,7 +287,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
if (analyzed) if (analyzed)
b_type = actx.forty_pct.btype; b_type = actx.forty_pct.btype;
if (PC_TYPE(b_type) != TYPE_BINARY) { if (!(PC_TYPE(b_type) & TYPE_BINARY)) {
hashsize = lzp_hash_size(level); hashsize = lzp_hash_size(level);
result = lzp_compress((const uchar_t *)from, to, fromlen, result = lzp_compress((const uchar_t *)from, to, fromlen,
hashsize, LZP_DEFAULT_LZPMINLEN, 0); hashsize, LZP_DEFAULT_LZPMINLEN, 0);
@ -312,7 +312,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
if (analyzed) if (analyzed)
b_type = actx.one_pct.btype; b_type = actx.one_pct.btype;
if (PC_TYPE(b_type) != TYPE_TEXT) { if (!(PC_TYPE(b_type) & TYPE_TEXT)) {
_dstlen = fromlen; _dstlen = fromlen;
result = delta2_encode((uchar_t *)from, fromlen, to, result = delta2_encode((uchar_t *)from, fromlen, to,
&_dstlen, props->delta2_span, &_dstlen, props->delta2_span,

View file

@ -148,7 +148,7 @@ ppmd_compress(void *src, uint64_t srclen, void *dst,
CPpmd8 *_ppmd = (CPpmd8 *)data; CPpmd8 *_ppmd = (CPpmd8 *)data;
uchar_t *_src = (uchar_t *)src; uchar_t *_src = (uchar_t *)src;
if (PC_TYPE(btype) == TYPE_COMPRESSED) if (PC_TYPE(btype) & TYPE_COMPRESSED)
return (-1); return (-1);
Ppmd8_RangeEnc_Init(_ppmd); Ppmd8_RangeEnc_Init(_ppmd);

View file

@ -157,7 +157,7 @@ zlib_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
* can be attempted to be compressed again for a possible gain. For others it is * can be attempted to be compressed again for a possible gain. For others it is
* a waste of time. * a waste of time.
*/ */
if (PC_TYPE(btype) == TYPE_COMPRESSED && level < 7) { if (PC_TYPE(btype) & TYPE_COMPRESSED && level < 7) {
int subtype = PC_SUBTYPE(btype); int subtype = PC_SUBTYPE(btype);
if (subtype != TYPE_COMPRESSED_LZW && if (subtype != TYPE_COMPRESSED_LZW &&