Some fixes in the Dictionary preprocessor.
Fix checking of data type flags. Allow file-level filters to change output data type. Tweak analyzer threshold for markup type.
This commit is contained in:
parent
077da83d5d
commit
d5e1d2cdef
11 changed files with 23 additions and 14 deletions
|
@ -247,7 +247,7 @@ is_bsc_type(int btype)
|
||||||
return ((stype == TYPE_BMP) | (stype == TYPE_DNA_SEQ) |
|
return ((stype == TYPE_BMP) | (stype == TYPE_DNA_SEQ) |
|
||||||
(stype == TYPE_MP4) | (stype == TYPE_FLAC) | (stype == TYPE_AVI) |
|
(stype == TYPE_MP4) | (stype == TYPE_FLAC) | (stype == TYPE_AVI) |
|
||||||
(stype == TYPE_DICOM) | (stype == TYPE_MEDIA_BSC) |
|
(stype == TYPE_DICOM) | (stype == TYPE_MEDIA_BSC) |
|
||||||
(mtype == TYPE_TEXT && stype != TYPE_MARKUP));
|
(mtype & TYPE_TEXT && stype != TYPE_MARKUP));
|
||||||
}
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
|
@ -259,7 +259,7 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
|
||||||
int stype = PC_SUBTYPE(btype);
|
int stype = PC_SUBTYPE(btype);
|
||||||
analyzer_ctx_t actx;
|
analyzer_ctx_t actx;
|
||||||
|
|
||||||
if (btype == TYPE_UNKNOWN || PC_TYPE(btype) == TYPE_TEXT ||
|
if (btype == TYPE_UNKNOWN || PC_TYPE(btype) & TYPE_TEXT ||
|
||||||
stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF) {
|
stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF) {
|
||||||
if (adat->actx == NULL) {
|
if (adat->actx == NULL) {
|
||||||
analyze_buffer(src, srclen, &actx);
|
analyze_buffer(src, srclen, &actx);
|
||||||
|
@ -292,14 +292,14 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
|
||||||
rv = ADAPT_COMPRESS_LZ4;
|
rv = ADAPT_COMPRESS_LZ4;
|
||||||
lz4_count++;
|
lz4_count++;
|
||||||
|
|
||||||
} else if (adat->adapt_mode == 2 && PC_TYPE(btype) == TYPE_BINARY && !bsc_type) {
|
} else if (adat->adapt_mode == 2 && PC_TYPE(btype) & TYPE_BINARY && !bsc_type) {
|
||||||
rv = lzma_compress(src, srclen, dst, dstlen, level, chdr, btype, adat->lzma_data);
|
rv = lzma_compress(src, srclen, dst, dstlen, level, chdr, btype, adat->lzma_data);
|
||||||
if (rv < 0)
|
if (rv < 0)
|
||||||
return (rv);
|
return (rv);
|
||||||
rv = ADAPT_COMPRESS_LZMA;
|
rv = ADAPT_COMPRESS_LZMA;
|
||||||
lzma_count++;
|
lzma_count++;
|
||||||
|
|
||||||
} else if (adat->adapt_mode == 1 && PC_TYPE(btype) == TYPE_BINARY && !bsc_type) {
|
} else if (adat->adapt_mode == 1 && PC_TYPE(btype) & TYPE_BINARY && !bsc_type) {
|
||||||
rv = bzip2_compress(src, srclen, dst, dstlen, level, chdr, btype, NULL);
|
rv = bzip2_compress(src, srclen, dst, dstlen, level, chdr, btype, NULL);
|
||||||
if (rv < 0)
|
if (rv < 0)
|
||||||
return (rv);
|
return (rv);
|
||||||
|
|
|
@ -81,16 +81,19 @@ add_filters_by_type(struct type_data *typetab, struct filter_flags *ff)
|
||||||
typetab[slot].filter_private = sdat;
|
typetab[slot].filter_private = sdat;
|
||||||
typetab[slot].filter_func = packjpg_filter;
|
typetab[slot].filter_func = packjpg_filter;
|
||||||
typetab[slot].filter_name = "packJPG";
|
typetab[slot].filter_name = "packJPG";
|
||||||
|
typetab[slot].result_type = TYPE_BINARY;
|
||||||
|
|
||||||
slot = TYPE_BMP >> 3;
|
slot = TYPE_BMP >> 3;
|
||||||
typetab[slot].filter_private = sdat;
|
typetab[slot].filter_private = sdat;
|
||||||
typetab[slot].filter_func = packpnm_filter;
|
typetab[slot].filter_func = packpnm_filter;
|
||||||
typetab[slot].filter_name = "packPNM";
|
typetab[slot].filter_name = "packPNM";
|
||||||
|
typetab[slot].result_type = TYPE_BINARY;
|
||||||
|
|
||||||
slot = TYPE_PNM >> 3;
|
slot = TYPE_PNM >> 3;
|
||||||
typetab[slot].filter_private = sdat;
|
typetab[slot].filter_private = sdat;
|
||||||
typetab[slot].filter_func = packpnm_filter;
|
typetab[slot].filter_func = packpnm_filter;
|
||||||
typetab[slot].filter_name = "packPNM";
|
typetab[slot].filter_name = "packPNM";
|
||||||
|
typetab[slot].result_type = TYPE_BINARY;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -104,6 +107,7 @@ add_filters_by_type(struct type_data *typetab, struct filter_flags *ff)
|
||||||
typetab[slot].filter_private = sdat;
|
typetab[slot].filter_private = sdat;
|
||||||
typetab[slot].filter_func = dispack_filter;
|
typetab[slot].filter_func = dispack_filter;
|
||||||
typetab[slot].filter_name = "Dispack";
|
typetab[slot].filter_name = "Dispack";
|
||||||
|
typetab[slot].result_type = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef _ENABLE_WAVPACK_
|
#ifdef _ENABLE_WAVPACK_
|
||||||
|
@ -118,6 +122,7 @@ add_filters_by_type(struct type_data *typetab, struct filter_flags *ff)
|
||||||
typetab[slot].filter_private = sdat;
|
typetab[slot].filter_private = sdat;
|
||||||
typetab[slot].filter_func = wavpack_filter;
|
typetab[slot].filter_func = wavpack_filter;
|
||||||
typetab[slot].filter_name = "WavPack";
|
typetab[slot].filter_name = "WavPack";
|
||||||
|
typetab[slot].result_type = 0;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
|
@ -100,6 +100,7 @@ struct type_data {
|
||||||
void *filter_private;
|
void *filter_private;
|
||||||
filter_func_ptr filter_func;
|
filter_func_ptr filter_func;
|
||||||
char *filter_name;
|
char *filter_name;
|
||||||
|
int result_type;
|
||||||
};
|
};
|
||||||
|
|
||||||
void add_filters_by_type(struct type_data *typetab, struct filter_flags *ff);
|
void add_filters_by_type(struct type_data *typetab, struct filter_flags *ff);
|
||||||
|
|
|
@ -1042,6 +1042,9 @@ process_by_filter(int fd, int *typ, struct archive *target_arc,
|
||||||
if (wrtn == FILTER_RETURN_ERROR) {
|
if (wrtn == FILTER_RETURN_ERROR) {
|
||||||
log_msg(LOG_ERR, 0, "Warning: Error invoking filter: %s (skipping)",
|
log_msg(LOG_ERR, 0, "Warning: Error invoking filter: %s (skipping)",
|
||||||
typetab[(*typ >> 3)].filter_name);
|
typetab[(*typ >> 3)].filter_name);
|
||||||
|
} else if (wrtn != FILTER_RETURN_SKIP) {
|
||||||
|
if (typetab[(*typ >> 3)].result_type != 0)
|
||||||
|
*typ = typetab[(*typ >> 3)].result_type;
|
||||||
}
|
}
|
||||||
return (wrtn);
|
return (wrtn);
|
||||||
}
|
}
|
||||||
|
|
|
@ -110,7 +110,7 @@ bzip2_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
|
||||||
* can be attempted to be compressed again for a possible gain. For others it is
|
* can be attempted to be compressed again for a possible gain. For others it is
|
||||||
* a waste of time.
|
* a waste of time.
|
||||||
*/
|
*/
|
||||||
if (PC_TYPE(btype) == TYPE_COMPRESSED && level < 7) {
|
if (PC_TYPE(btype) & TYPE_COMPRESSED && level < 7) {
|
||||||
int subtype = PC_SUBTYPE(btype);
|
int subtype = PC_SUBTYPE(btype);
|
||||||
|
|
||||||
if (subtype != TYPE_COMPRESSED_LZW && subtype != TYPE_COMPRESSED_GZ &&
|
if (subtype != TYPE_COMPRESSED_LZW && subtype != TYPE_COMPRESSED_GZ &&
|
||||||
|
|
|
@ -88,7 +88,7 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
|
||||||
|
|
||||||
markup = 0;
|
markup = 0;
|
||||||
if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 &&
|
if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 &&
|
||||||
tagcnt > (double)spc * 0.1)
|
tagcnt > (double)spc * 0.06)
|
||||||
markup = 1;
|
markup = 1;
|
||||||
|
|
||||||
if (markup) {
|
if (markup) {
|
||||||
|
|
|
@ -165,7 +165,7 @@ libbsc_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
|
||||||
int rv;
|
int rv;
|
||||||
struct libbsc_params *bscdat = (struct libbsc_params *)data;
|
struct libbsc_params *bscdat = (struct libbsc_params *)data;
|
||||||
|
|
||||||
if (PC_TYPE(btype) == TYPE_COMPRESSED) {
|
if (PC_TYPE(btype) & TYPE_COMPRESSED && level < 7) {
|
||||||
int subtype = PC_SUBTYPE(btype);
|
int subtype = PC_SUBTYPE(btype);
|
||||||
if (subtype == TYPE_COMPRESSED_BZ2 || subtype == TYPE_COMPRESSED_LZMA)
|
if (subtype == TYPE_COMPRESSED_BZ2 || subtype == TYPE_COMPRESSED_LZMA)
|
||||||
return (-1);
|
return (-1);
|
||||||
|
|
|
@ -114,7 +114,7 @@ lz_fx_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
|
||||||
/*
|
/*
|
||||||
* Ignore compressed data in fast modes.
|
* Ignore compressed data in fast modes.
|
||||||
*/
|
*/
|
||||||
if (level < 7 && PC_TYPE(btype) == TYPE_COMPRESSED)
|
if (level < 7 && PC_TYPE(btype) & TYPE_COMPRESSED)
|
||||||
return (-1);
|
return (-1);
|
||||||
|
|
||||||
rv = lzfx_compress(src, _srclen, dst, &_dstlen, lzdat->htab_bits);
|
rv = lzfx_compress(src, _srclen, dst, &_dstlen, lzdat->htab_bits);
|
||||||
|
|
|
@ -223,7 +223,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
|
||||||
stype = PC_SUBTYPE(btype);
|
stype = PC_SUBTYPE(btype);
|
||||||
analyzed = 0;
|
analyzed = 0;
|
||||||
if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF ||
|
if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR || stype == TYPE_PDF ||
|
||||||
PC_TYPE(btype) == TYPE_TEXT || interesting) {
|
PC_TYPE(btype) & TYPE_TEXT || interesting) {
|
||||||
analyze_buffer(src, srclen, &actx);
|
analyze_buffer(src, srclen, &actx);
|
||||||
analyzed = 1;
|
analyzed = 1;
|
||||||
if (pctx->adapt_mode)
|
if (pctx->adapt_mode)
|
||||||
|
@ -264,7 +264,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
|
||||||
else
|
else
|
||||||
b_type = analyze_buffer_simple(from, fromlen);
|
b_type = analyze_buffer_simple(from, fromlen);
|
||||||
|
|
||||||
if (PC_TYPE(b_type) == TYPE_TEXT) {
|
if (PC_TYPE(b_type) & TYPE_TEXT) {
|
||||||
_dstlen = fromlen;
|
_dstlen = fromlen;
|
||||||
result = dict_encode(from, fromlen, to, &_dstlen);
|
result = dict_encode(from, fromlen, to, &_dstlen);
|
||||||
if (result != -1) {
|
if (result != -1) {
|
||||||
|
@ -287,7 +287,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
|
||||||
if (analyzed)
|
if (analyzed)
|
||||||
b_type = actx.forty_pct.btype;
|
b_type = actx.forty_pct.btype;
|
||||||
|
|
||||||
if (PC_TYPE(b_type) != TYPE_BINARY) {
|
if (!(PC_TYPE(b_type) & TYPE_BINARY)) {
|
||||||
hashsize = lzp_hash_size(level);
|
hashsize = lzp_hash_size(level);
|
||||||
result = lzp_compress((const uchar_t *)from, to, fromlen,
|
result = lzp_compress((const uchar_t *)from, to, fromlen,
|
||||||
hashsize, LZP_DEFAULT_LZPMINLEN, 0);
|
hashsize, LZP_DEFAULT_LZPMINLEN, 0);
|
||||||
|
@ -312,7 +312,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
|
||||||
if (analyzed)
|
if (analyzed)
|
||||||
b_type = actx.one_pct.btype;
|
b_type = actx.one_pct.btype;
|
||||||
|
|
||||||
if (PC_TYPE(b_type) != TYPE_TEXT) {
|
if (!(PC_TYPE(b_type) & TYPE_TEXT)) {
|
||||||
_dstlen = fromlen;
|
_dstlen = fromlen;
|
||||||
result = delta2_encode((uchar_t *)from, fromlen, to,
|
result = delta2_encode((uchar_t *)from, fromlen, to,
|
||||||
&_dstlen, props->delta2_span,
|
&_dstlen, props->delta2_span,
|
||||||
|
|
|
@ -148,7 +148,7 @@ ppmd_compress(void *src, uint64_t srclen, void *dst,
|
||||||
CPpmd8 *_ppmd = (CPpmd8 *)data;
|
CPpmd8 *_ppmd = (CPpmd8 *)data;
|
||||||
uchar_t *_src = (uchar_t *)src;
|
uchar_t *_src = (uchar_t *)src;
|
||||||
|
|
||||||
if (PC_TYPE(btype) == TYPE_COMPRESSED)
|
if (PC_TYPE(btype) & TYPE_COMPRESSED)
|
||||||
return (-1);
|
return (-1);
|
||||||
|
|
||||||
Ppmd8_RangeEnc_Init(_ppmd);
|
Ppmd8_RangeEnc_Init(_ppmd);
|
||||||
|
|
|
@ -157,7 +157,7 @@ zlib_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
|
||||||
* can be attempted to be compressed again for a possible gain. For others it is
|
* can be attempted to be compressed again for a possible gain. For others it is
|
||||||
* a waste of time.
|
* a waste of time.
|
||||||
*/
|
*/
|
||||||
if (PC_TYPE(btype) == TYPE_COMPRESSED && level < 7) {
|
if (PC_TYPE(btype) & TYPE_COMPRESSED && level < 7) {
|
||||||
int subtype = PC_SUBTYPE(btype);
|
int subtype = PC_SUBTYPE(btype);
|
||||||
|
|
||||||
if (subtype != TYPE_COMPRESSED_LZW &&
|
if (subtype != TYPE_COMPRESSED_LZW &&
|
||||||
|
|
Loading…
Reference in a new issue