Structured handling of file types.

Handling of already compressed data based on compression algorithm.
Add a few more extension types.
This commit is contained in:
Moinak Ghosh 2013-11-09 16:46:19 +05:30
parent cae9de9b2e
commit 6aacd903ff
15 changed files with 107 additions and 49 deletions

View file

@ -229,14 +229,14 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
* Use PPMd if some percentage of source is 7-bit textual bytes, otherwise * Use PPMd if some percentage of source is 7-bit textual bytes, otherwise
* use Bzip2 or LZMA. * use Bzip2 or LZMA.
*/ */
if (adat->adapt_mode == 2 && (btype & TYPE_BINARY)) { if (adat->adapt_mode == 2 && (PC_TYPE(btype) == TYPE_BINARY)) {
rv = lzma_compress(src, srclen, dst, dstlen, level, chdr, btype, adat->lzma_data); rv = lzma_compress(src, srclen, dst, dstlen, level, chdr, btype, adat->lzma_data);
if (rv < 0) if (rv < 0)
return (rv); return (rv);
rv = ADAPT_COMPRESS_LZMA; rv = ADAPT_COMPRESS_LZMA;
lzma_count++; lzma_count++;
} else if (adat->adapt_mode == 1 && (btype & TYPE_BINARY)) { } else if (adat->adapt_mode == 1 && (PC_TYPE(btype) == TYPE_BINARY)) {
rv = bzip2_compress(src, srclen, dst, dstlen, level, chdr, btype, NULL); rv = bzip2_compress(src, srclen, dst, dstlen, level, chdr, btype, NULL);
if (rv < 0) if (rv < 0)
return (rv); return (rv);
@ -245,7 +245,7 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
} else { } else {
#ifdef ENABLE_PC_LIBBSC #ifdef ENABLE_PC_LIBBSC
if (adat->bsc_data && (btype & TYPE_MARKUP)) { if (adat->bsc_data && PC_SUBTYPE(btype) == TYPE_MARKUP) {
rv = libbsc_compress(src, srclen, dst, dstlen, level, chdr, btype, adat->bsc_data); rv = libbsc_compress(src, srclen, dst, dstlen, level, chdr, btype, adat->bsc_data);
if (rv < 0) if (rv < 0)
return (rv); return (rv);

View file

@ -942,6 +942,7 @@ archiver_thread_func(void *dat) {
continue; continue;
} }
typ = TYPE_UNKNOWN;
if (archive_entry_filetype(entry) == AE_IFREG) { if (archive_entry_filetype(entry) == AE_IFREG) {
if ((typ = detect_type_by_ext(fpath, fpathlen)) != TYPE_UNKNOWN) if ((typ = detect_type_by_ext(fpath, fpathlen)) != TYPE_UNKNOWN)
pctx->ctype = typ; pctx->ctype = typ;
@ -1248,9 +1249,9 @@ detect_type_by_data(uchar_t *buf, size_t len)
if (buf[0] == 0xe9) if (buf[0] == 0xe9)
return (TYPE_BINARY|TYPE_EXE); // MSDOS COM return (TYPE_BINARY|TYPE_EXE); // MSDOS COM
if (U32_P(buf) == TZSHORT) if (U32_P(buf) == TZSHORT)
return (TYPE_BINARY|TYPE_BINARY); // Timezone data return (TYPE_BINARY); // Timezone data
if (U32_P(buf) == PPMSHORT) if (U32_P(buf) == PPMSHORT)
return (TYPE_BINARY|TYPE_COMPRESSED); // PPM Compressed archive return (TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_PPMD); // PPM Compressed archive
return (TYPE_UNKNOWN); return (TYPE_UNKNOWN);
} }

View file

@ -174,12 +174,16 @@ bzip2_decompress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
char *dst1 = (char *)dst; char *dst1 = (char *)dst;
char *src1 = (char *)src; char *src1 = (char *)src;
if (btype & TYPE_COMPRESSED) { /*
if ((btype & TYPE_COMPRESSED_LZW) != TYPE_COMPRESSED_LZW && * If the data is known to be compressed then certain types less compressed data
(btype & TYPE_COMPRESSED_GZ) != TYPE_COMPRESSED_GZ && * can be attempted to be compressed again for a possible gain. For others it is
(btype & TYPE_COMPRESSED_LZ) != TYPE_COMPRESSED_LZ && * a waste of time.
(btype & TYPE_COMPRESSED_LZO) != TYPE_COMPRESSED_LZO) */
{ if (PC_TYPE(btype) == TYPE_COMPRESSED && level < 7) {
int subtype = PC_SUBTYPE(btype);
if (subtype != TYPE_COMPRESSED_LZW && subtype != TYPE_COMPRESSED_GZ &&
subtype != TYPE_COMPRESSED_LZ && subtype != TYPE_COMPRESSED_LZO) {
return (-1); return (-1);
} }
} }

View file

@ -153,9 +153,11 @@ libbsc_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
int rv; int rv;
struct libbsc_params *bscdat = (struct libbsc_params *)data; struct libbsc_params *bscdat = (struct libbsc_params *)data;
if ((btype & TYPE_COMPRESSED_BZ2) == TYPE_COMPRESSED_BZ2 || if (PC_TYPE(btype) == TYPE_COMPRESSED) {
(btype & TYPE_COMPRESSED_LZMA) == TYPE_COMPRESSED_LZMA) int subtype = PC_SUBTYPE(btype);
return (-1); if (subtype == TYPE_COMPRESSED_BZ2 || subtype == TYPE_COMPRESSED_LZMA)
return (-1);
}
rv = bsc_compress(src, dst, srclen, bscdat->lzpHashSize, bscdat->lzpMinLen, rv = bsc_compress(src, dst, srclen, bscdat->lzpHashSize, bscdat->lzpMinLen,
LIBBSC_BLOCKSORTER_BWT, bscdat->bscCoder, bscdat->features); LIBBSC_BLOCKSORTER_BWT, bscdat->bscCoder, bscdat->features);

View file

@ -106,6 +106,12 @@ lz4_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
int _srclen = srclen; int _srclen = srclen;
uchar_t *dst2; uchar_t *dst2;
/*
* Ignore compressed data in fast modes.
*/
if (lzdat->level < 3 && PC_TYPE(btype) == TYPE_COMPRESSED)
return (-1);
if (lzdat->level == 1) { if (lzdat->level == 1) {
rv = LZ4_compress((const char *)src, (char *)dst, _srclen); rv = LZ4_compress((const char *)src, (char *)dst, _srclen);

View file

@ -111,6 +111,12 @@ lz_fx_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
unsigned int _srclen = srclen; unsigned int _srclen = srclen;
unsigned int _dstlen = *dstlen; unsigned int _dstlen = *dstlen;
/*
* Ignore compressed data in fast modes.
*/
if (level < 7 && PC_TYPE(btype) == TYPE_COMPRESSED)
return (-1);
rv = lzfx_compress(src, _srclen, dst, &_dstlen, lzdat->htab_bits); rv = lzfx_compress(src, _srclen, dst, &_dstlen, lzdat->htab_bits);
if (rv != 0) { if (rv != 0) {
if (rv != LZFX_ESIZE) if (rv != LZFX_ESIZE)

View file

@ -211,7 +211,7 @@ lzma_compress(void *src, uint64_t srclen, void *dst,
return (-1); return (-1);
} }
if ((btype & TYPE_COMPRESSED_LZMA) == TYPE_COMPRESSED_LZMA) if (PC_SUBTYPE(btype) == TYPE_COMPRESSED_LZMA)
return (-1); return (-1);
props->level = level; props->level = level;

View file

@ -1544,12 +1544,13 @@ plain_index:
o_chunksize = _chunksize; o_chunksize = _chunksize;
/* Compress data chunk. */ /* Compress data chunk. */
if ((pctx->lzp_preprocess || pctx->enable_delta2_encode) && _chunksize > 0) { if ((pctx->lzp_preprocess || pctx->enable_delta2_encode) && _chunksize > 0 &&
PC_SUBTYPE(pctx->btype) == TYPE_CMP_MAX) {
rv = preproc_compress(pctx, tdat->compress, tdat->uncompressed_chunk + dedupe_index_sz, rv = preproc_compress(pctx, tdat->compress, tdat->uncompressed_chunk + dedupe_index_sz,
_chunksize, compressed_chunk + index_size_cmp, &_chunksize, _chunksize, compressed_chunk + index_size_cmp, &_chunksize,
tdat->level, 0, pctx->btype, tdat->data, tdat->props); tdat->level, 0, pctx->btype, tdat->data, tdat->props);
} else if (_chunksize > 0) { } else if (_chunksize > 0 && PC_SUBTYPE(pctx->btype) == TYPE_CMP_MAX) {
DEBUG_STAT_EN(double strt, en); DEBUG_STAT_EN(double strt, en);
DEBUG_STAT_EN(strt = get_wtime_millis()); DEBUG_STAT_EN(strt = get_wtime_millis());

View file

@ -114,8 +114,9 @@ ppmd_compress(void *src, uint64_t srclen, void *dst,
CPpmd8 *_ppmd = (CPpmd8 *)data; CPpmd8 *_ppmd = (CPpmd8 *)data;
uchar_t *_src = (uchar_t *)src; uchar_t *_src = (uchar_t *)src;
if (btype & TYPE_COMPRESSED) if (PC_TYPE(btype) == TYPE_COMPRESSED)
return (-1); return (-1);
Ppmd8_RangeEnc_Init(_ppmd); Ppmd8_RangeEnc_Init(_ppmd);
Ppmd8_Init(_ppmd, _ppmd->Order, PPMD8_RESTORE_METHOD_RESTART); Ppmd8_Init(_ppmd, _ppmd->Order, PPMD8_RESTORE_METHOD_RESTART);
_ppmd->buf = (Byte *)dst; _ppmd->buf = (Byte *)dst;

View file

@ -99,8 +99,8 @@ struct ext_entry {
{"xpi" , TYPE_BINARY|TYPE_EXE, 3}, {"xpi" , TYPE_BINARY|TYPE_EXE, 3},
{"off" , TYPE_BINARY|TYPE_EXE, 3}, {"off" , TYPE_BINARY|TYPE_EXE, 3},
{"pdf" , TYPE_BINARY, 3}, {"pdf" , TYPE_BINARY, 3},
{"jpg" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG, 3}, {"jpg" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_CMP_MAX, 3},
{"jpeg" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG, 4}, {"jpeg" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_CMP_MAX, 4},
{"png" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ, 3}, {"png" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ, 3},
{"mp3" , TYPE_BINARY|TYPE_COMPRESSED, 3}, {"mp3" , TYPE_BINARY|TYPE_COMPRESSED, 3},
{"wma" , TYPE_BINARY|TYPE_COMPRESSED, 3}, {"wma" , TYPE_BINARY|TYPE_COMPRESSED, 3},
@ -114,7 +114,8 @@ struct ext_entry {
{"flac" , TYPE_BINARY|TYPE_COMPRESSED, 4}, {"flac" , TYPE_BINARY|TYPE_COMPRESSED, 4},
{"pac" , TYPE_BINARY|TYPE_COMPRESSED, 3}, {"pac" , TYPE_BINARY|TYPE_COMPRESSED, 3},
{"gif" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZW, 3}, {"gif" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZW, 3},
{"jp2" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG, 3}, {"jp2" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_CMP_MAX, 3},
{"pjg" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_CMP_MAX, 3},
{"gz" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ, 2}, {"gz" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ, 2},
{"tgz" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ, 3}, {"tgz" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ, 3},
{"bz2" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_BZ2, 3}, {"bz2" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_BZ2, 3},

View file

@ -89,8 +89,8 @@ com,TYPE_BINARY|TYPE_EXE
xpi,TYPE_BINARY|TYPE_EXE xpi,TYPE_BINARY|TYPE_EXE
off,TYPE_BINARY|TYPE_EXE off,TYPE_BINARY|TYPE_EXE
pdf,TYPE_BINARY pdf,TYPE_BINARY
jpg,TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG jpg,TYPE_BINARY|TYPE_COMPRESSED|TYPE_CMP_MAX
jpeg,TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG jpeg,TYPE_BINARY|TYPE_COMPRESSED|TYPE_CMP_MAX
png,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ png,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ
mp3,TYPE_BINARY|TYPE_COMPRESSED mp3,TYPE_BINARY|TYPE_COMPRESSED
wma,TYPE_BINARY|TYPE_COMPRESSED wma,TYPE_BINARY|TYPE_COMPRESSED
@ -104,7 +104,8 @@ ofr,TYPE_BINARY|TYPE_COMPRESSED
flac,TYPE_BINARY|TYPE_COMPRESSED flac,TYPE_BINARY|TYPE_COMPRESSED
pac,TYPE_BINARY|TYPE_COMPRESSED pac,TYPE_BINARY|TYPE_COMPRESSED
gif,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZW gif,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZW
jp2,TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG jp2,TYPE_BINARY|TYPE_COMPRESSED|TYPE_CMP_MAX
pjg,TYPE_BINARY|TYPE_COMPRESSED|TYPE_CMP_MAX
gz,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ gz,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ
tgz,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ tgz,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ
bz2,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_BZ2 bz2,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_BZ2

View file

@ -13,12 +13,12 @@
/* small adjustments to _a_ to make values distinct */ /* small adjustments to _a_ to make values distinct */
ub1 tab[] = { ub1 tab[] = {
125,0,0,82,113,0,125,85,113,0,0,7,0,0,125,0, 125,0,0,82,113,0,125,85,113,0,0,7,0,0,125,0,
0,0,7,87,0,0,82,0,0,88,0,7,0,85,125,85, 0,0,7,87,0,0,82,0,0,125,0,7,0,85,125,85,
0,113,0,0,85,0,0,113,0,113,124,125,0,125,0,0, 0,113,0,0,85,0,0,113,0,113,124,0,0,125,0,0,
113,0,11,113,125,0,0,0,0,85,113,85,22,0,0,125, 113,0,11,113,125,7,0,0,0,85,113,85,22,0,0,125,
0,113,0,0,113,0,82,0,125,111,87,88,69,125,113,0, 0,113,0,0,113,0,82,0,125,111,87,124,69,183,113,0,
124,0,7,22,113,22,0,235,0,120,120,125,113,0,74,120, 124,0,7,22,120,22,0,127,0,120,51,125,0,0,74,120,
0,124,87,7,0,127,0,0,11,85,85,146,115,11,183,146, 0,124,87,113,0,127,0,0,11,85,85,146,69,11,183,146,
0,0,88,0,0,85,42,0,171,0,0,0,0,83,0,0, 0,0,88,0,0,85,42,0,171,0,0,0,0,83,0,0,
}; };

View file

@ -8,7 +8,7 @@
extern ub1 tab[]; extern ub1 tab[];
#define PHASHLEN 0x80 /* length of hash mapping table */ #define PHASHLEN 0x80 /* length of hash mapping table */
#define PHASHNKEYS 133 /* How many keys were hashed */ #define PHASHNKEYS 134 /* How many keys were hashed */
#define PHASHRANGE 256 /* Range any input might map to */ #define PHASHRANGE 256 /* Range any input might map to */
#define PHASHSALT 0x9e3779b9 /* internal, initialize normal hash */ #define PHASHSALT 0x9e3779b9 /* internal, initialize normal hash */

View file

@ -232,31 +232,53 @@ struct fn_list {
* Enumerated type constants for file type identification in pc_archive. * Enumerated type constants for file type identification in pc_archive.
*/ */
typedef enum { typedef enum {
/*
* Primary Types.
*/
TYPE_UNKNOWN = 0, TYPE_UNKNOWN = 0,
TYPE_TEXT = 1, TYPE_TEXT = 1,
TYPE_BINARY = 2, TYPE_BINARY = 2,
TYPE_COMPRESSED = 4, TYPE_COMPRESSED = 4,
/*
* Sub-types.
*/
TYPE_EXE = 8, TYPE_EXE = 8,
TYPE_JPEG = 12, TYPE_CMP_MAX = 16,
TYPE_MARKUP = 16, TYPE_MARKUP = 24,
TYPE_COMPRESSED_GZ = 20, TYPE_COMPRESSED_GZ = 32,
TYPE_COMPRESSED_LZW = 24, TYPE_COMPRESSED_LZW = 40,
TYPE_COMPRESSED_BZ2 = 28, TYPE_COMPRESSED_BZ2 = 48,
TYPE_COMPRESSED_ZIP = 32, TYPE_COMPRESSED_ZIP = 56,
TYPE_COMPRESSED_ARJ = 36, TYPE_COMPRESSED_ARJ = 64,
TYPE_COMPRESSED_ARC = 40, TYPE_COMPRESSED_ARC = 72,
TYPE_COMPRESSED_LH = 44, TYPE_COMPRESSED_LH = 80,
TYPE_COMPRESSED_LZMA = 48, TYPE_COMPRESSED_LZMA = 88,
TYPE_COMPRESSED_LZO = 52, TYPE_COMPRESSED_LZO = 96,
TYPE_COMPRESSED_UHARC = 56, TYPE_COMPRESSED_UHARC = 104,
TYPE_COMPRESSED_ALZ = 60, TYPE_COMPRESSED_ALZ = 112,
TYPE_COMPRESSED_ACE = 64, TYPE_COMPRESSED_ACE = 120,
TYPE_COMPRESSED_RAR = 68, TYPE_COMPRESSED_RAR = 128,
TYPE_COMPRESSED_LZ = 72, TYPE_COMPRESSED_LZ = 136,
TYPE_COMPRESSED_PPMD = 76, TYPE_COMPRESSED_PPMD = 144,
TYPE_COMPRESSED_ZPAQ = 80 TYPE_COMPRESSED_ZPAQ = 152
} data_type_t; } data_type_t;
/*
* Type identifier is an int with the following format.
*
* Sub Type Primary Type
* (Numeric Value) (Bit Positions - Flags)
* _____________|_____________ ___|___
* | | | |
* .---------------------------------------.
* | | | | | | | | | | |
* Bit 10 Bit 0
*/
#define PC_TYPE_MASK 0x7
#define PC_SUBTYPE_MASK 0x7f8
#define PC_SUBTYPE(x) ((x) & PC_SUBTYPE_MASK)
#define PC_TYPE(x) ((x) & PC_TYPE_MASK)
#ifndef _IN_UTILS_ #ifndef _IN_UTILS_
extern processor_info_t proc_info; extern processor_info_t proc_info;
#endif #endif

View file

@ -152,6 +152,19 @@ zlib_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
uchar_t *src1 = (uchar_t *)src; uchar_t *src1 = (uchar_t *)src;
z_stream *zs = (z_stream *)data; z_stream *zs = (z_stream *)data;
/*
* If the data is known to be compressed then certain types less compressed data
* can be attempted to be compressed again for a possible gain. For others it is
* a waste of time.
*/
if (PC_TYPE(btype) == TYPE_COMPRESSED && level < 7) {
int subtype = PC_SUBTYPE(btype);
if (subtype != TYPE_COMPRESSED_LZW &&
subtype != TYPE_COMPRESSED_LZ && subtype != TYPE_COMPRESSED_LZO) {
return (-1);
}
}
ending = 0; ending = 0;
while (_srclen > 0) { while (_srclen > 0) {
if (_srclen > SINGLE_CALL_MAX) { if (_srclen > SINGLE_CALL_MAX) {