From 6aacd903ffe90607bef360b176caa93aa6457f7c Mon Sep 17 00:00:00 2001 From: Moinak Ghosh Date: Sat, 9 Nov 2013 16:46:19 +0530 Subject: [PATCH] Structured handling of file types. Handling of already compressed data based on compression algorithm. Add a few more extension types. --- adaptive_compress.c | 6 ++-- archive/pc_archive.c | 5 ++-- bzip2_compress.c | 16 +++++++---- libbsc_compress.c | 8 ++++-- lz4_compress.c | 6 ++++ lzfx_compress.c | 6 ++++ lzma_compress.c | 2 +- pcompress.c | 5 ++-- ppmd_compress.c | 3 +- utils/phash/extensions.h | 7 +++-- utils/phash/extensions.txt | 7 +++-- utils/phash/phash.c | 12 ++++---- utils/phash/phash.h | 2 +- utils/utils.h | 58 ++++++++++++++++++++++++++------------ zlib_compress.c | 13 +++++++++ 15 files changed, 107 insertions(+), 49 deletions(-) diff --git a/adaptive_compress.c b/adaptive_compress.c index 60d2463..abf7828 100644 --- a/adaptive_compress.c +++ b/adaptive_compress.c @@ -229,14 +229,14 @@ adapt_compress(void *src, uint64_t srclen, void *dst, * Use PPMd if some percentage of source is 7-bit textual bytes, otherwise * use Bzip2 or LZMA. */ - if (adat->adapt_mode == 2 && (btype & TYPE_BINARY)) { + if (adat->adapt_mode == 2 && (PC_TYPE(btype) == TYPE_BINARY)) { rv = lzma_compress(src, srclen, dst, dstlen, level, chdr, btype, adat->lzma_data); if (rv < 0) return (rv); rv = ADAPT_COMPRESS_LZMA; lzma_count++; - } else if (adat->adapt_mode == 1 && (btype & TYPE_BINARY)) { + } else if (adat->adapt_mode == 1 && (PC_TYPE(btype) == TYPE_BINARY)) { rv = bzip2_compress(src, srclen, dst, dstlen, level, chdr, btype, NULL); if (rv < 0) return (rv); @@ -245,7 +245,7 @@ adapt_compress(void *src, uint64_t srclen, void *dst, } else { #ifdef ENABLE_PC_LIBBSC - if (adat->bsc_data && (btype & TYPE_MARKUP)) { + if (adat->bsc_data && PC_SUBTYPE(btype) == TYPE_MARKUP) { rv = libbsc_compress(src, srclen, dst, dstlen, level, chdr, btype, adat->bsc_data); if (rv < 0) return (rv); diff --git a/archive/pc_archive.c b/archive/pc_archive.c index 15c7d1c..a8f9f1d 100644 --- a/archive/pc_archive.c +++ b/archive/pc_archive.c @@ -942,6 +942,7 @@ archiver_thread_func(void *dat) { continue; } + typ = TYPE_UNKNOWN; if (archive_entry_filetype(entry) == AE_IFREG) { if ((typ = detect_type_by_ext(fpath, fpathlen)) != TYPE_UNKNOWN) pctx->ctype = typ; @@ -1248,9 +1249,9 @@ detect_type_by_data(uchar_t *buf, size_t len) if (buf[0] == 0xe9) return (TYPE_BINARY|TYPE_EXE); // MSDOS COM if (U32_P(buf) == TZSHORT) - return (TYPE_BINARY|TYPE_BINARY); // Timezone data + return (TYPE_BINARY); // Timezone data if (U32_P(buf) == PPMSHORT) - return (TYPE_BINARY|TYPE_COMPRESSED); // PPM Compressed archive + return (TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_PPMD); // PPM Compressed archive return (TYPE_UNKNOWN); } diff --git a/bzip2_compress.c b/bzip2_compress.c index 4306650..86b4537 100644 --- a/bzip2_compress.c +++ b/bzip2_compress.c @@ -174,12 +174,16 @@ bzip2_decompress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, char *dst1 = (char *)dst; char *src1 = (char *)src; - if (btype & TYPE_COMPRESSED) { - if ((btype & TYPE_COMPRESSED_LZW) != TYPE_COMPRESSED_LZW && - (btype & TYPE_COMPRESSED_GZ) != TYPE_COMPRESSED_GZ && - (btype & TYPE_COMPRESSED_LZ) != TYPE_COMPRESSED_LZ && - (btype & TYPE_COMPRESSED_LZO) != TYPE_COMPRESSED_LZO) - { + /* + * If the data is known to be compressed then certain types less compressed data + * can be attempted to be compressed again for a possible gain. For others it is + * a waste of time. + */ + if (PC_TYPE(btype) == TYPE_COMPRESSED && level < 7) { + int subtype = PC_SUBTYPE(btype); + + if (subtype != TYPE_COMPRESSED_LZW && subtype != TYPE_COMPRESSED_GZ && + subtype != TYPE_COMPRESSED_LZ && subtype != TYPE_COMPRESSED_LZO) { return (-1); } } diff --git a/libbsc_compress.c b/libbsc_compress.c index f259096..0eeb104 100644 --- a/libbsc_compress.c +++ b/libbsc_compress.c @@ -153,9 +153,11 @@ libbsc_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, int rv; struct libbsc_params *bscdat = (struct libbsc_params *)data; - if ((btype & TYPE_COMPRESSED_BZ2) == TYPE_COMPRESSED_BZ2 || - (btype & TYPE_COMPRESSED_LZMA) == TYPE_COMPRESSED_LZMA) - return (-1); + if (PC_TYPE(btype) == TYPE_COMPRESSED) { + int subtype = PC_SUBTYPE(btype); + if (subtype == TYPE_COMPRESSED_BZ2 || subtype == TYPE_COMPRESSED_LZMA) + return (-1); + } rv = bsc_compress(src, dst, srclen, bscdat->lzpHashSize, bscdat->lzpMinLen, LIBBSC_BLOCKSORTER_BWT, bscdat->bscCoder, bscdat->features); diff --git a/lz4_compress.c b/lz4_compress.c index b0e5a36..5ac8574 100644 --- a/lz4_compress.c +++ b/lz4_compress.c @@ -106,6 +106,12 @@ lz4_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, int _srclen = srclen; uchar_t *dst2; + /* + * Ignore compressed data in fast modes. + */ + if (lzdat->level < 3 && PC_TYPE(btype) == TYPE_COMPRESSED) + return (-1); + if (lzdat->level == 1) { rv = LZ4_compress((const char *)src, (char *)dst, _srclen); diff --git a/lzfx_compress.c b/lzfx_compress.c index f687a74..fd6f61e 100644 --- a/lzfx_compress.c +++ b/lzfx_compress.c @@ -111,6 +111,12 @@ lz_fx_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, unsigned int _srclen = srclen; unsigned int _dstlen = *dstlen; + /* + * Ignore compressed data in fast modes. + */ + if (level < 7 && PC_TYPE(btype) == TYPE_COMPRESSED) + return (-1); + rv = lzfx_compress(src, _srclen, dst, &_dstlen, lzdat->htab_bits); if (rv != 0) { if (rv != LZFX_ESIZE) diff --git a/lzma_compress.c b/lzma_compress.c index 8ddf643..c016820 100644 --- a/lzma_compress.c +++ b/lzma_compress.c @@ -211,7 +211,7 @@ lzma_compress(void *src, uint64_t srclen, void *dst, return (-1); } - if ((btype & TYPE_COMPRESSED_LZMA) == TYPE_COMPRESSED_LZMA) + if (PC_SUBTYPE(btype) == TYPE_COMPRESSED_LZMA) return (-1); props->level = level; diff --git a/pcompress.c b/pcompress.c index 29385b2..242b5cd 100644 --- a/pcompress.c +++ b/pcompress.c @@ -1544,12 +1544,13 @@ plain_index: o_chunksize = _chunksize; /* Compress data chunk. */ - if ((pctx->lzp_preprocess || pctx->enable_delta2_encode) && _chunksize > 0) { + if ((pctx->lzp_preprocess || pctx->enable_delta2_encode) && _chunksize > 0 && + PC_SUBTYPE(pctx->btype) == TYPE_CMP_MAX) { rv = preproc_compress(pctx, tdat->compress, tdat->uncompressed_chunk + dedupe_index_sz, _chunksize, compressed_chunk + index_size_cmp, &_chunksize, tdat->level, 0, pctx->btype, tdat->data, tdat->props); - } else if (_chunksize > 0) { + } else if (_chunksize > 0 && PC_SUBTYPE(pctx->btype) == TYPE_CMP_MAX) { DEBUG_STAT_EN(double strt, en); DEBUG_STAT_EN(strt = get_wtime_millis()); diff --git a/ppmd_compress.c b/ppmd_compress.c index 3552a45..fafc362 100644 --- a/ppmd_compress.c +++ b/ppmd_compress.c @@ -114,8 +114,9 @@ ppmd_compress(void *src, uint64_t srclen, void *dst, CPpmd8 *_ppmd = (CPpmd8 *)data; uchar_t *_src = (uchar_t *)src; - if (btype & TYPE_COMPRESSED) + if (PC_TYPE(btype) == TYPE_COMPRESSED) return (-1); + Ppmd8_RangeEnc_Init(_ppmd); Ppmd8_Init(_ppmd, _ppmd->Order, PPMD8_RESTORE_METHOD_RESTART); _ppmd->buf = (Byte *)dst; diff --git a/utils/phash/extensions.h b/utils/phash/extensions.h index b0fc4c0..c4a5b49 100644 --- a/utils/phash/extensions.h +++ b/utils/phash/extensions.h @@ -99,8 +99,8 @@ struct ext_entry { {"xpi" , TYPE_BINARY|TYPE_EXE, 3}, {"off" , TYPE_BINARY|TYPE_EXE, 3}, {"pdf" , TYPE_BINARY, 3}, - {"jpg" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG, 3}, - {"jpeg" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG, 4}, + {"jpg" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_CMP_MAX, 3}, + {"jpeg" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_CMP_MAX, 4}, {"png" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ, 3}, {"mp3" , TYPE_BINARY|TYPE_COMPRESSED, 3}, {"wma" , TYPE_BINARY|TYPE_COMPRESSED, 3}, @@ -114,7 +114,8 @@ struct ext_entry { {"flac" , TYPE_BINARY|TYPE_COMPRESSED, 4}, {"pac" , TYPE_BINARY|TYPE_COMPRESSED, 3}, {"gif" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZW, 3}, - {"jp2" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG, 3}, + {"jp2" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_CMP_MAX, 3}, + {"pjg" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_CMP_MAX, 3}, {"gz" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ, 2}, {"tgz" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ, 3}, {"bz2" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_BZ2, 3}, diff --git a/utils/phash/extensions.txt b/utils/phash/extensions.txt index 0f58bd1..a349510 100644 --- a/utils/phash/extensions.txt +++ b/utils/phash/extensions.txt @@ -89,8 +89,8 @@ com,TYPE_BINARY|TYPE_EXE xpi,TYPE_BINARY|TYPE_EXE off,TYPE_BINARY|TYPE_EXE pdf,TYPE_BINARY -jpg,TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG -jpeg,TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG +jpg,TYPE_BINARY|TYPE_COMPRESSED|TYPE_CMP_MAX +jpeg,TYPE_BINARY|TYPE_COMPRESSED|TYPE_CMP_MAX png,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ mp3,TYPE_BINARY|TYPE_COMPRESSED wma,TYPE_BINARY|TYPE_COMPRESSED @@ -104,7 +104,8 @@ ofr,TYPE_BINARY|TYPE_COMPRESSED flac,TYPE_BINARY|TYPE_COMPRESSED pac,TYPE_BINARY|TYPE_COMPRESSED gif,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZW -jp2,TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG +jp2,TYPE_BINARY|TYPE_COMPRESSED|TYPE_CMP_MAX +pjg,TYPE_BINARY|TYPE_COMPRESSED|TYPE_CMP_MAX gz,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ tgz,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ bz2,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_BZ2 diff --git a/utils/phash/phash.c b/utils/phash/phash.c index 5e2f340..b235b11 100644 --- a/utils/phash/phash.c +++ b/utils/phash/phash.c @@ -13,12 +13,12 @@ /* small adjustments to _a_ to make values distinct */ ub1 tab[] = { 125,0,0,82,113,0,125,85,113,0,0,7,0,0,125,0, -0,0,7,87,0,0,82,0,0,88,0,7,0,85,125,85, -0,113,0,0,85,0,0,113,0,113,124,125,0,125,0,0, -113,0,11,113,125,0,0,0,0,85,113,85,22,0,0,125, -0,113,0,0,113,0,82,0,125,111,87,88,69,125,113,0, -124,0,7,22,113,22,0,235,0,120,120,125,113,0,74,120, -0,124,87,7,0,127,0,0,11,85,85,146,115,11,183,146, +0,0,7,87,0,0,82,0,0,125,0,7,0,85,125,85, +0,113,0,0,85,0,0,113,0,113,124,0,0,125,0,0, +113,0,11,113,125,7,0,0,0,85,113,85,22,0,0,125, +0,113,0,0,113,0,82,0,125,111,87,124,69,183,113,0, +124,0,7,22,120,22,0,127,0,120,51,125,0,0,74,120, +0,124,87,113,0,127,0,0,11,85,85,146,69,11,183,146, 0,0,88,0,0,85,42,0,171,0,0,0,0,83,0,0, }; diff --git a/utils/phash/phash.h b/utils/phash/phash.h index 74bd726..e422e47 100644 --- a/utils/phash/phash.h +++ b/utils/phash/phash.h @@ -8,7 +8,7 @@ extern ub1 tab[]; #define PHASHLEN 0x80 /* length of hash mapping table */ -#define PHASHNKEYS 133 /* How many keys were hashed */ +#define PHASHNKEYS 134 /* How many keys were hashed */ #define PHASHRANGE 256 /* Range any input might map to */ #define PHASHSALT 0x9e3779b9 /* internal, initialize normal hash */ diff --git a/utils/utils.h b/utils/utils.h index 3a92355..464581d 100644 --- a/utils/utils.h +++ b/utils/utils.h @@ -232,31 +232,53 @@ struct fn_list { * Enumerated type constants for file type identification in pc_archive. */ typedef enum { + /* + * Primary Types. + */ TYPE_UNKNOWN = 0, TYPE_TEXT = 1, TYPE_BINARY = 2, TYPE_COMPRESSED = 4, + /* + * Sub-types. + */ TYPE_EXE = 8, - TYPE_JPEG = 12, - TYPE_MARKUP = 16, - TYPE_COMPRESSED_GZ = 20, - TYPE_COMPRESSED_LZW = 24, - TYPE_COMPRESSED_BZ2 = 28, - TYPE_COMPRESSED_ZIP = 32, - TYPE_COMPRESSED_ARJ = 36, - TYPE_COMPRESSED_ARC = 40, - TYPE_COMPRESSED_LH = 44, - TYPE_COMPRESSED_LZMA = 48, - TYPE_COMPRESSED_LZO = 52, - TYPE_COMPRESSED_UHARC = 56, - TYPE_COMPRESSED_ALZ = 60, - TYPE_COMPRESSED_ACE = 64, - TYPE_COMPRESSED_RAR = 68, - TYPE_COMPRESSED_LZ = 72, - TYPE_COMPRESSED_PPMD = 76, - TYPE_COMPRESSED_ZPAQ = 80 + TYPE_CMP_MAX = 16, + TYPE_MARKUP = 24, + TYPE_COMPRESSED_GZ = 32, + TYPE_COMPRESSED_LZW = 40, + TYPE_COMPRESSED_BZ2 = 48, + TYPE_COMPRESSED_ZIP = 56, + TYPE_COMPRESSED_ARJ = 64, + TYPE_COMPRESSED_ARC = 72, + TYPE_COMPRESSED_LH = 80, + TYPE_COMPRESSED_LZMA = 88, + TYPE_COMPRESSED_LZO = 96, + TYPE_COMPRESSED_UHARC = 104, + TYPE_COMPRESSED_ALZ = 112, + TYPE_COMPRESSED_ACE = 120, + TYPE_COMPRESSED_RAR = 128, + TYPE_COMPRESSED_LZ = 136, + TYPE_COMPRESSED_PPMD = 144, + TYPE_COMPRESSED_ZPAQ = 152 } data_type_t; +/* + * Type identifier is an int with the following format. + * + * Sub Type Primary Type + * (Numeric Value) (Bit Positions - Flags) + * _____________|_____________ ___|___ + * | | | | + * .---------------------------------------. + * | | | | | | | | | | | + * Bit 10 Bit 0 + */ +#define PC_TYPE_MASK 0x7 +#define PC_SUBTYPE_MASK 0x7f8 +#define PC_SUBTYPE(x) ((x) & PC_SUBTYPE_MASK) +#define PC_TYPE(x) ((x) & PC_TYPE_MASK) + #ifndef _IN_UTILS_ extern processor_info_t proc_info; #endif diff --git a/zlib_compress.c b/zlib_compress.c index dfd7ced..fddbc07 100644 --- a/zlib_compress.c +++ b/zlib_compress.c @@ -152,6 +152,19 @@ zlib_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, uchar_t *src1 = (uchar_t *)src; z_stream *zs = (z_stream *)data; + /* + * If the data is known to be compressed then certain types less compressed data + * can be attempted to be compressed again for a possible gain. For others it is + * a waste of time. + */ + if (PC_TYPE(btype) == TYPE_COMPRESSED && level < 7) { + int subtype = PC_SUBTYPE(btype); + + if (subtype != TYPE_COMPRESSED_LZW && + subtype != TYPE_COMPRESSED_LZ && subtype != TYPE_COMPRESSED_LZO) { + return (-1); + } + } ending = 0; while (_srclen > 0) { if (_srclen > SINGLE_CALL_MAX) {