From cae9de9b2ec4232caf02053074bb39afe417de24 Mon Sep 17 00:00:00 2001 From: Moinak Ghosh Date: Fri, 8 Nov 2013 23:50:28 +0530 Subject: [PATCH] Leverage file type detection(archiver) to improve compression performance. Use detected file/data type(archiver) for Adaptive compression modes. Update type flags and add more extensions. --- adaptive_compress.c | 104 ++++++++++++++++------------- archive/pc_archive.c | 18 +++-- archive/pc_archive.h | 13 +--- bzip2_compress.c | 13 +++- libbsc_compress.c | 8 ++- lz4_compress.c | 4 +- lzfx_compress.c | 4 +- lzma_compress.c | 7 +- none_compress.c | 4 +- pcompress.c | 44 ++++++++----- pcompress.h | 36 +++++----- ppmd_compress.c | 6 +- utils/phash/extensions.h | 128 ++++++++++++++++++++---------------- utils/phash/extensions.txt | 130 +++++++++++++++++++++---------------- utils/phash/genhash.sh | 5 +- utils/phash/phash.c | 14 ++-- utils/phash/phash.h | 6 +- utils/utils.h | 31 ++++++++- zlib_compress.c | 4 +- 19 files changed, 340 insertions(+), 239 deletions(-) diff --git a/adaptive_compress.c b/adaptive_compress.c index 3532f21..60d2463 100644 --- a/adaptive_compress.c +++ b/adaptive_compress.c @@ -35,6 +35,7 @@ #include #include #include +#include #define FIFTY_PCT(x) (((x)/10) * 5) #define FORTY_PCT(x) (((x)/10) * 4) @@ -46,22 +47,22 @@ static unsigned int bsc_count = 0; static unsigned int ppmd_count = 0; extern int lzma_compress(void *src, uint64_t srclen, void *dst, - uint64_t *destlen, int level, uchar_t chdr, void *data); + uint64_t *destlen, int level, uchar_t chdr, int btype, void *data); extern int bzip2_compress(void *src, uint64_t srclen, void *dst, - uint64_t *destlen, int level, uchar_t chdr, void *data); + uint64_t *destlen, int level, uchar_t chdr, int btype, void *data); extern int ppmd_compress(void *src, uint64_t srclen, void *dst, - uint64_t *dstlen, int level, uchar_t chdr, void *data); + uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data); extern int libbsc_compress(void *src, uint64_t srclen, void *dst, - uint64_t *dstlen, int level, uchar_t chdr, void *data); + uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data); extern int lzma_decompress(void *src, uint64_t srclen, void *dst, - uint64_t *dstlen, int level, uchar_t chdr, void *data); + uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data); extern int bzip2_decompress(void *src, uint64_t srclen, void *dst, - uint64_t *dstlen, int level, uchar_t chdr, void *data); + uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data); extern int ppmd_decompress(void *src, uint64_t srclen, void *dst, - uint64_t *dstlen, int level, uchar_t chdr, void *data); + uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data); extern int libbsc_decompress(void *src, uint64_t srclen, void *dst, - uint64_t *dstlen, int level, uchar_t chdr, void *data); + uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data); extern int lzma_init(void **data, int *level, int nthreads, uint64_t chunksize, int file_version, compress_op_t op); @@ -180,51 +181,63 @@ adapt_deinit(void **data) int adapt_compress(void *src, uint64_t srclen, void *dst, - uint64_t *dstlen, int level, uchar_t chdr, void *data) + uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data) { struct adapt_data *adat = (struct adapt_data *)(data); uchar_t *src1 = (uchar_t *)src; - uint64_t i, tot8b, tag1, tag2, tag3; int rv = 0; - double tagcnt, pct_tag; - uchar_t cur_byte, prev_byte; - /* - * Count number of 8-bit binary bytes and XML tags in source. - */ - tot8b = 0; - tag1 = 0; - tag2 = 0; - tag3 = 0; - prev_byte = cur_byte = 0; - for (i = 0; i < srclen; i++) { - cur_byte = src1[i]; - tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization - tag1 += (cur_byte == '<'); - tag2 += (cur_byte == '>'); - tag3 += ((prev_byte == '<') & (cur_byte == '/')); - tag3 += ((prev_byte == '/') & (cur_byte == '>')); - if (cur_byte != ' ') - prev_byte = cur_byte; + if (btype == TYPE_UNKNOWN) { + uint64_t i, tot8b, tag1, tag2, tag3; + double tagcnt, pct_tag; + uchar_t cur_byte, prev_byte; + /* + * Count number of 8-bit binary bytes and XML tags in source. + */ + tot8b = 0; + tag1 = 0; + tag2 = 0; + tag3 = 0; + prev_byte = cur_byte = 0; + for (i = 0; i < srclen; i++) { + cur_byte = src1[i]; + tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization + tag1 += (cur_byte == '<'); + tag2 += (cur_byte == '>'); + tag3 += ((prev_byte == '<') & (cur_byte == '/')); + tag3 += ((prev_byte == '/') & (cur_byte == '>')); + if (cur_byte != ' ') + prev_byte = cur_byte; + } + + tot8b /= 0x80; + tagcnt = tag1 + tag2 + tag3; + pct_tag = tagcnt / (double)srclen; + if (adat->adapt_mode == 2 && tot8b > FORTY_PCT(srclen)) { + btype = TYPE_BINARY; + } else if (adat->adapt_mode == 1 && tot8b > FIFTY_PCT(srclen)) { + btype = TYPE_BINARY; + } else { + btype = TYPE_TEXT; + if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 && + tagcnt > (double)srclen * 0.001) + btype |= TYPE_MARKUP; + } } - tot8b /= 0x80; - tagcnt = tag1 + tag2 + tag3; - pct_tag = tagcnt / (double)srclen; - /* * Use PPMd if some percentage of source is 7-bit textual bytes, otherwise * use Bzip2 or LZMA. */ - if (adat->adapt_mode == 2 && tot8b > FORTY_PCT(srclen)) { - rv = lzma_compress(src, srclen, dst, dstlen, level, chdr, adat->lzma_data); + if (adat->adapt_mode == 2 && (btype & TYPE_BINARY)) { + rv = lzma_compress(src, srclen, dst, dstlen, level, chdr, btype, adat->lzma_data); if (rv < 0) return (rv); rv = ADAPT_COMPRESS_LZMA; lzma_count++; - } else if (adat->adapt_mode == 1 && tot8b > FIFTY_PCT(srclen)) { - rv = bzip2_compress(src, srclen, dst, dstlen, level, chdr, NULL); + } else if (adat->adapt_mode == 1 && (btype & TYPE_BINARY)) { + rv = bzip2_compress(src, srclen, dst, dstlen, level, chdr, btype, NULL); if (rv < 0) return (rv); rv = ADAPT_COMPRESS_BZIP2; @@ -232,16 +245,15 @@ adapt_compress(void *src, uint64_t srclen, void *dst, } else { #ifdef ENABLE_PC_LIBBSC - if (adat->bsc_data && tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 && - tagcnt > (double)srclen * 0.001) { - rv = libbsc_compress(src, srclen, dst, dstlen, level, chdr, adat->bsc_data); + if (adat->bsc_data && (btype & TYPE_MARKUP)) { + rv = libbsc_compress(src, srclen, dst, dstlen, level, chdr, btype, adat->bsc_data); if (rv < 0) return (rv); rv = ADAPT_COMPRESS_BSC; bsc_count++; } else { #endif - rv = ppmd_compress(src, srclen, dst, dstlen, level, chdr, adat->ppmd_data); + rv = ppmd_compress(src, srclen, dst, dstlen, level, chdr, btype, adat->ppmd_data); if (rv < 0) return (rv); rv = ADAPT_COMPRESS_PPMD; @@ -256,7 +268,7 @@ adapt_compress(void *src, uint64_t srclen, void *dst, int adapt_decompress(void *src, uint64_t srclen, void *dst, - uint64_t *dstlen, int level, uchar_t chdr, void *data) + uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data) { struct adapt_data *adat = (struct adapt_data *)(data); uchar_t cmp_flags; @@ -264,17 +276,17 @@ adapt_decompress(void *src, uint64_t srclen, void *dst, cmp_flags = (chdr>>4) & CHDR_ALGO_MASK; if (cmp_flags == ADAPT_COMPRESS_LZMA) { - return (lzma_decompress(src, srclen, dst, dstlen, level, chdr, adat->lzma_data)); + return (lzma_decompress(src, srclen, dst, dstlen, level, chdr, btype, adat->lzma_data)); } else if (cmp_flags == ADAPT_COMPRESS_BZIP2) { - return (bzip2_decompress(src, srclen, dst, dstlen, level, chdr, NULL)); + return (bzip2_decompress(src, srclen, dst, dstlen, level, chdr, btype, NULL)); } else if (cmp_flags == ADAPT_COMPRESS_PPMD) { - return (ppmd_decompress(src, srclen, dst, dstlen, level, chdr, adat->ppmd_data)); + return (ppmd_decompress(src, srclen, dst, dstlen, level, chdr, btype, adat->ppmd_data)); } else if (cmp_flags == ADAPT_COMPRESS_BSC) { #ifdef ENABLE_PC_LIBBSC - return (libbsc_decompress(src, srclen, dst, dstlen, level, chdr, adat->bsc_data)); + return (libbsc_decompress(src, srclen, dst, dstlen, level, chdr, btype, adat->bsc_data)); #else log_msg(LOG_ERR, 0, "Cannot decompress chunk. Libbsc support not present.\n"); return (-1); diff --git a/archive/pc_archive.c b/archive/pc_archive.c index ca0d790..15c7d1c 100644 --- a/archive/pc_archive.c +++ b/archive/pc_archive.c @@ -229,8 +229,6 @@ archiver_read(void *ctx, void *buf, uint64_t count) sem_post(&(pctx->write_sem)); sem_wait(&(pctx->read_sem)); pctx->arc_buf = NULL; - if (pctx->btype == TYPE_UNKNOWN) - pctx->btype = TYPE_GENERIC; return (pctx->arc_buf_pos); } @@ -1166,9 +1164,9 @@ init_archive_mod() { if (!inited) { int i, j; - exthtab = malloc(NUM_EXT * sizeof (struct ext_hash_entry)); + exthtab = malloc(PHASHNKEYS * sizeof (struct ext_hash_entry)); if (exthtab != NULL) { - for (i = 0; i < NUM_EXT; i++) { + for (i = 0; i < PHASHNKEYS; i++) { uint64_t extnum; ub4 slot = phash(extlist[i].ext, extlist[i].len); extnum = 0; @@ -1211,7 +1209,7 @@ detect_type_by_ext(char *path, int pathlen) if (len == 0) goto out; // If extension is empty give up ext = &path[i+1]; slot = phash(ext, len); - if (slot > NUM_EXT) goto out; // Extension maps outside hash table range, give up + if (slot > PHASHNKEYS) goto out; // Extension maps outside hash table range, give up extnum = 0; /* @@ -1244,15 +1242,15 @@ detect_type_by_data(uchar_t *buf, size_t len) if (len < 16) return (TYPE_UNKNOWN); if (U32_P(buf) == ELFSHORT) - return (TYPE_EXE); // Regular ELF + return (TYPE_BINARY|TYPE_EXE); // Regular ELF if ((buf[0] == 'M' || buf[0] == 'L') && buf[1] == 'Z') - return (TYPE_EXE); // MSDOS Exe + return (TYPE_BINARY|TYPE_EXE); // MSDOS Exe if (buf[0] == 0xe9) - return (TYPE_EXE); // MSDOS COM + return (TYPE_BINARY|TYPE_EXE); // MSDOS COM if (U32_P(buf) == TZSHORT) - return (TYPE_BINARY); // Timezone data + return (TYPE_BINARY|TYPE_BINARY); // Timezone data if (U32_P(buf) == PPMSHORT) - return (TYPE_COMPRESSED); // PPM Compressed archive + return (TYPE_BINARY|TYPE_COMPRESSED); // PPM Compressed archive return (TYPE_UNKNOWN); } diff --git a/archive/pc_archive.h b/archive/pc_archive.h index 3d6bb36..1a71ca4 100644 --- a/archive/pc_archive.h +++ b/archive/pc_archive.h @@ -26,6 +26,9 @@ #ifndef _ARCHIVE_H #define _ARCHIVE_H +#include +#include +#include #include #ifdef __cplusplus @@ -38,16 +41,6 @@ typedef struct { size_t size; } archive_list_entry_t; -typedef enum { - TYPE_UNKNOWN = 0, - TYPE_GENERIC, - TYPE_COMPRESSED, - TYPE_EXE, - TYPE_TEXT, - TYPE_BINARY, - TYPE_JPEG -} data_type_t; - /* * Archiving related functions. */ diff --git a/bzip2_compress.c b/bzip2_compress.c index 2bb5f52..4306650 100644 --- a/bzip2_compress.c +++ b/bzip2_compress.c @@ -95,7 +95,7 @@ bzerr(int err) int bzip2_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, - int level, uchar_t chdr, void *data) + int level, uchar_t chdr, int btype, void *data) { bz_stream bzs; int ret, ending; @@ -164,7 +164,7 @@ bzip2_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, int bzip2_decompress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, - int level, uchar_t chdr, void *data) + int level, uchar_t chdr, int btype, void *data) { bz_stream bzs; int ret; @@ -174,6 +174,15 @@ bzip2_decompress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, char *dst1 = (char *)dst; char *src1 = (char *)src; + if (btype & TYPE_COMPRESSED) { + if ((btype & TYPE_COMPRESSED_LZW) != TYPE_COMPRESSED_LZW && + (btype & TYPE_COMPRESSED_GZ) != TYPE_COMPRESSED_GZ && + (btype & TYPE_COMPRESSED_LZ) != TYPE_COMPRESSED_LZ && + (btype & TYPE_COMPRESSED_LZO) != TYPE_COMPRESSED_LZO) + { + return (-1); + } + } bzs.bzalloc = slab_alloc_i; bzs.bzfree = slab_free; bzs.opaque = NULL; diff --git a/libbsc_compress.c b/libbsc_compress.c index 869aff0..f259096 100644 --- a/libbsc_compress.c +++ b/libbsc_compress.c @@ -148,11 +148,15 @@ libbsc_deinit(void **data) int libbsc_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, - int level, uchar_t chdr, void *data) + int level, uchar_t chdr, int btype, void *data) { int rv; struct libbsc_params *bscdat = (struct libbsc_params *)data; + if ((btype & TYPE_COMPRESSED_BZ2) == TYPE_COMPRESSED_BZ2 || + (btype & TYPE_COMPRESSED_LZMA) == TYPE_COMPRESSED_LZMA) + return (-1); + rv = bsc_compress(src, dst, srclen, bscdat->lzpHashSize, bscdat->lzpMinLen, LIBBSC_BLOCKSORTER_BWT, bscdat->bscCoder, bscdat->features); if (rv < 0) { @@ -165,7 +169,7 @@ libbsc_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, int libbsc_decompress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, - int level, uchar_t chdr, void *data) + int level, uchar_t chdr, int btype, void *data) { int rv; struct libbsc_params *bscdat = (struct libbsc_params *)data; diff --git a/lz4_compress.c b/lz4_compress.c index 0d1dd1c..b0e5a36 100644 --- a/lz4_compress.c +++ b/lz4_compress.c @@ -99,7 +99,7 @@ lz4_deinit(void **data) int lz4_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, - int level, uchar_t chdr, void *data) + int level, uchar_t chdr, int btype, void *data) { int rv; struct lz4_params *lzdat = (struct lz4_params *)data; @@ -135,7 +135,7 @@ lz4_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, int lz4_decompress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, - int level, uchar_t chdr, void *data) + int level, uchar_t chdr, int btype, void *data) { int rv; struct lz4_params *lzdat = (struct lz4_params *)data; diff --git a/lzfx_compress.c b/lzfx_compress.c index 7968c4b..f687a74 100644 --- a/lzfx_compress.c +++ b/lzfx_compress.c @@ -104,7 +104,7 @@ lz_fx_err(int err) int lz_fx_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, - int level, uchar_t chdr, void *data) + int level, uchar_t chdr, int btype, void *data) { int rv; struct lzfx_params *lzdat = (struct lzfx_params *)data; @@ -124,7 +124,7 @@ lz_fx_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, int lz_fx_decompress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, - int level, uchar_t chdr, void *data) + int level, uchar_t chdr, int btype, void *data) { int rv; unsigned int _srclen = srclen; diff --git a/lzma_compress.c b/lzma_compress.c index 4704f1b..8ddf643 100644 --- a/lzma_compress.c +++ b/lzma_compress.c @@ -199,7 +199,7 @@ lzerr(int err, int cmp) */ int lzma_compress(void *src, uint64_t srclen, void *dst, - uint64_t *dstlen, int level, uchar_t chdr, void *data) + uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data) { uint64_t props_len = LZMA_PROPS_SIZE; SRes res; @@ -210,6 +210,9 @@ lzma_compress(void *src, uint64_t srclen, void *dst, lzerr(SZ_ERROR_DESTLEN, 1); return (-1); } + + if ((btype & TYPE_COMPRESSED_LZMA) == TYPE_COMPRESSED_LZMA) + return (-1); props->level = level; _dst = (Byte *)dst; @@ -228,7 +231,7 @@ lzma_compress(void *src, uint64_t srclen, void *dst, int lzma_decompress(void *src, uint64_t srclen, void *dst, - uint64_t *dstlen, int level, uchar_t chdr, void *data) + uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data) { uint64_t _srclen; const uchar_t *_src; diff --git a/none_compress.c b/none_compress.c index 9cdb446..8948c64 100644 --- a/none_compress.c +++ b/none_compress.c @@ -61,7 +61,7 @@ none_deinit(void **data) int none_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, - int level, uchar_t chdr, void *data) + int level, uchar_t chdr, int btype, void *data) { memcpy(dst, src, srclen); return (0); @@ -69,7 +69,7 @@ none_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, int none_decompress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, - int level, uchar_t chdr, void *data) + int level, uchar_t chdr, int btype, void *data) { memcpy(dst, src, srclen); return (0); diff --git a/pcompress.c b/pcompress.c index 82d01bb..29385b2 100644 --- a/pcompress.c +++ b/pcompress.c @@ -201,7 +201,7 @@ show_compression_stats(pc_ctx_t *pctx) */ static int preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t srclen, - void *dst, uint64_t *dstlen, int level, uchar_t chdr, void *data, algo_props_t *props) + void *dst, uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data, algo_props_t *props) { uchar_t *dest = (uchar_t *)dst, type = 0; int64_t result; @@ -247,7 +247,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t U64_P(dest + 1) = htonll(srclen); _dstlen = srclen; DEBUG_STAT_EN(strt = get_wtime_millis()); - result = cmp_func(src, srclen, dest+9, &_dstlen, level, chdr, data); + result = cmp_func(src, srclen, dest+9, &_dstlen, level, chdr, btype, data); DEBUG_STAT_EN(en = get_wtime_millis()); if (result > -1 && _dstlen < srclen) { @@ -273,7 +273,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t static int preproc_decompress(pc_ctx_t *pctx, compress_func_ptr dec_func, void *src, uint64_t srclen, - void *dst, uint64_t *dstlen, int level, uchar_t chdr, void *data, algo_props_t *props) + void *dst, uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data, algo_props_t *props) { uchar_t *sorc = (uchar_t *)src, type; int64_t result; @@ -288,7 +288,7 @@ preproc_decompress(pc_ctx_t *pctx, compress_func_ptr dec_func, void *src, uint64 sorc += 8; srclen -= 8; DEBUG_STAT_EN(strt = get_wtime_millis()); - result = dec_func(sorc, srclen, dst, dstlen, level, chdr, data); + result = dec_func(sorc, srclen, dst, dstlen, level, chdr, btype, data); DEBUG_STAT_EN(en = get_wtime_millis()); if (result < 0) return (result); @@ -488,13 +488,13 @@ redo: if (HDR & COMPRESSED) { if (HDR & CHUNK_FLAG_PREPROC) { rv = preproc_decompress(pctx, tdat->decompress, cmpbuf, dedupe_data_sz_cmp, - ubuf, &_chunksize, tdat->level, HDR, tdat->data, tdat->props); + ubuf, &_chunksize, tdat->level, HDR, pctx->btype, tdat->data, tdat->props); } else { DEBUG_STAT_EN(double strt, en); DEBUG_STAT_EN(strt = get_wtime_millis()); rv = tdat->decompress(cmpbuf, dedupe_data_sz_cmp, ubuf, &_chunksize, - tdat->level, HDR, tdat->data); + tdat->level, HDR, pctx->btype, tdat->data); DEBUG_STAT_EN(en = get_wtime_millis()); DEBUG_STAT_EN(fprintf(stderr, "Chunk %d decompression speed %.3f MB/s\n", tdat->id, get_mb_s(_chunksize, strt, en))); @@ -516,7 +516,7 @@ redo: if (dedupe_index_sz >= 90 && dedupe_index_sz > dedupe_index_sz_cmp) { /* Index should be at least 90 bytes to have been compressed. */ rv = lzma_decompress(cmpbuf, dedupe_index_sz_cmp, ubuf, - &dedupe_index_sz, tdat->rctx->level, 0, tdat->rctx->lzma_data); + &dedupe_index_sz, tdat->rctx->level, 0, TYPE_BINARY, tdat->rctx->lzma_data); } else { memcpy(ubuf, cmpbuf, dedupe_index_sz); } @@ -531,14 +531,14 @@ redo: if (HDR & COMPRESSED) { if (HDR & CHUNK_FLAG_PREPROC) { rv = preproc_decompress(pctx, tdat->decompress, cseg, tdat->len_cmp, - tdat->uncompressed_chunk, &_chunksize, tdat->level, HDR, tdat->data, - tdat->props); + tdat->uncompressed_chunk, &_chunksize, tdat->level, HDR, pctx->btype, + tdat->data, tdat->props); } else { DEBUG_STAT_EN(double strt, en); DEBUG_STAT_EN(strt = get_wtime_millis()); rv = tdat->decompress(cseg, tdat->len_cmp, tdat->uncompressed_chunk, - &_chunksize, tdat->level, HDR, tdat->data); + &_chunksize, tdat->level, HDR, pctx->btype, tdat->data); DEBUG_STAT_EN(en = get_wtime_millis()); DEBUG_STAT_EN(fprintf(stderr, "Chunk decompression speed %.3f MB/s\n", get_mb_s(_chunksize, strt, en))); @@ -1520,7 +1520,8 @@ redo: /* Compress index if it is at least 90 bytes. */ rv = lzma_compress(tdat->uncompressed_chunk + RABIN_HDR_SIZE, dedupe_index_sz, compressed_chunk + RABIN_HDR_SIZE, - &index_size_cmp, tdat->rctx->level, 255, tdat->rctx->lzma_data); + &index_size_cmp, tdat->rctx->level, 255, TYPE_BINARY, + tdat->rctx->lzma_data); /* * If index compression fails or does not produce a smaller result @@ -1546,14 +1547,15 @@ plain_index: if ((pctx->lzp_preprocess || pctx->enable_delta2_encode) && _chunksize > 0) { rv = preproc_compress(pctx, tdat->compress, tdat->uncompressed_chunk + dedupe_index_sz, _chunksize, compressed_chunk + index_size_cmp, &_chunksize, - tdat->level, 0, tdat->data, tdat->props); + tdat->level, 0, pctx->btype, tdat->data, tdat->props); } else if (_chunksize > 0) { DEBUG_STAT_EN(double strt, en); DEBUG_STAT_EN(strt = get_wtime_millis()); rv = tdat->compress(tdat->uncompressed_chunk + dedupe_index_sz, _chunksize, - compressed_chunk + index_size_cmp, &_chunksize, tdat->level, 0, tdat->data); + compressed_chunk + index_size_cmp, &_chunksize, tdat->level, 0, pctx->btype, + tdat->data); DEBUG_STAT_EN(en = get_wtime_millis()); DEBUG_STAT_EN(fprintf(stderr, "Chunk compression speed %.3f MB/s\n", get_mb_s(_chunksize, strt, en))); @@ -1576,14 +1578,14 @@ plain_index: if (pctx->lzp_preprocess || pctx->enable_delta2_encode) { rv = preproc_compress(pctx, tdat->compress, tdat->uncompressed_chunk, tdat->rbytes, - compressed_chunk, &_chunksize, tdat->level, 0, tdat->data, + compressed_chunk, &_chunksize, tdat->level, 0, pctx->btype, tdat->data, tdat->props); } else { DEBUG_STAT_EN(double strt, en); DEBUG_STAT_EN(strt = get_wtime_millis()); rv = tdat->compress(tdat->uncompressed_chunk, tdat->rbytes, - compressed_chunk, &_chunksize, tdat->level, 0, tdat->data); + compressed_chunk, &_chunksize, tdat->level, 0, pctx->btype, tdat->data); DEBUG_STAT_EN(en = get_wtime_millis()); DEBUG_STAT_EN(fprintf(stderr, "Chunk compression speed %.3f MB/s\n", get_mb_s(_chunksize, strt, en))); @@ -2292,7 +2294,10 @@ start_compress(pc_ctx_t *pctx, const char *filename, uint64_t chunksize, int lev rctx = create_dedupe_context(chunksize, 0, pctx->rab_blk_size, pctx->algo, &props, pctx->enable_delta_encode, pctx->enable_fixed_scan, VERSION, COMPRESS, 0, NULL, pctx->pipe_mode, nprocs); - rbytes = Read_Adjusted(uncompfd, cread_buf, chunksize, &rabin_count, rctx, pctx); + if (pctx->archive_mode) + rbytes = Read_Adjusted(uncompfd, cread_buf, chunksize, &rabin_count, rctx, pctx); + else + rbytes = Read_Adjusted(uncompfd, cread_buf, chunksize, &rabin_count, rctx, NULL); } else { if (pctx->archive_mode) rbytes = archiver_read(pctx, cread_buf, chunksize); @@ -2405,7 +2410,12 @@ start_compress(pc_ctx_t *pctx, const char *filename, uint64_t chunksize, int lev * buffer is in progress. */ if (pctx->enable_rabin_split) { - rbytes = Read_Adjusted(uncompfd, cread_buf, chunksize, &rabin_count, rctx, pctx); + if (pctx->archive_mode) + rbytes = Read_Adjusted(uncompfd, cread_buf, chunksize, + &rabin_count, rctx, pctx); + else + rbytes = Read_Adjusted(uncompfd, cread_buf, chunksize, + &rabin_count, rctx, NULL); } else { if (pctx->archive_mode) rbytes = archiver_read(pctx, cread_buf, chunksize); diff --git a/pcompress.h b/pcompress.h index de6b288..6cbfb0c 100644 --- a/pcompress.h +++ b/pcompress.h @@ -84,38 +84,38 @@ extern uint32_t zlib_buf_extra(uint64_t buflen); extern int lz4_buf_extra(uint64_t buflen); extern int zlib_compress(void *src, uint64_t srclen, void *dst, - uint64_t *destlen, int level, uchar_t chdr, void *data); + uint64_t *destlen, int level, uchar_t chdr, int btype, void *data); extern int lzma_compress(void *src, uint64_t srclen, void *dst, - uint64_t *destlen, int level, uchar_t chdr, void *data); + uint64_t *destlen, int level, uchar_t chdr, int btype, void *data); extern int bzip2_compress(void *src, uint64_t srclen, void *dst, - uint64_t *destlen, int level, uchar_t chdr, void *data); + uint64_t *destlen, int level, uchar_t chdr, int btype, void *data); extern int adapt_compress(void *src, uint64_t srclen, void *dst, - uint64_t *dstlen, int level, uchar_t chdr, void *data); + uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data); extern int ppmd_compress(void *src, uint64_t srclen, void *dst, - uint64_t *dstlen, int level, uchar_t chdr, void *data); + uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data); extern int lz_fx_compress(void *src, uint64_t srclen, void *dst, - uint64_t *dstlen, int level, uchar_t chdr, void *data); + uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data); extern int lz4_compress(void *src, uint64_t srclen, void *dst, - uint64_t *dstlen, int level, uchar_t chdr, void *data); + uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data); extern int none_compress(void *src, uint64_t srclen, void *dst, - uint64_t *dstlen, int level, uchar_t chdr, void *data); + uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data); extern int zlib_decompress(void *src, uint64_t srclen, void *dst, - uint64_t *dstlen, int level, uchar_t chdr, void *data); + uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data); extern int lzma_decompress(void *src, uint64_t srclen, void *dst, - uint64_t *dstlen, int level, uchar_t chdr, void *data); + uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data); extern int bzip2_decompress(void *src, uint64_t srclen, void *dst, - uint64_t *dstlen, int level, uchar_t chdr, void *data); + uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data); extern int adapt_decompress(void *src, uint64_t srclen, void *dst, - uint64_t *dstlen, int level, uchar_t chdr, void *data); + uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data); extern int ppmd_decompress(void *src, uint64_t srclen, void *dst, - uint64_t *dstlen, int level, uchar_t chdr, void *data); + uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data); extern int lz_fx_decompress(void *src, uint64_t srclen, void *dst, - uint64_t *dstlen, int level, uchar_t chdr, void *data); + uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data); extern int lz4_decompress(void *src, uint64_t srclen, void *dst, - uint64_t *dstlen, int level, uchar_t chdr, void *data); + uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data); extern int none_decompress(void *src, uint64_t srclen, void *dst, - uint64_t *dstlen, int level, uchar_t chdr, void *data); + uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data); extern int adapt_init(void **data, int *level, int nthreads, uint64_t chunksize, int file_version, compress_op_t op); @@ -165,9 +165,9 @@ extern void none_stats(int show); #ifdef ENABLE_PC_LIBBSC extern int libbsc_compress(void *src, uint64_t srclen, void *dst, - uint64_t *dstlen, int level, uchar_t chdr, void *data); + uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data); extern int libbsc_decompress(void *src, uint64_t srclen, void *dst, - uint64_t *dstlen, int level, uchar_t chdr, void *data); + uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data); extern int libbsc_init(void **data, int *level, int nthreads, uint64_t chunksize, int file_version, compress_op_t op); extern void libbsc_props(algo_props_t *data, int level, uint64_t chunksize); diff --git a/ppmd_compress.c b/ppmd_compress.c index cf56c30..3552a45 100644 --- a/ppmd_compress.c +++ b/ppmd_compress.c @@ -109,11 +109,13 @@ ppmd_deinit(void **data) int ppmd_compress(void *src, uint64_t srclen, void *dst, - uint64_t *dstlen, int level, uchar_t chdr, void *data) + uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data) { CPpmd8 *_ppmd = (CPpmd8 *)data; uchar_t *_src = (uchar_t *)src; + if (btype & TYPE_COMPRESSED) + return (-1); Ppmd8_RangeEnc_Init(_ppmd); Ppmd8_Init(_ppmd, _ppmd->Order, PPMD8_RESTORE_METHOD_RESTART); _ppmd->buf = (Byte *)dst; @@ -132,7 +134,7 @@ ppmd_compress(void *src, uint64_t srclen, void *dst, int ppmd_decompress(void *src, uint64_t srclen, void *dst, - uint64_t *dstlen, int level, uchar_t chdr, void *data) + uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data) { CPpmd8 *_ppmd = (CPpmd8 *)data; Byte *_src = (Byte *)src; diff --git a/utils/phash/extensions.h b/utils/phash/extensions.h index 8797efb..b0fc4c0 100644 --- a/utils/phash/extensions.h +++ b/utils/phash/extensions.h @@ -18,9 +18,9 @@ struct ext_entry { {"c++" , TYPE_TEXT, 3}, {"hpp" , TYPE_TEXT, 3}, {"txt" , TYPE_TEXT, 3}, - {"html" , TYPE_TEXT, 4}, - {"htm" , TYPE_TEXT, 3}, - {"xml" , TYPE_TEXT, 3}, + {"html" , TYPE_TEXT|TYPE_MARKUP, 4}, + {"htm" , TYPE_TEXT|TYPE_MARKUP, 3}, + {"xml" , TYPE_TEXT|TYPE_MARKUP, 3}, {"info" , TYPE_TEXT, 4}, {"ppm" , TYPE_TEXT, 3}, {"svg" , TYPE_TEXT, 3}, @@ -44,18 +44,18 @@ struct ext_entry { {"java" , TYPE_TEXT, 4}, {"m4" , TYPE_TEXT, 2}, {"vb" , TYPE_TEXT, 2}, - {"xslt" , TYPE_TEXT, 4}, - {"xsl" , TYPE_TEXT, 3}, + {"xslt" , TYPE_TEXT|TYPE_MARKUP, 4}, + {"xsl" , TYPE_TEXT|TYPE_MARKUP, 3}, {"yacc" , TYPE_TEXT, 4}, {"lex" , TYPE_TEXT, 3}, {"csv" , TYPE_TEXT, 3}, - {"shtml" , TYPE_TEXT, 5}, - {"xhtml" , TYPE_TEXT, 5}, - {"xht" , TYPE_TEXT, 3}, + {"shtml" , TYPE_TEXT|TYPE_MARKUP, 5}, + {"xhtml" , TYPE_TEXT|TYPE_MARKUP, 5}, + {"xht" , TYPE_TEXT|TYPE_MARKUP, 3}, {"asp" , TYPE_TEXT, 3}, {"aspx" , TYPE_TEXT, 4}, - {"rss" , TYPE_TEXT, 3}, - {"atom" , TYPE_TEXT, 4}, + {"rss" , TYPE_TEXT|TYPE_MARKUP, 3}, + {"atom" , TYPE_TEXT|TYPE_MARKUP, 4}, {"cgi" , TYPE_TEXT, 3}, {"c#" , TYPE_TEXT, 2}, {"cob" , TYPE_TEXT, 3}, @@ -67,8 +67,18 @@ struct ext_entry { {"ps" , TYPE_TEXT, 2}, {"bib" , TYPE_TEXT, 3}, {"lua" , TYPE_TEXT, 3}, - {"qml" , TYPE_TEXT, 3}, + {"qml" , TYPE_TEXT|TYPE_MARKUP, 3}, {"fa" , TYPE_TEXT, 2}, + {"faa" , TYPE_TEXT, 3}, + {"asn" , TYPE_TEXT|TYPE_MARKUP, 3}, + {"ffn" , TYPE_TEXT, 3}, + {"fna" , TYPE_TEXT, 3}, + {"frn" , TYPE_TEXT, 3}, + {"gbk" , TYPE_TEXT, 3}, + {"gff" , TYPE_TEXT, 3}, + {"ptt" , TYPE_TEXT, 3}, + {"rnt" , TYPE_TEXT, 3}, + {"val" , TYPE_BINARY, 3}, {"tcc" , TYPE_TEXT, 3}, {"css" , TYPE_TEXT, 3}, {"pod" , TYPE_TEXT, 3}, @@ -78,55 +88,61 @@ struct ext_entry { {"upp" , TYPE_TEXT, 3}, {"mom" , TYPE_TEXT, 3}, {"tmac" , TYPE_TEXT, 4}, - {"exe" , TYPE_EXE, 3}, - {"dll" , TYPE_EXE, 3}, - {"bin" , TYPE_EXE, 3}, - {"o" , TYPE_EXE, 1}, - {"a" , TYPE_EXE, 1}, - {"obj" , TYPE_EXE, 3}, - {"so" , TYPE_EXE, 2}, - {"com" , TYPE_EXE, 3}, - {"xpi" , TYPE_EXE, 3}, - {"off" , TYPE_EXE, 3}, - {"pdf" , TYPE_COMPRESSED, 3}, - {"jpg" , TYPE_JPEG, 3}, - {"jpeg" , TYPE_JPEG, 4}, - {"png" , TYPE_COMPRESSED, 3}, - {"mp3" , TYPE_COMPRESSED, 3}, - {"wma" , TYPE_COMPRESSED, 3}, - {"divx" , TYPE_COMPRESSED, 4}, - {"mp4" , TYPE_COMPRESSED, 3}, - {"aac" , TYPE_COMPRESSED, 3}, - {"m4a" , TYPE_COMPRESSED, 3}, - {"m4p" , TYPE_COMPRESSED, 3}, - {"ofs" , TYPE_COMPRESSED, 3}, - {"ofr" , TYPE_COMPRESSED, 3}, - {"flac" , TYPE_COMPRESSED, 4}, - {"pac" , TYPE_COMPRESSED, 3}, - {"gif" , TYPE_COMPRESSED, 3}, - {"jp2" , TYPE_JPEG, 3}, - {"gz" , TYPE_COMPRESSED, 2}, - {"bz2" , TYPE_COMPRESSED, 3}, - {"zip" , TYPE_COMPRESSED, 3}, - {"arj" , TYPE_COMPRESSED, 3}, - {"arc" , TYPE_COMPRESSED, 3}, - {"jar" , TYPE_COMPRESSED, 3}, - {"lz" , TYPE_COMPRESSED, 2}, - {"lzh" , TYPE_COMPRESSED, 3}, - {"lzma" , TYPE_COMPRESSED, 4}, - {"lzo" , TYPE_COMPRESSED, 3}, - {"dmg" , TYPE_COMPRESSED, 3}, - {"7z" , TYPE_COMPRESSED, 2}, - {"uha" , TYPE_COMPRESSED, 3}, - {"alz" , TYPE_COMPRESSED, 3}, - {"ace" , TYPE_COMPRESSED, 3}, - {"rar" , TYPE_COMPRESSED, 3}, - {"xz" , TYPE_COMPRESSED, 2}, + {"exe" , TYPE_BINARY|TYPE_EXE, 3}, + {"dll" , TYPE_BINARY|TYPE_EXE, 3}, + {"bin" , TYPE_BINARY|TYPE_EXE, 3}, + {"o" , TYPE_BINARY|TYPE_EXE, 1}, + {"a" , TYPE_BINARY|TYPE_EXE, 1}, + {"obj" , TYPE_BINARY|TYPE_EXE, 3}, + {"so" , TYPE_BINARY|TYPE_EXE, 2}, + {"com" , TYPE_BINARY|TYPE_EXE, 3}, + {"xpi" , TYPE_BINARY|TYPE_EXE, 3}, + {"off" , TYPE_BINARY|TYPE_EXE, 3}, + {"pdf" , TYPE_BINARY, 3}, + {"jpg" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG, 3}, + {"jpeg" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG, 4}, + {"png" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ, 3}, + {"mp3" , TYPE_BINARY|TYPE_COMPRESSED, 3}, + {"wma" , TYPE_BINARY|TYPE_COMPRESSED, 3}, + {"divx" , TYPE_BINARY|TYPE_COMPRESSED, 4}, + {"mp4" , TYPE_BINARY|TYPE_COMPRESSED, 3}, + {"aac" , TYPE_BINARY|TYPE_COMPRESSED, 3}, + {"m4a" , TYPE_BINARY|TYPE_COMPRESSED, 3}, + {"m4p" , TYPE_BINARY|TYPE_COMPRESSED, 3}, + {"ofs" , TYPE_BINARY|TYPE_COMPRESSED, 3}, + {"ofr" , TYPE_BINARY|TYPE_COMPRESSED, 3}, + {"flac" , TYPE_BINARY|TYPE_COMPRESSED, 4}, + {"pac" , TYPE_BINARY|TYPE_COMPRESSED, 3}, + {"gif" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZW, 3}, + {"jp2" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG, 3}, + {"gz" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ, 2}, + {"tgz" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ, 3}, + {"bz2" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_BZ2, 3}, + {"tbz2" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_BZ2, 4}, + {"zip" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_ZIP, 3}, + {"arj" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_ARJ, 3}, + {"arc" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_ARC, 3}, + {"jar" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ, 3}, + {"lz" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZ, 2}, + {"lzh" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LH, 3}, + {"lha" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LH, 3}, + {"lzma" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZMA, 4}, + {"lzo" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZO, 3}, + {"dmg" , TYPE_BINARY, 3}, + {"7z" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZMA, 2}, + {"uha" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_UHARC, 3}, + {"alz" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_ALZ, 3}, + {"ace" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_ACE, 3}, + {"rar" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_RAR, 3}, + {"xz" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZMA, 2}, + {"txz" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZMA, 3}, + {"pmd" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_PPMD, 3}, + {"zpaq" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_ZPAQ, 4}, {"xcf" , TYPE_BINARY, 3}, {"mo" , TYPE_BINARY, 2}, {"bmp" , TYPE_BINARY, 3}, {"pyo" , TYPE_BINARY, 3}, {"pyc" , TYPE_BINARY, 3}, + {"wav" , TYPE_BINARY, 3}, }; -#define NUM_EXT (116) #endif diff --git a/utils/phash/extensions.txt b/utils/phash/extensions.txt index 1d410a0..0f58bd1 100644 --- a/utils/phash/extensions.txt +++ b/utils/phash/extensions.txt @@ -5,9 +5,9 @@ cpp,TYPE_TEXT c++,TYPE_TEXT hpp,TYPE_TEXT txt,TYPE_TEXT -html,TYPE_TEXT -htm,TYPE_TEXT -xml,TYPE_TEXT +html,TYPE_TEXT|TYPE_MARKUP +htm,TYPE_TEXT|TYPE_MARKUP +xml,TYPE_TEXT|TYPE_MARKUP info,TYPE_TEXT ppm,TYPE_TEXT svg,TYPE_TEXT @@ -31,18 +31,18 @@ go,TYPE_TEXT java,TYPE_TEXT m4,TYPE_TEXT vb,TYPE_TEXT -xslt,TYPE_TEXT -xsl,TYPE_TEXT +xslt,TYPE_TEXT|TYPE_MARKUP +xsl,TYPE_TEXT|TYPE_MARKUP yacc,TYPE_TEXT lex,TYPE_TEXT csv,TYPE_TEXT -shtml,TYPE_TEXT -xhtml,TYPE_TEXT -xht,TYPE_TEXT +shtml,TYPE_TEXT|TYPE_MARKUP +xhtml,TYPE_TEXT|TYPE_MARKUP +xht,TYPE_TEXT|TYPE_MARKUP asp,TYPE_TEXT aspx,TYPE_TEXT -rss,TYPE_TEXT -atom,TYPE_TEXT +rss,TYPE_TEXT|TYPE_MARKUP +atom,TYPE_TEXT|TYPE_MARKUP cgi,TYPE_TEXT c#,TYPE_TEXT cob,TYPE_TEXT @@ -54,8 +54,21 @@ s,TYPE_TEXT ps,TYPE_TEXT bib,TYPE_TEXT lua,TYPE_TEXT -qml,TYPE_TEXT +qml,TYPE_TEXT|TYPE_MARKUP + +# These are all genomic data file extensions fa,TYPE_TEXT +faa,TYPE_TEXT +asn,TYPE_TEXT|TYPE_MARKUP +ffn,TYPE_TEXT +fna,TYPE_TEXT +frn,TYPE_TEXT +gbk,TYPE_TEXT +gff,TYPE_TEXT +ptt,TYPE_TEXT +rnt,TYPE_TEXT +val,TYPE_BINARY + tcc,TYPE_TEXT css,TYPE_TEXT pod,TYPE_TEXT @@ -65,52 +78,59 @@ am,TYPE_TEXT upp,TYPE_TEXT mom,TYPE_TEXT tmac,TYPE_TEXT -exe,TYPE_EXE -dll,TYPE_EXE -bin,TYPE_EXE -o,TYPE_EXE -a,TYPE_EXE -obj,TYPE_EXE -so,TYPE_EXE -com,TYPE_EXE -xpi,TYPE_EXE -off,TYPE_EXE -pdf,TYPE_COMPRESSED -jpg,TYPE_JPEG -jpeg,TYPE_JPEG -png,TYPE_COMPRESSED -mp3,TYPE_COMPRESSED -wma,TYPE_COMPRESSED -divx,TYPE_COMPRESSED -mp4,TYPE_COMPRESSED -aac,TYPE_COMPRESSED -m4a,TYPE_COMPRESSED -m4p,TYPE_COMPRESSED -ofs,TYPE_COMPRESSED -ofr,TYPE_COMPRESSED -flac,TYPE_COMPRESSED -pac,TYPE_COMPRESSED -gif,TYPE_COMPRESSED -jp2,TYPE_JPEG -gz,TYPE_COMPRESSED -bz2,TYPE_COMPRESSED -zip,TYPE_COMPRESSED -arj,TYPE_COMPRESSED -arc,TYPE_COMPRESSED -jar,TYPE_COMPRESSED -lz,TYPE_COMPRESSED -lzh,TYPE_COMPRESSED -lzma,TYPE_COMPRESSED -lzo,TYPE_COMPRESSED -dmg,TYPE_COMPRESSED -7z,TYPE_COMPRESSED -uha,TYPE_COMPRESSED -alz,TYPE_COMPRESSED -ace,TYPE_COMPRESSED -rar,TYPE_COMPRESSED -xz,TYPE_COMPRESSED +exe,TYPE_BINARY|TYPE_EXE +dll,TYPE_BINARY|TYPE_EXE +bin,TYPE_BINARY|TYPE_EXE +o,TYPE_BINARY|TYPE_EXE +a,TYPE_BINARY|TYPE_EXE +obj,TYPE_BINARY|TYPE_EXE +so,TYPE_BINARY|TYPE_EXE +com,TYPE_BINARY|TYPE_EXE +xpi,TYPE_BINARY|TYPE_EXE +off,TYPE_BINARY|TYPE_EXE +pdf,TYPE_BINARY +jpg,TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG +jpeg,TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG +png,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ +mp3,TYPE_BINARY|TYPE_COMPRESSED +wma,TYPE_BINARY|TYPE_COMPRESSED +divx,TYPE_BINARY|TYPE_COMPRESSED +mp4,TYPE_BINARY|TYPE_COMPRESSED +aac,TYPE_BINARY|TYPE_COMPRESSED +m4a,TYPE_BINARY|TYPE_COMPRESSED +m4p,TYPE_BINARY|TYPE_COMPRESSED +ofs,TYPE_BINARY|TYPE_COMPRESSED +ofr,TYPE_BINARY|TYPE_COMPRESSED +flac,TYPE_BINARY|TYPE_COMPRESSED +pac,TYPE_BINARY|TYPE_COMPRESSED +gif,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZW +jp2,TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG +gz,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ +tgz,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ +bz2,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_BZ2 +tbz2,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_BZ2 +zip,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_ZIP +arj,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_ARJ +arc,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_ARC +jar,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ +lz,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZ +lzh,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LH +lha,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LH +lzma,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZMA +lzo,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZO +dmg,TYPE_BINARY +7z,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZMA +uha,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_UHARC +alz,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_ALZ +ace,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_ACE +rar,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_RAR +xz,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZMA +txz,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZMA +pmd,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_PPMD +zpaq,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_ZPAQ xcf,TYPE_BINARY mo,TYPE_BINARY bmp,TYPE_BINARY pyo,TYPE_BINARY pyc,TYPE_BINARY +wav,TYPE_BINARY diff --git a/utils/phash/genhash.sh b/utils/phash/genhash.sh index 4d7b6dc..1006a67 100644 --- a/utils/phash/genhash.sh +++ b/utils/phash/genhash.sh @@ -1,6 +1,5 @@ #!/bin/sh -count=`cat extensions.txt | wc -l` echo ' /* Generated File. DO NOT EDIT. */ /* @@ -18,6 +17,9 @@ struct ext_entry { rm -f extlist cat extensions.txt | while read line do + [ "x$line" = "x" ] && continue + echo "$line" | egrep "^#" > /dev/null + [ $? -eq 0 ] && continue _OIFS="$IFS" IFS="," set -- $line @@ -30,7 +32,6 @@ do done echo '};' >> extensions.h -echo "#define NUM_EXT (${count})" >> extensions.h echo "#endif" >> extensions.h ./perfect -nm < extlist rm -f extlist diff --git a/utils/phash/phash.c b/utils/phash/phash.c index 05ff4de..5e2f340 100644 --- a/utils/phash/phash.c +++ b/utils/phash/phash.c @@ -12,17 +12,21 @@ /* small adjustments to _a_ to make values distinct */ ub1 tab[] = { -10,76,0,76,70,42,0,1,0,0,119,1,61,1,70,79, -0,0,0,4,70,1,0,122,0,119,47,76,76,34,110,101, -0,76,70,70,42,28,0,66,0,108,0,109,28,4,28,4, -70,0,1,20,4,123,123,0,79,75,34,76,69,77,0,69, +125,0,0,82,113,0,125,85,113,0,0,7,0,0,125,0, +0,0,7,87,0,0,82,0,0,88,0,7,0,85,125,85, +0,113,0,0,85,0,0,113,0,113,124,125,0,125,0,0, +113,0,11,113,125,0,0,0,0,85,113,85,22,0,0,125, +0,113,0,0,113,0,82,0,125,111,87,88,69,125,113,0, +124,0,7,22,113,22,0,235,0,120,120,125,113,0,74,120, +0,124,87,7,0,127,0,0,11,85,85,146,115,11,183,146, +0,0,88,0,0,85,42,0,171,0,0,0,0,83,0,0, }; /* The hash function */ ub4 phash(char *key, int len) { ub4 rsl, val = lookup(key, len, 0x9e3779b9); - rsl = ((val>>26)^tab[val&0x3f]); + rsl = ((val>>25)^tab[val&0x7f]); return rsl; } diff --git a/utils/phash/phash.h b/utils/phash/phash.h index f833e23..74bd726 100644 --- a/utils/phash/phash.h +++ b/utils/phash/phash.h @@ -7,9 +7,9 @@ #define PHASH extern ub1 tab[]; -#define PHASHLEN 0x40 /* length of hash mapping table */ -#define PHASHNKEYS 116 /* How many keys were hashed */ -#define PHASHRANGE 128 /* Range any input might map to */ +#define PHASHLEN 0x80 /* length of hash mapping table */ +#define PHASHNKEYS 133 /* How many keys were hashed */ +#define PHASHRANGE 256 /* Range any input might map to */ #define PHASHSALT 0x9e3779b9 /* internal, initialize normal hash */ ub4 phash(); diff --git a/utils/utils.h b/utils/utils.h index 416121d..3a92355 100644 --- a/utils/utils.h +++ b/utils/utils.h @@ -228,6 +228,35 @@ struct fn_list { struct fn_list *next; }; +/* + * Enumerated type constants for file type identification in pc_archive. + */ +typedef enum { + TYPE_UNKNOWN = 0, + TYPE_TEXT = 1, + TYPE_BINARY = 2, + TYPE_COMPRESSED = 4, + TYPE_EXE = 8, + TYPE_JPEG = 12, + TYPE_MARKUP = 16, + TYPE_COMPRESSED_GZ = 20, + TYPE_COMPRESSED_LZW = 24, + TYPE_COMPRESSED_BZ2 = 28, + TYPE_COMPRESSED_ZIP = 32, + TYPE_COMPRESSED_ARJ = 36, + TYPE_COMPRESSED_ARC = 40, + TYPE_COMPRESSED_LH = 44, + TYPE_COMPRESSED_LZMA = 48, + TYPE_COMPRESSED_LZO = 52, + TYPE_COMPRESSED_UHARC = 56, + TYPE_COMPRESSED_ALZ = 60, + TYPE_COMPRESSED_ACE = 64, + TYPE_COMPRESSED_RAR = 68, + TYPE_COMPRESSED_LZ = 72, + TYPE_COMPRESSED_PPMD = 76, + TYPE_COMPRESSED_ZPAQ = 80 +} data_type_t; + #ifndef _IN_UTILS_ extern processor_info_t proc_info; #endif @@ -254,7 +283,7 @@ extern char *get_temp_dir(); /* Pointer type for compress and decompress functions. */ typedef int (*compress_func_ptr)(void *src, uint64_t srclen, void *dst, - uint64_t *destlen, int level, uchar_t chdr, void *data); + uint64_t *destlen, int level, uchar_t chdr, int btype, void *data); typedef enum { COMPRESS, diff --git a/zlib_compress.c b/zlib_compress.c index 8391cee..dfd7ced 100644 --- a/zlib_compress.c +++ b/zlib_compress.c @@ -142,7 +142,7 @@ void zerr(int ret, int cmp) int zlib_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, - int level, uchar_t chdr, void *data) + int level, uchar_t chdr, int btype, void *data) { int ret, ending; unsigned int slen, dlen; @@ -205,7 +205,7 @@ zlib_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, int zlib_decompress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, - int level, uchar_t chdr, void *data) + int level, uchar_t chdr, int btype, void *data) { int err; unsigned int slen, dlen;