From 1e2c3e479a62474014254bed26c0acce309c737c Mon Sep 17 00:00:00 2001 From: Moinak Ghosh Date: Fri, 22 Nov 2013 20:44:26 +0530 Subject: [PATCH] Optimize preprocessed compression and avoid a bunch of memory copies. Fix a crash. Add a few more file types. More comments. --- adaptive_compress.c | 17 ++++++++-- archive/pc_arc_filter.c | 6 ++-- archive/pc_archive.c | 14 ++++++++ pcompress.c | 66 +++++++++++++++++++++++--------------- pcompress.h | 3 +- utils/phash/extensions.h | 13 ++++++-- utils/phash/extensions.txt | 13 ++++++-- utils/phash/phash.c | 16 ++++----- utils/phash/phash.h | 2 +- utils/utils.c | 2 +- utils/utils.h | 8 ++++- 11 files changed, 110 insertions(+), 50 deletions(-) diff --git a/adaptive_compress.c b/adaptive_compress.c index 608e4e9..568006d 100644 --- a/adaptive_compress.c +++ b/adaptive_compress.c @@ -130,8 +130,14 @@ adapt_init(void **data, int *level, int nthreads, uint64_t chunksize, adat = (struct adapt_data *)slab_alloc(NULL, sizeof (struct adapt_data)); adat->adapt_mode = 1; rv = ppmd_init(&(adat->ppmd_data), level, nthreads, chunksize, file_version, op); + + /* + * LZ4 is used to tackle some embedded archive headers and/or zero paddings in + * otherwise incompressible data. So we always use it at the lowest and fastest + * compression level. + */ if (rv == 0) - rv = lz4_init(&(adat->lz4_data), level, nthreads, chunksize, file_version, op); + rv = lz4_init(&(adat->lz4_data), 1, nthreads, chunksize, file_version, op); adat->lzma_data = NULL; adat->bsc_data = NULL; *data = adat; @@ -167,8 +173,13 @@ adapt2_init(void **data, int *level, int nthreads, uint64_t chunksize, if (rv == 0) rv = libbsc_init(&(adat->bsc_data), &lv, nthreads, chunksize, file_version, op); #endif + /* + * LZ4 is used to tackle some embedded archive headers and/or zero paddings in + * otherwise incompressible data. So we always use it at the lowest and fastest + * compression level. + */ if (rv == 0) - rv = lz4_init(&(adat->lz4_data), level, nthreads, chunksize, file_version, op); + rv = lz4_init(&(adat->lz4_data), 1, nthreads, chunksize, file_version, op); *data = adat; if (*level > 9) *level = 9; } @@ -304,7 +315,7 @@ adapt_decompress(void *src, uint64_t srclen, void *dst, cmp_flags = CHDR_ALGO(chdr); if (cmp_flags == ADAPT_COMPRESS_LZ4) { - return (lz4_decompress(src, srclen, dst, dstlen, level, chdr, btype, adat->lz4_data)); + return (lz4_decompress(src, srclen, dst, dstlen, 1, chdr, btype, adat->lz4_data)); } else if (cmp_flags == ADAPT_COMPRESS_LZMA) { return (lzma_decompress(src, srclen, dst, dstlen, level, chdr, btype, adat->lzma_data)); diff --git a/archive/pc_arc_filter.c b/archive/pc_arc_filter.c index 1664916..9209bbb 100644 --- a/archive/pc_arc_filter.c +++ b/archive/pc_arc_filter.c @@ -43,7 +43,7 @@ #include "pc_archive.h" #define PACKJPG_DEF_BUFSIZ (512 * 1024) -#define JPG_SIZE_LIMIT (50 * 1024 * 1024) +#define JPG_SIZE_LIMIT (25 * 1024 * 1024) struct packjpg_filter_data { uchar_t *buff, *in_buff; @@ -136,7 +136,7 @@ packjpg_filter(struct filter_info *fi, void *filter_private) { struct packjpg_filter_data *pjdat = (struct packjpg_filter_data *)filter_private; uchar_t *mapbuf, *out; - size_t len, in_size = 0, len1; + uint64_t len, in_size = 0, len1; len = archive_entry_size(fi->entry); len1 = len; @@ -157,9 +157,7 @@ packjpg_filter(struct filter_info *fi, void *filter_private) munmap(mapbuf, len); return (FILTER_RETURN_SKIP); } - } else { - /* * Allocate input buffer and read archive data stream for the entry * into this buffer. diff --git a/archive/pc_archive.c b/archive/pc_archive.c index 9a3d013..93ca24a 100644 --- a/archive/pc_archive.c +++ b/archive/pc_archive.c @@ -1422,6 +1422,12 @@ out: /* PPMZ packed into 32-bit integer. */ # define PPMINT (0x50504d5aU) + +/* wvpk packed into 32-bit integer. */ +# define WVPK (0x7776706b) + +/* TTA1 packed into 32-bit integer. */ +# define TTA1 (0x54544131) #else /* 0x7fELF packed into 32-bit integer. */ # define ELFINT (0x464c457fU) @@ -1431,6 +1437,12 @@ out: /* PPMZ packed into 32-bit integer. */ # define PPMINT (0x5a4d5050U) + +/* wvpk packed into 32-bit integer. */ +# define WVPK (0x6b707677) + +/* TTA1 packed into 32-bit integer. */ +# define TTA1 (0x31415454) #endif /* @@ -1452,6 +1464,8 @@ detect_type_by_data(uchar_t *buf, size_t len) return (TYPE_BINARY); // Timezone data if (U32_P(buf) == PPMINT) return (TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_PPMD); // PPM Compressed archive + if (U32_P(buf) == WVPK || U32_P(buf) == TTA1) + return (TYPE_BINARY|TYPE_COMPRESSED|TYPE_AUDIO_COMPRESSED); return (TYPE_UNKNOWN); } diff --git a/pcompress.c b/pcompress.c index d160a06..abf4bfe 100644 --- a/pcompress.c +++ b/pcompress.c @@ -205,44 +205,57 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t { uchar_t *dest = (uchar_t *)dst, type = 0; int64_t result; - uint64_t _dstlen; + uint64_t _dstlen, fromlen; + uchar_t *from, *to; DEBUG_STAT_EN(double strt, en); _dstlen = *dstlen; + from = src; + to = dst; + fromlen = srclen; + result = 0; + if (pctx->lzp_preprocess) { int hashsize; hashsize = lzp_hash_size(level); - result = lzp_compress((const uchar_t *)src, (uchar_t *)dst, srclen, + result = lzp_compress((const uchar_t *)from, to, fromlen, hashsize, LZP_DEFAULT_LZPMINLEN, 0); - if (result < 0 || result == srclen) { - if (!pctx->enable_delta2_encode) - return (-1); - } else { + if (result >= 0 && result < srclen) { + uchar_t *tmp; + tmp = from; + from = to; + to = tmp; + fromlen = result; type |= PREPROC_TYPE_LZP; - srclen = result; - memcpy(src, dst, srclen); } - - } else if (!pctx->enable_delta2_encode) { - /* - * Execution won't come here but just in case ... - */ - log_msg(LOG_ERR, 0, "Invalid preprocessing mode"); - return (-1); } if (pctx->enable_delta2_encode && props->delta2_span > 0) { - _dstlen = srclen; - result = delta2_encode((uchar_t *)src, srclen, (uchar_t *)dst, + _dstlen = fromlen; + result = delta2_encode((uchar_t *)from, fromlen, to, &_dstlen, props->delta2_span); if (result != -1) { - memcpy(src, dst, _dstlen); - srclen = _dstlen; + uchar_t *tmp; + tmp = from; + from = to; + to = tmp; + fromlen = _dstlen; type |= PREPROC_TYPE_DELTA2; } } + /* + * Check which is the resulting buffer. If Encoded data is already sitting + * in src buffer then a memcpy() is not needed. + * Note that from,to ptrs are swapped after every encoding stage. So if + * from == dst, it means that encoded data is in dst. + */ + if (from == dst) { + memcpy(src, dst, fromlen); + } + srclen = fromlen; + *dest = type; U64_P(dest + 1) = htonll(srclen); _dstlen = srclen; @@ -305,7 +318,7 @@ preproc_decompress(pc_ctx_t *pctx, compress_func_ptr dec_func, void *src, uint64 memcpy(src, dst, _dstlen); srclen = _dstlen; *dstlen = _dstlen; - } else { + } else { return (result); } } @@ -819,9 +832,9 @@ start_decompress(pc_ctx_t *pctx, const char *filename, char *to_filename) err = 1; goto uncomp_done; } + to_filename = (char *)origf; } - compressed_chunksize = chunksize + CHUNK_HDR_SZ + zlib_buf_extra(chunksize); if (pctx->_props_func) { @@ -1546,7 +1559,7 @@ plain_index: /* Compress data chunk. */ if (_chunksize == 0) { rv = -1; - } else if ((pctx->lzp_preprocess || pctx->enable_delta2_encode)) { + } else if (pctx->preprocess_mode) { rv = preproc_compress(pctx, tdat->compress, tdat->uncompressed_chunk + dedupe_index_sz, _chunksize, compressed_chunk + index_size_cmp, &_chunksize, tdat->level, 0, @@ -1575,7 +1588,7 @@ plain_index: _chunksize += index_size_cmp; } else { _chunksize = tdat->rbytes; - if (pctx->lzp_preprocess || pctx->enable_delta2_encode) { + if (pctx->preprocess_mode) { rv = preproc_compress(pctx, tdat->compress, tdat->uncompressed_chunk, tdat->rbytes, compressed_chunk, &_chunksize, tdat->level, 0, tdat->btype, tdat->data, tdat->props); @@ -1639,7 +1652,7 @@ plain_index: if ((pctx->enable_rabin_scan || pctx->enable_fixed_scan) && tdat->rctx->valid) { type |= CHUNK_FLAG_DEDUP; } - if (pctx->lzp_preprocess || pctx->enable_delta2_encode) { + if (pctx->preprocess_mode) { type |= CHUNK_FLAG_PREPROC; } @@ -2689,6 +2702,7 @@ create_pc_context(void) ctx->rab_blk_size = -1; ctx->archive_temp_fd = -1; ctx->pagesize = sysconf(_SC_PAGE_SIZE); + ctx->btype = TYPE_UNKNOWN; return (ctx); } @@ -3140,7 +3154,9 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[]) init_filters(&ff); pctx->enable_packjpg = ff.enable_packjpg; } - + if (pctx->lzp_preprocess || pctx->enable_delta2_encode) { + pctx->preprocess_mode = 1; + } } else if (pctx->do_uncompress) { struct filter_flags ff; /* diff --git a/pcompress.h b/pcompress.h index d9ea44b..45d02bf 100644 --- a/pcompress.h +++ b/pcompress.h @@ -58,7 +58,7 @@ extern "C" { #define CHUNK_FLAG_PREPROC 4 #define COMP_EXTN ".pz" -#define PREPROC_TYPE_LZP 1 +#define PREPROC_TYPE_LZP 1 #define PREPROC_TYPE_DELTA2 2 #define PREPROC_COMPRESSED 128 @@ -203,6 +203,7 @@ typedef struct pc_ctx { int enable_delta2_encode; int enable_rabin_split; int enable_fixed_scan; + int preprocess_mode; int lzp_preprocess; int encrypt_type; int archive_mode; diff --git a/utils/phash/extensions.h b/utils/phash/extensions.h index 55925e2..c6c765f 100644 --- a/utils/phash/extensions.h +++ b/utils/phash/extensions.h @@ -68,11 +68,11 @@ struct ext_entry { {"bib" , TYPE_TEXT, 3}, {"lua" , TYPE_TEXT, 3}, {"qml" , TYPE_TEXT|TYPE_MARKUP, 3}, - {"fa" , TYPE_TEXT, 2}, + {"fa" , TYPE_TEXT|TYPE_DNA_SEQ, 2}, {"faa" , TYPE_TEXT, 3}, {"asn" , TYPE_TEXT|TYPE_MARKUP, 3}, - {"ffn" , TYPE_TEXT, 3}, - {"fna" , TYPE_TEXT, 3}, + {"ffn" , TYPE_TEXT|TYPE_DNA_SEQ, 3}, + {"fna" , TYPE_TEXT|TYPE_DNA_SEQ, 3}, {"frn" , TYPE_TEXT, 3}, {"gbk" , TYPE_TEXT, 3}, {"gff" , TYPE_TEXT, 3}, @@ -101,6 +101,7 @@ struct ext_entry { {"pdf" , TYPE_BINARY, 3}, {"jpg" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG, 3}, {"jpeg" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG, 4}, + {"mjpeg" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_MJPEG, 5}, {"png" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ, 3}, {"mp3" , TYPE_BINARY|TYPE_COMPRESSED, 3}, {"wma" , TYPE_BINARY|TYPE_COMPRESSED, 3}, @@ -145,5 +146,11 @@ struct ext_entry { {"pyo" , TYPE_BINARY, 3}, {"pyc" , TYPE_BINARY, 3}, {"wav" , TYPE_BINARY, 3}, + {"tta" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_AUDIO_COMPRESSED, 3}, + {"wv" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_AUDIO_COMPRESSED, 2}, + {"swf" , TYPE_BINARY, 3}, + {"SVGZ" , TYPE_BINARY, 4}, + {"ODT" , TYPE_BINARY, 3}, + {"3DM" , TYPE_BINARY, 3}, }; #endif diff --git a/utils/phash/extensions.txt b/utils/phash/extensions.txt index db2785e..ccde73f 100644 --- a/utils/phash/extensions.txt +++ b/utils/phash/extensions.txt @@ -57,11 +57,11 @@ lua,TYPE_TEXT qml,TYPE_TEXT|TYPE_MARKUP # These are all genomic data file extensions -fa,TYPE_TEXT +fa,TYPE_TEXT|TYPE_DNA_SEQ faa,TYPE_TEXT asn,TYPE_TEXT|TYPE_MARKUP -ffn,TYPE_TEXT -fna,TYPE_TEXT +ffn,TYPE_TEXT|TYPE_DNA_SEQ +fna,TYPE_TEXT|TYPE_DNA_SEQ frn,TYPE_TEXT gbk,TYPE_TEXT gff,TYPE_TEXT @@ -91,6 +91,7 @@ off,TYPE_BINARY|TYPE_EXE pdf,TYPE_BINARY jpg,TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG jpeg,TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG +mjpeg,TYPE_BINARY|TYPE_COMPRESSED|TYPE_MJPEG png,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ mp3,TYPE_BINARY|TYPE_COMPRESSED wma,TYPE_BINARY|TYPE_COMPRESSED @@ -135,3 +136,9 @@ bmp,TYPE_BINARY pyo,TYPE_BINARY pyc,TYPE_BINARY wav,TYPE_BINARY +tta,TYPE_BINARY|TYPE_COMPRESSED|TYPE_AUDIO_COMPRESSED +wv,TYPE_BINARY|TYPE_COMPRESSED|TYPE_AUDIO_COMPRESSED +swf,TYPE_BINARY +SVGZ,TYPE_BINARY,TYPE_COMPRESSED|TYPE_COMPRESSED_GZ +ODT,TYPE_BINARY,TYPE_COMPRESSED|TYPE_COMPRESSED_ZIP +3DM,TYPE_BINARY diff --git a/utils/phash/phash.c b/utils/phash/phash.c index b235b11..3196f2f 100644 --- a/utils/phash/phash.c +++ b/utils/phash/phash.c @@ -12,14 +12,14 @@ /* small adjustments to _a_ to make values distinct */ ub1 tab[] = { -125,0,0,82,113,0,125,85,113,0,0,7,0,0,125,0, -0,0,7,87,0,0,82,0,0,125,0,7,0,85,125,85, -0,113,0,0,85,0,0,113,0,113,124,0,0,125,0,0, -113,0,11,113,125,7,0,0,0,85,113,85,22,0,0,125, -0,113,0,0,113,0,82,0,125,111,87,124,69,183,113,0, -124,0,7,22,120,22,0,127,0,120,51,125,0,0,74,120, -0,124,87,113,0,127,0,0,11,85,85,146,69,11,183,146, -0,0,88,0,0,85,42,0,171,0,0,0,0,83,0,0, +125,0,0,220,235,125,82,0,113,0,0,7,0,0,82,0, +0,0,7,124,0,0,82,0,0,125,0,7,0,220,125,120, +0,0,0,0,22,0,0,113,0,113,113,0,0,125,85,0, +113,0,11,113,125,7,0,0,0,40,0,113,85,0,0,125, +0,113,0,0,113,0,125,183,40,27,7,15,58,183,113,0, +124,0,0,22,125,220,0,40,0,87,87,125,113,0,183,125, +0,125,87,7,0,85,0,0,59,229,85,7,135,116,0,146, +0,0,82,0,0,0,200,0,56,125,0,0,61,202,0,0, }; /* The hash function */ diff --git a/utils/phash/phash.h b/utils/phash/phash.h index e422e47..aa7445c 100644 --- a/utils/phash/phash.h +++ b/utils/phash/phash.h @@ -8,7 +8,7 @@ extern ub1 tab[]; #define PHASHLEN 0x80 /* length of hash mapping table */ -#define PHASHNKEYS 134 /* How many keys were hashed */ +#define PHASHNKEYS 141 /* How many keys were hashed */ #define PHASHRANGE 256 /* Range any input might map to */ #define PHASHSALT 0x9e3779b9 /* internal, initialize normal hash */ diff --git a/utils/utils.c b/utils/utils.c index 6f3a591..d256324 100644 --- a/utils/utils.c +++ b/utils/utils.c @@ -572,6 +572,6 @@ is_incompressible(int type) int ic = 0; int st = PC_SUBTYPE(type); - ic = (st == TYPE_JPEG) | (st == TYPE_PACKJPG); + ic = (st == TYPE_JPEG) | (st == TYPE_PACKJPG) | (st == TYPE_AUDIO_COMPRESSED); return (ic); } diff --git a/utils/utils.h b/utils/utils.h index a4a68ff..973b97d 100644 --- a/utils/utils.h +++ b/utils/utils.h @@ -90,9 +90,11 @@ typedef int32_t bsize_t; # if !defined(sun) && !defined (__sun) # define LE64(x) __bswap_64(x) # define LE32(x) __bswap_32(x) +# define LE16(x) __bswap_16(x) # else # define LE64(x) BSWAP_64(x) # define LE32(x) BSWAP_32(x) +# define LE16(x) BSWAP_16(x) # endif #else # if !defined(sun) && !defined (__sun) @@ -105,6 +107,7 @@ typedef int32_t bsize_t; # endif # define LE64(x) (x) # define LE32(x) (x) +# define LE16(x) (x) #endif @@ -262,7 +265,10 @@ typedef enum { TYPE_COMPRESSED_LZ = 136, TYPE_COMPRESSED_PPMD = 144, TYPE_COMPRESSED_ZPAQ = 152, - TYPE_PACKJPG = 160 + TYPE_PACKJPG = 160, + TYPE_DNA_SEQ = 168, + TYPE_MJPEG = 176, + TYPE_AUDIO_COMPRESSED = 184 } data_type_t; /*