Optimize preprocessed compression and avoid a bunch of memory copies.

Fix a crash.
Add a few more file types.
More comments.
This commit is contained in:
Moinak Ghosh 2013-11-22 20:44:26 +05:30
parent 664c8ef75b
commit 1e2c3e479a
11 changed files with 110 additions and 50 deletions

View file

@ -130,8 +130,14 @@ adapt_init(void **data, int *level, int nthreads, uint64_t chunksize,
adat = (struct adapt_data *)slab_alloc(NULL, sizeof (struct adapt_data)); adat = (struct adapt_data *)slab_alloc(NULL, sizeof (struct adapt_data));
adat->adapt_mode = 1; adat->adapt_mode = 1;
rv = ppmd_init(&(adat->ppmd_data), level, nthreads, chunksize, file_version, op); rv = ppmd_init(&(adat->ppmd_data), level, nthreads, chunksize, file_version, op);
/*
* LZ4 is used to tackle some embedded archive headers and/or zero paddings in
* otherwise incompressible data. So we always use it at the lowest and fastest
* compression level.
*/
if (rv == 0) if (rv == 0)
rv = lz4_init(&(adat->lz4_data), level, nthreads, chunksize, file_version, op); rv = lz4_init(&(adat->lz4_data), 1, nthreads, chunksize, file_version, op);
adat->lzma_data = NULL; adat->lzma_data = NULL;
adat->bsc_data = NULL; adat->bsc_data = NULL;
*data = adat; *data = adat;
@ -167,8 +173,13 @@ adapt2_init(void **data, int *level, int nthreads, uint64_t chunksize,
if (rv == 0) if (rv == 0)
rv = libbsc_init(&(adat->bsc_data), &lv, nthreads, chunksize, file_version, op); rv = libbsc_init(&(adat->bsc_data), &lv, nthreads, chunksize, file_version, op);
#endif #endif
/*
* LZ4 is used to tackle some embedded archive headers and/or zero paddings in
* otherwise incompressible data. So we always use it at the lowest and fastest
* compression level.
*/
if (rv == 0) if (rv == 0)
rv = lz4_init(&(adat->lz4_data), level, nthreads, chunksize, file_version, op); rv = lz4_init(&(adat->lz4_data), 1, nthreads, chunksize, file_version, op);
*data = adat; *data = adat;
if (*level > 9) *level = 9; if (*level > 9) *level = 9;
} }
@ -304,7 +315,7 @@ adapt_decompress(void *src, uint64_t srclen, void *dst,
cmp_flags = CHDR_ALGO(chdr); cmp_flags = CHDR_ALGO(chdr);
if (cmp_flags == ADAPT_COMPRESS_LZ4) { if (cmp_flags == ADAPT_COMPRESS_LZ4) {
return (lz4_decompress(src, srclen, dst, dstlen, level, chdr, btype, adat->lz4_data)); return (lz4_decompress(src, srclen, dst, dstlen, 1, chdr, btype, adat->lz4_data));
} else if (cmp_flags == ADAPT_COMPRESS_LZMA) { } else if (cmp_flags == ADAPT_COMPRESS_LZMA) {
return (lzma_decompress(src, srclen, dst, dstlen, level, chdr, btype, adat->lzma_data)); return (lzma_decompress(src, srclen, dst, dstlen, level, chdr, btype, adat->lzma_data));

View file

@ -43,7 +43,7 @@
#include "pc_archive.h" #include "pc_archive.h"
#define PACKJPG_DEF_BUFSIZ (512 * 1024) #define PACKJPG_DEF_BUFSIZ (512 * 1024)
#define JPG_SIZE_LIMIT (50 * 1024 * 1024) #define JPG_SIZE_LIMIT (25 * 1024 * 1024)
struct packjpg_filter_data { struct packjpg_filter_data {
uchar_t *buff, *in_buff; uchar_t *buff, *in_buff;
@ -136,7 +136,7 @@ packjpg_filter(struct filter_info *fi, void *filter_private)
{ {
struct packjpg_filter_data *pjdat = (struct packjpg_filter_data *)filter_private; struct packjpg_filter_data *pjdat = (struct packjpg_filter_data *)filter_private;
uchar_t *mapbuf, *out; uchar_t *mapbuf, *out;
size_t len, in_size = 0, len1; uint64_t len, in_size = 0, len1;
len = archive_entry_size(fi->entry); len = archive_entry_size(fi->entry);
len1 = len; len1 = len;
@ -157,9 +157,7 @@ packjpg_filter(struct filter_info *fi, void *filter_private)
munmap(mapbuf, len); munmap(mapbuf, len);
return (FILTER_RETURN_SKIP); return (FILTER_RETURN_SKIP);
} }
} else { } else {
/* /*
* Allocate input buffer and read archive data stream for the entry * Allocate input buffer and read archive data stream for the entry
* into this buffer. * into this buffer.

View file

@ -1422,6 +1422,12 @@ out:
/* PPMZ packed into 32-bit integer. */ /* PPMZ packed into 32-bit integer. */
# define PPMINT (0x50504d5aU) # define PPMINT (0x50504d5aU)
/* wvpk packed into 32-bit integer. */
# define WVPK (0x7776706b)
/* TTA1 packed into 32-bit integer. */
# define TTA1 (0x54544131)
#else #else
/* 0x7fELF packed into 32-bit integer. */ /* 0x7fELF packed into 32-bit integer. */
# define ELFINT (0x464c457fU) # define ELFINT (0x464c457fU)
@ -1431,6 +1437,12 @@ out:
/* PPMZ packed into 32-bit integer. */ /* PPMZ packed into 32-bit integer. */
# define PPMINT (0x5a4d5050U) # define PPMINT (0x5a4d5050U)
/* wvpk packed into 32-bit integer. */
# define WVPK (0x6b707677)
/* TTA1 packed into 32-bit integer. */
# define TTA1 (0x31415454)
#endif #endif
/* /*
@ -1452,6 +1464,8 @@ detect_type_by_data(uchar_t *buf, size_t len)
return (TYPE_BINARY); // Timezone data return (TYPE_BINARY); // Timezone data
if (U32_P(buf) == PPMINT) if (U32_P(buf) == PPMINT)
return (TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_PPMD); // PPM Compressed archive return (TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_PPMD); // PPM Compressed archive
if (U32_P(buf) == WVPK || U32_P(buf) == TTA1)
return (TYPE_BINARY|TYPE_COMPRESSED|TYPE_AUDIO_COMPRESSED);
return (TYPE_UNKNOWN); return (TYPE_UNKNOWN);
} }

View file

@ -205,44 +205,57 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
{ {
uchar_t *dest = (uchar_t *)dst, type = 0; uchar_t *dest = (uchar_t *)dst, type = 0;
int64_t result; int64_t result;
uint64_t _dstlen; uint64_t _dstlen, fromlen;
uchar_t *from, *to;
DEBUG_STAT_EN(double strt, en); DEBUG_STAT_EN(double strt, en);
_dstlen = *dstlen; _dstlen = *dstlen;
from = src;
to = dst;
fromlen = srclen;
result = 0;
if (pctx->lzp_preprocess) { if (pctx->lzp_preprocess) {
int hashsize; int hashsize;
hashsize = lzp_hash_size(level); hashsize = lzp_hash_size(level);
result = lzp_compress((const uchar_t *)src, (uchar_t *)dst, srclen, result = lzp_compress((const uchar_t *)from, to, fromlen,
hashsize, LZP_DEFAULT_LZPMINLEN, 0); hashsize, LZP_DEFAULT_LZPMINLEN, 0);
if (result < 0 || result == srclen) { if (result >= 0 && result < srclen) {
if (!pctx->enable_delta2_encode) uchar_t *tmp;
return (-1); tmp = from;
} else { from = to;
to = tmp;
fromlen = result;
type |= PREPROC_TYPE_LZP; type |= PREPROC_TYPE_LZP;
srclen = result;
memcpy(src, dst, srclen);
} }
} else if (!pctx->enable_delta2_encode) {
/*
* Execution won't come here but just in case ...
*/
log_msg(LOG_ERR, 0, "Invalid preprocessing mode");
return (-1);
} }
if (pctx->enable_delta2_encode && props->delta2_span > 0) { if (pctx->enable_delta2_encode && props->delta2_span > 0) {
_dstlen = srclen; _dstlen = fromlen;
result = delta2_encode((uchar_t *)src, srclen, (uchar_t *)dst, result = delta2_encode((uchar_t *)from, fromlen, to,
&_dstlen, props->delta2_span); &_dstlen, props->delta2_span);
if (result != -1) { if (result != -1) {
memcpy(src, dst, _dstlen); uchar_t *tmp;
srclen = _dstlen; tmp = from;
from = to;
to = tmp;
fromlen = _dstlen;
type |= PREPROC_TYPE_DELTA2; type |= PREPROC_TYPE_DELTA2;
} }
} }
/*
* Check which is the resulting buffer. If Encoded data is already sitting
* in src buffer then a memcpy() is not needed.
* Note that from,to ptrs are swapped after every encoding stage. So if
* from == dst, it means that encoded data is in dst.
*/
if (from == dst) {
memcpy(src, dst, fromlen);
}
srclen = fromlen;
*dest = type; *dest = type;
U64_P(dest + 1) = htonll(srclen); U64_P(dest + 1) = htonll(srclen);
_dstlen = srclen; _dstlen = srclen;
@ -819,9 +832,9 @@ start_decompress(pc_ctx_t *pctx, const char *filename, char *to_filename)
err = 1; err = 1;
goto uncomp_done; goto uncomp_done;
} }
to_filename = (char *)origf;
} }
compressed_chunksize = chunksize + CHUNK_HDR_SZ + zlib_buf_extra(chunksize); compressed_chunksize = chunksize + CHUNK_HDR_SZ + zlib_buf_extra(chunksize);
if (pctx->_props_func) { if (pctx->_props_func) {
@ -1546,7 +1559,7 @@ plain_index:
/* Compress data chunk. */ /* Compress data chunk. */
if (_chunksize == 0) { if (_chunksize == 0) {
rv = -1; rv = -1;
} else if ((pctx->lzp_preprocess || pctx->enable_delta2_encode)) { } else if (pctx->preprocess_mode) {
rv = preproc_compress(pctx, tdat->compress, rv = preproc_compress(pctx, tdat->compress,
tdat->uncompressed_chunk + dedupe_index_sz, _chunksize, tdat->uncompressed_chunk + dedupe_index_sz, _chunksize,
compressed_chunk + index_size_cmp, &_chunksize, tdat->level, 0, compressed_chunk + index_size_cmp, &_chunksize, tdat->level, 0,
@ -1575,7 +1588,7 @@ plain_index:
_chunksize += index_size_cmp; _chunksize += index_size_cmp;
} else { } else {
_chunksize = tdat->rbytes; _chunksize = tdat->rbytes;
if (pctx->lzp_preprocess || pctx->enable_delta2_encode) { if (pctx->preprocess_mode) {
rv = preproc_compress(pctx, tdat->compress, tdat->uncompressed_chunk, rv = preproc_compress(pctx, tdat->compress, tdat->uncompressed_chunk,
tdat->rbytes, compressed_chunk, &_chunksize, tdat->level, 0, tdat->rbytes, compressed_chunk, &_chunksize, tdat->level, 0,
tdat->btype, tdat->data, tdat->props); tdat->btype, tdat->data, tdat->props);
@ -1639,7 +1652,7 @@ plain_index:
if ((pctx->enable_rabin_scan || pctx->enable_fixed_scan) && tdat->rctx->valid) { if ((pctx->enable_rabin_scan || pctx->enable_fixed_scan) && tdat->rctx->valid) {
type |= CHUNK_FLAG_DEDUP; type |= CHUNK_FLAG_DEDUP;
} }
if (pctx->lzp_preprocess || pctx->enable_delta2_encode) { if (pctx->preprocess_mode) {
type |= CHUNK_FLAG_PREPROC; type |= CHUNK_FLAG_PREPROC;
} }
@ -2689,6 +2702,7 @@ create_pc_context(void)
ctx->rab_blk_size = -1; ctx->rab_blk_size = -1;
ctx->archive_temp_fd = -1; ctx->archive_temp_fd = -1;
ctx->pagesize = sysconf(_SC_PAGE_SIZE); ctx->pagesize = sysconf(_SC_PAGE_SIZE);
ctx->btype = TYPE_UNKNOWN;
return (ctx); return (ctx);
} }
@ -3140,7 +3154,9 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[])
init_filters(&ff); init_filters(&ff);
pctx->enable_packjpg = ff.enable_packjpg; pctx->enable_packjpg = ff.enable_packjpg;
} }
if (pctx->lzp_preprocess || pctx->enable_delta2_encode) {
pctx->preprocess_mode = 1;
}
} else if (pctx->do_uncompress) { } else if (pctx->do_uncompress) {
struct filter_flags ff; struct filter_flags ff;
/* /*

View file

@ -203,6 +203,7 @@ typedef struct pc_ctx {
int enable_delta2_encode; int enable_delta2_encode;
int enable_rabin_split; int enable_rabin_split;
int enable_fixed_scan; int enable_fixed_scan;
int preprocess_mode;
int lzp_preprocess; int lzp_preprocess;
int encrypt_type; int encrypt_type;
int archive_mode; int archive_mode;

View file

@ -68,11 +68,11 @@ struct ext_entry {
{"bib" , TYPE_TEXT, 3}, {"bib" , TYPE_TEXT, 3},
{"lua" , TYPE_TEXT, 3}, {"lua" , TYPE_TEXT, 3},
{"qml" , TYPE_TEXT|TYPE_MARKUP, 3}, {"qml" , TYPE_TEXT|TYPE_MARKUP, 3},
{"fa" , TYPE_TEXT, 2}, {"fa" , TYPE_TEXT|TYPE_DNA_SEQ, 2},
{"faa" , TYPE_TEXT, 3}, {"faa" , TYPE_TEXT, 3},
{"asn" , TYPE_TEXT|TYPE_MARKUP, 3}, {"asn" , TYPE_TEXT|TYPE_MARKUP, 3},
{"ffn" , TYPE_TEXT, 3}, {"ffn" , TYPE_TEXT|TYPE_DNA_SEQ, 3},
{"fna" , TYPE_TEXT, 3}, {"fna" , TYPE_TEXT|TYPE_DNA_SEQ, 3},
{"frn" , TYPE_TEXT, 3}, {"frn" , TYPE_TEXT, 3},
{"gbk" , TYPE_TEXT, 3}, {"gbk" , TYPE_TEXT, 3},
{"gff" , TYPE_TEXT, 3}, {"gff" , TYPE_TEXT, 3},
@ -101,6 +101,7 @@ struct ext_entry {
{"pdf" , TYPE_BINARY, 3}, {"pdf" , TYPE_BINARY, 3},
{"jpg" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG, 3}, {"jpg" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG, 3},
{"jpeg" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG, 4}, {"jpeg" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG, 4},
{"mjpeg" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_MJPEG, 5},
{"png" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ, 3}, {"png" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ, 3},
{"mp3" , TYPE_BINARY|TYPE_COMPRESSED, 3}, {"mp3" , TYPE_BINARY|TYPE_COMPRESSED, 3},
{"wma" , TYPE_BINARY|TYPE_COMPRESSED, 3}, {"wma" , TYPE_BINARY|TYPE_COMPRESSED, 3},
@ -145,5 +146,11 @@ struct ext_entry {
{"pyo" , TYPE_BINARY, 3}, {"pyo" , TYPE_BINARY, 3},
{"pyc" , TYPE_BINARY, 3}, {"pyc" , TYPE_BINARY, 3},
{"wav" , TYPE_BINARY, 3}, {"wav" , TYPE_BINARY, 3},
{"tta" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_AUDIO_COMPRESSED, 3},
{"wv" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_AUDIO_COMPRESSED, 2},
{"swf" , TYPE_BINARY, 3},
{"SVGZ" , TYPE_BINARY, 4},
{"ODT" , TYPE_BINARY, 3},
{"3DM" , TYPE_BINARY, 3},
}; };
#endif #endif

View file

@ -57,11 +57,11 @@ lua,TYPE_TEXT
qml,TYPE_TEXT|TYPE_MARKUP qml,TYPE_TEXT|TYPE_MARKUP
# These are all genomic data file extensions # These are all genomic data file extensions
fa,TYPE_TEXT fa,TYPE_TEXT|TYPE_DNA_SEQ
faa,TYPE_TEXT faa,TYPE_TEXT
asn,TYPE_TEXT|TYPE_MARKUP asn,TYPE_TEXT|TYPE_MARKUP
ffn,TYPE_TEXT ffn,TYPE_TEXT|TYPE_DNA_SEQ
fna,TYPE_TEXT fna,TYPE_TEXT|TYPE_DNA_SEQ
frn,TYPE_TEXT frn,TYPE_TEXT
gbk,TYPE_TEXT gbk,TYPE_TEXT
gff,TYPE_TEXT gff,TYPE_TEXT
@ -91,6 +91,7 @@ off,TYPE_BINARY|TYPE_EXE
pdf,TYPE_BINARY pdf,TYPE_BINARY
jpg,TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG jpg,TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG
jpeg,TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG jpeg,TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG
mjpeg,TYPE_BINARY|TYPE_COMPRESSED|TYPE_MJPEG
png,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ png,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ
mp3,TYPE_BINARY|TYPE_COMPRESSED mp3,TYPE_BINARY|TYPE_COMPRESSED
wma,TYPE_BINARY|TYPE_COMPRESSED wma,TYPE_BINARY|TYPE_COMPRESSED
@ -135,3 +136,9 @@ bmp,TYPE_BINARY
pyo,TYPE_BINARY pyo,TYPE_BINARY
pyc,TYPE_BINARY pyc,TYPE_BINARY
wav,TYPE_BINARY wav,TYPE_BINARY
tta,TYPE_BINARY|TYPE_COMPRESSED|TYPE_AUDIO_COMPRESSED
wv,TYPE_BINARY|TYPE_COMPRESSED|TYPE_AUDIO_COMPRESSED
swf,TYPE_BINARY
SVGZ,TYPE_BINARY,TYPE_COMPRESSED|TYPE_COMPRESSED_GZ
ODT,TYPE_BINARY,TYPE_COMPRESSED|TYPE_COMPRESSED_ZIP
3DM,TYPE_BINARY

View file

@ -12,14 +12,14 @@
/* small adjustments to _a_ to make values distinct */ /* small adjustments to _a_ to make values distinct */
ub1 tab[] = { ub1 tab[] = {
125,0,0,82,113,0,125,85,113,0,0,7,0,0,125,0, 125,0,0,220,235,125,82,0,113,0,0,7,0,0,82,0,
0,0,7,87,0,0,82,0,0,125,0,7,0,85,125,85, 0,0,7,124,0,0,82,0,0,125,0,7,0,220,125,120,
0,113,0,0,85,0,0,113,0,113,124,0,0,125,0,0, 0,0,0,0,22,0,0,113,0,113,113,0,0,125,85,0,
113,0,11,113,125,7,0,0,0,85,113,85,22,0,0,125, 113,0,11,113,125,7,0,0,0,40,0,113,85,0,0,125,
0,113,0,0,113,0,82,0,125,111,87,124,69,183,113,0, 0,113,0,0,113,0,125,183,40,27,7,15,58,183,113,0,
124,0,7,22,120,22,0,127,0,120,51,125,0,0,74,120, 124,0,0,22,125,220,0,40,0,87,87,125,113,0,183,125,
0,124,87,113,0,127,0,0,11,85,85,146,69,11,183,146, 0,125,87,7,0,85,0,0,59,229,85,7,135,116,0,146,
0,0,88,0,0,85,42,0,171,0,0,0,0,83,0,0, 0,0,82,0,0,0,200,0,56,125,0,0,61,202,0,0,
}; };
/* The hash function */ /* The hash function */

View file

@ -8,7 +8,7 @@
extern ub1 tab[]; extern ub1 tab[];
#define PHASHLEN 0x80 /* length of hash mapping table */ #define PHASHLEN 0x80 /* length of hash mapping table */
#define PHASHNKEYS 134 /* How many keys were hashed */ #define PHASHNKEYS 141 /* How many keys were hashed */
#define PHASHRANGE 256 /* Range any input might map to */ #define PHASHRANGE 256 /* Range any input might map to */
#define PHASHSALT 0x9e3779b9 /* internal, initialize normal hash */ #define PHASHSALT 0x9e3779b9 /* internal, initialize normal hash */

View file

@ -572,6 +572,6 @@ is_incompressible(int type)
int ic = 0; int ic = 0;
int st = PC_SUBTYPE(type); int st = PC_SUBTYPE(type);
ic = (st == TYPE_JPEG) | (st == TYPE_PACKJPG); ic = (st == TYPE_JPEG) | (st == TYPE_PACKJPG) | (st == TYPE_AUDIO_COMPRESSED);
return (ic); return (ic);
} }

View file

@ -90,9 +90,11 @@ typedef int32_t bsize_t;
# if !defined(sun) && !defined (__sun) # if !defined(sun) && !defined (__sun)
# define LE64(x) __bswap_64(x) # define LE64(x) __bswap_64(x)
# define LE32(x) __bswap_32(x) # define LE32(x) __bswap_32(x)
# define LE16(x) __bswap_16(x)
# else # else
# define LE64(x) BSWAP_64(x) # define LE64(x) BSWAP_64(x)
# define LE32(x) BSWAP_32(x) # define LE32(x) BSWAP_32(x)
# define LE16(x) BSWAP_16(x)
# endif # endif
#else #else
# if !defined(sun) && !defined (__sun) # if !defined(sun) && !defined (__sun)
@ -105,6 +107,7 @@ typedef int32_t bsize_t;
# endif # endif
# define LE64(x) (x) # define LE64(x) (x)
# define LE32(x) (x) # define LE32(x) (x)
# define LE16(x) (x)
#endif #endif
@ -262,7 +265,10 @@ typedef enum {
TYPE_COMPRESSED_LZ = 136, TYPE_COMPRESSED_LZ = 136,
TYPE_COMPRESSED_PPMD = 144, TYPE_COMPRESSED_PPMD = 144,
TYPE_COMPRESSED_ZPAQ = 152, TYPE_COMPRESSED_ZPAQ = 152,
TYPE_PACKJPG = 160 TYPE_PACKJPG = 160,
TYPE_DNA_SEQ = 168,
TYPE_MJPEG = 176,
TYPE_AUDIO_COMPRESSED = 184
} data_type_t; } data_type_t;
/* /*