Structured handling of file types.
Handling of already compressed data based on compression algorithm. Add a few more extension types.
This commit is contained in:
parent
cae9de9b2e
commit
6aacd903ff
15 changed files with 107 additions and 49 deletions
|
@ -229,14 +229,14 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
|
|||
* Use PPMd if some percentage of source is 7-bit textual bytes, otherwise
|
||||
* use Bzip2 or LZMA.
|
||||
*/
|
||||
if (adat->adapt_mode == 2 && (btype & TYPE_BINARY)) {
|
||||
if (adat->adapt_mode == 2 && (PC_TYPE(btype) == TYPE_BINARY)) {
|
||||
rv = lzma_compress(src, srclen, dst, dstlen, level, chdr, btype, adat->lzma_data);
|
||||
if (rv < 0)
|
||||
return (rv);
|
||||
rv = ADAPT_COMPRESS_LZMA;
|
||||
lzma_count++;
|
||||
|
||||
} else if (adat->adapt_mode == 1 && (btype & TYPE_BINARY)) {
|
||||
} else if (adat->adapt_mode == 1 && (PC_TYPE(btype) == TYPE_BINARY)) {
|
||||
rv = bzip2_compress(src, srclen, dst, dstlen, level, chdr, btype, NULL);
|
||||
if (rv < 0)
|
||||
return (rv);
|
||||
|
@ -245,7 +245,7 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
|
|||
|
||||
} else {
|
||||
#ifdef ENABLE_PC_LIBBSC
|
||||
if (adat->bsc_data && (btype & TYPE_MARKUP)) {
|
||||
if (adat->bsc_data && PC_SUBTYPE(btype) == TYPE_MARKUP) {
|
||||
rv = libbsc_compress(src, srclen, dst, dstlen, level, chdr, btype, adat->bsc_data);
|
||||
if (rv < 0)
|
||||
return (rv);
|
||||
|
|
|
@ -942,6 +942,7 @@ archiver_thread_func(void *dat) {
|
|||
continue;
|
||||
}
|
||||
|
||||
typ = TYPE_UNKNOWN;
|
||||
if (archive_entry_filetype(entry) == AE_IFREG) {
|
||||
if ((typ = detect_type_by_ext(fpath, fpathlen)) != TYPE_UNKNOWN)
|
||||
pctx->ctype = typ;
|
||||
|
@ -1248,9 +1249,9 @@ detect_type_by_data(uchar_t *buf, size_t len)
|
|||
if (buf[0] == 0xe9)
|
||||
return (TYPE_BINARY|TYPE_EXE); // MSDOS COM
|
||||
if (U32_P(buf) == TZSHORT)
|
||||
return (TYPE_BINARY|TYPE_BINARY); // Timezone data
|
||||
return (TYPE_BINARY); // Timezone data
|
||||
if (U32_P(buf) == PPMSHORT)
|
||||
return (TYPE_BINARY|TYPE_COMPRESSED); // PPM Compressed archive
|
||||
return (TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_PPMD); // PPM Compressed archive
|
||||
|
||||
return (TYPE_UNKNOWN);
|
||||
}
|
||||
|
|
|
@ -174,12 +174,16 @@ bzip2_decompress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
|
|||
char *dst1 = (char *)dst;
|
||||
char *src1 = (char *)src;
|
||||
|
||||
if (btype & TYPE_COMPRESSED) {
|
||||
if ((btype & TYPE_COMPRESSED_LZW) != TYPE_COMPRESSED_LZW &&
|
||||
(btype & TYPE_COMPRESSED_GZ) != TYPE_COMPRESSED_GZ &&
|
||||
(btype & TYPE_COMPRESSED_LZ) != TYPE_COMPRESSED_LZ &&
|
||||
(btype & TYPE_COMPRESSED_LZO) != TYPE_COMPRESSED_LZO)
|
||||
{
|
||||
/*
|
||||
* If the data is known to be compressed then certain types less compressed data
|
||||
* can be attempted to be compressed again for a possible gain. For others it is
|
||||
* a waste of time.
|
||||
*/
|
||||
if (PC_TYPE(btype) == TYPE_COMPRESSED && level < 7) {
|
||||
int subtype = PC_SUBTYPE(btype);
|
||||
|
||||
if (subtype != TYPE_COMPRESSED_LZW && subtype != TYPE_COMPRESSED_GZ &&
|
||||
subtype != TYPE_COMPRESSED_LZ && subtype != TYPE_COMPRESSED_LZO) {
|
||||
return (-1);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -153,9 +153,11 @@ libbsc_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
|
|||
int rv;
|
||||
struct libbsc_params *bscdat = (struct libbsc_params *)data;
|
||||
|
||||
if ((btype & TYPE_COMPRESSED_BZ2) == TYPE_COMPRESSED_BZ2 ||
|
||||
(btype & TYPE_COMPRESSED_LZMA) == TYPE_COMPRESSED_LZMA)
|
||||
return (-1);
|
||||
if (PC_TYPE(btype) == TYPE_COMPRESSED) {
|
||||
int subtype = PC_SUBTYPE(btype);
|
||||
if (subtype == TYPE_COMPRESSED_BZ2 || subtype == TYPE_COMPRESSED_LZMA)
|
||||
return (-1);
|
||||
}
|
||||
|
||||
rv = bsc_compress(src, dst, srclen, bscdat->lzpHashSize, bscdat->lzpMinLen,
|
||||
LIBBSC_BLOCKSORTER_BWT, bscdat->bscCoder, bscdat->features);
|
||||
|
|
|
@ -106,6 +106,12 @@ lz4_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
|
|||
int _srclen = srclen;
|
||||
uchar_t *dst2;
|
||||
|
||||
/*
|
||||
* Ignore compressed data in fast modes.
|
||||
*/
|
||||
if (lzdat->level < 3 && PC_TYPE(btype) == TYPE_COMPRESSED)
|
||||
return (-1);
|
||||
|
||||
if (lzdat->level == 1) {
|
||||
rv = LZ4_compress((const char *)src, (char *)dst, _srclen);
|
||||
|
||||
|
|
|
@ -111,6 +111,12 @@ lz_fx_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
|
|||
unsigned int _srclen = srclen;
|
||||
unsigned int _dstlen = *dstlen;
|
||||
|
||||
/*
|
||||
* Ignore compressed data in fast modes.
|
||||
*/
|
||||
if (level < 7 && PC_TYPE(btype) == TYPE_COMPRESSED)
|
||||
return (-1);
|
||||
|
||||
rv = lzfx_compress(src, _srclen, dst, &_dstlen, lzdat->htab_bits);
|
||||
if (rv != 0) {
|
||||
if (rv != LZFX_ESIZE)
|
||||
|
|
|
@ -211,7 +211,7 @@ lzma_compress(void *src, uint64_t srclen, void *dst,
|
|||
return (-1);
|
||||
}
|
||||
|
||||
if ((btype & TYPE_COMPRESSED_LZMA) == TYPE_COMPRESSED_LZMA)
|
||||
if (PC_SUBTYPE(btype) == TYPE_COMPRESSED_LZMA)
|
||||
return (-1);
|
||||
props->level = level;
|
||||
|
||||
|
|
|
@ -1544,12 +1544,13 @@ plain_index:
|
|||
o_chunksize = _chunksize;
|
||||
|
||||
/* Compress data chunk. */
|
||||
if ((pctx->lzp_preprocess || pctx->enable_delta2_encode) && _chunksize > 0) {
|
||||
if ((pctx->lzp_preprocess || pctx->enable_delta2_encode) && _chunksize > 0 &&
|
||||
PC_SUBTYPE(pctx->btype) == TYPE_CMP_MAX) {
|
||||
rv = preproc_compress(pctx, tdat->compress, tdat->uncompressed_chunk + dedupe_index_sz,
|
||||
_chunksize, compressed_chunk + index_size_cmp, &_chunksize,
|
||||
tdat->level, 0, pctx->btype, tdat->data, tdat->props);
|
||||
|
||||
} else if (_chunksize > 0) {
|
||||
} else if (_chunksize > 0 && PC_SUBTYPE(pctx->btype) == TYPE_CMP_MAX) {
|
||||
DEBUG_STAT_EN(double strt, en);
|
||||
|
||||
DEBUG_STAT_EN(strt = get_wtime_millis());
|
||||
|
|
|
@ -114,8 +114,9 @@ ppmd_compress(void *src, uint64_t srclen, void *dst,
|
|||
CPpmd8 *_ppmd = (CPpmd8 *)data;
|
||||
uchar_t *_src = (uchar_t *)src;
|
||||
|
||||
if (btype & TYPE_COMPRESSED)
|
||||
if (PC_TYPE(btype) == TYPE_COMPRESSED)
|
||||
return (-1);
|
||||
|
||||
Ppmd8_RangeEnc_Init(_ppmd);
|
||||
Ppmd8_Init(_ppmd, _ppmd->Order, PPMD8_RESTORE_METHOD_RESTART);
|
||||
_ppmd->buf = (Byte *)dst;
|
||||
|
|
|
@ -99,8 +99,8 @@ struct ext_entry {
|
|||
{"xpi" , TYPE_BINARY|TYPE_EXE, 3},
|
||||
{"off" , TYPE_BINARY|TYPE_EXE, 3},
|
||||
{"pdf" , TYPE_BINARY, 3},
|
||||
{"jpg" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG, 3},
|
||||
{"jpeg" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG, 4},
|
||||
{"jpg" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_CMP_MAX, 3},
|
||||
{"jpeg" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_CMP_MAX, 4},
|
||||
{"png" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ, 3},
|
||||
{"mp3" , TYPE_BINARY|TYPE_COMPRESSED, 3},
|
||||
{"wma" , TYPE_BINARY|TYPE_COMPRESSED, 3},
|
||||
|
@ -114,7 +114,8 @@ struct ext_entry {
|
|||
{"flac" , TYPE_BINARY|TYPE_COMPRESSED, 4},
|
||||
{"pac" , TYPE_BINARY|TYPE_COMPRESSED, 3},
|
||||
{"gif" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZW, 3},
|
||||
{"jp2" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG, 3},
|
||||
{"jp2" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_CMP_MAX, 3},
|
||||
{"pjg" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_CMP_MAX, 3},
|
||||
{"gz" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ, 2},
|
||||
{"tgz" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ, 3},
|
||||
{"bz2" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_BZ2, 3},
|
||||
|
|
|
@ -89,8 +89,8 @@ com,TYPE_BINARY|TYPE_EXE
|
|||
xpi,TYPE_BINARY|TYPE_EXE
|
||||
off,TYPE_BINARY|TYPE_EXE
|
||||
pdf,TYPE_BINARY
|
||||
jpg,TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG
|
||||
jpeg,TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG
|
||||
jpg,TYPE_BINARY|TYPE_COMPRESSED|TYPE_CMP_MAX
|
||||
jpeg,TYPE_BINARY|TYPE_COMPRESSED|TYPE_CMP_MAX
|
||||
png,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ
|
||||
mp3,TYPE_BINARY|TYPE_COMPRESSED
|
||||
wma,TYPE_BINARY|TYPE_COMPRESSED
|
||||
|
@ -104,7 +104,8 @@ ofr,TYPE_BINARY|TYPE_COMPRESSED
|
|||
flac,TYPE_BINARY|TYPE_COMPRESSED
|
||||
pac,TYPE_BINARY|TYPE_COMPRESSED
|
||||
gif,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZW
|
||||
jp2,TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG
|
||||
jp2,TYPE_BINARY|TYPE_COMPRESSED|TYPE_CMP_MAX
|
||||
pjg,TYPE_BINARY|TYPE_COMPRESSED|TYPE_CMP_MAX
|
||||
gz,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ
|
||||
tgz,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ
|
||||
bz2,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_BZ2
|
||||
|
|
|
@ -13,12 +13,12 @@
|
|||
/* small adjustments to _a_ to make values distinct */
|
||||
ub1 tab[] = {
|
||||
125,0,0,82,113,0,125,85,113,0,0,7,0,0,125,0,
|
||||
0,0,7,87,0,0,82,0,0,88,0,7,0,85,125,85,
|
||||
0,113,0,0,85,0,0,113,0,113,124,125,0,125,0,0,
|
||||
113,0,11,113,125,0,0,0,0,85,113,85,22,0,0,125,
|
||||
0,113,0,0,113,0,82,0,125,111,87,88,69,125,113,0,
|
||||
124,0,7,22,113,22,0,235,0,120,120,125,113,0,74,120,
|
||||
0,124,87,7,0,127,0,0,11,85,85,146,115,11,183,146,
|
||||
0,0,7,87,0,0,82,0,0,125,0,7,0,85,125,85,
|
||||
0,113,0,0,85,0,0,113,0,113,124,0,0,125,0,0,
|
||||
113,0,11,113,125,7,0,0,0,85,113,85,22,0,0,125,
|
||||
0,113,0,0,113,0,82,0,125,111,87,124,69,183,113,0,
|
||||
124,0,7,22,120,22,0,127,0,120,51,125,0,0,74,120,
|
||||
0,124,87,113,0,127,0,0,11,85,85,146,69,11,183,146,
|
||||
0,0,88,0,0,85,42,0,171,0,0,0,0,83,0,0,
|
||||
};
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@
|
|||
|
||||
extern ub1 tab[];
|
||||
#define PHASHLEN 0x80 /* length of hash mapping table */
|
||||
#define PHASHNKEYS 133 /* How many keys were hashed */
|
||||
#define PHASHNKEYS 134 /* How many keys were hashed */
|
||||
#define PHASHRANGE 256 /* Range any input might map to */
|
||||
#define PHASHSALT 0x9e3779b9 /* internal, initialize normal hash */
|
||||
|
||||
|
|
|
@ -232,31 +232,53 @@ struct fn_list {
|
|||
* Enumerated type constants for file type identification in pc_archive.
|
||||
*/
|
||||
typedef enum {
|
||||
/*
|
||||
* Primary Types.
|
||||
*/
|
||||
TYPE_UNKNOWN = 0,
|
||||
TYPE_TEXT = 1,
|
||||
TYPE_BINARY = 2,
|
||||
TYPE_COMPRESSED = 4,
|
||||
/*
|
||||
* Sub-types.
|
||||
*/
|
||||
TYPE_EXE = 8,
|
||||
TYPE_JPEG = 12,
|
||||
TYPE_MARKUP = 16,
|
||||
TYPE_COMPRESSED_GZ = 20,
|
||||
TYPE_COMPRESSED_LZW = 24,
|
||||
TYPE_COMPRESSED_BZ2 = 28,
|
||||
TYPE_COMPRESSED_ZIP = 32,
|
||||
TYPE_COMPRESSED_ARJ = 36,
|
||||
TYPE_COMPRESSED_ARC = 40,
|
||||
TYPE_COMPRESSED_LH = 44,
|
||||
TYPE_COMPRESSED_LZMA = 48,
|
||||
TYPE_COMPRESSED_LZO = 52,
|
||||
TYPE_COMPRESSED_UHARC = 56,
|
||||
TYPE_COMPRESSED_ALZ = 60,
|
||||
TYPE_COMPRESSED_ACE = 64,
|
||||
TYPE_COMPRESSED_RAR = 68,
|
||||
TYPE_COMPRESSED_LZ = 72,
|
||||
TYPE_COMPRESSED_PPMD = 76,
|
||||
TYPE_COMPRESSED_ZPAQ = 80
|
||||
TYPE_CMP_MAX = 16,
|
||||
TYPE_MARKUP = 24,
|
||||
TYPE_COMPRESSED_GZ = 32,
|
||||
TYPE_COMPRESSED_LZW = 40,
|
||||
TYPE_COMPRESSED_BZ2 = 48,
|
||||
TYPE_COMPRESSED_ZIP = 56,
|
||||
TYPE_COMPRESSED_ARJ = 64,
|
||||
TYPE_COMPRESSED_ARC = 72,
|
||||
TYPE_COMPRESSED_LH = 80,
|
||||
TYPE_COMPRESSED_LZMA = 88,
|
||||
TYPE_COMPRESSED_LZO = 96,
|
||||
TYPE_COMPRESSED_UHARC = 104,
|
||||
TYPE_COMPRESSED_ALZ = 112,
|
||||
TYPE_COMPRESSED_ACE = 120,
|
||||
TYPE_COMPRESSED_RAR = 128,
|
||||
TYPE_COMPRESSED_LZ = 136,
|
||||
TYPE_COMPRESSED_PPMD = 144,
|
||||
TYPE_COMPRESSED_ZPAQ = 152
|
||||
} data_type_t;
|
||||
|
||||
/*
|
||||
* Type identifier is an int with the following format.
|
||||
*
|
||||
* Sub Type Primary Type
|
||||
* (Numeric Value) (Bit Positions - Flags)
|
||||
* _____________|_____________ ___|___
|
||||
* | | | |
|
||||
* .---------------------------------------.
|
||||
* | | | | | | | | | | |
|
||||
* Bit 10 Bit 0
|
||||
*/
|
||||
#define PC_TYPE_MASK 0x7
|
||||
#define PC_SUBTYPE_MASK 0x7f8
|
||||
#define PC_SUBTYPE(x) ((x) & PC_SUBTYPE_MASK)
|
||||
#define PC_TYPE(x) ((x) & PC_TYPE_MASK)
|
||||
|
||||
#ifndef _IN_UTILS_
|
||||
extern processor_info_t proc_info;
|
||||
#endif
|
||||
|
|
|
@ -152,6 +152,19 @@ zlib_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
|
|||
uchar_t *src1 = (uchar_t *)src;
|
||||
z_stream *zs = (z_stream *)data;
|
||||
|
||||
/*
|
||||
* If the data is known to be compressed then certain types less compressed data
|
||||
* can be attempted to be compressed again for a possible gain. For others it is
|
||||
* a waste of time.
|
||||
*/
|
||||
if (PC_TYPE(btype) == TYPE_COMPRESSED && level < 7) {
|
||||
int subtype = PC_SUBTYPE(btype);
|
||||
|
||||
if (subtype != TYPE_COMPRESSED_LZW &&
|
||||
subtype != TYPE_COMPRESSED_LZ && subtype != TYPE_COMPRESSED_LZO) {
|
||||
return (-1);
|
||||
}
|
||||
}
|
||||
ending = 0;
|
||||
while (_srclen > 0) {
|
||||
if (_srclen > SINGLE_CALL_MAX) {
|
||||
|
|
Loading…
Reference in a new issue