Leverage file type detection(archiver) to improve compression performance.

Use detected file/data type(archiver) for Adaptive compression modes.
Update type flags and add more extensions.
This commit is contained in:
Moinak Ghosh 2013-11-08 23:50:28 +05:30
parent b7facc929e
commit cae9de9b2e
19 changed files with 340 additions and 239 deletions

View file

@ -35,6 +35,7 @@
#include <utils.h> #include <utils.h>
#include <pcompress.h> #include <pcompress.h>
#include <allocator.h> #include <allocator.h>
#include <pc_archive.h>
#define FIFTY_PCT(x) (((x)/10) * 5) #define FIFTY_PCT(x) (((x)/10) * 5)
#define FORTY_PCT(x) (((x)/10) * 4) #define FORTY_PCT(x) (((x)/10) * 4)
@ -46,22 +47,22 @@ static unsigned int bsc_count = 0;
static unsigned int ppmd_count = 0; static unsigned int ppmd_count = 0;
extern int lzma_compress(void *src, uint64_t srclen, void *dst, extern int lzma_compress(void *src, uint64_t srclen, void *dst,
uint64_t *destlen, int level, uchar_t chdr, void *data); uint64_t *destlen, int level, uchar_t chdr, int btype, void *data);
extern int bzip2_compress(void *src, uint64_t srclen, void *dst, extern int bzip2_compress(void *src, uint64_t srclen, void *dst,
uint64_t *destlen, int level, uchar_t chdr, void *data); uint64_t *destlen, int level, uchar_t chdr, int btype, void *data);
extern int ppmd_compress(void *src, uint64_t srclen, void *dst, extern int ppmd_compress(void *src, uint64_t srclen, void *dst,
uint64_t *dstlen, int level, uchar_t chdr, void *data); uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data);
extern int libbsc_compress(void *src, uint64_t srclen, void *dst, extern int libbsc_compress(void *src, uint64_t srclen, void *dst,
uint64_t *dstlen, int level, uchar_t chdr, void *data); uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data);
extern int lzma_decompress(void *src, uint64_t srclen, void *dst, extern int lzma_decompress(void *src, uint64_t srclen, void *dst,
uint64_t *dstlen, int level, uchar_t chdr, void *data); uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data);
extern int bzip2_decompress(void *src, uint64_t srclen, void *dst, extern int bzip2_decompress(void *src, uint64_t srclen, void *dst,
uint64_t *dstlen, int level, uchar_t chdr, void *data); uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data);
extern int ppmd_decompress(void *src, uint64_t srclen, void *dst, extern int ppmd_decompress(void *src, uint64_t srclen, void *dst,
uint64_t *dstlen, int level, uchar_t chdr, void *data); uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data);
extern int libbsc_decompress(void *src, uint64_t srclen, void *dst, extern int libbsc_decompress(void *src, uint64_t srclen, void *dst,
uint64_t *dstlen, int level, uchar_t chdr, void *data); uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data);
extern int lzma_init(void **data, int *level, int nthreads, uint64_t chunksize, extern int lzma_init(void **data, int *level, int nthreads, uint64_t chunksize,
int file_version, compress_op_t op); int file_version, compress_op_t op);
@ -180,15 +181,16 @@ adapt_deinit(void **data)
int int
adapt_compress(void *src, uint64_t srclen, void *dst, adapt_compress(void *src, uint64_t srclen, void *dst,
uint64_t *dstlen, int level, uchar_t chdr, void *data) uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data)
{ {
struct adapt_data *adat = (struct adapt_data *)(data); struct adapt_data *adat = (struct adapt_data *)(data);
uchar_t *src1 = (uchar_t *)src; uchar_t *src1 = (uchar_t *)src;
uint64_t i, tot8b, tag1, tag2, tag3;
int rv = 0; int rv = 0;
if (btype == TYPE_UNKNOWN) {
uint64_t i, tot8b, tag1, tag2, tag3;
double tagcnt, pct_tag; double tagcnt, pct_tag;
uchar_t cur_byte, prev_byte; uchar_t cur_byte, prev_byte;
/* /*
* Count number of 8-bit binary bytes and XML tags in source. * Count number of 8-bit binary bytes and XML tags in source.
*/ */
@ -211,20 +213,31 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
tot8b /= 0x80; tot8b /= 0x80;
tagcnt = tag1 + tag2 + tag3; tagcnt = tag1 + tag2 + tag3;
pct_tag = tagcnt / (double)srclen; pct_tag = tagcnt / (double)srclen;
if (adat->adapt_mode == 2 && tot8b > FORTY_PCT(srclen)) {
btype = TYPE_BINARY;
} else if (adat->adapt_mode == 1 && tot8b > FIFTY_PCT(srclen)) {
btype = TYPE_BINARY;
} else {
btype = TYPE_TEXT;
if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 &&
tagcnt > (double)srclen * 0.001)
btype |= TYPE_MARKUP;
}
}
/* /*
* Use PPMd if some percentage of source is 7-bit textual bytes, otherwise * Use PPMd if some percentage of source is 7-bit textual bytes, otherwise
* use Bzip2 or LZMA. * use Bzip2 or LZMA.
*/ */
if (adat->adapt_mode == 2 && tot8b > FORTY_PCT(srclen)) { if (adat->adapt_mode == 2 && (btype & TYPE_BINARY)) {
rv = lzma_compress(src, srclen, dst, dstlen, level, chdr, adat->lzma_data); rv = lzma_compress(src, srclen, dst, dstlen, level, chdr, btype, adat->lzma_data);
if (rv < 0) if (rv < 0)
return (rv); return (rv);
rv = ADAPT_COMPRESS_LZMA; rv = ADAPT_COMPRESS_LZMA;
lzma_count++; lzma_count++;
} else if (adat->adapt_mode == 1 && tot8b > FIFTY_PCT(srclen)) { } else if (adat->adapt_mode == 1 && (btype & TYPE_BINARY)) {
rv = bzip2_compress(src, srclen, dst, dstlen, level, chdr, NULL); rv = bzip2_compress(src, srclen, dst, dstlen, level, chdr, btype, NULL);
if (rv < 0) if (rv < 0)
return (rv); return (rv);
rv = ADAPT_COMPRESS_BZIP2; rv = ADAPT_COMPRESS_BZIP2;
@ -232,16 +245,15 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
} else { } else {
#ifdef ENABLE_PC_LIBBSC #ifdef ENABLE_PC_LIBBSC
if (adat->bsc_data && tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 && if (adat->bsc_data && (btype & TYPE_MARKUP)) {
tagcnt > (double)srclen * 0.001) { rv = libbsc_compress(src, srclen, dst, dstlen, level, chdr, btype, adat->bsc_data);
rv = libbsc_compress(src, srclen, dst, dstlen, level, chdr, adat->bsc_data);
if (rv < 0) if (rv < 0)
return (rv); return (rv);
rv = ADAPT_COMPRESS_BSC; rv = ADAPT_COMPRESS_BSC;
bsc_count++; bsc_count++;
} else { } else {
#endif #endif
rv = ppmd_compress(src, srclen, dst, dstlen, level, chdr, adat->ppmd_data); rv = ppmd_compress(src, srclen, dst, dstlen, level, chdr, btype, adat->ppmd_data);
if (rv < 0) if (rv < 0)
return (rv); return (rv);
rv = ADAPT_COMPRESS_PPMD; rv = ADAPT_COMPRESS_PPMD;
@ -256,7 +268,7 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
int int
adapt_decompress(void *src, uint64_t srclen, void *dst, adapt_decompress(void *src, uint64_t srclen, void *dst,
uint64_t *dstlen, int level, uchar_t chdr, void *data) uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data)
{ {
struct adapt_data *adat = (struct adapt_data *)(data); struct adapt_data *adat = (struct adapt_data *)(data);
uchar_t cmp_flags; uchar_t cmp_flags;
@ -264,17 +276,17 @@ adapt_decompress(void *src, uint64_t srclen, void *dst,
cmp_flags = (chdr>>4) & CHDR_ALGO_MASK; cmp_flags = (chdr>>4) & CHDR_ALGO_MASK;
if (cmp_flags == ADAPT_COMPRESS_LZMA) { if (cmp_flags == ADAPT_COMPRESS_LZMA) {
return (lzma_decompress(src, srclen, dst, dstlen, level, chdr, adat->lzma_data)); return (lzma_decompress(src, srclen, dst, dstlen, level, chdr, btype, adat->lzma_data));
} else if (cmp_flags == ADAPT_COMPRESS_BZIP2) { } else if (cmp_flags == ADAPT_COMPRESS_BZIP2) {
return (bzip2_decompress(src, srclen, dst, dstlen, level, chdr, NULL)); return (bzip2_decompress(src, srclen, dst, dstlen, level, chdr, btype, NULL));
} else if (cmp_flags == ADAPT_COMPRESS_PPMD) { } else if (cmp_flags == ADAPT_COMPRESS_PPMD) {
return (ppmd_decompress(src, srclen, dst, dstlen, level, chdr, adat->ppmd_data)); return (ppmd_decompress(src, srclen, dst, dstlen, level, chdr, btype, adat->ppmd_data));
} else if (cmp_flags == ADAPT_COMPRESS_BSC) { } else if (cmp_flags == ADAPT_COMPRESS_BSC) {
#ifdef ENABLE_PC_LIBBSC #ifdef ENABLE_PC_LIBBSC
return (libbsc_decompress(src, srclen, dst, dstlen, level, chdr, adat->bsc_data)); return (libbsc_decompress(src, srclen, dst, dstlen, level, chdr, btype, adat->bsc_data));
#else #else
log_msg(LOG_ERR, 0, "Cannot decompress chunk. Libbsc support not present.\n"); log_msg(LOG_ERR, 0, "Cannot decompress chunk. Libbsc support not present.\n");
return (-1); return (-1);

View file

@ -229,8 +229,6 @@ archiver_read(void *ctx, void *buf, uint64_t count)
sem_post(&(pctx->write_sem)); sem_post(&(pctx->write_sem));
sem_wait(&(pctx->read_sem)); sem_wait(&(pctx->read_sem));
pctx->arc_buf = NULL; pctx->arc_buf = NULL;
if (pctx->btype == TYPE_UNKNOWN)
pctx->btype = TYPE_GENERIC;
return (pctx->arc_buf_pos); return (pctx->arc_buf_pos);
} }
@ -1166,9 +1164,9 @@ init_archive_mod() {
if (!inited) { if (!inited) {
int i, j; int i, j;
exthtab = malloc(NUM_EXT * sizeof (struct ext_hash_entry)); exthtab = malloc(PHASHNKEYS * sizeof (struct ext_hash_entry));
if (exthtab != NULL) { if (exthtab != NULL) {
for (i = 0; i < NUM_EXT; i++) { for (i = 0; i < PHASHNKEYS; i++) {
uint64_t extnum; uint64_t extnum;
ub4 slot = phash(extlist[i].ext, extlist[i].len); ub4 slot = phash(extlist[i].ext, extlist[i].len);
extnum = 0; extnum = 0;
@ -1211,7 +1209,7 @@ detect_type_by_ext(char *path, int pathlen)
if (len == 0) goto out; // If extension is empty give up if (len == 0) goto out; // If extension is empty give up
ext = &path[i+1]; ext = &path[i+1];
slot = phash(ext, len); slot = phash(ext, len);
if (slot > NUM_EXT) goto out; // Extension maps outside hash table range, give up if (slot > PHASHNKEYS) goto out; // Extension maps outside hash table range, give up
extnum = 0; extnum = 0;
/* /*
@ -1244,15 +1242,15 @@ detect_type_by_data(uchar_t *buf, size_t len)
if (len < 16) return (TYPE_UNKNOWN); if (len < 16) return (TYPE_UNKNOWN);
if (U32_P(buf) == ELFSHORT) if (U32_P(buf) == ELFSHORT)
return (TYPE_EXE); // Regular ELF return (TYPE_BINARY|TYPE_EXE); // Regular ELF
if ((buf[0] == 'M' || buf[0] == 'L') && buf[1] == 'Z') if ((buf[0] == 'M' || buf[0] == 'L') && buf[1] == 'Z')
return (TYPE_EXE); // MSDOS Exe return (TYPE_BINARY|TYPE_EXE); // MSDOS Exe
if (buf[0] == 0xe9) if (buf[0] == 0xe9)
return (TYPE_EXE); // MSDOS COM return (TYPE_BINARY|TYPE_EXE); // MSDOS COM
if (U32_P(buf) == TZSHORT) if (U32_P(buf) == TZSHORT)
return (TYPE_BINARY); // Timezone data return (TYPE_BINARY|TYPE_BINARY); // Timezone data
if (U32_P(buf) == PPMSHORT) if (U32_P(buf) == PPMSHORT)
return (TYPE_COMPRESSED); // PPM Compressed archive return (TYPE_BINARY|TYPE_COMPRESSED); // PPM Compressed archive
return (TYPE_UNKNOWN); return (TYPE_UNKNOWN);
} }

View file

@ -26,6 +26,9 @@
#ifndef _ARCHIVE_H #ifndef _ARCHIVE_H
#define _ARCHIVE_H #define _ARCHIVE_H
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <pcompress.h> #include <pcompress.h>
#ifdef __cplusplus #ifdef __cplusplus
@ -38,16 +41,6 @@ typedef struct {
size_t size; size_t size;
} archive_list_entry_t; } archive_list_entry_t;
typedef enum {
TYPE_UNKNOWN = 0,
TYPE_GENERIC,
TYPE_COMPRESSED,
TYPE_EXE,
TYPE_TEXT,
TYPE_BINARY,
TYPE_JPEG
} data_type_t;
/* /*
* Archiving related functions. * Archiving related functions.
*/ */

View file

@ -95,7 +95,7 @@ bzerr(int err)
int int
bzip2_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, bzip2_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
int level, uchar_t chdr, void *data) int level, uchar_t chdr, int btype, void *data)
{ {
bz_stream bzs; bz_stream bzs;
int ret, ending; int ret, ending;
@ -164,7 +164,7 @@ bzip2_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
int int
bzip2_decompress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, bzip2_decompress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
int level, uchar_t chdr, void *data) int level, uchar_t chdr, int btype, void *data)
{ {
bz_stream bzs; bz_stream bzs;
int ret; int ret;
@ -174,6 +174,15 @@ bzip2_decompress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
char *dst1 = (char *)dst; char *dst1 = (char *)dst;
char *src1 = (char *)src; char *src1 = (char *)src;
if (btype & TYPE_COMPRESSED) {
if ((btype & TYPE_COMPRESSED_LZW) != TYPE_COMPRESSED_LZW &&
(btype & TYPE_COMPRESSED_GZ) != TYPE_COMPRESSED_GZ &&
(btype & TYPE_COMPRESSED_LZ) != TYPE_COMPRESSED_LZ &&
(btype & TYPE_COMPRESSED_LZO) != TYPE_COMPRESSED_LZO)
{
return (-1);
}
}
bzs.bzalloc = slab_alloc_i; bzs.bzalloc = slab_alloc_i;
bzs.bzfree = slab_free; bzs.bzfree = slab_free;
bzs.opaque = NULL; bzs.opaque = NULL;

View file

@ -148,11 +148,15 @@ libbsc_deinit(void **data)
int int
libbsc_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, libbsc_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
int level, uchar_t chdr, void *data) int level, uchar_t chdr, int btype, void *data)
{ {
int rv; int rv;
struct libbsc_params *bscdat = (struct libbsc_params *)data; struct libbsc_params *bscdat = (struct libbsc_params *)data;
if ((btype & TYPE_COMPRESSED_BZ2) == TYPE_COMPRESSED_BZ2 ||
(btype & TYPE_COMPRESSED_LZMA) == TYPE_COMPRESSED_LZMA)
return (-1);
rv = bsc_compress(src, dst, srclen, bscdat->lzpHashSize, bscdat->lzpMinLen, rv = bsc_compress(src, dst, srclen, bscdat->lzpHashSize, bscdat->lzpMinLen,
LIBBSC_BLOCKSORTER_BWT, bscdat->bscCoder, bscdat->features); LIBBSC_BLOCKSORTER_BWT, bscdat->bscCoder, bscdat->features);
if (rv < 0) { if (rv < 0) {
@ -165,7 +169,7 @@ libbsc_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
int int
libbsc_decompress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, libbsc_decompress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
int level, uchar_t chdr, void *data) int level, uchar_t chdr, int btype, void *data)
{ {
int rv; int rv;
struct libbsc_params *bscdat = (struct libbsc_params *)data; struct libbsc_params *bscdat = (struct libbsc_params *)data;

View file

@ -99,7 +99,7 @@ lz4_deinit(void **data)
int int
lz4_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, lz4_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
int level, uchar_t chdr, void *data) int level, uchar_t chdr, int btype, void *data)
{ {
int rv; int rv;
struct lz4_params *lzdat = (struct lz4_params *)data; struct lz4_params *lzdat = (struct lz4_params *)data;
@ -135,7 +135,7 @@ lz4_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
int int
lz4_decompress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, lz4_decompress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
int level, uchar_t chdr, void *data) int level, uchar_t chdr, int btype, void *data)
{ {
int rv; int rv;
struct lz4_params *lzdat = (struct lz4_params *)data; struct lz4_params *lzdat = (struct lz4_params *)data;

View file

@ -104,7 +104,7 @@ lz_fx_err(int err)
int int
lz_fx_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, lz_fx_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
int level, uchar_t chdr, void *data) int level, uchar_t chdr, int btype, void *data)
{ {
int rv; int rv;
struct lzfx_params *lzdat = (struct lzfx_params *)data; struct lzfx_params *lzdat = (struct lzfx_params *)data;
@ -124,7 +124,7 @@ lz_fx_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
int int
lz_fx_decompress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, lz_fx_decompress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
int level, uchar_t chdr, void *data) int level, uchar_t chdr, int btype, void *data)
{ {
int rv; int rv;
unsigned int _srclen = srclen; unsigned int _srclen = srclen;

View file

@ -199,7 +199,7 @@ lzerr(int err, int cmp)
*/ */
int int
lzma_compress(void *src, uint64_t srclen, void *dst, lzma_compress(void *src, uint64_t srclen, void *dst,
uint64_t *dstlen, int level, uchar_t chdr, void *data) uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data)
{ {
uint64_t props_len = LZMA_PROPS_SIZE; uint64_t props_len = LZMA_PROPS_SIZE;
SRes res; SRes res;
@ -210,6 +210,9 @@ lzma_compress(void *src, uint64_t srclen, void *dst,
lzerr(SZ_ERROR_DESTLEN, 1); lzerr(SZ_ERROR_DESTLEN, 1);
return (-1); return (-1);
} }
if ((btype & TYPE_COMPRESSED_LZMA) == TYPE_COMPRESSED_LZMA)
return (-1);
props->level = level; props->level = level;
_dst = (Byte *)dst; _dst = (Byte *)dst;
@ -228,7 +231,7 @@ lzma_compress(void *src, uint64_t srclen, void *dst,
int int
lzma_decompress(void *src, uint64_t srclen, void *dst, lzma_decompress(void *src, uint64_t srclen, void *dst,
uint64_t *dstlen, int level, uchar_t chdr, void *data) uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data)
{ {
uint64_t _srclen; uint64_t _srclen;
const uchar_t *_src; const uchar_t *_src;

View file

@ -61,7 +61,7 @@ none_deinit(void **data)
int int
none_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, none_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
int level, uchar_t chdr, void *data) int level, uchar_t chdr, int btype, void *data)
{ {
memcpy(dst, src, srclen); memcpy(dst, src, srclen);
return (0); return (0);
@ -69,7 +69,7 @@ none_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
int int
none_decompress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, none_decompress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
int level, uchar_t chdr, void *data) int level, uchar_t chdr, int btype, void *data)
{ {
memcpy(dst, src, srclen); memcpy(dst, src, srclen);
return (0); return (0);

View file

@ -201,7 +201,7 @@ show_compression_stats(pc_ctx_t *pctx)
*/ */
static int static int
preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t srclen, preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t srclen,
void *dst, uint64_t *dstlen, int level, uchar_t chdr, void *data, algo_props_t *props) void *dst, uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data, algo_props_t *props)
{ {
uchar_t *dest = (uchar_t *)dst, type = 0; uchar_t *dest = (uchar_t *)dst, type = 0;
int64_t result; int64_t result;
@ -247,7 +247,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
U64_P(dest + 1) = htonll(srclen); U64_P(dest + 1) = htonll(srclen);
_dstlen = srclen; _dstlen = srclen;
DEBUG_STAT_EN(strt = get_wtime_millis()); DEBUG_STAT_EN(strt = get_wtime_millis());
result = cmp_func(src, srclen, dest+9, &_dstlen, level, chdr, data); result = cmp_func(src, srclen, dest+9, &_dstlen, level, chdr, btype, data);
DEBUG_STAT_EN(en = get_wtime_millis()); DEBUG_STAT_EN(en = get_wtime_millis());
if (result > -1 && _dstlen < srclen) { if (result > -1 && _dstlen < srclen) {
@ -273,7 +273,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
static int static int
preproc_decompress(pc_ctx_t *pctx, compress_func_ptr dec_func, void *src, uint64_t srclen, preproc_decompress(pc_ctx_t *pctx, compress_func_ptr dec_func, void *src, uint64_t srclen,
void *dst, uint64_t *dstlen, int level, uchar_t chdr, void *data, algo_props_t *props) void *dst, uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data, algo_props_t *props)
{ {
uchar_t *sorc = (uchar_t *)src, type; uchar_t *sorc = (uchar_t *)src, type;
int64_t result; int64_t result;
@ -288,7 +288,7 @@ preproc_decompress(pc_ctx_t *pctx, compress_func_ptr dec_func, void *src, uint64
sorc += 8; sorc += 8;
srclen -= 8; srclen -= 8;
DEBUG_STAT_EN(strt = get_wtime_millis()); DEBUG_STAT_EN(strt = get_wtime_millis());
result = dec_func(sorc, srclen, dst, dstlen, level, chdr, data); result = dec_func(sorc, srclen, dst, dstlen, level, chdr, btype, data);
DEBUG_STAT_EN(en = get_wtime_millis()); DEBUG_STAT_EN(en = get_wtime_millis());
if (result < 0) return (result); if (result < 0) return (result);
@ -488,13 +488,13 @@ redo:
if (HDR & COMPRESSED) { if (HDR & COMPRESSED) {
if (HDR & CHUNK_FLAG_PREPROC) { if (HDR & CHUNK_FLAG_PREPROC) {
rv = preproc_decompress(pctx, tdat->decompress, cmpbuf, dedupe_data_sz_cmp, rv = preproc_decompress(pctx, tdat->decompress, cmpbuf, dedupe_data_sz_cmp,
ubuf, &_chunksize, tdat->level, HDR, tdat->data, tdat->props); ubuf, &_chunksize, tdat->level, HDR, pctx->btype, tdat->data, tdat->props);
} else { } else {
DEBUG_STAT_EN(double strt, en); DEBUG_STAT_EN(double strt, en);
DEBUG_STAT_EN(strt = get_wtime_millis()); DEBUG_STAT_EN(strt = get_wtime_millis());
rv = tdat->decompress(cmpbuf, dedupe_data_sz_cmp, ubuf, &_chunksize, rv = tdat->decompress(cmpbuf, dedupe_data_sz_cmp, ubuf, &_chunksize,
tdat->level, HDR, tdat->data); tdat->level, HDR, pctx->btype, tdat->data);
DEBUG_STAT_EN(en = get_wtime_millis()); DEBUG_STAT_EN(en = get_wtime_millis());
DEBUG_STAT_EN(fprintf(stderr, "Chunk %d decompression speed %.3f MB/s\n", DEBUG_STAT_EN(fprintf(stderr, "Chunk %d decompression speed %.3f MB/s\n",
tdat->id, get_mb_s(_chunksize, strt, en))); tdat->id, get_mb_s(_chunksize, strt, en)));
@ -516,7 +516,7 @@ redo:
if (dedupe_index_sz >= 90 && dedupe_index_sz > dedupe_index_sz_cmp) { if (dedupe_index_sz >= 90 && dedupe_index_sz > dedupe_index_sz_cmp) {
/* Index should be at least 90 bytes to have been compressed. */ /* Index should be at least 90 bytes to have been compressed. */
rv = lzma_decompress(cmpbuf, dedupe_index_sz_cmp, ubuf, rv = lzma_decompress(cmpbuf, dedupe_index_sz_cmp, ubuf,
&dedupe_index_sz, tdat->rctx->level, 0, tdat->rctx->lzma_data); &dedupe_index_sz, tdat->rctx->level, 0, TYPE_BINARY, tdat->rctx->lzma_data);
} else { } else {
memcpy(ubuf, cmpbuf, dedupe_index_sz); memcpy(ubuf, cmpbuf, dedupe_index_sz);
} }
@ -531,14 +531,14 @@ redo:
if (HDR & COMPRESSED) { if (HDR & COMPRESSED) {
if (HDR & CHUNK_FLAG_PREPROC) { if (HDR & CHUNK_FLAG_PREPROC) {
rv = preproc_decompress(pctx, tdat->decompress, cseg, tdat->len_cmp, rv = preproc_decompress(pctx, tdat->decompress, cseg, tdat->len_cmp,
tdat->uncompressed_chunk, &_chunksize, tdat->level, HDR, tdat->data, tdat->uncompressed_chunk, &_chunksize, tdat->level, HDR, pctx->btype,
tdat->props); tdat->data, tdat->props);
} else { } else {
DEBUG_STAT_EN(double strt, en); DEBUG_STAT_EN(double strt, en);
DEBUG_STAT_EN(strt = get_wtime_millis()); DEBUG_STAT_EN(strt = get_wtime_millis());
rv = tdat->decompress(cseg, tdat->len_cmp, tdat->uncompressed_chunk, rv = tdat->decompress(cseg, tdat->len_cmp, tdat->uncompressed_chunk,
&_chunksize, tdat->level, HDR, tdat->data); &_chunksize, tdat->level, HDR, pctx->btype, tdat->data);
DEBUG_STAT_EN(en = get_wtime_millis()); DEBUG_STAT_EN(en = get_wtime_millis());
DEBUG_STAT_EN(fprintf(stderr, "Chunk decompression speed %.3f MB/s\n", DEBUG_STAT_EN(fprintf(stderr, "Chunk decompression speed %.3f MB/s\n",
get_mb_s(_chunksize, strt, en))); get_mb_s(_chunksize, strt, en)));
@ -1520,7 +1520,8 @@ redo:
/* Compress index if it is at least 90 bytes. */ /* Compress index if it is at least 90 bytes. */
rv = lzma_compress(tdat->uncompressed_chunk + RABIN_HDR_SIZE, rv = lzma_compress(tdat->uncompressed_chunk + RABIN_HDR_SIZE,
dedupe_index_sz, compressed_chunk + RABIN_HDR_SIZE, dedupe_index_sz, compressed_chunk + RABIN_HDR_SIZE,
&index_size_cmp, tdat->rctx->level, 255, tdat->rctx->lzma_data); &index_size_cmp, tdat->rctx->level, 255, TYPE_BINARY,
tdat->rctx->lzma_data);
/* /*
* If index compression fails or does not produce a smaller result * If index compression fails or does not produce a smaller result
@ -1546,14 +1547,15 @@ plain_index:
if ((pctx->lzp_preprocess || pctx->enable_delta2_encode) && _chunksize > 0) { if ((pctx->lzp_preprocess || pctx->enable_delta2_encode) && _chunksize > 0) {
rv = preproc_compress(pctx, tdat->compress, tdat->uncompressed_chunk + dedupe_index_sz, rv = preproc_compress(pctx, tdat->compress, tdat->uncompressed_chunk + dedupe_index_sz,
_chunksize, compressed_chunk + index_size_cmp, &_chunksize, _chunksize, compressed_chunk + index_size_cmp, &_chunksize,
tdat->level, 0, tdat->data, tdat->props); tdat->level, 0, pctx->btype, tdat->data, tdat->props);
} else if (_chunksize > 0) { } else if (_chunksize > 0) {
DEBUG_STAT_EN(double strt, en); DEBUG_STAT_EN(double strt, en);
DEBUG_STAT_EN(strt = get_wtime_millis()); DEBUG_STAT_EN(strt = get_wtime_millis());
rv = tdat->compress(tdat->uncompressed_chunk + dedupe_index_sz, _chunksize, rv = tdat->compress(tdat->uncompressed_chunk + dedupe_index_sz, _chunksize,
compressed_chunk + index_size_cmp, &_chunksize, tdat->level, 0, tdat->data); compressed_chunk + index_size_cmp, &_chunksize, tdat->level, 0, pctx->btype,
tdat->data);
DEBUG_STAT_EN(en = get_wtime_millis()); DEBUG_STAT_EN(en = get_wtime_millis());
DEBUG_STAT_EN(fprintf(stderr, "Chunk compression speed %.3f MB/s\n", DEBUG_STAT_EN(fprintf(stderr, "Chunk compression speed %.3f MB/s\n",
get_mb_s(_chunksize, strt, en))); get_mb_s(_chunksize, strt, en)));
@ -1576,14 +1578,14 @@ plain_index:
if (pctx->lzp_preprocess || pctx->enable_delta2_encode) { if (pctx->lzp_preprocess || pctx->enable_delta2_encode) {
rv = preproc_compress(pctx, tdat->compress, rv = preproc_compress(pctx, tdat->compress,
tdat->uncompressed_chunk, tdat->rbytes, tdat->uncompressed_chunk, tdat->rbytes,
compressed_chunk, &_chunksize, tdat->level, 0, tdat->data, compressed_chunk, &_chunksize, tdat->level, 0, pctx->btype, tdat->data,
tdat->props); tdat->props);
} else { } else {
DEBUG_STAT_EN(double strt, en); DEBUG_STAT_EN(double strt, en);
DEBUG_STAT_EN(strt = get_wtime_millis()); DEBUG_STAT_EN(strt = get_wtime_millis());
rv = tdat->compress(tdat->uncompressed_chunk, tdat->rbytes, rv = tdat->compress(tdat->uncompressed_chunk, tdat->rbytes,
compressed_chunk, &_chunksize, tdat->level, 0, tdat->data); compressed_chunk, &_chunksize, tdat->level, 0, pctx->btype, tdat->data);
DEBUG_STAT_EN(en = get_wtime_millis()); DEBUG_STAT_EN(en = get_wtime_millis());
DEBUG_STAT_EN(fprintf(stderr, "Chunk compression speed %.3f MB/s\n", DEBUG_STAT_EN(fprintf(stderr, "Chunk compression speed %.3f MB/s\n",
get_mb_s(_chunksize, strt, en))); get_mb_s(_chunksize, strt, en)));
@ -2292,7 +2294,10 @@ start_compress(pc_ctx_t *pctx, const char *filename, uint64_t chunksize, int lev
rctx = create_dedupe_context(chunksize, 0, pctx->rab_blk_size, pctx->algo, &props, rctx = create_dedupe_context(chunksize, 0, pctx->rab_blk_size, pctx->algo, &props,
pctx->enable_delta_encode, pctx->enable_fixed_scan, VERSION, COMPRESS, 0, NULL, pctx->enable_delta_encode, pctx->enable_fixed_scan, VERSION, COMPRESS, 0, NULL,
pctx->pipe_mode, nprocs); pctx->pipe_mode, nprocs);
if (pctx->archive_mode)
rbytes = Read_Adjusted(uncompfd, cread_buf, chunksize, &rabin_count, rctx, pctx); rbytes = Read_Adjusted(uncompfd, cread_buf, chunksize, &rabin_count, rctx, pctx);
else
rbytes = Read_Adjusted(uncompfd, cread_buf, chunksize, &rabin_count, rctx, NULL);
} else { } else {
if (pctx->archive_mode) if (pctx->archive_mode)
rbytes = archiver_read(pctx, cread_buf, chunksize); rbytes = archiver_read(pctx, cread_buf, chunksize);
@ -2405,7 +2410,12 @@ start_compress(pc_ctx_t *pctx, const char *filename, uint64_t chunksize, int lev
* buffer is in progress. * buffer is in progress.
*/ */
if (pctx->enable_rabin_split) { if (pctx->enable_rabin_split) {
rbytes = Read_Adjusted(uncompfd, cread_buf, chunksize, &rabin_count, rctx, pctx); if (pctx->archive_mode)
rbytes = Read_Adjusted(uncompfd, cread_buf, chunksize,
&rabin_count, rctx, pctx);
else
rbytes = Read_Adjusted(uncompfd, cread_buf, chunksize,
&rabin_count, rctx, NULL);
} else { } else {
if (pctx->archive_mode) if (pctx->archive_mode)
rbytes = archiver_read(pctx, cread_buf, chunksize); rbytes = archiver_read(pctx, cread_buf, chunksize);

View file

@ -84,38 +84,38 @@ extern uint32_t zlib_buf_extra(uint64_t buflen);
extern int lz4_buf_extra(uint64_t buflen); extern int lz4_buf_extra(uint64_t buflen);
extern int zlib_compress(void *src, uint64_t srclen, void *dst, extern int zlib_compress(void *src, uint64_t srclen, void *dst,
uint64_t *destlen, int level, uchar_t chdr, void *data); uint64_t *destlen, int level, uchar_t chdr, int btype, void *data);
extern int lzma_compress(void *src, uint64_t srclen, void *dst, extern int lzma_compress(void *src, uint64_t srclen, void *dst,
uint64_t *destlen, int level, uchar_t chdr, void *data); uint64_t *destlen, int level, uchar_t chdr, int btype, void *data);
extern int bzip2_compress(void *src, uint64_t srclen, void *dst, extern int bzip2_compress(void *src, uint64_t srclen, void *dst,
uint64_t *destlen, int level, uchar_t chdr, void *data); uint64_t *destlen, int level, uchar_t chdr, int btype, void *data);
extern int adapt_compress(void *src, uint64_t srclen, void *dst, extern int adapt_compress(void *src, uint64_t srclen, void *dst,
uint64_t *dstlen, int level, uchar_t chdr, void *data); uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data);
extern int ppmd_compress(void *src, uint64_t srclen, void *dst, extern int ppmd_compress(void *src, uint64_t srclen, void *dst,
uint64_t *dstlen, int level, uchar_t chdr, void *data); uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data);
extern int lz_fx_compress(void *src, uint64_t srclen, void *dst, extern int lz_fx_compress(void *src, uint64_t srclen, void *dst,
uint64_t *dstlen, int level, uchar_t chdr, void *data); uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data);
extern int lz4_compress(void *src, uint64_t srclen, void *dst, extern int lz4_compress(void *src, uint64_t srclen, void *dst,
uint64_t *dstlen, int level, uchar_t chdr, void *data); uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data);
extern int none_compress(void *src, uint64_t srclen, void *dst, extern int none_compress(void *src, uint64_t srclen, void *dst,
uint64_t *dstlen, int level, uchar_t chdr, void *data); uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data);
extern int zlib_decompress(void *src, uint64_t srclen, void *dst, extern int zlib_decompress(void *src, uint64_t srclen, void *dst,
uint64_t *dstlen, int level, uchar_t chdr, void *data); uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data);
extern int lzma_decompress(void *src, uint64_t srclen, void *dst, extern int lzma_decompress(void *src, uint64_t srclen, void *dst,
uint64_t *dstlen, int level, uchar_t chdr, void *data); uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data);
extern int bzip2_decompress(void *src, uint64_t srclen, void *dst, extern int bzip2_decompress(void *src, uint64_t srclen, void *dst,
uint64_t *dstlen, int level, uchar_t chdr, void *data); uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data);
extern int adapt_decompress(void *src, uint64_t srclen, void *dst, extern int adapt_decompress(void *src, uint64_t srclen, void *dst,
uint64_t *dstlen, int level, uchar_t chdr, void *data); uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data);
extern int ppmd_decompress(void *src, uint64_t srclen, void *dst, extern int ppmd_decompress(void *src, uint64_t srclen, void *dst,
uint64_t *dstlen, int level, uchar_t chdr, void *data); uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data);
extern int lz_fx_decompress(void *src, uint64_t srclen, void *dst, extern int lz_fx_decompress(void *src, uint64_t srclen, void *dst,
uint64_t *dstlen, int level, uchar_t chdr, void *data); uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data);
extern int lz4_decompress(void *src, uint64_t srclen, void *dst, extern int lz4_decompress(void *src, uint64_t srclen, void *dst,
uint64_t *dstlen, int level, uchar_t chdr, void *data); uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data);
extern int none_decompress(void *src, uint64_t srclen, void *dst, extern int none_decompress(void *src, uint64_t srclen, void *dst,
uint64_t *dstlen, int level, uchar_t chdr, void *data); uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data);
extern int adapt_init(void **data, int *level, int nthreads, uint64_t chunksize, extern int adapt_init(void **data, int *level, int nthreads, uint64_t chunksize,
int file_version, compress_op_t op); int file_version, compress_op_t op);
@ -165,9 +165,9 @@ extern void none_stats(int show);
#ifdef ENABLE_PC_LIBBSC #ifdef ENABLE_PC_LIBBSC
extern int libbsc_compress(void *src, uint64_t srclen, void *dst, extern int libbsc_compress(void *src, uint64_t srclen, void *dst,
uint64_t *dstlen, int level, uchar_t chdr, void *data); uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data);
extern int libbsc_decompress(void *src, uint64_t srclen, void *dst, extern int libbsc_decompress(void *src, uint64_t srclen, void *dst,
uint64_t *dstlen, int level, uchar_t chdr, void *data); uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data);
extern int libbsc_init(void **data, int *level, int nthreads, uint64_t chunksize, extern int libbsc_init(void **data, int *level, int nthreads, uint64_t chunksize,
int file_version, compress_op_t op); int file_version, compress_op_t op);
extern void libbsc_props(algo_props_t *data, int level, uint64_t chunksize); extern void libbsc_props(algo_props_t *data, int level, uint64_t chunksize);

View file

@ -109,11 +109,13 @@ ppmd_deinit(void **data)
int int
ppmd_compress(void *src, uint64_t srclen, void *dst, ppmd_compress(void *src, uint64_t srclen, void *dst,
uint64_t *dstlen, int level, uchar_t chdr, void *data) uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data)
{ {
CPpmd8 *_ppmd = (CPpmd8 *)data; CPpmd8 *_ppmd = (CPpmd8 *)data;
uchar_t *_src = (uchar_t *)src; uchar_t *_src = (uchar_t *)src;
if (btype & TYPE_COMPRESSED)
return (-1);
Ppmd8_RangeEnc_Init(_ppmd); Ppmd8_RangeEnc_Init(_ppmd);
Ppmd8_Init(_ppmd, _ppmd->Order, PPMD8_RESTORE_METHOD_RESTART); Ppmd8_Init(_ppmd, _ppmd->Order, PPMD8_RESTORE_METHOD_RESTART);
_ppmd->buf = (Byte *)dst; _ppmd->buf = (Byte *)dst;
@ -132,7 +134,7 @@ ppmd_compress(void *src, uint64_t srclen, void *dst,
int int
ppmd_decompress(void *src, uint64_t srclen, void *dst, ppmd_decompress(void *src, uint64_t srclen, void *dst,
uint64_t *dstlen, int level, uchar_t chdr, void *data) uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data)
{ {
CPpmd8 *_ppmd = (CPpmd8 *)data; CPpmd8 *_ppmd = (CPpmd8 *)data;
Byte *_src = (Byte *)src; Byte *_src = (Byte *)src;

View file

@ -18,9 +18,9 @@ struct ext_entry {
{"c++" , TYPE_TEXT, 3}, {"c++" , TYPE_TEXT, 3},
{"hpp" , TYPE_TEXT, 3}, {"hpp" , TYPE_TEXT, 3},
{"txt" , TYPE_TEXT, 3}, {"txt" , TYPE_TEXT, 3},
{"html" , TYPE_TEXT, 4}, {"html" , TYPE_TEXT|TYPE_MARKUP, 4},
{"htm" , TYPE_TEXT, 3}, {"htm" , TYPE_TEXT|TYPE_MARKUP, 3},
{"xml" , TYPE_TEXT, 3}, {"xml" , TYPE_TEXT|TYPE_MARKUP, 3},
{"info" , TYPE_TEXT, 4}, {"info" , TYPE_TEXT, 4},
{"ppm" , TYPE_TEXT, 3}, {"ppm" , TYPE_TEXT, 3},
{"svg" , TYPE_TEXT, 3}, {"svg" , TYPE_TEXT, 3},
@ -44,18 +44,18 @@ struct ext_entry {
{"java" , TYPE_TEXT, 4}, {"java" , TYPE_TEXT, 4},
{"m4" , TYPE_TEXT, 2}, {"m4" , TYPE_TEXT, 2},
{"vb" , TYPE_TEXT, 2}, {"vb" , TYPE_TEXT, 2},
{"xslt" , TYPE_TEXT, 4}, {"xslt" , TYPE_TEXT|TYPE_MARKUP, 4},
{"xsl" , TYPE_TEXT, 3}, {"xsl" , TYPE_TEXT|TYPE_MARKUP, 3},
{"yacc" , TYPE_TEXT, 4}, {"yacc" , TYPE_TEXT, 4},
{"lex" , TYPE_TEXT, 3}, {"lex" , TYPE_TEXT, 3},
{"csv" , TYPE_TEXT, 3}, {"csv" , TYPE_TEXT, 3},
{"shtml" , TYPE_TEXT, 5}, {"shtml" , TYPE_TEXT|TYPE_MARKUP, 5},
{"xhtml" , TYPE_TEXT, 5}, {"xhtml" , TYPE_TEXT|TYPE_MARKUP, 5},
{"xht" , TYPE_TEXT, 3}, {"xht" , TYPE_TEXT|TYPE_MARKUP, 3},
{"asp" , TYPE_TEXT, 3}, {"asp" , TYPE_TEXT, 3},
{"aspx" , TYPE_TEXT, 4}, {"aspx" , TYPE_TEXT, 4},
{"rss" , TYPE_TEXT, 3}, {"rss" , TYPE_TEXT|TYPE_MARKUP, 3},
{"atom" , TYPE_TEXT, 4}, {"atom" , TYPE_TEXT|TYPE_MARKUP, 4},
{"cgi" , TYPE_TEXT, 3}, {"cgi" , TYPE_TEXT, 3},
{"c#" , TYPE_TEXT, 2}, {"c#" , TYPE_TEXT, 2},
{"cob" , TYPE_TEXT, 3}, {"cob" , TYPE_TEXT, 3},
@ -67,8 +67,18 @@ struct ext_entry {
{"ps" , TYPE_TEXT, 2}, {"ps" , TYPE_TEXT, 2},
{"bib" , TYPE_TEXT, 3}, {"bib" , TYPE_TEXT, 3},
{"lua" , TYPE_TEXT, 3}, {"lua" , TYPE_TEXT, 3},
{"qml" , TYPE_TEXT, 3}, {"qml" , TYPE_TEXT|TYPE_MARKUP, 3},
{"fa" , TYPE_TEXT, 2}, {"fa" , TYPE_TEXT, 2},
{"faa" , TYPE_TEXT, 3},
{"asn" , TYPE_TEXT|TYPE_MARKUP, 3},
{"ffn" , TYPE_TEXT, 3},
{"fna" , TYPE_TEXT, 3},
{"frn" , TYPE_TEXT, 3},
{"gbk" , TYPE_TEXT, 3},
{"gff" , TYPE_TEXT, 3},
{"ptt" , TYPE_TEXT, 3},
{"rnt" , TYPE_TEXT, 3},
{"val" , TYPE_BINARY, 3},
{"tcc" , TYPE_TEXT, 3}, {"tcc" , TYPE_TEXT, 3},
{"css" , TYPE_TEXT, 3}, {"css" , TYPE_TEXT, 3},
{"pod" , TYPE_TEXT, 3}, {"pod" , TYPE_TEXT, 3},
@ -78,55 +88,61 @@ struct ext_entry {
{"upp" , TYPE_TEXT, 3}, {"upp" , TYPE_TEXT, 3},
{"mom" , TYPE_TEXT, 3}, {"mom" , TYPE_TEXT, 3},
{"tmac" , TYPE_TEXT, 4}, {"tmac" , TYPE_TEXT, 4},
{"exe" , TYPE_EXE, 3}, {"exe" , TYPE_BINARY|TYPE_EXE, 3},
{"dll" , TYPE_EXE, 3}, {"dll" , TYPE_BINARY|TYPE_EXE, 3},
{"bin" , TYPE_EXE, 3}, {"bin" , TYPE_BINARY|TYPE_EXE, 3},
{"o" , TYPE_EXE, 1}, {"o" , TYPE_BINARY|TYPE_EXE, 1},
{"a" , TYPE_EXE, 1}, {"a" , TYPE_BINARY|TYPE_EXE, 1},
{"obj" , TYPE_EXE, 3}, {"obj" , TYPE_BINARY|TYPE_EXE, 3},
{"so" , TYPE_EXE, 2}, {"so" , TYPE_BINARY|TYPE_EXE, 2},
{"com" , TYPE_EXE, 3}, {"com" , TYPE_BINARY|TYPE_EXE, 3},
{"xpi" , TYPE_EXE, 3}, {"xpi" , TYPE_BINARY|TYPE_EXE, 3},
{"off" , TYPE_EXE, 3}, {"off" , TYPE_BINARY|TYPE_EXE, 3},
{"pdf" , TYPE_COMPRESSED, 3}, {"pdf" , TYPE_BINARY, 3},
{"jpg" , TYPE_JPEG, 3}, {"jpg" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG, 3},
{"jpeg" , TYPE_JPEG, 4}, {"jpeg" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG, 4},
{"png" , TYPE_COMPRESSED, 3}, {"png" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ, 3},
{"mp3" , TYPE_COMPRESSED, 3}, {"mp3" , TYPE_BINARY|TYPE_COMPRESSED, 3},
{"wma" , TYPE_COMPRESSED, 3}, {"wma" , TYPE_BINARY|TYPE_COMPRESSED, 3},
{"divx" , TYPE_COMPRESSED, 4}, {"divx" , TYPE_BINARY|TYPE_COMPRESSED, 4},
{"mp4" , TYPE_COMPRESSED, 3}, {"mp4" , TYPE_BINARY|TYPE_COMPRESSED, 3},
{"aac" , TYPE_COMPRESSED, 3}, {"aac" , TYPE_BINARY|TYPE_COMPRESSED, 3},
{"m4a" , TYPE_COMPRESSED, 3}, {"m4a" , TYPE_BINARY|TYPE_COMPRESSED, 3},
{"m4p" , TYPE_COMPRESSED, 3}, {"m4p" , TYPE_BINARY|TYPE_COMPRESSED, 3},
{"ofs" , TYPE_COMPRESSED, 3}, {"ofs" , TYPE_BINARY|TYPE_COMPRESSED, 3},
{"ofr" , TYPE_COMPRESSED, 3}, {"ofr" , TYPE_BINARY|TYPE_COMPRESSED, 3},
{"flac" , TYPE_COMPRESSED, 4}, {"flac" , TYPE_BINARY|TYPE_COMPRESSED, 4},
{"pac" , TYPE_COMPRESSED, 3}, {"pac" , TYPE_BINARY|TYPE_COMPRESSED, 3},
{"gif" , TYPE_COMPRESSED, 3}, {"gif" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZW, 3},
{"jp2" , TYPE_JPEG, 3}, {"jp2" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG, 3},
{"gz" , TYPE_COMPRESSED, 2}, {"gz" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ, 2},
{"bz2" , TYPE_COMPRESSED, 3}, {"tgz" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ, 3},
{"zip" , TYPE_COMPRESSED, 3}, {"bz2" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_BZ2, 3},
{"arj" , TYPE_COMPRESSED, 3}, {"tbz2" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_BZ2, 4},
{"arc" , TYPE_COMPRESSED, 3}, {"zip" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_ZIP, 3},
{"jar" , TYPE_COMPRESSED, 3}, {"arj" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_ARJ, 3},
{"lz" , TYPE_COMPRESSED, 2}, {"arc" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_ARC, 3},
{"lzh" , TYPE_COMPRESSED, 3}, {"jar" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ, 3},
{"lzma" , TYPE_COMPRESSED, 4}, {"lz" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZ, 2},
{"lzo" , TYPE_COMPRESSED, 3}, {"lzh" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LH, 3},
{"dmg" , TYPE_COMPRESSED, 3}, {"lha" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LH, 3},
{"7z" , TYPE_COMPRESSED, 2}, {"lzma" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZMA, 4},
{"uha" , TYPE_COMPRESSED, 3}, {"lzo" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZO, 3},
{"alz" , TYPE_COMPRESSED, 3}, {"dmg" , TYPE_BINARY, 3},
{"ace" , TYPE_COMPRESSED, 3}, {"7z" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZMA, 2},
{"rar" , TYPE_COMPRESSED, 3}, {"uha" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_UHARC, 3},
{"xz" , TYPE_COMPRESSED, 2}, {"alz" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_ALZ, 3},
{"ace" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_ACE, 3},
{"rar" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_RAR, 3},
{"xz" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZMA, 2},
{"txz" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZMA, 3},
{"pmd" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_PPMD, 3},
{"zpaq" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_ZPAQ, 4},
{"xcf" , TYPE_BINARY, 3}, {"xcf" , TYPE_BINARY, 3},
{"mo" , TYPE_BINARY, 2}, {"mo" , TYPE_BINARY, 2},
{"bmp" , TYPE_BINARY, 3}, {"bmp" , TYPE_BINARY, 3},
{"pyo" , TYPE_BINARY, 3}, {"pyo" , TYPE_BINARY, 3},
{"pyc" , TYPE_BINARY, 3}, {"pyc" , TYPE_BINARY, 3},
{"wav" , TYPE_BINARY, 3},
}; };
#define NUM_EXT (116)
#endif #endif

View file

@ -5,9 +5,9 @@ cpp,TYPE_TEXT
c++,TYPE_TEXT c++,TYPE_TEXT
hpp,TYPE_TEXT hpp,TYPE_TEXT
txt,TYPE_TEXT txt,TYPE_TEXT
html,TYPE_TEXT html,TYPE_TEXT|TYPE_MARKUP
htm,TYPE_TEXT htm,TYPE_TEXT|TYPE_MARKUP
xml,TYPE_TEXT xml,TYPE_TEXT|TYPE_MARKUP
info,TYPE_TEXT info,TYPE_TEXT
ppm,TYPE_TEXT ppm,TYPE_TEXT
svg,TYPE_TEXT svg,TYPE_TEXT
@ -31,18 +31,18 @@ go,TYPE_TEXT
java,TYPE_TEXT java,TYPE_TEXT
m4,TYPE_TEXT m4,TYPE_TEXT
vb,TYPE_TEXT vb,TYPE_TEXT
xslt,TYPE_TEXT xslt,TYPE_TEXT|TYPE_MARKUP
xsl,TYPE_TEXT xsl,TYPE_TEXT|TYPE_MARKUP
yacc,TYPE_TEXT yacc,TYPE_TEXT
lex,TYPE_TEXT lex,TYPE_TEXT
csv,TYPE_TEXT csv,TYPE_TEXT
shtml,TYPE_TEXT shtml,TYPE_TEXT|TYPE_MARKUP
xhtml,TYPE_TEXT xhtml,TYPE_TEXT|TYPE_MARKUP
xht,TYPE_TEXT xht,TYPE_TEXT|TYPE_MARKUP
asp,TYPE_TEXT asp,TYPE_TEXT
aspx,TYPE_TEXT aspx,TYPE_TEXT
rss,TYPE_TEXT rss,TYPE_TEXT|TYPE_MARKUP
atom,TYPE_TEXT atom,TYPE_TEXT|TYPE_MARKUP
cgi,TYPE_TEXT cgi,TYPE_TEXT
c#,TYPE_TEXT c#,TYPE_TEXT
cob,TYPE_TEXT cob,TYPE_TEXT
@ -54,8 +54,21 @@ s,TYPE_TEXT
ps,TYPE_TEXT ps,TYPE_TEXT
bib,TYPE_TEXT bib,TYPE_TEXT
lua,TYPE_TEXT lua,TYPE_TEXT
qml,TYPE_TEXT qml,TYPE_TEXT|TYPE_MARKUP
# These are all genomic data file extensions
fa,TYPE_TEXT fa,TYPE_TEXT
faa,TYPE_TEXT
asn,TYPE_TEXT|TYPE_MARKUP
ffn,TYPE_TEXT
fna,TYPE_TEXT
frn,TYPE_TEXT
gbk,TYPE_TEXT
gff,TYPE_TEXT
ptt,TYPE_TEXT
rnt,TYPE_TEXT
val,TYPE_BINARY
tcc,TYPE_TEXT tcc,TYPE_TEXT
css,TYPE_TEXT css,TYPE_TEXT
pod,TYPE_TEXT pod,TYPE_TEXT
@ -65,52 +78,59 @@ am,TYPE_TEXT
upp,TYPE_TEXT upp,TYPE_TEXT
mom,TYPE_TEXT mom,TYPE_TEXT
tmac,TYPE_TEXT tmac,TYPE_TEXT
exe,TYPE_EXE exe,TYPE_BINARY|TYPE_EXE
dll,TYPE_EXE dll,TYPE_BINARY|TYPE_EXE
bin,TYPE_EXE bin,TYPE_BINARY|TYPE_EXE
o,TYPE_EXE o,TYPE_BINARY|TYPE_EXE
a,TYPE_EXE a,TYPE_BINARY|TYPE_EXE
obj,TYPE_EXE obj,TYPE_BINARY|TYPE_EXE
so,TYPE_EXE so,TYPE_BINARY|TYPE_EXE
com,TYPE_EXE com,TYPE_BINARY|TYPE_EXE
xpi,TYPE_EXE xpi,TYPE_BINARY|TYPE_EXE
off,TYPE_EXE off,TYPE_BINARY|TYPE_EXE
pdf,TYPE_COMPRESSED pdf,TYPE_BINARY
jpg,TYPE_JPEG jpg,TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG
jpeg,TYPE_JPEG jpeg,TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG
png,TYPE_COMPRESSED png,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ
mp3,TYPE_COMPRESSED mp3,TYPE_BINARY|TYPE_COMPRESSED
wma,TYPE_COMPRESSED wma,TYPE_BINARY|TYPE_COMPRESSED
divx,TYPE_COMPRESSED divx,TYPE_BINARY|TYPE_COMPRESSED
mp4,TYPE_COMPRESSED mp4,TYPE_BINARY|TYPE_COMPRESSED
aac,TYPE_COMPRESSED aac,TYPE_BINARY|TYPE_COMPRESSED
m4a,TYPE_COMPRESSED m4a,TYPE_BINARY|TYPE_COMPRESSED
m4p,TYPE_COMPRESSED m4p,TYPE_BINARY|TYPE_COMPRESSED
ofs,TYPE_COMPRESSED ofs,TYPE_BINARY|TYPE_COMPRESSED
ofr,TYPE_COMPRESSED ofr,TYPE_BINARY|TYPE_COMPRESSED
flac,TYPE_COMPRESSED flac,TYPE_BINARY|TYPE_COMPRESSED
pac,TYPE_COMPRESSED pac,TYPE_BINARY|TYPE_COMPRESSED
gif,TYPE_COMPRESSED gif,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZW
jp2,TYPE_JPEG jp2,TYPE_BINARY|TYPE_COMPRESSED|TYPE_JPEG
gz,TYPE_COMPRESSED gz,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ
bz2,TYPE_COMPRESSED tgz,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ
zip,TYPE_COMPRESSED bz2,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_BZ2
arj,TYPE_COMPRESSED tbz2,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_BZ2
arc,TYPE_COMPRESSED zip,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_ZIP
jar,TYPE_COMPRESSED arj,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_ARJ
lz,TYPE_COMPRESSED arc,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_ARC
lzh,TYPE_COMPRESSED jar,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ
lzma,TYPE_COMPRESSED lz,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZ
lzo,TYPE_COMPRESSED lzh,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LH
dmg,TYPE_COMPRESSED lha,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LH
7z,TYPE_COMPRESSED lzma,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZMA
uha,TYPE_COMPRESSED lzo,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZO
alz,TYPE_COMPRESSED dmg,TYPE_BINARY
ace,TYPE_COMPRESSED 7z,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZMA
rar,TYPE_COMPRESSED uha,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_UHARC
xz,TYPE_COMPRESSED alz,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_ALZ
ace,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_ACE
rar,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_RAR
xz,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZMA
txz,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_LZMA
pmd,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_PPMD
zpaq,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_ZPAQ
xcf,TYPE_BINARY xcf,TYPE_BINARY
mo,TYPE_BINARY mo,TYPE_BINARY
bmp,TYPE_BINARY bmp,TYPE_BINARY
pyo,TYPE_BINARY pyo,TYPE_BINARY
pyc,TYPE_BINARY pyc,TYPE_BINARY
wav,TYPE_BINARY

View file

@ -1,6 +1,5 @@
#!/bin/sh #!/bin/sh
count=`cat extensions.txt | wc -l`
echo ' echo '
/* Generated File. DO NOT EDIT. */ /* Generated File. DO NOT EDIT. */
/* /*
@ -18,6 +17,9 @@ struct ext_entry {
rm -f extlist rm -f extlist
cat extensions.txt | while read line cat extensions.txt | while read line
do do
[ "x$line" = "x" ] && continue
echo "$line" | egrep "^#" > /dev/null
[ $? -eq 0 ] && continue
_OIFS="$IFS" _OIFS="$IFS"
IFS="," IFS=","
set -- $line set -- $line
@ -30,7 +32,6 @@ do
done done
echo '};' >> extensions.h echo '};' >> extensions.h
echo "#define NUM_EXT (${count})" >> extensions.h
echo "#endif" >> extensions.h echo "#endif" >> extensions.h
./perfect -nm < extlist ./perfect -nm < extlist
rm -f extlist rm -f extlist

View file

@ -12,17 +12,21 @@
/* small adjustments to _a_ to make values distinct */ /* small adjustments to _a_ to make values distinct */
ub1 tab[] = { ub1 tab[] = {
10,76,0,76,70,42,0,1,0,0,119,1,61,1,70,79, 125,0,0,82,113,0,125,85,113,0,0,7,0,0,125,0,
0,0,0,4,70,1,0,122,0,119,47,76,76,34,110,101, 0,0,7,87,0,0,82,0,0,88,0,7,0,85,125,85,
0,76,70,70,42,28,0,66,0,108,0,109,28,4,28,4, 0,113,0,0,85,0,0,113,0,113,124,125,0,125,0,0,
70,0,1,20,4,123,123,0,79,75,34,76,69,77,0,69, 113,0,11,113,125,0,0,0,0,85,113,85,22,0,0,125,
0,113,0,0,113,0,82,0,125,111,87,88,69,125,113,0,
124,0,7,22,113,22,0,235,0,120,120,125,113,0,74,120,
0,124,87,7,0,127,0,0,11,85,85,146,115,11,183,146,
0,0,88,0,0,85,42,0,171,0,0,0,0,83,0,0,
}; };
/* The hash function */ /* The hash function */
ub4 phash(char *key, int len) ub4 phash(char *key, int len)
{ {
ub4 rsl, val = lookup(key, len, 0x9e3779b9); ub4 rsl, val = lookup(key, len, 0x9e3779b9);
rsl = ((val>>26)^tab[val&0x3f]); rsl = ((val>>25)^tab[val&0x7f]);
return rsl; return rsl;
} }

View file

@ -7,9 +7,9 @@
#define PHASH #define PHASH
extern ub1 tab[]; extern ub1 tab[];
#define PHASHLEN 0x40 /* length of hash mapping table */ #define PHASHLEN 0x80 /* length of hash mapping table */
#define PHASHNKEYS 116 /* How many keys were hashed */ #define PHASHNKEYS 133 /* How many keys were hashed */
#define PHASHRANGE 128 /* Range any input might map to */ #define PHASHRANGE 256 /* Range any input might map to */
#define PHASHSALT 0x9e3779b9 /* internal, initialize normal hash */ #define PHASHSALT 0x9e3779b9 /* internal, initialize normal hash */
ub4 phash(); ub4 phash();

View file

@ -228,6 +228,35 @@ struct fn_list {
struct fn_list *next; struct fn_list *next;
}; };
/*
* Enumerated type constants for file type identification in pc_archive.
*/
typedef enum {
TYPE_UNKNOWN = 0,
TYPE_TEXT = 1,
TYPE_BINARY = 2,
TYPE_COMPRESSED = 4,
TYPE_EXE = 8,
TYPE_JPEG = 12,
TYPE_MARKUP = 16,
TYPE_COMPRESSED_GZ = 20,
TYPE_COMPRESSED_LZW = 24,
TYPE_COMPRESSED_BZ2 = 28,
TYPE_COMPRESSED_ZIP = 32,
TYPE_COMPRESSED_ARJ = 36,
TYPE_COMPRESSED_ARC = 40,
TYPE_COMPRESSED_LH = 44,
TYPE_COMPRESSED_LZMA = 48,
TYPE_COMPRESSED_LZO = 52,
TYPE_COMPRESSED_UHARC = 56,
TYPE_COMPRESSED_ALZ = 60,
TYPE_COMPRESSED_ACE = 64,
TYPE_COMPRESSED_RAR = 68,
TYPE_COMPRESSED_LZ = 72,
TYPE_COMPRESSED_PPMD = 76,
TYPE_COMPRESSED_ZPAQ = 80
} data_type_t;
#ifndef _IN_UTILS_ #ifndef _IN_UTILS_
extern processor_info_t proc_info; extern processor_info_t proc_info;
#endif #endif
@ -254,7 +283,7 @@ extern char *get_temp_dir();
/* Pointer type for compress and decompress functions. */ /* Pointer type for compress and decompress functions. */
typedef int (*compress_func_ptr)(void *src, uint64_t srclen, void *dst, typedef int (*compress_func_ptr)(void *src, uint64_t srclen, void *dst,
uint64_t *destlen, int level, uchar_t chdr, void *data); uint64_t *destlen, int level, uchar_t chdr, int btype, void *data);
typedef enum { typedef enum {
COMPRESS, COMPRESS,

View file

@ -142,7 +142,7 @@ void zerr(int ret, int cmp)
int int
zlib_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, zlib_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
int level, uchar_t chdr, void *data) int level, uchar_t chdr, int btype, void *data)
{ {
int ret, ending; int ret, ending;
unsigned int slen, dlen; unsigned int slen, dlen;
@ -205,7 +205,7 @@ zlib_compress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
int int
zlib_decompress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen, zlib_decompress(void *src, uint64_t srclen, void *dst, uint64_t *dstlen,
int level, uchar_t chdr, void *data) int level, uchar_t chdr, int btype, void *data)
{ {
int err; int err;
unsigned int slen, dlen; unsigned int slen, dlen;