From a741f34f78df72d45e14f9159a164dac2c5bba8f Mon Sep 17 00:00:00 2001 From: Moinak Ghosh Date: Wed, 18 Dec 2013 00:09:32 +0530 Subject: [PATCH] Move MSDOS COM single-byte magic number checks to last in the list. Move advanced options flag into context structure. Include dtd files as text type. --- archive/pc_archive.c | 31 ++++++++++++++++--------------- lzma_compress.c | 2 +- pcompress.c | 32 ++++++++++++++++++++------------ pcompress.h | 1 + utils/phash/extensions.h | 1 + utils/phash/extensions.txt | 1 + utils/phash/phash.c | 16 ++++++++-------- utils/phash/phash.h | 2 +- 8 files changed, 49 insertions(+), 37 deletions(-) diff --git a/archive/pc_archive.c b/archive/pc_archive.c index 7f97841..8708950 100644 --- a/archive/pc_archive.c +++ b/archive/pc_archive.c @@ -1516,21 +1516,6 @@ detect_type_by_data(uchar_t *buf, size_t len) return (TYPE_BINARY|TYPE_BMP); } - // MSDOS COM types - if (buf[0] == 0xe9 || buf[0] == 0xeb) { - if (LE16(U16_P(buf + 0x1fe)) == 0xaa55) - return (TYPE_BINARY|TYPE_EXE32); // MSDOS COM - else - return (TYPE_BINARY); - } - if (U16_P(buf + 2) == COM_MAGIC || U16_P(buf + 4) == COM_MAGIC || - U16_P(buf + 4) == COM_MAGIC || U16_P(buf + 5) == COM_MAGIC || - U16_P(buf + 13) == COM_MAGIC || U16_P(buf + 18) == COM_MAGIC || - U16_P(buf + 23) == COM_MAGIC || U16_P(buf + 30) == COM_MAGIC || - U16_P(buf + 70) == COM_MAGIC) { - return (TYPE_BINARY|TYPE_EXE32); // MSDOS COM - } - if (U32_P(buf) == TZINT) return (TYPE_BINARY); // Timezone data if (U32_P(buf) == PPMINT) @@ -1538,5 +1523,21 @@ detect_type_by_data(uchar_t *buf, size_t len) if (U32_P(buf) == WVPK || U32_P(buf) == TTA1) return (TYPE_BINARY|TYPE_COMPRESSED|TYPE_AUDIO_COMPRESSED); + // MSDOS COM types, two byte and one byte magic numbers are checked + // after all other multi-byte magic number checks. + if (buf[0] == 0xe9 || buf[0] == 0xeb) { + if (LE16(U16_P(buf + 0x1fe)) == 0xaa55) + return (TYPE_BINARY|TYPE_EXE32); // MSDOS COM + else + return (TYPE_BINARY); + } + + if (U16_P(buf + 2) == COM_MAGIC || U16_P(buf + 4) == COM_MAGIC || + U16_P(buf + 4) == COM_MAGIC || U16_P(buf + 5) == COM_MAGIC || + U16_P(buf + 13) == COM_MAGIC || U16_P(buf + 18) == COM_MAGIC || + U16_P(buf + 23) == COM_MAGIC || U16_P(buf + 30) == COM_MAGIC || + U16_P(buf + 70) == COM_MAGIC) { + return (TYPE_BINARY|TYPE_EXE32); // MSDOS COM + } return (TYPE_UNKNOWN); } diff --git a/lzma_compress.c b/lzma_compress.c index c016820..b113eb0 100644 --- a/lzma_compress.c +++ b/lzma_compress.c @@ -211,7 +211,7 @@ lzma_compress(void *src, uint64_t srclen, void *dst, return (-1); } - if (PC_SUBTYPE(btype) == TYPE_COMPRESSED_LZMA) + if (PC_SUBTYPE(btype) == TYPE_COMPRESSED_ZPAQ) return (-1); props->level = level; diff --git a/pcompress.c b/pcompress.c index ff83e79..4cd9625 100644 --- a/pcompress.c +++ b/pcompress.c @@ -1990,6 +1990,14 @@ start_compress(pc_ctx_t *pctx, const char *filename, uint64_t chunksize, int lev props.is_single_chunk = 1; flags |= FLAG_SINGLE_CHUNK; + /* + * Disable deduplication if file is too small. + */ + if (chunksize < RAB_MIN_CHUNK_SIZE) { + pctx->enable_rabin_scan = 0; + pctx->enable_rabin_global = 0; + } + /* * Switch to simple Deduplication if global is enabled. */ @@ -2799,7 +2807,7 @@ init_pc_context_argstr(pc_ctx_t *pctx, char *args) int DLL_EXPORT init_pc_context(pc_ctx_t *pctx, int argc, char *argv[]) { - int opt, num_rem, err, my_optind, advanced_opts; + int opt, num_rem, err, my_optind; char *pos; struct filter_flags ff; @@ -2812,7 +2820,7 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[]) while (*pos != '/' && pos > argv[0]) pos--; if (*pos == '/') pos++; strcpy(pctx->exec_name, pos); - advanced_opts = 0; + pctx->advanced_opts = 0; ff.enable_packjpg = 0; pthread_mutex_lock(&opt_parse); @@ -2865,7 +2873,7 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[]) break; case 'B': - advanced_opts = 1; + pctx->advanced_opts = 1; pctx->rab_blk_size = atoi(optarg); if (pctx->rab_blk_size < 0 || pctx->rab_blk_size > 5) { log_msg(LOG_ERR, 0, "Average Dedupe block size must be in range 0 (2k), 1 (4k) .. 5 (64k)"); @@ -2894,17 +2902,17 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[]) break; case 'D': - advanced_opts = 1; + pctx->advanced_opts = 1; pctx->enable_rabin_scan = 1; break; case 'G': - advanced_opts = 1; + pctx->advanced_opts = 1; pctx->enable_rabin_global = 1; break; case 'E': - advanced_opts = 1; + pctx->advanced_opts = 1; pctx->enable_rabin_scan = 1; if (!pctx->enable_delta_encode) pctx->enable_delta_encode = DELTA_NORMAL; @@ -2926,18 +2934,18 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[]) break; case 'F': - advanced_opts = 1; + pctx->advanced_opts = 1; pctx->enable_fixed_scan = 1; pctx->enable_rabin_split = 0; break; case 'L': - advanced_opts = 1; + pctx->advanced_opts = 1; pctx->lzp_preprocess = 1; break; case 'P': - advanced_opts = 1; + pctx->advanced_opts = 1; pctx->enable_delta2_encode = 1; break; @@ -2979,12 +2987,12 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[]) break; case 'j': - advanced_opts = 1; + pctx->advanced_opts = 1; ff.enable_packjpg = 1; break; case 'x': - advanced_opts = 1; + pctx->advanced_opts = 1; pctx->dispack_preprocess = 1; break; @@ -3252,7 +3260,7 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[]) * Auto-select filters and preprocessing modes based on compresion level. * This is not done if user explicitly specified advanced options. */ - if (!advanced_opts) { + if (!pctx->advanced_opts) { /* * Selectively enable filters while archiving, depending on compression level. */ diff --git a/pcompress.h b/pcompress.h index 065dac7..2561248 100644 --- a/pcompress.h +++ b/pcompress.h @@ -215,6 +215,7 @@ typedef struct pc_ctx { int pagesize; int force_archive_perms; int no_overwrite_newer; + int advanced_opts; /* * Archiving related context data. diff --git a/utils/phash/extensions.h b/utils/phash/extensions.h index f16aab1..e4c239b 100644 --- a/utils/phash/extensions.h +++ b/utils/phash/extensions.h @@ -68,6 +68,7 @@ struct ext_entry { {"ps" , TYPE_TEXT, 2}, {"bib" , TYPE_TEXT, 3}, {"lua" , TYPE_TEXT, 3}, + {"dtd" , TYPE_TEXT, 3}, {"qml" , TYPE_TEXT|TYPE_MARKUP, 3}, {"fa" , TYPE_TEXT|TYPE_DNA_SEQ, 2}, {"faa" , TYPE_TEXT|TYPE_DNA_SEQ, 3}, diff --git a/utils/phash/extensions.txt b/utils/phash/extensions.txt index 0b6c62b..22a8943 100644 --- a/utils/phash/extensions.txt +++ b/utils/phash/extensions.txt @@ -55,6 +55,7 @@ s,TYPE_TEXT ps,TYPE_TEXT bib,TYPE_TEXT lua,TYPE_TEXT +dtd,TYPE_TEXT qml,TYPE_TEXT|TYPE_MARKUP # These are all genomic data file extensions diff --git a/utils/phash/phash.c b/utils/phash/phash.c index 1dd8ad6..3e3b889 100644 --- a/utils/phash/phash.c +++ b/utils/phash/phash.c @@ -12,14 +12,14 @@ /* small adjustments to _a_ to make values distinct */ ub1 tab[] = { -125,0,0,220,125,113,82,82,113,0,0,7,0,0,113,125, -0,0,7,87,0,113,0,0,0,125,0,131,0,7,125,22, -0,0,0,0,85,0,0,0,0,113,87,113,0,7,22,0, -82,0,113,113,125,125,0,0,0,0,113,7,85,0,0,85, -0,82,0,0,113,0,125,183,82,85,124,88,58,183,0,0, -124,0,113,125,0,125,0,116,0,82,125,74,0,125,0,32, -0,113,113,124,0,85,0,0,42,61,0,87,0,40,183,61, -0,0,0,0,0,61,0,0,56,11,0,0,164,200,0,0, +125,0,0,220,125,0,82,82,113,0,0,113,0,0,113,125, +0,0,7,32,0,113,82,0,0,183,0,131,0,7,220,120, +0,0,0,0,85,0,0,0,0,113,125,113,0,7,22,0, +82,0,7,113,125,125,0,0,0,113,113,85,220,0,0,85, +0,82,0,0,113,0,85,183,82,88,11,85,55,113,0,0, +124,0,113,125,0,125,0,235,0,82,125,55,0,22,0,92, +0,125,113,7,0,40,0,0,82,61,0,42,0,11,177,15, +0,0,0,0,0,6,0,0,56,11,0,0,164,47,0,0, }; /* The hash function */ diff --git a/utils/phash/phash.h b/utils/phash/phash.h index 74e2f2f..11a1ffc 100644 --- a/utils/phash/phash.h +++ b/utils/phash/phash.h @@ -8,7 +8,7 @@ extern ub1 tab[]; #define PHASHLEN 0x80 /* length of hash mapping table */ -#define PHASHNKEYS 136 /* How many keys were hashed */ +#define PHASHNKEYS 137 /* How many keys were hashed */ #define PHASHRANGE 256 /* Range any input might map to */ #define PHASHSALT 0x9e3779b9 /* internal, initialize normal hash */