From 306f145f22b6af17011019ee5d318412c92eea1d Mon Sep 17 00:00:00 2001 From: Moinak Ghosh Date: Thu, 28 Nov 2013 22:42:51 +0530 Subject: [PATCH] Use libbsc/ppmd for BMP files. Fix extension based hashing. Do not append .pz extension to filenames already having it. Some code formatting changes. --- Makefile.in | 3 +- adaptive_compress.c | 9 ++++-- archive/pc_archive.c | 11 +++++-- pcompress.c | 61 +++++++++++++++++++++++++------------- utils/phash/extensions.h | 1 - utils/phash/extensions.txt | 1 - utils/phash/phash.c | 2 +- utils/phash/phash.h | 2 +- utils/utils.h | 16 +++++++++- 9 files changed, 74 insertions(+), 32 deletions(-) diff --git a/Makefile.in b/Makefile.in index fa57d52..ac10df2 100644 --- a/Makefile.in +++ b/Makefile.in @@ -133,7 +133,8 @@ DELTA2OBJS = $(DELTA2SRCS:.c=.o) ARCHIVESRCS = archive/pc_archive.c archive/pc_arc_filter.c utils/phash/phash.c \ utils/phash/lookupa.c utils/phash/recycle.c ARCHIVEHDRS = pcompress.h utils/utils.h archive/pc_archive.h utils/phash/standard.h \ - utils/phash/lookupa.h utils/phash/recycle.h utils/phash/phash.h archive/pc_arc_filter.h + utils/phash/lookupa.h utils/phash/recycle.h utils/phash/phash.h archive/pc_arc_filter.h \ + utils/phash/extensions.h ARCHIVEOBJS = $(ARCHIVESRCS:.c=.o) PJPGSRCS = filters/packjpg/aricoder.cpp filters/packjpg/bitops.cpp filters/packjpg/packjpg.cpp \ diff --git a/adaptive_compress.c b/adaptive_compress.c index 74b6f98..09b97db 100644 --- a/adaptive_compress.c +++ b/adaptive_compress.c @@ -269,14 +269,16 @@ adapt_compress(void *src, uint64_t srclen, void *dst, rv = ADAPT_COMPRESS_LZ4; lz4_count++; - } else if (adat->adapt_mode == 2 && (PC_TYPE(btype) == TYPE_BINARY)) { + } else if (adat->adapt_mode == 2 && PC_TYPE(btype) == TYPE_BINARY && + PC_SUBTYPE(btype) != TYPE_BMP) { rv = lzma_compress(src, srclen, dst, dstlen, level, chdr, btype, adat->lzma_data); if (rv < 0) return (rv); rv = ADAPT_COMPRESS_LZMA; lzma_count++; - } else if (adat->adapt_mode == 1 && (PC_TYPE(btype) == TYPE_BINARY)) { + } else if (adat->adapt_mode == 1 && PC_TYPE(btype) == TYPE_BINARY && + PC_SUBTYPE(btype) != TYPE_BMP) { rv = bzip2_compress(src, srclen, dst, dstlen, level, chdr, btype, NULL); if (rv < 0) return (rv); @@ -285,7 +287,8 @@ adapt_compress(void *src, uint64_t srclen, void *dst, } else { #ifdef ENABLE_PC_LIBBSC - if (adat->bsc_data && PC_SUBTYPE(btype) == TYPE_MARKUP) { + if (adat->bsc_data && (PC_SUBTYPE(btype) == TYPE_MARKUP || + PC_SUBTYPE(btype) == TYPE_BMP)) { rv = libbsc_compress(src, srclen, dst, dstlen, level, chdr, btype, adat->bsc_data); if (rv < 0) return (rv); diff --git a/archive/pc_archive.c b/archive/pc_archive.c index 44e981f..b8c0133 100644 --- a/archive/pc_archive.c +++ b/archive/pc_archive.c @@ -1355,7 +1355,7 @@ init_archive_mod() { * comparison. */ for (j = 0; j < extlist[i].len; j++) - extnum = (extnum << 1) | extlist[i].ext[j]; + extnum = (extnum << 8) | extlist[i].ext[j]; exthtab[slot].extnum = extnum; exthtab[slot].type = extlist[i].type; } @@ -1407,7 +1407,7 @@ detect_type_by_ext(const char *path, int pathlen) * Pack given extension into 64-bit integer. */ for (i = 0; i < len; i++) - extnum = (extnum << 1) | tolower(ext[i]); + extnum = (extnum << 8) | tolower(ext[i]); if (exthtab[slot].extnum == extnum) return (exthtab[slot].type); out: @@ -1503,6 +1503,13 @@ detect_type_by_data(uchar_t *buf, size_t len) } } + // BMP Files + if (buf[0] == 'B' && buf[1] == 'M') { + uint16_t typ = LE16(U16_P(buf + 14)); + if (typ == 12 || typ == 64 || typ == 40 || typ == 128) + return (TYPE_BINARY|TYPE_BMP); + } + // MSDOS COM types if (buf[0] == 0xe9 || buf[0] == 0xeb) { if (LE16(U16_P(buf + 0x1fe)) == 0xaa55) diff --git a/pcompress.c b/pcompress.c index bfef905..5730be1 100644 --- a/pcompress.c +++ b/pcompress.c @@ -180,12 +180,15 @@ show_compression_stats(pc_ctx_t *pctx) log_msg(LOG_INFO, 0, "No statistics to display."); } else { log_msg(LOG_INFO, 0, "Best compressed chunk : %s(%.2f%%)", - bytes_to_size(pctx->smallest_chunk), (double)pctx->smallest_chunk/(double)pctx->chunksize*100); + bytes_to_size(pctx->smallest_chunk), + (double)pctx->smallest_chunk/(double)pctx->chunksize*100); log_msg(LOG_INFO, 0, "Worst compressed chunk : %s(%.2f%%)", - bytes_to_size(pctx->largest_chunk), (double)pctx->largest_chunk/(double)pctx->chunksize*100); + bytes_to_size(pctx->largest_chunk), + (double)pctx->largest_chunk/(double)pctx->chunksize*100); pctx->avg_chunk /= pctx->chunk_num; log_msg(LOG_INFO, 0, "Avg compressed chunk : %s(%.2f%%)\n", - bytes_to_size(pctx->avg_chunk), (double)pctx->avg_chunk/(double)pctx->chunksize*100); + bytes_to_size(pctx->avg_chunk), + (double)pctx->avg_chunk/(double)pctx->chunksize*100); } } @@ -202,7 +205,8 @@ show_compression_stats(pc_ctx_t *pctx) */ static int preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t srclen, - void *dst, uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data, algo_props_t *props) + void *dst, uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data, + algo_props_t *props) { uchar_t *dest = (uchar_t *)dst, type = 0; int64_t result; @@ -233,7 +237,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t } } - if (pctx->lzp_preprocess) { + if (pctx->lzp_preprocess && PC_SUBTYPE(btype) != TYPE_BMP) { int hashsize; hashsize = lzp_hash_size(level); @@ -249,7 +253,8 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t } } - if (pctx->enable_delta2_encode && props->delta2_span > 0) { + if (pctx->enable_delta2_encode && props->delta2_span > 0 && + PC_SUBTYPE(btype) != TYPE_DNA_SEQ && PC_SUBTYPE(btype) != TYPE_BMP) { _dstlen = fromlen; result = delta2_encode((uchar_t *)from, fromlen, to, &_dstlen, props->delta2_span); @@ -284,7 +289,8 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t if (result > -1 && _dstlen < srclen) { *dest |= PREPROC_COMPRESSED; *dstlen = _dstlen + 9; - DEBUG_STAT_EN(fprintf(stderr, "Chunk compression speed %.3f MB/s\n", get_mb_s(srclen, strt, en))); + DEBUG_STAT_EN(fprintf(stderr, "Chunk compression speed %.3f MB/s\n", + get_mb_s(srclen, strt, en))); } else { DEBUG_STAT_EN(fprintf(stderr, "Chunk did not compress.\n")); memcpy(dest+1, src, srclen); @@ -304,7 +310,8 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t static int preproc_decompress(pc_ctx_t *pctx, compress_func_ptr dec_func, void *src, uint64_t srclen, - void *dst, uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data, algo_props_t *props) + void *dst, uint64_t *dstlen, int level, uchar_t chdr, int btype, void *data, + algo_props_t *props) { uchar_t *sorc = (uchar_t *)src, type; int64_t result; @@ -323,7 +330,8 @@ preproc_decompress(pc_ctx_t *pctx, compress_func_ptr dec_func, void *src, uint64 DEBUG_STAT_EN(en = get_wtime_millis()); if (result < 0) return (result); - DEBUG_STAT_EN(fprintf(stderr, "Chunk decompression speed %.3f MB/s\n", get_mb_s(srclen, strt, en))); + DEBUG_STAT_EN(fprintf(stderr, "Chunk decompression speed %.3f MB/s\n", + get_mb_s(srclen, strt, en))); memcpy(src, dst, *dstlen); srclen = *dstlen; } else { @@ -368,7 +376,7 @@ preproc_decompress(pc_ctx_t *pctx, compress_func_ptr dec_func, void *src, uint64 } } - if (!(type & (PREPROC_COMPRESSED | PREPROC_TYPE_DELTA2 | PREPROC_TYPE_LZP | PREPROC_TYPE_DISPACK)) + if (!(type & (PREPROC_COMPRESSED|PREPROC_TYPE_DELTA2|PREPROC_TYPE_LZP|PREPROC_TYPE_DISPACK)) && type > 0) { log_msg(LOG_ERR, 0, "Invalid preprocessing flags: %d", type); return (-1); @@ -435,7 +443,8 @@ redo: DEBUG_STAT_EN(strt = get_wtime_millis()); len = pctx->mac_bytes; - deserialize_checksum(checksum, tdat->compressed_chunk + pctx->cksum_bytes, pctx->mac_bytes); + deserialize_checksum(checksum, tdat->compressed_chunk + pctx->cksum_bytes, + pctx->mac_bytes); memset(tdat->compressed_chunk + pctx->cksum_bytes, 0, pctx->mac_bytes); hmac_reinit(&tdat->chunk_hmac); hmac_update(&tdat->chunk_hmac, (uchar_t *)&tdat->len_cmp_be, sizeof (tdat->len_cmp_be)); @@ -641,7 +650,8 @@ redo: * If it does not match we set length of chunk to 0 to indicate * exit to the writer thread. */ - compute_checksum(checksum, pctx->cksum, tdat->uncompressed_chunk, _chunksize, tdat->cksum_mt, 1); + compute_checksum(checksum, pctx->cksum, tdat->uncompressed_chunk, + _chunksize, tdat->cksum_mt, 1); if (memcmp(checksum, tdat->checksum, pctx->cksum_bytes) != 0) { tdat->len_cmp = 0; log_msg(LOG_ERR, 0, "ERROR: Chunk %d, checksums do not match.", tdat->id); @@ -887,7 +897,8 @@ start_decompress(pc_ctx_t *pctx, const char *filename, char *to_filename) if (flags & FLAG_DEDUP_FIXED) { if (version > 7) { if (pctx->pipe_mode) { - log_msg(LOG_ERR, 0, "Global Deduplication is not supported with pipe mode."); + log_msg(LOG_ERR, 0, "Global Deduplication is not " + "supported with pipe mode."); err = 1; goto uncomp_done; } @@ -1129,7 +1140,8 @@ start_decompress(pc_ctx_t *pctx, const char *filename, char *to_filename) free(salt1); memset(n1, 0, noncelen); if (memcmp(hdr_hash2, hdr_hash1, pctx->mac_bytes) != 0) { - log_msg(LOG_ERR, 0, "Header verification failed! File tampered or wrong password."); + log_msg(LOG_ERR, 0, "Header verification failed! File " + "tampered or wrong password."); UNCOMP_BAIL; } } else if (version >= 5) { @@ -1158,7 +1170,8 @@ start_decompress(pc_ctx_t *pctx, const char *filename, char *to_filename) d2 = htonl(level); crc2 = lzma_crc32((uchar_t *)&d2, sizeof (level), crc2); if (crc1 != crc2) { - log_msg(LOG_ERR, 0, "Header verification failed! File tampered or wrong password."); + log_msg(LOG_ERR, 0, "Header verification failed! File tampered " + "or wrong password."); UNCOMP_BAIL; } } @@ -1169,7 +1182,8 @@ start_decompress(pc_ctx_t *pctx, const char *filename, char *to_filename) strcat(pctx->archive_temp_file, "/.data"); if ((pctx->archive_temp_fd = open(pctx->archive_temp_file, O_WRONLY|O_CREAT|O_TRUNC, S_IRUSR|S_IWUSR)) == -1) { - log_msg(LOG_ERR, 1, "Cannot open temporary data file in target directory."); + log_msg(LOG_ERR, 1, "Cannot open temporary data file in " + "target directory."); UNCOMP_BAIL; } add_fname(pctx->archive_temp_file); @@ -2883,7 +2897,8 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[]) case 'e': pctx->encrypt_type = get_crypto_alg(optarg); if (pctx->encrypt_type == 0) { - log_msg(LOG_ERR, 0, "Invalid encryption algorithm. Should be AES or SALSA20.", optarg); + log_msg(LOG_ERR, 0, "Invalid encryption algorithm. " + "Should be AES or SALSA20.", optarg); return (1); } break; @@ -3098,12 +3113,14 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[]) pctx->to_filename = NULL; } else { strcpy(apath, argv[my_optind]); - strcat(apath, COMP_EXTN); + if (!endswith(apath, COMP_EXTN)) + strcat(apath, COMP_EXTN); pctx->to_filename = realpath(apath, NULL); /* Check if compressed file exists */ if (pctx->to_filename != NULL) { - log_msg(LOG_ERR, 0, "Compressed file %s exists", pctx->to_filename); + log_msg(LOG_ERR, 0, "Compressed file %s exists", + pctx->to_filename); free((void *)(pctx->to_filename)); return (1); } @@ -3111,12 +3128,14 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[]) } } else { strcpy(apath, pctx->filename); - strcat(apath, COMP_EXTN); + if (!endswith(apath, COMP_EXTN)) + strcat(apath, COMP_EXTN); pctx->to_filename = realpath(apath, NULL); /* Check if compressed file exists */ if (pctx->to_filename != NULL) { - log_msg(LOG_ERR, 0, "Compressed file %s exists", pctx->to_filename); + log_msg(LOG_ERR, 0, "Compressed file %s exists", + pctx->to_filename); free((void *)(pctx->to_filename)); return (1); } diff --git a/utils/phash/extensions.h b/utils/phash/extensions.h index 98679c7..7174e59 100644 --- a/utils/phash/extensions.h +++ b/utils/phash/extensions.h @@ -132,7 +132,6 @@ struct ext_entry { {"zpaq" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_ZPAQ, 4}, {"xcf" , TYPE_BINARY, 3}, {"mo" , TYPE_BINARY, 2}, - {"bmp" , TYPE_BINARY, 3}, {"pyo" , TYPE_BINARY, 3}, {"pyc" , TYPE_BINARY, 3}, {"wav" , TYPE_BINARY, 3}, diff --git a/utils/phash/extensions.txt b/utils/phash/extensions.txt index 8c43b42..32a86a2 100644 --- a/utils/phash/extensions.txt +++ b/utils/phash/extensions.txt @@ -122,7 +122,6 @@ pmd,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_PPMD zpaq,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_ZPAQ xcf,TYPE_BINARY mo,TYPE_BINARY -bmp,TYPE_BINARY pyo,TYPE_BINARY pyc,TYPE_BINARY wav,TYPE_BINARY diff --git a/utils/phash/phash.c b/utils/phash/phash.c index 2c13753..742254e 100644 --- a/utils/phash/phash.c +++ b/utils/phash/phash.c @@ -19,7 +19,7 @@ ub1 tab[] = { 0,87,0,0,113,0,125,183,82,0,124,88,40,125,0,0, 124,0,168,125,0,125,0,40,0,82,125,113,113,125,116,0, 0,0,113,85,0,88,0,0,42,27,0,0,0,40,183,61, -0,0,0,0,0,111,17,0,87,125,0,0,166,91,0,0, +0,0,0,0,0,111,0,0,87,125,0,0,127,91,0,0, }; /* The hash function */ diff --git a/utils/phash/phash.h b/utils/phash/phash.h index 74bd726..7d4bc16 100644 --- a/utils/phash/phash.h +++ b/utils/phash/phash.h @@ -8,7 +8,7 @@ extern ub1 tab[]; #define PHASHLEN 0x80 /* length of hash mapping table */ -#define PHASHNKEYS 133 /* How many keys were hashed */ +#define PHASHNKEYS 132 /* How many keys were hashed */ #define PHASHRANGE 256 /* Range any input might map to */ #define PHASHSALT 0x9e3779b9 /* internal, initialize normal hash */ diff --git a/utils/utils.h b/utils/utils.h index 7bb0d6f..9726163 100644 --- a/utils/utils.h +++ b/utils/utils.h @@ -38,6 +38,7 @@ #include #include #include +#include #include #ifdef __cplusplus @@ -269,7 +270,8 @@ typedef enum { TYPE_DNA_SEQ = 168, TYPE_MJPEG = 176, TYPE_AUDIO_COMPRESSED = 184, - TYPE_EXE64 = 192 + TYPE_EXE64 = 192, + TYPE_BMP = 200 } data_type_t; /* @@ -384,6 +386,18 @@ roundup_pow_two(unsigned int v) { return (v); } +static inline int +endswith(char *haystack, char *needle) { + size_t len = strlen(haystack); + size_t nlen = strlen(needle); + if (nlen > len) + return (0); + size_t pos = len - nlen; + if (memcmp(&haystack[pos], needle, nlen) != 0) + return (0); + return (1); +} + #ifdef __cplusplus } #endif