From a873f92e4170a14cb5a9d79f47e4ad3c38b4ab77 Mon Sep 17 00:00:00 2001 From: Moinak Ghosh Date: Tue, 10 Jul 2012 20:14:23 +0530 Subject: [PATCH] Fix crash when decompressing deduped archive. Ensure correct level is passed to lzma. Avoid branch when wrapping rabin window position and check for rabin window size to be power of 2. Update rabin parameters check for adaptive modes. Add detection of 7-bit text/8-bit binary data for later use. --- lzma_compress.c | 1 + main.c | 6 +++--- rabin/rabin_polynomial.c | 27 ++++++++++++++++++++++----- rabin/rabin_polynomial.h | 10 +++++----- utils.h | 5 +++++ 5 files changed, 36 insertions(+), 13 deletions(-) diff --git a/lzma_compress.c b/lzma_compress.c index 290b59a..02f80b2 100644 --- a/lzma_compress.c +++ b/lzma_compress.c @@ -85,6 +85,7 @@ lzma_init(void **data, int *level, ssize_t chunksize) p->fb = 128; p->mc = 256; } + if (*level > 9) *level = 9; p->level = *level; LzmaEncProps_Normalize(p); slab_cache_add(p->litprob_sz); diff --git a/main.c b/main.c index 4e828c8..6183bab 100644 --- a/main.c +++ b/main.c @@ -189,7 +189,7 @@ redo: } if (HDR & COMPRESSED) { - if (enable_rabin_scan && (HDR & FLAG_DEDUP)) { + if (enable_rabin_scan && (HDR & CHUNK_FLAG_DEDUP)) { uchar_t *cmpbuf, *ubuf; /* Extract various sizes from rabin header. */ @@ -232,7 +232,7 @@ redo: goto cont; } /* Rebuild chunk from dedup blocks. */ - if (enable_rabin_scan && (HDR & FLAG_DEDUP)) { + if (enable_rabin_scan && (HDR & CHUNK_FLAG_DEDUP)) { rabin_context_t *rctx; uchar_t *tmp; @@ -488,6 +488,7 @@ start_decompress(const char *filename, const char *to_filename) UNCOMP_BAIL; } tdat->len_cmp = htonll(tdat->len_cmp); + /* * Zero compressed len means end of file. */ @@ -507,7 +508,6 @@ start_decompress(const char *filename, const char *to_filename) */ tdat->rbytes = Read(compfd, tdat->compressed_chunk, tdat->len_cmp + sizeof(tdat->crc64) + CHDR_SZ); - if (main_cancel) break; if (tdat->rbytes < tdat->len_cmp + sizeof(tdat->crc64) + CHDR_SZ) { if (tdat->rbytes < 0) { diff --git a/rabin/rabin_polynomial.c b/rabin/rabin_polynomial.c index 6a8f0eb..0d38fda 100755 --- a/rabin/rabin_polynomial.c +++ b/rabin/rabin_polynomial.c @@ -95,13 +95,21 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, const char *al uint32_t blknum; int level = 14; + /* + * Rabin window size must be power of 2 for optimization. + */ + if (!ISP2(RAB_POLYNOMIAL_WIN_SIZE)) { + fprintf(stderr, "Rabin window size must be a power of 2 in range 4 <= x <= 64\n"); + return (NULL); + } /* * For LZMA with chunksize <= LZMA Window size we use 4K minimum Rabin * block size. For everything else it is 1K based on experimentation. */ ctx = (rabin_context_t *)slab_alloc(NULL, sizeof (rabin_context_t)); ctx->rabin_poly_max_block_size = RAB_POLYNOMIAL_MAX_BLOCK_SIZE; - if (memcmp(algo, "lzma", 4) == 0 && chunksize <= LZMA_WINDOW_MAX) { + if ((memcmp(algo, "lzma", 4) == 0 || memcmp(algo, "adapt", 5) == 0) && + chunksize <= LZMA_WINDOW_MAX) { ctx->rabin_poly_min_block_size = RAB_POLYNOMIAL_MIN_BLOCK_SIZE; ctx->rabin_avg_block_mask = RAB_POLYNOMIAL_AVG_BLOCK_MASK; ctx->rabin_poly_avg_block_size = RAB_POLYNOMIAL_AVG_BLOCK_SIZE; @@ -216,6 +224,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s blknum = 0; ctx->valid = 0; ctx->cur_checksum = 0; + j = 0; /* * If rabin_pos is non-zero then we are being asked to scan for the last rabin boundary @@ -244,11 +253,13 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s // CRC64 Calculation swiped from LZMA ctx->cur_checksum = lzma_crc64_table[0][cur_byte ^ A1(ctx->cur_checksum)] ^ S8(ctx->cur_checksum); - ctx->window_pos++; - length++; + // Count how many bytes have msb set. Needed to detect 7-bit text data. + j += (cur_byte >> 7); - if (ctx->window_pos == RAB_POLYNOMIAL_WIN_SIZE) // Loop back around - ctx->window_pos=0; + // Window pos has to rotate from 0 .. RAB_POLYNOMIAL_WIN_SIZE-1 + // This requires RAB_POLYNOMIAL_WIN_SIZE to be power of 2 + ctx->window_pos = (ctx->window_pos + 1) & (RAB_POLYNOMIAL_WIN_SIZE-1); + length++; if (length < ctx->rabin_poly_min_block_size) continue; @@ -273,6 +284,11 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s *rabin_pos = last_offset; return (0); } + if (j < *size * 0.55) + ctx->data_type = DATA_BINARY; + else + ctx->data_type = DATA_TEXT; +printf("Original size: %lld\n", *size); // If we found at least a few chunks, perform dedup. if (blknum > 2) { uint64_t prev_cksum; @@ -448,6 +464,7 @@ cont: entries[2] = htonll(pos1 - rabin_index_sz - RABIN_HDR_SIZE); *size = pos1; ctx->valid = 1; +printf("Deduped size: %lld, blknum: %u\n", *size, blknum); /* * Remaining header entries: size of compressed index and size of diff --git a/rabin/rabin_polynomial.h b/rabin/rabin_polynomial.h index 8c58122..f118b3b 100644 --- a/rabin/rabin_polynomial.h +++ b/rabin/rabin_polynomial.h @@ -80,9 +80,9 @@ #define RAB_POLYNOMIAL_MIN_BLOCK_SIZE2 2048 #define LZMA_WINDOW_MAX (128L * 1024L * 1024L) -#define RAB_POLYNOMIAL_WIN_SIZE 31 -#define RAB_POLYNOMIAL_MIN_WIN_SIZE 17 -#define RAB_POLYNOMIAL_MAX_WIN_SIZE 63 +#define RAB_POLYNOMIAL_WIN_SIZE 32 +#define RAB_POLYNOMIAL_MIN_WIN_SIZE 8 +#define RAB_POLYNOMIAL_MAX_WIN_SIZE 64 typedef struct { ssize_t offset; @@ -134,8 +134,8 @@ typedef struct { uint32_t rabin_avg_block_mask; uint32_t rabin_break_patt; uint64_t real_chunksize; - int dedup; - int valid; + short valid; + short data_type; void *lzma_data; int level; } rabin_context_t; diff --git a/utils.h b/utils.h index 62ea7cd..03b0e0b 100644 --- a/utils.h +++ b/utils.h @@ -31,6 +31,9 @@ extern "C" { #endif +#define DATA_TEXT 1 +#define DATA_BINARY 2 + #if !defined(sun) && !defined(__sun) #define ulong_t u_long #define uchar_t u_char @@ -92,6 +95,8 @@ typedef unsigned long uintptr_t; # endif #endif +#define ISP2(x) ((x != 0) && ((x & (~x + 1)) == x)) + extern void err_exit(int show_errno, const char *format, ...); extern const char *get_execname(const char *); extern int parse_numeric(ssize_t *val, const char *str);