diff --git a/lzma_compress.c b/lzma_compress.c index 290b59a..02f80b2 100644 --- a/lzma_compress.c +++ b/lzma_compress.c @@ -85,6 +85,7 @@ lzma_init(void **data, int *level, ssize_t chunksize) p->fb = 128; p->mc = 256; } + if (*level > 9) *level = 9; p->level = *level; LzmaEncProps_Normalize(p); slab_cache_add(p->litprob_sz); diff --git a/main.c b/main.c index 4e828c8..6183bab 100644 --- a/main.c +++ b/main.c @@ -189,7 +189,7 @@ redo: } if (HDR & COMPRESSED) { - if (enable_rabin_scan && (HDR & FLAG_DEDUP)) { + if (enable_rabin_scan && (HDR & CHUNK_FLAG_DEDUP)) { uchar_t *cmpbuf, *ubuf; /* Extract various sizes from rabin header. */ @@ -232,7 +232,7 @@ redo: goto cont; } /* Rebuild chunk from dedup blocks. */ - if (enable_rabin_scan && (HDR & FLAG_DEDUP)) { + if (enable_rabin_scan && (HDR & CHUNK_FLAG_DEDUP)) { rabin_context_t *rctx; uchar_t *tmp; @@ -488,6 +488,7 @@ start_decompress(const char *filename, const char *to_filename) UNCOMP_BAIL; } tdat->len_cmp = htonll(tdat->len_cmp); + /* * Zero compressed len means end of file. */ @@ -507,7 +508,6 @@ start_decompress(const char *filename, const char *to_filename) */ tdat->rbytes = Read(compfd, tdat->compressed_chunk, tdat->len_cmp + sizeof(tdat->crc64) + CHDR_SZ); - if (main_cancel) break; if (tdat->rbytes < tdat->len_cmp + sizeof(tdat->crc64) + CHDR_SZ) { if (tdat->rbytes < 0) { diff --git a/rabin/rabin_polynomial.c b/rabin/rabin_polynomial.c index 6a8f0eb..0d38fda 100755 --- a/rabin/rabin_polynomial.c +++ b/rabin/rabin_polynomial.c @@ -95,13 +95,21 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, const char *al uint32_t blknum; int level = 14; + /* + * Rabin window size must be power of 2 for optimization. + */ + if (!ISP2(RAB_POLYNOMIAL_WIN_SIZE)) { + fprintf(stderr, "Rabin window size must be a power of 2 in range 4 <= x <= 64\n"); + return (NULL); + } /* * For LZMA with chunksize <= LZMA Window size we use 4K minimum Rabin * block size. For everything else it is 1K based on experimentation. */ ctx = (rabin_context_t *)slab_alloc(NULL, sizeof (rabin_context_t)); ctx->rabin_poly_max_block_size = RAB_POLYNOMIAL_MAX_BLOCK_SIZE; - if (memcmp(algo, "lzma", 4) == 0 && chunksize <= LZMA_WINDOW_MAX) { + if ((memcmp(algo, "lzma", 4) == 0 || memcmp(algo, "adapt", 5) == 0) && + chunksize <= LZMA_WINDOW_MAX) { ctx->rabin_poly_min_block_size = RAB_POLYNOMIAL_MIN_BLOCK_SIZE; ctx->rabin_avg_block_mask = RAB_POLYNOMIAL_AVG_BLOCK_MASK; ctx->rabin_poly_avg_block_size = RAB_POLYNOMIAL_AVG_BLOCK_SIZE; @@ -216,6 +224,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s blknum = 0; ctx->valid = 0; ctx->cur_checksum = 0; + j = 0; /* * If rabin_pos is non-zero then we are being asked to scan for the last rabin boundary @@ -244,11 +253,13 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s // CRC64 Calculation swiped from LZMA ctx->cur_checksum = lzma_crc64_table[0][cur_byte ^ A1(ctx->cur_checksum)] ^ S8(ctx->cur_checksum); - ctx->window_pos++; - length++; + // Count how many bytes have msb set. Needed to detect 7-bit text data. + j += (cur_byte >> 7); - if (ctx->window_pos == RAB_POLYNOMIAL_WIN_SIZE) // Loop back around - ctx->window_pos=0; + // Window pos has to rotate from 0 .. RAB_POLYNOMIAL_WIN_SIZE-1 + // This requires RAB_POLYNOMIAL_WIN_SIZE to be power of 2 + ctx->window_pos = (ctx->window_pos + 1) & (RAB_POLYNOMIAL_WIN_SIZE-1); + length++; if (length < ctx->rabin_poly_min_block_size) continue; @@ -273,6 +284,11 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s *rabin_pos = last_offset; return (0); } + if (j < *size * 0.55) + ctx->data_type = DATA_BINARY; + else + ctx->data_type = DATA_TEXT; +printf("Original size: %lld\n", *size); // If we found at least a few chunks, perform dedup. if (blknum > 2) { uint64_t prev_cksum; @@ -448,6 +464,7 @@ cont: entries[2] = htonll(pos1 - rabin_index_sz - RABIN_HDR_SIZE); *size = pos1; ctx->valid = 1; +printf("Deduped size: %lld, blknum: %u\n", *size, blknum); /* * Remaining header entries: size of compressed index and size of diff --git a/rabin/rabin_polynomial.h b/rabin/rabin_polynomial.h index 8c58122..f118b3b 100644 --- a/rabin/rabin_polynomial.h +++ b/rabin/rabin_polynomial.h @@ -80,9 +80,9 @@ #define RAB_POLYNOMIAL_MIN_BLOCK_SIZE2 2048 #define LZMA_WINDOW_MAX (128L * 1024L * 1024L) -#define RAB_POLYNOMIAL_WIN_SIZE 31 -#define RAB_POLYNOMIAL_MIN_WIN_SIZE 17 -#define RAB_POLYNOMIAL_MAX_WIN_SIZE 63 +#define RAB_POLYNOMIAL_WIN_SIZE 32 +#define RAB_POLYNOMIAL_MIN_WIN_SIZE 8 +#define RAB_POLYNOMIAL_MAX_WIN_SIZE 64 typedef struct { ssize_t offset; @@ -134,8 +134,8 @@ typedef struct { uint32_t rabin_avg_block_mask; uint32_t rabin_break_patt; uint64_t real_chunksize; - int dedup; - int valid; + short valid; + short data_type; void *lzma_data; int level; } rabin_context_t; diff --git a/utils.h b/utils.h index 62ea7cd..03b0e0b 100644 --- a/utils.h +++ b/utils.h @@ -31,6 +31,9 @@ extern "C" { #endif +#define DATA_TEXT 1 +#define DATA_BINARY 2 + #if !defined(sun) && !defined(__sun) #define ulong_t u_long #define uchar_t u_char @@ -92,6 +95,8 @@ typedef unsigned long uintptr_t; # endif #endif +#define ISP2(x) ((x != 0) && ((x & (~x + 1)) == x)) + extern void err_exit(int show_errno, const char *format, ...); extern const char *get_execname(const char *); extern int parse_numeric(ssize_t *val, const char *str);