Fix crash when decompressing deduped archive.

Ensure correct level is passed to lzma.
Avoid branch when wrapping rabin window position and check for rabin window size to be power of 2.
Update rabin parameters check for adaptive modes.
Add detection of 7-bit text/8-bit binary data for later use.
This commit is contained in:
Moinak Ghosh 2012-07-10 20:14:23 +05:30
parent db0c9ea9ac
commit a873f92e41
5 changed files with 36 additions and 13 deletions

View file

@ -85,6 +85,7 @@ lzma_init(void **data, int *level, ssize_t chunksize)
p->fb = 128; p->fb = 128;
p->mc = 256; p->mc = 256;
} }
if (*level > 9) *level = 9;
p->level = *level; p->level = *level;
LzmaEncProps_Normalize(p); LzmaEncProps_Normalize(p);
slab_cache_add(p->litprob_sz); slab_cache_add(p->litprob_sz);

6
main.c
View file

@ -189,7 +189,7 @@ redo:
} }
if (HDR & COMPRESSED) { if (HDR & COMPRESSED) {
if (enable_rabin_scan && (HDR & FLAG_DEDUP)) { if (enable_rabin_scan && (HDR & CHUNK_FLAG_DEDUP)) {
uchar_t *cmpbuf, *ubuf; uchar_t *cmpbuf, *ubuf;
/* Extract various sizes from rabin header. */ /* Extract various sizes from rabin header. */
@ -232,7 +232,7 @@ redo:
goto cont; goto cont;
} }
/* Rebuild chunk from dedup blocks. */ /* Rebuild chunk from dedup blocks. */
if (enable_rabin_scan && (HDR & FLAG_DEDUP)) { if (enable_rabin_scan && (HDR & CHUNK_FLAG_DEDUP)) {
rabin_context_t *rctx; rabin_context_t *rctx;
uchar_t *tmp; uchar_t *tmp;
@ -488,6 +488,7 @@ start_decompress(const char *filename, const char *to_filename)
UNCOMP_BAIL; UNCOMP_BAIL;
} }
tdat->len_cmp = htonll(tdat->len_cmp); tdat->len_cmp = htonll(tdat->len_cmp);
/* /*
* Zero compressed len means end of file. * Zero compressed len means end of file.
*/ */
@ -507,7 +508,6 @@ start_decompress(const char *filename, const char *to_filename)
*/ */
tdat->rbytes = Read(compfd, tdat->compressed_chunk, tdat->rbytes = Read(compfd, tdat->compressed_chunk,
tdat->len_cmp + sizeof(tdat->crc64) + CHDR_SZ); tdat->len_cmp + sizeof(tdat->crc64) + CHDR_SZ);
if (main_cancel) break; if (main_cancel) break;
if (tdat->rbytes < tdat->len_cmp + sizeof(tdat->crc64) + CHDR_SZ) { if (tdat->rbytes < tdat->len_cmp + sizeof(tdat->crc64) + CHDR_SZ) {
if (tdat->rbytes < 0) { if (tdat->rbytes < 0) {

View file

@ -95,13 +95,21 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, const char *al
uint32_t blknum; uint32_t blknum;
int level = 14; int level = 14;
/*
* Rabin window size must be power of 2 for optimization.
*/
if (!ISP2(RAB_POLYNOMIAL_WIN_SIZE)) {
fprintf(stderr, "Rabin window size must be a power of 2 in range 4 <= x <= 64\n");
return (NULL);
}
/* /*
* For LZMA with chunksize <= LZMA Window size we use 4K minimum Rabin * For LZMA with chunksize <= LZMA Window size we use 4K minimum Rabin
* block size. For everything else it is 1K based on experimentation. * block size. For everything else it is 1K based on experimentation.
*/ */
ctx = (rabin_context_t *)slab_alloc(NULL, sizeof (rabin_context_t)); ctx = (rabin_context_t *)slab_alloc(NULL, sizeof (rabin_context_t));
ctx->rabin_poly_max_block_size = RAB_POLYNOMIAL_MAX_BLOCK_SIZE; ctx->rabin_poly_max_block_size = RAB_POLYNOMIAL_MAX_BLOCK_SIZE;
if (memcmp(algo, "lzma", 4) == 0 && chunksize <= LZMA_WINDOW_MAX) { if ((memcmp(algo, "lzma", 4) == 0 || memcmp(algo, "adapt", 5) == 0) &&
chunksize <= LZMA_WINDOW_MAX) {
ctx->rabin_poly_min_block_size = RAB_POLYNOMIAL_MIN_BLOCK_SIZE; ctx->rabin_poly_min_block_size = RAB_POLYNOMIAL_MIN_BLOCK_SIZE;
ctx->rabin_avg_block_mask = RAB_POLYNOMIAL_AVG_BLOCK_MASK; ctx->rabin_avg_block_mask = RAB_POLYNOMIAL_AVG_BLOCK_MASK;
ctx->rabin_poly_avg_block_size = RAB_POLYNOMIAL_AVG_BLOCK_SIZE; ctx->rabin_poly_avg_block_size = RAB_POLYNOMIAL_AVG_BLOCK_SIZE;
@ -216,6 +224,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
blknum = 0; blknum = 0;
ctx->valid = 0; ctx->valid = 0;
ctx->cur_checksum = 0; ctx->cur_checksum = 0;
j = 0;
/* /*
* If rabin_pos is non-zero then we are being asked to scan for the last rabin boundary * If rabin_pos is non-zero then we are being asked to scan for the last rabin boundary
@ -244,11 +253,13 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
// CRC64 Calculation swiped from LZMA // CRC64 Calculation swiped from LZMA
ctx->cur_checksum = lzma_crc64_table[0][cur_byte ^ A1(ctx->cur_checksum)] ^ S8(ctx->cur_checksum); ctx->cur_checksum = lzma_crc64_table[0][cur_byte ^ A1(ctx->cur_checksum)] ^ S8(ctx->cur_checksum);
ctx->window_pos++; // Count how many bytes have msb set. Needed to detect 7-bit text data.
length++; j += (cur_byte >> 7);
if (ctx->window_pos == RAB_POLYNOMIAL_WIN_SIZE) // Loop back around // Window pos has to rotate from 0 .. RAB_POLYNOMIAL_WIN_SIZE-1
ctx->window_pos=0; // This requires RAB_POLYNOMIAL_WIN_SIZE to be power of 2
ctx->window_pos = (ctx->window_pos + 1) & (RAB_POLYNOMIAL_WIN_SIZE-1);
length++;
if (length < ctx->rabin_poly_min_block_size) continue; if (length < ctx->rabin_poly_min_block_size) continue;
@ -273,6 +284,11 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
*rabin_pos = last_offset; *rabin_pos = last_offset;
return (0); return (0);
} }
if (j < *size * 0.55)
ctx->data_type = DATA_BINARY;
else
ctx->data_type = DATA_TEXT;
printf("Original size: %lld\n", *size);
// If we found at least a few chunks, perform dedup. // If we found at least a few chunks, perform dedup.
if (blknum > 2) { if (blknum > 2) {
uint64_t prev_cksum; uint64_t prev_cksum;
@ -448,6 +464,7 @@ cont:
entries[2] = htonll(pos1 - rabin_index_sz - RABIN_HDR_SIZE); entries[2] = htonll(pos1 - rabin_index_sz - RABIN_HDR_SIZE);
*size = pos1; *size = pos1;
ctx->valid = 1; ctx->valid = 1;
printf("Deduped size: %lld, blknum: %u\n", *size, blknum);
/* /*
* Remaining header entries: size of compressed index and size of * Remaining header entries: size of compressed index and size of

View file

@ -80,9 +80,9 @@
#define RAB_POLYNOMIAL_MIN_BLOCK_SIZE2 2048 #define RAB_POLYNOMIAL_MIN_BLOCK_SIZE2 2048
#define LZMA_WINDOW_MAX (128L * 1024L * 1024L) #define LZMA_WINDOW_MAX (128L * 1024L * 1024L)
#define RAB_POLYNOMIAL_WIN_SIZE 31 #define RAB_POLYNOMIAL_WIN_SIZE 32
#define RAB_POLYNOMIAL_MIN_WIN_SIZE 17 #define RAB_POLYNOMIAL_MIN_WIN_SIZE 8
#define RAB_POLYNOMIAL_MAX_WIN_SIZE 63 #define RAB_POLYNOMIAL_MAX_WIN_SIZE 64
typedef struct { typedef struct {
ssize_t offset; ssize_t offset;
@ -134,8 +134,8 @@ typedef struct {
uint32_t rabin_avg_block_mask; uint32_t rabin_avg_block_mask;
uint32_t rabin_break_patt; uint32_t rabin_break_patt;
uint64_t real_chunksize; uint64_t real_chunksize;
int dedup; short valid;
int valid; short data_type;
void *lzma_data; void *lzma_data;
int level; int level;
} rabin_context_t; } rabin_context_t;

View file

@ -31,6 +31,9 @@
extern "C" { extern "C" {
#endif #endif
#define DATA_TEXT 1
#define DATA_BINARY 2
#if !defined(sun) && !defined(__sun) #if !defined(sun) && !defined(__sun)
#define ulong_t u_long #define ulong_t u_long
#define uchar_t u_char #define uchar_t u_char
@ -92,6 +95,8 @@ typedef unsigned long uintptr_t;
# endif # endif
#endif #endif
#define ISP2(x) ((x != 0) && ((x & (~x + 1)) == x))
extern void err_exit(int show_errno, const char *format, ...); extern void err_exit(int show_errno, const char *format, ...);
extern const char *get_execname(const char *); extern const char *get_execname(const char *);
extern int parse_numeric(ssize_t *val, const char *str); extern int parse_numeric(ssize_t *val, const char *str);