Fix crash when decompressing deduped archive.

Ensure correct level is passed to lzma.
Avoid branch when wrapping rabin window position and check for rabin window size to be power of 2.
Update rabin parameters check for adaptive modes.
Add detection of 7-bit text/8-bit binary data for later use.
This commit is contained in:
Moinak Ghosh 2012-07-10 20:14:23 +05:30
parent db0c9ea9ac
commit a873f92e41
5 changed files with 36 additions and 13 deletions

View file

@ -85,6 +85,7 @@ lzma_init(void **data, int *level, ssize_t chunksize)
p->fb = 128;
p->mc = 256;
}
if (*level > 9) *level = 9;
p->level = *level;
LzmaEncProps_Normalize(p);
slab_cache_add(p->litprob_sz);

6
main.c
View file

@ -189,7 +189,7 @@ redo:
}
if (HDR & COMPRESSED) {
if (enable_rabin_scan && (HDR & FLAG_DEDUP)) {
if (enable_rabin_scan && (HDR & CHUNK_FLAG_DEDUP)) {
uchar_t *cmpbuf, *ubuf;
/* Extract various sizes from rabin header. */
@ -232,7 +232,7 @@ redo:
goto cont;
}
/* Rebuild chunk from dedup blocks. */
if (enable_rabin_scan && (HDR & FLAG_DEDUP)) {
if (enable_rabin_scan && (HDR & CHUNK_FLAG_DEDUP)) {
rabin_context_t *rctx;
uchar_t *tmp;
@ -488,6 +488,7 @@ start_decompress(const char *filename, const char *to_filename)
UNCOMP_BAIL;
}
tdat->len_cmp = htonll(tdat->len_cmp);
/*
* Zero compressed len means end of file.
*/
@ -507,7 +508,6 @@ start_decompress(const char *filename, const char *to_filename)
*/
tdat->rbytes = Read(compfd, tdat->compressed_chunk,
tdat->len_cmp + sizeof(tdat->crc64) + CHDR_SZ);
if (main_cancel) break;
if (tdat->rbytes < tdat->len_cmp + sizeof(tdat->crc64) + CHDR_SZ) {
if (tdat->rbytes < 0) {

View file

@ -95,13 +95,21 @@ create_rabin_context(uint64_t chunksize, uint64_t real_chunksize, const char *al
uint32_t blknum;
int level = 14;
/*
* Rabin window size must be power of 2 for optimization.
*/
if (!ISP2(RAB_POLYNOMIAL_WIN_SIZE)) {
fprintf(stderr, "Rabin window size must be a power of 2 in range 4 <= x <= 64\n");
return (NULL);
}
/*
* For LZMA with chunksize <= LZMA Window size we use 4K minimum Rabin
* block size. For everything else it is 1K based on experimentation.
*/
ctx = (rabin_context_t *)slab_alloc(NULL, sizeof (rabin_context_t));
ctx->rabin_poly_max_block_size = RAB_POLYNOMIAL_MAX_BLOCK_SIZE;
if (memcmp(algo, "lzma", 4) == 0 && chunksize <= LZMA_WINDOW_MAX) {
if ((memcmp(algo, "lzma", 4) == 0 || memcmp(algo, "adapt", 5) == 0) &&
chunksize <= LZMA_WINDOW_MAX) {
ctx->rabin_poly_min_block_size = RAB_POLYNOMIAL_MIN_BLOCK_SIZE;
ctx->rabin_avg_block_mask = RAB_POLYNOMIAL_AVG_BLOCK_MASK;
ctx->rabin_poly_avg_block_size = RAB_POLYNOMIAL_AVG_BLOCK_SIZE;
@ -216,6 +224,7 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
blknum = 0;
ctx->valid = 0;
ctx->cur_checksum = 0;
j = 0;
/*
* If rabin_pos is non-zero then we are being asked to scan for the last rabin boundary
@ -244,11 +253,13 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
// CRC64 Calculation swiped from LZMA
ctx->cur_checksum = lzma_crc64_table[0][cur_byte ^ A1(ctx->cur_checksum)] ^ S8(ctx->cur_checksum);
ctx->window_pos++;
length++;
// Count how many bytes have msb set. Needed to detect 7-bit text data.
j += (cur_byte >> 7);
if (ctx->window_pos == RAB_POLYNOMIAL_WIN_SIZE) // Loop back around
ctx->window_pos=0;
// Window pos has to rotate from 0 .. RAB_POLYNOMIAL_WIN_SIZE-1
// This requires RAB_POLYNOMIAL_WIN_SIZE to be power of 2
ctx->window_pos = (ctx->window_pos + 1) & (RAB_POLYNOMIAL_WIN_SIZE-1);
length++;
if (length < ctx->rabin_poly_min_block_size) continue;
@ -273,6 +284,11 @@ rabin_dedup(rabin_context_t *ctx, uchar_t *buf, ssize_t *size, ssize_t offset, s
*rabin_pos = last_offset;
return (0);
}
if (j < *size * 0.55)
ctx->data_type = DATA_BINARY;
else
ctx->data_type = DATA_TEXT;
printf("Original size: %lld\n", *size);
// If we found at least a few chunks, perform dedup.
if (blknum > 2) {
uint64_t prev_cksum;
@ -448,6 +464,7 @@ cont:
entries[2] = htonll(pos1 - rabin_index_sz - RABIN_HDR_SIZE);
*size = pos1;
ctx->valid = 1;
printf("Deduped size: %lld, blknum: %u\n", *size, blknum);
/*
* Remaining header entries: size of compressed index and size of

View file

@ -80,9 +80,9 @@
#define RAB_POLYNOMIAL_MIN_BLOCK_SIZE2 2048
#define LZMA_WINDOW_MAX (128L * 1024L * 1024L)
#define RAB_POLYNOMIAL_WIN_SIZE 31
#define RAB_POLYNOMIAL_MIN_WIN_SIZE 17
#define RAB_POLYNOMIAL_MAX_WIN_SIZE 63
#define RAB_POLYNOMIAL_WIN_SIZE 32
#define RAB_POLYNOMIAL_MIN_WIN_SIZE 8
#define RAB_POLYNOMIAL_MAX_WIN_SIZE 64
typedef struct {
ssize_t offset;
@ -134,8 +134,8 @@ typedef struct {
uint32_t rabin_avg_block_mask;
uint32_t rabin_break_patt;
uint64_t real_chunksize;
int dedup;
int valid;
short valid;
short data_type;
void *lzma_data;
int level;
} rabin_context_t;

View file

@ -31,6 +31,9 @@
extern "C" {
#endif
#define DATA_TEXT 1
#define DATA_BINARY 2
#if !defined(sun) && !defined(__sun)
#define ulong_t u_long
#define uchar_t u_char
@ -92,6 +95,8 @@ typedef unsigned long uintptr_t;
# endif
#endif
#define ISP2(x) ((x != 0) && ((x & (~x + 1)) == x))
extern void err_exit(int show_errno, const char *format, ...);
extern const char *get_execname(const char *);
extern int parse_numeric(ssize_t *val, const char *str);