From 39dbc4be431b10861cad735b81e3759a49c7f57b Mon Sep 17 00:00:00 2001 From: Moinak Ghosh Date: Mon, 14 Jan 2013 13:20:07 +0530 Subject: [PATCH] Implement algo-specific minimum distance match for Delta Compression. --- adaptive_compress.c | 1 + bzip2_compress.c | 1 + libbsc_compress.c | 4 ++++ lz4_compress.c | 1 + lzfx_compress.c | 1 + lzma_compress.c | 8 ++++++++ main.c | 6 +++--- ppmd_compress.c | 1 + rabin/rabin_dedup.c | 24 +++++++++++++++++------- rabin/rabin_dedup.h | 6 +++--- utils/utils.h | 3 +++ zlib_compress.c | 1 + 12 files changed, 44 insertions(+), 13 deletions(-) diff --git a/adaptive_compress.c b/adaptive_compress.c index 2678525..4a517e0 100644 --- a/adaptive_compress.c +++ b/adaptive_compress.c @@ -99,6 +99,7 @@ void adapt_props(algo_props_t *data, int level, uint64_t chunksize) { data->delta2_span = 200; + data->deltac_min_distance = EIGHTM; } int diff --git a/bzip2_compress.c b/bzip2_compress.c index fdee11f..f1addcb 100644 --- a/bzip2_compress.c +++ b/bzip2_compress.c @@ -51,6 +51,7 @@ bzip2_stats(int show) void bzip2_props(algo_props_t *data, int level, uint64_t chunksize) { data->delta2_span = 200; + data->deltac_min_distance = FOURM; } int diff --git a/libbsc_compress.c b/libbsc_compress.c index 3923249..3485ae2 100644 --- a/libbsc_compress.c +++ b/libbsc_compress.c @@ -80,6 +80,10 @@ libbsc_props(algo_props_t *data, int level, uint64_t chunksize) { data->c_max_threads = 8; data->d_max_threads = 8; data->delta2_span = 150; + if (chunksize > (EIGHTM * 2)) + data->deltac_min_distance = FOURM; + else + data->deltac_min_distance = EIGHTM; } int diff --git a/lz4_compress.c b/lz4_compress.c index 170cde8..32b9334 100644 --- a/lz4_compress.c +++ b/lz4_compress.c @@ -57,6 +57,7 @@ lz4_props(algo_props_t *data, int level, uint64_t chunksize) { data->decompress_mt_capable = 0; data->buf_extra = lz4_buf_extra(chunksize); data->delta2_span = 100; + data->deltac_min_distance = FOURM; } int diff --git a/lzfx_compress.c b/lzfx_compress.c index 537010f..c26a1f4 100644 --- a/lzfx_compress.c +++ b/lzfx_compress.c @@ -42,6 +42,7 @@ lz_fx_stats(int show) void lz_fx_props(algo_props_t *data, int level, uint64_t chunksize) { data->delta2_span = 50; + data->deltac_min_distance = FOURM; } int diff --git a/lzma_compress.c b/lzma_compress.c index e053d66..ced2e1a 100644 --- a/lzma_compress.c +++ b/lzma_compress.c @@ -53,6 +53,10 @@ lzma_mt_props(algo_props_t *data, int level, uint64_t chunksize) { data->buf_extra = 0; data->c_max_threads = 2; data->delta2_span = 150; + if (level < 12) + data->deltac_min_distance = (EIGHTM * 16); + else + data->deltac_min_distance = (EIGHTM * 32); } void @@ -61,6 +65,10 @@ lzma_props(algo_props_t *data, int level, uint64_t chunksize) { data->decompress_mt_capable = 0; data->buf_extra = 0; data->delta2_span = 150; + if (level < 12) + data->deltac_min_distance = (EIGHTM * 16); + else + data->deltac_min_distance = (EIGHTM * 32); } /* diff --git a/main.c b/main.c index 994e1c9..b90e82d 100644 --- a/main.c +++ b/main.c @@ -951,7 +951,7 @@ start_decompress(const char *filename, const char *to_filename) } if (enable_rabin_scan || enable_fixed_scan) { tdat->rctx = create_dedupe_context(chunksize, compressed_chunksize, rab_blk_size, - algo, enable_delta_encode, enable_fixed_scan, version, DECOMPRESS); + algo, &props, enable_delta_encode, enable_fixed_scan, version, DECOMPRESS); if (tdat->rctx == NULL) { UNCOMP_BAIL; } @@ -1673,7 +1673,7 @@ start_compress(const char *filename, uint64_t chunksize, int level) } if (enable_rabin_scan || enable_fixed_scan) { tdat->rctx = create_dedupe_context(chunksize, compressed_chunksize, rab_blk_size, - algo, enable_delta_encode, enable_fixed_scan, VERSION, COMPRESS); + algo, &props, enable_delta_encode, enable_fixed_scan, VERSION, COMPRESS); if (tdat->rctx == NULL) { COMP_BAIL; } @@ -1789,7 +1789,7 @@ start_compress(const char *filename, uint64_t chunksize, int level) * Read the first chunk into a spare buffer (a simple double-buffering). */ if (enable_rabin_split) { - rctx = create_dedupe_context(chunksize, 0, 0, algo, enable_delta_encode, + rctx = create_dedupe_context(chunksize, 0, 0, algo, &props, enable_delta_encode, enable_fixed_scan, VERSION, COMPRESS); rbytes = Read_Adjusted(uncompfd, cread_buf, chunksize, &rabin_count, rctx); } else { diff --git a/ppmd_compress.c b/ppmd_compress.c index 93ae1e4..88484a0 100644 --- a/ppmd_compress.c +++ b/ppmd_compress.c @@ -64,6 +64,7 @@ ppmd_stats(int show) void ppmd_props(algo_props_t *data, int level, uint64_t chunksize) { data->delta2_span = 100; + data->deltac_min_distance = FOURM; } int diff --git a/rabin/rabin_dedup.c b/rabin/rabin_dedup.c index 14aa3a6..52d363c 100755 --- a/rabin/rabin_dedup.c +++ b/rabin/rabin_dedup.c @@ -116,7 +116,8 @@ dedupe_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta */ dedupe_context_t * create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz, - const char *algo, int delta_flag, int fixed_flag, int file_version, compress_op_t op) { + const char *algo, const algo_props_t *props, int delta_flag, int fixed_flag, + int file_version, compress_op_t op) { dedupe_context_t *ctx; uint32_t i; @@ -189,6 +190,7 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s ctx->rabin_avg_block_mask = RAB_BLK_MASK; ctx->rabin_poly_min_block_size = dedupe_min_blksz(rab_blk_sz); ctx->delta_flag = 0; + ctx->deltac_min_distance = props->deltac_min_distance; /* * Scale down similarity percentage based on avg block size unless user specified @@ -582,12 +584,20 @@ process_blocks: while (1) { if (be->similarity_hash == ctx->blocks[i]->similarity_hash && be->length == ctx->blocks[i]->length) { - ctx->blocks[i]->similar = SIMILAR_PARTIAL; - ctx->blocks[i]->other = be; - be->similar = SIMILAR_REF; - matchlen += (be->length>>1); - length = 1; - break; + uint64_t off_diff; + if (be->offset > ctx->blocks[i]->offset) + off_diff = be->offset - ctx->blocks[i]->offset; + else + off_diff = ctx->blocks[i]->offset - be->offset; + + if (off_diff > ctx->deltac_min_distance) { + ctx->blocks[i]->similar = SIMILAR_PARTIAL; + ctx->blocks[i]->other = be; + be->similar = SIMILAR_REF; + matchlen += (be->length>>1); + length = 1; + break; + } } if (be->next) be = be->next; diff --git a/rabin/rabin_dedup.h b/rabin/rabin_dedup.h index e430942..dd5b565 100644 --- a/rabin/rabin_dedup.h +++ b/rabin/rabin_dedup.h @@ -159,12 +159,12 @@ typedef struct { uint64_t real_chunksize; short valid; void *lzma_data; - int level, delta_flag, fixed_flag; + int level, delta_flag, fixed_flag, deltac_min_distance; } dedupe_context_t; extern dedupe_context_t *create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, - int rab_blk_sz, const char *algo, int delta_flag, int fixed_flag, int file_version, - compress_op_t op); + int rab_blk_sz, const char *algo, const algo_props_t *props, int delta_flag, int fixed_flag, + int file_version, compress_op_t op); extern void destroy_dedupe_context(dedupe_context_t *ctx); extern unsigned int dedupe_compress(dedupe_context_t *ctx, unsigned char *buf, uint64_t *size, uint64_t offset, uint64_t *rabin_pos); diff --git a/utils/utils.h b/utils/utils.h index 10fd6e3..d55bb45 100644 --- a/utils/utils.h +++ b/utils/utils.h @@ -42,6 +42,8 @@ extern "C" { #define DATA_TEXT 1 #define DATA_BINARY 2 +#define EIGHTM (8UL * 1024UL * 1024UL) +#define FOURM (4UL * 1024UL * 1024UL) #if !defined(sun) && !defined(__sun) #define uchar_t u_char @@ -127,6 +129,7 @@ typedef struct { int c_max_threads; int d_max_threads; int delta2_span; + int deltac_min_distance; } algo_props_t; typedef enum { diff --git a/zlib_compress.c b/zlib_compress.c index 76b1d02..c1d24c3 100644 --- a/zlib_compress.c +++ b/zlib_compress.c @@ -92,6 +92,7 @@ zlib_stats(int show) void zlib_props(algo_props_t *data, int level, uint64_t chunksize) { data->delta2_span = 100; + data->deltac_min_distance = EIGHTM; } int