Implement algo-specific minimum distance match for Delta Compression.

This commit is contained in:
Moinak Ghosh 2013-01-14 13:20:07 +05:30
parent d49a088eea
commit 39dbc4be43
12 changed files with 44 additions and 13 deletions

View file

@ -99,6 +99,7 @@ void
adapt_props(algo_props_t *data, int level, uint64_t chunksize) adapt_props(algo_props_t *data, int level, uint64_t chunksize)
{ {
data->delta2_span = 200; data->delta2_span = 200;
data->deltac_min_distance = EIGHTM;
} }
int int

View file

@ -51,6 +51,7 @@ bzip2_stats(int show)
void void
bzip2_props(algo_props_t *data, int level, uint64_t chunksize) { bzip2_props(algo_props_t *data, int level, uint64_t chunksize) {
data->delta2_span = 200; data->delta2_span = 200;
data->deltac_min_distance = FOURM;
} }
int int

View file

@ -80,6 +80,10 @@ libbsc_props(algo_props_t *data, int level, uint64_t chunksize) {
data->c_max_threads = 8; data->c_max_threads = 8;
data->d_max_threads = 8; data->d_max_threads = 8;
data->delta2_span = 150; data->delta2_span = 150;
if (chunksize > (EIGHTM * 2))
data->deltac_min_distance = FOURM;
else
data->deltac_min_distance = EIGHTM;
} }
int int

View file

@ -57,6 +57,7 @@ lz4_props(algo_props_t *data, int level, uint64_t chunksize) {
data->decompress_mt_capable = 0; data->decompress_mt_capable = 0;
data->buf_extra = lz4_buf_extra(chunksize); data->buf_extra = lz4_buf_extra(chunksize);
data->delta2_span = 100; data->delta2_span = 100;
data->deltac_min_distance = FOURM;
} }
int int

View file

@ -42,6 +42,7 @@ lz_fx_stats(int show)
void void
lz_fx_props(algo_props_t *data, int level, uint64_t chunksize) { lz_fx_props(algo_props_t *data, int level, uint64_t chunksize) {
data->delta2_span = 50; data->delta2_span = 50;
data->deltac_min_distance = FOURM;
} }
int int

View file

@ -53,6 +53,10 @@ lzma_mt_props(algo_props_t *data, int level, uint64_t chunksize) {
data->buf_extra = 0; data->buf_extra = 0;
data->c_max_threads = 2; data->c_max_threads = 2;
data->delta2_span = 150; data->delta2_span = 150;
if (level < 12)
data->deltac_min_distance = (EIGHTM * 16);
else
data->deltac_min_distance = (EIGHTM * 32);
} }
void void
@ -61,6 +65,10 @@ lzma_props(algo_props_t *data, int level, uint64_t chunksize) {
data->decompress_mt_capable = 0; data->decompress_mt_capable = 0;
data->buf_extra = 0; data->buf_extra = 0;
data->delta2_span = 150; data->delta2_span = 150;
if (level < 12)
data->deltac_min_distance = (EIGHTM * 16);
else
data->deltac_min_distance = (EIGHTM * 32);
} }
/* /*

6
main.c
View file

@ -951,7 +951,7 @@ start_decompress(const char *filename, const char *to_filename)
} }
if (enable_rabin_scan || enable_fixed_scan) { if (enable_rabin_scan || enable_fixed_scan) {
tdat->rctx = create_dedupe_context(chunksize, compressed_chunksize, rab_blk_size, tdat->rctx = create_dedupe_context(chunksize, compressed_chunksize, rab_blk_size,
algo, enable_delta_encode, enable_fixed_scan, version, DECOMPRESS); algo, &props, enable_delta_encode, enable_fixed_scan, version, DECOMPRESS);
if (tdat->rctx == NULL) { if (tdat->rctx == NULL) {
UNCOMP_BAIL; UNCOMP_BAIL;
} }
@ -1673,7 +1673,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
} }
if (enable_rabin_scan || enable_fixed_scan) { if (enable_rabin_scan || enable_fixed_scan) {
tdat->rctx = create_dedupe_context(chunksize, compressed_chunksize, rab_blk_size, tdat->rctx = create_dedupe_context(chunksize, compressed_chunksize, rab_blk_size,
algo, enable_delta_encode, enable_fixed_scan, VERSION, COMPRESS); algo, &props, enable_delta_encode, enable_fixed_scan, VERSION, COMPRESS);
if (tdat->rctx == NULL) { if (tdat->rctx == NULL) {
COMP_BAIL; COMP_BAIL;
} }
@ -1789,7 +1789,7 @@ start_compress(const char *filename, uint64_t chunksize, int level)
* Read the first chunk into a spare buffer (a simple double-buffering). * Read the first chunk into a spare buffer (a simple double-buffering).
*/ */
if (enable_rabin_split) { if (enable_rabin_split) {
rctx = create_dedupe_context(chunksize, 0, 0, algo, enable_delta_encode, rctx = create_dedupe_context(chunksize, 0, 0, algo, &props, enable_delta_encode,
enable_fixed_scan, VERSION, COMPRESS); enable_fixed_scan, VERSION, COMPRESS);
rbytes = Read_Adjusted(uncompfd, cread_buf, chunksize, &rabin_count, rctx); rbytes = Read_Adjusted(uncompfd, cread_buf, chunksize, &rabin_count, rctx);
} else { } else {

View file

@ -64,6 +64,7 @@ ppmd_stats(int show)
void void
ppmd_props(algo_props_t *data, int level, uint64_t chunksize) { ppmd_props(algo_props_t *data, int level, uint64_t chunksize) {
data->delta2_span = 100; data->delta2_span = 100;
data->deltac_min_distance = FOURM;
} }
int int

View file

@ -116,7 +116,8 @@ dedupe_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta
*/ */
dedupe_context_t * dedupe_context_t *
create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz, create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_sz,
const char *algo, int delta_flag, int fixed_flag, int file_version, compress_op_t op) { const char *algo, const algo_props_t *props, int delta_flag, int fixed_flag,
int file_version, compress_op_t op) {
dedupe_context_t *ctx; dedupe_context_t *ctx;
uint32_t i; uint32_t i;
@ -189,6 +190,7 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s
ctx->rabin_avg_block_mask = RAB_BLK_MASK; ctx->rabin_avg_block_mask = RAB_BLK_MASK;
ctx->rabin_poly_min_block_size = dedupe_min_blksz(rab_blk_sz); ctx->rabin_poly_min_block_size = dedupe_min_blksz(rab_blk_sz);
ctx->delta_flag = 0; ctx->delta_flag = 0;
ctx->deltac_min_distance = props->deltac_min_distance;
/* /*
* Scale down similarity percentage based on avg block size unless user specified * Scale down similarity percentage based on avg block size unless user specified
@ -582,12 +584,20 @@ process_blocks:
while (1) { while (1) {
if (be->similarity_hash == ctx->blocks[i]->similarity_hash && if (be->similarity_hash == ctx->blocks[i]->similarity_hash &&
be->length == ctx->blocks[i]->length) { be->length == ctx->blocks[i]->length) {
ctx->blocks[i]->similar = SIMILAR_PARTIAL; uint64_t off_diff;
ctx->blocks[i]->other = be; if (be->offset > ctx->blocks[i]->offset)
be->similar = SIMILAR_REF; off_diff = be->offset - ctx->blocks[i]->offset;
matchlen += (be->length>>1); else
length = 1; off_diff = ctx->blocks[i]->offset - be->offset;
break;
if (off_diff > ctx->deltac_min_distance) {
ctx->blocks[i]->similar = SIMILAR_PARTIAL;
ctx->blocks[i]->other = be;
be->similar = SIMILAR_REF;
matchlen += (be->length>>1);
length = 1;
break;
}
} }
if (be->next) if (be->next)
be = be->next; be = be->next;

View file

@ -159,12 +159,12 @@ typedef struct {
uint64_t real_chunksize; uint64_t real_chunksize;
short valid; short valid;
void *lzma_data; void *lzma_data;
int level, delta_flag, fixed_flag; int level, delta_flag, fixed_flag, deltac_min_distance;
} dedupe_context_t; } dedupe_context_t;
extern dedupe_context_t *create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, extern dedupe_context_t *create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize,
int rab_blk_sz, const char *algo, int delta_flag, int fixed_flag, int file_version, int rab_blk_sz, const char *algo, const algo_props_t *props, int delta_flag, int fixed_flag,
compress_op_t op); int file_version, compress_op_t op);
extern void destroy_dedupe_context(dedupe_context_t *ctx); extern void destroy_dedupe_context(dedupe_context_t *ctx);
extern unsigned int dedupe_compress(dedupe_context_t *ctx, unsigned char *buf, extern unsigned int dedupe_compress(dedupe_context_t *ctx, unsigned char *buf,
uint64_t *size, uint64_t offset, uint64_t *rabin_pos); uint64_t *size, uint64_t offset, uint64_t *rabin_pos);

View file

@ -42,6 +42,8 @@ extern "C" {
#define DATA_TEXT 1 #define DATA_TEXT 1
#define DATA_BINARY 2 #define DATA_BINARY 2
#define EIGHTM (8UL * 1024UL * 1024UL)
#define FOURM (4UL * 1024UL * 1024UL)
#if !defined(sun) && !defined(__sun) #if !defined(sun) && !defined(__sun)
#define uchar_t u_char #define uchar_t u_char
@ -127,6 +129,7 @@ typedef struct {
int c_max_threads; int c_max_threads;
int d_max_threads; int d_max_threads;
int delta2_span; int delta2_span;
int deltac_min_distance;
} algo_props_t; } algo_props_t;
typedef enum { typedef enum {

View file

@ -92,6 +92,7 @@ zlib_stats(int show)
void void
zlib_props(algo_props_t *data, int level, uint64_t chunksize) { zlib_props(algo_props_t *data, int level, uint64_t chunksize) {
data->delta2_span = 100; data->delta2_span = 100;
data->deltac_min_distance = EIGHTM;
} }
int int