From d49a088eead9d090bcbe91ede93f02838ae9b2e1 Mon Sep 17 00:00:00 2001 From: Moinak Ghosh Date: Sun, 13 Jan 2013 22:04:59 +0530 Subject: [PATCH] Fixes and performance improvements for Dedupe Delta Compression Avoid using fingerprints in minhash computation and fix write amplification Modify min-heap to use 64bit values Improve bsdiff performance Fix pointer comparison in bsdiff Use 32bit offsets in bsdiff to reduce memory usage Improve Zero RLE Encoder performance Add more buffer overflow checks in Zero RLE Decoder --- bsdiff/bsdiff.c | 36 ++++++++++++-------- bsdiff/rle_encoder.c | 34 ++++++++++++++++--- rabin/rabin_dedup.c | 81 ++++++++++++++++++++------------------------ utils/heapq.c | 5 +-- utils/heapq.h | 2 +- utils/utils.h | 2 +- 6 files changed, 93 insertions(+), 67 deletions(-) diff --git a/bsdiff/bsdiff.c b/bsdiff/bsdiff.c index 56827e3..af6ff8b 100644 --- a/bsdiff/bsdiff.c +++ b/bsdiff/bsdiff.c @@ -134,11 +134,12 @@ static void split(bsize_t *I,bsize_t *V,bsize_t start,bsize_t len,bsize_t h) static void qsufsort(bsize_t *I,bsize_t *V,u_char *oldbuf,bsize_t oldsize) { - bsize_t buckets[256]; + bsize_t buckets[257]; + bsize_t *bkts; bsize_t i,h,len; #ifdef __USE_SSE_INTRIN__ - if (((bsize_t)buckets & (16 - 1)) == 0) { // 16-byte aligned ? + if (((size_t)buckets & (16 - 1)) == 0) { // 16-byte aligned ? int iters; uchar_t *pos; @@ -159,9 +160,18 @@ static void qsufsort(bsize_t *I,bsize_t *V,u_char *oldbuf,bsize_t oldsize) #ifdef __USE_SSE_INTRIN__ } #endif - for(i=0;i0;i--) buckets[i]=buckets[i-1]; + /* We want to do this: + * for(i=0;i0;i--) buckets[i]=buckets[i-1]; + * buckets[0]=0; + * + * However the code below uses an array larger by 1 element and is able to + * avoid the 3rd loop. + */ + bkts = &buckets[1]; + for(i=0;ioldscore+sz)) break; @@ -326,7 +335,7 @@ bsdiff(u_char *oldbuf, bsize_t oldsize, u_char *newbuf, bsize_t newsize, if((len!=oldscore) || (scan==newsize)) { s=0;Sf=0;lenf=0; for(i=0;(lastscan+iSf*2-lenf) { Sf=s; lenf=i; }; }; @@ -335,7 +344,7 @@ bsdiff(u_char *oldbuf, bsize_t oldsize, u_char *newbuf, bsize_t newsize, if(scan=lastscan+i)&&(pos>=i);i++) { - if(oldbuf[pos-i]==newbuf[scan-i]) s++; + s += (oldbuf[pos-i]==newbuf[scan-i]); if(s*2-i>Sb*2-lenb) { Sb=s; lenb=i; }; }; }; @@ -344,10 +353,9 @@ bsdiff(u_char *oldbuf, bsize_t oldsize, u_char *newbuf, bsize_t newsize, overlap=(lastscan+lenf)-(scan-lenb); s=0;Ss=0;lens=0; for(i=0;iSs) { Ss=s; lens=i+1; }; }; diff --git a/bsdiff/rle_encoder.c b/bsdiff/rle_encoder.c index 34b8505..1665b51 100644 --- a/bsdiff/rle_encoder.c +++ b/bsdiff/rle_encoder.c @@ -24,6 +24,7 @@ #include #include +#include #define ZERO_MASK (32768) #define DATA_MASK (32767) @@ -33,15 +34,25 @@ int zero_rle_encode(const void *ibuf, const unsigned int ilen, void *obuf, unsigned int *olen) { - unsigned int pos1, pos2; + unsigned int pos1, pos2, sz; unsigned short count; const uchar_t *ib = (const uchar_t *)ibuf; uchar_t *ob = (uchar_t *)obuf; + uint64_t val; + sz = sizeof (val) - 1; pos2 = 0; for (pos1=0; pos1 *olen) { + fprintf(stderr, "Output buffer overflow in Zero RLE decode.\n"); + return (-1); + } + memset(ob+pos2, 0, count); + pos2 += count; } else { - for (i=0; i ilen) { + fprintf(stderr, "Input underflow in Zero RLE decode.\n"); + return (-1); + } + if (pos2 + count > *olen) { + fprintf(stderr, "Output buffer overflow in Zero RLE decode.\n"); + return (-1); + } + memcpy(ob+pos2, ib+pos1, count); + pos2 += count; + pos1 += count; } } i = *olen; diff --git a/rabin/rabin_dedup.c b/rabin/rabin_dedup.c index ca99ecf..14aa3a6 100755 --- a/rabin/rabin_dedup.c +++ b/rabin/rabin_dedup.c @@ -72,9 +72,9 @@ #include "rabin_dedup.h" -#define FORTY_PCNT(x) ((x)/5 << 1) -#define FIFTY_PCNT(x) ((x) >> 1) -#define SIXTY_PCNT(x) (((x) >> 1) + ((x) >> 3)) +#define DELTA_EXTRA2_PCT(x) ((x) >> 1) +#define DELTA_EXTRA_PCT(x) (((x) >> 1) + ((x) >> 3)) +#define DELTA_NORMAL_PCT(x) (((x) >> 1) + ((x) >> 2) + ((x) >> 3)) extern int lzma_init(void **data, int *level, int nthreads, int64_t chunksize, int file_version, compress_op_t op); @@ -203,7 +203,7 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s ctx->delta_flag = 3; } } else if (delta_flag == DELTA_EXTRA) { - ctx->delta_flag = 1; + ctx->delta_flag = 2; } if (!fixed_flag) @@ -292,7 +292,7 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of uchar_t *buf1 = (uchar_t *)buf; uint32_t length; uint64_t cur_roll_checksum, cur_pos_checksum; - uint32_t *fplist; + uint32_t *ctx_heap; rabin_blockentry_t **htab; heap_t heap; DEBUG_STAT_EN(uint32_t max_count); @@ -341,9 +341,8 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of * Initialize arrays for sketch computation. We re-use memory allocated * for the compressed chunk temporarily. */ - ary_sz = 4 * ctx->rabin_poly_max_block_size; - fplist = (uint32_t *)(ctx->cbuf + ctx->real_chunksize - ary_sz); - if (ctx->delta_flag) memset(fplist, 0, ary_sz); + ary_sz = ctx->rabin_poly_max_block_size; + ctx_heap = (uint32_t *)(ctx->cbuf + ctx->real_chunksize - ary_sz); } memset(ctx->current_window_data, 0, RAB_POLYNOMIAL_WIN_SIZE); @@ -397,23 +396,6 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of cur_roll_checksum -= out[pushed_out]; cur_pos_checksum = cur_roll_checksum ^ ir[pushed_out]; - /* - * Retain a list of all fingerprints in the block. We then compute - * the K min values sketch from that list and generate a super sketch - * by hashing over the K min values sketch. We only store the least - * significant 32 bits of the fingerprint. This uses less memory, - * requires smaller memset() calls and generates a sufficiently large - * number of similarity matches without false positives - determined - * by experimentation. - * - * This is called minhashing and is used widely, for example in various - * search engines to detect similar documents. - */ - if (ctx->delta_flag) { - fplist[j] = cur_pos_checksum & 0xFFFFFFFFUL; - j++; - } - /* * Window pos has to rotate from 0 .. RAB_POLYNOMIAL_WIN_SIZE-1 * We avoid a branch here by masking. This requires RAB_POLYNOMIAL_WIN_SIZE @@ -432,25 +414,32 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of ctx->blocks[blknum]->offset = last_offset; ctx->blocks[blknum]->index = blknum; // Need to store for sorting ctx->blocks[blknum]->length = length; - DEBUG_STAT_EN(if (length >= ctx->rabin_poly_max_block_size) max_count++); + /* * Reset the heap structure and find the K min values if Delta Compression * is enabled. We use a min heap mechanism taken from the heap based priority * queue implementation in Python. - * Here K = 60% or 40%. We are aiming to detect either 60% (default) or 40% - * similarity on average. + * Here K = similarity extent = 87% or 62% or 50%. + * + * Once block contents are arranged in a min heap we compute the K min values + * sketch by hashing over the heap till K%. We interpret the raw bytes as a + * sequence of 64-bit integers. + * This is called minhashing and is used widely, for example in various + * search engines to detect similar documents. */ if (ctx->delta_flag) { - pc[1] = SIXTY_PCNT(j); - pc[2] = FIFTY_PCNT(j); - pc[3] = FORTY_PCNT(j); + memcpy(ctx_heap, buf1+last_offset, length); + length /= 8; + pc[1] = DELTA_NORMAL_PCT(length); + pc[2] = DELTA_EXTRA_PCT(length); + pc[3] = DELTA_EXTRA2_PCT(length); reset_heap(&heap, pc[ctx->delta_flag]); - ksmallest((int32_t *)fplist, j, &heap); + ksmallest((int64_t *)ctx_heap, length, &heap); + ctx->blocks[blknum]->similarity_hash = - XXH32((const uchar_t *)fplist, pc[ctx->delta_flag]*4, 0); - memset(fplist, 0, ary_sz); + XXH32((const uchar_t *)ctx_heap, pc[ctx->delta_flag]*8, 0); } blknum++; last_offset = i+1; @@ -466,26 +455,30 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of sizeof (rabin_blockentry_t)); ctx->blocks[blknum]->offset = last_offset; ctx->blocks[blknum]->index = blknum; - ctx->blocks[blknum]->length = *size - last_offset; + length = *size - last_offset; + ctx->blocks[blknum]->length = length; if (ctx->delta_flag) { uint64_t cur_sketch; uint64_t pc[3]; - if (j > 1) { - pc[1] = SIXTY_PCNT(j); - pc[2] = FIFTY_PCNT(j); - pc[3] = FORTY_PCNT(j); + if (length > ctx->rabin_poly_min_block_size) { + memcpy(ctx_heap, buf1+last_offset, length); + length /= 8; + pc[1] = DELTA_NORMAL_PCT(length); + pc[2] = DELTA_EXTRA_PCT(length); + pc[3] = DELTA_EXTRA2_PCT(length); + reset_heap(&heap, pc[ctx->delta_flag]); - ksmallest((int32_t *)fplist, j, &heap); + ksmallest((int64_t *)ctx_heap, length, &heap); cur_sketch = - XXH32((const uchar_t *)fplist, pc[ctx->delta_flag]*4, 0); + XXH32((const uchar_t *)ctx_heap, pc[ctx->delta_flag]*8, 0); + ctx->blocks[blknum]->similarity_hash = cur_sketch; } else { - if (j == 0) j = 1; cur_sketch = - XXH32((const uchar_t *)fplist, (j*4)/2, 0); + XXH32((const uchar_t *)(buf1+last_offset), length, 0); + ctx->blocks[blknum]->similarity_hash = cur_sketch; } - ctx->blocks[blknum]->similarity_hash = cur_sketch; } blknum++; last_offset = *size; diff --git a/utils/heapq.c b/utils/heapq.c index 3676a6b..5ce8958 100644 --- a/utils/heapq.c +++ b/utils/heapq.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #ifndef NDEBUG @@ -71,7 +72,7 @@ _siftupmax(heap_t *h, __TYPE spos, __TYPE epos) heap = h->ary; #ifdef ERROR_CHK if (spos >= endpos) { - fprintf(stderr, "_siftupmax: index out of range: %u, len: %u\n", spos, endpos); + fprintf(stderr, "_siftupmax: index out of range: %" PRId64 ", len: %" PRId64 "\n", spos, endpos); return -1; } #endif @@ -118,7 +119,7 @@ _siftupmax_s(heap_t *h, __TYPE spos) heap = h->ary; #ifdef ERROR_CHK if (spos >= endpos) { - fprintf(stderr, "_siftupmax: index out of range: %u, len: %u\n", spos, endpos); + fprintf(stderr, "_siftupmax: index out of range: %" PRId64 ", len: %" PRId64 "\n", spos, endpos); return -1; } #endif diff --git a/utils/heapq.h b/utils/heapq.h index 155eeca..5b3e2f5 100644 --- a/utils/heapq.h +++ b/utils/heapq.h @@ -1,6 +1,6 @@ #ifndef __HEAPQ_H_ -#define __TYPE int32_t +#define __TYPE int64_t typedef struct { __TYPE *ary; diff --git a/utils/utils.h b/utils/utils.h index 47017a9..10fd6e3 100644 --- a/utils/utils.h +++ b/utils/utils.h @@ -57,7 +57,7 @@ extern "C" { # endif #endif typedef unsigned long uintptr_t; -typedef int64_t bsize_t; +typedef int32_t bsize_t; #undef WORDS_BIGENDIAN #if BYTE_ORDER == BIG_ENDIAN