diff --git a/bsdiff/bsdiff.c b/bsdiff/bsdiff.c index 56827e3..af6ff8b 100644 --- a/bsdiff/bsdiff.c +++ b/bsdiff/bsdiff.c @@ -134,11 +134,12 @@ static void split(bsize_t *I,bsize_t *V,bsize_t start,bsize_t len,bsize_t h) static void qsufsort(bsize_t *I,bsize_t *V,u_char *oldbuf,bsize_t oldsize) { - bsize_t buckets[256]; + bsize_t buckets[257]; + bsize_t *bkts; bsize_t i,h,len; #ifdef __USE_SSE_INTRIN__ - if (((bsize_t)buckets & (16 - 1)) == 0) { // 16-byte aligned ? + if (((size_t)buckets & (16 - 1)) == 0) { // 16-byte aligned ? int iters; uchar_t *pos; @@ -159,9 +160,18 @@ static void qsufsort(bsize_t *I,bsize_t *V,u_char *oldbuf,bsize_t oldsize) #ifdef __USE_SSE_INTRIN__ } #endif - for(i=0;i0;i--) buckets[i]=buckets[i-1]; + /* We want to do this: + * for(i=0;i0;i--) buckets[i]=buckets[i-1]; + * buckets[0]=0; + * + * However the code below uses an array larger by 1 element and is able to + * avoid the 3rd loop. + */ + bkts = &buckets[1]; + for(i=0;ioldscore+sz)) break; @@ -326,7 +335,7 @@ bsdiff(u_char *oldbuf, bsize_t oldsize, u_char *newbuf, bsize_t newsize, if((len!=oldscore) || (scan==newsize)) { s=0;Sf=0;lenf=0; for(i=0;(lastscan+iSf*2-lenf) { Sf=s; lenf=i; }; }; @@ -335,7 +344,7 @@ bsdiff(u_char *oldbuf, bsize_t oldsize, u_char *newbuf, bsize_t newsize, if(scan=lastscan+i)&&(pos>=i);i++) { - if(oldbuf[pos-i]==newbuf[scan-i]) s++; + s += (oldbuf[pos-i]==newbuf[scan-i]); if(s*2-i>Sb*2-lenb) { Sb=s; lenb=i; }; }; }; @@ -344,10 +353,9 @@ bsdiff(u_char *oldbuf, bsize_t oldsize, u_char *newbuf, bsize_t newsize, overlap=(lastscan+lenf)-(scan-lenb); s=0;Ss=0;lens=0; for(i=0;iSs) { Ss=s; lens=i+1; }; }; diff --git a/bsdiff/rle_encoder.c b/bsdiff/rle_encoder.c index 34b8505..1665b51 100644 --- a/bsdiff/rle_encoder.c +++ b/bsdiff/rle_encoder.c @@ -24,6 +24,7 @@ #include #include +#include #define ZERO_MASK (32768) #define DATA_MASK (32767) @@ -33,15 +34,25 @@ int zero_rle_encode(const void *ibuf, const unsigned int ilen, void *obuf, unsigned int *olen) { - unsigned int pos1, pos2; + unsigned int pos1, pos2, sz; unsigned short count; const uchar_t *ib = (const uchar_t *)ibuf; uchar_t *ob = (uchar_t *)obuf; + uint64_t val; + sz = sizeof (val) - 1; pos2 = 0; for (pos1=0; pos1 *olen) { + fprintf(stderr, "Output buffer overflow in Zero RLE decode.\n"); + return (-1); + } + memset(ob+pos2, 0, count); + pos2 += count; } else { - for (i=0; i ilen) { + fprintf(stderr, "Input underflow in Zero RLE decode.\n"); + return (-1); + } + if (pos2 + count > *olen) { + fprintf(stderr, "Output buffer overflow in Zero RLE decode.\n"); + return (-1); + } + memcpy(ob+pos2, ib+pos1, count); + pos2 += count; + pos1 += count; } } i = *olen; diff --git a/rabin/rabin_dedup.c b/rabin/rabin_dedup.c index ca99ecf..14aa3a6 100755 --- a/rabin/rabin_dedup.c +++ b/rabin/rabin_dedup.c @@ -72,9 +72,9 @@ #include "rabin_dedup.h" -#define FORTY_PCNT(x) ((x)/5 << 1) -#define FIFTY_PCNT(x) ((x) >> 1) -#define SIXTY_PCNT(x) (((x) >> 1) + ((x) >> 3)) +#define DELTA_EXTRA2_PCT(x) ((x) >> 1) +#define DELTA_EXTRA_PCT(x) (((x) >> 1) + ((x) >> 3)) +#define DELTA_NORMAL_PCT(x) (((x) >> 1) + ((x) >> 2) + ((x) >> 3)) extern int lzma_init(void **data, int *level, int nthreads, int64_t chunksize, int file_version, compress_op_t op); @@ -203,7 +203,7 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s ctx->delta_flag = 3; } } else if (delta_flag == DELTA_EXTRA) { - ctx->delta_flag = 1; + ctx->delta_flag = 2; } if (!fixed_flag) @@ -292,7 +292,7 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of uchar_t *buf1 = (uchar_t *)buf; uint32_t length; uint64_t cur_roll_checksum, cur_pos_checksum; - uint32_t *fplist; + uint32_t *ctx_heap; rabin_blockentry_t **htab; heap_t heap; DEBUG_STAT_EN(uint32_t max_count); @@ -341,9 +341,8 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of * Initialize arrays for sketch computation. We re-use memory allocated * for the compressed chunk temporarily. */ - ary_sz = 4 * ctx->rabin_poly_max_block_size; - fplist = (uint32_t *)(ctx->cbuf + ctx->real_chunksize - ary_sz); - if (ctx->delta_flag) memset(fplist, 0, ary_sz); + ary_sz = ctx->rabin_poly_max_block_size; + ctx_heap = (uint32_t *)(ctx->cbuf + ctx->real_chunksize - ary_sz); } memset(ctx->current_window_data, 0, RAB_POLYNOMIAL_WIN_SIZE); @@ -397,23 +396,6 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of cur_roll_checksum -= out[pushed_out]; cur_pos_checksum = cur_roll_checksum ^ ir[pushed_out]; - /* - * Retain a list of all fingerprints in the block. We then compute - * the K min values sketch from that list and generate a super sketch - * by hashing over the K min values sketch. We only store the least - * significant 32 bits of the fingerprint. This uses less memory, - * requires smaller memset() calls and generates a sufficiently large - * number of similarity matches without false positives - determined - * by experimentation. - * - * This is called minhashing and is used widely, for example in various - * search engines to detect similar documents. - */ - if (ctx->delta_flag) { - fplist[j] = cur_pos_checksum & 0xFFFFFFFFUL; - j++; - } - /* * Window pos has to rotate from 0 .. RAB_POLYNOMIAL_WIN_SIZE-1 * We avoid a branch here by masking. This requires RAB_POLYNOMIAL_WIN_SIZE @@ -432,25 +414,32 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of ctx->blocks[blknum]->offset = last_offset; ctx->blocks[blknum]->index = blknum; // Need to store for sorting ctx->blocks[blknum]->length = length; - DEBUG_STAT_EN(if (length >= ctx->rabin_poly_max_block_size) max_count++); + /* * Reset the heap structure and find the K min values if Delta Compression * is enabled. We use a min heap mechanism taken from the heap based priority * queue implementation in Python. - * Here K = 60% or 40%. We are aiming to detect either 60% (default) or 40% - * similarity on average. + * Here K = similarity extent = 87% or 62% or 50%. + * + * Once block contents are arranged in a min heap we compute the K min values + * sketch by hashing over the heap till K%. We interpret the raw bytes as a + * sequence of 64-bit integers. + * This is called minhashing and is used widely, for example in various + * search engines to detect similar documents. */ if (ctx->delta_flag) { - pc[1] = SIXTY_PCNT(j); - pc[2] = FIFTY_PCNT(j); - pc[3] = FORTY_PCNT(j); + memcpy(ctx_heap, buf1+last_offset, length); + length /= 8; + pc[1] = DELTA_NORMAL_PCT(length); + pc[2] = DELTA_EXTRA_PCT(length); + pc[3] = DELTA_EXTRA2_PCT(length); reset_heap(&heap, pc[ctx->delta_flag]); - ksmallest((int32_t *)fplist, j, &heap); + ksmallest((int64_t *)ctx_heap, length, &heap); + ctx->blocks[blknum]->similarity_hash = - XXH32((const uchar_t *)fplist, pc[ctx->delta_flag]*4, 0); - memset(fplist, 0, ary_sz); + XXH32((const uchar_t *)ctx_heap, pc[ctx->delta_flag]*8, 0); } blknum++; last_offset = i+1; @@ -466,26 +455,30 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of sizeof (rabin_blockentry_t)); ctx->blocks[blknum]->offset = last_offset; ctx->blocks[blknum]->index = blknum; - ctx->blocks[blknum]->length = *size - last_offset; + length = *size - last_offset; + ctx->blocks[blknum]->length = length; if (ctx->delta_flag) { uint64_t cur_sketch; uint64_t pc[3]; - if (j > 1) { - pc[1] = SIXTY_PCNT(j); - pc[2] = FIFTY_PCNT(j); - pc[3] = FORTY_PCNT(j); + if (length > ctx->rabin_poly_min_block_size) { + memcpy(ctx_heap, buf1+last_offset, length); + length /= 8; + pc[1] = DELTA_NORMAL_PCT(length); + pc[2] = DELTA_EXTRA_PCT(length); + pc[3] = DELTA_EXTRA2_PCT(length); + reset_heap(&heap, pc[ctx->delta_flag]); - ksmallest((int32_t *)fplist, j, &heap); + ksmallest((int64_t *)ctx_heap, length, &heap); cur_sketch = - XXH32((const uchar_t *)fplist, pc[ctx->delta_flag]*4, 0); + XXH32((const uchar_t *)ctx_heap, pc[ctx->delta_flag]*8, 0); + ctx->blocks[blknum]->similarity_hash = cur_sketch; } else { - if (j == 0) j = 1; cur_sketch = - XXH32((const uchar_t *)fplist, (j*4)/2, 0); + XXH32((const uchar_t *)(buf1+last_offset), length, 0); + ctx->blocks[blknum]->similarity_hash = cur_sketch; } - ctx->blocks[blknum]->similarity_hash = cur_sketch; } blknum++; last_offset = *size; diff --git a/utils/heapq.c b/utils/heapq.c index 3676a6b..5ce8958 100644 --- a/utils/heapq.c +++ b/utils/heapq.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #ifndef NDEBUG @@ -71,7 +72,7 @@ _siftupmax(heap_t *h, __TYPE spos, __TYPE epos) heap = h->ary; #ifdef ERROR_CHK if (spos >= endpos) { - fprintf(stderr, "_siftupmax: index out of range: %u, len: %u\n", spos, endpos); + fprintf(stderr, "_siftupmax: index out of range: %" PRId64 ", len: %" PRId64 "\n", spos, endpos); return -1; } #endif @@ -118,7 +119,7 @@ _siftupmax_s(heap_t *h, __TYPE spos) heap = h->ary; #ifdef ERROR_CHK if (spos >= endpos) { - fprintf(stderr, "_siftupmax: index out of range: %u, len: %u\n", spos, endpos); + fprintf(stderr, "_siftupmax: index out of range: %" PRId64 ", len: %" PRId64 "\n", spos, endpos); return -1; } #endif diff --git a/utils/heapq.h b/utils/heapq.h index 155eeca..5b3e2f5 100644 --- a/utils/heapq.h +++ b/utils/heapq.h @@ -1,6 +1,6 @@ #ifndef __HEAPQ_H_ -#define __TYPE int32_t +#define __TYPE int64_t typedef struct { __TYPE *ary; diff --git a/utils/utils.h b/utils/utils.h index 47017a9..10fd6e3 100644 --- a/utils/utils.h +++ b/utils/utils.h @@ -57,7 +57,7 @@ extern "C" { # endif #endif typedef unsigned long uintptr_t; -typedef int64_t bsize_t; +typedef int32_t bsize_t; #undef WORDS_BIGENDIAN #if BYTE_ORDER == BIG_ENDIAN