Fixes and performance improvements for Dedupe Delta Compression
Avoid using fingerprints in minhash computation and fix write amplification Modify min-heap to use 64bit values Improve bsdiff performance Fix pointer comparison in bsdiff Use 32bit offsets in bsdiff to reduce memory usage Improve Zero RLE Encoder performance Add more buffer overflow checks in Zero RLE Decoder
This commit is contained in:
parent
87aa12206e
commit
d49a088eea
6 changed files with 93 additions and 67 deletions
|
@ -134,11 +134,12 @@ static void split(bsize_t *I,bsize_t *V,bsize_t start,bsize_t len,bsize_t h)
|
|||
|
||||
static void qsufsort(bsize_t *I,bsize_t *V,u_char *oldbuf,bsize_t oldsize)
|
||||
{
|
||||
bsize_t buckets[256];
|
||||
bsize_t buckets[257];
|
||||
bsize_t *bkts;
|
||||
bsize_t i,h,len;
|
||||
|
||||
#ifdef __USE_SSE_INTRIN__
|
||||
if (((bsize_t)buckets & (16 - 1)) == 0) { // 16-byte aligned ?
|
||||
if (((size_t)buckets & (16 - 1)) == 0) { // 16-byte aligned ?
|
||||
int iters;
|
||||
uchar_t *pos;
|
||||
|
||||
|
@ -159,9 +160,18 @@ static void qsufsort(bsize_t *I,bsize_t *V,u_char *oldbuf,bsize_t oldsize)
|
|||
#ifdef __USE_SSE_INTRIN__
|
||||
}
|
||||
#endif
|
||||
for(i=0;i<oldsize;i++) buckets[oldbuf[i]]++;
|
||||
for(i=1;i<256;i++) buckets[i]+=buckets[i-1];
|
||||
for(i=255;i>0;i--) buckets[i]=buckets[i-1];
|
||||
/* We want to do this:
|
||||
* for(i=0;i<oldsize;i++) buckets[oldbuf[i]]++;
|
||||
* for(i=1;i<256;i++) buckets[i]+=buckets[i-1];
|
||||
* for(i=255;i>0;i--) buckets[i]=buckets[i-1];
|
||||
* buckets[0]=0;
|
||||
*
|
||||
* However the code below uses an array larger by 1 element and is able to
|
||||
* avoid the 3rd loop.
|
||||
*/
|
||||
bkts = &buckets[1];
|
||||
for(i=0;i<oldsize;i++) bkts[oldbuf[i]]++;
|
||||
for(i=1;i<256;i++) bkts[i]+=bkts[i-1];
|
||||
buckets[0]=0;
|
||||
|
||||
for(i=0;i<oldsize;i++) I[++buckets[oldbuf[i]]]=i;
|
||||
|
@ -311,9 +321,8 @@ bsdiff(u_char *oldbuf, bsize_t oldsize, u_char *newbuf, bsize_t newsize,
|
|||
0,oldsize,&pos);
|
||||
|
||||
for(;scsc<scan+len;scsc++)
|
||||
if((scsc+lastoffset<oldsize) &&
|
||||
(oldbuf[scsc+lastoffset] == newbuf[scsc]))
|
||||
oldscore++;
|
||||
oldscore += ((scsc+lastoffset<oldsize) &&
|
||||
(oldbuf[scsc+lastoffset] == newbuf[scsc]));
|
||||
|
||||
if(((len==oldscore) && (len!=0)) ||
|
||||
(len>oldscore+sz)) break;
|
||||
|
@ -326,7 +335,7 @@ bsdiff(u_char *oldbuf, bsize_t oldsize, u_char *newbuf, bsize_t newsize,
|
|||
if((len!=oldscore) || (scan==newsize)) {
|
||||
s=0;Sf=0;lenf=0;
|
||||
for(i=0;(lastscan+i<scan)&&(lastpos+i<oldsize);) {
|
||||
if(oldbuf[lastpos+i]==newbuf[lastscan+i]) s++;
|
||||
s += (oldbuf[lastpos+i]==newbuf[lastscan+i]);
|
||||
i++;
|
||||
if(s*2-i>Sf*2-lenf) { Sf=s; lenf=i; };
|
||||
};
|
||||
|
@ -335,7 +344,7 @@ bsdiff(u_char *oldbuf, bsize_t oldsize, u_char *newbuf, bsize_t newsize,
|
|||
if(scan<newsize) {
|
||||
s=0;Sb=0;
|
||||
for(i=1;(scan>=lastscan+i)&&(pos>=i);i++) {
|
||||
if(oldbuf[pos-i]==newbuf[scan-i]) s++;
|
||||
s += (oldbuf[pos-i]==newbuf[scan-i]);
|
||||
if(s*2-i>Sb*2-lenb) { Sb=s; lenb=i; };
|
||||
};
|
||||
};
|
||||
|
@ -344,10 +353,9 @@ bsdiff(u_char *oldbuf, bsize_t oldsize, u_char *newbuf, bsize_t newsize,
|
|||
overlap=(lastscan+lenf)-(scan-lenb);
|
||||
s=0;Ss=0;lens=0;
|
||||
for(i=0;i<overlap;i++) {
|
||||
if(newbuf[lastscan+lenf-overlap+i]==
|
||||
oldbuf[lastpos+lenf-overlap+i]) s++;
|
||||
if(newbuf[scan-lenb+i]==
|
||||
oldbuf[pos-lenb+i]) s--;
|
||||
s += (newbuf[lastscan+lenf-overlap+i]==
|
||||
oldbuf[lastpos+lenf-overlap+i]);
|
||||
s -= (newbuf[scan-lenb+i]==oldbuf[pos-lenb+i]);
|
||||
if(s>Ss) { Ss=s; lens=i+1; };
|
||||
};
|
||||
|
||||
|
|
|
@ -24,6 +24,7 @@
|
|||
|
||||
#include <utils.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#define ZERO_MASK (32768)
|
||||
#define DATA_MASK (32767)
|
||||
|
@ -33,15 +34,25 @@ int
|
|||
zero_rle_encode(const void *ibuf, const unsigned int ilen,
|
||||
void *obuf, unsigned int *olen)
|
||||
{
|
||||
unsigned int pos1, pos2;
|
||||
unsigned int pos1, pos2, sz;
|
||||
unsigned short count;
|
||||
const uchar_t *ib = (const uchar_t *)ibuf;
|
||||
uchar_t *ob = (uchar_t *)obuf;
|
||||
uint64_t val;
|
||||
|
||||
sz = sizeof (val) - 1;
|
||||
pos2 = 0;
|
||||
for (pos1=0; pos1<ilen && pos2<*olen;) {
|
||||
count = 0;
|
||||
if (ib[pos1] == 0) {
|
||||
/*
|
||||
* We have a run of zeroes. Count them and store only the count.
|
||||
*/
|
||||
while (pos1 < (ilen - sz) && count < (COUNT_MAX - sz)) {
|
||||
val = *((uint64_t *)(ib+pos1));
|
||||
if (val) break;
|
||||
pos1 += sizeof (val); count += sizeof (val);
|
||||
}
|
||||
for (;pos1<ilen && ib[pos1]==0 && count<COUNT_MAX; pos1++) count++;
|
||||
count |= ZERO_MASK;
|
||||
*((unsigned short *)(ob + pos2)) = htons(count);
|
||||
|
@ -93,11 +104,24 @@ zero_rle_decode(const void* ibuf, unsigned int ilen,
|
|||
pos1 += 2;
|
||||
if (count & ZERO_MASK) {
|
||||
count &= DATA_MASK;
|
||||
for (i=0; i<count && pos2<*olen; i++)
|
||||
ob[pos2++] = 0;
|
||||
if (pos2 + count > *olen) {
|
||||
fprintf(stderr, "Output buffer overflow in Zero RLE decode.\n");
|
||||
return (-1);
|
||||
}
|
||||
memset(ob+pos2, 0, count);
|
||||
pos2 += count;
|
||||
} else {
|
||||
for (i=0; i<count && pos1<ilen && pos2<*olen; i++)
|
||||
ob[pos2++] = ib[pos1++];
|
||||
if (pos1 + count > ilen) {
|
||||
fprintf(stderr, "Input underflow in Zero RLE decode.\n");
|
||||
return (-1);
|
||||
}
|
||||
if (pos2 + count > *olen) {
|
||||
fprintf(stderr, "Output buffer overflow in Zero RLE decode.\n");
|
||||
return (-1);
|
||||
}
|
||||
memcpy(ob+pos2, ib+pos1, count);
|
||||
pos2 += count;
|
||||
pos1 += count;
|
||||
}
|
||||
}
|
||||
i = *olen;
|
||||
|
|
|
@ -72,9 +72,9 @@
|
|||
|
||||
#include "rabin_dedup.h"
|
||||
|
||||
#define FORTY_PCNT(x) ((x)/5 << 1)
|
||||
#define FIFTY_PCNT(x) ((x) >> 1)
|
||||
#define SIXTY_PCNT(x) (((x) >> 1) + ((x) >> 3))
|
||||
#define DELTA_EXTRA2_PCT(x) ((x) >> 1)
|
||||
#define DELTA_EXTRA_PCT(x) (((x) >> 1) + ((x) >> 3))
|
||||
#define DELTA_NORMAL_PCT(x) (((x) >> 1) + ((x) >> 2) + ((x) >> 3))
|
||||
|
||||
extern int lzma_init(void **data, int *level, int nthreads, int64_t chunksize,
|
||||
int file_version, compress_op_t op);
|
||||
|
@ -203,7 +203,7 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s
|
|||
ctx->delta_flag = 3;
|
||||
}
|
||||
} else if (delta_flag == DELTA_EXTRA) {
|
||||
ctx->delta_flag = 1;
|
||||
ctx->delta_flag = 2;
|
||||
}
|
||||
|
||||
if (!fixed_flag)
|
||||
|
@ -292,7 +292,7 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
|
|||
uchar_t *buf1 = (uchar_t *)buf;
|
||||
uint32_t length;
|
||||
uint64_t cur_roll_checksum, cur_pos_checksum;
|
||||
uint32_t *fplist;
|
||||
uint32_t *ctx_heap;
|
||||
rabin_blockentry_t **htab;
|
||||
heap_t heap;
|
||||
DEBUG_STAT_EN(uint32_t max_count);
|
||||
|
@ -341,9 +341,8 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
|
|||
* Initialize arrays for sketch computation. We re-use memory allocated
|
||||
* for the compressed chunk temporarily.
|
||||
*/
|
||||
ary_sz = 4 * ctx->rabin_poly_max_block_size;
|
||||
fplist = (uint32_t *)(ctx->cbuf + ctx->real_chunksize - ary_sz);
|
||||
if (ctx->delta_flag) memset(fplist, 0, ary_sz);
|
||||
ary_sz = ctx->rabin_poly_max_block_size;
|
||||
ctx_heap = (uint32_t *)(ctx->cbuf + ctx->real_chunksize - ary_sz);
|
||||
}
|
||||
memset(ctx->current_window_data, 0, RAB_POLYNOMIAL_WIN_SIZE);
|
||||
|
||||
|
@ -397,23 +396,6 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
|
|||
cur_roll_checksum -= out[pushed_out];
|
||||
cur_pos_checksum = cur_roll_checksum ^ ir[pushed_out];
|
||||
|
||||
/*
|
||||
* Retain a list of all fingerprints in the block. We then compute
|
||||
* the K min values sketch from that list and generate a super sketch
|
||||
* by hashing over the K min values sketch. We only store the least
|
||||
* significant 32 bits of the fingerprint. This uses less memory,
|
||||
* requires smaller memset() calls and generates a sufficiently large
|
||||
* number of similarity matches without false positives - determined
|
||||
* by experimentation.
|
||||
*
|
||||
* This is called minhashing and is used widely, for example in various
|
||||
* search engines to detect similar documents.
|
||||
*/
|
||||
if (ctx->delta_flag) {
|
||||
fplist[j] = cur_pos_checksum & 0xFFFFFFFFUL;
|
||||
j++;
|
||||
}
|
||||
|
||||
/*
|
||||
* Window pos has to rotate from 0 .. RAB_POLYNOMIAL_WIN_SIZE-1
|
||||
* We avoid a branch here by masking. This requires RAB_POLYNOMIAL_WIN_SIZE
|
||||
|
@ -432,25 +414,32 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
|
|||
ctx->blocks[blknum]->offset = last_offset;
|
||||
ctx->blocks[blknum]->index = blknum; // Need to store for sorting
|
||||
ctx->blocks[blknum]->length = length;
|
||||
|
||||
DEBUG_STAT_EN(if (length >= ctx->rabin_poly_max_block_size) max_count++);
|
||||
|
||||
/*
|
||||
* Reset the heap structure and find the K min values if Delta Compression
|
||||
* is enabled. We use a min heap mechanism taken from the heap based priority
|
||||
* queue implementation in Python.
|
||||
* Here K = 60% or 40%. We are aiming to detect either 60% (default) or 40%
|
||||
* similarity on average.
|
||||
* Here K = similarity extent = 87% or 62% or 50%.
|
||||
*
|
||||
* Once block contents are arranged in a min heap we compute the K min values
|
||||
* sketch by hashing over the heap till K%. We interpret the raw bytes as a
|
||||
* sequence of 64-bit integers.
|
||||
* This is called minhashing and is used widely, for example in various
|
||||
* search engines to detect similar documents.
|
||||
*/
|
||||
if (ctx->delta_flag) {
|
||||
pc[1] = SIXTY_PCNT(j);
|
||||
pc[2] = FIFTY_PCNT(j);
|
||||
pc[3] = FORTY_PCNT(j);
|
||||
memcpy(ctx_heap, buf1+last_offset, length);
|
||||
length /= 8;
|
||||
pc[1] = DELTA_NORMAL_PCT(length);
|
||||
pc[2] = DELTA_EXTRA_PCT(length);
|
||||
pc[3] = DELTA_EXTRA2_PCT(length);
|
||||
|
||||
reset_heap(&heap, pc[ctx->delta_flag]);
|
||||
ksmallest((int32_t *)fplist, j, &heap);
|
||||
ksmallest((int64_t *)ctx_heap, length, &heap);
|
||||
|
||||
ctx->blocks[blknum]->similarity_hash =
|
||||
XXH32((const uchar_t *)fplist, pc[ctx->delta_flag]*4, 0);
|
||||
memset(fplist, 0, ary_sz);
|
||||
XXH32((const uchar_t *)ctx_heap, pc[ctx->delta_flag]*8, 0);
|
||||
}
|
||||
blknum++;
|
||||
last_offset = i+1;
|
||||
|
@ -466,26 +455,30 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
|
|||
sizeof (rabin_blockentry_t));
|
||||
ctx->blocks[blknum]->offset = last_offset;
|
||||
ctx->blocks[blknum]->index = blknum;
|
||||
ctx->blocks[blknum]->length = *size - last_offset;
|
||||
length = *size - last_offset;
|
||||
ctx->blocks[blknum]->length = length;
|
||||
|
||||
if (ctx->delta_flag) {
|
||||
uint64_t cur_sketch;
|
||||
uint64_t pc[3];
|
||||
|
||||
if (j > 1) {
|
||||
pc[1] = SIXTY_PCNT(j);
|
||||
pc[2] = FIFTY_PCNT(j);
|
||||
pc[3] = FORTY_PCNT(j);
|
||||
if (length > ctx->rabin_poly_min_block_size) {
|
||||
memcpy(ctx_heap, buf1+last_offset, length);
|
||||
length /= 8;
|
||||
pc[1] = DELTA_NORMAL_PCT(length);
|
||||
pc[2] = DELTA_EXTRA_PCT(length);
|
||||
pc[3] = DELTA_EXTRA2_PCT(length);
|
||||
|
||||
reset_heap(&heap, pc[ctx->delta_flag]);
|
||||
ksmallest((int32_t *)fplist, j, &heap);
|
||||
ksmallest((int64_t *)ctx_heap, length, &heap);
|
||||
cur_sketch =
|
||||
XXH32((const uchar_t *)fplist, pc[ctx->delta_flag]*4, 0);
|
||||
} else {
|
||||
if (j == 0) j = 1;
|
||||
cur_sketch =
|
||||
XXH32((const uchar_t *)fplist, (j*4)/2, 0);
|
||||
}
|
||||
XXH32((const uchar_t *)ctx_heap, pc[ctx->delta_flag]*8, 0);
|
||||
ctx->blocks[blknum]->similarity_hash = cur_sketch;
|
||||
} else {
|
||||
cur_sketch =
|
||||
XXH32((const uchar_t *)(buf1+last_offset), length, 0);
|
||||
ctx->blocks[blknum]->similarity_hash = cur_sketch;
|
||||
}
|
||||
}
|
||||
blknum++;
|
||||
last_offset = *size;
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
#include <string.h>
|
||||
#include <sys/types.h>
|
||||
#include <stdint.h>
|
||||
#include <inttypes.h>
|
||||
#include <heapq.h>
|
||||
|
||||
#ifndef NDEBUG
|
||||
|
@ -71,7 +72,7 @@ _siftupmax(heap_t *h, __TYPE spos, __TYPE epos)
|
|||
heap = h->ary;
|
||||
#ifdef ERROR_CHK
|
||||
if (spos >= endpos) {
|
||||
fprintf(stderr, "_siftupmax: index out of range: %u, len: %u\n", spos, endpos);
|
||||
fprintf(stderr, "_siftupmax: index out of range: %" PRId64 ", len: %" PRId64 "\n", spos, endpos);
|
||||
return -1;
|
||||
}
|
||||
#endif
|
||||
|
@ -118,7 +119,7 @@ _siftupmax_s(heap_t *h, __TYPE spos)
|
|||
heap = h->ary;
|
||||
#ifdef ERROR_CHK
|
||||
if (spos >= endpos) {
|
||||
fprintf(stderr, "_siftupmax: index out of range: %u, len: %u\n", spos, endpos);
|
||||
fprintf(stderr, "_siftupmax: index out of range: %" PRId64 ", len: %" PRId64 "\n", spos, endpos);
|
||||
return -1;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
#ifndef __HEAPQ_H_
|
||||
|
||||
#define __TYPE int32_t
|
||||
#define __TYPE int64_t
|
||||
|
||||
typedef struct {
|
||||
__TYPE *ary;
|
||||
|
|
|
@ -57,7 +57,7 @@ extern "C" {
|
|||
# endif
|
||||
#endif
|
||||
typedef unsigned long uintptr_t;
|
||||
typedef int64_t bsize_t;
|
||||
typedef int32_t bsize_t;
|
||||
|
||||
#undef WORDS_BIGENDIAN
|
||||
#if BYTE_ORDER == BIG_ENDIAN
|
||||
|
|
Loading…
Reference in a new issue