Fixes and performance improvements for Dedupe Delta Compression

Avoid using fingerprints in minhash computation and fix write amplification
Modify min-heap to use 64bit values
Improve bsdiff performance
Fix pointer comparison in bsdiff
Use 32bit offsets in bsdiff to reduce memory usage
Improve Zero RLE Encoder performance
Add more buffer overflow checks in Zero RLE Decoder
This commit is contained in:
Moinak Ghosh 2013-01-13 22:04:59 +05:30
parent 87aa12206e
commit d49a088eea
6 changed files with 93 additions and 67 deletions

View file

@ -134,11 +134,12 @@ static void split(bsize_t *I,bsize_t *V,bsize_t start,bsize_t len,bsize_t h)
static void qsufsort(bsize_t *I,bsize_t *V,u_char *oldbuf,bsize_t oldsize)
{
bsize_t buckets[256];
bsize_t buckets[257];
bsize_t *bkts;
bsize_t i,h,len;
#ifdef __USE_SSE_INTRIN__
if (((bsize_t)buckets & (16 - 1)) == 0) { // 16-byte aligned ?
if (((size_t)buckets & (16 - 1)) == 0) { // 16-byte aligned ?
int iters;
uchar_t *pos;
@ -159,9 +160,18 @@ static void qsufsort(bsize_t *I,bsize_t *V,u_char *oldbuf,bsize_t oldsize)
#ifdef __USE_SSE_INTRIN__
}
#endif
for(i=0;i<oldsize;i++) buckets[oldbuf[i]]++;
for(i=1;i<256;i++) buckets[i]+=buckets[i-1];
for(i=255;i>0;i--) buckets[i]=buckets[i-1];
/* We want to do this:
* for(i=0;i<oldsize;i++) buckets[oldbuf[i]]++;
* for(i=1;i<256;i++) buckets[i]+=buckets[i-1];
* for(i=255;i>0;i--) buckets[i]=buckets[i-1];
* buckets[0]=0;
*
* However the code below uses an array larger by 1 element and is able to
* avoid the 3rd loop.
*/
bkts = &buckets[1];
for(i=0;i<oldsize;i++) bkts[oldbuf[i]]++;
for(i=1;i<256;i++) bkts[i]+=bkts[i-1];
buckets[0]=0;
for(i=0;i<oldsize;i++) I[++buckets[oldbuf[i]]]=i;
@ -311,9 +321,8 @@ bsdiff(u_char *oldbuf, bsize_t oldsize, u_char *newbuf, bsize_t newsize,
0,oldsize,&pos);
for(;scsc<scan+len;scsc++)
if((scsc+lastoffset<oldsize) &&
(oldbuf[scsc+lastoffset] == newbuf[scsc]))
oldscore++;
oldscore += ((scsc+lastoffset<oldsize) &&
(oldbuf[scsc+lastoffset] == newbuf[scsc]));
if(((len==oldscore) && (len!=0)) ||
(len>oldscore+sz)) break;
@ -326,7 +335,7 @@ bsdiff(u_char *oldbuf, bsize_t oldsize, u_char *newbuf, bsize_t newsize,
if((len!=oldscore) || (scan==newsize)) {
s=0;Sf=0;lenf=0;
for(i=0;(lastscan+i<scan)&&(lastpos+i<oldsize);) {
if(oldbuf[lastpos+i]==newbuf[lastscan+i]) s++;
s += (oldbuf[lastpos+i]==newbuf[lastscan+i]);
i++;
if(s*2-i>Sf*2-lenf) { Sf=s; lenf=i; };
};
@ -335,7 +344,7 @@ bsdiff(u_char *oldbuf, bsize_t oldsize, u_char *newbuf, bsize_t newsize,
if(scan<newsize) {
s=0;Sb=0;
for(i=1;(scan>=lastscan+i)&&(pos>=i);i++) {
if(oldbuf[pos-i]==newbuf[scan-i]) s++;
s += (oldbuf[pos-i]==newbuf[scan-i]);
if(s*2-i>Sb*2-lenb) { Sb=s; lenb=i; };
};
};
@ -344,10 +353,9 @@ bsdiff(u_char *oldbuf, bsize_t oldsize, u_char *newbuf, bsize_t newsize,
overlap=(lastscan+lenf)-(scan-lenb);
s=0;Ss=0;lens=0;
for(i=0;i<overlap;i++) {
if(newbuf[lastscan+lenf-overlap+i]==
oldbuf[lastpos+lenf-overlap+i]) s++;
if(newbuf[scan-lenb+i]==
oldbuf[pos-lenb+i]) s--;
s += (newbuf[lastscan+lenf-overlap+i]==
oldbuf[lastpos+lenf-overlap+i]);
s -= (newbuf[scan-lenb+i]==oldbuf[pos-lenb+i]);
if(s>Ss) { Ss=s; lens=i+1; };
};

View file

@ -24,6 +24,7 @@
#include <utils.h>
#include <stdio.h>
#include <string.h>
#define ZERO_MASK (32768)
#define DATA_MASK (32767)
@ -33,15 +34,25 @@ int
zero_rle_encode(const void *ibuf, const unsigned int ilen,
void *obuf, unsigned int *olen)
{
unsigned int pos1, pos2;
unsigned int pos1, pos2, sz;
unsigned short count;
const uchar_t *ib = (const uchar_t *)ibuf;
uchar_t *ob = (uchar_t *)obuf;
uint64_t val;
sz = sizeof (val) - 1;
pos2 = 0;
for (pos1=0; pos1<ilen && pos2<*olen;) {
count = 0;
if (ib[pos1] == 0) {
/*
* We have a run of zeroes. Count them and store only the count.
*/
while (pos1 < (ilen - sz) && count < (COUNT_MAX - sz)) {
val = *((uint64_t *)(ib+pos1));
if (val) break;
pos1 += sizeof (val); count += sizeof (val);
}
for (;pos1<ilen && ib[pos1]==0 && count<COUNT_MAX; pos1++) count++;
count |= ZERO_MASK;
*((unsigned short *)(ob + pos2)) = htons(count);
@ -93,11 +104,24 @@ zero_rle_decode(const void* ibuf, unsigned int ilen,
pos1 += 2;
if (count & ZERO_MASK) {
count &= DATA_MASK;
for (i=0; i<count && pos2<*olen; i++)
ob[pos2++] = 0;
if (pos2 + count > *olen) {
fprintf(stderr, "Output buffer overflow in Zero RLE decode.\n");
return (-1);
}
memset(ob+pos2, 0, count);
pos2 += count;
} else {
for (i=0; i<count && pos1<ilen && pos2<*olen; i++)
ob[pos2++] = ib[pos1++];
if (pos1 + count > ilen) {
fprintf(stderr, "Input underflow in Zero RLE decode.\n");
return (-1);
}
if (pos2 + count > *olen) {
fprintf(stderr, "Output buffer overflow in Zero RLE decode.\n");
return (-1);
}
memcpy(ob+pos2, ib+pos1, count);
pos2 += count;
pos1 += count;
}
}
i = *olen;

View file

@ -72,9 +72,9 @@
#include "rabin_dedup.h"
#define FORTY_PCNT(x) ((x)/5 << 1)
#define FIFTY_PCNT(x) ((x) >> 1)
#define SIXTY_PCNT(x) (((x) >> 1) + ((x) >> 3))
#define DELTA_EXTRA2_PCT(x) ((x) >> 1)
#define DELTA_EXTRA_PCT(x) (((x) >> 1) + ((x) >> 3))
#define DELTA_NORMAL_PCT(x) (((x) >> 1) + ((x) >> 2) + ((x) >> 3))
extern int lzma_init(void **data, int *level, int nthreads, int64_t chunksize,
int file_version, compress_op_t op);
@ -203,7 +203,7 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s
ctx->delta_flag = 3;
}
} else if (delta_flag == DELTA_EXTRA) {
ctx->delta_flag = 1;
ctx->delta_flag = 2;
}
if (!fixed_flag)
@ -292,7 +292,7 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
uchar_t *buf1 = (uchar_t *)buf;
uint32_t length;
uint64_t cur_roll_checksum, cur_pos_checksum;
uint32_t *fplist;
uint32_t *ctx_heap;
rabin_blockentry_t **htab;
heap_t heap;
DEBUG_STAT_EN(uint32_t max_count);
@ -341,9 +341,8 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
* Initialize arrays for sketch computation. We re-use memory allocated
* for the compressed chunk temporarily.
*/
ary_sz = 4 * ctx->rabin_poly_max_block_size;
fplist = (uint32_t *)(ctx->cbuf + ctx->real_chunksize - ary_sz);
if (ctx->delta_flag) memset(fplist, 0, ary_sz);
ary_sz = ctx->rabin_poly_max_block_size;
ctx_heap = (uint32_t *)(ctx->cbuf + ctx->real_chunksize - ary_sz);
}
memset(ctx->current_window_data, 0, RAB_POLYNOMIAL_WIN_SIZE);
@ -397,23 +396,6 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
cur_roll_checksum -= out[pushed_out];
cur_pos_checksum = cur_roll_checksum ^ ir[pushed_out];
/*
* Retain a list of all fingerprints in the block. We then compute
* the K min values sketch from that list and generate a super sketch
* by hashing over the K min values sketch. We only store the least
* significant 32 bits of the fingerprint. This uses less memory,
* requires smaller memset() calls and generates a sufficiently large
* number of similarity matches without false positives - determined
* by experimentation.
*
* This is called minhashing and is used widely, for example in various
* search engines to detect similar documents.
*/
if (ctx->delta_flag) {
fplist[j] = cur_pos_checksum & 0xFFFFFFFFUL;
j++;
}
/*
* Window pos has to rotate from 0 .. RAB_POLYNOMIAL_WIN_SIZE-1
* We avoid a branch here by masking. This requires RAB_POLYNOMIAL_WIN_SIZE
@ -432,25 +414,32 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
ctx->blocks[blknum]->offset = last_offset;
ctx->blocks[blknum]->index = blknum; // Need to store for sorting
ctx->blocks[blknum]->length = length;
DEBUG_STAT_EN(if (length >= ctx->rabin_poly_max_block_size) max_count++);
/*
* Reset the heap structure and find the K min values if Delta Compression
* is enabled. We use a min heap mechanism taken from the heap based priority
* queue implementation in Python.
* Here K = 60% or 40%. We are aiming to detect either 60% (default) or 40%
* similarity on average.
* Here K = similarity extent = 87% or 62% or 50%.
*
* Once block contents are arranged in a min heap we compute the K min values
* sketch by hashing over the heap till K%. We interpret the raw bytes as a
* sequence of 64-bit integers.
* This is called minhashing and is used widely, for example in various
* search engines to detect similar documents.
*/
if (ctx->delta_flag) {
pc[1] = SIXTY_PCNT(j);
pc[2] = FIFTY_PCNT(j);
pc[3] = FORTY_PCNT(j);
memcpy(ctx_heap, buf1+last_offset, length);
length /= 8;
pc[1] = DELTA_NORMAL_PCT(length);
pc[2] = DELTA_EXTRA_PCT(length);
pc[3] = DELTA_EXTRA2_PCT(length);
reset_heap(&heap, pc[ctx->delta_flag]);
ksmallest((int32_t *)fplist, j, &heap);
ksmallest((int64_t *)ctx_heap, length, &heap);
ctx->blocks[blknum]->similarity_hash =
XXH32((const uchar_t *)fplist, pc[ctx->delta_flag]*4, 0);
memset(fplist, 0, ary_sz);
XXH32((const uchar_t *)ctx_heap, pc[ctx->delta_flag]*8, 0);
}
blknum++;
last_offset = i+1;
@ -466,26 +455,30 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
sizeof (rabin_blockentry_t));
ctx->blocks[blknum]->offset = last_offset;
ctx->blocks[blknum]->index = blknum;
ctx->blocks[blknum]->length = *size - last_offset;
length = *size - last_offset;
ctx->blocks[blknum]->length = length;
if (ctx->delta_flag) {
uint64_t cur_sketch;
uint64_t pc[3];
if (j > 1) {
pc[1] = SIXTY_PCNT(j);
pc[2] = FIFTY_PCNT(j);
pc[3] = FORTY_PCNT(j);
if (length > ctx->rabin_poly_min_block_size) {
memcpy(ctx_heap, buf1+last_offset, length);
length /= 8;
pc[1] = DELTA_NORMAL_PCT(length);
pc[2] = DELTA_EXTRA_PCT(length);
pc[3] = DELTA_EXTRA2_PCT(length);
reset_heap(&heap, pc[ctx->delta_flag]);
ksmallest((int32_t *)fplist, j, &heap);
ksmallest((int64_t *)ctx_heap, length, &heap);
cur_sketch =
XXH32((const uchar_t *)fplist, pc[ctx->delta_flag]*4, 0);
} else {
if (j == 0) j = 1;
cur_sketch =
XXH32((const uchar_t *)fplist, (j*4)/2, 0);
}
XXH32((const uchar_t *)ctx_heap, pc[ctx->delta_flag]*8, 0);
ctx->blocks[blknum]->similarity_hash = cur_sketch;
} else {
cur_sketch =
XXH32((const uchar_t *)(buf1+last_offset), length, 0);
ctx->blocks[blknum]->similarity_hash = cur_sketch;
}
}
blknum++;
last_offset = *size;

View file

@ -17,6 +17,7 @@
#include <string.h>
#include <sys/types.h>
#include <stdint.h>
#include <inttypes.h>
#include <heapq.h>
#ifndef NDEBUG
@ -71,7 +72,7 @@ _siftupmax(heap_t *h, __TYPE spos, __TYPE epos)
heap = h->ary;
#ifdef ERROR_CHK
if (spos >= endpos) {
fprintf(stderr, "_siftupmax: index out of range: %u, len: %u\n", spos, endpos);
fprintf(stderr, "_siftupmax: index out of range: %" PRId64 ", len: %" PRId64 "\n", spos, endpos);
return -1;
}
#endif
@ -118,7 +119,7 @@ _siftupmax_s(heap_t *h, __TYPE spos)
heap = h->ary;
#ifdef ERROR_CHK
if (spos >= endpos) {
fprintf(stderr, "_siftupmax: index out of range: %u, len: %u\n", spos, endpos);
fprintf(stderr, "_siftupmax: index out of range: %" PRId64 ", len: %" PRId64 "\n", spos, endpos);
return -1;
}
#endif

View file

@ -1,6 +1,6 @@
#ifndef __HEAPQ_H_
#define __TYPE int32_t
#define __TYPE int64_t
typedef struct {
__TYPE *ary;

View file

@ -57,7 +57,7 @@ extern "C" {
# endif
#endif
typedef unsigned long uintptr_t;
typedef int64_t bsize_t;
typedef int32_t bsize_t;
#undef WORDS_BIGENDIAN
#if BYTE_ORDER == BIG_ENDIAN