Improve accuracy of the KMV sketch computation and speed it up.

This commit is contained in:
Moinak Ghosh 2013-07-03 19:24:06 +05:30
parent 6b67e98747
commit e10a13ad94

View file

@ -76,7 +76,6 @@
#include <heap.h> #include <heap.h>
#include <xxhash.h> #include <xxhash.h>
#include <qsort.h> #include <qsort.h>
#include <lzma_crc.h>
#include "rabin_dedup.h" #include "rabin_dedup.h"
#if defined(__USE_SSE_INTRIN__) && defined(__SSE4_1__) && RAB_POLYNOMIAL_WIN_SIZE == 16 #if defined(__USE_SSE_INTRIN__) && defined(__SSE4_1__) && RAB_POLYNOMIAL_WIN_SIZE == 16
@ -874,7 +873,7 @@ process_blocks:
} else { } else {
uchar_t *seg_heap, *sim_ck, *sim_offsets; uchar_t *seg_heap, *sim_ck, *sim_offsets;
archive_config_t *cfg; archive_config_t *cfg;
uint32_t increment, len, blks, o_blks, k; uint32_t len, blks, o_blks, k;
global_blockentry_t *seg_blocks; global_blockentry_t *seg_blocks;
uint64_t seg_offset, offset; uint64_t seg_offset, offset;
global_blockentry_t **htab, *be; global_blockentry_t **htab, *be;
@ -897,6 +896,7 @@ process_blocks:
htab = (global_blockentry_t **)(src - ary_sz); htab = (global_blockentry_t **)(src - ary_sz);
for (i=0; i<blknum;) { for (i=0; i<blknum;) {
uint64_t crc, off1; uint64_t crc, off1;
uint64_t a, b;
length = 0; length = 0;
/* /*
@ -943,25 +943,29 @@ process_blocks:
blks = j+i; blks = j+i;
/* /*
* Sort concatenated chunk hash buffer by raw 64-bit integer * Assume the concatenated chunk hash buffer as an array of 64-bit
* magnitudes. * integers and sort them in ascending order.
*/ */
do_qsort((uint64_t *)seg_heap, length/8); do_qsort((uint64_t *)seg_heap, length/8);
/* /*
* Compute the min-values range similarity hashes. * Compute the K min values sketch where K == 20 in this case.
*/ */
sim_ck = ctx->similarity_cksums; sim_ck = ctx->similarity_cksums;
sub_i = cfg->sub_intervals;
tgt = seg_heap; tgt = seg_heap;
increment = cfg->chunk_cksum_sz / 2; sub_i = 0;
if (increment * sub_i > length)
sub_i = length / increment; *((uint64_t *)sim_ck) = 0;
for (j = 0; j<sub_i; j++) { a = 0;
crc = lzma_crc64(tgt, increment/2, 0); for (j = 0; j < length && sub_i < cfg->sub_intervals;) {
*((uint64_t *)sim_ck) = crc; b = *((uint64_t *)tgt);
tgt += increment; tgt += sizeof (uint64_t);
sim_ck += cfg->similarity_cksum_sz; if (b != a) {
*((uint64_t *)sim_ck) = b;
sim_ck += sizeof (uint64_t);
a = b;
sub_i++;
}
} }
/* /*
@ -984,14 +988,10 @@ process_blocks:
} }
/* /*
* Now lookup all the similarity hashes. We sort the hashes first so that * Now lookup all the similarity hashes.
* all duplicate hash values can be easily eliminated.
*
* The matching segment offsets in the segcache are stored in a list. Entries * The matching segment offsets in the segcache are stored in a list. Entries
* that were not found are stored with offset of UINT64_MAX. * that were not found are stored with offset of UINT64_MAX.
*/ */
isort_uint64((uint64_t *)(ctx->similarity_cksums), sub_i);
sim_ck = ctx->similarity_cksums; sim_ck = ctx->similarity_cksums;
tgt = src + 1; // One byte for number of entries tgt = src + 1; // One byte for number of entries
crc = 0; crc = 0;