Improve accuracy of the KMV sketch computation and speed it up.
This commit is contained in:
parent
6b67e98747
commit
e10a13ad94
1 changed files with 19 additions and 19 deletions
|
@ -76,7 +76,6 @@
|
||||||
#include <heap.h>
|
#include <heap.h>
|
||||||
#include <xxhash.h>
|
#include <xxhash.h>
|
||||||
#include <qsort.h>
|
#include <qsort.h>
|
||||||
#include <lzma_crc.h>
|
|
||||||
|
|
||||||
#include "rabin_dedup.h"
|
#include "rabin_dedup.h"
|
||||||
#if defined(__USE_SSE_INTRIN__) && defined(__SSE4_1__) && RAB_POLYNOMIAL_WIN_SIZE == 16
|
#if defined(__USE_SSE_INTRIN__) && defined(__SSE4_1__) && RAB_POLYNOMIAL_WIN_SIZE == 16
|
||||||
|
@ -874,7 +873,7 @@ process_blocks:
|
||||||
} else {
|
} else {
|
||||||
uchar_t *seg_heap, *sim_ck, *sim_offsets;
|
uchar_t *seg_heap, *sim_ck, *sim_offsets;
|
||||||
archive_config_t *cfg;
|
archive_config_t *cfg;
|
||||||
uint32_t increment, len, blks, o_blks, k;
|
uint32_t len, blks, o_blks, k;
|
||||||
global_blockentry_t *seg_blocks;
|
global_blockentry_t *seg_blocks;
|
||||||
uint64_t seg_offset, offset;
|
uint64_t seg_offset, offset;
|
||||||
global_blockentry_t **htab, *be;
|
global_blockentry_t **htab, *be;
|
||||||
|
@ -897,6 +896,7 @@ process_blocks:
|
||||||
htab = (global_blockentry_t **)(src - ary_sz);
|
htab = (global_blockentry_t **)(src - ary_sz);
|
||||||
for (i=0; i<blknum;) {
|
for (i=0; i<blknum;) {
|
||||||
uint64_t crc, off1;
|
uint64_t crc, off1;
|
||||||
|
uint64_t a, b;
|
||||||
length = 0;
|
length = 0;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -943,25 +943,29 @@ process_blocks:
|
||||||
blks = j+i;
|
blks = j+i;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Sort concatenated chunk hash buffer by raw 64-bit integer
|
* Assume the concatenated chunk hash buffer as an array of 64-bit
|
||||||
* magnitudes.
|
* integers and sort them in ascending order.
|
||||||
*/
|
*/
|
||||||
do_qsort((uint64_t *)seg_heap, length/8);
|
do_qsort((uint64_t *)seg_heap, length/8);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Compute the min-values range similarity hashes.
|
* Compute the K min values sketch where K == 20 in this case.
|
||||||
*/
|
*/
|
||||||
sim_ck = ctx->similarity_cksums;
|
sim_ck = ctx->similarity_cksums;
|
||||||
sub_i = cfg->sub_intervals;
|
|
||||||
tgt = seg_heap;
|
tgt = seg_heap;
|
||||||
increment = cfg->chunk_cksum_sz / 2;
|
sub_i = 0;
|
||||||
if (increment * sub_i > length)
|
|
||||||
sub_i = length / increment;
|
*((uint64_t *)sim_ck) = 0;
|
||||||
for (j = 0; j<sub_i; j++) {
|
a = 0;
|
||||||
crc = lzma_crc64(tgt, increment/2, 0);
|
for (j = 0; j < length && sub_i < cfg->sub_intervals;) {
|
||||||
*((uint64_t *)sim_ck) = crc;
|
b = *((uint64_t *)tgt);
|
||||||
tgt += increment;
|
tgt += sizeof (uint64_t);
|
||||||
sim_ck += cfg->similarity_cksum_sz;
|
if (b != a) {
|
||||||
|
*((uint64_t *)sim_ck) = b;
|
||||||
|
sim_ck += sizeof (uint64_t);
|
||||||
|
a = b;
|
||||||
|
sub_i++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -984,14 +988,10 @@ process_blocks:
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Now lookup all the similarity hashes. We sort the hashes first so that
|
* Now lookup all the similarity hashes.
|
||||||
* all duplicate hash values can be easily eliminated.
|
|
||||||
*
|
|
||||||
* The matching segment offsets in the segcache are stored in a list. Entries
|
* The matching segment offsets in the segcache are stored in a list. Entries
|
||||||
* that were not found are stored with offset of UINT64_MAX.
|
* that were not found are stored with offset of UINT64_MAX.
|
||||||
*/
|
*/
|
||||||
isort_uint64((uint64_t *)(ctx->similarity_cksums), sub_i);
|
|
||||||
|
|
||||||
sim_ck = ctx->similarity_cksums;
|
sim_ck = ctx->similarity_cksums;
|
||||||
tgt = src + 1; // One byte for number of entries
|
tgt = src + 1; // One byte for number of entries
|
||||||
crc = 0;
|
crc = 0;
|
||||||
|
|
Loading…
Reference in a new issue