Several bugfixes.

Avoid matching with self during hash lookup.
This commit is contained in:
Moinak Ghosh 2013-04-22 22:07:07 +05:30
parent 6b23f6a73a
commit 2c4024792a
2 changed files with 7 additions and 10 deletions

View file

@ -432,7 +432,7 @@ db_lookup_insert_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval,
if (cfg->pct_interval == 0) { // Global dedupe with simple index
while (ent) {
if (mycmp(sim_cksum, ent->cksum, cfg->similarity_cksum_sz) == 0 &&
ent->item_size == item_size) {
ent->item_size == item_size && ent->item_offset != item_offset) {
return (ent);
}
pent = &(ent->next);
@ -440,7 +440,8 @@ db_lookup_insert_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval,
}
} else {
while (ent) {
if (mycmp(sim_cksum, ent->cksum, cfg->similarity_cksum_sz) == 0) {
if (mycmp(sim_cksum, ent->cksum, cfg->similarity_cksum_sz) == 0 &&
ent->item_offset != item_offset) {
return (ent);
}
pent = &(ent->next);

View file

@ -858,14 +858,14 @@ process_blocks:
qsort(seg_heap, length/8, 8, cmpint);
/*
* Compute the range similarity minhashes.
* Compute the range similarity hashes.
*/
sim_ck = ctx->similarity_cksums;
crc = 0;
sub_i = cfg->sub_intervals;
increment = (length / cfg->intervals) / sub_i;
tgt = seg_heap;
while (increment < cfg->chunk_cksum_sz/4 && sub_i > 0) {
while (increment < cfg->chunk_cksum_sz/4 && sub_i > 1) {
sub_i--;
increment = (length / cfg->intervals) / sub_i;
}
@ -880,7 +880,7 @@ process_blocks:
increment = length / cfg->intervals;
for (j=0; j<cfg->intervals-1; j++) {
crc = lzma_crc64(tgt, increment/2, 0);
crc = lzma_crc64(tgt, increment/8, 0);
*((uint64_t *)sim_ck) = crc;
tgt += increment;
len -= increment;
@ -958,11 +958,7 @@ process_blocks:
hash_entry_t *he;
he = db_lookup_insert_s(cfg, sim_ck, 0, seg_offset, 0, 1);
/*
* If match found also check that match is not with self!
*/
if (he && he->item_offset != seg_offset) {
if (he) {
/*
* Match found. Load segment metadata from disk and perform
* identity deduplication with the segment chunks.