Several bugfixes.
Avoid matching with self during hash lookup.
This commit is contained in:
parent
6b23f6a73a
commit
2c4024792a
2 changed files with 7 additions and 10 deletions
|
@ -432,7 +432,7 @@ db_lookup_insert_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval,
|
||||||
if (cfg->pct_interval == 0) { // Global dedupe with simple index
|
if (cfg->pct_interval == 0) { // Global dedupe with simple index
|
||||||
while (ent) {
|
while (ent) {
|
||||||
if (mycmp(sim_cksum, ent->cksum, cfg->similarity_cksum_sz) == 0 &&
|
if (mycmp(sim_cksum, ent->cksum, cfg->similarity_cksum_sz) == 0 &&
|
||||||
ent->item_size == item_size) {
|
ent->item_size == item_size && ent->item_offset != item_offset) {
|
||||||
return (ent);
|
return (ent);
|
||||||
}
|
}
|
||||||
pent = &(ent->next);
|
pent = &(ent->next);
|
||||||
|
@ -440,7 +440,8 @@ db_lookup_insert_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval,
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
while (ent) {
|
while (ent) {
|
||||||
if (mycmp(sim_cksum, ent->cksum, cfg->similarity_cksum_sz) == 0) {
|
if (mycmp(sim_cksum, ent->cksum, cfg->similarity_cksum_sz) == 0 &&
|
||||||
|
ent->item_offset != item_offset) {
|
||||||
return (ent);
|
return (ent);
|
||||||
}
|
}
|
||||||
pent = &(ent->next);
|
pent = &(ent->next);
|
||||||
|
|
|
@ -858,14 +858,14 @@ process_blocks:
|
||||||
qsort(seg_heap, length/8, 8, cmpint);
|
qsort(seg_heap, length/8, 8, cmpint);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Compute the range similarity minhashes.
|
* Compute the range similarity hashes.
|
||||||
*/
|
*/
|
||||||
sim_ck = ctx->similarity_cksums;
|
sim_ck = ctx->similarity_cksums;
|
||||||
crc = 0;
|
crc = 0;
|
||||||
sub_i = cfg->sub_intervals;
|
sub_i = cfg->sub_intervals;
|
||||||
increment = (length / cfg->intervals) / sub_i;
|
increment = (length / cfg->intervals) / sub_i;
|
||||||
tgt = seg_heap;
|
tgt = seg_heap;
|
||||||
while (increment < cfg->chunk_cksum_sz/4 && sub_i > 0) {
|
while (increment < cfg->chunk_cksum_sz/4 && sub_i > 1) {
|
||||||
sub_i--;
|
sub_i--;
|
||||||
increment = (length / cfg->intervals) / sub_i;
|
increment = (length / cfg->intervals) / sub_i;
|
||||||
}
|
}
|
||||||
|
@ -880,7 +880,7 @@ process_blocks:
|
||||||
|
|
||||||
increment = length / cfg->intervals;
|
increment = length / cfg->intervals;
|
||||||
for (j=0; j<cfg->intervals-1; j++) {
|
for (j=0; j<cfg->intervals-1; j++) {
|
||||||
crc = lzma_crc64(tgt, increment/2, 0);
|
crc = lzma_crc64(tgt, increment/8, 0);
|
||||||
*((uint64_t *)sim_ck) = crc;
|
*((uint64_t *)sim_ck) = crc;
|
||||||
tgt += increment;
|
tgt += increment;
|
||||||
len -= increment;
|
len -= increment;
|
||||||
|
@ -958,11 +958,7 @@ process_blocks:
|
||||||
hash_entry_t *he;
|
hash_entry_t *he;
|
||||||
|
|
||||||
he = db_lookup_insert_s(cfg, sim_ck, 0, seg_offset, 0, 1);
|
he = db_lookup_insert_s(cfg, sim_ck, 0, seg_offset, 0, 1);
|
||||||
|
if (he) {
|
||||||
/*
|
|
||||||
* If match found also check that match is not with self!
|
|
||||||
*/
|
|
||||||
if (he && he->item_offset != seg_offset) {
|
|
||||||
/*
|
/*
|
||||||
* Match found. Load segment metadata from disk and perform
|
* Match found. Load segment metadata from disk and perform
|
||||||
* identity deduplication with the segment chunks.
|
* identity deduplication with the segment chunks.
|
||||||
|
|
Loading…
Reference in a new issue