diff --git a/.gitignore b/.gitignore index f185bb8..ff8993b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ files.lst *.pc* +.seg* *.o *.so pcompress diff --git a/rabin/global/dedupe_config.h b/rabin/global/dedupe_config.h index 730a05a..f4a5c9a 100644 --- a/rabin/global/dedupe_config.h +++ b/rabin/global/dedupe_config.h @@ -37,7 +37,7 @@ extern "C" { #define DEFAULT_CHUNK_CKSUM CKSUM_SHA256 #define DEFAULT_SIMILARITY_CKSUM CKSUM_BLAKE256 #define DEFAULT_COMPRESS COMPRESS_LZ4 -#define DEFAULT_PCT_INTERVAL 8 +#define DEFAULT_PCT_INTERVAL 10 #define CONTAINER_ITEMS 2048 #define MIN_CK 1 #define MAX_CK 5 @@ -55,6 +55,7 @@ typedef enum { struct seg_map_fd { int fd; void *mapping; + uint64_t cache_offset; uint32_t len; }; diff --git a/rabin/global/index.c b/rabin/global/index.c index 7de4aca..060f82a 100644 --- a/rabin/global/index.c +++ b/rabin/global/index.c @@ -326,6 +326,19 @@ db_segcache_map(archive_config_t *cfg, int tid, uint32_t *blknum, uint64_t *offs uint32_t len, adj; uint64_t pos; + /* + * If same mapping is re-attempted just return the pointer into the + * existing mapping. + */ + adj = *offset % cfg->pagesize; + if (*offset == cfg->seg_fd_r[tid].cache_offset && cfg->seg_fd_r[tid].mapping) { + hdr = (uchar_t *)(cfg->seg_fd_r[tid].mapping) + adj; + *blknum = *((uint32_t *)(hdr)); + *offset = *((uint64_t *)(hdr + 4)); + *blocks = hdr + SEGCACHE_HDR_SZ; + return (0); + } + /* * Ensure previous mapping is removed. */ @@ -344,11 +357,11 @@ db_segcache_map(archive_config_t *cfg, int tid, uint32_t *blknum, uint64_t *offs if (pos - *offset < len) len = pos - *offset; - adj = *offset % cfg->pagesize; mapbuf = mmap(NULL, len + adj, PROT_READ, MAP_SHARED, fd, *offset - adj); if (mapbuf == MAP_FAILED) return (-1); + cfg->seg_fd_r[tid].cache_offset = *offset; hdr = mapbuf + adj; *blknum = *((uint32_t *)(hdr)); *offset = *((uint64_t *)(hdr + 4)); diff --git a/rabin/rabin_dedup.c b/rabin/rabin_dedup.c index 2740a32..401a5b3 100755 --- a/rabin/rabin_dedup.c +++ b/rabin/rabin_dedup.c @@ -309,8 +309,9 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s } if (arc && dedupe_flag == RABIN_DEDUPE_FILE_GLOBAL) { - ctx->similarity_cksums = (uchar_t *)slab_calloc(NULL, arc->intervals + arc->sub_intervals, - arc->similarity_cksum_sz); + ctx->similarity_cksums = (uchar_t *)slab_calloc(NULL, + arc->intervals + arc->sub_intervals, + arc->similarity_cksum_sz); if (!ctx->similarity_cksums) { fprintf(stderr, "Could not allocate dedupe context, out of memory\n"); @@ -390,6 +391,26 @@ cmpint(const void *a, const void *b) return (1); } +static inline int +ckcmp(uchar_t *a, uchar_t *b, int sz) +{ + size_t *v1 = (size_t *)a; + size_t *v2 = (size_t *)b; + int len; + + len = 0; + do { + if (*v1 != *v2) { + return (1); + } + ++v1; + ++v2; + len += sizeof (size_t); + } while (len < sz); + + return (0); +} + /** * Perform Deduplication. * Both Semi-Rabin fingerprinting based and Fixed Block Deduplication are supported. @@ -859,7 +880,7 @@ process_blocks: increment = length / cfg->intervals; for (j=0; jintervals-1; j++) { - crc = lzma_crc64(tgt, increment, 0); + crc = lzma_crc64(tgt, increment/2, 0); *((uint64_t *)sim_ck) = crc; tgt += increment; len -= increment; @@ -880,8 +901,7 @@ process_blocks: seg_offset = db_segcache_pos(cfg, ctx->id); src = (uchar_t *)&(ctx->g_blocks[i]); len = blks * sizeof (global_blockentry_t); - db_segcache_write(cfg, ctx->id, src, len, blks-i, - ctx->file_offset); + db_segcache_write(cfg, ctx->id, src, len, blks-i, ctx->file_offset); /* * Insert current segment blocks into local hashtable and do partial @@ -903,7 +923,7 @@ process_blocks: } else { be = htab[hent]; do { - if (memcmp(ctx->g_blocks[k].cksum, + if (ckcmp(ctx->g_blocks[k].cksum, be->cksum, cfg->chunk_cksum_sz) == 0 && ctx->g_blocks[k].length == be->length) { global_blockentry_t *en; @@ -938,7 +958,11 @@ process_blocks: hash_entry_t *he; he = db_lookup_insert_s(cfg, sim_ck, 0, seg_offset, 0, 1); - if (he) { + + /* + * If match found also check that match is not with self! + */ + if (he && he->item_offset != seg_offset) { /* * Match found. Load segment metadata from disk and perform * identity deduplication with the segment chunks. @@ -967,7 +991,7 @@ process_blocks: do { if (be->length & RABIN_INDEX_FLAG) goto next_ent; - if (memcmp(seg_blocks[k].cksum, + if (ckcmp(seg_blocks[k].cksum, be->cksum, cfg->chunk_cksum_sz) == 0 && seg_blocks[k].length == be->length) { be->length = (be->length | @@ -985,7 +1009,6 @@ next_ent: } while(1); } } - break; } sim_ck -= cfg->similarity_cksum_sz; } @@ -1400,7 +1423,7 @@ dedupe_decompress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size) len = LE32(*((uint32_t *)g_dedupe_idx)); g_dedupe_idx += RABIN_ENTRY_SIZE; ++blk; - flag = len & GLOBAL_FLAG; + flag = len & RABIN_INDEX_FLAG; len &= RABIN_INDEX_VALUE; if (sz + len > data_sz) {