Several fixes and optimizations.

This commit is contained in:
Moinak Ghosh 2013-04-22 19:52:18 +05:30
parent c0b4aa0116
commit 6b23f6a73a
4 changed files with 50 additions and 12 deletions

1
.gitignore vendored
View file

@ -1,5 +1,6 @@
files.lst files.lst
*.pc* *.pc*
.seg*
*.o *.o
*.so *.so
pcompress pcompress

View file

@ -37,7 +37,7 @@ extern "C" {
#define DEFAULT_CHUNK_CKSUM CKSUM_SHA256 #define DEFAULT_CHUNK_CKSUM CKSUM_SHA256
#define DEFAULT_SIMILARITY_CKSUM CKSUM_BLAKE256 #define DEFAULT_SIMILARITY_CKSUM CKSUM_BLAKE256
#define DEFAULT_COMPRESS COMPRESS_LZ4 #define DEFAULT_COMPRESS COMPRESS_LZ4
#define DEFAULT_PCT_INTERVAL 8 #define DEFAULT_PCT_INTERVAL 10
#define CONTAINER_ITEMS 2048 #define CONTAINER_ITEMS 2048
#define MIN_CK 1 #define MIN_CK 1
#define MAX_CK 5 #define MAX_CK 5
@ -55,6 +55,7 @@ typedef enum {
struct seg_map_fd { struct seg_map_fd {
int fd; int fd;
void *mapping; void *mapping;
uint64_t cache_offset;
uint32_t len; uint32_t len;
}; };

View file

@ -326,6 +326,19 @@ db_segcache_map(archive_config_t *cfg, int tid, uint32_t *blknum, uint64_t *offs
uint32_t len, adj; uint32_t len, adj;
uint64_t pos; uint64_t pos;
/*
* If same mapping is re-attempted just return the pointer into the
* existing mapping.
*/
adj = *offset % cfg->pagesize;
if (*offset == cfg->seg_fd_r[tid].cache_offset && cfg->seg_fd_r[tid].mapping) {
hdr = (uchar_t *)(cfg->seg_fd_r[tid].mapping) + adj;
*blknum = *((uint32_t *)(hdr));
*offset = *((uint64_t *)(hdr + 4));
*blocks = hdr + SEGCACHE_HDR_SZ;
return (0);
}
/* /*
* Ensure previous mapping is removed. * Ensure previous mapping is removed.
*/ */
@ -344,11 +357,11 @@ db_segcache_map(archive_config_t *cfg, int tid, uint32_t *blknum, uint64_t *offs
if (pos - *offset < len) if (pos - *offset < len)
len = pos - *offset; len = pos - *offset;
adj = *offset % cfg->pagesize;
mapbuf = mmap(NULL, len + adj, PROT_READ, MAP_SHARED, fd, *offset - adj); mapbuf = mmap(NULL, len + adj, PROT_READ, MAP_SHARED, fd, *offset - adj);
if (mapbuf == MAP_FAILED) if (mapbuf == MAP_FAILED)
return (-1); return (-1);
cfg->seg_fd_r[tid].cache_offset = *offset;
hdr = mapbuf + adj; hdr = mapbuf + adj;
*blknum = *((uint32_t *)(hdr)); *blknum = *((uint32_t *)(hdr));
*offset = *((uint64_t *)(hdr + 4)); *offset = *((uint64_t *)(hdr + 4));

View file

@ -309,7 +309,8 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s
} }
if (arc && dedupe_flag == RABIN_DEDUPE_FILE_GLOBAL) { if (arc && dedupe_flag == RABIN_DEDUPE_FILE_GLOBAL) {
ctx->similarity_cksums = (uchar_t *)slab_calloc(NULL, arc->intervals + arc->sub_intervals, ctx->similarity_cksums = (uchar_t *)slab_calloc(NULL,
arc->intervals + arc->sub_intervals,
arc->similarity_cksum_sz); arc->similarity_cksum_sz);
if (!ctx->similarity_cksums) { if (!ctx->similarity_cksums) {
fprintf(stderr, fprintf(stderr,
@ -390,6 +391,26 @@ cmpint(const void *a, const void *b)
return (1); return (1);
} }
static inline int
ckcmp(uchar_t *a, uchar_t *b, int sz)
{
size_t *v1 = (size_t *)a;
size_t *v2 = (size_t *)b;
int len;
len = 0;
do {
if (*v1 != *v2) {
return (1);
}
++v1;
++v2;
len += sizeof (size_t);
} while (len < sz);
return (0);
}
/** /**
* Perform Deduplication. * Perform Deduplication.
* Both Semi-Rabin fingerprinting based and Fixed Block Deduplication are supported. * Both Semi-Rabin fingerprinting based and Fixed Block Deduplication are supported.
@ -859,7 +880,7 @@ process_blocks:
increment = length / cfg->intervals; increment = length / cfg->intervals;
for (j=0; j<cfg->intervals-1; j++) { for (j=0; j<cfg->intervals-1; j++) {
crc = lzma_crc64(tgt, increment, 0); crc = lzma_crc64(tgt, increment/2, 0);
*((uint64_t *)sim_ck) = crc; *((uint64_t *)sim_ck) = crc;
tgt += increment; tgt += increment;
len -= increment; len -= increment;
@ -880,8 +901,7 @@ process_blocks:
seg_offset = db_segcache_pos(cfg, ctx->id); seg_offset = db_segcache_pos(cfg, ctx->id);
src = (uchar_t *)&(ctx->g_blocks[i]); src = (uchar_t *)&(ctx->g_blocks[i]);
len = blks * sizeof (global_blockentry_t); len = blks * sizeof (global_blockentry_t);
db_segcache_write(cfg, ctx->id, src, len, blks-i, db_segcache_write(cfg, ctx->id, src, len, blks-i, ctx->file_offset);
ctx->file_offset);
/* /*
* Insert current segment blocks into local hashtable and do partial * Insert current segment blocks into local hashtable and do partial
@ -903,7 +923,7 @@ process_blocks:
} else { } else {
be = htab[hent]; be = htab[hent];
do { do {
if (memcmp(ctx->g_blocks[k].cksum, if (ckcmp(ctx->g_blocks[k].cksum,
be->cksum, cfg->chunk_cksum_sz) == 0 && be->cksum, cfg->chunk_cksum_sz) == 0 &&
ctx->g_blocks[k].length == be->length) { ctx->g_blocks[k].length == be->length) {
global_blockentry_t *en; global_blockentry_t *en;
@ -938,7 +958,11 @@ process_blocks:
hash_entry_t *he; hash_entry_t *he;
he = db_lookup_insert_s(cfg, sim_ck, 0, seg_offset, 0, 1); he = db_lookup_insert_s(cfg, sim_ck, 0, seg_offset, 0, 1);
if (he) {
/*
* If match found also check that match is not with self!
*/
if (he && he->item_offset != seg_offset) {
/* /*
* Match found. Load segment metadata from disk and perform * Match found. Load segment metadata from disk and perform
* identity deduplication with the segment chunks. * identity deduplication with the segment chunks.
@ -967,7 +991,7 @@ process_blocks:
do { do {
if (be->length & RABIN_INDEX_FLAG) if (be->length & RABIN_INDEX_FLAG)
goto next_ent; goto next_ent;
if (memcmp(seg_blocks[k].cksum, if (ckcmp(seg_blocks[k].cksum,
be->cksum, cfg->chunk_cksum_sz) == 0 && be->cksum, cfg->chunk_cksum_sz) == 0 &&
seg_blocks[k].length == be->length) { seg_blocks[k].length == be->length) {
be->length = (be->length | be->length = (be->length |
@ -985,7 +1009,6 @@ next_ent:
} while(1); } while(1);
} }
} }
break;
} }
sim_ck -= cfg->similarity_cksum_sz; sim_ck -= cfg->similarity_cksum_sz;
} }
@ -1400,7 +1423,7 @@ dedupe_decompress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size)
len = LE32(*((uint32_t *)g_dedupe_idx)); len = LE32(*((uint32_t *)g_dedupe_idx));
g_dedupe_idx += RABIN_ENTRY_SIZE; g_dedupe_idx += RABIN_ENTRY_SIZE;
++blk; ++blk;
flag = len & GLOBAL_FLAG; flag = len & RABIN_INDEX_FLAG;
len &= RABIN_INDEX_VALUE; len &= RABIN_INDEX_VALUE;
if (sz + len > data_sz) { if (sz + len > data_sz) {