From 24d62bfde975ec61ef81bed9540176794a97d6cf Mon Sep 17 00:00:00 2001 From: Moinak Ghosh Date: Thu, 14 Feb 2013 23:10:53 +0530 Subject: [PATCH] Global dedupe work in progress. --- rabin/global/config.c | 13 +++++-- rabin/global/config.h | 10 +++--- rabin/global/initdb.c | 84 +++++++++++++++++++++++++++++++++++++++++-- rabin/global/initdb.h | 4 +-- rabin/rabin_dedup.h | 5 +++ 5 files changed, 104 insertions(+), 12 deletions(-) diff --git a/rabin/global/config.c b/rabin/global/config.c index bbba2a1..822707a 100644 --- a/rabin/global/config.c +++ b/rabin/global/config.c @@ -176,7 +176,7 @@ read_config(char *configfile, archive_config_t *cfg) cfg->verify_chunks = 0; cfg->algo = COMPRESS_LZ4; cfg->chunk_cksum_type = DEFAULT_CKSUM; - cfg->similarity_interval = DEFAULT_SIMILARITY_INTERVAL; + cfg->pct_interval = DEFAULT_SIMILARITY_INTERVAL; fh = fopen(configfile, "r"); if (fh == NULL) { @@ -286,6 +286,7 @@ read_config(char *configfile, archive_config_t *cfg) cfg->directory_levels = 3; } + cfg->segment_sz_bytes = segment_sz_bytes; cfg->segment_sz = segment_sz_bytes / cfg->chunk_sz_bytes; total_dirs = 1; @@ -329,8 +330,8 @@ write_config(char *configfile, archive_config_t *cfg) } int -set_simple_config(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, uint32_t chunksize, - size_t file_sz, uint32_t chunks_per_seg) +set_config_s(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, uint32_t chunksize, + size_t file_sz, int pct_interval) { cfg->algo = algo; cfg->chunk_cksum_type = ck; @@ -338,6 +339,7 @@ set_simple_config(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, uint3 cfg->chunk_cksum_sz = get_cksum_sz(cfg->chunk_cksum_type); cfg->chunk_sz = chunksize; cfg->chunk_sz_bytes = RAB_BLK_AVG_SZ(cfg->chunk_sz); + cfg->pct_interval = pct_interval; cfg->archive_sz = file_sz; if (cfg->archive_sz < ONE_TB) { @@ -346,5 +348,10 @@ set_simple_config(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, uint3 } else { segment_sz_bytes = EIGHT_MB; } + + cfg->segment_sz_bytes = segment_sz_bytes; + cfg->segment_sz = segment_sz_bytes / cfg->chunk_sz_bytes; + + return (0); } diff --git a/rabin/global/config.h b/rabin/global/config.h index 91199be..58574f8 100644 --- a/rabin/global/config.h +++ b/rabin/global/config.h @@ -46,22 +46,24 @@ typedef struct { compress_algo_t compress_level; // Default preset compression level per algo. cksum_t chunk_cksum_type; // Which digest to use for hash based chunk lookup. int chunk_cksum_sz; // Size of cksum in bytes. - int similarity_interval; // Similarity based match intervals in %age. + int pct_interval; // Similarity based match intervals in %age. // The items below are computed given the above // components. uint32_t chunk_sz_bytes; // Average chunk size - uint32_t segment_sz; // Number of chunks + uint32_t segment_sz_bytes; // Segment size in bytes + uint32_t segment_sz; // Number of chunks in one segment uint32_t container_sz; // Number of segments int directory_fanout; // Number of subdirectories in a directory int directory_levels; // Levels of nested directories int num_containers; // Number of containers in a directory + void *dbdata; } archive_config_t; int read_config(char *configfile, archive_config_t *cfg); int write_config(char *configfile, archive_config_t *cfg); -int set_simple_config(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, - uint32_t chunksize, size_t file_sz, uint32_t chunks_per_seg); +int set_config_s(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, + uint32_t chunksize, size_t file_sz, int pct_interval); #ifdef __cplusplus } diff --git a/rabin/global/initdb.c b/rabin/global/initdb.c index 96fdcee..2fd548b 100644 --- a/rabin/global/initdb.c +++ b/rabin/global/initdb.c @@ -28,7 +28,8 @@ #include #include #include -#include +#include +#include #include "initdb.h" #include "config.h" @@ -38,6 +39,28 @@ #define FOUR_MB (4194304ULL) #define EIGHT_MB (8388608ULL) +/* + * Hashtable structures for in-memory index. + */ +typedef struct _hash_entry { + uchar_t *cksum; + struct _hash_entry *next; + struct _hash_entry *lru_prev; + struct _hash_entry *lru_next; +} hash_entry_t; + +typedef struct { + hash_entry_t **htab; +} htab_t; + +typedef struct { + htab_t *htablst; + pthread_mutex_t *mlist; + hash_entry_t *lru_head; + hash_entry_t *lru_tail; + uint64_t memlimit; + uint64_t memused; +} htablst_t; archive_config_t * init_global_db(char *configfile) @@ -59,12 +82,67 @@ init_global_db(char *configfile) } archive_config_t * -init_global_db_simple(char *path, uint32_t chunksize, uint32_t chunks_per_seg, +init_global_db_s(char *path, uint32_t chunksize, int pct_interval, compress_algo_t algo, cksum_t ck, size_t file_sz, size_t memlimit) { archive_config_t *cfg; int rv; + float diff; cfg = calloc(1, sizeof (archive_config_t)); - rv = set_simple_config(cfg, algo, ck, chunksize, file_sz, chunks_per_seg); + rv = set_config_s(cfg, algo, ck, chunksize, file_sz, chunks_per_seg, pct_interval); + + if (path != NULL) { + printf("Disk based index not yet implemented.\n"); + free(cfg); + return (NULL); + } else { + uint32_t hash_slots, intervals, i; + uint64_t memreqd; + htablst_t *htablst; + + // Compute total hashtable entries first + intervals = 100 / pct_interval - 1; + hash_slots = file_sz / cfg->segment_sz_bytes + 1; + hash_slots *= intervals; + + // Compute memory required to hold all hash entries assuming worst case 50% + // occupancy. + memreqd = hash_slots * (sizeof (hash_entry_t) + cfg->chunk_cksum_sz + + sizeof (hash_entry_t *) + (sizeof (hash_entry_t *)) / 2); + memreqd += hash_slots * sizeof (hash_entry_t **); + diff = (float)pct_interval / 100.0; + + // Reduce hash_slots to remain within memlimit + while (memreqd > memlimit) { + hash_slots -= (hash_slots * diff); + memreqd = hash_slots * (sizeof (hash_entry_t) + + cfg->chunk_cksum_sz + sizeof (hash_entry_t *) + + (sizeof (hash_entry_t *)) / 2); + memreqd += hash_slots * sizeof (hash_entry_t **); + } + + // Now create as many hash tables as there are similarity match intervals + // each having hash_slots / intervals slots. + htablst = calloc(1, sizeof (htablst_t)); + htablst->memlimit = memlimit; + htablst->htablst = (htab_t *)calloc(intervals, sizeof (htab_t)); + htablst->mlist = (pthread_mutex_t *)malloc(intervals * sizeof (pthread_mutex_t)); + + for (i = 0; i < intervals; i++) { + htablst->htablst[i].htab = (hash_entry_t **)calloc(hash_slots / intervals, + sizeof (hash_entry_t *)); + htablst->memused += ((hash_slots / intervals) * (sizeof (hash_entry_t *))); + pthread_mutex_init(&(htablst->mlist[i]), NULL); + } + cfg->dbdata = htablst; + slab_cache_add(sizeof (hash_entry_t)); + slab_cache_add(cfg->chunk_cksum_sz); + } + return (cfg); +} + +int +db_insert_s(archive_config_t *cfg, uchar_t *cksum, int interval_num) +{ } diff --git a/rabin/global/initdb.h b/rabin/global/initdb.h index 803fd06..f51b968 100644 --- a/rabin/global/initdb.h +++ b/rabin/global/initdb.h @@ -26,8 +26,8 @@ extern "C" { #endif archive_config_t *init_global_db(char *configfile); -archive_config_t *init_global_db_simple(char *path, uint32_t chunksize, uint32_t chunks_per_seg, - compress_algo_t algo, cksum_t ck, size_t file_sz, size_t memlimit) +archive_config_t *init_global_db_s(char *path, uint32_t chunksize, int pct_interval, + compress_algo_t algo, cksum_t ck, size_t file_sz, size_t memlimit); #ifdef __cplusplus } diff --git a/rabin/rabin_dedup.h b/rabin/rabin_dedup.h index 1feae91..d4d3424 100644 --- a/rabin/rabin_dedup.h +++ b/rabin/rabin_dedup.h @@ -149,6 +149,11 @@ typedef struct rab_blockentry { struct rab_blockentry *next; } rabin_blockentry_t; +typedef struct global_blockentry { + uint64_t offset; + uint32_t length; +} global_blockentry_t; + typedef struct { unsigned char *current_window_data; rabin_blockentry_t **blocks;