diff --git a/rabin/global/config.c b/rabin/global/config.c index 9f9f058..28543ed 100644 --- a/rabin/global/config.c +++ b/rabin/global/config.c @@ -175,7 +175,8 @@ read_config(char *configfile, archive_config_t *cfg) // Default cfg->verify_chunks = 0; cfg->algo = COMPRESS_LZ4; - cfg->chunk_cksum_type = DEFAULT_CKSUM; + cfg->chunk_cksum_type = DEFAULT_CHUNK_CKSUM; + cfg->similarity_cksum = DEFAULT_SIMILARITY_CKSUM; cfg->pct_interval = DEFAULT_SIMILARITY_INTERVAL; fh = fopen(configfile, "r"); @@ -262,11 +263,19 @@ read_config(char *configfile, archive_config_t *cfg) fclose(fh); return (1); } + } else if (strncmp(line, "SIMILARITY_CKSUM") == 0) { + cfg->chunk_cksum_type = get_cksum_type(pos); + if (cfg->chunk_cksum_type == CKSUM_INVALID) { + fprintf(stderr, "Invalid CHUNK_CKSUM setting.\n"); + fclose(fh); + return (1); + } } } fclose(fh); cfg->compress_level = get_compress_level(cfg->algo); cfg->chunk_cksum_sz = get_cksum_sz(cfg->chunk_cksum_type); + cfg->similarity_cksum_sz = get_cksum_sz(cfg->similarity_cksum); /* * Now compute the remaining parameters. @@ -330,13 +339,15 @@ write_config(char *configfile, archive_config_t *cfg) } int -set_config_s(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, uint32_t chunksize, - size_t file_sz, int pct_interval) +set_config_s(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, cksum_t ck_sim, + uint32_t chunksize, size_t file_sz, int pct_interval) { cfg->algo = algo; cfg->chunk_cksum_type = ck; + cfg->similarity_cksum = ck_sim; cfg->compress_level = get_compress_level(cfg->algo); cfg->chunk_cksum_sz = get_cksum_sz(cfg->chunk_cksum_type); + cfg->similarity_cksum_sz = get_cksum_sz(cfg->similarity_cksum); cfg->chunk_sz = chunksize; cfg->chunk_sz_bytes = RAB_BLK_AVG_SZ(cfg->chunk_sz); cfg->pct_interval = pct_interval; diff --git a/rabin/global/config.h b/rabin/global/config.h index 58574f8..bd3025f 100644 --- a/rabin/global/config.h +++ b/rabin/global/config.h @@ -29,7 +29,8 @@ extern "C" { #endif #define DEFAULT_SIMILARITY_INTERVAL 5 -#define DEFAULT_CKSUM CKSUM_BLAKE256 +#define DEFAULT_CHUNK_CKSUM CKSUM_SHA256 +#define DEFAULT_SIMILARITY_CKSUM CKSUM_BLAKE256 #define DEFAULT_COMPRESS COMPRESS_LZ4 #define MIN_CK 1 #define MAX_CK 5 @@ -44,8 +45,10 @@ typedef struct { int verify_chunks; // Whether to use memcmp() to compare chunks byte for byte. int algo; // Which compression algo for segments. compress_algo_t compress_level; // Default preset compression level per algo. - cksum_t chunk_cksum_type; // Which digest to use for hash based chunk lookup. + cksum_t chunk_cksum_type; // Which digest to use for hash based chunk comparison. + cksum_t similarity_cksum; // Which digest to use similarity based segment lookup. int chunk_cksum_sz; // Size of cksum in bytes. + int similarity_cksum_sz; // Size of cksum in bytes. int pct_interval; // Similarity based match intervals in %age. // The items below are computed given the above // components. @@ -60,9 +63,15 @@ typedef struct { void *dbdata; } archive_config_t; +typedef struct _segment_entry { + uint64_t offset; + uint32_t length; + uchar_t *cksum; +} segment_entry_t; + int read_config(char *configfile, archive_config_t *cfg); int write_config(char *configfile, archive_config_t *cfg); -int set_config_s(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, +int set_config_s(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, cksum_t ck_sim, uint32_t chunksize, size_t file_sz, int pct_interval); #ifdef __cplusplus diff --git a/rabin/global/initdb.c b/rabin/global/db.c similarity index 81% rename from rabin/global/initdb.c rename to rabin/global/db.c index 2fd548b..a61a26b 100644 --- a/rabin/global/initdb.c +++ b/rabin/global/db.c @@ -31,7 +31,7 @@ #include #include -#include "initdb.h" +#include "db.h" #include "config.h" #define ONE_PB (1125899906842624ULL) @@ -43,10 +43,11 @@ * Hashtable structures for in-memory index. */ typedef struct _hash_entry { - uchar_t *cksum; + segment_entry_t *seg; struct _hash_entry *next; struct _hash_entry *lru_prev; struct _hash_entry *lru_next; + uchar_t cksum[1]; } hash_entry_t; typedef struct { @@ -54,12 +55,13 @@ typedef struct { } htab_t; typedef struct { - htab_t *htablst; + htab_t *list; pthread_mutex_t *mlist; hash_entry_t *lru_head; hash_entry_t *lru_tail; uint64_t memlimit; uint64_t memused; + int hash_entry_size; } htablst_t; archive_config_t * @@ -83,14 +85,15 @@ init_global_db(char *configfile) archive_config_t * init_global_db_s(char *path, uint32_t chunksize, int pct_interval, - compress_algo_t algo, cksum_t ck, size_t file_sz, size_t memlimit) + compress_algo_t algo, cksum_t ck, cksum_t ck_sim, size_t file_sz, + size_t memlimit) { archive_config_t *cfg; int rv; float diff; cfg = calloc(1, sizeof (archive_config_t)); - rv = set_config_s(cfg, algo, ck, chunksize, file_sz, chunks_per_seg, pct_interval); + rv = set_config_s(cfg, algo, ck, ck_sim, chunksize, file_sz, chunks_per_seg, pct_interval); if (path != NULL) { printf("Disk based index not yet implemented.\n"); @@ -100,24 +103,25 @@ init_global_db_s(char *path, uint32_t chunksize, int pct_interval, uint32_t hash_slots, intervals, i; uint64_t memreqd; htablst_t *htablst; + int hash_entry_size; // Compute total hashtable entries first intervals = 100 / pct_interval - 1; hash_slots = file_sz / cfg->segment_sz_bytes + 1; hash_slots *= intervals; + hash_entry_size = sizeof (hash_entry_t) + cfg->similarity_cksum_sz - 1; // Compute memory required to hold all hash entries assuming worst case 50% // occupancy. - memreqd = hash_slots * (sizeof (hash_entry_t) + cfg->chunk_cksum_sz + - sizeof (hash_entry_t *) + (sizeof (hash_entry_t *)) / 2); + memreqd = hash_slots * (hash_entry_size + sizeof (hash_entry_t *) + + (sizeof (hash_entry_t *)) / 2); memreqd += hash_slots * sizeof (hash_entry_t **); diff = (float)pct_interval / 100.0; // Reduce hash_slots to remain within memlimit while (memreqd > memlimit) { hash_slots -= (hash_slots * diff); - memreqd = hash_slots * (sizeof (hash_entry_t) + - cfg->chunk_cksum_sz + sizeof (hash_entry_t *) + + memreqd = hash_slots * (hash_entry_size + sizeof (hash_entry_t *) + (sizeof (hash_entry_t *)) / 2); memreqd += hash_slots * sizeof (hash_entry_t **); } @@ -126,17 +130,18 @@ init_global_db_s(char *path, uint32_t chunksize, int pct_interval, // each having hash_slots / intervals slots. htablst = calloc(1, sizeof (htablst_t)); htablst->memlimit = memlimit; - htablst->htablst = (htab_t *)calloc(intervals, sizeof (htab_t)); + htablst->list = (htab_t *)calloc(intervals, sizeof (htab_t)); htablst->mlist = (pthread_mutex_t *)malloc(intervals * sizeof (pthread_mutex_t)); + htablst->hash_entry_size = hash_entry_size; for (i = 0; i < intervals; i++) { - htablst->htablst[i].htab = (hash_entry_t **)calloc(hash_slots / intervals, + htablst->list[i].htab = (hash_entry_t **)calloc(hash_slots / intervals, sizeof (hash_entry_t *)); htablst->memused += ((hash_slots / intervals) * (sizeof (hash_entry_t *))); pthread_mutex_init(&(htablst->mlist[i]), NULL); } cfg->dbdata = htablst; - slab_cache_add(sizeof (hash_entry_t)); + slab_cache_add(hash_entry_size); slab_cache_add(cfg->chunk_cksum_sz); } return (cfg); diff --git a/rabin/global/initdb.h b/rabin/global/db.h similarity index 89% rename from rabin/global/initdb.h rename to rabin/global/db.h index f51b968..2220c18 100644 --- a/rabin/global/initdb.h +++ b/rabin/global/db.h @@ -18,8 +18,8 @@ * moinakg@belenix.org, http://moinakg.wordpress.com/ */ -#ifndef _INITDB_H -#define _INITDB_H +#ifndef _DB_H +#define _DB_H #ifdef __cplusplus extern "C" { @@ -27,7 +27,8 @@ extern "C" { archive_config_t *init_global_db(char *configfile); archive_config_t *init_global_db_s(char *path, uint32_t chunksize, int pct_interval, - compress_algo_t algo, cksum_t ck, size_t file_sz, size_t memlimit); + compress_algo_t algo, cksum_t ck, cksum_t ck_sim, size_t file_sz, + size_t memlimit); #ifdef __cplusplus }