Work in progress global dedupe.

This commit is contained in:
Moinak Ghosh 2013-02-16 23:33:06 +05:30
parent f89473d29c
commit 7386f82a4f
4 changed files with 47 additions and 21 deletions

View file

@ -175,7 +175,8 @@ read_config(char *configfile, archive_config_t *cfg)
// Default // Default
cfg->verify_chunks = 0; cfg->verify_chunks = 0;
cfg->algo = COMPRESS_LZ4; cfg->algo = COMPRESS_LZ4;
cfg->chunk_cksum_type = DEFAULT_CKSUM; cfg->chunk_cksum_type = DEFAULT_CHUNK_CKSUM;
cfg->similarity_cksum = DEFAULT_SIMILARITY_CKSUM;
cfg->pct_interval = DEFAULT_SIMILARITY_INTERVAL; cfg->pct_interval = DEFAULT_SIMILARITY_INTERVAL;
fh = fopen(configfile, "r"); fh = fopen(configfile, "r");
@ -262,11 +263,19 @@ read_config(char *configfile, archive_config_t *cfg)
fclose(fh); fclose(fh);
return (1); return (1);
} }
} else if (strncmp(line, "SIMILARITY_CKSUM") == 0) {
cfg->chunk_cksum_type = get_cksum_type(pos);
if (cfg->chunk_cksum_type == CKSUM_INVALID) {
fprintf(stderr, "Invalid CHUNK_CKSUM setting.\n");
fclose(fh);
return (1);
}
} }
} }
fclose(fh); fclose(fh);
cfg->compress_level = get_compress_level(cfg->algo); cfg->compress_level = get_compress_level(cfg->algo);
cfg->chunk_cksum_sz = get_cksum_sz(cfg->chunk_cksum_type); cfg->chunk_cksum_sz = get_cksum_sz(cfg->chunk_cksum_type);
cfg->similarity_cksum_sz = get_cksum_sz(cfg->similarity_cksum);
/* /*
* Now compute the remaining parameters. * Now compute the remaining parameters.
@ -330,13 +339,15 @@ write_config(char *configfile, archive_config_t *cfg)
} }
int int
set_config_s(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, uint32_t chunksize, set_config_s(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, cksum_t ck_sim,
size_t file_sz, int pct_interval) uint32_t chunksize, size_t file_sz, int pct_interval)
{ {
cfg->algo = algo; cfg->algo = algo;
cfg->chunk_cksum_type = ck; cfg->chunk_cksum_type = ck;
cfg->similarity_cksum = ck_sim;
cfg->compress_level = get_compress_level(cfg->algo); cfg->compress_level = get_compress_level(cfg->algo);
cfg->chunk_cksum_sz = get_cksum_sz(cfg->chunk_cksum_type); cfg->chunk_cksum_sz = get_cksum_sz(cfg->chunk_cksum_type);
cfg->similarity_cksum_sz = get_cksum_sz(cfg->similarity_cksum);
cfg->chunk_sz = chunksize; cfg->chunk_sz = chunksize;
cfg->chunk_sz_bytes = RAB_BLK_AVG_SZ(cfg->chunk_sz); cfg->chunk_sz_bytes = RAB_BLK_AVG_SZ(cfg->chunk_sz);
cfg->pct_interval = pct_interval; cfg->pct_interval = pct_interval;

View file

@ -29,7 +29,8 @@ extern "C" {
#endif #endif
#define DEFAULT_SIMILARITY_INTERVAL 5 #define DEFAULT_SIMILARITY_INTERVAL 5
#define DEFAULT_CKSUM CKSUM_BLAKE256 #define DEFAULT_CHUNK_CKSUM CKSUM_SHA256
#define DEFAULT_SIMILARITY_CKSUM CKSUM_BLAKE256
#define DEFAULT_COMPRESS COMPRESS_LZ4 #define DEFAULT_COMPRESS COMPRESS_LZ4
#define MIN_CK 1 #define MIN_CK 1
#define MAX_CK 5 #define MAX_CK 5
@ -44,8 +45,10 @@ typedef struct {
int verify_chunks; // Whether to use memcmp() to compare chunks byte for byte. int verify_chunks; // Whether to use memcmp() to compare chunks byte for byte.
int algo; // Which compression algo for segments. int algo; // Which compression algo for segments.
compress_algo_t compress_level; // Default preset compression level per algo. compress_algo_t compress_level; // Default preset compression level per algo.
cksum_t chunk_cksum_type; // Which digest to use for hash based chunk lookup. cksum_t chunk_cksum_type; // Which digest to use for hash based chunk comparison.
cksum_t similarity_cksum; // Which digest to use similarity based segment lookup.
int chunk_cksum_sz; // Size of cksum in bytes. int chunk_cksum_sz; // Size of cksum in bytes.
int similarity_cksum_sz; // Size of cksum in bytes.
int pct_interval; // Similarity based match intervals in %age. int pct_interval; // Similarity based match intervals in %age.
// The items below are computed given the above // The items below are computed given the above
// components. // components.
@ -60,9 +63,15 @@ typedef struct {
void *dbdata; void *dbdata;
} archive_config_t; } archive_config_t;
typedef struct _segment_entry {
uint64_t offset;
uint32_t length;
uchar_t *cksum;
} segment_entry_t;
int read_config(char *configfile, archive_config_t *cfg); int read_config(char *configfile, archive_config_t *cfg);
int write_config(char *configfile, archive_config_t *cfg); int write_config(char *configfile, archive_config_t *cfg);
int set_config_s(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, int set_config_s(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, cksum_t ck_sim,
uint32_t chunksize, size_t file_sz, int pct_interval); uint32_t chunksize, size_t file_sz, int pct_interval);
#ifdef __cplusplus #ifdef __cplusplus

View file

@ -31,7 +31,7 @@
#include <allocator.h> #include <allocator.h>
#include <pthread.h> #include <pthread.h>
#include "initdb.h" #include "db.h"
#include "config.h" #include "config.h"
#define ONE_PB (1125899906842624ULL) #define ONE_PB (1125899906842624ULL)
@ -43,10 +43,11 @@
* Hashtable structures for in-memory index. * Hashtable structures for in-memory index.
*/ */
typedef struct _hash_entry { typedef struct _hash_entry {
uchar_t *cksum; segment_entry_t *seg;
struct _hash_entry *next; struct _hash_entry *next;
struct _hash_entry *lru_prev; struct _hash_entry *lru_prev;
struct _hash_entry *lru_next; struct _hash_entry *lru_next;
uchar_t cksum[1];
} hash_entry_t; } hash_entry_t;
typedef struct { typedef struct {
@ -54,12 +55,13 @@ typedef struct {
} htab_t; } htab_t;
typedef struct { typedef struct {
htab_t *htablst; htab_t *list;
pthread_mutex_t *mlist; pthread_mutex_t *mlist;
hash_entry_t *lru_head; hash_entry_t *lru_head;
hash_entry_t *lru_tail; hash_entry_t *lru_tail;
uint64_t memlimit; uint64_t memlimit;
uint64_t memused; uint64_t memused;
int hash_entry_size;
} htablst_t; } htablst_t;
archive_config_t * archive_config_t *
@ -83,14 +85,15 @@ init_global_db(char *configfile)
archive_config_t * archive_config_t *
init_global_db_s(char *path, uint32_t chunksize, int pct_interval, init_global_db_s(char *path, uint32_t chunksize, int pct_interval,
compress_algo_t algo, cksum_t ck, size_t file_sz, size_t memlimit) compress_algo_t algo, cksum_t ck, cksum_t ck_sim, size_t file_sz,
size_t memlimit)
{ {
archive_config_t *cfg; archive_config_t *cfg;
int rv; int rv;
float diff; float diff;
cfg = calloc(1, sizeof (archive_config_t)); cfg = calloc(1, sizeof (archive_config_t));
rv = set_config_s(cfg, algo, ck, chunksize, file_sz, chunks_per_seg, pct_interval); rv = set_config_s(cfg, algo, ck, ck_sim, chunksize, file_sz, chunks_per_seg, pct_interval);
if (path != NULL) { if (path != NULL) {
printf("Disk based index not yet implemented.\n"); printf("Disk based index not yet implemented.\n");
@ -100,24 +103,25 @@ init_global_db_s(char *path, uint32_t chunksize, int pct_interval,
uint32_t hash_slots, intervals, i; uint32_t hash_slots, intervals, i;
uint64_t memreqd; uint64_t memreqd;
htablst_t *htablst; htablst_t *htablst;
int hash_entry_size;
// Compute total hashtable entries first // Compute total hashtable entries first
intervals = 100 / pct_interval - 1; intervals = 100 / pct_interval - 1;
hash_slots = file_sz / cfg->segment_sz_bytes + 1; hash_slots = file_sz / cfg->segment_sz_bytes + 1;
hash_slots *= intervals; hash_slots *= intervals;
hash_entry_size = sizeof (hash_entry_t) + cfg->similarity_cksum_sz - 1;
// Compute memory required to hold all hash entries assuming worst case 50% // Compute memory required to hold all hash entries assuming worst case 50%
// occupancy. // occupancy.
memreqd = hash_slots * (sizeof (hash_entry_t) + cfg->chunk_cksum_sz + memreqd = hash_slots * (hash_entry_size + sizeof (hash_entry_t *) +
sizeof (hash_entry_t *) + (sizeof (hash_entry_t *)) / 2); (sizeof (hash_entry_t *)) / 2);
memreqd += hash_slots * sizeof (hash_entry_t **); memreqd += hash_slots * sizeof (hash_entry_t **);
diff = (float)pct_interval / 100.0; diff = (float)pct_interval / 100.0;
// Reduce hash_slots to remain within memlimit // Reduce hash_slots to remain within memlimit
while (memreqd > memlimit) { while (memreqd > memlimit) {
hash_slots -= (hash_slots * diff); hash_slots -= (hash_slots * diff);
memreqd = hash_slots * (sizeof (hash_entry_t) + memreqd = hash_slots * (hash_entry_size + sizeof (hash_entry_t *) +
cfg->chunk_cksum_sz + sizeof (hash_entry_t *) +
(sizeof (hash_entry_t *)) / 2); (sizeof (hash_entry_t *)) / 2);
memreqd += hash_slots * sizeof (hash_entry_t **); memreqd += hash_slots * sizeof (hash_entry_t **);
} }
@ -126,17 +130,18 @@ init_global_db_s(char *path, uint32_t chunksize, int pct_interval,
// each having hash_slots / intervals slots. // each having hash_slots / intervals slots.
htablst = calloc(1, sizeof (htablst_t)); htablst = calloc(1, sizeof (htablst_t));
htablst->memlimit = memlimit; htablst->memlimit = memlimit;
htablst->htablst = (htab_t *)calloc(intervals, sizeof (htab_t)); htablst->list = (htab_t *)calloc(intervals, sizeof (htab_t));
htablst->mlist = (pthread_mutex_t *)malloc(intervals * sizeof (pthread_mutex_t)); htablst->mlist = (pthread_mutex_t *)malloc(intervals * sizeof (pthread_mutex_t));
htablst->hash_entry_size = hash_entry_size;
for (i = 0; i < intervals; i++) { for (i = 0; i < intervals; i++) {
htablst->htablst[i].htab = (hash_entry_t **)calloc(hash_slots / intervals, htablst->list[i].htab = (hash_entry_t **)calloc(hash_slots / intervals,
sizeof (hash_entry_t *)); sizeof (hash_entry_t *));
htablst->memused += ((hash_slots / intervals) * (sizeof (hash_entry_t *))); htablst->memused += ((hash_slots / intervals) * (sizeof (hash_entry_t *)));
pthread_mutex_init(&(htablst->mlist[i]), NULL); pthread_mutex_init(&(htablst->mlist[i]), NULL);
} }
cfg->dbdata = htablst; cfg->dbdata = htablst;
slab_cache_add(sizeof (hash_entry_t)); slab_cache_add(hash_entry_size);
slab_cache_add(cfg->chunk_cksum_sz); slab_cache_add(cfg->chunk_cksum_sz);
} }
return (cfg); return (cfg);

View file

@ -18,8 +18,8 @@
* moinakg@belenix.org, http://moinakg.wordpress.com/ * moinakg@belenix.org, http://moinakg.wordpress.com/
*/ */
#ifndef _INITDB_H #ifndef _DB_H
#define _INITDB_H #define _DB_H
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
@ -27,7 +27,8 @@ extern "C" {
archive_config_t *init_global_db(char *configfile); archive_config_t *init_global_db(char *configfile);
archive_config_t *init_global_db_s(char *path, uint32_t chunksize, int pct_interval, archive_config_t *init_global_db_s(char *path, uint32_t chunksize, int pct_interval,
compress_algo_t algo, cksum_t ck, size_t file_sz, size_t memlimit); compress_algo_t algo, cksum_t ck, cksum_t ck_sim, size_t file_sz,
size_t memlimit);
#ifdef __cplusplus #ifdef __cplusplus
} }