Work in progress global dedupe changes.

This commit is contained in:
Moinak Ghosh 2013-03-19 20:13:44 +05:30
parent f8f23e5200
commit f2806d4ffa
5 changed files with 34 additions and 18 deletions

View file

@ -186,7 +186,7 @@ struct cmp_data {
uint64_t chunksize; uint64_t chunksize;
uint64_t len_cmp, len_cmp_be; uint64_t len_cmp, len_cmp_be;
uchar_t checksum[CKSUM_MAX_BYTES]; uchar_t checksum[CKSUM_MAX_BYTES];
int level, cksum_mt; int level, cksum_mt, out_fd;
unsigned int id; unsigned int id;
compress_func_ptr compress; compress_func_ptr compress;
compress_func_ptr decompress; compress_func_ptr decompress;

View file

@ -38,11 +38,6 @@
#include "db.h" #include "db.h"
#define ONE_PB (1125899906842624ULL)
#define ONE_TB (1099511627776ULL)
#define FOUR_MB (4194304ULL)
#define EIGHT_MB (8388608ULL)
/* /*
* Hashtable structures for in-memory index. * Hashtable structures for in-memory index.
*/ */
@ -103,16 +98,16 @@ static cleanup_indx(index_t *indx)
} }
archive_config_t * archive_config_t *
init_global_db_s(char *path, char *tmppath, uint32_t chunksize, int pct_interval, init_global_db_s(char *path, char *tmppath, uint32_t chunksize, uint64_t user_chunk_sz,
compress_algo_t algo, cksum_t ck, cksum_t ck_sim, size_t file_sz, int pct_interval, compress_algo_t algo, cksum_t ck, cksum_t ck_sim,
size_t memlimit, int nthreads) size_t file_sz, size_t memlimit, int nthreads)
{ {
archive_config_t *cfg; archive_config_t *cfg;
int rv; int rv;
float diff; float diff;
cfg = calloc(1, sizeof (archive_config_t)); cfg = calloc(1, sizeof (archive_config_t));
rv = set_config_s(cfg, algo, ck, ck_sim, chunksize, file_sz, pct_interval); rv = set_config_s(cfg, algo, ck, ck_sim, chunksize, file_sz, user_chunk_sz, pct_interval);
if (path != NULL) { if (path != NULL) {
printf("Disk based index not yet implemented.\n"); printf("Disk based index not yet implemented.\n");
@ -125,7 +120,10 @@ init_global_db_s(char *path, char *tmppath, uint32_t chunksize, int pct_interval
index_t *indx; index_t *indx;
// Compute total hashtable entries first // Compute total hashtable entries first
intervals = 100 / pct_interval - 1; if (pct_interval == 0)
intervals = 1;
else
intervals = 100 / pct_interval - 1;
hash_slots = file_sz / cfg->segment_sz_bytes + 1; hash_slots = file_sz / cfg->segment_sz_bytes + 1;
hash_slots *= intervals; hash_slots *= intervals;
hash_entry_size = sizeof (hash_entry_t) + cfg->similarity_cksum_sz - 1; hash_entry_size = sizeof (hash_entry_t) + cfg->similarity_cksum_sz - 1;
@ -249,6 +247,7 @@ db_lookup_insert_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval,
htab[htab_entry] = htab[htab_entry]->next; htab[htab_entry] = htab[htab_entry]->next;
} else { } else {
ent = (hash_entry_t *)malloc(indx->hash_entry_size); ent = (hash_entry_t *)malloc(indx->hash_entry_size);
indx->memused += indx->hash_entry_size;
} }
ent->seg_offset = seg_offset; ent->seg_offset = seg_offset;
ent->next = 0; ent->next = 0;

View file

@ -33,8 +33,9 @@ extern "C" {
archive_config_t *init_global_db(char *configfile); archive_config_t *init_global_db(char *configfile);
archive_config_t *init_global_db_s(char *path, char *tmppath, uint32_t chunksize, archive_config_t *init_global_db_s(char *path, char *tmppath, uint32_t chunksize,
int pct_interval, compress_algo_t algo, cksum_t ck, uint64_t user_chunk_sz, int pct_interval, compress_algo_t algo,
cksum_t ck_sim, size_t file_sz, size_t memlimit, int nthreads); cksum_t ck, cksum_t ck_sim, size_t file_sz, size_t memlimit,
int nthreads);
uint64_t db_lookup_insert_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval, uint64_t db_lookup_insert_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval,
uint64_t seg_offset, int do_insert); uint64_t seg_offset, int do_insert);

View file

@ -42,6 +42,7 @@
#define ONE_TB (1099511627776ULL) #define ONE_TB (1099511627776ULL)
#define FOUR_MB (4194304ULL) #define FOUR_MB (4194304ULL)
#define EIGHT_MB (8388608ULL) #define EIGHT_MB (8388608ULL)
#define EIGHT_GB (8589934592ULL)
static compress_algo_t static compress_algo_t
get_compress_level(compress_algo_t algo) get_compress_level(compress_algo_t algo)
@ -287,6 +288,7 @@ read_config(char *configfile, archive_config_t *cfg)
*/ */
cfg->chunk_sz_bytes = RAB_BLK_AVG_SZ(cfg->chunk_sz); cfg->chunk_sz_bytes = RAB_BLK_AVG_SZ(cfg->chunk_sz);
cfg->directory_levels = 2; cfg->directory_levels = 2;
if (cfg->archive_sz < ONE_TB) { if (cfg->archive_sz < ONE_TB) {
segment_sz_bytes = FOUR_MB; segment_sz_bytes = FOUR_MB;
cfg->directory_fanout = 128; cfg->directory_fanout = 128;
@ -349,8 +351,9 @@ write_config(char *configfile, archive_config_t *cfg)
int int
set_config_s(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, cksum_t ck_sim, set_config_s(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, cksum_t ck_sim,
uint32_t chunksize, size_t file_sz, int pct_interval) uint32_t chunksize, size_t file_sz, uint64_t user_chunk_sz, int pct_interval)
{ {
cfg->algo = algo; cfg->algo = algo;
cfg->chunk_cksum_type = ck; cfg->chunk_cksum_type = ck;
cfg->similarity_cksum = ck_sim; cfg->similarity_cksum = ck_sim;
@ -360,9 +363,14 @@ set_config_s(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, cksum_t ck
cfg->chunk_sz = chunksize; cfg->chunk_sz = chunksize;
cfg->chunk_sz_bytes = RAB_BLK_AVG_SZ(cfg->chunk_sz); cfg->chunk_sz_bytes = RAB_BLK_AVG_SZ(cfg->chunk_sz);
cfg->pct_interval = pct_interval; cfg->pct_interval = pct_interval;
cfg->archive_sz = file_sz; cfg->archive_sz = file_sz;
if (cfg->archive_sz < ONE_TB) { cfg->dedupe_mode = MODE_SIMILARITY;
if (cfg->archive_sz <= EIGHT_GB) {
cfg->dedupe_mode = MODE_SIMPLE;
cfg->segment_sz_bytes = user_chunk_sz;
} else if (cfg->archive_sz < ONE_TB) {
cfg->segment_sz_bytes = FOUR_MB; cfg->segment_sz_bytes = FOUR_MB;
} else { } else {

View file

@ -44,6 +44,12 @@ extern "C" {
// 8GB // 8GB
#define MIN_ARCHIVE_SZ (8589934592ULL) #define MIN_ARCHIVE_SZ (8589934592ULL)
typedef enum {
MODE_SIMPLE = 0,
MODE_SIMILARITY,
MODE_ARCHIVE
} dedupe_mode_t;
typedef struct { typedef struct {
char rootdir[PATH_MAX+1]; char rootdir[PATH_MAX+1];
uint32_t chunk_sz; // Numeric ID: 1 - 4k ... 5 - 64k uint32_t chunk_sz; // Numeric ID: 1 - 4k ... 5 - 64k
@ -58,9 +64,10 @@ typedef struct {
int pct_interval; // Similarity based match intervals in %age. int pct_interval; // Similarity based match intervals in %age.
// The items below are computed given the above // The items below are computed given the above
// components. // components.
dedupe_mode_t dedupe_mode;
uint32_t chunk_sz_bytes; // Average chunk size uint32_t chunk_sz_bytes; // Average chunk size
uint32_t segment_sz_bytes; // Segment size in bytes uint64_t segment_sz_bytes; // Segment size in bytes
uint32_t segment_sz; // Number of chunks in one segment uint32_t segment_sz; // Number of chunks in one segment
uint32_t container_sz; // Number of segments uint32_t container_sz; // Number of segments
int directory_fanout; // Number of subdirectories in a directory int directory_fanout; // Number of subdirectories in a directory
@ -80,7 +87,8 @@ typedef struct _segment_entry {
int read_config(char *configfile, archive_config_t *cfg); int read_config(char *configfile, archive_config_t *cfg);
int write_config(char *configfile, archive_config_t *cfg); int write_config(char *configfile, archive_config_t *cfg);
int set_config_s(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, cksum_t ck_sim, int set_config_s(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, cksum_t ck_sim,
uint32_t chunksize, size_t file_sz, int pct_interval); uint32_t chunksize, size_t file_sz, uint64_t user_chunk_sz,
int pct_interval);
#ifdef __cplusplus #ifdef __cplusplus
} }