Work in progress global dedupe changes.

This commit is contained in:
Moinak Ghosh 2013-03-19 20:13:44 +05:30
parent f8f23e5200
commit f2806d4ffa
5 changed files with 34 additions and 18 deletions

View file

@ -186,7 +186,7 @@ struct cmp_data {
uint64_t chunksize;
uint64_t len_cmp, len_cmp_be;
uchar_t checksum[CKSUM_MAX_BYTES];
int level, cksum_mt;
int level, cksum_mt, out_fd;
unsigned int id;
compress_func_ptr compress;
compress_func_ptr decompress;

View file

@ -38,11 +38,6 @@
#include "db.h"
#define ONE_PB (1125899906842624ULL)
#define ONE_TB (1099511627776ULL)
#define FOUR_MB (4194304ULL)
#define EIGHT_MB (8388608ULL)
/*
* Hashtable structures for in-memory index.
*/
@ -103,16 +98,16 @@ static cleanup_indx(index_t *indx)
}
archive_config_t *
init_global_db_s(char *path, char *tmppath, uint32_t chunksize, int pct_interval,
compress_algo_t algo, cksum_t ck, cksum_t ck_sim, size_t file_sz,
size_t memlimit, int nthreads)
init_global_db_s(char *path, char *tmppath, uint32_t chunksize, uint64_t user_chunk_sz,
int pct_interval, compress_algo_t algo, cksum_t ck, cksum_t ck_sim,
size_t file_sz, size_t memlimit, int nthreads)
{
archive_config_t *cfg;
int rv;
float diff;
cfg = calloc(1, sizeof (archive_config_t));
rv = set_config_s(cfg, algo, ck, ck_sim, chunksize, file_sz, pct_interval);
rv = set_config_s(cfg, algo, ck, ck_sim, chunksize, file_sz, user_chunk_sz, pct_interval);
if (path != NULL) {
printf("Disk based index not yet implemented.\n");
@ -125,7 +120,10 @@ init_global_db_s(char *path, char *tmppath, uint32_t chunksize, int pct_interval
index_t *indx;
// Compute total hashtable entries first
intervals = 100 / pct_interval - 1;
if (pct_interval == 0)
intervals = 1;
else
intervals = 100 / pct_interval - 1;
hash_slots = file_sz / cfg->segment_sz_bytes + 1;
hash_slots *= intervals;
hash_entry_size = sizeof (hash_entry_t) + cfg->similarity_cksum_sz - 1;
@ -249,6 +247,7 @@ db_lookup_insert_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval,
htab[htab_entry] = htab[htab_entry]->next;
} else {
ent = (hash_entry_t *)malloc(indx->hash_entry_size);
indx->memused += indx->hash_entry_size;
}
ent->seg_offset = seg_offset;
ent->next = 0;

View file

@ -33,8 +33,9 @@ extern "C" {
archive_config_t *init_global_db(char *configfile);
archive_config_t *init_global_db_s(char *path, char *tmppath, uint32_t chunksize,
int pct_interval, compress_algo_t algo, cksum_t ck,
cksum_t ck_sim, size_t file_sz, size_t memlimit, int nthreads);
uint64_t user_chunk_sz, int pct_interval, compress_algo_t algo,
cksum_t ck, cksum_t ck_sim, size_t file_sz, size_t memlimit,
int nthreads);
uint64_t db_lookup_insert_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval,
uint64_t seg_offset, int do_insert);

View file

@ -42,6 +42,7 @@
#define ONE_TB (1099511627776ULL)
#define FOUR_MB (4194304ULL)
#define EIGHT_MB (8388608ULL)
#define EIGHT_GB (8589934592ULL)
static compress_algo_t
get_compress_level(compress_algo_t algo)
@ -287,6 +288,7 @@ read_config(char *configfile, archive_config_t *cfg)
*/
cfg->chunk_sz_bytes = RAB_BLK_AVG_SZ(cfg->chunk_sz);
cfg->directory_levels = 2;
if (cfg->archive_sz < ONE_TB) {
segment_sz_bytes = FOUR_MB;
cfg->directory_fanout = 128;
@ -349,8 +351,9 @@ write_config(char *configfile, archive_config_t *cfg)
int
set_config_s(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, cksum_t ck_sim,
uint32_t chunksize, size_t file_sz, int pct_interval)
uint32_t chunksize, size_t file_sz, uint64_t user_chunk_sz, int pct_interval)
{
cfg->algo = algo;
cfg->chunk_cksum_type = ck;
cfg->similarity_cksum = ck_sim;
@ -360,9 +363,14 @@ set_config_s(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, cksum_t ck
cfg->chunk_sz = chunksize;
cfg->chunk_sz_bytes = RAB_BLK_AVG_SZ(cfg->chunk_sz);
cfg->pct_interval = pct_interval;
cfg->archive_sz = file_sz;
if (cfg->archive_sz < ONE_TB) {
cfg->dedupe_mode = MODE_SIMILARITY;
if (cfg->archive_sz <= EIGHT_GB) {
cfg->dedupe_mode = MODE_SIMPLE;
cfg->segment_sz_bytes = user_chunk_sz;
} else if (cfg->archive_sz < ONE_TB) {
cfg->segment_sz_bytes = FOUR_MB;
} else {

View file

@ -44,6 +44,12 @@ extern "C" {
// 8GB
#define MIN_ARCHIVE_SZ (8589934592ULL)
typedef enum {
MODE_SIMPLE = 0,
MODE_SIMILARITY,
MODE_ARCHIVE
} dedupe_mode_t;
typedef struct {
char rootdir[PATH_MAX+1];
uint32_t chunk_sz; // Numeric ID: 1 - 4k ... 5 - 64k
@ -58,9 +64,10 @@ typedef struct {
int pct_interval; // Similarity based match intervals in %age.
// The items below are computed given the above
// components.
dedupe_mode_t dedupe_mode;
uint32_t chunk_sz_bytes; // Average chunk size
uint32_t segment_sz_bytes; // Segment size in bytes
uint64_t segment_sz_bytes; // Segment size in bytes
uint32_t segment_sz; // Number of chunks in one segment
uint32_t container_sz; // Number of segments
int directory_fanout; // Number of subdirectories in a directory
@ -80,7 +87,8 @@ typedef struct _segment_entry {
int read_config(char *configfile, archive_config_t *cfg);
int write_config(char *configfile, archive_config_t *cfg);
int set_config_s(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, cksum_t ck_sim,
uint32_t chunksize, size_t file_sz, int pct_interval);
uint32_t chunksize, size_t file_sz, uint64_t user_chunk_sz,
int pct_interval);
#ifdef __cplusplus
}