Work in progress global dedupe changes.
This commit is contained in:
parent
f8f23e5200
commit
f2806d4ffa
5 changed files with 34 additions and 18 deletions
|
@ -186,7 +186,7 @@ struct cmp_data {
|
|||
uint64_t chunksize;
|
||||
uint64_t len_cmp, len_cmp_be;
|
||||
uchar_t checksum[CKSUM_MAX_BYTES];
|
||||
int level, cksum_mt;
|
||||
int level, cksum_mt, out_fd;
|
||||
unsigned int id;
|
||||
compress_func_ptr compress;
|
||||
compress_func_ptr decompress;
|
||||
|
|
|
@ -38,11 +38,6 @@
|
|||
|
||||
#include "db.h"
|
||||
|
||||
#define ONE_PB (1125899906842624ULL)
|
||||
#define ONE_TB (1099511627776ULL)
|
||||
#define FOUR_MB (4194304ULL)
|
||||
#define EIGHT_MB (8388608ULL)
|
||||
|
||||
/*
|
||||
* Hashtable structures for in-memory index.
|
||||
*/
|
||||
|
@ -103,16 +98,16 @@ static cleanup_indx(index_t *indx)
|
|||
}
|
||||
|
||||
archive_config_t *
|
||||
init_global_db_s(char *path, char *tmppath, uint32_t chunksize, int pct_interval,
|
||||
compress_algo_t algo, cksum_t ck, cksum_t ck_sim, size_t file_sz,
|
||||
size_t memlimit, int nthreads)
|
||||
init_global_db_s(char *path, char *tmppath, uint32_t chunksize, uint64_t user_chunk_sz,
|
||||
int pct_interval, compress_algo_t algo, cksum_t ck, cksum_t ck_sim,
|
||||
size_t file_sz, size_t memlimit, int nthreads)
|
||||
{
|
||||
archive_config_t *cfg;
|
||||
int rv;
|
||||
float diff;
|
||||
|
||||
cfg = calloc(1, sizeof (archive_config_t));
|
||||
rv = set_config_s(cfg, algo, ck, ck_sim, chunksize, file_sz, pct_interval);
|
||||
rv = set_config_s(cfg, algo, ck, ck_sim, chunksize, file_sz, user_chunk_sz, pct_interval);
|
||||
|
||||
if (path != NULL) {
|
||||
printf("Disk based index not yet implemented.\n");
|
||||
|
@ -125,6 +120,9 @@ init_global_db_s(char *path, char *tmppath, uint32_t chunksize, int pct_interval
|
|||
index_t *indx;
|
||||
|
||||
// Compute total hashtable entries first
|
||||
if (pct_interval == 0)
|
||||
intervals = 1;
|
||||
else
|
||||
intervals = 100 / pct_interval - 1;
|
||||
hash_slots = file_sz / cfg->segment_sz_bytes + 1;
|
||||
hash_slots *= intervals;
|
||||
|
@ -249,6 +247,7 @@ db_lookup_insert_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval,
|
|||
htab[htab_entry] = htab[htab_entry]->next;
|
||||
} else {
|
||||
ent = (hash_entry_t *)malloc(indx->hash_entry_size);
|
||||
indx->memused += indx->hash_entry_size;
|
||||
}
|
||||
ent->seg_offset = seg_offset;
|
||||
ent->next = 0;
|
||||
|
|
|
@ -33,8 +33,9 @@ extern "C" {
|
|||
|
||||
archive_config_t *init_global_db(char *configfile);
|
||||
archive_config_t *init_global_db_s(char *path, char *tmppath, uint32_t chunksize,
|
||||
int pct_interval, compress_algo_t algo, cksum_t ck,
|
||||
cksum_t ck_sim, size_t file_sz, size_t memlimit, int nthreads);
|
||||
uint64_t user_chunk_sz, int pct_interval, compress_algo_t algo,
|
||||
cksum_t ck, cksum_t ck_sim, size_t file_sz, size_t memlimit,
|
||||
int nthreads);
|
||||
uint64_t db_lookup_insert_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval,
|
||||
uint64_t seg_offset, int do_insert);
|
||||
|
||||
|
|
|
@ -42,6 +42,7 @@
|
|||
#define ONE_TB (1099511627776ULL)
|
||||
#define FOUR_MB (4194304ULL)
|
||||
#define EIGHT_MB (8388608ULL)
|
||||
#define EIGHT_GB (8589934592ULL)
|
||||
|
||||
static compress_algo_t
|
||||
get_compress_level(compress_algo_t algo)
|
||||
|
@ -287,6 +288,7 @@ read_config(char *configfile, archive_config_t *cfg)
|
|||
*/
|
||||
cfg->chunk_sz_bytes = RAB_BLK_AVG_SZ(cfg->chunk_sz);
|
||||
cfg->directory_levels = 2;
|
||||
|
||||
if (cfg->archive_sz < ONE_TB) {
|
||||
segment_sz_bytes = FOUR_MB;
|
||||
cfg->directory_fanout = 128;
|
||||
|
@ -349,8 +351,9 @@ write_config(char *configfile, archive_config_t *cfg)
|
|||
|
||||
int
|
||||
set_config_s(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, cksum_t ck_sim,
|
||||
uint32_t chunksize, size_t file_sz, int pct_interval)
|
||||
uint32_t chunksize, size_t file_sz, uint64_t user_chunk_sz, int pct_interval)
|
||||
{
|
||||
|
||||
cfg->algo = algo;
|
||||
cfg->chunk_cksum_type = ck;
|
||||
cfg->similarity_cksum = ck_sim;
|
||||
|
@ -360,9 +363,14 @@ set_config_s(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, cksum_t ck
|
|||
cfg->chunk_sz = chunksize;
|
||||
cfg->chunk_sz_bytes = RAB_BLK_AVG_SZ(cfg->chunk_sz);
|
||||
cfg->pct_interval = pct_interval;
|
||||
|
||||
cfg->archive_sz = file_sz;
|
||||
if (cfg->archive_sz < ONE_TB) {
|
||||
cfg->dedupe_mode = MODE_SIMILARITY;
|
||||
|
||||
if (cfg->archive_sz <= EIGHT_GB) {
|
||||
cfg->dedupe_mode = MODE_SIMPLE;
|
||||
cfg->segment_sz_bytes = user_chunk_sz;
|
||||
|
||||
} else if (cfg->archive_sz < ONE_TB) {
|
||||
cfg->segment_sz_bytes = FOUR_MB;
|
||||
|
||||
} else {
|
||||
|
|
|
@ -44,6 +44,12 @@ extern "C" {
|
|||
// 8GB
|
||||
#define MIN_ARCHIVE_SZ (8589934592ULL)
|
||||
|
||||
typedef enum {
|
||||
MODE_SIMPLE = 0,
|
||||
MODE_SIMILARITY,
|
||||
MODE_ARCHIVE
|
||||
} dedupe_mode_t;
|
||||
|
||||
typedef struct {
|
||||
char rootdir[PATH_MAX+1];
|
||||
uint32_t chunk_sz; // Numeric ID: 1 - 4k ... 5 - 64k
|
||||
|
@ -58,9 +64,10 @@ typedef struct {
|
|||
int pct_interval; // Similarity based match intervals in %age.
|
||||
// The items below are computed given the above
|
||||
// components.
|
||||
dedupe_mode_t dedupe_mode;
|
||||
|
||||
uint32_t chunk_sz_bytes; // Average chunk size
|
||||
uint32_t segment_sz_bytes; // Segment size in bytes
|
||||
uint64_t segment_sz_bytes; // Segment size in bytes
|
||||
uint32_t segment_sz; // Number of chunks in one segment
|
||||
uint32_t container_sz; // Number of segments
|
||||
int directory_fanout; // Number of subdirectories in a directory
|
||||
|
@ -80,7 +87,8 @@ typedef struct _segment_entry {
|
|||
int read_config(char *configfile, archive_config_t *cfg);
|
||||
int write_config(char *configfile, archive_config_t *cfg);
|
||||
int set_config_s(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, cksum_t ck_sim,
|
||||
uint32_t chunksize, size_t file_sz, int pct_interval);
|
||||
uint32_t chunksize, size_t file_sz, uint64_t user_chunk_sz,
|
||||
int pct_interval);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue