Work in progress global dedupe changes.
This commit is contained in:
parent
f8f23e5200
commit
f2806d4ffa
5 changed files with 34 additions and 18 deletions
|
@ -186,7 +186,7 @@ struct cmp_data {
|
||||||
uint64_t chunksize;
|
uint64_t chunksize;
|
||||||
uint64_t len_cmp, len_cmp_be;
|
uint64_t len_cmp, len_cmp_be;
|
||||||
uchar_t checksum[CKSUM_MAX_BYTES];
|
uchar_t checksum[CKSUM_MAX_BYTES];
|
||||||
int level, cksum_mt;
|
int level, cksum_mt, out_fd;
|
||||||
unsigned int id;
|
unsigned int id;
|
||||||
compress_func_ptr compress;
|
compress_func_ptr compress;
|
||||||
compress_func_ptr decompress;
|
compress_func_ptr decompress;
|
||||||
|
|
|
@ -38,11 +38,6 @@
|
||||||
|
|
||||||
#include "db.h"
|
#include "db.h"
|
||||||
|
|
||||||
#define ONE_PB (1125899906842624ULL)
|
|
||||||
#define ONE_TB (1099511627776ULL)
|
|
||||||
#define FOUR_MB (4194304ULL)
|
|
||||||
#define EIGHT_MB (8388608ULL)
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Hashtable structures for in-memory index.
|
* Hashtable structures for in-memory index.
|
||||||
*/
|
*/
|
||||||
|
@ -103,16 +98,16 @@ static cleanup_indx(index_t *indx)
|
||||||
}
|
}
|
||||||
|
|
||||||
archive_config_t *
|
archive_config_t *
|
||||||
init_global_db_s(char *path, char *tmppath, uint32_t chunksize, int pct_interval,
|
init_global_db_s(char *path, char *tmppath, uint32_t chunksize, uint64_t user_chunk_sz,
|
||||||
compress_algo_t algo, cksum_t ck, cksum_t ck_sim, size_t file_sz,
|
int pct_interval, compress_algo_t algo, cksum_t ck, cksum_t ck_sim,
|
||||||
size_t memlimit, int nthreads)
|
size_t file_sz, size_t memlimit, int nthreads)
|
||||||
{
|
{
|
||||||
archive_config_t *cfg;
|
archive_config_t *cfg;
|
||||||
int rv;
|
int rv;
|
||||||
float diff;
|
float diff;
|
||||||
|
|
||||||
cfg = calloc(1, sizeof (archive_config_t));
|
cfg = calloc(1, sizeof (archive_config_t));
|
||||||
rv = set_config_s(cfg, algo, ck, ck_sim, chunksize, file_sz, pct_interval);
|
rv = set_config_s(cfg, algo, ck, ck_sim, chunksize, file_sz, user_chunk_sz, pct_interval);
|
||||||
|
|
||||||
if (path != NULL) {
|
if (path != NULL) {
|
||||||
printf("Disk based index not yet implemented.\n");
|
printf("Disk based index not yet implemented.\n");
|
||||||
|
@ -125,7 +120,10 @@ init_global_db_s(char *path, char *tmppath, uint32_t chunksize, int pct_interval
|
||||||
index_t *indx;
|
index_t *indx;
|
||||||
|
|
||||||
// Compute total hashtable entries first
|
// Compute total hashtable entries first
|
||||||
intervals = 100 / pct_interval - 1;
|
if (pct_interval == 0)
|
||||||
|
intervals = 1;
|
||||||
|
else
|
||||||
|
intervals = 100 / pct_interval - 1;
|
||||||
hash_slots = file_sz / cfg->segment_sz_bytes + 1;
|
hash_slots = file_sz / cfg->segment_sz_bytes + 1;
|
||||||
hash_slots *= intervals;
|
hash_slots *= intervals;
|
||||||
hash_entry_size = sizeof (hash_entry_t) + cfg->similarity_cksum_sz - 1;
|
hash_entry_size = sizeof (hash_entry_t) + cfg->similarity_cksum_sz - 1;
|
||||||
|
@ -249,6 +247,7 @@ db_lookup_insert_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval,
|
||||||
htab[htab_entry] = htab[htab_entry]->next;
|
htab[htab_entry] = htab[htab_entry]->next;
|
||||||
} else {
|
} else {
|
||||||
ent = (hash_entry_t *)malloc(indx->hash_entry_size);
|
ent = (hash_entry_t *)malloc(indx->hash_entry_size);
|
||||||
|
indx->memused += indx->hash_entry_size;
|
||||||
}
|
}
|
||||||
ent->seg_offset = seg_offset;
|
ent->seg_offset = seg_offset;
|
||||||
ent->next = 0;
|
ent->next = 0;
|
||||||
|
|
|
@ -33,8 +33,9 @@ extern "C" {
|
||||||
|
|
||||||
archive_config_t *init_global_db(char *configfile);
|
archive_config_t *init_global_db(char *configfile);
|
||||||
archive_config_t *init_global_db_s(char *path, char *tmppath, uint32_t chunksize,
|
archive_config_t *init_global_db_s(char *path, char *tmppath, uint32_t chunksize,
|
||||||
int pct_interval, compress_algo_t algo, cksum_t ck,
|
uint64_t user_chunk_sz, int pct_interval, compress_algo_t algo,
|
||||||
cksum_t ck_sim, size_t file_sz, size_t memlimit, int nthreads);
|
cksum_t ck, cksum_t ck_sim, size_t file_sz, size_t memlimit,
|
||||||
|
int nthreads);
|
||||||
uint64_t db_lookup_insert_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval,
|
uint64_t db_lookup_insert_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval,
|
||||||
uint64_t seg_offset, int do_insert);
|
uint64_t seg_offset, int do_insert);
|
||||||
|
|
||||||
|
|
|
@ -42,6 +42,7 @@
|
||||||
#define ONE_TB (1099511627776ULL)
|
#define ONE_TB (1099511627776ULL)
|
||||||
#define FOUR_MB (4194304ULL)
|
#define FOUR_MB (4194304ULL)
|
||||||
#define EIGHT_MB (8388608ULL)
|
#define EIGHT_MB (8388608ULL)
|
||||||
|
#define EIGHT_GB (8589934592ULL)
|
||||||
|
|
||||||
static compress_algo_t
|
static compress_algo_t
|
||||||
get_compress_level(compress_algo_t algo)
|
get_compress_level(compress_algo_t algo)
|
||||||
|
@ -287,6 +288,7 @@ read_config(char *configfile, archive_config_t *cfg)
|
||||||
*/
|
*/
|
||||||
cfg->chunk_sz_bytes = RAB_BLK_AVG_SZ(cfg->chunk_sz);
|
cfg->chunk_sz_bytes = RAB_BLK_AVG_SZ(cfg->chunk_sz);
|
||||||
cfg->directory_levels = 2;
|
cfg->directory_levels = 2;
|
||||||
|
|
||||||
if (cfg->archive_sz < ONE_TB) {
|
if (cfg->archive_sz < ONE_TB) {
|
||||||
segment_sz_bytes = FOUR_MB;
|
segment_sz_bytes = FOUR_MB;
|
||||||
cfg->directory_fanout = 128;
|
cfg->directory_fanout = 128;
|
||||||
|
@ -349,8 +351,9 @@ write_config(char *configfile, archive_config_t *cfg)
|
||||||
|
|
||||||
int
|
int
|
||||||
set_config_s(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, cksum_t ck_sim,
|
set_config_s(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, cksum_t ck_sim,
|
||||||
uint32_t chunksize, size_t file_sz, int pct_interval)
|
uint32_t chunksize, size_t file_sz, uint64_t user_chunk_sz, int pct_interval)
|
||||||
{
|
{
|
||||||
|
|
||||||
cfg->algo = algo;
|
cfg->algo = algo;
|
||||||
cfg->chunk_cksum_type = ck;
|
cfg->chunk_cksum_type = ck;
|
||||||
cfg->similarity_cksum = ck_sim;
|
cfg->similarity_cksum = ck_sim;
|
||||||
|
@ -360,9 +363,14 @@ set_config_s(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, cksum_t ck
|
||||||
cfg->chunk_sz = chunksize;
|
cfg->chunk_sz = chunksize;
|
||||||
cfg->chunk_sz_bytes = RAB_BLK_AVG_SZ(cfg->chunk_sz);
|
cfg->chunk_sz_bytes = RAB_BLK_AVG_SZ(cfg->chunk_sz);
|
||||||
cfg->pct_interval = pct_interval;
|
cfg->pct_interval = pct_interval;
|
||||||
|
|
||||||
cfg->archive_sz = file_sz;
|
cfg->archive_sz = file_sz;
|
||||||
if (cfg->archive_sz < ONE_TB) {
|
cfg->dedupe_mode = MODE_SIMILARITY;
|
||||||
|
|
||||||
|
if (cfg->archive_sz <= EIGHT_GB) {
|
||||||
|
cfg->dedupe_mode = MODE_SIMPLE;
|
||||||
|
cfg->segment_sz_bytes = user_chunk_sz;
|
||||||
|
|
||||||
|
} else if (cfg->archive_sz < ONE_TB) {
|
||||||
cfg->segment_sz_bytes = FOUR_MB;
|
cfg->segment_sz_bytes = FOUR_MB;
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -44,6 +44,12 @@ extern "C" {
|
||||||
// 8GB
|
// 8GB
|
||||||
#define MIN_ARCHIVE_SZ (8589934592ULL)
|
#define MIN_ARCHIVE_SZ (8589934592ULL)
|
||||||
|
|
||||||
|
typedef enum {
|
||||||
|
MODE_SIMPLE = 0,
|
||||||
|
MODE_SIMILARITY,
|
||||||
|
MODE_ARCHIVE
|
||||||
|
} dedupe_mode_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
char rootdir[PATH_MAX+1];
|
char rootdir[PATH_MAX+1];
|
||||||
uint32_t chunk_sz; // Numeric ID: 1 - 4k ... 5 - 64k
|
uint32_t chunk_sz; // Numeric ID: 1 - 4k ... 5 - 64k
|
||||||
|
@ -58,9 +64,10 @@ typedef struct {
|
||||||
int pct_interval; // Similarity based match intervals in %age.
|
int pct_interval; // Similarity based match intervals in %age.
|
||||||
// The items below are computed given the above
|
// The items below are computed given the above
|
||||||
// components.
|
// components.
|
||||||
|
dedupe_mode_t dedupe_mode;
|
||||||
|
|
||||||
uint32_t chunk_sz_bytes; // Average chunk size
|
uint32_t chunk_sz_bytes; // Average chunk size
|
||||||
uint32_t segment_sz_bytes; // Segment size in bytes
|
uint64_t segment_sz_bytes; // Segment size in bytes
|
||||||
uint32_t segment_sz; // Number of chunks in one segment
|
uint32_t segment_sz; // Number of chunks in one segment
|
||||||
uint32_t container_sz; // Number of segments
|
uint32_t container_sz; // Number of segments
|
||||||
int directory_fanout; // Number of subdirectories in a directory
|
int directory_fanout; // Number of subdirectories in a directory
|
||||||
|
@ -80,7 +87,8 @@ typedef struct _segment_entry {
|
||||||
int read_config(char *configfile, archive_config_t *cfg);
|
int read_config(char *configfile, archive_config_t *cfg);
|
||||||
int write_config(char *configfile, archive_config_t *cfg);
|
int write_config(char *configfile, archive_config_t *cfg);
|
||||||
int set_config_s(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, cksum_t ck_sim,
|
int set_config_s(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, cksum_t ck_sim,
|
||||||
uint32_t chunksize, size_t file_sz, int pct_interval);
|
uint32_t chunksize, size_t file_sz, uint64_t user_chunk_sz,
|
||||||
|
int pct_interval);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue