Work in progress global dedupe.
This commit is contained in:
parent
f89473d29c
commit
7386f82a4f
4 changed files with 47 additions and 21 deletions
|
@ -175,7 +175,8 @@ read_config(char *configfile, archive_config_t *cfg)
|
||||||
// Default
|
// Default
|
||||||
cfg->verify_chunks = 0;
|
cfg->verify_chunks = 0;
|
||||||
cfg->algo = COMPRESS_LZ4;
|
cfg->algo = COMPRESS_LZ4;
|
||||||
cfg->chunk_cksum_type = DEFAULT_CKSUM;
|
cfg->chunk_cksum_type = DEFAULT_CHUNK_CKSUM;
|
||||||
|
cfg->similarity_cksum = DEFAULT_SIMILARITY_CKSUM;
|
||||||
cfg->pct_interval = DEFAULT_SIMILARITY_INTERVAL;
|
cfg->pct_interval = DEFAULT_SIMILARITY_INTERVAL;
|
||||||
|
|
||||||
fh = fopen(configfile, "r");
|
fh = fopen(configfile, "r");
|
||||||
|
@ -262,11 +263,19 @@ read_config(char *configfile, archive_config_t *cfg)
|
||||||
fclose(fh);
|
fclose(fh);
|
||||||
return (1);
|
return (1);
|
||||||
}
|
}
|
||||||
|
} else if (strncmp(line, "SIMILARITY_CKSUM") == 0) {
|
||||||
|
cfg->chunk_cksum_type = get_cksum_type(pos);
|
||||||
|
if (cfg->chunk_cksum_type == CKSUM_INVALID) {
|
||||||
|
fprintf(stderr, "Invalid CHUNK_CKSUM setting.\n");
|
||||||
|
fclose(fh);
|
||||||
|
return (1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fclose(fh);
|
fclose(fh);
|
||||||
cfg->compress_level = get_compress_level(cfg->algo);
|
cfg->compress_level = get_compress_level(cfg->algo);
|
||||||
cfg->chunk_cksum_sz = get_cksum_sz(cfg->chunk_cksum_type);
|
cfg->chunk_cksum_sz = get_cksum_sz(cfg->chunk_cksum_type);
|
||||||
|
cfg->similarity_cksum_sz = get_cksum_sz(cfg->similarity_cksum);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Now compute the remaining parameters.
|
* Now compute the remaining parameters.
|
||||||
|
@ -330,13 +339,15 @@ write_config(char *configfile, archive_config_t *cfg)
|
||||||
}
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
set_config_s(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, uint32_t chunksize,
|
set_config_s(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, cksum_t ck_sim,
|
||||||
size_t file_sz, int pct_interval)
|
uint32_t chunksize, size_t file_sz, int pct_interval)
|
||||||
{
|
{
|
||||||
cfg->algo = algo;
|
cfg->algo = algo;
|
||||||
cfg->chunk_cksum_type = ck;
|
cfg->chunk_cksum_type = ck;
|
||||||
|
cfg->similarity_cksum = ck_sim;
|
||||||
cfg->compress_level = get_compress_level(cfg->algo);
|
cfg->compress_level = get_compress_level(cfg->algo);
|
||||||
cfg->chunk_cksum_sz = get_cksum_sz(cfg->chunk_cksum_type);
|
cfg->chunk_cksum_sz = get_cksum_sz(cfg->chunk_cksum_type);
|
||||||
|
cfg->similarity_cksum_sz = get_cksum_sz(cfg->similarity_cksum);
|
||||||
cfg->chunk_sz = chunksize;
|
cfg->chunk_sz = chunksize;
|
||||||
cfg->chunk_sz_bytes = RAB_BLK_AVG_SZ(cfg->chunk_sz);
|
cfg->chunk_sz_bytes = RAB_BLK_AVG_SZ(cfg->chunk_sz);
|
||||||
cfg->pct_interval = pct_interval;
|
cfg->pct_interval = pct_interval;
|
||||||
|
|
|
@ -29,7 +29,8 @@ extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define DEFAULT_SIMILARITY_INTERVAL 5
|
#define DEFAULT_SIMILARITY_INTERVAL 5
|
||||||
#define DEFAULT_CKSUM CKSUM_BLAKE256
|
#define DEFAULT_CHUNK_CKSUM CKSUM_SHA256
|
||||||
|
#define DEFAULT_SIMILARITY_CKSUM CKSUM_BLAKE256
|
||||||
#define DEFAULT_COMPRESS COMPRESS_LZ4
|
#define DEFAULT_COMPRESS COMPRESS_LZ4
|
||||||
#define MIN_CK 1
|
#define MIN_CK 1
|
||||||
#define MAX_CK 5
|
#define MAX_CK 5
|
||||||
|
@ -44,8 +45,10 @@ typedef struct {
|
||||||
int verify_chunks; // Whether to use memcmp() to compare chunks byte for byte.
|
int verify_chunks; // Whether to use memcmp() to compare chunks byte for byte.
|
||||||
int algo; // Which compression algo for segments.
|
int algo; // Which compression algo for segments.
|
||||||
compress_algo_t compress_level; // Default preset compression level per algo.
|
compress_algo_t compress_level; // Default preset compression level per algo.
|
||||||
cksum_t chunk_cksum_type; // Which digest to use for hash based chunk lookup.
|
cksum_t chunk_cksum_type; // Which digest to use for hash based chunk comparison.
|
||||||
|
cksum_t similarity_cksum; // Which digest to use similarity based segment lookup.
|
||||||
int chunk_cksum_sz; // Size of cksum in bytes.
|
int chunk_cksum_sz; // Size of cksum in bytes.
|
||||||
|
int similarity_cksum_sz; // Size of cksum in bytes.
|
||||||
int pct_interval; // Similarity based match intervals in %age.
|
int pct_interval; // Similarity based match intervals in %age.
|
||||||
// The items below are computed given the above
|
// The items below are computed given the above
|
||||||
// components.
|
// components.
|
||||||
|
@ -60,9 +63,15 @@ typedef struct {
|
||||||
void *dbdata;
|
void *dbdata;
|
||||||
} archive_config_t;
|
} archive_config_t;
|
||||||
|
|
||||||
|
typedef struct _segment_entry {
|
||||||
|
uint64_t offset;
|
||||||
|
uint32_t length;
|
||||||
|
uchar_t *cksum;
|
||||||
|
} segment_entry_t;
|
||||||
|
|
||||||
int read_config(char *configfile, archive_config_t *cfg);
|
int read_config(char *configfile, archive_config_t *cfg);
|
||||||
int write_config(char *configfile, archive_config_t *cfg);
|
int write_config(char *configfile, archive_config_t *cfg);
|
||||||
int set_config_s(archive_config_t *cfg, compress_algo_t algo, cksum_t ck,
|
int set_config_s(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, cksum_t ck_sim,
|
||||||
uint32_t chunksize, size_t file_sz, int pct_interval);
|
uint32_t chunksize, size_t file_sz, int pct_interval);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|
|
@ -31,7 +31,7 @@
|
||||||
#include <allocator.h>
|
#include <allocator.h>
|
||||||
#include <pthread.h>
|
#include <pthread.h>
|
||||||
|
|
||||||
#include "initdb.h"
|
#include "db.h"
|
||||||
#include "config.h"
|
#include "config.h"
|
||||||
|
|
||||||
#define ONE_PB (1125899906842624ULL)
|
#define ONE_PB (1125899906842624ULL)
|
||||||
|
@ -43,10 +43,11 @@
|
||||||
* Hashtable structures for in-memory index.
|
* Hashtable structures for in-memory index.
|
||||||
*/
|
*/
|
||||||
typedef struct _hash_entry {
|
typedef struct _hash_entry {
|
||||||
uchar_t *cksum;
|
segment_entry_t *seg;
|
||||||
struct _hash_entry *next;
|
struct _hash_entry *next;
|
||||||
struct _hash_entry *lru_prev;
|
struct _hash_entry *lru_prev;
|
||||||
struct _hash_entry *lru_next;
|
struct _hash_entry *lru_next;
|
||||||
|
uchar_t cksum[1];
|
||||||
} hash_entry_t;
|
} hash_entry_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
|
@ -54,12 +55,13 @@ typedef struct {
|
||||||
} htab_t;
|
} htab_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
htab_t *htablst;
|
htab_t *list;
|
||||||
pthread_mutex_t *mlist;
|
pthread_mutex_t *mlist;
|
||||||
hash_entry_t *lru_head;
|
hash_entry_t *lru_head;
|
||||||
hash_entry_t *lru_tail;
|
hash_entry_t *lru_tail;
|
||||||
uint64_t memlimit;
|
uint64_t memlimit;
|
||||||
uint64_t memused;
|
uint64_t memused;
|
||||||
|
int hash_entry_size;
|
||||||
} htablst_t;
|
} htablst_t;
|
||||||
|
|
||||||
archive_config_t *
|
archive_config_t *
|
||||||
|
@ -83,14 +85,15 @@ init_global_db(char *configfile)
|
||||||
|
|
||||||
archive_config_t *
|
archive_config_t *
|
||||||
init_global_db_s(char *path, uint32_t chunksize, int pct_interval,
|
init_global_db_s(char *path, uint32_t chunksize, int pct_interval,
|
||||||
compress_algo_t algo, cksum_t ck, size_t file_sz, size_t memlimit)
|
compress_algo_t algo, cksum_t ck, cksum_t ck_sim, size_t file_sz,
|
||||||
|
size_t memlimit)
|
||||||
{
|
{
|
||||||
archive_config_t *cfg;
|
archive_config_t *cfg;
|
||||||
int rv;
|
int rv;
|
||||||
float diff;
|
float diff;
|
||||||
|
|
||||||
cfg = calloc(1, sizeof (archive_config_t));
|
cfg = calloc(1, sizeof (archive_config_t));
|
||||||
rv = set_config_s(cfg, algo, ck, chunksize, file_sz, chunks_per_seg, pct_interval);
|
rv = set_config_s(cfg, algo, ck, ck_sim, chunksize, file_sz, chunks_per_seg, pct_interval);
|
||||||
|
|
||||||
if (path != NULL) {
|
if (path != NULL) {
|
||||||
printf("Disk based index not yet implemented.\n");
|
printf("Disk based index not yet implemented.\n");
|
||||||
|
@ -100,24 +103,25 @@ init_global_db_s(char *path, uint32_t chunksize, int pct_interval,
|
||||||
uint32_t hash_slots, intervals, i;
|
uint32_t hash_slots, intervals, i;
|
||||||
uint64_t memreqd;
|
uint64_t memreqd;
|
||||||
htablst_t *htablst;
|
htablst_t *htablst;
|
||||||
|
int hash_entry_size;
|
||||||
|
|
||||||
// Compute total hashtable entries first
|
// Compute total hashtable entries first
|
||||||
intervals = 100 / pct_interval - 1;
|
intervals = 100 / pct_interval - 1;
|
||||||
hash_slots = file_sz / cfg->segment_sz_bytes + 1;
|
hash_slots = file_sz / cfg->segment_sz_bytes + 1;
|
||||||
hash_slots *= intervals;
|
hash_slots *= intervals;
|
||||||
|
hash_entry_size = sizeof (hash_entry_t) + cfg->similarity_cksum_sz - 1;
|
||||||
|
|
||||||
// Compute memory required to hold all hash entries assuming worst case 50%
|
// Compute memory required to hold all hash entries assuming worst case 50%
|
||||||
// occupancy.
|
// occupancy.
|
||||||
memreqd = hash_slots * (sizeof (hash_entry_t) + cfg->chunk_cksum_sz +
|
memreqd = hash_slots * (hash_entry_size + sizeof (hash_entry_t *) +
|
||||||
sizeof (hash_entry_t *) + (sizeof (hash_entry_t *)) / 2);
|
(sizeof (hash_entry_t *)) / 2);
|
||||||
memreqd += hash_slots * sizeof (hash_entry_t **);
|
memreqd += hash_slots * sizeof (hash_entry_t **);
|
||||||
diff = (float)pct_interval / 100.0;
|
diff = (float)pct_interval / 100.0;
|
||||||
|
|
||||||
// Reduce hash_slots to remain within memlimit
|
// Reduce hash_slots to remain within memlimit
|
||||||
while (memreqd > memlimit) {
|
while (memreqd > memlimit) {
|
||||||
hash_slots -= (hash_slots * diff);
|
hash_slots -= (hash_slots * diff);
|
||||||
memreqd = hash_slots * (sizeof (hash_entry_t) +
|
memreqd = hash_slots * (hash_entry_size + sizeof (hash_entry_t *) +
|
||||||
cfg->chunk_cksum_sz + sizeof (hash_entry_t *) +
|
|
||||||
(sizeof (hash_entry_t *)) / 2);
|
(sizeof (hash_entry_t *)) / 2);
|
||||||
memreqd += hash_slots * sizeof (hash_entry_t **);
|
memreqd += hash_slots * sizeof (hash_entry_t **);
|
||||||
}
|
}
|
||||||
|
@ -126,17 +130,18 @@ init_global_db_s(char *path, uint32_t chunksize, int pct_interval,
|
||||||
// each having hash_slots / intervals slots.
|
// each having hash_slots / intervals slots.
|
||||||
htablst = calloc(1, sizeof (htablst_t));
|
htablst = calloc(1, sizeof (htablst_t));
|
||||||
htablst->memlimit = memlimit;
|
htablst->memlimit = memlimit;
|
||||||
htablst->htablst = (htab_t *)calloc(intervals, sizeof (htab_t));
|
htablst->list = (htab_t *)calloc(intervals, sizeof (htab_t));
|
||||||
htablst->mlist = (pthread_mutex_t *)malloc(intervals * sizeof (pthread_mutex_t));
|
htablst->mlist = (pthread_mutex_t *)malloc(intervals * sizeof (pthread_mutex_t));
|
||||||
|
htablst->hash_entry_size = hash_entry_size;
|
||||||
|
|
||||||
for (i = 0; i < intervals; i++) {
|
for (i = 0; i < intervals; i++) {
|
||||||
htablst->htablst[i].htab = (hash_entry_t **)calloc(hash_slots / intervals,
|
htablst->list[i].htab = (hash_entry_t **)calloc(hash_slots / intervals,
|
||||||
sizeof (hash_entry_t *));
|
sizeof (hash_entry_t *));
|
||||||
htablst->memused += ((hash_slots / intervals) * (sizeof (hash_entry_t *)));
|
htablst->memused += ((hash_slots / intervals) * (sizeof (hash_entry_t *)));
|
||||||
pthread_mutex_init(&(htablst->mlist[i]), NULL);
|
pthread_mutex_init(&(htablst->mlist[i]), NULL);
|
||||||
}
|
}
|
||||||
cfg->dbdata = htablst;
|
cfg->dbdata = htablst;
|
||||||
slab_cache_add(sizeof (hash_entry_t));
|
slab_cache_add(hash_entry_size);
|
||||||
slab_cache_add(cfg->chunk_cksum_sz);
|
slab_cache_add(cfg->chunk_cksum_sz);
|
||||||
}
|
}
|
||||||
return (cfg);
|
return (cfg);
|
|
@ -18,8 +18,8 @@
|
||||||
* moinakg@belenix.org, http://moinakg.wordpress.com/
|
* moinakg@belenix.org, http://moinakg.wordpress.com/
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef _INITDB_H
|
#ifndef _DB_H
|
||||||
#define _INITDB_H
|
#define _DB_H
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
|
@ -27,7 +27,8 @@ extern "C" {
|
||||||
|
|
||||||
archive_config_t *init_global_db(char *configfile);
|
archive_config_t *init_global_db(char *configfile);
|
||||||
archive_config_t *init_global_db_s(char *path, uint32_t chunksize, int pct_interval,
|
archive_config_t *init_global_db_s(char *path, uint32_t chunksize, int pct_interval,
|
||||||
compress_algo_t algo, cksum_t ck, size_t file_sz, size_t memlimit);
|
compress_algo_t algo, cksum_t ck, cksum_t ck_sim, size_t file_sz,
|
||||||
|
size_t memlimit);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
Loading…
Reference in a new issue