Global dedupe work in progress.
This commit is contained in:
parent
1eae57c8a2
commit
24d62bfde9
5 changed files with 104 additions and 12 deletions
|
@ -176,7 +176,7 @@ read_config(char *configfile, archive_config_t *cfg)
|
|||
cfg->verify_chunks = 0;
|
||||
cfg->algo = COMPRESS_LZ4;
|
||||
cfg->chunk_cksum_type = DEFAULT_CKSUM;
|
||||
cfg->similarity_interval = DEFAULT_SIMILARITY_INTERVAL;
|
||||
cfg->pct_interval = DEFAULT_SIMILARITY_INTERVAL;
|
||||
|
||||
fh = fopen(configfile, "r");
|
||||
if (fh == NULL) {
|
||||
|
@ -286,6 +286,7 @@ read_config(char *configfile, archive_config_t *cfg)
|
|||
cfg->directory_levels = 3;
|
||||
}
|
||||
|
||||
cfg->segment_sz_bytes = segment_sz_bytes;
|
||||
cfg->segment_sz = segment_sz_bytes / cfg->chunk_sz_bytes;
|
||||
|
||||
total_dirs = 1;
|
||||
|
@ -329,8 +330,8 @@ write_config(char *configfile, archive_config_t *cfg)
|
|||
}
|
||||
|
||||
int
|
||||
set_simple_config(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, uint32_t chunksize,
|
||||
size_t file_sz, uint32_t chunks_per_seg)
|
||||
set_config_s(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, uint32_t chunksize,
|
||||
size_t file_sz, int pct_interval)
|
||||
{
|
||||
cfg->algo = algo;
|
||||
cfg->chunk_cksum_type = ck;
|
||||
|
@ -338,6 +339,7 @@ set_simple_config(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, uint3
|
|||
cfg->chunk_cksum_sz = get_cksum_sz(cfg->chunk_cksum_type);
|
||||
cfg->chunk_sz = chunksize;
|
||||
cfg->chunk_sz_bytes = RAB_BLK_AVG_SZ(cfg->chunk_sz);
|
||||
cfg->pct_interval = pct_interval;
|
||||
|
||||
cfg->archive_sz = file_sz;
|
||||
if (cfg->archive_sz < ONE_TB) {
|
||||
|
@ -346,5 +348,10 @@ set_simple_config(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, uint3
|
|||
} else {
|
||||
segment_sz_bytes = EIGHT_MB;
|
||||
}
|
||||
|
||||
cfg->segment_sz_bytes = segment_sz_bytes;
|
||||
cfg->segment_sz = segment_sz_bytes / cfg->chunk_sz_bytes;
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
|
|
|
@ -46,22 +46,24 @@ typedef struct {
|
|||
compress_algo_t compress_level; // Default preset compression level per algo.
|
||||
cksum_t chunk_cksum_type; // Which digest to use for hash based chunk lookup.
|
||||
int chunk_cksum_sz; // Size of cksum in bytes.
|
||||
int similarity_interval; // Similarity based match intervals in %age.
|
||||
int pct_interval; // Similarity based match intervals in %age.
|
||||
// The items below are computed given the above
|
||||
// components.
|
||||
|
||||
uint32_t chunk_sz_bytes; // Average chunk size
|
||||
uint32_t segment_sz; // Number of chunks
|
||||
uint32_t segment_sz_bytes; // Segment size in bytes
|
||||
uint32_t segment_sz; // Number of chunks in one segment
|
||||
uint32_t container_sz; // Number of segments
|
||||
int directory_fanout; // Number of subdirectories in a directory
|
||||
int directory_levels; // Levels of nested directories
|
||||
int num_containers; // Number of containers in a directory
|
||||
void *dbdata;
|
||||
} archive_config_t;
|
||||
|
||||
int read_config(char *configfile, archive_config_t *cfg);
|
||||
int write_config(char *configfile, archive_config_t *cfg);
|
||||
int set_simple_config(archive_config_t *cfg, compress_algo_t algo, cksum_t ck,
|
||||
uint32_t chunksize, size_t file_sz, uint32_t chunks_per_seg);
|
||||
int set_config_s(archive_config_t *cfg, compress_algo_t algo, cksum_t ck,
|
||||
uint32_t chunksize, size_t file_sz, int pct_interval);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
@ -28,7 +28,8 @@
|
|||
#include <stdio.h>
|
||||
#include <errno.h>
|
||||
#include <utils.h>
|
||||
#include <config.h>
|
||||
#include <allocator.h>
|
||||
#include <pthread.h>
|
||||
|
||||
#include "initdb.h"
|
||||
#include "config.h"
|
||||
|
@ -38,6 +39,28 @@
|
|||
#define FOUR_MB (4194304ULL)
|
||||
#define EIGHT_MB (8388608ULL)
|
||||
|
||||
/*
|
||||
* Hashtable structures for in-memory index.
|
||||
*/
|
||||
typedef struct _hash_entry {
|
||||
uchar_t *cksum;
|
||||
struct _hash_entry *next;
|
||||
struct _hash_entry *lru_prev;
|
||||
struct _hash_entry *lru_next;
|
||||
} hash_entry_t;
|
||||
|
||||
typedef struct {
|
||||
hash_entry_t **htab;
|
||||
} htab_t;
|
||||
|
||||
typedef struct {
|
||||
htab_t *htablst;
|
||||
pthread_mutex_t *mlist;
|
||||
hash_entry_t *lru_head;
|
||||
hash_entry_t *lru_tail;
|
||||
uint64_t memlimit;
|
||||
uint64_t memused;
|
||||
} htablst_t;
|
||||
|
||||
archive_config_t *
|
||||
init_global_db(char *configfile)
|
||||
|
@ -59,12 +82,67 @@ init_global_db(char *configfile)
|
|||
}
|
||||
|
||||
archive_config_t *
|
||||
init_global_db_simple(char *path, uint32_t chunksize, uint32_t chunks_per_seg,
|
||||
init_global_db_s(char *path, uint32_t chunksize, int pct_interval,
|
||||
compress_algo_t algo, cksum_t ck, size_t file_sz, size_t memlimit)
|
||||
{
|
||||
archive_config_t *cfg;
|
||||
int rv;
|
||||
float diff;
|
||||
|
||||
cfg = calloc(1, sizeof (archive_config_t));
|
||||
rv = set_simple_config(cfg, algo, ck, chunksize, file_sz, chunks_per_seg);
|
||||
rv = set_config_s(cfg, algo, ck, chunksize, file_sz, chunks_per_seg, pct_interval);
|
||||
|
||||
if (path != NULL) {
|
||||
printf("Disk based index not yet implemented.\n");
|
||||
free(cfg);
|
||||
return (NULL);
|
||||
} else {
|
||||
uint32_t hash_slots, intervals, i;
|
||||
uint64_t memreqd;
|
||||
htablst_t *htablst;
|
||||
|
||||
// Compute total hashtable entries first
|
||||
intervals = 100 / pct_interval - 1;
|
||||
hash_slots = file_sz / cfg->segment_sz_bytes + 1;
|
||||
hash_slots *= intervals;
|
||||
|
||||
// Compute memory required to hold all hash entries assuming worst case 50%
|
||||
// occupancy.
|
||||
memreqd = hash_slots * (sizeof (hash_entry_t) + cfg->chunk_cksum_sz +
|
||||
sizeof (hash_entry_t *) + (sizeof (hash_entry_t *)) / 2);
|
||||
memreqd += hash_slots * sizeof (hash_entry_t **);
|
||||
diff = (float)pct_interval / 100.0;
|
||||
|
||||
// Reduce hash_slots to remain within memlimit
|
||||
while (memreqd > memlimit) {
|
||||
hash_slots -= (hash_slots * diff);
|
||||
memreqd = hash_slots * (sizeof (hash_entry_t) +
|
||||
cfg->chunk_cksum_sz + sizeof (hash_entry_t *) +
|
||||
(sizeof (hash_entry_t *)) / 2);
|
||||
memreqd += hash_slots * sizeof (hash_entry_t **);
|
||||
}
|
||||
|
||||
// Now create as many hash tables as there are similarity match intervals
|
||||
// each having hash_slots / intervals slots.
|
||||
htablst = calloc(1, sizeof (htablst_t));
|
||||
htablst->memlimit = memlimit;
|
||||
htablst->htablst = (htab_t *)calloc(intervals, sizeof (htab_t));
|
||||
htablst->mlist = (pthread_mutex_t *)malloc(intervals * sizeof (pthread_mutex_t));
|
||||
|
||||
for (i = 0; i < intervals; i++) {
|
||||
htablst->htablst[i].htab = (hash_entry_t **)calloc(hash_slots / intervals,
|
||||
sizeof (hash_entry_t *));
|
||||
htablst->memused += ((hash_slots / intervals) * (sizeof (hash_entry_t *)));
|
||||
pthread_mutex_init(&(htablst->mlist[i]), NULL);
|
||||
}
|
||||
cfg->dbdata = htablst;
|
||||
slab_cache_add(sizeof (hash_entry_t));
|
||||
slab_cache_add(cfg->chunk_cksum_sz);
|
||||
}
|
||||
return (cfg);
|
||||
}
|
||||
|
||||
int
|
||||
db_insert_s(archive_config_t *cfg, uchar_t *cksum, int interval_num)
|
||||
{
|
||||
}
|
||||
|
|
|
@ -26,8 +26,8 @@ extern "C" {
|
|||
#endif
|
||||
|
||||
archive_config_t *init_global_db(char *configfile);
|
||||
archive_config_t *init_global_db_simple(char *path, uint32_t chunksize, uint32_t chunks_per_seg,
|
||||
compress_algo_t algo, cksum_t ck, size_t file_sz, size_t memlimit)
|
||||
archive_config_t *init_global_db_s(char *path, uint32_t chunksize, int pct_interval,
|
||||
compress_algo_t algo, cksum_t ck, size_t file_sz, size_t memlimit);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
@ -149,6 +149,11 @@ typedef struct rab_blockentry {
|
|||
struct rab_blockentry *next;
|
||||
} rabin_blockentry_t;
|
||||
|
||||
typedef struct global_blockentry {
|
||||
uint64_t offset;
|
||||
uint32_t length;
|
||||
} global_blockentry_t;
|
||||
|
||||
typedef struct {
|
||||
unsigned char *current_window_data;
|
||||
rabin_blockentry_t **blocks;
|
||||
|
|
Loading…
Reference in a new issue