Complete implementation for Segmented Global Deduplication.
This commit is contained in:
parent
a22b52cf08
commit
8ae571124d
8 changed files with 332 additions and 157 deletions
62
main.c
62
main.c
|
@ -1659,37 +1659,9 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
||||||
props.cksum = cksum;
|
props.cksum = cksum;
|
||||||
props.buf_extra = 0;
|
props.buf_extra = 0;
|
||||||
cread_buf = NULL;
|
cread_buf = NULL;
|
||||||
|
|
||||||
if (_props_func) {
|
|
||||||
_props_func(&props, level, chunksize);
|
|
||||||
if (chunksize + props.buf_extra > compressed_chunksize) {
|
|
||||||
compressed_chunksize += (chunksize + props.buf_extra -
|
|
||||||
compressed_chunksize);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
flags = 0;
|
flags = 0;
|
||||||
sbuf.st_size = 0;
|
sbuf.st_size = 0;
|
||||||
dedupe_flag = RABIN_DEDUPE_SEGMENTED; // Silence the compiler
|
dedupe_flag = RABIN_DEDUPE_SEGMENTED; // Silence the compiler
|
||||||
if (enable_rabin_scan || enable_fixed_scan || enable_rabin_global) {
|
|
||||||
if (enable_rabin_global) {
|
|
||||||
flags |= (FLAG_DEDUP | FLAG_DEDUP_FIXED);
|
|
||||||
dedupe_flag = RABIN_DEDUPE_FILE_GLOBAL;
|
|
||||||
} else if (enable_rabin_scan) {
|
|
||||||
flags |= FLAG_DEDUP;
|
|
||||||
dedupe_flag = RABIN_DEDUPE_SEGMENTED;
|
|
||||||
} else {
|
|
||||||
flags |= FLAG_DEDUP_FIXED;
|
|
||||||
dedupe_flag = RABIN_DEDUPE_FIXED;
|
|
||||||
}
|
|
||||||
/* Additional scratch space for dedup arrays. */
|
|
||||||
if (chunksize + dedupe_buf_extra(chunksize, 0, algo, enable_delta_encode)
|
|
||||||
> compressed_chunksize) {
|
|
||||||
compressed_chunksize += (chunksize +
|
|
||||||
dedupe_buf_extra(chunksize, 0, algo, enable_delta_encode)) -
|
|
||||||
compressed_chunksize;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (encrypt_type) {
|
if (encrypt_type) {
|
||||||
uchar_t pw[MAX_PW_LEN];
|
uchar_t pw[MAX_PW_LEN];
|
||||||
|
@ -1867,6 +1839,14 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
||||||
strcpy(tmpdir, tmp);
|
strcpy(tmpdir, tmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (enable_rabin_global && !pipe_mode) {
|
||||||
|
my_sysinfo msys_info;
|
||||||
|
|
||||||
|
get_sys_limits(&msys_info);
|
||||||
|
global_dedupe_bufadjust(rab_blk_size, &chunksize, 0, algo, cksum,
|
||||||
|
CKSUM_BLAKE256, sbuf.st_size, msys_info.freeram, nthreads);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Compressed buffer size must include zlib/dedup scratch space and
|
* Compressed buffer size must include zlib/dedup scratch space and
|
||||||
* chunk header space.
|
* chunk header space.
|
||||||
|
@ -1885,7 +1865,25 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
||||||
compressed_chunksize);
|
compressed_chunksize);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (_props_func) {
|
||||||
|
_props_func(&props, level, chunksize);
|
||||||
|
if (chunksize + props.buf_extra > compressed_chunksize) {
|
||||||
|
compressed_chunksize += (chunksize + props.buf_extra -
|
||||||
|
compressed_chunksize);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (enable_rabin_scan || enable_fixed_scan || enable_rabin_global) {
|
if (enable_rabin_scan || enable_fixed_scan || enable_rabin_global) {
|
||||||
|
if (enable_rabin_global) {
|
||||||
|
flags |= (FLAG_DEDUP | FLAG_DEDUP_FIXED);
|
||||||
|
dedupe_flag = RABIN_DEDUPE_FILE_GLOBAL;
|
||||||
|
} else if (enable_rabin_scan) {
|
||||||
|
flags |= FLAG_DEDUP;
|
||||||
|
dedupe_flag = RABIN_DEDUPE_SEGMENTED;
|
||||||
|
} else {
|
||||||
|
flags |= FLAG_DEDUP_FIXED;
|
||||||
|
dedupe_flag = RABIN_DEDUPE_FIXED;
|
||||||
|
}
|
||||||
/* Additional scratch space for dedup arrays. */
|
/* Additional scratch space for dedup arrays. */
|
||||||
if (chunksize + dedupe_buf_extra(chunksize, 0, algo, enable_delta_encode)
|
if (chunksize + dedupe_buf_extra(chunksize, 0, algo, enable_delta_encode)
|
||||||
> compressed_chunksize) {
|
> compressed_chunksize) {
|
||||||
|
@ -1908,14 +1906,6 @@ start_compress(const char *filename, uint64_t chunksize, int level)
|
||||||
nprocs = nthreads;
|
nprocs = nthreads;
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
|
||||||
if (enable_rabin_global && !pipe_mode) {
|
|
||||||
my_sysinfo msys_info;
|
|
||||||
|
|
||||||
get_sys_limits(&msys_info);
|
|
||||||
global_dedupe_bufadjust(rab_blk_size, &chunksize, 0, algo, cksum,
|
|
||||||
CKSUM_BLAKE256, sbuf.st_size, msys_info.freeram, nthreads);
|
|
||||||
}
|
|
||||||
|
|
||||||
dary = (struct cmp_data **)slab_calloc(NULL, nprocs, sizeof (struct cmp_data *));
|
dary = (struct cmp_data **)slab_calloc(NULL, nprocs, sizeof (struct cmp_data *));
|
||||||
if ((enable_rabin_scan || enable_fixed_scan))
|
if ((enable_rabin_scan || enable_fixed_scan))
|
||||||
cread_buf = (uchar_t *)slab_alloc(NULL, compressed_chunksize);
|
cread_buf = (uchar_t *)slab_alloc(NULL, compressed_chunksize);
|
||||||
|
|
|
@ -108,7 +108,10 @@ get_compress_str(compress_algo_t algo)
|
||||||
static cksum_t
|
static cksum_t
|
||||||
get_cksum_type(char *cksum_name)
|
get_cksum_type(char *cksum_name)
|
||||||
{
|
{
|
||||||
if (strcmp(cksum_name, "SHA256") == 0) {
|
if (strcmp(cksum_name, "CRC64") == 0) {
|
||||||
|
return (CKSUM_CRC64);
|
||||||
|
|
||||||
|
} else if (strcmp(cksum_name, "SHA256") == 0) {
|
||||||
return (CKSUM_SHA256);
|
return (CKSUM_SHA256);
|
||||||
|
|
||||||
} else if (strcmp(cksum_name, "SHA512") == 0) {
|
} else if (strcmp(cksum_name, "SHA512") == 0) {
|
||||||
|
@ -132,7 +135,10 @@ get_cksum_type(char *cksum_name)
|
||||||
static char *
|
static char *
|
||||||
get_cksum_str(cksum_t ck)
|
get_cksum_str(cksum_t ck)
|
||||||
{
|
{
|
||||||
if (ck == CKSUM_SHA256) {
|
if (ck == CKSUM_CRC64) {
|
||||||
|
return ("CRC64");
|
||||||
|
|
||||||
|
} else if (ck == CKSUM_SHA256) {
|
||||||
return ("SHA256");
|
return ("SHA256");
|
||||||
|
|
||||||
} else if (ck == CKSUM_SHA512) {
|
} else if (ck == CKSUM_SHA512) {
|
||||||
|
@ -156,7 +162,10 @@ get_cksum_str(cksum_t ck)
|
||||||
static int
|
static int
|
||||||
get_cksum_sz(cksum_t ck)
|
get_cksum_sz(cksum_t ck)
|
||||||
{
|
{
|
||||||
if (ck == CKSUM_SHA256 || ck == CKSUM_BLAKE256 || ck == CKSUM_KECCAK256) {
|
if (ck == CKSUM_CRC64) {
|
||||||
|
return (8);
|
||||||
|
|
||||||
|
} else if (ck == CKSUM_SHA256 || ck == CKSUM_BLAKE256 || ck == CKSUM_KECCAK256) {
|
||||||
return (32);
|
return (32);
|
||||||
|
|
||||||
} else if (ck == CKSUM_SHA512 || ck == CKSUM_BLAKE512 || ck == CKSUM_KECCAK512) {
|
} else if (ck == CKSUM_SHA512 || ck == CKSUM_BLAKE512 || ck == CKSUM_KECCAK512) {
|
||||||
|
@ -360,9 +369,10 @@ set_config_s(archive_config_t *cfg, const char *algo, cksum_t ck, cksum_t ck_sim
|
||||||
cfg->archive_sz = file_sz;
|
cfg->archive_sz = file_sz;
|
||||||
cfg->dedupe_mode = MODE_SIMILARITY;
|
cfg->dedupe_mode = MODE_SIMILARITY;
|
||||||
|
|
||||||
if (cfg->archive_sz <= SIXTEEN_GB || pct_interval == 0 || pct_interval == 100) {
|
if (cfg->archive_sz <= SIXTEEN_GB && (pct_interval == 0 || pct_interval == 100)) {
|
||||||
cfg->dedupe_mode = MODE_SIMPLE;
|
cfg->dedupe_mode = MODE_SIMPLE;
|
||||||
cfg->segment_sz_bytes = user_chunk_sz;
|
cfg->segment_sz_bytes = user_chunk_sz;
|
||||||
|
cfg->similarity_cksum_sz = cfg->chunk_cksum_sz;
|
||||||
|
|
||||||
} else if (cfg->archive_sz < ONE_TB) {
|
} else if (cfg->archive_sz < ONE_TB) {
|
||||||
cfg->segment_sz_bytes = FOUR_MB;
|
cfg->segment_sz_bytes = FOUR_MB;
|
||||||
|
|
|
@ -37,7 +37,7 @@ extern "C" {
|
||||||
#define DEFAULT_CHUNK_CKSUM CKSUM_SHA256
|
#define DEFAULT_CHUNK_CKSUM CKSUM_SHA256
|
||||||
#define DEFAULT_SIMILARITY_CKSUM CKSUM_BLAKE256
|
#define DEFAULT_SIMILARITY_CKSUM CKSUM_BLAKE256
|
||||||
#define DEFAULT_COMPRESS COMPRESS_LZ4
|
#define DEFAULT_COMPRESS COMPRESS_LZ4
|
||||||
#define DEFAULT_PCT_INTERVAL 2
|
#define DEFAULT_PCT_INTERVAL 5
|
||||||
#define CONTAINER_ITEMS 2048
|
#define CONTAINER_ITEMS 2048
|
||||||
#define MIN_CK 1
|
#define MIN_CK 1
|
||||||
#define MAX_CK 5
|
#define MAX_CK 5
|
||||||
|
@ -71,7 +71,7 @@ typedef struct {
|
||||||
int pct_interval; // Similarity based match intervals in %age.
|
int pct_interval; // Similarity based match intervals in %age.
|
||||||
// The items below are computed given the above
|
// The items below are computed given the above
|
||||||
// components.
|
// components.
|
||||||
int intervals;
|
int intervals, sub_intervals;
|
||||||
dedupe_mode_t dedupe_mode;
|
dedupe_mode_t dedupe_mode;
|
||||||
|
|
||||||
uint32_t chunk_sz_bytes; // Average chunk size
|
uint32_t chunk_sz_bytes; // Average chunk size
|
||||||
|
@ -83,6 +83,7 @@ typedef struct {
|
||||||
int num_containers; // Number of containers in a directory
|
int num_containers; // Number of containers in a directory
|
||||||
int nthreads; // Number of threads processing data segments in parallel
|
int nthreads; // Number of threads processing data segments in parallel
|
||||||
int seg_fd_w;
|
int seg_fd_w;
|
||||||
|
uint64_t segcache_pos;
|
||||||
uint32_t pagesize;
|
uint32_t pagesize;
|
||||||
struct seg_map_fd *seg_fd_r; // One read-only fd per thread for mapping in portions of the
|
struct seg_map_fd *seg_fd_r; // One read-only fd per thread for mapping in portions of the
|
||||||
// segment metadata cache.
|
// segment metadata cache.
|
||||||
|
@ -90,11 +91,14 @@ typedef struct {
|
||||||
void *dbdata;
|
void *dbdata;
|
||||||
} archive_config_t;
|
} archive_config_t;
|
||||||
|
|
||||||
typedef struct _segment_entry {
|
#pragma pack(1)
|
||||||
uint64_t chunk_offset;
|
typedef struct global_blockentry {
|
||||||
uint32_t chunk_length;
|
uint32_t length;
|
||||||
uchar_t *chunk_cksum;
|
uint64_t offset;
|
||||||
} segment_entry_t;
|
struct global_blockentry *next; // Reqd when part of a hashtable
|
||||||
|
uchar_t cksum[CKSUM_MAX_BYTES];
|
||||||
|
} global_blockentry_t;
|
||||||
|
#pragma pack()
|
||||||
|
|
||||||
int read_config(char *configfile, archive_config_t *cfg);
|
int read_config(char *configfile, archive_config_t *cfg);
|
||||||
int write_config(char *configfile, archive_config_t *cfg);
|
int write_config(char *configfile, archive_config_t *cfg);
|
||||||
|
|
|
@ -108,7 +108,7 @@ int
|
||||||
setup_db_config_s(archive_config_t *cfg, uint32_t chunksize, uint64_t *user_chunk_sz,
|
setup_db_config_s(archive_config_t *cfg, uint32_t chunksize, uint64_t *user_chunk_sz,
|
||||||
int *pct_interval, const char *algo, cksum_t ck, cksum_t ck_sim,
|
int *pct_interval, const char *algo, cksum_t ck, cksum_t ck_sim,
|
||||||
size_t file_sz, uint32_t *hash_slots, int *hash_entry_size,
|
size_t file_sz, uint32_t *hash_slots, int *hash_entry_size,
|
||||||
uint32_t *intervals, uint64_t *memreqd, size_t memlimit)
|
uint64_t *memreqd, size_t memlimit)
|
||||||
{
|
{
|
||||||
int rv, set_user;
|
int rv, set_user;
|
||||||
|
|
||||||
|
@ -158,17 +158,20 @@ set_cfg:
|
||||||
// Compute total hashtable entries first
|
// Compute total hashtable entries first
|
||||||
*hash_entry_size = sizeof (hash_entry_t) + cfg->similarity_cksum_sz - 1;
|
*hash_entry_size = sizeof (hash_entry_t) + cfg->similarity_cksum_sz - 1;
|
||||||
if (*pct_interval == 0) {
|
if (*pct_interval == 0) {
|
||||||
*intervals = 1;
|
cfg->intervals = 1;
|
||||||
|
cfg->sub_intervals = 0;
|
||||||
*hash_slots = file_sz / cfg->chunk_sz_bytes + 1;
|
*hash_slots = file_sz / cfg->chunk_sz_bytes + 1;
|
||||||
|
|
||||||
} else if (*pct_interval == 100) {
|
} else if (*pct_interval == 100) {
|
||||||
*intervals = 1;
|
cfg->intervals = 1;
|
||||||
|
cfg->sub_intervals = 0;
|
||||||
*hash_slots = SLOTS_FOR_MEM(memlimit, *hash_entry_size);
|
*hash_slots = SLOTS_FOR_MEM(memlimit, *hash_entry_size);
|
||||||
*pct_interval = 0;
|
*pct_interval = 0;
|
||||||
} else {
|
} else {
|
||||||
*intervals = 90 / *pct_interval - 1;
|
cfg->intervals = 100 / *pct_interval;
|
||||||
|
cfg->sub_intervals = cfg->segment_sz / cfg->intervals;
|
||||||
*hash_slots = file_sz / cfg->segment_sz_bytes + 1;
|
*hash_slots = file_sz / cfg->segment_sz_bytes + 1;
|
||||||
*hash_slots *= *intervals;
|
*hash_slots *= (cfg->intervals + cfg->sub_intervals);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compute memory required to hold all hash entries assuming worst case 50%
|
// Compute memory required to hold all hash entries assuming worst case 50%
|
||||||
|
@ -191,7 +194,6 @@ init_global_db_s(char *path, char *tmppath, uint32_t chunksize, uint64_t user_ch
|
||||||
{
|
{
|
||||||
archive_config_t *cfg;
|
archive_config_t *cfg;
|
||||||
int rv, orig_pct;
|
int rv, orig_pct;
|
||||||
float diff;
|
|
||||||
uint32_t hash_slots, intervals, i;
|
uint32_t hash_slots, intervals, i;
|
||||||
uint64_t memreqd;
|
uint64_t memreqd;
|
||||||
int hash_entry_size;
|
int hash_entry_size;
|
||||||
|
@ -204,17 +206,12 @@ init_global_db_s(char *path, char *tmppath, uint32_t chunksize, uint64_t user_ch
|
||||||
orig_pct = pct_interval;
|
orig_pct = pct_interval;
|
||||||
cfg = calloc(1, sizeof (archive_config_t));
|
cfg = calloc(1, sizeof (archive_config_t));
|
||||||
|
|
||||||
diff = (float)pct_interval / 90.0;
|
|
||||||
rv = setup_db_config_s(cfg, chunksize, &user_chunk_sz, &pct_interval, algo, ck, ck_sim,
|
rv = setup_db_config_s(cfg, chunksize, &user_chunk_sz, &pct_interval, algo, ck, ck_sim,
|
||||||
file_sz, &hash_slots, &hash_entry_size, &intervals, &memreqd, memlimit);
|
file_sz, &hash_slots, &hash_entry_size, &memreqd, memlimit);
|
||||||
|
|
||||||
// Reduce hash_slots to remain within memlimit
|
// Reduce hash_slots to remain within memlimit
|
||||||
while (memreqd > memlimit) {
|
while (memreqd > memlimit) {
|
||||||
if (pct_interval == 0) {
|
|
||||||
hash_slots--;
|
hash_slots--;
|
||||||
} else {
|
|
||||||
hash_slots -= (hash_slots * diff);
|
|
||||||
}
|
|
||||||
memreqd = hash_slots * MEM_PER_UNIT(hash_entry_size);
|
memreqd = hash_slots * MEM_PER_UNIT(hash_entry_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -228,23 +225,22 @@ init_global_db_s(char *path, char *tmppath, uint32_t chunksize, uint64_t user_ch
|
||||||
return (NULL);
|
return (NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cfg->nthreads = nthreads;
|
||||||
|
intervals = cfg->intervals + cfg->sub_intervals;
|
||||||
indx->memlimit = memlimit - (hash_entry_size << 2);
|
indx->memlimit = memlimit - (hash_entry_size << 2);
|
||||||
indx->list = (htab_t *)calloc(intervals, sizeof (htab_t));
|
indx->list = (htab_t *)calloc(intervals, sizeof (htab_t));
|
||||||
indx->hash_entry_size = hash_entry_size;
|
indx->hash_entry_size = hash_entry_size;
|
||||||
indx->intervals = intervals;
|
indx->intervals = intervals;
|
||||||
indx->hash_slots = hash_slots / intervals;
|
indx->hash_slots = hash_slots / intervals;
|
||||||
cfg->nthreads = nthreads;
|
|
||||||
cfg->intervals = intervals;
|
|
||||||
|
|
||||||
for (i = 0; i < intervals; i++) {
|
for (i = 0; i < intervals; i++) {
|
||||||
indx->list[i].tab = (hash_entry_t **)calloc(hash_slots / intervals,
|
indx->list[i].tab = (hash_entry_t **)calloc(indx->hash_slots, sizeof (hash_entry_t *));
|
||||||
sizeof (hash_entry_t *));
|
|
||||||
if (!(indx->list[i].tab)) {
|
if (!(indx->list[i].tab)) {
|
||||||
cleanup_indx(indx);
|
cleanup_indx(indx);
|
||||||
free(cfg);
|
free(cfg);
|
||||||
return (NULL);
|
return (NULL);
|
||||||
}
|
}
|
||||||
indx->memused += ((hash_slots / intervals) * (sizeof (hash_entry_t *)));
|
indx->memused += ((indx->hash_slots) * (sizeof (hash_entry_t *)));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pct_interval > 0) {
|
if (pct_interval > 0) {
|
||||||
|
@ -264,6 +260,7 @@ init_global_db_s(char *path, char *tmppath, uint32_t chunksize, uint64_t user_ch
|
||||||
cfg->seg_fd_r[i].mapping = NULL;
|
cfg->seg_fd_r[i].mapping = NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
cfg->segcache_pos = 0;
|
||||||
cfg->dbdata = indx;
|
cfg->dbdata = indx;
|
||||||
return (cfg);
|
return (cfg);
|
||||||
}
|
}
|
||||||
|
@ -277,23 +274,25 @@ init_global_db_s(char *path, char *tmppath, uint32_t chunksize, uint64_t user_ch
|
||||||
* Add new segment block list array into the metadata cache. Once added the entry is
|
* Add new segment block list array into the metadata cache. Once added the entry is
|
||||||
* not removed till the program exits.
|
* not removed till the program exits.
|
||||||
*/
|
*/
|
||||||
|
#define SEGCACHE_HDR_SZ 12
|
||||||
int
|
int
|
||||||
db_segcache_write(archive_config_t *cfg, int tid, uchar_t *buf, uint32_t len, uint32_t blknum,
|
db_segcache_write(archive_config_t *cfg, int tid, uchar_t *buf, uint32_t len, uint32_t blknum,
|
||||||
uint64_t file_offset)
|
uint64_t file_offset)
|
||||||
{
|
{
|
||||||
int64_t w;
|
int64_t w;
|
||||||
uchar_t *hdr[16];
|
uchar_t hdr[SEGCACHE_HDR_SZ];
|
||||||
|
|
||||||
*((uint32_t *)hdr) = len;
|
*((uint32_t *)(hdr)) = blknum;
|
||||||
*((uint32_t *)(hdr + 4)) = blknum;
|
*((uint64_t *)(hdr + 4)) = file_offset;
|
||||||
*((uint32_t *)(hdr + 8)) = file_offset;
|
|
||||||
|
|
||||||
w = Write(cfg->seg_fd_w, hdr, sizeof (hdr));
|
w = Write(cfg->seg_fd_w, hdr, sizeof (hdr));
|
||||||
if (w < sizeof (hdr))
|
if (w < sizeof (hdr))
|
||||||
return (-1);
|
return (-1);
|
||||||
|
cfg->segcache_pos += w;
|
||||||
w = Write(cfg->seg_fd_w, buf, len);
|
w = Write(cfg->seg_fd_w, buf, len);
|
||||||
if (w < len)
|
if (w < len)
|
||||||
return (-1);
|
return (-1);
|
||||||
|
cfg->segcache_pos += w;
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -304,20 +303,19 @@ db_segcache_write(archive_config_t *cfg, int tid, uchar_t *buf, uint32_t len, ui
|
||||||
int
|
int
|
||||||
db_segcache_pos(archive_config_t *cfg, int tid)
|
db_segcache_pos(archive_config_t *cfg, int tid)
|
||||||
{
|
{
|
||||||
return (lseek(cfg->seg_fd_w, 0, SEEK_CUR));
|
return (cfg->segcache_pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Mmap the requested segment metadata array.
|
* Mmap the requested segment metadata array.
|
||||||
*/
|
*/
|
||||||
int
|
int
|
||||||
db_segcache_map(archive_config_t *cfg, int tid, uint32_t *blknum, uint64_t *offset, uchar_t **blocks)
|
db_segcache_map(archive_config_t *cfg, int tid, uint32_t *blknum, uint64_t *offset, uchar_t *blocks)
|
||||||
{
|
{
|
||||||
uchar_t *mapbuf;
|
uchar_t *mapbuf, *hdr;
|
||||||
uchar_t *hdr[16];
|
|
||||||
int64_t r;
|
|
||||||
int fd;
|
int fd;
|
||||||
uint32_t len, adj;
|
uint32_t len, adj;
|
||||||
|
uint64_t pos;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Ensure previous mapping is removed.
|
* Ensure previous mapping is removed.
|
||||||
|
@ -328,22 +326,25 @@ db_segcache_map(archive_config_t *cfg, int tid, uint32_t *blknum, uint64_t *offs
|
||||||
return (-1);
|
return (-1);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Read header first so that we know how much to map.
|
* Mmap hdr and blocks. We assume max # of rabin block entries and mmap (unless remaining
|
||||||
|
* file length is less). The header contains actual number of block entries so mmap-ing
|
||||||
|
* extra has no consequence other than address space usage.
|
||||||
*/
|
*/
|
||||||
r = Read(fd, hdr, sizeof (hdr));
|
len = cfg->segment_sz * sizeof (global_blockentry_t) + SEGCACHE_HDR_SZ;
|
||||||
if (r < sizeof (hdr))
|
pos = cfg->segcache_pos;
|
||||||
return (-1);
|
if (pos - *offset < len)
|
||||||
|
len = pos - *offset;
|
||||||
|
|
||||||
*offset += sizeof (hdr);
|
|
||||||
len = *((uint32_t *)hdr);
|
|
||||||
adj = *offset % cfg->pagesize;
|
adj = *offset % cfg->pagesize;
|
||||||
*blknum = *((uint32_t *)(hdr + 4));
|
|
||||||
mapbuf = mmap(NULL, len + adj, PROT_READ, MAP_SHARED, fd, *offset - adj);
|
mapbuf = mmap(NULL, len + adj, PROT_READ, MAP_SHARED, fd, *offset - adj);
|
||||||
|
|
||||||
if (mapbuf == MAP_FAILED)
|
if (mapbuf == MAP_FAILED)
|
||||||
return (-1);
|
return (-1);
|
||||||
*offset = *((uint32_t *)(hdr + 8));
|
|
||||||
*blocks = mapbuf + adj;
|
hdr = mapbuf + adj;
|
||||||
|
*blknum = *((uint32_t *)(hdr));
|
||||||
|
*offset = *((uint64_t *)(hdr + 4));
|
||||||
|
memcpy(blocks, hdr + SEGCACHE_HDR_SZ, *blknum * sizeof (global_blockentry_t));
|
||||||
|
|
||||||
cfg->seg_fd_r[tid].mapping = mapbuf;
|
cfg->seg_fd_r[tid].mapping = mapbuf;
|
||||||
cfg->seg_fd_r[tid].len = len + adj;
|
cfg->seg_fd_r[tid].len = len + adj;
|
||||||
|
|
||||||
|
|
|
@ -45,7 +45,7 @@ archive_config_t *init_global_db(char *configfile);
|
||||||
int setup_db_config_s(archive_config_t *cfg, uint32_t chunksize, uint64_t *user_chunk_sz,
|
int setup_db_config_s(archive_config_t *cfg, uint32_t chunksize, uint64_t *user_chunk_sz,
|
||||||
int *pct_interval, const char *algo, cksum_t ck, cksum_t ck_sim,
|
int *pct_interval, const char *algo, cksum_t ck, cksum_t ck_sim,
|
||||||
size_t file_sz, uint32_t *hash_slots, int *hash_entry_size,
|
size_t file_sz, uint32_t *hash_slots, int *hash_entry_size,
|
||||||
uint32_t *intervals, uint64_t *memreqd, size_t memlimit);
|
uint64_t *memreqd, size_t memlimit);
|
||||||
archive_config_t *init_global_db_s(char *path, char *tmppath, uint32_t chunksize,
|
archive_config_t *init_global_db_s(char *path, char *tmppath, uint32_t chunksize,
|
||||||
uint64_t user_chunk_sz, int pct_interval, const char *algo,
|
uint64_t user_chunk_sz, int pct_interval, const char *algo,
|
||||||
cksum_t ck, cksum_t ck_sim, size_t file_sz, size_t memlimit,
|
cksum_t ck, cksum_t ck_sim, size_t file_sz, size_t memlimit,
|
||||||
|
@ -56,7 +56,7 @@ void destroy_global_db_s(archive_config_t *cfg);
|
||||||
|
|
||||||
int db_segcache_write(archive_config_t *cfg, int tid, uchar_t *buf, uint32_t len, uint32_t blknum, uint64_t file_offset);
|
int db_segcache_write(archive_config_t *cfg, int tid, uchar_t *buf, uint32_t len, uint32_t blknum, uint64_t file_offset);
|
||||||
int db_segcache_pos(archive_config_t *cfg, int tid);
|
int db_segcache_pos(archive_config_t *cfg, int tid);
|
||||||
int db_segcache_map(archive_config_t *cfg, int tid, uint32_t *blknum, uint64_t *offset, uchar_t **blocks);
|
int db_segcache_map(archive_config_t *cfg, int tid, uint32_t *blknum, uint64_t *offset, uchar_t *blocks);
|
||||||
int db_segcache_unmap(archive_config_t *cfg, int tid);
|
int db_segcache_unmap(archive_config_t *cfg, int tid);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|
|
@ -103,6 +103,7 @@ extern int bsdiff(u_char *oldbuf, bsize_t oldsize, u_char *newbuf, bsize_t newsi
|
||||||
extern bsize_t get_bsdiff_sz(u_char *pbuf);
|
extern bsize_t get_bsdiff_sz(u_char *pbuf);
|
||||||
extern int bspatch(u_char *pbuf, u_char *oldbuf, bsize_t oldsize, u_char *newbuf,
|
extern int bspatch(u_char *pbuf, u_char *oldbuf, bsize_t oldsize, u_char *newbuf,
|
||||||
bsize_t *_newsize);
|
bsize_t *_newsize);
|
||||||
|
extern uint64_t lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc);
|
||||||
|
|
||||||
static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
|
static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER;
|
||||||
uint64_t ir[256], out[256];
|
uint64_t ir[256], out[256];
|
||||||
|
@ -133,19 +134,19 @@ dedupe_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo, int delta
|
||||||
* to align with deduplication requirements.
|
* to align with deduplication requirements.
|
||||||
*/
|
*/
|
||||||
int
|
int
|
||||||
global_dedupe_bufadjust(uint32_t chunksize, uint64_t *user_chunk_sz, int pct_interval,
|
global_dedupe_bufadjust(uint32_t rab_blk_sz, uint64_t *user_chunk_sz, int pct_interval,
|
||||||
const char *algo, cksum_t ck, cksum_t ck_sim, size_t file_sz,
|
const char *algo, cksum_t ck, cksum_t ck_sim, size_t file_sz,
|
||||||
size_t memlimit, int nthreads)
|
size_t memlimit, int nthreads)
|
||||||
{
|
{
|
||||||
uint64_t memreqd;
|
uint64_t memreqd;
|
||||||
archive_config_t cfg;
|
archive_config_t cfg;
|
||||||
int rv, pct_i, hash_entry_size;
|
int rv, pct_i, hash_entry_size;
|
||||||
uint32_t intervals, hash_slots;
|
uint32_t hash_slots;
|
||||||
|
|
||||||
rv = 0;
|
rv = 0;
|
||||||
pct_i = pct_interval;
|
pct_i = pct_interval;
|
||||||
rv = setup_db_config_s(&cfg, chunksize, user_chunk_sz, &pct_i, algo, ck, ck_sim,
|
rv = setup_db_config_s(&cfg, rab_blk_sz, user_chunk_sz, &pct_i, algo, ck, ck_sim,
|
||||||
file_sz, &hash_slots, &hash_entry_size, &intervals, &memreqd, memlimit);
|
file_sz, &hash_slots, &hash_entry_size, &memreqd, memlimit);
|
||||||
return (rv);
|
return (rv);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -210,8 +211,8 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s
|
||||||
*/
|
*/
|
||||||
get_sys_limits(&msys_info);
|
get_sys_limits(&msys_info);
|
||||||
|
|
||||||
arc = init_global_db_s(NULL, NULL, rab_blk_sz, chunksize, 0,
|
arc = init_global_db_s(NULL, "/tmp", rab_blk_sz, chunksize, 0,
|
||||||
algo, props->cksum, props->cksum, file_size,
|
algo, props->cksum, CKSUM_CRC64, file_size,
|
||||||
msys_info.freeram, props->nthreads);
|
msys_info.freeram, props->nthreads);
|
||||||
if (arc == NULL) {
|
if (arc == NULL) {
|
||||||
pthread_mutex_unlock(&init_lock);
|
pthread_mutex_unlock(&init_lock);
|
||||||
|
@ -307,7 +308,8 @@ create_dedupe_context(uint64_t chunksize, uint64_t real_chunksize, int rab_blk_s
|
||||||
}
|
}
|
||||||
|
|
||||||
if (arc && dedupe_flag == RABIN_DEDUPE_FILE_GLOBAL) {
|
if (arc && dedupe_flag == RABIN_DEDUPE_FILE_GLOBAL) {
|
||||||
ctx->similarity_cksums = (uchar_t *)slab_calloc(NULL, arc->intervals, 32);
|
ctx->similarity_cksums = (uchar_t *)slab_calloc(NULL, arc->intervals + arc->sub_intervals,
|
||||||
|
arc->similarity_cksum_sz);
|
||||||
if (!ctx->similarity_cksums) {
|
if (!ctx->similarity_cksums) {
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
"Could not allocate dedupe context, out of memory\n");
|
"Could not allocate dedupe context, out of memory\n");
|
||||||
|
@ -436,21 +438,22 @@ dedupe_compress(dedupe_context_t *ctx, uchar_t *buf, uint64_t *size, uint64_t of
|
||||||
}
|
}
|
||||||
|
|
||||||
if (rabin_pos == NULL) {
|
if (rabin_pos == NULL) {
|
||||||
/*
|
|
||||||
* Initialize arrays for sketch computation. We re-use memory allocated
|
|
||||||
* for the compressed chunk temporarily.
|
|
||||||
*/
|
|
||||||
ary_sz = ctx->rabin_poly_max_block_size;
|
|
||||||
ctx_heap = (uint32_t *)(ctx->cbuf + ctx->real_chunksize - ary_sz);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If global dedupe is active, the global blocks array uses temp space in
|
* If global dedupe is active, the global blocks array uses temp space in
|
||||||
* the target buffer.
|
* the target buffer.
|
||||||
*/
|
*/
|
||||||
|
ary_sz = 0;
|
||||||
if (ctx->arc != NULL) {
|
if (ctx->arc != NULL) {
|
||||||
ary_sz += (sizeof (global_blockentry_t) * (*size / ctx->rabin_poly_min_block_size + 1));
|
ary_sz = (sizeof (global_blockentry_t) * (*size / ctx->rabin_poly_min_block_size + 1));
|
||||||
ctx->g_blocks = (global_blockentry_t *)(ctx->cbuf + ctx->real_chunksize - ary_sz);
|
ctx->g_blocks = (global_blockentry_t *)(ctx->cbuf + ctx->real_chunksize - ary_sz);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Initialize arrays for sketch computation. We re-use memory allocated
|
||||||
|
* for the compressed chunk temporarily.
|
||||||
|
*/
|
||||||
|
ary_sz += ctx->rabin_poly_max_block_size;
|
||||||
|
ctx_heap = (uint32_t *)(ctx->cbuf + ctx->real_chunksize - ary_sz);
|
||||||
}
|
}
|
||||||
#ifndef SSE_MODE
|
#ifndef SSE_MODE
|
||||||
memset(ctx->current_window_data, 0, RAB_POLYNOMIAL_WIN_SIZE);
|
memset(ctx->current_window_data, 0, RAB_POLYNOMIAL_WIN_SIZE);
|
||||||
|
@ -699,8 +702,9 @@ process_blocks:
|
||||||
matchlen = 0;
|
matchlen = 0;
|
||||||
|
|
||||||
if (ctx->arc->dedupe_mode == MODE_SIMPLE) {
|
if (ctx->arc->dedupe_mode == MODE_SIMPLE) {
|
||||||
/*
|
/*======================================================================
|
||||||
* This code block implements Global Dedupe with simple in-memory index.
|
* This code block implements Global Dedupe with simple in-memory index.
|
||||||
|
*======================================================================
|
||||||
*/
|
*/
|
||||||
/*
|
/*
|
||||||
* Now lookup blocks in index. First wait for our semaphore to be
|
* Now lookup blocks in index. First wait for our semaphore to be
|
||||||
|
@ -780,60 +784,89 @@ process_blocks:
|
||||||
} else {
|
} else {
|
||||||
uchar_t *seg_heap, *sim_ck;
|
uchar_t *seg_heap, *sim_ck;
|
||||||
archive_config_t *cfg;
|
archive_config_t *cfg;
|
||||||
uint32_t increment, len, mapped_blks;
|
uint32_t increment, len, blks, o_blks, k;
|
||||||
global_blockentry_t *seg_blocks;
|
global_blockentry_t *seg_blocks;
|
||||||
uint64_t seg_offset, offset;
|
uint64_t seg_offset, offset;
|
||||||
int written;
|
global_blockentry_t **htab, *be;
|
||||||
global_blockentry_t **htab;
|
|
||||||
|
|
||||||
/*
|
/*======================================================================
|
||||||
* This code block implements Segmented similarity based Dedupe with
|
* This code block implements Segmented similarity based Dedupe with
|
||||||
* in-memory index.
|
* in-memory index.
|
||||||
|
* ======================================================================
|
||||||
*/
|
*/
|
||||||
cfg = ctx->arc;
|
cfg = ctx->arc;
|
||||||
seg_heap = (uchar_t *)ctx->g_blocks - cfg->segment_sz_bytes;
|
seg_heap = (uchar_t *)(ctx->g_blocks) - cfg->segment_sz_bytes;
|
||||||
ary_sz = cfg->segment_sz * sizeof (global_blockentry_t **);
|
ary_sz = cfg->segment_sz * sizeof (global_blockentry_t **);
|
||||||
htab = (global_blockentry_t **)(seg_heap - ary_sz);
|
htab = (global_blockentry_t **)(seg_heap - ary_sz);
|
||||||
for (i=0; i<blknum; i += cfg->segment_sz) {
|
seg_blocks = (global_blockentry_t *)(seg_heap - ary_sz - \
|
||||||
blake2b_state S1, S2;
|
cfg->segment_sz * sizeof (global_blockentry_t));
|
||||||
|
for (i=0; i<blknum;) {
|
||||||
|
uint64_t crc;
|
||||||
length = 0;
|
length = 0;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Compute length of current segment.
|
* Compute length of current segment.
|
||||||
*/
|
*/
|
||||||
for (j=i; j<blknum && j<cfg->segment_sz; j++)
|
blks = cfg->segment_sz;
|
||||||
length += ctx->g_blocks[i].length;
|
if (blks > blknum-i) blks = blknum-i;
|
||||||
|
len = 0;
|
||||||
|
for (j=0; j<blks; j++) {
|
||||||
|
len += ctx->g_blocks[j+i].length;
|
||||||
|
if (len > cfg->segment_sz_bytes) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
length = len;
|
||||||
|
}
|
||||||
|
blks = j+i;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Compute the cumulative similarity minhashes.
|
* Compute the cumulative similarity minhashes.
|
||||||
*/
|
*/
|
||||||
sim_ck = ctx->similarity_cksums;
|
sim_ck = ctx->similarity_cksums;
|
||||||
increment = length / cfg->intervals;
|
|
||||||
len = increment;
|
|
||||||
src = buf1 + ctx->g_blocks[i].offset;
|
src = buf1 + ctx->g_blocks[i].offset;
|
||||||
tgt = seg_heap;
|
tgt = seg_heap;
|
||||||
memcpy(tgt, src, length);
|
memcpy(tgt, src, length);
|
||||||
bdsp.blake2b_init(&S1, 32);
|
crc = 0;
|
||||||
for (j=0; j<cfg->intervals; j++) {
|
increment = (length / cfg->intervals) / cfg->sub_intervals;
|
||||||
|
len = increment;
|
||||||
|
for (j=0; j<cfg->sub_intervals; j++) {
|
||||||
reset_heap(&heap, len/8);
|
reset_heap(&heap, len/8);
|
||||||
ksmallest((int64_t *)seg_heap, length, &heap);
|
ksmallest((int64_t *)seg_heap, length/8, &heap);
|
||||||
bdsp.blake2b_update(&S1, seg_heap + len - increment, increment);
|
crc = lzma_crc64(seg_heap + len - increment, increment, crc);
|
||||||
memcpy(&S2, &S1, sizeof (S1));
|
*((uint64_t *)sim_ck) = crc;
|
||||||
bdsp.blake2b_final(&S2, sim_ck, 32);
|
|
||||||
len += increment;
|
len += increment;
|
||||||
sim_ck += 32;
|
sim_ck += cfg->similarity_cksum_sz;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
len -= increment;
|
||||||
|
increment = length / cfg->intervals;
|
||||||
|
len = increment * 2;
|
||||||
|
for (j=0; j<cfg->intervals-1; j++) {
|
||||||
|
reset_heap(&heap, len/8);
|
||||||
|
ksmallest((int64_t *)seg_heap, length/8, &heap);
|
||||||
|
crc = lzma_crc64(seg_heap + len - increment, increment, crc);
|
||||||
|
*((uint64_t *)sim_ck) = crc;
|
||||||
|
len += increment;
|
||||||
|
sim_ck += cfg->similarity_cksum_sz;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Begin shared index access and write segment metadata to cache
|
||||||
|
* first.
|
||||||
|
*/
|
||||||
|
if (i == 0) sem_wait(ctx->index_sem);
|
||||||
|
sim_ck -= cfg->similarity_cksum_sz;
|
||||||
|
seg_offset = db_segcache_pos(cfg, ctx->id);
|
||||||
|
src = (uchar_t *)&(ctx->g_blocks[i]);
|
||||||
|
len = blks * sizeof (global_blockentry_t);
|
||||||
|
db_segcache_write(cfg, ctx->id, src, len, blks,
|
||||||
|
ctx->file_offset + ctx->g_blocks[i].offset);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Now lookup the similarity minhashes starting at the highest
|
* Now lookup the similarity minhashes starting at the highest
|
||||||
* significance level.
|
* significance level.
|
||||||
*/
|
*/
|
||||||
sim_ck -= 32;
|
for (j=cfg->intervals + cfg->sub_intervals; j > 0; j--) {
|
||||||
written = 0;
|
|
||||||
increment = cfg->intervals * cfg->pct_interval;
|
|
||||||
sem_wait(ctx->index_sem);
|
|
||||||
seg_offset = db_segcache_pos(cfg, ctx->id);
|
|
||||||
for (j=cfg->intervals; j > 0; j--) {
|
|
||||||
hash_entry_t *he;
|
hash_entry_t *he;
|
||||||
|
|
||||||
he = db_lookup_insert_s(cfg, sim_ck, j-1, seg_offset, 0, 1);
|
he = db_lookup_insert_s(cfg, sim_ck, j-1, seg_offset, 0, 1);
|
||||||
|
@ -844,35 +877,181 @@ process_blocks:
|
||||||
*/
|
*/
|
||||||
memset(htab, 0, ary_sz);
|
memset(htab, 0, ary_sz);
|
||||||
offset = he->item_offset;
|
offset = he->item_offset;
|
||||||
if (db_segcache_map(cfg, ctx->id, &mapped_blks, &offset,
|
if (db_segcache_map(cfg, ctx->id, &o_blks, &offset,
|
||||||
(uchar_t **)&seg_blocks) == -1) {
|
(uchar_t *)seg_blocks) == -1) {
|
||||||
fprintf(stderr, "Segment cache mmap failed.\n");
|
fprintf(stderr, "Segment cache mmap failed.\n");
|
||||||
ctx->valid = 0;
|
ctx->valid = 0;
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (increment > 70) {
|
/*
|
||||||
j = cfg->intervals - j;
|
* First insert all the unique blocks from the mapped segment
|
||||||
}
|
* blocks(chunks) array into the hashtable.
|
||||||
} else if (!written) {
|
*/
|
||||||
if (blknum - i >= cfg->segment_sz) {
|
for (k=0; k<o_blks; k++) {
|
||||||
db_segcache_write(cfg, ctx->id, src, len, cfg->segment_sz,
|
uint32_t hent;
|
||||||
ctx->file_offset);
|
hent = XXH32(&(seg_blocks[k].cksum[0]), cfg->chunk_cksum_sz, 0);
|
||||||
|
hent ^= (hent / cfg->chunk_cksum_sz);
|
||||||
|
hent = hent % cfg->segment_sz;
|
||||||
|
|
||||||
|
if (htab[hent] == NULL) {
|
||||||
|
htab[hent] = &(seg_blocks[k]);
|
||||||
|
seg_blocks[k].offset += offset;
|
||||||
|
seg_blocks[k].next = NULL;
|
||||||
} else {
|
} else {
|
||||||
db_segcache_write(cfg, ctx->id, src, len, blknum-i,
|
be = htab[hent];
|
||||||
ctx->file_offset);
|
do {
|
||||||
|
if (memcmp(seg_blocks[k].cksum,
|
||||||
|
be->cksum, cfg->chunk_cksum_sz) == 0 &&
|
||||||
|
seg_blocks[k].length == be->length) {
|
||||||
|
be = NULL;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
written = 1;
|
if (be->next)
|
||||||
|
be = be->next;
|
||||||
|
else
|
||||||
|
break;
|
||||||
|
} while(1);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* be will be non-NULL if no match was found.
|
||||||
|
* It will the last bucket in the hash slot.
|
||||||
|
*/
|
||||||
|
if (be) {
|
||||||
|
be->next = &(seg_blocks[k]);
|
||||||
|
seg_blocks[k].offset += offset;
|
||||||
|
seg_blocks[k].next = NULL;
|
||||||
}
|
}
|
||||||
increment -= cfg->pct_interval;
|
|
||||||
sim_ck -= 32;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Now lookup current segment blocks(chunks) in the hashtable and
|
||||||
|
* perform the actual deduplication.
|
||||||
|
*/
|
||||||
|
be = NULL;
|
||||||
|
for (k=i; k<blks; k++) {
|
||||||
|
uint32_t hent;
|
||||||
|
|
||||||
|
if (ctx->g_blocks[k].length & RABIN_INDEX_FLAG) continue;
|
||||||
|
hent = XXH32(ctx->g_blocks[k].cksum, cfg->chunk_cksum_sz, 0);
|
||||||
|
hent ^= (hent / cfg->chunk_cksum_sz);
|
||||||
|
hent = hent % cfg->segment_sz;
|
||||||
|
|
||||||
|
if (htab[hent] == NULL) {
|
||||||
|
htab[hent] = &(ctx->g_blocks[k]);
|
||||||
|
ctx->g_blocks[k].offset += ctx->file_offset;
|
||||||
|
ctx->g_blocks[k].next = NULL;
|
||||||
|
be = NULL;
|
||||||
|
} else {
|
||||||
|
be = htab[hent];
|
||||||
|
do {
|
||||||
|
if (memcmp(ctx->g_blocks[k].cksum,
|
||||||
|
be->cksum, cfg->chunk_cksum_sz) == 0 &&
|
||||||
|
ctx->g_blocks[k].length == be->length) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (be->next) {
|
||||||
|
be = be->next;
|
||||||
|
} else {
|
||||||
|
be->next = &(ctx->g_blocks[k]);
|
||||||
|
be->next->offset +=
|
||||||
|
ctx->file_offset;
|
||||||
|
be->next->next = NULL;
|
||||||
|
be = NULL;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} while(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* be will be non-NULL if match was found. It will
|
||||||
|
* point to the matching bucket.
|
||||||
|
*/
|
||||||
|
if (be) {
|
||||||
|
global_blockentry_t *en;
|
||||||
|
/*
|
||||||
|
* Block match in index was found. Update g_blocks
|
||||||
|
* array.
|
||||||
|
*/
|
||||||
|
en = &(ctx->g_blocks[k]);
|
||||||
|
en->length = (en->length | RABIN_INDEX_FLAG) &
|
||||||
|
CLEAR_SIMILARITY_FLAG;
|
||||||
|
en->offset = be->offset;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
sim_ck -= cfg->similarity_cksum_sz;
|
||||||
|
}
|
||||||
|
i += blks;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Signal the next thread in sequence to access the index.
|
* Signal the next thread in sequence to access the index.
|
||||||
*/
|
*/
|
||||||
sem_post(ctx->index_sem_next);
|
sem_post(ctx->index_sem_next);
|
||||||
|
|
||||||
|
/*======================================================================
|
||||||
|
* Finally scan the blocks array and update dedupe index.
|
||||||
|
*======================================================================
|
||||||
|
*/
|
||||||
|
length = 0;
|
||||||
|
for (i=0; i<blknum; i++) {
|
||||||
|
|
||||||
|
if (!(ctx->g_blocks[i].length & RABIN_INDEX_FLAG)) {
|
||||||
|
/*
|
||||||
|
* Block match in index not found.
|
||||||
|
* Block was added to index. Merge this block.
|
||||||
|
*/
|
||||||
|
if (length + ctx->g_blocks[i].length > RABIN_MAX_BLOCK_SIZE) {
|
||||||
|
*((uint32_t *)g_dedupe_idx) = LE32(length);
|
||||||
|
g_dedupe_idx += RABIN_ENTRY_SIZE;
|
||||||
|
length = 0;
|
||||||
|
dedupe_index_sz++;
|
||||||
|
}
|
||||||
|
length += ctx->g_blocks[i].length;
|
||||||
|
} else {
|
||||||
|
/*
|
||||||
|
* Block match in index was found.
|
||||||
|
*/
|
||||||
|
if (length > 0) {
|
||||||
|
/*
|
||||||
|
* Write pending accumulated block length value.
|
||||||
|
*/
|
||||||
|
*((uint32_t *)g_dedupe_idx) = LE32(length);
|
||||||
|
g_dedupe_idx += RABIN_ENTRY_SIZE;
|
||||||
|
length = 0;
|
||||||
|
dedupe_index_sz++;
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
* Add a reference entry to the dedupe array.
|
||||||
|
*/
|
||||||
|
*((uint32_t *)g_dedupe_idx) = LE32(ctx->g_blocks[i].length);
|
||||||
|
g_dedupe_idx += RABIN_ENTRY_SIZE;
|
||||||
|
*((uint64_t *)g_dedupe_idx) = LE64(ctx->g_blocks[i].offset);
|
||||||
|
g_dedupe_idx += (RABIN_ENTRY_SIZE * 2);
|
||||||
|
matchlen += (ctx->g_blocks[i].length & RABIN_INDEX_VALUE);
|
||||||
|
dedupe_index_sz += 3;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Write final pending block length value (if any).
|
||||||
|
*/
|
||||||
|
if (length > 0) {
|
||||||
|
*((uint32_t *)g_dedupe_idx) = LE32(length);
|
||||||
|
g_dedupe_idx += RABIN_ENTRY_SIZE;
|
||||||
|
length = 0;
|
||||||
|
dedupe_index_sz++;
|
||||||
|
}
|
||||||
|
|
||||||
|
blknum = dedupe_index_sz; // Number of entries in block list
|
||||||
|
tgt = g_dedupe_idx;
|
||||||
|
g_dedupe_idx = ctx->cbuf + RABIN_HDR_SIZE;
|
||||||
|
dedupe_index_sz = tgt - g_dedupe_idx;
|
||||||
|
src = buf1;
|
||||||
|
g_dedupe_idx += (RABIN_ENTRY_SIZE * 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -163,15 +163,6 @@ typedef struct rab_blockentry {
|
||||||
struct rab_blockentry *next;
|
struct rab_blockentry *next;
|
||||||
} rabin_blockentry_t;
|
} rabin_blockentry_t;
|
||||||
|
|
||||||
#pragma pack(1)
|
|
||||||
typedef struct global_blockentry {
|
|
||||||
uint32_t length;
|
|
||||||
uint64_t offset;
|
|
||||||
struct global_blockentry *next; // Reqd when part of a hashtable
|
|
||||||
uchar_t cksum[CKSUM_MAX_BYTES];
|
|
||||||
} global_blockentry_t;
|
|
||||||
#pragma pack()
|
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
unsigned char *current_window_data;
|
unsigned char *current_window_data;
|
||||||
rabin_blockentry_t **blocks;
|
rabin_blockentry_t **blocks;
|
||||||
|
@ -212,7 +203,7 @@ extern void update_dedupe_hdr(uchar_t *buf, uint64_t dedupe_index_sz_cmp,
|
||||||
extern void reset_dedupe_context(dedupe_context_t *ctx);
|
extern void reset_dedupe_context(dedupe_context_t *ctx);
|
||||||
extern uint32_t dedupe_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo,
|
extern uint32_t dedupe_buf_extra(uint64_t chunksize, int rab_blk_sz, const char *algo,
|
||||||
int delta_flag);
|
int delta_flag);
|
||||||
extern int global_dedupe_bufadjust(uint32_t chunksize, uint64_t *user_chunk_sz, int pct_interval,
|
extern int global_dedupe_bufadjust(uint32_t rab_blk_sz, uint64_t *user_chunk_sz, int pct_interval,
|
||||||
const char *algo, cksum_t ck, cksum_t ck_sim, size_t file_sz,
|
const char *algo, cksum_t ck, cksum_t ck_sim, size_t file_sz,
|
||||||
size_t memlimit, int nthreads);
|
size_t memlimit, int nthreads);
|
||||||
|
|
||||||
|
|
|
@ -396,13 +396,13 @@ get_sys_limits(my_sysinfo *msys_info)
|
||||||
*/
|
*/
|
||||||
mem = strtoull(val, NULL, 0);
|
mem = strtoull(val, NULL, 0);
|
||||||
mem *= (1024 * 1024);
|
mem *= (1024 * 1024);
|
||||||
if (mem > (1024 * 1024) && mem < msys_info->freeram) {
|
if (mem >= (1024 * 1024) && mem < msys_info->freeram) {
|
||||||
msys_info->freeram = mem;
|
msys_info->freeram = mem;
|
||||||
}
|
}
|
||||||
}
|
} else {
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Use a maximum of approx 75% of free RAM for the index.
|
* Use a maximum of approx 75% of free RAM for the index(if limit was not specified).
|
||||||
*/
|
*/
|
||||||
msys_info->freeram = (msys_info->freeram >> 1) + (msys_info->freeram >> 2);
|
msys_info->freeram = (msys_info->freeram >> 1) + (msys_info->freeram >> 2);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue