Make global dedupe bits buildable and fix errors.

Rename Adaptive compression type constants to avoid conflict with global constants.
This commit is contained in:
Moinak Ghosh 2013-02-17 21:05:40 +05:30
parent 7386f82a4f
commit 6badbcaea7
8 changed files with 127 additions and 47 deletions

View file

@ -207,14 +207,14 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
rv = lzma_compress(src, srclen, dst, dstlen, level, chdr, adat->lzma_data); rv = lzma_compress(src, srclen, dst, dstlen, level, chdr, adat->lzma_data);
if (rv < 0) if (rv < 0)
return (rv); return (rv);
rv = COMPRESS_LZMA; rv = ADAPT_COMPRESS_LZMA;
lzma_count++; lzma_count++;
} else if (adat->adapt_mode == 1 && tot8b > FIFTY_PCT(srclen)) { } else if (adat->adapt_mode == 1 && tot8b > FIFTY_PCT(srclen)) {
rv = bzip2_compress(src, srclen, dst, dstlen, level, chdr, NULL); rv = bzip2_compress(src, srclen, dst, dstlen, level, chdr, NULL);
if (rv < 0) if (rv < 0)
return (rv); return (rv);
rv = COMPRESS_BZIP2; rv = ADAPT_COMPRESS_BZIP2;
bzip2_count++; bzip2_count++;
} else { } else {
@ -223,14 +223,14 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
rv = libbsc_compress(src, srclen, dst, dstlen, level, chdr, adat->bsc_data); rv = libbsc_compress(src, srclen, dst, dstlen, level, chdr, adat->bsc_data);
if (rv < 0) if (rv < 0)
return (rv); return (rv);
rv = COMPRESS_BSC; rv = ADAPT_COMPRESS_BSC;
bsc_count++; bsc_count++;
#endif #endif
} else { } else {
rv = ppmd_compress(src, srclen, dst, dstlen, level, chdr, adat->ppmd_data); rv = ppmd_compress(src, srclen, dst, dstlen, level, chdr, adat->ppmd_data);
if (rv < 0) if (rv < 0)
return (rv); return (rv);
rv = COMPRESS_PPMD; rv = ADAPT_COMPRESS_PPMD;
ppmd_count++; ppmd_count++;
} }
} }
@ -247,16 +247,16 @@ adapt_decompress(void *src, uint64_t srclen, void *dst,
cmp_flags = (chdr>>4) & CHDR_ALGO_MASK; cmp_flags = (chdr>>4) & CHDR_ALGO_MASK;
if (cmp_flags == COMPRESS_LZMA) { if (cmp_flags == ADAPT_COMPRESS_LZMA) {
return (lzma_decompress(src, srclen, dst, dstlen, level, chdr, adat->lzma_data)); return (lzma_decompress(src, srclen, dst, dstlen, level, chdr, adat->lzma_data));
} else if (cmp_flags == COMPRESS_BZIP2) { } else if (cmp_flags == ADAPT_COMPRESS_BZIP2) {
return (bzip2_decompress(src, srclen, dst, dstlen, level, chdr, NULL)); return (bzip2_decompress(src, srclen, dst, dstlen, level, chdr, NULL));
} else if (cmp_flags == COMPRESS_PPMD) { } else if (cmp_flags == ADAPT_COMPRESS_PPMD) {
return (ppmd_decompress(src, srclen, dst, dstlen, level, chdr, adat->ppmd_data)); return (ppmd_decompress(src, srclen, dst, dstlen, level, chdr, adat->ppmd_data));
} else if (cmp_flags == COMPRESS_BSC) { } else if (cmp_flags == ADAPT_COMPRESS_BSC) {
#ifdef ENABLE_PC_LIBBSC #ifdef ENABLE_PC_LIBBSC
return (libbsc_decompress(src, srclen, dst, dstlen, level, chdr, adat->bsc_data)); return (libbsc_decompress(src, srclen, dst, dstlen, level, chdr, adat->bsc_data));
#else #else

View file

@ -61,7 +61,8 @@ typedef enum {
* to decode archives created with 1.2. New archives do not use SKEIN. * to decode archives created with 1.2. New archives do not use SKEIN.
*/ */
CKSUM_SKEIN256 = 0x800, CKSUM_SKEIN256 = 0x800,
CKSUM_SKEIN512 = 0x900 CKSUM_SKEIN512 = 0x900,
CKSUM_INVALID = 0
} cksum_t; } cksum_t;
typedef struct { typedef struct {

View file

@ -70,11 +70,11 @@ extern "C" {
* lower 3 bits in higher nibble indicate chunk compression algorithm * lower 3 bits in higher nibble indicate chunk compression algorithm
* in adaptive modes. * in adaptive modes.
*/ */
#define COMPRESS_NONE 0 #define ADAPT_COMPRESS_NONE 0
#define COMPRESS_LZMA 1 #define ADAPT_COMPRESS_LZMA 1
#define COMPRESS_BZIP2 2 #define ADAPT_COMPRESS_BZIP2 2
#define COMPRESS_PPMD 3 #define ADAPT_COMPRESS_PPMD 3
#define COMPRESS_BSC 4 #define ADAPT_COMPRESS_BSC 4
#define CHDR_ALGO_MASK 7 #define CHDR_ALGO_MASK 7
extern uint32_t zlib_buf_extra(uint64_t buflen); extern uint32_t zlib_buf_extra(uint64_t buflen);

View file

@ -32,7 +32,6 @@
#include <pthread.h> #include <pthread.h>
#include "db.h" #include "db.h"
#include "config.h"
#define ONE_PB (1125899906842624ULL) #define ONE_PB (1125899906842624ULL)
#define ONE_TB (1099511627776ULL) #define ONE_TB (1099511627776ULL)
@ -43,7 +42,7 @@
* Hashtable structures for in-memory index. * Hashtable structures for in-memory index.
*/ */
typedef struct _hash_entry { typedef struct _hash_entry {
segment_entry_t *seg; uint64_t seg_offset;
struct _hash_entry *next; struct _hash_entry *next;
struct _hash_entry *lru_prev; struct _hash_entry *lru_prev;
struct _hash_entry *lru_next; struct _hash_entry *lru_next;
@ -61,9 +60,15 @@ typedef struct {
hash_entry_t *lru_tail; hash_entry_t *lru_tail;
uint64_t memlimit; uint64_t memlimit;
uint64_t memused; uint64_t memused;
int hash_entry_size; int hash_entry_size, intervals;
} htablst_t; } htablst_t;
typedef struct {
htablst_t *hlist;
int seg_fd_w;
int *tfd;
} seg_index_t;
archive_config_t * archive_config_t *
init_global_db(char *configfile) init_global_db(char *configfile)
{ {
@ -83,17 +88,36 @@ init_global_db(char *configfile)
return (cfg); return (cfg);
} }
void
static cleanup_htablst(htablst_t *htablst, int intervals)
{
int i;
if (htablst) {
if (htablst->list) {
for (i = 0; i < intervals; i++) {
if (htablst->list[i].htab)
free(htablst->list[i].htab);
}
free(htablst->list);
}
if (htablst->mlist)
free(htablst->mlist);
free(htablst);
}
}
archive_config_t * archive_config_t *
init_global_db_s(char *path, uint32_t chunksize, int pct_interval, init_global_db_s(char *path, char *tmppath, uint32_t chunksize, int pct_interval,
compress_algo_t algo, cksum_t ck, cksum_t ck_sim, size_t file_sz, compress_algo_t algo, cksum_t ck, cksum_t ck_sim, size_t file_sz,
size_t memlimit) size_t memlimit, int nthreads)
{ {
archive_config_t *cfg; archive_config_t *cfg;
int rv; int rv;
float diff; float diff;
cfg = calloc(1, sizeof (archive_config_t)); cfg = calloc(1, sizeof (archive_config_t));
rv = set_config_s(cfg, algo, ck, ck_sim, chunksize, file_sz, chunks_per_seg, pct_interval); rv = set_config_s(cfg, algo, ck, ck_sim, chunksize, file_sz, pct_interval);
if (path != NULL) { if (path != NULL) {
printf("Disk based index not yet implemented.\n"); printf("Disk based index not yet implemented.\n");
@ -104,6 +128,7 @@ init_global_db_s(char *path, uint32_t chunksize, int pct_interval,
uint64_t memreqd; uint64_t memreqd;
htablst_t *htablst; htablst_t *htablst;
int hash_entry_size; int hash_entry_size;
seg_index_t *indx;
// Compute total hashtable entries first // Compute total hashtable entries first
intervals = 100 / pct_interval - 1; intervals = 100 / pct_interval - 1;
@ -129,18 +154,53 @@ init_global_db_s(char *path, uint32_t chunksize, int pct_interval,
// Now create as many hash tables as there are similarity match intervals // Now create as many hash tables as there are similarity match intervals
// each having hash_slots / intervals slots. // each having hash_slots / intervals slots.
htablst = calloc(1, sizeof (htablst_t)); htablst = calloc(1, sizeof (htablst_t));
if (!htablst) {
free(cfg);
return (NULL);
}
htablst->memlimit = memlimit; htablst->memlimit = memlimit;
htablst->list = (htab_t *)calloc(intervals, sizeof (htab_t)); htablst->list = (htab_t *)calloc(intervals, sizeof (htab_t));
htablst->mlist = (pthread_mutex_t *)malloc(intervals * sizeof (pthread_mutex_t)); htablst->mlist = (pthread_mutex_t *)malloc(intervals * sizeof (pthread_mutex_t));
htablst->hash_entry_size = hash_entry_size; htablst->hash_entry_size = hash_entry_size;
htablst->intervals = intervals;
for (i = 0; i < intervals; i++) { for (i = 0; i < intervals; i++) {
htablst->list[i].htab = (hash_entry_t **)calloc(hash_slots / intervals, htablst->list[i].htab = (hash_entry_t **)calloc(hash_slots / intervals,
sizeof (hash_entry_t *)); sizeof (hash_entry_t *));
if (!(htablst->list[i].htab)) {
cleanup_htablst(htablst, intervals);
free(cfg);
return (NULL);
}
htablst->memused += ((hash_slots / intervals) * (sizeof (hash_entry_t *))); htablst->memused += ((hash_slots / intervals) * (sizeof (hash_entry_t *)));
pthread_mutex_init(&(htablst->mlist[i]), NULL); pthread_mutex_init(&(htablst->mlist[i]), NULL);
} }
cfg->dbdata = htablst;
indx = (seg_index_t *)calloc(1, sizeof (seg_index_t));
if (!indx) {
cleanup_htablst(htablst, intervals);
free(cfg);
return (NULL);
}
indx->hlist = htablst;
strcpy(cfg->rootdir, tmppath);
strcat(cfg->rootdir, "/.segXXXXXX");
indx->seg_fd_w = mkstemp(cfg->rootdir);
indx->tfd = (int *)malloc(sizeof (int) * nthreads);
if (indx->seg_fd_w == -1 || indx->tfd == NULL) {
cleanup_htablst(htablst, intervals);
free(cfg);
if (indx->tfd)
free(indx->tfd);
return (NULL);
}
for (i = 0; i < nthreads; i++) {
indx->tfd[i] = open(cfg->rootdir, O_RDONLY);
}
cfg->dbdata = indx;
slab_cache_add(hash_entry_size); slab_cache_add(hash_entry_size);
slab_cache_add(cfg->chunk_cksum_sz); slab_cache_add(cfg->chunk_cksum_sz);
} }
@ -148,6 +208,13 @@ init_global_db_s(char *path, uint32_t chunksize, int pct_interval,
} }
int int
db_insert_s(archive_config_t *cfg, uchar_t *cksum, int interval_num) db_insert_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval, segment_entry_t *seg, int thr_id)
{ {
return (0);
}
segment_entry_t *
db_query_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval, int thr_id)
{
return (0);
} }

View file

@ -21,14 +21,19 @@
#ifndef _DB_H #ifndef _DB_H
#define _DB_H #define _DB_H
#include <dedupe_config.h>
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
archive_config_t *init_global_db(char *configfile); archive_config_t *init_global_db(char *configfile);
archive_config_t *init_global_db_s(char *path, uint32_t chunksize, int pct_interval, archive_config_t *init_global_db_s(char *path, char *tmppath, uint32_t chunksize,
compress_algo_t algo, cksum_t ck, cksum_t ck_sim, size_t file_sz, int pct_interval, compress_algo_t algo, cksum_t ck,
size_t memlimit); cksum_t ck_sim, size_t file_sz, size_t memlimit, int nthreads);
int db_insert_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval,
segment_entry_t *seg, int thr_id);
segment_entry_t *db_query_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval, int thr_id);
#ifdef __cplusplus #ifdef __cplusplus
} }

View file

@ -31,8 +31,8 @@
#include <sys/stat.h> #include <sys/stat.h>
#include <rabin_dedup.h> #include <rabin_dedup.h>
#include "config.h" #include "dedupe_config.h"
#include "initdb.h" #include "db.h"
#define ONE_PB (1125899906842624ULL) #define ONE_PB (1125899906842624ULL)
#define ONE_TB (1099511627776ULL) #define ONE_TB (1099511627776ULL)
@ -44,6 +44,7 @@ get_compress_level(compress_algo_t algo)
{ {
switch (algo) { switch (algo) {
case COMPRESS_NONE: case COMPRESS_NONE:
case COMPRESS_INVALID:
return (0); return (0);
case COMPRESS_LZFX: case COMPRESS_LZFX:
return (5); return (5);
@ -130,7 +131,7 @@ get_cksum_type(char *cksum_name)
} }
static char * static char *
get_cksum_str(chunk_cksum_t ck) get_cksum_str(cksum_t ck)
{ {
if (ck == CKSUM_SHA256) { if (ck == CKSUM_SHA256) {
return ("SHA256"); return ("SHA256");
@ -154,7 +155,7 @@ get_cksum_str(chunk_cksum_t ck)
} }
static int static int
get_cksum_sz(chunk_cksum_t ck) get_cksum_sz(cksum_t ck)
{ {
if (ck == CKSUM_SHA256 || ck == CKSUM_BLAKE256 || ck == CKSUM_KECCAK256) { if (ck == CKSUM_SHA256 || ck == CKSUM_BLAKE256 || ck == CKSUM_KECCAK256) {
return (32); return (32);
@ -185,7 +186,7 @@ read_config(char *configfile, archive_config_t *cfg)
return (1); return (1);
} }
while (fgets(line, 255, fh) != NULL) { while (fgets(line, 255, fh) != NULL) {
int pos; char *pos;
if (strlen(line) < 9 || line[0] == '#') { if (strlen(line) < 9 || line[0] == '#') {
continue; continue;
@ -205,7 +206,7 @@ read_config(char *configfile, archive_config_t *cfg)
} }
cfg->chunk_sz = ck; cfg->chunk_sz = ck;
} else if (strncmp(line, "ROOTDIR") == 0) { } else if (strncmp(line, "ROOTDIR", 7) == 0) {
struct stat sb; struct stat sb;
if (stat(pos, &sb) == -1) { if (stat(pos, &sb) == -1) {
if (errno != ENOENT) { if (errno != ENOENT) {
@ -222,7 +223,7 @@ read_config(char *configfile, archive_config_t *cfg)
fclose(fh); fclose(fh);
return (1); return (1);
} }
} else if (strncmp(line, "ARCHIVESZ") == 0) { } else if (strncmp(line, "ARCHIVESZ", 9) == 0) {
int ovr; int ovr;
ssize_t arch_sz; ssize_t arch_sz;
ovr = parse_numeric(&arch_sz, pos); ovr = parse_numeric(&arch_sz, pos);
@ -238,7 +239,7 @@ read_config(char *configfile, archive_config_t *cfg)
} }
cfg->archive_sz = arch_sz; cfg->archive_sz = arch_sz;
} else if (strncmp(line, "VERIFY") == 0) { } else if (strncmp(line, "VERIFY", 6) == 0) {
if (strcmp(pos, "no") == 0) { if (strcmp(pos, "no") == 0) {
cfg->verify_chunks = 0; cfg->verify_chunks = 0;
@ -249,21 +250,21 @@ read_config(char *configfile, archive_config_t *cfg)
fclose(fh); fclose(fh);
return (1); return (1);
} }
} else if (strncmp(line, "COMPRESS") == 0) { } else if (strncmp(line, "COMPRESS", 8) == 0) {
cfg->algo = get_compress_algo(pos); cfg->algo = get_compress_algo(pos);
if (cfg->algo == COMPRESS_INVALID) { if (cfg->algo == COMPRESS_INVALID) {
fprintf(stderr, "Invalid COMPRESS setting.\n"); fprintf(stderr, "Invalid COMPRESS setting.\n");
fclose(fh); fclose(fh);
return (1); return (1);
} }
} else if (strncmp(line, "CHUNK_CKSUM") == 0) { } else if (strncmp(line, "CHUNK_CKSUM", 11) == 0) {
cfg->chunk_cksum_type = get_cksum_type(pos); cfg->chunk_cksum_type = get_cksum_type(pos);
if (cfg->chunk_cksum_type == CKSUM_INVALID) { if (cfg->chunk_cksum_type == CKSUM_INVALID) {
fprintf(stderr, "Invalid CHUNK_CKSUM setting.\n"); fprintf(stderr, "Invalid CHUNK_CKSUM setting.\n");
fclose(fh); fclose(fh);
return (1); return (1);
} }
} else if (strncmp(line, "SIMILARITY_CKSUM") == 0) { } else if (strncmp(line, "SIMILARITY_CKSUM", 16) == 0) {
cfg->chunk_cksum_type = get_cksum_type(pos); cfg->chunk_cksum_type = get_cksum_type(pos);
if (cfg->chunk_cksum_type == CKSUM_INVALID) { if (cfg->chunk_cksum_type == CKSUM_INVALID) {
fprintf(stderr, "Invalid CHUNK_CKSUM setting.\n"); fprintf(stderr, "Invalid CHUNK_CKSUM setting.\n");
@ -306,10 +307,10 @@ read_config(char *configfile, archive_config_t *cfg)
cfg->container_sz = CONTAINER_ITEMS; cfg->container_sz = CONTAINER_ITEMS;
container_sz_bytes = CONTAINER_ITEMS * segment_sz_bytes; container_sz_bytes = CONTAINER_ITEMS * segment_sz_bytes;
if (cfg->archive_sz / total_dirs < container_sz) if (cfg->archive_sz / total_dirs < cfg->container_sz)
cfg->num_containers = 1; cfg->num_containers = 1;
else else
cfg->num_containers = (cfg->archive_sz / total_dirs) / container_sz + 1; cfg->num_containers = (cfg->archive_sz / total_dirs) / cfg->container_sz + 1;
return (0); return (0);
} }
@ -317,6 +318,8 @@ read_config(char *configfile, archive_config_t *cfg)
int int
write_config(char *configfile, archive_config_t *cfg) write_config(char *configfile, archive_config_t *cfg)
{ {
FILE *fh;
fh = fopen(configfile, "w"); fh = fopen(configfile, "w");
if (fh == NULL) { if (fh == NULL) {
perror(" "); perror(" ");
@ -325,7 +328,7 @@ write_config(char *configfile, archive_config_t *cfg)
fprintf(fh, "#\n# Autogenerated config file\n# !! DO NOT EDIT !!\n#\n\n"); fprintf(fh, "#\n# Autogenerated config file\n# !! DO NOT EDIT !!\n#\n\n");
fprintf(fh, "ROOTDIR = %s\n", cfg->rootdir); fprintf(fh, "ROOTDIR = %s\n", cfg->rootdir);
fprintf(fh, "CHUNKSZ = %u\n", cfg->chunk_sz; fprintf(fh, "CHUNKSZ = %u\n", cfg->chunk_sz);
fprintf(fh, "ARCHIVESZ = %" PRId64 "\n", cfg->archive_sz); fprintf(fh, "ARCHIVESZ = %" PRId64 "\n", cfg->archive_sz);
if (cfg->verify_chunks) if (cfg->verify_chunks)
@ -336,6 +339,8 @@ write_config(char *configfile, archive_config_t *cfg)
fprintf(fh, "CHUNK_CKSUM = %s\n", get_cksum_str(cfg->chunk_cksum_type)); fprintf(fh, "CHUNK_CKSUM = %s\n", get_cksum_str(cfg->chunk_cksum_type));
fprintf(fh, "\n"); fprintf(fh, "\n");
fclose(fh); fclose(fh);
return (0);
} }
int int
@ -354,14 +359,13 @@ set_config_s(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, cksum_t ck
cfg->archive_sz = file_sz; cfg->archive_sz = file_sz;
if (cfg->archive_sz < ONE_TB) { if (cfg->archive_sz < ONE_TB) {
segment_sz_bytes = FOUR_MB; cfg->segment_sz_bytes = FOUR_MB;
} else { } else {
segment_sz_bytes = EIGHT_MB; cfg->segment_sz_bytes = EIGHT_MB;
} }
cfg->segment_sz_bytes = segment_sz_bytes; cfg->segment_sz = cfg->segment_sz_bytes / cfg->chunk_sz_bytes;
cfg->segment_sz = segment_sz_bytes / cfg->chunk_sz_bytes;
return (0); return (0);
} }

View file

@ -23,6 +23,7 @@
#include <limits.h> #include <limits.h>
#include <utils.h> #include <utils.h>
#include <crypto_utils.h>
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
@ -32,6 +33,7 @@ extern "C" {
#define DEFAULT_CHUNK_CKSUM CKSUM_SHA256 #define DEFAULT_CHUNK_CKSUM CKSUM_SHA256
#define DEFAULT_SIMILARITY_CKSUM CKSUM_BLAKE256 #define DEFAULT_SIMILARITY_CKSUM CKSUM_BLAKE256
#define DEFAULT_COMPRESS COMPRESS_LZ4 #define DEFAULT_COMPRESS COMPRESS_LZ4
#define CONTAINER_ITEMS 2048
#define MIN_CK 1 #define MIN_CK 1
#define MAX_CK 5 #define MAX_CK 5
@ -64,9 +66,9 @@ typedef struct {
} archive_config_t; } archive_config_t;
typedef struct _segment_entry { typedef struct _segment_entry {
uint64_t offset; uint64_t chunk_offset;
uint32_t length; uint32_t chunk_length;
uchar_t *cksum; uchar_t *chunk_cksum;
} segment_entry_t; } segment_entry_t;
int read_config(char *configfile, archive_config_t *cfg); int read_config(char *configfile, archive_config_t *cfg);

View file

@ -133,7 +133,8 @@ typedef enum {
COMPRESS_LZ4, COMPRESS_LZ4,
COMPRESS_ZLIB, COMPRESS_ZLIB,
COMPRESS_BZIP2, COMPRESS_BZIP2,
COMPRESS_LZMA COMPRESS_LZMA,
COMPRESS_INVALID
} compress_algo_t; } compress_algo_t;
typedef struct { typedef struct {