From 6badbcaea740309231f99801c7dd195be9b3af1a Mon Sep 17 00:00:00 2001 From: Moinak Ghosh Date: Sun, 17 Feb 2013 21:05:40 +0530 Subject: [PATCH] Make global dedupe bits buildable and fix errors. Rename Adaptive compression type constants to avoid conflict with global constants. --- adaptive_compress.c | 16 ++--- crypto/crypto_utils.h | 3 +- pcompress.h | 10 +-- rabin/global/db.c | 83 +++++++++++++++++++--- rabin/global/db.h | 11 ++- rabin/global/{config.c => dedupe_config.c} | 40 ++++++----- rabin/global/{config.h => dedupe_config.h} | 8 ++- utils/utils.h | 3 +- 8 files changed, 127 insertions(+), 47 deletions(-) rename rabin/global/{config.c => dedupe_config.c} (91%) rename rabin/global/{config.h => dedupe_config.h} (95%) diff --git a/adaptive_compress.c b/adaptive_compress.c index 450e2cd..ad2c6bb 100644 --- a/adaptive_compress.c +++ b/adaptive_compress.c @@ -207,14 +207,14 @@ adapt_compress(void *src, uint64_t srclen, void *dst, rv = lzma_compress(src, srclen, dst, dstlen, level, chdr, adat->lzma_data); if (rv < 0) return (rv); - rv = COMPRESS_LZMA; + rv = ADAPT_COMPRESS_LZMA; lzma_count++; } else if (adat->adapt_mode == 1 && tot8b > FIFTY_PCT(srclen)) { rv = bzip2_compress(src, srclen, dst, dstlen, level, chdr, NULL); if (rv < 0) return (rv); - rv = COMPRESS_BZIP2; + rv = ADAPT_COMPRESS_BZIP2; bzip2_count++; } else { @@ -223,14 +223,14 @@ adapt_compress(void *src, uint64_t srclen, void *dst, rv = libbsc_compress(src, srclen, dst, dstlen, level, chdr, adat->bsc_data); if (rv < 0) return (rv); - rv = COMPRESS_BSC; + rv = ADAPT_COMPRESS_BSC; bsc_count++; #endif } else { rv = ppmd_compress(src, srclen, dst, dstlen, level, chdr, adat->ppmd_data); if (rv < 0) return (rv); - rv = COMPRESS_PPMD; + rv = ADAPT_COMPRESS_PPMD; ppmd_count++; } } @@ -247,16 +247,16 @@ adapt_decompress(void *src, uint64_t srclen, void *dst, cmp_flags = (chdr>>4) & CHDR_ALGO_MASK; - if (cmp_flags == COMPRESS_LZMA) { + if (cmp_flags == ADAPT_COMPRESS_LZMA) { return (lzma_decompress(src, srclen, dst, dstlen, level, chdr, adat->lzma_data)); - } else if (cmp_flags == COMPRESS_BZIP2) { + } else if (cmp_flags == ADAPT_COMPRESS_BZIP2) { return (bzip2_decompress(src, srclen, dst, dstlen, level, chdr, NULL)); - } else if (cmp_flags == COMPRESS_PPMD) { + } else if (cmp_flags == ADAPT_COMPRESS_PPMD) { return (ppmd_decompress(src, srclen, dst, dstlen, level, chdr, adat->ppmd_data)); - } else if (cmp_flags == COMPRESS_BSC) { + } else if (cmp_flags == ADAPT_COMPRESS_BSC) { #ifdef ENABLE_PC_LIBBSC return (libbsc_decompress(src, srclen, dst, dstlen, level, chdr, adat->bsc_data)); #else diff --git a/crypto/crypto_utils.h b/crypto/crypto_utils.h index 2f73a11..a07fd39 100644 --- a/crypto/crypto_utils.h +++ b/crypto/crypto_utils.h @@ -61,7 +61,8 @@ typedef enum { * to decode archives created with 1.2. New archives do not use SKEIN. */ CKSUM_SKEIN256 = 0x800, - CKSUM_SKEIN512 = 0x900 + CKSUM_SKEIN512 = 0x900, + CKSUM_INVALID = 0 } cksum_t; typedef struct { diff --git a/pcompress.h b/pcompress.h index 9dd2d3e..3296483 100644 --- a/pcompress.h +++ b/pcompress.h @@ -70,11 +70,11 @@ extern "C" { * lower 3 bits in higher nibble indicate chunk compression algorithm * in adaptive modes. */ -#define COMPRESS_NONE 0 -#define COMPRESS_LZMA 1 -#define COMPRESS_BZIP2 2 -#define COMPRESS_PPMD 3 -#define COMPRESS_BSC 4 +#define ADAPT_COMPRESS_NONE 0 +#define ADAPT_COMPRESS_LZMA 1 +#define ADAPT_COMPRESS_BZIP2 2 +#define ADAPT_COMPRESS_PPMD 3 +#define ADAPT_COMPRESS_BSC 4 #define CHDR_ALGO_MASK 7 extern uint32_t zlib_buf_extra(uint64_t buflen); diff --git a/rabin/global/db.c b/rabin/global/db.c index a61a26b..ad0b592 100644 --- a/rabin/global/db.c +++ b/rabin/global/db.c @@ -32,7 +32,6 @@ #include #include "db.h" -#include "config.h" #define ONE_PB (1125899906842624ULL) #define ONE_TB (1099511627776ULL) @@ -43,7 +42,7 @@ * Hashtable structures for in-memory index. */ typedef struct _hash_entry { - segment_entry_t *seg; + uint64_t seg_offset; struct _hash_entry *next; struct _hash_entry *lru_prev; struct _hash_entry *lru_next; @@ -61,9 +60,15 @@ typedef struct { hash_entry_t *lru_tail; uint64_t memlimit; uint64_t memused; - int hash_entry_size; + int hash_entry_size, intervals; } htablst_t; +typedef struct { + htablst_t *hlist; + int seg_fd_w; + int *tfd; +} seg_index_t; + archive_config_t * init_global_db(char *configfile) { @@ -83,17 +88,36 @@ init_global_db(char *configfile) return (cfg); } +void +static cleanup_htablst(htablst_t *htablst, int intervals) +{ + int i; + + if (htablst) { + if (htablst->list) { + for (i = 0; i < intervals; i++) { + if (htablst->list[i].htab) + free(htablst->list[i].htab); + } + free(htablst->list); + } + if (htablst->mlist) + free(htablst->mlist); + free(htablst); + } +} + archive_config_t * -init_global_db_s(char *path, uint32_t chunksize, int pct_interval, +init_global_db_s(char *path, char *tmppath, uint32_t chunksize, int pct_interval, compress_algo_t algo, cksum_t ck, cksum_t ck_sim, size_t file_sz, - size_t memlimit) + size_t memlimit, int nthreads) { archive_config_t *cfg; int rv; float diff; cfg = calloc(1, sizeof (archive_config_t)); - rv = set_config_s(cfg, algo, ck, ck_sim, chunksize, file_sz, chunks_per_seg, pct_interval); + rv = set_config_s(cfg, algo, ck, ck_sim, chunksize, file_sz, pct_interval); if (path != NULL) { printf("Disk based index not yet implemented.\n"); @@ -104,6 +128,7 @@ init_global_db_s(char *path, uint32_t chunksize, int pct_interval, uint64_t memreqd; htablst_t *htablst; int hash_entry_size; + seg_index_t *indx; // Compute total hashtable entries first intervals = 100 / pct_interval - 1; @@ -129,18 +154,53 @@ init_global_db_s(char *path, uint32_t chunksize, int pct_interval, // Now create as many hash tables as there are similarity match intervals // each having hash_slots / intervals slots. htablst = calloc(1, sizeof (htablst_t)); + if (!htablst) { + free(cfg); + return (NULL); + } + htablst->memlimit = memlimit; htablst->list = (htab_t *)calloc(intervals, sizeof (htab_t)); htablst->mlist = (pthread_mutex_t *)malloc(intervals * sizeof (pthread_mutex_t)); htablst->hash_entry_size = hash_entry_size; + htablst->intervals = intervals; for (i = 0; i < intervals; i++) { htablst->list[i].htab = (hash_entry_t **)calloc(hash_slots / intervals, sizeof (hash_entry_t *)); + if (!(htablst->list[i].htab)) { + cleanup_htablst(htablst, intervals); + free(cfg); + return (NULL); + } htablst->memused += ((hash_slots / intervals) * (sizeof (hash_entry_t *))); pthread_mutex_init(&(htablst->mlist[i]), NULL); } - cfg->dbdata = htablst; + + indx = (seg_index_t *)calloc(1, sizeof (seg_index_t)); + if (!indx) { + cleanup_htablst(htablst, intervals); + free(cfg); + return (NULL); + } + indx->hlist = htablst; + + strcpy(cfg->rootdir, tmppath); + strcat(cfg->rootdir, "/.segXXXXXX"); + indx->seg_fd_w = mkstemp(cfg->rootdir); + indx->tfd = (int *)malloc(sizeof (int) * nthreads); + if (indx->seg_fd_w == -1 || indx->tfd == NULL) { + cleanup_htablst(htablst, intervals); + free(cfg); + if (indx->tfd) + free(indx->tfd); + return (NULL); + } + + for (i = 0; i < nthreads; i++) { + indx->tfd[i] = open(cfg->rootdir, O_RDONLY); + } + cfg->dbdata = indx; slab_cache_add(hash_entry_size); slab_cache_add(cfg->chunk_cksum_sz); } @@ -148,6 +208,13 @@ init_global_db_s(char *path, uint32_t chunksize, int pct_interval, } int -db_insert_s(archive_config_t *cfg, uchar_t *cksum, int interval_num) +db_insert_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval, segment_entry_t *seg, int thr_id) { + return (0); +} + +segment_entry_t * +db_query_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval, int thr_id) +{ + return (0); } diff --git a/rabin/global/db.h b/rabin/global/db.h index 2220c18..0d1ccba 100644 --- a/rabin/global/db.h +++ b/rabin/global/db.h @@ -21,14 +21,19 @@ #ifndef _DB_H #define _DB_H +#include + #ifdef __cplusplus extern "C" { #endif archive_config_t *init_global_db(char *configfile); -archive_config_t *init_global_db_s(char *path, uint32_t chunksize, int pct_interval, - compress_algo_t algo, cksum_t ck, cksum_t ck_sim, size_t file_sz, - size_t memlimit); +archive_config_t *init_global_db_s(char *path, char *tmppath, uint32_t chunksize, + int pct_interval, compress_algo_t algo, cksum_t ck, + cksum_t ck_sim, size_t file_sz, size_t memlimit, int nthreads); +int db_insert_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval, + segment_entry_t *seg, int thr_id); +segment_entry_t *db_query_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval, int thr_id); #ifdef __cplusplus } diff --git a/rabin/global/config.c b/rabin/global/dedupe_config.c similarity index 91% rename from rabin/global/config.c rename to rabin/global/dedupe_config.c index 28543ed..d5355df 100644 --- a/rabin/global/config.c +++ b/rabin/global/dedupe_config.c @@ -31,8 +31,8 @@ #include #include -#include "config.h" -#include "initdb.h" +#include "dedupe_config.h" +#include "db.h" #define ONE_PB (1125899906842624ULL) #define ONE_TB (1099511627776ULL) @@ -44,6 +44,7 @@ get_compress_level(compress_algo_t algo) { switch (algo) { case COMPRESS_NONE: + case COMPRESS_INVALID: return (0); case COMPRESS_LZFX: return (5); @@ -130,7 +131,7 @@ get_cksum_type(char *cksum_name) } static char * -get_cksum_str(chunk_cksum_t ck) +get_cksum_str(cksum_t ck) { if (ck == CKSUM_SHA256) { return ("SHA256"); @@ -154,7 +155,7 @@ get_cksum_str(chunk_cksum_t ck) } static int -get_cksum_sz(chunk_cksum_t ck) +get_cksum_sz(cksum_t ck) { if (ck == CKSUM_SHA256 || ck == CKSUM_BLAKE256 || ck == CKSUM_KECCAK256) { return (32); @@ -185,7 +186,7 @@ read_config(char *configfile, archive_config_t *cfg) return (1); } while (fgets(line, 255, fh) != NULL) { - int pos; + char *pos; if (strlen(line) < 9 || line[0] == '#') { continue; @@ -205,7 +206,7 @@ read_config(char *configfile, archive_config_t *cfg) } cfg->chunk_sz = ck; - } else if (strncmp(line, "ROOTDIR") == 0) { + } else if (strncmp(line, "ROOTDIR", 7) == 0) { struct stat sb; if (stat(pos, &sb) == -1) { if (errno != ENOENT) { @@ -222,7 +223,7 @@ read_config(char *configfile, archive_config_t *cfg) fclose(fh); return (1); } - } else if (strncmp(line, "ARCHIVESZ") == 0) { + } else if (strncmp(line, "ARCHIVESZ", 9) == 0) { int ovr; ssize_t arch_sz; ovr = parse_numeric(&arch_sz, pos); @@ -238,7 +239,7 @@ read_config(char *configfile, archive_config_t *cfg) } cfg->archive_sz = arch_sz; - } else if (strncmp(line, "VERIFY") == 0) { + } else if (strncmp(line, "VERIFY", 6) == 0) { if (strcmp(pos, "no") == 0) { cfg->verify_chunks = 0; @@ -249,21 +250,21 @@ read_config(char *configfile, archive_config_t *cfg) fclose(fh); return (1); } - } else if (strncmp(line, "COMPRESS") == 0) { + } else if (strncmp(line, "COMPRESS", 8) == 0) { cfg->algo = get_compress_algo(pos); if (cfg->algo == COMPRESS_INVALID) { fprintf(stderr, "Invalid COMPRESS setting.\n"); fclose(fh); return (1); } - } else if (strncmp(line, "CHUNK_CKSUM") == 0) { + } else if (strncmp(line, "CHUNK_CKSUM", 11) == 0) { cfg->chunk_cksum_type = get_cksum_type(pos); if (cfg->chunk_cksum_type == CKSUM_INVALID) { fprintf(stderr, "Invalid CHUNK_CKSUM setting.\n"); fclose(fh); return (1); } - } else if (strncmp(line, "SIMILARITY_CKSUM") == 0) { + } else if (strncmp(line, "SIMILARITY_CKSUM", 16) == 0) { cfg->chunk_cksum_type = get_cksum_type(pos); if (cfg->chunk_cksum_type == CKSUM_INVALID) { fprintf(stderr, "Invalid CHUNK_CKSUM setting.\n"); @@ -306,10 +307,10 @@ read_config(char *configfile, archive_config_t *cfg) cfg->container_sz = CONTAINER_ITEMS; container_sz_bytes = CONTAINER_ITEMS * segment_sz_bytes; - if (cfg->archive_sz / total_dirs < container_sz) + if (cfg->archive_sz / total_dirs < cfg->container_sz) cfg->num_containers = 1; else - cfg->num_containers = (cfg->archive_sz / total_dirs) / container_sz + 1; + cfg->num_containers = (cfg->archive_sz / total_dirs) / cfg->container_sz + 1; return (0); } @@ -317,6 +318,8 @@ read_config(char *configfile, archive_config_t *cfg) int write_config(char *configfile, archive_config_t *cfg) { + FILE *fh; + fh = fopen(configfile, "w"); if (fh == NULL) { perror(" "); @@ -325,7 +328,7 @@ write_config(char *configfile, archive_config_t *cfg) fprintf(fh, "#\n# Autogenerated config file\n# !! DO NOT EDIT !!\n#\n\n"); fprintf(fh, "ROOTDIR = %s\n", cfg->rootdir); - fprintf(fh, "CHUNKSZ = %u\n", cfg->chunk_sz; + fprintf(fh, "CHUNKSZ = %u\n", cfg->chunk_sz); fprintf(fh, "ARCHIVESZ = %" PRId64 "\n", cfg->archive_sz); if (cfg->verify_chunks) @@ -336,6 +339,8 @@ write_config(char *configfile, archive_config_t *cfg) fprintf(fh, "CHUNK_CKSUM = %s\n", get_cksum_str(cfg->chunk_cksum_type)); fprintf(fh, "\n"); fclose(fh); + + return (0); } int @@ -354,14 +359,13 @@ set_config_s(archive_config_t *cfg, compress_algo_t algo, cksum_t ck, cksum_t ck cfg->archive_sz = file_sz; if (cfg->archive_sz < ONE_TB) { - segment_sz_bytes = FOUR_MB; + cfg->segment_sz_bytes = FOUR_MB; } else { - segment_sz_bytes = EIGHT_MB; + cfg->segment_sz_bytes = EIGHT_MB; } - cfg->segment_sz_bytes = segment_sz_bytes; - cfg->segment_sz = segment_sz_bytes / cfg->chunk_sz_bytes; + cfg->segment_sz = cfg->segment_sz_bytes / cfg->chunk_sz_bytes; return (0); } diff --git a/rabin/global/config.h b/rabin/global/dedupe_config.h similarity index 95% rename from rabin/global/config.h rename to rabin/global/dedupe_config.h index bd3025f..b38b7ce 100644 --- a/rabin/global/config.h +++ b/rabin/global/dedupe_config.h @@ -23,6 +23,7 @@ #include #include +#include #ifdef __cplusplus extern "C" { @@ -32,6 +33,7 @@ extern "C" { #define DEFAULT_CHUNK_CKSUM CKSUM_SHA256 #define DEFAULT_SIMILARITY_CKSUM CKSUM_BLAKE256 #define DEFAULT_COMPRESS COMPRESS_LZ4 +#define CONTAINER_ITEMS 2048 #define MIN_CK 1 #define MAX_CK 5 @@ -64,9 +66,9 @@ typedef struct { } archive_config_t; typedef struct _segment_entry { - uint64_t offset; - uint32_t length; - uchar_t *cksum; + uint64_t chunk_offset; + uint32_t chunk_length; + uchar_t *chunk_cksum; } segment_entry_t; int read_config(char *configfile, archive_config_t *cfg); diff --git a/utils/utils.h b/utils/utils.h index 9365587..fd59175 100644 --- a/utils/utils.h +++ b/utils/utils.h @@ -133,7 +133,8 @@ typedef enum { COMPRESS_LZ4, COMPRESS_ZLIB, COMPRESS_BZIP2, - COMPRESS_LZMA + COMPRESS_LZMA, + COMPRESS_INVALID } compress_algo_t; typedef struct {