From 6c3173f92939918da42e625fb8be1b4c65131201 Mon Sep 17 00:00:00 2001 From: Moinak Ghosh Date: Thu, 29 Nov 2012 22:28:50 +0530 Subject: [PATCH] Work in progress global dedupe config setup. --- Makefile.in | 2 +- rabin/global/config.c | 177 +++++++++++++++++++++++++++++++++++++++++- rabin/global/config.h | 29 ++++++- rabin/global/initdb.c | 14 +++- 4 files changed, 216 insertions(+), 6 deletions(-) diff --git a/Makefile.in b/Makefile.in index 1ff7b97..e7cabdf 100644 --- a/Makefile.in +++ b/Makefile.in @@ -94,7 +94,7 @@ LIBBSCGEN_OPT = -fopenmp LIBBSCCPPFLAGS = -I$(LIBBSCDIR)/libbsc -DENABLE_PC_LIBBSC BAKFILES = *~ lzma/*~ lzfx/*~ lz4/*~ rabin/*~ bsdiff/*~ lzp/*~ utils/*~ crypto/sha2/*~ \ - crypto/sha2/intel/*~ crypto/aes/*~ crypto/scrypt/*~ crypto/*~ + crypto/sha2/intel/*~ crypto/aes/*~ crypto/scrypt/*~ crypto/*~ rabin/global/*~ RM = rm -f RM_RF = rm -rf diff --git a/rabin/global/config.c b/rabin/global/config.c index 4ad9f33..267c78e 100644 --- a/rabin/global/config.c +++ b/rabin/global/config.c @@ -31,6 +31,7 @@ #include #include +#include "config.h" #include "initdb.h" #define ONE_PB (1125899906842624ULL) @@ -38,6 +39,120 @@ #define FOUR_MB (4194304ULL) #define EIGHT_MB (8388608ULL) +static int +get_compress_level(compress_algo_t algo) +{ + switch (algo) { + case COMPRESS_NONE: + return (0); + case COMPRESS_LZFX: + return (5); + case COMPRESS_LZ4: + return (1); + case COMPRESS_ZLIB: + case COMPRESS_BZIP2: + case COMPRESS_LZMA: + return (6); + }; + return (0); +} + +static compress_algo_t +get_compress_algo(char *algo_name) +{ + if (strcmp(algo_name, "none") == 0) { + return (COMPRESS_NONE); + + } else if (strcmp(algo_name, "lzfx") == 0) { + return (COMPRESS_LZFX); + + } else if (strcmp(algo_name, "lz4") == 0) { + return (COMPRESS_LZ4); + + } else if (strcmp(algo_name, "zlib") == 0) { + return (COMPRESS_ZLIB); + + } else if (strcmp(algo_name, "bzip2") == 0) { + return (COMPRESS_BZIP2); + + } else if (strcmp(algo_name, "lzma") == 0) { + return (COMPRESS_LZMA); + } + return (COMPRESS_INVALID); +} + +static char * +get_compress_str(compress_algo_t algo) +{ + if (algo == COMPRESS_NONE) { + return ("none"); + + } else if (algo == COMPRESS_LZFX) { + return ("lzfx"); + + } else if (algo == COMPRESS_LZ4) { + return ("lz4"); + + } else if (algo == COMPRESS_ZLIB) { + return ("zlib"); + + } else if (algo == COMPRESS_BZIP2) { + return ("bzip2"); + + } else if (algo == COMPRESS_LZMA) { + return ("lzma"); + } + return ("invalid"); +} + +static chunk_cksum_t +get_cksum_type(char *cksum_name) +{ + if (strcmp(cksum_name, "SHA256") == 0) { + return (CKSUM_SHA256); + + } else if (cksum_name, "SHA512") == 0) { + return (CKSUM_SHA512); + + } else if (cksum_name, "SKEIN256") == 0) { + return (CKSUM_SKEIN256); + + } else if (cksum_name, "SKEIN512") == 0) { + return (CKSUM_SKEIN512); + } + return (CKSUM_INVALID); +} + +static char * +get_cksum_str(chunk_cksum_t ck) +{ + if (ck == CKSUM_SHA256) { + return ("SHA256"); + + } else if (ck == CKSUM_SHA512) { + return ("SHA512"); + + } else if (ck == CKSUM_SKEIN256) { + return ("SKEIN256"); + + } else if (ck == CKSUM_SKEIN512) { + return ("SKEIN512"); + } + return ("INVALID"); +} + +static int +get_cksum_sz(chunk_cksum_t ck) +{ + if (ck == CKSUM_SHA256 || ck == CKSUM_SKEIN256) { + return (32); + + } else if (ck == CKSUM_SHA512 || ck == CKSUM_SKEIN512) { + return (64); + } + return (0); +} + int read_config(char *configfile, archive_config_t *cfg) { @@ -45,6 +160,12 @@ read_config(char *configfile, archive_config_t *cfg) char line[255]; uint32_t container_sz_bytes, segment_sz_bytes, total_dirs, i; + // Default + cfg->verify_chunks = 0; + cfg->algo = COMPRESS_LZ4; + cfg->chunk_cksum_type = DEFAULT_CKSUM; + cfg->similarity_interval = DEFAULT_SIMILARITY_INTERVAL; + fh = fopen(configfile, "r"); if (fh == NULL) { perror(" "); @@ -76,7 +197,7 @@ read_config(char *configfile, archive_config_t *cfg) if (stat(pos, &sb) == -1) { if (errno != ENOENT) { perror(" "); - fprintf(stderr, "Invalid ROOTDIR\n"); + fprintf(stderr, "Invalid ROOTDIR.\n"); fclose(fh); return (1); } else { @@ -103,9 +224,37 @@ read_config(char *configfile, archive_config_t *cfg) return (1); } cfg->archive_sz = arch_sz; + + } else if (strncmp(line, "VERIFY") == 0) { + if (strcmp(pos, "no") == 0) { + cfg->verify_chunks = 0; + + } else if (strcmp(pos, "yes") == 0) { + cfg->verify_chunks = 1; + } else { + fprintf(stderr, "Invalid VERIFY setting. Must be either yes or no.\n"); + fclose(fh); + return (1); + } + } else if (strncmp(line, "COMPRESS") == 0) { + cfg->algo = get_compress_algo(pos); + if (cfg->algo == COMPRESS_INVALID) { + fprintf(stderr, "Invalid COMPRESS setting.\n"); + fclose(fh); + return (1); + } + } else if (strncmp(line, "CHUNK_CKSUM") == 0) { + cfg->chunk_cksum_type = get_cksum_type(pos); + if (cfg->chunk_cksum_type == CKSUM_INVALID) { + fprintf(stderr, "Invalid CHUNK_CKSUM setting.\n"); + fclose(fh); + return (1); + } } } fclose(fh); + cfg->compress_level = get_compress_level(cfg->algo); + cfg->chunk_cksum_sz = get_cksum_sz(cfg->chunk_cksum_type); /* * Now compute the remaining parameters. @@ -139,4 +288,30 @@ read_config(char *configfile, archive_config_t *cfg) cfg->num_containers = 1; else cfg->num_containers = (cfg->archive_sz / total_dirs) / container_sz + 1; + + return (0); +} + +int +write_config(char *configfile, archive_config_t *cfg) +{ + fh = fopen(configfile, "w"); + if (fh == NULL) { + perror(" "); + return (1); + } + + fprintf(fh, "#\n# Autogenerated config file\n# !! DO NOT EDIT !!\n#\n\n"); + fprintf(fh, "ROOTDIR = %s\n", cfg->rootdir); + fprintf(fh, "CHUNKSZ = %u\n", cfg->chunk_sz; + fprintf(fh, "ARCHIVESZ = %" PRId64 "\n", cfg->archive_sz); + + if (cfg->verify_chunks) + fprintf(fh, "VERIFY = yes\n"); + else + fprintf(fh, "VERIFY = no\n"); + fprintf(fh, "COMPRESS = %s\n", get_compress_str(cfg->algo)); + fprintf(fh, "CHUNK_CKSUM = %s\n", get_cksum_str(cfg->chunk_cksum_type)); + fprintf(fh, "\n"); + fclose(fh); } diff --git a/rabin/global/config.h b/rabin/global/config.h index 20e968e..05217ce 100644 --- a/rabin/global/config.h +++ b/rabin/global/config.h @@ -28,24 +28,46 @@ extern "C" { #endif #define DEFAULT_SIMILARITY_INTERVAL 10 -#define DEFAULT_CKSUM "SHA256" +#define DEFAULT_CKSUM CKSUM_SHA256 #define CONTAINER_ITEMS 2048 #define MIN_CK 1 #define MAX_CK 5 +enum { + COMPRESS_NONE=0, + COMPRESS_LZFX, + COMPRESS_LZ4, + COMPRESS_ZLIB, + COMPRESS_BZIP2, + COMPRESS_LZMA, + COMPRESS_INVALID +} compress_algo_t; + +enum { + CKSUM_SHA256, + CKSUM_SHA512, + CKSUM_SKEIN256, + CKSUM_SKEIN512, + CKSUM_INVALID +} chunk_cksum_t; + // 8GB #define MIN_ARCHIVE_SZ (8589934592ULL) typedef struct { char rootdir[PATH_MAX+1]; uint32_t chunk_sz; // Numeric ID: 1 - 4k ... 5 - 64k - uint64_t archive_sz; // Total size of archive in bytes. + int64_t archive_sz; // Total size of archive in bytes. + int verify_chunks; // Whether to use memcmp() to compare chunks byte for byte. + compress_algo_t algo; // Which compression algo for segments. + int compress_level; // Default preset compression level per algo. int chunk_cksum_type; // Which digest to use for hash based chunk lookup. + int chunk_cksum_sz; // Size of cksum in bytes. int similarity_interval; // Similarity based match intervals in %age. // The items below are computed given the above // components. - uint32_t chunk_sz_bytes; + uint32_t chunk_sz_bytes; // Average chunk size uint32_t segment_sz; // Number of chunks uint32_t container_sz; // Number of segments int directory_fanout; // Number of subdirectories in a directory @@ -54,6 +76,7 @@ typedef struct { } archive_config_t; int read_config(char *configfile, archive_config_t *cfg); +int write_config(char *configfile, archive_config_t *cfg); #ifdef __cplusplus } diff --git a/rabin/global/initdb.c b/rabin/global/initdb.c index 04a4561..ea88305 100644 --- a/rabin/global/initdb.c +++ b/rabin/global/initdb.c @@ -32,7 +32,19 @@ #include "initdb.h" -int +archive_config_t * init_global_db(char *configfile) { + archive_config_t *cfg; + int rv; + + cfg = calloc(1, sizeof (archive_config_t)); + if (!cfg) { + fprintf(stderr, "Memory allocation failure\n"); + return (NULL); + } + + rv = read_config(configfile, cfg); + if (rv != 0) + return (NULL); }