Work in progress global dedupe config setup.

This commit is contained in:
Moinak Ghosh 2012-11-29 22:28:50 +05:30
parent 1f0c237495
commit 6c3173f929
4 changed files with 216 additions and 6 deletions

View file

@ -94,7 +94,7 @@ LIBBSCGEN_OPT = -fopenmp
LIBBSCCPPFLAGS = -I$(LIBBSCDIR)/libbsc -DENABLE_PC_LIBBSC LIBBSCCPPFLAGS = -I$(LIBBSCDIR)/libbsc -DENABLE_PC_LIBBSC
BAKFILES = *~ lzma/*~ lzfx/*~ lz4/*~ rabin/*~ bsdiff/*~ lzp/*~ utils/*~ crypto/sha2/*~ \ BAKFILES = *~ lzma/*~ lzfx/*~ lz4/*~ rabin/*~ bsdiff/*~ lzp/*~ utils/*~ crypto/sha2/*~ \
crypto/sha2/intel/*~ crypto/aes/*~ crypto/scrypt/*~ crypto/*~ crypto/sha2/intel/*~ crypto/aes/*~ crypto/scrypt/*~ crypto/*~ rabin/global/*~
RM = rm -f RM = rm -f
RM_RF = rm -rf RM_RF = rm -rf

View file

@ -31,6 +31,7 @@
#include <sys/stat.h> #include <sys/stat.h>
#include <rabin_dedup.h> #include <rabin_dedup.h>
#include "config.h"
#include "initdb.h" #include "initdb.h"
#define ONE_PB (1125899906842624ULL) #define ONE_PB (1125899906842624ULL)
@ -38,6 +39,120 @@
#define FOUR_MB (4194304ULL) #define FOUR_MB (4194304ULL)
#define EIGHT_MB (8388608ULL) #define EIGHT_MB (8388608ULL)
static int
get_compress_level(compress_algo_t algo)
{
switch (algo) {
case COMPRESS_NONE:
return (0);
case COMPRESS_LZFX:
return (5);
case COMPRESS_LZ4:
return (1);
case COMPRESS_ZLIB:
case COMPRESS_BZIP2:
case COMPRESS_LZMA:
return (6);
};
return (0);
}
static compress_algo_t
get_compress_algo(char *algo_name)
{
if (strcmp(algo_name, "none") == 0) {
return (COMPRESS_NONE);
} else if (strcmp(algo_name, "lzfx") == 0) {
return (COMPRESS_LZFX);
} else if (strcmp(algo_name, "lz4") == 0) {
return (COMPRESS_LZ4);
} else if (strcmp(algo_name, "zlib") == 0) {
return (COMPRESS_ZLIB);
} else if (strcmp(algo_name, "bzip2") == 0) {
return (COMPRESS_BZIP2);
} else if (strcmp(algo_name, "lzma") == 0) {
return (COMPRESS_LZMA);
}
return (COMPRESS_INVALID);
}
static char *
get_compress_str(compress_algo_t algo)
{
if (algo == COMPRESS_NONE) {
return ("none");
} else if (algo == COMPRESS_LZFX) {
return ("lzfx");
} else if (algo == COMPRESS_LZ4) {
return ("lz4");
} else if (algo == COMPRESS_ZLIB) {
return ("zlib");
} else if (algo == COMPRESS_BZIP2) {
return ("bzip2");
} else if (algo == COMPRESS_LZMA) {
return ("lzma");
}
return ("invalid");
}
static chunk_cksum_t
get_cksum_type(char *cksum_name)
{
if (strcmp(cksum_name, "SHA256") == 0) {
return (CKSUM_SHA256);
} else if (cksum_name, "SHA512") == 0) {
return (CKSUM_SHA512);
} else if (cksum_name, "SKEIN256") == 0) {
return (CKSUM_SKEIN256);
} else if (cksum_name, "SKEIN512") == 0) {
return (CKSUM_SKEIN512);
}
return (CKSUM_INVALID);
}
static char *
get_cksum_str(chunk_cksum_t ck)
{
if (ck == CKSUM_SHA256) {
return ("SHA256");
} else if (ck == CKSUM_SHA512) {
return ("SHA512");
} else if (ck == CKSUM_SKEIN256) {
return ("SKEIN256");
} else if (ck == CKSUM_SKEIN512) {
return ("SKEIN512");
}
return ("INVALID");
}
static int
get_cksum_sz(chunk_cksum_t ck)
{
if (ck == CKSUM_SHA256 || ck == CKSUM_SKEIN256) {
return (32);
} else if (ck == CKSUM_SHA512 || ck == CKSUM_SKEIN512) {
return (64);
}
return (0);
}
int int
read_config(char *configfile, archive_config_t *cfg) read_config(char *configfile, archive_config_t *cfg)
{ {
@ -45,6 +160,12 @@ read_config(char *configfile, archive_config_t *cfg)
char line[255]; char line[255];
uint32_t container_sz_bytes, segment_sz_bytes, total_dirs, i; uint32_t container_sz_bytes, segment_sz_bytes, total_dirs, i;
// Default
cfg->verify_chunks = 0;
cfg->algo = COMPRESS_LZ4;
cfg->chunk_cksum_type = DEFAULT_CKSUM;
cfg->similarity_interval = DEFAULT_SIMILARITY_INTERVAL;
fh = fopen(configfile, "r"); fh = fopen(configfile, "r");
if (fh == NULL) { if (fh == NULL) {
perror(" "); perror(" ");
@ -76,7 +197,7 @@ read_config(char *configfile, archive_config_t *cfg)
if (stat(pos, &sb) == -1) { if (stat(pos, &sb) == -1) {
if (errno != ENOENT) { if (errno != ENOENT) {
perror(" "); perror(" ");
fprintf(stderr, "Invalid ROOTDIR\n"); fprintf(stderr, "Invalid ROOTDIR.\n");
fclose(fh); fclose(fh);
return (1); return (1);
} else { } else {
@ -103,9 +224,37 @@ read_config(char *configfile, archive_config_t *cfg)
return (1); return (1);
} }
cfg->archive_sz = arch_sz; cfg->archive_sz = arch_sz;
} else if (strncmp(line, "VERIFY") == 0) {
if (strcmp(pos, "no") == 0) {
cfg->verify_chunks = 0;
} else if (strcmp(pos, "yes") == 0) {
cfg->verify_chunks = 1;
} else {
fprintf(stderr, "Invalid VERIFY setting. Must be either yes or no.\n");
fclose(fh);
return (1);
}
} else if (strncmp(line, "COMPRESS") == 0) {
cfg->algo = get_compress_algo(pos);
if (cfg->algo == COMPRESS_INVALID) {
fprintf(stderr, "Invalid COMPRESS setting.\n");
fclose(fh);
return (1);
}
} else if (strncmp(line, "CHUNK_CKSUM") == 0) {
cfg->chunk_cksum_type = get_cksum_type(pos);
if (cfg->chunk_cksum_type == CKSUM_INVALID) {
fprintf(stderr, "Invalid CHUNK_CKSUM setting.\n");
fclose(fh);
return (1);
}
} }
} }
fclose(fh); fclose(fh);
cfg->compress_level = get_compress_level(cfg->algo);
cfg->chunk_cksum_sz = get_cksum_sz(cfg->chunk_cksum_type);
/* /*
* Now compute the remaining parameters. * Now compute the remaining parameters.
@ -139,4 +288,30 @@ read_config(char *configfile, archive_config_t *cfg)
cfg->num_containers = 1; cfg->num_containers = 1;
else else
cfg->num_containers = (cfg->archive_sz / total_dirs) / container_sz + 1; cfg->num_containers = (cfg->archive_sz / total_dirs) / container_sz + 1;
return (0);
}
int
write_config(char *configfile, archive_config_t *cfg)
{
fh = fopen(configfile, "w");
if (fh == NULL) {
perror(" ");
return (1);
}
fprintf(fh, "#\n# Autogenerated config file\n# !! DO NOT EDIT !!\n#\n\n");
fprintf(fh, "ROOTDIR = %s\n", cfg->rootdir);
fprintf(fh, "CHUNKSZ = %u\n", cfg->chunk_sz;
fprintf(fh, "ARCHIVESZ = %" PRId64 "\n", cfg->archive_sz);
if (cfg->verify_chunks)
fprintf(fh, "VERIFY = yes\n");
else
fprintf(fh, "VERIFY = no\n");
fprintf(fh, "COMPRESS = %s\n", get_compress_str(cfg->algo));
fprintf(fh, "CHUNK_CKSUM = %s\n", get_cksum_str(cfg->chunk_cksum_type));
fprintf(fh, "\n");
fclose(fh);
} }

View file

@ -28,24 +28,46 @@ extern "C" {
#endif #endif
#define DEFAULT_SIMILARITY_INTERVAL 10 #define DEFAULT_SIMILARITY_INTERVAL 10
#define DEFAULT_CKSUM "SHA256" #define DEFAULT_CKSUM CKSUM_SHA256
#define CONTAINER_ITEMS 2048 #define CONTAINER_ITEMS 2048
#define MIN_CK 1 #define MIN_CK 1
#define MAX_CK 5 #define MAX_CK 5
enum {
COMPRESS_NONE=0,
COMPRESS_LZFX,
COMPRESS_LZ4,
COMPRESS_ZLIB,
COMPRESS_BZIP2,
COMPRESS_LZMA,
COMPRESS_INVALID
} compress_algo_t;
enum {
CKSUM_SHA256,
CKSUM_SHA512,
CKSUM_SKEIN256,
CKSUM_SKEIN512,
CKSUM_INVALID
} chunk_cksum_t;
// 8GB // 8GB
#define MIN_ARCHIVE_SZ (8589934592ULL) #define MIN_ARCHIVE_SZ (8589934592ULL)
typedef struct { typedef struct {
char rootdir[PATH_MAX+1]; char rootdir[PATH_MAX+1];
uint32_t chunk_sz; // Numeric ID: 1 - 4k ... 5 - 64k uint32_t chunk_sz; // Numeric ID: 1 - 4k ... 5 - 64k
uint64_t archive_sz; // Total size of archive in bytes. int64_t archive_sz; // Total size of archive in bytes.
int verify_chunks; // Whether to use memcmp() to compare chunks byte for byte.
compress_algo_t algo; // Which compression algo for segments.
int compress_level; // Default preset compression level per algo.
int chunk_cksum_type; // Which digest to use for hash based chunk lookup. int chunk_cksum_type; // Which digest to use for hash based chunk lookup.
int chunk_cksum_sz; // Size of cksum in bytes.
int similarity_interval; // Similarity based match intervals in %age. int similarity_interval; // Similarity based match intervals in %age.
// The items below are computed given the above // The items below are computed given the above
// components. // components.
uint32_t chunk_sz_bytes; uint32_t chunk_sz_bytes; // Average chunk size
uint32_t segment_sz; // Number of chunks uint32_t segment_sz; // Number of chunks
uint32_t container_sz; // Number of segments uint32_t container_sz; // Number of segments
int directory_fanout; // Number of subdirectories in a directory int directory_fanout; // Number of subdirectories in a directory
@ -54,6 +76,7 @@ typedef struct {
} archive_config_t; } archive_config_t;
int read_config(char *configfile, archive_config_t *cfg); int read_config(char *configfile, archive_config_t *cfg);
int write_config(char *configfile, archive_config_t *cfg);
#ifdef __cplusplus #ifdef __cplusplus
} }

View file

@ -32,7 +32,19 @@
#include "initdb.h" #include "initdb.h"
int archive_config_t *
init_global_db(char *configfile) init_global_db(char *configfile)
{ {
archive_config_t *cfg;
int rv;
cfg = calloc(1, sizeof (archive_config_t));
if (!cfg) {
fprintf(stderr, "Memory allocation failure\n");
return (NULL);
}
rv = read_config(configfile, cfg);
if (rv != 0)
return (NULL);
} }