From 3b1d6b55fe7bdccf1f30be184126d67dc7b2082c Mon Sep 17 00:00:00 2001 From: Moinak Ghosh Date: Mon, 19 Nov 2012 21:41:56 +0530 Subject: [PATCH] Work in progress global dedupe config loader. --- rabin/global/config.c | 142 ++++++++++++++++++++++++++++++++++++++++++ rabin/global/config.h | 62 ++++++++++++++++++ rabin/global/initdb.c | 38 +++++++++++ rabin/global/initdb.h | 34 ++++++++++ 4 files changed, 276 insertions(+) create mode 100644 rabin/global/config.c create mode 100644 rabin/global/config.h create mode 100644 rabin/global/initdb.c create mode 100644 rabin/global/initdb.h diff --git a/rabin/global/config.c b/rabin/global/config.c new file mode 100644 index 0000000..4ad9f33 --- /dev/null +++ b/rabin/global/config.c @@ -0,0 +1,142 @@ +/* + * This file is a part of Pcompress, a chunked parallel multi- + * algorithm lossless compression and decompression program. + * + * Copyright (C) 2012 Moinak Ghosh. All rights reserved. + * Use is subject to license terms. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * moinakg@belenix.org, http://moinakg.wordpress.com/ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "initdb.h" + +#define ONE_PB (1125899906842624ULL) +#define ONE_TB (1099511627776ULL) +#define FOUR_MB (4194304ULL) +#define EIGHT_MB (8388608ULL) + +int +read_config(char *configfile, archive_config_t *cfg) +{ + FILE *fh; + char line[255]; + uint32_t container_sz_bytes, segment_sz_bytes, total_dirs, i; + + fh = fopen(configfile, "r"); + if (fh == NULL) { + perror(" "); + return (1); + } + while (fgets(line, 255, fh) != NULL) { + int pos; + + if (strlen(line) < 9 || line[0] == '#') { + continue; + } + pos = strchr(line, '='); + if (pos == NULL) continue; + + pos++; // Skip '=' char + while (isspace(*pos)) pos++; + + if (strncmp(line, "CHUNKSZ", 7) == 0) { + int ck = atoi(pos); + if (ck < MIN_CK || ck > MAX_CK) { + fprintf(stderr, "Invalid Chunk Size: %d\n", ck); + fclose(fh); + return (1); + } + cfg->chunk_sz = ck; + + } else if (strncmp(line, "ROOTDIR") == 0) { + struct stat sb; + if (stat(pos, &sb) == -1) { + if (errno != ENOENT) { + perror(" "); + fprintf(stderr, "Invalid ROOTDIR\n"); + fclose(fh); + return (1); + } else { + memset(cfg->rootdir, 0, PATH_MAX+1); + strncpy(cfg->rootdir, pos, PATH_MAX); + } + } else { + fprintf(stderr, "Invalid ROOTDIR. It already exists.\n"); + fclose(fh); + return (1); + } + } else if (strncmp(line, "ARCHIVESZ") == 0) { + int ovr; + ssize_t arch_sz; + ovr = parse_numeric(&arch_sz, pos); + if (ovr == 1) { + fprintf(stderr, "ARCHIVESZ value too large.\n"); + fclose(fh); + return (1); + } + if (ovr == 2) { + fprintf(stderr, "Invalid ARCHIVESZ value.\n"); + fclose(fh); + return (1); + } + cfg->archive_sz = arch_sz; + } + } + fclose(fh); + + /* + * Now compute the remaining parameters. + */ + cfg->chunk_sz_bytes = RAB_BLK_AVG_SZ(cfg->chunk_sz); + cfg->directory_levels = 2; + if (cfg->archive_sz < ONE_TB) { + segment_sz_bytes = FOUR_MB; + cfg->directory_fanout = 128; + + } else if (cfg->archive_sz < ONE_PB) { + segment_sz_bytes = EIGHT_MB; + cfg->directory_fanout = 256; + } else { + segment_sz_bytes = EIGHT_MB; + cfg->directory_fanout = 256; + cfg->directory_levels = 3; + } + + cfg->segment_sz = segment_sz_bytes / cfg->chunk_sz_bytes; + + total_dirs = 1; + for (i = 0; i < cfg->directory_levels; i++) + total_dirs *= cfg->directory_fanout; + + // Fixed number of segments in a container for now. + cfg->container_sz = CONTAINER_ITEMS; + container_sz_bytes = CONTAINER_ITEMS * segment_sz_bytes; + + if (cfg->archive_sz / total_dirs < container_sz) + cfg->num_containers = 1; + else + cfg->num_containers = (cfg->archive_sz / total_dirs) / container_sz + 1; +} diff --git a/rabin/global/config.h b/rabin/global/config.h new file mode 100644 index 0000000..20e968e --- /dev/null +++ b/rabin/global/config.h @@ -0,0 +1,62 @@ +/* + * This file is a part of Pcompress, a chunked parallel multi- + * algorithm lossless compression and decompression program. + * + * Copyright (C) 2012 Moinak Ghosh. All rights reserved. + * Use is subject to license terms. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * moinakg@belenix.org, http://moinakg.wordpress.com/ + */ + +#ifndef _C_ONFIG_H +#define _C_ONFIG_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define DEFAULT_SIMILARITY_INTERVAL 10 +#define DEFAULT_CKSUM "SHA256" +#define CONTAINER_ITEMS 2048 +#define MIN_CK 1 +#define MAX_CK 5 + +// 8GB +#define MIN_ARCHIVE_SZ (8589934592ULL) + +typedef struct { + char rootdir[PATH_MAX+1]; + uint32_t chunk_sz; // Numeric ID: 1 - 4k ... 5 - 64k + uint64_t archive_sz; // Total size of archive in bytes. + int chunk_cksum_type; // Which digest to use for hash based chunk lookup. + int similarity_interval; // Similarity based match intervals in %age. + // The items below are computed given the above + // components. + + uint32_t chunk_sz_bytes; + uint32_t segment_sz; // Number of chunks + uint32_t container_sz; // Number of segments + int directory_fanout; // Number of subdirectories in a directory + int directory_levels; // Levels of nested directories + int num_containers; // Number of containers in a directory +} archive_config_t; + +int read_config(char *configfile, archive_config_t *cfg); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/rabin/global/initdb.c b/rabin/global/initdb.c new file mode 100644 index 0000000..04a4561 --- /dev/null +++ b/rabin/global/initdb.c @@ -0,0 +1,38 @@ +/* + * This file is a part of Pcompress, a chunked parallel multi- + * algorithm lossless compression and decompression program. + * + * Copyright (C) 2012 Moinak Ghosh. All rights reserved. + * Use is subject to license terms. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * moinakg@belenix.org, http://moinakg.wordpress.com/ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "initdb.h" + +int +init_global_db(char *configfile) +{ +} diff --git a/rabin/global/initdb.h b/rabin/global/initdb.h new file mode 100644 index 0000000..2596fd3 --- /dev/null +++ b/rabin/global/initdb.h @@ -0,0 +1,34 @@ +/* + * This file is a part of Pcompress, a chunked parallel multi- + * algorithm lossless compression and decompression program. + * + * Copyright (C) 2012 Moinak Ghosh. All rights reserved. + * Use is subject to license terms. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * moinakg@belenix.org, http://moinakg.wordpress.com/ + */ + +#ifndef _INITDB_H +#define _INITDB_H + +#ifdef __cplusplus +extern "C" { +#endif + +int init_global_db(char *configfile); + +#ifdef __cplusplus +} +#endif + +#endif