Add test cases for Global Deduplication.

Update documentation and code comments.
Remove tempfile pathname after creation to ensure clean removal after process exit.
This commit is contained in:
Moinak Ghosh 2013-04-26 18:32:00 +05:30
parent 75f62d6a36
commit aed69b2d53
7 changed files with 128 additions and 8 deletions

View file

@ -40,6 +40,7 @@ Usage
To compress a file: To compress a file:
pcompress -c <algorithm> [-l <compress level>] [-s <chunk size>] <file> [-] pcompress -c <algorithm> [-l <compress level>] [-s <chunk size>] <file> [-]
Where <algorithm> can be the folowing: Where <algorithm> can be the folowing:
lzfx - Very fast and small algorithm based on LZF. lzfx - Very fast and small algorithm based on LZF.
lz4 - Ultra fast, high-throughput algorithm reaching RAM B/W at level1. lz4 - Ultra fast, high-throughput algorithm reaching RAM B/W at level1.
@ -64,6 +65,7 @@ Usage
LZMA. It has significantly more memory usage than adapt. LZMA. It has significantly more memory usage than adapt.
none - No compression. This is only meaningful with -D and -E so Dedupe none - No compression. This is only meaningful with -D and -E so Dedupe
can be done for post-processing with an external utility. can be done for post-processing with an external utility.
<chunk_size> - This can be in bytes or can use the following suffixes: <chunk_size> - This can be in bytes or can use the following suffixes:
g - Gigabyte, m - Megabyte, k - Kilobyte. g - Gigabyte, m - Megabyte, k - Kilobyte.
Larger chunks produce better compression at the cost of memory. Larger chunks produce better compression at the cost of memory.
@ -206,6 +208,15 @@ allocator. Due to the the way it rounds up an allocation request to the nearest
slab the built-in allocator can allocate extra unused memory. In addition you slab the built-in allocator can allocate extra unused memory. In addition you
may want to use a different allocator in your environment. may want to use a different allocator in your environment.
The variable PCOMPRESS_INDEX_MEM can be set to limit memory used by the Global
Deduplication Index. The number specified is in multiples of a megabyte.
The variable PCOMPRESS_CACHE_DIR can point to a directory where some temporary
files relating to the Global Deduplication process can be stored. This for example
can be a directory on a Solid State Drive to speed up Global Deduplication. The
space used in this directory is proportional to the size of the dataset being
processed and is slightly more than 8KB for every 1MB of data.
Examples Examples
======== ========

View file

@ -223,15 +223,16 @@ init_global_db_s(char *path, char *tmppath, uint32_t chunksize, uint64_t user_ch
rv = setup_db_config_s(cfg, chunksize, &user_chunk_sz, &pct_interval, algo, ck, ck_sim, rv = setup_db_config_s(cfg, chunksize, &user_chunk_sz, &pct_interval, algo, ck, ck_sim,
file_sz, &hash_slots, &hash_entry_size, &memreqd, memlimit, tmppath); file_sz, &hash_slots, &hash_entry_size, &memreqd, memlimit, tmppath);
// Reduce hash_slots to remain within memlimit /*
* Reduce hash_slots to remain within memlimit
*/
while (memreqd > memlimit) { while (memreqd > memlimit) {
hash_slots--; hash_slots--;
memreqd = hash_slots * MEM_PER_UNIT(hash_entry_size); memreqd = hash_slots * MEM_PER_UNIT(hash_entry_size);
} }
/* /*
* Now create as many hash tables as there are similarity match intervals * Now initialize the hashtable[s] to setup the index.
* each having hash_slots / intervals slots.
*/ */
indx = calloc(1, sizeof (index_t)); indx = calloc(1, sizeof (index_t));
if (!indx) { if (!indx) {
@ -260,6 +261,10 @@ init_global_db_s(char *path, char *tmppath, uint32_t chunksize, uint64_t user_ch
indx->memused += ((indx->hash_slots) * (sizeof (hash_entry_t *))); indx->memused += ((indx->hash_slots) * (sizeof (hash_entry_t *)));
} }
/*
* If Segmented Deduplication is required intervals will be set and a temporary
* file is created to hold rabin block hash lists for each segment.
*/
if (pct_interval > 0) { if (pct_interval > 0) {
strcpy(cfg->rootdir, tmppath); strcpy(cfg->rootdir, tmppath);
strcat(cfg->rootdir, "/.segXXXXXX"); strcat(cfg->rootdir, "/.segXXXXXX");
@ -276,6 +281,12 @@ init_global_db_s(char *path, char *tmppath, uint32_t chunksize, uint64_t user_ch
cfg->seg_fd_r[i].fd = open(cfg->rootdir, O_RDONLY); cfg->seg_fd_r[i].fd = open(cfg->rootdir, O_RDONLY);
cfg->seg_fd_r[i].mapping = NULL; cfg->seg_fd_r[i].mapping = NULL;
} }
/*
* Remove tempfile entry from the filesystem metadata so that file gets
* automatically removed once process exits.
*/
unlink(cfg->rootdir);
} }
cfg->segcache_pos = 0; cfg->segcache_pos = 0;
cfg->dbdata = indx; cfg->dbdata = indx;
@ -401,6 +412,9 @@ db_segcache_unmap(archive_config_t *cfg, int tid)
return (0); return (0);
} }
/*
* Compare hashes. Hash size must be multiple of 8 bytes.
*/
static inline int static inline int
mycmp(uchar_t *a, uchar_t *b, int sz) mycmp(uchar_t *a, uchar_t *b, int sz)
{ {
@ -453,7 +467,7 @@ db_lookup_insert_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval,
pent = &(ent->next); pent = &(ent->next);
ent = ent->next; ent = ent->next;
} }
} else if (cfg->similarity_cksum_sz == 8) { } else if (cfg->similarity_cksum_sz == 8) {// Fast path for 64-bit keys
while (ent) { while (ent) {
if (*((uint64_t *)sim_cksum) == *((uint64_t *)ent->cksum) && if (*((uint64_t *)sim_cksum) == *((uint64_t *)ent->cksum) &&
ent->item_offset != item_offset) { ent->item_offset != item_offset) {
@ -474,6 +488,10 @@ db_lookup_insert_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval,
} }
if (do_insert) { if (do_insert) {
if (indx->memused + indx->hash_entry_size >= indx->memlimit && htab[htab_entry] != NULL) { if (indx->memused + indx->hash_entry_size >= indx->memlimit && htab[htab_entry] != NULL) {
/*
* If the index is close to full capacity, steal the oldest hash bucket
* in this slot to hold the new data.
*/
ent = htab[htab_entry]; ent = htab[htab_entry];
htab[htab_entry] = htab[htab_entry]->next; htab[htab_entry] = htab[htab_entry]->next;
} else { } else {
@ -502,7 +520,6 @@ destroy_global_db_s(archive_config_t *cfg)
} }
free(cfg->seg_fd_r); free(cfg->seg_fd_r);
close(cfg->seg_fd_w); close(cfg->seg_fd_w);
unlink(cfg->rootdir);
} }
} }

View file

@ -10,7 +10,8 @@ do
for tf in `cat files.lst` for tf in `cat files.lst`
do do
rm -f ${tf}.* rm -f ${tf}.*
for feat in "-D" "-D -B3 -L" "-D -B4 -E" "-D -B2 -EE" "-D -B5 -EE -L" "-D -B2 -r" "-P" "-D -P" "-D -L -P" for feat in "-D" "-D -B3 -L" "-D -B4 -E" "-D -B2 -EE" "-D -B5 -EE -L" "-D -B2 -r" "-P" "-D -P" "-D -L -P" \
"-G -D" "-G -F" "-G -L -P" "-G -B2"
do do
for seg in 2m 100m for seg in 2m 100m
do do

View file

@ -10,7 +10,7 @@ do
for tf in `cat files.lst` for tf in `cat files.lst`
do do
rm -f ${tf}.* rm -f ${tf}.*
for feat in "-e AES" "-e AES -L -S SHA256" "-D -e SALSA20 -S SHA512" "-D -EE -L -e SALSA20 -S BLAKE512" "-e AES -S CRC64" "-e SALSA20 -P" "-e AES -L -P -S KECCAK256" "-D -e SALSA20 -L -S KECCAK512" "-e AES -k16" "-e SALSA20 -k16" for feat in "-e AES" "-e AES -L -S SHA256" "-D -e SALSA20 -S SHA512" "-D -EE -L -e SALSA20 -S BLAKE512" "-e AES -S CRC64" "-e SALSA20 -P" "-e AES -L -P -S KECCAK256" "-D -e SALSA20 -L -S KECCAK512" "-e AES -k16" "-e SALSA20 -k16" "-G -e AES -S SHA256" "-G -e SALSA20 -P"
do do
for seg in 2m 100m for seg in 2m 100m
do do

View file

@ -46,6 +46,44 @@ do
done done
done done
for algo in lz4 zlib
do
for dopts "" "-G -D" "-G -F" "-D"
do
for tf in `cat files.lst`
do
rm -f ${tf}.*
for seg in 1m 21m
do
cmd="../../pcompress -c ${algo} -l6 -s ${seg} ${dopts} ${tf} - > ${tf}.pz"
echo "Running $cmd"
eval $cmd
if [ $? -ne 0 ]
then
echo "FATAL: Compression errored."
rm -f ${tf}.pz
continue
fi
cmd="cat ${tf}.pz | ../../pcompress -d - ${tf}.1"
echo "Running $cmd"
eval $cmd
if [ $? -ne 0 ]
then
echo "FATAL: Decompression errored."
rm -f ${tf}.pz ${tf}.1
continue
fi
diff ${tf} ${tf}.1 > /dev/null
if [ $? -ne 0 ]
then
echo "FATAL: Decompression was not correct"
fi
rm -f ${tf}.pz ${tf}.1
done
done
done
done
echo "#################################################" echo "#################################################"
echo "" echo ""

View file

@ -61,6 +61,59 @@ do
done done
done done
for algo in lzfx zlib
do
for tf in `cat files.lst`
do
for feat in "-e SALSA20" "-e AES -L" "-D -e SALSA20" "-D -EE -L -e SALSA20 -S KECCAK256" "-G -e SALSA20" "-G -F -e AES"
do
for seg in 5m
do
echo "sillypassword" > /tmp/pwf
cmd="../../pcompress -c${algo} -l3 -s${seg} $feat -w /tmp/pwf ${tf} - > ${tf}.pz"
echo "Running $cmd"
eval $cmd
if [ $? -ne 0 ]
then
echo "FATAL: Compression errored."
rm -f ${tf}.pz ${tf}.1
continue
fi
pw=`cat /tmp/pwf`
if [ "$pw" = "sillypassword" ]
then
echo "FATAL: Password file /tmp/pwf not zeroed!"
fi
echo "sillypassword" > /tmp/pwf
cmd="cat ${tf}.pz | ../../pcompress -d -w /tmp/pwf - ${tf}.1"
echo "Running $cmd"
eval $cmd
if [ $? -ne 0 ]
then
echo "FATAL: Decompression errored."
rm -f ${tf}.pz ${tf}.1
continue
fi
diff ${tf} ${tf}.1 > /dev/null
if [ $? -ne 0 ]
then
echo "FATAL: Decompression was not correct"
fi
pw=`cat /tmp/pwf`
if [ "$pw" = "sillypassword" ]
then
echo "FATAL: Password file /tmp/pwf not zeroed!"
fi
rm -f ${tf}.pz ${tf}.1
done
done
done
done
rm -f /tmp/pwf rm -f /tmp/pwf
echo "#################################################" echo "#################################################"

View file

@ -12,7 +12,7 @@ for algo in lzfx lz4 adapt adapt2
do do
for tf in `cat files.lst` for tf in `cat files.lst`
do do
for feat in "-F" "-F -B3 -L" "-F -B4" "-F -B5 -L" "-F -P" "-F -L -P" for feat in "-F" "-F -B3 -L" "-F -B4" "-F -B5 -L" "-F -P" "-F -L -P" "-G -F -B3 -L"
do do
for seg in 2m 100m for seg in 2m 100m
do do