Add test cases for Global Deduplication.
Update documentation and code comments. Remove tempfile pathname after creation to ensure clean removal after process exit.
This commit is contained in:
parent
75f62d6a36
commit
aed69b2d53
7 changed files with 128 additions and 8 deletions
11
README.md
11
README.md
|
@ -40,6 +40,7 @@ Usage
|
||||||
|
|
||||||
To compress a file:
|
To compress a file:
|
||||||
pcompress -c <algorithm> [-l <compress level>] [-s <chunk size>] <file> [-]
|
pcompress -c <algorithm> [-l <compress level>] [-s <chunk size>] <file> [-]
|
||||||
|
|
||||||
Where <algorithm> can be the folowing:
|
Where <algorithm> can be the folowing:
|
||||||
lzfx - Very fast and small algorithm based on LZF.
|
lzfx - Very fast and small algorithm based on LZF.
|
||||||
lz4 - Ultra fast, high-throughput algorithm reaching RAM B/W at level1.
|
lz4 - Ultra fast, high-throughput algorithm reaching RAM B/W at level1.
|
||||||
|
@ -64,6 +65,7 @@ Usage
|
||||||
LZMA. It has significantly more memory usage than adapt.
|
LZMA. It has significantly more memory usage than adapt.
|
||||||
none - No compression. This is only meaningful with -D and -E so Dedupe
|
none - No compression. This is only meaningful with -D and -E so Dedupe
|
||||||
can be done for post-processing with an external utility.
|
can be done for post-processing with an external utility.
|
||||||
|
|
||||||
<chunk_size> - This can be in bytes or can use the following suffixes:
|
<chunk_size> - This can be in bytes or can use the following suffixes:
|
||||||
g - Gigabyte, m - Megabyte, k - Kilobyte.
|
g - Gigabyte, m - Megabyte, k - Kilobyte.
|
||||||
Larger chunks produce better compression at the cost of memory.
|
Larger chunks produce better compression at the cost of memory.
|
||||||
|
@ -206,6 +208,15 @@ allocator. Due to the the way it rounds up an allocation request to the nearest
|
||||||
slab the built-in allocator can allocate extra unused memory. In addition you
|
slab the built-in allocator can allocate extra unused memory. In addition you
|
||||||
may want to use a different allocator in your environment.
|
may want to use a different allocator in your environment.
|
||||||
|
|
||||||
|
The variable PCOMPRESS_INDEX_MEM can be set to limit memory used by the Global
|
||||||
|
Deduplication Index. The number specified is in multiples of a megabyte.
|
||||||
|
|
||||||
|
The variable PCOMPRESS_CACHE_DIR can point to a directory where some temporary
|
||||||
|
files relating to the Global Deduplication process can be stored. This for example
|
||||||
|
can be a directory on a Solid State Drive to speed up Global Deduplication. The
|
||||||
|
space used in this directory is proportional to the size of the dataset being
|
||||||
|
processed and is slightly more than 8KB for every 1MB of data.
|
||||||
|
|
||||||
Examples
|
Examples
|
||||||
========
|
========
|
||||||
|
|
||||||
|
|
|
@ -223,15 +223,16 @@ init_global_db_s(char *path, char *tmppath, uint32_t chunksize, uint64_t user_ch
|
||||||
rv = setup_db_config_s(cfg, chunksize, &user_chunk_sz, &pct_interval, algo, ck, ck_sim,
|
rv = setup_db_config_s(cfg, chunksize, &user_chunk_sz, &pct_interval, algo, ck, ck_sim,
|
||||||
file_sz, &hash_slots, &hash_entry_size, &memreqd, memlimit, tmppath);
|
file_sz, &hash_slots, &hash_entry_size, &memreqd, memlimit, tmppath);
|
||||||
|
|
||||||
// Reduce hash_slots to remain within memlimit
|
/*
|
||||||
|
* Reduce hash_slots to remain within memlimit
|
||||||
|
*/
|
||||||
while (memreqd > memlimit) {
|
while (memreqd > memlimit) {
|
||||||
hash_slots--;
|
hash_slots--;
|
||||||
memreqd = hash_slots * MEM_PER_UNIT(hash_entry_size);
|
memreqd = hash_slots * MEM_PER_UNIT(hash_entry_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Now create as many hash tables as there are similarity match intervals
|
* Now initialize the hashtable[s] to setup the index.
|
||||||
* each having hash_slots / intervals slots.
|
|
||||||
*/
|
*/
|
||||||
indx = calloc(1, sizeof (index_t));
|
indx = calloc(1, sizeof (index_t));
|
||||||
if (!indx) {
|
if (!indx) {
|
||||||
|
@ -260,6 +261,10 @@ init_global_db_s(char *path, char *tmppath, uint32_t chunksize, uint64_t user_ch
|
||||||
indx->memused += ((indx->hash_slots) * (sizeof (hash_entry_t *)));
|
indx->memused += ((indx->hash_slots) * (sizeof (hash_entry_t *)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If Segmented Deduplication is required intervals will be set and a temporary
|
||||||
|
* file is created to hold rabin block hash lists for each segment.
|
||||||
|
*/
|
||||||
if (pct_interval > 0) {
|
if (pct_interval > 0) {
|
||||||
strcpy(cfg->rootdir, tmppath);
|
strcpy(cfg->rootdir, tmppath);
|
||||||
strcat(cfg->rootdir, "/.segXXXXXX");
|
strcat(cfg->rootdir, "/.segXXXXXX");
|
||||||
|
@ -276,6 +281,12 @@ init_global_db_s(char *path, char *tmppath, uint32_t chunksize, uint64_t user_ch
|
||||||
cfg->seg_fd_r[i].fd = open(cfg->rootdir, O_RDONLY);
|
cfg->seg_fd_r[i].fd = open(cfg->rootdir, O_RDONLY);
|
||||||
cfg->seg_fd_r[i].mapping = NULL;
|
cfg->seg_fd_r[i].mapping = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Remove tempfile entry from the filesystem metadata so that file gets
|
||||||
|
* automatically removed once process exits.
|
||||||
|
*/
|
||||||
|
unlink(cfg->rootdir);
|
||||||
}
|
}
|
||||||
cfg->segcache_pos = 0;
|
cfg->segcache_pos = 0;
|
||||||
cfg->dbdata = indx;
|
cfg->dbdata = indx;
|
||||||
|
@ -401,6 +412,9 @@ db_segcache_unmap(archive_config_t *cfg, int tid)
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Compare hashes. Hash size must be multiple of 8 bytes.
|
||||||
|
*/
|
||||||
static inline int
|
static inline int
|
||||||
mycmp(uchar_t *a, uchar_t *b, int sz)
|
mycmp(uchar_t *a, uchar_t *b, int sz)
|
||||||
{
|
{
|
||||||
|
@ -453,7 +467,7 @@ db_lookup_insert_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval,
|
||||||
pent = &(ent->next);
|
pent = &(ent->next);
|
||||||
ent = ent->next;
|
ent = ent->next;
|
||||||
}
|
}
|
||||||
} else if (cfg->similarity_cksum_sz == 8) {
|
} else if (cfg->similarity_cksum_sz == 8) {// Fast path for 64-bit keys
|
||||||
while (ent) {
|
while (ent) {
|
||||||
if (*((uint64_t *)sim_cksum) == *((uint64_t *)ent->cksum) &&
|
if (*((uint64_t *)sim_cksum) == *((uint64_t *)ent->cksum) &&
|
||||||
ent->item_offset != item_offset) {
|
ent->item_offset != item_offset) {
|
||||||
|
@ -474,6 +488,10 @@ db_lookup_insert_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval,
|
||||||
}
|
}
|
||||||
if (do_insert) {
|
if (do_insert) {
|
||||||
if (indx->memused + indx->hash_entry_size >= indx->memlimit && htab[htab_entry] != NULL) {
|
if (indx->memused + indx->hash_entry_size >= indx->memlimit && htab[htab_entry] != NULL) {
|
||||||
|
/*
|
||||||
|
* If the index is close to full capacity, steal the oldest hash bucket
|
||||||
|
* in this slot to hold the new data.
|
||||||
|
*/
|
||||||
ent = htab[htab_entry];
|
ent = htab[htab_entry];
|
||||||
htab[htab_entry] = htab[htab_entry]->next;
|
htab[htab_entry] = htab[htab_entry]->next;
|
||||||
} else {
|
} else {
|
||||||
|
@ -502,7 +520,6 @@ destroy_global_db_s(archive_config_t *cfg)
|
||||||
}
|
}
|
||||||
free(cfg->seg_fd_r);
|
free(cfg->seg_fd_r);
|
||||||
close(cfg->seg_fd_w);
|
close(cfg->seg_fd_w);
|
||||||
unlink(cfg->rootdir);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -10,7 +10,8 @@ do
|
||||||
for tf in `cat files.lst`
|
for tf in `cat files.lst`
|
||||||
do
|
do
|
||||||
rm -f ${tf}.*
|
rm -f ${tf}.*
|
||||||
for feat in "-D" "-D -B3 -L" "-D -B4 -E" "-D -B2 -EE" "-D -B5 -EE -L" "-D -B2 -r" "-P" "-D -P" "-D -L -P"
|
for feat in "-D" "-D -B3 -L" "-D -B4 -E" "-D -B2 -EE" "-D -B5 -EE -L" "-D -B2 -r" "-P" "-D -P" "-D -L -P" \
|
||||||
|
"-G -D" "-G -F" "-G -L -P" "-G -B2"
|
||||||
do
|
do
|
||||||
for seg in 2m 100m
|
for seg in 2m 100m
|
||||||
do
|
do
|
||||||
|
|
|
@ -10,7 +10,7 @@ do
|
||||||
for tf in `cat files.lst`
|
for tf in `cat files.lst`
|
||||||
do
|
do
|
||||||
rm -f ${tf}.*
|
rm -f ${tf}.*
|
||||||
for feat in "-e AES" "-e AES -L -S SHA256" "-D -e SALSA20 -S SHA512" "-D -EE -L -e SALSA20 -S BLAKE512" "-e AES -S CRC64" "-e SALSA20 -P" "-e AES -L -P -S KECCAK256" "-D -e SALSA20 -L -S KECCAK512" "-e AES -k16" "-e SALSA20 -k16"
|
for feat in "-e AES" "-e AES -L -S SHA256" "-D -e SALSA20 -S SHA512" "-D -EE -L -e SALSA20 -S BLAKE512" "-e AES -S CRC64" "-e SALSA20 -P" "-e AES -L -P -S KECCAK256" "-D -e SALSA20 -L -S KECCAK512" "-e AES -k16" "-e SALSA20 -k16" "-G -e AES -S SHA256" "-G -e SALSA20 -P"
|
||||||
do
|
do
|
||||||
for seg in 2m 100m
|
for seg in 2m 100m
|
||||||
do
|
do
|
||||||
|
|
38
test/t6.tst
38
test/t6.tst
|
@ -46,6 +46,44 @@ do
|
||||||
done
|
done
|
||||||
done
|
done
|
||||||
|
|
||||||
|
for algo in lz4 zlib
|
||||||
|
do
|
||||||
|
for dopts "" "-G -D" "-G -F" "-D"
|
||||||
|
do
|
||||||
|
for tf in `cat files.lst`
|
||||||
|
do
|
||||||
|
rm -f ${tf}.*
|
||||||
|
for seg in 1m 21m
|
||||||
|
do
|
||||||
|
cmd="../../pcompress -c ${algo} -l6 -s ${seg} ${dopts} ${tf} - > ${tf}.pz"
|
||||||
|
echo "Running $cmd"
|
||||||
|
eval $cmd
|
||||||
|
if [ $? -ne 0 ]
|
||||||
|
then
|
||||||
|
echo "FATAL: Compression errored."
|
||||||
|
rm -f ${tf}.pz
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
cmd="cat ${tf}.pz | ../../pcompress -d - ${tf}.1"
|
||||||
|
echo "Running $cmd"
|
||||||
|
eval $cmd
|
||||||
|
if [ $? -ne 0 ]
|
||||||
|
then
|
||||||
|
echo "FATAL: Decompression errored."
|
||||||
|
rm -f ${tf}.pz ${tf}.1
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
diff ${tf} ${tf}.1 > /dev/null
|
||||||
|
if [ $? -ne 0 ]
|
||||||
|
then
|
||||||
|
echo "FATAL: Decompression was not correct"
|
||||||
|
fi
|
||||||
|
rm -f ${tf}.pz ${tf}.1
|
||||||
|
done
|
||||||
|
done
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
echo "#################################################"
|
echo "#################################################"
|
||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
|
|
53
test/t7.tst
53
test/t7.tst
|
@ -61,6 +61,59 @@ do
|
||||||
done
|
done
|
||||||
done
|
done
|
||||||
|
|
||||||
|
for algo in lzfx zlib
|
||||||
|
do
|
||||||
|
for tf in `cat files.lst`
|
||||||
|
do
|
||||||
|
for feat in "-e SALSA20" "-e AES -L" "-D -e SALSA20" "-D -EE -L -e SALSA20 -S KECCAK256" "-G -e SALSA20" "-G -F -e AES"
|
||||||
|
do
|
||||||
|
for seg in 5m
|
||||||
|
do
|
||||||
|
echo "sillypassword" > /tmp/pwf
|
||||||
|
cmd="../../pcompress -c${algo} -l3 -s${seg} $feat -w /tmp/pwf ${tf} - > ${tf}.pz"
|
||||||
|
echo "Running $cmd"
|
||||||
|
eval $cmd
|
||||||
|
if [ $? -ne 0 ]
|
||||||
|
then
|
||||||
|
echo "FATAL: Compression errored."
|
||||||
|
rm -f ${tf}.pz ${tf}.1
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
pw=`cat /tmp/pwf`
|
||||||
|
if [ "$pw" = "sillypassword" ]
|
||||||
|
then
|
||||||
|
echo "FATAL: Password file /tmp/pwf not zeroed!"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "sillypassword" > /tmp/pwf
|
||||||
|
cmd="cat ${tf}.pz | ../../pcompress -d -w /tmp/pwf - ${tf}.1"
|
||||||
|
echo "Running $cmd"
|
||||||
|
eval $cmd
|
||||||
|
if [ $? -ne 0 ]
|
||||||
|
then
|
||||||
|
echo "FATAL: Decompression errored."
|
||||||
|
rm -f ${tf}.pz ${tf}.1
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
diff ${tf} ${tf}.1 > /dev/null
|
||||||
|
if [ $? -ne 0 ]
|
||||||
|
then
|
||||||
|
echo "FATAL: Decompression was not correct"
|
||||||
|
fi
|
||||||
|
|
||||||
|
pw=`cat /tmp/pwf`
|
||||||
|
if [ "$pw" = "sillypassword" ]
|
||||||
|
then
|
||||||
|
echo "FATAL: Password file /tmp/pwf not zeroed!"
|
||||||
|
fi
|
||||||
|
rm -f ${tf}.pz ${tf}.1
|
||||||
|
done
|
||||||
|
done
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
rm -f /tmp/pwf
|
rm -f /tmp/pwf
|
||||||
|
|
||||||
echo "#################################################"
|
echo "#################################################"
|
||||||
|
|
|
@ -12,7 +12,7 @@ for algo in lzfx lz4 adapt adapt2
|
||||||
do
|
do
|
||||||
for tf in `cat files.lst`
|
for tf in `cat files.lst`
|
||||||
do
|
do
|
||||||
for feat in "-F" "-F -B3 -L" "-F -B4" "-F -B5 -L" "-F -P" "-F -L -P"
|
for feat in "-F" "-F -B3 -L" "-F -B4" "-F -B5 -L" "-F -P" "-F -L -P" "-G -F -B3 -L"
|
||||||
do
|
do
|
||||||
for seg in 2m 100m
|
for seg in 2m 100m
|
||||||
do
|
do
|
||||||
|
|
Loading…
Reference in a new issue