Add test cases for Global Deduplication.
Update documentation and code comments. Remove tempfile pathname after creation to ensure clean removal after process exit.
This commit is contained in:
parent
75f62d6a36
commit
aed69b2d53
7 changed files with 128 additions and 8 deletions
11
README.md
11
README.md
|
@ -40,6 +40,7 @@ Usage
|
|||
|
||||
To compress a file:
|
||||
pcompress -c <algorithm> [-l <compress level>] [-s <chunk size>] <file> [-]
|
||||
|
||||
Where <algorithm> can be the folowing:
|
||||
lzfx - Very fast and small algorithm based on LZF.
|
||||
lz4 - Ultra fast, high-throughput algorithm reaching RAM B/W at level1.
|
||||
|
@ -64,6 +65,7 @@ Usage
|
|||
LZMA. It has significantly more memory usage than adapt.
|
||||
none - No compression. This is only meaningful with -D and -E so Dedupe
|
||||
can be done for post-processing with an external utility.
|
||||
|
||||
<chunk_size> - This can be in bytes or can use the following suffixes:
|
||||
g - Gigabyte, m - Megabyte, k - Kilobyte.
|
||||
Larger chunks produce better compression at the cost of memory.
|
||||
|
@ -206,6 +208,15 @@ allocator. Due to the the way it rounds up an allocation request to the nearest
|
|||
slab the built-in allocator can allocate extra unused memory. In addition you
|
||||
may want to use a different allocator in your environment.
|
||||
|
||||
The variable PCOMPRESS_INDEX_MEM can be set to limit memory used by the Global
|
||||
Deduplication Index. The number specified is in multiples of a megabyte.
|
||||
|
||||
The variable PCOMPRESS_CACHE_DIR can point to a directory where some temporary
|
||||
files relating to the Global Deduplication process can be stored. This for example
|
||||
can be a directory on a Solid State Drive to speed up Global Deduplication. The
|
||||
space used in this directory is proportional to the size of the dataset being
|
||||
processed and is slightly more than 8KB for every 1MB of data.
|
||||
|
||||
Examples
|
||||
========
|
||||
|
||||
|
|
|
@ -223,15 +223,16 @@ init_global_db_s(char *path, char *tmppath, uint32_t chunksize, uint64_t user_ch
|
|||
rv = setup_db_config_s(cfg, chunksize, &user_chunk_sz, &pct_interval, algo, ck, ck_sim,
|
||||
file_sz, &hash_slots, &hash_entry_size, &memreqd, memlimit, tmppath);
|
||||
|
||||
// Reduce hash_slots to remain within memlimit
|
||||
/*
|
||||
* Reduce hash_slots to remain within memlimit
|
||||
*/
|
||||
while (memreqd > memlimit) {
|
||||
hash_slots--;
|
||||
memreqd = hash_slots * MEM_PER_UNIT(hash_entry_size);
|
||||
}
|
||||
|
||||
/*
|
||||
* Now create as many hash tables as there are similarity match intervals
|
||||
* each having hash_slots / intervals slots.
|
||||
* Now initialize the hashtable[s] to setup the index.
|
||||
*/
|
||||
indx = calloc(1, sizeof (index_t));
|
||||
if (!indx) {
|
||||
|
@ -260,6 +261,10 @@ init_global_db_s(char *path, char *tmppath, uint32_t chunksize, uint64_t user_ch
|
|||
indx->memused += ((indx->hash_slots) * (sizeof (hash_entry_t *)));
|
||||
}
|
||||
|
||||
/*
|
||||
* If Segmented Deduplication is required intervals will be set and a temporary
|
||||
* file is created to hold rabin block hash lists for each segment.
|
||||
*/
|
||||
if (pct_interval > 0) {
|
||||
strcpy(cfg->rootdir, tmppath);
|
||||
strcat(cfg->rootdir, "/.segXXXXXX");
|
||||
|
@ -276,6 +281,12 @@ init_global_db_s(char *path, char *tmppath, uint32_t chunksize, uint64_t user_ch
|
|||
cfg->seg_fd_r[i].fd = open(cfg->rootdir, O_RDONLY);
|
||||
cfg->seg_fd_r[i].mapping = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove tempfile entry from the filesystem metadata so that file gets
|
||||
* automatically removed once process exits.
|
||||
*/
|
||||
unlink(cfg->rootdir);
|
||||
}
|
||||
cfg->segcache_pos = 0;
|
||||
cfg->dbdata = indx;
|
||||
|
@ -401,6 +412,9 @@ db_segcache_unmap(archive_config_t *cfg, int tid)
|
|||
return (0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Compare hashes. Hash size must be multiple of 8 bytes.
|
||||
*/
|
||||
static inline int
|
||||
mycmp(uchar_t *a, uchar_t *b, int sz)
|
||||
{
|
||||
|
@ -453,7 +467,7 @@ db_lookup_insert_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval,
|
|||
pent = &(ent->next);
|
||||
ent = ent->next;
|
||||
}
|
||||
} else if (cfg->similarity_cksum_sz == 8) {
|
||||
} else if (cfg->similarity_cksum_sz == 8) {// Fast path for 64-bit keys
|
||||
while (ent) {
|
||||
if (*((uint64_t *)sim_cksum) == *((uint64_t *)ent->cksum) &&
|
||||
ent->item_offset != item_offset) {
|
||||
|
@ -474,6 +488,10 @@ db_lookup_insert_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval,
|
|||
}
|
||||
if (do_insert) {
|
||||
if (indx->memused + indx->hash_entry_size >= indx->memlimit && htab[htab_entry] != NULL) {
|
||||
/*
|
||||
* If the index is close to full capacity, steal the oldest hash bucket
|
||||
* in this slot to hold the new data.
|
||||
*/
|
||||
ent = htab[htab_entry];
|
||||
htab[htab_entry] = htab[htab_entry]->next;
|
||||
} else {
|
||||
|
@ -502,7 +520,6 @@ destroy_global_db_s(archive_config_t *cfg)
|
|||
}
|
||||
free(cfg->seg_fd_r);
|
||||
close(cfg->seg_fd_w);
|
||||
unlink(cfg->rootdir);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -10,7 +10,8 @@ do
|
|||
for tf in `cat files.lst`
|
||||
do
|
||||
rm -f ${tf}.*
|
||||
for feat in "-D" "-D -B3 -L" "-D -B4 -E" "-D -B2 -EE" "-D -B5 -EE -L" "-D -B2 -r" "-P" "-D -P" "-D -L -P"
|
||||
for feat in "-D" "-D -B3 -L" "-D -B4 -E" "-D -B2 -EE" "-D -B5 -EE -L" "-D -B2 -r" "-P" "-D -P" "-D -L -P" \
|
||||
"-G -D" "-G -F" "-G -L -P" "-G -B2"
|
||||
do
|
||||
for seg in 2m 100m
|
||||
do
|
||||
|
|
|
@ -10,7 +10,7 @@ do
|
|||
for tf in `cat files.lst`
|
||||
do
|
||||
rm -f ${tf}.*
|
||||
for feat in "-e AES" "-e AES -L -S SHA256" "-D -e SALSA20 -S SHA512" "-D -EE -L -e SALSA20 -S BLAKE512" "-e AES -S CRC64" "-e SALSA20 -P" "-e AES -L -P -S KECCAK256" "-D -e SALSA20 -L -S KECCAK512" "-e AES -k16" "-e SALSA20 -k16"
|
||||
for feat in "-e AES" "-e AES -L -S SHA256" "-D -e SALSA20 -S SHA512" "-D -EE -L -e SALSA20 -S BLAKE512" "-e AES -S CRC64" "-e SALSA20 -P" "-e AES -L -P -S KECCAK256" "-D -e SALSA20 -L -S KECCAK512" "-e AES -k16" "-e SALSA20 -k16" "-G -e AES -S SHA256" "-G -e SALSA20 -P"
|
||||
do
|
||||
for seg in 2m 100m
|
||||
do
|
||||
|
|
38
test/t6.tst
38
test/t6.tst
|
@ -46,6 +46,44 @@ do
|
|||
done
|
||||
done
|
||||
|
||||
for algo in lz4 zlib
|
||||
do
|
||||
for dopts "" "-G -D" "-G -F" "-D"
|
||||
do
|
||||
for tf in `cat files.lst`
|
||||
do
|
||||
rm -f ${tf}.*
|
||||
for seg in 1m 21m
|
||||
do
|
||||
cmd="../../pcompress -c ${algo} -l6 -s ${seg} ${dopts} ${tf} - > ${tf}.pz"
|
||||
echo "Running $cmd"
|
||||
eval $cmd
|
||||
if [ $? -ne 0 ]
|
||||
then
|
||||
echo "FATAL: Compression errored."
|
||||
rm -f ${tf}.pz
|
||||
continue
|
||||
fi
|
||||
cmd="cat ${tf}.pz | ../../pcompress -d - ${tf}.1"
|
||||
echo "Running $cmd"
|
||||
eval $cmd
|
||||
if [ $? -ne 0 ]
|
||||
then
|
||||
echo "FATAL: Decompression errored."
|
||||
rm -f ${tf}.pz ${tf}.1
|
||||
continue
|
||||
fi
|
||||
diff ${tf} ${tf}.1 > /dev/null
|
||||
if [ $? -ne 0 ]
|
||||
then
|
||||
echo "FATAL: Decompression was not correct"
|
||||
fi
|
||||
rm -f ${tf}.pz ${tf}.1
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
||||
|
||||
echo "#################################################"
|
||||
echo ""
|
||||
|
||||
|
|
53
test/t7.tst
53
test/t7.tst
|
@ -61,6 +61,59 @@ do
|
|||
done
|
||||
done
|
||||
|
||||
for algo in lzfx zlib
|
||||
do
|
||||
for tf in `cat files.lst`
|
||||
do
|
||||
for feat in "-e SALSA20" "-e AES -L" "-D -e SALSA20" "-D -EE -L -e SALSA20 -S KECCAK256" "-G -e SALSA20" "-G -F -e AES"
|
||||
do
|
||||
for seg in 5m
|
||||
do
|
||||
echo "sillypassword" > /tmp/pwf
|
||||
cmd="../../pcompress -c${algo} -l3 -s${seg} $feat -w /tmp/pwf ${tf} - > ${tf}.pz"
|
||||
echo "Running $cmd"
|
||||
eval $cmd
|
||||
if [ $? -ne 0 ]
|
||||
then
|
||||
echo "FATAL: Compression errored."
|
||||
rm -f ${tf}.pz ${tf}.1
|
||||
continue
|
||||
fi
|
||||
|
||||
pw=`cat /tmp/pwf`
|
||||
if [ "$pw" = "sillypassword" ]
|
||||
then
|
||||
echo "FATAL: Password file /tmp/pwf not zeroed!"
|
||||
fi
|
||||
|
||||
echo "sillypassword" > /tmp/pwf
|
||||
cmd="cat ${tf}.pz | ../../pcompress -d -w /tmp/pwf - ${tf}.1"
|
||||
echo "Running $cmd"
|
||||
eval $cmd
|
||||
if [ $? -ne 0 ]
|
||||
then
|
||||
echo "FATAL: Decompression errored."
|
||||
rm -f ${tf}.pz ${tf}.1
|
||||
continue
|
||||
fi
|
||||
|
||||
diff ${tf} ${tf}.1 > /dev/null
|
||||
if [ $? -ne 0 ]
|
||||
then
|
||||
echo "FATAL: Decompression was not correct"
|
||||
fi
|
||||
|
||||
pw=`cat /tmp/pwf`
|
||||
if [ "$pw" = "sillypassword" ]
|
||||
then
|
||||
echo "FATAL: Password file /tmp/pwf not zeroed!"
|
||||
fi
|
||||
rm -f ${tf}.pz ${tf}.1
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
||||
|
||||
rm -f /tmp/pwf
|
||||
|
||||
echo "#################################################"
|
||||
|
|
|
@ -12,7 +12,7 @@ for algo in lzfx lz4 adapt adapt2
|
|||
do
|
||||
for tf in `cat files.lst`
|
||||
do
|
||||
for feat in "-F" "-F -B3 -L" "-F -B4" "-F -B5 -L" "-F -P" "-F -L -P"
|
||||
for feat in "-F" "-F -B3 -L" "-F -B4" "-F -B5 -L" "-F -P" "-F -L -P" "-G -F -B3 -L"
|
||||
do
|
||||
for seg in 2m 100m
|
||||
do
|
||||
|
|
Loading…
Reference in a new issue