Add test cases for Global Deduplication.

Update documentation and code comments. Remove tempfile pathname after creation to ensure clean removal after process exit.
2013-04-26 18:32:00 +05:30 · 2013-04-26 18:32:00 +05:30 · aed69b2d53
commit aed69b2d53
parent 75f62d6a36
7 changed files with 128 additions and 8 deletions
--- a/README.md
+++ b/README.md
@ -40,6 +40,7 @@ Usage
    To compress a file:
       pcompress -c <algorithm> [-l <compress level>] [-s <chunk size>] <file> [-]
       Where <algorithm> can be the folowing:
       lzfx   - Very fast and small algorithm based on LZF.
       lz4    - Ultra fast, high-throughput algorithm reaching RAM B/W at level1.
@ -64,6 +65,7 @@ Usage
                LZMA. It has significantly more memory usage than adapt.
       none   - No compression. This is only meaningful with -D and -E so Dedupe
                can be done for post-processing with an external utility.
       <chunk_size> - This can be in bytes or can use the following suffixes:
                g - Gigabyte, m - Megabyte, k - Kilobyte.
                Larger chunks produce better compression at the cost of memory.
@ -206,6 +208,15 @@ allocator. Due to the the way it rounds up an allocation request to the nearest
 slab the built-in allocator can allocate extra unused memory. In addition you
 may want to use a different allocator in your environment.
 The variable PCOMPRESS_INDEX_MEM can be set to limit memory used by the Global
 Deduplication Index. The number specified is in multiples of a megabyte.
 The variable PCOMPRESS_CACHE_DIR can point to a directory where some temporary
 files relating to the Global Deduplication process can be stored. This for example
 can be a directory on a Solid State Drive to speed up Global Deduplication. The
 space used in this directory is proportional to the size of the dataset being
 processed and is slightly more than 8KB for every 1MB of data.
 Examples
 ========
--- a/rabin/global/index.c
+++ b/rabin/global/index.c
@ -223,15 +223,16 @@ init_global_db_s(char *path, char *tmppath, uint32_t chunksize, uint64_t user_ch
 	rv = setup_db_config_s(cfg, chunksize, &user_chunk_sz, &pct_interval, algo, ck, ck_sim,
 		 file_sz, &hash_slots, &hash_entry_size, &memreqd, memlimit, tmppath);
-	// Reduce hash_slots to remain within memlimit
+	/*
 	 * Reduce hash_slots to remain within memlimit
 	 */
 	while (memreqd > memlimit) {
 		hash_slots--;
 		memreqd = hash_slots * MEM_PER_UNIT(hash_entry_size);
 	}
 	/*
-	 * Now create as many hash tables as there are similarity match intervals
+	 * Now initialize the hashtable[s] to setup the index. 
 	 * each having hash_slots / intervals slots.
 	 */
 	indx = calloc(1, sizeof (index_t));
 	if (!indx) {
@ -260,6 +261,10 @@ init_global_db_s(char *path, char *tmppath, uint32_t chunksize, uint64_t user_ch
 		indx->memused += ((indx->hash_slots) * (sizeof (hash_entry_t *)));
 	}
 	/*
 	 * If Segmented Deduplication is required intervals will be set and a temporary
 	 * file is created to hold rabin block hash lists for each segment.
 	 */
 	if (pct_interval > 0) {
 		strcpy(cfg->rootdir, tmppath);
 		strcat(cfg->rootdir, "/.segXXXXXX");
@ -276,6 +281,12 @@ init_global_db_s(char *path, char *tmppath, uint32_t chunksize, uint64_t user_ch
 			cfg->seg_fd_r[i].fd = open(cfg->rootdir, O_RDONLY);
 			cfg->seg_fd_r[i].mapping = NULL;
 		}
 		/*
 		 * Remove tempfile entry from the filesystem metadata so that file gets
 		 * automatically removed once process exits.
 		 */
 		unlink(cfg->rootdir);
 	}
 	cfg->segcache_pos = 0;
 	cfg->dbdata = indx;
@ -401,6 +412,9 @@ db_segcache_unmap(archive_config_t *cfg, int tid)
 	return (0);
 }
 /*
 * Compare hashes. Hash size must be multiple of 8 bytes.
 */
 static inline int
 mycmp(uchar_t *a, uchar_t *b, int sz)
 {
@ -453,7 +467,7 @@ db_lookup_insert_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval,
 			pent = &(ent->next);
 			ent = ent->next;
 		}
-	} else if (cfg->similarity_cksum_sz == 8) {
+	} else if (cfg->similarity_cksum_sz == 8) {// Fast path for 64-bit keys
 		while (ent) {
 			if (*((uint64_t *)sim_cksum) == *((uint64_t *)ent->cksum) &&
 			    ent->item_offset != item_offset) {
@ -474,6 +488,10 @@ db_lookup_insert_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval,
 	}
 	if (do_insert) {
 		if (indx->memused + indx->hash_entry_size >= indx->memlimit && htab[htab_entry] != NULL) {
 			/*
 			 * If the index is close to full capacity, steal the oldest hash bucket
 			 * in this slot to hold the new data.
 			 */
 			ent = htab[htab_entry];
 			htab[htab_entry] = htab[htab_entry]->next;
 		} else {
@ -502,7 +520,6 @@ destroy_global_db_s(archive_config_t *cfg)
 		}
 		free(cfg->seg_fd_r);
 		close(cfg->seg_fd_w);
 		unlink(cfg->rootdir);
 	}
 }
--- a/test/t4.tst
+++ b/test/t4.tst
@ -10,7 +10,8 @@ do
 	for tf in `cat files.lst`
 	do
 		rm -f ${tf}.*
-		for feat in "-D" "-D -B3 -L" "-D -B4 -E" "-D -B2 -EE" "-D -B5 -EE -L" "-D -B2 -r" "-P" "-D -P" "-D -L -P"
+		for feat in "-D" "-D -B3 -L" "-D -B4 -E" "-D -B2 -EE" "-D -B5 -EE -L" "-D -B2 -r" "-P" "-D -P" "-D -L -P" \
 				"-G -D" "-G -F" "-G -L -P" "-G -B2"
 		do
 			for seg in 2m 100m
 			do
--- a/test/t5.tst
+++ b/test/t5.tst
@ -10,7 +10,7 @@ do
 	for tf in `cat files.lst`
 	do
 		rm -f ${tf}.*
-		for feat in "-e AES" "-e AES -L -S SHA256" "-D -e SALSA20 -S SHA512" "-D -EE -L -e SALSA20 -S BLAKE512" "-e AES -S CRC64" "-e SALSA20 -P" "-e AES -L -P -S KECCAK256" "-D -e SALSA20 -L -S KECCAK512" "-e AES -k16" "-e SALSA20 -k16"
+		for feat in "-e AES" "-e AES -L -S SHA256" "-D -e SALSA20 -S SHA512" "-D -EE -L -e SALSA20 -S BLAKE512" "-e AES -S CRC64" "-e SALSA20 -P" "-e AES -L -P -S KECCAK256" "-D -e SALSA20 -L -S KECCAK512" "-e AES -k16" "-e SALSA20 -k16" "-G -e AES -S SHA256" "-G -e SALSA20 -P"
 		do
 			for seg in 2m 100m
 			do
--- a/test/t6.tst
+++ b/test/t6.tst
@ -46,6 +46,44 @@ do
 	done
 done
 for algo in lz4 zlib
 do
 	for dopts "" "-G -D" "-G -F" "-D"
 	do
 		for tf in `cat files.lst`
 		do
 			rm -f ${tf}.*
 			for seg in 1m 21m
 			do
 				cmd="../../pcompress -c ${algo} -l6 -s ${seg} ${dopts} ${tf} - > ${tf}.pz"
 				echo "Running $cmd"
 				eval $cmd
 				if [ $? -ne 0 ]
 				then
 					echo "FATAL: Compression errored."
 					rm -f ${tf}.pz
 					continue
 				fi
 				cmd="cat ${tf}.pz | ../../pcompress -d - ${tf}.1"
 				echo "Running $cmd"
 				eval $cmd
 				if [ $? -ne 0 ]
 				then
 					echo "FATAL: Decompression errored."
 					rm -f ${tf}.pz ${tf}.1
 					continue
 				fi
 				diff ${tf} ${tf}.1 > /dev/null
 				if [ $? -ne 0 ]
 				then
 					echo "FATAL: Decompression was not correct"
 				fi
 				rm -f ${tf}.pz ${tf}.1
 			done
 		done
 	done
 done
 echo "#################################################"
 echo ""
--- a/test/t7.tst
+++ b/test/t7.tst
@ -61,6 +61,59 @@ do
 	done
 done
 for algo in lzfx zlib
 do
 	for tf in `cat files.lst`
 	do
 		for feat in "-e SALSA20" "-e AES -L" "-D -e SALSA20" "-D -EE -L -e SALSA20 -S KECCAK256" "-G -e SALSA20" "-G -F -e AES"
 		do
 			for seg in 5m
 			do
 				echo "sillypassword" > /tmp/pwf
 				cmd="../../pcompress -c${algo} -l3 -s${seg} $feat -w /tmp/pwf ${tf} - > ${tf}.pz"
 				echo "Running $cmd"
 				eval $cmd
 				if [ $? -ne 0 ]
 				then
 					echo "FATAL: Compression errored."
 					rm -f ${tf}.pz ${tf}.1
 					continue
 				fi
 				pw=`cat /tmp/pwf`
 				if [ "$pw" = "sillypassword" ]
 				then
 					echo "FATAL: Password file /tmp/pwf not zeroed!"
 				fi
 				echo "sillypassword" > /tmp/pwf
 				cmd="cat ${tf}.pz | ../../pcompress -d -w /tmp/pwf - ${tf}.1"
 				echo "Running $cmd"
 				eval $cmd
 				if [ $? -ne 0 ]
 				then
 					echo "FATAL: Decompression errored."
 					rm -f ${tf}.pz ${tf}.1
 					continue
 				fi
 				diff ${tf} ${tf}.1 > /dev/null
 				if [ $? -ne 0 ]
 				then
 					echo "FATAL: Decompression was not correct"
 				fi
 				pw=`cat /tmp/pwf`
 				if [ "$pw" = "sillypassword" ]
 				then
 					echo "FATAL: Password file /tmp/pwf not zeroed!"
 				fi
 				rm -f ${tf}.pz ${tf}.1
 			done
 		done
 	done
 done
 rm -f /tmp/pwf
 echo "#################################################"
--- a/test/t8.tst
+++ b/test/t8.tst
@ -12,7 +12,7 @@ for algo in lzfx lz4 adapt adapt2
 do
 	for tf in `cat files.lst`
 	do
-		for feat in "-F" "-F -B3 -L" "-F -B4" "-F -B5 -L" "-F -P" "-F -L -P"
+		for feat in "-F" "-F -B3 -L" "-F -B4" "-F -B5 -L" "-F -P" "-F -L -P" "-G -F -B3 -L"
 		do
 			for seg in 2m 100m
 			do