Add test cases for Global Deduplication.

Update documentation and code comments. Remove tempfile pathname after creation to ensure clean removal after process exit.
2013-04-26 18:32:00 +05:30 · 2013-04-26 18:32:00 +05:30 · aed69b2d53
commit aed69b2d53
parent 75f62d6a36
7 changed files with 128 additions and 8 deletions
--- a/README.md
+++ b/README.md
@ -40,6 +40,7 @@ Usage

    To compress a file:
       pcompress -c <algorithm> [-l <compress level>] [-s <chunk size>] <file> [-]
+
       Where <algorithm> can be the folowing:
       lzfx   - Very fast and small algorithm based on LZF.
       lz4    - Ultra fast, high-throughput algorithm reaching RAM B/W at level1.
@ -64,6 +65,7 @@ Usage
                LZMA. It has significantly more memory usage than adapt.
       none   - No compression. This is only meaningful with -D and -E so Dedupe
                can be done for post-processing with an external utility.
+
       <chunk_size> - This can be in bytes or can use the following suffixes:
                g - Gigabyte, m - Megabyte, k - Kilobyte.
                Larger chunks produce better compression at the cost of memory.
@ -206,6 +208,15 @@ allocator. Due to the the way it rounds up an allocation request to the nearest
 slab the built-in allocator can allocate extra unused memory. In addition you
 may want to use a different allocator in your environment.

+The variable PCOMPRESS_INDEX_MEM can be set to limit memory used by the Global
+Deduplication Index. The number specified is in multiples of a megabyte.
+
+The variable PCOMPRESS_CACHE_DIR can point to a directory where some temporary
+files relating to the Global Deduplication process can be stored. This for example
+can be a directory on a Solid State Drive to speed up Global Deduplication. The
+space used in this directory is proportional to the size of the dataset being
+processed and is slightly more than 8KB for every 1MB of data.
+
 Examples
 ========

--- a/rabin/global/index.c
+++ b/rabin/global/index.c
@ -223,15 +223,16 @@ init_global_db_s(char *path, char *tmppath, uint32_t chunksize, uint64_t user_ch
 	rv = setup_db_config_s(cfg, chunksize, &user_chunk_sz, &pct_interval, algo, ck, ck_sim,
 		 file_sz, &hash_slots, &hash_entry_size, &memreqd, memlimit, tmppath);

-	// Reduce hash_slots to remain within memlimit
+	/*
+	 * Reduce hash_slots to remain within memlimit
+	 */
 	while (memreqd > memlimit) {
 		hash_slots--;
 		memreqd = hash_slots * MEM_PER_UNIT(hash_entry_size);
 	}

 	/*
-	 * Now create as many hash tables as there are similarity match intervals
-	 * each having hash_slots / intervals slots.
+	 * Now initialize the hashtable[s] to setup the index. 
 	 */
 	indx = calloc(1, sizeof (index_t));
 	if (!indx) {
@ -260,6 +261,10 @@ init_global_db_s(char *path, char *tmppath, uint32_t chunksize, uint64_t user_ch
 		indx->memused += ((indx->hash_slots) * (sizeof (hash_entry_t *)));
 	}

+	/*
+	 * If Segmented Deduplication is required intervals will be set and a temporary
+	 * file is created to hold rabin block hash lists for each segment.
+	 */
 	if (pct_interval > 0) {
 		strcpy(cfg->rootdir, tmppath);
 		strcat(cfg->rootdir, "/.segXXXXXX");
@ -276,6 +281,12 @@ init_global_db_s(char *path, char *tmppath, uint32_t chunksize, uint64_t user_ch
 			cfg->seg_fd_r[i].fd = open(cfg->rootdir, O_RDONLY);
 			cfg->seg_fd_r[i].mapping = NULL;
 		}
+
+		/*
+		 * Remove tempfile entry from the filesystem metadata so that file gets
+		 * automatically removed once process exits.
+		 */
+		unlink(cfg->rootdir);
 	}
 	cfg->segcache_pos = 0;
 	cfg->dbdata = indx;
@ -401,6 +412,9 @@ db_segcache_unmap(archive_config_t *cfg, int tid)
 	return (0);
 }

+/*
+ * Compare hashes. Hash size must be multiple of 8 bytes.
+ */
 static inline int
 mycmp(uchar_t *a, uchar_t *b, int sz)
 {
@ -453,7 +467,7 @@ db_lookup_insert_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval,
 			pent = &(ent->next);
 			ent = ent->next;
 		}
-	} else if (cfg->similarity_cksum_sz == 8) {
+	} else if (cfg->similarity_cksum_sz == 8) {// Fast path for 64-bit keys
 		while (ent) {
 			if (*((uint64_t *)sim_cksum) == *((uint64_t *)ent->cksum) &&
 			    ent->item_offset != item_offset) {
@ -474,6 +488,10 @@ db_lookup_insert_s(archive_config_t *cfg, uchar_t *sim_cksum, int interval,
 	}
 	if (do_insert) {
 		if (indx->memused + indx->hash_entry_size >= indx->memlimit && htab[htab_entry] != NULL) {
+			/*
+			 * If the index is close to full capacity, steal the oldest hash bucket
+			 * in this slot to hold the new data.
+			 */
 			ent = htab[htab_entry];
 			htab[htab_entry] = htab[htab_entry]->next;
 		} else {
@ -502,7 +520,6 @@ destroy_global_db_s(archive_config_t *cfg)
 		}
 		free(cfg->seg_fd_r);
 		close(cfg->seg_fd_w);
-		unlink(cfg->rootdir);
 	}
 }

--- a/test/t4.tst
+++ b/test/t4.tst
@ -10,7 +10,8 @@ do
 	for tf in `cat files.lst`
 	do
 		rm -f ${tf}.*
-		for feat in "-D" "-D -B3 -L" "-D -B4 -E" "-D -B2 -EE" "-D -B5 -EE -L" "-D -B2 -r" "-P" "-D -P" "-D -L -P"
+		for feat in "-D" "-D -B3 -L" "-D -B4 -E" "-D -B2 -EE" "-D -B5 -EE -L" "-D -B2 -r" "-P" "-D -P" "-D -L -P" \
+				"-G -D" "-G -F" "-G -L -P" "-G -B2"
 		do
 			for seg in 2m 100m
 			do
--- a/test/t5.tst
+++ b/test/t5.tst
@ -10,7 +10,7 @@ do
 	for tf in `cat files.lst`
 	do
 		rm -f ${tf}.*
-		for feat in "-e AES" "-e AES -L -S SHA256" "-D -e SALSA20 -S SHA512" "-D -EE -L -e SALSA20 -S BLAKE512" "-e AES -S CRC64" "-e SALSA20 -P" "-e AES -L -P -S KECCAK256" "-D -e SALSA20 -L -S KECCAK512" "-e AES -k16" "-e SALSA20 -k16"
+		for feat in "-e AES" "-e AES -L -S SHA256" "-D -e SALSA20 -S SHA512" "-D -EE -L -e SALSA20 -S BLAKE512" "-e AES -S CRC64" "-e SALSA20 -P" "-e AES -L -P -S KECCAK256" "-D -e SALSA20 -L -S KECCAK512" "-e AES -k16" "-e SALSA20 -k16" "-G -e AES -S SHA256" "-G -e SALSA20 -P"
 		do
 			for seg in 2m 100m
 			do
--- a/test/t6.tst
+++ b/test/t6.tst
@ -46,6 +46,44 @@ do
 	done
 done

+for algo in lz4 zlib
+do
+	for dopts "" "-G -D" "-G -F" "-D"
+	do
+		for tf in `cat files.lst`
+		do
+			rm -f ${tf}.*
+			for seg in 1m 21m
+			do
+				cmd="../../pcompress -c ${algo} -l6 -s ${seg} ${dopts} ${tf} - > ${tf}.pz"
+				echo "Running $cmd"
+				eval $cmd
+				if [ $? -ne 0 ]
+				then
+					echo "FATAL: Compression errored."
+					rm -f ${tf}.pz
+					continue
+				fi
+				cmd="cat ${tf}.pz | ../../pcompress -d - ${tf}.1"
+				echo "Running $cmd"
+				eval $cmd
+				if [ $? -ne 0 ]
+				then
+					echo "FATAL: Decompression errored."
+					rm -f ${tf}.pz ${tf}.1
+					continue
+				fi
+				diff ${tf} ${tf}.1 > /dev/null
+				if [ $? -ne 0 ]
+				then
+					echo "FATAL: Decompression was not correct"
+				fi
+				rm -f ${tf}.pz ${tf}.1
+			done
+		done
+	done
+done
+
 echo "#################################################"
 echo ""

--- a/test/t7.tst
+++ b/test/t7.tst
@ -61,6 +61,59 @@ do
 	done
 done

+for algo in lzfx zlib
+do
+	for tf in `cat files.lst`
+	do
+		for feat in "-e SALSA20" "-e AES -L" "-D -e SALSA20" "-D -EE -L -e SALSA20 -S KECCAK256" "-G -e SALSA20" "-G -F -e AES"
+		do
+			for seg in 5m
+			do
+				echo "sillypassword" > /tmp/pwf
+				cmd="../../pcompress -c${algo} -l3 -s${seg} $feat -w /tmp/pwf ${tf} - > ${tf}.pz"
+				echo "Running $cmd"
+				eval $cmd
+				if [ $? -ne 0 ]
+				then
+					echo "FATAL: Compression errored."
+					rm -f ${tf}.pz ${tf}.1
+					continue
+				fi
+
+				pw=`cat /tmp/pwf`
+				if [ "$pw" = "sillypassword" ]
+				then
+					echo "FATAL: Password file /tmp/pwf not zeroed!"
+				fi
+
+				echo "sillypassword" > /tmp/pwf
+				cmd="cat ${tf}.pz | ../../pcompress -d -w /tmp/pwf - ${tf}.1"
+				echo "Running $cmd"
+				eval $cmd
+				if [ $? -ne 0 ]
+				then
+					echo "FATAL: Decompression errored."
+					rm -f ${tf}.pz ${tf}.1
+					continue
+				fi
+
+				diff ${tf} ${tf}.1 > /dev/null
+				if [ $? -ne 0 ]
+				then
+					echo "FATAL: Decompression was not correct"
+				fi
+
+				pw=`cat /tmp/pwf`
+				if [ "$pw" = "sillypassword" ]
+				then
+					echo "FATAL: Password file /tmp/pwf not zeroed!"
+				fi
+				rm -f ${tf}.pz ${tf}.1
+			done
+		done
+	done
+done
+
 rm -f /tmp/pwf

 echo "#################################################"
--- a/test/t8.tst
+++ b/test/t8.tst
@ -12,7 +12,7 @@ for algo in lzfx lz4 adapt adapt2
 do
 	for tf in `cat files.lst`
 	do
-		for feat in "-F" "-F -B3 -L" "-F -B4" "-F -B5 -L" "-F -P" "-F -L -P"
+		for feat in "-F" "-F -B3 -L" "-F -B4" "-F -B5 -L" "-F -P" "-F -L -P" "-G -F -B3 -L"
 		do
 			for seg in 2m 100m
 			do