From 66a482c968275accb98eb3a0490a8cb0ee7de452 Mon Sep 17 00:00:00 2001
From: Moinak Ghosh <moinakg@gmail.com>
Date: Fri, 9 Jan 2015 22:13:24 +0530
Subject: [PATCH] A new Dictionary preprocessor for text files.

---
 filters/analyzer/analyzer.c |    4 +-
 filters/dict/DictFilter.cpp | 1160 +++++++++++++++++++++++++++++------
 filters/dict/DictFilter.h   |    7 +-
 meta_stream.c               |    4 +-
 pcompress.c                 |   14 +-
 5 files changed, 975 insertions(+), 214 deletions(-)

diff --git a/filters/analyzer/analyzer.c b/filters/analyzer/analyzer.c
index 54033fb..9e2a53a 100644
--- a/filters/analyzer/analyzer.c
+++ b/filters/analyzer/analyzer.c
@@ -59,7 +59,7 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
 		if (cur_byte != ' ')
 			prev_byte = cur_byte;
 	}
-	
+
 	/*
 	 * Heuristics for detecting BINARY vs generic TEXT vs XML data at various
 	 * significance levels.
@@ -72,7 +72,7 @@ analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
 	} else {
 		actx->forty_pct.btype = TYPE_TEXT;
 	}
-	
+
 	if (tot_8b > FIFTY_PCT(srclen)) {
 		actx->fifty_pct.btype = TYPE_BINARY;
 	} else {
diff --git a/filters/dict/DictFilter.cpp b/filters/dict/DictFilter.cpp
index a462bdc..252b3d4 100644
--- a/filters/dict/DictFilter.cpp
+++ b/filters/dict/DictFilter.cpp
@@ -23,8 +23,43 @@
  */
 
 /*
- * Dict filter for text files. Adapted from Public Domain sources
- * of Fu Siyuan's CSC 3.2 archiver.
+ * Dictionary preprocessor for text files. It uses some ideas from
+ * the following paper:
+ * http://pskibinski.pl/papers/05-RevisitingDictCompr.pdf
+ *
+ * However the implementation here is quite different from that
+ * described in the paper. A simple hash table is used for the
+ * word dictionary. A min-LRU based aging mechanism is used to evict
+ * words with low frequency to make way for newer words. The min-LRU
+ * aging kicks in after at least 50% of the data is processed and the
+ * hash table is full. The hash table size is derived from the data
+ * size.
+ * After scanning the data, words with occurrence X word size less
+ * than a threshold are evicted from the final dictionary. The
+ * dictionary is then prefixed to the encoded data. The words in the
+ * final dictionary are sorted based on occurrence X word size value
+ * and then alphabetically.
+ *
+ * Words are extracted by splitting text on a few separator characters.
+ * Proper case capital conversion is done. So the dictionary only
+ * contains lower case words.
+ * Words in the data are replaced by dictionary indexes. These numbers
+ * are encoded into a base-217 string. A bunch of non-separator char
+ * ranges are used. Each encoded word is prefixed with a backtick (`).
+ * Capital converted words are prefixed with an exclamation (!).
+ * Apart from encoding words, literal numbers more than 3 digits are
+ * replaced with their base-217 encoded strings. These encoded
+ * numbers are prefixed with a dollar ($).
+ * Since words are only encoded on a separator boundary, any lieral
+ * prefix characters following a separator boundary are escaped using
+ * a back-slash (\).
+ *
+ * The separators are prefix characters have been exprimentally
+ * selected to benefit context based compressors like PPM and Libbsc.
+ * Libbsc is especially finicky about the nature of the transform.
+ * For example XWrt (http://xwrt.sourceforge.net/), a preprocessor
+ * that implements all of the ideas described in the paper does not
+ * benefit Libbsc in the enwik9 test(http://mattmahoney.net/dc/text.html).
  */
 
 #include <stdlib.h>
@@ -32,245 +67,969 @@
 #include <strings.h>
 #include <stdint.h>
 #include <stdio.h>
+#include <pthread.h>
+#include <ctype.h>
 #include "DictFilter.h"
-#include "Common.h"
 #include "utils.h"
+#include "allocator.h"
+#include "xxhash.h"
 
+#define	WORD_MIN	3
+#define	WORD_MAX	50
+#define	LIST_LRU_NUM	15
+
+typedef struct dict_entry {
+	unsigned char *word;
+	unsigned char sz;
+	unsigned char lcfirst;
+	uint32_t indx;
+	uint32_t occur;
+	struct dict_entry *next;
+	struct dict_entry *list_next;
+} dict_entry_t;
+
+typedef struct hash_context_s {
+	dict_entry_t    **dict;
+	uint32_t        dictcount;
+	uint32_t        dictsize;
+	uint32_t        cur_indx;
+	uint32_t        collisions;
+	dict_entry_t    *sentinel;
+} hash_context_t;
+
+typedef struct list_context_s {
+	dict_entry_t    *head;
+	dict_entry_t    *tail;
+	uint32_t        listcount;
+	uint32_t        listsize;
+	uint32_t        aged_entries;
+	uint32_t        aging_requests;
+} list_context_t;
+
+typedef struct decode_dict_entry_s {
+	uint32_t	sz;
+	uint8_t		*word;
+} decode_dict_entry_t;
+
+/*
+ * We are always copying small blocks, typically words, ranging
+ * from 3 bytes to 20 bytes. So an inline memory copy is more
+ * efficient than memcpy() library calls.
+ */
+static inline void
+copy_bytes(void *dst, void *src, size_t len)
+{
+	static void *targets[] = { &&zero, &&one, &&two, &&three };
+
+	uint8_t *to = (uint8_t *)dst;
+	uint8_t *from = (uint8_t *)src;
+
+	while (len >= sizeof (uint32_t)) {
+		*(uint32_t *)to = *(const uint32_t *)from;
+		to += sizeof (uint32_t);
+		from += sizeof (uint32_t);
+		len -= sizeof (uint32_t);
+	}
+
+	/* Unroll final small loop using computed goto. */
+	goto *targets[len];
+three:
+	*to = *from;
+	to++; from++;
+two:
+	*to = *from;
+	to++; from++;
+one:
+	*to = *from;
+zero:
+	return;
+}
+
+/*
+ * Local replacement for bcmp() avoiding a library call for comparing
+ * words.
+ */
+static inline int
+eq_bytes(void *a, void *b, size_t len)
+{
+	static void *targets[] = { &&_zero, &&_one, &&_two, &&_three };
+	uint8_t *to = (uint8_t *)a;
+	uint8_t *from = (uint8_t *)b;
+
+	while (len >= sizeof (uint32_t)) {
+		if (*(uint32_t *)to != *(uint32_t *)from)
+			return (1);
+		to += sizeof (uint32_t);
+		from += sizeof (uint32_t);
+		len -= sizeof (uint32_t);
+	}
+
+	/* Unroll final small loop using computed goto. */
+	goto *targets[len];
+_three:
+	if (*to != *from) return (1);
+	to++; from++;
+_two:
+	if (*to != *from) return (1);
+	to++; from++;
+_one:
+	if (*to != *from) return (1);
+_zero:
+	return (0);
+}
+
+/*
+ * Sort comparison for the dictionary words.
+ * Compare first by occurrence X word length and then alphabetically
+ * by the first three letters. Words are at least 3 chars in length.
+ */
+static int
+cmpoccur(const void *a, const void *b) {
+	dict_entry_t *de1 = *((dict_entry_t **)a);
+	dict_entry_t *de2 = *((dict_entry_t **)b);
+	uint64_t a1, b1;
+
+	a1 = ((uint64_t)(de1->occur) - 1) * (de1->sz - 1);
+	b1 = ((uint64_t)(de2->occur) - 1) * (de2->sz - 1);
+
+	if (a1 < b1) {
+		return (1);
+	} else if (a1 == b1) {
+		if (de1->sz < de2->sz) {
+			return (1);
+		} else if (de1->sz == de2->sz) {
+			if (de1->word[0] != de2->word[0])
+				return ((int)de2->word[0] - (int)de1->word[0]);
+			if (de1->word[1] != de2->word[1])
+				return ((int)de2->word[1] - (int)de1->word[1]);
+			if (de1->word[2] != de2->word[2])
+				return ((int)de2->word[2] - (int)de1->word[2]);
+			return (0);
+		} else {
+			return (-1);
+		}
+	} else {
+		return (-1);
+	}
+}
+
+/*
+ * Singleton filter class.
+ */
 class DictFilter
 {
 public:
+	int Forward_Dict(uint8_t *src, uint32_t size, uint8_t *dst, uint32_t *dstsize);
+	int Inverse_Dict(uint8_t *src, uint32_t size, uint8_t *dst, uint32_t *dstsize);
+
+	static DictFilter *getInstance() {
+		pthread_mutex_lock(&inst_lock);
+		if (!inst) {
+			inst = new DictFilter();
+		}
+		pthread_mutex_unlock(&inst_lock);
+		return (inst);
+	}
+
+protected:
 	~DictFilter();
 	DictFilter();
 
-	u32 Forward_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize);
-	void Inverse_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize);
+	dict_entry_t *find_string(dict_entry_t *de, uint8_t *str, uint32_t sz,
+	    uint8_t lcfirst);
+	void hash_context_init(hash_context_t *hctx, uint32_t dictsize);
+	void hash_context_delete(hash_context_t *hctx);
+	dict_entry_t *hash_lookup(hash_context_t *hctx, uint8_t *word, uint32_t wordsize,
+	    uint8_t lcfirst);
+	dict_entry_t *hash_add(hash_context_t *hctx, uint8_t *word, uint32_t wordsize,
+	    dict_entry_t *_de);
+	dict_entry_t *hash_remove(hash_context_t *hctx, uint8_t *word, uint32_t wordsize,
+	    dict_entry_t *r_de);
 
-private:
-	typedef struct
-	{
-		u32 next[26];
-		u8 symbol;
-	} CTreeNode;
-	CTreeNode wordTree[MAX_WORDTREE_NODE_NUM];
-	u32 nodeMum;
-	u8 maxSymbol;
-	//Used for DICT transformer. Words are stored in trees.
+	void list_context_init(list_context_t *lctx, uint32_t listsize);
+	void list_context_delete(list_context_t *lctx);
+	dict_entry_t *list_push(list_context_t *lctx, dict_entry_t *de);
+	dict_entry_t *list_pop_lru_min(list_context_t *lctx);
 
-	u32 wordIndex[256];
-	//Used for DICT untransformer.choose words by symbols.
-	void MakeWordTree();  //Init the DICT transformer
+	uint8_t *to_base_enc(uint32_t number, uint8_t *str, int sz);
+	uint32_t from_base_enc(uint8_t *dnum, int sz);
 
-	u32 x0,x1;
-	u32 i,k;
+	static pthread_mutex_t inst_lock;
+	static DictFilter *inst;
+	static const char *BASE_DIGITS;
+
+	uint8_t  SEPARATOR[256], flag, flag1, flag2;
+	uint8_t  base_enc_digits[256];
+	uint8_t  base_dec_digits[256];
+	uint32_t NUMERAL_BASE;
 };
 
-const u32 wordNum = 123;
-
-u8 wordList[wordNum][8] =
-{
-	"",
-	"ac","ad","ai","al","am",
-	"an","ar","as","at","ea",
-	"ec","ed","ee","el","en",
-	"er","es","et","id","ie",
-	"ig","il","in","io","is",
-	"it","of","ol","on","oo",
-	"or","os","ou","ow","ul",
-	"un","ur","us","ba","be",
-	"ca","ce","co","ch","de",
-	"di","ge","gh","ha","he",
-	"hi","ho","ra","re","ri",
-	"ro","rs","la","le","li",
-	"lo","ld","ll","ly","se",
-	"si","so","sh","ss","st",
-	"ma","me","mi","ne","nc",
-	"nd","ng","nt","pa","pe",
-	"ta","te","ti","to","th",
-	"tr","wa","ve",
-	"all","and","but","dow",
-	"for","had","hav","her",
-	"him","his","man","mor",
-	"not","now","one","out",
-	"she","the","was","wer",
-	"whi","whe","wit","you",
-	"any","are",
-	"that","said","with","have",
-	"this","from","were","tion",
-};
-
-
-void
-DictFilter::MakeWordTree()
-{
-	u32 i,j;
-	u32 treePos;
-	u8 symbolIndex = 0x82;
-
-	nodeMum = 1;
-
-	memset(wordTree,0,sizeof(wordTree));
-
-	for (i = 1; i < wordNum; i++) {
-		treePos = 0;
-		for(j = 0; wordList[i][j] != 0; j++) {
-			u32 idx = wordList[i][j] - 'a';
-			if (wordTree[treePos].next[idx]) {
-				treePos = wordTree[treePos].next[idx];
-			} else {
-				wordTree[treePos].next[idx] = nodeMum;
-				treePos = nodeMum;
-				nodeMum++;
-			}
-		}
-		wordIndex[symbolIndex] = i;
-		wordTree[treePos].symbol = symbolIndex++;
-	}
-
-	maxSymbol=symbolIndex;
-
-}
-
+pthread_mutex_t DictFilter::inst_lock = PTHREAD_MUTEX_INITIALIZER;
+DictFilter *DictFilter::inst = NULL;
+const char *DictFilter::BASE_DIGITS = "0123456789abcdefghijklmnopqrstuvwxyz@ABCDEFGHIJKLMNOPQRSTUVWXYZ";
 
 DictFilter::DictFilter()
 {
-	MakeWordTree();
+	uint32_t new_size, i;
+
+	memset(SEPARATOR, 0, 256);
+
+	/*
+	 * Initialize the number encoding characters. Total
+	 * 217 chars are used for a base-217 encoding. In
+	 * particular, separator characters are avoided.
+	 */
+	new_size = strlen(BASE_DIGITS);
+	for (i=0; i<new_size; i++) {
+		base_dec_digits[(uint8_t)BASE_DIGITS[i]] = i;
+		base_enc_digits[i] = BASE_DIGITS[i];
+	}
+
+	new_size = 1;
+	while (new_size < 9) {
+		base_dec_digits[new_size] = i;
+		base_enc_digits[i++] = new_size++;
+	}
+
+	new_size = 14;
+	while (new_size < 32) {
+		base_dec_digits[new_size] = i;
+		base_enc_digits[i++] = new_size++;
+	}
+
+	new_size = 128;
+	while (new_size < 256) {
+		base_dec_digits[new_size] = i;
+		base_enc_digits[i++] = new_size++;
+	}
+
+	base_enc_digits[i] = '\0';
+	NUMERAL_BASE = i;
+
+	/*
+	 * The characters that are regarded as word separators. These
+	 * separators are good for general roman alphabet text and
+	 * XML/HTML markup text.
+	 */
+	SEPARATOR['<'] = 1;
+	SEPARATOR['['] = 1;
+	SEPARATOR['"'] = 1;
+	SEPARATOR['('] = 1;
+	SEPARATOR['|'] = 1;
+	SEPARATOR['/'] = 1;
+	SEPARATOR[' '] = 1;
+	SEPARATOR['\t'] = 1;
+	SEPARATOR[':'] = 1;
+	SEPARATOR['\n'] = 1;
+	SEPARATOR['\r'] = 1;
+	SEPARATOR['>'] = 1;
+	SEPARATOR[']'] = 1;
+	SEPARATOR['\''] = 1;
+	SEPARATOR[')'] = 1;
+	SEPARATOR['.'] = 1;
+	SEPARATOR['?'] = 1;
+	SEPARATOR[','] = 1;
+	SEPARATOR[';'] = 1;
+	SEPARATOR['='] = 1;
+	SEPARATOR['{'] = 1;
+	SEPARATOR['}'] = 1;
+	SEPARATOR['-'] = 1;
+	SEPARATOR['+'] = 1;
+	SEPARATOR['*'] = 1;
+
+	/*
+	 * Prefix characters for encoded words and numbers.
+	 */
+	flag = '`';
+	flag1 = '!';
+	flag2 = '$';
+
+	/*slab_cache_add(sizeof (dict_entry_t));
+	slab_cache_add(sizeof (hash_context_t));
+	slab_cache_add(sizeof (list_context_t));*/
 }
 
-
-
 DictFilter::~DictFilter()
 {
+	pthread_mutex_lock(&inst_lock);
+	if (inst) {
+		delete inst;
+	}
+	pthread_mutex_unlock(&inst_lock);
 }
 
-
-u32
-DictFilter::Forward_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize)
+uint8_t *
+DictFilter::to_base_enc(uint32_t number, uint8_t *str, int sz)
 {
-	if (size < 16384)
-		return 0;
-
-	u32 i,j,treePos = 0;
-	u32 lastSymbol = 0;
-	u32 dstSize = 0;
-	int idx;
-
-
-	for(i = 0; i < size-5;) {
-		if (dstSize > *dstsize-4)
-			return (0);
-		if (src[i] >= 'a' && src[i] <= 'z') {
-
-			u32 matchSymbol = 0,longestWord = 0;
-			treePos = 0;
-			for(j = 0;;) {
-				idx = src[i+j] - 'a';
-				if (idx < 0 || idx > 25)
-					break;
-				if (wordTree[treePos].next[idx] == 0)
-					break;
-
-				treePos=wordTree[treePos].next[idx];
-				j++;
-				if (wordTree[treePos].symbol) {
-					matchSymbol = wordTree[treePos].symbol;
-					longestWord = j;
-				}
-			}
-
-			if (matchSymbol) {
-				dst[dstSize++] = matchSymbol;
-				i += longestWord;
-				continue;
-			}
-			lastSymbol = 0;
-			dst[dstSize++] = src[i];
-			i++;
-		} else {
-			if (src[i] >= 0x82) {
-				dst[dstSize++] = 254;
-				dst[dstSize++] = src[i];
-			}
-			else
-				dst[dstSize++] = src[i];
-
-			lastSymbol = 0;
-			treePos = 0;
-			i++;
-		}
-
+	sz--;
+	str[sz] = '\0';
+	sz--;
+	while (number > 0 && sz >= 0) {
+		uint32_t rem = number % NUMERAL_BASE;
+		str[sz--] = base_enc_digits[rem];
+		number /= NUMERAL_BASE;
 	}
+	sz++;
+	return (&str[sz]);
+}
 
-	for (; i<size; i++) {
-		if (dstSize > *dstsize-4)
-			return (0);
-		if (src[i] >= 0x82) {
-			dst[dstSize++] = 254;
-			dst[dstSize++] = src[i];
-		}
-		else
-			dst[dstSize++] = src[i];
+uint32_t
+DictFilter::from_base_enc(uint8_t *dnum, int sz)
+{
+	uint32_t pow = 1;
+	uint32_t num = 0;
+
+	if (sz == 0) return (0);
+	while (sz > 0) {
+		uint32_t c = dnum[sz-1];
+		c = base_dec_digits[c];
+		num += (c * pow);
+		pow *= NUMERAL_BASE;
+		sz--;
 	}
+	return (num);
+}
 
-	if (dstSize > size*0.82)
-		return 0;
-
-	*dstsize = dstSize;
-	return 1;
+/*
+ * Search for a string in the hash table bucket chain. The first letter is
+ * always lower-cased for Proper-case capital-converted comparison.
+ */
+dict_entry_t *
+DictFilter::find_string(dict_entry_t *de, uint8_t *str, unsigned int sz, uint8_t lcfirst)
+{
+	uint8_t c1 = lcfirst;
+	while(de) {
+		if (de->sz == sz) {
+			uint8_t c2 = de->lcfirst;
+			if (c1 == c2) {
+				if (eq_bytes(de->word+1, str+1, sz-1) == 0)
+					return (de);
+			}
+		}
+		de = de->next;
+	}
+	return (NULL);
 }
 
 void
-DictFilter::Inverse_Dict(u8 *src, u32 size, u8 *dst, u32 *dstsize)
+DictFilter::hash_context_init(hash_context_t *hctx, uint32_t dictsize)
 {
+	hctx->dict = new dict_entry_t* [dictsize]();
+	hctx->dictcount = 0;
+	hctx->dictsize = dictsize;
+	hctx->collisions = 0;
+	hctx->sentinel = new dict_entry_t[1]();
+}
 
-	u32 i = 0,j;
-	u32 dstPos = 0,idx;
+void
+DictFilter::hash_context_delete(hash_context_t *hctx) {
+	uint32_t i;
 
-	while(dstPos < *dstsize && i < size) {
-		if (src[i] >= 0x82 && src[i] < maxSymbol) {
-			idx = wordIndex[src[i]];
-			for(j=0; wordList[idx][j]; j++)
-				dst[dstPos++] = wordList[idx][j];
-		}
-		else if (src[i] == 254 && (i+1 < size && src[i+1] >= 0x82)) {
-			i++;
-			dst[dstPos++] = src[i];
-		}
-		else {
-			dst[dstPos++] = src[i];
-		}
+	for (i=0; i<hctx->dictsize; i++) {
+		if (hctx->dict[i]) {
+			dict_entry_t *de, *de1;
 
-		i++;
+			de = hctx->dict[i];
+			while (de) {
+				de1 = de->next;
+				delete de;
+				de = de1;
+			}
+		}
 	}
-	*dstsize = dstPos;
+	delete hctx->dict;
+	delete hctx->sentinel;
+	hctx->dictcount = 0;
+	hctx->collisions = 0;
+}
+
+dict_entry_t *
+DictFilter::hash_lookup(hash_context_t *hctx, uint8_t *word, uint32_t wordsize, uint8_t lcfirst)
+{
+	uint32_t indx;
+
+	indx = XXH32(word+1, wordsize-1, lcfirst) % hctx->dictsize;
+	hctx->cur_indx = indx;
+	return (find_string(hctx->dict[indx], word, wordsize, lcfirst));
+}
+
+dict_entry_t *
+DictFilter::hash_add(hash_context_t *hctx, uint8_t *word, uint32_t wordsize, dict_entry_t *_de)
+{
+	dict_entry_t *de;
+	uint8_t lcfirst;
+
+	lcfirst = tolower(word[0]);
+
+	/*
+	 * As of now non-NULL _de means a lookup was already done and match was not found
+	 * and the hash table is full.
+	 * So we are adding a new entry with a aged out node. No need to do another lookup.
+	 */
+	if (!_de) {
+		de = hash_lookup(hctx, word, wordsize, lcfirst);
+		if (de) {
+			de->occur++;
+			return (hctx->sentinel);
+		}
+
+		if (hctx->dictcount == hctx->dictsize)
+			return (NULL);
+	} else {
+		hctx->cur_indx = XXH32(word+1, wordsize-1, lcfirst) % hctx->dictsize;
+	}
+
+	if (_de)
+		de = _de;
+	else
+		de = new dict_entry_t[1]();
+	de->word = word;
+	de->sz = wordsize;
+	de->lcfirst = lcfirst;
+	de->occur = 1;
+	de->indx = hctx->cur_indx;
+	if (hctx->dict[hctx->cur_indx])
+		hctx->collisions++;
+
+	de->next = hctx->dict[hctx->cur_indx];
+	hctx->dict[hctx->cur_indx] = de;
+	hctx->dictcount++;
+	return (de);
+}
+
+dict_entry_t *
+DictFilter::hash_remove(hash_context_t *hctx, uint8_t *word, uint32_t wordsize, dict_entry_t *r_de)
+{
+	dict_entry_t *de;
+	uint8_t lcfirst;
+
+	if (!r_de) {
+		lcfirst = tolower(word[0]);
+		de = hash_lookup(hctx, word, wordsize, lcfirst);
+	} else {
+		de = r_de;
+		hctx->cur_indx = de->indx;
+	}
+	if (de) {
+		dict_entry_t *c_de, *p_de;
+		de->indx = UINT32_MAX;
+
+		c_de = hctx->dict[hctx->cur_indx];
+		if (c_de == de) {
+			hctx->dict[hctx->cur_indx] = c_de->next;
+			hctx->dictcount--;
+			return (de);
+		}
+
+		p_de = c_de;
+		c_de = c_de->next;
+		while (c_de) {
+			if (c_de == de) {
+				p_de->next = c_de->next;
+				hctx->dictcount--;
+				return (de);
+			}
+			p_de = c_de;
+			c_de = c_de->next;
+		}
+		assert(0 == 1); // Fail, corrupted hash
+	}
+	return (NULL);
+}
+
+void
+DictFilter::list_context_init(list_context_t *lctx, uint32_t listsize)
+{
+	lctx->head = new dict_entry_t[1]();
+	lctx->tail = lctx->head;
+	lctx->listcount = 0;
+	lctx->listsize = listsize;
+	lctx->aged_entries = 0;
+}
+
+void
+DictFilter::list_context_delete(list_context_t *lctx)
+{
+	delete lctx->head;
+	lctx->listcount = 0;
+	lctx->aged_entries = 0;
+}
+
+dict_entry_t *
+DictFilter::list_push(list_context_t *lctx, dict_entry_t *de)
+{
+	if (lctx->listcount == lctx->listsize)
+		return (NULL);
+
+	lctx->tail->list_next = de;
+	de->list_next = NULL;
+	lctx->tail = de;
+	lctx->listcount++;
+
+	return (de);
+}
+
+/*
+ * Identify a dictionary entry to evict from the N least recently used
+ * entries at the list head. The entry with the lowest occurrence count
+ * which is below a given threshold is evicted.
+ * If no such entry can be found then the current lru aging request is not
+ * fulfilled. Also, all the N entries are rotated to the tail of the list.
+ * This increases the likelihood of finding an entry to evict for the next
+ * request. This allows incremental sequential probing of the list without
+ * incurring the cost of very large sequential scans, but at the cost of
+ * missing some interesting words.
+ * N is kept a small positive number.
+ */
+dict_entry_t *
+DictFilter::list_pop_lru_min(list_context_t *lctx)
+{
+	dict_entry_t *p_de, *c_de;
+	dict_entry_t *min, *min_p;
+	uint32_t list_scan, occur, maxoccur;
+
+	if (lctx->listcount == 0)
+		return (NULL);
+
+	lctx->aging_requests++;
+	p_de = lctx->head;
+	c_de = lctx->head->list_next;
+	min = NULL;
+
+	if (lctx->listcount > LIST_LRU_NUM)
+		list_scan = LIST_LRU_NUM;
+	else
+		list_scan = lctx->listcount;
+
+	occur = UINT32_MAX;
+	maxoccur = 0;
+	while (c_de && c_de != lctx->tail && list_scan > 0) {
+		if (c_de->occur < occur) {
+			min = c_de;
+			min_p = p_de;
+			occur = c_de->occur;
+		}
+		list_scan--;
+		p_de = c_de;
+		c_de = c_de->list_next;
+	}
+
+	if (min && min->occur * min->sz < 2048) {
+		min_p->list_next = min->list_next;
+		lctx->aged_entries++;
+		lctx->listcount--;
+		return (min);
+	}
+
+	if (lctx->listcount > LIST_LRU_NUM) {
+		lctx->tail->list_next = lctx->head->list_next;
+		lctx->head->list_next = c_de;
+		p_de->list_next = NULL;
+	}
+	return (NULL);
+}
+
+int
+DictFilter::Forward_Dict(uint8_t *src, uint32_t size, uint8_t *dst, uint32_t *dstsize)
+{
+	uint32_t dstSize = 0, dictSize, i, pos, num_entries;
+	hash_context_t hctx;
+	list_context_t lctx;
+	dict_entry_t **sorted_dict;
+	uint8_t num_dict[10], *numd;
+	ssize_t new_size;
+	int rv, sz;
+
+	if (size < 1024)
+		return 0;
+
+	if (size > 20000) {
+		dictSize = size / 10000;
+		dictSize += (dictSize >> 1);
+	} else {
+		dictSize = (size >> 1);
+	}
+	dictSize++;
+
+	pos = 0;
+	rv = 0;
+	hash_context_init(&hctx, dictSize);
+	list_context_init(&lctx, dictSize);
+	sorted_dict = new dict_entry_t* [dictSize];
+
+	/*
+	 * Scan words in the data and build the dictionary.
+	 */
+	for (i=0; i<size; i++) {
+		uint8_t c = src[i];
+
+		if (SEPARATOR[c]) {
+			dict_entry_t *de;
+			size_t toklen = i - pos;
+
+			if (toklen < WORD_MIN || toklen > WORD_MAX) {
+				pos = i+1;
+				continue;
+			}
+
+			de = hash_add(&hctx, src+pos, toklen, NULL);
+			if (!de && i > (size>>1)) {
+				de = list_pop_lru_min(&lctx);
+				if (de) {
+					dict_entry_t *de1;
+					de1 = hash_remove(&hctx, de->word, de->sz, de);
+					assert(de1 == de);
+					de1 = hash_add(&hctx, src+pos, toklen, de);
+					assert(de1 != NULL);
+					assert(de1 != hctx.sentinel);
+					list_push(&lctx, de1);
+				}
+			} else if (de != hctx.sentinel) {
+				list_push(&lctx, de);
+			}
+			pos = i+1;
+		}
+	}
+
+	/*
+	 * Mark below-threshold entries in the dictionary. Also sorted_dict holds a
+	 * flattened view of the hash.
+	 */
+	pos = 0;
+	for (i=0; i<dictSize; i++) {
+		if (hctx.dict[i]) {
+			dict_entry_t *de;
+
+			de = hctx.dict[i];
+			while (de) {
+				ssize_t val;
+
+				val = (size_t)de->occur * (size_t)de->sz;
+				if (val <= 4500) {
+					de->occur = 0;
+					de = de->next;
+					continue;
+				}
+
+				sorted_dict[pos++] = de;
+				de = de->next;
+			}
+		}
+	}
+
+	/*
+	 * Sort the flattened view of the hash in descending order of
+	 * occurrence X word size.
+	 */
+	qsort(sorted_dict, pos, sizeof (dict_entry_t *), cmpoccur);
+	num_entries = 0;
+	new_size = size;
+
+	for (i=0; i<pos; i++) {
+		dict_entry_t *de;
+		ssize_t prev_size;
+
+		de = sorted_dict[i];
+		if (de->occur > 1) {
+			ssize_t val;
+
+			/*
+			 * Mark entries for which the encoded representation will be
+			 * larger than the original.
+			 */
+			prev_size = new_size;
+			val = (size_t)de->occur * (size_t)de->sz;
+			new_size -= val;
+			if (num_entries == 0)
+				new_size += ((size_t)de->sz + (size_t)de->occur * 1);
+			else if (num_entries < NUMERAL_BASE)
+				new_size += ((size_t)de->sz + (size_t)de->occur * 2);
+			else if (num_entries < NUMERAL_BASE * NUMERAL_BASE)
+				new_size += ((size_t)de->sz + (size_t)de->occur * 3);
+			else if (num_entries < NUMERAL_BASE * NUMERAL_BASE * NUMERAL_BASE)
+				new_size += ((size_t)de->sz + (size_t)de->occur * 4);
+			else
+				new_size += ((size_t)de->sz + (size_t)de->occur * 5);
+			if (new_size >= prev_size) {
+				new_size = prev_size;
+				de->occur = 0;
+				continue;
+			}
+
+			de->indx = num_entries;
+			num_entries++;
+		} else {
+			de->occur = 0;
+		}
+	}
+
+	sz = sizeof (num_dict);
+	numd = to_base_enc(num_entries, num_dict, sz);
+	dstSize = num_dict+sz-numd-1;
+	copy_bytes(dst, numd, dstSize);
+	dst[dstSize++] = ' ';
+
+	/*
+	 * Copy the dictionary to the output buffer.
+	 */
+	for (i=0; i<pos && dstSize<*dstsize; i++) {
+		dict_entry_t *de;
+
+		de = sorted_dict[i];
+		if (de->occur > 1) {
+			dst[dstSize++] = de->lcfirst;
+			if (dstSize + de->sz + 1 >= *dstsize) {
+				goto bail;
+			}
+
+			copy_bytes(&dst[dstSize], de->word+1, de->sz-1);
+			dstSize += (de->sz-1);
+			dst[dstSize++] = ' ';
+		}
+	}
+
+	pos = 0;
+	for (i=0; i<size && dstSize<*dstsize; i++) {
+		uint8_t *tok, c;
+
+		c = src[i];
+		if (SEPARATOR[c]) {
+			dict_entry_t *de;
+			size_t toklen = i - pos;
+
+			if (toklen < WORD_MIN || toklen > WORD_MAX) {
+				if (*(src+pos) == flag || *(src+pos) == flag1 ||
+				    *(src+pos) == flag2 || *(src+pos) == '\\') {
+					dst[dstSize++] = '\\';
+				}
+				if (dstSize + toklen + 1 > *dstsize) {
+					goto bail;
+				}
+				copy_bytes(&dst[dstSize], src+pos, toklen+1);
+				dstSize += (toklen+1);
+				pos = i+1;
+				continue;
+			}
+
+			tok = src+pos;
+			de = hash_lookup(&hctx, tok, toklen, tolower(tok[0]));
+			if (de != NULL && de->occur > 1) {
+				uint16_t val;
+				unsigned char tok_hdr[10], *dnum;
+
+				/*
+				 * Encode word with dictionary reference.
+				 */
+				sz = sizeof (tok_hdr);
+				val = de->indx;
+				dnum = to_base_enc(val, tok_hdr, sz);
+				dnum--;
+				if (isupper(tok[0])) {
+					*dnum = flag1;
+				} else {
+					*dnum = flag;
+				}
+
+				val = tok_hdr+sz - dnum-1;
+				if (dstSize + val + 1 > *dstsize) {
+					goto bail;
+				}
+				copy_bytes(&dst[dstSize], dnum, val);
+				dstSize += val;
+				dst[dstSize++] = src[i];
+			} else {
+				uint8_t *word = src+pos;
+				uint8_t num[15];
+				uint32_t val;
+				int converted;
+
+				/*
+				 * Encode literal numeric strings.
+				 */
+				converted = 0;
+				if (word[0] != '+' && word[0] != '-' && word[0] != '0' &&
+				    toklen > 4 && toklen < 10) {
+					copy_bytes(num, word, toklen);
+					num[toklen] = '\0';
+					val = strtoul((const char *)num, (char **)&word, 10);
+
+					if (*word == '\0') {
+						uint8_t tok_hdr[10], *dnum;
+						sz = sizeof (tok_hdr);
+						dnum = to_base_enc(val, tok_hdr, sz);
+						dnum--;
+						*dnum = flag2;
+
+						val = tok_hdr+sz - dnum-1;
+						if (dstSize + val + 1 > *dstsize) {
+							goto bail;
+						}
+						copy_bytes(&dst[dstSize], dnum, val);
+						dstSize += val;
+						dst[dstSize++] = src[i];
+						converted = 1;
+					}
+				}
+				if (!converted) {
+					if (*(src+pos) == flag || *(src+pos) == flag1 ||
+					    *(src+pos) == flag2 || *(src+pos) == '\\') {
+						dst[dstSize++] = '\\';
+					}
+					if (dstSize + toklen + 1 > *dstsize) {
+						goto bail;
+					}
+					copy_bytes(&dst[dstSize], src+pos, toklen+1);
+					dstSize += (toklen+1);
+				}
+			}
+			pos = i+1;
+		}
+	}
+	if (pos < size) {
+		uint32_t sz = size - pos;
+
+		if (dstSize + sz > *dstsize) {
+			goto bail;
+		}
+		copy_bytes(&dst[dstSize], src+pos, sz);
+		dstSize += sz;
+	}
+
+	*dstsize = dstSize;
+	rv = 1;
+
+bail:
+	hash_context_delete(&hctx);
+	list_context_delete(&lctx);
+	delete sorted_dict;
+
+	return rv;
+}
+
+int
+DictFilter::Inverse_Dict(uint8_t *src, uint32_t srclen, uint8_t *dst, uint32_t *dstsize)
+{
+	uint32_t numWords, i, enclen, pos;
+	uint8_t *srcpos, *end, *dstpos, *dstend, c;
+	decode_dict_entry_t *w_dict;
+
+	end = src + srclen;
+	srcpos = (uint8_t *)strchr((const char *)src, ' ');
+	if (srcpos - src > 12) {
+		return (0);
+	}
+
+	numWords = from_base_enc(src, srcpos - src);
+	srcpos++;
+	w_dict = new decode_dict_entry_t[numWords];
+	for (i = 0; i < numWords && srcpos < end; i++) {
+		uint8_t *w_src = srcpos;
+		srcpos = (uint8_t *)strchr((const char *)srcpos, ' ');
+		if (srcpos - w_src > WORD_MAX)
+			return (0);
+
+		w_dict[i].sz = srcpos - w_src;
+		w_dict[i].word = w_src;
+		srcpos++;
+	}
+
+	enclen = srclen - (srcpos - src);
+	dstpos = dst;
+	dstend = dst + *dstsize;
+	pos = 0;
+
+	for (i = 0; i < enclen && dstpos < dstend; i++) {
+		c = srcpos[i];
+		if (SEPARATOR[c]) {
+			uint32_t toklen = i - pos;
+			uint32_t dpos;
+
+			c = srcpos[pos];
+			if (toklen == 0) {
+				*dstpos++ = srcpos[i];
+
+			} else if (c == '\\') {
+				if (dstpos + toklen > dstend) {
+					log_msg(LOG_ERR, 0, "Overflow in DICT decode.\n");
+					return (0);
+				}
+				copy_bytes(dstpos, srcpos+pos+1, toklen);
+				dstpos += toklen;
+
+			} else if (c == flag) {
+				toklen--;
+				dpos = from_base_enc(srcpos+pos+1, toklen);
+
+				if (dstpos + w_dict[dpos].sz > dstend) {
+					log_msg(LOG_ERR, 0, "Overflow in DICT decode.\n");
+					return (0);
+				}
+				copy_bytes(dstpos, w_dict[dpos].word, w_dict[dpos].sz);
+				dstpos += w_dict[dpos].sz;
+				*dstpos++ = srcpos[i];
+
+			} else if (c == flag1) {
+				toklen--;
+				dpos = from_base_enc(srcpos+pos+1, toklen);
+
+				if (dstpos + w_dict[dpos].sz > dstend) {
+					log_msg(LOG_ERR, 0, "Overflow in DICT decode.\n");
+					return (0);
+				}
+				*dstpos++ = toupper(*(w_dict[dpos].word));
+				copy_bytes(dstpos, w_dict[dpos].word+1, w_dict[dpos].sz-1);
+				dstpos += (w_dict[dpos].sz-1);
+				*dstpos++ = srcpos[i];
+
+			} else if (c == flag2) {
+				uint32_t n;
+
+				toklen--;
+				dpos = from_base_enc(srcpos+pos+1, toklen);
+				n = snprintf((char *)dstpos, dstend - dstpos, "%u", dpos);
+
+				if (n >= dstend - dstpos) {
+					log_msg(LOG_ERR, 0, "Overflow in DICT decode.\n");
+					return (0);
+				}
+				dstpos += n;
+				*dstpos++ = srcpos[i];
+			} else {
+				if (dstpos + toklen + 1 > dstend) {
+					log_msg(LOG_ERR, 0, "Overflow in DICT decode.\n");
+					return (0);
+				}
+				copy_bytes(dstpos, srcpos+pos, toklen+1);
+				dstpos += (toklen+1);
+			}
+			pos = i+1;
+		}
+	}
+
+	if (pos < i) {
+		if (dstpos + i - pos > dstend) {
+			log_msg(LOG_ERR, 0, "Overflow in DICT decode.\n");
+			return (0);
+		}
+		copy_bytes(dstpos, srcpos+pos, i-pos);
+		dstpos += (i-pos);
+	}
+
+	*dstsize = dstpos - dst;
+	return (1);
 }
 
 #ifdef  __cplusplus
 extern "C" {
 #endif
 
-void *
-new_dict_context()
-{
-	DictFilter *df = new DictFilter();
-	return (static_cast<void *>(df));
-}
-
-void
-delete_dict_context(void *dict_ctx)
-{
-	if (dict_ctx) {
-		DictFilter *df = static_cast<DictFilter *>(dict_ctx);
-		delete df;
-	}
-}
-
 int
-dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
+dict_encode(uint8_t *from, uint64_t fromlen, uint8_t *to, uint64_t *dstlen)
 {
-	DictFilter *df = static_cast<DictFilter *>(dict_ctx);
+	DictFilter *df = DictFilter::getInstance();
 	u32 fl;
 	u32 dl;
-	uchar_t *dst;
+	uint8_t *dst;
 	DEBUG_STAT_EN(double strt, en);
 
 	/*
@@ -299,12 +1058,13 @@ dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64
 }
 
 int
-dict_decode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen)
+dict_decode(uint8_t *from, uint64_t fromlen, uint8_t *to, uint64_t *dstlen)
 {
-	DictFilter *df = static_cast<DictFilter *>(dict_ctx);
+	DictFilter *df = DictFilter::getInstance();
 	u32 fl;
 	u32 dl;
 	u8 *src;
+	int rv;
 	DEBUG_STAT_EN(double strt, en);
 
 	if (fromlen > UINT32_MAX) {
@@ -324,7 +1084,11 @@ dict_decode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64
 	src = from + 4;
 	fl -= 4;
 
-	df->Inverse_Dict(src, fl, to, &dl);
+	rv = df->Inverse_Dict(src, fl, to, &dl);
+	if (!rv) {
+		log_msg(LOG_ERR, 0, "dict_decode: Failed.\n");
+		return (-1);
+	}
 	if (dl < *dstlen) {
 		log_msg(LOG_ERR, 0, "dict_decode: Expected: %" PRIu64 ", Got: %" PRIu64 "\n",
 		    *dstlen, dl);
diff --git a/filters/dict/DictFilter.h b/filters/dict/DictFilter.h
index 08187c6..97a76fb 100644
--- a/filters/dict/DictFilter.h
+++ b/filters/dict/DictFilter.h
@@ -39,11 +39,8 @@
 extern "C" {
 #endif
 
-void *new_dict_context();
-void delete_dict_context(void *dict_ctx);
-
-int dict_encode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
-int dict_decode(void *dict_ctx, uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
+int dict_encode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
+int dict_decode(uchar_t *from, uint64_t fromlen, uchar_t *to, uint64_t *dstlen);
 
 #ifdef  __cplusplus
 }
diff --git a/meta_stream.c b/meta_stream.c
index 67ab0a0..7901a8d 100644
--- a/meta_stream.c
+++ b/meta_stream.c
@@ -510,7 +510,9 @@ meta_ctx_create(void *pc, int file_version, int comp_fd)
 	pc_ctx_t *pctx = (pc_ctx_t *)pc;
 	meta_ctx_t *mctx;
 
-	mctx = (meta_ctx_t *)malloc(sizeof (meta_ctx_t));
+	slab_cache_add(METADATA_CHUNK_SIZE + METADATA_HDR_SZ);
+	slab_cache_add(sizeof (meta_ctx_t));
+	mctx = (meta_ctx_t *)slab_alloc(NULL, sizeof (meta_ctx_t));
 	if (!mctx) {
 		log_msg(LOG_ERR, 1, "Failed to allocate metadata context.");
 		return (NULL);
diff --git a/pcompress.c b/pcompress.c
index b093a0d..a524b93 100644
--- a/pcompress.c
+++ b/pcompress.c
@@ -259,15 +259,13 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
 		int b_type;
 
 		if (analyzed)
-			b_type = PC_TYPE(actx.one_pct.btype);
+			b_type = PC_TYPE(actx.forty_pct.btype);
 		else
 			b_type = PC_TYPE(analyze_buffer_simple(from, fromlen));
 
 		if (b_type == TYPE_TEXT) {
-			void *dct = new_dict_context();
 			_dstlen = fromlen;
-			result = dict_encode(dct, from, fromlen, to, &_dstlen);
-			delete_dict_context(dct);
+			result = dict_encode(from, fromlen, to, &_dstlen);
 			if (result != -1) {
 				uchar_t *tmp;
 				tmp = from;
@@ -346,7 +344,7 @@ preproc_compress(pc_ctx_t *pctx, compress_func_ptr cmp_func, void *src, uint64_t
 	_dstlen = srclen;
 	DEBUG_STAT_EN(strt = get_wtime_millis());
 	result = cmp_func(src, srclen, dest+9, &_dstlen, level, chdr,
-	    (dict?TYPE_TEXT:btype), data);
+	    btype, data);
 	DEBUG_STAT_EN(en = get_wtime_millis());
 
 	if (result > -1 && _dstlen < srclen) {
@@ -407,6 +405,7 @@ preproc_decompress(pc_ctx_t *pctx, compress_func_ptr dec_func, void *src, uint64
 			memcpy(src, dst, _dstlen);
 			srclen = _dstlen;
 			*dstlen = _dstlen;
+			_dstlen = _dstlen1;
 		} else {
 			log_msg(LOG_ERR, 0, "Delta2 decoding failed.");
 			return (result);
@@ -436,13 +435,12 @@ preproc_decompress(pc_ctx_t *pctx, compress_func_ptr dec_func, void *src, uint64
 	}
 
 	if (type & PREPROC_TYPE_DICT) {
-		void *dct = new_dict_context();
-		result = dict_decode(dct, src, srclen, dst, &_dstlen);
-		delete_dict_context(dct);
+		result = dict_decode(src, srclen, dst, &_dstlen);
 		if (result != -1) {
 			memcpy(src, dst, _dstlen);
 			srclen = _dstlen;
 			*dstlen = _dstlen;
+			_dstlen = _dstlen1;
 		} else {
 			log_msg(LOG_ERR, 0, "DICT decoding failed.");
 			return (result);