diff --git a/Makefile b/Makefile
index bd06db6..ee12b3e 100644
--- a/Makefile
+++ b/Makefile
@@ -40,13 +40,19 @@ BAKFILES = *~ lzma/*~
 
 RM = rm -f
 CPPFLAGS = -I. -I./lzma -D_7ZIP_ST -DNODEFAULT_PROPS -DFILE_OFFSET_BITS=64 \
-	-D_REENTRANT -D__USE_SSE_INTRIN__ -DNDEBUG -D_LZMA_PROB32
+	-D_REENTRANT -D__USE_SSE_INTRIN__ -D_LZMA_PROB32
 VEC_FLAGS = -ftree-vectorize
 LOOP_OPTFLAGS = $(VEC_FLAGS) -floop-interchange -floop-block
 LDLIBS = -ldl -lbz2 $(ZLIB_DIR) -lz -lm
 
+ifdef DEBUG
+LINK = gcc -m64 -pthread -msse3
+COMPILE = gcc -m64 -g -msse3 -c
+else
 LINK = gcc -m64 -pthread -msse3
 COMPILE = gcc -m64 -O3 -msse3 -c
+CPPFLAGS += -DNDEBUG
+endif
 
 all: $(PROG)
 
diff --git a/allocator.c b/allocator.c
index 308e326..228578a 100644
--- a/allocator.c
+++ b/allocator.c
@@ -51,16 +51,21 @@
 /*
  * Number of slabs:
  * 256 bytes to 1M in power of 2 steps: 13
- * 1M to 256M in linear steps of 1M: 256
+ * 1M to 128M in linear steps of 1M: 128
+ * 200 dynamic slab slots: 200
  *
  * By doing this we try to get reasonable memory usage while not
  * sacrificing performance.
  */
-#define	NUM_SLABS	269
 #define	NUM_POW2	13
-#define	SLAB_START	256
+#define NUM_LINEAR	128
+#define NUM_SLAB_HASH	200 /* Dynamic slabs hashtable size. */
+#define	NUM_SLABS	(NUM_POW2 + NUM_LINEAR + NUM_SLAB_HASH)
+#define SLAB_POS_HASH	(NUM_POW2 + NUM_LINEAR)
+#define	SLAB_START_SZ	256 /* Starting slab size in Bytes. */
 #define	SLAB_START_POW2	8 /* 2 ^ SLAB_START_POW2 = SLAB_START. */
-#define	HTABLE_SZ	16384
+
+#define	HTABLE_SZ	8192
 #define	TWOM		(2UL * 1024UL * 1024UL)
 #define	ONEM		(1UL * 1024UL * 1024UL)
 
@@ -72,22 +77,24 @@ static const unsigned int bv[] = {
 	0xFFFF0000
 };
 
-struct bufentry {
-	void *ptr;
-	int slab_index;
-	struct bufentry *next;
-};
 struct slabentry {
 	struct bufentry *avail;
-	struct bufentry *used;
+	struct slabentry *next;
 	size_t sz;
 	uint64_t allocs, hits;
 	pthread_mutex_t slab_lock;
 };
+struct bufentry {
+	void *ptr;
+	struct slabentry *slab;
+	struct bufentry *next;
+};
+
 static struct slabentry slabheads[NUM_SLABS];
 static struct bufentry **htable;
 static pthread_mutex_t *hbucket_locks;
 static pthread_mutex_t htable_lock = PTHREAD_MUTEX_INITIALIZER;
+static pthread_mutex_t slab_table_lock = PTHREAD_MUTEX_INITIALIZER;
 static int inited = 0;
 
 static uint64_t total_allocs, oversize_allocs, hash_collisions, hash_entries;
@@ -117,10 +124,9 @@ slab_init()
 	int nprocs;
 
 	/* Initialize first NUM_POW2 power of 2 slots. */
-	slab_sz = SLAB_START;
+	slab_sz = SLAB_START_SZ;
 	for (i = 0; i < NUM_POW2; i++) {
 		slabheads[i].avail = NULL;
-		slabheads[i].used = NULL;
 		slabheads[i].sz = slab_sz;
 		slabheads[i].allocs = 0;
 		slabheads[i].hits = 0;
@@ -129,10 +135,10 @@ slab_init()
 		slab_sz *= 2;
 	}
 
-	/* At this point slab_sz is 2M. So linear slots start at 2M. */
-	for (i = NUM_POW2; i < NUM_SLABS; i++) {
+	/* At this point slab_sz is 1M. So linear slots start at 1M. */
+	for (i = NUM_POW2; i < SLAB_POS_HASH; i++) {
 		slabheads[i].avail = NULL;
-		slabheads[i].used = NULL;
+		slabheads[i].next = NULL;
 		slabheads[i].sz = slab_sz;
 		slabheads[i].allocs = 0;
 		slabheads[i].hits = 0;
@@ -141,6 +147,14 @@ slab_init()
 		slab_sz += ONEM;
 	}
 
+	for (i = SLAB_POS_HASH; i < NUM_SLABS; i++) {
+		slabheads[i].avail = NULL;
+		slabheads[i].next = NULL;
+		slabheads[i].sz = 0;
+		slabheads[i].allocs = 0;
+		slabheads[i].hits = 0;
+		/* Do not init locks here. They will be inited on demand. */
+	}
 	htable = (struct bufentry **)calloc(HTABLE_SZ, sizeof (struct bufentry *));
 	hbucket_locks = (pthread_mutex_t *)malloc(HTABLE_SZ * sizeof (pthread_mutex_t));
 
@@ -172,19 +186,25 @@ slab_cleanup(int quiet)
 
 	for (i=0; i<NUM_SLABS; i++)
 	{
-		if (slabheads[i].avail) {
-			if (!quiet) {
-				fprintf(stderr, "%21llu %21llu %21llu\n",slabheads[i].sz,
-				    slabheads[i].allocs, slabheads[i].hits);
+		struct slabentry *slab;
+
+		slab = &slabheads[i];
+		while (slab) {
+			if (slab->avail) {
+				if (!quiet) {
+					fprintf(stderr, "%21llu %21llu %21llu\n",slab->sz,
+					slab->allocs, slab->hits);
+				}
+				slab->allocs = 0;
+				buf = slab->avail;
+				do {
+					buf1 = buf->next;
+					free(buf->ptr);
+					free(buf);
+					buf = buf1;
+				} while (buf);
 			}
-			slabheads[i].allocs = 0;
-			buf = slabheads[i].avail;
-			do {
-				buf1 = buf->next;
-				free(buf->ptr);
-				free(buf);
-				buf = buf1;
-			} while (buf);
+			slab = slab->next;
 		}
 	}
 
@@ -202,10 +222,10 @@ slab_cleanup(int quiet)
 			buf = htable[i];
 
 			while (buf) {
-				if (buf->slab_index == -1) {
+				if (buf->slab == NULL) {
 					nonfreed_oversize++;
 				} else {
-					slabheads[buf->slab_index].allocs++;
+					buf->slab->allocs++;
 				}
 				buf1 = buf->next;
 				free(buf->ptr);
@@ -222,11 +242,32 @@ slab_cleanup(int quiet)
 			fprintf(stderr, "==================================================================\n");
 			for (i=0; i<NUM_SLABS; i++)
 			{
-				if (slabheads[i].allocs == 0) continue;
-				fprintf(stderr, "%21llu %21llu\n",slabheads[i].sz, slabheads[i].allocs);
+				struct slabentry *slab;
+
+				slab = &slabheads[i];
+				do {
+					if (slab->allocs > 0)
+						fprintf(stderr, "%21llu %21llu\n", \
+						    slab->sz, slab->allocs);
+					slab = slab->next;
+				} while (slab);
 			}
 		}
 	}
+	for (i=0; i<NUM_SLABS; i++)
+	{
+		struct slabentry *slab, *pslab;
+		int j;
+
+		slab = &slabheads[i];
+		j = 0;
+		do {
+			pslab = slab;
+			slab = slab->next;
+			if (j > 0) free(pslab);
+			j++;
+		} while (slab);
+	}
 	if (!quiet) fprintf(stderr, "\n\n");
 }
 
@@ -271,30 +312,91 @@ find_slot(unsigned int v)
 	return (r);
 }
 
+static void *
+try_dynamic_slab(size_t size)
+{
+	uint32_t sindx;
+	struct slabentry *slab;
+
+	/* Locate the hash slot for the size. */
+	sindx = hash6432shift((unsigned long)size) & (NUM_SLAB_HASH - 1);
+	sindx += SLAB_POS_HASH;
+	if (slabheads[sindx].sz == 0) return (NULL);
+
+	/* Linear search in the chained buckets. */
+	slab = &slabheads[sindx];
+	while (slab && slab->sz != size) {
+		slab = slab->next;
+	}
+
+	return (slab);
+}
+
+int
+slab_cache_add(size_t size)
+{
+	uint32_t sindx;
+	struct slabentry *slab;
+	if (try_dynamic_slab(size)) return (0); /* Already added. */
+
+	/* Locate the hash slot for the size. */
+	sindx = hash6432shift((unsigned long)size) & (NUM_SLAB_HASH - 1);
+	sindx += SLAB_POS_HASH;
+
+	if (slabheads[sindx].sz == 0) {
+		pthread_mutex_init(&(slabheads[sindx].slab_lock), NULL);
+		pthread_mutex_lock(&(slabheads[sindx].slab_lock));
+		slabheads[sindx].sz = size;
+		pthread_mutex_unlock(&(slabheads[sindx].slab_lock));
+	} else {
+		slab = (struct slabentry *)malloc(sizeof (struct slabentry));
+		if (!slab) return (0);
+		slab->avail = NULL;
+		slab->sz = size;
+		slab->allocs = 0;
+		slab->hits = 0;
+		pthread_mutex_init(&(slab->slab_lock), NULL);
+
+		pthread_mutex_lock(&(slabheads[sindx].slab_lock));
+		slabheads[sindx].next = slab;
+		slab->next = slabheads[sindx].next;
+		pthread_mutex_unlock(&(slabheads[sindx].slab_lock));
+	}
+	return (1);
+}
+
 void *
 slab_alloc(void *p, size_t size)
 {
-	size_t slab_sz = SLAB_START;
-	int i, found;
+	size_t slab_sz = SLAB_START_SZ;
+	int i;
 	size_t div;
+	void *ptr;
+	struct slabentry *slab;
 
 	ATOMIC_ADD(total_allocs, 1);
-	found = -1;
-	if (size <= ONEM) {
-		/* First eleven slots are power of 2 sizes upto 1M. */
-		found = find_slot(size);
-	} else {
-		/* Next slots are in intervals of 1M. */
-		div = size / ONEM;
-		if (size % ONEM) div++;
-		if (div < NUM_SLABS) found = div + NUM_POW2;
+	slab = NULL;
+
+	/* First check if we can use a dynamic slab of this size. */
+	slab = try_dynamic_slab(size);
+
+	if (!slab) {
+		if (size <= ONEM) {
+			/* First eleven slots are power of 2 sizes upto 1M. */
+			slab = &slabheads[find_slot(size)];
+		} else {
+			/* Next slots are in intervals of 1M. */
+			div = size / ONEM;
+			if (size % ONEM) div++;
+			if (div < NUM_LINEAR) slab = &slabheads[div + NUM_POW2];
+		}
 	}
-	if (found == -1) {
+	if (!slab) {
 		struct bufentry *buf = (struct bufentry *)malloc(sizeof (struct bufentry));
 		uint32_t hindx;
 
 		buf->ptr = malloc(size);
-		buf->slab_index = -1;
+		buf->slab = NULL;
 		hindx = hash6432shift((unsigned long)(buf->ptr)) & (HTABLE_SZ - 1);
 
 		pthread_mutex_lock(&hbucket_locks[hindx]);
@@ -302,40 +404,33 @@ slab_alloc(void *p, size_t size)
 		htable[hindx] = buf;
 		pthread_mutex_unlock(&hbucket_locks[hindx]);
 		ATOMIC_ADD(oversize_allocs, 1);
+		ATOMIC_ADD(hash_entries, 1);
 		return (buf->ptr);
 	} else {
 		struct bufentry *buf;
 		uint32_t hindx;
 
-		pthread_mutex_lock(&(slabheads[found].slab_lock));
-		if (slabheads[found].avail == NULL) {
-			slabheads[found].allocs++;
-			pthread_mutex_unlock(&(slabheads[found].slab_lock));
+		pthread_mutex_lock(&(slab->slab_lock));
+		if (slab->avail == NULL) {
+			slab->allocs++;
+			pthread_mutex_unlock(&(slab->slab_lock));
 			buf = (struct bufentry *)malloc(sizeof (struct bufentry));
-			buf->ptr = malloc(slabheads[found].sz);
-			buf->slab_index = found;
-			hindx = hash6432shift((unsigned long)(buf->ptr)) & (HTABLE_SZ - 1);
-
-			if (htable[hindx]) ATOMIC_ADD(hash_collisions, 1);
-			pthread_mutex_lock(&hbucket_locks[hindx]);
-			buf->next = htable[hindx];
-			htable[hindx] = buf;
-			pthread_mutex_unlock(&hbucket_locks[hindx]);
-			ATOMIC_ADD(hash_entries, 1);
+			buf->ptr = malloc(slab->sz);
+			buf->slab = slab;
 		} else {
-			buf = slabheads[found].avail;
-			slabheads[found].avail = buf->next;
-			slabheads[found].hits++;
-			pthread_mutex_unlock(&(slabheads[found].slab_lock));
-			hindx = hash6432shift((unsigned long)(buf->ptr)) & (HTABLE_SZ - 1);
-
-			if (htable[hindx]) ATOMIC_ADD(hash_collisions, 1);
-			pthread_mutex_lock(&hbucket_locks[hindx]);
-			buf->next = htable[hindx];
-			htable[hindx] = buf;
-			pthread_mutex_unlock(&hbucket_locks[hindx]);
-			ATOMIC_ADD(hash_entries, 1);
+			buf = slab->avail;
+			slab->avail = buf->next;
+			slab->hits++;
+			pthread_mutex_unlock(&(slab->slab_lock));
 		}
+
+		hindx = hash6432shift((unsigned long)(buf->ptr)) & (HTABLE_SZ - 1);
+		if (htable[hindx]) ATOMIC_ADD(hash_collisions, 1);
+		pthread_mutex_lock(&hbucket_locks[hindx]);
+		buf->next = htable[hindx];
+		htable[hindx] = buf;
+		pthread_mutex_unlock(&hbucket_locks[hindx]);
+		ATOMIC_ADD(hash_entries, 1);
 		return (buf->ptr);
 	}
 }
@@ -355,30 +450,27 @@ slab_free(void *p, void *address)
 	pbuf = NULL;
 	while (buf) {
 		if (buf->ptr == address) {
-			if (buf->slab_index == -1) {
-				if (pbuf)
-					pbuf->next = buf->next;
-				else
-					htable[hindx] = buf->next;
-				pthread_mutex_unlock(&hbucket_locks[hindx]);
-				ATOMIC_SUB(hash_entries, 1);
+			if (hash_entries <=0) {
+				fprintf(stderr, "Inconsistent allocation hash\n");
+				abort();
+			}
+			if (pbuf)
+				pbuf->next = buf->next;
+			else
+				htable[hindx] = buf->next;
+			pthread_mutex_unlock(&hbucket_locks[hindx]);
+			ATOMIC_SUB(hash_entries, 1);
 
+			if (buf->slab == NULL) {
 				free(buf->ptr);
 				free(buf);
 				found = 1;
 				break;
 			} else {
-				if (pbuf)
-					pbuf->next = buf->next;
-				else
-					htable[hindx] = buf->next;
-				pthread_mutex_unlock(&hbucket_locks[hindx]);
-				ATOMIC_SUB(hash_entries, 1);
-
-				pthread_mutex_lock(&(slabheads[buf->slab_index].slab_lock));
-				buf->next = slabheads[buf->slab_index].avail;
-				slabheads[buf->slab_index].avail = buf;
-				pthread_mutex_unlock(&(slabheads[buf->slab_index].slab_lock));
+				pthread_mutex_lock(&(buf->slab->slab_lock));
+				buf->next = buf->slab->avail;
+				buf->slab->avail = buf;
+				pthread_mutex_unlock(&(buf->slab->slab_lock));
 				found = 1;
 				break;
 			}
diff --git a/allocator.h b/allocator.h
index be8df05..576627b 100644
--- a/allocator.h
+++ b/allocator.h
@@ -29,6 +29,7 @@ void slab_cleanup(int quiet);
 void *slab_alloc(void *p, size_t size);
 void *slab_calloc(void *p, size_t items, size_t size);
 void slab_free(void *p, void *address);
+int slab_cache_add(size_t size);
 
 #endif
 
diff --git a/lzma/LzmaEnc.c b/lzma/LzmaEnc.c
index 19feb80..f73f0ac 100644
--- a/lzma/LzmaEnc.c
+++ b/lzma/LzmaEnc.c
@@ -45,6 +45,14 @@ static int ttt = 0;
 #define kNumBitPriceShiftBits 4
 #define kBitPrice (1 << kNumBitPriceShiftBits)
 
+#ifdef _LZMA_PROB32
+#define CLzmaProb UInt32
+#else
+#define CLzmaProb UInt16
+#endif
+
+#define LITPROB_SZ(lclp) ((0x300 << lclp) * sizeof(CLzmaProb))
+
 #ifdef __USE_SSE_INTRIN__
 #define MOV_DBL_QUAD(mem, val) __asm (\
           "movntiq %[val], (%[ptr1]);"\
@@ -114,6 +122,8 @@ void LzmaEncProps_Init(CLzmaEncProps *p)
 void LzmaEncProps_Normalize(CLzmaEncProps *p)
 {
   int level = p->level;
+  unsigned lclp;
+  
   if (!p->normalized) {
     if (level < 0) level = 5;
     p->level = level;
@@ -126,6 +136,8 @@ void LzmaEncProps_Normalize(CLzmaEncProps *p)
     if (p->btMode < 0) p->btMode = (p->algo == 0 ? 0 : 1);
     if (p->numHashBytes < 0) p->numHashBytes = 4;
     if (p->mc == 0)  p->mc = (16 + (p->fb >> 1)) >> (p->btMode ? 0 : 1);
+    lclp = p->lc + p->lp;
+    p->litprob_sz = LITPROB_SZ(lclp);
     if (p->numThreads < 0)
       p->numThreads =
         #ifndef _7ZIP_ST
@@ -238,12 +250,6 @@ typedef struct
 
 #define kNumFullDistances (1 << (kEndPosModelIndex >> 1))
 
-#ifdef _LZMA_PROB32
-#define CLzmaProb UInt32
-#else
-#define CLzmaProb UInt16
-#endif
-
 #define LZMA_PB_MAX 4
 #define LZMA_LC_MAX 8
 #define LZMA_LP_MAX 4
@@ -423,7 +429,7 @@ void LzmaEnc_SaveState(CLzmaEncHandle pp)
   memcpy(dest->posEncoders, p->posEncoders, sizeof(p->posEncoders));
   memcpy(dest->posAlignEncoder, p->posAlignEncoder, sizeof(p->posAlignEncoder));
   memcpy(dest->reps, p->reps, sizeof(p->reps));
-  memcpy(dest->litProbs, p->litProbs, (0x300 << p->lclp) * sizeof(CLzmaProb));
+  memcpy(dest->litProbs, p->litProbs, LITPROB_SZ(p->lclp));
 }
 
 void LzmaEnc_RestoreState(CLzmaEncHandle pp)
@@ -449,7 +455,7 @@ void LzmaEnc_RestoreState(CLzmaEncHandle pp)
   memcpy(dest->posEncoders, p->posEncoders, sizeof(p->posEncoders));
   memcpy(dest->posAlignEncoder, p->posAlignEncoder, sizeof(p->posAlignEncoder));
   memcpy(dest->reps, p->reps, sizeof(p->reps));
-  memcpy(dest->litProbs, p->litProbs, (0x300 << dest->lclp) * sizeof(CLzmaProb));
+  memcpy(dest->litProbs, p->litProbs, LITPROB_SZ(dest->lclp));
 }
 
 SRes LzmaEnc_SetProps(CLzmaEncHandle pp, const CLzmaEncProps *props2)
@@ -2063,8 +2069,8 @@ static SRes LzmaEnc_Alloc(CLzmaEnc *p, UInt32 keepWindowSize, ISzAlloc *alloc, I
     if (p->litProbs == 0 || p->saveState.litProbs == 0 || p->lclp != lclp)
     {
       LzmaEnc_FreeLits(p, alloc);
-      p->litProbs = (CLzmaProb *)alloc->Alloc(alloc, (0x300 << lclp) * sizeof(CLzmaProb));
-      p->saveState.litProbs = (CLzmaProb *)alloc->Alloc(alloc, (0x300 << lclp) * sizeof(CLzmaProb));
+      p->litProbs = (CLzmaProb *)alloc->Alloc(alloc, LITPROB_SZ(lclp));
+      p->saveState.litProbs = (CLzmaProb *)alloc->Alloc(alloc, LITPROB_SZ(lclp));
       if (p->litProbs == 0 || p->saveState.litProbs == 0)
       {
         LzmaEnc_FreeLits(p, alloc);
diff --git a/lzma/LzmaEnc.h b/lzma/LzmaEnc.h
index c71d2ff..9ce63af 100644
--- a/lzma/LzmaEnc.h
+++ b/lzma/LzmaEnc.h
@@ -30,6 +30,7 @@ typedef struct _CLzmaEncProps
   unsigned writeEndMark;  /* 0 - do not write EOPM, 1 - write EOPM, default = 0 */
   int numThreads;  /* 1 or 2, default = 2 */
   int normalized;
+  size_t litprob_sz;
 } CLzmaEncProps;
 
 extern void LzmaEncProps_Init(CLzmaEncProps *p);
diff --git a/lzma_compress.c b/lzma_compress.c
index 076ec63..0474c6b 100644
--- a/lzma_compress.c
+++ b/lzma_compress.c
@@ -77,6 +77,7 @@ lzma_init(void **data, int *level, ssize_t chunksize)
 		if (*level > 9) *level = 9;
 		p->level = *level;
 		LzmaEncProps_Normalize(p);
+		slab_cache_add(p->litprob_sz);
 	}
 	*data = p;
 	return (0);
diff --git a/main.c b/main.c
index eac8238..ae33b7d 100644
--- a/main.c
+++ b/main.c
@@ -659,6 +659,10 @@ start_compress(const char *filename, uint64_t chunksize, int level)
 		nprocs = nthreads;
 
 	fprintf(stderr, "Scaling to %d threads\n", nprocs);
+	slab_cache_add(chunksize);
+	slab_cache_add(compressed_chunksize + CHDR_SZ);
+	slab_cache_add(sizeof (struct cmp_data));
+
 	dary = (struct cmp_data **)slab_alloc(NULL, sizeof (struct cmp_data *) * nprocs);
 	cread_buf = (uchar_t *)slab_alloc(NULL, chunksize);
 	if (!cread_buf) {