From 2eaf151ca078c204d9adf250698d4ce2a792fefe Mon Sep 17 00:00:00 2001 From: Moinak Ghosh Date: Thu, 31 May 2012 13:06:40 +0530 Subject: [PATCH] New capability in allocator to add slab caches with user-specified size. Use specific slab caches in main program and Lzma to slightly reduce memory footprint. Fix missing hashtable counter update in allocator. Slight cleanup of repeated computation with macro in Lzma. --- Makefile | 8 +- allocator.c | 268 ++++++++++++++++++++++++++++++++---------------- allocator.h | 1 + lzma/LzmaEnc.c | 26 +++-- lzma/LzmaEnc.h | 1 + lzma_compress.c | 1 + main.c | 4 + 7 files changed, 210 insertions(+), 99 deletions(-) diff --git a/Makefile b/Makefile index bd06db6..ee12b3e 100644 --- a/Makefile +++ b/Makefile @@ -40,13 +40,19 @@ BAKFILES = *~ lzma/*~ RM = rm -f CPPFLAGS = -I. -I./lzma -D_7ZIP_ST -DNODEFAULT_PROPS -DFILE_OFFSET_BITS=64 \ - -D_REENTRANT -D__USE_SSE_INTRIN__ -DNDEBUG -D_LZMA_PROB32 + -D_REENTRANT -D__USE_SSE_INTRIN__ -D_LZMA_PROB32 VEC_FLAGS = -ftree-vectorize LOOP_OPTFLAGS = $(VEC_FLAGS) -floop-interchange -floop-block LDLIBS = -ldl -lbz2 $(ZLIB_DIR) -lz -lm +ifdef DEBUG +LINK = gcc -m64 -pthread -msse3 +COMPILE = gcc -m64 -g -msse3 -c +else LINK = gcc -m64 -pthread -msse3 COMPILE = gcc -m64 -O3 -msse3 -c +CPPFLAGS += -DNDEBUG +endif all: $(PROG) diff --git a/allocator.c b/allocator.c index 308e326..228578a 100644 --- a/allocator.c +++ b/allocator.c @@ -51,16 +51,21 @@ /* * Number of slabs: * 256 bytes to 1M in power of 2 steps: 13 - * 1M to 256M in linear steps of 1M: 256 + * 1M to 128M in linear steps of 1M: 128 + * 200 dynamic slab slots: 200 * * By doing this we try to get reasonable memory usage while not * sacrificing performance. */ -#define NUM_SLABS 269 #define NUM_POW2 13 -#define SLAB_START 256 +#define NUM_LINEAR 128 +#define NUM_SLAB_HASH 200 /* Dynamic slabs hashtable size. */ +#define NUM_SLABS (NUM_POW2 + NUM_LINEAR + NUM_SLAB_HASH) +#define SLAB_POS_HASH (NUM_POW2 + NUM_LINEAR) +#define SLAB_START_SZ 256 /* Starting slab size in Bytes. */ #define SLAB_START_POW2 8 /* 2 ^ SLAB_START_POW2 = SLAB_START. */ -#define HTABLE_SZ 16384 + +#define HTABLE_SZ 8192 #define TWOM (2UL * 1024UL * 1024UL) #define ONEM (1UL * 1024UL * 1024UL) @@ -72,22 +77,24 @@ static const unsigned int bv[] = { 0xFFFF0000 }; -struct bufentry { - void *ptr; - int slab_index; - struct bufentry *next; -}; struct slabentry { struct bufentry *avail; - struct bufentry *used; + struct slabentry *next; size_t sz; uint64_t allocs, hits; pthread_mutex_t slab_lock; }; +struct bufentry { + void *ptr; + struct slabentry *slab; + struct bufentry *next; +}; + static struct slabentry slabheads[NUM_SLABS]; static struct bufentry **htable; static pthread_mutex_t *hbucket_locks; static pthread_mutex_t htable_lock = PTHREAD_MUTEX_INITIALIZER; +static pthread_mutex_t slab_table_lock = PTHREAD_MUTEX_INITIALIZER; static int inited = 0; static uint64_t total_allocs, oversize_allocs, hash_collisions, hash_entries; @@ -117,10 +124,9 @@ slab_init() int nprocs; /* Initialize first NUM_POW2 power of 2 slots. */ - slab_sz = SLAB_START; + slab_sz = SLAB_START_SZ; for (i = 0; i < NUM_POW2; i++) { slabheads[i].avail = NULL; - slabheads[i].used = NULL; slabheads[i].sz = slab_sz; slabheads[i].allocs = 0; slabheads[i].hits = 0; @@ -129,10 +135,10 @@ slab_init() slab_sz *= 2; } - /* At this point slab_sz is 2M. So linear slots start at 2M. */ - for (i = NUM_POW2; i < NUM_SLABS; i++) { + /* At this point slab_sz is 1M. So linear slots start at 1M. */ + for (i = NUM_POW2; i < SLAB_POS_HASH; i++) { slabheads[i].avail = NULL; - slabheads[i].used = NULL; + slabheads[i].next = NULL; slabheads[i].sz = slab_sz; slabheads[i].allocs = 0; slabheads[i].hits = 0; @@ -141,6 +147,14 @@ slab_init() slab_sz += ONEM; } + for (i = SLAB_POS_HASH; i < NUM_SLABS; i++) { + slabheads[i].avail = NULL; + slabheads[i].next = NULL; + slabheads[i].sz = 0; + slabheads[i].allocs = 0; + slabheads[i].hits = 0; + /* Do not init locks here. They will be inited on demand. */ + } htable = (struct bufentry **)calloc(HTABLE_SZ, sizeof (struct bufentry *)); hbucket_locks = (pthread_mutex_t *)malloc(HTABLE_SZ * sizeof (pthread_mutex_t)); @@ -172,19 +186,25 @@ slab_cleanup(int quiet) for (i=0; iavail) { + if (!quiet) { + fprintf(stderr, "%21llu %21llu %21llu\n",slab->sz, + slab->allocs, slab->hits); + } + slab->allocs = 0; + buf = slab->avail; + do { + buf1 = buf->next; + free(buf->ptr); + free(buf); + buf = buf1; + } while (buf); } - slabheads[i].allocs = 0; - buf = slabheads[i].avail; - do { - buf1 = buf->next; - free(buf->ptr); - free(buf); - buf = buf1; - } while (buf); + slab = slab->next; } } @@ -202,10 +222,10 @@ slab_cleanup(int quiet) buf = htable[i]; while (buf) { - if (buf->slab_index == -1) { + if (buf->slab == NULL) { nonfreed_oversize++; } else { - slabheads[buf->slab_index].allocs++; + buf->slab->allocs++; } buf1 = buf->next; free(buf->ptr); @@ -222,11 +242,32 @@ slab_cleanup(int quiet) fprintf(stderr, "==================================================================\n"); for (i=0; iallocs > 0) + fprintf(stderr, "%21llu %21llu\n", \ + slab->sz, slab->allocs); + slab = slab->next; + } while (slab); } } } + for (i=0; inext; + if (j > 0) free(pslab); + j++; + } while (slab); + } if (!quiet) fprintf(stderr, "\n\n"); } @@ -271,30 +312,91 @@ find_slot(unsigned int v) return (r); } +static void * +try_dynamic_slab(size_t size) +{ + uint32_t sindx; + struct slabentry *slab; + + /* Locate the hash slot for the size. */ + sindx = hash6432shift((unsigned long)size) & (NUM_SLAB_HASH - 1); + sindx += SLAB_POS_HASH; + if (slabheads[sindx].sz == 0) return (NULL); + + /* Linear search in the chained buckets. */ + slab = &slabheads[sindx]; + while (slab && slab->sz != size) { + slab = slab->next; + } + + return (slab); +} + +int +slab_cache_add(size_t size) +{ + uint32_t sindx; + struct slabentry *slab; + if (try_dynamic_slab(size)) return (0); /* Already added. */ + + /* Locate the hash slot for the size. */ + sindx = hash6432shift((unsigned long)size) & (NUM_SLAB_HASH - 1); + sindx += SLAB_POS_HASH; + + if (slabheads[sindx].sz == 0) { + pthread_mutex_init(&(slabheads[sindx].slab_lock), NULL); + pthread_mutex_lock(&(slabheads[sindx].slab_lock)); + slabheads[sindx].sz = size; + pthread_mutex_unlock(&(slabheads[sindx].slab_lock)); + } else { + slab = (struct slabentry *)malloc(sizeof (struct slabentry)); + if (!slab) return (0); + slab->avail = NULL; + slab->sz = size; + slab->allocs = 0; + slab->hits = 0; + pthread_mutex_init(&(slab->slab_lock), NULL); + + pthread_mutex_lock(&(slabheads[sindx].slab_lock)); + slabheads[sindx].next = slab; + slab->next = slabheads[sindx].next; + pthread_mutex_unlock(&(slabheads[sindx].slab_lock)); + } + return (1); +} + void * slab_alloc(void *p, size_t size) { - size_t slab_sz = SLAB_START; - int i, found; + size_t slab_sz = SLAB_START_SZ; + int i; size_t div; + void *ptr; + struct slabentry *slab; ATOMIC_ADD(total_allocs, 1); - found = -1; - if (size <= ONEM) { - /* First eleven slots are power of 2 sizes upto 1M. */ - found = find_slot(size); - } else { - /* Next slots are in intervals of 1M. */ - div = size / ONEM; - if (size % ONEM) div++; - if (div < NUM_SLABS) found = div + NUM_POW2; + slab = NULL; + + /* First check if we can use a dynamic slab of this size. */ + slab = try_dynamic_slab(size); + + if (!slab) { + if (size <= ONEM) { + /* First eleven slots are power of 2 sizes upto 1M. */ + slab = &slabheads[find_slot(size)]; + } else { + /* Next slots are in intervals of 1M. */ + div = size / ONEM; + if (size % ONEM) div++; + if (div < NUM_LINEAR) slab = &slabheads[div + NUM_POW2]; + } } - if (found == -1) { + if (!slab) { struct bufentry *buf = (struct bufentry *)malloc(sizeof (struct bufentry)); uint32_t hindx; buf->ptr = malloc(size); - buf->slab_index = -1; + buf->slab = NULL; hindx = hash6432shift((unsigned long)(buf->ptr)) & (HTABLE_SZ - 1); pthread_mutex_lock(&hbucket_locks[hindx]); @@ -302,40 +404,33 @@ slab_alloc(void *p, size_t size) htable[hindx] = buf; pthread_mutex_unlock(&hbucket_locks[hindx]); ATOMIC_ADD(oversize_allocs, 1); + ATOMIC_ADD(hash_entries, 1); return (buf->ptr); } else { struct bufentry *buf; uint32_t hindx; - pthread_mutex_lock(&(slabheads[found].slab_lock)); - if (slabheads[found].avail == NULL) { - slabheads[found].allocs++; - pthread_mutex_unlock(&(slabheads[found].slab_lock)); + pthread_mutex_lock(&(slab->slab_lock)); + if (slab->avail == NULL) { + slab->allocs++; + pthread_mutex_unlock(&(slab->slab_lock)); buf = (struct bufentry *)malloc(sizeof (struct bufentry)); - buf->ptr = malloc(slabheads[found].sz); - buf->slab_index = found; - hindx = hash6432shift((unsigned long)(buf->ptr)) & (HTABLE_SZ - 1); - - if (htable[hindx]) ATOMIC_ADD(hash_collisions, 1); - pthread_mutex_lock(&hbucket_locks[hindx]); - buf->next = htable[hindx]; - htable[hindx] = buf; - pthread_mutex_unlock(&hbucket_locks[hindx]); - ATOMIC_ADD(hash_entries, 1); + buf->ptr = malloc(slab->sz); + buf->slab = slab; } else { - buf = slabheads[found].avail; - slabheads[found].avail = buf->next; - slabheads[found].hits++; - pthread_mutex_unlock(&(slabheads[found].slab_lock)); - hindx = hash6432shift((unsigned long)(buf->ptr)) & (HTABLE_SZ - 1); - - if (htable[hindx]) ATOMIC_ADD(hash_collisions, 1); - pthread_mutex_lock(&hbucket_locks[hindx]); - buf->next = htable[hindx]; - htable[hindx] = buf; - pthread_mutex_unlock(&hbucket_locks[hindx]); - ATOMIC_ADD(hash_entries, 1); + buf = slab->avail; + slab->avail = buf->next; + slab->hits++; + pthread_mutex_unlock(&(slab->slab_lock)); } + + hindx = hash6432shift((unsigned long)(buf->ptr)) & (HTABLE_SZ - 1); + if (htable[hindx]) ATOMIC_ADD(hash_collisions, 1); + pthread_mutex_lock(&hbucket_locks[hindx]); + buf->next = htable[hindx]; + htable[hindx] = buf; + pthread_mutex_unlock(&hbucket_locks[hindx]); + ATOMIC_ADD(hash_entries, 1); return (buf->ptr); } } @@ -355,30 +450,27 @@ slab_free(void *p, void *address) pbuf = NULL; while (buf) { if (buf->ptr == address) { - if (buf->slab_index == -1) { - if (pbuf) - pbuf->next = buf->next; - else - htable[hindx] = buf->next; - pthread_mutex_unlock(&hbucket_locks[hindx]); - ATOMIC_SUB(hash_entries, 1); + if (hash_entries <=0) { + fprintf(stderr, "Inconsistent allocation hash\n"); + abort(); + } + if (pbuf) + pbuf->next = buf->next; + else + htable[hindx] = buf->next; + pthread_mutex_unlock(&hbucket_locks[hindx]); + ATOMIC_SUB(hash_entries, 1); + if (buf->slab == NULL) { free(buf->ptr); free(buf); found = 1; break; } else { - if (pbuf) - pbuf->next = buf->next; - else - htable[hindx] = buf->next; - pthread_mutex_unlock(&hbucket_locks[hindx]); - ATOMIC_SUB(hash_entries, 1); - - pthread_mutex_lock(&(slabheads[buf->slab_index].slab_lock)); - buf->next = slabheads[buf->slab_index].avail; - slabheads[buf->slab_index].avail = buf; - pthread_mutex_unlock(&(slabheads[buf->slab_index].slab_lock)); + pthread_mutex_lock(&(buf->slab->slab_lock)); + buf->next = buf->slab->avail; + buf->slab->avail = buf; + pthread_mutex_unlock(&(buf->slab->slab_lock)); found = 1; break; } diff --git a/allocator.h b/allocator.h index be8df05..576627b 100644 --- a/allocator.h +++ b/allocator.h @@ -29,6 +29,7 @@ void slab_cleanup(int quiet); void *slab_alloc(void *p, size_t size); void *slab_calloc(void *p, size_t items, size_t size); void slab_free(void *p, void *address); +int slab_cache_add(size_t size); #endif diff --git a/lzma/LzmaEnc.c b/lzma/LzmaEnc.c index 19feb80..f73f0ac 100644 --- a/lzma/LzmaEnc.c +++ b/lzma/LzmaEnc.c @@ -45,6 +45,14 @@ static int ttt = 0; #define kNumBitPriceShiftBits 4 #define kBitPrice (1 << kNumBitPriceShiftBits) +#ifdef _LZMA_PROB32 +#define CLzmaProb UInt32 +#else +#define CLzmaProb UInt16 +#endif + +#define LITPROB_SZ(lclp) ((0x300 << lclp) * sizeof(CLzmaProb)) + #ifdef __USE_SSE_INTRIN__ #define MOV_DBL_QUAD(mem, val) __asm (\ "movntiq %[val], (%[ptr1]);"\ @@ -114,6 +122,8 @@ void LzmaEncProps_Init(CLzmaEncProps *p) void LzmaEncProps_Normalize(CLzmaEncProps *p) { int level = p->level; + unsigned lclp; + if (!p->normalized) { if (level < 0) level = 5; p->level = level; @@ -126,6 +136,8 @@ void LzmaEncProps_Normalize(CLzmaEncProps *p) if (p->btMode < 0) p->btMode = (p->algo == 0 ? 0 : 1); if (p->numHashBytes < 0) p->numHashBytes = 4; if (p->mc == 0) p->mc = (16 + (p->fb >> 1)) >> (p->btMode ? 0 : 1); + lclp = p->lc + p->lp; + p->litprob_sz = LITPROB_SZ(lclp); if (p->numThreads < 0) p->numThreads = #ifndef _7ZIP_ST @@ -238,12 +250,6 @@ typedef struct #define kNumFullDistances (1 << (kEndPosModelIndex >> 1)) -#ifdef _LZMA_PROB32 -#define CLzmaProb UInt32 -#else -#define CLzmaProb UInt16 -#endif - #define LZMA_PB_MAX 4 #define LZMA_LC_MAX 8 #define LZMA_LP_MAX 4 @@ -423,7 +429,7 @@ void LzmaEnc_SaveState(CLzmaEncHandle pp) memcpy(dest->posEncoders, p->posEncoders, sizeof(p->posEncoders)); memcpy(dest->posAlignEncoder, p->posAlignEncoder, sizeof(p->posAlignEncoder)); memcpy(dest->reps, p->reps, sizeof(p->reps)); - memcpy(dest->litProbs, p->litProbs, (0x300 << p->lclp) * sizeof(CLzmaProb)); + memcpy(dest->litProbs, p->litProbs, LITPROB_SZ(p->lclp)); } void LzmaEnc_RestoreState(CLzmaEncHandle pp) @@ -449,7 +455,7 @@ void LzmaEnc_RestoreState(CLzmaEncHandle pp) memcpy(dest->posEncoders, p->posEncoders, sizeof(p->posEncoders)); memcpy(dest->posAlignEncoder, p->posAlignEncoder, sizeof(p->posAlignEncoder)); memcpy(dest->reps, p->reps, sizeof(p->reps)); - memcpy(dest->litProbs, p->litProbs, (0x300 << dest->lclp) * sizeof(CLzmaProb)); + memcpy(dest->litProbs, p->litProbs, LITPROB_SZ(dest->lclp)); } SRes LzmaEnc_SetProps(CLzmaEncHandle pp, const CLzmaEncProps *props2) @@ -2063,8 +2069,8 @@ static SRes LzmaEnc_Alloc(CLzmaEnc *p, UInt32 keepWindowSize, ISzAlloc *alloc, I if (p->litProbs == 0 || p->saveState.litProbs == 0 || p->lclp != lclp) { LzmaEnc_FreeLits(p, alloc); - p->litProbs = (CLzmaProb *)alloc->Alloc(alloc, (0x300 << lclp) * sizeof(CLzmaProb)); - p->saveState.litProbs = (CLzmaProb *)alloc->Alloc(alloc, (0x300 << lclp) * sizeof(CLzmaProb)); + p->litProbs = (CLzmaProb *)alloc->Alloc(alloc, LITPROB_SZ(lclp)); + p->saveState.litProbs = (CLzmaProb *)alloc->Alloc(alloc, LITPROB_SZ(lclp)); if (p->litProbs == 0 || p->saveState.litProbs == 0) { LzmaEnc_FreeLits(p, alloc); diff --git a/lzma/LzmaEnc.h b/lzma/LzmaEnc.h index c71d2ff..9ce63af 100644 --- a/lzma/LzmaEnc.h +++ b/lzma/LzmaEnc.h @@ -30,6 +30,7 @@ typedef struct _CLzmaEncProps unsigned writeEndMark; /* 0 - do not write EOPM, 1 - write EOPM, default = 0 */ int numThreads; /* 1 or 2, default = 2 */ int normalized; + size_t litprob_sz; } CLzmaEncProps; extern void LzmaEncProps_Init(CLzmaEncProps *p); diff --git a/lzma_compress.c b/lzma_compress.c index 076ec63..0474c6b 100644 --- a/lzma_compress.c +++ b/lzma_compress.c @@ -77,6 +77,7 @@ lzma_init(void **data, int *level, ssize_t chunksize) if (*level > 9) *level = 9; p->level = *level; LzmaEncProps_Normalize(p); + slab_cache_add(p->litprob_sz); } *data = p; return (0); diff --git a/main.c b/main.c index eac8238..ae33b7d 100644 --- a/main.c +++ b/main.c @@ -659,6 +659,10 @@ start_compress(const char *filename, uint64_t chunksize, int level) nprocs = nthreads; fprintf(stderr, "Scaling to %d threads\n", nprocs); + slab_cache_add(chunksize); + slab_cache_add(compressed_chunksize + CHDR_SZ); + slab_cache_add(sizeof (struct cmp_data)); + dary = (struct cmp_data **)slab_alloc(NULL, sizeof (struct cmp_data *) * nprocs); cread_buf = (uchar_t *)slab_alloc(NULL, chunksize); if (!cread_buf) {