Add extension based file type detection and setting segment data type.

Use Bob Jenkins Minimal Perfect Hash to check for known extensions.
Use semaphore signaling and direct buffer copy for extraction.
Miscellaneous fixes.
This commit is contained in:
Moinak Ghosh 2013-11-07 21:48:54 +05:30
parent 489b97cc79
commit 991482403b
22 changed files with 4177 additions and 14 deletions

View file

@ -28,9 +28,11 @@ LINKLIB=pcompress
LIBVER=1 LIBVER=1
MAINSRCS = utils/utils.c allocator.c lzma_compress.c ppmd_compress.c \ MAINSRCS = utils/utils.c allocator.c lzma_compress.c ppmd_compress.c \
adaptive_compress.c lzfx_compress.c lz4_compress.c none_compress.c \ adaptive_compress.c lzfx_compress.c lz4_compress.c none_compress.c \
utils/xxhash_base.c utils/heap.c utils/cpuid.c archive/pc_archive.c pcompress.c utils/xxhash_base.c utils/heap.c utils/cpuid.c archive/pc_archive.c \
utils/phash/phash.c utils/phash/lookupa.c utils/phash/recycle.c pcompress.c
MAINHDRS = allocator.h pcompress.h utils/utils.h utils/xxhash.h utils/heap.h \ MAINHDRS = allocator.h pcompress.h utils/utils.h utils/xxhash.h utils/heap.h \
utils/cpuid.h utils/xxhash.h archive/pc_archive.h utils/cpuid.h utils/xxhash.h archive/pc_archive.h utils/phash/standard.h \
utils/phash/lookupa.h utils/phash/recycle.h utils/phash/phash.h
MAINOBJS = $(MAINSRCS:.c=.o) MAINOBJS = $(MAINSRCS:.c=.o)
PROGSRCS = main.c PROGSRCS = main.c

View file

@ -45,12 +45,22 @@
#include <archive.h> #include <archive.h>
#include <archive_entry.h> #include <archive_entry.h>
#include "pc_archive.h" #include "pc_archive.h"
#include <phash/phash.h>
#include <phash/extensions.h>
#include <phash/standard.h>
#undef _FEATURES_H #undef _FEATURES_H
#define _XOPEN_SOURCE 700 #define _XOPEN_SOURCE 700
#include <ftw.h> #include <ftw.h>
#include <stdint.h> #include <stdint.h>
static int inited = 0;
pthread_mutex_t init_mutex = PTHREAD_MUTEX_INITIALIZER;
static struct ext_hash_entry {
uint64_t extnum;
int type;
} *exthtab = NULL;
/* /*
AE_IFREG Regular file AE_IFREG Regular file
AE_IFLNK Symbolic link AE_IFLNK Symbolic link
@ -91,6 +101,8 @@ static struct arc_list_state {
pthread_mutex_t nftw_mutex = PTHREAD_MUTEX_INITIALIZER; pthread_mutex_t nftw_mutex = PTHREAD_MUTEX_INITIALIZER;
static int detect_type_by_ext(char *path, int pathlen);
/* /*
* Archive writer callback routines for archive creation operation. * Archive writer callback routines for archive creation operation.
*/ */
@ -148,6 +160,28 @@ creat_write_callback(struct archive *arc, void *ctx, const void *buf, size_t len
uchar_t *tbuf; uchar_t *tbuf;
tbuf = pctx->arc_buf + pctx->arc_buf_pos; tbuf = pctx->arc_buf + pctx->arc_buf_pos;
if (pctx->btype != pctx->ctype) {
if (pctx->btype == TYPE_UNKNOWN || pctx->arc_buf_pos == 0) {
pctx->btype = pctx->ctype;
} else {
if (pctx->arc_buf_pos < pctx->min_chunk) {
uint32_t diff = pctx->min_chunk - pctx->arc_buf_pos;
if (len > diff)
pctx->btype = pctx->ctype;
else
pctx->ctype = pctx->btype;
} else {
pctx->arc_writing = 0;
sem_post(&(pctx->read_sem));
sem_wait(&(pctx->write_sem));
tbuf = pctx->arc_buf + pctx->arc_buf_pos;
pctx->arc_writing = 1;
if (remaining > 0)
pctx->btype = pctx->ctype;
}
}
}
if (remaining > pctx->arc_buf_size - pctx->arc_buf_pos) { if (remaining > pctx->arc_buf_size - pctx->arc_buf_pos) {
size_t nlen = pctx->arc_buf_size - pctx->arc_buf_pos; size_t nlen = pctx->arc_buf_size - pctx->arc_buf_pos;
memcpy(tbuf, buff, nlen); memcpy(tbuf, buff, nlen);
@ -189,9 +223,12 @@ archiver_read(void *ctx, void *buf, uint64_t count)
pctx->arc_buf = buf; pctx->arc_buf = buf;
pctx->arc_buf_size = count; pctx->arc_buf_size = count;
pctx->arc_buf_pos = 0; pctx->arc_buf_pos = 0;
pctx->btype = TYPE_UNKNOWN;
sem_post(&(pctx->write_sem)); sem_post(&(pctx->write_sem));
sem_wait(&(pctx->read_sem)); sem_wait(&(pctx->read_sem));
pctx->arc_buf = NULL; pctx->arc_buf = NULL;
if (pctx->btype == TYPE_UNKNOWN)
pctx->btype = TYPE_GENERIC;
return (pctx->arc_buf_pos); return (pctx->arc_buf_pos);
} }
@ -229,8 +266,9 @@ extract_read_callback(struct archive *arc, void *ctx, const void **buf)
if (pctx->arc_closed) { if (pctx->arc_closed) {
pctx->arc_buf_size = 0; pctx->arc_buf_size = 0;
log_msg(LOG_WARN, 0, "End of file.");
archive_set_error(arc, ARCHIVE_EOF, "End of file."); archive_set_error(arc, ARCHIVE_EOF, "End of file.");
return (0); return (-1);
} }
if (!pctx->arc_writing) { if (!pctx->arc_writing) {
@ -242,8 +280,9 @@ extract_read_callback(struct archive *arc, void *ctx, const void **buf)
if (pctx->arc_buf == NULL || pctx->arc_buf_size == 0) { if (pctx->arc_buf == NULL || pctx->arc_buf_size == 0) {
pctx->arc_buf_size = 0; pctx->arc_buf_size = 0;
log_msg(LOG_ERR, 0, "End of file when extracting archive.");
archive_set_error(arc, ARCHIVE_EOF, "End of file when extracting archive."); archive_set_error(arc, ARCHIVE_EOF, "End of file when extracting archive.");
return (0); return (-1);
} }
pctx->arc_writing = 1; pctx->arc_writing = 1;
*buf = pctx->arc_buf; *buf = pctx->arc_buf;
@ -256,8 +295,10 @@ archiver_write(void *ctx, void *buf, uint64_t count)
{ {
pc_ctx_t *pctx = (pc_ctx_t *)ctx; pc_ctx_t *pctx = (pc_ctx_t *)ctx;
if (pctx->arc_closed) if (pctx->arc_closed) {
log_msg(LOG_WARN, 0, "Archive extractor closed unexpectedly");
return (0); return (0);
}
if (pctx->arc_buf != NULL) { if (pctx->arc_buf != NULL) {
log_msg(LOG_ERR, 0, "Incorrect sequencing of archiver_read() call."); log_msg(LOG_ERR, 0, "Incorrect sequencing of archiver_read() call.");
@ -321,7 +362,7 @@ compare_members_lt(member_entry_t *mem1, member_entry_t *mem2) {
* fetches the next entry in ascending order of the predetermined sort keys. * fetches the next entry in ascending order of the predetermined sort keys.
*/ */
static int static int
read_next_path(pc_ctx_t *pctx, char *fpath, char **namechars) read_next_path(pc_ctx_t *pctx, char *fpath, char **namechars, int *fpathlen)
{ {
short namelen; short namelen;
ssize_t rbytes; ssize_t rbytes;
@ -434,6 +475,7 @@ do_mmap:
buf = pctx->temp_mmap_buf + (pctx->temp_file_pos - pctx->temp_mmap_pos); buf = pctx->temp_mmap_buf + (pctx->temp_file_pos - pctx->temp_mmap_pos);
memcpy(fpath, buf, namelen); memcpy(fpath, buf, namelen);
fpath[namelen] = '\0'; fpath[namelen] = '\0';
*fpathlen = namelen;
n = namelen-1; n = namelen-1;
while (fpath[n] == '/' && n > 0) n--; while (fpath[n] == '/' && n > 0) n--;
@ -761,6 +803,7 @@ setup_extractor(pc_ctx_t *pctx)
} }
archive_read_support_format_all(arc); archive_read_support_format_all(arc);
pctx->archive_ctx = arc; pctx->archive_ctx = arc;
pctx->arc_writing = 0;
return (0); return (0);
} }
@ -771,7 +814,7 @@ setup_extractor(pc_ctx_t *pctx)
*/ */
static int static int
copy_file_data(pc_ctx_t *pctx, struct archive *arc, copy_file_data(pc_ctx_t *pctx, struct archive *arc,
struct archive *in_arc, struct archive_entry *entry) struct archive *in_arc, struct archive_entry *entry, int typ)
{ {
size_t sz, offset, len; size_t sz, offset, len;
ssize_t bytes_to_write; ssize_t bytes_to_write;
@ -804,6 +847,9 @@ copy_file_data(pc_ctx_t *pctx, struct archive *arc,
src = mapbuf; src = mapbuf;
wlen = len; wlen = len;
/* if (typ == TYPE_UNKNOWN)
pctx->ctype = detect_type_by_data(src, len);*/
/* /*
* Write the entire mmap-ed buffer. Since we are writing to the compressor * Write the entire mmap-ed buffer. Since we are writing to the compressor
* stage pipe there is no need for blocking. * stage pipe there is no need for blocking.
@ -825,7 +871,7 @@ copy_file_data(pc_ctx_t *pctx, struct archive *arc,
static int static int
write_entry(pc_ctx_t *pctx, struct archive *arc, struct archive *in_arc, write_entry(pc_ctx_t *pctx, struct archive *arc, struct archive *in_arc,
struct archive_entry *entry) struct archive_entry *entry, int typ)
{ {
int rv; int rv;
@ -842,7 +888,7 @@ write_entry(pc_ctx_t *pctx, struct archive *arc, struct archive *in_arc,
} }
if (archive_entry_size(entry) > 0) { if (archive_entry_size(entry) > 0) {
return (copy_file_data(pctx, arc, in_arc, entry)); return (copy_file_data(pctx, arc, in_arc, entry, typ));
} }
return (0); return (0);
@ -856,7 +902,7 @@ static void *
archiver_thread_func(void *dat) { archiver_thread_func(void *dat) {
pc_ctx_t *pctx = (pc_ctx_t *)dat; pc_ctx_t *pctx = (pc_ctx_t *)dat;
char fpath[PATH_MAX], *name, *bnchars = NULL; // Silence compiler char fpath[PATH_MAX], *name, *bnchars = NULL; // Silence compiler
int warn, rbytes; int warn, rbytes, fpathlen = 0; // Silence compiler
uint32_t ctr; uint32_t ctr;
struct archive_entry *entry, *spare_entry, *ent; struct archive_entry *entry, *spare_entry, *ent;
struct archive *arc, *ard; struct archive *arc, *ard;
@ -885,7 +931,9 @@ archiver_thread_func(void *dat) {
/* /*
* Read next path entry from list file. read_next_path() also handles sorted reading. * Read next path entry from list file. read_next_path() also handles sorted reading.
*/ */
while ((rbytes = read_next_path(pctx, fpath, &bnchars)) != 0) { while ((rbytes = read_next_path(pctx, fpath, &bnchars, &fpathlen)) != 0) {
int typ;
if (rbytes == -1) break; if (rbytes == -1) break;
archive_entry_copy_sourcepath(entry, fpath); archive_entry_copy_sourcepath(entry, fpath);
if (archive_read_disk_entry_from_file(ard, entry, -1, NULL) != ARCHIVE_OK) { if (archive_read_disk_entry_from_file(ard, entry, -1, NULL) != ARCHIVE_OK) {
@ -894,6 +942,11 @@ archiver_thread_func(void *dat) {
continue; continue;
} }
if (archive_entry_filetype(entry) == AE_IFREG) {
if ((typ = detect_type_by_ext(fpath, fpathlen)) != TYPE_UNKNOWN)
pctx->ctype = typ;
}
/* /*
* Strip leading '/' or '../' or '/../' from member name. * Strip leading '/' or '../' or '/../' from member name.
*/ */
@ -945,7 +998,7 @@ archiver_thread_func(void *dat) {
archive_entry_linkify(resolver, &entry, &spare_entry); archive_entry_linkify(resolver, &entry, &spare_entry);
ent = entry; ent = entry;
while (ent != NULL) { while (ent != NULL) {
if (write_entry(pctx, arc, ard, ent) != 0) { if (write_entry(pctx, arc, ard, ent, typ) != 0) {
goto done; goto done;
} }
ent = spare_entry; ent = spare_entry;
@ -1094,3 +1147,55 @@ int
start_extractor(pc_ctx_t *pctx) { start_extractor(pc_ctx_t *pctx) {
return (pthread_create(&(pctx->archive_thread), NULL, extractor_thread_func, (void *)pctx)); return (pthread_create(&(pctx->archive_thread), NULL, extractor_thread_func, (void *)pctx));
} }
int
init_archive_mod() {
int rv = 0;
pthread_mutex_lock(&init_mutex);
if (!inited) {
int i, j;
exthtab = malloc(NUM_EXT * sizeof (struct ext_hash_entry));
if (exthtab != NULL) {
for (i = 0; i < NUM_EXT; i++) {
uint64_t extnum;
ub4 slot = phash(extlist[i].ext, extlist[i].len);
extnum = 0;
for (j = 0; j < extlist[i].len; j++)
extnum = (extnum << 1) | extlist[i].ext[j];
exthtab[slot].extnum = extnum;
exthtab[slot].type = extlist[i].type;
}
inited = 1;
} else {
rv = 1;
}
}
pthread_mutex_unlock(&init_mutex);
return (rv);
}
static int
detect_type_by_ext(char *path, int pathlen)
{
char *ext = NULL;
ub4 slot;
int i, len;
uint64_t extnum;
for (i = pathlen-1; i > 0 && path[i] != '.' && path[i] != PATHSEP_CHAR; i--);
if (i == 0 || path[i] != '.') goto out;
len = pathlen - i - 1;
if (len == 0) goto out;
ext = &path[i+1];
slot = phash(ext, len);
if (slot > NUM_EXT) goto out;
extnum = 0;
for (i = 0; i < len; i++)
extnum = (extnum << 1) | ext[i];
if (exthtab[slot].extnum == extnum)
return (exthtab[slot].type);
out:
return (TYPE_UNKNOWN);
}

View file

@ -38,6 +38,16 @@ typedef struct {
size_t size; size_t size;
} archive_list_entry_t; } archive_list_entry_t;
typedef enum {
TYPE_UNKNOWN = 0,
TYPE_GENERIC,
TYPE_COMPRESSED,
TYPE_EXE,
TYPE_TEXT,
TYPE_BINARY,
TYPE_JPEG
} data_type_t;
/* /*
* Archiving related functions. * Archiving related functions.
*/ */
@ -48,6 +58,7 @@ int start_extractor(pc_ctx_t *pctx);
int64_t archiver_read(void *ctx, void *buf, uint64_t count); int64_t archiver_read(void *ctx, void *buf, uint64_t count);
int64_t archiver_write(void *ctx, void *buf, uint64_t count); int64_t archiver_write(void *ctx, void *buf, uint64_t count);
int archiver_close(void *ctx); int archiver_close(void *ctx);
int init_archive_mod();
#ifdef __cplusplus #ifdef __cplusplus
} }

View file

@ -1795,6 +1795,7 @@ start_compress(pc_ctx_t *pctx, const char *filename, uint64_t chunksize, int lev
props.cksum = pctx->cksum; props.cksum = pctx->cksum;
props.buf_extra = 0; props.buf_extra = 0;
cread_buf = NULL; cread_buf = NULL;
pctx->btype = TYPE_UNKNOWN;
flags = 0; flags = 0;
sbuf.st_size = 0; sbuf.st_size = 0;
err = 0; err = 0;
@ -2355,6 +2356,7 @@ start_compress(pc_ctx_t *pctx, const char *filename, uint64_t chunksize, int lev
*/ */
tdat->id = pctx->chunk_num; tdat->id = pctx->chunk_num;
tdat->rbytes = rbytes; tdat->rbytes = rbytes;
tdat->btype = pctx->btype; // Have to copy btype for this buffer as pctx->btype will change
if ((pctx->enable_rabin_scan || pctx->enable_fixed_scan || pctx->enable_rabin_global)) { if ((pctx->enable_rabin_scan || pctx->enable_fixed_scan || pctx->enable_rabin_global)) {
tmp = tdat->cmp_seg; tmp = tdat->cmp_seg;
tdat->cmp_seg = cread_buf; tdat->cmp_seg = cread_buf;
@ -2665,6 +2667,7 @@ create_pc_context(void)
slab_init(); slab_init();
init_pcompress(); init_pcompress();
init_archive_mod();
memset(ctx, 0, sizeof (pc_ctx_t)); memset(ctx, 0, sizeof (pc_ctx_t));
ctx->exec_name = (char *)malloc(NAME_MAX); ctx->exec_name = (char *)malloc(NAME_MAX);
@ -2686,8 +2689,8 @@ destroy_pc_context(pc_ctx_t *pctx)
if (pctx->pwd_file) if (pctx->pwd_file)
free(pctx->pwd_file); free(pctx->pwd_file);
free((void *)(pctx->exec_name)); free((void *)(pctx->exec_name));
free(pctx);
slab_cleanup(pctx->hide_mem_stats); slab_cleanup(pctx->hide_mem_stats);
free(pctx);
} }
int DLL_EXPORT int DLL_EXPORT
@ -2904,7 +2907,7 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[])
* Sorting of members when archiving is enabled for compression levels >6 (>2 for lz4), * Sorting of members when archiving is enabled for compression levels >6 (>2 for lz4),
* unless it is explicitly disabled via '-n'. * unless it is explicitly disabled via '-n'.
*/ */
if (pctx->enable_archive_sort != -1) { if (pctx->enable_archive_sort != -1 && pctx->do_compress) {
if ((memcmp(pctx->algo, "lz4", 3) == 0 && pctx->level > 2) || pctx->level > 6) if ((memcmp(pctx->algo, "lz4", 3) == 0 && pctx->level > 2) || pctx->level > 6)
pctx->enable_archive_sort = 1; pctx->enable_archive_sort = 1;
} else { } else {
@ -2918,6 +2921,12 @@ init_pc_context(pc_ctx_t *pctx, int argc, char *argv[])
pctx->rab_blk_size = RAB_BLK_DEFAULT; pctx->rab_blk_size = RAB_BLK_DEFAULT;
} }
pctx->min_chunk = MIN_CHUNK;
if (pctx->enable_rabin_scan)
pctx->min_chunk = RAB_MIN_CHUNK_SIZE;
if (pctx->enable_rabin_global)
pctx->min_chunk = RAB_MIN_CHUNK_SIZE_GLOBAL;
/* /*
* Remaining mandatory arguments are the filenames. * Remaining mandatory arguments are the filenames.
*/ */

View file

@ -224,6 +224,8 @@ typedef struct pc_ctx {
uchar_t *arc_buf; uchar_t *arc_buf;
uint64_t arc_buf_size, arc_buf_pos; uint64_t arc_buf_size, arc_buf_pos;
int arc_closed, arc_writing; int arc_closed, arc_writing;
uchar_t btype, ctype;
int min_chunk;
unsigned int chunk_num; unsigned int chunk_num;
uint64_t largest_chunk, smallest_chunk, avg_chunk; uint64_t largest_chunk, smallest_chunk, avg_chunk;
@ -268,6 +270,7 @@ struct cmp_data {
mac_ctx_t chunk_hmac; mac_ctx_t chunk_hmac;
algo_props_t *props; algo_props_t *props;
int decompressing; int decompressing;
uchar_t btype;
pc_ctx_t *pctx; pc_ctx_t *pctx;
}; };

23
utils/phash/Makefile Normal file
View file

@ -0,0 +1,23 @@
CFLAGS = -O
.cc.o:
gcc $(CFLAGS) -c $<
O = lookupa.o recycle.o perfhex.o perfect.o
all : $(O)
gcc -o perfect $(O) -lm
sh genhash.sh
clean:
rm -f perfect foo *.o
# DEPENDENCIES
lookupa.o : lookupa.c standard.h lookupa.h
recycle.o : recycle.c standard.h recycle.h
perfhex.o : perfhex.c standard.h lookupa.h recycle.h perfect.h
perfect.o : perfect.c standard.h lookupa.h recycle.h perfect.h

19
utils/phash/Makefile.test Normal file
View file

@ -0,0 +1,19 @@
CFLAGS = -O
.cc.o:
gcc $(CFLAGS) -c $<
O = lookupa.o recycle.o phash.o testperf.o
foo : $(O)
gcc -o foo $(O) -lm
# DEPENDENCIES
lookupa.o : lookupa.c standard.h lookupa.h
recycle.o : recycle.c standard.h recycle.h
phash.o : phash.c standard.h phash.h lookupa.h
testperf.o : testperf.c standard.h recycle.h phash.h

115
utils/phash/extensions.h Normal file
View file

@ -0,0 +1,115 @@
/* Generated File. DO NOT EDIT. */
/*
* List of extensions and their types.
*/
#ifndef __EXT_H__
#define __EXT_H__
struct ext_entry {
char *ext;
int type;
int len;
} extlist[] = {
{"c" , TYPE_TEXT, 1},
{"h" , TYPE_TEXT, 1},
{"cc" , TYPE_TEXT, 2},
{"cpp" , TYPE_TEXT, 3},
{"c++" , TYPE_TEXT, 3},
{"hpp" , TYPE_TEXT, 3},
{"txt" , TYPE_TEXT, 3},
{"html" , TYPE_TEXT, 4},
{"htm" , TYPE_TEXT, 3},
{"xml" , TYPE_TEXT, 3},
{"info" , TYPE_TEXT, 4},
{"ppm" , TYPE_TEXT, 3},
{"svg" , TYPE_TEXT, 3},
{"conf" , TYPE_TEXT, 4},
{"py" , TYPE_TEXT, 2},
{"rb" , TYPE_TEXT, 2},
{"xpm" , TYPE_TEXT, 3},
{"js" , TYPE_TEXT, 2},
{"jsp" , TYPE_TEXT, 3},
{"pl" , TYPE_TEXT, 2},
{"tcl" , TYPE_TEXT, 3},
{"sh" , TYPE_TEXT, 2},
{"php" , TYPE_TEXT, 3},
{"bat" , TYPE_TEXT, 3},
{"pm" , TYPE_TEXT, 2},
{"r" , TYPE_TEXT, 1},
{"d" , TYPE_TEXT, 1},
{"bas" , TYPE_TEXT, 3},
{"asm" , TYPE_TEXT, 3},
{"go" , TYPE_TEXT, 2},
{"java" , TYPE_TEXT, 4},
{"m4" , TYPE_TEXT, 2},
{"vb" , TYPE_TEXT, 2},
{"xslt" , TYPE_TEXT, 4},
{"yacc" , TYPE_TEXT, 4},
{"lex" , TYPE_TEXT, 3},
{"csv" , TYPE_TEXT, 3},
{"shtml" , TYPE_TEXT, 5},
{"xhtml" , TYPE_TEXT, 5},
{"xht" , TYPE_TEXT, 3},
{"asp" , TYPE_TEXT, 3},
{"aspx" , TYPE_TEXT, 4},
{"rss" , TYPE_TEXT, 3},
{"atom" , TYPE_TEXT, 4},
{"cgi" , TYPE_TEXT, 3},
{"c#" , TYPE_TEXT, 2},
{"cob" , TYPE_TEXT, 3},
{"ada" , TYPE_TEXT, 3},
{"ini" , TYPE_TEXT, 3},
{"y" , TYPE_TEXT, 1},
{"swg" , TYPE_TEXT, 3},
{"s" , TYPE_TEXT, 1},
{"ps" , TYPE_TEXT, 2},
{"bib" , TYPE_TEXT, 3},
{"lua" , TYPE_TEXT, 3},
{"qml" , TYPE_TEXT, 3},
{"exe" , TYPE_EXE, 3},
{"dll" , TYPE_EXE, 3},
{"bin" , TYPE_EXE, 3},
{"o" , TYPE_EXE, 1},
{"a" , TYPE_EXE, 1},
{"obj" , TYPE_EXE, 3},
{"so" , TYPE_EXE, 2},
{"com" , TYPE_EXE, 3},
{"xpi" , TYPE_EXE, 3},
{"off" , TYPE_EXE, 3},
{"pdf" , TYPE_COMPRESSED, 3},
{"jpg" , TYPE_JPEG, 3},
{"jpeg" , TYPE_JPEG, 4},
{"png" , TYPE_COMPRESSED, 3},
{"mp3" , TYPE_COMPRESSED, 3},
{"wma" , TYPE_COMPRESSED, 3},
{"divx" , TYPE_COMPRESSED, 4},
{"mp4" , TYPE_COMPRESSED, 3},
{"aac" , TYPE_COMPRESSED, 3},
{"m4a" , TYPE_COMPRESSED, 3},
{"m4p" , TYPE_COMPRESSED, 3},
{"ofs" , TYPE_COMPRESSED, 3},
{"ofr" , TYPE_COMPRESSED, 3},
{"flac" , TYPE_COMPRESSED, 4},
{"pac" , TYPE_COMPRESSED, 3},
{"gif" , TYPE_COMPRESSED, 3},
{"jp2" , TYPE_JPEG, 3},
{"gz" , TYPE_COMPRESSED, 2},
{"bz2" , TYPE_COMPRESSED, 3},
{"zip" , TYPE_COMPRESSED, 3},
{"arj" , TYPE_COMPRESSED, 3},
{"arc" , TYPE_COMPRESSED, 3},
{"jar" , TYPE_COMPRESSED, 3},
{"lz" , TYPE_COMPRESSED, 2},
{"lzh" , TYPE_COMPRESSED, 3},
{"lzma" , TYPE_COMPRESSED, 4},
{"lzo" , TYPE_COMPRESSED, 3},
{"dmg" , TYPE_COMPRESSED, 3},
{"7z" , TYPE_COMPRESSED, 2},
{"uha" , TYPE_COMPRESSED, 3},
{"alz" , TYPE_COMPRESSED, 3},
{"ace" , TYPE_COMPRESSED, 3},
{"xcf" , TYPE_BINARY, 3},
};
#define NUM_EXT (99)
#endif

View file

@ -0,0 +1,99 @@
c,TYPE_TEXT
h,TYPE_TEXT
cc,TYPE_TEXT
cpp,TYPE_TEXT
c++,TYPE_TEXT
hpp,TYPE_TEXT
txt,TYPE_TEXT
html,TYPE_TEXT
htm,TYPE_TEXT
xml,TYPE_TEXT
info,TYPE_TEXT
ppm,TYPE_TEXT
svg,TYPE_TEXT
conf,TYPE_TEXT
py,TYPE_TEXT
rb,TYPE_TEXT
xpm,TYPE_TEXT
js,TYPE_TEXT
jsp,TYPE_TEXT
pl,TYPE_TEXT
tcl,TYPE_TEXT
sh,TYPE_TEXT
php,TYPE_TEXT
bat,TYPE_TEXT
pm,TYPE_TEXT
r,TYPE_TEXT
d,TYPE_TEXT
bas,TYPE_TEXT
asm,TYPE_TEXT
go,TYPE_TEXT
java,TYPE_TEXT
m4,TYPE_TEXT
vb,TYPE_TEXT
xslt,TYPE_TEXT
yacc,TYPE_TEXT
lex,TYPE_TEXT
csv,TYPE_TEXT
shtml,TYPE_TEXT
xhtml,TYPE_TEXT
xht,TYPE_TEXT
asp,TYPE_TEXT
aspx,TYPE_TEXT
rss,TYPE_TEXT
atom,TYPE_TEXT
cgi,TYPE_TEXT
c#,TYPE_TEXT
cob,TYPE_TEXT
ada,TYPE_TEXT
ini,TYPE_TEXT
y,TYPE_TEXT
swg,TYPE_TEXT
s,TYPE_TEXT
ps,TYPE_TEXT
bib,TYPE_TEXT
lua,TYPE_TEXT
qml,TYPE_TEXT
exe,TYPE_EXE
dll,TYPE_EXE
bin,TYPE_EXE
o,TYPE_EXE
a,TYPE_EXE
obj,TYPE_EXE
so,TYPE_EXE
com,TYPE_EXE
xpi,TYPE_EXE
off,TYPE_EXE
pdf,TYPE_COMPRESSED
jpg,TYPE_JPEG
jpeg,TYPE_JPEG
png,TYPE_COMPRESSED
mp3,TYPE_COMPRESSED
wma,TYPE_COMPRESSED
divx,TYPE_COMPRESSED
mp4,TYPE_COMPRESSED
aac,TYPE_COMPRESSED
m4a,TYPE_COMPRESSED
m4p,TYPE_COMPRESSED
ofs,TYPE_COMPRESSED
ofr,TYPE_COMPRESSED
flac,TYPE_COMPRESSED
pac,TYPE_COMPRESSED
gif,TYPE_COMPRESSED
jp2,TYPE_JPEG
gz,TYPE_COMPRESSED
bz2,TYPE_COMPRESSED
zip,TYPE_COMPRESSED
arj,TYPE_COMPRESSED
arc,TYPE_COMPRESSED
jar,TYPE_COMPRESSED
lz,TYPE_COMPRESSED
lzh,TYPE_COMPRESSED
lzma,TYPE_COMPRESSED
lzo,TYPE_COMPRESSED
dmg,TYPE_COMPRESSED
7z,TYPE_COMPRESSED
uha,TYPE_COMPRESSED
alz,TYPE_COMPRESSED
ace,TYPE_COMPRESSED
xcf,TYPE_BINARY

36
utils/phash/genhash.sh Normal file
View file

@ -0,0 +1,36 @@
#!/bin/sh
count=`cat extensions.txt | wc -l`
echo '
/* Generated File. DO NOT EDIT. */
/*
* List of extensions and their types.
*/
#ifndef __EXT_H__
#define __EXT_H__
struct ext_entry {
char *ext;
int type;
int len;
} extlist[] = {' > extensions.h
rm -f extlist
cat extensions.txt | while read line
do
_OIFS="$IFS"
IFS=","
set -- $line
IFS="$_OIFS"
ext=$1
type=$2
len=`printf $ext | wc -c`
echo $ext >> extlist
echo " {\"${ext}\" , $type, $len}," >> extensions.h
done
echo '};' >> extensions.h
echo "#define NUM_EXT (${count})" >> extensions.h
echo "#endif" >> extensions.h
./perfect -nm < extlist
rm -f extlist

264
utils/phash/lookupa.c Normal file
View file

@ -0,0 +1,264 @@
/*
* This file is a part of Pcompress, a chunked parallel multi-
* algorithm lossless compression and decompression program.
*
* Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved.
* Use is subject to license terms.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program.
* If not, see <http://www.gnu.org/licenses/>.
*
* moinakg@belenix.org, http://moinakg.wordpress.com/
*
*/
/*
--------------------------------------------------------------------
lookupa.c, by Bob Jenkins, December 1996. Same as lookup2.c
Use this code however you wish. Public Domain. No warranty.
Source is http://burtleburtle.net/bob/c/lookupa.c
--------------------------------------------------------------------
*/
#ifndef STANDARD
#include "standard.h"
#endif
#ifndef LOOKUPA
#include "lookupa.h"
#endif
/*
--------------------------------------------------------------------
mix -- mix 3 32-bit values reversibly.
For every delta with one or two bit set, and the deltas of all three
high bits or all three low bits, whether the original value of a,b,c
is almost all zero or is uniformly distributed,
* If mix() is run forward or backward, at least 32 bits in a,b,c
have at least 1/4 probability of changing.
* If mix() is run forward, every bit of c will change between 1/3 and
2/3 of the time. (Well, 22/100 and 78/100 for some 2-bit deltas.)
mix() was built out of 36 single-cycle latency instructions in a
structure that could supported 2x parallelism, like so:
a -= b;
a -= c; x = (c>>13);
b -= c; a ^= x;
b -= a; x = (a<<8);
c -= a; b ^= x;
c -= b; x = (b>>13);
...
Unfortunately, superscalar Pentiums and Sparcs can't take advantage
of that parallelism. They've also turned some of those single-cycle
latency instructions into multi-cycle latency instructions. Still,
this is the fastest good hash I could find. There were about 2^^68
to choose from. I only looked at a billion or so.
--------------------------------------------------------------------
*/
#define mix(a,b,c) \
{ \
a -= b; a -= c; a ^= (c>>13); \
b -= c; b -= a; b ^= (a<<8); \
c -= a; c -= b; c ^= (b>>13); \
a -= b; a -= c; a ^= (c>>12); \
b -= c; b -= a; b ^= (a<<16); \
c -= a; c -= b; c ^= (b>>5); \
a -= b; a -= c; a ^= (c>>3); \
b -= c; b -= a; b ^= (a<<10); \
c -= a; c -= b; c ^= (b>>15); \
}
/*
--------------------------------------------------------------------
lookup() -- hash a variable-length key into a 32-bit value
k : the key (the unaligned variable-length array of bytes)
len : the length of the key, counting by bytes
level : can be any 4-byte value
Returns a 32-bit value. Every bit of the key affects every bit of
the return value. Every 1-bit and 2-bit delta achieves avalanche.
About 6len+35 instructions.
The best hash table sizes are powers of 2. There is no need to do
mod a prime (mod is sooo slow!). If you need less than 32 bits,
use a bitmask. For example, if you need only 10 bits, do
h = (h & hashmask(10));
In which case, the hash table should have hashsize(10) elements.
If you are hashing n strings (ub1 **)k, do it like this:
for (i=0, h=0; i<n; ++i) h = lookup( k[i], len[i], h);
By Bob Jenkins, 1996. bob_jenkins@burtleburtle.net. You may use this
code any way you wish, private, educational, or commercial.
See http://burtleburtle.net/bob/hash/evahash.html
Use for hash table lookup, or anything where one collision in 2^32 is
acceptable. Do NOT use for cryptographic purposes.
--------------------------------------------------------------------
*/
/* k: the key */
/* length: the length of the key */
/* level: the previous hash, or an arbitrary value */
ub4
lookup(ub1 *k, ub4 length, ub4 level)
{
register ub4 a,b,c,len;
/* Set up the internal state */
len = length;
a = b = 0x9e3779b9; /* the golden ratio; an arbitrary value */
c = level; /* the previous hash value */
/*---------------------------------------- handle most of the key */
while (len >= 12)
{
a += (k[0] +((ub4)k[1]<<8) +((ub4)k[2]<<16) +((ub4)k[3]<<24));
b += (k[4] +((ub4)k[5]<<8) +((ub4)k[6]<<16) +((ub4)k[7]<<24));
c += (k[8] +((ub4)k[9]<<8) +((ub4)k[10]<<16)+((ub4)k[11]<<24));
mix(a,b,c);
k += 12; len -= 12;
}
/*------------------------------------- handle the last 11 bytes */
c += length;
switch(len) /* all the case statements fall through */
{
case 11: c+=((ub4)k[10]<<24);
case 10: c+=((ub4)k[9]<<16);
case 9 : c+=((ub4)k[8]<<8);
/* the first byte of c is reserved for the length */
case 8 : b+=((ub4)k[7]<<24);
case 7 : b+=((ub4)k[6]<<16);
case 6 : b+=((ub4)k[5]<<8);
case 5 : b+=k[4];
case 4 : a+=((ub4)k[3]<<24);
case 3 : a+=((ub4)k[2]<<16);
case 2 : a+=((ub4)k[1]<<8);
case 1 : a+=k[0];
/* case 0: nothing left to add */
}
mix(a,b,c);
/*-------------------------------------------- report the result */
return c;
}
/*
--------------------------------------------------------------------
mixc -- mixc 8 4-bit values as quickly and thoroughly as possible.
Repeating mix() three times achieves avalanche.
Repeating mix() four times eliminates all funnels and all
characteristics stronger than 2^{-11}.
--------------------------------------------------------------------
*/
#define mixc(a,b,c,d,e,f,g,h) \
{ \
a^=b<<11; d+=a; b+=c; \
b^=c>>2; e+=b; c+=d; \
c^=d<<8; f+=c; d+=e; \
d^=e>>16; g+=d; e+=f; \
e^=f<<10; h+=e; f+=g; \
f^=g>>4; a+=f; g+=h; \
g^=h<<8; b+=g; h+=a; \
h^=a>>9; c+=h; a+=b; \
}
/*
--------------------------------------------------------------------
checksum() -- hash a variable-length key into a 256-bit value
k : the key (the unaligned variable-length array of bytes)
len : the length of the key, counting by bytes
state : an array of CHECKSTATE 4-byte values (256 bits)
The state is the checksum. Every bit of the key affects every bit of
the state. There are no funnels. About 112+6.875len instructions.
If you are hashing n strings (ub1 **)k, do it like this:
for (i=0; i<8; ++i) state[i] = 0x9e3779b9;
for (i=0, h=0; i<n; ++i) checksum( k[i], len[i], state);
See http://burtleburtle.net/bob/hash/evahash.html
Use to detect changes between revisions of documents, assuming nobody
is trying to cause collisions. Do NOT use for cryptography.
--------------------------------------------------------------------
*/
void
checksum(ub1 *k, ub4 len, ub4 *state)
{
register ub4 a,b,c,d,e,f,g,h,length;
/* Use the length and level; add in the golden ratio. */
length = len;
a=state[0]; b=state[1]; c=state[2]; d=state[3];
e=state[4]; f=state[5]; g=state[6]; h=state[7];
/*---------------------------------------- handle most of the key */
while (len >= 32)
{
a += (k[0] +(k[1]<<8) +(k[2]<<16) +(k[3]<<24));
b += (k[4] +(k[5]<<8) +(k[6]<<16) +(k[7]<<24));
c += (k[8] +(k[9]<<8) +(k[10]<<16)+(k[11]<<24));
d += (k[12]+(k[13]<<8)+(k[14]<<16)+(k[15]<<24));
e += (k[16]+(k[17]<<8)+(k[18]<<16)+(k[19]<<24));
f += (k[20]+(k[21]<<8)+(k[22]<<16)+(k[23]<<24));
g += (k[24]+(k[25]<<8)+(k[26]<<16)+(k[27]<<24));
h += (k[28]+(k[29]<<8)+(k[30]<<16)+(k[31]<<24));
mixc(a,b,c,d,e,f,g,h);
mixc(a,b,c,d,e,f,g,h);
mixc(a,b,c,d,e,f,g,h);
mixc(a,b,c,d,e,f,g,h);
k += 32; len -= 32;
}
/*------------------------------------- handle the last 31 bytes */
h += length;
switch(len)
{
case 31: h+=(k[30]<<24);
case 30: h+=(k[29]<<16);
case 29: h+=(k[28]<<8);
case 28: g+=(k[27]<<24);
case 27: g+=(k[26]<<16);
case 26: g+=(k[25]<<8);
case 25: g+=k[24];
case 24: f+=(k[23]<<24);
case 23: f+=(k[22]<<16);
case 22: f+=(k[21]<<8);
case 21: f+=k[20];
case 20: e+=(k[19]<<24);
case 19: e+=(k[18]<<16);
case 18: e+=(k[17]<<8);
case 17: e+=k[16];
case 16: d+=(k[15]<<24);
case 15: d+=(k[14]<<16);
case 14: d+=(k[13]<<8);
case 13: d+=k[12];
case 12: c+=(k[11]<<24);
case 11: c+=(k[10]<<16);
case 10: c+=(k[9]<<8);
case 9 : c+=k[8];
case 8 : b+=(k[7]<<24);
case 7 : b+=(k[6]<<16);
case 6 : b+=(k[5]<<8);
case 5 : b+=k[4];
case 4 : a+=(k[3]<<24);
case 3 : a+=(k[2]<<16);
case 2 : a+=(k[1]<<8);
case 1 : a+=k[0];
}
mixc(a,b,c,d,e,f,g,h);
mixc(a,b,c,d,e,f,g,h);
mixc(a,b,c,d,e,f,g,h);
mixc(a,b,c,d,e,f,g,h);
/*-------------------------------------------- report the result */
state[0]=a; state[1]=b; state[2]=c; state[3]=d;
state[4]=e; state[5]=f; state[6]=g; state[7]=h;
}

49
utils/phash/lookupa.h Normal file
View file

@ -0,0 +1,49 @@
/*
* This file is a part of Pcompress, a chunked parallel multi-
* algorithm lossless compression and decompression program.
*
* Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved.
* Use is subject to license terms.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program.
* If not, see <http://www.gnu.org/licenses/>.
*
* moinakg@belenix.org, http://moinakg.wordpress.com/
*
*/
/*
------------------------------------------------------------------------------
By Bob Jenkins, September 1996.
lookupa.h, a hash function for table lookup, same function as lookup.c.
Use this code in any way you wish. Public Domain. It has no warranty.
Source is http://burtleburtle.net/bob/c/lookupa.h
------------------------------------------------------------------------------
*/
#ifndef STANDARD
#include "standard.h"
#endif
#ifndef LOOKUPA
#define LOOKUPA
#define CHECKSTATE 8
#define hashsize(n) ((ub4)1<<(n))
#define hashmask(n) (hashsize(n)-1)
ub4 lookup(/*_ ub1 *k, ub4 length, ub4 level _*/);
void checksum(/*_ ub1 *k, ub4 length, ub4 *state _*/);
#endif /* LOOKUPA */

1387
utils/phash/perfect.c Normal file

File diff suppressed because it is too large Load diff

157
utils/phash/perfect.h Normal file
View file

@ -0,0 +1,157 @@
/*
* This file is a part of Pcompress, a chunked parallel multi-
* algorithm lossless compression and decompression program.
*
* Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved.
* Use is subject to license terms.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program.
* If not, see <http://www.gnu.org/licenses/>.
*
* moinakg@belenix.org, http://moinakg.wordpress.com/
*
*/
/*
------------------------------------------------------------------------------
perfect.h: code to generate code for a hash for perfect hashing.
(c) Bob Jenkins, September 1996
You may use this code in any way you wish, and it is free. No warranty.
I hereby place this in the public domain.
Source is http://burtleburtle.net/bob/c/perfect.h
------------------------------------------------------------------------------
*/
#ifndef STANDARD
#include "standard.h"
#endif
#ifndef PERFECT
#define PERFECT
#define MAXKEYLEN 30 /* maximum length of a key */
#define USE_SCRAMBLE 4096 /* use scramble if blen >= USE_SCRAMBLE */
#define SCRAMBLE_LEN ((ub4)1<<16) /* length of *scramble* */
#define RETRY_INITKEY 2048 /* number of times to try to find distinct (a,b) */
#define RETRY_PERFECT 1 /* number of times to try to make a perfect hash */
#define RETRY_HEX 200 /* RETRY_PERFECT when hex keys given */
/* the generated code for the final hash, assumes initial hash is done */
struct gencode
{
char **line; /* array of text lines, 80 bytes apiece */
/*
* The code placed here must declare "ub4 rsl"
* and assign it the value of the perfect hash using the function inputs.
* Later code will be tacked on which returns rsl or manipulates it according
* to the user directives.
*
* This code is at the top of the routine; it may and must declare any
* local variables it needs.
*
* Each way of filling in **line should be given a comment that is a unique
* tag. A testcase named with that tag should also be found which tests
* the generated code.
*/
ub4 len; /* number of lines available for final hash */
ub4 used; /* number of lines used by final hash */
ub4 lowbit; /* for HEX, lowest interesting bit */
ub4 highbit; /* for HEX, highest interesting bit */
ub4 diffbits; /* bits which differ for some key */
ub4 i,j,k,l,m,n,o; /* state machine used in hexn() */
};
typedef struct gencode gencode;
/* user directives: perfect hash? minimal perfect hash? input is an int? */
struct hashform
{
enum {
NORMAL_HM, /* key is a string */
INLINE_HM, /* user will do initial hash, we must choose salt for them */
HEX_HM, /* key to be hashed is a hexidecimal 4-byte integer */
DECIMAL_HM, /* key to be hashed is a decimal 4-byte integer */
AB_HM, /* key to be hashed is "A B", where A and B are (A,B) in hex */
ABDEC_HM /* like AB_HM, but in decimal */
} mode;
enum {
STRING_HT, /* key is a string */
INT_HT, /* key is an integer */
AB_HT /* dunno what key is, but input is distinct (A,B) pair */
} hashtype;
enum {
NORMAL_HP, /* just find a perfect hash */
MINIMAL_HP /* find a minimal perfect hash */
} perfect;
enum {
FAST_HS, /* fast mode */
SLOW_HS /* slow mode */
} speed;
};
typedef struct hashform hashform;
/* representation of a key */
struct key
{
ub1 *name_k; /* the actual key */
ub4 len_k; /* the length of the actual key */
ub4 hash_k; /* the initial hash value for this key */
struct key *next_k; /* next key */
/* beyond this point is mapping-dependent */
ub4 a_k; /* a, of the key maps to (a,b) */
ub4 b_k; /* b, of the key maps to (a,b) */
struct key *nextb_k; /* next key with this b */
};
typedef struct key key;
/* things indexed by b of original (a,b) pair */
struct bstuff
{
ub2 val_b; /* hash=a^tabb[b].val_b */
key *list_b; /* tabb[i].list_b is list of keys with b==i */
ub4 listlen_b; /* length of list_b */
ub4 water_b; /* high watermark of who has visited this map node */
};
typedef struct bstuff bstuff;
/* things indexed by final hash value */
struct hstuff
{
key *key_h; /* tabh[i].key_h is the key with a hash of i */
};
typedef struct hstuff hstuff;
/* things indexed by queue position */
struct qstuff
{
bstuff *b_q; /* b that currently occupies this hash */
ub4 parent_q; /* queue position of parent that could use this hash */
ub2 newval_q; /* what to change parent tab[b] to to use this hash */
ub2 oldval_q; /* original value of tab[b] */
};
typedef struct qstuff qstuff;
/* return ceiling(log based 2 of x) */
ub4 mylog2(/*_ ub4 x _*/);
/* Given the keys, scramble[], and hash mode, find the perfect hash */
void findhash(/*_ bstuff **tabb, ub4 *alen, ub4 *blen, ub4 *salt,
gencode *final, ub4 *scramble, ub4 smax, key *keys, ub4 nkeys,
hashform *form _*/);
/* private, but in a different file because it's excessively verbose */
int inithex(/*_ key *keys, ub4 *alen, ub4 *blen, ub4 smax, ub4 nkeys,
ub4 salt, gencode *final, gencode *form _*/);
#endif /* PERFECT */

1319
utils/phash/perfhex.c Normal file

File diff suppressed because it is too large Load diff

28
utils/phash/phash.c Normal file
View file

@ -0,0 +1,28 @@
/* Generated File, DO NOT EDIT */
/* table for the mapping for the perfect hash */
#ifndef STANDARD
#include "standard.h"
#endif /* STANDARD */
#ifndef PHASH
#include "phash.h"
#endif /* PHASH */
#ifndef LOOKUPA
#include "lookupa.h"
#endif /* LOOKUPA */
/* small adjustments to _a_ to make values distinct */
ub1 tab[] = {
20,70,0,4,61,76,0,119,0,0,16,4,10,1,61,76,
61,0,0,16,1,61,0,76,0,123,32,70,28,34,119,51,
0,76,4,122,70,0,0,43,0,106,20,83,0,0,28,66,
79,0,1,47,79,122,0,0,71,75,85,26,0,103,0,76,
};
/* The hash function */
ub4 phash(char *key, int len)
{
ub4 rsl, val = lookup(key, len, 0x9e3779b9);
rsl = ((val>>26)^tab[val&0x3f]);
return rsl;
}

18
utils/phash/phash.h Normal file
View file

@ -0,0 +1,18 @@
/* Generated File, DO NOT EDIT */
/* Perfect hash definitions */
#ifndef STANDARD
#include "standard.h"
#endif /* STANDARD */
#ifndef PHASH
#define PHASH
extern ub1 tab[];
#define PHASHLEN 0x40 /* length of hash mapping table */
#define PHASHNKEYS 99 /* How many keys were hashed */
#define PHASHRANGE 128 /* Range any input might map to */
#define PHASHSALT 0x9e3779b9 /* internal, initialize normal hash */
ub4 phash();
#endif /* PHASH */

115
utils/phash/recycle.c Normal file
View file

@ -0,0 +1,115 @@
/*
* This file is a part of Pcompress, a chunked parallel multi-
* algorithm lossless compression and decompression program.
*
* Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved.
* Use is subject to license terms.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program.
* If not, see <http://www.gnu.org/licenses/>.
*
* moinakg@belenix.org, http://moinakg.wordpress.com/
*
*/
/*
--------------------------------------------------------------------
By Bob Jenkins, September 1996. recycle.c
You may use this code in any way you wish, and it is free. No warranty.
This manages memory for commonly-allocated structures.
It allocates RESTART to REMAX items at a time.
Timings have shown that, if malloc is used for every new structure,
malloc will consume about 90% of the time in a program. This
module cuts down the number of mallocs by an order of magnitude.
This also decreases memory fragmentation, and freeing structures
only requires freeing the root.
--------------------------------------------------------------------
*/
#include <stdlib.h>
#include <string.h>
#ifndef STANDARD
# include "standard.h"
#endif
#ifndef RECYCLE
# include "recycle.h"
#endif
reroot *
remkroot(size_t size)
{
reroot *r = (reroot *)remalloc(sizeof(reroot), "recycle.c, root");
r->list = (recycle *)0;
r->trash = (recycle *)0;
r->size = align(size);
r->logsize = RESTART;
r->numleft = 0;
return r;
}
void
refree(struct reroot *r)
{
recycle *temp;
if ((temp = r->list) != NULL)
while (r->list)
{
temp = r->list->next;
free((char *)r->list);
r->list = temp;
}
free((char *)r);
return;
}
/* to be called from the macro renew only */
char *
renewx(struct reroot *r)
{
recycle *temp;
if (r->trash)
{ /* pull a node off the trash heap */
temp = r->trash;
r->trash = temp->next;
(void)memset((void *)temp, 0, r->size);
}
else
{ /* allocate a new block of nodes */
r->numleft = r->size*((ub4)1<<r->logsize);
if (r->numleft < REMAX) ++r->logsize;
temp = (recycle *)remalloc(sizeof(recycle) + r->numleft,
"recycle.c, data");
temp->next = r->list;
r->list = temp;
r->numleft-=r->size;
temp = (recycle *)((char *)(r->list+1)+r->numleft);
}
return (char *)temp;
}
char *
remalloc(size_t len, char *purpose)
{
char *x = (char *)malloc(len);
if (!x)
{
fprintf(stderr, "malloc of %zu failed for %s\n",
len, purpose);
exit(SUCCESS);
}
return x;
}

90
utils/phash/recycle.h Normal file
View file

@ -0,0 +1,90 @@
/*
* This file is a part of Pcompress, a chunked parallel multi-
* algorithm lossless compression and decompression program.
*
* Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved.
* Use is subject to license terms.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program.
* If not, see <http://www.gnu.org/licenses/>.
*
* moinakg@belenix.org, http://moinakg.wordpress.com/
*
*/
/*
--------------------------------------------------------------------
By Bob Jenkins, September 1996. recycle.h
You may use this code in any way you wish, and it is free. No warranty.
This manages memory for commonly-allocated structures.
It allocates RESTART to REMAX items at a time.
Timings have shown that, if malloc is used for every new structure,
malloc will consume about 90% of the time in a program. This
module cuts down the number of mallocs by an order of magnitude.
This also decreases memory fragmentation, and freeing all structures
only requires freeing the root.
--------------------------------------------------------------------
*/
#ifndef STANDARD
#include "standard.h"
#endif
#ifndef RECYCLE
#define RECYCLE
#define RESTART 0
#define REMAX 32000
struct recycle
{
struct recycle *next;
};
typedef struct recycle recycle;
struct reroot
{
struct recycle *list; /* list of malloced blocks */
struct recycle *trash; /* list of deleted items */
size_t size; /* size of an item */
size_t logsize; /* log_2 of number of items in a block */
word numleft; /* number of bytes left in this block */
};
typedef struct reroot reroot;
/* make a new recycling root */
reroot *remkroot(/*_ size_t mysize _*/);
/* free a recycling root and all the items it has made */
void refree(/*_ struct reroot *r _*/);
/* get a new (cleared) item from the root */
#define renew(r) ((r)->numleft ? \
(((char *)((r)->list+1))+((r)->numleft-=(r)->size)) : renewx(r))
char *renewx(/*_ struct reroot *r _*/);
/* delete an item; let the root recycle it */
/* void redel(/o_ struct reroot *r, struct recycle *item _o/); */
#define redel(root,item) { \
((recycle *)item)->next=(root)->trash; \
(root)->trash=(recycle *)(item); \
}
/* malloc, but complain to stderr and exit program if no joy */
/* use plain free() to free memory allocated by remalloc() */
char *remalloc(/*_ size_t len, char *purpose _*/);
#endif /* RECYCLE */

82
utils/phash/standard.h Normal file
View file

@ -0,0 +1,82 @@
/*
* This file is a part of Pcompress, a chunked parallel multi-
* algorithm lossless compression and decompression program.
*
* Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved.
* Use is subject to license terms.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program.
* If not, see <http://www.gnu.org/licenses/>.
*
* moinakg@belenix.org, http://moinakg.wordpress.com/
*
*/
/*
------------------------------------------------------------------------------
Standard definitions and types, Bob Jenkins
------------------------------------------------------------------------------
*/
#ifndef STANDARD
# define STANDARD
# ifndef STDIO
# include <stdio.h>
# define STDIO
# endif
# ifndef STDDEF
# include <stddef.h>
# define STDDEF
# endif
typedef unsigned long long ub8;
#define UB8MAXVAL 0xffffffffffffffffLL
#define UB8BITS 64
typedef signed long long sb8;
#define SB8MAXVAL 0x7fffffffffffffffLL
typedef unsigned int ub4; /* unsigned 4-byte quantities */
#define UB4MAXVAL 0xffffffff
typedef signed int sb4;
#define UB4BITS 32
#define SB4MAXVAL 0x7fffffff
typedef unsigned short int ub2;
#define UB2MAXVAL 0xffff
#define UB2BITS 16
typedef signed short int sb2;
#define SB2MAXVAL 0x7fff
typedef unsigned char ub1;
#define UB1MAXVAL 0xff
#define UB1BITS 8
typedef signed char sb1; /* signed 1-byte quantities */
#define SB1MAXVAL 0x7f
typedef int word; /* fastest type available */
#define bis(target,mask) ((target) |= (mask))
#define bic(target,mask) ((target) &= ~(mask))
#define bit(target,mask) ((target) & (mask))
#ifndef min
# define min(a,b) (((a)<(b)) ? (a) : (b))
#endif /* min */
#ifndef max
# define max(a,b) (((a)<(b)) ? (b) : (a))
#endif /* max */
#ifndef align
# define align(a) (((ub4)a+(sizeof(void *)-1))&(~(sizeof(void *)-1)))
#endif /* align */
#ifndef abs
# define abs(a) (((a)>0) ? (a) : -(a))
#endif
#define TRUE 1
#define FALSE 0
#define SUCCESS 0 /* 1 on VAX */
#endif /* STANDARD */

231
utils/phash/testperf.c Normal file
View file

@ -0,0 +1,231 @@
/*
* This file is a part of Pcompress, a chunked parallel multi-
* algorithm lossless compression and decompression program.
*
* Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved.
* Use is subject to license terms.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program.
* If not, see <http://www.gnu.org/licenses/>.
*
* moinakg@belenix.org, http://moinakg.wordpress.com/
*
*/
/*
----------------------------------------------------------------------------
Test a perfect hash.
By Bob Jenkins. Public Domain.
----------------------------------------------------------------------------
*/
#include <stdlib.h>
#include <string.h>
#ifndef STANDARD
#include "standard.h"
#endif
#ifndef RECYCLE
#include "recycle.h"
#endif
#ifndef PHASH
#include "phash.h"
#endif
/* user directives: perfect hash? minimal perfect hash? input is an int? */
struct hashform
{
enum {
NORMAL_HM, /* key is a string */
INLINE_HM, /* user will do initial hash, we must choose salt for them */
HEX_HM, /* key to be hashed is a hexidecimal 4-byte integer */
DECIMAL_HM, /* key to be hashed is a hexidecimal 4-byte integer */
AB_HM, /* key to be hashed is "A B", where A and B are (A,B) in hex */
ABDEC_HM /* same as AB_HM, but in decimal */
} mode;
};
typedef struct hashform hashform;
#define MAXKEYLEN 30
struct key
{
char *kname;
ub4 klen;
struct key *knext;
};
typedef struct key key;
/* get the list of keys */
static void getkeys(keys, nkeys, textroot, keyroot)
key **keys; /* list of all keys */
ub4 *nkeys; /* number of keys */
reroot *textroot; /* get space to store key text */
reroot *keyroot; /* get space for keys */
{
key *mykey;
char *mytext;
mytext = (char *)renew(textroot);
*keys = (key *)0;
*nkeys = (ub4)0;
while (fgets(mytext, MAXKEYLEN, stdin))
{
ub4 i;
mykey = (key *)renew(keyroot);
mykey->kname = (ub1 *)mytext;
mytext = (char *)renew(textroot);
mykey->klen = (ub4)(strlen((char *)mykey->kname)-1);
mykey->knext = *keys;
*keys = mykey;
++*nkeys;
}
redel(textroot, mytext);
}
/*
------------------------------------------------------------------------------
Read in the keys, find the hash, and write the .c and .h files
------------------------------------------------------------------------------
*/
void driver(form)
hashform *form;
{
ub4 nkeys; /* number of keys */
key *keys; /* head of list of keys */
key *mykey;
reroot *textroot; /* MAXKEYLEN-character text lines */
reroot *keyroot; /* source of keys */
/* set up memory sources */
textroot = remkroot((size_t)MAXKEYLEN);
keyroot = remkroot(sizeof(key));
/* read in the list of keywords */
getkeys(&keys, &nkeys, textroot, keyroot);
printf("Read in %u keys\n",nkeys);
for (mykey=keys; mykey; mykey=mykey->knext)
{
ub4 hash;
ub4 i;
ub4 a;
ub4 b;
switch(form->mode)
{
case NORMAL_HM:
hash = phash(mykey->kname, mykey->klen);
break;
case INLINE_HM:
hash = PHASHSALT;
for (i=0; i<mykey->klen; ++i)
{
hash = (mykey->kname[i] ^ hash) + ((hash<<26)+(hash>>6));
}
hash = phash(hash);
break;
case HEX_HM:
sscanf(mykey->kname, "%x ", &hash);
hash = phash(hash);
break;
case DECIMAL_HM:
sscanf(mykey->kname, "%u ", &hash);
hash = phash(hash);
break;
case AB_HM:
sscanf(mykey->kname, "%x %x ", &a, &b);
hash = phash(a,b);
break;
case ABDEC_HM:
sscanf(mykey->kname, "%u %u ", &a, &b);
hash = phash(a,b);
break;
}
printf("%8d %.*s\n", hash, mykey->klen, mykey->kname);
}
/* clean up memory sources */
refree(textroot);
refree(keyroot);
}
void usage_error()
{
printf("usage is the same as perfect (which see)\n");
exit(SUCCESS);
}
int main(argc, argv)
int argc;
char **argv;
{
hashform form;
char *c;
int mode_given = 0;
form.mode = NORMAL_HM;
/* let the user override the default behavior */
switch (argc)
{
case 1:
break;
case 2:
if (argv[1][0] != '-')
{
usage_error();
break;
}
for (c = &argv[1][1]; *c != '\0'; ++c) switch(*c)
{
case 'n': case 'N':
case 'i': case 'I':
case 'h': case 'H':
case 'd': case 'D':
case 'a': case 'A':
case 'b': case 'B':
if (mode_given == TRUE)
usage_error();
switch(*c)
{
case 'n': case 'N':
form.mode = NORMAL_HM; break;
case 'i': case 'I':
form.mode = INLINE_HM; break;
case 'h': case 'H':
form.mode = HEX_HM; break;
case 'd': case 'D':
form.mode = DECIMAL_HM; break;
case 'a': case 'A':
form.mode = AB_HM; break;
case 'b': case 'B':
form.mode = ABDEC_HM; break;
}
mode_given = TRUE;
break;
case 'm': case 'M':
case 'p': case 'P':
case 'f': case 'F':
case 's': case 'S':
break;
default:
usage_error();
}
break;
default:
usage_error();
}
driver(&form);
return 1;
}

View file

@ -151,6 +151,7 @@ typedef int32_t bsize_t;
#define DEBUG_STAT_EN(...) #define DEBUG_STAT_EN(...)
#endif #endif
#define PATHSEP_CHAR '/'
#define BYTES_TO_MB(x) ((x) / (1024 * 1024)) #define BYTES_TO_MB(x) ((x) / (1024 * 1024))
#define U64_P(x) *((uint64_t *)(x)) #define U64_P(x) *((uint64_t *)(x))
#define U32_P(x) *((uint32_t *)(x)) #define U32_P(x) *((uint32_t *)(x))