pcompress/filters/dict/DictFilter.cpp

1618 lines
37 KiB
C++
Raw Normal View History

/*
* This file is a part of Pcompress, a chunked parallel multi-
* algorithm lossless compression and decompression program.
*
* Copyright (C) 2012-2013 Moinak Ghosh. All rights reserved.
* Use is subject to license terms.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program.
* If not, see <http://www.gnu.org/licenses/>.
*
* moinakg@gmail.com, http://moinakg.wordpress.com/
*/
/*
* Dictionary preprocessor for text files. It uses some ideas from
* the following paper:
* http://pskibinski.pl/papers/05-RevisitingDictCompr.pdf
*
* However the implementation here is quite different from that
* described in the paper. A simple hash table is used for the
* word dictionary. A min-LRU based aging mechanism is used to evict
* words with low frequency to make way for newer words. The min-LRU
* aging kicks in after at least 50% of the data is processed and the
* hash table is full. The hash table size is derived from the data
* size.
* After scanning the data, words with occurrence X word size less
* than a threshold are evicted from the final dictionary. The
* dictionary is then prefixed to the encoded data. The words in the
* final dictionary are sorted based on occurrence X word size value
* and then alphabetically.
*
* Words are extracted by splitting text on a few separator characters.
* Proper case capital conversion is done. So the dictionary only
* contains lower case words.
* Words in the data are replaced by dictionary indexes. These numbers
* are encoded into a base-217 string. A bunch of non-separator char
* ranges are used. Each encoded word is prefixed with a backtick (`).
* Capital converted words are prefixed with an exclamation (!).
* Apart from encoding words, literal numbers more than 3 digits are
* replaced with their base-217 encoded strings. These encoded
* numbers are prefixed with a dollar ($).
* Since words are only encoded on a separator boundary, any lieral
* prefix characters following a separator boundary are escaped using
* a back-slash (\).
*
* The separators are prefix characters have been exprimentally
* selected to benefit context based compressors like PPM and Libbsc.
* Libbsc is especially finicky about the nature of the transform.
* For example XWrt (http://xwrt.sourceforge.net/), a preprocessor
* that implements all of the ideas described in the paper does not
* benefit Libbsc in the enwik9 test(http://mattmahoney.net/dc/text.html).
*/
#include <stdlib.h>
#include <string.h>
#include <strings.h>
#include <stdint.h>
#include <stdio.h>
#include <pthread.h>
#include <ctype.h>
#include "DictFilter.h"
#include "utils.h"
#include "allocator.h"
#include "xxhash.h"
#define WORD_MIN 3
#define WORD_MAX 50
#define LIST_LRU_NUM 15
typedef struct dict_entry {
unsigned char *word;
unsigned char sz;
unsigned char lcfirst;
uint32_t indx;
uint32_t occur;
struct dict_entry *next;
struct dict_entry *list_next;
} dict_entry_t;
typedef struct hash_context_s {
dict_entry_t **dict;
uint32_t dictcount;
uint32_t dictsize;
uint32_t cur_indx;
uint32_t collisions;
dict_entry_t *sentinel;
} hash_context_t;
typedef struct list_context_s {
dict_entry_t *head;
dict_entry_t *tail;
uint32_t listcount;
uint32_t listsize;
uint32_t aged_entries;
uint32_t aging_requests;
} list_context_t;
typedef struct decode_dict_entry_s {
uint32_t sz;
uint8_t *word;
} decode_dict_entry_t;
/*
* We are always copying small blocks, typically words, ranging
* from 3 bytes to 20 bytes. So an inline memory copy is more
* efficient than memcpy() library calls.
*/
static inline void
copy_bytes(void *dst, void *src, size_t len)
{
static void *targets[] = { &&zero, &&one, &&two, &&three };
uint8_t *to = (uint8_t *)dst;
uint8_t *from = (uint8_t *)src;
while (len >= sizeof (uint32_t)) {
*(uint32_t *)to = *(const uint32_t *)from;
to += sizeof (uint32_t);
from += sizeof (uint32_t);
len -= sizeof (uint32_t);
}
/* Unroll final small loop using computed goto. */
goto *targets[len];
three:
*to = *from;
to++; from++;
two:
*to = *from;
to++; from++;
one:
*to = *from;
zero:
return;
}
/*
* Local replacement for bcmp() avoiding a library call for comparing
* words.
*/
static inline int
eq_bytes(void *a, void *b, size_t len)
{
static void *targets[] = { &&_zero, &&_one, &&_two, &&_three };
uint8_t *to = (uint8_t *)a;
uint8_t *from = (uint8_t *)b;
while (len >= sizeof (uint32_t)) {
if (*(uint32_t *)to != *(uint32_t *)from)
return (1);
to += sizeof (uint32_t);
from += sizeof (uint32_t);
len -= sizeof (uint32_t);
}
/* Unroll final small loop using computed goto. */
goto *targets[len];
_three:
if (*to != *from) return (1);
to++; from++;
_two:
if (*to != *from) return (1);
to++; from++;
_one:
if (*to != *from) return (1);
_zero:
return (0);
}
/*
* Sort comparison for the dictionary words.
* Compare first by occurrence X word length and then alphabetically
* by the first three letters. Words are at least 3 chars in length.
*/
static int
cmpoccur(const void *a, const void *b) {
dict_entry_t *de1 = *((dict_entry_t **)a);
dict_entry_t *de2 = *((dict_entry_t **)b);
uint64_t a1, b1;
a1 = ((uint64_t)(de1->occur) - 1) * (de1->sz - 1);
b1 = ((uint64_t)(de2->occur) - 1) * (de2->sz - 1);
if (a1 < b1) {
return (1);
} else if (a1 == b1) {
if (de1->sz < de2->sz) {
return (1);
} else if (de1->sz == de2->sz) {
if (de1->word[0] != de2->word[0])
return ((int)de2->word[0] - (int)de1->word[0]);
if (de1->word[1] != de2->word[1])
return ((int)de2->word[1] - (int)de1->word[1]);
if (de1->word[2] != de2->word[2])
return ((int)de2->word[2] - (int)de1->word[2]);
return (0);
} else {
return (-1);
}
} else {
return (-1);
}
}
/*
* Singleton filter class.
*/
class DictFilter
{
public:
int Forward_Dict(uint8_t *src, uint32_t size, uint8_t *dst, uint32_t *dstsize);
int Inverse_Dict(uint8_t *src, uint32_t size, uint8_t *dst, uint32_t *dstsize);
int Forward_Dict_Fasta(uint8_t *src, uint32_t size, uint8_t *dst, uint32_t *dstsize);
int Inverse_Dict_Fasta(uint8_t *src, uint32_t size, uint8_t *dst, uint32_t *dstsize);
static DictFilter *getInstance() {
pthread_mutex_lock(&inst_lock);
if (!inst) {
inst = new DictFilter();
}
pthread_mutex_unlock(&inst_lock);
return (inst);
}
protected:
~DictFilter();
DictFilter();
dict_entry_t *find_string(dict_entry_t *de, uint8_t *str, uint32_t sz,
uint8_t lcfirst);
void hash_context_init(hash_context_t *hctx, uint32_t dictsize);
void hash_context_delete(hash_context_t *hctx);
dict_entry_t *hash_lookup(hash_context_t *hctx, uint8_t *word, uint32_t wordsize,
uint8_t lcfirst);
dict_entry_t *hash_add(hash_context_t *hctx, uint8_t *word, uint32_t wordsize,
dict_entry_t *_de);
dict_entry_t *hash_remove(hash_context_t *hctx, uint8_t *word, uint32_t wordsize,
dict_entry_t *r_de);
void list_context_init(list_context_t *lctx, uint32_t listsize);
void list_context_delete(list_context_t *lctx);
dict_entry_t *list_push(list_context_t *lctx, dict_entry_t *de);
dict_entry_t *list_pop_lru_min(list_context_t *lctx);
uint8_t *to_base_enc(uint32_t number, uint8_t *str, int sz);
uint32_t from_base_enc(uint8_t *dnum, int sz);
static pthread_mutex_t inst_lock;
static DictFilter *inst;
static const char *BASE_DIGITS;
uint8_t SEPARATOR[256], flag, flag1, flag2;
uint8_t base_enc_digits[256];
uint8_t base_dec_digits[256];
uint32_t NUMERAL_BASE;
};
pthread_mutex_t DictFilter::inst_lock = PTHREAD_MUTEX_INITIALIZER;
DictFilter *DictFilter::inst = NULL;
const char *DictFilter::BASE_DIGITS = "0123456789abcdefghijklmnopqrstuvwxyz@ABCDEFGHIJKLMNOPQRSTUVWXYZ";
DictFilter::DictFilter()
{
uint32_t new_size, i;
memset(SEPARATOR, 0, 256);
/*
* Initialize the number encoding characters. Total
* 217 chars are used for a base-217 encoding. In
* particular, separator characters are avoided.
*/
new_size = strlen(BASE_DIGITS);
for (i=0; i<new_size; i++) {
base_dec_digits[(uint8_t)BASE_DIGITS[i]] = i;
base_enc_digits[i] = BASE_DIGITS[i];
}
new_size = 1;
while (new_size < 9) {
base_dec_digits[new_size] = i;
base_enc_digits[i++] = new_size++;
}
new_size = 14;
while (new_size < 32) {
base_dec_digits[new_size] = i;
base_enc_digits[i++] = new_size++;
}
new_size = 128;
while (new_size < 256) {
base_dec_digits[new_size] = i;
base_enc_digits[i++] = new_size++;
}
base_enc_digits[i] = '\0';
NUMERAL_BASE = i;
/*
* The characters that are regarded as word separators. These
* separators are good for general roman alphabet text and
* XML/HTML markup text.
*/
SEPARATOR['<'] = 1;
SEPARATOR['['] = 1;
SEPARATOR['"'] = 1;
SEPARATOR['('] = 1;
SEPARATOR['|'] = 1;
SEPARATOR['/'] = 1;
SEPARATOR[' '] = 1;
SEPARATOR['\t'] = 1;
SEPARATOR[':'] = 1;
SEPARATOR['\n'] = 1;
SEPARATOR['\r'] = 1;
SEPARATOR['>'] = 1;
SEPARATOR[']'] = 1;
SEPARATOR[')'] = 1;
SEPARATOR['.'] = 1;
SEPARATOR['?'] = 1;
SEPARATOR[','] = 1;
SEPARATOR[';'] = 1;
SEPARATOR['='] = 1;
SEPARATOR['{'] = 1;
SEPARATOR['}'] = 1;
SEPARATOR['-'] = 1;
SEPARATOR['+'] = 1;
SEPARATOR['*'] = 1;
SEPARATOR['a'] = 128;
SEPARATOR['t'] = 128;
SEPARATOR['g'] = 128;
SEPARATOR['c'] = 128;
SEPARATOR['A'] = 128;
SEPARATOR['T'] = 128;
SEPARATOR['G'] = 128;
SEPARATOR['C'] = 128;
/*
* Prefix characters for encoded words and numbers.
*/
flag = '`';
flag1 = '!';
flag2 = '$';
/*slab_cache_add(sizeof (dict_entry_t));
slab_cache_add(sizeof (hash_context_t));
slab_cache_add(sizeof (list_context_t));*/
}
DictFilter::~DictFilter()
{
pthread_mutex_lock(&inst_lock);
if (inst) {
delete inst;
}
pthread_mutex_unlock(&inst_lock);
}
uint8_t *
DictFilter::to_base_enc(uint32_t number, uint8_t *str, int sz)
{
sz--;
str[sz] = '\0';
sz--;
while (number > 0 && sz >= 0) {
uint32_t rem = number % NUMERAL_BASE;
str[sz--] = base_enc_digits[rem];
number /= NUMERAL_BASE;
}
sz++;
return (&str[sz]);
}
uint32_t
DictFilter::from_base_enc(uint8_t *dnum, int sz)
{
uint32_t pow = 1;
uint32_t num = 0;
if (sz == 0) return (0);
while (sz > 0) {
uint32_t c = dnum[sz-1];
c = base_dec_digits[c];
num += (c * pow);
pow *= NUMERAL_BASE;
sz--;
}
return (num);
}
/*
* Search for a string in the hash table bucket chain. The first letter is
* always lower-cased for Proper-case capital-converted comparison.
*/
dict_entry_t *
DictFilter::find_string(dict_entry_t *de, uint8_t *str, unsigned int sz, uint8_t lcfirst)
{
uint8_t c1 = lcfirst;
while(de) {
if (de->sz == sz) {
uint8_t c2 = de->lcfirst;
if (c1 == c2) {
if (eq_bytes(de->word+1, str+1, sz-1) == 0)
return (de);
}
}
de = de->next;
}
return (NULL);
}
void
DictFilter::hash_context_init(hash_context_t *hctx, uint32_t dictsize)
{
hctx->dict = new dict_entry_t* [dictsize]();
hctx->dictcount = 0;
hctx->dictsize = dictsize;
hctx->collisions = 0;
hctx->sentinel = new dict_entry_t[1]();
}
void
DictFilter::hash_context_delete(hash_context_t *hctx) {
uint32_t i;
for (i=0; i<hctx->dictsize; i++) {
if (hctx->dict[i]) {
dict_entry_t *de, *de1;
de = hctx->dict[i];
while (de) {
de1 = de->next;
delete de;
de = de1;
}
}
}
delete hctx->dict;
delete hctx->sentinel;
hctx->dictcount = 0;
hctx->collisions = 0;
}
dict_entry_t *
DictFilter::hash_lookup(hash_context_t *hctx, uint8_t *word, uint32_t wordsize, uint8_t lcfirst)
{
uint32_t indx;
indx = XXH32(word+1, wordsize-1, lcfirst) % hctx->dictsize;
hctx->cur_indx = indx;
return (find_string(hctx->dict[indx], word, wordsize, lcfirst));
}
dict_entry_t *
DictFilter::hash_add(hash_context_t *hctx, uint8_t *word, uint32_t wordsize, dict_entry_t *_de)
{
dict_entry_t *de;
uint8_t lcfirst;
lcfirst = tolower(word[0]);
/*
* As of now non-NULL _de means a lookup was already done and match was not found
* and the hash table is full.
* So we are adding a new entry with a aged out node. No need to do another lookup.
*/
if (!_de) {
de = hash_lookup(hctx, word, wordsize, lcfirst);
if (de) {
de->occur++;
return (hctx->sentinel);
}
if (hctx->dictcount == hctx->dictsize)
return (NULL);
} else {
hctx->cur_indx = XXH32(word+1, wordsize-1, lcfirst) % hctx->dictsize;
}
if (_de)
de = _de;
else
de = new dict_entry_t[1]();
de->word = word;
de->sz = wordsize;
de->lcfirst = lcfirst;
de->occur = 1;
de->indx = hctx->cur_indx;
if (hctx->dict[hctx->cur_indx])
hctx->collisions++;
de->next = hctx->dict[hctx->cur_indx];
hctx->dict[hctx->cur_indx] = de;
hctx->dictcount++;
return (de);
}
dict_entry_t *
DictFilter::hash_remove(hash_context_t *hctx, uint8_t *word, uint32_t wordsize, dict_entry_t *r_de)
{
dict_entry_t *de;
uint8_t lcfirst;
if (!r_de) {
lcfirst = tolower(word[0]);
de = hash_lookup(hctx, word, wordsize, lcfirst);
} else {
de = r_de;
hctx->cur_indx = de->indx;
}
if (de) {
dict_entry_t *c_de, *p_de;
de->indx = UINT32_MAX;
c_de = hctx->dict[hctx->cur_indx];
if (c_de == de) {
hctx->dict[hctx->cur_indx] = c_de->next;
hctx->dictcount--;
return (de);
}
p_de = c_de;
c_de = c_de->next;
while (c_de) {
if (c_de == de) {
p_de->next = c_de->next;
hctx->dictcount--;
return (de);
}
p_de = c_de;
c_de = c_de->next;
}
assert(0 == 1); // Fail, corrupted hash
}
return (NULL);
}
void
DictFilter::list_context_init(list_context_t *lctx, uint32_t listsize)
{
lctx->head = new dict_entry_t[1]();
lctx->tail = lctx->head;
lctx->listcount = 0;
lctx->listsize = listsize;
lctx->aged_entries = 0;
}
void
DictFilter::list_context_delete(list_context_t *lctx)
{
delete lctx->head;
lctx->listcount = 0;
lctx->aged_entries = 0;
}
dict_entry_t *
DictFilter::list_push(list_context_t *lctx, dict_entry_t *de)
{
if (lctx->listcount == lctx->listsize)
return (NULL);
lctx->tail->list_next = de;
de->list_next = NULL;
lctx->tail = de;
lctx->listcount++;
return (de);
}
/*
* Identify a dictionary entry to evict from the N least recently used
* entries at the list head. The entry with the lowest occurrence count
* which is below a given threshold is evicted.
* If no such entry can be found then the current lru aging request is not
* fulfilled. Also, all the N entries are rotated to the tail of the list.
* This increases the likelihood of finding an entry to evict for the next
* request. This allows incremental sequential probing of the list without
* incurring the cost of very large sequential scans, but at the cost of
* missing some interesting words.
* N is kept a small positive number.
*/
dict_entry_t *
DictFilter::list_pop_lru_min(list_context_t *lctx)
{
dict_entry_t *p_de, *c_de;
dict_entry_t *min, *min_p;
uint32_t list_scan, occur, maxoccur;
if (lctx->listcount == 0)
return (NULL);
lctx->aging_requests++;
p_de = lctx->head;
c_de = lctx->head->list_next;
min = NULL;
if (lctx->listcount > LIST_LRU_NUM)
list_scan = LIST_LRU_NUM;
else
list_scan = lctx->listcount;
occur = UINT32_MAX;
maxoccur = 0;
while (c_de && c_de != lctx->tail && list_scan > 0) {
if (c_de->occur < occur) {
min = c_de;
min_p = p_de;
occur = c_de->occur;
}
list_scan--;
p_de = c_de;
c_de = c_de->list_next;
}
if (min && min->occur * min->sz < 2048) {
min_p->list_next = min->list_next;
lctx->aged_entries++;
lctx->listcount--;
return (min);
}
if (lctx->listcount > LIST_LRU_NUM) {
lctx->tail->list_next = lctx->head->list_next;
lctx->head->list_next = c_de;
p_de->list_next = NULL;
}
return (NULL);
}
int
DictFilter::Forward_Dict(uint8_t *src, uint32_t size, uint8_t *dst, uint32_t *dstsize)
{
uint32_t dstSize = 0, dictSize, i, pos, num_entries;
hash_context_t hctx;
list_context_t lctx;
dict_entry_t **sorted_dict;
uint8_t num_dict[10], *numd;
ssize_t new_size;
int rv, sz;
if (size < 1024)
return 0;
if (size > 20000) {
dictSize = size / 10000;
dictSize += (dictSize >> 1);
} else {
dictSize = (size >> 1);
}
dictSize++;
pos = 0;
rv = 0;
hash_context_init(&hctx, dictSize);
list_context_init(&lctx, dictSize);
sorted_dict = new dict_entry_t* [dictSize];
/*
* Scan words in the data and build the dictionary.
*/
for (i=0; i<size; i++) {
uint8_t c = src[i];
if (SEPARATOR[c] & 1) {
dict_entry_t *de;
size_t toklen = i - pos;
if (toklen < WORD_MIN || toklen > WORD_MAX) {
pos = i+1;
continue;
}
de = hash_add(&hctx, src+pos, toklen, NULL);
if (!de && i > (size>>1)) {
de = list_pop_lru_min(&lctx);
if (de) {
dict_entry_t *de1;
de1 = hash_remove(&hctx, de->word, de->sz, de);
assert(de1 == de);
de1 = hash_add(&hctx, src+pos, toklen, de);
assert(de1 != NULL);
assert(de1 != hctx.sentinel);
list_push(&lctx, de1);
}
} else if (de != hctx.sentinel) {
list_push(&lctx, de);
}
pos = i+1;
}
}
/*
* Mark below-threshold entries in the dictionary. Also sorted_dict holds a
* flattened view of the hash.
*/
pos = 0;
for (i=0; i<dictSize; i++) {
if (hctx.dict[i]) {
dict_entry_t *de;
de = hctx.dict[i];
while (de) {
ssize_t val;
val = (size_t)de->occur * (size_t)de->sz;
if (val <= 4500) {
de->occur = 0;
de = de->next;
continue;
}
sorted_dict[pos++] = de;
de = de->next;
}
}
}
/*
* Sort the flattened view of the hash in descending order of
* occurrence X word size.
*/
qsort(sorted_dict, pos, sizeof (dict_entry_t *), cmpoccur);
num_entries = 0;
new_size = size;
for (i=0; i<pos; i++) {
dict_entry_t *de;
ssize_t prev_size;
de = sorted_dict[i];
if (de->occur > 1) {
ssize_t val;
/*
* Mark entries for which the encoded representation will be
* larger than the original.
*/
prev_size = new_size;
val = (size_t)de->occur * (size_t)de->sz;
new_size -= val;
if (num_entries == 0)
new_size += ((size_t)de->sz + (size_t)de->occur * 1);
else if (num_entries < NUMERAL_BASE)
new_size += ((size_t)de->sz + (size_t)de->occur * 2);
else if (num_entries < NUMERAL_BASE * NUMERAL_BASE)
new_size += ((size_t)de->sz + (size_t)de->occur * 3);
else if (num_entries < NUMERAL_BASE * NUMERAL_BASE * NUMERAL_BASE)
new_size += ((size_t)de->sz + (size_t)de->occur * 4);
else
new_size += ((size_t)de->sz + (size_t)de->occur * 5);
if (new_size >= prev_size) {
new_size = prev_size;
de->occur = 0;
continue;
}
de->indx = num_entries;
num_entries++;
} else {
de->occur = 0;
}
}
sz = sizeof (num_dict);
numd = to_base_enc(num_entries, num_dict, sz);
dstSize = num_dict+sz-numd-1;
copy_bytes(dst, numd, dstSize);
dst[dstSize++] = ' ';
/*
* Copy the dictionary to the output buffer.
*/
for (i=0; i<pos && dstSize<*dstsize; i++) {
dict_entry_t *de;
de = sorted_dict[i];
if (de->occur > 1) {
dst[dstSize++] = de->lcfirst;
if (dstSize + de->sz + 1 >= *dstsize) {
goto bail;
}
copy_bytes(&dst[dstSize], de->word+1, de->sz-1);
dstSize += (de->sz-1);
dst[dstSize++] = ' ';
}
}
pos = 0;
for (i=0; i<size && dstSize<*dstsize; i++) {
uint8_t *tok, c;
c = src[i];
if (SEPARATOR[c] & 1) {
dict_entry_t *de;
size_t toklen = i - pos;
if (toklen < WORD_MIN || toklen > WORD_MAX) {
if (*(src+pos) == flag || *(src+pos) == flag1 ||
*(src+pos) == flag2 || *(src+pos) == '\\') {
dst[dstSize++] = '\\';
}
if (dstSize + toklen + 1 > *dstsize) {
goto bail;
}
copy_bytes(&dst[dstSize], src+pos, toklen+1);
dstSize += (toklen+1);
pos = i+1;
continue;
}
tok = src+pos;
de = hash_lookup(&hctx, tok, toklen, tolower(tok[0]));
if (de != NULL && de->occur > 1) {
uint16_t val;
unsigned char tok_hdr[10], *dnum;
/*
* Encode word with dictionary reference.
*/
sz = sizeof (tok_hdr);
val = de->indx;
dnum = to_base_enc(val, tok_hdr, sz);
dnum--;
if (isupper(tok[0])) {
*dnum = flag1;
} else {
*dnum = flag;
}
val = tok_hdr+sz - dnum-1;
if (dstSize + val + 1 > *dstsize) {
goto bail;
}
copy_bytes(&dst[dstSize], dnum, val);
dstSize += val;
dst[dstSize++] = src[i];
} else {
uint8_t *word = src+pos;
uint8_t num[15];
uint32_t val;
int converted;
/*
* Encode literal numeric strings.
*/
converted = 0;
if (isdigit(word[0]) && word[0] != '0' && toklen > 4 && toklen < 10) {
copy_bytes(num, word, toklen);
num[toklen] = '\0';
val = strtoul((const char *)num, (char **)&word, 10);
if (*word == '\0' && word - num == toklen && val > 0) {
uint8_t tok_hdr[10], *dnum;
sz = sizeof (tok_hdr);
dnum = to_base_enc(val, tok_hdr, sz);
dnum--;
*dnum = flag2;
val = tok_hdr+sz - dnum-1;
if (dstSize + val + 1 > *dstsize) {
goto bail;
}
copy_bytes(&dst[dstSize], dnum, val);
dstSize += val;
dst[dstSize++] = src[i];
converted = 1;
}
}
if (!converted) {
if (*(src+pos) == flag || *(src+pos) == flag1 ||
*(src+pos) == flag2 || *(src+pos) == '\\') {
dst[dstSize++] = '\\';
}
if (dstSize + toklen + 1 > *dstsize) {
goto bail;
}
copy_bytes(&dst[dstSize], src+pos, toklen+1);
dstSize += (toklen+1);
}
}
pos = i+1;
}
}
if (pos < size) {
uint32_t sz = size - pos;
if (dstSize + sz > *dstsize) {
goto bail;
}
copy_bytes(&dst[dstSize], src+pos, sz);
dstSize += sz;
}
*dstsize = dstSize;
rv = 1;
bail:
hash_context_delete(&hctx);
list_context_delete(&lctx);
delete sorted_dict;
return rv;
}
int
DictFilter::Forward_Dict_Fasta(uint8_t *src, uint32_t size, uint8_t *dst, uint32_t *dstsize)
{
uint32_t dstSize = 0, dictSize, i, pos, num_entries;
hash_context_t hctx;
list_context_t lctx;
dict_entry_t **sorted_dict;
uint8_t num_dict[10], *numd;
ssize_t new_size;
int rv, sz, j;
if (size < 1024)
return 0;
if (size > 20000) {
dictSize = size / 10000;
dictSize += (dictSize >> 1);
} else {
dictSize = (size >> 1);
}
dictSize++;
pos = 0;
rv = 0;
j = 0;
hash_context_init(&hctx, dictSize);
list_context_init(&lctx, dictSize);
sorted_dict = new dict_entry_t* [dictSize];
/*
* Scan words in the data and build the dictionary.
*/
for (i=0; i<size; i++) {
uint8_t c = src[i];
int is_sep, genome;
if (c == flag || c == flag1 || c == flag2)
goto bail;
is_sep = SEPARATOR[c] & 1;
if (is_sep || j == 4) {
dict_entry_t *de;
size_t toklen = i - pos;
if (j == 4 && !is_sep) {
unsigned char *bf = src+pos;
genome = ((SEPARATOR[bf[0]]&128) & (SEPARATOR[bf[1]]&128) &
(SEPARATOR[bf[2]]&128) & (SEPARATOR[bf[3]]&128));
if (!genome) {
j = 0;
continue;
}
}
j = 0;
if (toklen < WORD_MIN || toklen > WORD_MAX) {
if (is_sep) {
pos = i+1;
j--;
} else {
pos = i;
}
j++;
continue;
}
de = hash_add(&hctx, src+pos, toklen, NULL);
if (!de && i > (size>>1)) {
de = list_pop_lru_min(&lctx);
if (de) {
dict_entry_t *de1;
de1 = hash_remove(&hctx, de->word, de->sz, de);
assert(de1 == de);
de1 = hash_add(&hctx, src+pos, toklen, de);
assert(de1 != NULL);
assert(de1 != hctx.sentinel);
list_push(&lctx, de1);
}
} else if (de != hctx.sentinel) {
list_push(&lctx, de);
}
if (is_sep) {
pos = i+1;
j--;
} else {
pos = i;
}
}
j++;
}
/*
* Mark below-threshold entries in the dictionary. Also sorted_dict holds a
* flattened view of the hash.
*/
pos = 0;
for (i=0; i<dictSize; i++) {
if (hctx.dict[i]) {
dict_entry_t *de;
de = hctx.dict[i];
while (de) {
ssize_t val;
val = (size_t)de->occur * (size_t)de->sz;
if (val <= 4500) {
de->occur = 0;
de = de->next;
continue;
}
sorted_dict[pos++] = de;
de = de->next;
}
}
}
/*
* Sort the flattened view of the hash in descending order of
* occurrence X word size.
*/
qsort(sorted_dict, pos, sizeof (dict_entry_t *), cmpoccur);
num_entries = 0;
new_size = size;
for (i=0; i<pos; i++) {
dict_entry_t *de;
ssize_t prev_size;
de = sorted_dict[i];
if (de->occur > 1) {
ssize_t val;
/*
* Mark entries for which the encoded representation will be
* larger than the original.
*/
prev_size = new_size;
val = (size_t)de->occur * (size_t)de->sz;
new_size -= val;
if (num_entries == 0)
new_size += ((size_t)de->sz + (size_t)de->occur * 1);
else if (num_entries < NUMERAL_BASE)
new_size += ((size_t)de->sz + (size_t)de->occur * 2);
else if (num_entries < NUMERAL_BASE * NUMERAL_BASE)
new_size += ((size_t)de->sz + (size_t)de->occur * 3);
else if (num_entries < NUMERAL_BASE * NUMERAL_BASE * NUMERAL_BASE)
new_size += ((size_t)de->sz + (size_t)de->occur * 4);
else
new_size += ((size_t)de->sz + (size_t)de->occur * 5);
if (new_size >= prev_size) {
new_size = prev_size;
de->occur = 0;
continue;
}
de->indx = num_entries;
num_entries++;
} else {
de->occur = 0;
}
}
sz = sizeof (num_dict);
numd = to_base_enc(num_entries, num_dict, sz);
dstSize = num_dict+sz-numd-1;
copy_bytes(dst, numd, dstSize);
dst[dstSize++] = ' ';
// Copy the flags
dst[dstSize++] = flag;
dst[dstSize++] = flag1;
dst[dstSize++] = flag2;
dst[dstSize++] = ' ';
/*
* Copy the dictionary to the output buffer.
*/
for (i=0; i<pos && dstSize<*dstsize; i++) {
dict_entry_t *de;
de = sorted_dict[i];
if (de->occur > 1) {
dst[dstSize++] = de->lcfirst;
if (dstSize + de->sz + 1 >= *dstsize) {
goto bail;
}
copy_bytes(&dst[dstSize], de->word+1, de->sz-1);
dstSize += (de->sz-1);
dst[dstSize++] = ' ';
}
}
pos = 0;
j = 0;
for (i=0; i<size && dstSize<*dstsize; i++) {
uint8_t *tok, c;
int is_sep, genome;
c = src[i];
is_sep = SEPARATOR[c] & 1;
if (is_sep || j == 4) {
dict_entry_t *de;
size_t toklen = i - pos;
if (j == 4 && !is_sep) {
unsigned char *bf = src+pos;
genome = ((SEPARATOR[bf[0]]&128) & (SEPARATOR[bf[1]]&128) &
(SEPARATOR[bf[2]]&128) & (SEPARATOR[bf[3]]&128));
if (!genome) {
j = 0;
continue;
}
}
j = 0;
if (toklen < WORD_MIN || toklen > WORD_MAX) {
if (is_sep) {
if (dstSize + toklen + 1 > *dstsize) {
goto bail;
}
copy_bytes(&dst[dstSize], src+pos, toklen+1);
dstSize += (toklen+1);
pos = i+1;
j--;
} else {
if (dstSize + toklen > *dstsize) {
goto bail;
}
copy_bytes(&dst[dstSize], src+pos, toklen);
dstSize += toklen;
pos = i;
}
j++;
continue;
}
tok = src+pos;
de = hash_lookup(&hctx, tok, toklen, tolower(tok[0]));
if (de != NULL && de->occur > 1) {
uint16_t val;
unsigned char tok_hdr[10], *dnum;
/*
* Encode word with dictionary reference.
*/
sz = sizeof (tok_hdr);
val = de->indx;
dnum = to_base_enc(val, tok_hdr, sz);
dnum--;
if (isupper(tok[0])) {
*dnum = flag1;
} else {
*dnum = flag;
}
val = tok_hdr+sz - dnum-1;
if (dstSize + val + 1 > *dstsize) {
goto bail;
}
copy_bytes(&dst[dstSize], dnum, val);
dstSize += val;
if (is_sep) {
dst[dstSize++] = src[i];
} else {
dst[dstSize++] = flag2;
}
} else {
if (is_sep) {
if (dstSize + toklen + 1 > *dstsize) {
goto bail;
}
copy_bytes(&dst[dstSize], src+pos, toklen+1);
dstSize += (toklen+1);
} else {
if (dstSize + toklen > *dstsize) {
goto bail;
}
copy_bytes(&dst[dstSize], src+pos, toklen);
dstSize += toklen;
}
}
if (is_sep) {
pos = i+1;
j--;
} else {
pos = i;
}
}
j++;
}
if (pos < size) {
uint32_t sz = size - pos;
if (dstSize + sz > *dstsize) {
goto bail;
}
copy_bytes(&dst[dstSize], src+pos, sz);
dstSize += sz;
}
*dstsize = dstSize;
rv = 1;
bail:
hash_context_delete(&hctx);
list_context_delete(&lctx);
delete sorted_dict;
return rv;
}
int
DictFilter::Inverse_Dict(uint8_t *src, uint32_t srclen, uint8_t *dst, uint32_t *dstsize)
{
uint32_t numWords, i, enclen, pos;
uint8_t *srcpos, *end, *dstpos, *dstend, c;
decode_dict_entry_t *w_dict;
end = src + srclen;
srcpos = (uint8_t *)memchr((const void *)src, ' ', WORD_MAX);
if (srcpos - src > 12) {
return (0);
}
numWords = from_base_enc(src, srcpos - src);
srcpos++;
w_dict = new decode_dict_entry_t[numWords];
for (i = 0; i < numWords && srcpos < end; i++) {
uint8_t *w_src = srcpos;
size_t limit;
limit = end - srcpos;
if (limit > WORD_MAX+1) limit = WORD_MAX+1;
srcpos = (uint8_t *)memchr((const void *)srcpos, ' ', limit);
if (srcpos - w_src > WORD_MAX)
return (0);
w_dict[i].sz = srcpos - w_src;
w_dict[i].word = w_src;
srcpos++;
}
enclen = srclen - (srcpos - src);
dstpos = dst;
dstend = dst + *dstsize;
pos = 0;
for (i = 0; i < enclen && dstpos < dstend; i++) {
c = srcpos[i];
if (SEPARATOR[c] & 1) {
uint32_t toklen = i - pos;
uint32_t dpos;
c = srcpos[pos];
if (toklen == 0) {
*dstpos++ = srcpos[i];
} else if (c == '\\') {
if (dstpos + toklen > dstend) {
log_msg(LOG_ERR, 0, "Overflow in DICT decode.\n");
return (0);
}
copy_bytes(dstpos, srcpos+pos+1, toklen);
dstpos += toklen;
} else if (c == flag) {
toklen--;
dpos = from_base_enc(srcpos+pos+1, toklen);
if (dstpos + w_dict[dpos].sz > dstend) {
log_msg(LOG_ERR, 0, "Overflow in DICT decode.\n");
return (0);
}
copy_bytes(dstpos, w_dict[dpos].word, w_dict[dpos].sz);
dstpos += w_dict[dpos].sz;
*dstpos++ = srcpos[i];
} else if (c == flag1) {
toklen--;
dpos = from_base_enc(srcpos+pos+1, toklen);
if (dstpos + w_dict[dpos].sz > dstend) {
log_msg(LOG_ERR, 0, "Overflow in DICT decode.\n");
return (0);
}
*dstpos++ = toupper(*(w_dict[dpos].word));
copy_bytes(dstpos, w_dict[dpos].word+1, w_dict[dpos].sz-1);
dstpos += (w_dict[dpos].sz-1);
*dstpos++ = srcpos[i];
} else if (c == flag2) {
uint32_t n;
toklen--;
dpos = from_base_enc(srcpos+pos+1, toklen);
n = snprintf((char *)dstpos, dstend - dstpos, "%u", dpos);
if (n >= dstend - dstpos) {
log_msg(LOG_ERR, 0, "Overflow in DICT decode.\n");
return (0);
}
dstpos += n;
*dstpos++ = srcpos[i];
} else {
if (dstpos + toklen + 1 > dstend) {
log_msg(LOG_ERR, 0, "Overflow in DICT decode.\n");
return (0);
}
copy_bytes(dstpos, srcpos+pos, toklen+1);
dstpos += (toklen+1);
}
pos = i+1;
}
}
if (pos < i) {
if (dstpos + i - pos > dstend) {
log_msg(LOG_ERR, 0, "Overflow in DICT decode.\n");
return (0);
}
copy_bytes(dstpos, srcpos+pos, i-pos);
dstpos += (i-pos);
}
*dstsize = dstpos - dst;
return (1);
}
int
DictFilter::Inverse_Dict_Fasta(uint8_t *src, uint32_t srclen, uint8_t *dst, uint32_t *dstsize)
{
uint32_t numWords, i, enclen, pos;
uint8_t *srcpos, *end, *dstpos, *dstend, c;
decode_dict_entry_t *w_dict;
int flag, flag1, flag2;
uint8_t separator[256];
end = src + srclen;
srcpos = (uint8_t *)memchr((const void *)src, ' ', WORD_MAX);
if (srcpos - src > 12) {
return (0);
}
numWords = from_base_enc(src, srcpos - src);
srcpos++;
if ((srclen - (srcpos - src)) < 10) {
return (0);
}
flag = *srcpos++;
flag1 = *srcpos++;
flag2 = *srcpos++;
if (*srcpos != ' ')
return (0);
srcpos++;
memcpy(separator, SEPARATOR, sizeof (SEPARATOR));
separator[flag] = 1;
separator[flag1] = 1;
separator[flag2] = 1;
w_dict = new decode_dict_entry_t[numWords];
for (i = 0; i < numWords && srcpos < end; i++) {
size_t limit;
uint8_t *w_src;
w_src = srcpos;
limit = end - srcpos;
if (limit > WORD_MAX+1) limit = WORD_MAX+1;
srcpos = (uint8_t *)memchr((const void *)srcpos, ' ', limit);
if (srcpos - w_src > WORD_MAX)
return (0);
w_dict[i].sz = srcpos - w_src;
w_dict[i].word = w_src;
srcpos++;
}
enclen = srclen - (srcpos - src);
dstpos = dst;
dstend = dst + *dstsize;
pos = 0;
for (i = 1; i < enclen && dstpos < dstend; i++) {
c = srcpos[i];
if (separator[c]&1) {
uint32_t toklen;
uint32_t dpos;
uint8_t pc;
toklen = i - pos;
pc = srcpos[pos];
if (pc == flag) {
toklen --;
dpos = from_base_enc(srcpos+pos+1, toklen);
if (dstpos + w_dict[dpos].sz > dstend) {
log_msg(LOG_ERR, 0, "1: Overflow in DICT decode.");
return (0);
}
copy_bytes(dstpos, w_dict[dpos].word, w_dict[dpos].sz);
dstpos += w_dict[dpos].sz;
} else if (pc == flag1) {
toklen --;
dpos = from_base_enc(srcpos+pos+1, toklen);
if (dstpos + w_dict[dpos].sz > dstend) {
log_msg(LOG_ERR, 0, "2: Overflow in DICT decode.");
return (0);
}
*dstpos++ = toupper(*(w_dict[dpos].word));
copy_bytes(dstpos, w_dict[dpos].word+1, w_dict[dpos].sz-1);
dstpos += (w_dict[dpos].sz-1);
} else if (pc == flag2) {
toklen --;
if (toklen > 0) {
if (dstpos + toklen > dstend) {
log_msg(LOG_ERR, 0, "3: Overflow in DICT decode.");
return (0);
}
copy_bytes(dstpos, srcpos+pos+1, toklen);
dstpos += toklen;
}
} else {
if (dstpos + toklen > dstend) {
log_msg(LOG_ERR, 0, "4: Overflow in DICT decode.");
return (0);
}
copy_bytes(dstpos, srcpos+pos, toklen);
dstpos += toklen;
}
pos = i;
}
}
if (pos < i) {
uint32_t toklen;
uint32_t dpos;
uint8_t pc;
toklen = i - pos;
pc = srcpos[pos];
if (pc == flag) {
toklen --;
dpos = from_base_enc(srcpos+pos+1, toklen);
if (dstpos + w_dict[dpos].sz > dstend) {
log_msg(LOG_ERR, 0, "5: Overflow in DICT decode.\n");
return (0);
}
copy_bytes(dstpos, w_dict[dpos].word, w_dict[dpos].sz);
dstpos += w_dict[dpos].sz;
} else if (pc == flag1) {
toklen --;
dpos = from_base_enc(srcpos+pos+1, toklen);
if (dstpos + w_dict[dpos].sz > dstend) {
log_msg(LOG_ERR, 0, "6: Overflow in DICT decode.\n");
return (0);
}
*dstpos++ = toupper(*(w_dict[dpos].word));
copy_bytes(dstpos, w_dict[dpos].word+1, w_dict[dpos].sz-1);
dstpos += (w_dict[dpos].sz-1);
} else if (pc == flag2) {
toklen --;
if (toklen > 0) {
if (dstpos + toklen > dstend) {
log_msg(LOG_ERR, 0, "7: Overflow in DICT decode.\n");
return (0);
}
copy_bytes(dstpos, srcpos+pos+1, toklen);
dstpos += toklen;
}
} else {
if (dstpos + toklen > dstend) {
log_msg(LOG_ERR, 0, "8: Overflow in DICT decode.\n");
return (0);
}
copy_bytes(dstpos, srcpos+pos, toklen);
dstpos += toklen;
}
}
*dstsize = dstpos - dst;
return (1);
}
#ifdef __cplusplus
extern "C" {
#endif
int
dict_encode(uint8_t *from, uint64_t fromlen, uint8_t *to, uint64_t *dstlen, int is_fasta)
{
DictFilter *df = DictFilter::getInstance();
u32 fl;
u32 dl;
uint8_t *dst;
int rv;
DEBUG_STAT_EN(double strt, en);
/*
* Dict can't handle > 4GB buffers :-O
*/
if (fromlen > UINT32_MAX)
return (-1);
fl = (u32)fromlen;
dl = (u32)(*dstlen);
DEBUG_STAT_EN(strt = get_wtime_millis());
U32_P(to) = LE32(fl);
dst = to + 4;
dl -= 4;
if (!is_fasta) {
*dst++ = 0;
dl--;
rv = df->Forward_Dict(from, fl, dst, &dl);
} else {
*dst++ = 1;
dl--;
rv = df->Forward_Dict_Fasta(from, fl, dst, &dl);
}
if (rv) {
*dstlen = dl + 5;
DEBUG_STAT_EN(en = get_wtime_millis());
DEBUG_STAT_EN(fprintf(stderr, "DICT: fromlen: %" PRIu64 ", dstlen: %" PRIu64 "\n",
fromlen, *dstlen));
DEBUG_STAT_EN(fprintf(stderr, "DICT: Processed at %.3f MB/s\n",
get_mb_s(fromlen, strt, en)));
return (1);
}
DEBUG_STAT_EN(fprintf(stderr, "No DICT\n"));
return (-1);
}
int
dict_decode(uint8_t *from, uint64_t fromlen, uint8_t *to, uint64_t *dstlen)
{
DictFilter *df = DictFilter::getInstance();
u32 fl;
u32 dl;
u8 *src;
int rv, is_fasta;
DEBUG_STAT_EN(double strt, en);
if (fromlen > UINT32_MAX) {
log_msg(LOG_ERR, 0, "Dict decode buffer too big!");
return (-1);
}
fl = (u32)fromlen;
DEBUG_STAT_EN(strt = get_wtime_millis());
dl = U32_P(from);
if (dl > *dstlen) {
log_msg(LOG_ERR, 0, "Destination overflow in dict_decode. Need: %" PRIu64 ", Got: %" PRIu64 "\n",
dl, *dstlen);
return (-1);
}
*dstlen = dl;
src = from + 4;
fl -= 4;
is_fasta = *src++;
fl--;
if (!is_fasta)
rv = df->Inverse_Dict(src, fl, to, &dl);
else
rv = df->Inverse_Dict_Fasta(src, fl, to, &dl);
if (!rv) {
log_msg(LOG_ERR, 0, "dict_decode: Failed.\n");
return (-1);
}
if (dl < *dstlen) {
log_msg(LOG_ERR, 0, "dict_decode: Expected: %" PRIu64 ", Got: %" PRIu64 "\n",
*dstlen, dl);
return (-1);
}
DEBUG_STAT_EN(en = get_wtime_millis());
DEBUG_STAT_EN(fprintf(stderr, "DICT: fromlen: %" PRIu64 ", dstlen: %" PRIu64 "\n",
fromlen, *dstlen));
DEBUG_STAT_EN(fprintf(stderr, "DICT: Processed at %.3f MB/s\n",
get_mb_s(fromlen, strt, en)));
return (0);
}
#ifdef __cplusplus
}
#endif