Improve XML detection in adaptive mode.

This commit is contained in:
Moinak Ghosh 2013-02-17 21:36:20 +05:30
parent 6badbcaea7
commit f41ea40bb9

View file

@ -179,26 +179,35 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
{ {
struct adapt_data *adat = (struct adapt_data *)(data); struct adapt_data *adat = (struct adapt_data *)(data);
uchar_t *src1 = (uchar_t *)src; uchar_t *src1 = (uchar_t *)src;
uint64_t i, tot8b, tagcnt; uint64_t i, tot8b, tag1, tag2, tag3;
int rv = 0, tag; int rv = 0;
double tagcnt, pct_tag;
uchar_t cur_byte, prev_byte;
/* /*
* Count number of 8-bit binary bytes and XML tags in source. * Count number of 8-bit binary bytes and XML tags in source.
*/ */
tot8b = 0; tot8b = 0;
tagcnt = 0; tag1 = 0;
tag2 = 0;
tag3 = 0;
prev_byte = cur_byte = 0;
for (i = 0; i < srclen; i++) { for (i = 0; i < srclen; i++) {
/*
* This could have been: tot8b += (src1[i] >> 7); cur_byte = src1[i];
* However the approach below allows the compiler to auto-vectorize this tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization
* loop. tag1 += (cur_byte == '<');
*/ tag2 += (cur_byte == '>');
tot8b += (src1[i] & 0x80); tag3 += ((prev_byte == '<') & (cur_byte == '/'));
tag = ((src1[i] == '<') | (src1[i] == '>')); tag3 += ((prev_byte == '/') & (cur_byte == '>'));
tagcnt += tag; if (cur_byte != ' ')
prev_byte = cur_byte;
} }
tot8b /= 0x80; tot8b /= 0x80;
tagcnt = tag1 + tag2 + tag3;
pct_tag = tagcnt / (double)srclen;
/* /*
* Use PPMd if some percentage of source is 7-bit textual bytes, otherwise * Use PPMd if some percentage of source is 7-bit textual bytes, otherwise
* use Bzip2 or LZMA. * use Bzip2 or LZMA.
@ -218,7 +227,8 @@ adapt_compress(void *src, uint64_t srclen, void *dst,
bzip2_count++; bzip2_count++;
} else { } else {
if (adat->bsc_data && tagcnt > ONE_PCT(srclen)) { if (adat->bsc_data && tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 &&
tagcnt > (double)srclen * 0.001) {
#ifdef ENABLE_PC_LIBBSC #ifdef ENABLE_PC_LIBBSC
rv = libbsc_compress(src, srclen, dst, dstlen, level, chdr, adat->bsc_data); rv = libbsc_compress(src, srclen, dst, dstlen, level, chdr, adat->bsc_data);
if (rv < 0) if (rv < 0)