2014-09-20 07:19:00 +00:00
|
|
|
/*
|
|
|
|
* This file is a part of Pcompress, a chunked parallel multi-
|
|
|
|
* algorithm lossless compression and decompression program.
|
|
|
|
*
|
|
|
|
* Copyright (C) 2012-2014 Moinak Ghosh. All rights reserved.
|
|
|
|
* Use is subject to license terms.
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
|
|
* License as published by the Free Software Foundation; either
|
|
|
|
* version 3 of the License, or (at your option) any later version.
|
|
|
|
*
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
* Lesser General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
|
|
* License along with this program.
|
|
|
|
* If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
*
|
|
|
|
* moinakg@belenix.org, http://moinakg.wordpress.com/
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "utils.h"
|
2014-11-06 16:53:33 +00:00
|
|
|
#include "analyzer.h"
|
|
|
|
|
2015-03-22 18:06:04 +00:00
|
|
|
#define FIFTY_PCT(x) ((((double)x)/10) * 5)
|
|
|
|
#define THIRTY_PCT(x) ((((double)x)/10) * 3)
|
|
|
|
#define TEN_PCT(x) (((double)x)/10)
|
2014-11-06 16:53:33 +00:00
|
|
|
|
|
|
|
void
|
|
|
|
analyze_buffer(void *src, uint64_t srclen, analyzer_ctx_t *actx)
|
|
|
|
{
|
|
|
|
uchar_t *src1 = (uchar_t *)src;
|
2015-01-11 12:06:46 +00:00
|
|
|
uint64_t i, tot8b, tot_8b, lbytes, spc;
|
2014-11-06 16:53:33 +00:00
|
|
|
uchar_t cur_byte, prev_byte;
|
|
|
|
uint64_t tag1, tag2, tag3;
|
|
|
|
double tagcnt, pct_tag;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Count number of 8-bit binary bytes and XML tags in source.
|
|
|
|
*/
|
|
|
|
tot8b = 0;
|
|
|
|
tag1 = 0;
|
|
|
|
tag2 = 0;
|
|
|
|
tag3 = 0;
|
|
|
|
lbytes = 0;
|
2015-01-11 12:06:46 +00:00
|
|
|
spc = 0;
|
2014-11-06 16:53:33 +00:00
|
|
|
prev_byte = cur_byte = 0;
|
2015-03-22 18:06:04 +00:00
|
|
|
memset(actx, 0, sizeof (analyzer_ctx_t));
|
2014-11-06 16:53:33 +00:00
|
|
|
for (i = 0; i < srclen; i++) {
|
|
|
|
cur_byte = src1[i];
|
2015-03-22 18:06:04 +00:00
|
|
|
tot8b += (cur_byte > 127);
|
2014-11-06 16:53:33 +00:00
|
|
|
lbytes += (cur_byte < 32);
|
2015-01-11 12:06:46 +00:00
|
|
|
spc += (cur_byte == ' ');
|
2014-11-06 16:53:33 +00:00
|
|
|
tag1 += (cur_byte == '<');
|
|
|
|
tag2 += (cur_byte == '>');
|
|
|
|
tag3 += ((prev_byte == '<') & (cur_byte == '/'));
|
|
|
|
tag3 += ((prev_byte == '/') & (cur_byte == '>'));
|
|
|
|
if (cur_byte != ' ')
|
|
|
|
prev_byte = cur_byte;
|
|
|
|
}
|
2015-01-09 16:43:24 +00:00
|
|
|
|
2014-11-06 16:53:33 +00:00
|
|
|
/*
|
|
|
|
* Heuristics for detecting BINARY vs generic TEXT vs XML data at various
|
|
|
|
* significance levels.
|
|
|
|
*/
|
2015-03-22 18:06:04 +00:00
|
|
|
tot_8b = tot8b + lbytes;
|
2015-01-11 12:06:46 +00:00
|
|
|
tagcnt = tag1 + tag2;
|
2014-11-06 16:53:33 +00:00
|
|
|
pct_tag = tagcnt / (double)srclen;
|
2015-03-22 18:06:04 +00:00
|
|
|
if (tot_8b > THIRTY_PCT(srclen)) {
|
|
|
|
actx->thirty_pct.btype = TYPE_BINARY;
|
2014-11-06 16:53:33 +00:00
|
|
|
} else {
|
2015-03-22 18:06:04 +00:00
|
|
|
actx->thirty_pct.btype = TYPE_TEXT;
|
2014-11-06 16:53:33 +00:00
|
|
|
}
|
2015-01-09 16:43:24 +00:00
|
|
|
|
2014-11-06 16:53:33 +00:00
|
|
|
if (tot_8b > FIFTY_PCT(srclen)) {
|
|
|
|
actx->fifty_pct.btype = TYPE_BINARY;
|
|
|
|
} else {
|
|
|
|
actx->fifty_pct.btype = TYPE_TEXT;
|
|
|
|
}
|
|
|
|
|
2015-03-22 18:06:04 +00:00
|
|
|
/* This should be tot8b and not tot_8b. */
|
2015-01-11 12:06:46 +00:00
|
|
|
if (tot8b <= TEN_PCT((double)srclen) && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
|
2015-03-22 18:06:04 +00:00
|
|
|
actx->ten_pct.btype = TYPE_TEXT;
|
|
|
|
} else {
|
|
|
|
actx->ten_pct.btype = TYPE_BINARY;
|
2014-11-06 16:53:33 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (tag1 > tag2 - 4 && tag1 < tag2 + 4 && tag3 > (double)tag1 * 0.40 &&
|
2015-03-22 18:06:04 +00:00
|
|
|
tagcnt > (double)spc * 0.06) {
|
|
|
|
actx->thirty_pct.btype |= TYPE_MARKUP;
|
|
|
|
actx->fifty_pct.btype |= TYPE_MARKUP;
|
|
|
|
actx->ten_pct.btype |= TYPE_MARKUP;
|
2014-11-06 16:53:33 +00:00
|
|
|
}
|
|
|
|
}
|
2014-09-20 07:19:00 +00:00
|
|
|
|
|
|
|
int
|
2014-11-06 16:53:33 +00:00
|
|
|
analyze_buffer_simple(void *src, uint64_t srclen)
|
2014-09-20 07:19:00 +00:00
|
|
|
{
|
|
|
|
uchar_t *src1 = (uchar_t *)src;
|
2014-09-20 16:19:06 +00:00
|
|
|
uint64_t i, tot8b, lbytes;
|
|
|
|
uchar_t cur_byte;
|
|
|
|
int btype = TYPE_UNKNOWN;
|
|
|
|
/*
|
|
|
|
* Count number of 8-bit binary bytes in source
|
|
|
|
*/
|
|
|
|
tot8b = 0;
|
|
|
|
lbytes = 0;
|
|
|
|
for (i = 0; i < srclen; i++) {
|
|
|
|
cur_byte = src1[i];
|
|
|
|
tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization
|
|
|
|
lbytes += (cur_byte < 32);
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Heuristics for detecting BINARY vs generic TEXT
|
|
|
|
*/
|
|
|
|
tot8b /= 0x80;
|
2015-01-11 12:06:46 +00:00
|
|
|
if (tot8b <= TEN_PCT((double)srclen) && lbytes < ((srclen>>1) + (srclen>>2) + (srclen>>3))) {
|
2014-09-20 16:19:06 +00:00
|
|
|
btype = TYPE_TEXT;
|
2014-09-20 07:19:00 +00:00
|
|
|
}
|
|
|
|
return (btype);
|
|
|
|
}
|
2014-11-06 16:53:33 +00:00
|
|
|
|