70 lines
2.1 KiB
C
70 lines
2.1 KiB
C
|
/*
|
||
|
* This file is a part of Pcompress, a chunked parallel multi-
|
||
|
* algorithm lossless compression and decompression program.
|
||
|
*
|
||
|
* Copyright (C) 2012-2014 Moinak Ghosh. All rights reserved.
|
||
|
* Use is subject to license terms.
|
||
|
*
|
||
|
* This program is free software; you can redistribute it and/or
|
||
|
* modify it under the terms of the GNU Lesser General Public
|
||
|
* License as published by the Free Software Foundation; either
|
||
|
* version 3 of the License, or (at your option) any later version.
|
||
|
*
|
||
|
* This program is distributed in the hope that it will be useful,
|
||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
* Lesser General Public License for more details.
|
||
|
*
|
||
|
* You should have received a copy of the GNU Lesser General Public
|
||
|
* License along with this program.
|
||
|
* If not, see <http://www.gnu.org/licenses/>.
|
||
|
*
|
||
|
* moinakg@belenix.org, http://moinakg.wordpress.com/
|
||
|
*/
|
||
|
|
||
|
#include "utils.h"
|
||
|
|
||
|
int
|
||
|
analyze_buffer(void *src, uint64_t srclen, int btype, int adapt_mode)
|
||
|
{
|
||
|
uchar_t *src1 = (uchar_t *)src;
|
||
|
int stype = PC_SUBTYPE(btype);
|
||
|
|
||
|
if (btype == TYPE_UNKNOWN || stype == TYPE_ARCHIVE_TAR) {
|
||
|
uint32_t freq[256], freq0x80[2] = {0};
|
||
|
uint64_t i, alphabetNum = 0, tot8b = 0;
|
||
|
uchar_t cur_byte;
|
||
|
|
||
|
/*
|
||
|
* Count number of 8-bit binary bytes and XML tags in source.
|
||
|
*/
|
||
|
tot8b = 0;
|
||
|
for (i = 0; i < srclen; i++) {
|
||
|
cur_byte = src1[i];
|
||
|
tot8b += (cur_byte & 0x80); // This way for possible auto-vectorization
|
||
|
freq[cur_byte]++;
|
||
|
}
|
||
|
|
||
|
for (i = 0; i < 256; i++)
|
||
|
freq0x80[i>>7]+=freq[i];
|
||
|
|
||
|
for(i = 'a'; i <= 'z'; i++)
|
||
|
alphabetNum+=freq[i];
|
||
|
|
||
|
/*
|
||
|
* Heuristics for detecting BINARY vs generic TEXT
|
||
|
*/
|
||
|
tot8b /= 0x80;
|
||
|
if (tot8b < (srclen>>2 + srclen>>3)) {
|
||
|
btype = TYPE_TEXT;
|
||
|
if (freq0x80[1]<(srclen>>3) && (freq[' ']>(srclen>>7))
|
||
|
&& (freq['a']+freq['e']+freq['t']>(srclen>>4))
|
||
|
&& alphabetNum>(srclen>>2)) {
|
||
|
btype |= TYPE_ENGLISH;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return (btype);
|
||
|
}
|