Improve file sorting algorithm.

Add more file extension names.
Fix data type mask size.
This commit is contained in:
Moinak Ghosh 2014-10-27 19:23:03 +05:30
parent cc68550670
commit b7804a0caa
6 changed files with 242 additions and 35 deletions

View file

@ -106,6 +106,7 @@ static struct arc_list_state {
pthread_mutex_t nftw_mutex = PTHREAD_MUTEX_INITIALIZER;
static int detect_type_by_ext(const char *path, int pathlen);
static int detect_type_from_ext(const char *ext, int len);
static int detect_type_by_data(uchar_t *buf, size_t len);
/*
@ -208,7 +209,6 @@ creat_write_callback(struct archive *arc, void *ctx, const void *buf, size_t len
Sem_Wait(&(pctx->write_sem));
tbuf = pctx->arc_buf + pctx->arc_buf_pos;
pctx->arc_writing = 1;
if (remaining > 0)
pctx->btype = pctx->ctype;
}
}
@ -373,6 +373,18 @@ compare_members(const void *a, const void *b) {
int rv, i;
member_entry_t *mem1 = (member_entry_t *)a;
member_entry_t *mem2 = (member_entry_t *)b;
uint64_t sz1, sz2;
/*
* First compare MSB of size. That separates extension and non-extension
* files.
*/
sz1 = mem1->size & 0x8000000000000000;
sz2 = mem2->size & 0x8000000000000000;
if (sz1 > sz2)
return (1);
else if (sz1 < sz2)
return (-1);
rv = 0;
for (i = 0; i < NAMELEN; i++) {
@ -380,9 +392,15 @@ compare_members(const void *a, const void *b) {
if (rv != 0)
return (rv);
}
if (mem1->size > mem2->size)
/*
* Clear high bits of size. They are just flags.
*/
sz1 = mem1->size & 0x7FFFFFFFFFFFFFFF;
sz2 = mem2->size & 0x7FFFFFFFFFFFFFFF;
if (sz1 > sz2)
return (1);
else if (mem1->size < mem2->size)
else if (sz1 < sz2)
return (-1);
return (0);
}
@ -394,6 +412,16 @@ compare_members(const void *a, const void *b) {
static int
compare_members_lt(member_entry_t *mem1, member_entry_t *mem2) {
int rv, i;
uint64_t sz1, sz2;
/*
* First compare MSB of size. That separates extension and non-extension
* files.
*/
sz1 = mem1->size & 0x8000000000000000;
sz2 = mem2->size & 0x8000000000000000;
if (sz1 < sz2)
return (1);
rv = 0;
for (i = 0; i < NAMELEN; i++) {
@ -403,7 +431,13 @@ compare_members_lt(member_entry_t *mem1, member_entry_t *mem2) {
else if (rv > 0)
return (0);
}
if (mem1->size < mem2->size)
/*
* Clear high bits of size. They are just flags.
*/
sz1 = mem1->size & 0x7FFFFFFFFFFFFFFF;
sz2 = mem2->size & 0x7FFFFFFFFFFFFFFF;
if (sz1 < sz2)
return (1);
return (0);
}
@ -662,6 +696,11 @@ add_pathname(const char *fpath, const struct stat *sb,
* If not a directory then we store upto first 4 chars of
* the extension, if present, or first 4 chars of the
* filename.
*
* NOTE: In order to separate files with and without extensions
* we set the MSB of the size parameter to 1 for extension
* and 0 for no extension. This limits the noted size of the
* file to INT64_MAX, but I think that is more than enough!
*/
for (i = 0; i < NAMELEN; i++) member->name[i] = 0;
@ -670,11 +709,14 @@ add_pathname(const char *fpath, const struct stat *sb,
while (basename[i] != '\0' && i < NAMELEN) {
member->name[i] = basename[i]; i++;
}
// Clear 64-bit MSB
member->size &= 0x7FFFFFFFFFFFFFFF;
} else {
dot++;
while (dot[i] != '\0' && i < NAMELEN) {
member->name[i] = dot[i]; i++;
}
member->size |= 0x8000000000000000;
}
} else {
/*
@ -690,6 +732,11 @@ add_pathname(const char *fpath, const struct stat *sb,
*/
for (i = 0; i < NAMELEN; i++) member->name[i] = 255;
member->size = INT64_MAX - ftwbuf->level;
/*
* Set 64-bit MSB to force directories to be bunched at the end.
*/
member->size |= 0x8000000000000000;
}
}
cont:
@ -1629,22 +1676,17 @@ disable_all_filters()
* outside the hash table range then the function returns unknown type.
*/
static int
detect_type_by_ext(const char *path, int pathlen)
detect_type_from_ext(const char *ext, int len)
{
const char *ext = NULL;
int i;
ub4 slot;
int i, len;
uint64_t extnum;
char extl[8];
uint64_t extnum;
for (i = pathlen-1; i > 0 && path[i] != '.' && path[i] != PATHSEP_CHAR; i--);
if (i == 0 || path[i] != '.') goto out; // If extension not found give up
len = pathlen - i - 1;
if (len == 0 || len > 8) goto out; // If extension is empty give up
ext = &path[i+1];
if (len == 0 || len > 8) goto ret; // If extension is empty give up
for (i = 0; i < len; i++) extl[i] = tolower(ext[i]);
slot = phash(extl, len);
if (slot >= PHASHNKEYS) goto out; // Extension maps outside hash table range, give up
if (slot >= PHASHNKEYS) goto ret; // Extension maps outside hash table range, give up
extnum = 0;
/*
@ -1654,6 +1696,21 @@ detect_type_by_ext(const char *path, int pathlen)
extnum = (extnum << 8) | tolower(ext[i]);
if (exthtab[slot].extnum == extnum)
return (exthtab[slot].type);
ret:
return (TYPE_UNKNOWN);
}
static int
detect_type_by_ext(const char *path, int pathlen)
{
const char *ext = NULL;
int i, len;
for (i = pathlen-1; i > 0 && path[i] != '.' && path[i] != PATHSEP_CHAR; i--);
if (i == 0 || path[i] != '.') goto out; // If extension not found give up
len = pathlen - i - 1;
ext = &path[i+1];
return (detect_type_from_ext(ext, len));
out:
return (TYPE_UNKNOWN);
}
@ -1703,7 +1760,7 @@ static int
detect_type_by_data(uchar_t *buf, size_t len)
{
// At least a few bytes.
if (len < 512) return (TYPE_UNKNOWN);
if (len < 10) return (TYPE_UNKNOWN);
// WAV files.
if (identify_wav_type(buf, len))
@ -1718,10 +1775,10 @@ detect_type_by_data(uchar_t *buf, size_t len)
// Try to detect DICOM medical image file. BSC compresses these better.
if (len > 127) {
size_t i;
int i;
// DICOM files should have either DICM or ISO_IR within the first 128 bytes
for (i = 0; i < 128; i++) {
for (i = 0; i < 128-4; i++) {
if (buf[i] == 'D')
if (memcmp(&buf[i], "DICM", 4) == 0)
return (TYPE_BINARY|TYPE_DICOM);

View file

@ -16,24 +16,37 @@ struct ext_entry {
{"cc" , TYPE_TEXT, 2},
{"cpp" , TYPE_TEXT, 3},
{"c++" , TYPE_TEXT, 3},
{"h++" , TYPE_TEXT, 3},
{"hpp" , TYPE_TEXT, 3},
{"hxx" , TYPE_TEXT, 3},
{"hh" , TYPE_TEXT, 2},
{"txt" , TYPE_TEXT, 3},
{"text" , TYPE_TEXT, 4},
{"html" , TYPE_TEXT|TYPE_MARKUP, 4},
{"htm" , TYPE_TEXT|TYPE_MARKUP, 3},
{"xml" , TYPE_TEXT|TYPE_MARKUP, 3},
{"sgml" , TYPE_TEXT|TYPE_MARKUP, 4},
{"info" , TYPE_TEXT, 4},
{"svg" , TYPE_TEXT, 3},
{"conf" , TYPE_TEXT, 4},
{"cfg" , TYPE_TEXT, 3},
{"py" , TYPE_TEXT, 2},
{"rb" , TYPE_TEXT, 2},
{"ru" , TYPE_TEXT, 2},
{"rbw" , TYPE_TEXT, 3},
{"xpm" , TYPE_TEXT, 3},
{"js" , TYPE_TEXT, 2},
{"jsp" , TYPE_TEXT, 3},
{"pl" , TYPE_TEXT, 2},
{"t" , TYPE_TEXT, 1},
{"tcl" , TYPE_TEXT, 3},
{"sh" , TYPE_TEXT, 2},
{"ksh" , TYPE_TEXT, 3},
{"csh" , TYPE_TEXT, 3},
{"php" , TYPE_TEXT, 3},
{"php3" , TYPE_TEXT, 4},
{"php4" , TYPE_TEXT, 4},
{"php5" , TYPE_TEXT, 4},
{"bat" , TYPE_TEXT, 3},
{"pm" , TYPE_TEXT, 2},
{"r" , TYPE_TEXT, 1},
@ -44,14 +57,19 @@ struct ext_entry {
{"java" , TYPE_TEXT, 4},
{"m4" , TYPE_TEXT, 2},
{"vb" , TYPE_TEXT, 2},
{"vba" , TYPE_TEXT, 3},
{"vbs" , TYPE_TEXT, 3},
{"xslt" , TYPE_TEXT|TYPE_MARKUP, 4},
{"xsl" , TYPE_TEXT|TYPE_MARKUP, 3},
{"xsd" , TYPE_TEXT|TYPE_MARKUP, 3},
{"xs" , TYPE_TEXT, 2},
{"yacc" , TYPE_TEXT, 4},
{"lex" , TYPE_TEXT, 3},
{"csv" , TYPE_TEXT, 3},
{"shtml" , TYPE_TEXT|TYPE_MARKUP, 5},
{"xhtml" , TYPE_TEXT|TYPE_MARKUP, 5},
{"xht" , TYPE_TEXT|TYPE_MARKUP, 3},
{"tpl" , TYPE_TEXT|TYPE_MARKUP, 3},
{"asp" , TYPE_TEXT, 3},
{"aspx" , TYPE_TEXT, 4},
{"rss" , TYPE_TEXT|TYPE_MARKUP, 3},
@ -67,6 +85,7 @@ struct ext_entry {
{"ps" , TYPE_TEXT, 2},
{"bib" , TYPE_TEXT, 3},
{"lua" , TYPE_TEXT, 3},
{"nse" , TYPE_TEXT, 3},
{"dtd" , TYPE_TEXT, 3},
{"qml" , TYPE_TEXT|TYPE_MARKUP, 3},
{"fa" , TYPE_TEXT|TYPE_DNA_SEQ, 2},
@ -105,6 +124,7 @@ struct ext_entry {
{"m4p" , TYPE_BINARY|TYPE_COMPRESSED, 3},
{"ofs" , TYPE_BINARY|TYPE_COMPRESSED, 3},
{"ofr" , TYPE_BINARY|TYPE_COMPRESSED, 3},
{"ogg" , TYPE_BINARY|TYPE_COMPRESSED, 3},
{"flac" , TYPE_BINARY|TYPE_FLAC, 4},
{"avi" , TYPE_BINARY|TYPE_AVI, 3},
{"pac" , TYPE_BINARY|TYPE_COMPRESSED, 3},
@ -136,21 +156,76 @@ struct ext_entry {
{"zpaq" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_ZPAQ, 4},
{"xcf" , TYPE_BINARY, 3},
{"mo" , TYPE_BINARY, 2},
{"gmo" , TYPE_BINARY, 3},
{"pyo" , TYPE_BINARY, 3},
{"pyc" , TYPE_BINARY, 3},
{"wav" , TYPE_BINARY|TYPE_WAV, 3},
{"tta" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_AUDIO_COMPRESSED, 3},
{"wv" , TYPE_BINARY|TYPE_COMPRESSED, 2},
{"swf" , TYPE_BINARY, 3},
{"SVGZ" , TYPE_BINARY, 4},
{"ODT" , TYPE_BINARY, 3},
{"3DM" , TYPE_BINARY, 3},
{"svgz" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ, 4},
{"odt" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_ZIP, 3},
{"3dm" , TYPE_BINARY, 3},
{"chm" , TYPE_BINARY, 3},
{"CHM" , TYPE_BINARY, 3},
{"svn" , TYPE_BINARY, 3},
{"ppm" , TYPE_BINARY|TYPE_PNM, 3},
{"pbm" , TYPE_BINARY|TYPE_PNM, 3},
{"pgm" , TYPE_BINARY|TYPE_PNM, 3},
{"pnm" , TYPE_BINARY|TYPE_PNM, 3},
{"ppn" , TYPE_BINARY|TYPE_COMPRESSED|TYPE_PACKPNM, 3},
{"mk" , TYPE_TEXT, 2},
{"diff" , TYPE_TEXT, 4},
{"po" , TYPE_TEXT, 2},
{"pot" , TYPE_TEXT, 3},
{"in" , TYPE_TEXT, 2},
{"ac" , TYPE_TEXT, 2},
{"guess" , TYPE_TEXT, 5},
{"sub" , TYPE_TEXT, 3},
{"rpath" , TYPE_TEXT, 5},
{"texi" , TYPE_TEXT, 4},
{"valgrind" , TYPE_TEXT, 8},
{"gperf" , TYPE_TEXT, 5},
{"latex" , TYPE_TEXT, 5},
{"f77" , TYPE_TEXT, 3},
{"f90" , TYPE_TEXT, 3},
{"f95" , TYPE_TEXT, 3},
{"groovy" , TYPE_TEXT, 6},
{"ebuild" , TYPE_TEXT, 6},
{"rex" , TYPE_TEXT, 3},
{"rexx" , TYPE_TEXT, 4},
{"scala" , TYPE_TEXT, 5},
{"xaml" , TYPE_TEXT|TYPE_MARKUP, 4},
{"yaml" , TYPE_TEXT|TYPE_MARKUP, 4},
{"tex" , TYPE_TEXT, 3},
{"rebol" , TYPE_TEXT, 5},
{"reb" , TYPE_TEXT, 3},
{"perl" , TYPE_TEXT, 4},
{"pas" , TYPE_TEXT, 3},
{"p6" , TYPE_TEXT, 2},
{"z80" , TYPE_TEXT, 3},
{"scm" , TYPE_TEXT, 3},
{"ss" , TYPE_TEXT, 2},
{"ml" , TYPE_TEXT, 2},
{"ml4" , TYPE_TEXT, 3},
{"mli" , TYPE_TEXT, 3},
{"mm" , TYPE_TEXT, 2},
{"m3" , TYPE_TEXT, 2},
{"lisp" , TYPE_TEXT, 4},
{"kdebuild-1" , TYPE_TEXT, 10},
{"hs" , TYPE_TEXT, 2},
{"gemspec" , TYPE_TEXT, 7},
{"fs" , TYPE_TEXT, 2},
{"coffee" , TYPE_TEXT, 6},
{"e" , TYPE_TEXT, 1},
{"cu" , TYPE_TEXT, 2},
{"awk" , TYPE_TEXT, 3},
{"xls" , TYPE_BINARY, 3},
{"xlw" , TYPE_BINARY, 3},
{"qt" , TYPE_BINARY, 2},
{"charset" , TYPE_TEXT, 7},
{"sed" , TYPE_TEXT, 3},
{"mailmap" , TYPE_TEXT, 7},
{"sin" , TYPE_BINARY, 3},
};
#endif

View file

@ -3,24 +3,37 @@ h,TYPE_TEXT
cc,TYPE_TEXT
cpp,TYPE_TEXT
c++,TYPE_TEXT
h++,TYPE_TEXT
hpp,TYPE_TEXT
hxx,TYPE_TEXT
hh,TYPE_TEXT
txt,TYPE_TEXT
text,TYPE_TEXT
html,TYPE_TEXT|TYPE_MARKUP
htm,TYPE_TEXT|TYPE_MARKUP
xml,TYPE_TEXT|TYPE_MARKUP
sgml,TYPE_TEXT|TYPE_MARKUP
info,TYPE_TEXT
svg,TYPE_TEXT
conf,TYPE_TEXT
cfg,TYPE_TEXT
py,TYPE_TEXT
rb,TYPE_TEXT
ru,TYPE_TEXT
rbw,TYPE_TEXT
xpm,TYPE_TEXT
js,TYPE_TEXT
jsp,TYPE_TEXT
pl,TYPE_TEXT
t,TYPE_TEXT
tcl,TYPE_TEXT
sh,TYPE_TEXT
ksh,TYPE_TEXT
csh,TYPE_TEXT
php,TYPE_TEXT
php3,TYPE_TEXT
php4,TYPE_TEXT
php5,TYPE_TEXT
bat,TYPE_TEXT
pm,TYPE_TEXT
r,TYPE_TEXT
@ -31,14 +44,19 @@ go,TYPE_TEXT
java,TYPE_TEXT
m4,TYPE_TEXT
vb,TYPE_TEXT
vba,TYPE_TEXT
vbs,TYPE_TEXT
xslt,TYPE_TEXT|TYPE_MARKUP
xsl,TYPE_TEXT|TYPE_MARKUP
xsd,TYPE_TEXT|TYPE_MARKUP
xs,TYPE_TEXT
yacc,TYPE_TEXT
lex,TYPE_TEXT
csv,TYPE_TEXT
shtml,TYPE_TEXT|TYPE_MARKUP
xhtml,TYPE_TEXT|TYPE_MARKUP
xht,TYPE_TEXT|TYPE_MARKUP
tpl,TYPE_TEXT|TYPE_MARKUP
asp,TYPE_TEXT
aspx,TYPE_TEXT
rss,TYPE_TEXT|TYPE_MARKUP
@ -54,6 +72,7 @@ s,TYPE_TEXT
ps,TYPE_TEXT
bib,TYPE_TEXT
lua,TYPE_TEXT
nse,TYPE_TEXT
dtd,TYPE_TEXT
qml,TYPE_TEXT|TYPE_MARKUP
@ -95,6 +114,7 @@ m4a,TYPE_BINARY|TYPE_COMPRESSED
m4p,TYPE_BINARY|TYPE_COMPRESSED
ofs,TYPE_BINARY|TYPE_COMPRESSED
ofr,TYPE_BINARY|TYPE_COMPRESSED
ogg,TYPE_BINARY|TYPE_COMPRESSED
flac,TYPE_BINARY|TYPE_FLAC
avi,TYPE_BINARY|TYPE_AVI
pac,TYPE_BINARY|TYPE_COMPRESSED
@ -126,19 +146,74 @@ pmd,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_PPMD
zpaq,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_ZPAQ
xcf,TYPE_BINARY
mo,TYPE_BINARY
gmo,TYPE_BINARY
pyo,TYPE_BINARY
pyc,TYPE_BINARY
wav,TYPE_BINARY|TYPE_WAV
tta,TYPE_BINARY|TYPE_COMPRESSED|TYPE_AUDIO_COMPRESSED
wv,TYPE_BINARY|TYPE_COMPRESSED
swf,TYPE_BINARY
SVGZ,TYPE_BINARY,TYPE_COMPRESSED|TYPE_COMPRESSED_GZ
ODT,TYPE_BINARY,TYPE_COMPRESSED|TYPE_COMPRESSED_ZIP
3DM,TYPE_BINARY
svgz,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_GZ
odt,TYPE_BINARY|TYPE_COMPRESSED|TYPE_COMPRESSED_ZIP
3dm,TYPE_BINARY
chm,TYPE_BINARY
CHM,TYPE_BINARY
svn,TYPE_BINARY
ppm,TYPE_BINARY|TYPE_PNM
pbm,TYPE_BINARY|TYPE_PNM
pgm,TYPE_BINARY|TYPE_PNM
pnm,TYPE_BINARY|TYPE_PNM
ppn,TYPE_BINARY|TYPE_COMPRESSED|TYPE_PACKPNM
mk,TYPE_TEXT
diff,TYPE_TEXT
po,TYPE_TEXT
pot,TYPE_TEXT
in,TYPE_TEXT
ac,TYPE_TEXT
guess,TYPE_TEXT
sub,TYPE_TEXT
rpath,TYPE_TEXT
texi,TYPE_TEXT
valgrind,TYPE_TEXT
gperf,TYPE_TEXT
latex,TYPE_TEXT
f77,TYPE_TEXT
f90,TYPE_TEXT
f95,TYPE_TEXT
groovy,TYPE_TEXT
ebuild,TYPE_TEXT
rex,TYPE_TEXT
rexx,TYPE_TEXT
scala,TYPE_TEXT
xaml,TYPE_TEXT|TYPE_MARKUP
yaml,TYPE_TEXT|TYPE_MARKUP
tex,TYPE_TEXT
rebol,TYPE_TEXT
reb,TYPE_TEXT
perl,TYPE_TEXT
pas,TYPE_TEXT
p6,TYPE_TEXT
z80,TYPE_TEXT
scm,TYPE_TEXT
ss,TYPE_TEXT
ml,TYPE_TEXT
ml4,TYPE_TEXT
mli,TYPE_TEXT
mm,TYPE_TEXT
m3,TYPE_TEXT
lisp,TYPE_TEXT
kdebuild-1,TYPE_TEXT
hs,TYPE_TEXT
gemspec,TYPE_TEXT
fs,TYPE_TEXT
coffee,TYPE_TEXT
e,TYPE_TEXT
cu,TYPE_TEXT
awk,TYPE_TEXT
xls,TYPE_BINARY
xlw,TYPE_BINARY
qt,TYPE_BINARY
charset,TYPE_TEXT
sed,TYPE_TEXT
mailmap,TYPE_TEXT
sin,TYPE_BINARY

View file

@ -12,14 +12,14 @@
/* small adjustments to _a_ to make values distinct */
ub1 tab[] = {
125,0,0,220,85,0,82,87,113,0,0,113,0,0,82,125,
0,0,7,87,0,113,82,0,0,183,0,131,0,7,0,253,
0,0,0,0,85,0,113,0,0,113,125,113,0,7,22,0,
82,0,7,113,125,125,0,0,0,113,113,131,220,0,0,85,
0,87,0,0,113,0,85,183,82,88,7,88,58,113,0,0,
124,0,168,125,0,125,0,116,0,82,125,55,0,22,116,12,
0,125,113,113,0,40,0,0,42,232,0,124,0,92,183,61,
0,0,221,0,0,234,0,0,97,11,0,0,164,91,0,0,
0,0,87,120,113,125,22,125,0,0,0,220,125,0,131,7,
0,0,183,125,82,113,0,131,146,87,125,183,0,7,146,183,
0,0,0,253,183,0,131,113,253,168,0,220,0,7,0,113,
82,0,7,131,145,7,0,0,120,113,0,183,220,183,220,22,
0,183,0,183,113,0,183,120,22,27,125,125,233,124,125,235,
253,131,146,235,15,220,0,235,0,235,212,220,220,220,183,132,
87,125,113,82,220,32,229,235,131,27,0,220,237,113,4,132,
0,0,145,0,148,195,0,253,142,88,66,232,137,135,167,0,
};
/* The hash function */

View file

@ -8,7 +8,7 @@
extern ub1 tab[];
#define PHASHLEN 0x80 /* length of hash mapping table */
#define PHASHNKEYS 141 /* How many keys were hashed */
#define PHASHNKEYS 216 /* How many keys were hashed */
#define PHASHRANGE 256 /* Range any input might map to */
#define PHASHSALT 0x9e3779b9 /* internal, initialize normal hash */

View file

@ -322,10 +322,10 @@ typedef enum {
* | | | |
* .---------------------------------------.
* | | | | | | | | | | |
* Bit 10 Bit 0
* Bit 15 Bit 0
*/
#define PC_TYPE_MASK 0x7
#define PC_SUBTYPE_MASK 0x7f8
#define PC_SUBTYPE_MASK 0xfff8
#define PC_SUBTYPE(x) ((x) & PC_SUBTYPE_MASK)
#define PC_TYPE(x) ((x) & PC_TYPE_MASK)