Add file type detection based on magic values.

Add more comments.
Add more extensions.
This commit is contained in:
Moinak Ghosh 2013-11-07 23:57:15 +05:30
parent 991482403b
commit b7facc929e
5 changed files with 103 additions and 12 deletions

View file

@ -42,6 +42,7 @@
#include <utils.h> #include <utils.h>
#include <pthread.h> #include <pthread.h>
#include <sys/mman.h> #include <sys/mman.h>
#include <ctype.h>
#include <archive.h> #include <archive.h>
#include <archive_entry.h> #include <archive_entry.h>
#include "pc_archive.h" #include "pc_archive.h"
@ -102,6 +103,7 @@ static struct arc_list_state {
pthread_mutex_t nftw_mutex = PTHREAD_MUTEX_INITIALIZER; pthread_mutex_t nftw_mutex = PTHREAD_MUTEX_INITIALIZER;
static int detect_type_by_ext(char *path, int pathlen); static int detect_type_by_ext(char *path, int pathlen);
static int detect_type_by_data(uchar_t *buf, size_t len);
/* /*
* Archive writer callback routines for archive creation operation. * Archive writer callback routines for archive creation operation.
@ -847,8 +849,8 @@ copy_file_data(pc_ctx_t *pctx, struct archive *arc,
src = mapbuf; src = mapbuf;
wlen = len; wlen = len;
/* if (typ == TYPE_UNKNOWN) if (typ == TYPE_UNKNOWN)
pctx->ctype = detect_type_by_data(src, len);*/ pctx->ctype = detect_type_by_data(src, len);
/* /*
* Write the entire mmap-ed buffer. Since we are writing to the compressor * Write the entire mmap-ed buffer. Since we are writing to the compressor
@ -1043,6 +1045,9 @@ extractor_thread_func(void *dat) {
flags |= ARCHIVE_EXTRACT_SECURE_NODOTDOT; flags |= ARCHIVE_EXTRACT_SECURE_NODOTDOT;
flags |= ARCHIVE_EXTRACT_SPARSE; flags |= ARCHIVE_EXTRACT_SPARSE;
/*
* Extract all security attributes if we are root.
*/
if (pctx->force_archive_perms || geteuid() == 0) { if (pctx->force_archive_perms || geteuid() == 0) {
flags |= ARCHIVE_EXTRACT_OWNER; flags |= ARCHIVE_EXTRACT_OWNER;
flags |= ARCHIVE_EXTRACT_PERM; flags |= ARCHIVE_EXTRACT_PERM;
@ -1148,6 +1153,11 @@ start_extractor(pc_ctx_t *pctx) {
return (pthread_create(&(pctx->archive_thread), NULL, extractor_thread_func, (void *)pctx)); return (pthread_create(&(pctx->archive_thread), NULL, extractor_thread_func, (void *)pctx));
} }
/*
* Initialize the hash table of known extensions and types. Bob Jenkins Minimal Perfect Hash
* is used to get a perfect hash function for the set of known extensions. See:
* http://burtleburtle.net/bob/hash/perfect.html
*/
int int
init_archive_mod() { init_archive_mod() {
int rv = 0; int rv = 0;
@ -1162,6 +1172,12 @@ init_archive_mod() {
uint64_t extnum; uint64_t extnum;
ub4 slot = phash(extlist[i].ext, extlist[i].len); ub4 slot = phash(extlist[i].ext, extlist[i].len);
extnum = 0; extnum = 0;
/*
* Since extensions are less than 8 bytes (or truncated otherwise),
* each extension string is packed into a 64-bit integer for quick
* comparison.
*/
for (j = 0; j < extlist[i].len; j++) for (j = 0; j < extlist[i].len; j++)
extnum = (extnum << 1) | extlist[i].ext[j]; extnum = (extnum << 1) | extlist[i].ext[j];
exthtab[slot].extnum = extnum; exthtab[slot].extnum = extnum;
@ -1176,6 +1192,11 @@ init_archive_mod() {
return (rv); return (rv);
} }
/*
* Identify file type based on extension. Lookup is fast as we have a perfect hash function.
* If the given extension maps to a slot which has a different extension or maps to a slot
* outside the hash table range then the function returns unknown type.
*/
static int static int
detect_type_by_ext(char *path, int pathlen) detect_type_by_ext(char *path, int pathlen)
{ {
@ -1185,17 +1206,53 @@ detect_type_by_ext(char *path, int pathlen)
uint64_t extnum; uint64_t extnum;
for (i = pathlen-1; i > 0 && path[i] != '.' && path[i] != PATHSEP_CHAR; i--); for (i = pathlen-1; i > 0 && path[i] != '.' && path[i] != PATHSEP_CHAR; i--);
if (i == 0 || path[i] != '.') goto out; if (i == 0 || path[i] != '.') goto out; // If extension not found give up
len = pathlen - i - 1; len = pathlen - i - 1;
if (len == 0) goto out; if (len == 0) goto out; // If extension is empty give up
ext = &path[i+1]; ext = &path[i+1];
slot = phash(ext, len); slot = phash(ext, len);
if (slot > NUM_EXT) goto out; if (slot > NUM_EXT) goto out; // Extension maps outside hash table range, give up
extnum = 0; extnum = 0;
/*
* Pack given extension into 64-bit integer.
*/
for (i = 0; i < len; i++) for (i = 0; i < len; i++)
extnum = (extnum << 1) | ext[i]; extnum = (extnum << 1) | tolower(ext[i]);
if (exthtab[slot].extnum == extnum) if (exthtab[slot].extnum == extnum)
return (exthtab[slot].type); return (exthtab[slot].type);
out: out:
return (TYPE_UNKNOWN); return (TYPE_UNKNOWN);
} }
/* 0x7fELF packed into 32-bit integer. */
#define ELFSHORT (0x7f454c46U)
/* TZif packed into 32-bit integer. */
#define TZSHORT (0x545a6966U)
/* PPMZ packed into 32-bit integer. */
#define PPMSHORT (0x50504d5aU)
/*
* Detect a few file types from looking at magic signatures.
*/
static int
detect_type_by_data(uchar_t *buf, size_t len)
{
// At least a few bytes.
if (len < 16) return (TYPE_UNKNOWN);
if (U32_P(buf) == ELFSHORT)
return (TYPE_EXE); // Regular ELF
if ((buf[0] == 'M' || buf[0] == 'L') && buf[1] == 'Z')
return (TYPE_EXE); // MSDOS Exe
if (buf[0] == 0xe9)
return (TYPE_EXE); // MSDOS COM
if (U32_P(buf) == TZSHORT)
return (TYPE_BINARY); // Timezone data
if (U32_P(buf) == PPMSHORT)
return (TYPE_COMPRESSED); // PPM Compressed archive
return (TYPE_UNKNOWN);
}

View file

@ -45,6 +45,7 @@ struct ext_entry {
{"m4" , TYPE_TEXT, 2}, {"m4" , TYPE_TEXT, 2},
{"vb" , TYPE_TEXT, 2}, {"vb" , TYPE_TEXT, 2},
{"xslt" , TYPE_TEXT, 4}, {"xslt" , TYPE_TEXT, 4},
{"xsl" , TYPE_TEXT, 3},
{"yacc" , TYPE_TEXT, 4}, {"yacc" , TYPE_TEXT, 4},
{"lex" , TYPE_TEXT, 3}, {"lex" , TYPE_TEXT, 3},
{"csv" , TYPE_TEXT, 3}, {"csv" , TYPE_TEXT, 3},
@ -67,6 +68,16 @@ struct ext_entry {
{"bib" , TYPE_TEXT, 3}, {"bib" , TYPE_TEXT, 3},
{"lua" , TYPE_TEXT, 3}, {"lua" , TYPE_TEXT, 3},
{"qml" , TYPE_TEXT, 3}, {"qml" , TYPE_TEXT, 3},
{"fa" , TYPE_TEXT, 2},
{"tcc" , TYPE_TEXT, 3},
{"css" , TYPE_TEXT, 3},
{"pod" , TYPE_TEXT, 3},
{"al" , TYPE_TEXT, 2},
{"vim" , TYPE_TEXT, 3},
{"am" , TYPE_TEXT, 2},
{"upp" , TYPE_TEXT, 3},
{"mom" , TYPE_TEXT, 3},
{"tmac" , TYPE_TEXT, 4},
{"exe" , TYPE_EXE, 3}, {"exe" , TYPE_EXE, 3},
{"dll" , TYPE_EXE, 3}, {"dll" , TYPE_EXE, 3},
{"bin" , TYPE_EXE, 3}, {"bin" , TYPE_EXE, 3},
@ -109,7 +120,13 @@ struct ext_entry {
{"uha" , TYPE_COMPRESSED, 3}, {"uha" , TYPE_COMPRESSED, 3},
{"alz" , TYPE_COMPRESSED, 3}, {"alz" , TYPE_COMPRESSED, 3},
{"ace" , TYPE_COMPRESSED, 3}, {"ace" , TYPE_COMPRESSED, 3},
{"rar" , TYPE_COMPRESSED, 3},
{"xz" , TYPE_COMPRESSED, 2},
{"xcf" , TYPE_BINARY, 3}, {"xcf" , TYPE_BINARY, 3},
{"mo" , TYPE_BINARY, 2},
{"bmp" , TYPE_BINARY, 3},
{"pyo" , TYPE_BINARY, 3},
{"pyc" , TYPE_BINARY, 3},
}; };
#define NUM_EXT (99) #define NUM_EXT (116)
#endif #endif

View file

@ -32,6 +32,7 @@ java,TYPE_TEXT
m4,TYPE_TEXT m4,TYPE_TEXT
vb,TYPE_TEXT vb,TYPE_TEXT
xslt,TYPE_TEXT xslt,TYPE_TEXT
xsl,TYPE_TEXT
yacc,TYPE_TEXT yacc,TYPE_TEXT
lex,TYPE_TEXT lex,TYPE_TEXT
csv,TYPE_TEXT csv,TYPE_TEXT
@ -54,6 +55,16 @@ ps,TYPE_TEXT
bib,TYPE_TEXT bib,TYPE_TEXT
lua,TYPE_TEXT lua,TYPE_TEXT
qml,TYPE_TEXT qml,TYPE_TEXT
fa,TYPE_TEXT
tcc,TYPE_TEXT
css,TYPE_TEXT
pod,TYPE_TEXT
al,TYPE_TEXT
vim,TYPE_TEXT
am,TYPE_TEXT
upp,TYPE_TEXT
mom,TYPE_TEXT
tmac,TYPE_TEXT
exe,TYPE_EXE exe,TYPE_EXE
dll,TYPE_EXE dll,TYPE_EXE
bin,TYPE_EXE bin,TYPE_EXE
@ -96,4 +107,10 @@ dmg,TYPE_COMPRESSED
uha,TYPE_COMPRESSED uha,TYPE_COMPRESSED
alz,TYPE_COMPRESSED alz,TYPE_COMPRESSED
ace,TYPE_COMPRESSED ace,TYPE_COMPRESSED
rar,TYPE_COMPRESSED
xz,TYPE_COMPRESSED
xcf,TYPE_BINARY xcf,TYPE_BINARY
mo,TYPE_BINARY
bmp,TYPE_BINARY
pyo,TYPE_BINARY
pyc,TYPE_BINARY

View file

@ -12,10 +12,10 @@
/* small adjustments to _a_ to make values distinct */ /* small adjustments to _a_ to make values distinct */
ub1 tab[] = { ub1 tab[] = {
20,70,0,4,61,76,0,119,0,0,16,4,10,1,61,76, 10,76,0,76,70,42,0,1,0,0,119,1,61,1,70,79,
61,0,0,16,1,61,0,76,0,123,32,70,28,34,119,51, 0,0,0,4,70,1,0,122,0,119,47,76,76,34,110,101,
0,76,4,122,70,0,0,43,0,106,20,83,0,0,28,66, 0,76,70,70,42,28,0,66,0,108,0,109,28,4,28,4,
79,0,1,47,79,122,0,0,71,75,85,26,0,103,0,76, 70,0,1,20,4,123,123,0,79,75,34,76,69,77,0,69,
}; };
/* The hash function */ /* The hash function */

View file

@ -8,7 +8,7 @@
extern ub1 tab[]; extern ub1 tab[];
#define PHASHLEN 0x40 /* length of hash mapping table */ #define PHASHLEN 0x40 /* length of hash mapping table */
#define PHASHNKEYS 99 /* How many keys were hashed */ #define PHASHNKEYS 116 /* How many keys were hashed */
#define PHASHRANGE 128 /* Range any input might map to */ #define PHASHRANGE 128 /* Range any input might map to */
#define PHASHSALT 0x9e3779b9 /* internal, initialize normal hash */ #define PHASHSALT 0x9e3779b9 /* internal, initialize normal hash */