From b7facc929e6cfc58c57d17611246d7e861bf373c Mon Sep 17 00:00:00 2001 From: Moinak Ghosh Date: Thu, 7 Nov 2013 23:57:15 +0530 Subject: [PATCH] Add file type detection based on magic values. Add more comments. Add more extensions. --- archive/pc_archive.c | 69 ++++++++++++++++++++++++++++++++++---- utils/phash/extensions.h | 19 ++++++++++- utils/phash/extensions.txt | 17 ++++++++++ utils/phash/phash.c | 8 ++--- utils/phash/phash.h | 2 +- 5 files changed, 103 insertions(+), 12 deletions(-) diff --git a/archive/pc_archive.c b/archive/pc_archive.c index 493fbce..ca0d790 100644 --- a/archive/pc_archive.c +++ b/archive/pc_archive.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include "pc_archive.h" @@ -102,6 +103,7 @@ static struct arc_list_state { pthread_mutex_t nftw_mutex = PTHREAD_MUTEX_INITIALIZER; static int detect_type_by_ext(char *path, int pathlen); +static int detect_type_by_data(uchar_t *buf, size_t len); /* * Archive writer callback routines for archive creation operation. @@ -847,8 +849,8 @@ copy_file_data(pc_ctx_t *pctx, struct archive *arc, src = mapbuf; wlen = len; -/* if (typ == TYPE_UNKNOWN) - pctx->ctype = detect_type_by_data(src, len);*/ + if (typ == TYPE_UNKNOWN) + pctx->ctype = detect_type_by_data(src, len); /* * Write the entire mmap-ed buffer. Since we are writing to the compressor @@ -1043,6 +1045,9 @@ extractor_thread_func(void *dat) { flags |= ARCHIVE_EXTRACT_SECURE_NODOTDOT; flags |= ARCHIVE_EXTRACT_SPARSE; + /* + * Extract all security attributes if we are root. + */ if (pctx->force_archive_perms || geteuid() == 0) { flags |= ARCHIVE_EXTRACT_OWNER; flags |= ARCHIVE_EXTRACT_PERM; @@ -1148,6 +1153,11 @@ start_extractor(pc_ctx_t *pctx) { return (pthread_create(&(pctx->archive_thread), NULL, extractor_thread_func, (void *)pctx)); } +/* + * Initialize the hash table of known extensions and types. Bob Jenkins Minimal Perfect Hash + * is used to get a perfect hash function for the set of known extensions. See: + * http://burtleburtle.net/bob/hash/perfect.html + */ int init_archive_mod() { int rv = 0; @@ -1162,6 +1172,12 @@ init_archive_mod() { uint64_t extnum; ub4 slot = phash(extlist[i].ext, extlist[i].len); extnum = 0; + + /* + * Since extensions are less than 8 bytes (or truncated otherwise), + * each extension string is packed into a 64-bit integer for quick + * comparison. + */ for (j = 0; j < extlist[i].len; j++) extnum = (extnum << 1) | extlist[i].ext[j]; exthtab[slot].extnum = extnum; @@ -1176,6 +1192,11 @@ init_archive_mod() { return (rv); } +/* + * Identify file type based on extension. Lookup is fast as we have a perfect hash function. + * If the given extension maps to a slot which has a different extension or maps to a slot + * outside the hash table range then the function returns unknown type. + */ static int detect_type_by_ext(char *path, int pathlen) { @@ -1185,17 +1206,53 @@ detect_type_by_ext(char *path, int pathlen) uint64_t extnum; for (i = pathlen-1; i > 0 && path[i] != '.' && path[i] != PATHSEP_CHAR; i--); - if (i == 0 || path[i] != '.') goto out; + if (i == 0 || path[i] != '.') goto out; // If extension not found give up len = pathlen - i - 1; - if (len == 0) goto out; + if (len == 0) goto out; // If extension is empty give up ext = &path[i+1]; slot = phash(ext, len); - if (slot > NUM_EXT) goto out; + if (slot > NUM_EXT) goto out; // Extension maps outside hash table range, give up extnum = 0; + + /* + * Pack given extension into 64-bit integer. + */ for (i = 0; i < len; i++) - extnum = (extnum << 1) | ext[i]; + extnum = (extnum << 1) | tolower(ext[i]); if (exthtab[slot].extnum == extnum) return (exthtab[slot].type); out: return (TYPE_UNKNOWN); } + +/* 0x7fELF packed into 32-bit integer. */ +#define ELFSHORT (0x7f454c46U) + +/* TZif packed into 32-bit integer. */ +#define TZSHORT (0x545a6966U) + +/* PPMZ packed into 32-bit integer. */ +#define PPMSHORT (0x50504d5aU) + +/* + * Detect a few file types from looking at magic signatures. + */ +static int +detect_type_by_data(uchar_t *buf, size_t len) +{ + // At least a few bytes. + if (len < 16) return (TYPE_UNKNOWN); + + if (U32_P(buf) == ELFSHORT) + return (TYPE_EXE); // Regular ELF + if ((buf[0] == 'M' || buf[0] == 'L') && buf[1] == 'Z') + return (TYPE_EXE); // MSDOS Exe + if (buf[0] == 0xe9) + return (TYPE_EXE); // MSDOS COM + if (U32_P(buf) == TZSHORT) + return (TYPE_BINARY); // Timezone data + if (U32_P(buf) == PPMSHORT) + return (TYPE_COMPRESSED); // PPM Compressed archive + + return (TYPE_UNKNOWN); +} diff --git a/utils/phash/extensions.h b/utils/phash/extensions.h index c3498b3..8797efb 100644 --- a/utils/phash/extensions.h +++ b/utils/phash/extensions.h @@ -45,6 +45,7 @@ struct ext_entry { {"m4" , TYPE_TEXT, 2}, {"vb" , TYPE_TEXT, 2}, {"xslt" , TYPE_TEXT, 4}, + {"xsl" , TYPE_TEXT, 3}, {"yacc" , TYPE_TEXT, 4}, {"lex" , TYPE_TEXT, 3}, {"csv" , TYPE_TEXT, 3}, @@ -67,6 +68,16 @@ struct ext_entry { {"bib" , TYPE_TEXT, 3}, {"lua" , TYPE_TEXT, 3}, {"qml" , TYPE_TEXT, 3}, + {"fa" , TYPE_TEXT, 2}, + {"tcc" , TYPE_TEXT, 3}, + {"css" , TYPE_TEXT, 3}, + {"pod" , TYPE_TEXT, 3}, + {"al" , TYPE_TEXT, 2}, + {"vim" , TYPE_TEXT, 3}, + {"am" , TYPE_TEXT, 2}, + {"upp" , TYPE_TEXT, 3}, + {"mom" , TYPE_TEXT, 3}, + {"tmac" , TYPE_TEXT, 4}, {"exe" , TYPE_EXE, 3}, {"dll" , TYPE_EXE, 3}, {"bin" , TYPE_EXE, 3}, @@ -109,7 +120,13 @@ struct ext_entry { {"uha" , TYPE_COMPRESSED, 3}, {"alz" , TYPE_COMPRESSED, 3}, {"ace" , TYPE_COMPRESSED, 3}, + {"rar" , TYPE_COMPRESSED, 3}, + {"xz" , TYPE_COMPRESSED, 2}, {"xcf" , TYPE_BINARY, 3}, + {"mo" , TYPE_BINARY, 2}, + {"bmp" , TYPE_BINARY, 3}, + {"pyo" , TYPE_BINARY, 3}, + {"pyc" , TYPE_BINARY, 3}, }; -#define NUM_EXT (99) +#define NUM_EXT (116) #endif diff --git a/utils/phash/extensions.txt b/utils/phash/extensions.txt index 838997d..1d410a0 100644 --- a/utils/phash/extensions.txt +++ b/utils/phash/extensions.txt @@ -32,6 +32,7 @@ java,TYPE_TEXT m4,TYPE_TEXT vb,TYPE_TEXT xslt,TYPE_TEXT +xsl,TYPE_TEXT yacc,TYPE_TEXT lex,TYPE_TEXT csv,TYPE_TEXT @@ -54,6 +55,16 @@ ps,TYPE_TEXT bib,TYPE_TEXT lua,TYPE_TEXT qml,TYPE_TEXT +fa,TYPE_TEXT +tcc,TYPE_TEXT +css,TYPE_TEXT +pod,TYPE_TEXT +al,TYPE_TEXT +vim,TYPE_TEXT +am,TYPE_TEXT +upp,TYPE_TEXT +mom,TYPE_TEXT +tmac,TYPE_TEXT exe,TYPE_EXE dll,TYPE_EXE bin,TYPE_EXE @@ -96,4 +107,10 @@ dmg,TYPE_COMPRESSED uha,TYPE_COMPRESSED alz,TYPE_COMPRESSED ace,TYPE_COMPRESSED +rar,TYPE_COMPRESSED +xz,TYPE_COMPRESSED xcf,TYPE_BINARY +mo,TYPE_BINARY +bmp,TYPE_BINARY +pyo,TYPE_BINARY +pyc,TYPE_BINARY diff --git a/utils/phash/phash.c b/utils/phash/phash.c index a4183c3..05ff4de 100644 --- a/utils/phash/phash.c +++ b/utils/phash/phash.c @@ -12,10 +12,10 @@ /* small adjustments to _a_ to make values distinct */ ub1 tab[] = { -20,70,0,4,61,76,0,119,0,0,16,4,10,1,61,76, -61,0,0,16,1,61,0,76,0,123,32,70,28,34,119,51, -0,76,4,122,70,0,0,43,0,106,20,83,0,0,28,66, -79,0,1,47,79,122,0,0,71,75,85,26,0,103,0,76, +10,76,0,76,70,42,0,1,0,0,119,1,61,1,70,79, +0,0,0,4,70,1,0,122,0,119,47,76,76,34,110,101, +0,76,70,70,42,28,0,66,0,108,0,109,28,4,28,4, +70,0,1,20,4,123,123,0,79,75,34,76,69,77,0,69, }; /* The hash function */ diff --git a/utils/phash/phash.h b/utils/phash/phash.h index a9396a6..f833e23 100644 --- a/utils/phash/phash.h +++ b/utils/phash/phash.h @@ -8,7 +8,7 @@ extern ub1 tab[]; #define PHASHLEN 0x40 /* length of hash mapping table */ -#define PHASHNKEYS 99 /* How many keys were hashed */ +#define PHASHNKEYS 116 /* How many keys were hashed */ #define PHASHRANGE 128 /* Range any input might map to */ #define PHASHSALT 0x9e3779b9 /* internal, initialize normal hash */