From 683c3e48b5704f1e3c293825950588f4a63c62fc Mon Sep 17 00:00:00 2001 From: Moinak Ghosh Date: Wed, 1 Jan 2014 19:44:58 +0530 Subject: [PATCH] Detect some DICOM formats and use BSC for DICOM data. --- adaptive_compress.c | 12 +++++++++--- archive/pc_archive.c | 16 ++++++++++++++++ utils/utils.h | 3 ++- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/adaptive_compress.c b/adaptive_compress.c index edbb6d2..b2f4b71 100644 --- a/adaptive_compress.c +++ b/adaptive_compress.c @@ -215,12 +215,15 @@ adapt_deinit(void **data) return (rv); } +/* + * Identify the types that BSC can compress better than others. + */ int is_bsc_type(int btype) { int stype = PC_SUBTYPE(btype); return ((stype == TYPE_MARKUP) | (stype == TYPE_BMP) | (stype == TYPE_DNA_SEQ) | - (stype == TYPE_MP4) | (stype == TYPE_FLAC) | (stype == TYPE_AVI)); + (stype == TYPE_MP4) | (stype == TYPE_FLAC) | (stype == TYPE_AVI) | (stype == TYPE_DICOM)); } int @@ -237,8 +240,8 @@ adapt_compress(void *src, uint64_t srclen, void *dst, double tagcnt, pct_tag; uchar_t cur_byte, prev_byte; /* - * Count number of 8-bit binary bytes and XML tags in source. - */ + * Count number of 8-bit binary bytes and XML tags in source. + */ tot8b = 0; tag1 = 0; tag2 = 0; @@ -255,6 +258,9 @@ adapt_compress(void *src, uint64_t srclen, void *dst, prev_byte = cur_byte; } + /* + * Heuristics for detecting BINARY vs generic TEXT vs XML data. + */ tot8b /= 0x80; tagcnt = tag1 + tag2 + tag3; pct_tag = tagcnt / (double)srclen; diff --git a/archive/pc_archive.c b/archive/pc_archive.c index 01264b1..211a2fc 100644 --- a/archive/pc_archive.c +++ b/archive/pc_archive.c @@ -1493,6 +1493,22 @@ detect_type_by_data(uchar_t *buf, size_t len) return (TYPE_BINARY|TYPE_ARCHIVE_TAR); if (memcmp(buf, "%PDF-", 5) == 0) return (TYPE_BINARY|TYPE_PDF); + + // Try to detect DICOM medical image file. BSC compresses these better. + if (len > 127) { + size_t i; + + // DICOM files should have either DICM or ISO_IR within the first 128 bytes + for (i = 0; i < 128; i++) { + if (buf[i] == 'D') + if (memcmp(&buf[i], "DICM", 4) == 0) + return (TYPE_BINARY|TYPE_DICOM); + if (buf[i] == 'I') + if (memcmp(&buf[i], "ISO_IR ", 7) == 0) + return (TYPE_BINARY|TYPE_DICOM); + } + } + if (U32_P(buf) == ELFINT) { // Regular ELF, check for 32/64-bit, core dump if (*(buf + 16) != 4) { if (*(buf + 4) == 2) { diff --git a/utils/utils.h b/utils/utils.h index 358d07d..17c31d5 100644 --- a/utils/utils.h +++ b/utils/utils.h @@ -274,7 +274,8 @@ typedef enum { TYPE_BMP = 200, TYPE_TIFF = 208, TYPE_PDF = 216, - TYPE_ARCHIVE_TAR = 224 + TYPE_ARCHIVE_TAR = 224, + TYPE_DICOM = 232 } data_type_t; /*