diff --git a/oletools/ftguess.py b/oletools/ftguess.py index f22f416f..647d2560 100644 --- a/oletools/ftguess.py +++ b/oletools/ftguess.py @@ -15,9 +15,9 @@ # http://fileformats.archiveteam.org # https://www.nationalarchives.gov.uk/PRONOM/Default.aspx -#=== LICENSE ================================================================= +# === LICENSE ================================================================= -# ftguess is copyright (c) 2018-2024, Philippe Lagadec (http://www.decalage.info) +# ftguess is copyright (c) 2018-2026, Philippe Lagadec (http://www.decalage.info) # All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, @@ -42,12 +42,13 @@ from __future__ import print_function -#------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ # CHANGELOG: # 2018-07-04 v0.54 PL: - first version # 2021-05-09 v0.60 PL: - +# 2025-01-18 v0.61 PL: - added strict mode -__version__ = '0.60.2' +__version__ = '0.61.dev2' # ------------------------------------------------------------------------------ # TODO: @@ -62,6 +63,12 @@ import olefile import logging import optparse +import inspect + +try: + import magika +except ImportError: + magika = None # import lxml or ElementTree for XML parsing: try: @@ -77,8 +84,8 @@ import elementtree.cElementTree as ET except ImportError: raise ImportError("lxml or ElementTree are not installed, " \ - + "see http://codespeak.net/lxml " \ - + "or http://effbot.org/zone/element-index.htm") + + "see http://codespeak.net/lxml " \ + + "or http://effbot.org/zone/element-index.htm") # IMPORTANT: it should be possible to run oletools directly as scripts # in any directory without installing them with pip or setup.py. @@ -95,6 +102,7 @@ from oletools.common import clsid from oletools.thirdparty.xglob import xglob + # === LOGGING ================================================================= class NullHandler(logging.Handler): @@ -104,10 +112,12 @@ class NullHandler(logging.Handler): Python 2.7 has logging.NullHandler, but this is necessary for 2.6: see https://docs.python.org/2.6/library/logging.html#configuring-logging-for-a-library """ + def emit(self, record): pass -def get_logger(name, level=logging.CRITICAL+1): + +def get_logger(name, level=logging.CRITICAL + 1): """ Create a suitable logger object for this module. The goal is not to change settings of the root logger, to avoid getting @@ -119,7 +129,7 @@ def get_logger(name, level=logging.CRITICAL+1): # First, test if there is already a logger with the same name, else it # will generate duplicate messages (due to duplicate handlers): if name in logging.Logger.manager.loggerDict: - #NOTE: another less intrusive but more "hackish" solution would be to + # NOTE: another less intrusive but more "hackish" solution would be to # use getLogger then test if its effective level is not default. logger = logging.getLogger(name) # make sure level is OK: @@ -133,9 +143,11 @@ def get_logger(name, level=logging.CRITICAL+1): logger.setLevel(level) return logger + # a global logger object used for debugging: log = get_logger('ftguess') + def enable_logging(): """ Enable logging for this module (disabled by default). @@ -144,6 +156,7 @@ def enable_logging(): """ log.setLevel(logging.NOTSET) + # === CONSTANTS =============================================================== # file types for FileTypeGuesser: @@ -184,13 +197,15 @@ class FTYPE(object): MHTML = 'MHTML' TEXT = 'TEXT' EXE_PE = 'EXE_PE' - GENERIC_OLE = 'OLE' # Generic OLE file - GENERIC_XML = 'XML' # Generic XML file - GENERIC_OPENXML = 'OpenXML' # Generic OpenXML file + GENERIC_OLE = 'OLE' # Generic OLE file + GENERIC_XML = 'XML' # Generic XML file + GENERIC_OPENXML = 'OpenXML' # Generic OpenXML file UNKNOWN = 'Unknown File Type' MSI = "MSI" ONENOTE = "OneNote" PNG = 'PNG' + JAR = 'JAR' + BAT = 'BAT' class CONTAINER(object): """ @@ -207,6 +222,9 @@ class CONTAINER(object): UNKNOWN = 'Unknown Container' ONENOTE = 'OneNote' PNG = 'PNG' + PDF = 'PDF' + JAR = 'JAR' + class APP(object): """ @@ -221,9 +239,13 @@ class APP(object): MSOFFICE = 'MS Office' # when the exact app is unknown MSONENOTE = 'MS OneNote' ZIP_ARCHIVER = 'Any Zip Archiver' + PDF_VIEWER = 'Any PDF viewer (Adobe Reader, Acrobat, Firefox, Chrome, etc)' + JVM = 'Java Virtual Machine' WINDOWS = 'Windows' # for Windows executables and XPS + CMD = "Windows cmd.exe" UNKNOWN = 'Unknown Application' + # FTYPE_NAME = { # FTYPE_ZIP: 'Zip archive', # FTYPE_WORD97: 'MS Word 97-2000 Document', @@ -247,7 +269,6 @@ class APP(object): TAG_CTYPES_DEFAULT = NS_CONTENT_TYPES + 'Default' TAG_CTYPES_OVERRIDE = NS_CONTENT_TYPES + 'Override' - # Namespaces and tags for Word/PowerPoint 2007+ XML parsing: # root: NS_XMLPACKAGE = '{http://schemas.microsoft.com/office/2006/xmlPackage}' @@ -261,11 +282,9 @@ class APP(object): TAG_PKGBINDATA = NS_XMLPACKAGE + 'binaryData' - - # === CLASSES ================================================================ -class FType_Base (object): +class FType_Base(object): container = CONTAINER.UNKNOWN application = APP.UNKNOWN filetype = FTYPE.UNKNOWN @@ -287,9 +306,11 @@ def recognize(cls, ftg): """ return False + class FType_Unknown(FType_Base): pass + class FType_RTF(FType_Base): container = CONTAINER.RTF application = APP.MSWORD @@ -320,8 +341,8 @@ def recognize(cls, ftg): # see https://github.com/decalage2/olefile/issues/142 # Workaround: pad data when it's smaller than 1536 bytes # TODO: use the new data parameter of isOleFile when it's implemented - if len(ftg.data)<1536: - data = ftg.data + (b'\x00'*1536) + if len(ftg.data) < 1536: + data = ftg.data + (b'\x00' * 1536) else: data = ftg.data if olefile.isOleFile(data): @@ -349,7 +370,10 @@ class FType_OLE_CLSID_Base(FType_Generic_OLE): @classmethod def recognize(cls, ftg): - # TODO: refactor, this is not used anymore + # First make sure the olefile is parsed: + if ftg.olefile is None and not FType_Generic_OLE.recognize(ftg): + return False + # The following checks are used by children ftype classes: if ftg.root_clsid is not None: # First, attempt to identify the root storage CLSID: if ftg.root_clsid in cls.CLSIDS: @@ -361,6 +385,7 @@ def recognize(cls, ftg): # TODO: check if a Word doc is OK without a clsid return False + class FType_Generic_Zip(FType_Base): container = CONTAINER.ZIP application = APP.ZIP_ARCHIVER @@ -395,6 +420,10 @@ class FType_Generic_OpenXML(FType_Base): @classmethod def recognize(cls, ftg): log.debug('Open XML - recognize') + # First make sure this is a zip file, and set ftg.zipfile: + if ftg.zipfile is None: + if not FType_Generic_Zip.recognize(ftg): + return False # TODO: move most of this code to ooxml.py # TODO: here it can be either forward or backward slash... try: @@ -452,13 +481,13 @@ def recognize(cls, ftg): elem_ctypes = ET.fromstring(content_types) ctypes_ext = {} ctypes_part = {} - for elem_ext in elem_ctypes.iter(tag = TAG_CTYPES_DEFAULT): + for elem_ext in elem_ctypes.iter(tag=TAG_CTYPES_DEFAULT): extension = elem_ext.get('Extension') content_type = elem_ext.get('ContentType') # print('Ext: %s => Content-type: %s' % (extension, content_type)) if extension is not None and content_type is not None: ctypes_ext[extension] = content_type - for elem_part in elem_ctypes.iter(tag = TAG_CTYPES_OVERRIDE): + for elem_part in elem_ctypes.iter(tag=TAG_CTYPES_OVERRIDE): partname = elem_part.get('PartName') # remove leading slash if present partname = partname.lstrip('/') @@ -488,6 +517,7 @@ class FType_Word(FType_Base): name = 'MS Word (generic)' longname = 'MS Word Document or Template (generic)' + class FType_Word97(FType_OLE_CLSID_Base, FType_Word): application = APP.MSWORD filetype = FTYPE.WORD97 @@ -501,6 +531,7 @@ class FType_Word97(FType_OLE_CLSID_Base, FType_Word): may_contain_ole = True # TODO: if no CLSID, check stream 'WordDocument' + class FType_Word6(FType_OLE_CLSID_Base, FType_Word): application = APP.MSWORD filetype = FTYPE.WORD6 @@ -512,6 +543,7 @@ class FType_Word6(FType_OLE_CLSID_Base, FType_Word): PUID = 'fmt/39' may_contain_ole = True + class FType_Word2007_Base(FType_Generic_OpenXML, FType_Word): application = APP.MSWORD name = 'MS Word 2007+ File' @@ -525,6 +557,7 @@ class FType_Word2007(FType_Word2007_Base): longname = 'MS Word 2007+ Document (.docx)' extensions = ['docx'] + class FType_Word2007_Macro(FType_Word2007_Base): application = APP.MSWORD filetype = FTYPE.WORD2007_DOCM @@ -532,6 +565,7 @@ class FType_Word2007_Macro(FType_Word2007_Base): longname = 'MS Word 2007+ Macro-Enabled Document (.docm)' extensions = ['docm'] + class FType_Word2007_Template(FType_Word2007_Base): application = APP.MSWORD filetype = FTYPE.WORD2007_DOTX @@ -539,6 +573,7 @@ class FType_Word2007_Template(FType_Word2007_Base): longname = 'MS Word 2007+ Template (.dotx)' extensions = ['dotx'] + class FType_Word2007_Template_Macro(FType_Word2007_Base): application = APP.MSWORD filetype = FTYPE.WORD2007_DOTM @@ -546,6 +581,7 @@ class FType_Word2007_Template_Macro(FType_Word2007_Base): longname = 'MS Word 2007+ Macro-Enabled Template (.dotm)' extensions = ['dotm'] + # --- EXCEL Formats --- class FType_Excel(FType_Base): @@ -554,6 +590,7 @@ class FType_Excel(FType_Base): name = 'MS Excel (generic)' longname = 'MS Excel Workbook/Template/Add-in (generic)' + class FType_Excel97(FType_Excel, FType_Generic_OLE): filetype = FTYPE.EXCEL97 name = 'MS Excel 97 Workbook' @@ -562,6 +599,7 @@ class FType_Excel97(FType_Excel, FType_Generic_OLE): extensions = ['xls', 'xlt', 'xla'] # TODO: if no CLSID, check stream 'Workbook' or 'Book' (maybe Excel 5) + class FType_Excel5(FType_Excel, FType_Generic_OLE): filetype = FTYPE.EXCEL5 name = 'MS Excel 5.0/95 Workbook' @@ -570,21 +608,24 @@ class FType_Excel5(FType_Excel, FType_Generic_OLE): extensions = ['xls', 'xlt', 'xla'] # TODO: this CLSID is also used in Excel addins (.xla) saved by MS Excel 365 + class FType_Excel2007(FType_Excel, FType_Generic_OpenXML): '''Base class for all MS Excel 2007 file types''' name = 'MS Excel 2007+ (generic)' longname = 'MS Excel 2007+ Workbook or Template (generic)' content_types = ('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',) - # note: content type differs only for xlsm + # note: content type differs only for xlsm + -class FType_Excel2007_XLSX (FType_Excel2007): +class FType_Excel2007_XLSX(FType_Excel2007): filetype = FTYPE.EXCEL2007_XLSX name = 'MS Excel 2007+ Workbook' longname = 'MS Excel 2007+ Workbook (.xlsx)' extensions = ['xlsx'] PUID = 'fmt/214' -class FType_Excel2007_XLSM (FType_Excel2007): + +class FType_Excel2007_XLSM(FType_Excel2007): filetype = FTYPE.EXCEL2007_XLSM name = 'MS Excel 2007+ Macro-Enabled Workbook' longname = 'MS Excel 2007+ Macro-Enabled Workbook (.xlsm)' @@ -592,7 +633,8 @@ class FType_Excel2007_XLSM (FType_Excel2007): content_types = ('application/vnd.ms-excel.sheet.macroEnabled.12',) PUID = 'fmt/445' -class FType_Excel2007_XLSB (FType_Excel2007): + +class FType_Excel2007_XLSB(FType_Excel2007): filetype = FTYPE.EXCEL2007_XLSB name = 'MS Excel 2007+ Binary Workbook' longname = 'MS Excel 2007+ Binary Workbook (.xlsb)' @@ -600,24 +642,28 @@ class FType_Excel2007_XLSB (FType_Excel2007): content_types = ('application/vnd.ms-excel.sheet.binary.macroEnabled.12',) PUID = 'fmt/595' + class FType_Excel2007_Template(FType_Excel2007): filetype = FTYPE.EXCEL2007_XLTX name = 'MS Excel 2007+ Template' longname = 'MS Excel 2007+ Template (.xltx)' extensions = ['xltx'] + class FType_Excel2007_Template_Macro(FType_Excel2007): filetype = FTYPE.EXCEL2007_XLTM name = 'MS Excel 2007+ Macro-Enabled Template' longname = 'MS Excel 2007+ Macro-Enabled Template (.xltm)' extensions = ['xltm'] + class FType_Excel2007_Addin_Macro(FType_Excel2007): filetype = FTYPE.EXCEL2007_XLAM name = 'MS Excel 2007+ Macro-Enabled Add-in' longname = 'MS Excel 2007+ Macro-Enabled Add-in (.xlam)' extensions = ['xlam'] + # --- POWERPOINT Formats --- class FType_Powerpoint(FType_Base): @@ -626,6 +672,7 @@ class FType_Powerpoint(FType_Base): name = 'MS Powerpoint (generic)' longname = 'MS Powerpoint Presentation/Slideshow/Template/Addin/... (generic)' + class FType_Powerpoint97(FType_Powerpoint, FType_Generic_OLE): # see also: ppt_record_parser.is_ppt filetype = FTYPE.POWERPOINT97 @@ -634,6 +681,7 @@ class FType_Powerpoint97(FType_Powerpoint, FType_Generic_OLE): CLSIDS = ('64818D10-4F9B-11CF-86EA-00AA00B929E8',) extensions = ['ppt', 'pps', 'pot'] + class FType_Powerpoint2007(FType_Powerpoint, FType_Generic_OpenXML): '''Base class for all MS Powerpoint 2007 file types''' filetype = FTYPE.POWERPOINT2007 @@ -641,6 +689,7 @@ class FType_Powerpoint2007(FType_Powerpoint, FType_Generic_OpenXML): longname = 'MS Powerpoint 2007+ Presentation/Slideshow/Template (generic)' content_types = ('application/vnd.openxmlformats-officedocument.presentationml.presentation',) + class FType_Powerpoint2007_Presentation(FType_Powerpoint2007): filetype = FTYPE.POWERPOINT2007_PPTX name = 'MSPowerpoint 2007+ Presentation' @@ -648,6 +697,7 @@ class FType_Powerpoint2007_Presentation(FType_Powerpoint2007): content_types = ('application/vnd.openxmlformats-officedocument.presentationml.presentation',) extensions = ['pptx'] + class FType_Powerpoint2007_Slideshow(FType_Powerpoint2007): filetype = FTYPE.POWERPOINT2007_PPSX name = 'MSPowerpoint 2007+ Slideshow' @@ -655,6 +705,7 @@ class FType_Powerpoint2007_Slideshow(FType_Powerpoint2007): content_types = ('application/vnd.openxmlformats-officedocument.presentationml.slideshow',) extensions = ['ppsx'] + class FType_Powerpoint2007_Macro(FType_Powerpoint2007): filetype = FTYPE.POWERPOINT2007_PPTM name = 'MSPowerpoint 2007+ Macro-Enabled Presentation' @@ -662,6 +713,7 @@ class FType_Powerpoint2007_Macro(FType_Powerpoint2007): content_types = ('application/vnd.ms-powerpoint.presentation.macroEnabled.12',) extensions = ['pptm'] + class FType_Powerpoint2007_Slideshow_Macro(FType_Powerpoint2007): filetype = FTYPE.POWERPOINT2007_PPSM name = 'MSPowerpoint 2007+ Macro-Enabled Slideshow' @@ -696,6 +748,7 @@ class FType_OneNote(FType_Base): extensions = ['one'] content_types = ('application/msonenote',) PUID = 'fmt/637' + # ref: https://learn.microsoft.com/en-us/openspecs/office_file_formats/ms-onestore/ae670cd2-4b38-4b24-82d1-87cfb2cc3725 # PRONOM: https://www.nationalarchives.gov.uk/PRONOM/Format/proFormatSearch.aspx?status=detailReport&id=1437 @@ -703,7 +756,8 @@ class FType_OneNote(FType_Base): def recognize(cls, ftg): # ref about Header with OneNote GUID: # https://learn.microsoft.com/en-us/openspecs/office_file_formats/ms-onestore/2b394c6b-8788-441f-b631-da1583d772fd - return True if ftg.data.startswith(b'\xE4\x52\x5C\x7B\x8C\xD8\xA7\x4D\xAE\xB1\x53\x78\xD0\x29\x96\xD3') else False + return True if ftg.data.startswith( + b'\xE4\x52\x5C\x7B\x8C\xD8\xA7\x4D\xAE\xB1\x53\x78\xD0\x29\x96\xD3') else False class FType_PNG(FType_Base): @@ -714,7 +768,8 @@ class FType_PNG(FType_Base): longname = 'Portable Network Graphics picture (.png)' extensions = ['png'] content_types = ('image/png',) - PUID = 'fmt/13' # This is for PNG 1.2. PNG 1.1 is fmt/12, 1.0 is fmt/11 + PUID = 'fmt/13' # This is for PNG 1.2. PNG 1.1 is fmt/12, 1.0 is fmt/11 + # ref: http://fileformats.archiveteam.org/wiki/PNG # PRONOM: https://www.nationalarchives.gov.uk/PRONOM/Format/proFormatSearch.aspx?status=detailReport&id=666 @@ -723,6 +778,133 @@ def recognize(cls, ftg): return True if ftg.data.startswith(b'\x89\x50\x4E\x47\x0D\x0A\x1A\x0A') else False +class FType_MHT(FType_Base): + """ + MHTML file format, which can be produced by a web browser, MS Word, Excel or OneNote + """ + # TODO: this class is generic, it would be better to distinguish MHT produced by different apps (check MIME headers) + container = CONTAINER.MIME + application = APP.UNKNOWN + filetype = FTYPE.MHTML + name = 'MHTML' + longname = 'MHTML - MIME Encapsulation of Aggregate Documents, such as HTML (.mht)' + # TODO: MHT can work with .doc extension, need to check othe cases (xls, docx, etc) + extensions = ['mht', 'mhtml', 'doc', 'xls'] + content_types = ('multipart/related', 'message/rfc822') + PUID = 'x-fmt/429' + + # ref: http://justsolve.archiveteam.org/wiki/MHTML + # PRONOM: https://www.nationalarchives.gov.uk/PRONOM/x-fmt/429 + + @classmethod + def recognize(cls, ftg): + log.debug('FType_MHT.recognize') + data_lowercase = ftg.data.lower() + # code borrowed from olevba, not sure it's fully accurate + if (b'mime' in data_lowercase and + b'version' in data_lowercase and + b'multipart' in data_lowercase and + abs(data_lowercase.index(b'version') - data_lowercase.index(b'mime')) < 20): + return True + else: + return False + + +class FType_BAT(FType_Base): + """ + BAT/CMD file format, aka Windows batch files + """ + container = CONTAINER.UNKNOWN + application = APP.CMD + filetype = FTYPE.BAT + name = 'BAT/CMD' + longname = 'BAT/CMD - Windows batch files (.bat, .cmd)' + extensions = ['bat', 'cmd'] + content_types = ('text/x-msdos-batch', 'text/plain') # TODO + PUID = 'x-fmt/TODO' # TODO + + # ref: http://justsolve.archiveteam.org/wiki/MHTML + # PRONOM: https://www.nationalarchives.gov.uk/PRONOM/x-fmt/429 + + @classmethod + def recognize(cls, ftg): + log.debug('FType_BAT.recognize') + res = ftg.get_magika_results() + if res is not None: # if magika is installed + log.debug(repr(res)) + if res.status == 'ok': + if res.output.label == 'batch': + return True + return False + +def get_ftype_class(ftype, _name, _extensions, _content_types, label): + class FType_new(FType_Base): + container = CONTAINER.UNKNOWN + application = APP.UNKNOWN + filetype = ftype + name = _name + longname = name + extensions = _extensions + content_types = _content_types + PUID = 'x-fmt/TODO' # TODO + + @classmethod + def recognize(cls, ftg): + res = ftg.get_magika_results() + if res is not None: # if magika is installed + log.debug(repr(res)) + if res.status == 'ok': + if res.output.label == label: + return True + return False + + return FType_new + +# FType_HTML = get_ftype_class("HTML", "HTML", ("html", "htm"), ('text/html',), "html") + +class FType_PDF(FType_Base): + """ + PDF file format + """ + container = CONTAINER.PDF + application = APP.PDF_VIEWER + filetype = FTYPE.PDF + name = 'PDF' + longname = 'PDF - Portable Document Format (.pdf)' + # TODO: check if other extensions are supported + extensions = ['pdf'] + content_types = ('application/pdf',) + # TODO: in fact PDF has lots of PUID codes, one for each PDF version + # see http://fileformats.archiveteam.org/wiki/PDF#Identifiers + PUID = 'fmt/276' # This is only for PDF 1.7 + + # ref: http://fileformats.archiveteam.org/wiki/PDF + # PRONOM: https://www.nationalarchives.gov.uk/PRONOM/fmt/276 + + @classmethod + def recognize(cls, ftg): + log.debug('FType_PDF.recognize') + # Look for the "%PDF-" header, which can be in the first 1030 bytes: + if (b'%PDF-' in ftg.data[:1030]): + return True + else: + return False + # TODO: we could also look for other keywords present in all PDF files to be more accurate + # - %%EOF, startxref, trailer are NOT mandatory (see minimal PDF files) + # - obj/endobj should always be present, also "<<" and ">>" + + +class FType_JAR(FType_Generic_Zip): + container = CONTAINER.JAR + application = APP.JVM + filetype = FTYPE.JAR + name = 'Java Archive' + longname = 'Java Archive (.jar)' + extensions = ['jar'] + # TODO: for now we just rely on the generic zip recognize(), but we need to check specific files in JAR + # TODO: see https://github.com/CybercentreCanada/assemblyline-base/blob/3c2509e2618fb5d55827f6c80ca7f3419cda4de7/assemblyline/common/identify.py#L466 + + # TODO: for PPT, check for stream 'PowerPoint Document' # TODO: for Visio, check for stream 'VisioDocument' @@ -756,10 +938,12 @@ def recognize(cls, ftg): 'application/vnd.ms-excel.template.macroEnabled.main+xml': FType_Excel2007_Template_Macro, 'application/vnd.ms-excel.addin.macroEnabled.main+xml': FType_Excel2007_Addin_Macro, # POWERPOINT - 'application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml': FType_Powerpoint2007_Presentation, #PPTX - 'application/vnd.openxmlformats-officedocument.presentationml.slideshow.main+xml': FType_Powerpoint2007_Slideshow, #PPSX - 'application/vnd.ms-powerpoint.presentation.macroEnabled.main+xml': FType_Powerpoint2007_Macro, #PPTM - 'application/vnd.ms-powerpoint.slideshow.macroEnabled.main+xml': FType_Powerpoint2007_Slideshow_Macro, #PPSM + 'application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml': FType_Powerpoint2007_Presentation, + # PPTX + 'application/vnd.openxmlformats-officedocument.presentationml.slideshow.main+xml': FType_Powerpoint2007_Slideshow, + # PPSX + 'application/vnd.ms-powerpoint.presentation.macroEnabled.main+xml': FType_Powerpoint2007_Macro, # PPTM + 'application/vnd.ms-powerpoint.slideshow.macroEnabled.main+xml': FType_Powerpoint2007_Slideshow_Macro, # PPSM # TODO: add missing PowerPoint formats: # PPAM – PowerPoint Add-in Open Office XML File Format. Mime type is application/vnd.ms-powerpoint.addin.macroEnabled.12. @@ -768,17 +952,18 @@ def recognize(cls, ftg): # XPS 'application/vnd.ms-package.xps-fixeddocumentsequence+xml': FType_XPS, - #TODO: Add MSIX + # TODO: Add MSIX } -class FType_EXE_PE (FType_Base): +class FType_EXE_PE(FType_Base): filetype = FTYPE.EXE_PE container = CONTAINER.BINARY application = APP.WINDOWS name = "Windows PE Executable or DLL" longname = "Windows Portable Executable or DLL (EXE,DLL)" - extensions = ('exe', 'dll', 'sys', 'scr') # TODO: add more from https://en.wikipedia.org/wiki/Portable_Executable + extensions = ('exe', 'dll', 'sys', 'scr', 'com') # TODO: add more from https://en.wikipedia.org/wiki/Portable_Executable + # Note: on Windows 64 bits COM files cannot run, but EXE files renamed as COM are allowed to run (cf. Wikipedia) content_types = ('application/vnd.microsoft.portable-executable',) PUID = 'fmt/899' @@ -787,19 +972,55 @@ def recognize(cls, ftg): return True if ftg.data.startswith(b'MZ') else False # TODO: make this more accurate by checking the PE header, e.g. using pefile or directly + +# List of FType_* classes in this module +ftype_classes = [obj for name, obj in inspect.getmembers(sys.modules[__name__], inspect.isclass) if + issubclass(obj, FType_Base)] + +# Map each file extension to matching FType classes +# TODO: need to add a priority to sort the order of ftype classes +# For example for the .doc extension, MHT should be checked last, after Word, RTF, etc +EXT_FTYPE = {} +for ftype in ftype_classes: + for extension in ftype.extensions: + if extension in EXT_FTYPE: + EXT_FTYPE[extension].append(ftype) + else: + EXT_FTYPE[extension] = [ftype] + # print(f"ext {extension} => {ftype.name}") + + +# for extension in EXT_FTYPE: +# ftypes = EXT_FTYPE[extension] +# ftype_names = [ftype.name for ftype in ftypes] +# print(f".{extension}: {','.join(ftype_names)}") + + class FileTypeGuesser(object): """ - A class to guess the type of a file, focused on MS Office, RTF and ZIP. + A class to guess the type of a file. """ - def __init__(self, filepath=None, data=None): + def __init__(self, filepath=None, data=None, strict_mode=False): + """ + FileTypeGuesser constructor + :param filepath: (optional) path of the file to be opened from disk, or original filename if data is provided + :param data: (optional) bytes string containing the full file content + :param strict_mode: if True, the file identification algorithm is based on extension+content. Otherwise, content only. + """ + # TODO: add the possibility to get data from a file-like object + # TODO: add the possibility to supply the original filename, if filepath is a different name self.filepath = filepath + self.extension = '' self.data = data self.container = None self.application = None self.filetype = None self.ftype = FType_Unknown # FType class self.data_bytesio = None + self.magika_results = None # Cached Magika results + self.magika = None # Magika object (reused several times) + self.method = None # detection methodology # For OLE: self.olefile = None self.root_clsid = None @@ -817,10 +1038,53 @@ def __init__(self, filepath=None, data=None): if filepath is None and data is None: raise ValueError('FileTypeGuesser requires either a file path or file data, or both') if data is None: + # TODO: in general we should not read the whole file in RAM, to support large files with open(filepath, 'rb') as f: self.data = f.read() self.data_bytesio = io.BytesIO(self.data) - + if self.filepath is not None: + _, extension = os.path.splitext(self.filepath) + # remove dot if present (.exe => exe) + if extension.startswith('.'): + extension = extension[1:] + self.extension = extension.lower() + if strict_mode: + self.guess_ftype_strict() + else: + self.guess_ftype_by_content() + + def guess_ftype_strict(self): + log.debug('guess_ftype_strict') + log.info("STEP 1: Checking the extension if it matches known file formats") + # if the file has no extension, attempt to guess file type: + if self.extension == '': + log.info("The file has no extension, guessing file type by content only") + self.guess_ftype_by_content() + return + # if the extension is unknown or not supported, also attempt to guess file type: + if self.extension not in EXT_FTYPE: + log.info("The file extension is unknown or not supported, guessing file type by content only") + self.guess_ftype_by_content() + return + # check all ftypes matching the extension: + log.info(f"STEP 1: candidate file types matching the file extension: {[ftype.name for ftype in EXT_FTYPE[self.extension]]}") + for ftype in EXT_FTYPE[self.extension]: + log.info(f'STEP 2: Checking content for file type "{ftype.name}" which matches the extension ".{self.extension}"') + if ftype.recognize(self): + self.ftype = ftype + self.container = self.ftype.container + self.filetype = self.ftype.filetype + self.application = self.ftype.application + self.method = "strict mode (file extension and content)" + return + # TODO: handle case when several ftypes are recognized! + # if none of the ftypes could recognize the file structure, attempt to guess: + log.debug("None of the file types matching the extension were recognized, guessing file type by content only") + self.guess_ftype_by_content() + + def guess_ftype_by_content(self): + log.debug('guess_ftype_by_content') + log.info("STEP 3: Guessing file type by checking the content only") # Identify the main container type: for ftype in (FType_RTF, FType_Generic_OLE, FType_Generic_Zip, FType_OneNote, FType_PNG): if ftype.recognize(self): @@ -832,6 +1096,7 @@ def __init__(self, filepath=None, data=None): # OLE file types: if self.container == CONTAINER.OLE: + log.debug('OLE container') # for ftype in (FType_Word97, FType_Word6, FType_Excel97, FType_Excel5): # if ftype.recognize(self): # self.ftype = ftype @@ -842,16 +1107,52 @@ def __init__(self, filepath=None, data=None): # OpenXML file types: if self.container == CONTAINER.ZIP: + log.debug('ZIP container') if FType_Generic_OpenXML.recognize(self): self.ftype = FType_Generic_OpenXML ft = openxml_ftypes.get(self.main_part_content_type, None) if ft is not None: self.ftype = ft - # TODO: use a mapping from magic to file types + # TODO: use a mapping from magic to file types? if self.container == CONTAINER.UNKNOWN: - if FType_EXE_PE.recognize(self): - self.ftype = FType_EXE_PE + log.debug('Unknown container, checking other formats') + # NOTE: the order of file types matters! + # Check PDF, MHT towards the end because false positives are likely + for ftype in (FType_EXE_PE, FType_PDF, FType_MHT, FType_BAT): + if ftype.recognize(self): + self.ftype = ftype + break + + # Fallback to magika: create FType class on the fly + if self.ftype == None or self.ftype == FType_Unknown: + if magika is not None: + log.info("STEP 4: Fallback to magika") + res = self.get_magika_results() + # TODO: did it find a match? + log.debug(res) + ftype = res.output.label + # Check if Magika has recognized the format: + if ftype != "unknown": + name = res.output.label + longname = res.output.description + ext = res.output.extensions + mimetypes = (res.output.mime_type,) + self.ftype = get_ftype_class(ftype, name, ext, mimetypes, ftype) + if self.extension in ext: + # TODO: this result has higher confidence than content only + self.method = "Magika on file content only + matching extension" + else: + self.method = "Magika on file content only" + else: + self.method = "File format not recognized" + else: + self.method = "File format not recognized" + else: + self.method = "File content only" + + if self.ftype.extensions and self.extension not in self.ftype.extensions: + log.warning("The file extension does not match the file content") self.container = self.ftype.container self.filetype = self.ftype.filetype @@ -872,6 +1173,17 @@ def close(self): if self.zipfile is not None: self.zipfile.close() + def get_magika_results(self): + """ + Get results from Magika. Magika is only launched at first call. + If Magika is not available, return None + """ + # only use magika if installed + if self.magika_results == None and magika is not None: + self.magika = magika.Magika() + self.magika_results = self.magika.identify_stream(self.data_bytesio) + return self.magika_results + def is_ole(self): """ Shortcut to check if the container is OLE @@ -910,12 +1222,48 @@ def is_powerpoint(self): # === FUNCTIONS ============================================================== -def ftype_guess(filepath=None, data=None): - return FileTypeGuesser(filepath, data) - -def process_file(container, filename, data): +def ftype_guess(filepath=None, data=None, strict_mode=False): + return FileTypeGuesser(filepath, data, strict_mode) + + +# def ftype_strict(filepath=None, data=None, extension=''): +# """ +# Identify the file type using the strict mode: +# first by checking the file extension, and second by matching it to the +# internal file structure. +# :param filepath: str, file path +# :param data: bytes, file content +# :param extension: str, filename extension without dot +# :return: FType class +# """ +# # remove dot at beginning +# # if the file has no extension, attempt to guess file type: +# if extension == '': +# log.debug("The file has no extension, guessing file type by content only") +# return ftype_guess(filepath, data) +# # if the extension is unknown or not supported, also attempt to guess file type: +# if extension not in EXT_FTYPE: +# log.debug("The file extension is unknown or not supported, guessing file type by content only") +# return ftype_guess(filepath, data) +# # check all ftypes matching the extension: +# ftg = FileTypeGuesser(filepath, data) +# for ftype in EXT_FTYPE[extension]: +# if ftype.recognize(ftg): +# return ftype +# # TODO: handle case when several ftypes are recognized! +# # if none of the ftypes could recognize the file structure, attempt to guess: +# log.debug("None of the file types matching the extension were recognized, guessing file type by content only") +# return ftype_guess(filepath, data) + + +def process_file(container, filename, data, strict_mode=False): print('File : %s' % filename) - ftg = ftype_guess(filepath=filename, data=data) + _, extension = os.path.splitext(filename) + # remove dot if present (.exe => exe) + if extension.startswith('.'): + extension = extension[1:] + print('File extension: %s' % extension) + ftg = ftype_guess(filepath=filename, data=data, strict_mode=strict_mode) print('File Type : %s' % ftg.ftype.name) print('Description: %s' % ftg.ftype.longname) print('Application: %s' % ftg.ftype.application) @@ -924,47 +1272,50 @@ def process_file(container, filename, data): print('Root CLSID : %s - %s' % (ftg.root_clsid, ftg.root_clsid_name)) print('Content-type(s) : %s' % ','.join(ftg.ftype.content_types)) print('PUID : %s' % ftg.ftype.PUID) + print('File type identification method: %s' % ftg.method) print() -#=== MAIN ================================================================= +# === MAIN ================================================================= def main(): # print banner with version python_version = '%d.%d.%d' % sys.version_info[0:3] - print ('ftguess %s on Python %s - http://decalage.info/python/oletools' % - (__version__, python_version)) - print ('THIS IS WORK IN PROGRESS - Check updates regularly!') - print ('Please report any issue at https://github.com/decalage2/oletools/issues') - print ('') + print('ftguess %s on Python %s - http://decalage.info/python/oletools' % + (__version__, python_version)) + print('THIS IS WORK IN PROGRESS - Check updates regularly!') + print('Please report any issue at https://github.com/decalage2/oletools/issues') + print('') - DEFAULT_LOG_LEVEL = "warning" # Default log level + DEFAULT_LOG_LEVEL = "warning" # Default log level LOG_LEVELS = { - 'debug': logging.DEBUG, - 'info': logging.INFO, - 'warning': logging.WARNING, - 'error': logging.ERROR, + 'debug': logging.DEBUG, + 'info': logging.INFO, + 'warning': logging.WARNING, + 'error': logging.ERROR, 'critical': logging.CRITICAL - } + } usage = 'usage: %prog [options] [filename2 ...]' parser = optparse.OptionParser(usage=usage) # parser.add_option('-c', '--csv', dest='csv', # help='export results to a CSV file') parser.add_option("-r", action="store_true", dest="recursive", - help='find files recursively in subdirectories.') + help='find files recursively in subdirectories.') + parser.add_option("-s", "--strict", action="store_true", dest="strict_mode", + help='Strict mode: file extension is checked first, then matched to file structure') parser.add_option("-z", "--zip", dest='zip_password', type='str', default=None, - help='if the file is a zip archive, open first file from it, using the provided password') + help='if the file is a zip archive, open first file from it, using the provided password') parser.add_option("-f", "--zipfname", dest='zip_fname', type='str', default='*', - help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)') + help='if the file is a zip archive, file(s) to be opened within the zip. Wildcards * and ? are supported. (default:*)') parser.add_option('-l', '--loglevel', dest="loglevel", action="store", default=DEFAULT_LOG_LEVEL, - help="logging level debug/info/warning/error/critical (default=%default)") + help="logging level debug/info/warning/error/critical (default=%default)") (options, args) = parser.parse_args() # Print help if no arguments are passed if len(args) == 0: - print (__doc__) + print(__doc__) parser.print_help() sys.exit() @@ -976,12 +1327,16 @@ def main(): # enable logging in the modules: enable_logging() + # warn if Magika is not installed + if magika is None: + log.warning("Magika is not installed, some file formats will not be recognized") + for container, filename, data in xglob.iter_files(args, recursive=options.recursive, - zip_password=options.zip_password, zip_fname=options.zip_fname): + zip_password=options.zip_password, zip_fname=options.zip_fname): # ignore directory names stored in zip files: if container and filename.endswith('/'): continue - process_file(container, filename, data) + process_file(container, filename, data, options.strict_mode) if __name__ == '__main__': diff --git a/requirements.txt b/requirements.txt index 6ba1b652..b2853d78 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,12 @@ -pyparsing>=2.1.0,<4 +# TODO: for now we avoid pyparsing 3.3+ which generates warnings (see PR #877) +pyparsing>=2.1.0,<3.3 olefile>=0.46 easygui colorclass msoffcrypto-tool; platform_python_implementation!="PyPy" or (python_version>="3" and platform_system!="Windows" and platform_system!="Darwin") -pcodedmp>=1.2.5 \ No newline at end of file +pcodedmp>=1.2.5 + +# Magika is only used by ftguess, to improve accuracy and file formats coverage. +# For now it cannot be installed with Python 3.14 due to onnxruntime +# See https://github.com/microsoft/onnxruntime/issues/26309 +magika==1.0.1; python_version < "3.14" and platform_python_implementation!="PyPy" \ No newline at end of file diff --git a/setup.py b/setup.py index e1352c99..0adc30f1 100644 --- a/setup.py +++ b/setup.py @@ -55,7 +55,7 @@ #--- METADATA ----------------------------------------------------------------- name = "oletools" -version = '0.60.3' +version = '0.61.dev1' desc = "Python tools to analyze security characteristics of MS Office and OLE files (also called Structured Storage, Compound File Binary Format or Compound Document File Format), for Malware Analysis and Incident Response #DFIR" long_desc = open('oletools/README.rst').read() author = "Philippe Lagadec" @@ -323,7 +323,8 @@ def main(): test_suite="tests", # scripts=scripts, install_requires=[ - "pyparsing>=2.1.0,<4", # changed from 2.2.0 to 2.1.0 for issue #481 + # TODO: for now we avoid pyparsing 3.3+ which generates warnings (see PR #877) + "pyparsing>=2.1.0,<3.3", # changed from 2.2.0 to 2.1.0 for issue #481 "olefile>=0.46", "easygui", 'colorclass', @@ -331,6 +332,10 @@ def main(): # so we only require it if the platform is not Windows or not PyPy: 'msoffcrypto-tool; platform_python_implementation!="PyPy" or (python_version>="3" and platform_system!="Windows" and platform_system!="Darwin")', 'pcodedmp>=1.2.5', + # magika requires onnxruntime, which is not yet available for Python 3.14: + # TODO: update this when onnxruntime is available for Python 3.14 + # See https://github.com/microsoft/onnxruntime/issues/26309 + 'magika==1.0.1; python_version < "3.14" and platform_python_implementation!="PyPy"', ], extras_require={ # Optional packages - to be installed with pip install -U oletools[full]