From 6069d4b835f92e5dea365a46c36a9e8daa261730 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Thu, 29 Sep 2022 15:47:09 +0200 Subject: [PATCH 01/14] Ensure closing olefile in test --- oletools/record_base.py | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/oletools/record_base.py b/oletools/record_base.py index 9cf1015b..8688facb 100644 --- a/oletools/record_base.py +++ b/oletools/record_base.py @@ -377,21 +377,26 @@ def do_per_record(record): # pylint: disable=function-redefined if not olefile.isOleFile(filename): logger.info('not an ole file - skip') continue - ole = ole_file_class(filename) - - for stream in ole.iter_streams(): - logger.info(' parse ' + str(stream)) - try: - for record in stream.iter_records(): - logger.info(' ' + str(record)) - do_per_record(record) - except Exception: - if not must_parse: - raise - elif isinstance(stream, must_parse): - raise - else: - logger.info(' failed to parse', exc_info=True) + ole = None + try: + ole = ole_file_class(filename) + + for stream in ole.iter_streams(): + logger.info(' parse ' + str(stream)) + try: + for record in stream.iter_records(): + logger.info(' ' + str(record)) + do_per_record(record) + except Exception: + if not must_parse: + raise + elif isinstance(stream, must_parse): + raise + else: + logger.info(' failed to parse', exc_info=True) + finally: + if ole is not None: + ole.close() log_helper.end_logging() return 0 From 59bd5cff7730f1e5c4bc199f7887c13c702fbec2 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Thu, 29 Sep 2022 17:13:26 +0200 Subject: [PATCH 02/14] Minor uncorrelated fixes (new IDE complained about these) --- oletools/oleobj.py | 2 +- oletools/record_base.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/oletools/oleobj.py b/oletools/oleobj.py index 9f67752e..eef96b66 100644 --- a/oletools/oleobj.py +++ b/oletools/oleobj.py @@ -578,7 +578,7 @@ def get_sane_embedded_filenames(filename, src_path, tmp_path, max_len, # identify suffix. Dangerous suffixes are all short idx = candidate.rfind('.') - if idx is -1: + if idx == -1: candidates_without_suffix.append(candidate) continue elif idx < len(candidate)-5: diff --git a/oletools/record_base.py b/oletools/record_base.py index 8688facb..8cbc4f7e 100644 --- a/oletools/record_base.py +++ b/oletools/record_base.py @@ -257,7 +257,7 @@ def close(self): self.stream.close() def __str__(self): - return '[{0} {1} (type {2}, size {3})' \ + return '[{0} {1} (type {2}, size {3})]' \ .format(self.__class__.__name__, self.name or '[orphan]', ENTRY_TYPE2STR[self.stream_type], From 952bc7cc6593f90242a9007e49e0545e6d01f221 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Thu, 6 Oct 2022 11:09:08 +0200 Subject: [PATCH 03/14] ppt_record_parser: Add many types, parse CString Parse more record types from [MS-PPT] and some more from [MS-ODRAW], show more info in __str__ for debugging and extending --- oletools/ppt_record_parser.py | 158 ++++++++++++++++++++++++++++++---- 1 file changed, 143 insertions(+), 15 deletions(-) diff --git a/oletools/ppt_record_parser.py b/oletools/ppt_record_parser.py index f8d54eae..faa19eae 100644 --- a/oletools/ppt_record_parser.py +++ b/oletools/ppt_record_parser.py @@ -47,6 +47,7 @@ import logging import io import zlib +import string # IMPORTANT: it should be possible to run oletools directly as scripts # in any directory without installing them with pip or setup.py. @@ -65,30 +66,63 @@ from oletools import record_base -# types of relevant records (there are much more than listed here) +# types of relevant records (there are much more than listed here, c.f. [MS-PPT] 2.13.24) +# and https://learn.microsoft.com/en-us/openspecs/office_file_formats/ms-ppt +# these names are parsed in `record_class_for_type`: only if a record ends in "Container" will we find the +# sub-records contained in it RECORD_TYPES = dict([ - # file structure types + # file structure types, c.f. [MS-PPT] 2.3 (0x0ff5, 'UserEditAtom'), (0x0ff6, 'CurrentUserAtom'), # --> use PptRecordCurrentUser instead (0x1772, 'PersistDirectoryAtom'), (0x2f14, 'CryptSession10Container'), - # document types + # document types, c.f. [MS-PPT] 2.4 (0x03e8, 'DocumentContainer'), (0x0fc9, 'HandoutContainer'), (0x03f0, 'NotesContainer'), (0x03ff, 'VbaInfoContainer'), (0x03e9, 'DocumentAtom'), (0x03ea, 'EndDocumentAtom'), - # slide types + (0x0ff0, 'Master/Slide/NotesListWithTextContainer'), + (0x03f3, 'Master/Slide/NotesPersistAtom'), + (0x0fd9, 'Slide/NotesHeadersFootersContainer'), + (0x0fda, 'HeadersFootersAtom'), + (0x07d0, 'DocInfoListContainer'), + # slide types, c.f. [MS-PPT] 2.5 (0x03ee, 'SlideContainer'), (0x03f8, 'MainMasterContainer'), - # external object ty + (0x07f0, 'Slide/SchemeListeElementColorSchemeAtom'), + (0x1388, 'Slide/Doc/ShapeProgTagsContainer'), + (0x138a, 'Slide/Doc/ShapeProgBinaryTagContainer'), + # text types, c.f. [MS-PPT] 2.9 + (0x0fc8, 'Kinsoku[9]Container'), # user preferences for East Asian text line breaking + (0x07d9, 'FontCollectionContainer'), + (0x0fa0, 'TextCharsAtom'), + (0x0fa1, 'StyleTextPropAtom'), + (0x0fa2, 'MasterTextPropAtom'), + (0x0fa3, 'TextMasterStyleAtom'), + (0x0fa4, 'TextCharFormatExceptionAtom'), + (0x0fa5, 'TextParagraphFormatExceptionAtom'), + (0x0fa6, 'TextRulerAtom'), + (0x0fa7, 'TextBookmarkAtom'), + (0x0fa8, 'TextBytesAtom'), + (0x0fa9, 'TextSpecialInfoDefaultAtom'), + (0x0faa, 'TextSpecialInfoAtom'), + (0x0fab, 'DefaultRulerAtom'), + (0x0fac, 'StyleTextProp9Atom'), + (0x0fad, 'TextMasterStyle9Atom'), + (0x07d5, 'FontCollectionContainer'), + (0x0fb7, 'FontEntityAtom'), + # external object types, c.f. [MS-PPT] 2.10 (0x0409, 'ExObjListContainer'), (0x1011, 'ExOleVbaActiveXAtom'), # --> use PptRecordExOleVbaActiveXAtom (0x1006, 'ExAviMovieContainer'), (0x100e, 'ExCDAudioContainer'), - (0x0fee, 'ExControlContainer'), + (0x0fee, 'ExControl(ActiveX)Container'), + (0x0ffb, 'ExControl(ActiveX)Atom'), (0x0fd7, 'ExHyperlinkContainer'), + (0x0fd3, 'ExHyperlinkAtom'), + (0x0fe4, 'ExHyperlink9'), (0x1007, 'ExMCIMovieContainer'), (0x100d, 'ExMIDIAudioContainer'), (0x0fcc, 'ExOleEmbedContainer'), @@ -99,20 +133,58 @@ (0x040a, 'ExObjListAtom'), (0x0fcd, 'ExOleEmbedAtom'), (0x0fc3, 'ExOleObjAtom'), # --> use PptRecordExOleObjAtom instead - # other types + # other types from [MS-PPT] (0x0fc1, 'MetafileBlob'), (0x0fb8, 'FontEmbedDataBlob'), (0x07e7, 'SoundDataBlob'), (0x138b, 'BinaryTagDataBlob'), - (0x0fba, 'CString'), + (0x0fba, 'CString'), # --> use PptRecordCString instead + (0x03f2, 'DocumentTextInfoContainer'), + (0x040b, 'DrawingGroupContainer'), + (0x040c, 'DrawingContainer'), + (0x0423, 'RoundTripOArtTextStyles12Atom'), # to extract data from these, could create class ... + (0x0428, 'RoundTripCustomTableStyles12Atom'), # ... like PptRecordExOleVbaActiveXAtom + (0x040e, 'RoundTripThemeAtom'), + (0x040f, 'RoundTripColorMappingAtom'), + (0x041c, 'RoundTripOriginalMainMasterId12Atom'), + (0x041e, 'RoundTripContentMasterInfo12Atom'), + (0x0422, 'RoundTripContentMasterId12Atom'), + (0x03ef, 'SlideAtom'), + (0x03ff, 'VBAInfoContainer'), + (0x0400, 'VBAInfoAtom'), + (0x0ff2, 'MouseClick/OverInteractiveInfoContainer'), + (0x0ff3, 'InteractiveInfoAtom'), + # from [MS-ODRAW], https://learn.microsoft.com/en-us/openspecs/office_file_formats/ms-odraw + (0xf000, 'OfficeArtDggContainer'), + (0xf001, 'OfficeArtBStoreContainer'), + (0xf002, 'OfficeArtDgContainer'), + (0xf003, 'OfficeArtSpgrContainer'), + (0xf004, 'OfficeArtSpContainer'), + (0xf005, 'OfficeArtSolverContainer'), + (0xf008, 'OfficeArtFDG'), + (0xf00a, 'OfficeArtFSP'), + (0xf00b, 'OfficeArtFOPT'), + (0xf00d, 'OfficeArtClientTextbox'), + (0xf010, 'OfficeArtClientAnchorChart'), + (0xf011, 'OfficeArtClientData'), + (0xf018, 'OfficeArtFRITContainer'), + (0xf11a, 'OfficeArtColorMRUContainer'), + (0xf11e, 'OfficeArtSplitMenuColorContainer'), + (0xf122, 'OfficeArtTertiaryFOPT'), + (0xf007, 'OfficeArtRecordHeader'), ]) # record types where version is not 0x0 or 0x1 or 0xf VERSION_EXCEPTIONS = dict([ (0x0400, 2), # rt_vbainfoatom + (0x0800, 4), # main item in "Powerpoint Document" stream (0x03ef, 2), # rt_slideatom (0xe9c7, 7), # tests/test-data/encrypted/encrypted.ppt, not investigated + (0xf00a, 2), # MS-ODRAW + (0xf00b, 3), # MS-ODRAW + (0xf122, 3), # MS-ODRAW + (0xf007, 2), # MS-ODRAW ]) @@ -139,6 +211,12 @@ (0x0fb8, (0, 3)), # rt_fontembeddatablob, ]) +#: maximum length of record data to add to __str__ +STR_MAX_CONTENT_LEN = 100 + +#: chars in unknown record data to show in __str__ +STR_PRINTABLE_CHARS = list(ord(ch) for ch in string.digits + string.ascii_letters + string.punctuation + ' ') + def is_ppt(filename): """ determine whether given file is a PowerPoint 2003 (ppt) OLE file @@ -230,28 +308,39 @@ def record_class_for_type(cls, rec_type): return PptRecordExOleObjAtom, True elif rec_type == PptRecordExOleVbaActiveXAtom.TYPE: return PptRecordExOleVbaActiveXAtom, True + elif rec_type == PptRecordCString.TYPE: + return PptRecordCString, True + read_all_data = False try: record_name = RECORD_TYPES[rec_type] if record_name.endswith('Container'): is_container = True + read_all_data = True elif record_name.endswith('Atom'): is_container = False + read_all_data = False elif record_name.endswith('Blob'): is_container = False - elif record_name == 'CString': + read_all_data = True + elif record_name == 'OfficeArtClientData': + is_container = True + read_all_data = True + elif record_name.startswith('OfficeArt'): # no "atoms" here is_container = False + read_all_data = False else: logging.warning('Unexpected name for record type "{0}". typo?' .format(record_name)) is_container = False + read_all_data = False if is_container: - return PptContainerRecord, True + return PptContainerRecord, read_all_data else: - return PptRecord, False + return PptRecord, read_all_data except KeyError: - return PptRecord, False + return PptRecord, read_all_data class PptRecord(record_base.OleRecordBase): @@ -296,8 +385,22 @@ def _type_str(self): record_name = RECORD_TYPES[self.type] return '{0} record'.format(record_name) except KeyError: - return '{0} type 0x{1:04x}'.format(self.__class__.__name__, - self.type) + return 'Unknown {0} (type 0x{1:04x})'.format(self.__class__.__name__, + self.type) + + def __str__(self): + """Create string representation. Use super class except for Blobs.""" + try: + if RECORD_TYPES[self.type].endswith('Blob'): + contents = ''.join(chr(ch) if ch in STR_PRINTABLE_CHARS else '.' for ch in self.data) + if len(contents) > STR_MAX_CONTENT_LEN: + data_text = contents[:STR_MAX_CONTENT_LEN - 5] + '[...]' + else: + data_text = contents + return '[{0} record of size {1}: "{2}"]'.format(RECORD_TYPES[self.type], self.size, data_text) + except KeyError: # unknown record type --> fall back to super class + pass + return super(PptRecord, self).__str__() class PptContainerRecord(PptRecord): @@ -586,7 +689,7 @@ def close(self): class PptRecordExOleVbaActiveXAtom(PptRecord): - """ record that contains and ole object / vba storage / active x control + """ record that contains an ole object / vba storage / active x control Contains the actual data of the ole object / VBA storage / ActiveX control in compressed or uncompressed form. @@ -682,6 +785,30 @@ def __str__(self): return '{0}, {1}{2}'.format(text[:-2], compr_text, text[-2:]) +class PptRecordCString(PptRecord): + """ + Text used for many atom types + + Examples include: FriendlyNameAtom, TargetAtom, LocationAtom (from ExHyperlinkContainer). + Instance could help determine what actual type this atom is since some use unique instance numbers + """ + TYPE = 0x0fba + VERSION = 0x0 + # instance varies + + def get_string(self): + return self.data.decode('utf16') # apparently, resources are always utf16-encoded. But might be wrong + + def __str__(self): + """Print data""" + contents = self.get_string() + if len(contents) > STR_MAX_CONTENT_LEN: + data_text = contents[:STR_MAX_CONTENT_LEN - 5] + '[...]' + else: + data_text = contents + return '[CString record (size {0}): "{1}"]'.format(self.size, data_text) + + ############################################################################### # TESTING ############################################################################### @@ -704,6 +831,7 @@ def print_records(record, print_fn, indent, do_print_record): repr(record.ansi_user_name), repr(record.unicode_user_name), ' ' * indent)) + elif isinstance(record, PptRecordExOleObjAtom): logging.info('{2}--> obj id {0}, persist id ref {1}' .format(record.ex_obj_id, record.persist_id_ref, From 5f5f9778135e4cfe7078eaecf024a22deba1d03d Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Thu, 6 Oct 2022 11:21:17 +0200 Subject: [PATCH 04/14] olevba: Detect interactive record types in ppt Old powerpoint files (.ppt) can contain links to webpages or programs that are neither ActiveX nor VBA nor other tested types. They are saved in regular ppt-specific records and allow powerpoint to start arbitrary commands upon click or hovering over some item. The tineout for "hovering" is pretty fast here, it is very likely that users trigger this without realizing it. Add detection for these items to olevba --- oletools/olevba.py | 107 +++++++++++++++++++++++++++++++++- oletools/ppt_record_parser.py | 2 +- 2 files changed, 107 insertions(+), 2 deletions(-) diff --git a/oletools/olevba.py b/oletools/olevba.py index 52ffd512..032acc5f 100644 --- a/oletools/olevba.py +++ b/oletools/olevba.py @@ -345,6 +345,8 @@ from oletools.common import codepages from oletools import ftguess from oletools.common.log_helper import log_helper +from oletools.ppt_record_parser import PptFile, PptContainerRecord, RECORD_TYPES, \ + PptRecordCString, PptRecordExOleObjAtom # === PYTHON 2+3 SUPPORT ====================================================== @@ -647,6 +649,8 @@ def __init__(self, stream_path, variable, expected, value): r'\w+_ProgressChange', r'\w+_PropertyChange', r'\w+_SetSecureLockIcon', r'\w+_StatusTextChange', r'\w+_TitleChange', r'\w+_MouseMove', r'\w+_MouseEnter', r'\w+_MouseLeave', r'\w+_Layout', r'\w+_OnConnecting', r'\w+_FollowHyperlink', r'\w+_ContentControlOnEnter'), + 'Runs when the file is opened and Mouse Clicks or Hovers over element': + (r'MouseClick/OverInteractiveInfoContainer',), } # Suspicious Keywords that may be used by malware @@ -903,6 +907,10 @@ def __init__(self, stream_path, variable, expected, value): # (must be bytes for Python 3) re_printable_string = re.compile(b'[\\t\\r\\n\\x20-\\xFF]{5,}') +# ppt record types that contain interactive content (like ActiveX) +# see ppt_record_parser.RECORD_TYPES for meaning of these type constants +PPT_INTERACTIVE_RECORD_TYPES = 0x0fc3, 0x0ff2, 0x0fd7 + # === PARTIAL VBA GRAMMAR ==================================================== @@ -1682,6 +1690,7 @@ def __init__(self, ole, vba_root, project_path, dir_path, relaxed=True): self.dir_stream = dir_stream # reference: MS-VBAL 2.3.4.2 dir Stream: Version Independent Project Information + # This could be integrated with record parsing code in record_base.py # PROJECTSYSKIND Record # Specifies the platform for which the VBA project is created. @@ -2703,6 +2712,7 @@ def __init__(self, filename, data=None, container=None, relaxed=True, encoding=D self.vba_forms = None self.contains_vba_macros = None # will be set to True or False by detect_vba_macros self.contains_xlm_macros = None # will be set to True or False by detect_xlm_macros + self.contains_ppt_interactive = None # will be set to True or False by detect_ppt_interactive self.vba_code_all_modules = None # to store the source code of all modules # list of tuples for each module: (subfilename, stream_path, vba_filename, vba_code) self.modules = None @@ -2720,7 +2730,9 @@ def __init__(self, filename, data=None, container=None, relaxed=True, encoding=D #: Encoding for VBA source code and strings returned by all methods self.encoding = encoding self.xlm_macros = [] + self.ppt_interactive = [] self.no_xlm = False + self.no_ppt_interactive = False # nowhere set yet, but include switch for future #: Output from pcodedmp, disassembly of the VBA P-code self.disable_pcode = disable_pcode self.pcodedmp_output = None @@ -3259,13 +3271,20 @@ def detect_macros(self): by calling detect_vba_macros and detect_xlm_macros. (if the no_xlm option is set, XLM macros are not checked) + Also checks ppt files for ActiveX-like record types using self.detect_ppt_interactive + (if self.no_ppt_interactive is not set). + :return: bool, True if at least one VBA project has been found, False otherwise """ vba = self.detect_vba_macros() xlm = False + found_ppt_interactive = False if not self.no_xlm: xlm = self.detect_xlm_macros() - return (vba or xlm) + if not self.no_ppt_interactive: + found_ppt_interactive = self.detect_ppt_interactive() + + return (vba or xlm or found_ppt_interactive) def detect_vba_macros(self): """ @@ -3293,6 +3312,7 @@ def detect_vba_macros(self): for ole_subfile in self.ole_subfiles: log.debug("ole subfile {}".format(ole_subfile)) ole_subfile.no_xlm = self.no_xlm + ole_subfile.no_ppt_interactive = self.no_ppt_interactive if ole_subfile.detect_vba_macros(): self.contains_vba_macros = True return True @@ -3451,6 +3471,59 @@ def _extract_xlm_plugin_biff(self): self.contains_xlm_macros = False return False + def detect_ppt_interactive(self): + """ + Search through record structure of file and find problematic record types. + + Remembers problematic records in `self.ppt_interactive + + :return: True if record types from PPT_INTERACTIVE_RECORD_TYPES were found + """ + # do not search again + if self.contains_ppt_interactive is not None: + return self.contains_ppt_interactive + + if self.type != TYPE_PPT: + self.contains_ppt_interactive = False + return False + + with PptFile(self.filename) as ppt: # this is from ppt_record_parser + for stream in ppt.iter_streams(): + log.debug('Parse records in ' + str(stream)) + for record in stream.iter_records(): + self._detect_ppt_interactive(record, 1, stream.name) + if self.ppt_interactive: + self.contains_ppt_interactive = True + return self.contains_ppt_interactive + + def _detect_ppt_interactive(self, record, indent, stream_name): + """Recursive helper for detect_ppt_interactive.""" + log.debug('{0}{1}'.format(' ' * indent, record)) + if record.type in PPT_INTERACTIVE_RECORD_TYPES: + # add record, avoiding duplicates (which ppt likes to contain) + if isinstance(record, PptRecordExOleObjAtom): + if record.obj_type != 2: # not ActiveX + return + texts = set() + if isinstance(record, PptContainerRecord): + for subrec in record.records: + if not isinstance(subrec, PptRecordCString): + continue + texts.add(subrec.get_string().strip().rstrip('/')) + if texts: + text = '{0}: {1}'.format(RECORD_TYPES[record.type], ', '.join(texts)) + else: + text = RECORD_TYPES[record.type] + try: + previous_idx = self.ppt_interactive.index([text, stream_name, False]) + self.ppt_interactive[previous_idx][2] = True # mark as duplicated instead of adding again + except ValueError: # no such index + self.ppt_interactive.append([text, stream_name, False]) + if isinstance(record, PptContainerRecord): + for subrec in record.records: + self._detect_ppt_interactive(subrec, indent+1, stream_name) + # todo: is record contains ole streams, then parse those or add to substreams + def detect_is_encrypted(self): if self.ole_file: self.is_encrypted = crypto.is_encrypted(self.ole_file) @@ -3512,6 +3585,19 @@ def extract_macros(self): for line in self.xlm_macros: vba_code += "' " + line + '\n' yield ('xlm_macro', 'xlm_macro', 'xlm_macro.txt', vba_code) + # ...and interactive components found in PPT files (copy of this bit later in function) + if self.ppt_interactive: + # group by stream + curr_stream = self.ppt_interactive[0][1] + texts = [] + for text, stream_name, _ in self.ppt_interactive: + if stream_name == curr_stream: + texts.append(text) + else: + yield (self.filename, curr_stream, '', '\n'.join(texts)) + texts = [text,] + curr_stream = stream_name + yield (self.filename, curr_stream, '', '\n'.join(texts)) else: # This is an OLE file: self.find_vba_projects() @@ -3574,6 +3660,19 @@ def extract_macros(self): for line in self.xlm_macros: vba_code += "' " + line + '\n' yield ('xlm_macro', 'xlm_macro', 'xlm_macro.txt', vba_code) + # probably never happens here, but just in case (code copied from above): + if self.ppt_interactive: + # group by stream + curr_stream = self.ppt_interactive[0][1] + texts = [] + for text, stream_name, _ in self.ppt_interactive: + if stream_name == curr_stream: + texts.append(text) + else: + yield (self.filename, curr_stream, '', '\n'.join(texts)) + texts = [text,] + curr_stream = stream_name + yield (self.filename, curr_stream, '', '\n'.join(texts)) # Analyse the VBA P-code to detect VBA stomping: # If stomping is detected, add a fake VBA module with the P-code as source comments # so that VBA_Scanner can find keywords and IOCs in it @@ -3654,6 +3753,12 @@ def analyze_macros(self, show_decoded_strings=False, deobfuscate=False): description = 'XLM macro found. It may contain malicious code' scanner.suspicious_keywords.append((keyword, description)) scanner.results.append(('Suspicious', keyword, description)) + if self.contains_ppt_interactive: + log.debug('adding PPT interactive found to suspicious keywords') + keyword = 'Interactive Controls' + description = 'Found interactive controls. May execute malicious code' + scanner.suspicious_keywords.append((keyword, description)) + scanner.results.append(('Suspicious', keyword, description)) # TODO: this has been temporarily disabled if self.template_injection_found: log.debug('adding Template Injection to suspicious keywords') diff --git a/oletools/ppt_record_parser.py b/oletools/ppt_record_parser.py index faa19eae..8fe09d21 100644 --- a/oletools/ppt_record_parser.py +++ b/oletools/ppt_record_parser.py @@ -152,7 +152,7 @@ (0x03ef, 'SlideAtom'), (0x03ff, 'VBAInfoContainer'), (0x0400, 'VBAInfoAtom'), - (0x0ff2, 'MouseClick/OverInteractiveInfoContainer'), + (0x0ff2, 'MouseClick/OverInteractiveInfoContainer'), # this is a suspicious keyword in olevba (0x0ff3, 'InteractiveInfoAtom'), # from [MS-ODRAW], https://learn.microsoft.com/en-us/openspecs/office_file_formats/ms-odraw (0xf000, 'OfficeArtDggContainer'), From ab9b34887832976a02495304ad404109593cb4ea Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Thu, 6 Oct 2022 11:26:26 +0200 Subject: [PATCH 05/14] record_base: Provide complete record data for testing Expose existing param in record_base.test which is used to help extend and debug record-based streams (currently only ppt_record_parser). --- oletools/record_base.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/oletools/record_base.py b/oletools/record_base.py index 8cbc4f7e..67574cc6 100644 --- a/oletools/record_base.py +++ b/oletools/record_base.py @@ -52,6 +52,7 @@ # (maybe content type or so; identify streams that are never record-based) # Or use oleid to avoid same functionality in several files # - think about integrating this with olefile itself +# - there is quite some record parsing being done in __init__ of olevba.VBA_Project # ----------------------------------------------------------------------------- # REFERENCES: @@ -223,6 +224,8 @@ def iter_records(self, fill_data=False): """ yield all records in this stream Stream must be positioned at start of records (e.g. start of stream). + + :param bool fill_data: Always read (and save in `data`) all of this record's data. """ while True: # unpacking as in olevba._extract_vba @@ -292,7 +295,14 @@ class OleRecordBase(object): SIZE = None def __init__(self, type, size, more_data, pos, data): - """ create a record; more_data is discarded """ + """ + Create a record; more_data is discarded + + Usually called from a stream's `iter_records` and `read_record_head`. The latter defines `more_data`. + `data` contains all of this record and all its sub-records. + + Calls remembers `data` and calls `finish-constructing` with `more_data`. + """ if self.TYPE is not None and type != self.TYPE: raise ValueError('Wrong subclass {0} for type {1}' .format(self.__class__.__name__, type)) @@ -359,7 +369,7 @@ def __str__(self): def test(filenames, ole_file_class=OleRecordFile, - must_parse=None, do_per_record=None, verbose=False): + must_parse=None, do_per_record=None, verbose=False, fill_data=False): """ parse all given file names and print rough structure if an error occurs while parsing a stream of type in must_parse, the error @@ -384,7 +394,7 @@ def do_per_record(record): # pylint: disable=function-redefined for stream in ole.iter_streams(): logger.info(' parse ' + str(stream)) try: - for record in stream.iter_records(): + for record in stream.iter_records(fill_data=fill_data): logger.info(' ' + str(record)) do_per_record(record) except Exception: From 9919410aaf26f613642c30d4bca78592f3502848 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Thu, 6 Oct 2022 14:24:42 +0200 Subject: [PATCH 06/14] record_base: Improve stability for real-word samples (1) Do not parse all sub-records in a container when constructing it (2) Allow for stray bytes at end of container data --- oletools/olevba.py | 4 +-- oletools/ppt_record_parser.py | 61 ++++++++++++++++++++++++----------- oletools/record_base.py | 12 +++++-- oletools/xls_parser.py | 16 +++++++-- 4 files changed, 68 insertions(+), 25 deletions(-) diff --git a/oletools/olevba.py b/oletools/olevba.py index 032acc5f..907718e0 100644 --- a/oletools/olevba.py +++ b/oletools/olevba.py @@ -3506,7 +3506,7 @@ def _detect_ppt_interactive(self, record, indent, stream_name): return texts = set() if isinstance(record, PptContainerRecord): - for subrec in record.records: + for subrec in record.get_records(): if not isinstance(subrec, PptRecordCString): continue texts.add(subrec.get_string().strip().rstrip('/')) @@ -3520,7 +3520,7 @@ def _detect_ppt_interactive(self, record, indent, stream_name): except ValueError: # no such index self.ppt_interactive.append([text, stream_name, False]) if isinstance(record, PptContainerRecord): - for subrec in record.records: + for subrec in record.get_records(): self._detect_ppt_interactive(subrec, indent+1, stream_name) # todo: is record contains ole streams, then parse those or add to substreams diff --git a/oletools/ppt_record_parser.py b/oletools/ppt_record_parser.py index 8fe09d21..e86192cb 100644 --- a/oletools/ppt_record_parser.py +++ b/oletools/ppt_record_parser.py @@ -287,6 +287,8 @@ def stream_class_for_name(cls, stream_name): class PptStream(record_base.OleRecordStream): """ a stream of records in a ppt file """ + RECORD_HEADER_SIZE = 8 + def read_record_head(self): """ read first few bytes of record to determine size and type @@ -311,6 +313,8 @@ def record_class_for_type(cls, rec_type): elif rec_type == PptRecordCString.TYPE: return PptRecordCString, True + # flag to tell caller that complete record data must be read and given to constructor. + # important for Containers to they can parse their sub-records read_all_data = False try: record_name = RECORD_TYPES[rec_type] @@ -406,35 +410,56 @@ def __str__(self): class PptContainerRecord(PptRecord): """ A record that contains other records """ + def __init__(self, *args, **kwargs): + super(PptContainerRecord, self).__init__(*args, **kwargs) + self._records = None + def finish_constructing(self, more_data): """ parse records from self.data """ # set self.version and self.instance super(PptContainerRecord, self).finish_constructing(more_data) - self.records = None - if not self.data: - return - # logging.debug('parsing contents of container record {0}' - # .format(self)) + def get_records(self): + """ + Return list of records contained in this container. + + If this is the first call to this function, then read records structure + from self.data, remember those, and forget self.data . All future calls will + just return saved records. + + :return: list of sub-records contained in this container. + """ + # has been called before --> return saved records + if self._records is not None: + return self._records - # create a stream from self.data and parse it like any other - data_stream = io.BytesIO(self.data) - record_stream = PptStream(data_stream, self.size, - 'PptContainerRecordSubstream', - record_base.STGTY_SUBSTREAM) - self.records = list(record_stream.iter_records()) - # logging.debug('done parsing contents of container record {0}' - # .format(self)) + if not self.data: + logging.warning("Constructor of {0} was not given its data. Cannot parse sub-structure" + .format(self)) + self._records = [] + else: + # create a stream from self.data and parse it like any other + #logging.debug('parsing contents of container record {0}'.format(self)) + data_stream = io.BytesIO(self.data) + record_stream = PptStream(data_stream, self.size, + 'PptContainerRecordSubstream', + record_base.STGTY_SUBSTREAM) + self._records = list(record_stream.iter_records()) + # logging.debug('done parsing contents of container record {0}' + # .format(self)) + self.data = None # not needed any more + + return self._records def __str__(self): text = super(PptContainerRecord, self).__str__() - if self.records is None: + if self._records is None: return '{0}, unparsed{1}'.format(text[:-2], text[-2:]) - elif self.records: + elif self._records: return '{0}, contains {1} recs{2}' \ - .format(text[:-2], len(self.records), text[-2:]) + .format(text[:-2], len(self._records), text[-2:]) else: - return text + return '{0} (empty){1}'.format(text[:-2], text[-2:]) class PptRecordCurrentUser(PptRecord): @@ -822,7 +847,7 @@ def print_records(record, print_fn, indent, do_print_record): if do_print_record: print_fn('{0}{1}'.format(' ' * indent, record)) if isinstance(record, PptContainerRecord): - for subrec in record.records: + for subrec in record.get_records(): print_records(subrec, print_fn, indent+1, True) elif isinstance(record, PptRecordCurrentUser): logging.info('{4}--> crypt: {0}, offset {1}, user {2}/{3}' diff --git a/oletools/record_base.py b/oletools/record_base.py index 67574cc6..d6adf681 100644 --- a/oletools/record_base.py +++ b/oletools/record_base.py @@ -190,6 +190,8 @@ class OleRecordStream(object): abstract base class """ + RECORD_HEADER_SIZE = None # to be overwritten in subclass; specifies minimum size required for a record header + def __init__(self, stream, size, name, stream_type): self.stream = stream self.size = size @@ -232,6 +234,9 @@ def iter_records(self, fill_data=False): pos = self.stream.tell() if pos >= self.size: break + if self.size - pos < self.RECORD_HEADER_SIZE: + logger.debug("Skip {0} byte".format(self.size - pos)) + break # read first few bytes, determine record type and size rec_type, rec_size, other = self.read_record_head() @@ -298,10 +303,11 @@ def __init__(self, type, size, more_data, pos, data): """ Create a record; more_data is discarded - Usually called from a stream's `iter_records` and `read_record_head`. The latter defines `more_data`. - `data` contains all of this record and all its sub-records. + Usually called from a stream's `iter_records` and `read_record_head`. The latter defines + `more_data`. `data` contains raw data for this record and all its sub-records (if this is + a container). It might be None, caller must ensure that this has the proper contents. - Calls remembers `data` and calls `finish-constructing` with `more_data`. + Remember `data` and calls `finish_constructing` with `more_data`. """ if self.TYPE is not None and type != self.TYPE: raise ValueError('Wrong subclass {0} for type {1}' diff --git a/oletools/xls_parser.py b/oletools/xls_parser.py index 2f0bdad4..6177c0b0 100644 --- a/oletools/xls_parser.py +++ b/oletools/xls_parser.py @@ -143,7 +143,13 @@ def stream_class_for_name(cls, stream_name): class XlsStream(record_base.OleRecordStream): - """ most streams in xls file consist of records """ + """ + Stream of Records that make up an xls file. + + More info in [MS-XLS], https://learn.microsoft.com/en-us/openspecs/office_file_formats/ms-xls + """ + + RECORD_HEADER_SIZE = 4 def read_record_head(self): """ read first few bytes of record to determine size and type @@ -182,11 +188,17 @@ def record_class_for_type(cls, rec_type): class XlsbStream(record_base.OleRecordStream): - """ binary stream of an xlsb file, usually have a record structure """ + """ + Binary stream of an xlsb file, usually have a record structure. + + For further info, see [MS-XLSB], https://learn.microsoft.com/en-us/openspecs/office_file_formats/ms-xlsb + """ HIGH_BIT_MASK = 0b10000000 LOW7_BIT_MASK = 0b01111111 + RECORD_HEADER_SIZE = 2 # can be 2 to 6 bytes, minimum value requested in OleRecordStream comment + def read_record_head(self): """ read first few bytes of record to determine size and type From 325caf52f5c4b1ec0de8d7ba3ab683292afa2f3a Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Fri, 7 Oct 2022 14:52:34 +0200 Subject: [PATCH 07/14] ppt_parser: Relax requirements Had too harsh a requirement for ppt files, that it only contains root streams and no sub-streams. Not sure whether this theoretically should be true, but in any case it is not the case in real-world samples. --- oletools/ppt_parser.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/oletools/ppt_parser.py b/oletools/ppt_parser.py index 93b75a4b..8ed9f386 100644 --- a/oletools/ppt_parser.py +++ b/oletools/ppt_parser.py @@ -1195,13 +1195,11 @@ def __init__(self, ole, fast_fail=False): # ['\x05SummaryInformation'], # ['Current User'], # ['PowerPoint Document']] - root_streams = self.ole.listdir() - #for stream in root_streams: + all_streams = self.ole.listdir() # this includes non-root streams + #for stream in all_streams: # log.debug('found root stream {0!r}'.format(stream)) - if any(len(stream) != 1 for stream in root_streams): - self._fail('root', 'listdir', root_streams, 'len = 1') - root_streams = [stream[0].lower() for stream in root_streams] - if not 'current user' in root_streams: + root_streams = [stream[0].lower() for stream in all_streams if len(stream) == 1] + if 'current user' not in root_streams: self._fail('root', 'listdir', root_streams, 'Current User') if not MAIN_STREAM_NAME.lower() in root_streams: self._fail('root', 'listdir', root_streams, MAIN_STREAM_NAME) From d3ba646bbec0369a38604da9d560cc612ecea9aa Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Fri, 7 Oct 2022 15:34:22 +0200 Subject: [PATCH 08/14] ppt_parser: Remove dead code This is left-over from my initial attempt to parsing ppt documents. Never worked properly. --- oletools/ppt_parser.py | 174 +---------------------------------------- 1 file changed, 2 insertions(+), 172 deletions(-) diff --git a/oletools/ppt_parser.py b/oletools/ppt_parser.py index 8ed9f386..97155380 100644 --- a/oletools/ppt_parser.py +++ b/oletools/ppt_parser.py @@ -8,7 +8,7 @@ Currently quite narrowly focused on extracting VBA from ppt files, no slides or stuff, but built to be extended to parsing more/all of the file. For better "understanding" of ppt files, see module ppt_record_parser, which will probably -replace this module some time soon. +replace this module some time. References: * https://msdn.microsoft.com/en-us/library/dd921564%28v=office.12%29.aspx @@ -17,7 +17,7 @@ WARNING! Before thinking about understanding or even extending this module, please keep in mind that module ppt_record_parser has a better "understanding" of the ppt -file structure and will replace this module some time soon! +file structure and will replace this module some time! """ @@ -1238,176 +1238,6 @@ def _fail(self, *args): else: self._log_exception(PptUnexpectedData(*args).msg) - def parse_current_user(self): - """ parse the CurrentUserAtom record from stream 'Current User' - - Structure described in - https://msdn.microsoft.com/en-us/library/dd948895%28v=office.12%29.aspx - """ - - if self.current_user_atom is not None: - log.warning('re-reading and overwriting ' - 'previously read current_user_atom') - - log.debug('parsing "Current User"') - - stream = None - try: - log.debug('opening stream "Current User"') - stream = self.ole.openstream('Current User') - self.current_user_atom = CurrentUserAtom.extract_from(stream) - except Exception: - if self.fast_fail: - raise - else: - self._log_exception() - finally: - if stream is not None: - log.debug('closing stream "Current User"') - stream.close() - - @with_opened_main_stream - def parse_persist_object_directory(self, stream): - """ Part 1: Construct the persist object directory """ - - if self.persist_object_directory is not None: - log.warning('re-reading and overwriting ' - 'previously read persist_object_directory') - - # Step 1: Read the CurrentUserAtom record (section 2.3.2) from the - # Current User Stream (section 2.1.1). All seek operations in the steps - # that follow this step are in the PowerPoint Document Stream. - if self.current_user_atom is None: - self.parse_current_user() - - offset = self.current_user_atom.offset_to_current_edit - is_encrypted = self.current_user_atom.is_encrypted() - self.persist_object_directory = {} - self.newest_user_edit = None - - # Repeat steps 3 through 6 until offsetLastEdit is 0x00000000. - while offset != 0: - - # Step 2: Seek, in the PowerPoint Document Stream, to the - # offset specified by the offsetToCurrentEdit field of the - # CurrentUserAtom record identified in step 1. - stream.seek(offset, os.SEEK_SET) - - # Step 3: Read the UserEditAtom record at the current offset. - # Let this record be a live record. - user_edit = UserEditAtom.extract_from(stream, is_encrypted) - if self.newest_user_edit is None: - self.newest_user_edit = user_edit - - log.debug('checking validity') - errs = user_edit.check_validity() - if errs: - log.warning('check_validity found {0} issues' - .format(len(errs))) - for err in errs: - log.warning('UserEditAtom.check_validity: {0}'.format(err)) - if errs and self.fast_fail: - raise errs[0] - - # Step 4: Seek to the offset specified by the - # offsetPersistDirectory field of the UserEditAtom record - # identified in step 3. - log.debug('seeking to pos {0}' - .format(user_edit.offset_persist_directory)) - stream.seek(user_edit.offset_persist_directory, os.SEEK_SET) - - # Step 5: Read the PersistDirectoryAtom record at the current - # offset. Let this record be a live record. - persist_dir_atom = PersistDirectoryAtom.extract_from(stream) - - log.debug('checking validity') - errs = persist_dir_atom.check_validity(offset) - if errs: - log.warning('check_validity found {0} issues' - .format(len(errs))) - for err in errs: - log.warning('PersistDirectoryAtom.check_validity: {0}' - .format(err)) - if errs and self.fast_fail: - raise errs[0] - - - # Construct the complete persist object directory for this file - # as follows: - # - For each PersistDirectoryAtom record previously identified - # in step 5, add the persist object identifier and persist - # object stream offset pairs to the persist object directory - # starting with the PersistDirectoryAtom record last - # identified, that is, the one closest to the beginning of the - # stream. - # - Continue adding these pairs to the persist object directory - # for each PersistDirectoryAtom record in the reverse order - # that they were identified in step 5; that is, the pairs from - # the PersistDirectoryAtom record closest to the end of the - # stream are added last. - # - When adding a new pair to the persist object directory, if - # the persist object identifier already exists in the persist - # object directory, the persist object stream offset from the - # new pair replaces the existing persist object stream offset - # for that persist object identifier. - for entry in persist_dir_atom.rg_persist_dir_entry: - last_id = entry.persist_id+len(entry.rg_persist_offset)-1 - log.debug('for persist IDs {0}-{1}, save offsets {2}' - .format(entry.persist_id, last_id, - entry.rg_persist_offset)) - for count, offset in enumerate(entry.rg_persist_offset): - self.persist_object_directory[entry.persist_id+count] \ - = offset - - # check for more - # Step 6: Seek to the offset specified by the offsetLastEdit - # field in the UserEditAtom record identified in step 3. - offset = user_edit.offset_last_edit - - @with_opened_main_stream - def parse_document_persist_object(self, stream): - """ Part 2: Identify the document persist object """ - if self.document_persist_obj is not None: - log.warning('re-reading and overwriting ' - 'previously read document_persist_object') - - # Step 1: Read the docPersistIdRef field of the UserEditAtom record - # first identified in step 3 of Part 1, that is, the UserEditAtom - # record closest to the end of the stream. - if self.persist_object_directory is None: - self.parse_persist_object_directory() - - # Step 2: Lookup the value of the docPersistIdRef field in the persist - # object directory constructed in step 8 of Part 1 to find the stream - # offset of a persist object. - newest_ref = self.newest_user_edit.doc_persist_id_ref - offset = self.persist_object_directory[newest_ref] - log.debug('newest user edit ID is {0}, offset is {1}' - .format(newest_ref, offset)) - - # Step 3: Seek to the stream offset specified in step 2. - log.debug('seek to {0}'.format(offset)) - stream.seek(offset, os.SEEK_SET) - - # Step 4: Read the DocumentContainer record at the current offset. - # Let this record be a live record. - self.document_persist_obj = DocumentContainer.extract_from(stream) - - log.debug('checking validity') - errs = self.document_persist_obj.check_validity() - if errs: - log.warning('check_validity found {0} issues'.format(len(errs))) - for err in errs: - log.warning('check_validity(document_persist_obj): {0}' - .format(err)) - if errs and self.fast_fail: - raise errs[0] - - #-------------------------------------------------------------------------- - # 2nd attempt: do not parse whole structure but search through stream and - # yield results as they become available - # Keep in mind that after every yield the stream position may be anything! - @generator_with_opened_main_stream def search_pattern(self, stream, pattern): """ search for pattern in stream, return indices """ From 54b5693fa71da5a2c373e058777fe7a065ea9dc8 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Fri, 7 Oct 2022 16:27:05 +0200 Subject: [PATCH 09/14] oleobj: Do not trust is_zipfile This can easily be fooled as shown by some malware sample. So, do it the pythonic way: try treating it like a zip file and deal with the exceptions if it is not. --- oletools/oleobj.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/oletools/oleobj.py b/oletools/oleobj.py index eef96b66..641ae55c 100644 --- a/oletools/oleobj.py +++ b/oletools/oleobj.py @@ -50,7 +50,7 @@ import re import sys import io -from zipfile import is_zipfile +from zipfile import BadZipFile, is_zipfile import random import olefile @@ -70,7 +70,7 @@ from oletools.thirdparty import xglob from oletools.ppt_record_parser import (is_ppt, PptFile, PptRecordExOleVbaActiveXAtom) -from oletools.ooxml import XmlParser +from oletools.ooxml import XmlParser, BadOOXML from oletools.common.io_encoding import ensure_stdout_handles_unicode # ----------------------------------------------------------------------------- @@ -866,7 +866,7 @@ def process_file(filename, data, output_dir=None): did_dump = False xml_parser = None - if is_zipfile(filename): + try: # do not trust is_zipfile, can easily be fooled log.info('file could be an OOXML file, looking for relationships with ' 'external links') xml_parser = XmlParser(filename) @@ -878,6 +878,10 @@ def process_file(filename, data, output_dir=None): for target in find_customUI(xml_parser): did_dump = True print("Found customUI tag with external link or VBA macro %s (possibly exploiting CVE-2021-42292)" % target) + except (BadZipFile, BadOOXML, UnicodeDecodeError): + log.debug("", exc_info=True) + log.info("Not an OOXML file after all") + # look for ole files inside file (e.g. unzip docx) # have to finish work on every ole stream inside iteration, since handles From 76da83a7fea13c37fe95066ba26fa223e6b253f5 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Fri, 7 Oct 2022 17:00:14 +0200 Subject: [PATCH 10/14] ppt_record_parser: Optimize data loading Do not remember potentially huge blobs in memory, need that just for debugging. --- oletools/ppt_record_parser.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/oletools/ppt_record_parser.py b/oletools/ppt_record_parser.py index e86192cb..485c7551 100644 --- a/oletools/ppt_record_parser.py +++ b/oletools/ppt_record_parser.py @@ -66,6 +66,9 @@ from oletools import record_base +# flag to remember some more data for debug-printing +debug_print = False + # types of relevant records (there are much more than listed here, c.f. [MS-PPT] 2.13.24) # and https://learn.microsoft.com/en-us/openspecs/office_file_formats/ms-ppt # these names are parsed in `record_class_for_type`: only if a record ends in "Container" will we find the @@ -143,8 +146,8 @@ (0x040b, 'DrawingGroupContainer'), (0x040c, 'DrawingContainer'), (0x0423, 'RoundTripOArtTextStyles12Atom'), # to extract data from these, could create class ... - (0x0428, 'RoundTripCustomTableStyles12Atom'), # ... like PptRecordExOleVbaActiveXAtom - (0x040e, 'RoundTripThemeAtom'), + (0x0428, 'RoundTripCustomTableStyles12Atom'), # ... like PptRecordExOleVbaActiveXAtom ... + (0x040e, 'RoundTripThemeAtom'), # ... to parse zip/ooxml data (0x040f, 'RoundTripColorMappingAtom'), (0x041c, 'RoundTripOriginalMainMasterId12Atom'), (0x041e, 'RoundTripContentMasterInfo12Atom'), @@ -318,7 +321,10 @@ def record_class_for_type(cls, rec_type): read_all_data = False try: record_name = RECORD_TYPES[rec_type] - if record_name.endswith('Container'): + if record_name.startswith('RoundTrip'): + is_container = False + read_all_data = debug_print + elif record_name.endswith('Container'): is_container = True read_all_data = True elif record_name.endswith('Atom'): @@ -326,7 +332,7 @@ def record_class_for_type(cls, rec_type): read_all_data = False elif record_name.endswith('Blob'): is_container = False - read_all_data = True + read_all_data = debug_print elif record_name == 'OfficeArtClientData': is_container = True read_all_data = True @@ -395,7 +401,9 @@ def _type_str(self): def __str__(self): """Create string representation. Use super class except for Blobs.""" try: - if RECORD_TYPES[self.type].endswith('Blob'): + if debug_print and \ + (RECORD_TYPES[self.type].endswith('Blob') or + RECORD_TYPES[self.type].startswith('RoundTrip')): contents = ''.join(chr(ch) if ch in STR_PRINTABLE_CHARS else '.' for ch in self.data) if len(contents) > STR_MAX_CONTENT_LEN: data_text = contents[:STR_MAX_CONTENT_LEN - 5] + '[...]' @@ -868,6 +876,7 @@ def print_records(record, print_fn, indent, do_print_record): if __name__ == '__main__': + debug_print = True def do_per_record(record): print_records(record, logging.info, 2, False) sys.exit(record_base.test(sys.argv[1:], PptFile, From feb23090347d14eee77be6d3d26db37f98b97f30 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Mon, 10 Oct 2022 11:33:22 +0200 Subject: [PATCH 11/14] tests: Add sample and test for interactive ppt Self-made sample that triggers default browser with an URL upon clicking a shape, and that calls calc.exe upon hovering over another shape. --- tests/olevba/test_basic.py | 20 ++++++++++++++++++++ tests/test-data/olevba/mouse-over.ppt | Bin 0 -> 93184 bytes 2 files changed, 20 insertions(+) create mode 100644 tests/test-data/olevba/mouse-over.ppt diff --git a/tests/olevba/test_basic.py b/tests/olevba/test_basic.py index 5be1269a..09412544 100644 --- a/tests/olevba/test_basic.py +++ b/tests/olevba/test_basic.py @@ -147,6 +147,26 @@ def test_xlm(self): self.assertIn('AutoExec', types) self.assertIn('Suspicious', types) + def test_interactive_ppt(self): + """Test detection of interactive ppt feature in special sample.""" + SAMPLE = join(DATA_BASE_DIR, 'olevba', 'mouse-over.ppt') + out_str, _ = call_and_capture('olevba', args = [SAMPLE, '-j']) + output = json.loads(out_str) + self.assertEqual(len(output), 2) + self.assertEqual(output[0]['type'], 'MetaInformation') + self.assertEqual(output[0]['script_name'], 'olevba') + result = output[1] + self.assertTrue(result['json_conversion_successful']) + self.assertEqual(len(result['macros']), 1) + self.assertEqual(result['macros'][0]['ole_stream'], 'PowerPoint Document') + self.assertTrue(result['macros'][0]['code'].startswith('ExHyperlinkContainer')) + self.assertTrue(result['macros'][0]['code'].endswith('calc.exe')) + self.assertTrue(entry['type'] == 'AutoExec' for entry in result['analysis']) + self.assertTrue(entry['type'] == 'Suspicious' for entry in result['analysis']) + self.assertTrue(entry['keyword'] == 'calc.exe' for entry in result['analysis']) + self.assertTrue(entry['keyword'] == 'InteractiveControls' for entry in result['analysis']) + self.assertTrue(entry['keyword'] == 'MouseClick/OverInteractiveInfoContainer' for entry in result['analysis']) + # just in case somebody calls this file as a script if __name__ == '__main__': diff --git a/tests/test-data/olevba/mouse-over.ppt b/tests/test-data/olevba/mouse-over.ppt new file mode 100644 index 0000000000000000000000000000000000000000..3bbb9c3b5faa5ec9c501cb956b0b3133d3651771 GIT binary patch literal 93184 zcmeFa1z1+i+BSUCEhtEcq;xAOA&t@4L;~G zpjamc4#dHU@HI@4A3T5m1FRTUPrv+YGDE^>-#?)OXUvjQKP1p{(0eRFOF(6p z1n)BjZ?`*faR5281}^{HK<`Hc-lsM2*WRxU!kZSKrit{Yvi_gthX=g>$GiV(`Jp$< ze6N3~&3@D$R1RqT|B&HOJ^gO{L;V=S-{yaQ4Jx!N(O{18i{{Q?Bq+W9eCfG` z$DLRyNp7q=nU3h}5eVW==2Ml(u0gU*rqmq^S0qJPI5DWqYjsKPisr7SR|Mg+>yTKc zMNCb9Dt;}=S31SI#{04O8Pn9dh;-STr8%9(F1B?7Zl2rzA^xfs$p+~SM_X|yl~VC~ zY+fn&;!3FnW~^y>3p#e!ncWgiUOG3;SJHL$S~v*E<A=Mmhue7mV|75P)8 zSxSTA-496@3^9X^ZSK?W~@5rwzJ>5w6yXbNonRCt%f>nP15!7~PnRKItPbUezqZXMzN32TjeKO%03G;dpEt2{}xnY>XEVTjaQof4GgT~V+5Fj zGy)a5m+dcp^y&Brtvl?1ETiF>ovA9@Th(#$%cd+3F&IBF?*&xkm z@seCyt)ysIKxTNP@7c)ZEc@hLKGp52m z8b`k&s5Z`yLqU1Zf+Wrl7fFLfg24WH4d3?t1IjTh<|>!{H2%ba<8*{`RJ>q z%{>_92+rVwK+2cNkKX%1Zo}yLC~LskO6%Q2?;rZ!S+sKo%$7HgVLUa1qq*+d~8f` z-vk(i8-m|Nz(K->A#TQi3|xSrpX3h-dLf7w2gBGj1XgdbDA<7Wf#H~D)dig_5L$pM zGOPs-4$#A6a7BX&f{I0j`E)?ktp(WO0m*WN%>Zg*1g2PvlWE=t-1Wfp?*@pE9h=4_YwO(%(u8IbB^!|C4e+rW9pU|EcE-mC%T(anq6 zWV*NVEho6p9%?wxOgjj&%%hW15HO>nDPFH$%K+snkJR#{yP-)?n3z18cx$`ag=n`lmNTIf8+7 z{%^c_8=(Xgloi%7bW29Xj6cAA#( zNgB{J`QJ5&F!Jwd{V`wUK+jVFu?AE=mXvf^-~PQfRj6H6VizQva-nru85$| zgJ8h_lvF_pp62!R8A=FB;0Fu<0ipuWzK{k0)gKCu+5(3IdT=WsJuuW`I8;D-;NU^_ zQH5ZnTOfHTE-K^)x$OeRM+V!(z6{)9uulkh;CfQjljoC*4|K(b4WZD1NMtZP#M7`B z!Lk7cTbhxIVLw86KLI%S{nw$Wz83O#cpM?6&1?rY!ASvk8*sx$+sJmmsLUHL&!Zu(-RdM&^PXQ=?RDc@q z8gL22MLY4QMm)(f6ai`%Xe9qR0^}2h3%LWWBqG%JY-&y+2z&j#@4Y!0i&&sm<$vRb z4eDOyB>rjL|7hP6CnyIe=F|_g+ey1aPEg~Xno~ch&p`f=f&Yge9b!A^Jb$FBYGo)L zpw(eTAQUd>1izuGfJgLzTn8IYz{agc|0Tvy5+3q^|`^)LpvSl_G zt=_TJ*TvM@TBDeJb!Df*o+(nPiriv;WZC4p}cUF zX`~6hFC7OdTCCzv?t-TM^J3-WBBu0H;6nq(3kDc3zbxPX9w+~c2E%_*)PSe~^mb^R zfTjJ8m9jwat(Q7to5f$?#vH!-n_Y-iU7JiX!0_vSt=IgTzIf2IGezWbs%pi69gbnm zJ>*IHy7Bh|aAJnlSeHyB+r-Wn+chzi8AdD9?f6GgDq5%vv472pSKQ^$f1XO4k~$W6 z{(A3_WYj?jNw0SLhEWoGrWV_?JVhLq0emrm{n2k3AKy3)IoKU=#x9XvjNQ^(?uwIS z6pP5ZqQ=H&ymBYnh=O3XkM*~Gec~`83R>aT0zeGJK~LYpNZG;F!pI&{IAK1QJm<$o zC^Xj;;%s{4$t!c$)z$~R51LC&$a7YVk!nN=r{bbI;Z z7h}aK%X-Y*JA3XgA5i(eCkdI4LmmxDZ{sfZBhRjGE$CkgHoJwO<;Kxvt1|YgzUYvF z$xEmbqHw=c>iu!r1A+cvelrQ4)y@4;4*VLM4NTgA!M*{maKGWc(2n7$5jr7+Ke;bFNEZUMLV~h@ z15$AU)>8(J@COF139A5Ehz7oAQ-N{^;e<^>1)+w)CXOrM$blUm+P4LiW(XdN1rP1& zf~U-&y2XO!KOb(R>^7-z6T5^PcJfE&bz{2^v|7k7if|Q}UIEf4; z0E`WAgL((3n(qc`mmeXZRypYyzz=GZlMZ@f$N9-bLErHCO)7Z5q;lbxRB(Pt<=ii+ zoCE1zK{Nnk;g4K4W@BLcEAV>?G6iYse>03dqI)3@bN@SsNwX6|rC#w}>awH5X0xLS z3FLO2D)w}zF^geP;fXLlJlaqBGR^n6Lx$_VTA2zn5;CqkahIBxl}@oceWB2D+cTe1 ze)3*NXU|s_=g=fKoif$r@^Vp1du5h=Rb&Y3*gMykn8*(d2hE_4ctBl2n-afBtbnLCX#Nu0@H+ik;VvD%wh-`1`NPNVo4mNA!4?CRl^O7;8&gjM2h{AI225&QGodhg*$a%8Zd0cOz zPj?P+t*Uxvp(&r+GDQfJu`SV+p}(AKC@>PPwI{^F*A&H`&$K{}@PY3f)f1!oxUCep z0Y7@)9CD|8cw5h)DKLM1oi^unJ51`$1M?T{z2)#!I%B3T-&lCu{Yj1B;-0Un%=caceX$LUdPK0{n23{edqSh@Q$Ptmz?O+Yn&rAfgy_&h?9w* z4%g4k7&#p&D6OX3D5@|CuRru>Y9C=>oS+fTuqMbsy)B}jpr?5?Hk0K1vd7WRKuP3M zN5qsOB@<4YSU7IH^zH6i=8@ERjZ{@mJ+|61mk?@?C1SpXJ;q>`qL0^Kbh6mCMc*F0 zTh**t9u}{qx>8Fe!zIF#rQvr#$!h*MQLRNnb07qE)$I%JgLmfcfw^KEHa9x1HWgSk zE#l`BSKx}PNsK*las0$AtUSI;QbC-^A>8M+m#n?XS3y%5Z~WXNFza#^mW6k{QIX1m z=C!9q%8nEt93D!olKLkVA+Q<_ns9w{xYv$!TQH*`1G#@-+5GEyjW+$u{s|dE_u$OA zq^8f8@%VrF5)~GT|2jKUMOt;Lhk7aE+;$u1CH{wbPYiptWyg4sqFmIyh!l&A=aXhl z5D>W$id?v&(XTumu6S)~U{L2}e#on%`g)xkkNVv>+IFL9cFEleOa&U#>K0qx++0)4 z52Q)8ddv(ocg9=?F))u*4b;%s$``H^yy)DTvp?R^tBy8nZZ#5iM!VDz!O^XSGtuIN za=E-V(dOoi+|b~L;3Z6E=S@nZ8t=JtCH*85g&ial%Teo@IfKf>cHPNn=lm=+dn2) z3at_*C5evml)qTrQT46oQYnFA84vN|OZ;uRWX$d-3;o+VnPgiS_HB~(Rz?EEKUUTCc`KTi2 z#QP%*4W5y^~jD_SVt~?*3FCzmED~T!o!Lxi9DeB+$|b8Mluy? z4lBu-%vQursH0C}4Wzo(?Ac?*s0S+nr~nFc@{enBU0cvkd91 zX!;qO2?(oS1l8GyX9xSwh`1oH|8nU6O164I+<=pr4jYDv*gy#pVED~SaL0k6p6oe7 zObi)N4s000CN&9kN8E(e)nIS&`yT1f*F!e=?|YTdJO(8Uh5rohKlU);A^HOJ+Rz@R z7O?)WApGebrV3D$Lc5d}V7mVO-sM#g7L?+o4DVn#@SDW+h-de8p%9>| zK_0O-6`+Uj`?@DqSmB9!@Kk_0m^m0(7+c#}>N$|>8|m5U8vzHPKh4I5>IYQIi8qprY7&{iwc7*?--2pe}n_iDF+1aBlXAiTisZK_|WW({iIZwval09;#GSO6d6d6CdSXDd`JBjxN*XU#B zNqWok$1f)2p7h@!ukJ?>xutmRMYw>;3oVZ?#^b1!OI>49?;Q5rgwPUY9@I4Uue_gJ zokpS_^JNmI>$PuqpV0iU(Ze>6{Slf~0@usy+I$67UNe=P)~(HI$+U3 zPllH<2{pq9N9O3T2(RE+7ezR3)Vp!f&13f!%{|wXqVWGX#XC6PE1t=5A&dPLs;@U)ra`k?Xs47MyjjPF&G{&ULk>@B5idgM8b58p z^M|(zDy&v^`~(SdvDGb};e`46)OAGS`j*71XKW_27G%jMKBEsSh1Mf~dp3e$Wd#SJ>?x#WPm;-l-Xhc>T^_6?*IC>^ zd10Z%xjX$AOoab84TIamW!z+7d7=d*DGnGYXNN(F>ZBa;RUFUqQSXXxn!=UQu!srr z%s{nJ1+irmnZ{{D;*E)#CbQU#mF~~oytUi}E`u0 zHR;xVL4~fZ!qdP1q~K`|#*0)GCLy!<>vyHQ%kOW@Xeax!ho8$KPD&kY;`W#84yi9` zBe{1a#v-ql)8);Z+ua%~RBoG#g=%SWR=Xw2qJbW_)aUHNvX+5UW z=R)q-n7?~oj-|-hMD5CT{dm}iS}irH+|KI_sjN$b0a-?vZM&}On}brBphUJIquw5a za(6e|vAo4ndoQOq1$Vh^e0;TBY2eBG-3Z{wQElDE2IUT74D~NWckz0&eo49g*hcY1 zDPFoA27*RpDz?(PwwQL@1>;$D?gkB%#rPxo{h~sh48daKZNYJ;j)w=KEbqyxbfQFd zZC|ctWDIqYYUU3z>AW?6U=I(6-az}_Z~O9+$_x2#n!QhAG3Sc&`U)=*782Fy*qoon zuk^@G%qqNQ9=rF#O!cDk10^^dcbRF1{sbYWLxwQIyIJzqFTQH?CgRQb<{Y8yDsaUX zn3@*lIuh$|)_mMlr@f*hGB>JwQ&CZqzYD28{P-e12Y&1uJ_ zy#Q{f^9OMcE*(1ZBSwtJTPCwiB(L;bwvl)%V5Jc4PZif5a$Zs5N>{cj)0H--SQ1Jr z!}@1#iKB1n?{!YQX&EI_`d9-N1EQ{u_bu+FECs`uh=KcJPw}VFeAy3eG!uyP;d1JN%rb!T;03{h2b|pu-D<1XO8oKAJ8&Hf--z)yU8ie11~PWB zFE)Qbt<*T6G+B0Yy3D|`^Rc!cc^{cMzfv}EST1yO*z9!{BMNz>6|GfaWqobZGf zn2;Lt&vVKhJ4TULz~`TyRuz`+5W)!rY7IT2wv@LjDjZIlVsxS8BPy4o44jh1s;cyy zdkW%n(66G zblmIY#H3*wj}T*`aYb+qGVYNS-)9bdxO!XZiSx9Q%<%Xd7hbBmkDrYa-kM66<_6t& z^RlQrU_lO3NU*-UrgLA(aMz8q41@i6c^zkPFh9s$A*NtYaN4%EG^k~p;EM%LnefMq z$c$!AV)PgRk(0x!pSOX)IIV#{F22qZFi;)8n7^?2I&&EBm&pI(u%oIodvM|VCFEdf6dIf zK=@N;E&)y?^CS)DNpAFaUq@#b(5lwwZD1#|Cd{u++5p|- z$Ra<;Xh35RpKY%Aj>N)G4LKUA2FbxD7ltKiCIu;a}Vdwhu^QBzH%sJ1LnF~V@Y(O?8k zw5xhXJLCxwc`3)tkfCV|3x_|xT43hPr{kM)FM-RjztX_$!?<7COr=aBwQ%QaDk2$Vx;eDZ5nX#& z{KyF*Iae+tcbM_Qe~P!!2&Y*vB;;AUNi|)0b*zct;8kI;bG-7EI?=$}UFR5A`BTPm zeBbnl#nmsQvB*u^g?U#CS(KKuQn+2(a(($SF1A=gO;-8v0q0%iPd|$&x?9Swd;tW`3||=gJpGPnYl@ zes1=?3{x%z3Uc^*5+boWJ;97jH^NkyM^M)1F5g1}FNH8ddOBWm#q|rHAG0VQy~;zC ziKdyY z3=xOfyqslgNp9^n<#(RV-H4@nl|Sd(dB5DXTOGb39nIa$mbNdfv?^xOM2FbxRWsw; zmI=oR@m2E3V2P4zHqC$9LDg| zV@|8;AumJXM|kO$au;C-<$gsw8a`6ieqz0(U+Y&j+KF~dOeQvMl=sVhOJ2-Bv6;HY zXt>luPZvLOFUOI{@XpgfJj^ov_tHy`89t^6Uc;GbTs92v=6;B3(O22o*Dy3)rlOuO zc5`EK$k2B%)^_$dPfcK@LAJkH0Tut^lvmblZHl}~8|`vpYt;Gu{44YCI=yY!9=ho3 z@DGSwyl0#=Q-1t(YpCo?L@e(&6j$Mqz&&x;-gw}p8ng>Vwp zxV7l_CQ8>yh4y?(9@ogAvn=Y>6TW?Z_2tgiz^y-h`bH~7S?rO)x;)eQzm zKGO6IA;e4V$1cuOxnCFVx{8<9_z?wntQu`xY7!^wp%1ni=Fs%h`Ua--|12hSR%U{x z_-_6R6FS>^KjOk*Z~rwDnmfaU*e-4o|CtFvqy#$d2{9q_lk|UOLce~t192iqDGChW zMDXA3KRMCFf8)a)OHhm-{OIIERj|?g^V}-*CVw~(mTn)SLZZ?N;nU?*Y9{u zc$BOa{8dbe-CIqfi=LAC7y%M-dcK`-#XGR{5VR{7XrtYQk`I3|RKb$b%JACa~CcNlpF@!{ebsVqv(wnJa;v5gDB|0rvwg}G?#g44_ z38{zO2(HijaYF9ZV)q&p-K)Gt)7Ob4ebJ4!C0b?kIy zmr7$}G#gw+J2np_@a4&Q$u@y(IMW-nc0(ZLGXW^kpXUQvo{^I_>~2l2E=xrSV`@)fMdSQfKx3BHtN z{+6$aT*^ty*M*)1SUkp%LH43+ujChGWP74FXw6X;r+lIP>P-Y$DbF&;65?$_lPp?% zYhvUm$+0#wzFD1i&jRa8i!y;)CbCOwE6rI4?oFSNbx7YWw9SxeecAugFrxe=M&K!} z0SjyXD|nS``n1ToVAf>?^p`Yi6B4h8qA6G8pT)V_t7zjN2s-bB!wmmf)4|!U23p{| z`76`G*~a@37yPaTpyg+p^6bKx5R&wufB&`PU(l&|9uWSoA97!1fdt7U_yL94!UeF& zb`B8!MV)~&&IpVvwzWFaPOQlvqhjRS7BfGObT6-Q+ z;rC8d7N~yRl^IbHDI~|YV0z8Io6fgj*F=U=(~*=~AXUCC!br7Bd{aAbhFku1salBd zB|?6^JnW>;4b6>o+(AB)A<0U4?(b|&WGV$BqwUC|b4Lz%;QKk9XnYQR&J8#!P7$t# z#PBTiijFTK)K-va2&^ydCsEBuS2qZ9$(2ZnqWV|0#*VHP?o0IIi?H<7(|%#eWemw0 zxZSR@PgJq2W5hYkSvXC;jS+sKqXIMJ?w|=d5%aqizVwi_$+6oOjc+3>BR#lup3Aiz z`eN=ZCPygDi0jg0;jH6V9mw@BcqENp)+{yjN70{5i??5I zU~U05>KzxG`#J8DHFp~ohI9A_t`g_ns%3j{0Xz7$e4|pCh~aQA&WM!wB!@ngcR`Hp z-Ep#{bfQ@Sq?T;OTqWoDb-E&RA_Xh%RDI#f7l9Is$h_vd3Q0^UAB}z|xwxxm+#NI_eqo6I4@F1KjFXJO)@1M*|^w2Xc5~V zbjxwjbdx}TqmS{nN{^yEoH@J430C0YerH{7|)UPZV#5v6z!9~Ft(7Zta~q* z63Iw1gL}cqUhMXL_oGV~$5rkvhXdLURJf`7*o*A80d8KDOWTE-7ug9V=Fi=k$85gC zmQIOQ&R_Q8dIMq;eJPiDjT)yo)u$%Ryv&^~ePQE?mM4mQ*_nKCB3RGqGU)PL$oK@y z83Q!Jg6%fohtsjypTAzhYl&~Z856xJK3KjY*y6Tfb&=$vhze;m=VI4Ec{C&9yZ2ZE zv{8e7lK$kFnukg)$O`pQ#5Ffd6(5Zjz?pLSKX|8Qk@isn4^IOjUXa%Td$Pm$Z9DNn zihJtUtOE@ftiCTO?t5E2@Qj+kzBNaB?`4zX3sE^f}1Fwb>p(jB~-o_gq1eK zxqj_uwbnWQ_U_BfjK`N(DdU28_V-j1rmptsE3c2Y9Igm@?>FskWE>r=j?XMAZ_EdV zc<&yPO&=~~Y$#(hPAAWQd?MKVb-(+G_t8R2^Vc_7O@qS^(K2H3FgDINe@?H&!(X0_ z8r=Irm5)Z5FD4j<)EAxnEX$h;7vs=0g3fLhd7^f~{#;QPQe%so-uZMY59$!ng$}A3 zpMvmVxCJ>zeieHn<1wsYN9qFeXkoh(nk ztKx~Kk&k!R>{0yT?r3t&#Ck;wsa@|XYjyKk(@@6KFyNKWPtlJ48ZGiQ+>L_O zw9YDt+o5|D15vH3`2#lHPD}m3S4&>7r zpW2>L-634-C%}(jT$7G&8&B}qi zw!ClpK%h-QcIVTWrHTFI>%HUMw1KQckIBKyA9$_45|JH*PkQC^=a?$pr=QP6((WchO4`M#gu@DXjiH1R!#botqKTY3Lyr}s+`mOT)(Zn{vpD-hFvhf0GsG#D9p5q`+UV02X$M4I{uml z{?W2ov#@~kR$OS7L~mz95WO%7hKnT1MHU*G*R8b8hn4e(J_(yznpN{{7}zS`^`Ek` z%6#~c-t?XJ&Nnsofar%=MuT@-hp+PIO9zu=dAU+t>KN#k+4ks|p{io4j~1nhotouc zS$s`SeZf8|w)*{}r9_ax3yS1BO>k&)an$31)!Rie2nCaJU13P+k}p4Y1y?#8p%dL0 zy!gVpW^bTPDz>!sZJeRkO4{&6?@;p2b|-Bj8#%-&@=^PHFVM6?9)wsBd-DYr-Fo^i zEj}{KH1+>w*IYb+2jgt9|IgR>&N$n z4eE5L$Tc4g@j0Kl2x2`-)ee7CV^GzKt?*PThON@;UXD|@l~II^9Ev}8@i+1(#5HWy z6Q3Wv895l6YMeGUVI_CP%<9)xq_1Y2+fKe>8cfqb+xrsPA||nN;?m|~DbLFO-LZBr z9M1MarPY8OsUb}B2XYhEY6p4~B~k5emOQ47e(jWT)uivZcMR&fGFRuPb)q)jzAZ>e zN#gvt`6*Zk1zX<}M0uEDkGtER=5PNa zgV=Ycp`ml;A4UJ;F5y{D4ubt+exo!wd(!eON&Oj{4bAI=>#TfrXu{o((xjz{;LiPk_?Y!NG=?mDSz=Fl9Y^7Hb|`G}q`jyWF&DI+4& zee8-0Pa=AxZF!TA1k6mLVOS+hf@Om>DIeQz*LI81>?K%>4G_6qtlkpspK+Fd#cUGa zcCVt%px=b5`6CN!^4rTX8j9}b`$_hVaj~Pp=WYcAH#;0eUtWyO7nV>UW6NwTTO}^? z8(g-3kVw*akF+AF*LcT;ZYd`&Ac5zp;5n9(1kvy@x}Il6Q|^ahN@=LsbmK&qWOEMAlnm&y;%8jpGU9n+Rn91FI{=NI|+P5Ekr|1N%IzXomE9bQ}3cvj;lP! zZ_C~+&%mw!oFlSriluF{GZ_Dv&|L~$X3^S!qL^|6<@KoD))Qz?xZ#gvjVw?8F3 zJe0Eysc|j1W4u+g5i3=7Q}+p_B#M7du2DqpBabF&FYXL(YQc|vK?%~tLL3Cov%QqL z`@}eEFp@{6U|lAHti3E38<4I#z0qhB>@#o3?Cf48WfbLwLt2U`3^rt@u{!@|ioK0f z7@1&ESk17HU<~CF=d0AGc(1*OKS*fb%kX}E_uh!gIqN{txL55A&%LDr=BQm`ya-}6 z9w)6jUDHR}qX@ei7%t6C8FG=%FJ76v=JX0 zvyJMk*bJ{`YB?-5QB;cdM}`cUxxT)sHQqca`HE^;KaD2dp~2TpIA2u_^^4dui-q1u zk`ns9sF(iO{z58F4m>j08C4HR^Xl(9r;@zu&`V$PiVF^qg|B8@OF?OH(_VlN9Jx6 z`>bholmAtOCxSk{M?*xwB=XVQrY`@ME9u7v^zZMG-z9_h#&3TXrtW;(T^{4`LD8dW z^{5%VwWHVa^xu-yoClSIJAwrpdaeJ_xBoa>pA~r^*e~WU%+_b6z|YwK;!MxkarHCe zMKId_J2|Ph!St}@KJW`YWNMNvKP$?`Xq5kxKi8UPE#ZOz1`RbX_|^! zRdSY>;qF({_nWSy%EfiQqb`3apq?z9S{h&Uu%qBMZJ^#@Ynk}9OPDukIUiBI9ZOOE z@XB0u$)k8%b|5c|{65CJp%!LFOLq=W^kl9ncp8-!t6s-T78K!ks^bIrUy@(T#Z?%5 zpG_Y73YCeW8|igV`=jMzTADfvjOm7G(=Y8d>(Yjw-Y(ZSb7vjZ)^q1F@K56h(MY#B|ZeEYkcvB`LKvad-5Y3fCc=x?J-eWx3Xny2RiKaT!Bs~NDuYK2YTVNz> zEYB7l4Xfo%Mqh?hVmg=TGl;I#^kp!`pz{kH-ePUBbYmNQ-LV?mNm18_);tgvBph{P zM&u=V!o8N_>`$GEdsxB==&SF_aQRXFYHy?4Fh$>G+t~8UkToGq%d@iUDEXw^c`L`P zH5w+)Pr%jT$P%Apy=i$4dmMTjdZ?rzY;cw;&z3=XwATw)xZ9A>?Zqwys>ZAhs zkZTu9NCaf&I0>`VzX~uQ-!X}2pK%;DyQ>`T6D+Zo>g?CW_MZL%O4{Sk1c>O}L3<0Y z2g9BxXl)~ix`z=`2}bg8K5$2w7mr^_LXEmO%HPy2e=N~>a420CDYWj+sljzae&T?Z zSwlXFIDaE~NPyI~Zu<_Z8UG|(fxM^W7UAJc>*CKZ82(e_^4W3mZ_4GzCvrI&$YnFI zpYzAAfM=)d{)_Z{`KGwm-c;op8${Xdj%3$(i7ae*mZ7;K&YCAnOpzG^ke zMR+KvyCJl^G4v+ZBkX*^tvAMu(fPt31z_UJkMTn<+`%<<{$#CKE?*TdRvgy+*#b>j6`WuK99#9Umfe&_ya;$uEs71y0T zH}S?}Sv^E-JFnsWoS5+!1rgbN@-ib;TZpb09Vi`Fw=HI9S8gX$t?*Eev2PX5U|-M< z6R;EDoP1=yZz_;HN9LOoTgqT){_biZm;EO?9An;##OF~06ZntO^$t10M({mpec~F3 z=ahH@=UjB-GHX94d>aUgI>NNydT8EpF||#UIVN2BfvrrBy${i1h-fRW?xtc6_x&6C z$7D~cLsDdyEwkTs9Y&*X=Be8`-!Xr6_A@Hp-k=QgU2isEwBXoPkVzXC}M10scrBx!yuL z+7gao@%nW{)`xqcJ_$XyFy=?6f~%j|H~X-kYbSQtV!G8zTI?T(P`f^icFRxJ}M*;V?lmJih zb4D|kxA>vFq62=_MjBm`j1IC>jO$~uS*c92SDr348arj)bLz%?t7)39+6C0oR-?%KtBS>O-sh+_M#z?cI%8G(vl_&9C7^d&7Ax1|K%O(K7k=qCR9C;@EKO zA8UtZ9Y#7@iWd%6A644YE_9GQ@#avMd6lV>Qh;MA&Lr_^#e3`LvCjV#!f{sM|C@v( z6zFMSFdslTkbv!P!tu9)?_VhR#7f2bN5FX}YH+;hFAj_CVv0NoftRSq?;5~CvgvpT1iyo?cTZ;Nx1!!aaLr7rkYF=ih&qjw!y99?Tz zFn}(vm+^+<)VOF6Te^zb3$pwd@AsdE^-x6Q^5=;^dp-KPqN_f+>0qHUGKI1!FXe;c z1)EKpd}Z3W%19J{nqCdblyTJoo@r)AMWv9IdA(@#1ClADtDE;bwlZ_CJHPpoJ$PVV z{~Xszn>vbmVI&$=O(EnH_iY{Sgwpj6HH?-J%}56w4s)P*sJ_(tdhrn zM;zm&_jO%D%$)>54Vp{kiP@>zBZP7kt4013mZP{~rq9P>sf@3=Cd*bvp;SmZ*~0`X zh-VQeWzX-t(fB-dc`%uMkeV_0X-%IHeZsU_@00tR=uQluB3ygl_QletT=Yykj|=y( z->-(}lhW6YdE$@C?n9-tl-?I4db9{h!$l0=l*xY!*>Lq*##thA=D%DUT=wjKedN-C z>;tR#n*@8}vvl&6==BY3sp3~Qmhy#?%(P0-p9m=QC3wS4eM{m!!u^Wxlh%MSr*x^q zhQFdPvv^O*$=1_gC%Ou|I!30IsHBvpQdfnEkYXcN3M1v)pZ6PT}Qw7r=TwAp4le|7G}w*YSM+ zm3UU=l-A;b%eEh+B5FT+6kRht{MaGn6xy?q+1OohWer-teD7d0MbWajJ#=Sguzr3qbQM&{X3ck!yIKR2R!-3YUh zS7X*AT>jchL_76zIzi#_7wo=B>|Mk+7b0(F^_8;A?RgjO*>4Ru%}h03RdIIo^r!Dc z(e&0bTj}gZW8t>`q7pH9UbWumhA>{xVdKL?Fzin?{{JbH;aOqyZ%&3ZCujC;z+~tI z*5Q9ohJR0n|6($9>x}Rgg#-U^4QNLH(qtH}baKA7@yZr9mDdgbx0_hVEI?i8{KFYGzlWdqx*c0A34%TU(*)T=c;O|UozsVJ< zBx~)Fu~11s4B1-q7L=-;^!z*&#-tn-FeTV^(pf(m%;3@yR~BqvW}!;zkU+|_2ySEuEhJte*Wrsw7?fji5c)>^ zLY!olz}`8(?sjQ29!a&@i~YGP`|A$ooYU7mSB6H!E)2nJj~|#%N>S0lwwe-+rs)(-jb)ySCPZ6PG}#-$M>sgRojZblGhg3FZ%rOu z53ZHlw@+t!=TDU2&D`Ol{4LH)CrJpaU`Ul%!6J}Qsu!2|WuIiG4S7~UrdQYpYSH5BP#uW}%zk zEkc(eo*89ZG1QxjK5r|CI;(mx8?O=VKE_+Jw4?UUkNL!<9mcPUo8G!{pp`rWKfaeP znr?k;(Mn8Z%T!);02W>USq$uKAN)6EAj!%8hXRm+Jb$Yr{+5CNS_YP7KZQa6bB+#t z(DavN;F<=Ifjl@(tV74-%hPY-OQKL8V^m8LVB|^;ySPsfGY`i&QAJSK1M%KL#$0g3n4iK;710p?+vk=i@P%e*4_j%qzmAAkM&xnp(0 zQFFkkt6wwp@zV>XU!Ar;Bz~eX>t72k|KR=+cHaW`hLe41iXofnOD&Z~Pi1bA_NT6; z3}tf#4aA0_f-oo5bB%LsmsJ|0t+;i*5ZxVoFV4X70sOS65-|?7RGHqPr4C1__`QQn zPN`~`XpVDYt~!?CBeD-Gb_WVG&yO5P(A6+po=>4IW=tnMyyA+4-Hhs}9!fc|B0eee z0Edv0pr!a-?S3xZk=vH!b@BTPPi@yT-G>8*<WSlqy?E1QO*Bmvc`ec&pe?a| zE|xnsC%G+}p42SK+N7hw_MB)gWojjf)q%c&=Tq!n#TK)zIYEOl<<4L>Aus-g-3}ku zHWpe7!V2=s@ON)veQF{{^-S)hY@%@omLt>Li8;!b~D%FOR3HSzd~dY7x@1B z7KVV(%5m_SMz+Bd%e*ZlMK(HjgF(cx@Cf-!c!VjEr^4;Vs89ux52pTo-+~o$ zcKz~tw9#=M`D4}U8nEVbrlN;wkXY3DerY637>d)38(%j+|Hwb0_Wx<`OyIFv{>Oi; zkv$?5k5ZIfi!CDADzcX)`@XMHmXJMJqL4?*(qgA9kzFVeDTP8*wq!|(l;6zrL|wOg zulv8R@ArRyuiqK3IcLtyoH;Y^+0JLqV?IV{F@5DGAx7{*BtNvP%4|)kTsHM5JBmG& zsf(Jz^_yk7dg;^EM9J+Uixk@4HSVU^^=2_XJLW=mAE%90d5&&+-SsR+2lHyI$0hR_ z;!Sa)HA#A@jicr37i2qMk<=M&kerU()G}S?$>7(1B}ylKNUw30hoXyI-|nSSZIstj zb{eZgt#1x1I#(yLZCO{-NGG6qHN{_%%4y@&Ap(c))91FB(lwTSt*csWnxktP(s42R z;x*rN-g@hGkrJ_W%!eA}7r$I4qRAX{cMUkHk{7}|t+-)|!R`4K)09^-Yab~IkiIk= zH>6~duvv_oHfigx+E*zPqFqhBVb|QN5tiaSJgG@Fl| z=}0fu3>Y_d+;)efYhz>KOXRD)R}Xtn(HZMTZW3$9p0h%qDjH|8UIS)MMf_qnWkyoh z$+8#}a}&g9T60{Koj>X8L#LKXMZbOUf*;3AziUU+N*qarJNH%?Cymz7*3*hx$X;UI zGKgtYqTbpOe}UNFODE%?#bll?=}i-!dj6Ab4n^}a_t(ml%#Fq^>}U=rZqS-m4G^RX zC2Xba%=@t4CTiOenb@_vYD?es9TH{@vunw`#9yg4Fraykhhr1RN3#Zv&Bwwh43DZt zAK7gA!mp>pB^R-ayRmPFSnTYBj2BO5!x?S{J-_zYsJwv6^)yNFE6(SK3{P<~=Izvq zB;}Rg-gHW_z?4DEu*o?$(8|M++fuO)?mYvI!h!HvbvE!R5g8f~9E2{Z4U$Z|6c z6PSF)SD0YCb0;gc3N3HaJ+erm2aghq+}AP z`-Gvw-EVWUW(t@`A3SD5t2R|sy>;ZODksh!eDQkD-l@9p>*Vn-RW(OM9__Pr@`w_d zu)}sAEiaG1Q)cd>Ti!^&&th`xR1Kfff_K3)Ot4hYG0H`;UGreX`*#UPD?8Ue9Na(Q zgd;&PxW|EE9$lVBm#6(_p_VK03PcY_B#cNXW~jw=m#bqgl$R68U!TcZC(fayre!%LcH7Xs|ZBnCNx zpKMS15+tN7zF|1Ke|^Vw_qwf>IW}5CXO6l{G_C&<{`|s*z`9pvss-xvIaQ;BhI!iN z4Ufj&_!%;pra%OU?N>G7_j zCdvCdc(HtHxxDAvxgMq1w;Fga7857yTzzn%pt$Ug8a6gwze=OU-4(1{HGjM?{Q$ea z`09epm#e#P<*l>u+_2ZBYJ78n+b#8{CJpLNgZ6KwIA1&#z#e-b($M_Eq}Z?}JgO?c zC0wK`pUsWj&nTW@R3tYwrJ20k#o=IG5YxL%mur-{OzxlLyb9%$H7j`p%Jn^JZdk1C zW4KIT8z(>6+3IuK+wD`gOEp-n_c6NN!EAkC*}CH@vgU}yB+OPIZtfrvu`%$ekP&3 z2kDG=Kbg8t_~R)et{N+*w+pWq&NUk|vM21(_owc4;Bma4sO!Sp#2mov7JkmpkeKKj zbqQaP;@1k7v}6~nB+{hON+ZINyU~x+#w3EiDaod`DAc8~O59&mdf*&)xG^@yG<|F$ zM4hzVX#;0P#%+l=s=?Z9c_yQ^4mZ@_CeY7VvG%`TSJ!rGQxj_>hk#FK5%-6Z4BAgY z796Mf;z}$`-X~E}(GF`K9Mm>38t<;9RuL}V!B+2oi9hyh{uo=2B8Pa6oOq;3gjV#6 zHT)TpjQX{Tcl^9>he%L0)awY8%D5aT4J-(f=&hrYcQ(2%$9^j3O~LTiXujTI#BtrW zdfk1#Hw#T)L67@vjvKa}IPmSXC%M^K5vgj{D20lW9i#IVka{@ z>v77@W8J;QA>S?04(s+wm6J;bzj*)6ym|hEPUMz{FQXSfahGtFmFo?}JJE8sF}_^>JYzP!uJZ7tw=_9-F3;6H^z$cA zx_^pXHO}=?d)ZO)BKxn0M9qy*{g|XK`n- zFNGE2p*wicjAvFwTxRDky?ICWc;3;d1*DhWx}1Wz&u9Xz?w*&^sP!GK(0qYzys+3l7Q$;vsAfv0&XHa^7DcV<&bI$Y1B zyfP2Jc>YsxV)>pEQ8Yqj3C)9|A(V*JWev81*7k7 zR5C!9eg1mc=cIoj2O*FQ)~U5;D6Gx3Xs4Y}a&eig4wxXcOJ7=tlaZK<@eUoKEX@6kx_}P#n0ZQ+SuVy_X(N}?rHqhscfu7~k)GJ} zIZh(mhygX)dj;mkorYyLXY7;f_T5mB)A0~Yb8URxeK&z>O^$Vq7n|(C!0D}ao154z z{2ABlE%?cO^>h4W1MW!SuOj#a3Dr++O^hFqX2Tj_@M~~E$ z9X6Nbl;YmzZt?A=IdJaQl;`=bhf#Fv*Ajp1RV&OG?R}psujwTHyjj+LC`&<0EjPeMM1J_19V2o@LBb8IV)BCj z`ZfFH;!QIB)oc?5A2=HsT1~e&8Cgo4IRE;@Y;vP~fA!^e1zc$}Th!S;5&IKK^cB!W z)VUe-l5Ko`)axDPp8H*P#AQl-J<u^ z|7nCu`Ydqat5nhl?tvMEO8RI)=9cy=CItQ%kj@97s1EL&jGFaj2xE+*3aqcQ`Bt_u<#$Z=tBB=h)uMU~NoHfcieDKUdS- z)eWQU=eSexxiF^%LVTBOh5!x1Nzm$eW>wVw1CzNsNm2A9G6rbVi2(RC@^^aA1}$O0ij9C zUlGE$A3P!0HXtDt+`af6u@u0WgP#xIa}y710EBF9>|j(p$q|vJQCAAd*wycRh$S!Tu>&YN-DicoOn}6EK^JVWV0qChUeOAogmy z;We=A2fN{Aa2EWz8{+(lCWCS3Rdz!Vcc~jLfcz|XL*T#E+Hj15xk|VQH9ISw3l8A~Lw+euBUYl87upqVZnj00H!p ztAT1GJ9grp`wW-nBEbfrNZhZE6QqytZ4%qY^ zT+7ljA>%#Qi9ZMo*Z;78UlQ`a-*H@7IxMo`* z!WM4YH|Etk^;J|{XE)SGSkzAPNQzo4RyU!rvxH>P*|os6I`KpLA$9rYfV_-BuEvSJ zO+|NXnXyI9Jpqp@96!`lhuyJj^4XR1L^LbHzG<+dms*!6`N`$Igu89>B{ur9?9#OL zjmVmD=~l5j7MAsr6o^>Q`^J19U$dHlI zcjXfzJ>i%#=$M(wIDL!mIFWEz(1jg^E+5L=v0Zzwl=L-6&_qU&N@iTvyAt-YT9n`E z?wj|iq;YvWdhTA-j3zVcTDzZmmL{0_wLX)#N8EUn0v+=kmp6%_q-j&HzVTZUGN#o{ z`4)>rKBtd!(y~on+|iO{4D=D!#HgLvm=bd_zI9DfTsjkj!dEtjl37Bw|Hb+k+733@_{=_O*}p?7DWkQZ$)p^9*HYbVY<`L1`pK z(P3hnRrDe*wZ8HyVEIB{yhK3#N$TmoU~=eJBAM=i(rHJ zuYWNueW=uzdO<~;j~4SL=W&NNv1aai`d1p{=2rvv?4BvPmzce$xOZ#MIp&3u>(?)w zuFc(-?9TlrNsHk`r}D)6c&01Pi_SqDjLLgk)CaF_w9&tAvGFP8)pUL?Gx8q0_TBEL zzE733o%lVsdq|zGITq;KC8O%dNE54c)ae|?*rxlG|5QfHu3S+)qsSVPG@6`?g1f!y zv=5w4KP{VC8QV876oIXesG|~mW?^#fX03R?hWLVwtA8hvxYcFrMrN&eyH(uiKo3-@NSUM(rO zEMguJHrze%l|-BvkjpTL+~s~9|v~7i26=Gxd+2) z-TNH}Qgv&_on5w(tDNUha`*mXp*B1{R5jHSe`c&c%(Sp(%M0ZlwC|FxWIrP-pLo`z zK)XHm=o6AhWK1gg`VqNrHM5*Pu}azBVl=P6eu?&Qs}G;K4W-1v*tS~&2CS)R`eskm z<;3HipWXkGdFh}+XZ#vj$*K(wkv;4olOGu`x7gSmu{K3-O{c-wy-W_{W7Q+S-_xkNmWn|< zoZY8tmtC&V4T=k+^sL9+CAm$KutQzd&z!$ZlV+|htcy;dd~u`J!rPaHnnAT&{kV5- z=5~E5-pzBz$84CwkCn%T=?Q17Q|mrvk=%aSXr?4?&F91E6R)Z{{GUDHpQ2%o> zB0pH>S&|Vq12-xoR%tLk45R3C*VgEg_ccUi#8t_NgS$@Z4)S6Dnysuak5R=IeDk&ICuYjBEHBQEFCX7ckc;|udr=L?g0!|$c4@L%1# z6FbW`H28L~Si^2hvg5g2&ne@FiBD>uWPHtL|x? z+F7!b>BzvC#oJ?gM;8KiDsNJ6J>ynprtyg>tmuhd0PT*WnNP8}5_EIvN#~wabYg8x^ zN%MbSEpgD#FpXsd`Jl#>GIQ0Xr}VvMt(p|Av0muj%DXH~(Wj11eNb{1OJ<;BzTm=| zphz7%^6`zBB_Uhv*oe8akn;1Lal!<>%LW zW7hC;67R@vy0-qQuq?JxenUB*JZIRtE1}u`yC#C|k^Dz-GVcoBlrT@kO{>AX@q28kI(nZ26lRJn$UfO~w^r_$J#UXjo5yf%<(Z_G)#G`Xp&rfj%9;ZYCgyVR7bjgd50 zoIigK&$y&d$Tyvzw$5hn8xlXRFg-e-j4@`h+DCRTJ?t&NIo!(b*Pqh8?DLJvmSukZ zQMC#=VT<_60T!hJMvrlpG0mit;?Bhl26ANDLJbneXK5m}(>>!(D`fh_7(N~fxLe

*St36>hgmZ*%r^N}@W_wxD!br$#4C{zAa zm~1&nB*YLj8!$ne+m_tIUG(7jZN;n4`48y&`1RZ$=M=p|uGF14(vcL@KkfAGGh%l+m-Y|o5?+Efiqo_SKW9U2in3tUULQFMK2|y) zW{4x=fw+Qe4df&^!w;GEVEtYg_?UtuNxVn{Ls9d8 ziNLo-D6_(D^jiac2%ok*#qzV##>SH&JiK$c2Y?^R3O`X0aRxLV_+ARzP$;nQOL#P} zaGf;x?&4wYU}o-Sj?pl+a zAJTz^_0a?@tg|P;!UFdLO9$*Qus_!w$o=nrF$0SN(*{4GaRDb>(N_9JY!?pnHRRLJVn{c9s zQ$2ik7_BlW6ag$55mpm;A^x}{96c#u_HnCnl!P0>Z-$`X8eCd1jtm6`c?douUxAT; zYTjy?Ai@po6<~FNCBRh^=oL4>#R>y+fEttlN&qE*5wB%gZY;FtDg7 zXJ}#qxPhstC;~x&5pjWDUdyO*iYg|CdckEJHlw$*72dn#6BrSJzj0z{2>55@6oJdQ zhzKkeJG;1u56a2F;&1VCbp?;BD9Qnsy}d)OxJwYHw-e?E7?7n5Y#iPl5r+jqAbw|S z>%{CLJ`sEndi)ataoC)pURX!Arl!A-W@~E;gK)>?(B*2?)zu9Qg!PWi$N?2y(b|cx zj@j8oPPAuZoHWfVZ`w0=PlX`Z!5<$xZwI6Hx_ zahOzV13sp$Eo?2z4FjJ-YiFmet7}nCPVdm{!Tn;(j~jyR0X8JO0jYJiHuMfnLI*a+FlYxxV8O%zM|D@%*3RCX3^-5^ zfu0)?20M2+H z5EIlmevk*<8QcY`9&|box_lSj0p@WKnv;_Of--V)fEzpx3=u1uQ$!r>zaSK!{`Y$W z;mi(F1~WV8?_dlEm-yKq9}PYJlE8BR{beZL6W{xmBY@gGc<|ut1bl-6rlO%$U0wYj zqPBJQ!oRbDNf=D)_>|G(KOq2SL(nzw{S-LxZ@j-uaCYg<2OkP56dU*bwHEIK`ucKp z|1yQY^aM|ch=>5$0KIak7rvaM1|@(JKnb7(Py#3clmJQqC4dq@37`Z}0w@8L07?KQ zfD%9npaf6?C;^lJN&qE*5frZ!5 z4evKpRXSj(cvwm0kOt20H8Fz70RM;I!(lq`@$gLg{R~dHYv|7oEEBL+h#UBGLR=AX zL;(C*BM#u&1u;g1fy*4Y`4LmZ5wQnh(7^j(KA`z;JOlrvzkB>dvjN<~0IXqk%;0bq zynluW=9dgQ|HDQWaR&rl06oYOgJ=P|=HPm@h6U8<&*RZ*?xO~gFsq5U$~b@b`KwHJ L)zSVpKmUIKQNw~b literal 0 HcmV?d00001 From 547cd30d70e00c5546cf2f75d06c6f86320e06d3 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Wed, 15 Jun 2022 11:22:17 +0200 Subject: [PATCH 12/14] tests: Add helper to temporarily extract malware samples When testing json-output we need to run samples through the "main" functions of modules, not just their "process_file" functions that would accept the extracted and decrypted data from the existing helper function "loop_over_files". They need a filename as input, so add helper to create a temp dir and extract&decrypt samples to that temporarily. --- tests/test_utils/testdata_reader.py | 46 +++++++++++++++++++++++++---- tests/test_utils/utils.py | 2 +- 2 files changed, 41 insertions(+), 7 deletions(-) diff --git a/tests/test_utils/testdata_reader.py b/tests/test_utils/testdata_reader.py index 5f1a6baa..81f96f03 100644 --- a/tests/test_utils/testdata_reader.py +++ b/tests/test_utils/testdata_reader.py @@ -7,9 +7,9 @@ """ import os, sys, zipfile -from os.path import relpath, join, isfile +from os.path import relpath, join, isfile, splitext from contextlib import contextmanager -from tempfile import mkstemp +from tempfile import mkstemp, TemporaryDirectory, NamedTemporaryFile from . import DATA_BASE_DIR @@ -73,11 +73,10 @@ def loop_over_files(subdir=''): and the contents of the file, with the file being unzipped first if it ends with .zip. - :arg str subdir: Optional subdir of test data dir that caller is interested - in - """ - # create temp dir to extract files into + See also: :py:meth:`loop_and_extract` + :param str subdir: Optional subdir of test data dir that caller is interested in + """ for base_dir, _, files in os.walk(join(DATA_BASE_DIR, subdir)): for filename in files: relative_path = relpath(join(base_dir, filename), DATA_BASE_DIR) @@ -87,6 +86,41 @@ def loop_over_files(subdir=''): yield relative_path, read(relative_path) +def loop_and_extract(subdir=''): + """ + Find all files, decrypting them to tempdir if necessary. + + Does a `os.walk` through all test data or the given subdir and yields + the absolute path for each sample, which is either its original location + in `DATA_BASE_DIR` or in a temporary directory if it had to be decrypted. + + The temp dir and files inside it are always deleted right after usage. + + See also: :py:meth:`loop_over_files` + + :param str subdir: Optional subdir of test data dir that caller is interested in + """ + with TemporaryDirectory(prefix='oletools-test-') as temp_dir: + for base_dir, _, files in os.walk(join(DATA_BASE_DIR, subdir)): + for filename in files: + full_path = join(base_dir, filename) + if filename.endswith('.zip'): + # remove the ".zip" and split the rest into actual name and extension + actual_name, actual_extn = splitext(splitext(filename)[0]) + + with zipfile.ZipFile(full_path, 'r') as zip_file: + # create a temp file that has a proper file name and is deleted on closing + with NamedTemporaryFile(dir=temp_dir, prefix=actual_name, suffix=actual_extn) \ + as temp_file: + # our test samples are not big, so we can read the whole thing at once + temp_file.write(zip_file.read(zip_file.namelist()[0], + pwd=ENCRYPTED_FILES_PASSWORD)) + temp_file.flush() + yield temp_file.name + else: + yield full_path + + @contextmanager def decrypt_sample(relpath): """ diff --git a/tests/test_utils/utils.py b/tests/test_utils/utils.py index 45cedc8d..cd7b6e82 100644 --- a/tests/test_utils/utils.py +++ b/tests/test_utils/utils.py @@ -36,7 +36,7 @@ def call_and_capture(module, args=None, accept_nonzero_exit=False, :param bool fail_nonzero: Raise error if command returns non-0 return code :param bool exclude_stderr: Exclude output to `sys.stderr` from output (e.g. if parsing output through json) - :returns: ret_code, output + :returns: output, ret_codt :rtype: int, str """ # create a PYTHONPATH environment var to prefer our current code From d67ae41c607ddf9390f78d29f6fcd8c17db4ae0f Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Tue, 22 Nov 2022 10:49:31 +0100 Subject: [PATCH 13/14] tests: Return path-part from loop_and_extract When unzipping into temp dir, we often need to know the original sample name. --- tests/test_utils/__init__.py | 2 +- tests/test_utils/testdata_reader.py | 9 ++++++--- tests/test_utils/utils.py | 4 ++-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/tests/test_utils/__init__.py b/tests/test_utils/__init__.py index ad823416..23146e93 100644 --- a/tests/test_utils/__init__.py +++ b/tests/test_utils/__init__.py @@ -1,2 +1,2 @@ from .utils import * -from .testdata_reader import * \ No newline at end of file +from .testdata_reader import * diff --git a/tests/test_utils/testdata_reader.py b/tests/test_utils/testdata_reader.py index 81f96f03..5343305e 100644 --- a/tests/test_utils/testdata_reader.py +++ b/tests/test_utils/testdata_reader.py @@ -99,11 +99,14 @@ def loop_and_extract(subdir=''): See also: :py:meth:`loop_over_files` :param str subdir: Optional subdir of test data dir that caller is interested in + :returns: nothing but yields full path to sample file and relative path of original + sample inside test dir """ with TemporaryDirectory(prefix='oletools-test-') as temp_dir: for base_dir, _, files in os.walk(join(DATA_BASE_DIR, subdir)): for filename in files: full_path = join(base_dir, filename) + path_part = relpath(full_path, DATA_BASE_DIR) if filename.endswith('.zip'): # remove the ".zip" and split the rest into actual name and extension actual_name, actual_extn = splitext(splitext(filename)[0]) @@ -116,9 +119,9 @@ def loop_and_extract(subdir=''): temp_file.write(zip_file.read(zip_file.namelist()[0], pwd=ENCRYPTED_FILES_PASSWORD)) temp_file.flush() - yield temp_file.name + yield temp_file.name, path_part else: - yield full_path + yield full_path, path_part @contextmanager @@ -157,4 +160,4 @@ def decrypt_sample(relpath): if tmp_descriptor is not None: os.close(tmp_descriptor) if tmp_name is not None and isfile(tmp_name): - os.unlink(tmp_name) \ No newline at end of file + os.unlink(tmp_name) diff --git a/tests/test_utils/utils.py b/tests/test_utils/utils.py index cd7b6e82..33815811 100644 --- a/tests/test_utils/utils.py +++ b/tests/test_utils/utils.py @@ -36,8 +36,8 @@ def call_and_capture(module, args=None, accept_nonzero_exit=False, :param bool fail_nonzero: Raise error if command returns non-0 return code :param bool exclude_stderr: Exclude output to `sys.stderr` from output (e.g. if parsing output through json) - :returns: output, ret_codt - :rtype: int, str + :returns: output, ret_code + :rtype: str, int """ # create a PYTHONPATH environment var to prefer our current code env = os.environ.copy() From 9668cf67c951cb3f9794e16933fb4ea48b22af72 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Tue, 22 Nov 2022 10:51:33 +0100 Subject: [PATCH 14/14] tests: Start test to run all tools on all data In another branch I missed a bug that occurred in one of our test samples. Avoid this by running all tools over all data --- tests/test_on_all.py | 62 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 tests/test_on_all.py diff --git a/tests/test_on_all.py b/tests/test_on_all.py new file mode 100644 index 00000000..537e9eda --- /dev/null +++ b/tests/test_on_all.py @@ -0,0 +1,62 @@ +""" +Run all tools on all test data to check for regressions. +""" + +import unittest + +# Directory with test data, independent of current working directory +from tests.test_utils import loop_and_extract, call_and_capture, DATA_BASE_DIR + + +class TestOnAll(unittest.TestCase): + """Run all tools on all test data.""" + + def do_test(self, module, skip_list_arg=None): + """Helper for the tests that does the actual work.""" + if skip_list_arg is None: + skip_list = [] + else: + skip_list = skip_list_arg + + for full_path, rel_path in loop_and_extract(): + if rel_path in skip_list: + print('Run {0} on all test data: skip {1}' + .format(module, rel_path)) + continue + + output, return_code = call_and_capture(module, [full_path,], + accept_nonzero_exit=True) + if return_code == 0: + continue + + error = '{0} returned {1} for sample {2}' \ + .format(module, return_code, rel_path) + print(error) + for line in output.splitlines(): + print(line.rstrip()) + self.fail(error) + + def test_olevba(self): + """Run olevba on all test data""" + skip_list = ('rtfobj/issue_185.rtf.zip', + 'rtfobj/issue_251.rtf', + 'msodde/RTF-Spec-1.7.rtf', + 'basic/encrypted.docx', + 'encrypted/encrypted.doc', + 'encrypted/encrypted.docm', + 'encrypted/encrypted.docx', + 'encrypted/encrypted.ppt', + 'encrypted/encrypted.pptm', + 'encrypted/encrypted.pptx', + 'encrypted/encrypted.xls', + 'encrypted/encrypted.xlsm', + 'encrypted/encrypted.xlsx', + 'encrypted/encrypted.xlsb', + ) + self.do_test('olevba', skip_list) + + # todo: add all the others as well + +# just in case somebody calls this file as a script +if __name__ == '__main__': + unittest.main()