From fb6074a5d7aa3b57d5610c6626fb144fcf4da3ec Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Fri, 24 Jun 2022 14:51:56 +0200 Subject: [PATCH 1/5] tests: Identify 6 slowest tests and add option to disable them These 6 tests made up more than two thirds of the overall test time. Add an option to disable them to motivate incorporation of regular testing in development cycle: set environment variable OLETOOLS_TEST_SKIP_SLOW=1 . However, these tests are there for a reason, therefore do not disable them by default (like the json dump test). Should do a standard run without "optimizations" like this one before committing to master, for example. --- tests/msodde/test_basic.py | 4 ++++ tests/oleobj/test_basic.py | 18 ++++++++++++++++++ tests/olevba/test_basic.py | 4 +++- 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/tests/msodde/test_basic.py b/tests/msodde/test_basic.py index 7eed57998..89807ac02 100644 --- a/tests/msodde/test_basic.py +++ b/tests/msodde/test_basic.py @@ -73,6 +73,8 @@ def test_invalid_text(self): """ check that text file argument leads to non-zero exit status """ self.do_test_validity(join(BASE_DIR, 'basic/text'), Exception) + @unittest.skipIf('OLETOOLS_TEST_SKIP_SLOW' in os.environ and os.environ['OLETOOLS_TEST_SKIP_SLOW'] == '1', + "Skip slower tests") def test_encrypted(self): """ check that encrypted files lead to non-zero exit status @@ -119,6 +121,8 @@ def do_test_validity(self, filename, expect_error=None): class TestErrorOutput(unittest.TestCase): """msodde does not specify error by return code but text output.""" + @unittest.skipIf('OLETOOLS_TEST_SKIP_SLOW' in os.environ and os.environ['OLETOOLS_TEST_SKIP_SLOW'] == '1', + "Skip slower tests") def test_crypt_output(self): """Check for helpful error message when failing to decrypt.""" for suffix in 'doc', 'docm', 'docx', 'ppt', 'pptm', 'pptx', 'xls', \ diff --git a/tests/oleobj/test_basic.py b/tests/oleobj/test_basic.py index 3fdcab037..2f750abf2 100644 --- a/tests/oleobj/test_basic.py +++ b/tests/oleobj/test_basic.py @@ -3,6 +3,7 @@ import unittest from tempfile import mkdtemp from shutil import rmtree +from os import listdir, environ from os.path import join, isfile from hashlib import md5 from glob import glob @@ -91,10 +92,14 @@ def tearDown(self): elif self.temp_dir: rmtree(self.temp_dir) + @unittest.skipIf('OLETOOLS_TEST_SKIP_SLOW' in environ and environ['OLETOOLS_TEST_SKIP_SLOW'] == '1', + "Skip slower tests") def test_md5(self): """ test all files in oleobj test dir """ self.do_test_md5(['-d', self.temp_dir]) + @unittest.skipIf('OLETOOLS_TEST_SKIP_SLOW' in environ and environ['OLETOOLS_TEST_SKIP_SLOW'] == '1', + "Skip slower tests") def test_md5_args(self): """ test that oleobj can be called with -i and -v @@ -158,6 +163,19 @@ def test_non_streamed(self): return self.do_test_md5(['-d', self.temp_dir], test_fun=preread_file, only_run_every=4) + @unittest.skipIf('OLETOOLS_TEST_SKIP_SLOW' in environ and environ['OLETOOLS_TEST_SKIP_SLOW'] == '1', + "Skip slower tests") + def test_nodump(self): + """Ensure that with --nodump nothing is ever written to disc.""" + data_dir = join(DATA_BASE_DIR, 'oleobj') + for sample_name, _, _ in SAMPLES: + args = ['-d', self.temp_dir, '--nodump', join(data_dir, sample_name)] + call_and_capture('oleobj', args, + accept_nonzero_exit=True) + temp_dir_contents = listdir(self.temp_dir) + if temp_dir_contents: + self.fail('Found file in temp dir despite "--nodump": {}'.format(temp_dir_contents)) + class TestSaneFilenameCreation(unittest.TestCase): """ Test sanitization / creation of sane filenames """ diff --git a/tests/olevba/test_basic.py b/tests/olevba/test_basic.py index 5be1269a8..988c131e5 100644 --- a/tests/olevba/test_basic.py +++ b/tests/olevba/test_basic.py @@ -75,6 +75,8 @@ def test_rtf_behaviour(self): raise self.fail('Found "warn" in output line: "{}"' .format(line.rstrip())) + @unittest.skipIf('OLETOOLS_TEST_SKIP_SLOW' in os.environ and os.environ['OLETOOLS_TEST_SKIP_SLOW'] == '1', + "Skip slower tests") def test_crypt_return(self): """ Test that encrypted files give a certain return code. @@ -105,7 +107,7 @@ def test_crypt_return(self): .format(ret_code, args + [filename, ])) # test only first file with all arg combinations, others just - # without arg (test takes too long otherwise + # without arg (test takes too long otherwise) ADD_ARGS = ([], ) def test_xlm(self): From 44985620f38983080e23c1cdd2d53d66cb79f75e Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Tue, 19 Jul 2022 09:43:23 +0200 Subject: [PATCH 2/5] olevba: Check whether input is pure MSO / ActiveMime Olevba might get (and does get in our case) parts of files that were taken apart by an attachment interpreter. Olevba detects mso / ActiveMime components when embedded in an mhtml or excel2003 file, but fails to detect them when the input is pure mso. Fix that. --- oletools/olevba.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/oletools/olevba.py b/oletools/olevba.py index 52ffd5126..c12437411 100644 --- a/oletools/olevba.py +++ b/oletools/olevba.py @@ -2792,6 +2792,11 @@ def __init__(self, filename, data=None, container=None, relaxed=True, encoding=D # It must start with "ID" in uppercase, no whitespace or newline allowed before by Excel: if data.startswith(b'ID'): self.open_slk(data) + # check whether this is mso data + if is_mso_file(data): + log.debug('Found ActiveMime header, decompressing MSO container') + ole_data = mso_file_extract(data) + self.open_ole(ole_data) # Check if this is a plain text VBA or VBScript file: # To avoid scanning binary files, we simply check for some control chars: if self.type is None and b'\x00' not in data: From 2cd4dab4fce9e72959abb0d9c13971b33e2d8e72 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Mon, 10 Oct 2022 13:40:19 +0200 Subject: [PATCH 3/5] Remove "work in progress" message from all tools Oletools is now used in professional settings, so these warnings appear to naive users more and more often. These, however, do not appreciate such modesty, will more likely feel bewildered and insecure. --- oletools/ftguess.py | 2 -- oletools/mraptor.py | 3 --- oletools/msodde.py | 2 -- oletools/oleid.py | 3 --- oletools/olemeta.py | 2 -- oletools/oleobj.py | 3 --- oletools/oletimes.py | 2 -- oletools/rtfobj.py | 2 -- 8 files changed, 19 deletions(-) diff --git a/oletools/ftguess.py b/oletools/ftguess.py index 6db2c8644..75657e4d6 100644 --- a/oletools/ftguess.py +++ b/oletools/ftguess.py @@ -866,8 +866,6 @@ def main(): python_version = '%d.%d.%d' % sys.version_info[0:3] print ('ftguess %s on Python %s - http://decalage.info/python/oletools' % (__version__, python_version)) - print ('THIS IS WORK IN PROGRESS - Check updates regularly!') - print ('Please report any issue at https://github.com/decalage2/oletools/issues') print ('') DEFAULT_LOG_LEVEL = "warning" # Default log level diff --git a/oletools/mraptor.py b/oletools/mraptor.py index 35bf6ed6d..069318eb0 100644 --- a/oletools/mraptor.py +++ b/oletools/mraptor.py @@ -253,8 +253,6 @@ def main(): # Print help if no arguments are passed if len(args) == 0: print('MacroRaptor %s - http://decalage.info/python/oletools' % __version__) - print('This is work in progress, please report issues at %s' % URL_ISSUES) - print(__doc__) parser.print_help() print('\nAn exit code is returned based on the analysis result:') for result in (Result_NoMacro, Result_NotMSOffice, Result_MacroOK, Result_Error, Result_Suspicious): @@ -263,7 +261,6 @@ def main(): # print banner with version print('MacroRaptor %s - http://decalage.info/python/oletools' % __version__) - print('This is work in progress, please report issues at %s' % URL_ISSUES) log_helper.enable_logging(level=options.loglevel) # enable logging in the modules: diff --git a/oletools/msodde.py b/oletools/msodde.py index 303d97476..ee1932148 100644 --- a/oletools/msodde.py +++ b/oletools/msodde.py @@ -225,8 +225,6 @@ # banner to be printed at program start BANNER = """msodde %s - http://decalage.info/python/oletools -THIS IS WORK IN PROGRESS - Check updates regularly! -Please report any issue at https://github.com/decalage2/oletools/issues """ % __version__ # === LOGGING ================================================================= diff --git a/oletools/oleid.py b/oletools/oleid.py index 294f073be..bd3b9929c 100644 --- a/oletools/oleid.py +++ b/oletools/oleid.py @@ -513,9 +513,6 @@ def main(): """Called when running this file as script. Shows all info on input file.""" # print banner with version print('oleid %s - http://decalage.info/oletools' % __version__) - print('THIS IS WORK IN PROGRESS - Check updates regularly!') - print('Please report any issue at ' - 'https://github.com/decalage2/oletools/issues') print('') parser = argparse.ArgumentParser(description=__doc__) diff --git a/oletools/olemeta.py b/oletools/olemeta.py index 61317460b..ee539ace7 100644 --- a/oletools/olemeta.py +++ b/oletools/olemeta.py @@ -132,8 +132,6 @@ def process_ole(ole): def main(): # print banner with version print('olemeta %s - http://decalage.info/python/oletools' % __version__) - print ('THIS IS WORK IN PROGRESS - Check updates regularly!') - print ('Please report any issue at https://github.com/decalage2/oletools/issues') usage = 'usage: olemeta [options] [filename2 ...]' parser = optparse.OptionParser(usage=usage) diff --git a/oletools/oleobj.py b/oletools/oleobj.py index 9f67752ea..f75af9fc3 100644 --- a/oletools/oleobj.py +++ b/oletools/oleobj.py @@ -967,9 +967,6 @@ def main(cmd_line_args=None): # print banner with version ensure_stdout_handles_unicode() print('oleobj %s - http://decalage.info/oletools' % __version__) - print('THIS IS WORK IN PROGRESS - Check updates regularly!') - print('Please report any issue at ' - 'https://github.com/decalage2/oletools/issues') print('') usage = 'usage: %(prog)s [options] [filename2 ...]' diff --git a/oletools/oletimes.py b/oletools/oletimes.py index 5d7809a26..9783b85c0 100644 --- a/oletools/oletimes.py +++ b/oletools/oletimes.py @@ -111,8 +111,6 @@ def process_ole(ole): def main(): # print banner with version print('oletimes %s - http://decalage.info/python/oletools' % __version__) - print ('THIS IS WORK IN PROGRESS - Check updates regularly!') - print ('Please report any issue at https://github.com/decalage2/oletools/issues') usage = 'usage: oletimes [options] [filename2 ...]' parser = optparse.OptionParser(usage=usage) diff --git a/oletools/rtfobj.py b/oletools/rtfobj.py index f0b4e654e..3685c6ff5 100644 --- a/oletools/rtfobj.py +++ b/oletools/rtfobj.py @@ -1011,8 +1011,6 @@ def main(): python_version = '%d.%d.%d' % sys.version_info[0:3] print ('rtfobj %s on Python %s - http://decalage.info/python/oletools' % (__version__, python_version)) - print ('THIS IS WORK IN PROGRESS - Check updates regularly!') - print ('Please report any issue at https://github.com/decalage2/oletools/issues') print ('') DEFAULT_LOG_LEVEL = "warning" # Default log level From de7a09af014445c86768e51969bc0808a4dc7a38 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Fri, 22 Dec 2017 15:47:17 +0100 Subject: [PATCH 4/5] olevba: Split large vba lines When deobfuscating text or rtf, sometimes have large (500k chars) lines. Do not try to fit that into regular memory, but split data into overlapping pieces of manageable size. --- oletools/olevba.py | 47 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/oletools/olevba.py b/oletools/olevba.py index c12437411..137a0db89 100644 --- a/oletools/olevba.py +++ b/oletools/olevba.py @@ -284,6 +284,7 @@ import email.feedparser import string # for printable import json # for json output mode (argument --json) +from random import random # import lxml or ElementTree for XML parsing: try: @@ -2372,7 +2373,7 @@ def detect_vba_strings(vba_code): # Otherwise, start and end offsets are incorrect. vba_code = vba_code.expandtabs() # Split the VBA code line by line to avoid MemoryError on large scripts: - for vba_line in vba_code.splitlines(): + for vba_line in split_vba_code(vba_code): for tokens, start, end in vba_expr_str.scanString(vba_line): encoded = vba_line[start:end] decoded = tokens[0] @@ -2393,6 +2394,50 @@ def detect_vba_strings(vba_code): return results +#: max length of vba code lines that is analyzed in one go. Bigger code chunks +#: are split. Reduce this if you run into memory trouble +MAX_CODE_LINE_LEN = 32000 +MAX_CODE_LINE_OVERLAP = 500 + + +def split_vba_code(vba_code): + """ Split vba code (or what is suspected to be one) into manageable parts + + Tries a regular :py:meth:`str.splitlines`, and if that fails (e.g. in case + of non-vba-code in text files or mis-interpreted rtf) splits the string at + random into large overlapping chunks. + + This prevents MemoryErrors in the following parsing of that line, most of + all if deobfuscating. + """ + if MAX_CODE_LINE_LEN < 10: + raise ValueError('unreasonably small value for max code line length') + if MAX_CODE_LINE_OVERLAP < 0: + raise ValueError('unreasonably small value for max code line overlap') + if MAX_CODE_LINE_OVERLAP > MAX_CODE_LINE_LEN: + raise ValueError('overlap must be smaller than chunks') + HALF_LEN = int(MAX_CODE_LINE_LEN//2) + HALF_OVERLAP = int(MAX_CODE_LINE_OVERLAP//2) + + for line in vba_code.splitlines(): + line_len = len(line) + mean_idx_add = 1.5 * HALF_LEN - 1.5 * HALF_OVERLAP + n_chunks = int(line_len / mean_idx_add) # only an approximation + start_idx = 0 + chunk_idx = 0 + while (line_len - start_idx) > MAX_CODE_LINE_LEN: + chunk_idx += 1 + chunk_size = HALF_LEN + int(random() * HALF_LEN) + log.debug('splitting line of size {0}, yielding chunk of size {1},' + ' starting at {2} (number {3} of approx. {4})' + .format(line_len, chunk_size, start_idx, chunk_idx, + n_chunks)) + yield line[start_idx:start_idx+chunk_size] + overlap = HALF_OVERLAP + int(random() * HALF_OVERLAP) + start_idx += max(1, chunk_size - overlap) + yield line[start_idx:] # yield the rest + + def json2ascii(json_obj, encoding='utf8', errors='replace'): """ ensure there is no unicode in json and all strings are safe to decode From d933fb6ff6a075dfde4ae49d2b5ac1f54544fd08 Mon Sep 17 00:00:00 2001 From: Christian Herdtweck Date: Tue, 29 Nov 2022 12:12:10 +0100 Subject: [PATCH 5/5] olevba: When decompressing code fails, try analyzing as-is Office is likely to do it this way. This is a re-creation of an old commit from 2016, probably motivated by a sample obtained then. --- oletools/olevba.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/oletools/olevba.py b/oletools/olevba.py index 137a0db89..92be40deb 100644 --- a/oletools/olevba.py +++ b/oletools/olevba.py @@ -3619,6 +3619,9 @@ def extract_macros(self): log.debug('Error processing stream %r in file %r (%s)' % (d.name, self.filename, exc)) log.debug('Traceback:', exc_info=True) # do not raise the error, as it is unlikely to be a compressed macro stream + # instead, yield the code as-is, maybe it just was not compressed + log.debug('Try analyzing uncompressed code') + yield (self.filename, d.name, d.name, compressed_code) if self.xlm_macros: vba_code = '' for line in self.xlm_macros: