From fb6074a5d7aa3b57d5610c6626fb144fcf4da3ec Mon Sep 17 00:00:00 2001
From: Christian Herdtweck <christian.herdtweck@intra2net.com>
Date: Fri, 24 Jun 2022 14:51:56 +0200
Subject: [PATCH 1/5] tests: Identify 6 slowest tests and add option to disable
 them

These 6 tests made up more than two thirds of the overall test time. Add
an option to disable them to motivate incorporation of regular testing in
development cycle: set environment variable OLETOOLS_TEST_SKIP_SLOW=1 .

However, these tests are there for a reason, therefore do not disable them
by default (like the json dump test). Should do a standard run without
"optimizations" like this one before committing to master, for example.
---
 tests/msodde/test_basic.py |  4 ++++
 tests/oleobj/test_basic.py | 18 ++++++++++++++++++
 tests/olevba/test_basic.py |  4 +++-
 3 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/tests/msodde/test_basic.py b/tests/msodde/test_basic.py
index 7eed57998..89807ac02 100644
--- a/tests/msodde/test_basic.py
+++ b/tests/msodde/test_basic.py
@@ -73,6 +73,8 @@ def test_invalid_text(self):
         """ check that text file argument leads to non-zero exit status """
         self.do_test_validity(join(BASE_DIR, 'basic/text'), Exception)
 
+    @unittest.skipIf('OLETOOLS_TEST_SKIP_SLOW' in os.environ and os.environ['OLETOOLS_TEST_SKIP_SLOW'] == '1',
+                     "Skip slower tests")
     def test_encrypted(self):
         """
         check that encrypted files lead to non-zero exit status
@@ -119,6 +121,8 @@ def do_test_validity(self, filename, expect_error=None):
 class TestErrorOutput(unittest.TestCase):
     """msodde does not specify error by return code but text output."""
 
+    @unittest.skipIf('OLETOOLS_TEST_SKIP_SLOW' in os.environ and os.environ['OLETOOLS_TEST_SKIP_SLOW'] == '1',
+                     "Skip slower tests")
     def test_crypt_output(self):
         """Check for helpful error message when failing to decrypt."""
         for suffix in 'doc', 'docm', 'docx', 'ppt', 'pptm', 'pptx', 'xls', \
diff --git a/tests/oleobj/test_basic.py b/tests/oleobj/test_basic.py
index 3fdcab037..2f750abf2 100644
--- a/tests/oleobj/test_basic.py
+++ b/tests/oleobj/test_basic.py
@@ -3,6 +3,7 @@
 import unittest
 from tempfile import mkdtemp
 from shutil import rmtree
+from os import listdir, environ
 from os.path import join, isfile
 from hashlib import md5
 from glob import glob
@@ -91,10 +92,14 @@ def tearDown(self):
         elif self.temp_dir:
             rmtree(self.temp_dir)
 
+    @unittest.skipIf('OLETOOLS_TEST_SKIP_SLOW' in environ and environ['OLETOOLS_TEST_SKIP_SLOW'] == '1',
+                     "Skip slower tests")
     def test_md5(self):
         """ test all files in oleobj test dir """
         self.do_test_md5(['-d', self.temp_dir])
 
+    @unittest.skipIf('OLETOOLS_TEST_SKIP_SLOW' in environ and environ['OLETOOLS_TEST_SKIP_SLOW'] == '1',
+                     "Skip slower tests")
     def test_md5_args(self):
         """
         test that oleobj can be called with -i and -v
@@ -158,6 +163,19 @@ def test_non_streamed(self):
         return self.do_test_md5(['-d', self.temp_dir], test_fun=preread_file,
                                 only_run_every=4)
 
+    @unittest.skipIf('OLETOOLS_TEST_SKIP_SLOW' in environ and environ['OLETOOLS_TEST_SKIP_SLOW'] == '1',
+                     "Skip slower tests")
+    def test_nodump(self):
+        """Ensure that with --nodump nothing is ever written to disc."""
+        data_dir = join(DATA_BASE_DIR, 'oleobj')
+        for sample_name, _, _ in SAMPLES:
+            args = ['-d', self.temp_dir, '--nodump', join(data_dir, sample_name)]
+            call_and_capture('oleobj', args,
+                             accept_nonzero_exit=True)
+        temp_dir_contents = listdir(self.temp_dir)
+        if temp_dir_contents:
+            self.fail('Found file in temp dir despite "--nodump": {}'.format(temp_dir_contents))
+
 
 class TestSaneFilenameCreation(unittest.TestCase):
     """ Test sanitization / creation of sane filenames """
diff --git a/tests/olevba/test_basic.py b/tests/olevba/test_basic.py
index 5be1269a8..988c131e5 100644
--- a/tests/olevba/test_basic.py
+++ b/tests/olevba/test_basic.py
@@ -75,6 +75,8 @@ def test_rtf_behaviour(self):
                 raise self.fail('Found "warn" in output line: "{}"'
                                 .format(line.rstrip()))
 
+    @unittest.skipIf('OLETOOLS_TEST_SKIP_SLOW' in os.environ and os.environ['OLETOOLS_TEST_SKIP_SLOW'] == '1',
+                     "Skip slower tests")
     def test_crypt_return(self):
         """
         Test that encrypted files give a certain return code.
@@ -105,7 +107,7 @@ def test_crypt_return(self):
                                      .format(ret_code, args + [filename, ]))
 
                 # test only first file with all arg combinations, others just
-                # without arg (test takes too long otherwise
+                # without arg (test takes too long otherwise)
                 ADD_ARGS = ([], )
 
     def test_xlm(self):

From 44985620f38983080e23c1cdd2d53d66cb79f75e Mon Sep 17 00:00:00 2001
From: Christian Herdtweck <christian.herdtweck@intra2net.com>
Date: Tue, 19 Jul 2022 09:43:23 +0200
Subject: [PATCH 2/5] olevba: Check whether input is pure MSO / ActiveMime

Olevba might get (and does get in our case) parts of files that were
taken apart by an attachment interpreter. Olevba detects mso /
ActiveMime components when embedded in an mhtml or excel2003 file, but
fails to detect them when the input is pure mso. Fix that.
---
 oletools/olevba.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/oletools/olevba.py b/oletools/olevba.py
index 52ffd5126..c12437411 100644
--- a/oletools/olevba.py
+++ b/oletools/olevba.py
@@ -2792,6 +2792,11 @@ def __init__(self, filename, data=None, container=None, relaxed=True, encoding=D
             # It must start with "ID" in uppercase, no whitespace or newline allowed before by Excel:
             if data.startswith(b'ID'):
                 self.open_slk(data)
+            # check whether this is mso data
+            if is_mso_file(data):
+                log.debug('Found ActiveMime header, decompressing MSO container')
+                ole_data = mso_file_extract(data)
+                self.open_ole(ole_data)
             # Check if this is a plain text VBA or VBScript file:
             # To avoid scanning binary files, we simply check for some control chars:
             if self.type is None and b'\x00' not in data:

From 2cd4dab4fce9e72959abb0d9c13971b33e2d8e72 Mon Sep 17 00:00:00 2001
From: Christian Herdtweck <christian.herdtweck@intra2net.com>
Date: Mon, 10 Oct 2022 13:40:19 +0200
Subject: [PATCH 3/5] Remove "work in progress" message from all tools

Oletools is now used in professional settings, so these warnings appear to
naive users more and more often. These, however, do not appreciate such
modesty, will more likely feel bewildered and insecure.
---
 oletools/ftguess.py  | 2 --
 oletools/mraptor.py  | 3 ---
 oletools/msodde.py   | 2 --
 oletools/oleid.py    | 3 ---
 oletools/olemeta.py  | 2 --
 oletools/oleobj.py   | 3 ---
 oletools/oletimes.py | 2 --
 oletools/rtfobj.py   | 2 --
 8 files changed, 19 deletions(-)

diff --git a/oletools/ftguess.py b/oletools/ftguess.py
index 6db2c8644..75657e4d6 100644
--- a/oletools/ftguess.py
+++ b/oletools/ftguess.py
@@ -866,8 +866,6 @@ def main():
     python_version = '%d.%d.%d' % sys.version_info[0:3]
     print ('ftguess %s on Python %s - http://decalage.info/python/oletools' %
            (__version__, python_version))
-    print ('THIS IS WORK IN PROGRESS - Check updates regularly!')
-    print ('Please report any issue at https://github.com/decalage2/oletools/issues')
     print ('')
 
     DEFAULT_LOG_LEVEL = "warning" # Default log level
diff --git a/oletools/mraptor.py b/oletools/mraptor.py
index 35bf6ed6d..069318eb0 100644
--- a/oletools/mraptor.py
+++ b/oletools/mraptor.py
@@ -253,8 +253,6 @@ def main():
     # Print help if no arguments are passed
     if len(args) == 0:
         print('MacroRaptor %s - http://decalage.info/python/oletools' % __version__)
-        print('This is work in progress, please report issues at %s' % URL_ISSUES)
-        print(__doc__)
         parser.print_help()
         print('\nAn exit code is returned based on the analysis result:')
         for result in (Result_NoMacro, Result_NotMSOffice, Result_MacroOK, Result_Error, Result_Suspicious):
@@ -263,7 +261,6 @@ def main():
 
     # print banner with version
     print('MacroRaptor %s - http://decalage.info/python/oletools' % __version__)
-    print('This is work in progress, please report issues at %s' % URL_ISSUES)
 
     log_helper.enable_logging(level=options.loglevel)
     # enable logging in the modules:
diff --git a/oletools/msodde.py b/oletools/msodde.py
index 303d97476..ee1932148 100644
--- a/oletools/msodde.py
+++ b/oletools/msodde.py
@@ -225,8 +225,6 @@
 
 # banner to be printed at program start
 BANNER = """msodde %s - http://decalage.info/python/oletools
-THIS IS WORK IN PROGRESS - Check updates regularly!
-Please report any issue at https://github.com/decalage2/oletools/issues
 """ % __version__
 
 # === LOGGING =================================================================
diff --git a/oletools/oleid.py b/oletools/oleid.py
index 294f073be..bd3b9929c 100644
--- a/oletools/oleid.py
+++ b/oletools/oleid.py
@@ -513,9 +513,6 @@ def main():
     """Called when running this file as script. Shows all info on input file."""
     # print banner with version
     print('oleid %s - http://decalage.info/oletools' % __version__)
-    print('THIS IS WORK IN PROGRESS - Check updates regularly!')
-    print('Please report any issue at '
-          'https://github.com/decalage2/oletools/issues')
     print('')
 
     parser = argparse.ArgumentParser(description=__doc__)
diff --git a/oletools/olemeta.py b/oletools/olemeta.py
index 61317460b..ee539ace7 100644
--- a/oletools/olemeta.py
+++ b/oletools/olemeta.py
@@ -132,8 +132,6 @@ def process_ole(ole):
 def main():
     # print banner with version
     print('olemeta %s - http://decalage.info/python/oletools' % __version__)
-    print ('THIS IS WORK IN PROGRESS - Check updates regularly!')
-    print ('Please report any issue at https://github.com/decalage2/oletools/issues')
 
     usage = 'usage: olemeta [options] <filename> [filename2 ...]'
     parser = optparse.OptionParser(usage=usage)
diff --git a/oletools/oleobj.py b/oletools/oleobj.py
index 9f67752ea..f75af9fc3 100644
--- a/oletools/oleobj.py
+++ b/oletools/oleobj.py
@@ -967,9 +967,6 @@ def main(cmd_line_args=None):
     # print banner with version
     ensure_stdout_handles_unicode()
     print('oleobj %s - http://decalage.info/oletools' % __version__)
-    print('THIS IS WORK IN PROGRESS - Check updates regularly!')
-    print('Please report any issue at '
-          'https://github.com/decalage2/oletools/issues')
     print('')
 
     usage = 'usage: %(prog)s [options] <filename> [filename2 ...]'
diff --git a/oletools/oletimes.py b/oletools/oletimes.py
index 5d7809a26..9783b85c0 100644
--- a/oletools/oletimes.py
+++ b/oletools/oletimes.py
@@ -111,8 +111,6 @@ def process_ole(ole):
 def main():
     # print banner with version
     print('oletimes %s - http://decalage.info/python/oletools' % __version__)
-    print ('THIS IS WORK IN PROGRESS - Check updates regularly!')
-    print ('Please report any issue at https://github.com/decalage2/oletools/issues')
 
     usage = 'usage: oletimes [options] <filename> [filename2 ...]'
     parser = optparse.OptionParser(usage=usage)
diff --git a/oletools/rtfobj.py b/oletools/rtfobj.py
index f0b4e654e..3685c6ff5 100644
--- a/oletools/rtfobj.py
+++ b/oletools/rtfobj.py
@@ -1011,8 +1011,6 @@ def main():
     python_version = '%d.%d.%d' % sys.version_info[0:3]
     print ('rtfobj %s on Python %s - http://decalage.info/python/oletools' %
            (__version__, python_version))
-    print ('THIS IS WORK IN PROGRESS - Check updates regularly!')
-    print ('Please report any issue at https://github.com/decalage2/oletools/issues')
     print ('')
 
     DEFAULT_LOG_LEVEL = "warning" # Default log level

From de7a09af014445c86768e51969bc0808a4dc7a38 Mon Sep 17 00:00:00 2001
From: Christian Herdtweck <christian.herdtweck@intra2net.com>
Date: Fri, 22 Dec 2017 15:47:17 +0100
Subject: [PATCH 4/5] olevba: Split large vba lines

When deobfuscating text or rtf, sometimes have large (500k chars) lines.
Do not try to fit that into regular memory, but split data into
overlapping pieces of manageable size.
---
 oletools/olevba.py | 47 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 46 insertions(+), 1 deletion(-)

diff --git a/oletools/olevba.py b/oletools/olevba.py
index c12437411..137a0db89 100644
--- a/oletools/olevba.py
+++ b/oletools/olevba.py
@@ -284,6 +284,7 @@
 import email.feedparser
 import string  # for printable
 import json   # for json output mode (argument --json)
+from random import random
 
 # import lxml or ElementTree for XML parsing:
 try:
@@ -2372,7 +2373,7 @@ def detect_vba_strings(vba_code):
     #            Otherwise, start and end offsets are incorrect.
     vba_code = vba_code.expandtabs()
     # Split the VBA code line by line to avoid MemoryError on large scripts:
-    for vba_line in vba_code.splitlines():
+    for vba_line in split_vba_code(vba_code):
         for tokens, start, end in vba_expr_str.scanString(vba_line):
             encoded = vba_line[start:end]
             decoded = tokens[0]
@@ -2393,6 +2394,50 @@ def detect_vba_strings(vba_code):
     return results
 
 
+#: max length of vba code lines that is analyzed in one go. Bigger code chunks
+#: are split. Reduce this if you run into memory trouble
+MAX_CODE_LINE_LEN = 32000
+MAX_CODE_LINE_OVERLAP = 500
+
+
+def split_vba_code(vba_code):
+    """ Split vba code (or what is suspected to be one) into manageable parts
+
+    Tries a regular :py:meth:`str.splitlines`, and if that fails (e.g. in case
+    of non-vba-code in text files or mis-interpreted rtf) splits the string at
+    random into large overlapping chunks.
+
+    This prevents MemoryErrors in the following parsing of that line, most of
+    all if deobfuscating.
+    """
+    if MAX_CODE_LINE_LEN < 10:
+        raise ValueError('unreasonably small value for max code line length')
+    if MAX_CODE_LINE_OVERLAP < 0:
+        raise ValueError('unreasonably small value for max code line overlap')
+    if MAX_CODE_LINE_OVERLAP > MAX_CODE_LINE_LEN:
+        raise ValueError('overlap must be smaller than chunks')
+    HALF_LEN = int(MAX_CODE_LINE_LEN//2)
+    HALF_OVERLAP = int(MAX_CODE_LINE_OVERLAP//2)
+
+    for line in vba_code.splitlines():
+        line_len = len(line)
+        mean_idx_add = 1.5 * HALF_LEN - 1.5 * HALF_OVERLAP
+        n_chunks = int(line_len / mean_idx_add)    # only an approximation
+        start_idx = 0
+        chunk_idx = 0
+        while (line_len - start_idx) > MAX_CODE_LINE_LEN:
+            chunk_idx += 1
+            chunk_size = HALF_LEN + int(random() * HALF_LEN)
+            log.debug('splitting line of size {0}, yielding chunk of size {1},'
+                      ' starting at {2} (number {3} of approx. {4})'
+                      .format(line_len, chunk_size, start_idx, chunk_idx,
+                              n_chunks))
+            yield line[start_idx:start_idx+chunk_size]
+            overlap = HALF_OVERLAP + int(random() * HALF_OVERLAP)
+            start_idx += max(1, chunk_size - overlap)
+        yield line[start_idx:]   # yield the rest
+
+
 def json2ascii(json_obj, encoding='utf8', errors='replace'):
     """
     ensure there is no unicode in json and all strings are safe to decode

From d933fb6ff6a075dfde4ae49d2b5ac1f54544fd08 Mon Sep 17 00:00:00 2001
From: Christian Herdtweck <christian.herdtweck@intra2net.com>
Date: Tue, 29 Nov 2022 12:12:10 +0100
Subject: [PATCH 5/5] olevba: When decompressing code fails, try analyzing
 as-is

Office is likely to do it this way.

This is a re-creation of an old commit from 2016, probably motivated by a
sample obtained then.
---
 oletools/olevba.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/oletools/olevba.py b/oletools/olevba.py
index 137a0db89..92be40deb 100644
--- a/oletools/olevba.py
+++ b/oletools/olevba.py
@@ -3619,6 +3619,9 @@ def extract_macros(self):
                             log.debug('Error processing stream %r in file %r (%s)' % (d.name, self.filename, exc))
                             log.debug('Traceback:', exc_info=True)
                             # do not raise the error, as it is unlikely to be a compressed macro stream
+                            # instead, yield the code as-is, maybe it just was not compressed
+                            log.debug('Try analyzing uncompressed code')
+                            yield (self.filename, d.name, d.name, compressed_code)
             if self.xlm_macros:
                 vba_code = ''
                 for line in self.xlm_macros: