Skip to content
Open
Changes from all commits
Commits
Show all changes
83 commits
Select commit Hold shift + click to select a range
89a24b9
adds test to search through all unzipped docx files
omehes Apr 9, 2026
cf48c2a
Merge branch 'main' into docxsearch
staxly[bot] Apr 10, 2026
cf5af9e
Merge branch 'main' into docxsearch
staxly[bot] Apr 13, 2026
6b65652
Merge branch 'main' into docxsearch
staxly[bot] Apr 13, 2026
a70991a
Merge branch 'main' into docxsearch
staxly[bot] Apr 13, 2026
884ff51
Merge branch 'main' into docxsearch
staxly[bot] Apr 13, 2026
7c95c13
Merge branch 'main' into docxsearch
staxly[bot] Apr 13, 2026
a10939d
Merge branch 'main' into docxsearch
staxly[bot] Apr 13, 2026
a33a0f6
Merge branch 'main' into docxsearch
staxly[bot] Apr 13, 2026
10e8e3b
Merge branch 'main' into docxsearch
staxly[bot] Apr 14, 2026
983a967
Merge branch 'main' into docxsearch
staxly[bot] Apr 14, 2026
3258aa6
Merge branch 'main' into docxsearch
staxly[bot] Apr 14, 2026
48c9b1b
Merge branch 'main' into docxsearch
staxly[bot] Apr 14, 2026
0eafd65
Merge branch 'main' into docxsearch
staxly[bot] Apr 14, 2026
70cc7c4
Merge branch 'main' into docxsearch
staxly[bot] Apr 15, 2026
cf3156d
Merge branch 'main' into docxsearch
staxly[bot] Apr 15, 2026
dd02b1a
Merge branch 'main' into docxsearch
staxly[bot] Apr 17, 2026
b6d18d6
Merge branch 'main' into docxsearch
staxly[bot] Apr 20, 2026
c3cc5af
Merge branch 'main' into docxsearch
staxly[bot] Apr 20, 2026
c82d938
Merge branch 'main' into docxsearch
staxly[bot] Apr 20, 2026
309f4a0
Merge branch 'main' into docxsearch
staxly[bot] Apr 20, 2026
842b884
Merge branch 'main' into docxsearch
staxly[bot] Apr 23, 2026
9f9b595
Merge branch 'main' into docxsearch
staxly[bot] Apr 27, 2026
f1600e8
Merge branch 'main' into docxsearch
staxly[bot] Apr 27, 2026
cae4921
Merge branch 'main' into docxsearch
staxly[bot] Apr 27, 2026
8995c28
Merge branch 'main' into docxsearch
staxly[bot] Apr 27, 2026
cc1c634
Merge branch 'main' into docxsearch
staxly[bot] Apr 27, 2026
c90eee7
Merge branch 'main' into docxsearch
staxly[bot] Apr 27, 2026
6353d89
Merge branch 'main' into docxsearch
staxly[bot] Apr 27, 2026
a6f2f1a
Merge branch 'main' into docxsearch
staxly[bot] Apr 27, 2026
698fdd5
Merge branch 'main' into docxsearch
staxly[bot] Apr 27, 2026
bae3654
Merge branch 'main' into docxsearch
staxly[bot] Apr 27, 2026
583cbf7
Merge branch 'main' into docxsearch
staxly[bot] Apr 27, 2026
12163ad
Merge branch 'main' into docxsearch
staxly[bot] Apr 27, 2026
9c1f8b5
Merge branch 'main' into docxsearch
staxly[bot] Apr 27, 2026
0ecff6d
Merge branch 'main' into docxsearch
staxly[bot] Apr 27, 2026
a4814c5
Merge branch 'main' into docxsearch
staxly[bot] Apr 27, 2026
c53e183
Merge branch 'main' into docxsearch
staxly[bot] Apr 27, 2026
19127fb
Merge branch 'main' into docxsearch
staxly[bot] Apr 27, 2026
995e468
Merge branch 'main' into docxsearch
staxly[bot] Apr 27, 2026
c6f6756
Merge branch 'main' into docxsearch
staxly[bot] Apr 27, 2026
00ead11
Merge branch 'main' into docxsearch
staxly[bot] Apr 27, 2026
54bdc67
Merge branch 'main' into docxsearch
staxly[bot] Apr 27, 2026
63948c6
Merge branch 'main' into docxsearch
staxly[bot] Apr 27, 2026
6ea5860
Merge branch 'main' into docxsearch
staxly[bot] Apr 27, 2026
0dfc5b6
Merge branch 'main' into docxsearch
staxly[bot] Apr 27, 2026
0136954
Merge branch 'main' into docxsearch
staxly[bot] Apr 27, 2026
1adaa4a
Merge branch 'main' into docxsearch
staxly[bot] Apr 27, 2026
3b1f600
Merge branch 'main' into docxsearch
staxly[bot] Apr 27, 2026
6a5325f
Merge branch 'main' into docxsearch
staxly[bot] Apr 27, 2026
8d4b311
Merge branch 'main' into docxsearch
staxly[bot] Apr 27, 2026
e73570c
Merge branch 'main' into docxsearch
staxly[bot] Apr 27, 2026
815291d
Merge branch 'main' into docxsearch
staxly[bot] Apr 27, 2026
4e34428
Merge branch 'main' into docxsearch
staxly[bot] Apr 28, 2026
b958f5a
Merge branch 'main' into docxsearch
staxly[bot] Apr 28, 2026
166cd97
Merge branch 'main' into docxsearch
staxly[bot] Apr 28, 2026
a69b523
Merge branch 'main' into docxsearch
staxly[bot] Apr 28, 2026
6287e64
Merge branch 'main' into docxsearch
staxly[bot] Apr 28, 2026
cd4bddc
Merge branch 'main' into docxsearch
staxly[bot] Apr 28, 2026
dd9bb1d
Merge branch 'main' into docxsearch
staxly[bot] Apr 28, 2026
295b068
Merge branch 'main' into docxsearch
staxly[bot] Apr 28, 2026
7b19a28
Merge branch 'main' into docxsearch
staxly[bot] Apr 28, 2026
47ef964
Merge branch 'main' into docxsearch
staxly[bot] Apr 28, 2026
f71be0d
Merge branch 'main' into docxsearch
staxly[bot] Apr 28, 2026
f3d6592
Merge branch 'main' into docxsearch
staxly[bot] Apr 28, 2026
d0fda47
Merge branch 'main' into docxsearch
staxly[bot] Apr 28, 2026
e1adbc1
Merge branch 'main' into docxsearch
staxly[bot] Apr 28, 2026
53c78e3
Merge branch 'main' into docxsearch
staxly[bot] Apr 28, 2026
6eed865
Merge branch 'main' into docxsearch
staxly[bot] Apr 28, 2026
a16d0b1
Merge branch 'main' into docxsearch
staxly[bot] Apr 28, 2026
082518f
Merge branch 'main' into docxsearch
staxly[bot] Apr 29, 2026
15c6e7e
Merge branch 'main' into docxsearch
staxly[bot] Apr 29, 2026
71fb876
Merge branch 'main' into docxsearch
staxly[bot] May 4, 2026
ebc349b
Merge branch 'main' into docxsearch
staxly[bot] May 4, 2026
d654f7e
Merge branch 'main' into docxsearch
staxly[bot] May 5, 2026
4bde434
Merge branch 'main' into docxsearch
staxly[bot] May 6, 2026
765928a
Merge branch 'main' into docxsearch
staxly[bot] May 8, 2026
44c29f5
Merge branch 'main' into docxsearch
staxly[bot] May 11, 2026
f8d2062
Merge branch 'main' into docxsearch
staxly[bot] May 13, 2026
4b3c11c
Merge branch 'main' into docxsearch
staxly[bot] May 14, 2026
74da084
Merge branch 'main' into docxsearch
staxly[bot] May 14, 2026
4026895
Merge branch 'main' into docxsearch
staxly[bot] May 15, 2026
537fdaa
Merge branch 'main' into docxsearch
staxly[bot] May 15, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 85 additions & 0 deletions e2e_tests/docx-tools/test_search_docx_files_content.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import os
import re
import shutil
import zipfile
import pytest
from docx import Document

user_input = input("\nEnter search terms separated by commas (e.g. \\\sqrt, \\\pi): ")
# Process the input into a list and clean up whitespace
PATT_LIST = [item.strip() for item in user_input.split(",")]


def test_search_docx_files_content():
base_to_dir = f"{os.getcwd()}/docx_search"

# Ensure the destination directory exists
if os.path.exists(base_to_dir):
shutil.rmtree(base_to_dir)
os.makedirs(base_to_dir)

home_dir = os.path.expanduser("~")
base_from_dir = f"{home_dir}/Downloads/"

all_files = os.listdir(base_from_dir)

files = [
x
for x in all_files
if x.startswith("openstax-osbooks")
and x.lower().endswith(".zip")
and not os.path.isdir(os.path.join(base_from_dir, x))
]

unzip_dirs = []

if len(files) > 0:
for f_name in files:
file_source = os.path.join(base_from_dir, f_name)
file_dest = os.path.join(base_to_dir, f_name)

shutil.copy(file_source, file_dest)
# Use the directory containing the zip for extraction
unzip_dirs.append(base_to_dir)
else:
pytest.fail(f"No zip files found in {base_from_dir}")

# Deduplicate directory list and unzip
for j in set(unzip_dirs):
for file in os.listdir(j):
file_path = os.path.join(j, file)
if zipfile.is_zipfile(file_path):
with zipfile.ZipFile(file_path) as item:
# Extract into a folder named after the zip
extract_path = os.path.join(j, file.replace(".zip", ""))
item.extractall(extract_path)
os.remove(file_path)

found_anything = False

# 1. Combine PATT_LIST into a single regex for speed: "term1|term2|term3"
combined_pattern = "|".join(PATT_LIST)

for root, _, filenames in os.walk(base_to_dir):
docx_files = [
f for f in filenames if f.endswith(".docx") and not f.startswith("~$")
]

for filename in docx_files:
f_path = os.path.join(root, filename)
doc = Document(f_path)

# Check every paragraph against the combined regex
for para in doc.paragraphs:
match = re.search(combined_pattern, para.text)
if match:
found_anything = True
# match.group() tells us exactly which keyword was found
print(
f"\nFOUND '{match.group()}': {para.text} \nLOCATION: {f_path}"
)

if not found_anything:
print(
f"\n{'=' * 60}\nSEARCH COMPLETE: No matches found for {PATT_LIST}\n{'=' * 60}"
)
Loading