diff --git a/man_spider/lib/parser/parser.py b/man_spider/lib/parser/parser.py index b4205d6..74910c7 100644 --- a/man_spider/lib/parser/parser.py +++ b/man_spider/lib/parser/parser.py @@ -18,7 +18,14 @@ def is_text_file(filepath): best = result.best() # Only consider it a text file if we have high confidence # and the encoding is detected (not binary) - return best is not None and best.encoding is not None + if best is None or best.encoding is None: + return False + # Reject if decoded content has too many replacement characters — + # this means charset-normalizer forced a binary file through as text + text = str(best) + if text and text.count("\ufffd") / len(text) > 0.01: + return False + return True def extract_text_file(filepath): @@ -136,13 +143,11 @@ def grep(self, content, pattern): Interpret PATTERN as an extended regular expression -i, --ignore-case Ignore case distinctions - -a, --text - Process a binary file as if it were text -m NUM, --max-count=NUM - Stop reading a file after NUM matching lines + Stop reading a file after NUM matching lines """ grep_process = sp.Popen( - ["grep", "-Eiam", "5", "--color=always", pattern], stdin=sp.PIPE, stdout=sp.PIPE + ["grep", "-Eim", "5", "--color=always", pattern], stdin=sp.PIPE, stdout=sp.PIPE ) grep_output = grep_process.communicate(content)[0] for line in grep_output.splitlines(): @@ -204,6 +209,15 @@ def extract_text(self, file, pretty_filename): if text_content is None: return matches + # Guard against binary garbage: if more than 1% of characters are + # Unicode replacement chars (U+FFFD), the file was decoded incorrectly. + # Fall back to raw ASCII string extraction to avoid dumping huge binary chunks. + if text_content and text_content.count("\ufffd") / len(text_content) > 0.01: + log.debug(f"High replacement char ratio in {pretty_filename}, falling back to string extraction") + text_content = extract_strings_from_binary(str(file)) + if text_content is None: + return matches + # try to convert to UTF-8 for grep-friendliness try: binary_content = text_content.encode("utf-8", errors="ignore") diff --git a/man_spider/lib/spiderling.py b/man_spider/lib/spiderling.py index 658d8bc..5254929 100644 --- a/man_spider/lib/spiderling.py +++ b/man_spider/lib/spiderling.py @@ -16,6 +16,34 @@ log = logging.getLogger("manspider.spiderling") +# Directories to skip in --noise-filter moderate mode +NOISE_DIRS_MODERATE = [ + "policydefinitions", # Group Policy ADMX/ADML templates (all language variants) + "winsxs", # Windows component store (huge, system-only) + "servicing", # Windows Update staging area +] + +# Additional directories skipped in --noise-filter aggressive mode +NOISE_DIRS_AGGRESSIVE = NOISE_DIRS_MODERATE + [ + "\\windows\\system32", + "\\windows\\syswow64", + "\\windows\\assembly", + "\\windows\\fonts", + "\\windows\\spool", + "windows defender", +] + +# File extensions suppressed by --noise-filter (both modes) +NOISE_EXTENSIONS = [ + ".adml", # Group Policy Administrative Template Language files + ".admx", # Group Policy Administrative Template XML files + ".mui", # Multilingual User Interface resource files + ".mof", # Managed Object Format (WMI definitions) + ".cat", # Windows security catalog files + ".manifest", # Windows assembly manifest files +] + + class SpiderlingMessage: """ Message which gets sent back to the parent through parent_queue diff --git a/man_spider/manspider.py b/man_spider/manspider.py index c8823ed..70d9d32 100755 --- a/man_spider/manspider.py +++ b/man_spider/manspider.py @@ -190,6 +190,17 @@ def main(): metavar="SIZE", ) parser.add_argument("-v", "--verbose", action="store_true", help="show debugging messages") + parser.add_argument( + "--noise-filter", + choices=["moderate", "aggressive"], + default=None, + metavar="MODE", + help=( + "filter out common Windows system noise to reduce clutter. " + "moderate: skips PolicyDefinitions, WinSxS, Servicing + .adml/.admx/.mui/.mof/.cat/.manifest files. " + "aggressive: also skips System32, SysWOW64, Assembly, Fonts, Spool, Windows Defender." + ), + ) parser.add_argument( "--modified-after", type=str, @@ -257,6 +268,15 @@ def main(): options.dirnames = [s.lower() for s in options.dirnames] options.exclude_dirnames = [s.lower() for s in options.exclude_dirnames] + # apply built-in noise filter presets + if options.noise_filter: + from man_spider.lib.spiderling import NOISE_DIRS_MODERATE, NOISE_DIRS_AGGRESSIVE, NOISE_EXTENSIONS + noise_dirs = NOISE_DIRS_MODERATE if options.noise_filter == "moderate" else NOISE_DIRS_AGGRESSIVE + options.exclude_dirnames = list(set(options.exclude_dirnames + [d.lower() for d in noise_dirs])) + # ensure extension format is correct (dot prefix) + noise_exts = [e if e.startswith(".") else f".{e}" for e in NOISE_EXTENSIONS] + options.exclude_extensions = list(set(options.exclude_extensions + noise_exts)) + # deduplicate targets targets = set() [[targets.add(t) for t in g] for g in options.targets]