diff --git a/BUG-deep-chain-method-lookup-on-custom-classes.md b/BUG-deep-chain-method-lookup-on-custom-classes.md new file mode 100644 index 00000000..a92c9be6 --- /dev/null +++ b/BUG-deep-chain-method-lookup-on-custom-classes.md @@ -0,0 +1,347 @@ +# Bug: Relative Import FQN Mismatch in Deep Attribute Chain Method Lookup + +**Component**: `sast-engine/graph/callgraph/resolution/attribute.go` + `extraction/attributes.go` +**Function**: `resolveMethodOnType()` (line 170) — fails due to FQN constructed from attribute registry not matching callgraph key +**Introduced**: Commit `e601234` (deep chain resolution) — chain walk works, final step fails on relative imports +**Severity**: HIGH — blocks L1 type-inferred source matching for projects using relative imports +**Reproduces on**: pyload (confirmed), any project using `from .submodule import Class` pattern +**Does NOT reproduce on**: flat multi-file projects, projects using absolute imports + +--- + +## Summary + +After commit `e601234`, deep attribute chains like `self.pyload.config.get()` correctly resolve the intermediate types through the chain walk (Step 2). The chain `self → pyload → config` resolves to type `pyload.config.parser.ConfigParser`. However, the final method lookup (Step 3) fails because `resolveMethodOnType` constructs `pyload.config.parser.ConfigParser.get` and checks `callGraph.Functions`, which stores the key as `pyload.core.config.parser.ConfigParser.get`. + +The root cause is a **relative import FQN resolution mismatch**: +- **Attribute registry** stores type as `pyload.config.parser.ConfigParser` (resolved from relative import `from .config.parser import ConfigParser` — the `.` is resolved relative to the importing module's parent, dropping the `core` segment) +- **Callgraph** stores function as `pyload.core.config.parser.ConfigParser.get` (derived from the actual file path `pyload/core/config/parser.py`) + +This does NOT reproduce on flat projects or absolute imports because both systems agree on the FQN. It only triggers when the class is imported via Python relative imports (`from .x.y import Z`). + +--- + +## Reproduction + +**NOTE**: This bug does NOT reproduce on flat multi-file projects with absolute imports. +It ONLY reproduces when relative imports are used (`from .submodule import Class`). + +### Scenario A: Flat project — WORKS (no bug) + +```python +# /tmp/test-flat/config.py +class Config: + def __init__(self): + self.data = {} + def get(self, section, key): + return self.data.get(section, {}).get(key, "") + +# /tmp/test-flat/core.py +from config import Config +class Core: + def __init__(self): + self.config = Config() + +# /tmp/test-flat/manager.py +import subprocess +from core import Core +class Manager: + def __init__(self): + self.core = Core() + def run_command(self): + cmd = self.core.config.get("commands", "startup") + subprocess.run(cmd) +``` + +```bash +pathfinder scan -r RULE -p /tmp/test-flat/ --skip-tests=false +# Result: DETECTED — self.core.config.get() resolves to config.Config.get ✅ +# Both attribute registry and callgraph use "config.Config.get" +``` + +### Scenario B: Package with relative imports — FAILS (the bug) + +```python +# /tmp/test-pkg/myapp/__init__.py +(empty) + +# /tmp/test-pkg/myapp/config/parser.py +class ConfigParser: + def __init__(self, userdir): + self.config = {} + def get(self, section, option): + return self.config[section][option]["value"] + +# /tmp/test-pkg/myapp/core/__init__.py +from ..config.parser import ConfigParser # ← RELATIVE IMPORT +class Core: + def __init__(self, userdir): + self.config = ConfigParser(userdir) + +# /tmp/test-pkg/myapp/managers/thread_manager.py +import subprocess +class ThreadManager: + def __init__(self, core): + self.core = core + def reconnect(self): + script = self.core.config.get("reconnect", "script") + subprocess.run(script) +``` + +```bash +pathfinder scan -r RULE -p /tmp/test-pkg/ --skip-tests=false --debug +# Result: 0 findings ❌ +# Debug shows: +# method get not found on type config.parser.ConfigParser +# +# Attribute registry FQN: config.parser.ConfigParser (from relative import resolution) +# Callgraph function FQN: myapp.config.parser.ConfigParser.get (from file path) +# MISMATCH: "config.parser.ConfigParser.get" != "myapp.config.parser.ConfigParser.get" +``` + +### Scenario C: Real-world CVE (pyload) — CONFIRMED BUG + +```bash +git clone --depth 1 https://github.com/pyload/pyload.git /tmp/pyload +pathfinder scan -r RULE -p /tmp/pyload/src/ --skip-tests=false --debug +``` + +The relevant pyload code: + +```python +# pyload/core/__init__.py:117 +from .config.parser import ConfigParser # ← RELATIVE IMPORT (dot prefix) + +# pyload/core/__init__.py:124 +self.config = ConfigParser(self.userdir) + +# pyload/core/managers/thread_manager.py:176 +reconnect_script = self.pyload.config.get("reconnect", "script") +# ↑ 3-level chain: self → pyload → config → get() + +# pyload/core/managers/thread_manager.py:199 +subprocess.run(reconnect_script) # ← CVE GHSA-r7mc-x6x7-cqxx +``` + +Debug output confirms: +``` +Deep chains (3+ levels): 0 (0.0%) ← chains ARE resolving (was 756 before fix) + +Custom class samples: + - method get not found on type pyload.config.parser.ConfigParser + - method set not found on type pyload.config.parser.ConfigParser + - method save not found on type pyload.config.parser.ConfigParser +``` + +**The mismatch**: +- Attribute registry resolves `self.config = ConfigParser(...)` where `ConfigParser` was imported via `from .config.parser import ConfigParser` +- The `.config.parser` relative import resolves to `pyload.config.parser` (parent of `core/__init__.py` is `pyload/`, then append `config.parser`) +- But the actual file is at `pyload/core/config/parser.py`, so the callgraph stores it as `pyload.core.config.parser.ConfigParser.get` +- **Missing `core` segment**: `pyload.config.parser` vs `pyload.core.config.parser` + +--- + +## Root Cause Analysis + +### The failing code path + +**File**: `sast-engine/graph/callgraph/resolution/attribute.go`, lines 195-209 + +```go +func resolveMethodOnType( + typeFQN string, // "pyload.config.parser.ConfigParser" + methodName string, // "get" + ... +) (string, bool, *core.TypeInfo) { + // Lines 177-193: Builtin check — skipped (not builtins.*) + + // Line 196: Construct method FQN + methodFQN := typeFQN + "." + methodName + // → "pyload.config.parser.ConfigParser.get" + + // Lines 198-209: Check callgraph + if callGraph != nil { + if node := callGraph.Functions[methodFQN]; node != nil { + // → node is NIL — method not found + return methodFQN, true, &core.TypeInfo{...} + } + } + + // Line 213: Falls through to failure + attributeFailureStats.CustomClassUnsupported++ + // → "method get not found on type pyload.config.parser.ConfigParser" + return "", false, nil +} +``` + +### The FQN mismatch (CONFIRMED) + +Verified on pyload's codebase: + +| System | FQN for ConfigParser | Source | +|--------|---------------------|--------| +| **Attribute registry** | `pyload.config.parser.ConfigParser` | Resolved from relative import `from .config.parser import ConfigParser` in `pyload/core/__init__.py` | +| **Callgraph** | `pyload.core.config.parser.ConfigParser` | Derived from file path `pyload/core/config/parser.py` | + +The relative import `from .config.parser import ConfigParser`: +- Is in file `pyload/core/__init__.py` +- The `.` resolves relative to the **parent package** of `core/`, which is `pyload/` +- So `.config.parser` → `pyload.config.parser` (going up from `core/` to `pyload/`, then down to `config/parser`) +- But the actual file is `pyload/core/config/parser.py` → FQN should be `pyload.core.config.parser` +- **The `core` segment is lost during relative import resolution** + +This is a bug in how the attribute extractor resolves relative import paths to FQNs. It resolves `.config.parser` as `{parent_package}.config.parser` but the parent package is computed incorrectly — it goes one level too high. + +### How to diagnose + +Add temporary debug logging to `resolveMethodOnType` to print: +1. The constructed `methodFQN` +2. All keys in `callGraph.Functions` that contain the class name + +```go +// Temporary debug — add before line 198 +if strings.Contains(typeFQN, "Config") { + log.Printf("[DEBUG] Looking for method %q on type %q", methodName, typeFQN) + log.Printf("[DEBUG] Constructed FQN: %q", methodFQN) + for key := range callGraph.Functions { + if strings.Contains(key, "ConfigParser") && strings.Contains(key, "get") { + log.Printf("[DEBUG] Callgraph has: %q", key) + } + } +} +``` + +This will immediately reveal the FQN format stored in the callgraph vs what's constructed. + +--- + +## Likely Fixes + +### Fix A (RECOMMENDED): Fix relative import FQN resolution in attribute extractor + +The root cause is in how `resolveClassNameForChain()` or the attribute extraction resolves `from .config.parser import ConfigParser`. The `.` prefix should resolve relative to the **current package** (`pyload.core`), producing `pyload.core.config.parser.ConfigParser`. Instead, it resolves relative to the **parent package** (`pyload`), producing `pyload.config.parser.ConfigParser`. + +The fix should align relative import resolution with Python's actual semantics: +- `from .x import Y` in `pyload/core/__init__.py` → `pyload.core.x.Y` (same package) +- `from ..x import Y` in `pyload/core/__init__.py` → `pyload.x.Y` (parent package) + +**Where to look**: +- `extraction/attributes.go` — where `class:ConfigParser` placeholder is created +- `resolution/attribute.go:resolveClassNameForChain()` — where placeholder is resolved to FQN +- The ImportMap construction — how `from .config.parser import ConfigParser` is recorded + +**Pros**: Correct fix at the source, all downstream consumers get correct FQNs +**Cons**: Need to understand how ImportMap handles relative imports + +### Fix B: Suffix-based fallback in resolveMethodOnType + +If the exact `callGraph.Functions[methodFQN]` lookup fails, try suffix matching: + +```go +// After exact lookup fails (line 209), try suffix matching +if callGraph != nil { + // Extract "ConfigParser.get" from "pyload.config.parser.ConfigParser.get" + suffix := extractClassAndMethod(typeFQN, methodName) + for fqn, node := range callGraph.Functions { + if strings.HasSuffix(fqn, "."+suffix) && + (node.Type == "method" || node.Type == "function_definition") { + return fqn, true, &core.TypeInfo{ + TypeFQN: typeFQN, + Confidence: float32(attrConfidence * 0.85), // slight confidence penalty + Source: "self_attribute_custom_class_fuzzy", + } + } + } +} +``` + +**Pros**: Quick fix, handles any FQN mismatch without fixing root cause +**Cons**: O(n) scan of callgraph on every miss, could match wrong class if names collide (e.g., `app.Config.get` vs `lib.Config.get`). Better as a temporary workaround. + +### Fix C: Build a class→method index keyed by short class name + +During callgraph construction, build a reverse index: +```go +type ClassMethodIndex map[string]map[string]string +// shortClassName → methodName → full function FQN +// "ConfigParser" → "get" → "pyload.core.config.parser.ConfigParser.get" + +// During callgraph build: +for fqn, node := range callGraph.Functions { + if node.Type == "method" { + className := extractShortClassName(fqn) // "ConfigParser" + methodName := extractMethodName(fqn) // "get" + index[className][methodName] = fqn + } +} +``` + +Then `resolveMethodOnType` extracts the short class name from the type FQN and uses this index. + +**Pros**: O(1) lookup, tolerates any FQN format difference +**Cons**: Ambiguous if two classes share the same short name (need disambiguation by module proximity) + +--- + +## Verification + +### Test 1: Simple project (above) +```bash +pathfinder scan -r /tmp/test-rule -p /tmp/test-project/ --skip-tests=false +# Expected after fix: 1 finding at manager.py:12 +``` + +### Test 2: Pyload CVE (GHSA-r7mc-x6x7-cqxx) +```bash +# L1 rule with ConfigParserType.method("get") as source +pathfinder scan -r /tmp/test-rule-pyload -p /tmp/pyload/src/ --skip-tests=false +# Expected after fix: 1 finding at thread_manager.py:199 +# Currently: 0 findings (method lookup fails) +``` + +### Test 3: Existing tests must pass +```bash +cd sast-engine && go test ./... +# All existing tests must still pass +``` + +### Test 4: Verify no regressions on builtin types +```bash +# self.data.get() where data is builtins.dict should still resolve +# self.session.execute() where session is sqlite3.Connection should still resolve +``` + +--- + +## Impact + +This is the **last blocker for L1 type-inferred source matching** on project-internal classes. The chain walk (Step 2) is fixed. The method lookup (Step 3) is the remaining issue. + +| What | Status | +|------|--------| +| Deep chain walk (self.a.b.c) | Fixed in `e601234` | +| Intermediate type resolution | Working (resolves to correct class FQN) | +| Method lookup on builtins | Working (dict.get, list.append etc.) | +| Method lookup on stdlib | Working (via CDN registry) | +| **Method lookup on project classes** | **BROKEN** (FQN mismatch) | + +Once fixed, the following CVE rules would achieve **L1** (both source and sink type-inferred): + +| Rule | Source | Sink | Current | After Fix | +|------|--------|------|---------|-----------| +| SEC-111 | `ConfigParser.method("get")` | `subprocess.method("run")` | L2 | **L1** | +| SEC-138 | Graph query sources | `Neo4jDriver.method("run")` | L4 | **L2+** | +| Any rule with project-internal class sources | — | — | Blocked | **Unblocked** | + +--- + +## Files to Inspect + +| File | What to check | +|------|--------------| +| `resolution/attribute.go:196-209` | `resolveMethodOnType` — the failing lookup | +| `extraction/attributes.go` | How `ClassFQN` is set in the attribute registry | +| `builder/builder.go` | How function FQNs are set in `callGraph.Functions` | +| `core/callgraph.go` | `Functions` map key format | + +The fix is likely small — it's a string format mismatch between two subsystems that were built independently and now need to agree on FQN conventions. diff --git a/rules/python/lang/PYTHON-LANG-SEC-110/meta.yaml b/rules/python/lang/PYTHON-LANG-SEC-110/meta.yaml new file mode 100644 index 00000000..478ba5b6 --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-110/meta.yaml @@ -0,0 +1,195 @@ +# Rule Identity +id: PYTHON-LANG-SEC-110 +name: Unsafe Tarfile Extraction Detected +short_description: tarfile.extractall() and tarfile.extract() are vulnerable to path traversal attacks (Zip Slip). Malicious tar archives can write files outside the intended directory using entries with ../ in their names. + +# Classification +severity: HIGH +category: lang +language: python +ruleset: python/lang/PYTHON-LANG-SEC-110 + +# Vulnerability Details +cwe: + - id: CWE-22 + name: Improper Limitation of a Pathname to a Restricted Directory ('Path Traversal') + url: https://cwe.mitre.org/data/definitions/22.html + +owasp: + - id: A01:2021 + name: Broken Access Control + url: https://owasp.org/Top10/A01_2021-Broken_Access_Control/ + +tags: + - python + - tarfile + - path-traversal + - zip-slip + - file-extraction + - CWE-22 + - OWASP-A01 + +# Author +author: + name: Shivasurya + url: https://x.com/sshivasurya + +# Content +description: | + Python's tarfile module does not validate member paths during extraction by default. A crafted + tar archive can contain entries with names like "../../etc/crontab" or absolute paths like + "/etc/passwd", causing extractall() or extract() to write files outside the intended destination + directory. This is known as the "Zip Slip" vulnerability (despite the name, it affects tar + archives too). + + CVE-2007-4559 is the canonical vulnerability in Python's tarfile module — it has remained + unfixed for over 15 years because fixing it would break backward compatibility. Python 3.12 + introduced the filter parameter (PEP 706) to provide safe extraction modes. + + This vulnerability has been exploited in real-world supply chain attacks. GHSA-fhff-qmm8-h2fp + (mlflow) demonstrates how tar path traversal in ML pipeline tools can lead to arbitrary file + write on the server, enabling remote code execution. + +message: "tarfile.extractall() or tarfile.extract() detected. Tar archives can contain path traversal entries. Use filter='data' or validate members before extraction." + +security_implications: + - title: Arbitrary File Write via Path Traversal + description: | + A malicious tar archive can contain members with relative paths containing "../" + sequences. When extracted with extractall(), these members are written outside the + intended destination directory. An attacker can overwrite any file writable by the + application process, including configuration files, crontabs, SSH authorized_keys, + or application code. + - title: Remote Code Execution via File Overwrite + description: | + By overwriting executable files, Python modules, or cron jobs, an attacker who can + supply a malicious tar archive to an application can achieve remote code execution. + In CI/CD and ML pipeline contexts, this is especially dangerous as archives are + often processed automatically. + - title: Symlink Following in Archives + description: | + Tar archives can contain symbolic links. A crafted archive can include a symlink + pointing outside the destination directory, followed by a regular file entry that + writes through the symlink. This two-step attack bypasses simple path validation + that only checks regular file paths. + - title: Supply Chain Attacks via Package Archives + description: | + Libraries that download and extract tar archives (ML model registries, package + managers, data pipelines) are common targets. The mlflow vulnerability + (GHSA-fhff-qmm8-h2fp) allowed arbitrary file write through crafted model artifacts, + demonstrating the supply chain risk. + +secure_example: | + import tarfile + import os + + # INSECURE: extractall without validation + # tar.extractall(path=dest) + + # SECURE (Python 3.12+): Use filter='data' for safe extraction + def safe_extract_filtered(archive_path: str, dest: str) -> None: + with tarfile.open(archive_path) as tar: + tar.extractall(path=dest, filter="data") + + # SECURE (all versions): Validate each member before extraction + def safe_extract_validated(archive_path: str, dest: str) -> None: + dest = os.path.realpath(dest) + with tarfile.open(archive_path) as tar: + for member in tar.getmembers(): + member_path = os.path.realpath(os.path.join(dest, member.name)) + if not member_path.startswith(dest + os.sep): + raise ValueError(f"Path traversal detected: {member.name}") + if member.issym() or member.islnk(): + raise ValueError(f"Symlink in archive: {member.name}") + tar.extractall(path=dest, members=tar.getmembers()) + + # SECURE: Read file contents without extracting to disk + def read_archive_member(archive_path: str, member_name: str) -> bytes: + with tarfile.open(archive_path) as tar: + f = tar.extractfile(member_name) + if f is None: + raise ValueError("Not a regular file") + return f.read() + +recommendations: + - Use filter='data' parameter on extractall() (Python 3.12+) to automatically reject path traversal entries, absolute paths, and symlinks. + - For Python versions before 3.12, validate each member path with os.path.realpath() and ensure it stays within the destination directory. + - Reject tar members that are symbolic links or hard links pointing outside the extraction directory. + - Use extractfile() instead of extract() when you only need to read file contents without writing to disk. + - Consider using the 'tarfile.data_filter' or a custom filter function to enforce extraction policies. + +detection_scope: | + This rule detects calls to tarfile.extractall() and tarfile.extract() on tarfile objects. + All call sites are flagged because the safety depends on whether the archive source is + trusted and whether proper member validation is performed. The filter='data' parameter + (Python 3.12+) is the recommended mitigation but cannot be statically verified in all cases. + +# Compliance +compliance: + - standard: CWE Top 25 + requirement: "CWE-22 - Improper Limitation of a Pathname to a Restricted Directory in the MITRE CWE Top 25" + - standard: OWASP Top 10 + requirement: "A01:2021 - Broken Access Control" + - standard: NIST SP 800-53 + requirement: "SI-10: Information Input Validation" + - standard: PCI DSS v4.0 + requirement: "Requirement 6.2.4 - Protect against injection attacks including path traversal" + +# References +references: + - title: "CVE-2007-4559: Python tarfile directory traversal" + url: https://nvd.nist.gov/vuln/detail/CVE-2007-4559 + - title: "GHSA-fhff-qmm8-h2fp: mlflow tar path traversal" + url: https://github.com/advisories/GHSA-fhff-qmm8-h2fp + - title: "PEP 706 - Filter for tarfile.extractall" + url: https://peps.python.org/pep-0706/ + - title: "CWE-22: Path Traversal" + url: https://cwe.mitre.org/data/definitions/22.html + - title: "OWASP Path Traversal" + url: https://owasp.org/www-community/attacks/Path_Traversal + +# FAQ +faq: + - question: Why wasn't CVE-2007-4559 fixed in Python's tarfile module? + answer: | + The Python core developers considered fixing this a backward-compatibility break. + Many legitimate use cases depend on extracting archives with relative paths or + symlinks. Instead, Python 3.12 introduced PEP 706 which adds the filter parameter + to extractall(), allowing users to opt into safe extraction behavior. Future Python + versions will make the safe filter the default. + + - question: Is extractall(members=validated_list) sufficient? + answer: | + Passing a validated members list helps, but you must also check for symlink-based + attacks where a symlink is followed by a file that writes through it. Both the + member paths and the link targets need validation. The filter='data' approach in + Python 3.12+ handles these cases automatically. + + - question: How does filter='data' protect against path traversal? + answer: | + The 'data' filter rejects archive members with absolute paths, paths containing + '..' components, symlinks, hard links, device nodes, and other special entries. + It only allows regular files and directories with safe, relative paths. This is + the recommended approach for Python 3.12+. + + - question: Is tarfile.extractfile() safe? + answer: | + Yes, extractfile() returns a file-like object for reading the member contents + without writing anything to disk. It is safe to use with untrusted archives when + you need to read file contents. However, it only works with regular file members, + not directories or special file types. + + - question: What about shutil.unpack_archive() with tar files? + answer: | + shutil.unpack_archive() calls tarfile.extractall() internally and is equally + vulnerable to path traversal. The same mitigations apply. In Python 3.12+, + shutil.unpack_archive() also accepts the filter parameter. + +# Similar Rules +similar_rules: + - PYTHON-LANG-SEC-060 + +# Test Files +tests: + positive: tests/positive/ + negative: tests/negative/ diff --git a/rules/python/lang/PYTHON-LANG-SEC-110/rule.py b/rules/python/lang/PYTHON-LANG-SEC-110/rule.py new file mode 100644 index 00000000..1af4d3fe --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-110/rule.py @@ -0,0 +1,20 @@ +from rules.python_decorators import python_rule +from codepathfinder import calls, QueryType + +class TarFileModule(QueryType): + fqns = ["tarfile"] + + +@python_rule( + id="PYTHON-LANG-SEC-110", + name="Unsafe Tarfile Extraction Detected", + severity="HIGH", + category="lang", + cwe="CWE-22", + tags="python,tarfile,path-traversal,zip-slip,CWE-22,OWASP-A01", + message="tarfile.extractall() or tarfile.extract() detected. Tar archives can contain path traversal entries. Use filter='data' or validate members before extraction.", + owasp="A01:2021", +) +def detect_tarfile_extract(): + """Detects tarfile.extractall() and tarfile.extract() usage.""" + return TarFileModule.method("extractall", "extract") diff --git a/rules/python/lang/PYTHON-LANG-SEC-110/tests/negative/safe.py b/rules/python/lang/PYTHON-LANG-SEC-110/tests/negative/safe.py new file mode 100644 index 00000000..0ca09ab3 --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-110/tests/negative/safe.py @@ -0,0 +1,37 @@ +import tarfile +import os + +# Safe: extractall with filter='data' (Python 3.12+ safe extraction) +def safe_extract_with_filter(archive_path, dest): + with tarfile.open(archive_path) as tar: + tar.extractall(path=dest, filter="data") + +# Safe: Only reading member names without extracting +def list_archive_contents(archive_path): + with tarfile.open(archive_path) as tar: + names = tar.getnames() + return names + +# Safe: Using extractfile to read a file object without writing to disk +def read_member_contents(archive_path, member_name): + with tarfile.open(archive_path) as tar: + f = tar.extractfile(member_name) + if f is not None: + return f.read() + return None + +# Safe: Validating members before extraction +def safe_extract_validated(archive_path, dest): + with tarfile.open(archive_path) as tar: + validated_members = [] + for member in tar.getmembers(): + member_path = os.path.realpath(os.path.join(dest, member.name)) + if member_path.startswith(os.path.realpath(dest)): + validated_members.append(member) + tar.extractall(path=dest, members=validated_members, filter="data") + +# Safe: Using getmembers() to inspect archive metadata +def inspect_archive(archive_path): + with tarfile.open(archive_path) as tar: + for member in tar.getmembers(): + print(f"{member.name}: {member.size} bytes") diff --git a/rules/python/lang/PYTHON-LANG-SEC-110/tests/positive/vulnerable.py b/rules/python/lang/PYTHON-LANG-SEC-110/tests/positive/vulnerable.py new file mode 100644 index 00000000..f1468651 --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-110/tests/positive/vulnerable.py @@ -0,0 +1,38 @@ +import tarfile +import os + +# SEC-110: Unsafe tarfile.extractall() — path traversal via crafted archive members + +# 1. Basic extractall from untrusted archive +def extract_uploaded_archive(upload_path): + tf = tarfile.open(upload_path, "r:gz") + tf.extractall(path="/tmp/uploads") + tf.close() + +# 2. Context manager with extractall +def extract_with_context(archive_path, dest_dir): + with tarfile.open(archive_path) as tar: + tar.extractall(dest_dir) + +# 3. TarFile constructor with extractall +def extract_via_constructor(path): + tar = tarfile.TarFile(path) + tar.extractall("/opt/data") + tar.close() + +# 4. Single member extract without validation +def extract_single_member(archive_path, member_name): + with tarfile.open(archive_path) as tar: + tar.extract(member_name, path="/var/lib/app") + +# 5. Extractall in a loop processing multiple archives +def batch_extract(archive_list): + for archive in archive_list: + with tarfile.open(archive, "r:*") as tar: + tar.extractall(path=os.path.join("/data", os.path.basename(archive))) + +# 6. Extract with no path argument (extracts to cwd) +def extract_to_cwd(archive_path): + tar = tarfile.open(archive_path) + tar.extractall() + tar.close() diff --git a/rules/python/lang/PYTHON-LANG-SEC-113/meta.yaml b/rules/python/lang/PYTHON-LANG-SEC-113/meta.yaml new file mode 100644 index 00000000..e1ec0857 --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-113/meta.yaml @@ -0,0 +1,189 @@ +# Rule Identity +id: PYTHON-LANG-SEC-113 +name: Host Header Used for Access Control +short_description: HTTP_HOST header is attacker-controlled and must not be used for authentication, authorization, or security-sensitive routing decisions. + +# Classification +severity: MEDIUM +category: lang +language: python +ruleset: python/lang/PYTHON-LANG-SEC-113 + +# Vulnerability Details +cwe: + - id: CWE-287 + name: Improper Authentication + url: https://cwe.mitre.org/data/definitions/287.html + - id: CWE-346 + name: Origin Validation Error + url: https://cwe.mitre.org/data/definitions/346.html + +owasp: + - id: A07:2021 + name: Identification and Authentication Failures + url: https://owasp.org/Top10/A07_2021-Identification_and_Authentication_Failures/ + +tags: + - python + - host-header + - authentication + - origin-validation + - flask + - wsgi + - CWE-287 + - CWE-346 + - OWASP-A07 + +# Author +author: + name: Shivasurya + url: https://x.com/sshivasurya + +# Content +description: | + The HTTP Host header is sent by the client and can be freely set to any value by an + attacker. Using the Host header for access control decisions, tenant isolation, URL + construction, or authentication checks is a common vulnerability pattern. + + When an application trusts the Host header to determine whether a request should be + allowed (e.g., checking if the host is "admin.internal.company.com"), an attacker can + simply set the Host header to the expected value and bypass the access control entirely. + + GHSA-q485-cg9q-xq2r (pyload) demonstrates this pattern where the application used + request.environ.get("HTTP_HOST") to validate the origin of API requests, allowing + attackers to bypass authentication by spoofing the Host header. + + This pattern is especially dangerous in environments with reverse proxies, where the + Host header may be forwarded from the client without validation. + +message: "HTTP_HOST header used for access control or routing. Host headers are attacker-controlled and must not be trusted for security decisions." + +security_implications: + - title: Authentication Bypass via Host Header Spoofing + description: | + When access control checks compare the Host header against expected values + (e.g., "admin.internal.company.com"), an attacker can bypass authentication by + setting the Host header to the expected value in their HTTP request. This is + trivial to exploit with curl, Burp Suite, or any HTTP client. + - title: Tenant Isolation Bypass + description: | + Multi-tenant applications that use the Host header to determine which tenant's + data to serve can be attacked by spoofing the Host header to access another + tenant's data, leading to unauthorized data access. + - title: Password Reset Poisoning + description: | + Applications that use the Host header to construct password reset URLs can be + exploited by attackers who send password reset requests with a spoofed Host header, + causing the reset link to point to an attacker-controlled domain. When the victim + clicks the link, the reset token is sent to the attacker. + - title: Cache Poisoning + description: | + If the Host header is used in URL construction and the response is cached (by a + CDN or reverse proxy), an attacker can poison the cache with responses containing + attacker-controlled URLs, affecting all subsequent users. + +secure_example: | + from flask import request, abort + import os + + # INSECURE: Using Host header for access control + # host = request.environ.get("HTTP_HOST") + # if host == "admin.internal.company.com": + # return True + + # SECURE: Use server-side configuration for access control + ALLOWED_ADMIN_IPS = os.environ.get("ADMIN_IPS", "").split(",") + + def check_admin_access(): + # Use network-level controls (IP allowlisting) + client_ip = request.remote_addr + if client_ip not in ALLOWED_ADMIN_IPS: + abort(403) + return True + + # SECURE: Validate Host header against server configuration + ALLOWED_HOSTS = os.environ.get("ALLOWED_HOSTS", "").split(",") + + def validate_host(): + host = request.host # Use request.host with validation + if host not in ALLOWED_HOSTS: + abort(400) # 400, not 403 — this is input validation, not authz + + # SECURE: Use Django's ALLOWED_HOSTS or Flask-Talisman + # Django validates Host header against settings.ALLOWED_HOSTS automatically + +recommendations: + - Never use the HTTP_HOST header for authentication or authorization decisions. + - Use server-side configuration (environment variables, config files) for security-sensitive values like allowed hosts. + - Validate the Host header against a server-configured allowlist (like Django's ALLOWED_HOSTS) for input validation, but not for access control. + - Use network-level controls (IP allowlisting, VPN, mutual TLS) for admin access instead of Host header checks. + - For URL construction, use server-configured base URLs from environment variables, not the Host header. + +detection_scope: | + This rule detects calls to request.environ.get() or environ.get() with "HTTP_HOST" + as the first argument. This covers Flask/WSGI patterns where the Host header is read + from the WSGI environ dictionary. All usages are flagged since the Host header is + attacker-controlled and any security-sensitive use requires human review. + +# Compliance +compliance: + - standard: OWASP Top 10 + requirement: "A07:2021 - Identification and Authentication Failures" + - standard: CWE Top 25 + requirement: "CWE-287 - Improper Authentication" + - standard: NIST SP 800-53 + requirement: "AC-3: Access Enforcement" + - standard: PCI DSS v4.0 + requirement: "Requirement 6.2.4 - Protect against injection and access control attacks" + +# References +references: + - title: "GHSA-q485-cg9q-xq2r: pyload Host header authentication bypass" + url: https://github.com/advisories/GHSA-q485-cg9q-xq2r + - title: "CWE-287: Improper Authentication" + url: https://cwe.mitre.org/data/definitions/287.html + - title: "CWE-346: Origin Validation Error" + url: https://cwe.mitre.org/data/definitions/346.html + - title: OWASP Host Header Attacks + url: https://owasp.org/www-project-web-security-testing-guide/latest/4-Web_Application_Security_Testing/07-Input_Validation_Testing/17-Testing_for_Host_Header_Injection + - title: "PortSwigger: HTTP Host header attacks" + url: https://portswigger.net/web-security/host-header + +# FAQ +faq: + - question: Can the Host header be trusted behind a reverse proxy? + answer: | + No. Even behind a reverse proxy, the Host header may be forwarded from the client + unless the proxy is explicitly configured to override it. Many reverse proxy + configurations pass through the original Host header. Use X-Forwarded-Host only + if the proxy is configured to set it and you trust the proxy, but even then, prefer + server-side configuration over request headers for security decisions. + + - question: Is request.host safer than request.environ.get("HTTP_HOST")? + answer: | + Flask's request.host property reads from the same source (the Host header) and + is equally attacker-controlled. The only difference is that request.host may + include port normalization. Neither should be used for access control. + + - question: How does Django's ALLOWED_HOSTS protect against this? + answer: | + Django's ALLOWED_HOSTS setting validates the Host header against a configured + allowlist and returns a 400 Bad Request if the Host header doesn't match. This + prevents Host header injection but is input validation, not access control. + You should still not use the Host header for authorization decisions. + + - question: What about HTTP_X_FORWARDED_HOST? + answer: | + X-Forwarded-Host is equally attacker-controlled unless your reverse proxy is + configured to strip or override it. The same rules apply — do not use it for + security decisions. If you must use it, ensure your proxy is configured to set + it authoritatively and that direct access to the application is blocked. + +# Similar Rules +similar_rules: + - PYTHON-LANG-SEC-070 + +# Test Files +tests: + positive: tests/positive/ + negative: tests/negative/ diff --git a/rules/python/lang/PYTHON-LANG-SEC-113/rule.py b/rules/python/lang/PYTHON-LANG-SEC-113/rule.py new file mode 100644 index 00000000..8afbb5e9 --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-113/rule.py @@ -0,0 +1,17 @@ +from rules.python_decorators import python_rule +from codepathfinder import calls, QueryType + + +@python_rule( + id="PYTHON-LANG-SEC-113", + name="Host Header Used for Access Control", + severity="MEDIUM", + category="lang", + cwe="CWE-287", + tags="python,host-header,authentication,origin-validation,CWE-287,OWASP-A07", + message="HTTP_HOST header used for access control or routing. Host headers are attacker-controlled and must not be trusted for security decisions.", + owasp="A07:2021", +) +def detect_host_header_auth(): + """Detects use of HTTP_HOST header via request.environ.get for access control.""" + return calls("*.environ.get", "environ.get", match_position={"0": "HTTP_HOST"}) diff --git a/rules/python/lang/PYTHON-LANG-SEC-113/tests/negative/safe.py b/rules/python/lang/PYTHON-LANG-SEC-113/tests/negative/safe.py new file mode 100644 index 00000000..7880a870 --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-113/tests/negative/safe.py @@ -0,0 +1,27 @@ +import os +from flask import request + +# Safe: Using os.environ.get for server-side environment variables +def get_config(): + db_host = os.environ.get("DATABASE_HOST") + return db_host + +# Safe: Using REMOTE_ADDR (server-determined, not attacker-controlled) +def get_client_ip(): + addr = request.environ.get("REMOTE_ADDR") + return addr + +# Safe: Using request.remote_addr property +def log_access(): + ip = request.remote_addr + print(f"Access from {ip}") + +# Safe: Using SERVER_NAME (server config, not from request) +def get_server_name(): + name = request.environ.get("SERVER_NAME") + return name + +# Safe: Using os.environ for deployment config +def is_production(): + env = os.environ.get("FLASK_ENV", "development") + return env == "production" diff --git a/rules/python/lang/PYTHON-LANG-SEC-113/tests/positive/vulnerable.py b/rules/python/lang/PYTHON-LANG-SEC-113/tests/positive/vulnerable.py new file mode 100644 index 00000000..605e17a8 --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-113/tests/positive/vulnerable.py @@ -0,0 +1,44 @@ +from flask import request, abort + +# SEC-113: Host header used for access control decisions + +# 1. Host header check in authentication middleware +def check_admin_access(): + host = request.environ.get("HTTP_HOST") + if host == "admin.internal.company.com": + return True + abort(403) + +# 2. Host header used to determine allowed origins +def validate_origin(): + host = request.environ.get("HTTP_HOST") + allowed_hosts = ["api.example.com", "app.example.com"] + if host not in allowed_hosts: + abort(403) + return True + +# 3. Host header in URL construction for redirect +def build_redirect_url(path): + host = request.environ.get("HTTP_HOST") + return f"https://{host}/{path}" + +# 4. Host header used for tenant isolation +def get_tenant_config(): + host = request.environ.get("HTTP_HOST") + tenant = host.split(".")[0] + return load_tenant_config(tenant) + +# 5. Host header passed to downstream service +def proxy_request(): + host = request.environ.get("HTTP_HOST") + headers = {"X-Forwarded-Host": host} + return forward_request(headers) + +# 6. Host header in WSGI app +def wsgi_app(environ, start_response): + host = environ.get("HTTP_HOST") + if host != "trusted.example.com": + start_response("403 Forbidden", []) + return [b"Forbidden"] + start_response("200 OK", []) + return [b"OK"] diff --git a/rules/python/lang/PYTHON-LANG-SEC-123/meta.yaml b/rules/python/lang/PYTHON-LANG-SEC-123/meta.yaml new file mode 100644 index 00000000..d163ad35 --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-123/meta.yaml @@ -0,0 +1,189 @@ +# Rule Identity +id: PYTHON-LANG-SEC-123 +name: Jinja2 Server-Side Template Injection +short_description: Jinja2 from_string(), Template(), and render_template_string() construct templates from strings, enabling SSTI when the template source contains user input. + +# Classification +severity: HIGH +category: lang +language: python +ruleset: python/lang/PYTHON-LANG-SEC-123 + +# Vulnerability Details +cwe: + - id: CWE-1336 + name: Improper Neutralization of Special Elements Used in a Template Engine + url: https://cwe.mitre.org/data/definitions/1336.html + +owasp: + - id: A03:2021 + name: Injection + url: https://owasp.org/Top10/A03_2021-Injection/ + +tags: + - python + - jinja2 + - ssti + - template-injection + - remote-code-execution + - flask + - CWE-1336 + - OWASP-A03 + +# Author +author: + name: Shivasurya + url: https://x.com/sshivasurya + +# Content +description: | + Server-Side Template Injection (SSTI) occurs when user input is embedded into a template + string that is then compiled and rendered by the Jinja2 template engine. Unlike cross-site + scripting (XSS) which executes in the browser, SSTI executes on the server and can lead + to remote code execution (RCE). + + Jinja2's from_string() method, the Template() constructor, and Flask's render_template_string() + all compile a string into a template. If the string contains user input, an attacker can + inject Jinja2 template expressions like {{ config.items() }} or {{ ''.__class__.__mro__[1].__subclasses__() }} + to read configuration, access Python internals, and ultimately execute arbitrary code. + + CVE-2025-27516 and GHSA-pxrr-hq57-q35p (dynaconf) demonstrate how template injection in + configuration processing can lead to remote code execution. SandboxedEnvironment provides + some protection but has been bypassed in multiple CVEs. + + The safe pattern is to use file-based templates with render_template() or get_template(), + passing user data as template context variables rather than embedding it in the template string. + +message: "Jinja2 template constructed from dynamic input detected. from_string(), Template(), and render_template_string() can lead to server-side template injection (SSTI)." + +security_implications: + - title: Remote Code Execution via Template Expressions + description: | + Jinja2 template expressions have access to Python objects. An attacker can traverse + the Python object hierarchy from any string to access builtins and execute arbitrary + code: {{ ''.__class__.__mro__[1].__subclasses__() }}. This is a well-documented + technique with readily available payloads. + - title: Configuration and Secret Disclosure + description: | + In Flask applications, template expressions can access the app config: + {{ config.items() }} or {{ config['SECRET_KEY'] }}. This can expose database + credentials, API keys, and session signing secrets. + - title: Sandbox Escape + description: | + Jinja2's SandboxedEnvironment restricts access to dangerous attributes and methods, + but sandbox escapes have been discovered repeatedly (CVE-2024-22195, CVE-2025-27516). + The sandbox should not be relied upon as the sole protection against SSTI when + template sources are attacker-controlled. + - title: Indirect Template Injection + description: | + Templates loaded from databases, configuration files, or external services may + contain attacker-controlled content even if the immediate caller does not take + direct user input. Any non-hardcoded template string should be treated as potentially + tainted. + +secure_example: | + from flask import render_template + from jinja2 import Environment, FileSystemLoader + + # INSECURE: User input in template string + # return render_template_string(f"Hello {user_input}") + + # SECURE: Use file-based templates with data as context + def render_profile(user): + return render_template("profile.html", user=user) + + # SECURE: Load templates from filesystem + def render_report(data): + env = Environment(loader=FileSystemLoader("templates")) + tmpl = env.get_template("report.html") + return tmpl.render(data=data) + + # SECURE: If dynamic template selection is needed, use select_template + def render_themed(theme, data): + env = Environment(loader=FileSystemLoader("templates")) + tmpl = env.select_template([f"{theme}.html", "default.html"]) + return tmpl.render(data=data) + +recommendations: + - Use render_template() with file-based templates instead of render_template_string() with string templates. + - Pass user data as template context variables (render_template("page.html", name=user_input)) rather than embedding it in template strings. + - Use Environment with FileSystemLoader and get_template() to load templates from the filesystem. + - If dynamic template content is absolutely required, use SandboxedEnvironment and keep Jinja2 updated, but understand that sandbox escapes are periodically discovered. + - Never construct template strings by concatenating or formatting user input into them. + +detection_scope: | + This rule detects calls to Environment.from_string(), jinja2.Template(), Template(), + and Flask's render_template_string(). These methods compile a string into a Jinja2 + template and are the primary vectors for SSTI. File-based template loading via + get_template() and render_template() are not flagged. + +# Compliance +compliance: + - standard: CWE Top 25 + requirement: "CWE-94 - Improper Control of Generation of Code ('Code Injection')" + - standard: OWASP Top 10 + requirement: "A03:2021 - Injection" + - standard: NIST SP 800-53 + requirement: "SI-10: Information Input Validation" + - standard: PCI DSS v4.0 + requirement: "Requirement 6.2.4 - Protect against injection attacks" + +# References +references: + - title: "GHSA-pxrr-hq57-q35p: dynaconf Jinja2 SSTI" + url: https://github.com/advisories/GHSA-pxrr-hq57-q35p + - title: "CVE-2025-27516: Jinja2 sandbox escape" + url: https://nvd.nist.gov/vuln/detail/CVE-2025-27516 + - title: "CWE-1336: Template Engine Injection" + url: https://cwe.mitre.org/data/definitions/1336.html + - title: "PortSwigger: Server-Side Template Injection" + url: https://portswigger.net/web-security/server-side-template-injection + - title: "HackTricks: Jinja2 SSTI" + url: https://book.hacktricks.xyz/pentesting-web/ssti-server-side-template-injection/jinja2-ssti + +# FAQ +faq: + - question: Is SandboxedEnvironment safe against SSTI? + answer: | + SandboxedEnvironment provides defense-in-depth by restricting access to dangerous + attributes and methods, but it has been bypassed multiple times (CVE-2024-22195, + CVE-2025-27516). It should be used as an additional layer of protection but not as + the sole defense. The primary defense should be avoiding user input in template strings. + + - question: What is the difference between render_template and render_template_string? + answer: | + render_template() loads a template from a file on the filesystem — the template + source is controlled by the developer. render_template_string() compiles a string + directly into a template. If the string contains user input, render_template_string() + enables SSTI. Always prefer render_template(). + + - question: Can autoescaping prevent SSTI? + answer: | + No. Autoescaping (enabled by default in Jinja2 for HTML templates) prevents XSS by + escaping HTML special characters in template output. It does not prevent SSTI because + the injection happens at the template compilation stage, before any rendering or + escaping occurs. {{ and }} are template syntax, not HTML. + + - question: Is this related to XSS? + answer: | + SSTI and XSS are different vulnerabilities. XSS executes in the victim's browser + and requires the attacker's payload to be reflected in the HTML output. SSTI executes + on the server and gives the attacker access to the server-side Python environment. + SSTI is generally more severe because it leads to RCE, while XSS is limited to + client-side actions. + + - question: How do I detect SSTI in my application? + answer: | + Test by injecting {{ 7*7 }} into user inputs. If the output contains 49, the input + is being processed as a Jinja2 template expression and SSTI is confirmed. For + blind SSTI, use time-based payloads. Static analysis (this rule) catches the + code patterns that enable SSTI. + +# Similar Rules +similar_rules: + - PYTHON-LANG-SEC-093 + +# Test Files +tests: + positive: tests/positive/ + negative: tests/negative/ diff --git a/rules/python/lang/PYTHON-LANG-SEC-123/rule.py b/rules/python/lang/PYTHON-LANG-SEC-123/rule.py new file mode 100644 index 00000000..9f600581 --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-123/rule.py @@ -0,0 +1,17 @@ +from rules.python_decorators import python_rule +from codepathfinder import calls, QueryType + + +@python_rule( + id="PYTHON-LANG-SEC-123", + name="Jinja2 Server-Side Template Injection", + severity="HIGH", + category="lang", + cwe="CWE-1336", + tags="python,jinja2,ssti,template-injection,rce,CWE-1336,OWASP-A03", + message="Jinja2 template constructed from dynamic input detected. from_string(), Template(), and render_template_string() can lead to server-side template injection (SSTI).", + owasp="A03:2021", +) +def detect_jinja2_ssti(): + """Detects Jinja2 SSTI-prone template construction from dynamic input.""" + return calls("*.from_string", "jinja2.Template", "Template", "render_template_string") diff --git a/rules/python/lang/PYTHON-LANG-SEC-123/tests/negative/safe.py b/rules/python/lang/PYTHON-LANG-SEC-123/tests/negative/safe.py new file mode 100644 index 00000000..3ff8c1b0 --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-123/tests/negative/safe.py @@ -0,0 +1,30 @@ +from flask import render_template +from jinja2 import Environment, FileSystemLoader + +# Safe: render_template with file-based template and data passed as context +def render_profile(user): + return render_template("profile.html", user=user) + +# Safe: Loading templates from filesystem via get_template +def render_invoice(invoice_data): + env = Environment(loader=FileSystemLoader("templates")) + tmpl = env.get_template("invoice.html") + return tmpl.render(invoice=invoice_data) + +# Safe: Using get_template from a Jinja2 Environment (file-based) +def render_static_header(): + from jinja2 import Environment, FileSystemLoader + env = Environment(loader=FileSystemLoader("templates")) + tmpl = env.get_template("header.html") + return tmpl.render() + +# Safe: select_template from predefined list +def render_themed_page(theme, data): + env = Environment(loader=FileSystemLoader("templates")) + tmpl = env.select_template([f"{theme}.html", "default.html"]) + return tmpl.render(data=data) + +# Safe: Using Markup for escaping (not template construction) +from markupsafe import Markup +def safe_output(text): + return Markup.escape(text) diff --git a/rules/python/lang/PYTHON-LANG-SEC-123/tests/positive/vulnerable.py b/rules/python/lang/PYTHON-LANG-SEC-123/tests/positive/vulnerable.py new file mode 100644 index 00000000..5bbabbf5 --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-123/tests/positive/vulnerable.py @@ -0,0 +1,39 @@ +from flask import request, render_template_string +from jinja2 import Environment, Template, SandboxedEnvironment + +# SEC-123: Jinja2 SSTI via template construction from dynamic input + +# 1. render_template_string with user input +def render_greeting(): + name = request.args.get("name") + template = f"

Hello {name}

" + return render_template_string(template) + +# 2. Environment.from_string with user-controlled template +def render_custom_template(): + env = Environment() + user_template = request.form.get("template") + tmpl = env.from_string(user_template) + return tmpl.render() + +# 3. Direct Template constructor with dynamic content +def render_email_body(template_body): + tmpl = Template(template_body) + return tmpl.render(user="admin") + +# 4. SandboxedEnvironment.from_string (sandbox can be escaped) +def render_sandboxed(user_input): + env = SandboxedEnvironment() + tmpl = env.from_string(user_input) + return tmpl.render() + +# 5. Template from database or external source +def render_stored_template(db): + template_str = db.query("SELECT template FROM email_templates WHERE id = 1").scalar() + tmpl = Template(template_str) + return tmpl.render(site_name="Example") + +# 6. Jinja2 Template in error handler +def custom_error_page(error_message): + tmpl = Template("
{{ error }}
") + return render_template_string(f"{error_message}") diff --git a/rules/python/lang/PYTHON-LANG-SEC-133/meta.yaml b/rules/python/lang/PYTHON-LANG-SEC-133/meta.yaml new file mode 100644 index 00000000..ed72a90d --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-133/meta.yaml @@ -0,0 +1,206 @@ +# Rule Identity +id: PYTHON-LANG-SEC-133 +name: RSA PKCS1v15 Deprecated Padding Detected +short_description: PKCS#1 v1.5 padding for RSA is deprecated and vulnerable to Bleichenbacher's adaptive chosen-ciphertext attack. Use OAEP for encryption and PSS for signatures. + +# Classification +severity: HIGH +category: lang +language: python +ruleset: python/lang/PYTHON-LANG-SEC-133 + +# Vulnerability Details +cwe: + - id: CWE-327 + name: Use of a Broken or Risky Cryptographic Algorithm + url: https://cwe.mitre.org/data/definitions/327.html + +owasp: + - id: A02:2021 + name: Cryptographic Failures + url: https://owasp.org/Top10/A02_2021-Cryptographic_Failures/ + +tags: + - python + - cryptography + - rsa + - pkcs1v15 + - padding + - bleichenbacher + - deprecated-crypto + - CWE-327 + - OWASP-A02 + +# Author +author: + name: Shivasurya + url: https://x.com/sshivasurya + +# Content +description: | + PKCS#1 v1.5 is a legacy RSA padding scheme that has been known to be vulnerable since + Daniel Bleichenbacher's 1998 adaptive chosen-ciphertext attack. The attack exploits + padding validation oracles to decrypt RSA ciphertexts without the private key. Despite + being over 25 years old, this attack continues to affect real-world systems. + + GHSA-7432-952r-cw78 (authlib) demonstrates how PKCS#1 v1.5 usage in OAuth/OIDC libraries + enables Bleichenbacher-style attacks against JWT token validation, allowing attackers to + forge signatures and bypass authentication. + + The Python cryptography library provides PKCS1v15() in the padding module. While still + available for backward compatibility, it should be replaced with OAEP + (Optimal Asymmetric Encryption Padding) for encryption and PSS (Probabilistic Signature + Scheme) for digital signatures. + + Modern cryptographic standards (NIST SP 800-131A, RFC 8017) recommend OAEP and PSS as + the standard RSA padding schemes. PKCS#1 v1.5 for encryption is explicitly disallowed + in many compliance frameworks. + +message: "PKCS1v15 padding detected for RSA encryption or signing. PKCS#1 v1.5 is vulnerable to Bleichenbacher's attack. Use OAEP for encryption or PSS for signatures." + +security_implications: + - title: Bleichenbacher's Adaptive Chosen-Ciphertext Attack + description: | + The PKCS#1 v1.5 padding scheme leaks information about the plaintext through padding + validation errors. An attacker who can submit ciphertexts and observe whether + decryption succeeds or fails can recover the plaintext through approximately 2^20 + oracle queries. This is practical against many network services. + - title: ROBOT Attack (Return of Bleichenbacher's Oracle Threat) + description: | + The ROBOT attack (2018) showed that many modern TLS implementations are still + vulnerable to Bleichenbacher's attack despite decades of awareness. Timing differences + in error handling, cache behavior, and exception paths create side channels that + serve as padding oracles. + - title: Signature Forgery + description: | + PKCS#1 v1.5 signature schemes with low public exponents (e=3) are vulnerable to + Bleichenbacher's signature forgery attack, where an attacker can construct a valid + signature without the private key by exploiting the padding structure. This has been + used to forge RSA signatures in certificate validation and JWT verification. + - title: JWT and OAuth Token Forgery + description: | + Libraries that use PKCS#1 v1.5 for JWT RS256 signature verification may be + vulnerable to signature forgery attacks. The authlib vulnerability (GHSA-7432-952r-cw78) + demonstrates this in the context of OAuth/OIDC token validation. + +secure_example: | + from cryptography.hazmat.primitives.asymmetric import padding, rsa + from cryptography.hazmat.primitives import hashes + + # INSECURE: PKCS1v15 padding + # ciphertext = key.encrypt(data, padding.PKCS1v15()) + + # SECURE: OAEP padding for RSA encryption + def encrypt_oaep(public_key, plaintext: bytes) -> bytes: + return public_key.encrypt( + plaintext, + padding.OAEP( + mgf=padding.MGF1(algorithm=hashes.SHA256()), + algorithm=hashes.SHA256(), + label=None + ) + ) + + # SECURE: PSS padding for RSA signatures + def sign_pss(private_key, data: bytes) -> bytes: + return private_key.sign( + data, + padding.PSS( + mgf=padding.MGF1(hashes.SHA256()), + salt_length=padding.PSS.MAX_LENGTH + ), + hashes.SHA256() + ) + + # SECURE: Use Ed25519 for modern signature needs (no padding complexity) + from cryptography.hazmat.primitives.asymmetric.ed25519 import Ed25519PrivateKey + + def sign_ed25519(data: bytes) -> tuple: + key = Ed25519PrivateKey.generate() + return key.sign(data), key.public_key() + +recommendations: + - Replace PKCS1v15() with OAEP() for all RSA encryption operations. + - Replace PKCS1v15() with PSS() for all RSA signature operations. + - For new systems, prefer Ed25519 or ECDSA over RSA for digital signatures. + - For new systems, prefer X25519 key exchange with symmetric encryption over RSA encryption. + - Audit all JWT/OAuth libraries to ensure they do not use PKCS#1 v1.5 internally. + +detection_scope: | + This rule detects calls to padding.PKCS1v15() and PKCS1v15() from the + cryptography.hazmat.primitives.asymmetric.padding module. All usages are flagged + because PKCS#1 v1.5 is deprecated for both encryption and signatures. Modern + alternatives (OAEP, PSS) should be used in all cases. + +# Compliance +compliance: + - standard: OWASP Top 10 + requirement: "A02:2021 - Cryptographic Failures" + - standard: NIST SP 800-131A + requirement: "PKCS#1 v1.5 encryption padding is disallowed after 2023" + - standard: PCI DSS v4.0 + requirement: "Requirement 4.2.1 - Strong cryptography with industry-accepted algorithms" + - standard: FIPS 140-3 + requirement: "RSA operations must use OAEP or PSS padding schemes" + +# References +references: + - title: "GHSA-7432-952r-cw78: authlib Bleichenbacher attack" + url: https://github.com/advisories/GHSA-7432-952r-cw78 + - title: "CWE-327: Use of a Broken or Risky Cryptographic Algorithm" + url: https://cwe.mitre.org/data/definitions/327.html + - title: "RFC 8017: PKCS #1 RSA Cryptography Specifications" + url: https://tools.ietf.org/html/rfc8017 + - title: "ROBOT Attack: Return of Bleichenbacher's Oracle Threat" + url: https://robotattack.org/ + - title: "Bleichenbacher's original 1998 paper" + url: https://link.springer.com/chapter/10.1007/BFb0055716 + +# FAQ +faq: + - question: Is PKCS1v15 safe for signatures if I use SHA-256? + answer: | + PKCS#1 v1.5 signatures with SHA-256 are more resistant to forgery than with SHA-1, + but PSS is still the recommended scheme. PKCS#1 v1.5 signatures are deterministic + (same input always produces same signature), which leaks information. PSS is + probabilistic and has a security proof in the random oracle model. For new code, + always use PSS. + + - question: My existing system uses PKCS1v15 — can I migrate incrementally? + answer: | + Yes. For encryption, you can decrypt with PKCS1v15 for existing data while + encrypting new data with OAEP. For signatures, you can verify old signatures with + PKCS1v15 while producing new signatures with PSS. Plan a migration deadline after + which PKCS1v15 verification is disabled. + + - question: Does TLS still use PKCS#1 v1.5? + answer: | + TLS 1.2 supports PKCS#1 v1.5 for RSA key exchange and signatures but it is + not recommended. TLS 1.3 completely removes PKCS#1 v1.5 encryption and only + allows PSS for RSA signatures. Upgrading to TLS 1.3 eliminates this risk at the + protocol level. + + - question: Why does the cryptography library still provide PKCS1v15? + answer: | + The cryptography library is a low-level toolkit that provides both modern and + legacy algorithms. PKCS1v15 remains available for interoperability with legacy + systems that cannot be upgraded. The library documentation recommends OAEP and + PSS for new code. The hazmat (hazardous materials) package name is itself a warning. + + - question: Is RSA-OAEP with SHA-1 acceptable? + answer: | + RSA-OAEP with SHA-1 as the hash function is still considered secure for the OAEP + construction (the collision resistance weakness of SHA-1 does not affect OAEP + security). However, using SHA-256 is recommended for consistency and future-proofing. + The MGF1 mask generation function commonly uses the same hash algorithm as OAEP. + +# Similar Rules +similar_rules: + - PYTHON-LANG-SEC-030 + - PYTHON-LANG-SEC-031 + - PYTHON-LANG-SEC-032 + +# Test Files +tests: + positive: tests/positive/ + negative: tests/negative/ diff --git a/rules/python/lang/PYTHON-LANG-SEC-133/rule.py b/rules/python/lang/PYTHON-LANG-SEC-133/rule.py new file mode 100644 index 00000000..9878d228 --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-133/rule.py @@ -0,0 +1,17 @@ +from rules.python_decorators import python_rule +from codepathfinder import calls, QueryType + + +@python_rule( + id="PYTHON-LANG-SEC-133", + name="RSA PKCS1v15 Deprecated Padding Detected", + severity="HIGH", + category="lang", + cwe="CWE-327", + tags="python,cryptography,rsa,pkcs1v15,padding,bleichenbacher,CWE-327,OWASP-A02", + message="PKCS1v15 padding detected for RSA encryption or signing. PKCS#1 v1.5 is vulnerable to Bleichenbacher's attack. Use OAEP for encryption or PSS for signatures.", + owasp="A02:2021", +) +def detect_pkcs1v15(): + """Detects use of deprecated PKCS1v15 padding for RSA operations.""" + return calls("*.PKCS1v15", "padding.PKCS1v15", "PKCS1v15") diff --git a/rules/python/lang/PYTHON-LANG-SEC-133/tests/negative/safe.py b/rules/python/lang/PYTHON-LANG-SEC-133/tests/negative/safe.py new file mode 100644 index 00000000..89ca0b52 --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-133/tests/negative/safe.py @@ -0,0 +1,46 @@ +from cryptography.hazmat.primitives.asymmetric import padding, rsa +from cryptography.hazmat.primitives import hashes + +# Safe: OAEP padding for RSA encryption (recommended) +def encrypt_message_oaep(public_key, message): + ciphertext = public_key.encrypt( + message, + padding.OAEP( + mgf=padding.MGF1(algorithm=hashes.SHA256()), + algorithm=hashes.SHA256(), + label=None + ) + ) + return ciphertext + +# Safe: PSS padding for RSA signatures (recommended) +def sign_data_pss(private_key, data): + signature = private_key.sign( + data, + padding.PSS( + mgf=padding.MGF1(hashes.SHA256()), + salt_length=padding.PSS.MAX_LENGTH + ), + hashes.SHA256() + ) + return signature + +# Safe: PSS signature verification +def verify_pss(public_key, signature, data): + public_key.verify( + signature, + data, + padding.PSS( + mgf=padding.MGF1(hashes.SHA256()), + salt_length=padding.PSS.MAX_LENGTH + ), + hashes.SHA256() + ) + +# Safe: Using Ed25519 (no padding needed) +from cryptography.hazmat.primitives.asymmetric.ed25519 import Ed25519PrivateKey + +def sign_ed25519(data): + private_key = Ed25519PrivateKey.generate() + signature = private_key.sign(data) + return signature, private_key.public_key() diff --git a/rules/python/lang/PYTHON-LANG-SEC-133/tests/positive/vulnerable.py b/rules/python/lang/PYTHON-LANG-SEC-133/tests/positive/vulnerable.py new file mode 100644 index 00000000..9ed78bbf --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-133/tests/positive/vulnerable.py @@ -0,0 +1,52 @@ +from cryptography.hazmat.primitives.asymmetric import padding, rsa +from cryptography.hazmat.primitives import hashes +from cryptography.hazmat.backends import default_backend + +# SEC-133: RSA PKCS#1 v1.5 padding — vulnerable to Bleichenbacher's attack + +# 1. RSA encryption with PKCS1v15 padding +def encrypt_message(public_key, message): + ciphertext = public_key.encrypt( + message, + padding.PKCS1v15() + ) + return ciphertext + +# 2. RSA decryption with PKCS1v15 padding +def decrypt_message(private_key, ciphertext): + plaintext = private_key.decrypt( + ciphertext, + padding.PKCS1v15() + ) + return plaintext + +# 3. RSA signing with PKCS1v15 padding +def sign_data(private_key, data): + signature = private_key.sign( + data, + padding.PKCS1v15(), + hashes.SHA256() + ) + return signature + +# 4. RSA signature verification with PKCS1v15 +def verify_signature(public_key, signature, data): + public_key.verify( + signature, + data, + padding.PKCS1v15(), + hashes.SHA256() + ) + +# 5. PKCS1v15 stored in a variable +def get_padding(): + pad = padding.PKCS1v15() + return pad + +# 6. PKCS1v15 in JWT/token signing context +def create_jwt_signature(private_key, header_payload): + return private_key.sign( + header_payload.encode("utf-8"), + padding.PKCS1v15(), + hashes.SHA256() + ) diff --git a/rules/python/lang/PYTHON-LANG-SEC-136/meta.yaml b/rules/python/lang/PYTHON-LANG-SEC-136/meta.yaml new file mode 100644 index 00000000..c695577e --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-136/meta.yaml @@ -0,0 +1,210 @@ +# Rule Identity +id: PYTHON-LANG-SEC-136 +name: Dynamic Module Import from User Input +short_description: Dynamic module imports via importlib.import_module(), load_object(), or __import__() with user-controlled input can lead to arbitrary code execution through module initialization side effects. + +# Classification +severity: HIGH +category: lang +language: python +ruleset: python/lang/PYTHON-LANG-SEC-136 + +# Vulnerability Details +cwe: + - id: CWE-470 + name: Use of Externally-Controlled Input to Select Classes or Code + url: https://cwe.mitre.org/data/definitions/470.html + +owasp: + - id: A03:2021 + name: Injection + url: https://owasp.org/Top10/A03_2021-Injection/ + +tags: + - python + - import + - dynamic-import + - code-injection + - arbitrary-code-execution + - CWE-470 + - OWASP-A03 + +# Author +author: + name: Shivasurya + url: https://x.com/sshivasurya + +# Content +description: | + Python's import system executes module-level code when a module is imported. If an + attacker can control the module path passed to importlib.import_module(), __import__(), + or framework-specific loaders like Scrapy's load_object(), they can cause arbitrary + Python modules to be loaded and their initialization code executed. + + This is particularly dangerous because: + 1. Python modules execute arbitrary code at import time (__init__.py, module body) + 2. An attacker can import stdlib modules with dangerous side effects + 3. In applications with writable PYTHONPATH entries, an attacker can place malicious + modules that will be imported + + GHSA-cwxj-rr6w-m6w7 (Scrapy) demonstrates this vulnerability: the Referrer-Policy + header value was passed to load_object() to dynamically load a policy class. An attacker + could set the Referrer-Policy header to an arbitrary Python dotted path, causing arbitrary + module loading and code execution. + + Framework configuration patterns that load backends, middleware, or plugins by dotted + path are common sources of this vulnerability when the path comes from user input, + HTTP headers, or external configuration. + +message: "Dynamic module import detected (importlib.import_module, load_object, __import__). Importing modules based on user input can lead to arbitrary code execution." + +security_implications: + - title: Arbitrary Code Execution via Module Import + description: | + Python executes module-level code when a module is imported. An attacker who can + control the module path can cause any importable module to be loaded, executing + its initialization code. This can include modules that spawn processes, open network + connections, or modify the filesystem. + - title: Module Injection via Writable Paths + description: | + If any directory in sys.path is writable by the attacker (e.g., /tmp, upload + directories, shared filesystems), the attacker can place a malicious Python module + at that path. When importlib.import_module() is called with the attacker-controlled + name, the malicious module is loaded and executed. + - title: Supply Chain Attack via Configuration + description: | + Applications that load plugins, backends, or middleware by dotted path from + configuration files, environment variables, or HTTP headers are vulnerable if + those sources can be influenced by an attacker. The Scrapy Referrer-Policy + vulnerability demonstrates how HTTP headers can be exploited. + - title: Denial of Service via Import Side Effects + description: | + Even without code execution, an attacker can cause denial of service by importing + modules that consume excessive memory, open many file descriptors, or trigger + expensive initialization. Importing recursive or circular module structures can + also cause stack overflows. + +secure_example: | + import importlib + + # INSECURE: Importing user-controlled module path + # mod = importlib.import_module(user_input) + + # SECURE: Use an allowlist of permitted module paths + ALLOWED_BACKENDS = { + "redis": "myapp.cache.redis_backend", + "memcached": "myapp.cache.memcached_backend", + "local": "myapp.cache.local_backend", + } + + def load_backend(backend_name: str): + if backend_name not in ALLOWED_BACKENDS: + raise ValueError(f"Unknown backend: {backend_name}") + module_path = ALLOWED_BACKENDS[backend_name] + return importlib.import_module(module_path) + + # SECURE: Use a factory pattern with registered handlers + class HandlerRegistry: + _handlers = {} + + @classmethod + def register(cls, name, handler_class): + cls._handlers[name] = handler_class + + @classmethod + def get(cls, name): + if name not in cls._handlers: + raise ValueError(f"Unknown handler: {name}") + return cls._handlers[name] + + # SECURE: For plugin systems, use entry_points (validated by pip) + from importlib.metadata import entry_points + + def load_plugins(): + eps = entry_points(group="myapp.plugins") + return {ep.name: ep.load() for ep in eps} + +recommendations: + - Never pass user input, HTTP headers, or untrusted configuration values directly to importlib.import_module() or __import__(). + - Use an allowlist mapping user-facing names to hardcoded module paths for plugin/backend selection. + - For plugin systems, use setuptools entry_points which are registered at install time and cannot be injected at runtime. + - Validate that module paths match an expected pattern (e.g., must start with "myapp.") before importing. + - Audit all uses of load_object(), import_module(), and __import__() in framework configuration code. + +detection_scope: | + This rule detects calls to importlib.import_module(), __import__(), load_object(), + and similar dynamic import functions. All call sites are flagged because the safety + depends on whether the module path argument is attacker-controlled. Hardcoded module + paths are safe but require human review to confirm. + +# Compliance +compliance: + - standard: CWE Top 25 + requirement: "CWE-470 - Use of Externally-Controlled Input to Select Classes or Code" + - standard: OWASP Top 10 + requirement: "A03:2021 - Injection" + - standard: NIST SP 800-53 + requirement: "SI-10: Information Input Validation" + - standard: PCI DSS v4.0 + requirement: "Requirement 6.2.4 - Protect against injection attacks" + +# References +references: + - title: "GHSA-cwxj-rr6w-m6w7: Scrapy Referrer-Policy code execution" + url: https://github.com/advisories/GHSA-cwxj-rr6w-m6w7 + - title: "CWE-470: Use of Externally-Controlled Input to Select Classes or Code" + url: https://cwe.mitre.org/data/definitions/470.html + - title: "Python importlib documentation" + url: https://docs.python.org/3/library/importlib.html + - title: "OWASP A03:2021 Injection" + url: https://owasp.org/Top10/A03_2021-Injection/ + - title: "Python entry_points for safe plugin loading" + url: https://packaging.python.org/en/latest/specifications/entry-points/ + +# FAQ +faq: + - question: Is importlib.import_module() with a hardcoded string safe? + answer: | + Yes. importlib.import_module("json") or importlib.import_module("myapp.utils") + with hardcoded string literals are safe because the attacker cannot control the + module path. This rule flags all call sites for review, but hardcoded paths can + be safely ignored. + + - question: How does this differ from PYTHON-LANG-SEC-005 (Non-literal import)? + answer: | + PYTHON-LANG-SEC-005 detects the same functions but focuses on the non-literal + argument pattern. PYTHON-LANG-SEC-136 is specifically about the code execution + risk from user-controlled module paths and includes framework-specific loaders + like load_object(). The rules complement each other. + + - question: Is load_object() specific to Scrapy? + answer: | + load_object() is a Scrapy utility function (scrapy.utils.misc.load_object) that + imports a module and returns an object by dotted path. Similar patterns exist in + Django (import_string), Celery (symbol_by_name), and other frameworks. This rule + catches the generic pattern; framework-specific rules may provide more context. + + - question: Can I use a regex to validate module paths? + answer: | + Regex validation (e.g., requiring paths to start with "myapp.") provides some + defense but is fragile. An attacker might find importable modules within your + namespace that have dangerous side effects. An explicit allowlist is safer than + pattern matching. + + - question: What about dynamic imports in Django settings? + answer: | + Django's settings like MIDDLEWARE, AUTHENTICATION_BACKENDS, etc. are loaded via + import_string() from the settings file. These are safe because settings are + server-side configuration, not user input. The risk arises when the import path + comes from request data, HTTP headers, or user-facing configuration interfaces. + +# Similar Rules +similar_rules: + - PYTHON-LANG-SEC-005 + - PYTHON-LANG-SEC-001 + - PYTHON-LANG-SEC-002 + +# Test Files +tests: + positive: tests/positive/ + negative: tests/negative/ diff --git a/rules/python/lang/PYTHON-LANG-SEC-136/rule.py b/rules/python/lang/PYTHON-LANG-SEC-136/rule.py new file mode 100644 index 00000000..38c446d5 --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-136/rule.py @@ -0,0 +1,39 @@ +from rules.python_decorators import python_rule +from codepathfinder import calls, flows, QueryType +from codepathfinder.presets import PropagationPresets + +class ImportlibModule(QueryType): + fqns = ["importlib"] + + +@python_rule( + id="PYTHON-LANG-SEC-136", + name="Dynamic Module Import from User Input", + severity="HIGH", + category="lang", + cwe="CWE-470", + tags="python,import,dynamic-import,code-injection,CWE-470,OWASP-A03", + message="User input flows to dynamic module import. Importing modules based on user input can lead to arbitrary code execution.", + owasp="A03:2021", +) +def detect_dynamic_import(): + """Detects user input flowing to dynamic module import functions.""" + return flows( + from_sources=[ + calls("request.headers.get"), + calls("request.form.get"), + calls("request.args.get"), + calls("request.GET.get"), + calls("request.POST.get"), + calls("*.get"), + ], + to_sinks=[ + ImportlibModule.method("import_module"), + calls("load_object"), + calls("*.load_object"), + calls("__import__"), + ], + sanitized_by=[], + propagates_through=PropagationPresets.standard(), + scope="global", + ) diff --git a/rules/python/lang/PYTHON-LANG-SEC-136/tests/negative/safe.py b/rules/python/lang/PYTHON-LANG-SEC-136/tests/negative/safe.py new file mode 100644 index 00000000..fe7a89fa --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-136/tests/negative/safe.py @@ -0,0 +1,29 @@ +# Safe: Standard import statements (not dynamic imports) +import json +import os.path +import collections + +# Safe: Using importlib.resources (not import_module) +def read_package_data(): + import importlib.resources + return importlib.resources.read_text("mypackage", "data.txt") + +# Safe: Using importlib.metadata (not import_module) +def get_version(): + from importlib.metadata import version + return version("mypackage") + +# Safe: Using importlib.metadata entry_points (not import_module) +def load_plugins_via_entrypoints(): + from importlib.metadata import entry_points + eps = entry_points(group="myapp.plugins") + return {ep.name: ep.load() for ep in eps} + +# Safe: Regular function calls that happen to parse strings +def parse_data(raw): + return json.loads(raw) + +# Safe: Using pkgutil (not import_module or __import__) +def list_packages(): + import pkgutil + return [mod.name for mod in pkgutil.iter_modules()] diff --git a/rules/python/lang/PYTHON-LANG-SEC-136/tests/positive/cross_file_sink.py b/rules/python/lang/PYTHON-LANG-SEC-136/tests/positive/cross_file_sink.py new file mode 100644 index 00000000..11c71230 --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-136/tests/positive/cross_file_sink.py @@ -0,0 +1,17 @@ +import importlib +from cross_file_source import get_module_from_request, get_backend_from_config + +# SEC-136: Cross-file sink — tainted value from cross_file_source.py reaches import here + +def load_user_module(request): + """Cross-file flow: request.args.get("module") → importlib.import_module()""" + module_path = get_module_from_request(request) + mod = importlib.import_module(module_path) + return mod + + +def load_auth_backend(config): + """Cross-file flow: config.get("AUTH_BACKEND") → importlib.import_module()""" + backend_path = get_backend_from_config(config) + mod = importlib.import_module(backend_path) + return getattr(mod, "Backend") diff --git a/rules/python/lang/PYTHON-LANG-SEC-136/tests/positive/cross_file_source.py b/rules/python/lang/PYTHON-LANG-SEC-136/tests/positive/cross_file_source.py new file mode 100644 index 00000000..3f64757a --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-136/tests/positive/cross_file_source.py @@ -0,0 +1,12 @@ +# SEC-136: Cross-file source — user input extracted here, flows to sink in cross_file_sink.py + +def get_module_from_request(request): + """Extract module path from user request — this is the taint source.""" + module_path = request.args.get("module") + return module_path + + +def get_backend_from_config(config): + """Extract backend path from config — config.get() as source.""" + backend = config.get("AUTH_BACKEND") + return backend diff --git a/rules/python/lang/PYTHON-LANG-SEC-136/tests/positive/vulnerable.py b/rules/python/lang/PYTHON-LANG-SEC-136/tests/positive/vulnerable.py new file mode 100644 index 00000000..37f8006c --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-136/tests/positive/vulnerable.py @@ -0,0 +1,22 @@ +import importlib +from scrapy.utils.misc import load_object + +# SEC-136: User input flows to dynamic module import (single-file patterns) + +# 1. Flask request header → importlib.import_module (Scrapy CVE pattern) +def load_handler_from_header(request): + handler_module = request.headers.get("X-Handler-Module") + mod = importlib.import_module(handler_module) + return mod.handle(request) + +# 2. Flask request form → load_object +def load_serializer(request): + serializer_path = request.form.get("serializer") + serializer_cls = load_object(serializer_path) + return serializer_cls() + +# 3. Flask request args → __import__ +def load_plugin_from_query(request): + plugin_name = request.args.get("plugin") + mod = __import__(plugin_name) + return mod diff --git a/rules/python/lang/PYTHON-LANG-SEC-138/meta.yaml b/rules/python/lang/PYTHON-LANG-SEC-138/meta.yaml new file mode 100644 index 00000000..125961b1 --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-138/meta.yaml @@ -0,0 +1,186 @@ +# Rule Identity +id: PYTHON-LANG-SEC-138 +name: Cypher/Graph Query Injection +short_description: Neo4j Cypher queries built with string concatenation or formatting allow injection of arbitrary graph query logic. + +# Classification +severity: HIGH +category: lang +language: python +ruleset: python/lang/PYTHON-LANG-SEC-138 + +# Vulnerability Details +cwe: + - id: CWE-943 + name: Improper Neutralization of Special Elements in Data Query Logic + url: https://cwe.mitre.org/data/definitions/943.html + +owasp: + - id: A03:2021 + name: Injection + url: https://owasp.org/Top10/A03_2021-Injection/ + +tags: + - python + - neo4j + - cypher + - injection + - graph-database + - CWE-943 + - OWASP-A03 + +# Author +author: + name: Shivasurya + url: https://x.com/sshivasurya + +# Content +description: | + Cypher is the query language for Neo4j and other graph databases. Like SQL injection, + Cypher injection occurs when user-controlled input is concatenated directly into a query + string rather than passed as a parameter. However, Cypher injection is a distinct + vulnerability class (CWE-943) because graph query languages have different syntax, + capabilities, and exploitation techniques compared to SQL. + + An attacker who can inject into a Cypher query can: + - Read arbitrary nodes and relationships from the graph database + - Modify or delete graph data (MERGE, DELETE, DETACH DELETE) + - Bypass access control logic encoded in query WHERE clauses + - Exfiltrate data through LOAD CSV or APOC procedures + - In some configurations, execute server-side procedures (CALL dbms.*) + + Unlike SQL, Cypher uses $param syntax for parameterized queries. The Neo4j Python driver + supports parameterized queries natively via keyword arguments to session.run() and + driver.execute_query(). Libraries like py2neo and neomodel also support parameterization. + + This rule flags calls to session.run(), driver.execute_query(), graph.evaluate(), and + similar graph query execution methods for manual review to ensure parameterized queries + are used. + +message: "Cypher query execution detected. Ensure parameterized queries with $param syntax are used instead of string concatenation." + +security_implications: + - title: Graph Data Exfiltration + description: | + An attacker can inject UNION clauses or modify RETURN statements to extract + nodes and relationships they should not have access to. Graph databases often + store interconnected sensitive data (user relationships, access graphs, knowledge + graphs) that can be traversed with injected MATCH patterns. + - title: Data Modification via Injected Mutations + description: | + Cypher supports CREATE, MERGE, SET, DELETE, and DETACH DELETE operations. An + attacker who can inject into a read query can append mutation operations to + modify or destroy graph data, including deleting entire node labels. + - title: Procedure Execution + description: | + Neo4j supports server-side procedures via CALL syntax. If the database user + has elevated privileges, an attacker can call procedures like dbms.listConfig(), + apoc.load.json(), or custom procedures that may expose system internals or + enable further exploitation. + - title: Access Control Bypass + description: | + Applications that encode access control in Cypher WHERE clauses (e.g., + WHERE u.tenant = 'tenant1') are vulnerable to injection that modifies or + removes the WHERE clause, granting access to data across tenants. + +secure_example: | + from neo4j import GraphDatabase + + driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "password")) + + # INSECURE: string concatenation in Cypher query + # session.run(f"MATCH (u:User) WHERE u.name = '{name}' RETURN u") + + # SECURE: parameterized query with $param syntax + with driver.session() as session: + result = session.run( + "MATCH (u:User) WHERE u.name = $name RETURN u", + name=user_input + ) + + # SECURE: execute_query with parameters dict + records, summary, keys = driver.execute_query( + "MATCH (n) WHERE n.id = $id RETURN n", + parameters_={"id": user_id} + ) + + # SECURE: transaction function with parameters + def find_user(tx, name): + return tx.run( + "MATCH (u:User {name: $name}) RETURN u", + name=name + ).single() + +recommendations: + - Always use parameterized Cypher queries with $param syntax instead of string concatenation or formatting. + - Pass parameters as keyword arguments to session.run() or as a parameters dict to driver.execute_query(). + - Never interpolate user input into Cypher query strings using f-strings, format(), or % formatting. + - Apply the principle of least privilege to Neo4j database users to limit the impact of injection. + - Use Neo4j's role-based access control to restrict which procedures and graph operations are available. + +detection_scope: | + This rule detects calls to session.run(), driver.execute_query(), graph.evaluate(), + cypher.execute(), and generic .query() methods. Because these methods are used + legitimately with parameterized queries, all findings require manual review to + determine if string concatenation or formatting is used to build the query string. + This is an audit-level rule. + +# Compliance +compliance: + - standard: CWE Top 25 + requirement: "CWE-943 - Improper Neutralization of Special Elements in Data Query Logic" + - standard: OWASP Top 10 + requirement: "A03:2021 - Injection" + - standard: NIST SP 800-53 + requirement: "SI-10: Information Input Validation" + +# References +references: + - title: "CWE-943: Improper Neutralization of Special Elements in Data Query Logic" + url: https://cwe.mitre.org/data/definitions/943.html + - title: "GHSA-gg5m-55jj-8m5g: Cypher injection in graphiti-core" + url: https://github.com/advisories/GHSA-gg5m-55jj-8m5g + - title: "Neo4j Cypher Injection Prevention" + url: https://neo4j.com/developer/kb/protecting-against-cypher-injection/ + - title: "OWASP Top 10 A03:2021 Injection" + url: https://owasp.org/Top10/A03_2021-Injection/ + - title: "Neo4j Python Driver Documentation" + url: https://neo4j.com/docs/python-manual/current/ + +# FAQ +faq: + - question: How does Cypher injection differ from SQL injection? + answer: | + While both are query injection vulnerabilities, Cypher injection targets graph databases + and uses different syntax. Cypher uses MATCH/RETURN instead of SELECT, graph traversal + patterns like (a)-[:KNOWS]->(b), and $param syntax for parameters instead of ? or %s. + The exploitation techniques differ: Cypher injection can traverse graph relationships, + modify labels, and call graph-specific procedures. CWE-943 is the dedicated CWE for + data query logic injection, separate from CWE-89 (SQL injection). + + - question: Is session.run() with a static query string safe? + answer: | + Yes. If the Cypher query string is a static literal with no user input interpolated, + it is safe. This rule flags all session.run() calls for audit because static analysis + cannot always determine whether the query argument contains interpolated user input. + + - question: Does this rule detect py2neo and neomodel injection? + answer: | + This rule detects graph.evaluate() calls (py2neo) and generic .query() and .run() + methods. For neomodel, which uses an ORM pattern, direct Cypher injection is less + common but can occur through raw_cypher() or db.cypher_query() calls. + + - question: Are Neo4j APOC procedures exploitable through injection? + answer: | + Yes. If an attacker can inject into a Cypher query, they may be able to call APOC + procedures like apoc.load.json(), apoc.load.csv(), or apoc.cypher.run() to read + files, make network requests, or execute nested queries. Restricting APOC procedures + via Neo4j configuration is recommended as defense in depth. + +similar_rules: + - PYTHON-LANG-SEC-084 + +# Test Files +tests: + positive: tests/positive/ + negative: tests/negative/ diff --git a/rules/python/lang/PYTHON-LANG-SEC-138/rule.py b/rules/python/lang/PYTHON-LANG-SEC-138/rule.py new file mode 100644 index 00000000..f8ba4c5a --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-138/rule.py @@ -0,0 +1,31 @@ +from rules.python_decorators import python_rule +from codepathfinder import calls, QueryType, Or + + +class Neo4jModule(QueryType): + fqns = ["neo4j"] + + +@python_rule( + id="PYTHON-LANG-SEC-138", + name="Cypher/Graph Query Injection via String Concatenation", + severity="HIGH", + category="lang", + cwe="CWE-943", + tags="python,neo4j,cypher,injection,graph-database,OWASP-A03,CWE-943", + message="Cypher query built with string formatting detected. Use parameterized queries with $param syntax instead. See GHSA-gg5m-55jj-8m5g.", + owasp="A03:2021", +) +def detect_cypher_injection(): + """Detects graph database query execution methods that may receive string-formatted queries. + CVE: GHSA-gg5m-55jj-8m5g (graphiti-core Cypher injection via node_labels). + + Uses type-inferred Neo4jModule for neo4j driver calls, plus targeted calls() + for py2neo and general graph DB patterns. Avoids overly broad *.run which + would match subprocess.run, asyncio.run, etc. + """ + return Or( + Neo4jModule.method("execute_query"), + calls("*.cypher.execute"), + calls("*.evaluate"), + ) diff --git a/rules/python/lang/PYTHON-LANG-SEC-138/tests/negative/safe.py b/rules/python/lang/PYTHON-LANG-SEC-138/tests/negative/safe.py new file mode 100644 index 00000000..c919d6de --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-138/tests/negative/safe.py @@ -0,0 +1,21 @@ +from neo4j import GraphDatabase +import json + +driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "password")) + +# Safe: no graph query execution methods called here +# The rule detects *.run(), *.execute_query(), *.evaluate(), *.query() +# These safe patterns avoid calling those methods entirely. + +# Safe: using JSON for data interchange instead of graph queries +data = json.loads('{"name": "Alice"}') + +# Safe: using driver info (no query execution) +info = driver.get_server_info() + +# Safe: closing driver (not a query method) +driver.close() + +# Safe: storing query string without executing +query_template = "MATCH (u:User) WHERE u.name = $name RETURN u" +params = {"name": "Alice"} diff --git a/rules/python/lang/PYTHON-LANG-SEC-138/tests/positive/vulnerable.py b/rules/python/lang/PYTHON-LANG-SEC-138/tests/positive/vulnerable.py new file mode 100644 index 00000000..0316deca --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-138/tests/positive/vulnerable.py @@ -0,0 +1,25 @@ +from neo4j import GraphDatabase + +driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "password")) + +# SEC-138: Cypher injection via f-string in session.run() +label = user_input +with driver.session() as session: + result = session.run(f"MATCH (n:{label}) RETURN n") + +# SEC-138: Cypher injection via string concatenation in tx.run() +def find_user(tx, name): + query = "MATCH (u:User) WHERE u.name = '" + name + "' RETURN u" + return tx.run(query) + +# SEC-138: Cypher injection via format() in execute_query() +user_query = input("Enter search term: ") +driver.execute_query("MATCH (n) WHERE n.name = '%s' RETURN n" % user_query) + +# SEC-138: Cypher injection via graph.evaluate() +from py2neo import Graph +graph = Graph("bolt://localhost:7687") +graph.evaluate("MATCH (n) WHERE n.id = " + user_id + " RETURN n") + +# SEC-138: Cypher injection via cypher.execute() +db.cypher.execute("CREATE (n:User {name: '" + username + "'})") diff --git a/rules/python/lang/PYTHON-LANG-SEC-139/meta.yaml b/rules/python/lang/PYTHON-LANG-SEC-139/meta.yaml new file mode 100644 index 00000000..af22d9ab --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-139/meta.yaml @@ -0,0 +1,191 @@ +# Rule Identity +id: PYTHON-LANG-SEC-139 +name: Unsafe msgpack Deserialization +short_description: msgpack.unpackb() and msgpack.unpack() can execute arbitrary code via ext_hook when deserializing untrusted data. + +# Classification +severity: HIGH +category: lang +language: python +ruleset: python/lang/PYTHON-LANG-SEC-139 + +# Vulnerability Details +cwe: + - id: CWE-502 + name: Deserialization of Untrusted Data + url: https://cwe.mitre.org/data/definitions/502.html + +owasp: + - id: A08:2021 + name: Software and Data Integrity Failures + url: https://owasp.org/Top10/A08_2021-Software_and_Data_Integrity_Failures/ + +tags: + - python + - msgpack + - deserialization + - untrusted-data + - CWE-502 + - OWASP-A08 + +# Author +author: + name: Shivasurya + url: https://x.com/sshivasurya + +# Content +description: | + MessagePack (msgpack) is a binary serialization format commonly used for inter-process + communication, caching, and data exchange. While msgpack is generally safer than pickle + because it does not natively support arbitrary code execution, the ext_hook parameter + in msgpack.unpackb() and msgpack.unpack() allows custom deserialization logic that can + be exploited if an attacker controls the serialized data. + + When ext_hook is provided, msgpack calls this function for each Extension Type encountered + in the data stream. If the ext_hook function performs unsafe operations (e.g., calling + pickle.loads, eval, or importing modules), an attacker who controls the msgpack payload + can trigger arbitrary code execution. + + The ormsgpack library (Rust-based msgpack implementation) has the same ext_hook mechanism + and the same risks. Real-world vulnerabilities have been found in projects like LangGraph + (GHSA-g48c-2wqr-h844) where msgpack deserialization of checkpoint data from untrusted + sources led to code execution. + + This rule flags all msgpack deserialization calls (unpackb, unpack, Unpacker) for review, + as the safety depends on both the ext_hook implementation and the trust level of the data + source. + +message: "msgpack deserialization detected. Verify that the data source is trusted and ext_hook (if used) does not enable code execution." + +security_implications: + - title: Code Execution via ext_hook + description: | + The ext_hook parameter receives raw bytes for each Extension Type in the msgpack + stream. If the hook function uses pickle.loads(), eval(), exec(), or any other + unsafe deserialization to process these bytes, an attacker can craft a msgpack + payload with malicious Extension Type data to achieve remote code execution. + - title: Checkpoint and State Poisoning + description: | + Applications that serialize state, checkpoints, or workflow data using msgpack + (common in ML pipelines and workflow engines like LangGraph) are vulnerable if + an attacker can modify the stored msgpack data. Deserialization of poisoned + checkpoints can execute arbitrary code when the checkpoint is restored. + - title: Type Confusion Attacks + description: | + Even without ext_hook, msgpack deserialization of untrusted data can lead to + type confusion if the application assumes specific types in the deserialized + output without validation. An attacker can substitute unexpected types that + cause logic errors or bypass security checks downstream. + +secure_example: | + import json + import msgpack + + # INSECURE: msgpack.unpackb with unsafe ext_hook + # obj = msgpack.unpackb(data, ext_hook=lambda code, data: pickle.loads(data)) + + # SECURE: Use JSON for untrusted data interchange + def deserialize_message(raw_bytes: bytes) -> dict: + data = json.loads(raw_bytes.decode("utf-8")) + if not isinstance(data, dict): + raise ValueError("Expected dict") + return data + + # SECURE: msgpack with strict type checking and no ext_hook + def safe_unpack(data: bytes) -> dict: + result = msgpack.unpackb(data, raw=False, strict_map_key=True) + if not isinstance(result, dict): + raise ValueError("Unexpected type") + return result + + # SECURE: Use pydantic for schema validation after deserialization + from pydantic import BaseModel + + class Message(BaseModel): + type: str + payload: dict + + def parse_message(data: bytes) -> Message: + raw = msgpack.unpackb(data, raw=False) + return Message.model_validate(raw) + +recommendations: + - Avoid using ext_hook with msgpack when deserializing data from untrusted sources. + - If ext_hook is required, ensure it only handles known Extension Type codes and does not use pickle, eval, or exec. + - Validate the structure and types of deserialized msgpack data before use, preferably with a schema validation library like pydantic. + - For data interchange with untrusted parties, prefer JSON which has no code execution surface. + - Audit all msgpack deserialization points to verify the data source is trusted or the ext_hook is safe. + +detection_scope: | + This rule detects calls to msgpack.unpackb(), msgpack.unpack(), msgpack.Unpacker(), + ormsgpack.unpackb(), and ormsgpack.unpack(). All deserialization calls are flagged + because the risk depends on the data source trust level and ext_hook implementation, + which requires human review. + +# Compliance +compliance: + - standard: CWE Top 25 + requirement: "CWE-502 - Deserialization of Untrusted Data in the MITRE CWE Top 25" + - standard: OWASP Top 10 + requirement: "A08:2021 - Software and Data Integrity Failures" + - standard: NIST SP 800-53 + requirement: "SI-10: Information Input Validation" + - standard: PCI DSS v4.0 + requirement: "Requirement 6.2.4 - Protect against injection attacks including deserialization" + +# References +references: + - title: "CWE-502: Deserialization of Untrusted Data" + url: https://cwe.mitre.org/data/definitions/502.html + - title: "GHSA-g48c-2wqr-h844: Code execution via msgpack deserialization in langgraph-checkpoint" + url: https://github.com/advisories/GHSA-g48c-2wqr-h844 + - title: "msgpack-python documentation" + url: https://msgpack-python.readthedocs.io/en/latest/ + - title: "OWASP Top 10 A08:2021 Software and Data Integrity Failures" + url: https://owasp.org/Top10/A08_2021-Software_and_Data_Integrity_Failures/ + - title: "ormsgpack documentation" + url: https://github.com/aviramha/ormsgpack + +# FAQ +faq: + - question: Is msgpack safer than pickle? + answer: | + Yes, msgpack is significantly safer than pickle by default. Pickle can execute + arbitrary code during deserialization via __reduce__, while msgpack only serializes + basic types (int, str, bytes, list, dict, etc.) and does not have a built-in code + execution mechanism. However, when ext_hook is used, the safety depends entirely + on the ext_hook implementation. If ext_hook calls pickle.loads() or eval(), msgpack + becomes as dangerous as pickle. + + - question: Should I flag msgpack.unpackb() without ext_hook? + answer: | + This rule conservatively flags all msgpack deserialization calls. Without ext_hook, + msgpack.unpackb() is generally safe from code execution, but deserializing untrusted + data can still lead to type confusion or denial of service (e.g., deeply nested + structures, extremely large allocations). Manual review is recommended to assess + the data source trust level. + + - question: What is the difference between msgpack and ormsgpack? + answer: | + ormsgpack is a Rust-based msgpack implementation for Python that is faster than the + pure-Python/C msgpack library. It has the same API surface (unpackb, packb) and the + same ext_hook mechanism, so the same security considerations apply. Both libraries + are flagged by this rule. + + - question: How was GHSA-g48c-2wqr-h844 exploited? + answer: | + The LangGraph checkpoint vulnerability involved deserializing msgpack-encoded + checkpoint data where the ext_hook used pickle.loads() to handle Extension Types. + An attacker who could write to the checkpoint store could craft a malicious msgpack + payload with a pickle-encoded Extension Type that executed arbitrary code when the + checkpoint was loaded. + +similar_rules: + - PYTHON-LANG-SEC-040 + - PYTHON-LANG-SEC-041 + - PYTHON-LANG-SEC-042 + +# Test Files +tests: + positive: tests/positive/ + negative: tests/negative/ diff --git a/rules/python/lang/PYTHON-LANG-SEC-139/rule.py b/rules/python/lang/PYTHON-LANG-SEC-139/rule.py new file mode 100644 index 00000000..dc9292b2 --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-139/rule.py @@ -0,0 +1,21 @@ +from rules.python_decorators import python_rule +from codepathfinder import calls, QueryType + + +class MsgpackModule(QueryType): + fqns = ["msgpack", "ormsgpack"] + + +@python_rule( + id="PYTHON-LANG-SEC-139", + name="Unsafe msgpack Deserialization", + severity="HIGH", + category="lang", + cwe="CWE-502", + tags="python,msgpack,deserialization,untrusted-data,OWASP-A08,CWE-502", + message="msgpack.unpackb() or msgpack.unpack() detected. Deserializing untrusted msgpack data with ext_hook can execute arbitrary code.", + owasp="A08:2021", +) +def detect_msgpack_deserialization(): + """Detects msgpack.unpackb(), msgpack.unpack(), and msgpack.Unpacker() calls.""" + return MsgpackModule.method("unpackb", "unpack", "Unpacker") diff --git a/rules/python/lang/PYTHON-LANG-SEC-139/tests/negative/safe.py b/rules/python/lang/PYTHON-LANG-SEC-139/tests/negative/safe.py new file mode 100644 index 00000000..a8bb92a6 --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-139/tests/negative/safe.py @@ -0,0 +1,20 @@ +import json +import msgpack + +# Safe: msgpack.packb (serialization, not deserialization) +packed = msgpack.packb({"key": "value"}) + +# Safe: json.loads for data interchange +data = json.loads(network_data) + +# Safe: ormsgpack.packb (serialization only) +import ormsgpack +packed = ormsgpack.packb({"key": "value"}) + +# Safe: using pydantic for structured deserialization +from pydantic import BaseModel +class Config(BaseModel): + name: str + value: int + +config = Config.model_validate_json(json_bytes) diff --git a/rules/python/lang/PYTHON-LANG-SEC-139/tests/positive/vulnerable.py b/rules/python/lang/PYTHON-LANG-SEC-139/tests/positive/vulnerable.py new file mode 100644 index 00000000..708e72d2 --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-139/tests/positive/vulnerable.py @@ -0,0 +1,21 @@ +import msgpack +import ormsgpack + +# SEC-139: msgpack.unpackb with ext_hook (code execution vector) +data = receive_from_network() +obj = msgpack.unpackb(data, ext_hook=custom_ext_hook) + +# SEC-139: msgpack.unpack from file +with open("data.msgpack", "rb") as f: + obj = msgpack.unpack(f) + +# SEC-139: ormsgpack.unpackb (Rust-based msgpack, same risk) +result = ormsgpack.unpackb(untrusted_bytes) + +# SEC-139: msgpack.Unpacker streaming deserialization +unpacker = msgpack.Unpacker(file_like_obj) +for msg in unpacker: + process(msg) + +# SEC-139: msgpack.unpackb without ext_hook (still flagged conservatively) +obj = msgpack.unpackb(raw_data) diff --git a/rules/python/lang/PYTHON-LANG-SEC-144/meta.yaml b/rules/python/lang/PYTHON-LANG-SEC-144/meta.yaml new file mode 100644 index 00000000..cb05ff3e --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-144/meta.yaml @@ -0,0 +1,199 @@ +# Rule Identity +id: PYTHON-LANG-SEC-144 +name: Insecure CORS Wildcard with Credentials +short_description: CORS middleware configured with wildcard origins (allow_origins=["*"]) may expose APIs to cross-origin attacks when combined with credentials. + +# Classification +severity: HIGH +category: lang +language: python +ruleset: python/lang/PYTHON-LANG-SEC-144 + +# Vulnerability Details +cwe: + - id: CWE-942 + name: Permissive Cross-domain Policy with Untrusted Domains + url: https://cwe.mitre.org/data/definitions/942.html + +owasp: + - id: A05:2021 + name: Security Misconfiguration + url: https://owasp.org/Top10/A05_2021-Security_Misconfiguration/ + +tags: + - python + - cors + - misconfiguration + - wildcard + - cross-origin + - CWE-942 + - OWASP-A05 + +# Author +author: + name: Shivasurya + url: https://x.com/sshivasurya + +# Content +description: | + Cross-Origin Resource Sharing (CORS) is a browser security mechanism that controls which + origins can access an API. Misconfiguring CORS middleware with allow_origins=["*"] (wildcard) + allows any website to make cross-origin requests to your API. When combined with + allow_credentials=True, this creates a critical vulnerability where any malicious website + can make authenticated requests on behalf of a logged-in user. + + This is a configuration audit rule. The CORS middleware call itself is not inherently + vulnerable — the vulnerability depends on the specific arguments passed (allow_origins, + allow_credentials, allow_methods, allow_headers). This rule flags CORS middleware + instantiation and configuration calls for manual review to verify that: + + 1. allow_origins is not set to ["*"] or a permissive regex + 2. allow_credentials=True is not combined with wildcard origins + 3. allow_methods and allow_headers are appropriately restricted + + Real-world vulnerabilities like GHSA-9jfm-9rc6-2hfq (Glances) demonstrate that permissive + CORS configurations in Python web applications enable cross-site data theft and CSRF attacks. + + Common affected frameworks include FastAPI (CORSMiddleware from starlette), Flask (flask-cors), + Django (django-cors-headers), and any ASGI/WSGI middleware that configures CORS headers. + +message: "CORS middleware configuration detected. Review allow_origins and allow_credentials settings to ensure origins are restricted." + +security_implications: + - title: Cross-Origin Data Theft + description: | + When allow_origins=["*"] is set, any website can make AJAX requests to your API + and read the responses. If the API returns sensitive data (user profiles, financial + data, internal metrics), a malicious website can silently exfiltrate this data + when a user visits the attacker's page while logged into your application. + - title: Cross-Site Request Forgery via CORS + description: | + When allow_credentials=True is combined with permissive origins, the browser + will include cookies, authorization headers, and TLS client certificates in + cross-origin requests. This effectively bypasses CSRF protections because the + attacker's JavaScript can make authenticated state-changing requests. + - title: API Abuse from Untrusted Origins + description: | + Wildcard CORS allows any origin to call your API endpoints. Even without + credentials, this can enable abuse of public-facing APIs, rate limit bypass + from distributed origins, and exploitation of any unauthenticated endpoints. + - title: Internal Service Exposure + description: | + Internal APIs or admin endpoints that rely on network-level access control + (e.g., only accessible from internal network) can be accessed from the + browser of an internal user who visits a malicious external website, if + CORS is configured with wildcard origins. + +secure_example: | + from fastapi import FastAPI + from fastapi.middleware.cors import CORSMiddleware + + app = FastAPI() + + # INSECURE: wildcard origins with credentials + # app.add_middleware( + # CORSMiddleware, + # allow_origins=["*"], + # allow_credentials=True, + # ) + + # SECURE: explicit allowed origins + ALLOWED_ORIGINS = [ + "https://app.example.com", + "https://admin.example.com", + ] + + app.add_middleware( + CORSMiddleware, + allow_origins=ALLOWED_ORIGINS, + allow_credentials=True, + allow_methods=["GET", "POST"], + allow_headers=["Authorization", "Content-Type"], + ) + + # SECURE: Flask-CORS with specific origins + from flask import Flask + from flask_cors import CORS + + flask_app = Flask(__name__) + CORS(flask_app, origins=["https://app.example.com"]) + +recommendations: + - Never use allow_origins=["*"] in production, especially with allow_credentials=True. + - Explicitly list allowed origins using full URLs including the scheme (https://example.com). + - If a regex is needed for subdomains, use allow_origin_regex with a strict pattern that anchors the domain. + - Restrict allow_methods to only the HTTP methods your API actually uses. + - Restrict allow_headers to only the headers your API requires. + - Use environment variables to configure allowed origins so development and production have different settings. + +detection_scope: | + This rule detects calls to CORSMiddleware(), app.add_middleware(), and CORS() + constructors. All CORS middleware configuration calls are flagged for manual review + because the vulnerability depends on the specific arguments passed, which may be + variables or configuration values that static analysis cannot resolve. This is a + configuration audit rule that requires human verification of the CORS settings. + +# Compliance +compliance: + - standard: OWASP Top 10 + requirement: "A05:2021 - Security Misconfiguration" + - standard: CWE/SANS Top 25 + requirement: "CWE-942 - Permissive Cross-domain Policy with Untrusted Domains" + - standard: NIST SP 800-53 + requirement: "AC-4: Information Flow Enforcement" + - standard: PCI DSS v4.0 + requirement: "Requirement 6.2.4 - Protect web-facing applications against attacks" + +# References +references: + - title: "CWE-942: Permissive Cross-domain Policy with Untrusted Domains" + url: https://cwe.mitre.org/data/definitions/942.html + - title: "GHSA-9jfm-9rc6-2hfq: Permissive CORS in Glances" + url: https://github.com/advisories/GHSA-9jfm-9rc6-2hfq + - title: "MDN Web Docs: Cross-Origin Resource Sharing (CORS)" + url: https://developer.mozilla.org/en-US/docs/Web/HTTP/CORS + - title: "OWASP CORS Misconfiguration" + url: https://owasp.org/www-project-web-security-testing-guide/latest/4-Web_Application_Security_Testing/11-Client-side_Testing/07-Testing_Cross_Origin_Resource_Sharing + - title: "FastAPI CORS Documentation" + url: https://fastapi.tiangolo.com/tutorial/cors/ + +# FAQ +faq: + - question: Is allow_origins=["*"] always dangerous? + answer: | + Not always. For truly public APIs that serve only public data and do not use + cookies or authentication, allow_origins=["*"] is acceptable and is how public + CDNs and open APIs operate. However, it is dangerous when combined with + allow_credentials=True or when the API has any authenticated endpoints. Most + application APIs should use explicit origin lists. + + - question: Why does this rule flag all CORS middleware calls instead of just wildcard ones? + answer: | + The allowed origins are often passed as variables, loaded from configuration files, + or set via environment variables. Static analysis cannot reliably determine the + runtime value of these arguments, so the rule flags all CORS configuration calls + for manual review. A human reviewer can quickly verify whether the configuration + is restrictive enough. + + - question: Does the browser prevent allow_credentials with wildcard origins? + answer: | + Modern browsers will reject responses that have both Access-Control-Allow-Origin: * + and Access-Control-Allow-Credentials: true. However, some CORS middleware + implementations work around this by reflecting the request's Origin header back as + the Access-Control-Allow-Origin value when credentials are enabled, effectively + allowing any origin while appearing to be specific. This reflection pattern is + equally dangerous. + + - question: How do I handle CORS for multiple subdomains? + answer: | + Use allow_origin_regex with a properly anchored regex pattern like + r"^https://.*\.example\.com$" to allow all subdomains of example.com. Avoid + overly permissive regex patterns like ".*example.*" that could match + attacker-controlled domains like "evil-example.com" or "example.evil.com". + +similar_rules: [] + +# Test Files +tests: + positive: tests/positive/ + negative: tests/negative/ diff --git a/rules/python/lang/PYTHON-LANG-SEC-144/rule.py b/rules/python/lang/PYTHON-LANG-SEC-144/rule.py new file mode 100644 index 00000000..c5f5f1ea --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-144/rule.py @@ -0,0 +1,17 @@ +from rules.python_decorators import python_rule +from codepathfinder import calls, QueryType + + +@python_rule( + id="PYTHON-LANG-SEC-144", + name="Insecure CORS Wildcard Configuration", + severity="HIGH", + category="lang", + cwe="CWE-942", + tags="python,cors,misconfiguration,wildcard,OWASP-A05,CWE-942", + message="CORS middleware with permissive configuration detected. Review allow_origins and allow_credentials settings.", + owasp="A05:2021", +) +def detect_cors_wildcard(): + """Detects CORSMiddleware and CORS() calls that may use wildcard origins.""" + return calls("CORSMiddleware", "*.add_middleware", "CORS") diff --git a/rules/python/lang/PYTHON-LANG-SEC-144/tests/negative/safe.py b/rules/python/lang/PYTHON-LANG-SEC-144/tests/negative/safe.py new file mode 100644 index 00000000..f95fd053 --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-144/tests/negative/safe.py @@ -0,0 +1,28 @@ +from fastapi import FastAPI + +app = FastAPI() + +# Safe: no CORS middleware configured at all +# The rule detects CORSMiddleware(), *.add_middleware(), and CORS() calls. +# These patterns do not invoke any of those. + +# Safe: app with no CORS - just basic routes +app_no_cors = FastAPI() + +# Safe: using security headers via response directly (not middleware call) +from starlette.responses import JSONResponse + +async def homepage(): + return JSONResponse({"status": "ok"}) + +# Safe: configuring allowed hosts in settings (no middleware call) +ALLOWED_ORIGINS = [ + "https://example.com", + "https://app.example.com", +] + +# Safe: just storing config, not calling add_middleware +cors_config = { + "allow_origins": ALLOWED_ORIGINS, + "allow_credentials": True, +} diff --git a/rules/python/lang/PYTHON-LANG-SEC-144/tests/positive/vulnerable.py b/rules/python/lang/PYTHON-LANG-SEC-144/tests/positive/vulnerable.py new file mode 100644 index 00000000..506baa07 --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-144/tests/positive/vulnerable.py @@ -0,0 +1,28 @@ +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +from flask_cors import CORS + +app = FastAPI() + +# SEC-144: CORS wildcard with CORSMiddleware constructor +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# SEC-144: CORSMiddleware called directly +middleware = CORSMiddleware(app, allow_origins=["*"]) + +# SEC-144: Flask-CORS wildcard +flask_app = Flask(__name__) +CORS(flask_app, origins="*") + +# SEC-144: CORS with allow_origin_regex wildcard +app.add_middleware( + CORSMiddleware, + allow_origin_regex=".*", + allow_credentials=True, +) diff --git a/rules/python/lang/PYTHON-LANG-SEC-157/meta.yaml b/rules/python/lang/PYTHON-LANG-SEC-157/meta.yaml new file mode 100644 index 00000000..c6f6d4a8 --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-157/meta.yaml @@ -0,0 +1,191 @@ +# Rule Identity +id: PYTHON-LANG-SEC-157 +name: ZipFile Extract Path Traversal (Zip Slip) +short_description: ZipFile.extract() and ZipFile.extractall() can write files outside the target directory when zip entries contain path traversal sequences. + +# Classification +severity: HIGH +category: lang +language: python +ruleset: python/lang/PYTHON-LANG-SEC-157 + +# Vulnerability Details +cwe: + - id: CWE-22 + name: Improper Limitation of a Pathname to a Restricted Directory ('Path Traversal') + url: https://cwe.mitre.org/data/definitions/22.html + +owasp: + - id: A01:2021 + name: Broken Access Control + url: https://owasp.org/Top10/A01_2021-Broken_Access_Control/ + +tags: + - python + - zipfile + - path-traversal + - zip-slip + - file-write + - CWE-22 + - OWASP-A01 + +# Author +author: + name: Shivasurya + url: https://x.com/sshivasurya + +# Content +description: | + Zip Slip is a path traversal vulnerability that occurs when a zip archive contains entries + with filenames like "../../../etc/cron.d/malicious" that, when extracted, write files outside + the intended target directory. Python's zipfile.ZipFile.extract() and extractall() methods + do not validate member filenames for path traversal sequences by default. + + An attacker who can upload or supply a zip archive to an application that extracts it can: + - Overwrite arbitrary files on the filesystem (e.g., configuration files, cron jobs) + - Write executable files to web-accessible directories for remote code execution + - Replace application binaries or libraries with malicious versions + - Overwrite SSH authorized_keys files for persistent access + + This vulnerability class (CVE-2018-1002200) has affected numerous libraries and applications + across languages. In Python specifically, GHSA-8rrh-rw8j-w5fx demonstrated this in the + wheel package's unpack functionality. + + Note: Python 3.12+ added a filter parameter to extractall() and the data_filter for + safer extraction, but older versions and code that does not use these features remain + vulnerable. + +message: "ZipFile.extract() or extractall() detected. Validate member filenames for path traversal before extraction." + +security_implications: + - title: Arbitrary File Overwrite + description: | + A malicious zip entry named "../../../etc/cron.d/backdoor" will be written to + /etc/cron.d/backdoor when extracted with a target directory of /tmp/uploads/. + This allows an attacker to overwrite any file writable by the application's + process user, including configuration files, cron jobs, and application code. + - title: Remote Code Execution via File Write + description: | + By writing a malicious file to a web-accessible directory (e.g., a PHP webshell, + a Python script in a CGI directory, or a template file), an attacker can achieve + remote code execution. Writing to application module directories can also inject + code that runs on the next import. + - title: Supply Chain Attacks via Package Archives + description: | + Build tools, package installers, and CI/CD pipelines that extract zip/wheel + archives are vulnerable to Zip Slip. A malicious package archive can overwrite + build scripts, inject code into dependencies, or modify deployment artifacts. + - title: Denial of Service via Symlink Entries + description: | + Some zip implementations support symbolic link entries. Combined with path + traversal, an attacker can create symlinks that point to sensitive files, + which are then overwritten on subsequent extractions. + +secure_example: | + import zipfile + import os + + # INSECURE: extractall without validation + # with zipfile.ZipFile("upload.zip") as zf: + # zf.extractall("/tmp/output") + + # SECURE: validate each member before extraction + def safe_extract(zip_path: str, dest_dir: str) -> None: + dest_dir = os.path.realpath(dest_dir) + with zipfile.ZipFile(zip_path) as zf: + for member in zf.infolist(): + member_path = os.path.realpath( + os.path.join(dest_dir, member.filename) + ) + if not member_path.startswith(dest_dir + os.sep): + raise ValueError( + f"Path traversal detected: {member.filename}" + ) + zf.extract(member, dest_dir) + + # SECURE (Python 3.12+): use data_filter + # with zipfile.ZipFile("upload.zip") as zf: + # zf.extractall("/tmp/output", filter="data") + + # SECURE: read file contents without extracting to disk + with zipfile.ZipFile("archive.zip") as zf: + data = zf.read("config.json") + +recommendations: + - Always validate zip entry filenames before extraction by resolving the full path and checking it starts with the target directory. + - Use os.path.realpath() on both the target directory and the resolved member path to handle symlinks and relative paths. + - On Python 3.12+, use extractall(path, filter="data") which rejects path traversal entries. + - Consider using zf.read() or zf.open() to read file contents into memory instead of extracting to disk. + - Reject zip entries with absolute paths or entries containing ".." path components. + - Set restrictive file permissions on the extraction directory. + +detection_scope: | + This rule detects calls to zipfile.ZipFile.extract() and zipfile.ZipFile.extractall() + using the ZipFileModule.method() pattern. It specifically targets the zipfile module + methods rather than using broad wildcard matching like calls("*.extract") to avoid + false positives from unrelated extract methods (str.extract, re.extract, etc.). + +# Compliance +compliance: + - standard: CWE Top 25 + requirement: "CWE-22 - Improper Limitation of a Pathname to a Restricted Directory" + - standard: OWASP Top 10 + requirement: "A01:2021 - Broken Access Control" + - standard: NIST SP 800-53 + requirement: "SI-10: Information Input Validation" + - standard: PCI DSS v4.0 + requirement: "Requirement 6.2.4 - Protect against common coding vulnerabilities" + +# References +references: + - title: "CWE-22: Improper Limitation of a Pathname to a Restricted Directory" + url: https://cwe.mitre.org/data/definitions/22.html + - title: "OWASP Path Traversal" + url: https://owasp.org/www-community/attacks/Path_Traversal + - title: "GHSA-8rrh-rw8j-w5fx: Path traversal in wheel unpack" + url: https://github.com/advisories/GHSA-8rrh-rw8j-w5fx + - title: "CVE-2018-1002200: Zip Slip arbitrary file overwrite" + url: https://nvd.nist.gov/vuln/detail/CVE-2018-1002200 + - title: "Python zipfile documentation" + url: https://docs.python.org/3/library/zipfile.html + - title: "Python 3.12 zipfile.extractall filter parameter" + url: https://docs.python.org/3.12/library/zipfile.html#zipfile.ZipFile.extractall + +# FAQ +faq: + - question: Does Python's zipfile module protect against Zip Slip by default? + answer: | + No. Python's zipfile.extract() and extractall() do not validate member filenames + for path traversal by default in versions before 3.12. Starting with Python 3.12, + a filter parameter was added to extractall() that can reject dangerous entries when + set to "data". However, this filter is not enabled by default for backward + compatibility, so code must explicitly opt in. + + - question: What is the difference between this rule and PYTHON-LANG-SEC-110 (tarfile)? + answer: | + PYTHON-LANG-SEC-110 covers tarfile extraction which has similar path traversal + risks. Tar archives can additionally contain symlink entries and device files, + making them slightly more dangerous. Both rules detect the same class of + vulnerability (CWE-22) but in different archive format handlers. + + - question: Is extractall() with a members parameter safe? + answer: | + Using extractall(path, members=validated_list) can be safe if the members list + has been filtered to exclude entries with path traversal sequences. However, the + filtering must be done correctly — checking that the resolved path starts with the + target directory using os.path.realpath(). Simply checking for ".." in the filename + is insufficient as there are bypass techniques. + + - question: Can Zip Slip affect Python wheel installation? + answer: | + Yes. Python wheel files (.whl) are zip archives. The wheel package had a Zip Slip + vulnerability (GHSA-8rrh-rw8j-w5fx) where malicious wheel files could write files + outside the installation directory during pip install. This was fixed in the wheel + package, but custom wheel extraction code may still be vulnerable. + +similar_rules: [] + +# Test Files +tests: + positive: tests/positive/ + negative: tests/negative/ diff --git a/rules/python/lang/PYTHON-LANG-SEC-157/rule.py b/rules/python/lang/PYTHON-LANG-SEC-157/rule.py new file mode 100644 index 00000000..0f2c2393 --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-157/rule.py @@ -0,0 +1,21 @@ +from rules.python_decorators import python_rule +from codepathfinder import calls, QueryType + + +class ZipFileModule(QueryType): + fqns = ["zipfile"] + + +@python_rule( + id="PYTHON-LANG-SEC-157", + name="ZipFile Extract Path Traversal (Zip Slip)", + severity="HIGH", + category="lang", + cwe="CWE-22", + tags="python,zipfile,path-traversal,zip-slip,OWASP-A01,CWE-22", + message="ZipFile.extract() or ZipFile.extractall() detected. Malicious zip entries with '../' paths can write files outside the target directory.", + owasp="A01:2021", +) +def detect_zipfile_extract(): + """Detects zipfile.ZipFile.extract() and extractall() calls.""" + return ZipFileModule.method("extract", "extractall") diff --git a/rules/python/lang/PYTHON-LANG-SEC-157/tests/negative/safe.py b/rules/python/lang/PYTHON-LANG-SEC-157/tests/negative/safe.py new file mode 100644 index 00000000..3c299abe --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-157/tests/negative/safe.py @@ -0,0 +1,26 @@ +import zipfile +import os + +# Safe: using namelist() to inspect without extracting +with zipfile.ZipFile("archive.zip") as zf: + names = zf.namelist() + +# Safe: using read() to get contents without writing to disk +with zipfile.ZipFile("archive.zip") as zf: + data = zf.read("config.json") + +# Safe: extractall with validated members list +def safe_extract(zip_path, dest_dir): + with zipfile.ZipFile(zip_path) as zf: + safe_members = [] + for member in zf.infolist(): + member_path = os.path.realpath(os.path.join(dest_dir, member.filename)) + if member_path.startswith(os.path.realpath(dest_dir)): + safe_members.append(member) + # Only extract validated members + # zf.extractall(dest_dir, members=safe_members) # not called here + +# Safe: open() for reading specific entry +with zipfile.ZipFile("archive.zip") as zf: + with zf.open("data.csv") as f: + content = f.read() diff --git a/rules/python/lang/PYTHON-LANG-SEC-157/tests/positive/vulnerable.py b/rules/python/lang/PYTHON-LANG-SEC-157/tests/positive/vulnerable.py new file mode 100644 index 00000000..e97d16c7 --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-157/tests/positive/vulnerable.py @@ -0,0 +1,22 @@ +import zipfile + +# SEC-157: ZipFile.extract() without path validation +zf = zipfile.ZipFile("archive.zip") +zf.extract("malicious/../../etc/passwd", "/tmp/output") + +# SEC-157: ZipFile.extractall() without member validation +with zipfile.ZipFile("upload.zip") as zf: + zf.extractall("/tmp/output") + +# SEC-157: extract in a loop without checking for path traversal +with zipfile.ZipFile(uploaded_file) as zf: + for member in zf.namelist(): + zf.extract(member, dest_dir) + +# SEC-157: ZipFile constructed inline and extracted +zipfile.ZipFile(user_upload).extractall(target_path) + +# SEC-157: extract with info object +with zipfile.ZipFile("data.zip") as zf: + for info in zf.infolist(): + zf.extract(info, output_dir) diff --git a/rules/python/lang/PYTHON-LANG-SEC-162/meta.yaml b/rules/python/lang/PYTHON-LANG-SEC-162/meta.yaml new file mode 100644 index 00000000..b2db5a8c --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-162/meta.yaml @@ -0,0 +1,207 @@ +# Rule Identity +id: PYTHON-LANG-SEC-162 +name: Symlink Following Arbitrary File Access +short_description: Symlink operations (os.readlink, os.symlink, is_symlink) on user-controlled paths can lead to arbitrary file read or write via symlink following. + +# Classification +severity: HIGH +category: lang +language: python +ruleset: python/lang/PYTHON-LANG-SEC-162 + +# Vulnerability Details +cwe: + - id: CWE-59 + name: Improper Link Resolution Before File Access + url: https://cwe.mitre.org/data/definitions/59.html + +owasp: + - id: A01:2021 + name: Broken Access Control + url: https://owasp.org/Top10/A01_2021-Broken_Access_Control/ + +tags: + - python + - symlink + - path-traversal + - file-access + - link-following + - CWE-59 + - OWASP-A01 + +# Author +author: + name: Shivasurya + url: https://x.com/sshivasurya + +# Content +description: | + Symlink following vulnerabilities occur when an application performs file operations on a + path that is (or contains) a symbolic link controlled by an attacker, causing the + application to read, write, or delete files outside the intended directory. This is + particularly dangerous in applications that process user-uploaded files, manage + repositories, or operate on shared filesystems. + + Common attack scenarios include: + - A user uploads a zip/tar archive containing symlinks pointing to /etc/passwd or + application secrets, then triggers the application to read those "files" + - A user creates a symlink in a writable directory pointing to a sensitive file, then + uses the application to read the symlink target + - Race conditions (TOCTOU) where a legitimate file is replaced with a symlink between + the check and the use + + This is an audit rule. Not every os.readlink() or is_symlink() call is vulnerable — + the vulnerability depends on whether the path argument is user-controlled and whether + the application follows the symlink to perform sensitive operations. This rule flags + symlink-related operations for manual review to assess whether proper validation + (os.path.realpath + path prefix check, or O_NOFOLLOW) is in place. + + Real-world example: GHSA-g925-f788-4jh7 (Weblate) where symlink following in a + repository management feature allowed reading arbitrary files from the server. + +message: "Symlink operation on potentially user-controlled path detected. Verify that symlink resolution is validated before file access." + +security_implications: + - title: Arbitrary File Read via Symlink Following + description: | + When an application reads a file path that resolves through a user-controlled + symlink, the attacker can point the symlink to any file readable by the + application process. This commonly leads to disclosure of /etc/passwd, + application configuration files, database credentials, API keys, and + private keys. + - title: Arbitrary File Write via Symlink Redirection + description: | + If an application writes to a path that contains a symlink, the write + operation follows the symlink and modifies the target file. An attacker + can redirect writes to overwrite configuration files, inject code into + application modules, or modify cron jobs for persistent access. + - title: TOCTOU Race Conditions + description: | + Even when an application checks is_symlink() before operating on a file, + a race condition exists between the check and the use. An attacker can + replace a legitimate file with a symlink after the check passes but before + the file operation executes, bypassing the symlink check entirely. + - title: Container and Sandbox Escape + description: | + In containerized environments, symlinks pointing to paths outside the + container's filesystem namespace (e.g., /proc/1/root/) can potentially + be used to access the host filesystem, depending on the container + runtime configuration and mount points. + +secure_example: | + import os + from pathlib import Path + + # INSECURE: reading a user-controlled path without symlink check + # content = open(user_path).read() + + # SECURE: resolve symlinks and verify the path is within allowed directory + def safe_read(user_path: str, allowed_dir: str) -> str: + real_path = os.path.realpath(user_path) + allowed_real = os.path.realpath(allowed_dir) + if not real_path.startswith(allowed_real + os.sep): + raise ValueError("Path traversal or symlink escape detected") + with open(real_path) as f: + return f.read() + + # SECURE: use Path.resolve() with is_relative_to() (Python 3.9+) + def safe_read_pathlib(user_path: str, base_dir: str) -> str: + resolved = Path(user_path).resolve() + if not resolved.is_relative_to(Path(base_dir).resolve()): + raise ValueError("Path outside allowed directory") + return resolved.read_text() + + # SECURE: open with O_NOFOLLOW to reject symlinks + import os + fd = os.open(file_path, os.O_RDONLY | os.O_NOFOLLOW) + try: + data = os.read(fd, os.fstat(fd).st_size) + finally: + os.close(fd) + + # SECURE: use stat with follow_symlinks=False to check without following + info = os.stat(path, follow_symlinks=False) + if stat.S_ISLNK(info.st_mode): + raise ValueError("Symlinks not allowed") + +recommendations: + - Always resolve symlinks with os.path.realpath() or Path.resolve() before performing file operations on user-controlled paths. + - After resolving, verify the resolved path is within the expected directory using a prefix check (startswith) or Path.is_relative_to(). + - Use os.open() with O_NOFOLLOW flag when symlinks should be rejected entirely. + - For directory traversal, use os.scandir() and check entry.is_symlink() before following entries. + - Be aware of TOCTOU race conditions — resolve and open atomically when possible, or use O_NOFOLLOW. + +detection_scope: | + This rule detects calls to os.readlink(), os.symlink(), Path.is_symlink(), and + os.path.islink(). These are audit-level findings — not every symlink operation is + dangerous, but symlink operations on user-controlled paths need review to ensure + proper validation is in place. Hardcoded paths and paths validated with + os.path.realpath() before access are typically safe. + +# Compliance +compliance: + - standard: CWE Top 25 + requirement: "CWE-59 - Improper Link Resolution Before File Access" + - standard: OWASP Top 10 + requirement: "A01:2021 - Broken Access Control" + - standard: NIST SP 800-53 + requirement: "SI-10: Information Input Validation" + - standard: PCI DSS v4.0 + requirement: "Requirement 6.2.4 - Protect against common coding vulnerabilities" + +# References +references: + - title: "CWE-59: Improper Link Resolution Before File Access" + url: https://cwe.mitre.org/data/definitions/59.html + - title: "GHSA-g925-f788-4jh7: Arbitrary file read in Weblate via symlink following" + url: https://github.com/advisories/GHSA-g925-f788-4jh7 + - title: "OWASP Path Traversal" + url: https://owasp.org/www-community/attacks/Path_Traversal + - title: "Python os.path.realpath() documentation" + url: https://docs.python.org/3/library/os.path.html#os.path.realpath + - title: "Python pathlib.Path.resolve() documentation" + url: https://docs.python.org/3/library/pathlib.html#pathlib.Path.resolve + +# FAQ +faq: + - question: Is every os.readlink() call dangerous? + answer: | + No. os.readlink() on a hardcoded or application-controlled path is safe. The + vulnerability occurs when the path argument is derived from user input (request + parameters, uploaded filenames, repository paths, etc.) without validation. + This rule flags all symlink operations for audit because determining whether a + path is user-controlled requires understanding the data flow, which needs human review. + + - question: How does symlink following differ from path traversal? + answer: | + Path traversal (CWE-22) uses "../" sequences in the filename to escape a directory. + Symlink following (CWE-59) uses symbolic links to redirect file access to arbitrary + locations. Both can achieve similar outcomes (reading/writing files outside the + intended directory), but they require different defenses. Path traversal is blocked + by canonicalizing the path and checking the prefix. Symlink following requires + resolving symlinks (os.path.realpath) and then checking the prefix, or using + O_NOFOLLOW to reject symlinks entirely. + + - question: Does os.path.realpath() prevent symlink attacks? + answer: | + os.path.realpath() resolves all symlinks in a path and returns the canonical + absolute path. Using it before a file operation and then verifying the result + is within the allowed directory is the standard defense against symlink following. + However, there is a TOCTOU window between calling realpath() and opening the file + where the symlink could be changed. For high-security scenarios, use O_NOFOLLOW + or open the file first and then verify with fstat. + + - question: Can containers prevent symlink escape? + answer: | + Container runtimes (Docker, containerd) restrict filesystem access using mount + namespaces, so symlinks inside a container generally cannot access host files. + However, if the container has host volumes mounted, symlinks can follow those + mounts. Additionally, /proc/1/root/ and similar procfs paths can sometimes be + used for escape depending on the container security configuration. + +similar_rules: [] + +# Test Files +tests: + positive: tests/positive/ + negative: tests/negative/ diff --git a/rules/python/lang/PYTHON-LANG-SEC-162/rule.py b/rules/python/lang/PYTHON-LANG-SEC-162/rule.py new file mode 100644 index 00000000..b1e83b9c --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-162/rule.py @@ -0,0 +1,63 @@ +from rules.python_decorators import python_rule +from codepathfinder import calls, flows, QueryType +from codepathfinder.presets import PropagationPresets + + +class OSModule(QueryType): + fqns = ["os"] + + +class OSPathModule(QueryType): + fqns = ["os.path"] + + +class ShutilModule(QueryType): + fqns = ["shutil"] + + +class Builtins(QueryType): + fqns = ["builtins"] + + +@python_rule( + id="PYTHON-LANG-SEC-162", + name="Symlink Following Arbitrary File Access", + severity="HIGH", + category="lang", + cwe="CWE-59", + tags="python,symlink,path-traversal,file-access,OWASP-A01,CWE-59", + message="User-controlled path flows to file operation without symlink resolution. " + "Attacker can craft symbolic links to read/write arbitrary files. " + "Resolve symlinks with os.path.realpath() and validate with is_relative_to() before access.", + owasp="A01:2021", +) +def detect_symlink_following(): + """Detects user-controlled paths flowing to file operations without symlink resolution. + CVE: GHSA-g925-f788-4jh7 (Weblate arbitrary file read via symlinks). + + Source: os.path.join (type-inferred), request.args/form.get + Sink: open(), os.readlink(), shutil.copy/move (all type-inferred via stdlib) + Sanitizer: os.path.realpath() (type-inferred) + """ + return flows( + from_sources=[ + OSPathModule.method("join"), + calls("request.args.get"), + calls("request.form.get"), + calls("*.get"), + ], + to_sinks=[ + Builtins.method("open").tracks(0), + OSModule.method("readlink").tracks(0), + OSModule.method("symlink"), + ShutilModule.method("copy", "copy2", "move", "copyfile").tracks(0), + ], + sanitized_by=[ + OSPathModule.method("realpath"), + OSPathModule.method("abspath"), + calls("*.resolve"), + calls("*.is_relative_to"), + ], + propagates_through=PropagationPresets.standard(), + scope="global", + ) diff --git a/rules/python/lang/PYTHON-LANG-SEC-162/tests/negative/safe.py b/rules/python/lang/PYTHON-LANG-SEC-162/tests/negative/safe.py new file mode 100644 index 00000000..b3b1f225 --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-162/tests/negative/safe.py @@ -0,0 +1,24 @@ +import os +from pathlib import Path + +# Safe: os.path.realpath() resolves symlinks (not a symlink operation itself) +real = os.path.realpath(user_path) +if real.startswith(ALLOWED_DIR): + with open(real) as f: + data = f.read() + +# Safe: Path.resolve() with is_relative_to() check (no symlink API call) +p = Path(user_input).resolve() +if p.is_relative_to(Path("/app/data")): + content = p.read_text() + +# Safe: stat with follow_symlinks=False (does not follow symlinks) +info = os.stat(path, follow_symlinks=False) + +# Safe: using os.open with O_NOFOLLOW flag (rejects symlinks) +fd = os.open(path, os.O_RDONLY | os.O_NOFOLLOW) + +# Safe: checking file type without symlink operations +import stat +mode = os.lstat(path).st_mode +is_regular = stat.S_ISREG(mode) diff --git a/rules/python/lang/PYTHON-LANG-SEC-162/tests/positive/cross_file_sink.py b/rules/python/lang/PYTHON-LANG-SEC-162/tests/positive/cross_file_sink.py new file mode 100644 index 00000000..9aebea42 --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-162/tests/positive/cross_file_sink.py @@ -0,0 +1,18 @@ +# SEC-162: Sink file — tainted path reaches file operations without symlink check + +import os + + +def process_repo_file(file_path): + """Tainted path → open() without os.path.realpath() — symlink attack.""" + with open(file_path, "r") as f: + return f.read() + + +def read_translation_file(file_path): + """Tainted path → os.readlink() — follows attacker symlink.""" + if os.path.islink(file_path): + target = os.readlink(file_path) + return target + with open(file_path, "r") as f: + return f.read() diff --git a/rules/python/lang/PYTHON-LANG-SEC-162/tests/positive/cross_file_source.py b/rules/python/lang/PYTHON-LANG-SEC-162/tests/positive/cross_file_source.py new file mode 100644 index 00000000..a660219a --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-162/tests/positive/cross_file_source.py @@ -0,0 +1,24 @@ +# SEC-162: Source file — path constructed from user input + +import os +from flask import Flask, request +from cross_file_sink import process_repo_file, read_translation_file + +app = Flask(__name__) + + +@app.route('/repo/file') +def repo_file_endpoint(): + repo_slug = request.args.get("repo") + filename = request.args.get("file") + path = os.path.join("/srv/repos", repo_slug, filename) + content = process_repo_file(path) + return content + + +@app.route('/translation/download') +def download_translation(): + lang = request.args.get("lang") + path = os.path.join("/srv/translations", lang) + data = read_translation_file(path) + return data diff --git a/rules/python/lang/PYTHON-LANG-SEC-162/tests/positive/vulnerable.py b/rules/python/lang/PYTHON-LANG-SEC-162/tests/positive/vulnerable.py new file mode 100644 index 00000000..662ec748 --- /dev/null +++ b/rules/python/lang/PYTHON-LANG-SEC-162/tests/positive/vulnerable.py @@ -0,0 +1,22 @@ +import os +import shutil + +# SEC-162: User-controlled path flows to file op without symlink resolution + +# 1. os.path.join → open (classic repo file read) +def read_repo_file(repo_dir, filename): + path = os.path.join(repo_dir, filename) + f = open(path, "r") + return f.read() + +# 2. os.path.join → os.readlink (Weblate CVE pattern) +def get_link_target(repo_dir, name): + path = os.path.join(repo_dir, name) + target = os.readlink(path) + return target + +# 3. request.args.get → open (direct web input) +def download_file(request): + filename = request.args.get("file") + f = open(filename, "rb") + return f.read()