diff --git a/src/crawlee/_utils/file.py b/src/crawlee/_utils/file.py index b3ee5662f4..7a9e13be28 100644 --- a/src/crawlee/_utils/file.py +++ b/src/crawlee/_utils/file.py @@ -60,6 +60,37 @@ def _write_file(path: Path, data: str | bytes) -> None: raise +def validate_subdirectory(base_dir: Path, subdirectory: str) -> Path: + """Resolve a storage subdirectory inside a base directory. + + Joins `subdirectory` onto `base_dir` and verifies that the result stays inside `base_dir`, so a storage + name or alias always maps to a directory under the storage directory rather than somewhere else (e.g. a + value containing `..` or an absolute path). + + Args: + base_dir: The base storage directory (e.g. the `key_value_stores` directory). + subdirectory: The storage name or alias to use as the subdirectory. + + Returns: + The validated full path to the storage subdirectory. + + Raises: + ValueError: If the resolved path would fall outside `base_dir`. + """ + # Normalize lexically (no filesystem access), so symlinks are not followed and the check is deterministic. + base_resolved = Path(os.path.normpath(base_dir)) + target_resolved = Path(os.path.normpath(base_dir / subdirectory)) + + # The target must be strictly inside the base directory - reject parent references and absolute paths. + if target_resolved == base_resolved or base_resolved not in target_resolved.parents: + raise ValueError( + f'Invalid storage name or alias "{subdirectory}". It must not contain path separators, parent ' + f'directory references ("..") or absolute paths that resolve outside the storage directory.' + ) + + return target_resolved + + def infer_mime_type(value: Any) -> str: """Infer the MIME content type from the value. diff --git a/src/crawlee/storage_clients/_file_system/_dataset_client.py b/src/crawlee/storage_clients/_file_system/_dataset_client.py index 8fce7b4733..89d0435092 100644 --- a/src/crawlee/storage_clients/_file_system/_dataset_client.py +++ b/src/crawlee/storage_clients/_file_system/_dataset_client.py @@ -15,7 +15,7 @@ from crawlee._consts import METADATA_FILENAME from crawlee._types import JsonSerializable from crawlee._utils.crypto import crypto_random_object_id -from crawlee._utils.file import atomic_write, json_dumps +from crawlee._utils.file import atomic_write, json_dumps, validate_subdirectory from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs from crawlee.storage_clients._base import DatasetClient from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata @@ -159,8 +159,7 @@ async def open( # Get a new instance by name or alias. else: - dataset_dir = Path(name) if name else Path(alias) if alias else Path('default') - path_to_dataset = dataset_base_path / dataset_dir + path_to_dataset = validate_subdirectory(dataset_base_path, name or alias or 'default') path_to_metadata = path_to_dataset / METADATA_FILENAME # If the dataset directory exists, reconstruct the client from the metadata file. diff --git a/src/crawlee/storage_clients/_file_system/_key_value_store_client.py b/src/crawlee/storage_clients/_file_system/_key_value_store_client.py index 3a36a77074..984a88651f 100644 --- a/src/crawlee/storage_clients/_file_system/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_file_system/_key_value_store_client.py @@ -15,7 +15,7 @@ from crawlee._consts import METADATA_FILENAME from crawlee._utils.crypto import crypto_random_object_id -from crawlee._utils.file import atomic_write, infer_mime_type, json_dumps +from crawlee._utils.file import atomic_write, infer_mime_type, json_dumps, validate_subdirectory from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs from crawlee.storage_clients._base import KeyValueStoreClient from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata @@ -157,8 +157,7 @@ async def open( # Get a new instance by name or alias. else: - kvs_dir = Path(name) if name else Path(alias) if alias else Path('default') - path_to_kvs = kvs_base_path / kvs_dir + path_to_kvs = validate_subdirectory(kvs_base_path, name or alias or 'default') path_to_metadata = path_to_kvs / METADATA_FILENAME # If the key-value store directory exists, reconstruct the client from the metadata file. diff --git a/src/crawlee/storage_clients/_file_system/_request_queue_client.py b/src/crawlee/storage_clients/_file_system/_request_queue_client.py index 4954d8a4d2..024a50b8a9 100644 --- a/src/crawlee/storage_clients/_file_system/_request_queue_client.py +++ b/src/crawlee/storage_clients/_file_system/_request_queue_client.py @@ -17,7 +17,7 @@ from crawlee import Request from crawlee._consts import METADATA_FILENAME from crawlee._utils.crypto import crypto_random_object_id -from crawlee._utils.file import atomic_write, json_dumps +from crawlee._utils.file import atomic_write, json_dumps, validate_subdirectory from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs from crawlee._utils.recoverable_state import RecoverableState from crawlee.storage_clients._base import RequestQueueClient @@ -227,8 +227,7 @@ async def open( # Open an existing RQ by its name or alias, or create a new one if not found. else: - rq_dir = Path(name) if name else Path(alias) if alias else Path('default') - path_to_rq = rq_base_path / rq_dir + path_to_rq = validate_subdirectory(rq_base_path, name or alias or 'default') path_to_metadata = path_to_rq / METADATA_FILENAME # If the RQ directory exists, reconstruct the client from the metadata file. diff --git a/src/crawlee/storages/_storage_instance_manager.py b/src/crawlee/storages/_storage_instance_manager.py index 6e6c8c2359..05b6eeb081 100644 --- a/src/crawlee/storages/_storage_instance_manager.py +++ b/src/crawlee/storages/_storage_instance_manager.py @@ -10,7 +10,7 @@ from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs from crawlee.storage_clients._base import DatasetClient, KeyValueStoreClient, RequestQueueClient -from ._utils import validate_storage_name +from ._utils import validate_storage_alias, validate_storage_name if TYPE_CHECKING: from ._base import Storage @@ -135,6 +135,10 @@ async def open_storage_instance( if name is not None: validate_storage_name(name) + # Validate storage alias + if alias is not None: + validate_storage_alias(alias) + # Acquire lock for this opener opener_lock_key = (cls, str(id or name or alias), storage_client_cache_key) if not (lock := self._opener_locks.get(opener_lock_key)): diff --git a/src/crawlee/storages/_utils.py b/src/crawlee/storages/_utils.py index 17e1fcc55c..2bfcab143b 100644 --- a/src/crawlee/storages/_utils.py +++ b/src/crawlee/storages/_utils.py @@ -9,3 +9,20 @@ def validate_storage_name(name: str | None) -> None: f'Invalid storage name "{name}". Name can only contain letters "a" through "z", the digits "0" through' '"9", and the hyphen ("-") but only in the middle of the string (e.g. "my-value-1")' ) + + +def validate_storage_alias(alias: str | None) -> None: + """Validate a storage alias that is used as an on-disk subdirectory name. + + Unlike storage names, aliases may contain underscores and dots (e.g. the reserved `__default__` alias), + but must not contain path separators or parent-directory references, so an alias always maps to a single + directory under the storage directory. + """ + if alias is None: + return + + if not alias or '/' in alias or '\\' in alias or '\x00' in alias or alias in {'.', '..'}: + raise ValueError( + f'Invalid storage alias "{alias}". Alias must not be empty, contain path separators ("/", "\\") or ' + f'null bytes, or be a directory reference (".", "..").' + ) diff --git a/tests/unit/_utils/test_file.py b/tests/unit/_utils/test_file.py index c00618b600..91764d6df8 100644 --- a/tests/unit/_utils/test_file.py +++ b/tests/unit/_utils/test_file.py @@ -1,8 +1,14 @@ from __future__ import annotations from datetime import datetime, timezone +from typing import TYPE_CHECKING -from crawlee._utils.file import json_dumps +import pytest + +from crawlee._utils.file import json_dumps, validate_subdirectory + +if TYPE_CHECKING: + from pathlib import Path async def test_json_dumps() -> None: @@ -11,3 +17,41 @@ async def test_json_dumps() -> None: assert await json_dumps('string') == '"string"' assert await json_dumps(123) == '123' assert await json_dumps(datetime(2022, 1, 1, tzinfo=timezone.utc)) == '"2022-01-01 00:00:00+00:00"' + + +# Tests for validate_subdirectory (storage name/alias directory validation). + + +@pytest.mark.parametrize( + 'subdirectory', + [ + pytest.param('my-store', id='simple'), + pytest.param('store_with_underscores', id='underscores'), + pytest.param('store.with.dots', id='dots'), + pytest.param('__default__', id='reserved-default'), + pytest.param('nested/inside', id='nested-but-contained'), + ], +) +def test_validate_subdirectory_accepts_safe_segments(tmp_path: Path, subdirectory: str) -> None: + base_dir = tmp_path / 'key_value_stores' + result = validate_subdirectory(base_dir, subdirectory) + # The resolved path must stay within the base directory. + assert result.parent == base_dir or base_dir in result.parents + + +@pytest.mark.parametrize( + 'subdirectory', + [ + pytest.param('../outside', id='parent-ref'), + pytest.param('../../outside', id='deep-parent-ref'), + pytest.param('..', id='bare-parent'), + pytest.param('.', id='bare-current'), + pytest.param('a/../../outside', id='mixed-parent-ref'), + pytest.param('/etc/passwd', id='absolute-path'), + pytest.param('', id='empty'), + ], +) +def test_validate_subdirectory_rejects_invalid_segments(tmp_path: Path, subdirectory: str) -> None: + base_dir = tmp_path / 'key_value_stores' + with pytest.raises(ValueError, match='Invalid storage name or alias'): + validate_subdirectory(base_dir, subdirectory) diff --git a/tests/unit/storage_clients/_file_system/test_fs_dataset_client.py b/tests/unit/storage_clients/_file_system/test_fs_dataset_client.py index 3276ba2f0b..d7d2b4d64d 100644 --- a/tests/unit/storage_clients/_file_system/test_fs_dataset_client.py +++ b/tests/unit/storage_clients/_file_system/test_fs_dataset_client.py @@ -51,6 +51,33 @@ async def test_file_and_directory_creation(configuration: Configuration) -> None await client.drop() +@pytest.mark.parametrize( + 'invalid_value', + [ + pytest.param('../outside', id='parent-ref'), + pytest.param('..', id='bare-parent'), + pytest.param('/abs/outside', id='absolute-path'), + ], +) +async def test_open_rejects_invalid_name_or_alias( + configuration: Configuration, tmp_path: Path, invalid_value: str +) -> None: + """The low-level client must reject names/aliases that resolve outside the storage directory. + + This covers direct usage of the storage client, which bypasses the high-level validation. + """ + storage_client = FileSystemStorageClient() + + with pytest.raises(ValueError, match='Invalid storage name or alias'): + await storage_client.create_dataset_client(alias=invalid_value, configuration=configuration) + + with pytest.raises(ValueError, match='Invalid storage name or alias'): + await storage_client.create_dataset_client(name=invalid_value, configuration=configuration) + + # Nothing should have been written outside the configured storage directory. + assert not (tmp_path / 'outside').exists() + + async def test_file_persistence_and_content_verification(dataset_client: FileSystemDatasetClient) -> None: """Test that data is properly persisted to files with correct content.""" item = {'key': 'value', 'number': 42} diff --git a/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py b/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py index 5f2ae15da0..6866f59d05 100644 --- a/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py +++ b/tests/unit/storage_clients/_file_system/test_fs_kvs_client.py @@ -49,6 +49,33 @@ async def test_file_and_directory_creation(configuration: Configuration) -> None await client.drop() +@pytest.mark.parametrize( + 'invalid_value', + [ + pytest.param('../outside', id='parent-ref'), + pytest.param('..', id='bare-parent'), + pytest.param('/abs/outside', id='absolute-path'), + ], +) +async def test_open_rejects_invalid_name_or_alias( + configuration: Configuration, tmp_path: Path, invalid_value: str +) -> None: + """The low-level client must reject names/aliases that resolve outside the storage directory. + + This covers direct usage of the storage client, which bypasses the high-level validation. + """ + storage_client = FileSystemStorageClient() + + with pytest.raises(ValueError, match='Invalid storage name or alias'): + await storage_client.create_kvs_client(alias=invalid_value, configuration=configuration) + + with pytest.raises(ValueError, match='Invalid storage name or alias'): + await storage_client.create_kvs_client(name=invalid_value, configuration=configuration) + + # Nothing should have been written outside the configured storage directory. + assert not (tmp_path / 'outside').exists() + + async def test_value_file_creation_and_content(kvs_client: FileSystemKeyValueStoreClient) -> None: """Test that values are properly persisted to files with correct content and metadata.""" test_key = 'test-key' diff --git a/tests/unit/storage_clients/_file_system/test_fs_rq_client.py b/tests/unit/storage_clients/_file_system/test_fs_rq_client.py index 275665d9d5..a3641f60bf 100644 --- a/tests/unit/storage_clients/_file_system/test_fs_rq_client.py +++ b/tests/unit/storage_clients/_file_system/test_fs_rq_client.py @@ -51,6 +51,33 @@ async def test_file_and_directory_creation() -> None: await client.drop() +@pytest.mark.parametrize( + 'invalid_value', + [ + pytest.param('../outside', id='parent-ref'), + pytest.param('..', id='bare-parent'), + pytest.param('/abs/outside', id='absolute-path'), + ], +) +async def test_open_rejects_invalid_name_or_alias( + configuration: Configuration, tmp_path: Path, invalid_value: str +) -> None: + """The low-level client must reject names/aliases that resolve outside the storage directory. + + This covers direct usage of the storage client, which bypasses the high-level validation. + """ + storage_client = FileSystemStorageClient() + + with pytest.raises(ValueError, match='Invalid storage name or alias'): + await storage_client.create_rq_client(alias=invalid_value, configuration=configuration) + + with pytest.raises(ValueError, match='Invalid storage name or alias'): + await storage_client.create_rq_client(name=invalid_value, configuration=configuration) + + # Nothing should have been written outside the configured storage directory. + assert not (tmp_path / 'outside').exists() + + async def test_request_file_persistence(rq_client: FileSystemRequestQueueClient) -> None: """Test that requests are properly persisted to files.""" requests = [ diff --git a/tests/unit/storages/test_dataset.py b/tests/unit/storages/test_dataset.py index 3b0a8e1d6e..1d55e250dc 100644 --- a/tests/unit/storages/test_dataset.py +++ b/tests/unit/storages/test_dataset.py @@ -1078,6 +1078,8 @@ async def test_name_default_not_allowed(storage_client: StorageClient) -> None: pytest.param('name with spaces', False, id='spaces'), pytest.param('-test', False, id='dashes start'), pytest.param('test-', False, id='dashes end'), + pytest.param('../outside', False, id='parent-ref'), + pytest.param('with/slash', False, id='slash'), ], ) async def test_validate_name(storage_client: StorageClient, name: str, *, is_valid: bool) -> None: @@ -1092,6 +1094,31 @@ async def test_validate_name(storage_client: StorageClient, name: str, *, is_val await Dataset.open(name=name, storage_client=storage_client) +@pytest.mark.parametrize( + ('alias', 'is_valid'), + [ + pytest.param('valid-alias', True, id='dashes'), + pytest.param('alias_with_underscores', True, id='underscores'), + pytest.param('alias.with.dots', True, id='dots'), + pytest.param('CamelCaseAlias', True, id='mixed-case'), + pytest.param('../outside', False, id='parent-ref'), + pytest.param('..', False, id='bare-parent'), + pytest.param('.', False, id='bare-current'), + pytest.param('nested/alias', False, id='slash'), + pytest.param('back\\slash', False, id='backslash'), + ], +) +async def test_validate_alias(storage_client: StorageClient, alias: str, *, is_valid: bool) -> None: + """Test alias validation logic, including rejection of values that would resolve outside the storage dir.""" + if is_valid: + # Should not raise. + dataset = await Dataset.open(alias=alias, storage_client=storage_client) + await dataset.drop() + else: + with pytest.raises(ValueError, match=r'Invalid storage alias'): + await Dataset.open(alias=alias, storage_client=storage_client) + + async def test_record_with_noascii_chars(dataset: Dataset) -> None: """Test handling record with non-ASCII characters.""" init_value = { diff --git a/tests/unit/storages/test_key_value_store.py b/tests/unit/storages/test_key_value_store.py index 5b789d2843..42467f3c5f 100644 --- a/tests/unit/storages/test_key_value_store.py +++ b/tests/unit/storages/test_key_value_store.py @@ -1080,6 +1080,8 @@ async def test_name_default_not_allowed(storage_client: StorageClient) -> None: pytest.param('name with spaces', False, id='spaces'), pytest.param('-test', False, id='dashes start'), pytest.param('test-', False, id='dashes end'), + pytest.param('../outside', False, id='parent-ref'), + pytest.param('with/slash', False, id='slash'), ], ) async def test_validate_name(storage_client: StorageClient, name: str, *, is_valid: bool) -> None: @@ -1094,6 +1096,31 @@ async def test_validate_name(storage_client: StorageClient, name: str, *, is_val await KeyValueStore.open(name=name, storage_client=storage_client) +@pytest.mark.parametrize( + ('alias', 'is_valid'), + [ + pytest.param('valid-alias', True, id='dashes'), + pytest.param('alias_with_underscores', True, id='underscores'), + pytest.param('alias.with.dots', True, id='dots'), + pytest.param('CamelCaseAlias', True, id='mixed-case'), + pytest.param('../outside', False, id='parent-ref'), + pytest.param('..', False, id='bare-parent'), + pytest.param('.', False, id='bare-current'), + pytest.param('nested/alias', False, id='slash'), + pytest.param('back\\slash', False, id='backslash'), + ], +) +async def test_validate_alias(storage_client: StorageClient, alias: str, *, is_valid: bool) -> None: + """Test alias validation logic, including rejection of values that would resolve outside the storage dir.""" + if is_valid: + # Should not raise. + kvs = await KeyValueStore.open(alias=alias, storage_client=storage_client) + await kvs.drop() + else: + with pytest.raises(ValueError, match=r'Invalid storage alias'): + await KeyValueStore.open(alias=alias, storage_client=storage_client) + + @pytest.mark.parametrize( 'tested_storage_client_class', [ diff --git a/tests/unit/storages/test_request_queue.py b/tests/unit/storages/test_request_queue.py index 56704ebbed..9df49ec79f 100644 --- a/tests/unit/storages/test_request_queue.py +++ b/tests/unit/storages/test_request_queue.py @@ -1471,6 +1471,8 @@ async def test_name_default_not_allowed(storage_client: StorageClient) -> None: pytest.param('name with spaces', False, id='spaces'), pytest.param('-test', False, id='dashes start'), pytest.param('test-', False, id='dashes end'), + pytest.param('../outside', False, id='parent-ref'), + pytest.param('with/slash', False, id='slash'), ], ) async def test_validate_name(storage_client: StorageClient, name: str, *, is_valid: bool) -> None: @@ -1485,6 +1487,31 @@ async def test_validate_name(storage_client: StorageClient, name: str, *, is_val await RequestQueue.open(name=name, storage_client=storage_client) +@pytest.mark.parametrize( + ('alias', 'is_valid'), + [ + pytest.param('valid-alias', True, id='dashes'), + pytest.param('alias_with_underscores', True, id='underscores'), + pytest.param('alias.with.dots', True, id='dots'), + pytest.param('CamelCaseAlias', True, id='mixed-case'), + pytest.param('../outside', False, id='parent-ref'), + pytest.param('..', False, id='bare-parent'), + pytest.param('.', False, id='bare-current'), + pytest.param('nested/alias', False, id='slash'), + pytest.param('back\\slash', False, id='backslash'), + ], +) +async def test_validate_alias(storage_client: StorageClient, alias: str, *, is_valid: bool) -> None: + """Test alias validation logic, including rejection of values that would resolve outside the storage dir.""" + if is_valid: + # Should not raise. + rq = await RequestQueue.open(alias=alias, storage_client=storage_client) + await rq.drop() + else: + with pytest.raises(ValueError, match=r'Invalid storage alias'): + await RequestQueue.open(alias=alias, storage_client=storage_client) + + async def test_reclaim_request_with_change_state(rq: RequestQueue) -> None: """Test reclaiming a request and changing its state.""" # Add a request