Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions src/crawlee/_utils/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,37 @@ def _write_file(path: Path, data: str | bytes) -> None:
raise


def validate_subdirectory(base_dir: Path, subdirectory: str) -> Path:
"""Resolve a storage subdirectory inside a base directory.

Joins `subdirectory` onto `base_dir` and verifies that the result stays inside `base_dir`, so a storage
name or alias always maps to a directory under the storage directory rather than somewhere else (e.g. a
value containing `..` or an absolute path).

Args:
base_dir: The base storage directory (e.g. the `key_value_stores` directory).
subdirectory: The storage name or alias to use as the subdirectory.

Returns:
The validated full path to the storage subdirectory.

Raises:
ValueError: If the resolved path would fall outside `base_dir`.
"""
# Normalize lexically (no filesystem access), so symlinks are not followed and the check is deterministic.
base_resolved = Path(os.path.normpath(base_dir))
target_resolved = Path(os.path.normpath(base_dir / subdirectory))

# The target must be strictly inside the base directory - reject parent references and absolute paths.
if target_resolved == base_resolved or base_resolved not in target_resolved.parents:
raise ValueError(
f'Invalid storage name or alias "{subdirectory}". It must not contain path separators, parent '
f'directory references ("..") or absolute paths that resolve outside the storage directory.'
)

return target_resolved


def infer_mime_type(value: Any) -> str:
"""Infer the MIME content type from the value.

Expand Down
5 changes: 2 additions & 3 deletions src/crawlee/storage_clients/_file_system/_dataset_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from crawlee._consts import METADATA_FILENAME
from crawlee._types import JsonSerializable
from crawlee._utils.crypto import crypto_random_object_id
from crawlee._utils.file import atomic_write, json_dumps
from crawlee._utils.file import atomic_write, json_dumps, validate_subdirectory
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
from crawlee.storage_clients._base import DatasetClient
from crawlee.storage_clients.models import DatasetItemsListPage, DatasetMetadata
Expand Down Expand Up @@ -159,8 +159,7 @@ async def open(

# Get a new instance by name or alias.
else:
dataset_dir = Path(name) if name else Path(alias) if alias else Path('default')
path_to_dataset = dataset_base_path / dataset_dir
path_to_dataset = validate_subdirectory(dataset_base_path, name or alias or 'default')
path_to_metadata = path_to_dataset / METADATA_FILENAME

# If the dataset directory exists, reconstruct the client from the metadata file.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

from crawlee._consts import METADATA_FILENAME
from crawlee._utils.crypto import crypto_random_object_id
from crawlee._utils.file import atomic_write, infer_mime_type, json_dumps
from crawlee._utils.file import atomic_write, infer_mime_type, json_dumps, validate_subdirectory
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
from crawlee.storage_clients._base import KeyValueStoreClient
from crawlee.storage_clients.models import KeyValueStoreMetadata, KeyValueStoreRecord, KeyValueStoreRecordMetadata
Expand Down Expand Up @@ -157,8 +157,7 @@ async def open(

# Get a new instance by name or alias.
else:
kvs_dir = Path(name) if name else Path(alias) if alias else Path('default')
path_to_kvs = kvs_base_path / kvs_dir
path_to_kvs = validate_subdirectory(kvs_base_path, name or alias or 'default')
path_to_metadata = path_to_kvs / METADATA_FILENAME

# If the key-value store directory exists, reconstruct the client from the metadata file.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from crawlee import Request
from crawlee._consts import METADATA_FILENAME
from crawlee._utils.crypto import crypto_random_object_id
from crawlee._utils.file import atomic_write, json_dumps
from crawlee._utils.file import atomic_write, json_dumps, validate_subdirectory
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
from crawlee._utils.recoverable_state import RecoverableState
from crawlee.storage_clients._base import RequestQueueClient
Expand Down Expand Up @@ -227,8 +227,7 @@ async def open(

# Open an existing RQ by its name or alias, or create a new one if not found.
else:
rq_dir = Path(name) if name else Path(alias) if alias else Path('default')
path_to_rq = rq_base_path / rq_dir
path_to_rq = validate_subdirectory(rq_base_path, name or alias or 'default')
path_to_metadata = path_to_rq / METADATA_FILENAME

# If the RQ directory exists, reconstruct the client from the metadata file.
Expand Down
6 changes: 5 additions & 1 deletion src/crawlee/storages/_storage_instance_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from crawlee._utils.raise_if_too_many_kwargs import raise_if_too_many_kwargs
from crawlee.storage_clients._base import DatasetClient, KeyValueStoreClient, RequestQueueClient

from ._utils import validate_storage_name
from ._utils import validate_storage_alias, validate_storage_name

if TYPE_CHECKING:
from ._base import Storage
Expand Down Expand Up @@ -135,6 +135,10 @@ async def open_storage_instance(
if name is not None:
validate_storage_name(name)

# Validate storage alias
if alias is not None:
validate_storage_alias(alias)

# Acquire lock for this opener
opener_lock_key = (cls, str(id or name or alias), storage_client_cache_key)
if not (lock := self._opener_locks.get(opener_lock_key)):
Expand Down
17 changes: 17 additions & 0 deletions src/crawlee/storages/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,20 @@ def validate_storage_name(name: str | None) -> None:
f'Invalid storage name "{name}". Name can only contain letters "a" through "z", the digits "0" through'
'"9", and the hyphen ("-") but only in the middle of the string (e.g. "my-value-1")'
)


def validate_storage_alias(alias: str | None) -> None:
"""Validate a storage alias that is used as an on-disk subdirectory name.

Unlike storage names, aliases may contain underscores and dots (e.g. the reserved `__default__` alias),
but must not contain path separators or parent-directory references, so an alias always maps to a single
directory under the storage directory.
"""
if alias is None:
return

if not alias or '/' in alias or '\\' in alias or '\x00' in alias or alias in {'.', '..'}:
raise ValueError(
f'Invalid storage alias "{alias}". Alias must not be empty, contain path separators ("/", "\\") or '
f'null bytes, or be a directory reference (".", "..").'
)
46 changes: 45 additions & 1 deletion tests/unit/_utils/test_file.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
from __future__ import annotations

from datetime import datetime, timezone
from typing import TYPE_CHECKING

from crawlee._utils.file import json_dumps
import pytest

from crawlee._utils.file import json_dumps, validate_subdirectory

if TYPE_CHECKING:
from pathlib import Path


async def test_json_dumps() -> None:
Expand All @@ -11,3 +17,41 @@ async def test_json_dumps() -> None:
assert await json_dumps('string') == '"string"'
assert await json_dumps(123) == '123'
assert await json_dumps(datetime(2022, 1, 1, tzinfo=timezone.utc)) == '"2022-01-01 00:00:00+00:00"'


# Tests for validate_subdirectory (storage name/alias directory validation).


@pytest.mark.parametrize(
'subdirectory',
[
pytest.param('my-store', id='simple'),
pytest.param('store_with_underscores', id='underscores'),
pytest.param('store.with.dots', id='dots'),
pytest.param('__default__', id='reserved-default'),
pytest.param('nested/inside', id='nested-but-contained'),
],
)
def test_validate_subdirectory_accepts_safe_segments(tmp_path: Path, subdirectory: str) -> None:
base_dir = tmp_path / 'key_value_stores'
result = validate_subdirectory(base_dir, subdirectory)
# The resolved path must stay within the base directory.
assert result.parent == base_dir or base_dir in result.parents


@pytest.mark.parametrize(
'subdirectory',
[
pytest.param('../outside', id='parent-ref'),
pytest.param('../../outside', id='deep-parent-ref'),
pytest.param('..', id='bare-parent'),
pytest.param('.', id='bare-current'),
pytest.param('a/../../outside', id='mixed-parent-ref'),
pytest.param('/etc/passwd', id='absolute-path'),
pytest.param('', id='empty'),
],
)
def test_validate_subdirectory_rejects_invalid_segments(tmp_path: Path, subdirectory: str) -> None:
base_dir = tmp_path / 'key_value_stores'
with pytest.raises(ValueError, match='Invalid storage name or alias'):
validate_subdirectory(base_dir, subdirectory)
27 changes: 27 additions & 0 deletions tests/unit/storage_clients/_file_system/test_fs_dataset_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,33 @@ async def test_file_and_directory_creation(configuration: Configuration) -> None
await client.drop()


@pytest.mark.parametrize(
'invalid_value',
[
pytest.param('../outside', id='parent-ref'),
pytest.param('..', id='bare-parent'),
pytest.param('/abs/outside', id='absolute-path'),
],
)
async def test_open_rejects_invalid_name_or_alias(
configuration: Configuration, tmp_path: Path, invalid_value: str
) -> None:
"""The low-level client must reject names/aliases that resolve outside the storage directory.

This covers direct usage of the storage client, which bypasses the high-level validation.
"""
storage_client = FileSystemStorageClient()

with pytest.raises(ValueError, match='Invalid storage name or alias'):
await storage_client.create_dataset_client(alias=invalid_value, configuration=configuration)

with pytest.raises(ValueError, match='Invalid storage name or alias'):
await storage_client.create_dataset_client(name=invalid_value, configuration=configuration)

# Nothing should have been written outside the configured storage directory.
assert not (tmp_path / 'outside').exists()


async def test_file_persistence_and_content_verification(dataset_client: FileSystemDatasetClient) -> None:
"""Test that data is properly persisted to files with correct content."""
item = {'key': 'value', 'number': 42}
Expand Down
27 changes: 27 additions & 0 deletions tests/unit/storage_clients/_file_system/test_fs_kvs_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,33 @@ async def test_file_and_directory_creation(configuration: Configuration) -> None
await client.drop()


@pytest.mark.parametrize(
'invalid_value',
[
pytest.param('../outside', id='parent-ref'),
pytest.param('..', id='bare-parent'),
pytest.param('/abs/outside', id='absolute-path'),
],
)
async def test_open_rejects_invalid_name_or_alias(
configuration: Configuration, tmp_path: Path, invalid_value: str
) -> None:
"""The low-level client must reject names/aliases that resolve outside the storage directory.

This covers direct usage of the storage client, which bypasses the high-level validation.
"""
storage_client = FileSystemStorageClient()

with pytest.raises(ValueError, match='Invalid storage name or alias'):
await storage_client.create_kvs_client(alias=invalid_value, configuration=configuration)

with pytest.raises(ValueError, match='Invalid storage name or alias'):
await storage_client.create_kvs_client(name=invalid_value, configuration=configuration)

# Nothing should have been written outside the configured storage directory.
assert not (tmp_path / 'outside').exists()


async def test_value_file_creation_and_content(kvs_client: FileSystemKeyValueStoreClient) -> None:
"""Test that values are properly persisted to files with correct content and metadata."""
test_key = 'test-key'
Expand Down
27 changes: 27 additions & 0 deletions tests/unit/storage_clients/_file_system/test_fs_rq_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,33 @@ async def test_file_and_directory_creation() -> None:
await client.drop()


@pytest.mark.parametrize(
'invalid_value',
[
pytest.param('../outside', id='parent-ref'),
pytest.param('..', id='bare-parent'),
pytest.param('/abs/outside', id='absolute-path'),
],
)
async def test_open_rejects_invalid_name_or_alias(
configuration: Configuration, tmp_path: Path, invalid_value: str
) -> None:
"""The low-level client must reject names/aliases that resolve outside the storage directory.

This covers direct usage of the storage client, which bypasses the high-level validation.
"""
storage_client = FileSystemStorageClient()

with pytest.raises(ValueError, match='Invalid storage name or alias'):
await storage_client.create_rq_client(alias=invalid_value, configuration=configuration)

with pytest.raises(ValueError, match='Invalid storage name or alias'):
await storage_client.create_rq_client(name=invalid_value, configuration=configuration)

# Nothing should have been written outside the configured storage directory.
assert not (tmp_path / 'outside').exists()


async def test_request_file_persistence(rq_client: FileSystemRequestQueueClient) -> None:
"""Test that requests are properly persisted to files."""
requests = [
Expand Down
27 changes: 27 additions & 0 deletions tests/unit/storages/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1078,6 +1078,8 @@ async def test_name_default_not_allowed(storage_client: StorageClient) -> None:
pytest.param('name with spaces', False, id='spaces'),
pytest.param('-test', False, id='dashes start'),
pytest.param('test-', False, id='dashes end'),
pytest.param('../outside', False, id='parent-ref'),
pytest.param('with/slash', False, id='slash'),
],
)
async def test_validate_name(storage_client: StorageClient, name: str, *, is_valid: bool) -> None:
Expand All @@ -1092,6 +1094,31 @@ async def test_validate_name(storage_client: StorageClient, name: str, *, is_val
await Dataset.open(name=name, storage_client=storage_client)


@pytest.mark.parametrize(
('alias', 'is_valid'),
[
pytest.param('valid-alias', True, id='dashes'),
pytest.param('alias_with_underscores', True, id='underscores'),
pytest.param('alias.with.dots', True, id='dots'),
pytest.param('CamelCaseAlias', True, id='mixed-case'),
pytest.param('../outside', False, id='parent-ref'),
pytest.param('..', False, id='bare-parent'),
pytest.param('.', False, id='bare-current'),
pytest.param('nested/alias', False, id='slash'),
pytest.param('back\\slash', False, id='backslash'),
],
)
async def test_validate_alias(storage_client: StorageClient, alias: str, *, is_valid: bool) -> None:
"""Test alias validation logic, including rejection of values that would resolve outside the storage dir."""
if is_valid:
# Should not raise.
dataset = await Dataset.open(alias=alias, storage_client=storage_client)
await dataset.drop()
else:
with pytest.raises(ValueError, match=r'Invalid storage alias'):
await Dataset.open(alias=alias, storage_client=storage_client)


async def test_record_with_noascii_chars(dataset: Dataset) -> None:
"""Test handling record with non-ASCII characters."""
init_value = {
Expand Down
27 changes: 27 additions & 0 deletions tests/unit/storages/test_key_value_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -1080,6 +1080,8 @@ async def test_name_default_not_allowed(storage_client: StorageClient) -> None:
pytest.param('name with spaces', False, id='spaces'),
pytest.param('-test', False, id='dashes start'),
pytest.param('test-', False, id='dashes end'),
pytest.param('../outside', False, id='parent-ref'),
pytest.param('with/slash', False, id='slash'),
],
)
async def test_validate_name(storage_client: StorageClient, name: str, *, is_valid: bool) -> None:
Expand All @@ -1094,6 +1096,31 @@ async def test_validate_name(storage_client: StorageClient, name: str, *, is_val
await KeyValueStore.open(name=name, storage_client=storage_client)


@pytest.mark.parametrize(
('alias', 'is_valid'),
[
pytest.param('valid-alias', True, id='dashes'),
pytest.param('alias_with_underscores', True, id='underscores'),
pytest.param('alias.with.dots', True, id='dots'),
pytest.param('CamelCaseAlias', True, id='mixed-case'),
pytest.param('../outside', False, id='parent-ref'),
pytest.param('..', False, id='bare-parent'),
pytest.param('.', False, id='bare-current'),
pytest.param('nested/alias', False, id='slash'),
pytest.param('back\\slash', False, id='backslash'),
],
)
async def test_validate_alias(storage_client: StorageClient, alias: str, *, is_valid: bool) -> None:
"""Test alias validation logic, including rejection of values that would resolve outside the storage dir."""
if is_valid:
# Should not raise.
kvs = await KeyValueStore.open(alias=alias, storage_client=storage_client)
await kvs.drop()
else:
with pytest.raises(ValueError, match=r'Invalid storage alias'):
await KeyValueStore.open(alias=alias, storage_client=storage_client)


@pytest.mark.parametrize(
'tested_storage_client_class',
[
Expand Down
Loading
Loading