Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,6 @@ jobs:
run: npm ci

- run: npm run build

- name: Validate datapackage
run: uv run pytest tests/ -v --runslow --limit-rows 250000
14 changes: 9 additions & 5 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -195,14 +195,18 @@ uv run --group dev pytest tests/

# Slow tier — frictionless schema and row validation per resource.
# Default is full read; flights-3m.parquet (~3M rows) takes minutes.
uv run --group dev pytest tests/ --run-slow
uv run --group dev pytest tests/ --runslow

# Slow tier with a row cap — useful for quick iteration.
uv run --group dev pytest tests/ --run-slow --limit-rows 100000
# Slow tier with a row cap — matches what CI runs; lower for tighter iteration.
uv run --group dev pytest tests/ --runslow --limit-rows 250000
```

Not run in CI. The slow tier is the comprehensive validation step; the
fast tier alone does not exercise frictionless schemas.
CI runs the slow tier with `--limit-rows 250000`: `flights_3m`'s ~3M
rows are sampled, every other resource is below the cap and validates
in full. The fast tier is implicitly covered too — `npm run build`
regenerates `datapackage.json` from on-disk data before the slow tier
runs, so any byte/hash drift would surface either there or in the slow
tier's schema validation.

Resources whose schema/row failures are known and non-actionable (for
example, `movies` whose schema is intentionally aspirational, or
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ geo-species = [

[tool.pytest.ini_options]
markers = [
"slow: full schema/row validation via frictionless; opt in with --run-slow",
"slow: full schema/row validation via frictionless; opt in with --runslow",
]
testpaths = ["tests"]

Expand Down
12 changes: 6 additions & 6 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
Pytest config: ``--run-slow`` and ``--limit-rows`` CLI options.
Pytest config: ``--runslow`` and ``--limit-rows`` CLI options.

The ``slow`` marker is registered in ``pyproject.toml``
(``[tool.pytest.ini_options].markers``), matching the convention used in
Expand All @@ -16,7 +16,7 @@

def pytest_addoption(parser: pytest.Parser) -> None:
parser.addoption(
"--run-slow",
"--runslow",
action="store_true",
default=False,
help="Run @pytest.mark.slow tests (frictionless schema/row validation).",
Expand All @@ -26,7 +26,7 @@ def pytest_addoption(parser: pytest.Parser) -> None:
type=int,
default=None,
help=(
"Cap row reads in --run-slow tests at N rows per resource. "
"Cap row reads in --runslow tests at N rows per resource. "
"Default is unlimited (full read). Use a small N for quick "
"iteration; flights-3m takes minutes at full read."
),
Expand All @@ -41,10 +41,10 @@ def schema_limit_rows(request: pytest.FixtureRequest) -> int | None:
def pytest_collection_modifyitems(
config: pytest.Config, items: list[pytest.Item]
) -> None:
"""Skip ``slow`` items unless ``--run-slow`` was passed."""
if config.getoption("--run-slow"):
"""Skip ``slow`` items unless ``--runslow`` was passed."""
if config.getoption("--runslow"):
return
skip_slow = pytest.mark.skip(reason="opt in with --run-slow")
skip_slow = pytest.mark.skip(reason="opt in with --runslow")
for item in items:
if "slow" in item.keywords:
item.add_marker(skip_slow)
12 changes: 10 additions & 2 deletions tests/test_datapackage.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
tabular JSON / arrow / parquet; hash-count supports only md5 and
sha256, descriptor uses sha1).

* Slow (``pytest --run-slow``) — frictionless schema and row validation
* Slow (``pytest --runslow``) — frictionless schema and row validation
per resource. Multi-minute on flights-3m at full read; opt in via the
``--run-slow`` flag and pass ``--limit-rows N`` to cap row reads
``--runslow`` flag and pass ``--limit-rows N`` to cap row reads
during iteration. Default is full read.

Resources whose schema/row check is known-broken upstream (``movies``
Expand Down Expand Up @@ -80,6 +80,7 @@ def git_blob_sha1(path: Path) -> str:

@pytest.mark.parametrize("resource", _RESOURCES, ids=_RESOURCE_IDS)
def test_file_exists(resource: dict) -> None:
"""Catch descriptors that point at a missing or relocated data file."""
assert "path" in resource, (
f"descriptor regression: resource {resource.get('name')!r} has no 'path'"
)
Expand All @@ -89,6 +90,7 @@ def test_file_exists(resource: dict) -> None:

@pytest.mark.parametrize("resource", _RESOURCES, ids=_RESOURCE_IDS)
def test_bytes_match(resource: dict) -> None:
"""Catch on-disk edits where `bytes` in the descriptor wasn't regenerated."""
assert "bytes" in resource, (
f"descriptor regression: 'bytes' missing for {resource['name']!r}"
)
Expand All @@ -102,6 +104,12 @@ def test_bytes_match(resource: dict) -> None:

@pytest.mark.parametrize("resource", _RESOURCES, ids=_RESOURCE_IDS)
def test_sha1_matches_git_blob(resource: dict) -> None:
"""
Catch on-disk edits where `hash` in the descriptor wasn't regenerated.

Uses git's blob SHA-1 so the recorded hash matches `git ls-tree` —
catches edits that change content without changing file size.
"""
declared = resource.get("hash", "")
assert declared, f"descriptor regression: 'hash' missing for {resource['name']!r}"
assert declared.startswith("sha1:"), (
Expand Down