diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index db3679d4..cb888cc7 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -40,3 +40,6 @@ jobs: run: npm ci - run: npm run build + + - name: Validate datapackage + run: uv run pytest tests/ -v --runslow --limit-rows 250000 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index eea99a50..84dbf0e6 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -195,14 +195,18 @@ uv run --group dev pytest tests/ # Slow tier — frictionless schema and row validation per resource. # Default is full read; flights-3m.parquet (~3M rows) takes minutes. -uv run --group dev pytest tests/ --run-slow +uv run --group dev pytest tests/ --runslow -# Slow tier with a row cap — useful for quick iteration. -uv run --group dev pytest tests/ --run-slow --limit-rows 100000 +# Slow tier with a row cap — matches what CI runs; lower for tighter iteration. +uv run --group dev pytest tests/ --runslow --limit-rows 250000 ``` -Not run in CI. The slow tier is the comprehensive validation step; the -fast tier alone does not exercise frictionless schemas. +CI runs the slow tier with `--limit-rows 250000`: `flights_3m`'s ~3M +rows are sampled, every other resource is below the cap and validates +in full. The fast tier is implicitly covered too — `npm run build` +regenerates `datapackage.json` from on-disk data before the slow tier +runs, so any byte/hash drift would surface either there or in the slow +tier's schema validation. Resources whose schema/row failures are known and non-actionable (for example, `movies` whose schema is intentionally aspirational, or diff --git a/pyproject.toml b/pyproject.toml index bcd657c2..aa5ca922 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ geo-species = [ [tool.pytest.ini_options] markers = [ - "slow: full schema/row validation via frictionless; opt in with --run-slow", + "slow: full schema/row validation via frictionless; opt in with --runslow", ] testpaths = ["tests"] diff --git a/tests/conftest.py b/tests/conftest.py index 8d2c5f8f..3b036b36 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,5 @@ """ -Pytest config: ``--run-slow`` and ``--limit-rows`` CLI options. +Pytest config: ``--runslow`` and ``--limit-rows`` CLI options. The ``slow`` marker is registered in ``pyproject.toml`` (``[tool.pytest.ini_options].markers``), matching the convention used in @@ -16,7 +16,7 @@ def pytest_addoption(parser: pytest.Parser) -> None: parser.addoption( - "--run-slow", + "--runslow", action="store_true", default=False, help="Run @pytest.mark.slow tests (frictionless schema/row validation).", @@ -26,7 +26,7 @@ def pytest_addoption(parser: pytest.Parser) -> None: type=int, default=None, help=( - "Cap row reads in --run-slow tests at N rows per resource. " + "Cap row reads in --runslow tests at N rows per resource. " "Default is unlimited (full read). Use a small N for quick " "iteration; flights-3m takes minutes at full read." ), @@ -41,10 +41,10 @@ def schema_limit_rows(request: pytest.FixtureRequest) -> int | None: def pytest_collection_modifyitems( config: pytest.Config, items: list[pytest.Item] ) -> None: - """Skip ``slow`` items unless ``--run-slow`` was passed.""" - if config.getoption("--run-slow"): + """Skip ``slow`` items unless ``--runslow`` was passed.""" + if config.getoption("--runslow"): return - skip_slow = pytest.mark.skip(reason="opt in with --run-slow") + skip_slow = pytest.mark.skip(reason="opt in with --runslow") for item in items: if "slow" in item.keywords: item.add_marker(skip_slow) diff --git a/tests/test_datapackage.py b/tests/test_datapackage.py index 214f21cf..5f5a1b10 100644 --- a/tests/test_datapackage.py +++ b/tests/test_datapackage.py @@ -9,9 +9,9 @@ tabular JSON / arrow / parquet; hash-count supports only md5 and sha256, descriptor uses sha1). -* Slow (``pytest --run-slow``) — frictionless schema and row validation +* Slow (``pytest --runslow``) — frictionless schema and row validation per resource. Multi-minute on flights-3m at full read; opt in via the - ``--run-slow`` flag and pass ``--limit-rows N`` to cap row reads + ``--runslow`` flag and pass ``--limit-rows N`` to cap row reads during iteration. Default is full read. Resources whose schema/row check is known-broken upstream (``movies`` @@ -80,6 +80,7 @@ def git_blob_sha1(path: Path) -> str: @pytest.mark.parametrize("resource", _RESOURCES, ids=_RESOURCE_IDS) def test_file_exists(resource: dict) -> None: + """Catch descriptors that point at a missing or relocated data file.""" assert "path" in resource, ( f"descriptor regression: resource {resource.get('name')!r} has no 'path'" ) @@ -89,6 +90,7 @@ def test_file_exists(resource: dict) -> None: @pytest.mark.parametrize("resource", _RESOURCES, ids=_RESOURCE_IDS) def test_bytes_match(resource: dict) -> None: + """Catch on-disk edits where `bytes` in the descriptor wasn't regenerated.""" assert "bytes" in resource, ( f"descriptor regression: 'bytes' missing for {resource['name']!r}" ) @@ -102,6 +104,12 @@ def test_bytes_match(resource: dict) -> None: @pytest.mark.parametrize("resource", _RESOURCES, ids=_RESOURCE_IDS) def test_sha1_matches_git_blob(resource: dict) -> None: + """ + Catch on-disk edits where `hash` in the descriptor wasn't regenerated. + + Uses git's blob SHA-1 so the recorded hash matches `git ls-tree` — + catches edits that change content without changing file size. + """ declared = resource.get("hash", "") assert declared, f"descriptor regression: 'hash' missing for {resource['name']!r}" assert declared.startswith("sha1:"), (