From a0d5143cb1f442faaa49ee294fb4b111cda9e0e6 Mon Sep 17 00:00:00 2001 From: dsmedia <63077097+dsmedia@users.noreply.github.com> Date: Fri, 8 May 2026 21:47:45 -0400 Subject: [PATCH 1/2] ci: run datapackage validation tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wire the pytest suite from #782 into the existing Test workflow so descriptor/data drift fails CI rather than slipping through. Includes the slow tier (--run-slow) — the fast tier alone catches bytes/sha1 drift that npm run build would re-trip anyway, while the slow tier is the unique value-add catching schema-vs-data drift. Step is placed after npm run build so tests validate the freshly rebuilt datapackage.json (catches build_datapackage.py regressions, not just committed-state drift). Local timing: 290 passed, 2 xfailed in ~3m32s on WSL2 ARM (flights_3m is the long pole). The two xfails are the pre-existing allowlisted movies + flights_200k_arrow entries from _data/validate_datapackage.toml. Closes the follow-up commitment from #782 review (#782 (comment)). --- .github/workflows/test.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index db3679d4..43a2aaeb 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -40,3 +40,6 @@ jobs: run: npm ci - run: npm run build + + - name: Validate datapackage + run: uv run pytest tests/ -v --run-slow From 193298cba8495f3bbe425112450662137771fcb9 Mon Sep 17 00:00:00 2001 From: dsmedia <63077097+dsmedia@users.noreply.github.com> Date: Sun, 10 May 2026 19:33:56 -0400 Subject: [PATCH 2/2] =?UTF-8?q?ci:=20cap=20pytest=20at=20--limit-rows=2025?= =?UTF-8?q?0000=20and=20rename=20--run-slow=20=E2=86=92=20--runslow?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per @domoritz on #784: flights_3m is the only resource above ~200K rows, so capping at 250K keeps every other dataset fully validated while cutting the Test job from ~4m43s to ~40s. Renamed --run-slow → --runslow to match the dominant Python convention (pytest docs example, ~1.3K GitHub hits vs ~500 for --run-slow). Updated CONTRIBUTING.md to reflect that the slow tier now runs in CI and added per-test docstrings naming each fast-tier failure mode. --- .github/workflows/test.yml | 2 +- CONTRIBUTING.md | 14 +++++++++----- pyproject.toml | 2 +- tests/conftest.py | 12 ++++++------ tests/test_datapackage.py | 12 ++++++++++-- 5 files changed, 27 insertions(+), 15 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 43a2aaeb..cb888cc7 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -42,4 +42,4 @@ jobs: - run: npm run build - name: Validate datapackage - run: uv run pytest tests/ -v --run-slow + run: uv run pytest tests/ -v --runslow --limit-rows 250000 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index eea99a50..84dbf0e6 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -195,14 +195,18 @@ uv run --group dev pytest tests/ # Slow tier — frictionless schema and row validation per resource. # Default is full read; flights-3m.parquet (~3M rows) takes minutes. -uv run --group dev pytest tests/ --run-slow +uv run --group dev pytest tests/ --runslow -# Slow tier with a row cap — useful for quick iteration. -uv run --group dev pytest tests/ --run-slow --limit-rows 100000 +# Slow tier with a row cap — matches what CI runs; lower for tighter iteration. +uv run --group dev pytest tests/ --runslow --limit-rows 250000 ``` -Not run in CI. The slow tier is the comprehensive validation step; the -fast tier alone does not exercise frictionless schemas. +CI runs the slow tier with `--limit-rows 250000`: `flights_3m`'s ~3M +rows are sampled, every other resource is below the cap and validates +in full. The fast tier is implicitly covered too — `npm run build` +regenerates `datapackage.json` from on-disk data before the slow tier +runs, so any byte/hash drift would surface either there or in the slow +tier's schema validation. Resources whose schema/row failures are known and non-actionable (for example, `movies` whose schema is intentionally aspirational, or diff --git a/pyproject.toml b/pyproject.toml index bcd657c2..aa5ca922 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ geo-species = [ [tool.pytest.ini_options] markers = [ - "slow: full schema/row validation via frictionless; opt in with --run-slow", + "slow: full schema/row validation via frictionless; opt in with --runslow", ] testpaths = ["tests"] diff --git a/tests/conftest.py b/tests/conftest.py index 8d2c5f8f..3b036b36 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,5 @@ """ -Pytest config: ``--run-slow`` and ``--limit-rows`` CLI options. +Pytest config: ``--runslow`` and ``--limit-rows`` CLI options. The ``slow`` marker is registered in ``pyproject.toml`` (``[tool.pytest.ini_options].markers``), matching the convention used in @@ -16,7 +16,7 @@ def pytest_addoption(parser: pytest.Parser) -> None: parser.addoption( - "--run-slow", + "--runslow", action="store_true", default=False, help="Run @pytest.mark.slow tests (frictionless schema/row validation).", @@ -26,7 +26,7 @@ def pytest_addoption(parser: pytest.Parser) -> None: type=int, default=None, help=( - "Cap row reads in --run-slow tests at N rows per resource. " + "Cap row reads in --runslow tests at N rows per resource. " "Default is unlimited (full read). Use a small N for quick " "iteration; flights-3m takes minutes at full read." ), @@ -41,10 +41,10 @@ def schema_limit_rows(request: pytest.FixtureRequest) -> int | None: def pytest_collection_modifyitems( config: pytest.Config, items: list[pytest.Item] ) -> None: - """Skip ``slow`` items unless ``--run-slow`` was passed.""" - if config.getoption("--run-slow"): + """Skip ``slow`` items unless ``--runslow`` was passed.""" + if config.getoption("--runslow"): return - skip_slow = pytest.mark.skip(reason="opt in with --run-slow") + skip_slow = pytest.mark.skip(reason="opt in with --runslow") for item in items: if "slow" in item.keywords: item.add_marker(skip_slow) diff --git a/tests/test_datapackage.py b/tests/test_datapackage.py index 214f21cf..5f5a1b10 100644 --- a/tests/test_datapackage.py +++ b/tests/test_datapackage.py @@ -9,9 +9,9 @@ tabular JSON / arrow / parquet; hash-count supports only md5 and sha256, descriptor uses sha1). -* Slow (``pytest --run-slow``) — frictionless schema and row validation +* Slow (``pytest --runslow``) — frictionless schema and row validation per resource. Multi-minute on flights-3m at full read; opt in via the - ``--run-slow`` flag and pass ``--limit-rows N`` to cap row reads + ``--runslow`` flag and pass ``--limit-rows N`` to cap row reads during iteration. Default is full read. Resources whose schema/row check is known-broken upstream (``movies`` @@ -80,6 +80,7 @@ def git_blob_sha1(path: Path) -> str: @pytest.mark.parametrize("resource", _RESOURCES, ids=_RESOURCE_IDS) def test_file_exists(resource: dict) -> None: + """Catch descriptors that point at a missing or relocated data file.""" assert "path" in resource, ( f"descriptor regression: resource {resource.get('name')!r} has no 'path'" ) @@ -89,6 +90,7 @@ def test_file_exists(resource: dict) -> None: @pytest.mark.parametrize("resource", _RESOURCES, ids=_RESOURCE_IDS) def test_bytes_match(resource: dict) -> None: + """Catch on-disk edits where `bytes` in the descriptor wasn't regenerated.""" assert "bytes" in resource, ( f"descriptor regression: 'bytes' missing for {resource['name']!r}" ) @@ -102,6 +104,12 @@ def test_bytes_match(resource: dict) -> None: @pytest.mark.parametrize("resource", _RESOURCES, ids=_RESOURCE_IDS) def test_sha1_matches_git_blob(resource: dict) -> None: + """ + Catch on-disk edits where `hash` in the descriptor wasn't regenerated. + + Uses git's blob SHA-1 so the recorded hash matches `git ls-tree` — + catches edits that change content without changing file size. + """ declared = resource.get("hash", "") assert declared, f"descriptor regression: 'hash' missing for {resource['name']!r}" assert declared.startswith("sha1:"), (