From a0d5143cb1f442faaa49ee294fb4b111cda9e0e6 Mon Sep 17 00:00:00 2001
From: dsmedia <63077097+dsmedia@users.noreply.github.com>
Date: Fri, 8 May 2026 21:47:45 -0400
Subject: [PATCH 1/2] ci: run datapackage validation tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Wire the pytest suite from #782 into the existing Test workflow so
descriptor/data drift fails CI rather than slipping through. Includes
the slow tier (--run-slow) — the fast tier alone catches bytes/sha1
drift that npm run build would re-trip anyway, while the slow tier is
the unique value-add catching schema-vs-data drift.

Step is placed after npm run build so tests validate the freshly
rebuilt datapackage.json (catches build_datapackage.py regressions,
not just committed-state drift).

Local timing: 290 passed, 2 xfailed in ~3m32s on WSL2 ARM (flights_3m
is the long pole). The two xfails are the pre-existing allowlisted
movies + flights_200k_arrow entries from _data/validate_datapackage.toml.

Closes the follow-up commitment from #782 review (#782 (comment)).
---
 .github/workflows/test.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index db3679d4..43a2aaeb 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -40,3 +40,6 @@ jobs:
         run: npm ci
 
       - run: npm run build
+
+      - name: Validate datapackage
+        run: uv run pytest tests/ -v --run-slow

From 193298cba8495f3bbe425112450662137771fcb9 Mon Sep 17 00:00:00 2001
From: dsmedia <63077097+dsmedia@users.noreply.github.com>
Date: Sun, 10 May 2026 19:33:56 -0400
Subject: [PATCH 2/2] =?UTF-8?q?ci:=20cap=20pytest=20at=20--limit-rows=2025?=
 =?UTF-8?q?0000=20and=20rename=20--run-slow=20=E2=86=92=20--runslow?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per @domoritz on #784: flights_3m is the only resource above ~200K
rows, so capping at 250K keeps every other dataset fully validated
while cutting the Test job from ~4m43s to ~40s.

Renamed --run-slow → --runslow to match the dominant Python convention
(pytest docs example, ~1.3K GitHub hits vs ~500 for --run-slow).
Updated CONTRIBUTING.md to reflect that the slow tier now runs in CI
and added per-test docstrings naming each fast-tier failure mode.
---
 .github/workflows/test.yml |  2 +-
 CONTRIBUTING.md            | 14 +++++++++-----
 pyproject.toml             |  2 +-
 tests/conftest.py          | 12 ++++++------
 tests/test_datapackage.py  | 12 ++++++++++--
 5 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 43a2aaeb..cb888cc7 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -42,4 +42,4 @@ jobs:
       - run: npm run build
 
       - name: Validate datapackage
-        run: uv run pytest tests/ -v --run-slow
+        run: uv run pytest tests/ -v --runslow --limit-rows 250000
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index eea99a50..84dbf0e6 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -195,14 +195,18 @@ uv run --group dev pytest tests/
 
 # Slow tier — frictionless schema and row validation per resource.
 # Default is full read; flights-3m.parquet (~3M rows) takes minutes.
-uv run --group dev pytest tests/ --run-slow
+uv run --group dev pytest tests/ --runslow
 
-# Slow tier with a row cap — useful for quick iteration.
-uv run --group dev pytest tests/ --run-slow --limit-rows 100000
+# Slow tier with a row cap — matches what CI runs; lower for tighter iteration.
+uv run --group dev pytest tests/ --runslow --limit-rows 250000
 ```
 
-Not run in CI. The slow tier is the comprehensive validation step; the
-fast tier alone does not exercise frictionless schemas.
+CI runs the slow tier with `--limit-rows 250000`: `flights_3m`'s ~3M
+rows are sampled, every other resource is below the cap and validates
+in full. The fast tier is implicitly covered too — `npm run build`
+regenerates `datapackage.json` from on-disk data before the slow tier
+runs, so any byte/hash drift would surface either there or in the slow
+tier's schema validation.
 
 Resources whose schema/row failures are known and non-actionable (for
 example, `movies` whose schema is intentionally aspirational, or
diff --git a/pyproject.toml b/pyproject.toml
index bcd657c2..aa5ca922 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,7 +29,7 @@ geo-species = [
 
 [tool.pytest.ini_options]
 markers = [
-  "slow: full schema/row validation via frictionless; opt in with --run-slow",
+  "slow: full schema/row validation via frictionless; opt in with --runslow",
 ]
 testpaths = ["tests"]
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 8d2c5f8f..3b036b36 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,5 +1,5 @@
 """
-Pytest config: ``--run-slow`` and ``--limit-rows`` CLI options.
+Pytest config: ``--runslow`` and ``--limit-rows`` CLI options.
 
 The ``slow`` marker is registered in ``pyproject.toml``
 (``[tool.pytest.ini_options].markers``), matching the convention used in
@@ -16,7 +16,7 @@
 
 def pytest_addoption(parser: pytest.Parser) -> None:
     parser.addoption(
-        "--run-slow",
+        "--runslow",
         action="store_true",
         default=False,
         help="Run @pytest.mark.slow tests (frictionless schema/row validation).",
@@ -26,7 +26,7 @@ def pytest_addoption(parser: pytest.Parser) -> None:
         type=int,
         default=None,
         help=(
-            "Cap row reads in --run-slow tests at N rows per resource. "
+            "Cap row reads in --runslow tests at N rows per resource. "
             "Default is unlimited (full read). Use a small N for quick "
             "iteration; flights-3m takes minutes at full read."
         ),
@@ -41,10 +41,10 @@ def schema_limit_rows(request: pytest.FixtureRequest) -> int | None:
 def pytest_collection_modifyitems(
     config: pytest.Config, items: list[pytest.Item]
 ) -> None:
-    """Skip ``slow`` items unless ``--run-slow`` was passed."""
-    if config.getoption("--run-slow"):
+    """Skip ``slow`` items unless ``--runslow`` was passed."""
+    if config.getoption("--runslow"):
         return
-    skip_slow = pytest.mark.skip(reason="opt in with --run-slow")
+    skip_slow = pytest.mark.skip(reason="opt in with --runslow")
     for item in items:
         if "slow" in item.keywords:
             item.add_marker(skip_slow)
diff --git a/tests/test_datapackage.py b/tests/test_datapackage.py
index 214f21cf..5f5a1b10 100644
--- a/tests/test_datapackage.py
+++ b/tests/test_datapackage.py
@@ -9,9 +9,9 @@
   tabular JSON / arrow / parquet; hash-count supports only md5 and
   sha256, descriptor uses sha1).
 
-* Slow (``pytest --run-slow``) — frictionless schema and row validation
+* Slow (``pytest --runslow``) — frictionless schema and row validation
   per resource. Multi-minute on flights-3m at full read; opt in via the
-  ``--run-slow`` flag and pass ``--limit-rows N`` to cap row reads
+  ``--runslow`` flag and pass ``--limit-rows N`` to cap row reads
   during iteration. Default is full read.
 
 Resources whose schema/row check is known-broken upstream (``movies``
@@ -80,6 +80,7 @@ def git_blob_sha1(path: Path) -> str:
 
 @pytest.mark.parametrize("resource", _RESOURCES, ids=_RESOURCE_IDS)
 def test_file_exists(resource: dict) -> None:
+    """Catch descriptors that point at a missing or relocated data file."""
     assert "path" in resource, (
         f"descriptor regression: resource {resource.get('name')!r} has no 'path'"
     )
@@ -89,6 +90,7 @@ def test_file_exists(resource: dict) -> None:
 
 @pytest.mark.parametrize("resource", _RESOURCES, ids=_RESOURCE_IDS)
 def test_bytes_match(resource: dict) -> None:
+    """Catch on-disk edits where `bytes` in the descriptor wasn't regenerated."""
     assert "bytes" in resource, (
         f"descriptor regression: 'bytes' missing for {resource['name']!r}"
     )
@@ -102,6 +104,12 @@ def test_bytes_match(resource: dict) -> None:
 
 @pytest.mark.parametrize("resource", _RESOURCES, ids=_RESOURCE_IDS)
 def test_sha1_matches_git_blob(resource: dict) -> None:
+    """
+    Catch on-disk edits where `hash` in the descriptor wasn't regenerated.
+
+    Uses git's blob SHA-1 so the recorded hash matches `git ls-tree` —
+    catches edits that change content without changing file size.
+    """
     declared = resource.get("hash", "")
     assert declared, f"descriptor regression: 'hash' missing for {resource['name']!r}"
     assert declared.startswith("sha1:"), (