vega · dsmedia · Apr 30, 2026 · Apr 19, 2026 · Apr 19, 2026 · Apr 19, 2026
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -183,6 +183,35 @@ uv run taplo fmt
 uv run ruff format
 ```
 
+### Validating `datapackage.json` (optional, local)
+
+After editing data files or the descriptor, you can validate the data
+package end-to-end with pytest. Two tiers:
+
+```bash
+# Fast tier — file existence, declared bytes, git-blob SHA-1.
+# Stdlib only, sub-second across all resources.
+uv run --group dev pytest tests/
+
+# Slow tier — frictionless schema and row validation per resource.
+# Default is full read; flights-3m.parquet (~3M rows) takes minutes.
+uv run --group dev pytest tests/ --run-slow
+
+# Slow tier with a row cap — useful for quick iteration.
+uv run --group dev pytest tests/ --run-slow --limit-rows 100000
+```
+
+Not run in CI. The slow tier is the comprehensive validation step; the
+fast tier alone does not exercise frictionless schemas.
+
+Resources whose schema/row failures are known and non-actionable (for
+example, `movies` whose schema is intentionally aspirational, or
+`flights_200k_arrow` which frictionless can't parse) are listed in
+[`_data/validate_datapackage.toml`](_data/validate_datapackage.toml).
+The slow-tier test for each is marked `xfail(strict=True)`, so it does
+not fail the run today — but if the upstream issue ever resolves, the
+test flips XFAIL → XPASS and the run fails, prompting allowlist removal.
+
 ## Contributing Process
 
 1. Create a branch:

diff --git a/_data/validate_datapackage.toml b/_data/validate_datapackage.toml
@@ -0,0 +1,37 @@
+# Allowlist consumed by tests/test_datapackage.py.
+#
+# Resources whose frictionless schema/row failure is known and non-actionable
+# from this repository's perspective. Each matching test_schema_and_rows[X]
+# is decorated with pytest.mark.xfail(reason=..., strict=True), so the test
+# does not fail the run today — but if the upstream issue ever resolves, the
+# test flips XFAIL → XPASS and the run fails, prompting allowlist removal.
+#
+# Each entry must include a `resource` name (matching Resource.name in
+# datapackage.json) and a `reason`. The first line of `reason` is the
+# summary used as the xfail reason; keep it self-contained (full explanation
+# goes underneath).
+
+[[expected_failures]]
+resource = "movies"
+reason   = """
+intentional pedagogy — schema is aspirational, data has documented quirks.
+
+The descriptor's movies.json description explicitly frames the data as a
+teaching resource for data cleaning: numeric-looking titles (1776, 2012, 300)
+are stored as JSON numbers rather than strings, and release dates use
+MMM DD YYYY instead of ISO 8601. Widening the schema would defeat the
+teaching purpose and alter what downstream consumers (altair's datasets
+library, vega-lite galleries) see.
+"""
+
+[[expected_failures]]
+resource = "flights_200k_arrow"
+reason   = """
+no frictionless arrow parser (infrastructure gap, not a data issue).
+
+The shipped frictionless package has no `formats/arrow/` directory, so it
+emits `format-error: format ".arrow" is not supported`. The file itself is
+well-formed: pyarrow reads the schema and record batches, and altair
+consumes it without issue. Remove this entry once frictionless ships arrow
+support or a plugin is wired up.
+"""
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,8 +1,9 @@
 #:schema https://json.schemastore.org/pyproject.json
 [project]
 dependencies = [
-  "frictionless[json,parquet]>=5.18.0",
+  "frictionless[json,parquet]>=5.18.1",
   "niquests>=3.11.2",
+  "pandas>=2.2.3",
   "polars>=1.17.1",
   "tomli-w>=1.1.0",
 ]
@@ -13,7 +14,7 @@ requires-python = ">=3.12"
 version = "2.11.0"
 
 [dependency-groups]
-dev = ["ipython[kernel]>=8.30.0", "ruff>=0.14.2", "taplo>=0.9.3"]
+dev = ["ipython[kernel]>=8.30.0", "pytest>=9", "ruff>=0.14.2", "taplo>=0.9.3"]
 geo-species = [
   "exactextract>=0.2.1",
   "geopandas",
@@ -26,6 +27,12 @@ geo-species = [
   "tqdm",
 ]
 
+[tool.pytest.ini_options]
+markers = [
+  "slow: full schema/row validation via frictionless; opt in with --run-slow",
+]
+testpaths = ["tests"]
+
 [tool.ruff]
 extend-exclude = [
   ".venv",

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,50 @@
+"""
+Pytest config: ``--run-slow`` and ``--limit-rows`` CLI options.
+
+The ``slow`` marker is registered in ``pyproject.toml``
+(``[tool.pytest.ini_options].markers``), matching the convention used in
+vega/altair. The expected-failures allowlist is read at parametrize time
+inside ``test_datapackage.py`` (so xfail marks attach by resource-dict
+lookup, not by ID-string parsing of pytest's test names) — that logic
+does not live here either.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+
+def pytest_addoption(parser: pytest.Parser) -> None:
+    parser.addoption(
+        "--run-slow",
+        action="store_true",
+        default=False,
+        help="Run @pytest.mark.slow tests (frictionless schema/row validation).",
+    )
+    parser.addoption(
+        "--limit-rows",
+        type=int,
+        default=None,
+        help=(
+            "Cap row reads in --run-slow tests at N rows per resource. "
+            "Default is unlimited (full read). Use a small N for quick "
+            "iteration; flights-3m takes minutes at full read."
+        ),
+    )
+
+
+@pytest.fixture(scope="session")
+def schema_limit_rows(request: pytest.FixtureRequest) -> int | None:
+    return request.config.getoption("--limit-rows")
+
+
+def pytest_collection_modifyitems(
+    config: pytest.Config, items: list[pytest.Item]
+) -> None:
+    """Skip ``slow`` items unless ``--run-slow`` was passed."""
+    if config.getoption("--run-slow"):
+        return
+    skip_slow = pytest.mark.skip(reason="opt in with --run-slow")
+    for item in items:
+        if "slow" in item.keywords:
+            item.add_marker(skip_slow)
diff --git a/tests/test_datapackage.py b/tests/test_datapackage.py
@@ -0,0 +1,157 @@
+"""
+Validate every resource in datapackage.json against its on-disk file.
+
+Two tiers:
+
+* Default — stdlib-only file existence, byte size, and git-blob SHA-1
+  against the descriptor. Sub-second across all 70+ resources. Covers
+  what frictionless-py doesn't today (byte-count returns ``None`` for
+  tabular JSON / arrow / parquet; hash-count supports only md5 and
+  sha256, descriptor uses sha1).
+
+* Slow (``pytest --run-slow``) — frictionless schema and row validation
+  per resource. Multi-minute on flights-3m at full read; opt in via the
+  ``--run-slow`` flag and pass ``--limit-rows N`` to cap row reads
+  during iteration. Default is full read.
+
+Resources whose schema/row check is known-broken upstream (``movies``
+documented pedagogy; ``flights_200k_arrow`` no upstream parser) are
+listed in ``_data/validate_datapackage.toml`` and marked
+``xfail(strict=True)`` at parametrize time. Removing an entry
+re-enables strict checking; if the upstream issue resolves, the run
+flips XFAIL → XPASS and fails, prompting allowlist removal.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import json
+import tomllib
+from copy import deepcopy
+from pathlib import Path
+from typing import Any
+
+import pytest
+from frictionless import Checklist, Package
+
+REPO = Path(__file__).resolve().parent.parent
+DATA = REPO / "data"
+DESCRIPTOR_PATH = REPO / "datapackage.json"
+ALLOWLIST_PATH = REPO / "_data" / "validate_datapackage.toml"
+
+
+def _load_resources() -> list[dict]:
+    return json.loads(DESCRIPTOR_PATH.read_text(encoding="utf-8"))["resources"]
+
+
+def _load_xfail_reasons() -> dict[str, str]:
+    """Read the allowlist; xfail reason is the first non-empty line of `reason`."""
+    if not ALLOWLIST_PATH.exists():
+        return {}
+    cfg = tomllib.loads(ALLOWLIST_PATH.read_text(encoding="utf-8"))
+    return {
+        entry["resource"]: entry["reason"].strip().splitlines()[0]
+        for entry in cfg.get("expected_failures", [])
+    }
+
+
+_RESOURCES = _load_resources()
+_RESOURCE_IDS = [r["name"] for r in _RESOURCES]
+_XFAIL = _load_xfail_reasons()
+
+# Sanity-check the allowlist against the live descriptor at import time.
+# A stale entry in validate_datapackage.toml is a silent maintenance hazard
+# otherwise — the xfail mark would never attach and a real regression
+# could slip past.
+_unknown_xfail = set(_XFAIL) - set(_RESOURCE_IDS)
+if _unknown_xfail:
+    msg = (
+        f"_data/validate_datapackage.toml lists resources not in datapackage.json: "
+        f"{sorted(_unknown_xfail)}"
+    )
+    raise RuntimeError(msg)
+
+
+def git_blob_sha1(path: Path) -> str:
+    r"""Compute git's blob SHA-1: ``sha1(b"blob {len}\0" + content)``."""
+    content = path.read_bytes()
+    return hashlib.sha1(b"blob %d\0%b" % (len(content), content)).hexdigest()
+
+
+@pytest.mark.parametrize("resource", _RESOURCES, ids=_RESOURCE_IDS)
+def test_file_exists(resource: dict) -> None:
+    assert "path" in resource, (
+        f"descriptor regression: resource {resource.get('name')!r} has no 'path'"
+    )
+    path = DATA / resource["path"]
+    assert path.exists(), f"missing data file: {resource['path']}"
+
+
+@pytest.mark.parametrize("resource", _RESOURCES, ids=_RESOURCE_IDS)
+def test_bytes_match(resource: dict) -> None:
+    assert "bytes" in resource, (
+        f"descriptor regression: 'bytes' missing for {resource['name']!r}"
+    )
+    path = DATA / resource["path"]
+    if not path.exists():
+        pytest.skip(f"file missing — see test_file_exists[{resource['name']}]")
+    declared = resource["bytes"]
+    actual = path.stat().st_size
+    assert declared == actual, f"declared={declared} disk={actual}"
+
+
+@pytest.mark.parametrize("resource", _RESOURCES, ids=_RESOURCE_IDS)
+def test_sha1_matches_git_blob(resource: dict) -> None:
+    declared = resource.get("hash", "")
+    assert declared, f"descriptor regression: 'hash' missing for {resource['name']!r}"
+    assert declared.startswith("sha1:"), (
+        f"descriptor regression: hash format not sha1 for {resource['name']!r}: "
+        f"{declared!r}"
+    )
+    path = DATA / resource["path"]
+    if not path.exists():
+        pytest.skip(f"file missing — see test_file_exists[{resource['name']}]")
+    expected = declared.removeprefix("sha1:")
+    actual = git_blob_sha1(path)
+    assert expected == actual, f"declared={expected[:10]}... disk={actual[:10]}..."
+
+
+def _slow_param(resource: dict) -> Any:  # pytest.ParameterSet; not in public API
+    """Build the parametrize entry for the slow tier; attach xfail strict if allowlisted."""
+    name = resource["name"]
+    marks = []
+    if name in _XFAIL:
+        marks = [pytest.mark.xfail(reason=_XFAIL[name], strict=True)]
+    return pytest.param(resource, id=name, marks=marks)
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize("resource", [_slow_param(r) for r in _RESOURCES])
+def test_schema_and_rows(resource: dict, schema_limit_rows: int | None) -> None:
+    """Validate column types and row content via frictionless."""
+    # parallel=False is load-bearing: frictionless's parallel code path silently
+    # ignores Checklist.skip_errors, which would re-surface byte-count and
+    # hash-count errors that phase 1 already covers more completely. Don't
+    # flip without verifying upstream.
+    checklist = Checklist(skip_errors=["byte-count", "hash-count"])
+    # basepath workaround: descriptor paths are bare filenames under data/ (see #758).
+    package = Package({"resources": [deepcopy(resource)]}, basepath=str(DATA))
+    report = package.validate(
+        checklist=checklist, limit_rows=schema_limit_rows, parallel=False
+    )
+    if report.valid:
+        return
+
+    # Failure rendering — guarded against empty tasks (frictionless can return
+    # package-level errors without per-task entries).
+    lines: list[str] = []
+    if report.tasks:
+        task_errors = report.tasks[0].errors
+        for err in task_errors[:5]:
+            field = getattr(err, "field_name", None)
+            lines.append(f"{err.type} field={field!r}: {err.note}")
+        if len(task_errors) > 5:
+            lines.append(f"  (+{len(task_errors) - 5} more)")
+    for err in list(getattr(report, "errors", []) or [])[:5]:
+        lines.append(f"package-level {err.type}: {err.note}")
+    pytest.fail("\n".join(lines) or f"validation failed (no error details): {report!r}")