vega · dsmedia · Apr 30, 2026 · Apr 19, 2026 · Apr 19, 2026 · Apr 19, 2026
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -183,6 +183,29 @@ uv run taplo fmt
 uv run ruff format
 ```
 
+### Validating `datapackage.json` (optional, local)
+
+After editing data files or the descriptor, you can validate the data
+package end-to-end (file integrity, schemas, and row content) with:
+
+```bash
+# Fast: first 100k rows per resource (good for quick sanity checks)
+uv run scripts/validate_datapackage.py --limit-rows 100000
+
+# Comprehensive: full read, ~5 min (flights-3m.parquet is ~3M rows)
+uv run scripts/validate_datapackage.py
+```
+
+Exits 0 when no unexpected failures occur, 1 otherwise. Not run in CI.
+
+Resources whose schema/row failures are known and non-actionable (for
+example, `movies` whose schema is intentionally aspirational, or
+`flights-200k.arrow` which frictionless can't parse) are listed in
+[`_data/validate_datapackage.toml`](_data/validate_datapackage.toml). The
+validator reports them with a warning marker (⚠) but does not fail. Remove
+an entry from that file once the underlying situation is resolved — that's
+what surfaces the fix in a PR.
+
 ## Contributing Process
 
 1. Create a branch:

diff --git a/_data/validate_datapackage.toml b/_data/validate_datapackage.toml
@@ -0,0 +1,38 @@
+# Configuration for scripts/validate_datapackage.py
+#
+# Resources whose Phase 2 (frictionless schema/row) failure is known and
+# non-actionable from this repository's perspective. Matching failures are
+# reported but do not trip the non-zero exit code, so the validator can
+# still be used to detect *new* failures.
+#
+# Each entry must include a `resource` name (matching Resource.name in
+# datapackage.json) and a `reason`. The first line of `reason` is used as
+# the inline summary in the validator's output, so keep it self-contained
+# (full explanation goes underneath). Remove an entry once the underlying
+# situation changes — that removal re-enables strict checking and surfaces
+# the fix in a PR.
+
+[[expected_failures]]
+resource = "movies"
+reason   = """
+intentional pedagogy — schema is aspirational, data has documented quirks.
+
+The descriptor's movies.json description explicitly frames the data as a
+teaching resource for data cleaning: numeric-looking titles (1776, 2012, 300)
+are stored as JSON numbers rather than strings, and release dates use
+MMM DD YYYY instead of ISO 8601. Widening the schema would defeat the
+teaching purpose and alter what downstream consumers (altair's datasets
+library, vega-lite galleries) see.
+"""
+
+[[expected_failures]]
+resource = "flights_200k_arrow"
+reason   = """
+no frictionless arrow parser (infrastructure gap, not a data issue).
+
+The shipped frictionless package has no `formats/arrow/` directory, so it
+emits `format-error: format ".arrow" is not supported`. The file itself is
+well-formed: pyarrow reads the schema and record batches, and altair
+consumes it without issue. Remove this entry once frictionless ships arrow
+support or a plugin is wired up.
+"""
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,8 +1,9 @@
 #:schema https://json.schemastore.org/pyproject.json
 [project]
 dependencies = [
-  "frictionless[json,parquet]>=5.18.0",
+  "frictionless[json,parquet]>=5.18.1",
   "niquests>=3.11.2",
+  "pandas>=2.2.3",
   "polars>=1.17.1",
   "tomli-w>=1.1.0",
 ]