Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,35 @@ uv run taplo fmt
uv run ruff format
```

### Validating `datapackage.json` (optional, local)

After editing data files or the descriptor, you can validate the data
package end-to-end with pytest. Two tiers:

```bash
# Fast tier — file existence, declared bytes, git-blob SHA-1.
# Stdlib only, sub-second across all resources.
uv run --group dev pytest tests/

# Slow tier — frictionless schema and row validation per resource.
# Default is full read; flights-3m.parquet (~3M rows) takes minutes.
uv run --group dev pytest tests/ --run-slow

# Slow tier with a row cap — useful for quick iteration.
uv run --group dev pytest tests/ --run-slow --limit-rows 100000
```

Not run in CI. The slow tier is the comprehensive validation step; the
fast tier alone does not exercise frictionless schemas.

Resources whose schema/row failures are known and non-actionable (for
example, `movies` whose schema is intentionally aspirational, or
`flights_200k_arrow` which frictionless can't parse) are listed in
[`_data/validate_datapackage.toml`](_data/validate_datapackage.toml).
The slow-tier test for each is marked `xfail(strict=True)`, so it does
not fail the run today — but if the upstream issue ever resolves, the
test flips XFAIL → XPASS and the run fails, prompting allowlist removal.

## Contributing Process

1. Create a branch:
Expand Down
37 changes: 37 additions & 0 deletions _data/validate_datapackage.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Allowlist consumed by tests/test_datapackage.py.
#
# Resources whose frictionless schema/row failure is known and non-actionable
# from this repository's perspective. Each matching test_schema_and_rows[X]
# is decorated with pytest.mark.xfail(reason=..., strict=True), so the test
# does not fail the run today — but if the upstream issue ever resolves, the
# test flips XFAIL → XPASS and the run fails, prompting allowlist removal.
#
# Each entry must include a `resource` name (matching Resource.name in
# datapackage.json) and a `reason`. The first line of `reason` is the
# summary used as the xfail reason; keep it self-contained (full explanation
# goes underneath).

[[expected_failures]]
resource = "movies"
reason = """
intentional pedagogy — schema is aspirational, data has documented quirks.

The descriptor's movies.json description explicitly frames the data as a
teaching resource for data cleaning: numeric-looking titles (1776, 2012, 300)
are stored as JSON numbers rather than strings, and release dates use
MMM DD YYYY instead of ISO 8601. Widening the schema would defeat the
teaching purpose and alter what downstream consumers (altair's datasets
library, vega-lite galleries) see.
"""

[[expected_failures]]
resource = "flights_200k_arrow"
reason = """
no frictionless arrow parser (infrastructure gap, not a data issue).

The shipped frictionless package has no `formats/arrow/` directory, so it
emits `format-error: format ".arrow" is not supported`. The file itself is
well-formed: pyarrow reads the schema and record batches, and altair
consumes it without issue. Remove this entry once frictionless ships arrow
support or a plugin is wired up.
"""
11 changes: 9 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#:schema https://json.schemastore.org/pyproject.json
[project]
dependencies = [
"frictionless[json,parquet]>=5.18.0",
"frictionless[json,parquet]>=5.18.1",
"niquests>=3.11.2",
"pandas>=2.2.3",
"polars>=1.17.1",
"tomli-w>=1.1.0",
]
Expand All @@ -13,7 +14,7 @@ requires-python = ">=3.12"
version = "2.11.0"

[dependency-groups]
dev = ["ipython[kernel]>=8.30.0", "ruff>=0.14.2", "taplo>=0.9.3"]
dev = ["ipython[kernel]>=8.30.0", "pytest>=9", "ruff>=0.14.2", "taplo>=0.9.3"]
geo-species = [
"exactextract>=0.2.1",
"geopandas",
Expand All @@ -26,6 +27,12 @@ geo-species = [
"tqdm",
]

[tool.pytest.ini_options]
markers = [
"slow: full schema/row validation via frictionless; opt in with --run-slow",
]
testpaths = ["tests"]

[tool.ruff]
extend-exclude = [
".venv",
Expand Down
50 changes: 50 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""
Pytest config: ``--run-slow`` and ``--limit-rows`` CLI options.

The ``slow`` marker is registered in ``pyproject.toml``
(``[tool.pytest.ini_options].markers``), matching the convention used in
vega/altair. The expected-failures allowlist is read at parametrize time
inside ``test_datapackage.py`` (so xfail marks attach by resource-dict
lookup, not by ID-string parsing of pytest's test names) — that logic
does not live here either.
"""

from __future__ import annotations

import pytest


def pytest_addoption(parser: pytest.Parser) -> None:
parser.addoption(
"--run-slow",
action="store_true",
default=False,
help="Run @pytest.mark.slow tests (frictionless schema/row validation).",
)
parser.addoption(
"--limit-rows",
type=int,
default=None,
help=(
"Cap row reads in --run-slow tests at N rows per resource. "
"Default is unlimited (full read). Use a small N for quick "
"iteration; flights-3m takes minutes at full read."
),
)


@pytest.fixture(scope="session")
def schema_limit_rows(request: pytest.FixtureRequest) -> int | None:
return request.config.getoption("--limit-rows")


def pytest_collection_modifyitems(
config: pytest.Config, items: list[pytest.Item]
) -> None:
"""Skip ``slow`` items unless ``--run-slow`` was passed."""
if config.getoption("--run-slow"):
return
skip_slow = pytest.mark.skip(reason="opt in with --run-slow")
for item in items:
if "slow" in item.keywords:
item.add_marker(skip_slow)
157 changes: 157 additions & 0 deletions tests/test_datapackage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
"""
Validate every resource in datapackage.json against its on-disk file.

Two tiers:

* Default — stdlib-only file existence, byte size, and git-blob SHA-1
against the descriptor. Sub-second across all 70+ resources. Covers
what frictionless-py doesn't today (byte-count returns ``None`` for
tabular JSON / arrow / parquet; hash-count supports only md5 and
sha256, descriptor uses sha1).

* Slow (``pytest --run-slow``) — frictionless schema and row validation
per resource. Multi-minute on flights-3m at full read; opt in via the
``--run-slow`` flag and pass ``--limit-rows N`` to cap row reads
during iteration. Default is full read.

Resources whose schema/row check is known-broken upstream (``movies``
documented pedagogy; ``flights_200k_arrow`` no upstream parser) are
listed in ``_data/validate_datapackage.toml`` and marked
``xfail(strict=True)`` at parametrize time. Removing an entry
re-enables strict checking; if the upstream issue resolves, the run
flips XFAIL → XPASS and fails, prompting allowlist removal.
"""

from __future__ import annotations

import hashlib
import json
import tomllib
from copy import deepcopy
from pathlib import Path
from typing import Any

import pytest
from frictionless import Checklist, Package

REPO = Path(__file__).resolve().parent.parent
DATA = REPO / "data"
DESCRIPTOR_PATH = REPO / "datapackage.json"
ALLOWLIST_PATH = REPO / "_data" / "validate_datapackage.toml"


def _load_resources() -> list[dict]:
return json.loads(DESCRIPTOR_PATH.read_text(encoding="utf-8"))["resources"]


def _load_xfail_reasons() -> dict[str, str]:
"""Read the allowlist; xfail reason is the first non-empty line of `reason`."""
if not ALLOWLIST_PATH.exists():
return {}
cfg = tomllib.loads(ALLOWLIST_PATH.read_text(encoding="utf-8"))
return {
entry["resource"]: entry["reason"].strip().splitlines()[0]
for entry in cfg.get("expected_failures", [])
}


_RESOURCES = _load_resources()
_RESOURCE_IDS = [r["name"] for r in _RESOURCES]
_XFAIL = _load_xfail_reasons()

# Sanity-check the allowlist against the live descriptor at import time.
# A stale entry in validate_datapackage.toml is a silent maintenance hazard
# otherwise — the xfail mark would never attach and a real regression
# could slip past.
_unknown_xfail = set(_XFAIL) - set(_RESOURCE_IDS)
if _unknown_xfail:
msg = (
f"_data/validate_datapackage.toml lists resources not in datapackage.json: "
f"{sorted(_unknown_xfail)}"
)
raise RuntimeError(msg)


def git_blob_sha1(path: Path) -> str:
r"""Compute git's blob SHA-1: ``sha1(b"blob {len}\0" + content)``."""
content = path.read_bytes()
return hashlib.sha1(b"blob %d\0%b" % (len(content), content)).hexdigest()


@pytest.mark.parametrize("resource", _RESOURCES, ids=_RESOURCE_IDS)
def test_file_exists(resource: dict) -> None:
assert "path" in resource, (
f"descriptor regression: resource {resource.get('name')!r} has no 'path'"
)
path = DATA / resource["path"]
assert path.exists(), f"missing data file: {resource['path']}"


@pytest.mark.parametrize("resource", _RESOURCES, ids=_RESOURCE_IDS)
def test_bytes_match(resource: dict) -> None:
assert "bytes" in resource, (
f"descriptor regression: 'bytes' missing for {resource['name']!r}"
)
path = DATA / resource["path"]
if not path.exists():
pytest.skip(f"file missing — see test_file_exists[{resource['name']}]")
declared = resource["bytes"]
actual = path.stat().st_size
assert declared == actual, f"declared={declared} disk={actual}"


@pytest.mark.parametrize("resource", _RESOURCES, ids=_RESOURCE_IDS)
def test_sha1_matches_git_blob(resource: dict) -> None:
declared = resource.get("hash", "")
assert declared, f"descriptor regression: 'hash' missing for {resource['name']!r}"
assert declared.startswith("sha1:"), (
f"descriptor regression: hash format not sha1 for {resource['name']!r}: "
f"{declared!r}"
)
path = DATA / resource["path"]
if not path.exists():
pytest.skip(f"file missing — see test_file_exists[{resource['name']}]")
expected = declared.removeprefix("sha1:")
actual = git_blob_sha1(path)
assert expected == actual, f"declared={expected[:10]}... disk={actual[:10]}..."


def _slow_param(resource: dict) -> Any: # pytest.ParameterSet; not in public API
"""Build the parametrize entry for the slow tier; attach xfail strict if allowlisted."""
name = resource["name"]
marks = []
if name in _XFAIL:
marks = [pytest.mark.xfail(reason=_XFAIL[name], strict=True)]
return pytest.param(resource, id=name, marks=marks)


@pytest.mark.slow
@pytest.mark.parametrize("resource", [_slow_param(r) for r in _RESOURCES])
def test_schema_and_rows(resource: dict, schema_limit_rows: int | None) -> None:
"""Validate column types and row content via frictionless."""
# parallel=False is load-bearing: frictionless's parallel code path silently
# ignores Checklist.skip_errors, which would re-surface byte-count and
# hash-count errors that phase 1 already covers more completely. Don't
# flip without verifying upstream.
checklist = Checklist(skip_errors=["byte-count", "hash-count"])
# basepath workaround: descriptor paths are bare filenames under data/ (see #758).
package = Package({"resources": [deepcopy(resource)]}, basepath=str(DATA))
report = package.validate(
checklist=checklist, limit_rows=schema_limit_rows, parallel=False
)
if report.valid:
return

# Failure rendering — guarded against empty tasks (frictionless can return
# package-level errors without per-task entries).
lines: list[str] = []
if report.tasks:
task_errors = report.tasks[0].errors
for err in task_errors[:5]:
field = getattr(err, "field_name", None)
lines.append(f"{err.type} field={field!r}: {err.note}")
if len(task_errors) > 5:
lines.append(f" (+{len(task_errors) - 5} more)")
for err in list(getattr(report, "errors", []) or [])[:5]:
lines.append(f"package-level {err.type}: {err.note}")
pytest.fail("\n".join(lines) or f"validation failed (no error details): {report!r}")
Loading