diff --git a/.github/doc-tags-allowed.txt b/.github/doc-tags-allowed.txt new file mode 100644 index 000000000..2f47fbeb3 --- /dev/null +++ b/.github/doc-tags-allowed.txt @@ -0,0 +1,316 @@ +# Allowed documentation frontmatter tags (canonical spellings). +# One tag per line. Lines starting with # are comments. +# Enforced by the check-doc-tags pre-commit/prek hook. +# Canonical form = the spelling most used in the repo today. +# 310 tags. Add new canonical tags here intentionally. + +actions +add-dataset-version +admin panel +administration +advanced +agent +ai +alerts +allowlist +alphafold +altair +amazon +analysis +antigravity +api +approval +artifact +audit logs +authentication +automation +aws +aws eks +aws help +aws troubleshooting +aws-batch +azure +azure batch +azure blob +azure help +azure troubleshooting +batch +billing +bioinformatics +blob storage +cache +ce +certificate +changelog +checklist +checkpoint +claude-code +cli +client +cloud +cloud costs +cloud-enterprise +cloud-pro +cloudformation +clusters +co-scientist +code-intelligence +codex +coding-agents +colabfold +commands +compatibility +compose +compute +compute environment +conda +configuration +configure +connect +connect-changelog +container +container-images +container-registry +content +cost +create-datasets +create-organization +create-workspace +credentials +credits +custom +dashboard +data +data explorer +data lineage +data studios +database +datasets +deployment +details +diagnostics +dns +docker +dragen +ec2 +ecr +ecs +eks +email +ena +enterprise +entra +entra-id +environment +environments +error-codes +errors +examples +execution +exit-codes +explorer +faq +file system +firewall +fuse +fusion +fusion-doctor +fusion-snapshots +gcp +gcp batch +gcs +gemini +geo +get started +getting-started +git +git configuration +git-repository +gitea +github +github-copilot +gitlab +gke +google +google cloud +governance +grid +grid engine +groundswell +guides +helm +help +https +iam +ibm +idp-delegation +igv +image +input +install +installation +integration +interactive +jupyter +k8s +keycloak +keys +kubernetes +labels +launch +launchpad +license +lid +life sciences +limitations +limits +lineage +lineage id +local +logging +login +logs +lsf +lsp +mail +manage-datasets +managed identities +mcp +mcp-server +meta-seqera.io +metrics +migration notes +minio +moab +modes +modules +molstar +monitoring +multiqc +networking +nextflow +nextflow help +nextflow troubleshooting +nf-canary +nf-core +nf-launcher +notifications +oauth +object +oidc +okta +on-prem +optimization +oracle object storage +organization-settings +organizations +output +overview +parameters +passwordless +pbs +pbs pro +permissions +pipeline +pipeline optimization +pipeline-runs +platform +policy +posix +prerequisites +privacy +processes +production +projects +proteinfold +provenance +quay +quotas +r-ide +rate limits +rbac +rds +redis +reference +registry +relaunch +reports +reproducibility +resource +resource labels +restore +resume +retry +reverse-proxy +ride +rnaseq +roles +rstudio +runs +s3 +s3-compatible +schema +scim +scripts +secrets +securestring +security +seqera +seqera cloud +seqera compute +seqera containers +seqera enterprise +seqera pipelines +seqera-ai +seqera-template +seqera-template-studio +seqerakit +sequencing +ses +session +settings +showcase tutorial +singularity +skills +slurm +snapshot +spec +spot +sra +ssh +ssl +sso +stats +status +storage +studio-custom +studio-git +studios +tasks +teams +template +testing +tls +troubleshooting +tutorial +unpublished +update +usage +usage-limits +use cases +use configuration +user-workspace +users +version-control +versioning +vm +vscode +wave +wave cli +webhooks +workflow +workspaces +workspaces help +workspaces troubleshooting +xpra +yaml diff --git a/.github/scripts/check-doc-tags.py b/.github/scripts/check-doc-tags.py new file mode 100755 index 000000000..9f3c0404d --- /dev/null +++ b/.github/scripts/check-doc-tags.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 +"""Validate documentation frontmatter `tags:` against an allowlist. + +Used by the `check-doc-tags` pre-commit / prek hook. Reads the canonical +allowlist from `.github/doc-tags-allowed.txt` and fails if any changed +Markdown file declares a tag that is not on the list. + +When a disallowed tag matches a known variant (see +`.github/scripts/doc-tags-aliases.json`) or differs only by case/spacing +from an allowed tag, the error suggests the canonical form. + +Exit codes: + 0 all tags allowed + 1 one or more disallowed tags found + 2 configuration/usage error +""" +from __future__ import annotations + +import json +import re +import sys +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parents[2] +ALLOWLIST = REPO_ROOT / ".github" / "doc-tags-allowed.txt" +ALIASES = REPO_ROOT / ".github" / "scripts" / "doc-tags-aliases.json" + + +def load_allowlist() -> set[str]: + if not ALLOWLIST.exists(): + sys.stderr.write(f"error: allowlist not found: {ALLOWLIST}\n") + sys.exit(2) + tags = set() + for line in ALLOWLIST.read_text(encoding="utf-8").splitlines(): + line = line.strip() + if line and not line.startswith("#"): + tags.add(line) + return tags + + +def load_aliases() -> dict[str, str]: + if ALIASES.exists(): + try: + return json.loads(ALIASES.read_text(encoding="utf-8")) + except json.JSONDecodeError: + return {} + return {} + + +def parse_frontmatter(text: str) -> str | None: + if not text.startswith("---"): + return None + end = text.find("\n---", 3) + return text[3:end] if end != -1 else None + + +def extract_tags(fm: str) -> list[str]: + m = re.search(r"^tags:\s*(.*)$", fm, re.M) + if not m: + return [] + inline = m.group(1).strip() + vals: list[str] = [] + if inline.startswith("["): + flow = inline + if not inline.endswith("]"): + idx = fm.find("]", m.end()) + flow = fm[m.start(1):idx + 1] if idx != -1 else inline + flow = flow.strip().lstrip("[").rstrip("]") + vals = [v.strip().strip("\"'") for v in flow.split(",") if v.strip()] + elif inline == "": + for ln in fm[m.end():].split("\n"): + sm = re.match(r"^\s*-\s+(.*)$", ln) + if sm: + vals.append(sm.group(1).strip().strip("\"'")) + elif ln.strip() == "": + continue + else: + break + else: + vals = [inline.strip("\"'")] + return [v for v in vals if v] + + +def normkey(t: str) -> str: + return re.sub(r"[-\s]+", " ", t.strip().lower()) + + +def suggest(tag: str, allowed: set[str], aliases: dict[str, str]) -> str | None: + if tag in aliases: + return aliases[tag] + key = normkey(tag) + norm_map = {normkey(a): a for a in allowed} + return norm_map.get(key) + + +def main(argv: list[str]) -> int: + files = [Path(a) for a in argv] + if not files: + return 0 + + allowed = load_allowlist() + aliases = load_aliases() + + failed = False + for path in files: + if path.suffix not in (".md", ".mdx"): + continue + try: + text = path.read_text(encoding="utf-8") + except OSError: + continue + fm = parse_frontmatter(text) + if not fm: + continue + for tag in extract_tags(fm): + if tag in allowed: + continue + hint = suggest(tag, allowed, aliases) + failed = True + msg = f"{path}: disallowed tag {tag!r}" + if hint: + msg += f" -> did you mean {hint!r}?" + else: + msg += " (not in .github/doc-tags-allowed.txt)" + print(msg) + + if failed: + print() + print("Fix the tags above, or add a new canonical tag to " + ".github/doc-tags-allowed.txt if it is genuinely new.") + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv[1:])) diff --git a/.github/scripts/doc-tags-aliases.json b/.github/scripts/doc-tags-aliases.json new file mode 100644 index 000000000..aee63e066 --- /dev/null +++ b/.github/scripts/doc-tags-aliases.json @@ -0,0 +1,26 @@ +{ + "cloud-costs": "cloud costs", + "compute-environment": "compute environment", + "compute-environments": "compute environment", + "containers": "container", + "custom roles": "roles", + "data-explorer": "data explorer", + "dataset": "datasets", + "label": "labels", + "organization": "organizations", + "parameter": "parameters", + "pipeline runs": "pipeline-runs", + "pipeline-optimization": "pipeline optimization", + "pipelines": "pipeline", + "resources": "resource", + "role": "roles", + "run": "runs", + "seqera ai": "seqera-ai", + "seqera-pipelines": "seqera pipelines", + "studio": "studios", + "tower": "platform", + "user": "users", + "user-roles": "roles", + "workflows": "workflow", + "workspace": "workspaces" +} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7a95f0754..4747700ce 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -30,6 +30,22 @@ repos: verbose: true always_run: true +# Validate doc frontmatter tags against the canonical allowlist +- repo: local + hooks: + - id: check-doc-tags + name: Doc frontmatter tags allowlist + language: python + entry: python3 .github/scripts/check-doc-tags.py + files: \.(md|mdx)$ + exclude: | + (?x)^( + node_modules/| + build/| + dist/| + \.git/ + ) + # Standard pre-commit hooks - repo: https://github.com/pre-commit/pre-commit-hooks rev: v6.0.0