neondatabase · bgrenon · Apr 2, 2026 · Apr 2, 2026 · Apr 6, 2026 · Apr 6, 2026
diff --git a/.claude/commands/eval-guide.md b/.claude/commands/eval-guide.md
@@ -0,0 +1,36 @@
+---
+description: 'Test a framework integration guide by having an AI agent follow it from zero to working Neon connection'
+---
+
+Run the docs guide eval harness against one or more guides. The harness creates an ephemeral Neon database, spins up a Docker container, and lets an AI agent follow the guide autonomously. A separate evaluator scores the result.
+
+**Prerequisites:** Docker Desktop must be running. The harness must be set up (`cd evals/docs-guides && npm install` and `.env` configured). See `evals/docs-guides/README.md`.
+
+**Arguments:** Pass the guide name(s) and optionally a local path. Examples:
+- `/eval-guide express` — test the published Express guide
+- `/eval-guide rust --local ./content/docs/guides/` — test a local draft
+- `/eval-guide express,prisma,django` — test multiple guides
+
+**Steps:**
+
+1. Parse the guide name(s) from the user's input. If `--local` is specified, use that path. Otherwise default to `--local ./content/docs/guides/` if the file exists there, falling back to the published URL.
+
+2. Run the harness:
+```bash
+cd evals/docs-guides && npm run eval -- --guide {names} {--local path if applicable} --timeout 15
+```
+
+3. Wait for the harness to complete. It will take 1-5 minutes per guide.
+
+4. Read `evals/docs-guides/results/latest.json` for the scores.
+
+5. Summarize the results for the user:
+   - Overall score (0-10) and what it means
+   - Which deterministic checks passed/failed
+   - Key findings from the evaluator's reasoning
+   - Any specific documentation improvements suggested
+   - Whether infrastructure issues (network, DNS, proxy) affected the score
+
+6. If the score is below 8, offer to read the full transcript at `evals/docs-guides/results/history/{latest}/` to diagnose specific issues.
+
+7. If the user asks about a specific failure, read the relevant transcript or file snapshots to provide detailed analysis.
diff --git a/evals/docs-guides/.env.example b/evals/docs-guides/.env.example
@@ -0,0 +1,30 @@
+# === API credentials (choose one) ===
+
+# Option 1: OpenAI-compatible endpoint with custom base URL
+# OPENAI_BASE_URL=https://your-endpoint.example.com/serving-endpoints
+# OPENAI_API_KEY=your-token
+
+# Option 2: Direct OpenAI
+# OPENAI_API_KEY=sk-...
+
+# Option 3: Databricks Model Serving (see internal setup guide)
+# DATABRICKS_HOST=https://your-workspace.example.com
+# DATABRICKS_TOKEN=your-token
+
+# Model names (defaults to claude-sonnet-4-6 if not set)
+# EVAL_WORKER_MODEL=claude-sonnet-4-6
+# EVAL_JUDGE_MODEL=claude-sonnet-4-6
+
+# Throttle delay between API calls in ms (default: 2000).
+# Increase if you hit rate limits frequently.
+# EVAL_THROTTLE_MS=2000
+
+# === Registry proxies (optional) ===
+# Set these if your network blocks direct access to public registries.
+# If not set, package managers use their default public registries.
+
+# NPM_REGISTRY=https://your-npm-proxy/
+# PIP_INDEX_URL=https://your-pypi-proxy/simple
+# GOPROXY=https://your-go-proxy
+# MAVEN_MIRROR_URL=https://your-maven-proxy
+# CARGO_REGISTRY_URL=https://your-crates-proxy/
diff --git a/evals/docs-guides/.gitignore b/evals/docs-guides/.gitignore
@@ -0,0 +1,5 @@
+node_modules/
+dist/
+results/
+.env
+docker/build-local.sh
diff --git a/evals/docs-guides/README.md b/evals/docs-guides/README.md
@@ -0,0 +1,85 @@
+# Docs Guide Eval Harness
+
+Tests whether Neon framework integration guides are effective enough for an AI agent to follow from zero to a working Neon connection. The agent reads the guide, sets up the project in an isolated Docker container with an ephemeral Neon database, and a separate evaluator scores the result.
+
+The harness scores the documentation, not the agent. A low score means the guide needs improvement.
+
+## Setup
+
+1. **Install Docker Desktop** and make sure it's running.
+
+2. **Install dependencies:**
+   ```bash
+   cd evals/docs-guides
+   npm install
+   ```
+
+3. **Configure credentials.** Copy the example and add your API key:
+   ```bash
+   cp .env.example .env
+   ```
+   Edit `evals/docs-guides/.env` with your credentials. You need one of:
+   - `OPENAI_API_KEY` for direct OpenAI access
+   - `OPENAI_BASE_URL` + `OPENAI_API_KEY` for any OpenAI-compatible endpoint
+   - `DATABRICKS_HOST` + `DATABRICKS_TOKEN` for Databricks Model Serving
+
+   If your network blocks public package registries, add the proxy URLs for npm, pip, Go, Maven, and Cargo. See `.env.example` for all options.
+
+4. **Build the Docker image** (happens automatically on first run, takes a few minutes).
+
+## Usage with Claude Code
+
+The easiest way to use this harness is via the `/eval-guide` slash command in a Claude Code session:
+
+```
+/eval-guide express
+/eval-guide rust --local ./content/docs/guides/
+/eval-guide express,prisma,django
+```
+
+Claude runs the harness, reads the results, and summarizes the findings conversationally. Use the output as context for future doc edits. If the evaluator identifies specific issues (unclear driver selection, missing error handling, scope creep), those are direct improvement signals for the guide.
+
+## Manual CLI usage
+
+Test a published guide:
+```bash
+npm run eval -- --guide express
+```
+
+Test a local draft before publishing:
+```bash
+npm run eval -- --guide express --local ../../content/docs/guides/
+```
+
+Test multiple guides:
+```bash
+npm run eval -- --guide express,prisma,django
+```
+
+Run all guides registered in `config/guides.yaml` (intended for scheduled/CI runs):
+```bash
+npm run eval
+```
+
+## How it works
+
+1. Creates an ephemeral Neon Postgres database (via neon.new, no account needed)
+2. Starts a Docker container with Node.js, Python, and apt-get access for other runtimes
+3. Gives the AI agent the guide content and a task prompt
+4. The agent installs packages, writes code, and verifies the connection
+5. Deterministic checks confirm: connection works, .env exists, no hardcoded credentials
+6. An LLM evaluator scores the session against a rubric (0-10)
+7. Saves transcript, file snapshots, and scores to `results/`
+
+## Output
+
+Results go to `results/history/{timestamp}/` with:
+- `summary.json` — scores and deterministic check results
+- `{guide}/transcript.txt` — readable conversation log
+- `{guide}/files/` — every file the agent created
+
+## Configuration
+
+- **`config/guides.yaml`** — Registry of guides available for eval. Running `npm run eval` with no `--guide` flag runs all guides listed here. Add a guide when it's ready for regular testing.
+- **`config/rubric.md`** — Evaluation criteria for the LLM scorer. Edit this to adjust what the evaluator cares about.
+- **`.env`** — Your API credentials and optional registry proxies (gitignored, never committed).
diff --git a/evals/docs-guides/config/guides.yaml b/evals/docs-guides/config/guides.yaml
@@ -0,0 +1,15 @@
+guides:
+  express:
+    doc_url: "https://neon.com/docs/guides/express.md"
+  prisma:
+    doc_url: "https://neon.com/docs/guides/prisma.md"
+  django:
+    doc_url: "https://neon.com/docs/guides/django.md"
+  rust:
+    doc_url: "https://neon.com/docs/guides/rust.md"
+  elixir:
+    doc_url: "https://neon.com/docs/guides/elixir.md"
+  go:
+    doc_url: "https://neon.com/docs/guides/go.md"
+  java:
+    doc_url: "https://neon.com/docs/guides/java.md"
diff --git a/evals/docs-guides/config/rubric.md b/evals/docs-guides/config/rubric.md
@@ -0,0 +1,51 @@
+You are an evaluator for AI agent documentation-following tests. You assess how well an AI agent followed a documentation guide to set up a framework connected to Neon Postgres.
+
+## Scoring rubric (0-10)
+
+- **10**: Working setup on first attempt, correct patterns, no backtracking, zero user interventions
+- **7-9**: Working setup with minor issues or one instance of backtracking
+- **4-6**: Working setup but required significant corrections or missed key requirements
+- **0-3**: Did not produce a working setup
+
+## Evaluation dimensions
+
+### correct_packages
+Did the agent install the correct packages/drivers? No unnecessary, deprecated, or wrong packages.
+- PASS: Only packages needed for the guide's approach were installed
+- FAIL: Installed deprecated packages, wrong drivers, or unnecessary dependencies
+
+### best_practices
+Did the code follow best practices?
+- Error handling (try/catch, proper HTTP status codes)
+- Connection cleanup (client.release(), pool.end(), disconnect)
+- Credentials are not hardcoded; sensitive values are managed via environment variables or secure configuration mechanisms (e.g., `config.exs` for Elixir, `appsettings.json` for C# etc.), as appropriate for the language/framework
+- .env file with dotenv or framework-native env loading
+
+### stayed_in_scope
+Did the agent stay within the scope of the task?
+- PASS: Only created files and features described in the guide
+- FAIL: Added unrequested features beyond what the guide demonstrates
+- NOTE: If the guide itself demonstrates CRUD operations, transactions, etc., creating those files IS in scope. Judge scope against what the guide covers, not against a minimal quickstart assumption.
+
+### no_backtracking
+Did the agent complete the task without backtracking or fixing its own errors?
+- PASS: Linear progress from start to finish
+- FAIL: Had to undo work, fix errors it introduced, or retry failed approaches
+
+## Critical: Infrastructure vs content failures
+
+Network, DNS, proxy, and registry issues are INFRASTRUCTURE failures, not documentation failures. When evaluating:
+
+- If `npm install`, `pip install`, `go get`, `mvn` commands fail due to network/DNS/proxy/registry issues, do NOT count this against the documentation or the agent.
+- If the agent has to work around infrastructure issues (e.g., manual compilation instead of Maven, alternate package sources), evaluate the WORKAROUND on its merits, not the fact that it was needed.
+- Backtracking caused by infrastructure issues (network retries, DNS troubleshooting) should NOT count as backtracking for scoring purposes. Only count backtracking where the agent made a mistake in following the documentation.
+- Rate limiting (429 errors) causing delays is infrastructure, not a content failure.
+
+When scoring, mentally separate: "Did the documentation guide the agent correctly?" from "Did the environment cooperate?"
+
+## Other important notes
+
+- Writing credentials to .env (and tool output showing the write) is EXPECTED behavior, not a failure
+- Missing .gitignore is a minor issue, not a major failure
+- Score the DOCUMENTATION's effectiveness at guiding the agent, not the agent's general capability
+- If the agent successfully connects to Neon and demonstrates the guide's functionality, that is the primary success criterion
diff --git a/evals/docs-guides/docker/Dockerfile.universal b/evals/docs-guides/docker/Dockerfile.universal
@@ -0,0 +1,42 @@
+FROM debian:bookworm-slim
+
+# Core tools — the agent installs language runtimes as needed
+RUN apt-get update && apt-get install -y \
+    curl \
+    wget \
+    git \
+    ca-certificates \
+    gnupg \
+    build-essential \
+    pkg-config \
+    libssl-dev \
+    unzip \
+    sudo \
+    && rm -rf /var/lib/apt/lists/*
+
+# Node.js 22 (pre-installed — used by most guides)
+RUN curl -fsSL https://deb.nodesource.com/setup_22.x | bash - \
+    && apt-get install -y nodejs \
+    && rm -rf /var/lib/apt/lists/*
+
+# Python 3 (pre-installed — used by many guides)
+RUN apt-get update && apt-get install -y \
+    python3 \
+    python3-pip \
+    python3-venv \
+    && rm -rf /var/lib/apt/lists/* \
+    && ln -sf /usr/bin/python3 /usr/bin/python
+
+# Everything else (Go, Java, Rust, Elixir, Ruby, PHP, .NET, etc.)
+# is installed by the agent at runtime via apt-get.
+
+# Registry proxies are injected at container runtime via env vars.
+# See .env.example for the list of supported proxy vars.
+
+WORKDIR /app
+
+# Entrypoint script configures package manager proxies from env vars,
+# then exec's the original command (sleep infinity).
+COPY entrypoint.sh /entrypoint.sh
+RUN chmod +x /entrypoint.sh
+ENTRYPOINT ["/entrypoint.sh"]
diff --git a/evals/docs-guides/docker/entrypoint.sh b/evals/docs-guides/docker/entrypoint.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+# Configure package manager proxies from environment variables.
+# If a proxy var is set, configure the corresponding package manager.
+# If not set, package managers use their default public registries.
+
+# npm
+if [ -n "$NPM_REGISTRY" ]; then
+  npm config set registry "$NPM_REGISTRY"
+fi
+
+# pip
+if [ -n "$PIP_INDEX_URL" ]; then
+  mkdir -p /etc/pip
+  TRUSTED=$(echo "$PIP_INDEX_URL" | sed 's|https://||;s|http://||;s|/.*||')
+  printf "[global]\nindex-url = %s\ntrusted-host = %s\n" "$PIP_INDEX_URL" "$TRUSTED" \
+    > /etc/pip/pip.conf
+fi
+
+# Go modules
+if [ -n "$GOPROXY" ]; then
+  export GOPROXY
+fi
+
+# Maven
+if [ -n "$MAVEN_MIRROR_URL" ]; then
+  mkdir -p /root/.m2
+  echo "<settings><profiles><profile><id>mirror</id><repositories><repository><id>central</id><url>${MAVEN_MIRROR_URL}</url></repository></repositories><pluginRepositories><pluginRepository><id>central</id><url>${MAVEN_MIRROR_URL}</url></pluginRepository></pluginRepositories></profile></profiles><activeProfiles><activeProfile>mirror</activeProfile></activeProfiles></settings>" \
+    > /root/.m2/settings.xml
+fi
+
+# Cargo (crates.io)
+if [ -n "$CARGO_REGISTRY_URL" ]; then
+  mkdir -p /root/.cargo
+  printf '[net]\ngit-fetch-with-cli = true\nretry = 5\n\n[http]\ntimeout = 120\n\n[source.crates-io]\nreplace-with = "proxy"\n\n[source.proxy]\nregistry = "sparse+%s"\n' "$CARGO_REGISTRY_URL" \
+    > /root/.cargo/config.toml
+fi
+
+# Execute the original command
+exec "$@"
diff --git a/evals/docs-guides/package.json b/evals/docs-guides/package.json
@@ -0,0 +1,20 @@
+{
+  "name": "neon-docs-eval",
+  "version": "0.1.0",
+  "description": "Automated test harness for Neon docs agent effectiveness",
+  "type": "module",
+  "scripts": {
+    "eval": "tsx src/runner.ts",
+    "build": "tsc"
+  },
+  "dependencies": {
+    "js-yaml": "^4.1.0",
+    "openai": "^6.32.0"
+  },
+  "devDependencies": {
+    "@types/js-yaml": "^4.0.9",
+    "@types/node": "^22.0.0",
+    "tsx": "^4.19.0",
+    "typescript": "^5.7.0"
+  }
+}
diff --git a/evals/docs-guides/src/client.ts b/evals/docs-guides/src/client.ts
@@ -0,0 +1,43 @@
+import OpenAI from 'openai';
+
+function createDatabricksClient(model: string): OpenAI {
+  const host = process.env.DATABRICKS_HOST!;
+  const token = process.env.DATABRICKS_TOKEN!;
+
+  return new OpenAI({
+    apiKey: token,
+    baseURL: `${host.replace(/\/+$/, '')}/serving-endpoints/${model}`,
+    fetch: async (url: RequestInfo | URL, init?: RequestInit) => {
+      const rewritten = url.toString().replace(/\/chat\/completions$/, '/invocations');
+      return globalThis.fetch(rewritten, init);
+    },
+  });
+}
+
+function createGenericClient(baseURL?: string): OpenAI {
+  return new OpenAI({
+    ...(baseURL ? { baseURL } : {}),
+  });
+}
+
+export function getClient(model?: string): OpenAI {
+  // Databricks Model Serving (uses per-model URL routing)
+  if (process.env.DATABRICKS_HOST && process.env.DATABRICKS_TOKEN) {
+    if (!model) throw new Error('Databricks client requires a model name');
+    return createDatabricksClient(model);
+  }
+
+  // Generic OpenAI-compatible endpoint (custom base URL)
+  if (process.env.OPENAI_BASE_URL) {
+    return createGenericClient(process.env.OPENAI_BASE_URL);
+  }
+
+  // Direct OpenAI (OPENAI_API_KEY picked up automatically by the SDK)
+  if (process.env.OPENAI_API_KEY) {
+    return createGenericClient();
+  }
+
+  throw new Error(
+    'No API credentials found. See .env.example for options.',
+  );
+}