diff --git a/.claude/commands/eval-guide.md b/.claude/commands/eval-guide.md new file mode 100644 index 0000000000..b1cb9863c3 --- /dev/null +++ b/.claude/commands/eval-guide.md @@ -0,0 +1,36 @@ +--- +description: 'Test a framework integration guide by having an AI agent follow it from zero to working Neon connection' +--- + +Run the docs guide eval harness against one or more guides. The harness creates an ephemeral Neon database, spins up a Docker container, and lets an AI agent follow the guide autonomously. A separate evaluator scores the result. + +**Prerequisites:** Docker Desktop must be running. The harness must be set up (`cd evals/docs-guides && npm install` and `.env` configured). See `evals/docs-guides/README.md`. + +**Arguments:** Pass the guide name(s) and optionally a local path. Examples: +- `/eval-guide express` — test the published Express guide +- `/eval-guide rust --local ./content/docs/guides/` — test a local draft +- `/eval-guide express,prisma,django` — test multiple guides + +**Steps:** + +1. Parse the guide name(s) from the user's input. If `--local` is specified, use that path. Otherwise default to `--local ./content/docs/guides/` if the file exists there, falling back to the published URL. + +2. Run the harness: +```bash +cd evals/docs-guides && npm run eval -- --guide {names} {--local path if applicable} --timeout 15 +``` + +3. Wait for the harness to complete. It will take 1-5 minutes per guide. + +4. Read `evals/docs-guides/results/latest.json` for the scores. + +5. Summarize the results for the user: + - Overall score (0-10) and what it means + - Which deterministic checks passed/failed + - Key findings from the evaluator's reasoning + - Any specific documentation improvements suggested + - Whether infrastructure issues (network, DNS, proxy) affected the score + +6. If the score is below 8, offer to read the full transcript at `evals/docs-guides/results/history/{latest}/` to diagnose specific issues. + +7. If the user asks about a specific failure, read the relevant transcript or file snapshots to provide detailed analysis. diff --git a/evals/docs-guides/.env.example b/evals/docs-guides/.env.example new file mode 100644 index 0000000000..34332e458d --- /dev/null +++ b/evals/docs-guides/.env.example @@ -0,0 +1,30 @@ +# === API credentials (choose one) === + +# Option 1: OpenAI-compatible endpoint with custom base URL +# OPENAI_BASE_URL=https://your-endpoint.example.com/serving-endpoints +# OPENAI_API_KEY=your-token + +# Option 2: Direct OpenAI +# OPENAI_API_KEY=sk-... + +# Option 3: Databricks Model Serving (see internal setup guide) +# DATABRICKS_HOST=https://your-workspace.example.com +# DATABRICKS_TOKEN=your-token + +# Model names (defaults to claude-sonnet-4-6 if not set) +# EVAL_WORKER_MODEL=claude-sonnet-4-6 +# EVAL_JUDGE_MODEL=claude-sonnet-4-6 + +# Throttle delay between API calls in ms (default: 2000). +# Increase if you hit rate limits frequently. +# EVAL_THROTTLE_MS=2000 + +# === Registry proxies (optional) === +# Set these if your network blocks direct access to public registries. +# If not set, package managers use their default public registries. + +# NPM_REGISTRY=https://your-npm-proxy/ +# PIP_INDEX_URL=https://your-pypi-proxy/simple +# GOPROXY=https://your-go-proxy +# MAVEN_MIRROR_URL=https://your-maven-proxy +# CARGO_REGISTRY_URL=https://your-crates-proxy/ diff --git a/evals/docs-guides/.gitignore b/evals/docs-guides/.gitignore new file mode 100644 index 0000000000..81f724d06b --- /dev/null +++ b/evals/docs-guides/.gitignore @@ -0,0 +1,5 @@ +node_modules/ +dist/ +results/ +.env +docker/build-local.sh diff --git a/evals/docs-guides/README.md b/evals/docs-guides/README.md new file mode 100644 index 0000000000..ca55ee7275 --- /dev/null +++ b/evals/docs-guides/README.md @@ -0,0 +1,85 @@ +# Docs Guide Eval Harness + +Tests whether Neon framework integration guides are effective enough for an AI agent to follow from zero to a working Neon connection. The agent reads the guide, sets up the project in an isolated Docker container with an ephemeral Neon database, and a separate evaluator scores the result. + +The harness scores the documentation, not the agent. A low score means the guide needs improvement. + +## Setup + +1. **Install Docker Desktop** and make sure it's running. + +2. **Install dependencies:** + ```bash + cd evals/docs-guides + npm install + ``` + +3. **Configure credentials.** Copy the example and add your API key: + ```bash + cp .env.example .env + ``` + Edit `evals/docs-guides/.env` with your credentials. You need one of: + - `OPENAI_API_KEY` for direct OpenAI access + - `OPENAI_BASE_URL` + `OPENAI_API_KEY` for any OpenAI-compatible endpoint + - `DATABRICKS_HOST` + `DATABRICKS_TOKEN` for Databricks Model Serving + + If your network blocks public package registries, add the proxy URLs for npm, pip, Go, Maven, and Cargo. See `.env.example` for all options. + +4. **Build the Docker image** (happens automatically on first run, takes a few minutes). + +## Usage with Claude Code + +The easiest way to use this harness is via the `/eval-guide` slash command in a Claude Code session: + +``` +/eval-guide express +/eval-guide rust --local ./content/docs/guides/ +/eval-guide express,prisma,django +``` + +Claude runs the harness, reads the results, and summarizes the findings conversationally. Use the output as context for future doc edits. If the evaluator identifies specific issues (unclear driver selection, missing error handling, scope creep), those are direct improvement signals for the guide. + +## Manual CLI usage + +Test a published guide: +```bash +npm run eval -- --guide express +``` + +Test a local draft before publishing: +```bash +npm run eval -- --guide express --local ../../content/docs/guides/ +``` + +Test multiple guides: +```bash +npm run eval -- --guide express,prisma,django +``` + +Run all guides registered in `config/guides.yaml` (intended for scheduled/CI runs): +```bash +npm run eval +``` + +## How it works + +1. Creates an ephemeral Neon Postgres database (via neon.new, no account needed) +2. Starts a Docker container with Node.js, Python, and apt-get access for other runtimes +3. Gives the AI agent the guide content and a task prompt +4. The agent installs packages, writes code, and verifies the connection +5. Deterministic checks confirm: connection works, .env exists, no hardcoded credentials +6. An LLM evaluator scores the session against a rubric (0-10) +7. Saves transcript, file snapshots, and scores to `results/` + +## Output + +Results go to `results/history/{timestamp}/` with: +- `summary.json` — scores and deterministic check results +- `{guide}/transcript.txt` — readable conversation log +- `{guide}/files/` — every file the agent created + +## Configuration + +- **`config/guides.yaml`** — Registry of guides available for eval. Running `npm run eval` with no `--guide` flag runs all guides listed here. Add a guide when it's ready for regular testing. +- **`config/rubric.md`** — Evaluation criteria for the LLM scorer. Edit this to adjust what the evaluator cares about. +- **`.env`** — Your API credentials and optional registry proxies (gitignored, never committed). diff --git a/evals/docs-guides/config/guides.yaml b/evals/docs-guides/config/guides.yaml new file mode 100644 index 0000000000..4a73233f98 --- /dev/null +++ b/evals/docs-guides/config/guides.yaml @@ -0,0 +1,15 @@ +guides: + express: + doc_url: "https://neon.com/docs/guides/express.md" + prisma: + doc_url: "https://neon.com/docs/guides/prisma.md" + django: + doc_url: "https://neon.com/docs/guides/django.md" + rust: + doc_url: "https://neon.com/docs/guides/rust.md" + elixir: + doc_url: "https://neon.com/docs/guides/elixir.md" + go: + doc_url: "https://neon.com/docs/guides/go.md" + java: + doc_url: "https://neon.com/docs/guides/java.md" diff --git a/evals/docs-guides/config/rubric.md b/evals/docs-guides/config/rubric.md new file mode 100644 index 0000000000..ea7d015a7d --- /dev/null +++ b/evals/docs-guides/config/rubric.md @@ -0,0 +1,51 @@ +You are an evaluator for AI agent documentation-following tests. You assess how well an AI agent followed a documentation guide to set up a framework connected to Neon Postgres. + +## Scoring rubric (0-10) + +- **10**: Working setup on first attempt, correct patterns, no backtracking, zero user interventions +- **7-9**: Working setup with minor issues or one instance of backtracking +- **4-6**: Working setup but required significant corrections or missed key requirements +- **0-3**: Did not produce a working setup + +## Evaluation dimensions + +### correct_packages +Did the agent install the correct packages/drivers? No unnecessary, deprecated, or wrong packages. +- PASS: Only packages needed for the guide's approach were installed +- FAIL: Installed deprecated packages, wrong drivers, or unnecessary dependencies + +### best_practices +Did the code follow best practices? +- Error handling (try/catch, proper HTTP status codes) +- Connection cleanup (client.release(), pool.end(), disconnect) +- Credentials are not hardcoded; sensitive values are managed via environment variables or secure configuration mechanisms (e.g., `config.exs` for Elixir, `appsettings.json` for C# etc.), as appropriate for the language/framework +- .env file with dotenv or framework-native env loading + +### stayed_in_scope +Did the agent stay within the scope of the task? +- PASS: Only created files and features described in the guide +- FAIL: Added unrequested features beyond what the guide demonstrates +- NOTE: If the guide itself demonstrates CRUD operations, transactions, etc., creating those files IS in scope. Judge scope against what the guide covers, not against a minimal quickstart assumption. + +### no_backtracking +Did the agent complete the task without backtracking or fixing its own errors? +- PASS: Linear progress from start to finish +- FAIL: Had to undo work, fix errors it introduced, or retry failed approaches + +## Critical: Infrastructure vs content failures + +Network, DNS, proxy, and registry issues are INFRASTRUCTURE failures, not documentation failures. When evaluating: + +- If `npm install`, `pip install`, `go get`, `mvn` commands fail due to network/DNS/proxy/registry issues, do NOT count this against the documentation or the agent. +- If the agent has to work around infrastructure issues (e.g., manual compilation instead of Maven, alternate package sources), evaluate the WORKAROUND on its merits, not the fact that it was needed. +- Backtracking caused by infrastructure issues (network retries, DNS troubleshooting) should NOT count as backtracking for scoring purposes. Only count backtracking where the agent made a mistake in following the documentation. +- Rate limiting (429 errors) causing delays is infrastructure, not a content failure. + +When scoring, mentally separate: "Did the documentation guide the agent correctly?" from "Did the environment cooperate?" + +## Other important notes + +- Writing credentials to .env (and tool output showing the write) is EXPECTED behavior, not a failure +- Missing .gitignore is a minor issue, not a major failure +- Score the DOCUMENTATION's effectiveness at guiding the agent, not the agent's general capability +- If the agent successfully connects to Neon and demonstrates the guide's functionality, that is the primary success criterion diff --git a/evals/docs-guides/docker/Dockerfile.universal b/evals/docs-guides/docker/Dockerfile.universal new file mode 100644 index 0000000000..0b3b676002 --- /dev/null +++ b/evals/docs-guides/docker/Dockerfile.universal @@ -0,0 +1,42 @@ +FROM debian:bookworm-slim + +# Core tools — the agent installs language runtimes as needed +RUN apt-get update && apt-get install -y \ + curl \ + wget \ + git \ + ca-certificates \ + gnupg \ + build-essential \ + pkg-config \ + libssl-dev \ + unzip \ + sudo \ + && rm -rf /var/lib/apt/lists/* + +# Node.js 22 (pre-installed — used by most guides) +RUN curl -fsSL https://deb.nodesource.com/setup_22.x | bash - \ + && apt-get install -y nodejs \ + && rm -rf /var/lib/apt/lists/* + +# Python 3 (pre-installed — used by many guides) +RUN apt-get update && apt-get install -y \ + python3 \ + python3-pip \ + python3-venv \ + && rm -rf /var/lib/apt/lists/* \ + && ln -sf /usr/bin/python3 /usr/bin/python + +# Everything else (Go, Java, Rust, Elixir, Ruby, PHP, .NET, etc.) +# is installed by the agent at runtime via apt-get. + +# Registry proxies are injected at container runtime via env vars. +# See .env.example for the list of supported proxy vars. + +WORKDIR /app + +# Entrypoint script configures package manager proxies from env vars, +# then exec's the original command (sleep infinity). +COPY entrypoint.sh /entrypoint.sh +RUN chmod +x /entrypoint.sh +ENTRYPOINT ["/entrypoint.sh"] diff --git a/evals/docs-guides/docker/entrypoint.sh b/evals/docs-guides/docker/entrypoint.sh new file mode 100644 index 0000000000..cd39488136 --- /dev/null +++ b/evals/docs-guides/docker/entrypoint.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# Configure package manager proxies from environment variables. +# If a proxy var is set, configure the corresponding package manager. +# If not set, package managers use their default public registries. + +# npm +if [ -n "$NPM_REGISTRY" ]; then + npm config set registry "$NPM_REGISTRY" +fi + +# pip +if [ -n "$PIP_INDEX_URL" ]; then + mkdir -p /etc/pip + TRUSTED=$(echo "$PIP_INDEX_URL" | sed 's|https://||;s|http://||;s|/.*||') + printf "[global]\nindex-url = %s\ntrusted-host = %s\n" "$PIP_INDEX_URL" "$TRUSTED" \ + > /etc/pip/pip.conf +fi + +# Go modules +if [ -n "$GOPROXY" ]; then + export GOPROXY +fi + +# Maven +if [ -n "$MAVEN_MIRROR_URL" ]; then + mkdir -p /root/.m2 + echo "mirrorcentral${MAVEN_MIRROR_URL}central${MAVEN_MIRROR_URL}mirror" \ + > /root/.m2/settings.xml +fi + +# Cargo (crates.io) +if [ -n "$CARGO_REGISTRY_URL" ]; then + mkdir -p /root/.cargo + printf '[net]\ngit-fetch-with-cli = true\nretry = 5\n\n[http]\ntimeout = 120\n\n[source.crates-io]\nreplace-with = "proxy"\n\n[source.proxy]\nregistry = "sparse+%s"\n' "$CARGO_REGISTRY_URL" \ + > /root/.cargo/config.toml +fi + +# Execute the original command +exec "$@" diff --git a/evals/docs-guides/package.json b/evals/docs-guides/package.json new file mode 100644 index 0000000000..984b174dd3 --- /dev/null +++ b/evals/docs-guides/package.json @@ -0,0 +1,20 @@ +{ + "name": "neon-docs-eval", + "version": "0.1.0", + "description": "Automated test harness for Neon docs agent effectiveness", + "type": "module", + "scripts": { + "eval": "tsx src/runner.ts", + "build": "tsc" + }, + "dependencies": { + "js-yaml": "^4.1.0", + "openai": "^6.32.0" + }, + "devDependencies": { + "@types/js-yaml": "^4.0.9", + "@types/node": "^22.0.0", + "tsx": "^4.19.0", + "typescript": "^5.7.0" + } +} diff --git a/evals/docs-guides/src/client.ts b/evals/docs-guides/src/client.ts new file mode 100644 index 0000000000..cf2b5dde9d --- /dev/null +++ b/evals/docs-guides/src/client.ts @@ -0,0 +1,43 @@ +import OpenAI from 'openai'; + +function createDatabricksClient(model: string): OpenAI { + const host = process.env.DATABRICKS_HOST!; + const token = process.env.DATABRICKS_TOKEN!; + + return new OpenAI({ + apiKey: token, + baseURL: `${host.replace(/\/+$/, '')}/serving-endpoints/${model}`, + fetch: async (url: RequestInfo | URL, init?: RequestInit) => { + const rewritten = url.toString().replace(/\/chat\/completions$/, '/invocations'); + return globalThis.fetch(rewritten, init); + }, + }); +} + +function createGenericClient(baseURL?: string): OpenAI { + return new OpenAI({ + ...(baseURL ? { baseURL } : {}), + }); +} + +export function getClient(model?: string): OpenAI { + // Databricks Model Serving (uses per-model URL routing) + if (process.env.DATABRICKS_HOST && process.env.DATABRICKS_TOKEN) { + if (!model) throw new Error('Databricks client requires a model name'); + return createDatabricksClient(model); + } + + // Generic OpenAI-compatible endpoint (custom base URL) + if (process.env.OPENAI_BASE_URL) { + return createGenericClient(process.env.OPENAI_BASE_URL); + } + + // Direct OpenAI (OPENAI_API_KEY picked up automatically by the SDK) + if (process.env.OPENAI_API_KEY) { + return createGenericClient(); + } + + throw new Error( + 'No API credentials found. See .env.example for options.', + ); +} diff --git a/evals/docs-guides/src/config.ts b/evals/docs-guides/src/config.ts new file mode 100644 index 0000000000..bfb51e1ebf --- /dev/null +++ b/evals/docs-guides/src/config.ts @@ -0,0 +1,56 @@ +import { readFileSync } from 'fs'; +import yaml from 'js-yaml'; + +export interface GuideConfig { + doc_url: string; +} + +export interface GuidesConfig { + guides: Record; +} + +export interface RunConfig { + model: string; + evalModel: string; + guides: string[]; + configPath: string; + localDir: string | null; + timeout: number; // minutes per guide +} + +export function loadGuides(configPath: string): GuidesConfig { + const raw = readFileSync(configPath, 'utf-8'); + return yaml.load(raw) as GuidesConfig; +} + +export function parseArgs(): RunConfig { + const args = process.argv.slice(2); + let model = process.env.EVAL_WORKER_MODEL || 'claude-sonnet-4-6'; + let evalModel = process.env.EVAL_JUDGE_MODEL || 'claude-sonnet-4-6'; + let guides: string[] = []; + let localDir: string | null = null; + let timeout = 10; // default 10 min per guide + + const configPath = new URL('../config/guides.yaml', import.meta.url).pathname; + + for (let i = 0; i < args.length; i++) { + if (args[i] === '--guide' && args[i + 1]) { + guides = args[i + 1].split(','); + i++; + } else if (args[i] === '--model' && args[i + 1]) { + model = args[i + 1]; + i++; + } else if (args[i] === '--eval-model' && args[i + 1]) { + evalModel = args[i + 1]; + i++; + } else if (args[i] === '--local' && args[i + 1]) { + localDir = args[i + 1]; + i++; + } else if (args[i] === '--timeout' && args[i + 1]) { + timeout = parseInt(args[i + 1], 10); + i++; + } + } + + return { model, evalModel, guides, configPath, localDir, timeout }; +} diff --git a/evals/docs-guides/src/docker.ts b/evals/docs-guides/src/docker.ts new file mode 100644 index 0000000000..d21654bb29 --- /dev/null +++ b/evals/docs-guides/src/docker.ts @@ -0,0 +1,180 @@ +import { execFile } from 'child_process'; +import { promisify } from 'util'; + +const execFileAsync = promisify(execFile); + +const UNIVERSAL_IMAGE = 'neon-eval-universal'; + +export interface Container { + id: string; + name: string; +} + +export interface FileSnapshot { + path: string; + content: string; +} + +// Proxy env vars to pass through from host to container (if set) +const PROXY_ENV_VARS = [ + 'NPM_REGISTRY', + 'PIP_INDEX_URL', + 'GOPROXY', + 'MAVEN_MIRROR_URL', + 'CARGO_REGISTRY_URL', +]; + +export async function startContainer( + name: string, + env: Record, +): Promise { + // Pass through any proxy env vars that are set on the host + const allEnv = { ...env }; + for (const key of PROXY_ENV_VARS) { + if (process.env[key] && !allEnv[key]) { + allEnv[key] = process.env[key]!; + } + } + + const envArgs = Object.entries(allEnv).flatMap(([k, v]) => ['-e', `${k}=${v}`]); + + const { stdout } = await execFileAsync('docker', [ + 'run', '-d', + '--name', name, + '-w', '/app', + ...envArgs, + UNIVERSAL_IMAGE, + 'sleep', 'infinity', + ]); + + return { id: stdout.trim(), name }; +} + +export async function dockerExec( + container: Container, + command: string, + timeoutMs = 120_000, +): Promise<{ stdout: string; stderr: string; exitCode: number }> { + try { + const { stdout, stderr } = await execFileAsync( + 'docker', + ['exec', container.name, 'bash', '-c', command], + { timeout: timeoutMs, maxBuffer: 10 * 1024 * 1024 }, + ); + return { stdout, stderr, exitCode: 0 }; + } catch (err: any) { + return { + stdout: err.stdout || '', + stderr: err.stderr || '', + exitCode: err.code ?? 1, + }; + } +} + +export async function dockerWriteFile( + container: Container, + filePath: string, + content: string, +): Promise { + const safePath = filePath.replace(/'/g, "'\\''"); + // First ensure the directory exists + await dockerExec(container, `mkdir -p "$(dirname '${safePath}')"`); + // Write via stdin to avoid shell injection from file content + try { + const child = require('child_process').spawn( + 'docker', ['exec', '-i', container.name, 'bash', '-c', `cat > '${safePath}'`], + { stdio: ['pipe', 'pipe', 'pipe'] }, + ); + child.stdin.write(content); + child.stdin.end(); + await new Promise((resolve, reject) => { + child.on('close', (code: number) => code === 0 ? resolve() : reject(new Error(`exit ${code}`))); + child.on('error', reject); + }); + return `File written to ${filePath}`; + } catch (err) { + return `Error writing file: ${err}`; + } +} + +export async function dockerReadFile( + container: Container, + filePath: string, +): Promise { + const safePath = filePath.replace(/'/g, "'\\''"); + const { stdout, stderr, exitCode } = await dockerExec(container, `cat '${safePath}'`); + if (exitCode !== 0) return `Error reading file: ${stderr}`; + return stdout; +} + +export async function captureFileSnapshots(container: Container): Promise { + // Grab all user-created files in one shot using a find+awk script + // that prints PATHCONTENT for each file + const result = await dockerExec( + container, + `find /app -type f \ + -not -path '*/node_modules/*' \ + -not -path '*/__pycache__/*' \ + -not -path '*/.git/*' \ + -not -path '*/venv/*' \ + -not -path '*/.venv/*' \ + -not -path '*/target/*' \ + -not -path '*/_build/*' \ + -not -path '*/deps/*' \ + -not -name 'package-lock.json' \ + -not -name '*.class' \ + -not -name 'go.sum' \ + -size -50k \ + 2>/dev/null | sort | while IFS= read -r f; do + printf '%s\\n' "===FILE:$f" + cat "$f" 2>/dev/null + done`, + 300_000, + ); + + const snapshots: FileSnapshot[] = []; + const chunks = result.stdout.split(/^===FILE:/m).filter(Boolean); + for (const chunk of chunks) { + const newlineIdx = chunk.indexOf('\n'); + if (newlineIdx === -1) continue; + const filePath = chunk.slice(0, newlineIdx).trim(); + const content = chunk.slice(newlineIdx + 1); + if (filePath && content.length < 50_000) { + snapshots.push({ + path: filePath.replace('/app/', ''), + content, + }); + } + } + + return snapshots; +} + +export async function removeContainer(container: Container): Promise { + try { + await execFileAsync('docker', ['rm', '-f', container.name]); + } catch { + // ignore — container may already be gone + } +} + +export async function buildUniversalImage(): Promise { + const dockerDir = new URL('../docker', import.meta.url).pathname; + + // Check if image already exists + try { + await execFileAsync('docker', ['image', 'inspect', UNIVERSAL_IMAGE]); + console.log(`Image ${UNIVERSAL_IMAGE} already exists. Delete it manually to rebuild.`); + return; + } catch { + // Image doesn't exist, build it + } + + console.log(`Building ${UNIVERSAL_IMAGE} (this takes a few minutes the first time)...`); + await execFileAsync('docker', [ + 'build', '-t', UNIVERSAL_IMAGE, + '-f', `${dockerDir}/Dockerfile.universal`, + dockerDir, + ], { timeout: 600_000 }); + console.log(`Image ${UNIVERSAL_IMAGE} built.`); +} diff --git a/evals/docs-guides/src/evaluator.ts b/evals/docs-guides/src/evaluator.ts new file mode 100644 index 0000000000..e04d526a6c --- /dev/null +++ b/evals/docs-guides/src/evaluator.ts @@ -0,0 +1,295 @@ +import OpenAI from 'openai'; +import { readFileSync } from 'fs'; +import { Container, dockerExec } from './docker.js'; +import { getClient } from './client.js'; + +export interface DeterministicChecks { + has_source_files: boolean; + env_file_exists: boolean; + no_hardcoded_creds: boolean; + agent_verified_connection: boolean; +} + +export interface EvalDimension { + pass: boolean; + note: string; +} + +export interface EvalResult { + overall_score: number; + dimensions: { + correct_packages: EvalDimension; + best_practices: EvalDimension; + stayed_in_scope: EvalDimension; + no_backtracking: EvalDimension; + }; + user_interventions: number; + failures: string[]; + reasoning: string; +} + +export interface FullEvaluation { + deterministicChecks: DeterministicChecks; + llmEval: EvalResult; +} + +// --- Layer 1: Deterministic checks --- + +export async function runDeterministicChecks( + container: Container, + connectionString: string, + transcript: OpenAI.Chat.Completions.ChatCompletionMessageParam[], +): Promise { + // Check if source files were created + const filesResult = await dockerExec( + container, + `find /app -type f \ + -not -path '*/node_modules/*' \ + -not -path '*/__pycache__/*' \ + -not -path '*/venv/*' \ + -not -path '*/.venv/*' \ + -not -path '*/target/*' \ + -not -name '*.class' \ + -not -name 'package-lock.json' \ + -not -name 'go.sum' \ + 2>/dev/null | wc -l`, + ); + const hasSourceFiles = parseInt(filesResult.stdout.trim(), 10) > 0; + + // Check .env file exists (anywhere under /app) + const envCheck = await dockerExec(container, 'find /app -name ".env" -type f 2>/dev/null | head -1'); + let envFileExists = envCheck.stdout.trim().length > 0; + + // For Elixir, C# we use config.exs/dev.exs, appsettings.json, etc. as the credential store instead of .env, so we won't check for hardcoded creds if no .env file is found + if (!envFileExists) { + envFileExists = await dockerExec(container, 'find /app -type f \\( -name "config.exs" -o -name "dev.exs" -o -name "appsettings.json" \\) 2>/dev/null | head -1') + .then(res => res.stdout.trim().length > 0); + } + + // Check no hardcoded credentials in source files + const connMatch = connectionString.match(/npg_[a-zA-Z0-9]+/); + const searchFragment = connMatch ? connMatch[0] : ''; + let noHardcodedCreds = true; + if (searchFragment) { + const grepResult = await dockerExec( + container, + `grep -r "${searchFragment}" \ + --include="*.js" --include="*.ts" --include="*.py" \ + --include="*.go" --include="*.java" --include="*.rb" \ + /app 2>/dev/null | grep -v node_modules | grep -v '.env' | grep -v __pycache__ || true`, + ); + noHardcodedCreds = grepResult.stdout.trim() === ''; + } + + // Check if the agent's own verification showed a successful connection + // Look for PostgreSQL version strings or success indicators in tool results + const transcriptText = JSON.stringify(transcript); + const agentVerifiedConnection = + /PostgreSQL \d+\.\d+/.test(transcriptText) || + /successfully connected/i.test(transcriptText) || + /connection.*successful/i.test(transcriptText) || + /rows? (affected|inserted|created|returned)/i.test(transcriptText); + + return { + has_source_files: hasSourceFiles, + env_file_exists: envFileExists, + no_hardcoded_creds: noHardcodedCreds, + agent_verified_connection: agentVerifiedConnection, + }; +} + +// --- Layer 2: LLM evaluation --- + +const scoringTool: OpenAI.Chat.Completions.ChatCompletionTool = { + type: 'function', + function: { + name: 'submit_evaluation', + description: 'Submit the evaluation scores for this guide test run.', + parameters: { + type: 'object', + properties: { + overall_score: { + type: 'number', + description: 'Overall score from 0-10.', + }, + dimensions: { + type: 'object', + properties: { + correct_packages: { + type: 'object', + properties: { pass: { type: 'boolean' }, note: { type: 'string' } }, + required: ['pass', 'note'], + }, + best_practices: { + type: 'object', + properties: { pass: { type: 'boolean' }, note: { type: 'string' } }, + required: ['pass', 'note'], + }, + stayed_in_scope: { + type: 'object', + properties: { pass: { type: 'boolean' }, note: { type: 'string' } }, + required: ['pass', 'note'], + }, + no_backtracking: { + type: 'object', + properties: { pass: { type: 'boolean' }, note: { type: 'string' } }, + required: ['pass', 'note'], + }, + }, + required: ['correct_packages', 'best_practices', 'stayed_in_scope', 'no_backtracking'], + }, + user_interventions: { + type: 'integer', + description: 'Number of user interventions needed beyond providing credentials.', + }, + failures: { + type: 'array', + items: { type: 'string' }, + description: 'List of specific failures or issues.', + }, + reasoning: { + type: 'string', + description: 'Detailed reasoning for the scores given.', + }, + }, + required: ['overall_score', 'dimensions', 'user_interventions', 'failures', 'reasoning'], + }, + }, +}; + +function buildTranscriptSummary(transcript: OpenAI.Chat.Completions.ChatCompletionMessageParam[]): string { + const parts: string[] = []; + + for (const msg of transcript) { + if (msg.role === 'user' && typeof msg.content === 'string') { + if (msg.content.length > 500) { + parts.push('[User provided documentation guide and task]'); + } else { + parts.push(`User: ${msg.content}`); + } + } else if (msg.role === 'assistant') { + const am = msg as OpenAI.Chat.Completions.ChatCompletionAssistantMessageParam; + if (am.content && typeof am.content === 'string') { + parts.push(`Assistant: ${am.content}`); + } + if (am.tool_calls) { + for (const tc of am.tool_calls) { + if (tc.type !== 'function') continue; + const args = JSON.parse(tc.function.arguments); + if (tc.function.name === 'bash') { + parts.push(`Tool [bash]: ${args.command}`); + } else if (tc.function.name === 'write_file') { + parts.push(`Tool [write_file]: ${args.path} (${args.content?.length || 0} chars)`); + } else if (tc.function.name === 'read_file') { + parts.push(`Tool [read_file]: ${args.path}`); + } + } + } + } else if (msg.role === 'tool') { + const tm = msg as OpenAI.Chat.Completions.ChatCompletionToolMessageParam; + const content = typeof tm.content === 'string' ? tm.content : ''; + const preview = content.length > 200 ? content.slice(0, 200) + '...' : content; + parts.push(`Tool result: ${preview}`); + } + } + + return parts.join('\n'); +} + +async function callWithRetry( + client: OpenAI, + params: OpenAI.Chat.Completions.ChatCompletionCreateParamsNonStreaming, + maxRetries = 3, +): Promise { + for (let attempt = 0; attempt <= maxRetries; attempt++) { + try { + return await client.chat.completions.create(params); + } catch (err: any) { + if (err?.status === 429 && attempt < maxRetries) { + const waitSec = Math.pow(2, attempt + 1) * 5; + console.log(` Rate limited (429). Waiting ${waitSec}s before retry ${attempt + 1}/${maxRetries}...`); + await new Promise(r => setTimeout(r, waitSec * 1000)); + continue; + } + throw err; + } + } + throw new Error('Unreachable'); +} + +export async function runLLMEvaluation( + transcript: OpenAI.Chat.Completions.ChatCompletionMessageParam[], + docsMarkdown: string, + deterministicChecks: DeterministicChecks, + evalModel: string, +): Promise { + const client = getClient(evalModel); + const rubricPath = new URL('../config/rubric.md', import.meta.url).pathname; + const rubric = readFileSync(rubricPath, 'utf-8'); + + const transcriptSummary = buildTranscriptSummary(transcript); + + // Truncate docs to avoid overwhelming the evaluator context + const docsPreview = docsMarkdown.length > 3000 + ? docsMarkdown.slice(0, 3000) + '\n...(truncated)' + : docsMarkdown; + + const evalPrompt = [ + '## Documentation guide (reference)', + '', + docsPreview, + '', + '## Deterministic check results', + '', + `- Source files created: ${deterministicChecks.has_source_files ? 'PASS' : 'FAIL'}`, + `- .env file exists: ${deterministicChecks.env_file_exists ? 'PASS' : 'FAIL'}`, + `- No hardcoded credentials: ${deterministicChecks.no_hardcoded_creds ? 'PASS' : 'FAIL'}`, + `- Agent verified connection in transcript: ${deterministicChecks.agent_verified_connection ? 'PASS' : 'FAIL'}`, + '', + '## Agent session transcript', + '', + transcriptSummary, + '', + `- Total turns: ${transcript.filter(m => m.role === 'assistant').length}`, + '', + 'Based on the rubric, the documentation guide, the deterministic checks, and the full transcript,', + 'evaluate this agent session by calling the submit_evaluation tool.', + ].join('\n'); + + const response = await callWithRetry(client, { + model: evalModel, + max_tokens: 2048, + messages: [ + { role: 'system', content: rubric }, + { role: 'user', content: evalPrompt }, + ], + tools: [scoringTool], + tool_choice: { type: 'function', function: { name: 'submit_evaluation' } }, + }); + + const choice = response.choices[0]; + const toolCalls = choice?.message?.tool_calls || []; + for (const tc of toolCalls) { + if (tc.type === 'function' && tc.function.name === 'submit_evaluation') { + return JSON.parse(tc.function.arguments) as EvalResult; + } + } + + throw new Error('Evaluator did not return a scoring result'); +} + +export async function evaluate( + container: Container, + connectionString: string, + transcript: OpenAI.Chat.Completions.ChatCompletionMessageParam[], + docsMarkdown: string, + evalModel: string, +): Promise { + console.log(' Running deterministic checks...'); + const deterministicChecks = await runDeterministicChecks(container, connectionString, transcript); + + console.log(' Running LLM evaluation...'); + const llmEval = await runLLMEvaluation(transcript, docsMarkdown, deterministicChecks, evalModel); + + return { deterministicChecks, llmEval }; +} diff --git a/evals/docs-guides/src/neon.ts b/evals/docs-guides/src/neon.ts new file mode 100644 index 0000000000..e209601a8a --- /dev/null +++ b/evals/docs-guides/src/neon.ts @@ -0,0 +1,31 @@ +export interface NeonDatabase { + id: string; + connectionString: string; + directConnectionString: string; + expiresAt: string; +} + +export async function createDatabase(): Promise { + const response = await fetch('https://neon.new/api/v1/database', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ ref: 'neon-docs-eval' }), + }); + + if (!response.ok) { + throw new Error(`Failed to create Neon database: ${response.status} ${await response.text()}`); + } + + const data = await response.json(); + const pooledUrl: string = data.connection_string; + + // Derive direct URL by removing -pooler from hostname + const directUrl = pooledUrl.replace('-pooler.', '.'); + + return { + id: data.id, + connectionString: pooledUrl, + directConnectionString: directUrl, + expiresAt: data.expires_at, + }; +} diff --git a/evals/docs-guides/src/reporter.ts b/evals/docs-guides/src/reporter.ts new file mode 100644 index 0000000000..0cdc49bbd2 --- /dev/null +++ b/evals/docs-guides/src/reporter.ts @@ -0,0 +1,161 @@ +import { writeFileSync, mkdirSync } from 'fs'; +import type { FullEvaluation } from './evaluator.js'; +import type { WorkerResult } from './worker.js'; +import type { FileSnapshot } from './docker.js'; +import type OpenAI from 'openai'; + +export interface GuideResult { + guide: string; + model: string; + evaluation: FullEvaluation; + worker: { + turnCount: number; + toolCallCount: number; + durationMs: number; + }; + transcript: OpenAI.Chat.Completions.ChatCompletionMessageParam[]; + fileSnapshots: FileSnapshot[]; + timestamp: string; + source: 'remote' | 'local'; +} + +export function printResults(results: GuideResult[]): void { + console.log('\n' + '='.repeat(80)); + console.log('EVALUATION RESULTS'); + console.log('='.repeat(80)); + + // Summary table + console.log('\n| Guide | Score | Connection | .env | No Hardcoded Creds | Scope | Backtrack | Source |'); + console.log('|-------|-------|------------|------|-------------------|-------|-----------|--------|'); + + for (const r of results) { + const e = r.evaluation.llmEval; + const d = r.evaluation.deterministicChecks; + const dims = e.dimensions || {}; + const p = (v: boolean | undefined) => v ? 'PASS' : 'FAIL'; + console.log( + `| ${r.guide} ` + + `| **${e.overall_score}/10** ` + + `| ${p(d.agent_verified_connection)} ` + + `| ${p(d.env_file_exists)} ` + + `| ${p(d.no_hardcoded_creds)} ` + + `| ${p(dims.stayed_in_scope?.pass)} ` + + `| ${p(dims.no_backtracking?.pass)} ` + + `| ${r.source} |`, + ); + } + + // Detailed results + for (const r of results) { + const e = r.evaluation.llmEval; + const d = r.evaluation.deterministicChecks; + const dims = e.dimensions || {}; + console.log(`\n--- ${r.guide} (${r.model}) ---`); + console.log(`Score: ${e.overall_score}/10`); + console.log(`Duration: ${Math.round(r.worker.durationMs / 1000)}s | Turns: ${r.worker.turnCount} | Tool calls: ${r.worker.toolCallCount}`); + console.log(`\nDeterministic checks:`); + console.log(` Agent verified connection: ${d.agent_verified_connection ? 'PASS' : 'FAIL'}`); + console.log(` Source files created: ${d.has_source_files ? 'PASS' : 'FAIL'}`); + console.log(` .env file exists: ${d.env_file_exists ? 'PASS' : 'FAIL'}`); + console.log(` No hardcoded creds: ${d.no_hardcoded_creds ? 'PASS' : 'FAIL'}`); + if (dims.correct_packages && typeof dims.correct_packages === 'object') { + console.log(`\nDimensions:`); + for (const [k, v] of Object.entries(dims)) { + if (typeof v === 'object' && v !== null && 'pass' in v) { + const dim = v as { pass?: boolean; note?: string }; + console.log(` ${k}: ${dim?.pass ? 'PASS' : 'FAIL'}${dim?.note ? ` — ${dim.note}` : ''}`); + } + } + } + if (e.failures?.length > 0) { + console.log(`\nFailures:`); + for (const f of e.failures) console.log(` - ${f}`); + } + console.log(`\nReasoning: ${e.reasoning}`); + } + + console.log('\n' + '='.repeat(80)); +} + +export function saveResults(results: GuideResult[]): void { + const resultsDir = new URL('../results', import.meta.url).pathname; + const historyDir = `${resultsDir}/history`; + const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); + const runDir = `${historyDir}/${timestamp}`; + mkdirSync(runDir, { recursive: true }); + + // Save summary (without transcript/snapshots for readability) + const summary = results.map(r => ({ + guide: r.guide, + model: r.model, + evaluation: r.evaluation, + worker: r.worker, + timestamp: r.timestamp, + source: r.source, + })); + writeFileSync(`${resultsDir}/latest.json`, JSON.stringify(summary, null, 2)); + writeFileSync(`${runDir}/summary.json`, JSON.stringify(summary, null, 2)); + + // Save per-guide artifacts + for (const r of results) { + const guideDir = `${runDir}/${r.guide}`; + mkdirSync(guideDir, { recursive: true }); + + // Transcript as readable text + const transcriptLines: string[] = []; + for (const msg of r.transcript) { + if (msg.role === 'user' && typeof msg.content === 'string') { + if (msg.content.length > 500) { + transcriptLines.push('[USER] (provided documentation guide and task)\n'); + } else { + transcriptLines.push(`[USER] ${msg.content}\n`); + } + } else if (msg.role === 'assistant') { + const am = msg as any; + if (am.content) transcriptLines.push(`[ASSISTANT] ${am.content}\n`); + if (am.tool_calls) { + for (const tc of am.tool_calls) { + if (tc.type !== 'function') continue; + const args = JSON.parse(tc.function.arguments); + if (tc.function.name === 'bash') { + transcriptLines.push(`[TOOL bash] ${args.command}\n`); + } else if (tc.function.name === 'write_file') { + transcriptLines.push(`[TOOL write_file] ${args.path}\n${args.content}\n`); + } else if (tc.function.name === 'read_file') { + transcriptLines.push(`[TOOL read_file] ${args.path}\n`); + } + } + } + } else if (msg.role === 'tool') { + const tm = msg as any; + const content = typeof tm.content === 'string' ? tm.content : ''; + transcriptLines.push(`[TOOL RESULT] ${content.slice(0, 500)}${content.length > 500 ? '...' : ''}\n`); + } + } + writeFileSync(`${guideDir}/transcript.txt`, transcriptLines.join('\n')); + + // File snapshots — write each file the agent created + if (r.fileSnapshots.length > 0) { + const snapshotDir = `${guideDir}/files`; + mkdirSync(snapshotDir, { recursive: true }); + for (const snap of r.fileSnapshots) { + const safePath = snap.path.replace(/\//g, '__'); + writeFileSync(`${snapshotDir}/${safePath}`, snap.content); + } + // Also write an index + const index = r.fileSnapshots.map(s => s.path).join('\n'); + writeFileSync(`${guideDir}/files-index.txt`, index); + } + + // Raw transcript JSON (for programmatic access) + writeFileSync(`${guideDir}/transcript.json`, JSON.stringify(r.transcript, null, 2)); + } + + console.log(`\nResults saved to:`); + console.log(` results/latest.json (summary)`); + console.log(` results/history/${timestamp}/ (full artifacts)`); + for (const r of results) { + console.log(` ${r.guide}/transcript.txt`); + console.log(` ${r.guide}/files/ (${r.fileSnapshots.length} files)`); + } +} diff --git a/evals/docs-guides/src/runner.ts b/evals/docs-guides/src/runner.ts new file mode 100644 index 0000000000..2d1532e481 --- /dev/null +++ b/evals/docs-guides/src/runner.ts @@ -0,0 +1,191 @@ +import { readFileSync, existsSync } from 'fs'; +import { resolve } from 'path'; + +// Load .env file if present +try { + const envPath = resolve(new URL('..', import.meta.url).pathname, '.env'); + const envContent = readFileSync(envPath, 'utf-8'); + for (const line of envContent.split('\n')) { + const trimmed = line.trim(); + if (!trimmed || trimmed.startsWith('#')) continue; + const eqIdx = trimmed.indexOf('='); + if (eqIdx === -1) continue; + const key = trimmed.slice(0, eqIdx).trim(); + const value = trimmed.slice(eqIdx + 1).trim(); + if (!process.env[key]) { + process.env[key] = value; + } + } +} catch { + // No .env file +} + +import { loadGuides, parseArgs } from './config.js'; +import { startContainer, removeContainer, buildUniversalImage, captureFileSnapshots } from './docker.js'; +import { createDatabase } from './neon.js'; +import { runWorker } from './worker.js'; +import { evaluate } from './evaluator.js'; +import { printResults, saveResults, type GuideResult } from './reporter.js'; + +async function fetchDocs(url: string): Promise { + const response = await fetch(url); + if (!response.ok) { + throw new Error(`Failed to fetch docs from ${url}: ${response.status}`); + } + return response.text(); +} + +function loadLocalDocs(localDir: string, guideName: string): string { + const candidates = [ + resolve(localDir, `${guideName}.md`), + resolve(localDir, guideName, 'index.md'), + ]; + + for (const path of candidates) { + if (existsSync(path)) { + console.log(` Loading local file: ${path}`); + return readFileSync(path, 'utf-8'); + } + } + + throw new Error( + `Local docs not found for "${guideName}". Tried:\n` + + candidates.map(c => ` ${c}`).join('\n'), + ); +} + +function errorResult(guide: string, model: string, localDir: string | null, err: unknown): GuideResult { + const errMsg = err instanceof Error ? err.message : String(err); + const failDim = { pass: false, note: 'Error during evaluation' }; + return { + guide, model, + evaluation: { + deterministicChecks: { has_source_files: false, env_file_exists: false, no_hardcoded_creds: false, agent_verified_connection: false }, + llmEval: { overall_score: 0, dimensions: { correct_packages: failDim, best_practices: failDim, stayed_in_scope: failDim, no_backtracking: failDim }, user_interventions: 0, failures: [`Harness error: ${errMsg}`], reasoning: `Evaluation failed: ${errMsg}` }, + }, + worker: { turnCount: 0, toolCallCount: 0, durationMs: 0 }, + transcript: [], fileSnapshots: [], + timestamp: new Date().toISOString(), + source: localDir ? 'local' : 'remote', + }; +} + +async function main() { + const args = parseArgs(); + const config = loadGuides(args.configPath); + + const guideNames = args.guides.length > 0 + ? args.guides + : Object.keys(config.guides); + + // For --local mode, guides don't need to be in the config. + // Any name that matches a .md file in the local dir is valid. + if (!args.localDir) { + for (const name of guideNames) { + if (!config.guides[name]) { + console.error(`Unknown guide: ${name}. Available: ${Object.keys(config.guides).join(', ')}`); + console.error('Tip: use --local /path/to/guides/ to test local files without adding them to config.'); + process.exit(1); + } + } + } + + console.log(`Running eval for: ${guideNames.join(', ')}`); + console.log(`Worker model: ${args.model}`); + console.log(`Eval model: ${args.evalModel}`); + console.log(`Timeout: ${args.timeout}min per guide`); + if (args.localDir) { + console.log(`Source: LOCAL (${args.localDir})`); + } else { + console.log(`Source: REMOTE (neon.com)`); + } + console.log(''); + + // Build universal Docker image if needed + await buildUniversalImage(); + console.log(''); + + const results: GuideResult[] = []; + + for (const guideName of guideNames) { + console.log(`\n${'='.repeat(60)}`); + console.log(`EVALUATING: ${guideName}`); + console.log(`${'='.repeat(60)}`); + + let container; + try { + // 1. Create ephemeral Neon database + console.log(' Creating ephemeral Neon database...'); + const db = await createDatabase(); + console.log(` Database created (expires: ${db.expiresAt})`); + + // 2. Get the docs content + let docsMarkdown: string; + if (args.localDir) { + docsMarkdown = loadLocalDocs(args.localDir, guideName); + } else { + const guide = config.guides[guideName]; + const docUrl = guide?.doc_url || `https://neon.com/docs/guides/${guideName}.md`; + console.log(` Fetching docs from ${docUrl}...`); + docsMarkdown = await fetchDocs(docUrl); + } + console.log(` Docs: ${docsMarkdown.length} chars`); + + // 3. Start Docker container (universal image) + const containerName = `neon-eval-${guideName}-${Date.now()}`; + console.log(` Starting container ${containerName}...`); + container = await startContainer(containerName, { + DATABASE_URL: db.connectionString, + DIRECT_URL: db.directConnectionString, + }); + console.log(` Container started: ${container.id.slice(0, 12)}`); + + // 4. Run the worker agent + console.log(` Running worker agent (model: ${args.model}, timeout: ${args.timeout}min)...`); + const workerResult = await runWorker(guideName, docsMarkdown, container, args.model, args.timeout); + console.log(` Worker complete: ${workerResult.turnCount} turns, ${workerResult.toolCallCount} tool calls, ${Math.round(workerResult.durationMs / 1000)}s`); + + // 5. Capture file snapshots + console.log(' Capturing file snapshots...'); + const fileSnapshots = await captureFileSnapshots(container); + console.log(` Captured ${fileSnapshots.length} files`); + + // 6. Evaluate + console.log(' Evaluating...'); + const evaluation = await evaluate(container, db.connectionString, workerResult.transcript, docsMarkdown, args.evalModel); + + results.push({ + guide: guideName, + model: args.model, + evaluation, + worker: { + turnCount: workerResult.turnCount, + toolCallCount: workerResult.toolCallCount, + durationMs: workerResult.durationMs, + }, + transcript: workerResult.transcript, + fileSnapshots, + timestamp: new Date().toISOString(), + source: args.localDir ? 'local' : 'remote', + }); + + console.log(` Score: ${evaluation.llmEval.overall_score}/10`); + } catch (err) { + console.error(` ERROR evaluating ${guideName}:`, err); + results.push(errorResult(guideName, args.model, args.localDir, err)); + } finally { + if (container) { + console.log(' Removing container...'); + await removeContainer(container); + } + } + } + + printResults(results); + saveResults(results); +} + +main().catch((err) => { + console.error('Fatal error:', err); + process.exit(1); +}); diff --git a/evals/docs-guides/src/worker.ts b/evals/docs-guides/src/worker.ts new file mode 100644 index 0000000000..efa2847232 --- /dev/null +++ b/evals/docs-guides/src/worker.ts @@ -0,0 +1,212 @@ +import OpenAI from 'openai'; +import { Container, dockerExec, dockerWriteFile, dockerReadFile } from './docker.js'; +import { getClient } from './client.js'; + +const tools: OpenAI.Chat.Completions.ChatCompletionTool[] = [ + { + type: 'function', + function: { + name: 'bash', + description: 'Run a bash command in the working directory. Returns stdout and stderr.', + parameters: { + type: 'object', + properties: { + command: { type: 'string', description: 'The bash command to execute' }, + }, + required: ['command'], + }, + }, + }, + { + type: 'function', + function: { + name: 'write_file', + description: 'Write content to a file. Creates parent directories if needed.', + parameters: { + type: 'object', + properties: { + path: { type: 'string', description: 'The file path to write to' }, + content: { type: 'string', description: 'The content to write' }, + }, + required: ['path', 'content'], + }, + }, + }, + { + type: 'function', + function: { + name: 'read_file', + description: 'Read the contents of a file.', + parameters: { + type: 'object', + properties: { + path: { type: 'string', description: 'The file path to read' }, + }, + required: ['path'], + }, + }, + }, +]; + +interface ToolInput { + command?: string; + path?: string; + content?: string; +} + +async function executeTool( + name: string, + input: ToolInput, + container: Container, +): Promise { + switch (name) { + case 'bash': { + const result = await dockerExec(container, input.command!); + let output = ''; + if (result.stdout) output += result.stdout; + if (result.stderr) output += (output ? '\n' : '') + result.stderr; + if (result.exitCode !== 0) output += `\nExit code: ${result.exitCode}`; + return output || '(no output)'; + } + case 'write_file': + return dockerWriteFile(container, input.path!, input.content!); + case 'read_file': + return dockerReadFile(container, input.path!); + default: + return `Unknown tool: ${name}`; + } +} + +// Throttle: wait between API calls to stay under rate limits +const THROTTLE_MS = parseInt(process.env.EVAL_THROTTLE_MS || '2000', 10); +let lastCallTime = 0; + +async function throttle(): Promise { + const elapsed = Date.now() - lastCallTime; + if (elapsed < THROTTLE_MS) { + await new Promise(r => setTimeout(r, THROTTLE_MS - elapsed)); + } + lastCallTime = Date.now(); +} + +async function callWithRetry( + client: OpenAI, + params: OpenAI.Chat.Completions.ChatCompletionCreateParamsNonStreaming, + maxRetries = 5, +): Promise { + for (let attempt = 0; attempt <= maxRetries; attempt++) { + try { + await throttle(); + return await client.chat.completions.create(params); + } catch (err: any) { + if (err?.status === 429 && attempt < maxRetries) { + const waitSec = Math.min(30 + attempt * 30, 120); // 30s, 60s, 90s, 120s, 120s + console.log(` Rate limited (429). Waiting ${waitSec}s before retry ${attempt + 1}/${maxRetries}...`); + await new Promise(r => setTimeout(r, waitSec * 1000)); + continue; + } + throw err; + } + } + throw new Error('Unreachable'); +} + +export interface WorkerResult { + transcript: OpenAI.Chat.Completions.ChatCompletionMessageParam[]; + turnCount: number; + toolCallCount: number; + durationMs: number; +} + +export async function runWorker( + guideName: string, + docsMarkdown: string, + container: Container, + model: string, + timeoutMinutes: number, +): Promise { + const client = getClient(model); + const startTime = Date.now(); + const timeoutMs = timeoutMinutes * 60 * 1000; + + const prompt = [ + `Here is a documentation guide:`, + '', + docsMarkdown, + '', + `Your task: Follow this documentation to create a working application connected to Neon Postgres.`, + '', + 'Environment:', + '- You are in an empty /app directory on a Debian Linux system', + '- Node.js 22 and Python 3 are pre-installed', + '- You have root access and can install any other runtimes or tools with apt-get (e.g., golang, rustc, elixir, ruby, php, dotnet, openjdk, maven, etc.)', + '- The DATABASE_URL environment variable is set with a valid Neon Postgres connection string', + '- If you need a direct (non-pooled) connection string, derive it from DATABASE_URL by removing "-pooler" from the hostname', + '', + 'Requirements:', + '- Follow the guide closely. Do not add features beyond what the guide demonstrates.', + '- Prioritize storing credentials in a .env file, but if the guide uses a different method such as config.exs for Elixir, appsettings.json for C#, etc., use that method as shown in the guide.', + '- After setup is complete, verify the connection works by running the application.', + '- If the app is a web server, start it and confirm it responds. If it is a script, run it and confirm the output shows a successful Neon connection.', + ].join('\n'); + + const messages: OpenAI.Chat.Completions.ChatCompletionMessageParam[] = [ + { role: 'user', content: prompt }, + ]; + + let turnCount = 0; + let toolCallCount = 0; + const maxTurns = 60; + + while (turnCount < maxTurns) { + if (Date.now() - startTime > timeoutMs) { + console.log(` Timeout reached (${timeoutMinutes}min)`); + break; + } + + turnCount++; + const response = await callWithRetry(client, { + model, + max_tokens: 4096, + tools, + messages, + }); + + const choice = response.choices?.[0]; + if (!choice) { + console.log(' Unexpected response (no choices)'); + break; + } + + const assistantMsg = choice.message; + messages.push(assistantMsg); + + if (assistantMsg.content?.trim()) { + console.log(` Agent: ${assistantMsg.content.slice(0, 120)}${assistantMsg.content.length > 120 ? '...' : ''}`); + } + + if (choice.finish_reason !== 'tool_calls') break; + + const toolCalls = assistantMsg.tool_calls || []; + for (const toolCall of toolCalls) { + if (toolCall.type !== 'function') continue; + toolCallCount++; + const name = toolCall.function.name; + const input: ToolInput = JSON.parse(toolCall.function.arguments); + console.log(` Tool [${name}]: ${name === 'bash' ? input.command?.slice(0, 80) : input.path}`); + const result = await executeTool(name, input, container); + messages.push({ + role: 'tool', + tool_call_id: toolCall.id, + content: result, + }); + } + } + + return { + transcript: messages, + turnCount, + toolCallCount, + durationMs: Date.now() - startTime, + }; +} diff --git a/evals/docs-guides/tsconfig.json b/evals/docs-guides/tsconfig.json new file mode 100644 index 0000000000..1d6524b726 --- /dev/null +++ b/evals/docs-guides/tsconfig.json @@ -0,0 +1,13 @@ +{ + "compilerOptions": { + "target": "ES2022", + "module": "ESNext", + "moduleResolution": "bundler", + "esModuleInterop": true, + "strict": true, + "outDir": "dist", + "rootDir": "src", + "skipLibCheck": true + }, + "include": ["src"] +}