From 8626af81ddef62a69caeab6a5225011385d8a683 Mon Sep 17 00:00:00 2001
From: Alex Qiu <alexander@browserbase.com>
Date: Thu, 14 May 2026 14:22:10 -0700
Subject: [PATCH 1/2] =?UTF-8?q?feat(autobrowse):=20iterative=20Playwright?=
 =?UTF-8?q?=20loop=20=E2=80=94=20explorer=20+=20deterministic=20verify=20c?=
 =?UTF-8?q?onverge=20together?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Until now the explorer (evaluate.mjs) and the Playwright emitter (export.mjs)
were two disconnected stages: explorer converged on "the LLM can finish the
task," then export was a one-shot translation. The two objective functions
diverge — what unblocks the LLM agent doesn't always unblock a deterministic
replay. Demoing this against bizfile.sos.ca.gov surfaced 7+ classes of
mismatch (styled-label overlays, autocomplete keystroke interception,
transiently-disabled selects) that each cost a hand-fix in the emitted
script.

This PR unifies the loop:

  Each iteration of `scripts/loop.mjs`:
    1. evaluate.mjs  → produces trace.json + summary.md
    2. If trace passed, export.mjs --no-verify → emits Playwright script
    3. npx tsx <task>.ts → actual deterministic replay
    4. On Playwright fail, distill-failure.mjs summarizes the error via
       Claude Haiku into strategy.md's "Recent Playwright Failures" section
    5. Next iteration's evaluate reads the updated strategy.md and adapts
  Convergence: Playwright passes 2 of last 3 iterations → graduate.

`strategy.md` is the shared intelligence layer between the LLM explorer and
the codegen. Three sections (documented in SKILL.md):
  - Navigation Heuristics  (LLM-facing)
  - Codegen Hints         (emitter-facing, per-task overrides)
  - Recent Playwright Failures  (auto-appended by distill-failure)

Also lifts the lessons from the bizfile demo into codegen defaults so future
tasks don't repeat the same hand-fixes:

  - forceCheck       : .check({ force: true }) for checkbox fill_sel ops
  - forceClickRadio  : .first().click({ force: true }) for radio click ops
                       (detected by selector pattern OR resolved node role)
  - selectWithFallback: .selectOption() with a JS-enable + native-setter
                       fallback when the <select> is transiently disabled
  - reactFill        : helper for inputs where simulated keystrokes get
                       intercepted by autosuggest/autocomplete handlers
  - clickButtonByText: eval-find-by-text in page context, avoids the
                       cross-step getByRole race on SPA wizards

Plus: select_dropdown ops with ref-shaped selectors (e.g. `[0-2005]`) now
route through the snapshot resolver instead of leaking as invalid CSS.

Files in this PR:
  scripts/loop.mjs                  NEW — top-level orchestrator
  scripts/export.mjs                NEW — trace → Playwright codegen
  scripts/lib/pick-run.mjs          NEW — newest-passing-run selector
  scripts/lib/parse-task.mjs        NEW — task.md → Zod schema
  scripts/lib/command-mapping.mjs   NEW — browse trace → target-agnostic ops
  scripts/lib/selector-resolver.mjs NEW — snapshot+ref → Playwright locators
  scripts/lib/codegen-playwright.mjs NEW — ops → TS with helpers baked in
  scripts/lib/verify.mjs            NEW — npm install + tsx run + JSON parse
  scripts/lib/distill-failure.mjs   NEW — Playwright stderr → strategy.md addendum
  scripts/evaluate.mjs              MODIFIED — BROWSERBASE_CONTEXT_ID
                                    passthrough + --max-turns flag
  SKILL.md                          MODIFIED — documents export, loop,
                                    sectioned strategy.md, and the
                                    helper defaults baked into codegen

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 skills/autobrowse/SKILL.md                    |  93 +++
 skills/autobrowse/scripts/evaluate.mjs        |  88 ++-
 skills/autobrowse/scripts/export.mjs          | 203 ++++++
 .../scripts/lib/codegen-playwright.mjs        | 619 ++++++++++++++++++
 .../scripts/lib/command-mapping.mjs           | 214 ++++++
 .../scripts/lib/distill-failure.mjs           | 124 ++++
 skills/autobrowse/scripts/lib/parse-task.mjs  |  96 +++
 skills/autobrowse/scripts/lib/pick-run.mjs    |  47 ++
 .../scripts/lib/selector-resolver.mjs         | 274 ++++++++
 skills/autobrowse/scripts/lib/verify.mjs      |  54 ++
 skills/autobrowse/scripts/loop.mjs            | 302 +++++++++
 11 files changed, 2108 insertions(+), 6 deletions(-)
 create mode 100755 skills/autobrowse/scripts/export.mjs
 create mode 100644 skills/autobrowse/scripts/lib/codegen-playwright.mjs
 create mode 100644 skills/autobrowse/scripts/lib/command-mapping.mjs
 create mode 100644 skills/autobrowse/scripts/lib/distill-failure.mjs
 create mode 100644 skills/autobrowse/scripts/lib/parse-task.mjs
 create mode 100644 skills/autobrowse/scripts/lib/pick-run.mjs
 create mode 100644 skills/autobrowse/scripts/lib/selector-resolver.mjs
 create mode 100644 skills/autobrowse/scripts/lib/verify.mjs
 create mode 100755 skills/autobrowse/scripts/loop.mjs
diff --git a/skills/autobrowse/SKILL.md b/skills/autobrowse/SKILL.md
index 39d4f449..d9b0bc7c 100644
--- a/skills/autobrowse/SKILL.md
+++ b/skills/autobrowse/SKILL.md
@@ -272,6 +272,99 @@ Write the file `./autobrowse/reports/YYYY-MM-DD-HH-MM-<tasks>.md` with:
 
 ---
 
+## Export to deterministic Playwright
+
+Once a task has graduated, you can collapse the LLM-driven replay loop into a single deterministic TypeScript script via the `export` subcommand. The export mines the most recent passing run's `trace.json`, resolves session-scoped ARIA refs against the snapshots they came from, and emits a Playwright script that connects to a fresh Browserbase session (optionally bound to a persistent context).
+
+```bash
+# Default — generate and verify against the latest passing run
+node ${CLAUDE_SKILL_DIR}/scripts/export.mjs --task <task-name>
+
+# Custom workspace / specific run / skip verification
+node ${CLAUDE_SKILL_DIR}/scripts/export.mjs --task <task-name> --workspace ./autobrowse
+node ${CLAUDE_SKILL_DIR}/scripts/export.mjs --task <task-name> --run run-022
+node ${CLAUDE_SKILL_DIR}/scripts/export.mjs --task <task-name> --no-verify
+```
+
+The export writes to `<workspace>/tasks/<task>/playwright/`:
+
+- `<task>.ts` — runnable Playwright script. Connects to Browserbase via `chromium.connectOverCDP` when `BROWSERBASE_CONTEXT_ID` is set; falls back to local Chromium otherwise.
+- `selectors.cache.json` — resolved locators + ranked fallbacks per action. Used by future self-healing tooling.
+- `package.json`, `tsconfig.json` — minimal scaffold with `playwright`, `zod`, `tsx`, `dotenv`.
+
+How refs are resolved: every `[X-Y]` ref in the trace is looked up against the most recent prior `browse snapshot` containing it. The matched node's role + accessible name are turned into a ranked list of Playwright locator candidates — `getByRole({ name })` first, then `getByLabel` / `getByPlaceholder` for form inputs, then `getByText`, then bare `getByRole`. The best candidate is emitted inline; lower-ranked candidates are saved to `selectors.cache.json` for self-healing.
+
+The final extract step is generated with one Claude Haiku call at export time (requires `ANTHROPIC_API_KEY`). The LLM is given the final snapshot, the Zod schema parsed from `task.md`'s `## Output` block, and the agent's final reasoning. If the API key is missing the export still produces a script — the extract block is a TODO placeholder.
+
+For a Stagehand-targeted export (LLM-driven replay via `stagehand.act`/`observe`), use the standalone `/stagehand-export` skill.
+
+## Iterative Playwright loop (recommended for tasks that need a deterministic artifact)
+
+When the end goal is a runnable Playwright script (cron, Browserbase Functions, etc.), prefer `loop.mjs` over manually orchestrating evaluate + export. The loop converges on a workflow that **both** the LLM explorer **and** the deterministic Playwright replay can complete — which is a strictly stronger guarantee than "the LLM agent's trace ends with success: true."
+
+```bash
+node ${CLAUDE_SKILL_DIR}/scripts/loop.mjs --task <task-name> --env remote \
+  --max-iterations 8 --max-turns-per-iter 60
+```
+
+What it does per iteration:
+
+1. Runs `evaluate.mjs` (one LLM-driven exploration round).
+2. If the trace passed (`success: true` in the final JSON), runs `export.mjs --target playwright --no-verify` to emit a fresh script.
+3. Runs the emitted script (`npx tsx <task>.ts`) against a new BB session — the actual deterministic replay.
+4. If the Playwright replay passed → records a pass. If it failed → distills the failure (Claude Haiku, ~$0.01) into a new entry under `strategy.md`'s "Recent Playwright Failures" section.
+5. Next iteration's evaluate reads the updated strategy.md and adapts.
+
+**Convergence**: graduates when the emitted script passes in 2 of the last 3 iterations.
+
+### Strategy.md sections
+
+The loop expects (and the distiller maintains) this structure:
+
+```markdown
+# <task> Navigation Strategy
+
+## Navigation Heuristics
+(prose for the LLM explorer — fast-path URLs, timing notes, step sequences)
+
+## Codegen Hints
+(per-task overrides for the Playwright emitter — e.g., "use force:true for all radios on this site")
+
+## Recent Playwright Failures
+### Iteration 3 — <one-line>
+- **What failed**: ...
+- **Likely cause**: ...
+- **Fix to try next iteration**: ...
+```
+
+The emitter (`codegen-playwright.mjs`) bakes in baseline defaults for the most common state-portal patterns: `forceCheck` for checkbox `fill_sel` ops, `forceClickRadio` for radio click ops, `selectWithFallback` (JS-enable + native setter) for every `select_dropdown`, and a `reactFill` helper for inputs that need to bypass keystroke-by-keystroke event handling.
+
+### When to use `loop.mjs` vs `evaluate.mjs` directly
+
+- **Use `loop.mjs`** when you want a Playwright script as the deliverable. Costs more per iteration (each adds a script export + replay) but converges on something that actually replays in prod.
+- **Use `evaluate.mjs`** when you want a `/<task>` skill that future Claude sessions invoke (the original autobrowse flow). Cheaper, doesn't generate a Playwright script.
+
+---
+
+### Pre-authed sessions via persistent context
+
+For tasks that need authentication, create a Browserbase context once, log in interactively, and point autobrowse at it via the env var:
+
+```bash
+# One-time: create a context, log into the target site via live-view
+bb contexts create --project-id $BROWSERBASE_PROJECT_ID --json
+
+# Then, every autobrowse run for this task reuses the cached cookies/storage
+export BROWSERBASE_CONTEXT_ID=<id-from-above>
+node ${CLAUDE_SKILL_DIR}/scripts/evaluate.mjs --task <name> --env remote
+```
+
+When `BROWSERBASE_CONTEXT_ID` is set with `--env remote`, evaluate.mjs creates one BB session bound to that context before the agent loop, transparently injects `--connect <session-id>` into every browse command the agent issues, and releases the session at exit. The agent's `browse env` / `browse stop` / `browse status` calls become no-ops in this mode. Iterations skip the per-run login dance.
+
+The same env var, when set at runtime for the exported script, makes it attach to the same persisted context.
+
+---
+
 ## Rules
 
 - **Only edit `strategy.md`** — never touch `task.md` (unless creating it from the template) or `evaluate.mjs`
diff --git a/skills/autobrowse/scripts/evaluate.mjs b/skills/autobrowse/scripts/evaluate.mjs
index bf427580..f3ba6665 100644
--- a/skills/autobrowse/scripts/evaluate.mjs
+++ b/skills/autobrowse/scripts/evaluate.mjs
@@ -23,7 +23,7 @@ const SKILL_DIR = path.resolve(__dirname, "..");
 // ── Config ─────────────────────────────────────────────────────────
 
 const DEFAULT_MODEL = "claude-sonnet-4-6";
-const MAX_TURNS = 30;
+const DEFAULT_MAX_TURNS = 30;
 const MAX_TOKENS = 4096;
 const EXEC_TIMEOUT_MS = 30_000;
 
@@ -161,6 +161,50 @@ function getNextRunNumber(tracesDir) {
 
 const ALLOWED_COMMAND = "browse";
 
+// ── Managed Browserbase session (BROWSERBASE_CONTEXT_ID passthrough) ──
+//
+// When BROWSERBASE_CONTEXT_ID is set and --env remote, autobrowse creates
+// one persistent BB session before the agent loop and rewrites every
+// browse command from the agent to attach to it. This lets the agent
+// inherit a pre-authed state (cookies/storage) without having to log in
+// during every training iteration. Set via env var (not a CLI flag) so
+// callers can run autobrowse for the same task with or without the
+// context attached.
+let MANAGED_SESSION_ID = null;
+
+function preCreateBrowserbaseSession(ctxId) {
+  console.error(`[autobrowse] BROWSERBASE_CONTEXT_ID detected — creating BB session bound to context ${ctxId.slice(0, 8)}…`);
+  try {
+    const stdout = execFileSync(
+      "bb",
+      ["sessions", "create", "--context-id", ctxId, "--persist", "--advanced-stealth", "--solve-captchas"],
+      { encoding: "utf-8" },
+    );
+    const session = JSON.parse(stdout);
+    if (!session.id) throw new Error(`bb output missing id: ${stdout.slice(0, 200)}`);
+    console.error(`[autobrowse] Created managed session ${session.id} (context ${ctxId.slice(0, 8)}…)`);
+    return session.id;
+  } catch (err) {
+    console.error(`[autobrowse] FATAL: could not create BB session — ${err.message || err}`);
+    process.exit(1);
+  }
+}
+
+function releaseManagedSession() {
+  if (!MANAGED_SESSION_ID) return;
+  const sid = MANAGED_SESSION_ID;
+  MANAGED_SESSION_ID = null; // guard against re-entry from signal handlers
+  try {
+    execFileSync("bb", ["sessions", "update", sid, "--status", "REQUEST_RELEASE"], { stdio: "ignore" });
+    console.error(`[autobrowse] Released managed session ${sid}`);
+  } catch (err) {
+    console.error(`[autobrowse] Warning: failed to release session ${sid}: ${err.message || err}`);
+  }
+}
+process.on("exit", releaseManagedSession);
+process.on("SIGINT", () => { releaseManagedSession(); process.exit(130); });
+process.on("SIGTERM", () => { releaseManagedSession(); process.exit(143); });
+
 function parseCommand(command) {
   const args = [];
   let current = "";
@@ -243,6 +287,23 @@ function parseCommand(command) {
 }
 
 function executeCommand(command) {
+  // If a managed BB session is active, rewrite browse commands to attach to
+  // it. Session-lifecycle commands become no-ops so the agent's prompt-baked
+  // `browse env remote` / `browse stop` muscle memory doesn't fight us.
+  if (MANAGED_SESSION_ID) {
+    const trimmed = command.trim();
+    if (/^browse\s+(env|stop|status|start)(\s|$)/.test(trimmed)) {
+      return {
+        output: `[managed] no-op — session ${MANAGED_SESSION_ID.slice(0, 8)}… is pre-attached`,
+        error: false,
+        duration_ms: 0,
+      };
+    }
+    if (/^browse\s+/.test(trimmed) && !trimmed.includes("--connect")) {
+      command = trimmed.replace(/^browse\s+/, `browse --connect ${MANAGED_SESSION_ID} `);
+    }
+  }
+
   // Security: only allow the browse CLI and execute it without a shell so
   // metacharacters are treated as literal arguments instead of extra commands.
   const parsed = parseCommand(command);
@@ -272,15 +333,17 @@ function executeCommand(command) {
   }
 }
 
-function buildSystemPrompt(strategy, traceDir, browseEnv) {
-  const envDesc = browseEnv === "remote"
-    ? `Use **remote mode** (Browserbase) — anti-bot stealth, CAPTCHA solving, residential proxies:
+function buildSystemPrompt(strategy, traceDir, browseEnv, managedSessionId) {
+  const envDesc = managedSessionId
+    ? `The browser is **pre-attached** to a managed Browserbase session (id starting ${managedSessionId.slice(0, 8)}…) with a persistent context — cookies/storage from prior sessions are loaded. **Skip session lifecycle commands** (\`browse env\`, \`browse stop\`, \`browse status\`) — they are silently no-ops. Begin with \`browse open <url>\`.`
+    : browseEnv === "remote"
+      ? `Use **remote mode** (Browserbase) — anti-bot stealth, CAPTCHA solving, residential proxies:
 \`\`\`
 browse stop
 browse env remote
 \`\`\`
 Always run \`browse stop\` first to kill any existing local session before switching to remote.`
-    : `Use **local mode** — runs on local Chrome:
+      : `Use **local mode** — runs on local Chrome:
 \`\`\`
 browse env local
 \`\`\``;
@@ -391,6 +454,19 @@ async function main() {
   }
 
   const browseEnv = getArg("env", "local");
+  const maxTurnsArg = getArg("max-turns");
+  const MAX_TURNS = maxTurnsArg ? parseInt(maxTurnsArg, 10) : DEFAULT_MAX_TURNS;
+  if (!Number.isFinite(MAX_TURNS) || MAX_TURNS < 1) {
+    console.error(`ERROR: --max-turns must be a positive integer; got "${maxTurnsArg}".`);
+    process.exit(1);
+  }
+
+  // Pre-create a managed Browserbase session if BROWSERBASE_CONTEXT_ID is set.
+  // Falls back to the agent driving session setup itself when unset.
+  if (process.env.BROWSERBASE_CONTEXT_ID && browseEnv === "remote") {
+    MANAGED_SESSION_ID = preCreateBrowserbaseSession(process.env.BROWSERBASE_CONTEXT_ID);
+  }
+
   const client = new Anthropic();
   const runNumber = getNextRunNumber(tracesDir);
   const runId = `run-${String(runNumber).padStart(3, "0")}`;
@@ -400,7 +476,7 @@ async function main() {
 
   const strategy = fs.readFileSync(strategyFile, "utf-8");
   const task = fs.readFileSync(taskFile, "utf-8");
-  const systemPrompt = buildSystemPrompt(strategy, traceDir, browseEnv);
+  const systemPrompt = buildSystemPrompt(strategy, traceDir, browseEnv, MANAGED_SESSION_ID);
 
   console.error(`\n${"=".repeat(60)}`);
   console.error(`  AUTOBROWSE — ${taskName} — Run ${runNumber}`);
diff --git a/skills/autobrowse/scripts/export.mjs b/skills/autobrowse/scripts/export.mjs
new file mode 100755
index 00000000..e6c2fb0e
--- /dev/null
+++ b/skills/autobrowse/scripts/export.mjs
@@ -0,0 +1,203 @@
+#!/usr/bin/env node
+
+/**
+ * export.mjs — Translate a graduated autobrowse task into a deterministic
+ * runnable script.
+ *
+ * Currently supports --target playwright. The Stagehand variant lives in
+ * the standalone /stagehand-export skill; once Playwright is shipped and
+ * proven we can fold both targets behind this CLI.
+ *
+ * Usage:
+ *   node scripts/export.mjs --task <name> --target playwright \\
+ *        [--workspace ./autobrowse] [--run run-NNN] \\
+ *        [--output <dir>] [--no-verify]
+ */
+
+import "dotenv/config";
+import * as fs from "node:fs";
+import * as path from "node:path";
+
+import { pickRun, listRuns } from "./lib/pick-run.mjs";
+import { taskToSchema, parseStrategySections } from "./lib/parse-task.mjs";
+import { walkTrace } from "./lib/command-mapping.mjs";
+import {
+  generatePlaywrightScript,
+  playwrightPackageJson,
+  playwrightTsconfig,
+} from "./lib/codegen-playwright.mjs";
+import { verifyGenerated } from "./lib/verify.mjs";
+
+// ── CLI args ───────────────────────────────────────────────────────
+
+function getArg(name, fallback) {
+  const i = process.argv.indexOf(`--${name}`);
+  return i !== -1 && process.argv[i + 1] ? process.argv[i + 1] : fallback;
+}
+const hasFlag = (n) => process.argv.includes(`--${n}`);
+
+if (hasFlag("help") || hasFlag("h")) {
+  console.log(`autobrowse export — generate deterministic replay scripts from autobrowse traces
+
+Usage: node scripts/export.mjs --task <name> [options]
+
+Options:
+  --task <name>          Task name — matches tasks/<name>/ (required)
+  --target <kind>        playwright (default; stagehand lives in /stagehand-export)
+  --workspace <dir>      Workspace root holding tasks/ and traces/ (default: ./autobrowse)
+  --run <id>             Force a specific run (default: newest passing)
+  --output <dir>         Output directory for generated files (default: <workspace>/tasks/<name>/<target>)
+  --no-verify            Skip the npm install + tsx run verification step
+
+Env:
+  ANTHROPIC_API_KEY      Used for LLM-generated extract block. If unset, a TODO placeholder is emitted.
+  BROWSERBASE_*          Pass through to the generated script at runtime.
+
+Exit codes: 0 generated+verified, 2 generated but verify failed (or --no-verify), 1 generator error.`);
+  process.exit(0);
+}
+
+const TASK = getArg("task");
+const TARGET = getArg("target", "playwright");
+const WORKSPACE = path.resolve(getArg("workspace", "autobrowse"));
+const FORCED_RUN = getArg("run");
+const VERIFY = !hasFlag("no-verify");
+const OUTPUT = getArg("output");
+
+if (!TASK) {
+  console.error("ERROR: --task <name> is required");
+  console.error("Run with --help for usage.");
+  process.exit(1);
+}
+if (TARGET !== "playwright") {
+  console.error(`ERROR: --target=${TARGET} not yet supported here. Use the /stagehand-export skill for Stagehand output.`);
+  process.exit(1);
+}
+
+// ── Locate sources ────────────────────────────────────────────────
+
+const taskDir = path.join(WORKSPACE, "tasks", TASK);
+const tracesDir = path.join(WORKSPACE, "traces", TASK);
+const outDir = OUTPUT ? path.resolve(OUTPUT) : path.join(taskDir, TARGET);
+
+const taskFile = path.join(taskDir, "task.md");
+const strategyFile = path.join(taskDir, "strategy.md");
+
+for (const [label, file] of [["task.md", taskFile], ["strategy.md", strategyFile]]) {
+  if (!fs.existsSync(file)) {
+    console.error(`ERROR: ${label} not found at ${file} — run autobrowse first.`);
+    process.exit(1);
+  }
+}
+if (!fs.existsSync(tracesDir)) {
+  console.error(`ERROR: no traces at ${tracesDir} — run autobrowse first.`);
+  process.exit(1);
+}
+
+const runId = pickRun(tracesDir, FORCED_RUN);
+if (!runId) {
+  console.error(`ERROR: no passing runs found in ${tracesDir}.`);
+  console.error("Graduate the task with autobrowse first, or pass --run <id> to force.");
+  console.error("Available runs:", listRuns(tracesDir).join(", ") || "(none)");
+  process.exit(1);
+}
+
+const runDir = path.join(tracesDir, runId);
+const tracePath = path.join(runDir, "trace.json");
+if (!fs.existsSync(tracePath)) {
+  console.error(`ERROR: trace.json missing at ${tracePath}`);
+  process.exit(1);
+}
+
+console.error(`[export] task=${TASK} target=${TARGET} run=${runId} workspace=${WORKSPACE}`);
+
+const trace = JSON.parse(fs.readFileSync(tracePath, "utf-8"));
+const taskMd = fs.readFileSync(taskFile, "utf-8");
+const strategyMd = fs.readFileSync(strategyFile, "utf-8");
+
+// ── Schema + sections ──────────────────────────────────────────────
+
+const { outputShape, zodSchema, schemaFieldCount } = taskToSchema(taskMd);
+const sections = parseStrategySections(strategyMd);
+const ops = walkTrace(trace, sections);
+
+// Find the agent's final natural-language summary (for LLM extract grounding).
+let finalReasoning = "";
+for (let i = trace.length - 1; i >= 0; i--) {
+  if (trace[i].role === "assistant" && trace[i].reasoning) {
+    finalReasoning = trace[i].reasoning;
+    break;
+  }
+}
+
+// ── Generate Playwright script ─────────────────────────────────────
+
+const { scriptCode, cachedActions, stats, extract } = await generatePlaywrightScript({
+  task: TASK,
+  runId,
+  workspace: WORKSPACE,
+  trace,
+  ops,
+  zodSchema,
+  outputShape,
+  taskMd,
+  finalReasoning,
+});
+
+// ── Write outputs ──────────────────────────────────────────────────
+
+fs.mkdirSync(outDir, { recursive: true });
+const scriptPath = path.join(outDir, `${TASK}.ts`);
+const cachePath = path.join(outDir, "selectors.cache.json");
+const pkgPath = path.join(outDir, "package.json");
+const tsconfigPath = path.join(outDir, "tsconfig.json");
+
+fs.writeFileSync(scriptPath, scriptCode);
+fs.writeFileSync(
+  cachePath,
+  JSON.stringify(
+    {
+      task: TASK,
+      target: TARGET,
+      generated_from: { workspace: WORKSPACE, run: runId },
+      stats,
+      extract,
+      actions: cachedActions,
+    },
+    null,
+    2,
+  ),
+);
+if (!fs.existsSync(pkgPath)) {
+  fs.writeFileSync(pkgPath, JSON.stringify(playwrightPackageJson(TASK), null, 2));
+}
+if (!fs.existsSync(tsconfigPath)) {
+  fs.writeFileSync(tsconfigPath, JSON.stringify(playwrightTsconfig(), null, 2));
+}
+
+console.error(`[export] wrote ${path.relative(process.cwd(), scriptPath)}`);
+console.error(`[export] ops: ${ops.length} | cached: ${stats.cached} | ref_resolved: ${stats.ref_resolved} | ref_failed: ${stats.ref_failed} | dropped: ${stats.dropped}`);
+console.error(`[export] schema fields: ${schemaFieldCount} | extract: ${extract.generated ? "LLM-generated" : `fallback (${extract.reason})`}`);
+
+// ── Verify ─────────────────────────────────────────────────────────
+
+const baseReport = {
+  task: TASK,
+  target: TARGET,
+  run: runId,
+  script: scriptPath,
+  cache: cachePath,
+  stats,
+  schema_fields: schemaFieldCount,
+  extract: { generated: extract.generated, reason: extract.reason },
+};
+
+if (!VERIFY) {
+  console.log(JSON.stringify({ ...baseReport, verified: false }, null, 2));
+  process.exit(0);
+}
+
+const v = verifyGenerated(outDir, `${TASK}.ts`);
+const report = { ...baseReport, verified: true, passed: v.passed, exit_code: v.exit_code, run_log: v.run_log, output: v.output };
+console.log(JSON.stringify(report, null, 2));
+process.exit(v.passed ? 0 : 2);
diff --git a/skills/autobrowse/scripts/lib/codegen-playwright.mjs b/skills/autobrowse/scripts/lib/codegen-playwright.mjs
new file mode 100644
index 00000000..85dd47f3
--- /dev/null
+++ b/skills/autobrowse/scripts/lib/codegen-playwright.mjs
@@ -0,0 +1,619 @@
+// codegen-playwright.mjs — ops[] + snapshots[] → runnable Playwright TS.
+//
+// Connects to a Browserbase session bound to BROWSERBASE_CONTEXT_ID at
+// runtime (or falls back to chromium.launch for local dev). Replays the
+// mined trace using resolved Playwright locators and ends with an
+// LLM-generated extract block that pulls the final result JSON.
+
+import Anthropic from "@anthropic-ai/sdk";
+import {
+  resolveOpRef,
+  renderLocator,
+  collectSnapshots,
+} from "./selector-resolver.mjs";
+
+// ── Op → Playwright code ──────────────────────────────────────────
+
+function jsStr(s) {
+  return JSON.stringify(String(s ?? ""));
+}
+
+// Cheap classifier: does a CSS selector target an <input type="checkbox">?
+function isCheckboxSelector(s) {
+  if (!s) return false;
+  return /input\s*\[\s*type\s*=\s*['"]?checkbox['"]?\s*\]/i.test(s) ||
+    /\[type=checkbox\]/i.test(s);
+}
+
+// Cheap classifier: does a CSS selector target an <input type="radio">?
+function isRadioSelector(s) {
+  if (!s) return false;
+  return /input\s*\[\s*type\s*=\s*['"]?radio['"]?\s*\]/i.test(s) ||
+    /\[type=radio\]/i.test(s);
+}
+
+function emitOp(op, snapshots) {
+  const lines = [];
+  const cached = null;
+  const stats = { cached: 0, ref_resolved: 0, ref_failed: 0, dropped: 0 };
+  const sec = op.section ? `// [${op.section}] ` : "";
+  const intent = (op.intent || "").replace(/[\r\n]+/g, " ").slice(0, 140);
+  // Skip the intent header when intent is just the fallback "turn N" string
+  // (i.e., the agent had no reasoning for this turn).
+  const hasUsefulIntent = intent && intent !== `turn ${op.turn}`;
+  const header = hasUsefulIntent ? `  ${sec}// turn ${op.turn}: ${intent}` : (op.section ? `  ${sec}` : null);
+
+  switch (op.kind) {
+    case "goto":
+      if (header) lines.push(header);
+      lines.push(`  await page.goto(${jsStr(op.url)});`);
+      stats.cached++;
+      break;
+
+    case "wait_load":
+      lines.push(`  await page.waitForLoadState("load");`);
+      break;
+    case "wait_timeout":
+      lines.push(`  await page.waitForTimeout(${op.ms || 1000});`);
+      break;
+    case "wait_selector":
+      lines.push(`  await page.waitForSelector(${jsStr(op.selector)});`);
+      break;
+
+    case "click_sel": {
+      if (header) lines.push(header);
+      // Detect radio inputs by selector pattern → use forceClickRadio
+      // (styled labels commonly intercept actionability checks).
+      if (isRadioSelector(op.selector)) {
+        lines.push(`  await forceClickRadio(page.locator(${jsStr(op.selector)}));`);
+      } else {
+        lines.push(`  await page.locator(${jsStr(op.selector)}).click();`);
+      }
+      stats.cached++;
+      return { lines, cached: { kind: "click", code: `page.locator(${jsStr(op.selector)}).click()`, selector: op.selector, op }, stats };
+    }
+    case "fill_sel": {
+      if (header) lines.push(header);
+      // Detect checkbox inputs by selector pattern → use forceCheck
+      // (Playwright's .fill() rejects checkboxes; styled labels often
+      // intercept .check() actionability).
+      if (isCheckboxSelector(op.selector)) {
+        lines.push(`  await forceCheck(page.locator(${jsStr(op.selector)}));`);
+      } else {
+        lines.push(`  await page.locator(${jsStr(op.selector)}).fill(${jsStr(op.value)});`);
+      }
+      stats.cached++;
+      return { lines, cached: { kind: "fill", code: `page.locator(${jsStr(op.selector)}).fill(${jsStr(op.value)})`, selector: op.selector, value: op.value, op }, stats };
+    }
+    case "select_dropdown": {
+      if (header) lines.push(header);
+      // Always use selectWithFallback — handles transiently-disabled selects
+      // via JS-enable + native value setter when .selectOption() times out.
+      lines.push(`  await selectWithFallback(page.locator(${jsStr(op.selector)}), ${jsStr(op.value)});`);
+      stats.cached++;
+      return { lines, cached: { kind: "select", code: `selectWithFallback(page.locator(${jsStr(op.selector)}), ${jsStr(op.value)})`, selector: op.selector, value: op.value, op }, stats };
+    }
+
+    case "select_ref": {
+      const r = resolveOpRef(op, snapshots);
+      if (!r.resolved) {
+        if (header) lines.push(header);
+        lines.push(`  // TODO: could not resolve select ref ${op.ref} (${r.reason})`);
+        lines.push(`  // Original: ${op.command}`);
+        stats.ref_failed++;
+        return { lines, cached: null, stats };
+      }
+      const best = r.candidates[0];
+      if (header) lines.push(header);
+      lines.push(`  await selectWithFallback(${best.code}, ${jsStr(op.value)});`);
+      stats.ref_resolved++;
+      return {
+        lines,
+        cached: {
+          kind: "select",
+          ref: op.ref,
+          source_turn: r.sourceTurn,
+          node: { role: r.node.role, name: r.node.name, depth: r.node.depth },
+          primary: { method: best.method, args: best.args, confidence: best.confidence, code: best.code },
+          fallbacks: r.candidates.slice(1).map((c) => ({ method: c.method, args: c.args, confidence: c.confidence, code: c.code })),
+          op,
+        },
+        stats,
+      };
+    }
+
+    case "click_ref":
+    case "fill_ref": {
+      const r = resolveOpRef(op, snapshots);
+      if (!r.resolved) {
+        if (header) lines.push(header);
+        lines.push(`  // TODO: could not resolve ref ${op.ref} (${r.reason})`);
+        lines.push(`  // Original: ${op.command}`);
+        stats.ref_failed++;
+        return { lines, cached: null, stats };
+      }
+      const best = r.candidates[0];
+      const method = op.kind === "click_ref" ? "click" : "fill";
+      const args = method === "fill" ? `(${jsStr(op.value)})` : `()`;
+      if (header) lines.push(header);
+      // Bake in force-helpers when the resolved node role tells us what we're dealing with.
+      const role = (r.node.role || "").toLowerCase();
+      if (op.kind === "click_ref" && role === "radio") {
+        lines.push(`  await forceClickRadio(${best.code});`);
+      } else if (op.kind === "click_ref" && role === "checkbox") {
+        lines.push(`  await forceCheck(${best.code});`);
+      } else if (op.kind === "fill_ref" && role === "checkbox") {
+        lines.push(`  await forceCheck(${best.code});`);
+      } else {
+        lines.push(`  await ${best.code}.${method}${args};`);
+      }
+      // Emit alternative candidates as comments — the self-healer (P1) reads
+      // these and selectors.cache.json to swap when the primary breaks.
+      if (r.candidates.length > 1) {
+        const alts = r.candidates.slice(1, 3).map((c) => c.code).join("  |  ");
+        lines.push(`  //   fallbacks: ${alts}`);
+      }
+      stats.ref_resolved++;
+      return {
+        lines,
+        cached: {
+          kind: method,
+          ref: op.ref,
+          source_turn: r.sourceTurn,
+          node: { role: r.node.role, name: r.node.name, depth: r.node.depth },
+          primary: { method: best.method, args: best.args, confidence: best.confidence, code: best.code },
+          fallbacks: r.candidates.slice(1).map((c) => ({ method: c.method, args: c.args, confidence: c.confidence, code: c.code })),
+          op,
+        },
+        stats,
+      };
+    }
+
+    case "type_focused":
+      lines.push(`  await page.keyboard.type(${jsStr(op.text)});`);
+      break;
+
+    case "eval": {
+      if (header) lines.push(header);
+      // Escape backticks, escape sequences, and ${} for safe embedding in a
+      // TS template literal. The expression runs in page context, same as
+      // the original `browse eval` did via CDP.
+      const escaped = op.expression
+        .replace(/\\/g, "\\\\")
+        .replace(/`/g, "\\`")
+        .replace(/\$\{/g, "\\${");
+      lines.push(`  await page.evaluate(\`${escaped}\`);`);
+      stats.cached++;
+      return {
+        lines,
+        cached: { kind: "eval", expression: op.expression, op },
+        stats,
+      };
+    }
+    case "press":
+      lines.push(`  await page.keyboard.press(${jsStr(op.key)});`);
+      break;
+
+    case "scroll": {
+      const [x, y, dx, dy] = op.coords;
+      if ([x, y, dx, dy].some((n) => Number.isNaN(n))) {
+        lines.push(`  // skip: malformed scroll ${JSON.stringify(op.coords)}`);
+      } else {
+        lines.push(`  await page.mouse.move(${x}, ${y});`);
+        lines.push(`  await page.mouse.wheel(${dx}, ${dy});`);
+      }
+      break;
+    }
+
+    case "page_nav":
+      if (op.verb === "back") lines.push(`  await page.goBack();`);
+      else if (op.verb === "forward") lines.push(`  await page.goForward();`);
+      else if (op.verb === "reload") lines.push(`  await page.reload();`);
+      break;
+
+    case "session":
+    case "perception":
+      lines.push(`  // skip (${op.kind}): ${op.command}`);
+      stats.dropped++;
+      break;
+
+    case "unhandled":
+      lines.push(`  // TODO: unhandled browse verb '${op.verb}' (turn ${op.turn}): ${op.command}`);
+      stats.dropped++;
+      break;
+  }
+  return { lines, cached, stats };
+}
+
+// ── LLM-generated extract block ───────────────────────────────────
+
+async function generateExtractBlock({ snapshots, zodSchema, outputShape, taskMd, finalReasoning }) {
+  const FALLBACK = `    // TODO: extract step could not be auto-generated. Hand-write or re-run export with ANTHROPIC_API_KEY set.
+    const result: Output = { success: false, error: "extract step not generated" } as unknown as Output;`;
+
+  if (!process.env.ANTHROPIC_API_KEY) {
+    return { code: FALLBACK, generated: false, reason: "no ANTHROPIC_API_KEY" };
+  }
+  if (!snapshots.length) {
+    return { code: FALLBACK, generated: false, reason: "no snapshots in trace" };
+  }
+
+  // Send the final snapshot + schema + agent's final reasoning to Claude.
+  const finalSnap = snapshots[snapshots.length - 1];
+  const treeText = finalSnap.tree.nodes
+    .map((n) => `${"  ".repeat(n.depth)}[${n.ref}] ${n.role}${n.name ? ": " + n.name : ""}`)
+    .join("\n")
+    .slice(0, 10_000); // safety cap
+
+  const prompt = `You are generating the final extract step for a deterministic Playwright replay script.
+
+The replay script will navigate to a page that the agent previously walked through. Your job is to write TypeScript code that **queries the live page at replay time** to populate a \`result\` variable matching this Zod schema:
+
+\`\`\`ts
+const OutputSchema = ${zodSchema};
+type Output = z.infer<typeof OutputSchema>;
+\`\`\`
+
+The expected output shape (from task.md):
+\`\`\`json
+${JSON.stringify(outputShape, null, 2)}
+\`\`\`
+
+The accessibility tree of the final page (after all actions ran) is below. Use it ONLY as a guide to pick selectors — do not hardcode field values from it:
+\`\`\`
+${treeText}
+\`\`\`
+
+The agent's prior reasoning (for context — do not copy data from it into the result):
+${finalReasoning ? finalReasoning.slice(0, 1500) : "(none)"}
+
+**Critical rules**:
+- Generate code that calls Playwright locators (\`page.getByRole(...)\`, \`page.getByText(...)\`, \`page.getByLabel(...)\`, \`page.locator(...)\`) to fetch text content from the live page. Do **NOT** bake the agent's findings in as static literals.
+- For each field, pick the most stable locator (prefer \`getByRole\` with name → \`getByLabel\` → \`getByText\`) and call \`.textContent()\` / \`.innerText()\` / \`.inputValue()\`.
+- For repeated items (arrays in the schema), use \`.all()\` or \`.allTextContents()\` plus a small loop or \`.map()\`. Pick a parent locator and walk its children.
+- Coerce types correctly: \`Number(...)\` for numbers, parse dates with \`new Date(...)\`, etc.
+- For fields you cannot locate, use an empty sentinel: \`""\` for strings, \`0\` for numbers, \`null\` for nullable, \`[]\` for arrays.
+- Set \`success: true\` at the end if extraction completed without throwing.
+- The variable MUST be named \`result\` and typed \`Output\` (already defined above).
+- Output ONLY the code block. No prose, no markdown fences, no \`async function\` wrapper. The code will be inserted inside a try-block where \`page\` is in scope.
+- Keep it concise. Aim for under 80 lines.
+
+Begin the code now:`;
+
+  try {
+    const client = new Anthropic();
+    const resp = await client.messages.create({
+      model: "claude-haiku-4-5-20251001",
+      max_tokens: 4096,
+      messages: [{ role: "user", content: prompt }],
+    });
+    const text = resp.content.find((b) => b.type === "text")?.text ?? "";
+    const stopReason = resp.stop_reason;
+    // Strip leading/trailing markdown fences if Claude added them.
+    let code = text.trim().replace(/^```(?:typescript|ts)?\s*\n?/, "").replace(/\n?```\s*$/, "");
+    if (!code) return { code: FALLBACK, generated: false, reason: "empty LLM response" };
+
+    // Structural validation. Truncated output (stop_reason === "max_tokens")
+    // produces unparseable code — refuse it. Also require that braces /
+    // brackets / parens balance, since the LLM occasionally drops a closer.
+    if (stopReason === "max_tokens") {
+      return { code: FALLBACK, generated: false, reason: "LLM output truncated at max_tokens" };
+    }
+    const balance = checkBalance(code);
+    if (!balance.ok) {
+      return { code: FALLBACK, generated: false, reason: `LLM output unbalanced: ${balance.reason}` };
+    }
+    if (!/\bresult\b/.test(code)) {
+      return { code: FALLBACK, generated: false, reason: "LLM output did not declare a `result` variable" };
+    }
+
+    // Indent two extra spaces for the try-block context.
+    code = code.split("\n").map((l) => (l.length ? "    " + l : l)).join("\n");
+    return { code, generated: true, reason: null };
+  } catch (err) {
+    return { code: FALLBACK, generated: false, reason: String(err?.message || err) };
+  }
+}
+
+// Crude balance check — counts brackets ignoring those inside strings or
+// comments. Good enough to catch LLM truncation, not a parser.
+function checkBalance(code) {
+  let depth = { "{": 0, "[": 0, "(": 0 };
+  const open = { "{": "}", "[": "]", "(": ")" };
+  let inStr = null;
+  let inLineComment = false;
+  let inBlockComment = false;
+  for (let i = 0; i < code.length; i++) {
+    const c = code[i];
+    const prev = code[i - 1];
+    if (inLineComment) {
+      if (c === "\n") inLineComment = false;
+      continue;
+    }
+    if (inBlockComment) {
+      if (prev === "*" && c === "/") inBlockComment = false;
+      continue;
+    }
+    if (inStr) {
+      if (c === "\\") {
+        i++;
+        continue;
+      }
+      if (c === inStr) inStr = null;
+      continue;
+    }
+    if (c === "/" && code[i + 1] === "/") {
+      inLineComment = true;
+      i++;
+      continue;
+    }
+    if (c === "/" && code[i + 1] === "*") {
+      inBlockComment = true;
+      i++;
+      continue;
+    }
+    if (c === '"' || c === "'" || c === "`") {
+      inStr = c;
+      continue;
+    }
+    if (c in depth) depth[c]++;
+    else if (c === "}") depth["{"]--;
+    else if (c === "]") depth["["]--;
+    else if (c === ")") depth["("]--;
+  }
+  for (const k of Object.keys(depth)) {
+    if (depth[k] !== 0) {
+      return { ok: false, reason: `unbalanced '${k}' (${depth[k]} open at end)` };
+    }
+  }
+  return { ok: true };
+}
+
+// ── Final script wrapper ──────────────────────────────────────────
+
+function wrapScript({ task, runId, workspace, zodSchema, body, extractCode }) {
+  return `// Generated by autobrowse export --target playwright from ${runId}.
+// Source: ${workspace}/tasks/${task}/{task.md, strategy.md} + traces/${task}/${runId}/trace.json
+// Hand-edit freely. selectors.cache.json mirrors resolved locators + fallbacks.
+import { chromium } from "playwright";
+import { z } from "zod";
+import "dotenv/config";
+import { execFileSync } from "node:child_process";
+
+const OutputSchema = ${zodSchema};
+type Output = z.infer<typeof OutputSchema>;
+
+interface BbSession {
+  wssUrl: string;
+  sessionId: string;
+}
+
+function createBrowserbaseSession(): BbSession | null {
+  const ctx = process.env.BROWSERBASE_CONTEXT_ID;
+  if (!ctx) return null;
+
+  const apiKey = process.env.BROWSERBASE_API_KEY;
+  const projectId = process.env.BROWSERBASE_PROJECT_ID;
+  if (!apiKey || !projectId) {
+    throw new Error("BROWSERBASE_CONTEXT_ID is set but BROWSERBASE_API_KEY or BROWSERBASE_PROJECT_ID are missing.");
+  }
+
+  const stdout = execFileSync(
+    "bb",
+    ["sessions", "create", "--context-id", ctx, "--persist", "--advanced-stealth", "--solve-captchas"],
+    { encoding: "utf-8" },
+  );
+  const session = JSON.parse(stdout);
+  const wssUrl = \`wss://connect.browserbase.com?apiKey=\${apiKey}&sessionId=\${session.id}\`;
+  return { wssUrl, sessionId: session.id };
+}
+
+function releaseBrowserbaseSession(bb: BbSession): void {
+  try {
+    execFileSync("bb", ["sessions", "update", bb.sessionId, "--status", "REQUEST_RELEASE"], { stdio: "ignore" });
+  } catch {
+    /* best-effort */
+  }
+}
+
+// ── Helpers ────────────────────────────────────────────────────────
+//
+// Baked-in workarounds for patterns that broke during the bizfile demo:
+// styled-label overlays intercepting clicks on radios/checkboxes, selects
+// that render briefly disabled while other fields are committing, and
+// React-controlled inputs that strip simulated keystrokes mid-typing.
+
+import type { Locator, Page } from "playwright";
+
+/** Check a styled checkbox, bypassing actionability (the visible label often intercepts). */
+async function forceCheck(loc: Locator): Promise<void> {
+  await loc.first().check({ force: true });
+}
+
+/** Click a styled radio, bypassing actionability (the visible label often intercepts). */
+async function forceClickRadio(loc: Locator): Promise<void> {
+  await loc.first().click({ force: true });
+}
+
+/**
+ * Select an option; if the <select> is rendered disabled (common right after
+ * a prior field commits in React-controlled forms), force-enable it and set
+ * the value via React's tracked-value setter so the form picks it up.
+ */
+async function selectWithFallback(loc: Locator, value: string): Promise<void> {
+  try {
+    await loc.first().selectOption(value, { timeout: 5000 });
+    return;
+  } catch {
+    await loc.first().evaluate((el, v) => {
+      const sel = el as HTMLSelectElement;
+      sel.disabled = false;
+      const setter = Object.getOwnPropertyDescriptor(HTMLSelectElement.prototype, "value")?.set;
+      if (!setter) throw new Error("No value setter on HTMLSelectElement");
+      setter.call(sel, v);
+      sel.dispatchEvent(new Event("change", { bubbles: true }));
+    }, value);
+  }
+}
+
+/**
+ * Fill a text input via React's tracked-value setter. Bypasses keystroke-by-
+ * keystroke event handling (autosuggests that intercept space, autocompletes
+ * that drop characters, etc.). Always prefer this over .fill()/.type() on
+ * React-controlled forms.
+ */
+async function reactFill(page: Page, labelPattern: RegExp | string, value: string): Promise<void> {
+  await page.getByLabel(labelPattern).first().click();
+  await page.evaluate((v) => {
+    const el = document.activeElement as HTMLInputElement | null;
+    if (!el) throw new Error("No active element to fill");
+    const setter = Object.getOwnPropertyDescriptor(HTMLInputElement.prototype, "value")?.set;
+    if (!setter) throw new Error("No value setter on HTMLInputElement");
+    setter.call(el, v);
+    el.dispatchEvent(new Event("input", { bubbles: true }));
+    el.dispatchEvent(new Event("change", { bubbles: true }));
+  }, value);
+}
+
+/** Click "Next Step" (or other named button) via find-by-text in page context;
+ *  avoids the race where getByRole resolves to a stale element between SPA wizard steps. */
+async function clickButtonByText(page: Page, text: string, waitAfterMs = 1500): Promise<void> {
+  await page.evaluate((t) => {
+    const btn = Array.from(document.querySelectorAll("button")).find(
+      (b) => (b.textContent || "").trim() === t,
+    );
+    if (!btn) throw new Error(\`Button "\${t}" not found in DOM\`);
+    (btn as HTMLElement).click();
+  }, text);
+  await page.waitForLoadState("load");
+  await page.waitForTimeout(waitAfterMs);
+}
+
+// ───────────────────────────────────────────────────────────────────
+
+async function main(): Promise<Output> {
+  const bb = createBrowserbaseSession();
+  const browser = bb
+    ? await chromium.connectOverCDP(bb.wssUrl)
+    : await chromium.launch({ headless: false });
+
+  const context = bb ? browser.contexts()[0] : await browser.newContext();
+  const page = context.pages()[0] ?? (await context.newPage());
+
+  try {
+${body}
+
+${extractCode}
+
+    return OutputSchema.parse(result);
+  } finally {
+    if (bb) {
+      releaseBrowserbaseSession(bb);
+    } else {
+      await browser.close();
+    }
+  }
+}
+
+main()
+  .then((result) => {
+    console.log(JSON.stringify(result, null, 2));
+    process.exit((result as { success?: boolean })?.success === true ? 0 : 2);
+  })
+  .catch((err) => {
+    console.error("FATAL:", err);
+    console.log(JSON.stringify({ success: false, error: String(err) }));
+    process.exit(1);
+  });
+`;
+}
+
+// ── Top-level entry ───────────────────────────────────────────────
+
+export async function generatePlaywrightScript({
+  task,
+  runId,
+  workspace,
+  trace,
+  ops,
+  zodSchema,
+  outputShape,
+  taskMd,
+  finalReasoning,
+}) {
+  const snapshots = collectSnapshots(trace);
+  const bodyLines = [];
+  const cachedActions = [];
+  const stats = { cached: 0, ref_resolved: 0, ref_failed: 0, dropped: 0 };
+
+  for (const op of ops) {
+    const r = emitOp(op, snapshots);
+    bodyLines.push(...r.lines);
+    if (r.cached) cachedActions.push({ turn: op.turn, intent: op.intent, section: op.section, ...r.cached });
+    stats.cached += r.stats.cached;
+    stats.ref_resolved += r.stats.ref_resolved;
+    stats.ref_failed += r.stats.ref_failed;
+    stats.dropped += r.stats.dropped;
+  }
+
+  const extract = await generateExtractBlock({
+    snapshots,
+    zodSchema,
+    outputShape,
+    taskMd,
+    finalReasoning,
+  });
+
+  const scriptCode = wrapScript({
+    task,
+    runId,
+    workspace,
+    zodSchema,
+    body: bodyLines.join("\n"),
+    extractCode: extract.code,
+  });
+
+  return {
+    scriptCode,
+    cachedActions,
+    stats,
+    extract: { generated: extract.generated, reason: extract.reason },
+  };
+}
+
+// ── Scaffold files (package.json, tsconfig.json) ──────────────────
+
+export function playwrightPackageJson(task) {
+  return {
+    name: `${task}-playwright`,
+    version: "0.0.1",
+    private: true,
+    type: "module",
+    scripts: { start: `tsx ${task}.ts` },
+    dependencies: {
+      playwright: "^1.47.0",
+      zod: "^3.23.0",
+      dotenv: "^16.4.0",
+    },
+    devDependencies: {
+      tsx: "^4.7.0",
+      typescript: "^5.4.0",
+      "@types/node": "^20.0.0",
+    },
+  };
+}
+
+export function playwrightTsconfig() {
+  return {
+    compilerOptions: {
+      target: "ES2022",
+      module: "ESNext",
+      moduleResolution: "Bundler",
+      lib: ["ES2022", "DOM"],
+      types: ["node"],
+      strict: true,
+      esModuleInterop: true,
+      skipLibCheck: true,
+    },
+  };
+}
diff --git a/skills/autobrowse/scripts/lib/command-mapping.mjs b/skills/autobrowse/scripts/lib/command-mapping.mjs
new file mode 100644
index 00000000..82b2b39d
--- /dev/null
+++ b/skills/autobrowse/scripts/lib/command-mapping.mjs
@@ -0,0 +1,214 @@
+// command-mapping.mjs — walk an autobrowse trace.json into target-agnostic ops.
+//
+// Each `browse <verb> ...` command in the trace becomes one op. The downstream
+// codegen (Stagehand or Playwright) decides how to emit each op kind.
+//
+// Op kinds:
+//   goto, wait_load, wait_timeout, wait_selector,
+//   click_sel, click_ref, fill_sel, fill_ref,
+//   select_dropdown, type_focused, press, scroll,
+//   page_nav, session, perception, unhandled
+
+import { sectionForTurn } from "./parse-task.mjs";
+
+// Shell-aware tokenizer. Single-quoted strings are literal; double-quoted
+// strings honor backslash escapes.
+export function tokenize(cmd) {
+  const out = [];
+  let cur = "",
+    q = null,
+    esc = false,
+    started = false;
+  for (const ch of cmd.trim()) {
+    if (esc) {
+      cur += ch;
+      esc = false;
+      started = true;
+      continue;
+    }
+    if (q) {
+      if (ch === q) q = null;
+      else if (q === '"' && ch === "\\") esc = true;
+      else cur += ch;
+      started = true;
+      continue;
+    }
+    if (ch === "'" || ch === '"') {
+      q = ch;
+      started = true;
+      continue;
+    }
+    if (ch === "\\") {
+      esc = true;
+      started = true;
+      continue;
+    }
+    if (/\s/.test(ch)) {
+      if (started) {
+        out.push(cur);
+        cur = "";
+        started = false;
+      }
+      continue;
+    }
+    cur += ch;
+    started = true;
+  }
+  if (started) out.push(cur);
+  return out;
+}
+
+export const REF_RE = /^\[?\d+-\d+\]?$/;
+const XPATH_RE = /^(\.?\/\/|\/)/;
+const CSS_RE = /^[#.\[]|^[a-zA-Z][\w-]*[#.\[:]|^\*/;
+
+export function classifySelector(s) {
+  if (!s) return "none";
+  if (REF_RE.test(s)) return "ref";
+  if (XPATH_RE.test(s)) return "xpath";
+  if (CSS_RE.test(s) || /^[a-zA-Z][\w-]*$/.test(s)) return "css";
+  return "unknown";
+}
+
+// Strip brackets if present and normalize to "X-Y".
+export function normalizeRef(s) {
+  return s.replace(/^\[/, "").replace(/\]$/, "");
+}
+
+// Skip flags between `browse` and the verb. Flags that consume a value
+// (--connect <id>, --session <name>, --ws <url>) take two tokens.
+const FLAGS_WITH_VALUE = new Set(["--connect", "--session", "--ws", "--region", "--session-timeout"]);
+function findVerbIndex(tokens) {
+  let i = 1;
+  while (i < tokens.length && tokens[i].startsWith("--")) {
+    i += FLAGS_WITH_VALUE.has(tokens[i]) ? 2 : 1;
+  }
+  return i;
+}
+
+// Walk trace.json into ops[]. Pairs each tool_use with its tool_result and
+// only emits ops for successful results. Each op carries turn/intent/section
+// so codegen can attach an explanatory comment.
+export function walkTrace(trace, sections = []) {
+  const ops = [];
+  const traceByTurn = {};
+  for (const e of trace) {
+    if (!traceByTurn[e.turn]) traceByTurn[e.turn] = [];
+    traceByTurn[e.turn].push(e);
+  }
+  const turns = Object.keys(traceByTurn).map(Number).sort((a, b) => a - b);
+
+  for (const turn of turns) {
+    const entries = traceByTurn[turn];
+    const reasoningEntry = entries.find((e) => e.role === "assistant" && e.reasoning);
+    const turnReasoning = reasoningEntry?.reasoning?.split("\n")[0]?.trim() ?? "";
+    const section = sectionForTurn(sections, turn);
+    const intent = (turnReasoning || section?.heading || `turn ${turn}`).slice(0, 160);
+
+    for (let i = 0; i < entries.length; i++) {
+      const e = entries[i];
+      if (e.role !== "assistant" || !e.tool_input) continue;
+      const next = entries[i + 1];
+      const success = next && next.role === "tool_result" && next.error === false;
+      if (!success) continue;
+
+      const tokens = tokenize(e.tool_input.command);
+      if (tokens.length < 2 || tokens[0] !== "browse") continue;
+      const vi = findVerbIndex(tokens);
+      if (vi >= tokens.length) continue;
+      const verb = tokens[vi];
+      const args = tokens.slice(vi + 1);
+
+      const base = {
+        turn,
+        intent,
+        section: section?.heading ?? null,
+        command: e.tool_input.command,
+        result: next.output ?? "",
+      };
+
+      switch (verb) {
+        case "stop":
+        case "status":
+        case "pages":
+        case "env":
+        case "start":
+          ops.push({ kind: "session", verb, args, ...base });
+          break;
+        case "open":
+        case "newpage":
+        case "goto":
+          ops.push({ kind: "goto", url: args[0], ...base });
+          break;
+        case "wait": {
+          const sub = args[0];
+          if (sub === "load") ops.push({ kind: "wait_load", ...base });
+          else if (sub === "timeout")
+            ops.push({ kind: "wait_timeout", ms: parseInt(args[1] || "1000", 10), ...base });
+          else if (sub === "selector") ops.push({ kind: "wait_selector", selector: args[1], ...base });
+          break;
+        }
+        case "snapshot":
+        case "screenshot":
+        case "get":
+          ops.push({ kind: "perception", verb, args, ...base });
+          break;
+        case "click": {
+          const target = args[0];
+          const klass = classifySelector(target);
+          if (klass === "xpath" || klass === "css") {
+            ops.push({ kind: "click_sel", selector: target, ...base });
+          } else if (klass === "ref") {
+            ops.push({ kind: "click_ref", ref: normalizeRef(target), ...base });
+          }
+          break;
+        }
+        case "fill": {
+          const selector = args[0];
+          const positional = args.slice(1).filter((a) => !a.startsWith("--"));
+          const value = positional.join(" ");
+          const klass = classifySelector(selector);
+          if (klass === "xpath" || klass === "css") {
+            ops.push({ kind: "fill_sel", selector, value, ...base });
+          } else if (klass === "ref") {
+            ops.push({ kind: "fill_ref", ref: normalizeRef(selector), value, ...base });
+          } else {
+            ops.push({ kind: "fill_sel", selector, value, ...base });
+          }
+          break;
+        }
+        case "select": {
+          const target = args[0];
+          const value = args.slice(1).join(" ");
+          const klass = classifySelector(target);
+          if (klass === "ref") {
+            ops.push({ kind: "select_ref", ref: normalizeRef(target), value, ...base });
+          } else {
+            ops.push({ kind: "select_dropdown", selector: target, value, ...base });
+          }
+          break;
+        }
+        case "eval":
+          ops.push({ kind: "eval", expression: args.join(" "), ...base });
+          break;
+        case "type":
+          ops.push({ kind: "type_focused", text: args.join(" "), ...base });
+          break;
+        case "press":
+          ops.push({ kind: "press", key: args[0], ...base });
+          break;
+        case "scroll":
+          ops.push({ kind: "scroll", coords: args.map(Number), ...base });
+          break;
+        case "back":
+        case "forward":
+        case "reload":
+          ops.push({ kind: "page_nav", verb, ...base });
+          break;
+        default:
+          ops.push({ kind: "unhandled", verb, args, ...base });
+      }
+    }
+  }
+  return ops;
+}
diff --git a/skills/autobrowse/scripts/lib/distill-failure.mjs b/skills/autobrowse/scripts/lib/distill-failure.mjs
new file mode 100644
index 00000000..2bdcdbb2
--- /dev/null
+++ b/skills/autobrowse/scripts/lib/distill-failure.mjs
@@ -0,0 +1,124 @@
+// distill-failure.mjs — Playwright failure → strategy.md addendum.
+//
+// When the in-loop Playwright replay fails, this module asks Claude Haiku to
+// distill the error into a concise, actionable strategy.md entry: what
+// failed, the likely cause, and what to try next iteration. The addendum is
+// appended to strategy.md's "Recent Playwright Failures" section so both the
+// explorer agent (next evaluate run) and the codegen (next export) can react.
+
+import Anthropic from "@anthropic-ai/sdk";
+import * as fs from "node:fs";
+
+const FALLBACK = (iter, exitCode, stderrSnip) =>
+  `### Iteration ${iter} — Playwright replay failed (exit ${exitCode})
+
+\`\`\`
+${stderrSnip.slice(0, 800)}
+\`\`\`
+
+(Auto-summary unavailable — ANTHROPIC_API_KEY missing or LLM call errored. Read the raw error above and decide the next move.)
+`;
+
+export async function distillFailure({
+  iteration,
+  taskName,
+  scriptPath,
+  exitCode,
+  stdout = "",
+  stderr = "",
+  runLogPath = null,
+}) {
+  const stderrSnip = stderr.slice(0, 4000);
+  const stdoutSnip = stdout.slice(0, 1000);
+
+  if (!process.env.ANTHROPIC_API_KEY) {
+    return {
+      addendum: FALLBACK(iteration, exitCode, stderrSnip),
+      generated: false,
+      reason: "no ANTHROPIC_API_KEY",
+    };
+  }
+
+  // Pull a small slice of the script around the failing line, if we can
+  // infer it from the stderr (Playwright prints "at main (<path>:<line>:<col>)").
+  let scriptSnippet = "";
+  try {
+    const m = stderrSnip.match(/at\s+\w+\s+\(([^:]+):(\d+):\d+\)/);
+    if (m && fs.existsSync(m[1])) {
+      const lines = fs.readFileSync(m[1], "utf-8").split("\n");
+      const failingLine = parseInt(m[2], 10) - 1;
+      const lo = Math.max(0, failingLine - 6);
+      const hi = Math.min(lines.length, failingLine + 4);
+      scriptSnippet = lines
+        .slice(lo, hi)
+        .map((l, i) => `${(lo + i + 1).toString().padStart(4)}${lo + i === failingLine ? " →" : "  "} ${l}`)
+        .join("\n");
+    }
+  } catch {
+    /* best-effort */
+  }
+
+  const prompt = `A deterministic Playwright replay script for task "${taskName}" just failed mid-replay. You are writing one short Markdown entry that will be appended to that task's \`strategy.md\` so the next iteration of the explorer agent can learn from this failure.
+
+Exit code: ${exitCode}
+Script path: ${scriptPath}
+
+Stderr (last 4KB):
+\`\`\`
+${stderrSnip}
+\`\`\`
+
+${stdoutSnip ? `Stdout (last 1KB):\n\`\`\`\n${stdoutSnip}\n\`\`\`\n` : ""}${scriptSnippet ? `Script context around the failing line:\n\`\`\`ts\n${scriptSnippet}\n\`\`\`\n` : ""}
+
+Write a tight Markdown entry with this exact structure (no surrounding prose, no fences around the entry itself):
+
+### Iteration ${iteration} — <one-line failure summary>
+
+- **What failed**: <locator / action / step / line number>
+- **Likely cause**: <one sentence; e.g., "element is rendered as <select disabled> for ~3s after the prior fill", "styled label intercepts pointer events on the underlying input", "selector resolved to a stale ref after a re-render">
+- **Fix to try next iteration**: <one actionable suggestion the explorer or codegen can adopt; e.g., "force-click via .click({force:true})", "use eval-find-by-text instead of getByRole", "add 1500ms wait before the click">
+
+Keep it under 80 words total. Be specific. Reference the actual locator or line number when you can.`;
+
+  try {
+    const client = new Anthropic();
+    const resp = await client.messages.create({
+      model: "claude-haiku-4-5-20251001",
+      max_tokens: 600,
+      messages: [{ role: "user", content: prompt }],
+    });
+    const text = resp.content.find((b) => b.type === "text")?.text?.trim() ?? "";
+    if (!text || !text.startsWith("###")) {
+      return {
+        addendum: FALLBACK(iteration, exitCode, stderrSnip),
+        generated: false,
+        reason: "LLM output did not match expected heading",
+      };
+    }
+    return { addendum: text + "\n", generated: true, reason: null };
+  } catch (err) {
+    return {
+      addendum: FALLBACK(iteration, exitCode, stderrSnip),
+      generated: false,
+      reason: String(err?.message || err),
+    };
+  }
+}
+
+// Append an addendum to strategy.md under the "Recent Playwright Failures"
+// section. Creates the section if it doesn't exist.
+export function appendToStrategy(strategyPath, addendum) {
+  const SECTION_HEADER = "## Recent Playwright Failures";
+  let md = fs.existsSync(strategyPath) ? fs.readFileSync(strategyPath, "utf-8") : "";
+
+  if (!md.trim()) {
+    md = `# Navigation Strategy\n\n## Navigation Heuristics\n\n(grows as the explorer learns)\n\n## Codegen Hints\n\n(per-task overrides the Playwright codegen should apply)\n\n${SECTION_HEADER}\n\n${addendum}`;
+  } else if (md.includes(SECTION_HEADER)) {
+    // Insert addendum right after the section header (newest first).
+    md = md.replace(SECTION_HEADER, `${SECTION_HEADER}\n\n${addendum.trim()}\n`);
+  } else {
+    md += `\n\n${SECTION_HEADER}\n\n${addendum}`;
+  }
+
+  fs.writeFileSync(strategyPath, md);
+}
diff --git a/skills/autobrowse/scripts/lib/parse-task.mjs b/skills/autobrowse/scripts/lib/parse-task.mjs
new file mode 100644
index 00000000..70a846b0
--- /dev/null
+++ b/skills/autobrowse/scripts/lib/parse-task.mjs
@@ -0,0 +1,96 @@
+// parse-task.mjs — task.md → Zod schema, strategy.md → section ranges.
+//
+// task.md's `## Output` block holds a fenced JSON example. We normalize
+// placeholder tokens (<integer>, <string|null>, etc.) to JSON sentinels,
+// then walk the resulting object to infer a Zod schema.
+
+export function extractOutputJson(taskMd) {
+  const after = taskMd.split(/^##\s+Output\s*$/m)[1];
+  if (!after) return null;
+  const fence = after.match(/```(?:json)?\s*\n([\s\S]*?)\n```/);
+  if (!fence) return null;
+  let raw = fence[1];
+  // <int>, <number>, <count>, etc. → 0
+  raw = raw.replace(/"<[^>]*>"/g, '""');
+  raw = raw.replace(/<integer>|<number>|<int>|<count>/gi, "0");
+  raw = raw.replace(/<bool>|<boolean>/gi, "false");
+  raw = raw.replace(/<null>/gi, "null");
+  raw = raw.replace(/<[^>]+>/g, '""');
+  raw = raw.replace(/,\s*([}\]])/g, "$1");
+  try {
+    return JSON.parse(raw);
+  } catch {
+    return null;
+  }
+}
+
+export function jsonToZod(value, indent = 2) {
+  const pad = " ".repeat(indent);
+  if (value === null) return "z.unknown().nullable()";
+  if (Array.isArray(value)) {
+    if (value.length === 0) return "z.array(z.unknown())";
+    return `z.array(${jsonToZod(value[0], indent)})`;
+  }
+  switch (typeof value) {
+    case "string":
+      return "z.string()";
+    case "number":
+      return Number.isInteger(value) ? "z.number().int()" : "z.number()";
+    case "boolean":
+      return "z.boolean()";
+    case "object": {
+      const entries = Object.entries(value).map(([k, v]) => {
+        const keyOut = /^[A-Za-z_$][\w$]*$/.test(k) ? k : JSON.stringify(k);
+        return `${pad}${keyOut}: ${jsonToZod(v, indent + 2)},`;
+      });
+      return `z.object({\n${entries.join("\n")}\n${" ".repeat(indent - 2)}})`;
+    }
+    default:
+      return "z.unknown()";
+  }
+}
+
+export function taskToSchema(taskMd) {
+  const outputShape = extractOutputJson(taskMd);
+  if (outputShape && typeof outputShape === "object" && !Array.isArray(outputShape)) {
+    return {
+      outputShape,
+      zodSchema: jsonToZod(outputShape),
+      schemaFieldCount: Object.keys(outputShape).length,
+    };
+  }
+  return {
+    outputShape: null,
+    zodSchema: "z.object({ result: z.unknown() })",
+    schemaFieldCount: 0,
+  };
+}
+
+// Parse strategy.md headers that carry "(turns N–M)" or "(turns N-M)"
+// markers. Returns sections [{ heading, start, end, prose }] in document order.
+export function parseStrategySections(strategyMd) {
+  const lines = strategyMd.split("\n");
+  const sections = [];
+  let cur = null;
+  for (const line of lines) {
+    const h = line.match(/^#{2,4}\s+(.+)$/);
+    if (h) {
+      if (cur) sections.push(cur);
+      const range = h[1].match(/turns?\s+(\d+)\s*[–—\-]\s*(\d+)/i);
+      cur = {
+        heading: h[1].trim(),
+        start: range ? parseInt(range[1], 10) : null,
+        end: range ? parseInt(range[2], 10) : null,
+        prose: [],
+      };
+    } else if (cur) {
+      cur.prose.push(line);
+    }
+  }
+  if (cur) sections.push(cur);
+  return sections;
+}
+
+export function sectionForTurn(sections, turn) {
+  return sections.find((s) => s.start !== null && turn >= s.start && turn <= s.end);
+}
diff --git a/skills/autobrowse/scripts/lib/pick-run.mjs b/skills/autobrowse/scripts/lib/pick-run.mjs
new file mode 100644
index 00000000..ae08c957
--- /dev/null
+++ b/skills/autobrowse/scripts/lib/pick-run.mjs
@@ -0,0 +1,47 @@
+// pick-run.mjs — choose which autobrowse run to mine for export.
+//
+// A run is "passing" when its summary.md's final JSON has `success: true`.
+// Lifted from stagehand-export/scripts/export.mjs.
+
+import * as fs from "node:fs";
+import * as path from "node:path";
+
+export function listRuns(tracesDir) {
+  if (!fs.existsSync(tracesDir)) return [];
+  return fs
+    .readdirSync(tracesDir)
+    .filter((d) => d.startsWith("run-"))
+    .sort()
+    .reverse();
+}
+
+export function readSummary(tracesDir, runId) {
+  const f = path.join(tracesDir, runId, "summary.md");
+  return fs.existsSync(f) ? fs.readFileSync(f, "utf-8") : null;
+}
+
+export function extractFinalJson(summary) {
+  if (!summary) return null;
+  const after = summary.split("## Agent Final Output")[1];
+  if (!after) return null;
+  const fence = after.match(/```(?:json)?\s*\n([\s\S]*?)\n```/);
+  if (!fence) return null;
+  try {
+    return JSON.parse(fence[1]);
+  } catch {
+    return null;
+  }
+}
+
+export function isPassing(tracesDir, runId) {
+  const summary = readSummary(tracesDir, runId);
+  if (!summary) return false;
+  const json = extractFinalJson(summary);
+  return json && json.success === true;
+}
+
+// Returns the run-id to export from, or null if none found.
+export function pickRun(tracesDir, forcedRunId) {
+  if (forcedRunId) return forcedRunId;
+  return listRuns(tracesDir).find((r) => isPassing(tracesDir, r)) ?? null;
+}
diff --git a/skills/autobrowse/scripts/lib/selector-resolver.mjs b/skills/autobrowse/scripts/lib/selector-resolver.mjs
new file mode 100644
index 00000000..bb9d929a
--- /dev/null
+++ b/skills/autobrowse/scripts/lib/selector-resolver.mjs
@@ -0,0 +1,274 @@
+// selector-resolver.mjs — turn an ARIA ref (e.g. "23-2205") from a browse
+// snapshot into a ranked list of Playwright locator candidates.
+//
+// Why this exists: autobrowse traces reference DOM nodes by session-scoped
+// `[X-Y]` refs that don't replay outside the original CDP session. To emit
+// deterministic Playwright, we have to resolve each ref against the snapshot
+// the agent saw at the time, then translate the node's role/name/parent
+// context into stable Playwright locators.
+//
+// Snapshot line format:
+//   <indent>[X-Y] <role>(, <role2>)*( : <name|text>)?
+// Indent is 2 spaces per nesting level.
+
+// ARIA roles that getByRole supports. Non-ARIA tree types like
+// LayoutTable, LayoutTableRow, RootWebArea, IframePresentational,
+// StaticText, scrollable, html, body, paragraph are skipped for the
+// role-locator candidate but can still be reached via getByText / parent.
+const ARIA_ROLES = new Set([
+  "alert", "alertdialog", "application", "article", "banner", "blockquote",
+  "button", "caption", "cell", "checkbox", "code", "columnheader", "combobox",
+  "complementary", "contentinfo", "definition", "deletion", "dialog",
+  "directory", "document", "emphasis", "feed", "figure", "form", "generic",
+  "grid", "gridcell", "group", "heading", "img", "image", "insertion", "link",
+  "list", "listbox", "listitem", "log", "main", "marquee", "math", "menu",
+  "menubar", "menuitem", "menuitemcheckbox", "menuitemradio", "meter",
+  "navigation", "none", "note", "option", "paragraph", "presentation",
+  "progressbar", "radio", "radiogroup", "region", "row", "rowgroup",
+  "rowheader", "scrollbar", "search", "searchbox", "separator", "slider",
+  "spinbutton", "status", "strong", "subscript", "superscript", "switch",
+  "tab", "table", "tablist", "tabpanel", "term", "textbox", "time", "timer",
+  "toolbar", "tooltip", "tree", "treegrid", "treeitem",
+]);
+
+// Parse one snapshot tree (the string under JSON's `tree` field) into a
+// flat array of nodes with parent links.
+export function parseSnapshotTree(treeText) {
+  const lines = (treeText ?? "").split("\n");
+  const nodes = [];
+  const stack = []; // [{ depth, idx }]
+  for (let lineIdx = 0; lineIdx < lines.length; lineIdx++) {
+    const line = lines[lineIdx];
+    if (!line.trim()) continue;
+    const match = line.match(/^(\s*)\[(\d+-\d+)\]\s+(.+)$/);
+    if (!match) continue;
+    const indent = match[1].length;
+    const depth = Math.floor(indent / 2);
+    const ref = match[2];
+    const rest = match[3];
+
+    // Split into roles and name. Name comes after the first `: ` that's
+    // outside a comma-role list. Format: "role[, role2, ...][:name]"
+    // Simplest: split on first ":" — left side is comma-separated roles,
+    // right is the name.
+    const colonIdx = rest.indexOf(":");
+    let rolesStr, name;
+    if (colonIdx === -1) {
+      rolesStr = rest;
+      name = null;
+    } else {
+      rolesStr = rest.slice(0, colonIdx).trim();
+      name = rest.slice(colonIdx + 1).trim();
+      if (name === "") name = null;
+    }
+    const roles = rolesStr.split(",").map((r) => r.trim()).filter(Boolean);
+
+    // Pop stack to current depth
+    while (stack.length && stack[stack.length - 1].depth >= depth) stack.pop();
+    const parentIdx = stack.length ? stack[stack.length - 1].idx : -1;
+
+    const node = {
+      ref,
+      roles,
+      role: roles[0] ?? null,
+      name,
+      depth,
+      parentIdx,
+      lineIdx,
+      childrenIdx: [],
+    };
+    const idx = nodes.length;
+    nodes.push(node);
+    if (parentIdx !== -1) nodes[parentIdx].childrenIdx.push(idx);
+    stack.push({ depth, idx });
+  }
+
+  const byRef = new Map();
+  for (const n of nodes) byRef.set(n.ref, n);
+  return { nodes, byRef };
+}
+
+// Walk the full trace and collect every successful `browse snapshot` result,
+// parsed. Returns [{ turn, tree }] in turn order.
+export function collectSnapshots(trace) {
+  const snaps = [];
+  for (const e of trace) {
+    if (e.role !== "tool_result") continue;
+    if (e.error) continue;
+    if (!e.command || !/\bsnapshot\b/.test(e.command)) continue;
+    let payload;
+    try {
+      payload = JSON.parse(e.output);
+    } catch {
+      continue;
+    }
+    if (!payload || typeof payload.tree !== "string") continue;
+    snaps.push({ turn: e.turn, tree: parseSnapshotTree(payload.tree) });
+  }
+  return snaps;
+}
+
+// Resolve a ref to its node, looking backwards from `fromTurn` (inclusive).
+// Refs are session-scoped and persist until the DOM changes, so we accept
+// the most recent prior snapshot that contains the ref.
+export function resolveRef(ref, snapshots, fromTurn) {
+  for (let i = snapshots.length - 1; i >= 0; i--) {
+    if (snapshots[i].turn > fromTurn) continue;
+    const node = snapshots[i].tree.byRef.get(ref);
+    if (node) return { node, sourceTurn: snapshots[i].turn, tree: snapshots[i].tree };
+  }
+  return null;
+}
+
+// Best ancestor that carries a labeling text. Walks parents looking for a
+// nearby StaticText child sibling — common in form rows where a label and
+// input are siblings under a wrapper div.
+function findNearbyLabel(node, tree) {
+  // Try parent's StaticText children first.
+  if (node.parentIdx >= 0) {
+    const parent = tree.nodes[node.parentIdx];
+    for (const ci of parent.childrenIdx) {
+      const sib = tree.nodes[ci];
+      if (sib === node) continue;
+      if ((sib.role === "StaticText" || sib.role === "label") && sib.name) {
+        return sib.name;
+      }
+    }
+  }
+  // Then look for the first non-empty descendant text — useful for clickable
+  // wrappers whose displayed text lives in a child.
+  const visit = (idx, budget = 5) => {
+    if (budget <= 0) return null;
+    const n = tree.nodes[idx];
+    if ((n.role === "StaticText" || n.role === "heading") && n.name) return n.name;
+    for (const c of n.childrenIdx) {
+      const found = visit(c, budget - 1);
+      if (found) return found;
+    }
+    return null;
+  };
+  for (const c of node.childrenIdx) {
+    const found = visit(c, 5);
+    if (found) return found;
+  }
+  return null;
+}
+
+// Build ranked Playwright locator candidates for a node. Each candidate has
+// { method, args, confidence (0..1), code }. The first is the "best" — the
+// self-healer (P1) can fall back to lower-ranked candidates when selectors
+// drift. Ranking heuristic mirrors Swivel's stability ordering.
+export function nodeToLocators(node, tree) {
+  if (!node) return [];
+  const candidates = [];
+
+  const pwName = (n) => n; // Playwright accepts strings for `name`; quotes happen at emit.
+
+  // 1. getByRole({ name }) — best when both are present and role is ARIA.
+  if (node.role && ARIA_ROLES.has(node.role.toLowerCase()) && node.name) {
+    candidates.push({
+      method: "getByRole",
+      args: [node.role.toLowerCase(), { name: pwName(node.name) }],
+      confidence: 0.92,
+    });
+  }
+
+  // 2. getByLabel — for form inputs sitting next to a label.
+  const isFormInput = node.role && /^(textbox|combobox|checkbox|searchbox|spinbutton|slider|switch|radio)$/i.test(node.role);
+  if (isFormInput) {
+    const label = node.name || findNearbyLabel(node, tree);
+    if (label) {
+      candidates.push({
+        method: "getByLabel",
+        args: [label],
+        confidence: 0.85,
+      });
+    }
+    // Also try placeholder via getByPlaceholder — placeholder text shows up
+    // as `name` for textboxes when there's no explicit label.
+    if (node.name) {
+      candidates.push({
+        method: "getByPlaceholder",
+        args: [node.name],
+        confidence: 0.75,
+      });
+    }
+  }
+
+  // 3. getByRole({ name }) — also worth trying with role even if not in
+  // ARIA_ROLES, since Playwright tolerates many role strings.
+  if (node.role && !ARIA_ROLES.has(node.role.toLowerCase()) && node.name) {
+    candidates.push({
+      method: "getByRole",
+      args: [node.role.toLowerCase(), { name: pwName(node.name) }],
+      confidence: 0.6,
+    });
+  }
+
+  // 4. getByText — fallback when role isn't useful but text is. Useful
+  // for clickable wrapper divs that contain a static text child.
+  const textForLocator = node.name || findNearbyLabel(node, tree);
+  if (textForLocator && textForLocator.length < 100) {
+    candidates.push({
+      method: "getByText",
+      args: [textForLocator],
+      confidence: 0.45,
+    });
+  }
+
+  // 5. getByRole without a name — last resort, only safe when there's
+  // really only one of this role in the relevant scope. Low confidence.
+  if (node.role && ARIA_ROLES.has(node.role.toLowerCase()) && !node.name) {
+    candidates.push({
+      method: "getByRole",
+      args: [node.role.toLowerCase()],
+      confidence: 0.25,
+    });
+  }
+
+  // Dedup by code shape.
+  const seen = new Set();
+  const out = [];
+  for (const c of candidates) {
+    const code = renderLocator(c);
+    if (seen.has(code)) continue;
+    seen.add(code);
+    out.push({ ...c, code });
+  }
+  out.sort((a, b) => b.confidence - a.confidence);
+  return out;
+}
+
+// Render a candidate into a Playwright code fragment like
+// `page.getByRole("button", { name: "Submit" })`.
+export function renderLocator(candidate) {
+  const args = candidate.args.map((a) => stringifyArg(a)).join(", ");
+  return `page.${candidate.method}(${args})`;
+}
+
+function stringifyArg(a) {
+  if (typeof a === "string") return JSON.stringify(a);
+  if (typeof a === "number" || typeof a === "boolean") return String(a);
+  if (a && typeof a === "object") {
+    const entries = Object.entries(a).map(([k, v]) => `${k}: ${stringifyArg(v)}`);
+    return `{ ${entries.join(", ")} }`;
+  }
+  return JSON.stringify(a);
+}
+
+// Convenience: given the full trace and an op (with .ref and .turn), return
+// { resolved, candidates, sourceTurn } or { resolved: false, reason }.
+export function resolveOpRef(op, snapshots) {
+  const r = resolveRef(op.ref, snapshots, op.turn);
+  if (!r) return { resolved: false, reason: `no snapshot ≤ turn ${op.turn} contains ref ${op.ref}` };
+  const candidates = nodeToLocators(r.node, r.tree);
+  if (candidates.length === 0) {
+    return {
+      resolved: false,
+      reason: `ref ${op.ref} resolved to node with no usable locator (role=${r.node.role}, name=${r.node.name})`,
+      node: r.node,
+      sourceTurn: r.sourceTurn,
+    };
+  }
+  return { resolved: true, candidates, node: r.node, sourceTurn: r.sourceTurn };
+}
diff --git a/skills/autobrowse/scripts/lib/verify.mjs b/skills/autobrowse/scripts/lib/verify.mjs
new file mode 100644
index 00000000..32d6bbc9
--- /dev/null
+++ b/skills/autobrowse/scripts/lib/verify.mjs
@@ -0,0 +1,54 @@
+// verify.mjs — install deps + run the generated script + parse its final JSON.
+//
+// Pass = exit 0 AND last JSON block on stdout has `success: true`.
+
+import * as fs from "node:fs";
+import * as path from "node:path";
+import { spawnSync } from "node:child_process";
+
+export function verifyGenerated(outDir, scriptFilename) {
+  const log = (msg) => console.error(`[verify] ${msg}`);
+  log(`running npm install (silent) in ${outDir}…`);
+  const install = spawnSync("npm", ["install", "--silent"], {
+    cwd: outDir,
+    stdio: ["ignore", "inherit", "inherit"],
+  });
+  if (install.status !== 0) {
+    log(`npm install failed (exit ${install.status})`);
+    return {
+      passed: false,
+      exit_code: install.status ?? 1,
+      run_log: null,
+      output: null,
+      stage: "install",
+    };
+  }
+
+  log(`running: npx tsx ${scriptFilename}`);
+  const runLogPath = path.join(outDir, "run.log");
+  const run = spawnSync("npx", ["tsx", scriptFilename], {
+    cwd: outDir,
+    encoding: "utf-8",
+    stdio: ["ignore", "pipe", "pipe"],
+  });
+  fs.writeFileSync(runLogPath, `STDOUT:\n${run.stdout ?? ""}\n\nSTDERR:\n${run.stderr ?? ""}\n`);
+
+  let parsed = null;
+  try {
+    const stdout = run.stdout ?? "";
+    const lastBrace = stdout.lastIndexOf("{");
+    if (lastBrace >= 0) parsed = JSON.parse(stdout.slice(lastBrace));
+  } catch {
+    /* leave null */
+  }
+
+  const passed = run.status === 0 && parsed?.success === true;
+  log(passed ? `✅ verification passed` : `❌ verification failed (exit=${run.status}) — see ${runLogPath}`);
+  return {
+    passed,
+    exit_code: run.status,
+    run_log: runLogPath,
+    output: parsed,
+    stage: "run",
+  };
+}
diff --git a/skills/autobrowse/scripts/loop.mjs b/skills/autobrowse/scripts/loop.mjs
new file mode 100755
index 00000000..9abf533b
--- /dev/null
+++ b/skills/autobrowse/scripts/loop.mjs
@@ -0,0 +1,302 @@
+#!/usr/bin/env node
+
+/**
+ * loop.mjs — Iterative autobrowse + Playwright verification.
+ *
+ * Wraps the existing evaluate.mjs and export.mjs into a single loop that
+ * converges on a workflow which BOTH the LLM explorer and the deterministic
+ * Playwright replay can complete. Each iteration:
+ *
+ *   1. Run evaluate.mjs (the inner LLM agent)
+ *   2. If the trace passed (success: true in final JSON), run export.mjs to
+ *      emit a Playwright script and replay it against a fresh BB session.
+ *   3. If the Playwright replay also passed → record a pass.
+ *      Else → distill the failure into strategy.md and continue.
+ *   4. Graduate when Playwright has passed in 2 of the last 3 iterations.
+ *
+ * The shared `strategy.md` is the convergence point. The explorer reads it
+ * each iteration. The codegen (eventually) reads its "Codegen Hints" section.
+ * Playwright failures land in "Recent Playwright Failures".
+ *
+ * Usage:
+ *   node scripts/loop.mjs --task <name> [--max-iterations N] [--max-turns-per-iter N]
+ *                         [--workspace ./autobrowse] [--env local|remote]
+ */
+
+import "dotenv/config";
+import * as fs from "node:fs";
+import * as path from "node:path";
+import { spawnSync } from "node:child_process";
+import { fileURLToPath } from "node:url";
+import { distillFailure, appendToStrategy } from "./lib/distill-failure.mjs";
+import { pickRun } from "./lib/pick-run.mjs";
+import { extractFinalJson, readSummary } from "./lib/pick-run.mjs";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+const SKILL_DIR = path.resolve(__dirname, "..");
+
+// ── CLI ────────────────────────────────────────────────────────────
+
+function getArg(name, fallback) {
+  const i = process.argv.indexOf(`--${name}`);
+  return i !== -1 && process.argv[i + 1] ? process.argv[i + 1] : fallback;
+}
+const hasFlag = (n) => process.argv.includes(`--${n}`);
+
+if (hasFlag("help") || hasFlag("h")) {
+  console.log(`autobrowse loop — iterate evaluate + Playwright verification until convergence
+
+Usage: node scripts/loop.mjs --task <name> [options]
+
+Options:
+  --task <name>              Task name — matches tasks/<name>/ (required)
+  --max-iterations N         Cap on outer iterations (default: 8)
+  --max-turns-per-iter N     Per-evaluate turn budget (default: 60)
+  --workspace <dir>          Default: ./autobrowse
+  --env local|remote         Default: local (use remote for bot-protected sites)
+  --skip-verify              Skip the Playwright verify step (still emit script)
+
+Convergence: graduates when the emitted Playwright script passes in 2 of the
+last 3 iterations. Until then, each Playwright failure is distilled into
+strategy.md so the next evaluate run can adapt.
+
+Env vars:
+  ANTHROPIC_API_KEY          Required for evaluate + distillation + LLM extract
+  BROWSERBASE_API_KEY        Required for --env remote
+  BROWSERBASE_PROJECT_ID     Required for --env remote
+  BROWSERBASE_CONTEXT_ID     Optional — pre-authed context for both evaluate and Playwright`);
+  process.exit(0);
+}
+
+const TASK = getArg("task");
+const MAX_ITER = parseInt(getArg("max-iterations", "8"), 10);
+const MAX_TURNS_PER_ITER = parseInt(getArg("max-turns-per-iter", "60"), 10);
+const WORKSPACE = path.resolve(getArg("workspace", "autobrowse"));
+const ENV = getArg("env", "local");
+const SKIP_VERIFY = hasFlag("skip-verify");
+
+if (!TASK) {
+  console.error("ERROR: --task <name> is required. Run with --help.");
+  process.exit(1);
+}
+
+// ── Paths ──────────────────────────────────────────────────────────
+
+const evaluateScript = path.join(SKILL_DIR, "scripts", "evaluate.mjs");
+const exportScript = path.join(SKILL_DIR, "scripts", "export.mjs");
+const taskDir = path.join(WORKSPACE, "tasks", TASK);
+const tracesDir = path.join(WORKSPACE, "traces", TASK);
+const strategyPath = path.join(taskDir, "strategy.md");
+const playwrightDir = path.join(taskDir, "playwright");
+const playwrightScript = path.join(playwrightDir, `${TASK}.ts`);
+
+if (!fs.existsSync(taskDir)) {
+  console.error(`ERROR: ${taskDir} does not exist. Create task.md first (see SKILL.md).`);
+  process.exit(1);
+}
+
+fs.mkdirSync(path.join(WORKSPACE, "reports"), { recursive: true });
+const reportPath = path.join(
+  WORKSPACE,
+  "reports",
+  `loop-${TASK}-${new Date().toISOString().replace(/[:.]/g, "-").slice(0, 19)}.md`,
+);
+
+// ── Helpers ────────────────────────────────────────────────────────
+
+function log(msg) {
+  console.error(`[loop] ${msg}`);
+}
+
+function runEvaluate(iter) {
+  log(`iter ${iter}: running evaluate.mjs (max-turns ${MAX_TURNS_PER_ITER})…`);
+  const args = [
+    evaluateScript,
+    "--task", TASK,
+    "--workspace", WORKSPACE,
+    "--env", ENV,
+    "--max-turns", String(MAX_TURNS_PER_ITER),
+  ];
+  const result = spawnSync("node", args, {
+    stdio: ["ignore", "pipe", "inherit"],
+    encoding: "utf-8",
+    env: process.env,
+  });
+  if (result.status !== 0) {
+    log(`iter ${iter}: evaluate.mjs exited ${result.status}`);
+  }
+  // evaluate.mjs prints a JSON line to stdout with run id + cost
+  let evalReport = null;
+  try {
+    const lastBrace = (result.stdout || "").lastIndexOf("{");
+    if (lastBrace >= 0) evalReport = JSON.parse(result.stdout.slice(lastBrace));
+  } catch {
+    /* leave null */
+  }
+  return { status: result.status, evalReport };
+}
+
+function tracePassed(runId) {
+  const summary = readSummary(tracesDir, runId);
+  if (!summary) return false;
+  const final = extractFinalJson(summary);
+  return final && final.success === true;
+}
+
+function runExport(runId) {
+  log(`exporting Playwright script from ${runId}…`);
+  const args = [
+    exportScript,
+    "--task", TASK,
+    "--workspace", WORKSPACE,
+    "--target", "playwright",
+    "--run", runId,
+    "--no-verify", // we run the verification ourselves below so we can capture/distill output
+  ];
+  const result = spawnSync("node", args, {
+    stdio: ["ignore", "inherit", "inherit"],
+    env: process.env,
+  });
+  return result.status === 0;
+}
+
+function runPlaywright() {
+  log(`replaying Playwright script…`);
+  // Ensure deps are installed (first iter only is slow; npm caches after).
+  if (!fs.existsSync(path.join(playwrightDir, "node_modules"))) {
+    const install = spawnSync("npm", ["install", "--silent"], {
+      cwd: playwrightDir,
+      stdio: ["ignore", "inherit", "inherit"],
+    });
+    if (install.status !== 0) {
+      return { passed: false, exitCode: install.status, stdout: "", stderr: "npm install failed" };
+    }
+  }
+  const run = spawnSync("npx", ["tsx", `${TASK}.ts`], {
+    cwd: playwrightDir,
+    encoding: "utf-8",
+    stdio: ["ignore", "pipe", "pipe"],
+    env: process.env,
+  });
+  const stdout = run.stdout ?? "";
+  const stderr = run.stderr ?? "";
+  // Parse the last JSON line for success
+  let parsed = null;
+  try {
+    const lastBrace = stdout.lastIndexOf("{");
+    if (lastBrace >= 0) parsed = JSON.parse(stdout.slice(lastBrace));
+  } catch {
+    /* leave null */
+  }
+  const passed = run.status === 0 && parsed?.success === true;
+  return { passed, exitCode: run.status, stdout, stderr, parsed };
+}
+
+// ── Main loop ──────────────────────────────────────────────────────
+
+const history = []; // [{ iter, runId, evalPassed, pwPassed, distillReason }]
+
+async function main() {
+  log(`task=${TASK} workspace=${WORKSPACE} env=${ENV} max-iter=${MAX_ITER}`);
+
+  for (let iter = 1; iter <= MAX_ITER; iter++) {
+    log(`──────── iteration ${iter}/${MAX_ITER} ────────`);
+
+    // 1. Run evaluate
+    const { status: evalStatus, evalReport } = runEvaluate(iter);
+    const runId = evalReport?.run ?? null;
+    const evalPassed = runId ? tracePassed(runId) : false;
+    log(`iter ${iter}: evaluate ${evalPassed ? "✅ passed" : "❌ no success: true"} (run=${runId ?? "?"})`);
+
+    const hist = { iter, runId, evalPassed, pwPassed: false, distillReason: null };
+    history.push(hist);
+
+    if (!evalPassed) {
+      log(`iter ${iter}: skipping Playwright (trace not passing) — agent will iterate next round`);
+      continue;
+    }
+
+    // 2. Emit Playwright (overwrites previous if any)
+    const exportOk = runExport(runId);
+    if (!exportOk) {
+      log(`iter ${iter}: export failed; treating as Playwright fail`);
+      hist.distillReason = "export script returned non-zero";
+      continue;
+    }
+
+    if (SKIP_VERIFY) {
+      log(`iter ${iter}: --skip-verify set; not running Playwright`);
+      continue;
+    }
+
+    // 3. Run Playwright
+    const pw = runPlaywright();
+    hist.pwPassed = pw.passed;
+    log(`iter ${iter}: Playwright ${pw.passed ? "✅ passed" : `❌ failed (exit=${pw.exitCode})`}`);
+
+    if (!pw.passed) {
+      // 4. Distill the failure into strategy.md
+      log(`iter ${iter}: distilling Playwright failure into strategy.md…`);
+      const { addendum, generated, reason } = await distillFailure({
+        iteration: iter,
+        taskName: TASK,
+        scriptPath: playwrightScript,
+        exitCode: pw.exitCode,
+        stdout: pw.stdout,
+        stderr: pw.stderr,
+      });
+      appendToStrategy(strategyPath, addendum);
+      hist.distillReason = generated ? "LLM-summarized" : `fallback: ${reason}`;
+      log(`iter ${iter}: strategy.md updated (${hist.distillReason})`);
+    }
+
+    // 5. Convergence check — Playwright passed in 2 of last 3 iterations?
+    const last3 = history.slice(-3);
+    const passes = last3.filter((h) => h.pwPassed).length;
+    if (passes >= 2 && history.length >= 2) {
+      log(`🎓 GRADUATED: Playwright passed in ${passes} of last ${last3.length} iterations`);
+      break;
+    }
+  }
+
+  // ── Write report ─────────────────────────────────────────────────
+  const passedCount = history.filter((h) => h.pwPassed).length;
+  const lines = [
+    `# autobrowse loop report — ${TASK}`,
+    ``,
+    `**Total iterations:** ${history.length}`,
+    `**Playwright passes:** ${passedCount}`,
+    `**Final status:** ${passedCount >= 2 ? "✅ graduated" : "❌ did not converge"}`,
+    ``,
+    `## Per-iteration`,
+    ``,
+    `| Iter | Run | Trace passed | Playwright passed | Distill |`,
+    `|------|-----|--------------|-------------------|---------|`,
+    ...history.map((h) =>
+      `| ${h.iter} | ${h.runId ?? "?"} | ${h.evalPassed ? "✅" : "❌"} | ${h.pwPassed ? "✅" : "❌"} | ${h.distillReason ?? "—"} |`,
+    ),
+    ``,
+    `Strategy file: \`${strategyPath}\``,
+    passedCount >= 1 ? `Latest emitted script: \`${playwrightScript}\`` : "",
+  ];
+  fs.writeFileSync(reportPath, lines.filter(Boolean).join("\n") + "\n");
+  log(`wrote report → ${reportPath}`);
+
+  // Final structured stdout
+  console.log(JSON.stringify({
+    task: TASK,
+    iterations: history.length,
+    pw_passes: passedCount,
+    graduated: passedCount >= 2,
+    history,
+    report: reportPath,
+    script: passedCount >= 1 ? playwrightScript : null,
+  }, null, 2));
+
+  process.exit(passedCount >= 2 ? 0 : 2);
+}
+
+main().catch((err) => {
+  console.error("FATAL:", err);
+  process.exit(1);
+});

From c918d2d6daed8aab57055bb13c4f7f94c30b9a6e Mon Sep 17 00:00:00 2001
From: Alex Qiu <alexander@browserbase.com>
Date: Thu, 14 May 2026 16:32:52 -0700
Subject: [PATCH 2/2] feat(autobrowse): codegen defaults for the 4 hand-edit
 classes surfaced by loop validation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Loop validation today on bizfile (run-008 mined as the passing trace) reduced
the post-codegen hand-edits from yesterday's 15 down to 4 + 1 LLM-extract patch.
Each of the 4 navigation-level issues is now baked in as a codegen default, so
the next task we point loop.mjs at should start from a much smaller residual.

Fixes landed:

  1. clickLinkWithFallback helper (codegen-playwright.mjs)
     - For click_ref ops where the resolved node role is "link", emit
       clickLinkWithFallback(page, <locator>) instead of plain .click().
     - Helper reads the resolved .href property (not getAttribute, which
       returns relative URLs). If the link exposes an absolute http(s) href,
       prefer page.goto over .click — bypasses SPA tour overlays and
       onClick preventDefault gates that block deterministic replay.
     - Waits for networkidle after navigation (load fires too early on SPAs).

  2. .first() default for ambiguous click_sel selectors
     - Added isUniqueSelector() classifier: #id, [id=...], [data-testid=...].
     - For unique selectors, emit .click() as before. For ambiguous ones
       (e.g. `button[type=button]`), emit .first().click() to avoid
       Playwright strict-mode violations.

  3. exact: true for form-input getByRole emissions (selector-resolver.mjs)
     - Added EXACT_NAME_ROLES set: textbox, searchbox, combobox, spinbutton,
       listbox. nodeToLocators emits { name, exact: true } for these.
     - Prevents "Limited Liability Company Name" from matching
       "Confirm Limited Liability Company Name" (real bug from yesterday).

  4. snapshot role "select" → ARIA role "combobox" (selector-resolver.mjs)
     - Added SNAPSHOT_TO_ARIA_ROLE map and normalize at top of nodeToLocators.
     - Browse-snapshot reports <select> with role "select" but Playwright's
       ARIA role is "combobox". Without this mapping, the emitter produced
       getByRole("select", ...) which is invalid.
     - Also boost getByLabel above getByRole for select-likes (combobox/listbox)
       since label-based locators tend to be more reliable for form selects.

Validation:

  Re-exported bizfile-ca-llc from run-008 with these defaults. The emitted
  script navigates ALL 9 wizard steps without hand-edits (vs. yesterday's
  hand-fixed playwright-baseline/ which required 7 categories of patches).
  Only failure is in the LLM-generated extract block at the end (brittle
  structural locators in result-shaping) — separate concern, tracked as a
  follow-up. The architectural goal (loop + codegen produces a navigating
  Playwright script) is met.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../scripts/lib/codegen-playwright.mjs        | 39 ++++++++++++-
 .../scripts/lib/selector-resolver.mjs         | 56 +++++++++++++++----
 2 files changed, 84 insertions(+), 11 deletions(-)

diff --git a/skills/autobrowse/scripts/lib/codegen-playwright.mjs b/skills/autobrowse/scripts/lib/codegen-playwright.mjs
index 85dd47f3..a9bb4450 100644
--- a/skills/autobrowse/scripts/lib/codegen-playwright.mjs
+++ b/skills/autobrowse/scripts/lib/codegen-playwright.mjs
@@ -32,6 +32,15 @@ function isRadioSelector(s) {
     /\[type=radio\]/i.test(s);
 }
 
+// A selector is "clearly unique" if it's an id-style or has a unique-attribute hint.
+// For everything else, codegen emits .first() to avoid Playwright strict-mode violations.
+function isUniqueSelector(s) {
+  if (!s) return false;
+  // #id, [id=...], [data-testid=...], [name=...]
+  return /^#[\w-]+$/.test(s.trim()) ||
+    /\[(id|data-testid|data-test|data-cy|name)\s*=/i.test(s);
+}
+
 function emitOp(op, snapshots) {
   const lines = [];
   const cached = null;
@@ -66,8 +75,12 @@ function emitOp(op, snapshots) {
       // (styled labels commonly intercept actionability checks).
       if (isRadioSelector(op.selector)) {
         lines.push(`  await forceClickRadio(page.locator(${jsStr(op.selector)}));`);
-      } else {
+      } else if (isUniqueSelector(op.selector)) {
         lines.push(`  await page.locator(${jsStr(op.selector)}).click();`);
+      } else {
+        // .first() guards against strict-mode violations when the agent emitted
+        // an ambiguous CSS selector like `button[type=button]` (matches Help / Save Draft / Next Step).
+        lines.push(`  await page.locator(${jsStr(op.selector)}).first().click();`);
       }
       stats.cached++;
       return { lines, cached: { kind: "click", code: `page.locator(${jsStr(op.selector)}).click()`, selector: op.selector, op }, stats };
@@ -144,6 +157,10 @@ function emitOp(op, snapshots) {
         lines.push(`  await forceCheck(${best.code});`);
       } else if (op.kind === "fill_ref" && role === "checkbox") {
         lines.push(`  await forceCheck(${best.code});`);
+      } else if (op.kind === "click_ref" && role === "link") {
+        // SPA links with onClick handlers (tour overlays, route-only handlers)
+        // often don't navigate via .click(). Falls back to page.goto(href).
+        lines.push(`  await clickLinkWithFallback(page, ${best.code});`);
       } else {
         lines.push(`  await ${best.code}.${method}${args};`);
       }
@@ -475,6 +492,26 @@ async function reactFill(page: Page, labelPattern: RegExp | string, value: strin
   }, value);
 }
 
+/** Click a link with auto-fallback to direct navigation. SPA links on state-agency
+ *  portals frequently have onClick handlers that preventDefault and route via
+ *  client-state (often gated behind tour/onboarding overlays). When the link
+ *  exposes an absolute http(s) href, prefer page.goto over .click — same destination,
+ *  no overlay-intercept risk. Falls back to a plain click only for non-routable
+ *  hrefs (e.g. fragment anchors or JS-only handlers). */
+async function clickLinkWithFallback(page: Page, loc: Locator): Promise<void> {
+  // Use the resolved .href property (absolute URL), not getAttribute("href")
+  // which returns the raw attribute value (often relative like "/forms/business").
+  const href = await loc.first().evaluate((el) => (el as HTMLAnchorElement).href).catch(() => null);
+  if (href && /^https?:\\/\\//i.test(href)) {
+    await page.goto(href);
+  } else {
+    await loc.first().click({ timeout: 10000 });
+  }
+  // SPAs often finish loading client content well after the load event fires; wait
+  // for the network to actually settle before returning.
+  await page.waitForLoadState("networkidle").catch(() => {});
+}
+
 /** Click "Next Step" (or other named button) via find-by-text in page context;
  *  avoids the race where getByRole resolves to a stale element between SPA wizard steps. */
 async function clickButtonByText(page: Page, text: string, waitAfterMs = 1500): Promise<void> {
diff --git a/skills/autobrowse/scripts/lib/selector-resolver.mjs b/skills/autobrowse/scripts/lib/selector-resolver.mjs
index bb9d929a..37b9aa09 100644
--- a/skills/autobrowse/scripts/lib/selector-resolver.mjs
+++ b/skills/autobrowse/scripts/lib/selector-resolver.mjs
@@ -154,6 +154,21 @@ function findNearbyLabel(node, tree) {
   return null;
 }
 
+// Mapping for non-ARIA role names that show up in the browse snapshot tree
+// to their ARIA equivalents. Most importantly: <select> reports as "select"
+// in browse snapshots but Playwright's role is "combobox".
+const SNAPSHOT_TO_ARIA_ROLE = {
+  select: "combobox",
+  // Add additional mappings here as we encounter them in production traces.
+};
+
+// Roles where the accessible name is an exact label/placeholder and getByRole
+// with substring matching causes collisions (e.g. "Company Name" matching
+// "Confirm Company Name"). We emit `exact: true` for these.
+const EXACT_NAME_ROLES = new Set([
+  "textbox", "searchbox", "combobox", "spinbutton", "listbox",
+]);
+
 // Build ranked Playwright locator candidates for a node. Each candidate has
 // { method, args, confidence (0..1), code }. The first is the "best" — the
 // self-healer (P1) can fall back to lower-ranked candidates when selectors
@@ -164,18 +179,39 @@ export function nodeToLocators(node, tree) {
 
   const pwName = (n) => n; // Playwright accepts strings for `name`; quotes happen at emit.
 
+  // Normalize the role: map browse-snapshot quirks (e.g., "select") to ARIA roles.
+  const rawRole = (node.role || "").toLowerCase();
+  const role = SNAPSHOT_TO_ARIA_ROLE[rawRole] || rawRole;
+  const buildRoleArgs = (r, name) => {
+    const opts = { name: pwName(name) };
+    if (EXACT_NAME_ROLES.has(r)) opts.exact = true;
+    return [r, opts];
+  };
+
+  const isFormInput = role && /^(textbox|combobox|checkbox|searchbox|spinbutton|slider|switch|radio|listbox)$/i.test(role);
+  const isSelectLike = role === "combobox" || role === "listbox";
+
+  // For form selects, prefer getByLabel over getByRole — label-based locators
+  // tend to be more reliable when forms have repeated/similar select widgets.
+  if (isSelectLike && node.name) {
+    candidates.push({
+      method: "getByLabel",
+      args: [node.name],
+      confidence: 0.93,
+    });
+  }
+
   // 1. getByRole({ name }) — best when both are present and role is ARIA.
-  if (node.role && ARIA_ROLES.has(node.role.toLowerCase()) && node.name) {
+  if (role && ARIA_ROLES.has(role) && node.name) {
     candidates.push({
       method: "getByRole",
-      args: [node.role.toLowerCase(), { name: pwName(node.name) }],
-      confidence: 0.92,
+      args: buildRoleArgs(role, node.name),
+      confidence: isSelectLike ? 0.88 : 0.92,
     });
   }
 
-  // 2. getByLabel — for form inputs sitting next to a label.
-  const isFormInput = node.role && /^(textbox|combobox|checkbox|searchbox|spinbutton|slider|switch|radio)$/i.test(node.role);
-  if (isFormInput) {
+  // 2. getByLabel — for form inputs sitting next to a label (non-select).
+  if (isFormInput && !isSelectLike) {
     const label = node.name || findNearbyLabel(node, tree);
     if (label) {
       candidates.push({
@@ -197,10 +233,10 @@ export function nodeToLocators(node, tree) {
 
   // 3. getByRole({ name }) — also worth trying with role even if not in
   // ARIA_ROLES, since Playwright tolerates many role strings.
-  if (node.role && !ARIA_ROLES.has(node.role.toLowerCase()) && node.name) {
+  if (role && !ARIA_ROLES.has(role) && node.name) {
     candidates.push({
       method: "getByRole",
-      args: [node.role.toLowerCase(), { name: pwName(node.name) }],
+      args: buildRoleArgs(role, node.name),
       confidence: 0.6,
     });
   }
@@ -218,10 +254,10 @@ export function nodeToLocators(node, tree) {
 
   // 5. getByRole without a name — last resort, only safe when there's
   // really only one of this role in the relevant scope. Low confidence.
-  if (node.role && ARIA_ROLES.has(node.role.toLowerCase()) && !node.name) {
+  if (role && ARIA_ROLES.has(role) && !node.name) {
     candidates.push({
       method: "getByRole",
-      args: [node.role.toLowerCase()],
+      args: [role],
       confidence: 0.25,
     });
   }