From 8626af81ddef62a69caeab6a5225011385d8a683 Mon Sep 17 00:00:00 2001 From: Alex Qiu Date: Thu, 14 May 2026 14:22:10 -0700 Subject: [PATCH 1/2] =?UTF-8?q?feat(autobrowse):=20iterative=20Playwright?= =?UTF-8?q?=20loop=20=E2=80=94=20explorer=20+=20deterministic=20verify=20c?= =?UTF-8?q?onverge=20together?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Until now the explorer (evaluate.mjs) and the Playwright emitter (export.mjs) were two disconnected stages: explorer converged on "the LLM can finish the task," then export was a one-shot translation. The two objective functions diverge — what unblocks the LLM agent doesn't always unblock a deterministic replay. Demoing this against bizfile.sos.ca.gov surfaced 7+ classes of mismatch (styled-label overlays, autocomplete keystroke interception, transiently-disabled selects) that each cost a hand-fix in the emitted script. This PR unifies the loop: Each iteration of `scripts/loop.mjs`: 1. evaluate.mjs → produces trace.json + summary.md 2. If trace passed, export.mjs --no-verify → emits Playwright script 3. npx tsx .ts → actual deterministic replay 4. On Playwright fail, distill-failure.mjs summarizes the error via Claude Haiku into strategy.md's "Recent Playwright Failures" section 5. Next iteration's evaluate reads the updated strategy.md and adapts Convergence: Playwright passes 2 of last 3 iterations → graduate. `strategy.md` is the shared intelligence layer between the LLM explorer and the codegen. Three sections (documented in SKILL.md): - Navigation Heuristics (LLM-facing) - Codegen Hints (emitter-facing, per-task overrides) - Recent Playwright Failures (auto-appended by distill-failure) Also lifts the lessons from the bizfile demo into codegen defaults so future tasks don't repeat the same hand-fixes: - forceCheck : .check({ force: true }) for checkbox fill_sel ops - forceClickRadio : .first().click({ force: true }) for radio click ops (detected by selector pattern OR resolved node role) - selectWithFallback: .selectOption() with a JS-enable + native-setter fallback when the ? +function isCheckboxSelector(s) { + if (!s) return false; + return /input\s*\[\s*type\s*=\s*['"]?checkbox['"]?\s*\]/i.test(s) || + /\[type=checkbox\]/i.test(s); +} + +// Cheap classifier: does a CSS selector target an ? +function isRadioSelector(s) { + if (!s) return false; + return /input\s*\[\s*type\s*=\s*['"]?radio['"]?\s*\]/i.test(s) || + /\[type=radio\]/i.test(s); +} + +function emitOp(op, snapshots) { + const lines = []; + const cached = null; + const stats = { cached: 0, ref_resolved: 0, ref_failed: 0, dropped: 0 }; + const sec = op.section ? `// [${op.section}] ` : ""; + const intent = (op.intent || "").replace(/[\r\n]+/g, " ").slice(0, 140); + // Skip the intent header when intent is just the fallback "turn N" string + // (i.e., the agent had no reasoning for this turn). + const hasUsefulIntent = intent && intent !== `turn ${op.turn}`; + const header = hasUsefulIntent ? ` ${sec}// turn ${op.turn}: ${intent}` : (op.section ? ` ${sec}` : null); + + switch (op.kind) { + case "goto": + if (header) lines.push(header); + lines.push(` await page.goto(${jsStr(op.url)});`); + stats.cached++; + break; + + case "wait_load": + lines.push(` await page.waitForLoadState("load");`); + break; + case "wait_timeout": + lines.push(` await page.waitForTimeout(${op.ms || 1000});`); + break; + case "wait_selector": + lines.push(` await page.waitForSelector(${jsStr(op.selector)});`); + break; + + case "click_sel": { + if (header) lines.push(header); + // Detect radio inputs by selector pattern → use forceClickRadio + // (styled labels commonly intercept actionability checks). + if (isRadioSelector(op.selector)) { + lines.push(` await forceClickRadio(page.locator(${jsStr(op.selector)}));`); + } else { + lines.push(` await page.locator(${jsStr(op.selector)}).click();`); + } + stats.cached++; + return { lines, cached: { kind: "click", code: `page.locator(${jsStr(op.selector)}).click()`, selector: op.selector, op }, stats }; + } + case "fill_sel": { + if (header) lines.push(header); + // Detect checkbox inputs by selector pattern → use forceCheck + // (Playwright's .fill() rejects checkboxes; styled labels often + // intercept .check() actionability). + if (isCheckboxSelector(op.selector)) { + lines.push(` await forceCheck(page.locator(${jsStr(op.selector)}));`); + } else { + lines.push(` await page.locator(${jsStr(op.selector)}).fill(${jsStr(op.value)});`); + } + stats.cached++; + return { lines, cached: { kind: "fill", code: `page.locator(${jsStr(op.selector)}).fill(${jsStr(op.value)})`, selector: op.selector, value: op.value, op }, stats }; + } + case "select_dropdown": { + if (header) lines.push(header); + // Always use selectWithFallback — handles transiently-disabled selects + // via JS-enable + native value setter when .selectOption() times out. + lines.push(` await selectWithFallback(page.locator(${jsStr(op.selector)}), ${jsStr(op.value)});`); + stats.cached++; + return { lines, cached: { kind: "select", code: `selectWithFallback(page.locator(${jsStr(op.selector)}), ${jsStr(op.value)})`, selector: op.selector, value: op.value, op }, stats }; + } + + case "select_ref": { + const r = resolveOpRef(op, snapshots); + if (!r.resolved) { + if (header) lines.push(header); + lines.push(` // TODO: could not resolve select ref ${op.ref} (${r.reason})`); + lines.push(` // Original: ${op.command}`); + stats.ref_failed++; + return { lines, cached: null, stats }; + } + const best = r.candidates[0]; + if (header) lines.push(header); + lines.push(` await selectWithFallback(${best.code}, ${jsStr(op.value)});`); + stats.ref_resolved++; + return { + lines, + cached: { + kind: "select", + ref: op.ref, + source_turn: r.sourceTurn, + node: { role: r.node.role, name: r.node.name, depth: r.node.depth }, + primary: { method: best.method, args: best.args, confidence: best.confidence, code: best.code }, + fallbacks: r.candidates.slice(1).map((c) => ({ method: c.method, args: c.args, confidence: c.confidence, code: c.code })), + op, + }, + stats, + }; + } + + case "click_ref": + case "fill_ref": { + const r = resolveOpRef(op, snapshots); + if (!r.resolved) { + if (header) lines.push(header); + lines.push(` // TODO: could not resolve ref ${op.ref} (${r.reason})`); + lines.push(` // Original: ${op.command}`); + stats.ref_failed++; + return { lines, cached: null, stats }; + } + const best = r.candidates[0]; + const method = op.kind === "click_ref" ? "click" : "fill"; + const args = method === "fill" ? `(${jsStr(op.value)})` : `()`; + if (header) lines.push(header); + // Bake in force-helpers when the resolved node role tells us what we're dealing with. + const role = (r.node.role || "").toLowerCase(); + if (op.kind === "click_ref" && role === "radio") { + lines.push(` await forceClickRadio(${best.code});`); + } else if (op.kind === "click_ref" && role === "checkbox") { + lines.push(` await forceCheck(${best.code});`); + } else if (op.kind === "fill_ref" && role === "checkbox") { + lines.push(` await forceCheck(${best.code});`); + } else { + lines.push(` await ${best.code}.${method}${args};`); + } + // Emit alternative candidates as comments — the self-healer (P1) reads + // these and selectors.cache.json to swap when the primary breaks. + if (r.candidates.length > 1) { + const alts = r.candidates.slice(1, 3).map((c) => c.code).join(" | "); + lines.push(` // fallbacks: ${alts}`); + } + stats.ref_resolved++; + return { + lines, + cached: { + kind: method, + ref: op.ref, + source_turn: r.sourceTurn, + node: { role: r.node.role, name: r.node.name, depth: r.node.depth }, + primary: { method: best.method, args: best.args, confidence: best.confidence, code: best.code }, + fallbacks: r.candidates.slice(1).map((c) => ({ method: c.method, args: c.args, confidence: c.confidence, code: c.code })), + op, + }, + stats, + }; + } + + case "type_focused": + lines.push(` await page.keyboard.type(${jsStr(op.text)});`); + break; + + case "eval": { + if (header) lines.push(header); + // Escape backticks, escape sequences, and ${} for safe embedding in a + // TS template literal. The expression runs in page context, same as + // the original `browse eval` did via CDP. + const escaped = op.expression + .replace(/\\/g, "\\\\") + .replace(/`/g, "\\`") + .replace(/\$\{/g, "\\${"); + lines.push(` await page.evaluate(\`${escaped}\`);`); + stats.cached++; + return { + lines, + cached: { kind: "eval", expression: op.expression, op }, + stats, + }; + } + case "press": + lines.push(` await page.keyboard.press(${jsStr(op.key)});`); + break; + + case "scroll": { + const [x, y, dx, dy] = op.coords; + if ([x, y, dx, dy].some((n) => Number.isNaN(n))) { + lines.push(` // skip: malformed scroll ${JSON.stringify(op.coords)}`); + } else { + lines.push(` await page.mouse.move(${x}, ${y});`); + lines.push(` await page.mouse.wheel(${dx}, ${dy});`); + } + break; + } + + case "page_nav": + if (op.verb === "back") lines.push(` await page.goBack();`); + else if (op.verb === "forward") lines.push(` await page.goForward();`); + else if (op.verb === "reload") lines.push(` await page.reload();`); + break; + + case "session": + case "perception": + lines.push(` // skip (${op.kind}): ${op.command}`); + stats.dropped++; + break; + + case "unhandled": + lines.push(` // TODO: unhandled browse verb '${op.verb}' (turn ${op.turn}): ${op.command}`); + stats.dropped++; + break; + } + return { lines, cached, stats }; +} + +// ── LLM-generated extract block ─────────────────────────────────── + +async function generateExtractBlock({ snapshots, zodSchema, outputShape, taskMd, finalReasoning }) { + const FALLBACK = ` // TODO: extract step could not be auto-generated. Hand-write or re-run export with ANTHROPIC_API_KEY set. + const result: Output = { success: false, error: "extract step not generated" } as unknown as Output;`; + + if (!process.env.ANTHROPIC_API_KEY) { + return { code: FALLBACK, generated: false, reason: "no ANTHROPIC_API_KEY" }; + } + if (!snapshots.length) { + return { code: FALLBACK, generated: false, reason: "no snapshots in trace" }; + } + + // Send the final snapshot + schema + agent's final reasoning to Claude. + const finalSnap = snapshots[snapshots.length - 1]; + const treeText = finalSnap.tree.nodes + .map((n) => `${" ".repeat(n.depth)}[${n.ref}] ${n.role}${n.name ? ": " + n.name : ""}`) + .join("\n") + .slice(0, 10_000); // safety cap + + const prompt = `You are generating the final extract step for a deterministic Playwright replay script. + +The replay script will navigate to a page that the agent previously walked through. Your job is to write TypeScript code that **queries the live page at replay time** to populate a \`result\` variable matching this Zod schema: + +\`\`\`ts +const OutputSchema = ${zodSchema}; +type Output = z.infer; +\`\`\` + +The expected output shape (from task.md): +\`\`\`json +${JSON.stringify(outputShape, null, 2)} +\`\`\` + +The accessibility tree of the final page (after all actions ran) is below. Use it ONLY as a guide to pick selectors — do not hardcode field values from it: +\`\`\` +${treeText} +\`\`\` + +The agent's prior reasoning (for context — do not copy data from it into the result): +${finalReasoning ? finalReasoning.slice(0, 1500) : "(none)"} + +**Critical rules**: +- Generate code that calls Playwright locators (\`page.getByRole(...)\`, \`page.getByText(...)\`, \`page.getByLabel(...)\`, \`page.locator(...)\`) to fetch text content from the live page. Do **NOT** bake the agent's findings in as static literals. +- For each field, pick the most stable locator (prefer \`getByRole\` with name → \`getByLabel\` → \`getByText\`) and call \`.textContent()\` / \`.innerText()\` / \`.inputValue()\`. +- For repeated items (arrays in the schema), use \`.all()\` or \`.allTextContents()\` plus a small loop or \`.map()\`. Pick a parent locator and walk its children. +- Coerce types correctly: \`Number(...)\` for numbers, parse dates with \`new Date(...)\`, etc. +- For fields you cannot locate, use an empty sentinel: \`""\` for strings, \`0\` for numbers, \`null\` for nullable, \`[]\` for arrays. +- Set \`success: true\` at the end if extraction completed without throwing. +- The variable MUST be named \`result\` and typed \`Output\` (already defined above). +- Output ONLY the code block. No prose, no markdown fences, no \`async function\` wrapper. The code will be inserted inside a try-block where \`page\` is in scope. +- Keep it concise. Aim for under 80 lines. + +Begin the code now:`; + + try { + const client = new Anthropic(); + const resp = await client.messages.create({ + model: "claude-haiku-4-5-20251001", + max_tokens: 4096, + messages: [{ role: "user", content: prompt }], + }); + const text = resp.content.find((b) => b.type === "text")?.text ?? ""; + const stopReason = resp.stop_reason; + // Strip leading/trailing markdown fences if Claude added them. + let code = text.trim().replace(/^```(?:typescript|ts)?\s*\n?/, "").replace(/\n?```\s*$/, ""); + if (!code) return { code: FALLBACK, generated: false, reason: "empty LLM response" }; + + // Structural validation. Truncated output (stop_reason === "max_tokens") + // produces unparseable code — refuse it. Also require that braces / + // brackets / parens balance, since the LLM occasionally drops a closer. + if (stopReason === "max_tokens") { + return { code: FALLBACK, generated: false, reason: "LLM output truncated at max_tokens" }; + } + const balance = checkBalance(code); + if (!balance.ok) { + return { code: FALLBACK, generated: false, reason: `LLM output unbalanced: ${balance.reason}` }; + } + if (!/\bresult\b/.test(code)) { + return { code: FALLBACK, generated: false, reason: "LLM output did not declare a `result` variable" }; + } + + // Indent two extra spaces for the try-block context. + code = code.split("\n").map((l) => (l.length ? " " + l : l)).join("\n"); + return { code, generated: true, reason: null }; + } catch (err) { + return { code: FALLBACK, generated: false, reason: String(err?.message || err) }; + } +} + +// Crude balance check — counts brackets ignoring those inside strings or +// comments. Good enough to catch LLM truncation, not a parser. +function checkBalance(code) { + let depth = { "{": 0, "[": 0, "(": 0 }; + const open = { "{": "}", "[": "]", "(": ")" }; + let inStr = null; + let inLineComment = false; + let inBlockComment = false; + for (let i = 0; i < code.length; i++) { + const c = code[i]; + const prev = code[i - 1]; + if (inLineComment) { + if (c === "\n") inLineComment = false; + continue; + } + if (inBlockComment) { + if (prev === "*" && c === "/") inBlockComment = false; + continue; + } + if (inStr) { + if (c === "\\") { + i++; + continue; + } + if (c === inStr) inStr = null; + continue; + } + if (c === "/" && code[i + 1] === "/") { + inLineComment = true; + i++; + continue; + } + if (c === "/" && code[i + 1] === "*") { + inBlockComment = true; + i++; + continue; + } + if (c === '"' || c === "'" || c === "`") { + inStr = c; + continue; + } + if (c in depth) depth[c]++; + else if (c === "}") depth["{"]--; + else if (c === "]") depth["["]--; + else if (c === ")") depth["("]--; + } + for (const k of Object.keys(depth)) { + if (depth[k] !== 0) { + return { ok: false, reason: `unbalanced '${k}' (${depth[k]} open at end)` }; + } + } + return { ok: true }; +} + +// ── Final script wrapper ────────────────────────────────────────── + +function wrapScript({ task, runId, workspace, zodSchema, body, extractCode }) { + return `// Generated by autobrowse export --target playwright from ${runId}. +// Source: ${workspace}/tasks/${task}/{task.md, strategy.md} + traces/${task}/${runId}/trace.json +// Hand-edit freely. selectors.cache.json mirrors resolved locators + fallbacks. +import { chromium } from "playwright"; +import { z } from "zod"; +import "dotenv/config"; +import { execFileSync } from "node:child_process"; + +const OutputSchema = ${zodSchema}; +type Output = z.infer; + +interface BbSession { + wssUrl: string; + sessionId: string; +} + +function createBrowserbaseSession(): BbSession | null { + const ctx = process.env.BROWSERBASE_CONTEXT_ID; + if (!ctx) return null; + + const apiKey = process.env.BROWSERBASE_API_KEY; + const projectId = process.env.BROWSERBASE_PROJECT_ID; + if (!apiKey || !projectId) { + throw new Error("BROWSERBASE_CONTEXT_ID is set but BROWSERBASE_API_KEY or BROWSERBASE_PROJECT_ID are missing."); + } + + const stdout = execFileSync( + "bb", + ["sessions", "create", "--context-id", ctx, "--persist", "--advanced-stealth", "--solve-captchas"], + { encoding: "utf-8" }, + ); + const session = JSON.parse(stdout); + const wssUrl = \`wss://connect.browserbase.com?apiKey=\${apiKey}&sessionId=\${session.id}\`; + return { wssUrl, sessionId: session.id }; +} + +function releaseBrowserbaseSession(bb: BbSession): void { + try { + execFileSync("bb", ["sessions", "update", bb.sessionId, "--status", "REQUEST_RELEASE"], { stdio: "ignore" }); + } catch { + /* best-effort */ + } +} + +// ── Helpers ──────────────────────────────────────────────────────── +// +// Baked-in workarounds for patterns that broke during the bizfile demo: +// styled-label overlays intercepting clicks on radios/checkboxes, selects +// that render briefly disabled while other fields are committing, and +// React-controlled inputs that strip simulated keystrokes mid-typing. + +import type { Locator, Page } from "playwright"; + +/** Check a styled checkbox, bypassing actionability (the visible label often intercepts). */ +async function forceCheck(loc: Locator): Promise { + await loc.first().check({ force: true }); +} + +/** Click a styled radio, bypassing actionability (the visible label often intercepts). */ +async function forceClickRadio(loc: Locator): Promise { + await loc.first().click({ force: true }); +} + +/** + * Select an option; if the with role "select" but Playwright's ARIA role is "combobox". Without this mapping, the emitter produced getByRole("select", ...) which is invalid. - Also boost getByLabel above getByRole for select-likes (combobox/listbox) since label-based locators tend to be more reliable for form selects. Validation: Re-exported bizfile-ca-llc from run-008 with these defaults. The emitted script navigates ALL 9 wizard steps without hand-edits (vs. yesterday's hand-fixed playwright-baseline/ which required 7 categories of patches). Only failure is in the LLM-generated extract block at the end (brittle structural locators in result-shaping) — separate concern, tracked as a follow-up. The architectural goal (loop + codegen produces a navigating Playwright script) is met. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../scripts/lib/codegen-playwright.mjs | 39 ++++++++++++- .../scripts/lib/selector-resolver.mjs | 56 +++++++++++++++---- 2 files changed, 84 insertions(+), 11 deletions(-) diff --git a/skills/autobrowse/scripts/lib/codegen-playwright.mjs b/skills/autobrowse/scripts/lib/codegen-playwright.mjs index 85dd47f3..a9bb4450 100644 --- a/skills/autobrowse/scripts/lib/codegen-playwright.mjs +++ b/skills/autobrowse/scripts/lib/codegen-playwright.mjs @@ -32,6 +32,15 @@ function isRadioSelector(s) { /\[type=radio\]/i.test(s); } +// A selector is "clearly unique" if it's an id-style or has a unique-attribute hint. +// For everything else, codegen emits .first() to avoid Playwright strict-mode violations. +function isUniqueSelector(s) { + if (!s) return false; + // #id, [id=...], [data-testid=...], [name=...] + return /^#[\w-]+$/.test(s.trim()) || + /\[(id|data-testid|data-test|data-cy|name)\s*=/i.test(s); +} + function emitOp(op, snapshots) { const lines = []; const cached = null; @@ -66,8 +75,12 @@ function emitOp(op, snapshots) { // (styled labels commonly intercept actionability checks). if (isRadioSelector(op.selector)) { lines.push(` await forceClickRadio(page.locator(${jsStr(op.selector)}));`); - } else { + } else if (isUniqueSelector(op.selector)) { lines.push(` await page.locator(${jsStr(op.selector)}).click();`); + } else { + // .first() guards against strict-mode violations when the agent emitted + // an ambiguous CSS selector like `button[type=button]` (matches Help / Save Draft / Next Step). + lines.push(` await page.locator(${jsStr(op.selector)}).first().click();`); } stats.cached++; return { lines, cached: { kind: "click", code: `page.locator(${jsStr(op.selector)}).click()`, selector: op.selector, op }, stats }; @@ -144,6 +157,10 @@ function emitOp(op, snapshots) { lines.push(` await forceCheck(${best.code});`); } else if (op.kind === "fill_ref" && role === "checkbox") { lines.push(` await forceCheck(${best.code});`); + } else if (op.kind === "click_ref" && role === "link") { + // SPA links with onClick handlers (tour overlays, route-only handlers) + // often don't navigate via .click(). Falls back to page.goto(href). + lines.push(` await clickLinkWithFallback(page, ${best.code});`); } else { lines.push(` await ${best.code}.${method}${args};`); } @@ -475,6 +492,26 @@ async function reactFill(page: Page, labelPattern: RegExp | string, value: strin }, value); } +/** Click a link with auto-fallback to direct navigation. SPA links on state-agency + * portals frequently have onClick handlers that preventDefault and route via + * client-state (often gated behind tour/onboarding overlays). When the link + * exposes an absolute http(s) href, prefer page.goto over .click — same destination, + * no overlay-intercept risk. Falls back to a plain click only for non-routable + * hrefs (e.g. fragment anchors or JS-only handlers). */ +async function clickLinkWithFallback(page: Page, loc: Locator): Promise { + // Use the resolved .href property (absolute URL), not getAttribute("href") + // which returns the raw attribute value (often relative like "/forms/business"). + const href = await loc.first().evaluate((el) => (el as HTMLAnchorElement).href).catch(() => null); + if (href && /^https?:\\/\\//i.test(href)) { + await page.goto(href); + } else { + await loc.first().click({ timeout: 10000 }); + } + // SPAs often finish loading client content well after the load event fires; wait + // for the network to actually settle before returning. + await page.waitForLoadState("networkidle").catch(() => {}); +} + /** Click "Next Step" (or other named button) via find-by-text in page context; * avoids the race where getByRole resolves to a stale element between SPA wizard steps. */ async function clickButtonByText(page: Page, text: string, waitAfterMs = 1500): Promise { diff --git a/skills/autobrowse/scripts/lib/selector-resolver.mjs b/skills/autobrowse/scripts/lib/selector-resolver.mjs index bb9d929a..37b9aa09 100644 --- a/skills/autobrowse/scripts/lib/selector-resolver.mjs +++ b/skills/autobrowse/scripts/lib/selector-resolver.mjs @@ -154,6 +154,21 @@ function findNearbyLabel(node, tree) { return null; } +// Mapping for non-ARIA role names that show up in the browse snapshot tree +// to their ARIA equivalents. Most importantly: