diff --git a/skills/autobrowse/SKILL.md b/skills/autobrowse/SKILL.md index d9b0bc7c..0bda47f6 100644 --- a/skills/autobrowse/SKILL.md +++ b/skills/autobrowse/SKILL.md @@ -296,23 +296,35 @@ How refs are resolved: every `[X-Y]` ref in the trace is looked up against the m The final extract step is generated with one Claude Haiku call at export time (requires `ANTHROPIC_API_KEY`). The LLM is given the final snapshot, the Zod schema parsed from `task.md`'s `## Output` block, and the agent's final reasoning. If the API key is missing the export still produces a script — the extract block is a TODO placeholder. -For a Stagehand-targeted export (LLM-driven replay via `stagehand.act`/`observe`), use the standalone `/stagehand-export` skill. +For a Stagehand-targeted export (self-healing replay via `stagehand.page.act` / `stagehand.page.extract`), pass `--target stagehand`: -## Iterative Playwright loop (recommended for tasks that need a deterministic artifact) +```bash +node ${CLAUDE_SKILL_DIR}/scripts/export.mjs --task --target stagehand +``` + +Stagehand-native: every interaction op (clicks, fills, selects) collapses into a `page.act("…")` call. Deterministic ops (goto, waits, keyboard, scroll, eval, page nav) stay as raw `page.*` calls — there's no element to find, so no LLM call is needed. The final extract step uses `page.extract({ instruction, schema })` with a one-sentence instruction generated at export time (Haiku, ~$0.001) or a generic fallback if `ANTHROPIC_API_KEY` is missing. -When the end goal is a runnable Playwright script (cron, Browserbase Functions, etc.), prefer `loop.mjs` over manually orchestrating evaluate + export. The loop converges on a workflow that **both** the LLM explorer **and** the deterministic Playwright replay can complete — which is a strictly stronger guarantee than "the LLM agent's trace ends with success: true." +The Stagehand script reads `BROWSERBASE_API_KEY` / `BROWSERBASE_PROJECT_ID` to run against Browserbase (and `BROWSERBASE_CONTEXT_ID` for pre-authed sessions); when those are absent it falls back to `env: "LOCAL"`. Model selection is controlled by the `STAGEHAND_MODEL` env var (defaults to a current Claude Sonnet). + +## Iterative loop (recommended for tasks that need a deterministic artifact) + +When the end goal is a runnable script (cron, Browserbase Functions, etc.), prefer `loop.mjs` over manually orchestrating evaluate + export. The loop converges on a workflow that **both** the LLM explorer **and** the deterministic replay can complete — which is a strictly stronger guarantee than "the LLM agent's trace ends with success: true." ```bash +# Playwright (default) node ${CLAUDE_SKILL_DIR}/scripts/loop.mjs --task --env remote \ --max-iterations 8 --max-turns-per-iter 60 + +# Stagehand +node ${CLAUDE_SKILL_DIR}/scripts/loop.mjs --task --target stagehand --env remote ``` What it does per iteration: 1. Runs `evaluate.mjs` (one LLM-driven exploration round). -2. If the trace passed (`success: true` in the final JSON), runs `export.mjs --target playwright --no-verify` to emit a fresh script. +2. If the trace passed (`success: true` in the final JSON), runs `export.mjs --target --no-verify` to emit a fresh script. 3. Runs the emitted script (`npx tsx .ts`) against a new BB session — the actual deterministic replay. -4. If the Playwright replay passed → records a pass. If it failed → distills the failure (Claude Haiku, ~$0.01) into a new entry under `strategy.md`'s "Recent Playwright Failures" section. +4. If the replay passed → records a pass. If it failed → distills the failure (Claude Haiku, ~$0.01) into a new entry under `strategy.md`'s "Recent Playwright Failures" or "Recent Stagehand Failures" section (target-scoped). 5. Next iteration's evaluate reads the updated strategy.md and adapts. **Convergence**: graduates when the emitted script passes in 2 of the last 3 iterations. diff --git a/skills/autobrowse/scripts/export.mjs b/skills/autobrowse/scripts/export.mjs index e6c2fb0e..ce7a3cfd 100755 --- a/skills/autobrowse/scripts/export.mjs +++ b/skills/autobrowse/scripts/export.mjs @@ -4,9 +4,10 @@ * export.mjs — Translate a graduated autobrowse task into a deterministic * runnable script. * - * Currently supports --target playwright. The Stagehand variant lives in - * the standalone /stagehand-export skill; once Playwright is shipped and - * proven we can fold both targets behind this CLI. + * Supports --target playwright (default) and --target stagehand. Playwright + * resolves every ARIA ref to a locator at export time; Stagehand-native + * collapses every interaction op to `page.act(...)` and lets Stagehand + * self-heal at replay time. * * Usage: * node scripts/export.mjs --task --target playwright \\ @@ -26,8 +27,15 @@ import { playwrightPackageJson, playwrightTsconfig, } from "./lib/codegen-playwright.mjs"; +import { + generateStagehandScript, + stagehandPackageJson, + stagehandTsconfig, +} from "./lib/codegen-stagehand.mjs"; import { verifyGenerated } from "./lib/verify.mjs"; +const SUPPORTED_TARGETS = new Set(["playwright", "stagehand"]); + // ── CLI args ─────────────────────────────────────────────────────── function getArg(name, fallback) { @@ -43,7 +51,7 @@ Usage: node scripts/export.mjs --task [options] Options: --task Task name — matches tasks// (required) - --target playwright (default; stagehand lives in /stagehand-export) + --target playwright (default) | stagehand --workspace Workspace root holding tasks/ and traces/ (default: ./autobrowse) --run Force a specific run (default: newest passing) --output Output directory for generated files (default: /tasks//) @@ -69,8 +77,8 @@ if (!TASK) { console.error("Run with --help for usage."); process.exit(1); } -if (TARGET !== "playwright") { - console.error(`ERROR: --target=${TARGET} not yet supported here. Use the /stagehand-export skill for Stagehand output.`); +if (!SUPPORTED_TARGETS.has(TARGET)) { + console.error(`ERROR: --target=${TARGET} not supported. Use one of: ${[...SUPPORTED_TARGETS].join(", ")}.`); process.exit(1); } @@ -130,9 +138,10 @@ for (let i = trace.length - 1; i >= 0; i--) { } } -// ── Generate Playwright script ───────────────────────────────────── +// ── Generate script ──────────────────────────────────────────────── -const { scriptCode, cachedActions, stats, extract } = await generatePlaywrightScript({ +const generate = TARGET === "stagehand" ? generateStagehandScript : generatePlaywrightScript; +const { scriptCode, cachedActions, stats, extract } = await generate({ task: TASK, runId, workspace: WORKSPACE, @@ -168,15 +177,21 @@ fs.writeFileSync( 2, ), ); +const pkgGen = TARGET === "stagehand" ? stagehandPackageJson : playwrightPackageJson; +const tsconfigGen = TARGET === "stagehand" ? stagehandTsconfig : playwrightTsconfig; if (!fs.existsSync(pkgPath)) { - fs.writeFileSync(pkgPath, JSON.stringify(playwrightPackageJson(TASK), null, 2)); + fs.writeFileSync(pkgPath, JSON.stringify(pkgGen(TASK), null, 2)); } if (!fs.existsSync(tsconfigPath)) { - fs.writeFileSync(tsconfigPath, JSON.stringify(playwrightTsconfig(), null, 2)); + fs.writeFileSync(tsconfigPath, JSON.stringify(tsconfigGen(), null, 2)); } console.error(`[export] wrote ${path.relative(process.cwd(), scriptPath)}`); -console.error(`[export] ops: ${ops.length} | cached: ${stats.cached} | ref_resolved: ${stats.ref_resolved} | ref_failed: ${stats.ref_failed} | dropped: ${stats.dropped}`); +if (TARGET === "stagehand") { + console.error(`[export] ops: ${ops.length} | deterministic: ${stats.deterministic} | act: ${stats.act} | ref_resolved: ${stats.ref_resolved} | ref_failed: ${stats.ref_failed} | dropped: ${stats.dropped}`); +} else { + console.error(`[export] ops: ${ops.length} | cached: ${stats.cached} | ref_resolved: ${stats.ref_resolved} | ref_failed: ${stats.ref_failed} | dropped: ${stats.dropped}`); +} console.error(`[export] schema fields: ${schemaFieldCount} | extract: ${extract.generated ? "LLM-generated" : `fallback (${extract.reason})`}`); // ── Verify ───────────────────────────────────────────────────────── diff --git a/skills/autobrowse/scripts/lib/codegen-stagehand.mjs b/skills/autobrowse/scripts/lib/codegen-stagehand.mjs new file mode 100644 index 00000000..1d28f96d --- /dev/null +++ b/skills/autobrowse/scripts/lib/codegen-stagehand.mjs @@ -0,0 +1,398 @@ +// codegen-stagehand.mjs — ops[] + snapshots[] → runnable Stagehand TS. +// +// Stagehand-native emitter: every element-finding op collapses into a +// `stagehand.page.act(...)` call, letting Stagehand self-heal across DOM +// drift. Deterministic ops (goto, waits, keyboard, scroll, eval, page_nav) +// stay as `page.*` — there's no element to find, so no point paying for an +// LLM call. The final extract step uses `stagehand.page.extract({ instruction, schema })`. +// +// Connects to a Browserbase session bound to BROWSERBASE_CONTEXT_ID at +// runtime (or falls back to env=LOCAL for development). + +import Anthropic from "@anthropic-ai/sdk"; +import { resolveOpRef, collectSnapshots } from "./selector-resolver.mjs"; + +// ── Helpers ──────────────────────────────────────────────────────── + +function jsStr(s) { + return JSON.stringify(String(s ?? "")); +} + +// Truncate intent so it doesn't bloat act() instructions. +function shortIntent(s) { + const t = (s || "").replace(/[\r\n]+/g, " ").trim(); + return t.length > 140 ? t.slice(0, 137) + "..." : t; +} + +// Heuristic: roughly describe a selector for an `act` instruction when we +// don't have a resolved ARIA node. Stagehand parses natural language, not +// CSS, so we hand it the selector as a hint, not a directive. +function describeSelector(sel) { + if (!sel) return ""; + return ` (originally targeted CSS selector \`${sel}\`)`; +} + +// Build a natural-language action string for an `act` op. +function actInstruction(verb, op, resolvedNode) { + const intent = shortIntent(op.intent); + const intentSuffix = intent && intent !== `turn ${op.turn}` ? ` — ${intent}` : ""; + + if (resolvedNode) { + const role = resolvedNode.role || "element"; + const name = resolvedNode.name ? ` "${resolvedNode.name}"` : ""; + switch (verb) { + case "click": + return `click the ${role}${name}${intentSuffix}`; + case "fill": + return `fill the ${role}${name} with "${op.value ?? ""}"${intentSuffix}`; + case "select": + return `select "${op.value ?? ""}" in the ${role}${name}${intentSuffix}`; + } + } + + // Selector-based fallback — describe by intent + selector hint. + const hint = describeSelector(op.selector); + switch (verb) { + case "click": + return intent ? `${intent}${hint}` : `click the element${hint}`; + case "fill": + return `fill the field with "${op.value ?? ""}"${hint}${intentSuffix}`; + case "select": + return `select "${op.value ?? ""}" in the dropdown${hint}${intentSuffix}`; + } + return intent || verb; +} + +// ── Op → Stagehand code ─────────────────────────────────────────── + +function emitOp(op, snapshots) { + const lines = []; + const stats = { deterministic: 0, act: 0, ref_resolved: 0, ref_failed: 0, dropped: 0 }; + const sec = op.section ? `// [${op.section}] ` : ""; + const intent = (op.intent || "").replace(/[\r\n]+/g, " ").slice(0, 140); + const hasUsefulIntent = intent && intent !== `turn ${op.turn}`; + const header = hasUsefulIntent ? ` ${sec}// turn ${op.turn}: ${intent}` : (op.section ? ` ${sec}` : null); + + const pushAct = (verb, resolvedNode = null) => { + if (header) lines.push(header); + const instruction = actInstruction(verb, op, resolvedNode); + lines.push(` await page.act(${jsStr(instruction)});`); + stats.act++; + return { kind: "act", verb, instruction, op }; + }; + + switch (op.kind) { + case "goto": + if (header) lines.push(header); + lines.push(` await page.goto(${jsStr(op.url)});`); + stats.deterministic++; + return { lines, cached: { kind: "goto", url: op.url, op }, stats }; + + case "wait_load": + lines.push(` await page.waitForLoadState("load");`); + stats.deterministic++; + break; + case "wait_timeout": + lines.push(` await page.waitForTimeout(${op.ms || 1000});`); + stats.deterministic++; + break; + case "wait_selector": + lines.push(` await page.waitForSelector(${jsStr(op.selector)});`); + stats.deterministic++; + break; + + case "click_sel": { + const cached = pushAct("click"); + return { lines, cached, stats }; + } + case "fill_sel": { + const cached = pushAct("fill"); + return { lines, cached, stats }; + } + case "select_dropdown": { + const cached = pushAct("select"); + return { lines, cached, stats }; + } + + case "click_ref": + case "fill_ref": + case "select_ref": { + const r = resolveOpRef(op, snapshots); + const verb = op.kind === "click_ref" ? "click" : op.kind === "fill_ref" ? "fill" : "select"; + if (!r.resolved) { + // Even without a resolved node we can still try act() with the + // agent's intent — Stagehand will look at the live page. + if (header) lines.push(header); + lines.push(` // ref ${op.ref} did not resolve in snapshots (${r.reason}); falling back to intent-only act`); + const cached = pushAct(verb, null); + stats.ref_failed++; + return { lines, cached, stats }; + } + const cached = pushAct(verb, r.node); + cached.ref = op.ref; + cached.source_turn = r.sourceTurn; + cached.node = { role: r.node.role, name: r.node.name, depth: r.node.depth }; + stats.ref_resolved++; + return { lines, cached, stats }; + } + + case "type_focused": + lines.push(` await page.keyboard.type(${jsStr(op.text)});`); + stats.deterministic++; + break; + + case "eval": { + if (header) lines.push(header); + const escaped = (op.expression || "") + .replace(/\\/g, "\\\\") + .replace(/`/g, "\\`") + .replace(/\$\{/g, "\\${"); + lines.push(` await page.evaluate(\`${escaped}\`);`); + stats.deterministic++; + return { lines, cached: { kind: "eval", expression: op.expression, op }, stats }; + } + case "press": + lines.push(` await page.keyboard.press(${jsStr(op.key)});`); + stats.deterministic++; + break; + + case "scroll": { + const [x, y, dx, dy] = op.coords; + if ([x, y, dx, dy].some((n) => Number.isNaN(n))) { + lines.push(` // skip: malformed scroll ${JSON.stringify(op.coords)}`); + stats.dropped++; + } else { + lines.push(` await page.mouse.move(${x}, ${y});`); + lines.push(` await page.mouse.wheel(${dx}, ${dy});`); + stats.deterministic++; + } + break; + } + + case "page_nav": + if (op.verb === "back") lines.push(` await page.goBack();`); + else if (op.verb === "forward") lines.push(` await page.goForward();`); + else if (op.verb === "reload") lines.push(` await page.reload();`); + stats.deterministic++; + break; + + case "session": + case "perception": + lines.push(` // skip (${op.kind}): ${op.command}`); + stats.dropped++; + break; + + case "unhandled": { + // Best-effort act() with the intent string; the original verb was + // something our op walker didn't classify. + if (header) lines.push(header); + lines.push(` // unhandled browse verb '${op.verb}' — attempting act() with intent`); + const cached = pushAct("click"); + stats.dropped++; + return { lines, cached, stats }; + } + } + return { lines, cached: null, stats }; +} + +// ── Extract instruction (one tiny LLM call, optional) ───────────── + +async function generateExtractInstruction({ outputShape, taskMd, finalReasoning }) { + const FALLBACK = `Extract the final result from the page that matches the provided schema. Pull every field directly from visible page content.`; + + if (!process.env.ANTHROPIC_API_KEY) { + return { instruction: FALLBACK, generated: false, reason: "no ANTHROPIC_API_KEY" }; + } + + const prompt = `You are writing ONE natural-language instruction for \`stagehand.page.extract({ instruction, schema })\`. Stagehand will read the live page and populate a Zod schema. Your instruction should tell it which data to pull. + +Task description (excerpt from task.md): +\`\`\` +${(taskMd || "").slice(0, 1500)} +\`\`\` + +Expected output shape: +\`\`\`json +${JSON.stringify(outputShape, null, 2)} +\`\`\` + +Agent's final reasoning (context — do NOT copy values from it): +${finalReasoning ? finalReasoning.slice(0, 800) : "(none)"} + +Write ONE instruction sentence (max 50 words) telling Stagehand what to extract from the page. Reference the schema fields by name. Do not include the schema itself, code, or markdown — just the sentence.`; + + try { + const client = new Anthropic(); + const resp = await client.messages.create({ + model: "claude-haiku-4-5-20251001", + max_tokens: 400, + messages: [{ role: "user", content: prompt }], + }); + const text = resp.content.find((b) => b.type === "text")?.text?.trim() ?? ""; + if (!text) return { instruction: FALLBACK, generated: false, reason: "empty LLM response" }; + // Strip stray quoting / fences. + const cleaned = text.replace(/^["'`]+|["'`]+$/g, "").replace(/^```[a-z]*\n?|\n?```$/g, "").trim(); + return { instruction: cleaned || FALLBACK, generated: true, reason: null }; + } catch (err) { + return { instruction: FALLBACK, generated: false, reason: String(err?.message || err) }; + } +} + +// ── Script wrapper ──────────────────────────────────────────────── + +function wrapScript({ task, runId, workspace, zodSchema, body, extractInstruction }) { + return `// Generated by autobrowse export --target stagehand from ${runId}. +// Source: ${workspace}/tasks/${task}/{task.md, strategy.md} + traces/${task}/${runId}/trace.json +// Hand-edit freely. selectors.cache.json mirrors the act() instructions per turn. +// +// Stagehand-native: every interactive step is a page.act() call so the script +// self-heals across DOM drift. Deterministic steps (goto, waits, keyboard, +// scroll) stay as raw Playwright page.* calls. +import { Stagehand } from "@browserbasehq/stagehand"; +import { z } from "zod"; +import "dotenv/config"; + +const OutputSchema = ${zodSchema}; +type Output = z.infer; + +const MODEL_NAME = process.env.STAGEHAND_MODEL ?? "claude-sonnet-4-5-20250929"; + +function createStagehand(): Stagehand { + const ctxId = process.env.BROWSERBASE_CONTEXT_ID; + const useBrowserbase = Boolean(process.env.BROWSERBASE_API_KEY && process.env.BROWSERBASE_PROJECT_ID); + + if (!useBrowserbase) { + return new Stagehand({ env: "LOCAL", modelName: MODEL_NAME }); + } + + return new Stagehand({ + env: "BROWSERBASE", + apiKey: process.env.BROWSERBASE_API_KEY!, + projectId: process.env.BROWSERBASE_PROJECT_ID!, + modelName: MODEL_NAME, + browserbaseSessionCreateParams: { + projectId: process.env.BROWSERBASE_PROJECT_ID!, + browserSettings: { + ...(ctxId ? { context: { id: ctxId, persist: true } } : {}), + advancedStealth: true, + solveCaptchas: true, + }, + }, + }); +} + +async function main(): Promise { + const stagehand = createStagehand(); + await stagehand.init(); + const page = stagehand.page; + + try { +${body} + + const result = await page.extract({ + instruction: ${jsStr(extractInstruction)}, + schema: OutputSchema, + }); + return OutputSchema.parse({ ...result, success: true }); + } finally { + await stagehand.close(); + } +} + +main() + .then((result) => { + console.log(JSON.stringify(result, null, 2)); + process.exit((result as { success?: boolean })?.success === true ? 0 : 2); + }) + .catch((err) => { + console.error("FATAL:", err); + console.log(JSON.stringify({ success: false, error: String(err) })); + process.exit(1); + }); +`; +} + +// ── Top-level entry ─────────────────────────────────────────────── + +export async function generateStagehandScript({ + task, + runId, + workspace, + trace, + ops, + zodSchema, + outputShape, + taskMd, + finalReasoning, +}) { + const snapshots = collectSnapshots(trace); + const bodyLines = []; + const cachedActions = []; + const stats = { deterministic: 0, act: 0, ref_resolved: 0, ref_failed: 0, dropped: 0 }; + + for (const op of ops) { + const r = emitOp(op, snapshots); + bodyLines.push(...r.lines); + if (r.cached) cachedActions.push({ turn: op.turn, intent: op.intent, section: op.section, ...r.cached }); + stats.deterministic += r.stats.deterministic; + stats.act += r.stats.act; + stats.ref_resolved += r.stats.ref_resolved; + stats.ref_failed += r.stats.ref_failed; + stats.dropped += r.stats.dropped; + } + + const extract = await generateExtractInstruction({ outputShape, taskMd, finalReasoning }); + + const scriptCode = wrapScript({ + task, + runId, + workspace, + zodSchema, + body: bodyLines.join("\n"), + extractInstruction: extract.instruction, + }); + + return { + scriptCode, + cachedActions, + stats, + extract: { generated: extract.generated, reason: extract.reason, instruction: extract.instruction }, + }; +} + +// ── Scaffold files ──────────────────────────────────────────────── + +export function stagehandPackageJson(task) { + return { + name: `${task}-stagehand`, + version: "0.0.1", + private: true, + type: "module", + scripts: { start: `tsx ${task}.ts` }, + dependencies: { + "@browserbasehq/stagehand": "^2.0.0", + zod: "^3.23.0", + dotenv: "^16.4.0", + }, + devDependencies: { + tsx: "^4.7.0", + typescript: "^5.4.0", + "@types/node": "^20.0.0", + }, + }; +} + +export function stagehandTsconfig() { + return { + compilerOptions: { + target: "ES2022", + module: "ESNext", + moduleResolution: "Bundler", + lib: ["ES2022", "DOM"], + types: ["node"], + strict: true, + esModuleInterop: true, + skipLibCheck: true, + }, + }; +} diff --git a/skills/autobrowse/scripts/lib/distill-failure.mjs b/skills/autobrowse/scripts/lib/distill-failure.mjs index 2bdcdbb2..651aaff3 100644 --- a/skills/autobrowse/scripts/lib/distill-failure.mjs +++ b/skills/autobrowse/scripts/lib/distill-failure.mjs @@ -1,16 +1,21 @@ -// distill-failure.mjs — Playwright failure → strategy.md addendum. +// distill-failure.mjs — replay failure → strategy.md addendum. // -// When the in-loop Playwright replay fails, this module asks Claude Haiku to -// distill the error into a concise, actionable strategy.md entry: what -// failed, the likely cause, and what to try next iteration. The addendum is -// appended to strategy.md's "Recent Playwright Failures" section so both the -// explorer agent (next evaluate run) and the codegen (next export) can react. +// When the in-loop replay (Playwright or Stagehand) fails, this module asks +// Claude Haiku to distill the error into a concise, actionable strategy.md +// entry: what failed, the likely cause, and what to try next iteration. The +// addendum is appended to strategy.md's "Recent Failures" section +// so both the explorer agent (next evaluate run) and the codegen (next +// export) can react. import Anthropic from "@anthropic-ai/sdk"; import * as fs from "node:fs"; -const FALLBACK = (iter, exitCode, stderrSnip) => - `### Iteration ${iter} — Playwright replay failed (exit ${exitCode}) +function targetLabel(target) { + return target === "stagehand" ? "Stagehand" : "Playwright"; +} + +const FALLBACK = (iter, exitCode, stderrSnip, target) => + `### Iteration ${iter} — ${targetLabel(target)} replay failed (exit ${exitCode}) \`\`\` ${stderrSnip.slice(0, 800)} @@ -22,6 +27,7 @@ ${stderrSnip.slice(0, 800)} export async function distillFailure({ iteration, taskName, + target = "playwright", scriptPath, exitCode, stdout = "", @@ -33,7 +39,7 @@ export async function distillFailure({ if (!process.env.ANTHROPIC_API_KEY) { return { - addendum: FALLBACK(iteration, exitCode, stderrSnip), + addendum: FALLBACK(iteration, exitCode, stderrSnip, target), generated: false, reason: "no ANTHROPIC_API_KEY", }; @@ -58,7 +64,14 @@ export async function distillFailure({ /* best-effort */ } - const prompt = `A deterministic Playwright replay script for task "${taskName}" just failed mid-replay. You are writing one short Markdown entry that will be appended to that task's \`strategy.md\` so the next iteration of the explorer agent can learn from this failure. + const tLabel = targetLabel(target); + const targetGuidance = target === "stagehand" + ? `This is a Stagehand-native script: every interactive step is a \`page.act("...")\` call that lets the model self-heal across DOM drift. Failures usually mean either (a) the act() instruction was too vague for the model to pick the right element, (b) the page state wasn't ready when act() ran (need a wait), or (c) the \`page.extract({ instruction, schema })\` step couldn't find the fields.\n\nFixes to suggest: rewrite the act() instruction to reference a more specific element (role, name, surrounding text); insert a \`page.waitForLoadState\` / \`page.waitForTimeout\` before the failing act(); restructure the extract instruction to name fields explicitly.` + : `This is a deterministic Playwright script with resolved locators. Failures usually mean (a) the locator broke (DOM drift, role/name changed), (b) actionability check failed (disabled, intercepted by overlay, off-screen), or (c) a timing issue (element rendered too late, or stale after re-render).\n\nFixes to suggest: force-click via .click({force:true}); use eval-find-by-text instead of getByRole; add a waitForTimeout before the action; swap to a fallback locator from selectors.cache.json.`; + + const prompt = `A deterministic ${tLabel} replay script for task "${taskName}" just failed mid-replay. You are writing one short Markdown entry that will be appended to that task's \`strategy.md\` so the next iteration of the explorer agent can learn from this failure. + +${targetGuidance} Exit code: ${exitCode} Script path: ${scriptPath} @@ -74,11 +87,11 @@ Write a tight Markdown entry with this exact structure (no surrounding prose, no ### Iteration ${iteration} — -- **What failed**: -- **Likely cause**: for ~3s after the prior fill", "styled label intercepts pointer events on the underlying input", "selector resolved to a stale ref after a re-render"> -- **Fix to try next iteration**: +- **What failed**: +- **Likely cause**: +- **Fix to try next iteration**: -Keep it under 80 words total. Be specific. Reference the actual locator or line number when you can.`; +Keep it under 80 words total. Be specific. Reference the actual locator, act() instruction, or line number when you can.`; try { const client = new Anthropic(); @@ -90,7 +103,7 @@ Keep it under 80 words total. Be specific. Reference the actual locator or line const text = resp.content.find((b) => b.type === "text")?.text?.trim() ?? ""; if (!text || !text.startsWith("###")) { return { - addendum: FALLBACK(iteration, exitCode, stderrSnip), + addendum: FALLBACK(iteration, exitCode, stderrSnip, target), generated: false, reason: "LLM output did not match expected heading", }; @@ -98,21 +111,21 @@ Keep it under 80 words total. Be specific. Reference the actual locator or line return { addendum: text + "\n", generated: true, reason: null }; } catch (err) { return { - addendum: FALLBACK(iteration, exitCode, stderrSnip), + addendum: FALLBACK(iteration, exitCode, stderrSnip, target), generated: false, reason: String(err?.message || err), }; } } -// Append an addendum to strategy.md under the "Recent Playwright Failures" +// Append an addendum to strategy.md under the "Recent Failures" // section. Creates the section if it doesn't exist. -export function appendToStrategy(strategyPath, addendum) { - const SECTION_HEADER = "## Recent Playwright Failures"; +export function appendToStrategy(strategyPath, addendum, target = "playwright") { + const SECTION_HEADER = `## Recent ${targetLabel(target)} Failures`; let md = fs.existsSync(strategyPath) ? fs.readFileSync(strategyPath, "utf-8") : ""; if (!md.trim()) { - md = `# Navigation Strategy\n\n## Navigation Heuristics\n\n(grows as the explorer learns)\n\n## Codegen Hints\n\n(per-task overrides the Playwright codegen should apply)\n\n${SECTION_HEADER}\n\n${addendum}`; + md = `# Navigation Strategy\n\n## Navigation Heuristics\n\n(grows as the explorer learns)\n\n## Codegen Hints\n\n(per-task overrides the codegen should apply)\n\n${SECTION_HEADER}\n\n${addendum}`; } else if (md.includes(SECTION_HEADER)) { // Insert addendum right after the section header (newest first). md = md.replace(SECTION_HEADER, `${SECTION_HEADER}\n\n${addendum.trim()}\n`); diff --git a/skills/autobrowse/scripts/lib/pick-run.mjs b/skills/autobrowse/scripts/lib/pick-run.mjs index ae08c957..d641fa61 100644 --- a/skills/autobrowse/scripts/lib/pick-run.mjs +++ b/skills/autobrowse/scripts/lib/pick-run.mjs @@ -1,7 +1,7 @@ // pick-run.mjs — choose which autobrowse run to mine for export. // // A run is "passing" when its summary.md's final JSON has `success: true`. -// Lifted from stagehand-export/scripts/export.mjs. +// Shared by all target codegens (playwright, stagehand). import * as fs from "node:fs"; import * as path from "node:path"; diff --git a/skills/autobrowse/scripts/loop.mjs b/skills/autobrowse/scripts/loop.mjs index 9abf533b..ba4f7f52 100755 --- a/skills/autobrowse/scripts/loop.mjs +++ b/skills/autobrowse/scripts/loop.mjs @@ -1,25 +1,27 @@ #!/usr/bin/env node /** - * loop.mjs — Iterative autobrowse + Playwright verification. + * loop.mjs — Iterative autobrowse + deterministic verification. * * Wraps the existing evaluate.mjs and export.mjs into a single loop that * converges on a workflow which BOTH the LLM explorer and the deterministic - * Playwright replay can complete. Each iteration: + * replay can complete. Each iteration: * * 1. Run evaluate.mjs (the inner LLM agent) * 2. If the trace passed (success: true in final JSON), run export.mjs to - * emit a Playwright script and replay it against a fresh BB session. - * 3. If the Playwright replay also passed → record a pass. + * emit a script for the chosen target (--target playwright|stagehand) + * and replay it against a fresh BB session. + * 3. If the replay also passed → record a pass. * Else → distill the failure into strategy.md and continue. - * 4. Graduate when Playwright has passed in 2 of the last 3 iterations. + * 4. Graduate when the replay has passed in 2 of the last 3 iterations. * * The shared `strategy.md` is the convergence point. The explorer reads it - * each iteration. The codegen (eventually) reads its "Codegen Hints" section. - * Playwright failures land in "Recent Playwright Failures". + * each iteration. The codegen reads its "Codegen Hints" section. Replay + * failures land in "Recent Failures". * * Usage: - * node scripts/loop.mjs --task [--max-iterations N] [--max-turns-per-iter N] + * node scripts/loop.mjs --task [--target playwright|stagehand] + * [--max-iterations N] [--max-turns-per-iter N] * [--workspace ./autobrowse] [--env local|remote] */ @@ -44,21 +46,22 @@ function getArg(name, fallback) { const hasFlag = (n) => process.argv.includes(`--${n}`); if (hasFlag("help") || hasFlag("h")) { - console.log(`autobrowse loop — iterate evaluate + Playwright verification until convergence + console.log(`autobrowse loop — iterate evaluate + deterministic replay until convergence Usage: node scripts/loop.mjs --task [options] Options: --task Task name — matches tasks// (required) + --target playwright (default) | stagehand --max-iterations N Cap on outer iterations (default: 8) --max-turns-per-iter N Per-evaluate turn budget (default: 60) --workspace Default: ./autobrowse --env local|remote Default: local (use remote for bot-protected sites) - --skip-verify Skip the Playwright verify step (still emit script) + --skip-verify Skip the replay verify step (still emit script) -Convergence: graduates when the emitted Playwright script passes in 2 of the -last 3 iterations. Until then, each Playwright failure is distilled into -strategy.md so the next evaluate run can adapt. +Convergence: graduates when the emitted script passes in 2 of the last 3 +iterations. Until then, each replay failure is distilled into strategy.md so +the next evaluate run can adapt. Env vars: ANTHROPIC_API_KEY Required for evaluate + distillation + LLM extract @@ -69,6 +72,7 @@ Env vars: } const TASK = getArg("task"); +const TARGET = getArg("target", "playwright"); const MAX_ITER = parseInt(getArg("max-iterations", "8"), 10); const MAX_TURNS_PER_ITER = parseInt(getArg("max-turns-per-iter", "60"), 10); const WORKSPACE = path.resolve(getArg("workspace", "autobrowse")); @@ -79,6 +83,11 @@ if (!TASK) { console.error("ERROR: --task is required. Run with --help."); process.exit(1); } +if (TARGET !== "playwright" && TARGET !== "stagehand") { + console.error(`ERROR: --target=${TARGET} not supported. Use playwright or stagehand.`); + process.exit(1); +} +const TARGET_LABEL = TARGET === "stagehand" ? "Stagehand" : "Playwright"; // ── Paths ────────────────────────────────────────────────────────── @@ -87,8 +96,8 @@ const exportScript = path.join(SKILL_DIR, "scripts", "export.mjs"); const taskDir = path.join(WORKSPACE, "tasks", TASK); const tracesDir = path.join(WORKSPACE, "traces", TASK); const strategyPath = path.join(taskDir, "strategy.md"); -const playwrightDir = path.join(taskDir, "playwright"); -const playwrightScript = path.join(playwrightDir, `${TASK}.ts`); +const targetDir = path.join(taskDir, TARGET); +const targetScript = path.join(targetDir, `${TASK}.ts`); if (!fs.existsSync(taskDir)) { console.error(`ERROR: ${taskDir} does not exist. Create task.md first (see SKILL.md).`); @@ -99,7 +108,7 @@ fs.mkdirSync(path.join(WORKSPACE, "reports"), { recursive: true }); const reportPath = path.join( WORKSPACE, "reports", - `loop-${TASK}-${new Date().toISOString().replace(/[:.]/g, "-").slice(0, 19)}.md`, + `loop-${TASK}-${TARGET}-${new Date().toISOString().replace(/[:.]/g, "-").slice(0, 19)}.md`, ); // ── Helpers ──────────────────────────────────────────────────────── @@ -144,12 +153,12 @@ function tracePassed(runId) { } function runExport(runId) { - log(`exporting Playwright script from ${runId}…`); + log(`exporting ${TARGET_LABEL} script from ${runId}…`); const args = [ exportScript, "--task", TASK, "--workspace", WORKSPACE, - "--target", "playwright", + "--target", TARGET, "--run", runId, "--no-verify", // we run the verification ourselves below so we can capture/distill output ]; @@ -160,12 +169,12 @@ function runExport(runId) { return result.status === 0; } -function runPlaywright() { - log(`replaying Playwright script…`); +function runReplay() { + log(`replaying ${TARGET_LABEL} script…`); // Ensure deps are installed (first iter only is slow; npm caches after). - if (!fs.existsSync(path.join(playwrightDir, "node_modules"))) { + if (!fs.existsSync(path.join(targetDir, "node_modules"))) { const install = spawnSync("npm", ["install", "--silent"], { - cwd: playwrightDir, + cwd: targetDir, stdio: ["ignore", "inherit", "inherit"], }); if (install.status !== 0) { @@ -173,7 +182,7 @@ function runPlaywright() { } } const run = spawnSync("npx", ["tsx", `${TASK}.ts`], { - cwd: playwrightDir, + cwd: targetDir, encoding: "utf-8", stdio: ["ignore", "pipe", "pipe"], env: process.env, @@ -194,10 +203,10 @@ function runPlaywright() { // ── Main loop ────────────────────────────────────────────────────── -const history = []; // [{ iter, runId, evalPassed, pwPassed, distillReason }] +const history = []; // [{ iter, runId, evalPassed, replayPassed, distillReason }] async function main() { - log(`task=${TASK} workspace=${WORKSPACE} env=${ENV} max-iter=${MAX_ITER}`); + log(`task=${TASK} target=${TARGET} workspace=${WORKSPACE} env=${ENV} max-iter=${MAX_ITER}`); for (let iter = 1; iter <= MAX_ITER; iter++) { log(`──────── iteration ${iter}/${MAX_ITER} ────────`); @@ -208,76 +217,77 @@ async function main() { const evalPassed = runId ? tracePassed(runId) : false; log(`iter ${iter}: evaluate ${evalPassed ? "✅ passed" : "❌ no success: true"} (run=${runId ?? "?"})`); - const hist = { iter, runId, evalPassed, pwPassed: false, distillReason: null }; + const hist = { iter, runId, evalPassed, replayPassed: false, distillReason: null }; history.push(hist); if (!evalPassed) { - log(`iter ${iter}: skipping Playwright (trace not passing) — agent will iterate next round`); + log(`iter ${iter}: skipping ${TARGET_LABEL} (trace not passing) — agent will iterate next round`); continue; } - // 2. Emit Playwright (overwrites previous if any) + // 2. Emit script (overwrites previous if any) const exportOk = runExport(runId); if (!exportOk) { - log(`iter ${iter}: export failed; treating as Playwright fail`); + log(`iter ${iter}: export failed; treating as ${TARGET_LABEL} fail`); hist.distillReason = "export script returned non-zero"; continue; } if (SKIP_VERIFY) { - log(`iter ${iter}: --skip-verify set; not running Playwright`); + log(`iter ${iter}: --skip-verify set; not replaying ${TARGET_LABEL}`); continue; } - // 3. Run Playwright - const pw = runPlaywright(); - hist.pwPassed = pw.passed; - log(`iter ${iter}: Playwright ${pw.passed ? "✅ passed" : `❌ failed (exit=${pw.exitCode})`}`); + // 3. Run replay + const replay = runReplay(); + hist.replayPassed = replay.passed; + log(`iter ${iter}: ${TARGET_LABEL} ${replay.passed ? "✅ passed" : `❌ failed (exit=${replay.exitCode})`}`); - if (!pw.passed) { + if (!replay.passed) { // 4. Distill the failure into strategy.md - log(`iter ${iter}: distilling Playwright failure into strategy.md…`); + log(`iter ${iter}: distilling ${TARGET_LABEL} failure into strategy.md…`); const { addendum, generated, reason } = await distillFailure({ iteration: iter, taskName: TASK, - scriptPath: playwrightScript, - exitCode: pw.exitCode, - stdout: pw.stdout, - stderr: pw.stderr, + target: TARGET, + scriptPath: targetScript, + exitCode: replay.exitCode, + stdout: replay.stdout, + stderr: replay.stderr, }); - appendToStrategy(strategyPath, addendum); + appendToStrategy(strategyPath, addendum, TARGET); hist.distillReason = generated ? "LLM-summarized" : `fallback: ${reason}`; log(`iter ${iter}: strategy.md updated (${hist.distillReason})`); } - // 5. Convergence check — Playwright passed in 2 of last 3 iterations? + // 5. Convergence check — replay passed in 2 of last 3 iterations? const last3 = history.slice(-3); - const passes = last3.filter((h) => h.pwPassed).length; + const passes = last3.filter((h) => h.replayPassed).length; if (passes >= 2 && history.length >= 2) { - log(`🎓 GRADUATED: Playwright passed in ${passes} of last ${last3.length} iterations`); + log(`🎓 GRADUATED: ${TARGET_LABEL} passed in ${passes} of last ${last3.length} iterations`); break; } } // ── Write report ───────────────────────────────────────────────── - const passedCount = history.filter((h) => h.pwPassed).length; + const passedCount = history.filter((h) => h.replayPassed).length; const lines = [ - `# autobrowse loop report — ${TASK}`, + `# autobrowse loop report — ${TASK} (${TARGET})`, ``, `**Total iterations:** ${history.length}`, - `**Playwright passes:** ${passedCount}`, + `**${TARGET_LABEL} passes:** ${passedCount}`, `**Final status:** ${passedCount >= 2 ? "✅ graduated" : "❌ did not converge"}`, ``, `## Per-iteration`, ``, - `| Iter | Run | Trace passed | Playwright passed | Distill |`, - `|------|-----|--------------|-------------------|---------|`, + `| Iter | Run | Trace passed | ${TARGET_LABEL} passed | Distill |`, + `|------|-----|--------------|----------------------|---------|`, ...history.map((h) => - `| ${h.iter} | ${h.runId ?? "?"} | ${h.evalPassed ? "✅" : "❌"} | ${h.pwPassed ? "✅" : "❌"} | ${h.distillReason ?? "—"} |`, + `| ${h.iter} | ${h.runId ?? "?"} | ${h.evalPassed ? "✅" : "❌"} | ${h.replayPassed ? "✅" : "❌"} | ${h.distillReason ?? "—"} |`, ), ``, `Strategy file: \`${strategyPath}\``, - passedCount >= 1 ? `Latest emitted script: \`${playwrightScript}\`` : "", + passedCount >= 1 ? `Latest emitted script: \`${targetScript}\`` : "", ]; fs.writeFileSync(reportPath, lines.filter(Boolean).join("\n") + "\n"); log(`wrote report → ${reportPath}`); @@ -285,12 +295,13 @@ async function main() { // Final structured stdout console.log(JSON.stringify({ task: TASK, + target: TARGET, iterations: history.length, - pw_passes: passedCount, + replay_passes: passedCount, graduated: passedCount >= 2, history, report: reportPath, - script: passedCount >= 1 ? playwrightScript : null, + script: passedCount >= 1 ? targetScript : null, }, null, 2)); process.exit(passedCount >= 2 ? 0 : 2);