From 1b1fd30ec793d3ef24d9caa3a54ac984821cc972 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sat, 25 Apr 2026 21:23:40 -0700 Subject: [PATCH] feat(test): real-PTY harness for plan-mode E2E tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds test/helpers/claude-pty-runner.ts. Spawns the actual claude binary via Bun.spawn({terminal:}) (Bun 1.3.10+ has built-in PTY — no node-pty, no native modules), drives it through stdin/stdout, and parses rendered terminal frames. Pattern adapted from the cc-pty-import branch's terminal-agent.ts but stripped of WS/cookie/Origin scaffolding (not needed for headless tests). Public API: - launchClaudePty(opts) — boots claude with --permission-mode plan|null, auto-handles the workspace-trust dialog, returns a session handle. - session.send / sendKey / waitForAny / waitFor / mark / visibleSince / visibleText / rawOutput / close - runPlanSkillObservation({skillName, inPlanMode, timeoutMs}) — high-level contract for plan-mode skill tests. Returns { outcome, summary, evidence, elapsedMs }. outcome ∈ {asked, plan_ready, silent_write, exited, timeout}. Replaces the SDK-based runPlanModeSkillTest from plan-mode-helpers.ts which never worked. Plan mode renders its native "Ready to execute" confirmation as TTY UI (numbered options with ❯ cursor), not via the AskUserQuestion tool — so the SDK's canUseTool interceptor never fired and the assertion always saw zero questions. Real PTY observes the rendered output directly. Deletes test/helpers/plan-mode-helpers.ts. No production callers remained. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/helpers/claude-pty-runner.ts | 539 ++++++++++++++++++++++++++++++ test/helpers/plan-mode-helpers.ts | 176 ---------- 2 files changed, 539 insertions(+), 176 deletions(-) create mode 100644 test/helpers/claude-pty-runner.ts delete mode 100644 test/helpers/plan-mode-helpers.ts diff --git a/test/helpers/claude-pty-runner.ts b/test/helpers/claude-pty-runner.ts new file mode 100644 index 00000000..aab48e7d --- /dev/null +++ b/test/helpers/claude-pty-runner.ts @@ -0,0 +1,539 @@ +/** + * Real-PTY runner for Claude Code plan-mode E2E tests. + * + * Spawns the actual `claude` binary via `Bun.spawn({terminal:})`, drives + * it through stdin/stdout, parses the rendered terminal frames, and exposes + * primitives the 5 plan-mode tests need. Replaces the SDK-based + * `runPlanModeSkillTest` from plan-mode-helpers.ts which never worked + * because plan mode doesn't use the AskUserQuestion tool — it uses its + * own TTY-rendered native confirmation UI. + * + * Why this exists: the SDK harness intercepts `canUseTool` for + * `AskUserQuestion`. Claude in plan mode renders its "Ready to execute" + * confirmation as a native option list (1-4 numbered options) without + * invoking the AskUserQuestion tool. The SDK never sees it. Real PTY + * does — it shows up as text on screen with `❯` cursor markers. + * + * Architecture: pure Bun.spawn — no node-pty, no native modules, no chmod + * fixes. Bun 1.3.10+ has built-in PTY support via the `terminal:` spawn + * option. Pattern borrowed from cc-pty-import branch's terminal-agent.ts + * (the WS/cookie/Origin scaffolding there is for the browser sidebar; + * tests don't need it). + */ + +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; + +/** Strip ANSI escapes for pattern-matching against visible text. */ +export function stripAnsi(s: string): string { + return s + .replace(/\x1b\[[\d;]*[a-zA-Z]/g, '') + .replace(/\x1b\][^\x07\x1b]*(\x07|\x1b\\)/g, '') + .replace(/\x1b[()][AB012]/g, '') + .replace(/\x1b[78=>]/g, ''); +} + +/** Find claude on PATH, with fallback locations. Mirrors terminal-agent.ts. */ +export function resolveClaudeBinary(): string | null { + const override = process.env.BROWSE_TERMINAL_BINARY; + if (override && fs.existsSync(override)) return override; + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const which = (Bun as any).which?.('claude'); + if (which) return which; + const candidates = [ + '/opt/homebrew/bin/claude', + '/usr/local/bin/claude', + `${process.env.HOME}/.local/bin/claude`, + `${process.env.HOME}/.bun/bin/claude`, + `${process.env.HOME}/.npm-global/bin/claude`, + ]; + for (const c of candidates) { + try { + fs.accessSync(c, fs.constants.X_OK); + return c; + } catch { + /* keep searching */ + } + } + return null; +} + +export interface ClaudePtyOptions { + /** + * Permission mode for the session. + * - 'plan' (default) — launches with --permission-mode plan + * - undefined — no --permission-mode flag at all (regular interactive) + * Other valid SDK modes ('default', 'acceptEdits', 'bypassPermissions', + * 'auto', 'dontAsk') are passed through verbatim. + */ + permissionMode?: 'plan' | 'default' | 'acceptEdits' | 'bypassPermissions' | 'auto' | 'dontAsk' | null; + /** Extra args after the permission-mode flag. */ + extraArgs?: string[]; + /** Terminal size. Default 120x40. Plan-mode UI lays out cleanly at this size. */ + cols?: number; + rows?: number; + /** Working directory. Default: process.cwd(). The repo cwd has the gstack + * skill registry and trusted-folder cookie, so most tests want this. */ + cwd?: string; + /** Extra env on top of process.env. */ + env?: Record; + /** Total run timeout (ms). Default 240000 (4 min). */ + timeoutMs?: number; +} + +export interface ClaudePtySession { + /** Send raw bytes to PTY stdin. Newlines = "\r" in TTY world. */ + send(data: string): void; + /** Send a key by name. Limited set used by these tests. */ + sendKey(key: 'Enter' | 'Up' | 'Down' | 'Esc' | 'Tab' | 'ShiftTab' | 'CtrlC'): void; + /** Raw accumulated stdout (with ANSI). For forensics. */ + rawOutput(): string; + /** Visible (ANSI-stripped) output for the entire session. For pattern matching. */ + visibleText(): string; + /** + * Mark the current buffer position. Subsequent waitForAny / visibleSince + * calls only look at output AFTER this mark. Use to scope assertions to + * "after I sent the skill command" — avoids matching against the trust + * dialog or boot banner residue. Returns a marker handle. + */ + mark(): number; + /** Visible text since the most recent (or specific) mark. */ + visibleSince(marker?: number): string; + /** + * Wait for any of the supplied patterns to appear in visibleText. Resolves + * with the first match. Throws on timeout (with last 2KB of visible text). + * If `since` is supplied, only matches text after that mark. + */ + waitForAny( + patterns: Array, + opts?: { timeoutMs?: number; pollMs?: number; since?: number }, + ): Promise<{ matched: RegExp | string; index: number }>; + /** Convenience: single-pattern wait. */ + waitFor( + pattern: RegExp | string, + opts?: { timeoutMs?: number; pollMs?: number; since?: number }, + ): Promise; + /** Process pid (for debug). */ + pid(): number | undefined; + /** Whether the underlying process has exited. */ + exited(): boolean; + /** Exit code, if known. */ + exitCode(): number | null; + /** + * Send SIGINT, then SIGKILL after 1s. Always safe to call multiple times. + * Awaits process exit before resolving. + */ + close(): Promise; +} + +/** Detect the workspace-trust dialog rendering. */ +export function isTrustDialogVisible(visible: string): boolean { + // Phrase Claude Code prints. Stable across versions in this branch's range. + return visible.includes('trust this folder'); +} + +/** Detect plan-mode's native "ready to execute" confirmation. */ +export function isPlanReadyVisible(visible: string): boolean { + return /ready to execute|Would you like to proceed/i.test(visible); +} + +/** Detect any AskUserQuestion-shaped numbered option list with cursor. */ +export function isNumberedOptionListVisible(visible: string): boolean { + // ❯ cursor + at least two numbered options 1-9. + // Matches the trust dialog AND plan-ready prompt AND skill questions. + // Tighter classification happens via scope (after-trust, after-skill-cmd, etc). + return /❯\s*1\./.test(visible) && /\b2\./.test(visible); +} + +/** + * Spawn `claude --permission-mode plan` in a real PTY and return a session + * handle. Caller is responsible for `await session.close()` to release the + * subprocess and any timers. + * + * Auto-handles the workspace-trust dialog (presses "1\r" if it appears + * during the boot window). Tests should NOT have to handle it themselves. + */ +export async function launchClaudePty( + opts: ClaudePtyOptions = {}, +): Promise { + const claudePath = resolveClaudeBinary(); + if (!claudePath) { + throw new Error( + 'claude binary not found on PATH. Install: https://docs.anthropic.com/en/docs/claude-code', + ); + } + + const cwd = opts.cwd ?? process.cwd(); + const cols = opts.cols ?? 120; + const rows = opts.rows ?? 40; + const timeoutMs = opts.timeoutMs ?? 240_000; + + let buffer = ''; + let exited = false; + let exitCodeCaptured: number | null = null; + + // Permission mode: 'plan' default, null => omit flag entirely. + const permissionMode = opts.permissionMode === undefined ? 'plan' : opts.permissionMode; + const args: string[] = []; + if (permissionMode !== null) { + args.push('--permission-mode', permissionMode); + } + if (opts.extraArgs) args.push(...opts.extraArgs); + + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const proc = (Bun as any).spawn([claudePath, ...args], { + terminal: { + cols, + rows, + data(_t: unknown, chunk: Buffer) { + buffer += chunk.toString('utf-8'); + }, + }, + cwd, + env: { ...process.env, ...(opts.env ?? {}) }, + }); + + // Track exit so waitForAny can fail fast if claude crashes. + let exitedPromise: Promise = Promise.resolve(); + if (proc.exited && typeof proc.exited.then === 'function') { + exitedPromise = proc.exited + .then((code: number | null) => { + exitCodeCaptured = code; + exited = true; + }) + .catch(() => { + exited = true; + }); + } + + // Top-level timeout. If a test forgets to close, this kills it eventually. + const wallTimer = setTimeout(() => { + try { + proc.kill?.('SIGKILL'); + } catch { + /* ignore */ + } + }, timeoutMs); + + // Auto-handle the workspace-trust dialog. Runs once during the boot + // window; idempotent (only fires if the phrase is still on screen). + let trustHandled = false; + const trustWatcher = setInterval(() => { + if (trustHandled || exited) return; + const visible = stripAnsi(buffer); + if (isTrustDialogVisible(visible)) { + trustHandled = true; + try { + proc.terminal?.write?.('1\r'); + } catch { + /* ignore */ + } + } + }, 200); + // Stop the watcher after 15s — by then the dialog has either fired or + // doesn't exist on this run. + const trustWatcherStop = setTimeout(() => clearInterval(trustWatcher), 15_000); + + function send(data: string): void { + if (exited) return; + try { + proc.terminal?.write?.(data); + } catch { + /* ignore */ + } + } + + type Key = Parameters[0]; + function sendKey(key: Key): void { + const map: Record = { + Enter: '\r', + Up: '\x1b[A', + Down: '\x1b[B', + Esc: '\x1b', + Tab: '\t', + ShiftTab: '\x1b[Z', + CtrlC: '\x03', + }; + send(map[key] ?? ''); + } + + let lastMark = 0; + function mark(): number { + lastMark = buffer.length; + return lastMark; + } + function visibleSince(marker?: number): string { + const offset = marker ?? lastMark; + return stripAnsi(buffer.slice(offset)); + } + + async function waitForAny( + patterns: Array, + waitOpts?: { timeoutMs?: number; pollMs?: number; since?: number }, + ): Promise<{ matched: RegExp | string; index: number }> { + const wTimeout = waitOpts?.timeoutMs ?? 60_000; + const poll = waitOpts?.pollMs ?? 250; + const since = waitOpts?.since; + const start = Date.now(); + while (Date.now() - start < wTimeout) { + if (exited) { + throw new Error( + `claude exited (code=${exitCodeCaptured}) before any pattern matched. ` + + `Last visible:\n${stripAnsi(buffer).slice(-2000)}`, + ); + } + const visible = since !== undefined ? stripAnsi(buffer.slice(since)) : stripAnsi(buffer); + for (let i = 0; i < patterns.length; i++) { + const p = patterns[i]!; + const matchIdx = typeof p === 'string' ? visible.indexOf(p) : visible.search(p); + if (matchIdx >= 0) { + return { matched: p, index: matchIdx }; + } + } + await Bun.sleep(poll); + } + throw new Error( + `Timed out after ${wTimeout}ms waiting for any of: ${patterns + .map((p) => (typeof p === 'string' ? JSON.stringify(p) : p.source)) + .join(', ')}\nLast visible (since=${since ?? 'all'}):\n${ + since !== undefined ? stripAnsi(buffer.slice(since)).slice(-2000) : stripAnsi(buffer).slice(-2000) + }`, + ); + } + + async function waitFor( + pattern: RegExp | string, + waitOpts?: { timeoutMs?: number; pollMs?: number; since?: number }, + ): Promise { + await waitForAny([pattern], waitOpts); + } + + async function close(): Promise { + clearTimeout(wallTimer); + clearTimeout(trustWatcherStop); + clearInterval(trustWatcher); + if (exited) return; + try { + proc.kill?.('SIGINT'); + } catch { + /* ignore */ + } + // Wait up to 2s for graceful exit. + await Promise.race([exitedPromise, Bun.sleep(2000)]); + if (!exited) { + try { + proc.kill?.('SIGKILL'); + } catch { + /* ignore */ + } + await Promise.race([exitedPromise, Bun.sleep(1000)]); + } + } + + return { + send, + sendKey, + rawOutput: () => buffer, + visibleText: () => stripAnsi(buffer), + mark, + visibleSince, + waitForAny, + waitFor, + pid: () => proc.pid as number | undefined, + exited: () => exited, + exitCode: () => exitCodeCaptured, + close, + }; +} + +/** + * High-level: invoke a slash command and observe the response. Used by the + * 5 plan-mode tests so each only has ~10 LOC of orchestration. + * + * The `expectations` object names the patterns the caller cares about. + * Returns which one matched first (or throws on timeout). + * + * @example + * const session = await launchClaudePty(); + * const result = await invokeAndObserve(session, '/plan-ceo-review', { + * askUserQuestion: /❯\s*1\./, + * planReady: /ready to execute/i, + * silentWrite: /⏺\s*Write\(/, + * silentEdit: /⏺\s*Edit\(/, + * exitedPlanMode: /Exiting plan mode/i, + * }); + * await session.close(); + */ +export async function invokeAndObserve( + session: ClaudePtySession, + slashCommand: string, + expectations: Record, + opts?: { boot_grace_ms?: number; timeoutMs?: number }, +): Promise<{ matched: string; rawPattern: RegExp | string; visibleAtMatch: string }> { + // Brief grace period so the trust-dialog auto-press has time to clear and + // claude is back at the input prompt before we type the command. + const boot = opts?.boot_grace_ms ?? 6000; + await Bun.sleep(boot); + + // Mark buffer position. All pattern matching scopes to text AFTER this point, + // so the trust-dialog residue and boot banner numbered options don't cause + // false positives. + const sinceMark = session.mark(); + + // Type and submit. + session.send(slashCommand + '\r'); + + const patterns = Object.entries(expectations); + const result = await session.waitForAny( + patterns.map(([, p]) => p), + { timeoutMs: opts?.timeoutMs ?? 240_000, since: sinceMark }, + ); + // Map back to the named key. + const idx = patterns.findIndex(([, p]) => p === result.matched); + const [name, rawPattern] = patterns[idx]!; + return { + matched: name, + rawPattern, + visibleAtMatch: session.visibleText(), + }; +} + +// --------------------------------------------------------------------------- +// High-level skill-mode test contract +// --------------------------------------------------------------------------- + +export interface PlanSkillObservation { + /** + * What happened first. One of: + * - 'asked' — skill emitted a numbered-option prompt (its Step 0 + * AskUserQuestion or the routing-injection prompt) + * - 'plan_ready' — claude wrote a plan and emitted its native + * "Ready to execute" confirmation + * - 'silent_write' — a Write/Edit landed BEFORE any prompt, to a path + * outside the sanctioned plan/project directories + * - 'exited' — claude process died before any of the above + * - 'timeout' — none of the above within budget + */ + outcome: 'asked' | 'plan_ready' | 'silent_write' | 'exited' | 'timeout'; + /** Human-readable summary. */ + summary: string; + /** Visible terminal text since the slash command was sent (last 2KB). */ + evidence: string; + /** Wall time (ms) until the outcome was decided. */ + elapsedMs: number; +} + +/** + * The contract for "skill X invoked in plan mode behaves correctly." + * + * PASS: outcome is 'asked' or 'plan_ready'. + * - 'asked' = the skill is gating decisions on the user, as expected. + * - 'plan_ready' = the skill ran end-to-end, wrote a plan file, and + * surfaced claude's native confirmation. Some skills (like + * plan-design-review on a no-UI branch) legitimately reach plan_ready + * without firing AskUserQuestion because they short-circuit. + * + * FAIL: 'silent_write' or 'exited' or 'timeout'. + * + * This replaces the SDK-based runPlanModeSkillTest which never worked + * because plan mode renders its native confirmation as TTY UI, not via + * the AskUserQuestion tool — so canUseTool never fired and the assertion + * counted zero questions. + */ +export async function runPlanSkillObservation(opts: { + /** Skill name, e.g. 'plan-ceo-review'. */ + skillName: string; + /** Whether to launch in plan mode. Default true. The no-op regression + * test sets this false to verify skills work outside plan mode. */ + inPlanMode?: boolean; + /** Working directory. Default process.cwd(). */ + cwd?: string; + /** Total budget for skill to reach a terminal outcome. Default 180000. */ + timeoutMs?: number; +}): Promise { + const startedAt = Date.now(); + const session = await launchClaudePty({ + permissionMode: opts.inPlanMode === false ? null : 'plan', + cwd: opts.cwd, + timeoutMs: (opts.timeoutMs ?? 180_000) + 30_000, + }); + + try { + // Boot grace + trust-dialog auto-handle. + await Bun.sleep(8000); + const since = session.mark(); + session.send(`/${opts.skillName}\r`); + + const budgetMs = opts.timeoutMs ?? 180_000; + const start = Date.now(); + while (Date.now() - start < budgetMs) { + await Bun.sleep(2000); + const visible = session.visibleSince(since); + + if (session.exited()) { + return { + outcome: 'exited', + summary: `claude exited (code=${session.exitCode()}) before reaching a terminal outcome`, + evidence: visible.slice(-2000), + elapsedMs: Date.now() - startedAt, + }; + } + if (visible.includes('Unknown command:')) { + return { + outcome: 'exited', + summary: `claude rejected /${opts.skillName} as unknown command (skill not registered in this cwd)`, + evidence: visible.slice(-2000), + elapsedMs: Date.now() - startedAt, + }; + } + // Silent-write detection: any Write/Edit tool render that targets a + // path OUTSIDE ~/.claude/plans, ~/.gstack/, or the active worktree's + // .gstack/. Plan files and gbrain artifacts are sanctioned. + const writeRe = /⏺\s*(?:Write|Edit)\(([^)]+)\)/g; + let m: RegExpExecArray | null; + while ((m = writeRe.exec(visible)) !== null) { + const target = m[1] ?? ''; + const sanctioned = + target.includes('.claude/plans') || + target.includes('.gstack/') || + target.includes('/.context/') || + target.includes('CHANGELOG.md') || + target.includes('TODOS.md'); + if (!sanctioned && !isNumberedOptionListVisible(visible)) { + return { + outcome: 'silent_write', + summary: `Write/Edit to ${target} fired before any AskUserQuestion`, + evidence: visible.slice(-2000), + elapsedMs: Date.now() - startedAt, + }; + } + } + if (isPlanReadyVisible(visible)) { + return { + outcome: 'plan_ready', + summary: 'skill ran end-to-end and emitted plan-mode "Ready to execute" confirmation', + evidence: visible.slice(-2000), + elapsedMs: Date.now() - startedAt, + }; + } + if (isNumberedOptionListVisible(visible)) { + return { + outcome: 'asked', + summary: 'skill fired a numbered-option prompt (AskUserQuestion or routing-injection)', + evidence: visible.slice(-2000), + elapsedMs: Date.now() - startedAt, + }; + } + } + + return { + outcome: 'timeout', + summary: `no terminal outcome within ${budgetMs}ms`, + evidence: session.visibleSince(since).slice(-2000), + elapsedMs: Date.now() - startedAt, + }; + } finally { + await session.close(); + } +} diff --git a/test/helpers/plan-mode-helpers.ts b/test/helpers/plan-mode-helpers.ts deleted file mode 100644 index cf0025b6..00000000 --- a/test/helpers/plan-mode-helpers.ts +++ /dev/null @@ -1,176 +0,0 @@ -/** - * Shared helpers for plan-mode E2E tests. - * - * Four sibling per-skill smoke tests (plan-ceo, plan-eng, plan-design, plan-devex) - * plus the no-op regression test use this helper. The goal: run a review skill - * in plan mode, confirm it goes straight to its Step 0 AskUserQuestion without - * writing files or calling ExitPlanMode first (the vestigial handshake - * regression we fixed in ceo-plan 2026-04-24). - * - * This file was renamed from `plan-mode-handshake-helpers.ts` when the - * handshake was removed. The write-guard detection (no Write/Edit before the - * first AskUserQuestion) is the load-bearing piece that catches silent - * regressions a simple "first question text matches" check would miss. - */ - -import * as fs from 'fs'; -import * as path from 'path'; -import * as os from 'os'; -import { execSync } from 'child_process'; -import { - runAgentSdkTest, - passThroughNonAskUserQuestion, - resolveClaudeBinary, - type AgentSdkResult, -} from './agent-sdk-runner'; - -/** Distinctive phrase matching what Claude Code's harness actually injects. */ -export const PLAN_MODE_REMINDER = - 'Plan mode is active. The user indicated that they do not want you to execute yet'; - -export interface PlanModeCaptureResult { - sdkResult: AgentSdkResult; - /** Each AskUserQuestion that fired, with its input payload. */ - askUserQuestions: Array<{ input: Record; orderIndex: number }>; - /** Tool-use events in the order they fired (names only). */ - toolOrder: string[]; - /** Whether any Write or Edit tool fired BEFORE the first AskUserQuestion. */ - writeOrEditBeforeAsk: boolean; - /** Whether ExitPlanMode fired BEFORE the first AskUserQuestion. */ - exitPlanModeBeforeAsk: boolean; -} - -/** - * Run a skill via the Agent SDK with canUseTool intercepting every tool use. - * Inject the plan-mode distinctive phrase into the system prompt, auto-answer - * the first AskUserQuestion (so the skill stops cleanly after Step 0), and - * return the captured events for assertion. - */ -export async function runPlanModeSkillTest(opts: { - /** Skill name, e.g. 'plan-ceo-review'. */ - skillName: string; - /** - * For the first AskUserQuestion, pick the option whose label contains this - * substring. Pick a "cheap" answer that terminates the skill quickly (e.g. - * "HOLD SCOPE" for plan-ceo-review). - */ - firstAnswerSubstring: string; - /** If true, DO NOT inject the reminder — used by the no-op regression test. */ - omitPlanModeReminder?: boolean; - /** Max turns for the SDK call (default 4 — Step 0 + answer should fit). */ - maxTurns?: number; -}): Promise { - const { skillName, firstAnswerSubstring, omitPlanModeReminder, maxTurns } = opts; - - const askUserQuestions: PlanModeCaptureResult['askUserQuestions'] = []; - const toolOrder: string[] = []; - let toolIndex = 0; - let firstAskIndex = -1; - - const workingDir = fs.mkdtempSync( - path.join(os.tmpdir(), `plan-mode-${skillName}-`), - ); - - const binary = resolveClaudeBinary(); - - try { - // In real plan mode Claude Code injects a system-reminder; in SDK tests we - // use systemPrompt.append which the model treats as equally authoritative. - const reminderAppend = omitPlanModeReminder - ? '' - : `\n\n\n${PLAN_MODE_REMINDER}. This supercedes any other instructions you have received.\n\n`; - - const sdkResult = await runAgentSdkTest({ - systemPrompt: { - type: 'preset', - preset: 'claude_code', - append: reminderAppend, - }, - userPrompt: `Read the skill file at ${path.resolve( - import.meta.dir, - '..', - '..', - skillName, - 'SKILL.md', - )} and follow its instructions. There is no real plan to review — just start the skill and respond to any AskUserQuestion that fires.`, - workingDirectory: workingDir, - maxTurns: maxTurns ?? 4, - allowedTools: ['Read', 'Grep', 'Glob', 'Bash'], - ...(binary ? { pathToClaudeCodeExecutable: binary } : {}), - canUseTool: async (toolName, input) => { - toolOrder.push(toolName); - if (toolName === 'AskUserQuestion') { - if (firstAskIndex === -1) firstAskIndex = toolIndex; - askUserQuestions.push({ input, orderIndex: toolIndex }); - toolIndex++; - // Auto-answer the FIRST question with the configured substring; for - // later questions, pick the first option to keep the run short. - const q = (input.questions as Array<{ question: string; options: Array<{ label: string }> }>)[0]; - const isFirst = askUserQuestions.length === 1; - const matched = isFirst - ? q.options.find((o) => o.label.toLowerCase().includes(firstAnswerSubstring.toLowerCase())) - : undefined; - const answer = matched ? matched.label : q.options[0]!.label; - return { - behavior: 'allow', - updatedInput: { - questions: input.questions, - answers: { [q.question]: answer }, - }, - }; - } - toolIndex++; - return passThroughNonAskUserQuestion(toolName, input); - }, - }); - - const writeOrEditBeforeAsk = - firstAskIndex > 0 && - toolOrder.slice(0, firstAskIndex).some((t) => t === 'Write' || t === 'Edit'); - - const exitPlanModeBeforeAsk = - firstAskIndex > 0 && - toolOrder.slice(0, firstAskIndex).some((t) => t === 'ExitPlanMode'); - - return { - sdkResult, - askUserQuestions, - toolOrder, - writeOrEditBeforeAsk, - exitPlanModeBeforeAsk, - }; - } finally { - try { - fs.rmSync(workingDir, { recursive: true, force: true }); - } catch { /* ignore cleanup errors */ } - } -} - -/** - * Assert a captured AskUserQuestion is NOT the old vestigial handshake - * (A=exit-and-rerun / C=cancel). The handshake is gone — if a test ever sees - * one again, that's the regression we're guarding against. - */ -export function assertNotHandshakeShape( - aq: { input: Record }, -): void { - const questions = aq.input.questions as Array<{ - question: string; - options: Array<{ label: string }>; - }>; - if (!questions || questions.length === 0) return; - const q = questions[0]!; - const labels = q.options.map((o) => o.label.toLowerCase()); - const looksLikeHandshake = - labels.some((l) => l.includes('exit') && l.includes('rerun')) && - labels.some((l) => l.includes('cancel')); - if (looksLikeHandshake) { - throw new Error( - `First AskUserQuestion looks like the vestigial plan-mode handshake ` + - `(options: ${labels.join(', ')}). The handshake was removed; skills ` + - `should go straight to their Step 0 question in plan mode.`, - ); - } -} - -export { execSync };