mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-02 03:35:09 +02:00
feat(test): real-PTY harness for plan-mode E2E tests
Adds test/helpers/claude-pty-runner.ts. Spawns the actual claude binary
via Bun.spawn({terminal:}) (Bun 1.3.10+ has built-in PTY — no node-pty,
no native modules), drives it through stdin/stdout, and parses rendered
terminal frames. Pattern adapted from the cc-pty-import branch's
terminal-agent.ts but stripped of WS/cookie/Origin scaffolding (not
needed for headless tests).
Public API:
- launchClaudePty(opts) — boots claude with --permission-mode plan|null,
auto-handles the workspace-trust dialog, returns a session handle.
- session.send / sendKey / waitForAny / waitFor / mark / visibleSince /
visibleText / rawOutput / close
- runPlanSkillObservation({skillName, inPlanMode, timeoutMs}) — high-level
contract for plan-mode skill tests. Returns { outcome, summary, evidence,
elapsedMs }. outcome ∈ {asked, plan_ready, silent_write, exited, timeout}.
Replaces the SDK-based runPlanModeSkillTest from plan-mode-helpers.ts
which never worked. Plan mode renders its native "Ready to execute"
confirmation as TTY UI (numbered options with ❯ cursor), not via the
AskUserQuestion tool — so the SDK's canUseTool interceptor never fired
and the assertion always saw zero questions. Real PTY observes the
rendered output directly.
Deletes test/helpers/plan-mode-helpers.ts. No production callers remained.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,539 @@
|
||||
/**
|
||||
* Real-PTY runner for Claude Code plan-mode E2E tests.
|
||||
*
|
||||
* Spawns the actual `claude` binary via `Bun.spawn({terminal:})`, drives
|
||||
* it through stdin/stdout, parses the rendered terminal frames, and exposes
|
||||
* primitives the 5 plan-mode tests need. Replaces the SDK-based
|
||||
* `runPlanModeSkillTest` from plan-mode-helpers.ts which never worked
|
||||
* because plan mode doesn't use the AskUserQuestion tool — it uses its
|
||||
* own TTY-rendered native confirmation UI.
|
||||
*
|
||||
* Why this exists: the SDK harness intercepts `canUseTool` for
|
||||
* `AskUserQuestion`. Claude in plan mode renders its "Ready to execute"
|
||||
* confirmation as a native option list (1-4 numbered options) without
|
||||
* invoking the AskUserQuestion tool. The SDK never sees it. Real PTY
|
||||
* does — it shows up as text on screen with `❯` cursor markers.
|
||||
*
|
||||
* Architecture: pure Bun.spawn — no node-pty, no native modules, no chmod
|
||||
* fixes. Bun 1.3.10+ has built-in PTY support via the `terminal:` spawn
|
||||
* option. Pattern borrowed from cc-pty-import branch's terminal-agent.ts
|
||||
* (the WS/cookie/Origin scaffolding there is for the browser sidebar;
|
||||
* tests don't need it).
|
||||
*/
|
||||
|
||||
import * as fs from 'fs';
|
||||
import * as os from 'os';
|
||||
import * as path from 'path';
|
||||
|
||||
/** Strip ANSI escapes for pattern-matching against visible text. */
|
||||
export function stripAnsi(s: string): string {
|
||||
return s
|
||||
.replace(/\x1b\[[\d;]*[a-zA-Z]/g, '')
|
||||
.replace(/\x1b\][^\x07\x1b]*(\x07|\x1b\\)/g, '')
|
||||
.replace(/\x1b[()][AB012]/g, '')
|
||||
.replace(/\x1b[78=>]/g, '');
|
||||
}
|
||||
|
||||
/** Find claude on PATH, with fallback locations. Mirrors terminal-agent.ts. */
|
||||
export function resolveClaudeBinary(): string | null {
|
||||
const override = process.env.BROWSE_TERMINAL_BINARY;
|
||||
if (override && fs.existsSync(override)) return override;
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const which = (Bun as any).which?.('claude');
|
||||
if (which) return which;
|
||||
const candidates = [
|
||||
'/opt/homebrew/bin/claude',
|
||||
'/usr/local/bin/claude',
|
||||
`${process.env.HOME}/.local/bin/claude`,
|
||||
`${process.env.HOME}/.bun/bin/claude`,
|
||||
`${process.env.HOME}/.npm-global/bin/claude`,
|
||||
];
|
||||
for (const c of candidates) {
|
||||
try {
|
||||
fs.accessSync(c, fs.constants.X_OK);
|
||||
return c;
|
||||
} catch {
|
||||
/* keep searching */
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
export interface ClaudePtyOptions {
|
||||
/**
|
||||
* Permission mode for the session.
|
||||
* - 'plan' (default) — launches with --permission-mode plan
|
||||
* - undefined — no --permission-mode flag at all (regular interactive)
|
||||
* Other valid SDK modes ('default', 'acceptEdits', 'bypassPermissions',
|
||||
* 'auto', 'dontAsk') are passed through verbatim.
|
||||
*/
|
||||
permissionMode?: 'plan' | 'default' | 'acceptEdits' | 'bypassPermissions' | 'auto' | 'dontAsk' | null;
|
||||
/** Extra args after the permission-mode flag. */
|
||||
extraArgs?: string[];
|
||||
/** Terminal size. Default 120x40. Plan-mode UI lays out cleanly at this size. */
|
||||
cols?: number;
|
||||
rows?: number;
|
||||
/** Working directory. Default: process.cwd(). The repo cwd has the gstack
|
||||
* skill registry and trusted-folder cookie, so most tests want this. */
|
||||
cwd?: string;
|
||||
/** Extra env on top of process.env. */
|
||||
env?: Record<string, string>;
|
||||
/** Total run timeout (ms). Default 240000 (4 min). */
|
||||
timeoutMs?: number;
|
||||
}
|
||||
|
||||
export interface ClaudePtySession {
|
||||
/** Send raw bytes to PTY stdin. Newlines = "\r" in TTY world. */
|
||||
send(data: string): void;
|
||||
/** Send a key by name. Limited set used by these tests. */
|
||||
sendKey(key: 'Enter' | 'Up' | 'Down' | 'Esc' | 'Tab' | 'ShiftTab' | 'CtrlC'): void;
|
||||
/** Raw accumulated stdout (with ANSI). For forensics. */
|
||||
rawOutput(): string;
|
||||
/** Visible (ANSI-stripped) output for the entire session. For pattern matching. */
|
||||
visibleText(): string;
|
||||
/**
|
||||
* Mark the current buffer position. Subsequent waitForAny / visibleSince
|
||||
* calls only look at output AFTER this mark. Use to scope assertions to
|
||||
* "after I sent the skill command" — avoids matching against the trust
|
||||
* dialog or boot banner residue. Returns a marker handle.
|
||||
*/
|
||||
mark(): number;
|
||||
/** Visible text since the most recent (or specific) mark. */
|
||||
visibleSince(marker?: number): string;
|
||||
/**
|
||||
* Wait for any of the supplied patterns to appear in visibleText. Resolves
|
||||
* with the first match. Throws on timeout (with last 2KB of visible text).
|
||||
* If `since` is supplied, only matches text after that mark.
|
||||
*/
|
||||
waitForAny(
|
||||
patterns: Array<RegExp | string>,
|
||||
opts?: { timeoutMs?: number; pollMs?: number; since?: number },
|
||||
): Promise<{ matched: RegExp | string; index: number }>;
|
||||
/** Convenience: single-pattern wait. */
|
||||
waitFor(
|
||||
pattern: RegExp | string,
|
||||
opts?: { timeoutMs?: number; pollMs?: number; since?: number },
|
||||
): Promise<void>;
|
||||
/** Process pid (for debug). */
|
||||
pid(): number | undefined;
|
||||
/** Whether the underlying process has exited. */
|
||||
exited(): boolean;
|
||||
/** Exit code, if known. */
|
||||
exitCode(): number | null;
|
||||
/**
|
||||
* Send SIGINT, then SIGKILL after 1s. Always safe to call multiple times.
|
||||
* Awaits process exit before resolving.
|
||||
*/
|
||||
close(): Promise<void>;
|
||||
}
|
||||
|
||||
/** Detect the workspace-trust dialog rendering. */
|
||||
export function isTrustDialogVisible(visible: string): boolean {
|
||||
// Phrase Claude Code prints. Stable across versions in this branch's range.
|
||||
return visible.includes('trust this folder');
|
||||
}
|
||||
|
||||
/** Detect plan-mode's native "ready to execute" confirmation. */
|
||||
export function isPlanReadyVisible(visible: string): boolean {
|
||||
return /ready to execute|Would you like to proceed/i.test(visible);
|
||||
}
|
||||
|
||||
/** Detect any AskUserQuestion-shaped numbered option list with cursor. */
|
||||
export function isNumberedOptionListVisible(visible: string): boolean {
|
||||
// ❯ cursor + at least two numbered options 1-9.
|
||||
// Matches the trust dialog AND plan-ready prompt AND skill questions.
|
||||
// Tighter classification happens via scope (after-trust, after-skill-cmd, etc).
|
||||
return /❯\s*1\./.test(visible) && /\b2\./.test(visible);
|
||||
}
|
||||
|
||||
/**
|
||||
* Spawn `claude --permission-mode plan` in a real PTY and return a session
|
||||
* handle. Caller is responsible for `await session.close()` to release the
|
||||
* subprocess and any timers.
|
||||
*
|
||||
* Auto-handles the workspace-trust dialog (presses "1\r" if it appears
|
||||
* during the boot window). Tests should NOT have to handle it themselves.
|
||||
*/
|
||||
export async function launchClaudePty(
|
||||
opts: ClaudePtyOptions = {},
|
||||
): Promise<ClaudePtySession> {
|
||||
const claudePath = resolveClaudeBinary();
|
||||
if (!claudePath) {
|
||||
throw new Error(
|
||||
'claude binary not found on PATH. Install: https://docs.anthropic.com/en/docs/claude-code',
|
||||
);
|
||||
}
|
||||
|
||||
const cwd = opts.cwd ?? process.cwd();
|
||||
const cols = opts.cols ?? 120;
|
||||
const rows = opts.rows ?? 40;
|
||||
const timeoutMs = opts.timeoutMs ?? 240_000;
|
||||
|
||||
let buffer = '';
|
||||
let exited = false;
|
||||
let exitCodeCaptured: number | null = null;
|
||||
|
||||
// Permission mode: 'plan' default, null => omit flag entirely.
|
||||
const permissionMode = opts.permissionMode === undefined ? 'plan' : opts.permissionMode;
|
||||
const args: string[] = [];
|
||||
if (permissionMode !== null) {
|
||||
args.push('--permission-mode', permissionMode);
|
||||
}
|
||||
if (opts.extraArgs) args.push(...opts.extraArgs);
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const proc = (Bun as any).spawn([claudePath, ...args], {
|
||||
terminal: {
|
||||
cols,
|
||||
rows,
|
||||
data(_t: unknown, chunk: Buffer) {
|
||||
buffer += chunk.toString('utf-8');
|
||||
},
|
||||
},
|
||||
cwd,
|
||||
env: { ...process.env, ...(opts.env ?? {}) },
|
||||
});
|
||||
|
||||
// Track exit so waitForAny can fail fast if claude crashes.
|
||||
let exitedPromise: Promise<void> = Promise.resolve();
|
||||
if (proc.exited && typeof proc.exited.then === 'function') {
|
||||
exitedPromise = proc.exited
|
||||
.then((code: number | null) => {
|
||||
exitCodeCaptured = code;
|
||||
exited = true;
|
||||
})
|
||||
.catch(() => {
|
||||
exited = true;
|
||||
});
|
||||
}
|
||||
|
||||
// Top-level timeout. If a test forgets to close, this kills it eventually.
|
||||
const wallTimer = setTimeout(() => {
|
||||
try {
|
||||
proc.kill?.('SIGKILL');
|
||||
} catch {
|
||||
/* ignore */
|
||||
}
|
||||
}, timeoutMs);
|
||||
|
||||
// Auto-handle the workspace-trust dialog. Runs once during the boot
|
||||
// window; idempotent (only fires if the phrase is still on screen).
|
||||
let trustHandled = false;
|
||||
const trustWatcher = setInterval(() => {
|
||||
if (trustHandled || exited) return;
|
||||
const visible = stripAnsi(buffer);
|
||||
if (isTrustDialogVisible(visible)) {
|
||||
trustHandled = true;
|
||||
try {
|
||||
proc.terminal?.write?.('1\r');
|
||||
} catch {
|
||||
/* ignore */
|
||||
}
|
||||
}
|
||||
}, 200);
|
||||
// Stop the watcher after 15s — by then the dialog has either fired or
|
||||
// doesn't exist on this run.
|
||||
const trustWatcherStop = setTimeout(() => clearInterval(trustWatcher), 15_000);
|
||||
|
||||
function send(data: string): void {
|
||||
if (exited) return;
|
||||
try {
|
||||
proc.terminal?.write?.(data);
|
||||
} catch {
|
||||
/* ignore */
|
||||
}
|
||||
}
|
||||
|
||||
type Key = Parameters<ClaudePtySession['sendKey']>[0];
|
||||
function sendKey(key: Key): void {
|
||||
const map: Record<string, string> = {
|
||||
Enter: '\r',
|
||||
Up: '\x1b[A',
|
||||
Down: '\x1b[B',
|
||||
Esc: '\x1b',
|
||||
Tab: '\t',
|
||||
ShiftTab: '\x1b[Z',
|
||||
CtrlC: '\x03',
|
||||
};
|
||||
send(map[key] ?? '');
|
||||
}
|
||||
|
||||
let lastMark = 0;
|
||||
function mark(): number {
|
||||
lastMark = buffer.length;
|
||||
return lastMark;
|
||||
}
|
||||
function visibleSince(marker?: number): string {
|
||||
const offset = marker ?? lastMark;
|
||||
return stripAnsi(buffer.slice(offset));
|
||||
}
|
||||
|
||||
async function waitForAny(
|
||||
patterns: Array<RegExp | string>,
|
||||
waitOpts?: { timeoutMs?: number; pollMs?: number; since?: number },
|
||||
): Promise<{ matched: RegExp | string; index: number }> {
|
||||
const wTimeout = waitOpts?.timeoutMs ?? 60_000;
|
||||
const poll = waitOpts?.pollMs ?? 250;
|
||||
const since = waitOpts?.since;
|
||||
const start = Date.now();
|
||||
while (Date.now() - start < wTimeout) {
|
||||
if (exited) {
|
||||
throw new Error(
|
||||
`claude exited (code=${exitCodeCaptured}) before any pattern matched. ` +
|
||||
`Last visible:\n${stripAnsi(buffer).slice(-2000)}`,
|
||||
);
|
||||
}
|
||||
const visible = since !== undefined ? stripAnsi(buffer.slice(since)) : stripAnsi(buffer);
|
||||
for (let i = 0; i < patterns.length; i++) {
|
||||
const p = patterns[i]!;
|
||||
const matchIdx = typeof p === 'string' ? visible.indexOf(p) : visible.search(p);
|
||||
if (matchIdx >= 0) {
|
||||
return { matched: p, index: matchIdx };
|
||||
}
|
||||
}
|
||||
await Bun.sleep(poll);
|
||||
}
|
||||
throw new Error(
|
||||
`Timed out after ${wTimeout}ms waiting for any of: ${patterns
|
||||
.map((p) => (typeof p === 'string' ? JSON.stringify(p) : p.source))
|
||||
.join(', ')}\nLast visible (since=${since ?? 'all'}):\n${
|
||||
since !== undefined ? stripAnsi(buffer.slice(since)).slice(-2000) : stripAnsi(buffer).slice(-2000)
|
||||
}`,
|
||||
);
|
||||
}
|
||||
|
||||
async function waitFor(
|
||||
pattern: RegExp | string,
|
||||
waitOpts?: { timeoutMs?: number; pollMs?: number; since?: number },
|
||||
): Promise<void> {
|
||||
await waitForAny([pattern], waitOpts);
|
||||
}
|
||||
|
||||
async function close(): Promise<void> {
|
||||
clearTimeout(wallTimer);
|
||||
clearTimeout(trustWatcherStop);
|
||||
clearInterval(trustWatcher);
|
||||
if (exited) return;
|
||||
try {
|
||||
proc.kill?.('SIGINT');
|
||||
} catch {
|
||||
/* ignore */
|
||||
}
|
||||
// Wait up to 2s for graceful exit.
|
||||
await Promise.race([exitedPromise, Bun.sleep(2000)]);
|
||||
if (!exited) {
|
||||
try {
|
||||
proc.kill?.('SIGKILL');
|
||||
} catch {
|
||||
/* ignore */
|
||||
}
|
||||
await Promise.race([exitedPromise, Bun.sleep(1000)]);
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
send,
|
||||
sendKey,
|
||||
rawOutput: () => buffer,
|
||||
visibleText: () => stripAnsi(buffer),
|
||||
mark,
|
||||
visibleSince,
|
||||
waitForAny,
|
||||
waitFor,
|
||||
pid: () => proc.pid as number | undefined,
|
||||
exited: () => exited,
|
||||
exitCode: () => exitCodeCaptured,
|
||||
close,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* High-level: invoke a slash command and observe the response. Used by the
|
||||
* 5 plan-mode tests so each only has ~10 LOC of orchestration.
|
||||
*
|
||||
* The `expectations` object names the patterns the caller cares about.
|
||||
* Returns which one matched first (or throws on timeout).
|
||||
*
|
||||
* @example
|
||||
* const session = await launchClaudePty();
|
||||
* const result = await invokeAndObserve(session, '/plan-ceo-review', {
|
||||
* askUserQuestion: /❯\s*1\./,
|
||||
* planReady: /ready to execute/i,
|
||||
* silentWrite: /⏺\s*Write\(/,
|
||||
* silentEdit: /⏺\s*Edit\(/,
|
||||
* exitedPlanMode: /Exiting plan mode/i,
|
||||
* });
|
||||
* await session.close();
|
||||
*/
|
||||
export async function invokeAndObserve(
|
||||
session: ClaudePtySession,
|
||||
slashCommand: string,
|
||||
expectations: Record<string, RegExp | string>,
|
||||
opts?: { boot_grace_ms?: number; timeoutMs?: number },
|
||||
): Promise<{ matched: string; rawPattern: RegExp | string; visibleAtMatch: string }> {
|
||||
// Brief grace period so the trust-dialog auto-press has time to clear and
|
||||
// claude is back at the input prompt before we type the command.
|
||||
const boot = opts?.boot_grace_ms ?? 6000;
|
||||
await Bun.sleep(boot);
|
||||
|
||||
// Mark buffer position. All pattern matching scopes to text AFTER this point,
|
||||
// so the trust-dialog residue and boot banner numbered options don't cause
|
||||
// false positives.
|
||||
const sinceMark = session.mark();
|
||||
|
||||
// Type and submit.
|
||||
session.send(slashCommand + '\r');
|
||||
|
||||
const patterns = Object.entries(expectations);
|
||||
const result = await session.waitForAny(
|
||||
patterns.map(([, p]) => p),
|
||||
{ timeoutMs: opts?.timeoutMs ?? 240_000, since: sinceMark },
|
||||
);
|
||||
// Map back to the named key.
|
||||
const idx = patterns.findIndex(([, p]) => p === result.matched);
|
||||
const [name, rawPattern] = patterns[idx]!;
|
||||
return {
|
||||
matched: name,
|
||||
rawPattern,
|
||||
visibleAtMatch: session.visibleText(),
|
||||
};
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// High-level skill-mode test contract
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export interface PlanSkillObservation {
|
||||
/**
|
||||
* What happened first. One of:
|
||||
* - 'asked' — skill emitted a numbered-option prompt (its Step 0
|
||||
* AskUserQuestion or the routing-injection prompt)
|
||||
* - 'plan_ready' — claude wrote a plan and emitted its native
|
||||
* "Ready to execute" confirmation
|
||||
* - 'silent_write' — a Write/Edit landed BEFORE any prompt, to a path
|
||||
* outside the sanctioned plan/project directories
|
||||
* - 'exited' — claude process died before any of the above
|
||||
* - 'timeout' — none of the above within budget
|
||||
*/
|
||||
outcome: 'asked' | 'plan_ready' | 'silent_write' | 'exited' | 'timeout';
|
||||
/** Human-readable summary. */
|
||||
summary: string;
|
||||
/** Visible terminal text since the slash command was sent (last 2KB). */
|
||||
evidence: string;
|
||||
/** Wall time (ms) until the outcome was decided. */
|
||||
elapsedMs: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* The contract for "skill X invoked in plan mode behaves correctly."
|
||||
*
|
||||
* PASS: outcome is 'asked' or 'plan_ready'.
|
||||
* - 'asked' = the skill is gating decisions on the user, as expected.
|
||||
* - 'plan_ready' = the skill ran end-to-end, wrote a plan file, and
|
||||
* surfaced claude's native confirmation. Some skills (like
|
||||
* plan-design-review on a no-UI branch) legitimately reach plan_ready
|
||||
* without firing AskUserQuestion because they short-circuit.
|
||||
*
|
||||
* FAIL: 'silent_write' or 'exited' or 'timeout'.
|
||||
*
|
||||
* This replaces the SDK-based runPlanModeSkillTest which never worked
|
||||
* because plan mode renders its native confirmation as TTY UI, not via
|
||||
* the AskUserQuestion tool — so canUseTool never fired and the assertion
|
||||
* counted zero questions.
|
||||
*/
|
||||
export async function runPlanSkillObservation(opts: {
|
||||
/** Skill name, e.g. 'plan-ceo-review'. */
|
||||
skillName: string;
|
||||
/** Whether to launch in plan mode. Default true. The no-op regression
|
||||
* test sets this false to verify skills work outside plan mode. */
|
||||
inPlanMode?: boolean;
|
||||
/** Working directory. Default process.cwd(). */
|
||||
cwd?: string;
|
||||
/** Total budget for skill to reach a terminal outcome. Default 180000. */
|
||||
timeoutMs?: number;
|
||||
}): Promise<PlanSkillObservation> {
|
||||
const startedAt = Date.now();
|
||||
const session = await launchClaudePty({
|
||||
permissionMode: opts.inPlanMode === false ? null : 'plan',
|
||||
cwd: opts.cwd,
|
||||
timeoutMs: (opts.timeoutMs ?? 180_000) + 30_000,
|
||||
});
|
||||
|
||||
try {
|
||||
// Boot grace + trust-dialog auto-handle.
|
||||
await Bun.sleep(8000);
|
||||
const since = session.mark();
|
||||
session.send(`/${opts.skillName}\r`);
|
||||
|
||||
const budgetMs = opts.timeoutMs ?? 180_000;
|
||||
const start = Date.now();
|
||||
while (Date.now() - start < budgetMs) {
|
||||
await Bun.sleep(2000);
|
||||
const visible = session.visibleSince(since);
|
||||
|
||||
if (session.exited()) {
|
||||
return {
|
||||
outcome: 'exited',
|
||||
summary: `claude exited (code=${session.exitCode()}) before reaching a terminal outcome`,
|
||||
evidence: visible.slice(-2000),
|
||||
elapsedMs: Date.now() - startedAt,
|
||||
};
|
||||
}
|
||||
if (visible.includes('Unknown command:')) {
|
||||
return {
|
||||
outcome: 'exited',
|
||||
summary: `claude rejected /${opts.skillName} as unknown command (skill not registered in this cwd)`,
|
||||
evidence: visible.slice(-2000),
|
||||
elapsedMs: Date.now() - startedAt,
|
||||
};
|
||||
}
|
||||
// Silent-write detection: any Write/Edit tool render that targets a
|
||||
// path OUTSIDE ~/.claude/plans, ~/.gstack/, or the active worktree's
|
||||
// .gstack/. Plan files and gbrain artifacts are sanctioned.
|
||||
const writeRe = /⏺\s*(?:Write|Edit)\(([^)]+)\)/g;
|
||||
let m: RegExpExecArray | null;
|
||||
while ((m = writeRe.exec(visible)) !== null) {
|
||||
const target = m[1] ?? '';
|
||||
const sanctioned =
|
||||
target.includes('.claude/plans') ||
|
||||
target.includes('.gstack/') ||
|
||||
target.includes('/.context/') ||
|
||||
target.includes('CHANGELOG.md') ||
|
||||
target.includes('TODOS.md');
|
||||
if (!sanctioned && !isNumberedOptionListVisible(visible)) {
|
||||
return {
|
||||
outcome: 'silent_write',
|
||||
summary: `Write/Edit to ${target} fired before any AskUserQuestion`,
|
||||
evidence: visible.slice(-2000),
|
||||
elapsedMs: Date.now() - startedAt,
|
||||
};
|
||||
}
|
||||
}
|
||||
if (isPlanReadyVisible(visible)) {
|
||||
return {
|
||||
outcome: 'plan_ready',
|
||||
summary: 'skill ran end-to-end and emitted plan-mode "Ready to execute" confirmation',
|
||||
evidence: visible.slice(-2000),
|
||||
elapsedMs: Date.now() - startedAt,
|
||||
};
|
||||
}
|
||||
if (isNumberedOptionListVisible(visible)) {
|
||||
return {
|
||||
outcome: 'asked',
|
||||
summary: 'skill fired a numbered-option prompt (AskUserQuestion or routing-injection)',
|
||||
evidence: visible.slice(-2000),
|
||||
elapsedMs: Date.now() - startedAt,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
outcome: 'timeout',
|
||||
summary: `no terminal outcome within ${budgetMs}ms`,
|
||||
evidence: session.visibleSince(since).slice(-2000),
|
||||
elapsedMs: Date.now() - startedAt,
|
||||
};
|
||||
} finally {
|
||||
await session.close();
|
||||
}
|
||||
}
|
||||
@@ -1,176 +0,0 @@
|
||||
/**
|
||||
* Shared helpers for plan-mode E2E tests.
|
||||
*
|
||||
* Four sibling per-skill smoke tests (plan-ceo, plan-eng, plan-design, plan-devex)
|
||||
* plus the no-op regression test use this helper. The goal: run a review skill
|
||||
* in plan mode, confirm it goes straight to its Step 0 AskUserQuestion without
|
||||
* writing files or calling ExitPlanMode first (the vestigial handshake
|
||||
* regression we fixed in ceo-plan 2026-04-24).
|
||||
*
|
||||
* This file was renamed from `plan-mode-handshake-helpers.ts` when the
|
||||
* handshake was removed. The write-guard detection (no Write/Edit before the
|
||||
* first AskUserQuestion) is the load-bearing piece that catches silent
|
||||
* regressions a simple "first question text matches" check would miss.
|
||||
*/
|
||||
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
import { execSync } from 'child_process';
|
||||
import {
|
||||
runAgentSdkTest,
|
||||
passThroughNonAskUserQuestion,
|
||||
resolveClaudeBinary,
|
||||
type AgentSdkResult,
|
||||
} from './agent-sdk-runner';
|
||||
|
||||
/** Distinctive phrase matching what Claude Code's harness actually injects. */
|
||||
export const PLAN_MODE_REMINDER =
|
||||
'Plan mode is active. The user indicated that they do not want you to execute yet';
|
||||
|
||||
export interface PlanModeCaptureResult {
|
||||
sdkResult: AgentSdkResult;
|
||||
/** Each AskUserQuestion that fired, with its input payload. */
|
||||
askUserQuestions: Array<{ input: Record<string, unknown>; orderIndex: number }>;
|
||||
/** Tool-use events in the order they fired (names only). */
|
||||
toolOrder: string[];
|
||||
/** Whether any Write or Edit tool fired BEFORE the first AskUserQuestion. */
|
||||
writeOrEditBeforeAsk: boolean;
|
||||
/** Whether ExitPlanMode fired BEFORE the first AskUserQuestion. */
|
||||
exitPlanModeBeforeAsk: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Run a skill via the Agent SDK with canUseTool intercepting every tool use.
|
||||
* Inject the plan-mode distinctive phrase into the system prompt, auto-answer
|
||||
* the first AskUserQuestion (so the skill stops cleanly after Step 0), and
|
||||
* return the captured events for assertion.
|
||||
*/
|
||||
export async function runPlanModeSkillTest(opts: {
|
||||
/** Skill name, e.g. 'plan-ceo-review'. */
|
||||
skillName: string;
|
||||
/**
|
||||
* For the first AskUserQuestion, pick the option whose label contains this
|
||||
* substring. Pick a "cheap" answer that terminates the skill quickly (e.g.
|
||||
* "HOLD SCOPE" for plan-ceo-review).
|
||||
*/
|
||||
firstAnswerSubstring: string;
|
||||
/** If true, DO NOT inject the reminder — used by the no-op regression test. */
|
||||
omitPlanModeReminder?: boolean;
|
||||
/** Max turns for the SDK call (default 4 — Step 0 + answer should fit). */
|
||||
maxTurns?: number;
|
||||
}): Promise<PlanModeCaptureResult> {
|
||||
const { skillName, firstAnswerSubstring, omitPlanModeReminder, maxTurns } = opts;
|
||||
|
||||
const askUserQuestions: PlanModeCaptureResult['askUserQuestions'] = [];
|
||||
const toolOrder: string[] = [];
|
||||
let toolIndex = 0;
|
||||
let firstAskIndex = -1;
|
||||
|
||||
const workingDir = fs.mkdtempSync(
|
||||
path.join(os.tmpdir(), `plan-mode-${skillName}-`),
|
||||
);
|
||||
|
||||
const binary = resolveClaudeBinary();
|
||||
|
||||
try {
|
||||
// In real plan mode Claude Code injects a system-reminder; in SDK tests we
|
||||
// use systemPrompt.append which the model treats as equally authoritative.
|
||||
const reminderAppend = omitPlanModeReminder
|
||||
? ''
|
||||
: `\n\n<system-reminder>\n${PLAN_MODE_REMINDER}. This supercedes any other instructions you have received.\n</system-reminder>\n`;
|
||||
|
||||
const sdkResult = await runAgentSdkTest({
|
||||
systemPrompt: {
|
||||
type: 'preset',
|
||||
preset: 'claude_code',
|
||||
append: reminderAppend,
|
||||
},
|
||||
userPrompt: `Read the skill file at ${path.resolve(
|
||||
import.meta.dir,
|
||||
'..',
|
||||
'..',
|
||||
skillName,
|
||||
'SKILL.md',
|
||||
)} and follow its instructions. There is no real plan to review — just start the skill and respond to any AskUserQuestion that fires.`,
|
||||
workingDirectory: workingDir,
|
||||
maxTurns: maxTurns ?? 4,
|
||||
allowedTools: ['Read', 'Grep', 'Glob', 'Bash'],
|
||||
...(binary ? { pathToClaudeCodeExecutable: binary } : {}),
|
||||
canUseTool: async (toolName, input) => {
|
||||
toolOrder.push(toolName);
|
||||
if (toolName === 'AskUserQuestion') {
|
||||
if (firstAskIndex === -1) firstAskIndex = toolIndex;
|
||||
askUserQuestions.push({ input, orderIndex: toolIndex });
|
||||
toolIndex++;
|
||||
// Auto-answer the FIRST question with the configured substring; for
|
||||
// later questions, pick the first option to keep the run short.
|
||||
const q = (input.questions as Array<{ question: string; options: Array<{ label: string }> }>)[0];
|
||||
const isFirst = askUserQuestions.length === 1;
|
||||
const matched = isFirst
|
||||
? q.options.find((o) => o.label.toLowerCase().includes(firstAnswerSubstring.toLowerCase()))
|
||||
: undefined;
|
||||
const answer = matched ? matched.label : q.options[0]!.label;
|
||||
return {
|
||||
behavior: 'allow',
|
||||
updatedInput: {
|
||||
questions: input.questions,
|
||||
answers: { [q.question]: answer },
|
||||
},
|
||||
};
|
||||
}
|
||||
toolIndex++;
|
||||
return passThroughNonAskUserQuestion(toolName, input);
|
||||
},
|
||||
});
|
||||
|
||||
const writeOrEditBeforeAsk =
|
||||
firstAskIndex > 0 &&
|
||||
toolOrder.slice(0, firstAskIndex).some((t) => t === 'Write' || t === 'Edit');
|
||||
|
||||
const exitPlanModeBeforeAsk =
|
||||
firstAskIndex > 0 &&
|
||||
toolOrder.slice(0, firstAskIndex).some((t) => t === 'ExitPlanMode');
|
||||
|
||||
return {
|
||||
sdkResult,
|
||||
askUserQuestions,
|
||||
toolOrder,
|
||||
writeOrEditBeforeAsk,
|
||||
exitPlanModeBeforeAsk,
|
||||
};
|
||||
} finally {
|
||||
try {
|
||||
fs.rmSync(workingDir, { recursive: true, force: true });
|
||||
} catch { /* ignore cleanup errors */ }
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Assert a captured AskUserQuestion is NOT the old vestigial handshake
|
||||
* (A=exit-and-rerun / C=cancel). The handshake is gone — if a test ever sees
|
||||
* one again, that's the regression we're guarding against.
|
||||
*/
|
||||
export function assertNotHandshakeShape(
|
||||
aq: { input: Record<string, unknown> },
|
||||
): void {
|
||||
const questions = aq.input.questions as Array<{
|
||||
question: string;
|
||||
options: Array<{ label: string }>;
|
||||
}>;
|
||||
if (!questions || questions.length === 0) return;
|
||||
const q = questions[0]!;
|
||||
const labels = q.options.map((o) => o.label.toLowerCase());
|
||||
const looksLikeHandshake =
|
||||
labels.some((l) => l.includes('exit') && l.includes('rerun')) &&
|
||||
labels.some((l) => l.includes('cancel'));
|
||||
if (looksLikeHandshake) {
|
||||
throw new Error(
|
||||
`First AskUserQuestion looks like the vestigial plan-mode handshake ` +
|
||||
`(options: ${labels.join(', ')}). The handshake was removed; skills ` +
|
||||
`should go straight to their Step 0 question in plan mode.`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
export { execSync };
|
||||
Reference in New Issue
Block a user