From 1b1fd30ec793d3ef24d9caa3a54ac984821cc972 Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Sat, 25 Apr 2026 21:23:40 -0700
Subject: [PATCH] feat(test): real-PTY harness for plan-mode E2E tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds test/helpers/claude-pty-runner.ts. Spawns the actual claude binary
via Bun.spawn({terminal:}) (Bun 1.3.10+ has built-in PTY — no node-pty,
no native modules), drives it through stdin/stdout, and parses rendered
terminal frames. Pattern adapted from the cc-pty-import branch's
terminal-agent.ts but stripped of WS/cookie/Origin scaffolding (not
needed for headless tests).

Public API:
- launchClaudePty(opts) — boots claude with --permission-mode plan|null,
  auto-handles the workspace-trust dialog, returns a session handle.
- session.send / sendKey / waitForAny / waitFor / mark / visibleSince /
  visibleText / rawOutput / close
- runPlanSkillObservation({skillName, inPlanMode, timeoutMs}) — high-level
  contract for plan-mode skill tests. Returns { outcome, summary, evidence,
  elapsedMs }. outcome ∈ {asked, plan_ready, silent_write, exited, timeout}.

Replaces the SDK-based runPlanModeSkillTest from plan-mode-helpers.ts
which never worked. Plan mode renders its native "Ready to execute"
confirmation as TTY UI (numbered options with ❯ cursor), not via the
AskUserQuestion tool — so the SDK's canUseTool interceptor never fired
and the assertion always saw zero questions. Real PTY observes the
rendered output directly.

Deletes test/helpers/plan-mode-helpers.ts. No production callers remained.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 test/helpers/claude-pty-runner.ts | 539 ++++++++++++++++++++++++++++++
 test/helpers/plan-mode-helpers.ts | 176 ----------
 2 files changed, 539 insertions(+), 176 deletions(-)
 create mode 100644 test/helpers/claude-pty-runner.ts
 delete mode 100644 test/helpers/plan-mode-helpers.ts

diff --git a/test/helpers/claude-pty-runner.ts b/test/helpers/claude-pty-runner.ts
new file mode 100644
index 00000000..aab48e7d
--- /dev/null
+++ b/test/helpers/claude-pty-runner.ts
@@ -0,0 +1,539 @@
+/**
+ * Real-PTY runner for Claude Code plan-mode E2E tests.
+ *
+ * Spawns the actual `claude` binary via `Bun.spawn({terminal:})`, drives
+ * it through stdin/stdout, parses the rendered terminal frames, and exposes
+ * primitives the 5 plan-mode tests need. Replaces the SDK-based
+ * `runPlanModeSkillTest` from plan-mode-helpers.ts which never worked
+ * because plan mode doesn't use the AskUserQuestion tool — it uses its
+ * own TTY-rendered native confirmation UI.
+ *
+ * Why this exists: the SDK harness intercepts `canUseTool` for
+ * `AskUserQuestion`. Claude in plan mode renders its "Ready to execute"
+ * confirmation as a native option list (1-4 numbered options) without
+ * invoking the AskUserQuestion tool. The SDK never sees it. Real PTY
+ * does — it shows up as text on screen with `❯` cursor markers.
+ *
+ * Architecture: pure Bun.spawn — no node-pty, no native modules, no chmod
+ * fixes. Bun 1.3.10+ has built-in PTY support via the `terminal:` spawn
+ * option. Pattern borrowed from cc-pty-import branch's terminal-agent.ts
+ * (the WS/cookie/Origin scaffolding there is for the browser sidebar;
+ * tests don't need it).
+ */
+
+import * as fs from 'fs';
+import * as os from 'os';
+import * as path from 'path';
+
+/** Strip ANSI escapes for pattern-matching against visible text. */
+export function stripAnsi(s: string): string {
+  return s
+    .replace(/\x1b\[[\d;]*[a-zA-Z]/g, '')
+    .replace(/\x1b\][^\x07\x1b]*(\x07|\x1b\\)/g, '')
+    .replace(/\x1b[()][AB012]/g, '')
+    .replace(/\x1b[78=>]/g, '');
+}
+
+/** Find claude on PATH, with fallback locations. Mirrors terminal-agent.ts. */
+export function resolveClaudeBinary(): string | null {
+  const override = process.env.BROWSE_TERMINAL_BINARY;
+  if (override && fs.existsSync(override)) return override;
+  // eslint-disable-next-line @typescript-eslint/no-explicit-any
+  const which = (Bun as any).which?.('claude');
+  if (which) return which;
+  const candidates = [
+    '/opt/homebrew/bin/claude',
+    '/usr/local/bin/claude',
+    `${process.env.HOME}/.local/bin/claude`,
+    `${process.env.HOME}/.bun/bin/claude`,
+    `${process.env.HOME}/.npm-global/bin/claude`,
+  ];
+  for (const c of candidates) {
+    try {
+      fs.accessSync(c, fs.constants.X_OK);
+      return c;
+    } catch {
+      /* keep searching */
+    }
+  }
+  return null;
+}
+
+export interface ClaudePtyOptions {
+  /**
+   * Permission mode for the session.
+   *  - 'plan' (default) — launches with --permission-mode plan
+   *  - undefined — no --permission-mode flag at all (regular interactive)
+   *  Other valid SDK modes ('default', 'acceptEdits', 'bypassPermissions',
+   *  'auto', 'dontAsk') are passed through verbatim.
+   */
+  permissionMode?: 'plan' | 'default' | 'acceptEdits' | 'bypassPermissions' | 'auto' | 'dontAsk' | null;
+  /** Extra args after the permission-mode flag. */
+  extraArgs?: string[];
+  /** Terminal size. Default 120x40. Plan-mode UI lays out cleanly at this size. */
+  cols?: number;
+  rows?: number;
+  /** Working directory. Default: process.cwd(). The repo cwd has the gstack
+   *  skill registry and trusted-folder cookie, so most tests want this. */
+  cwd?: string;
+  /** Extra env on top of process.env. */
+  env?: Record<string, string>;
+  /** Total run timeout (ms). Default 240000 (4 min). */
+  timeoutMs?: number;
+}
+
+export interface ClaudePtySession {
+  /** Send raw bytes to PTY stdin. Newlines = "\r" in TTY world. */
+  send(data: string): void;
+  /** Send a key by name. Limited set used by these tests. */
+  sendKey(key: 'Enter' | 'Up' | 'Down' | 'Esc' | 'Tab' | 'ShiftTab' | 'CtrlC'): void;
+  /** Raw accumulated stdout (with ANSI). For forensics. */
+  rawOutput(): string;
+  /** Visible (ANSI-stripped) output for the entire session. For pattern matching. */
+  visibleText(): string;
+  /**
+   * Mark the current buffer position. Subsequent waitForAny / visibleSince
+   * calls only look at output AFTER this mark. Use to scope assertions to
+   * "after I sent the skill command" — avoids matching against the trust
+   * dialog or boot banner residue. Returns a marker handle.
+   */
+  mark(): number;
+  /** Visible text since the most recent (or specific) mark. */
+  visibleSince(marker?: number): string;
+  /**
+   * Wait for any of the supplied patterns to appear in visibleText. Resolves
+   * with the first match. Throws on timeout (with last 2KB of visible text).
+   * If `since` is supplied, only matches text after that mark.
+   */
+  waitForAny(
+    patterns: Array<RegExp | string>,
+    opts?: { timeoutMs?: number; pollMs?: number; since?: number },
+  ): Promise<{ matched: RegExp | string; index: number }>;
+  /** Convenience: single-pattern wait. */
+  waitFor(
+    pattern: RegExp | string,
+    opts?: { timeoutMs?: number; pollMs?: number; since?: number },
+  ): Promise<void>;
+  /** Process pid (for debug). */
+  pid(): number | undefined;
+  /** Whether the underlying process has exited. */
+  exited(): boolean;
+  /** Exit code, if known. */
+  exitCode(): number | null;
+  /**
+   * Send SIGINT, then SIGKILL after 1s. Always safe to call multiple times.
+   * Awaits process exit before resolving.
+   */
+  close(): Promise<void>;
+}
+
+/** Detect the workspace-trust dialog rendering. */
+export function isTrustDialogVisible(visible: string): boolean {
+  // Phrase Claude Code prints. Stable across versions in this branch's range.
+  return visible.includes('trust this folder');
+}
+
+/** Detect plan-mode's native "ready to execute" confirmation. */
+export function isPlanReadyVisible(visible: string): boolean {
+  return /ready to execute|Would you like to proceed/i.test(visible);
+}
+
+/** Detect any AskUserQuestion-shaped numbered option list with cursor. */
+export function isNumberedOptionListVisible(visible: string): boolean {
+  // ❯ cursor + at least two numbered options 1-9.
+  // Matches the trust dialog AND plan-ready prompt AND skill questions.
+  // Tighter classification happens via scope (after-trust, after-skill-cmd, etc).
+  return /❯\s*1\./.test(visible) && /\b2\./.test(visible);
+}
+
+/**
+ * Spawn `claude --permission-mode plan` in a real PTY and return a session
+ * handle. Caller is responsible for `await session.close()` to release the
+ * subprocess and any timers.
+ *
+ * Auto-handles the workspace-trust dialog (presses "1\r" if it appears
+ * during the boot window). Tests should NOT have to handle it themselves.
+ */
+export async function launchClaudePty(
+  opts: ClaudePtyOptions = {},
+): Promise<ClaudePtySession> {
+  const claudePath = resolveClaudeBinary();
+  if (!claudePath) {
+    throw new Error(
+      'claude binary not found on PATH. Install: https://docs.anthropic.com/en/docs/claude-code',
+    );
+  }
+
+  const cwd = opts.cwd ?? process.cwd();
+  const cols = opts.cols ?? 120;
+  const rows = opts.rows ?? 40;
+  const timeoutMs = opts.timeoutMs ?? 240_000;
+
+  let buffer = '';
+  let exited = false;
+  let exitCodeCaptured: number | null = null;
+
+  // Permission mode: 'plan' default, null => omit flag entirely.
+  const permissionMode = opts.permissionMode === undefined ? 'plan' : opts.permissionMode;
+  const args: string[] = [];
+  if (permissionMode !== null) {
+    args.push('--permission-mode', permissionMode);
+  }
+  if (opts.extraArgs) args.push(...opts.extraArgs);
+
+  // eslint-disable-next-line @typescript-eslint/no-explicit-any
+  const proc = (Bun as any).spawn([claudePath, ...args], {
+    terminal: {
+      cols,
+      rows,
+      data(_t: unknown, chunk: Buffer) {
+        buffer += chunk.toString('utf-8');
+      },
+    },
+    cwd,
+    env: { ...process.env, ...(opts.env ?? {}) },
+  });
+
+  // Track exit so waitForAny can fail fast if claude crashes.
+  let exitedPromise: Promise<void> = Promise.resolve();
+  if (proc.exited && typeof proc.exited.then === 'function') {
+    exitedPromise = proc.exited
+      .then((code: number | null) => {
+        exitCodeCaptured = code;
+        exited = true;
+      })
+      .catch(() => {
+        exited = true;
+      });
+  }
+
+  // Top-level timeout. If a test forgets to close, this kills it eventually.
+  const wallTimer = setTimeout(() => {
+    try {
+      proc.kill?.('SIGKILL');
+    } catch {
+      /* ignore */
+    }
+  }, timeoutMs);
+
+  // Auto-handle the workspace-trust dialog. Runs once during the boot
+  // window; idempotent (only fires if the phrase is still on screen).
+  let trustHandled = false;
+  const trustWatcher = setInterval(() => {
+    if (trustHandled || exited) return;
+    const visible = stripAnsi(buffer);
+    if (isTrustDialogVisible(visible)) {
+      trustHandled = true;
+      try {
+        proc.terminal?.write?.('1\r');
+      } catch {
+        /* ignore */
+      }
+    }
+  }, 200);
+  // Stop the watcher after 15s — by then the dialog has either fired or
+  // doesn't exist on this run.
+  const trustWatcherStop = setTimeout(() => clearInterval(trustWatcher), 15_000);
+
+  function send(data: string): void {
+    if (exited) return;
+    try {
+      proc.terminal?.write?.(data);
+    } catch {
+      /* ignore */
+    }
+  }
+
+  type Key = Parameters<ClaudePtySession['sendKey']>[0];
+  function sendKey(key: Key): void {
+    const map: Record<string, string> = {
+      Enter: '\r',
+      Up: '\x1b[A',
+      Down: '\x1b[B',
+      Esc: '\x1b',
+      Tab: '\t',
+      ShiftTab: '\x1b[Z',
+      CtrlC: '\x03',
+    };
+    send(map[key] ?? '');
+  }
+
+  let lastMark = 0;
+  function mark(): number {
+    lastMark = buffer.length;
+    return lastMark;
+  }
+  function visibleSince(marker?: number): string {
+    const offset = marker ?? lastMark;
+    return stripAnsi(buffer.slice(offset));
+  }
+
+  async function waitForAny(
+    patterns: Array<RegExp | string>,
+    waitOpts?: { timeoutMs?: number; pollMs?: number; since?: number },
+  ): Promise<{ matched: RegExp | string; index: number }> {
+    const wTimeout = waitOpts?.timeoutMs ?? 60_000;
+    const poll = waitOpts?.pollMs ?? 250;
+    const since = waitOpts?.since;
+    const start = Date.now();
+    while (Date.now() - start < wTimeout) {
+      if (exited) {
+        throw new Error(
+          `claude exited (code=${exitCodeCaptured}) before any pattern matched. ` +
+            `Last visible:\n${stripAnsi(buffer).slice(-2000)}`,
+        );
+      }
+      const visible = since !== undefined ? stripAnsi(buffer.slice(since)) : stripAnsi(buffer);
+      for (let i = 0; i < patterns.length; i++) {
+        const p = patterns[i]!;
+        const matchIdx = typeof p === 'string' ? visible.indexOf(p) : visible.search(p);
+        if (matchIdx >= 0) {
+          return { matched: p, index: matchIdx };
+        }
+      }
+      await Bun.sleep(poll);
+    }
+    throw new Error(
+      `Timed out after ${wTimeout}ms waiting for any of: ${patterns
+        .map((p) => (typeof p === 'string' ? JSON.stringify(p) : p.source))
+        .join(', ')}\nLast visible (since=${since ?? 'all'}):\n${
+        since !== undefined ? stripAnsi(buffer.slice(since)).slice(-2000) : stripAnsi(buffer).slice(-2000)
+      }`,
+    );
+  }
+
+  async function waitFor(
+    pattern: RegExp | string,
+    waitOpts?: { timeoutMs?: number; pollMs?: number; since?: number },
+  ): Promise<void> {
+    await waitForAny([pattern], waitOpts);
+  }
+
+  async function close(): Promise<void> {
+    clearTimeout(wallTimer);
+    clearTimeout(trustWatcherStop);
+    clearInterval(trustWatcher);
+    if (exited) return;
+    try {
+      proc.kill?.('SIGINT');
+    } catch {
+      /* ignore */
+    }
+    // Wait up to 2s for graceful exit.
+    await Promise.race([exitedPromise, Bun.sleep(2000)]);
+    if (!exited) {
+      try {
+        proc.kill?.('SIGKILL');
+      } catch {
+        /* ignore */
+      }
+      await Promise.race([exitedPromise, Bun.sleep(1000)]);
+    }
+  }
+
+  return {
+    send,
+    sendKey,
+    rawOutput: () => buffer,
+    visibleText: () => stripAnsi(buffer),
+    mark,
+    visibleSince,
+    waitForAny,
+    waitFor,
+    pid: () => proc.pid as number | undefined,
+    exited: () => exited,
+    exitCode: () => exitCodeCaptured,
+    close,
+  };
+}
+
+/**
+ * High-level: invoke a slash command and observe the response. Used by the
+ * 5 plan-mode tests so each only has ~10 LOC of orchestration.
+ *
+ * The `expectations` object names the patterns the caller cares about.
+ * Returns which one matched first (or throws on timeout).
+ *
+ * @example
+ * const session = await launchClaudePty();
+ * const result = await invokeAndObserve(session, '/plan-ceo-review', {
+ *   askUserQuestion: /❯\s*1\./,
+ *   planReady: /ready to execute/i,
+ *   silentWrite: /⏺\s*Write\(/,
+ *   silentEdit: /⏺\s*Edit\(/,
+ *   exitedPlanMode: /Exiting plan mode/i,
+ * });
+ * await session.close();
+ */
+export async function invokeAndObserve(
+  session: ClaudePtySession,
+  slashCommand: string,
+  expectations: Record<string, RegExp | string>,
+  opts?: { boot_grace_ms?: number; timeoutMs?: number },
+): Promise<{ matched: string; rawPattern: RegExp | string; visibleAtMatch: string }> {
+  // Brief grace period so the trust-dialog auto-press has time to clear and
+  // claude is back at the input prompt before we type the command.
+  const boot = opts?.boot_grace_ms ?? 6000;
+  await Bun.sleep(boot);
+
+  // Mark buffer position. All pattern matching scopes to text AFTER this point,
+  // so the trust-dialog residue and boot banner numbered options don't cause
+  // false positives.
+  const sinceMark = session.mark();
+
+  // Type and submit.
+  session.send(slashCommand + '\r');
+
+  const patterns = Object.entries(expectations);
+  const result = await session.waitForAny(
+    patterns.map(([, p]) => p),
+    { timeoutMs: opts?.timeoutMs ?? 240_000, since: sinceMark },
+  );
+  // Map back to the named key.
+  const idx = patterns.findIndex(([, p]) => p === result.matched);
+  const [name, rawPattern] = patterns[idx]!;
+  return {
+    matched: name,
+    rawPattern,
+    visibleAtMatch: session.visibleText(),
+  };
+}
+
+// ---------------------------------------------------------------------------
+// High-level skill-mode test contract
+// ---------------------------------------------------------------------------
+
+export interface PlanSkillObservation {
+  /**
+   * What happened first. One of:
+   *  - 'asked'      — skill emitted a numbered-option prompt (its Step 0
+   *                   AskUserQuestion or the routing-injection prompt)
+   *  - 'plan_ready' — claude wrote a plan and emitted its native
+   *                   "Ready to execute" confirmation
+   *  - 'silent_write' — a Write/Edit landed BEFORE any prompt, to a path
+   *                   outside the sanctioned plan/project directories
+   *  - 'exited'     — claude process died before any of the above
+   *  - 'timeout'    — none of the above within budget
+   */
+  outcome: 'asked' | 'plan_ready' | 'silent_write' | 'exited' | 'timeout';
+  /** Human-readable summary. */
+  summary: string;
+  /** Visible terminal text since the slash command was sent (last 2KB). */
+  evidence: string;
+  /** Wall time (ms) until the outcome was decided. */
+  elapsedMs: number;
+}
+
+/**
+ * The contract for "skill X invoked in plan mode behaves correctly."
+ *
+ * PASS: outcome is 'asked' or 'plan_ready'.
+ *   - 'asked' = the skill is gating decisions on the user, as expected.
+ *   - 'plan_ready' = the skill ran end-to-end, wrote a plan file, and
+ *     surfaced claude's native confirmation. Some skills (like
+ *     plan-design-review on a no-UI branch) legitimately reach plan_ready
+ *     without firing AskUserQuestion because they short-circuit.
+ *
+ * FAIL: 'silent_write' or 'exited' or 'timeout'.
+ *
+ * This replaces the SDK-based runPlanModeSkillTest which never worked
+ * because plan mode renders its native confirmation as TTY UI, not via
+ * the AskUserQuestion tool — so canUseTool never fired and the assertion
+ * counted zero questions.
+ */
+export async function runPlanSkillObservation(opts: {
+  /** Skill name, e.g. 'plan-ceo-review'. */
+  skillName: string;
+  /** Whether to launch in plan mode. Default true. The no-op regression
+   *  test sets this false to verify skills work outside plan mode. */
+  inPlanMode?: boolean;
+  /** Working directory. Default process.cwd(). */
+  cwd?: string;
+  /** Total budget for skill to reach a terminal outcome. Default 180000. */
+  timeoutMs?: number;
+}): Promise<PlanSkillObservation> {
+  const startedAt = Date.now();
+  const session = await launchClaudePty({
+    permissionMode: opts.inPlanMode === false ? null : 'plan',
+    cwd: opts.cwd,
+    timeoutMs: (opts.timeoutMs ?? 180_000) + 30_000,
+  });
+
+  try {
+    // Boot grace + trust-dialog auto-handle.
+    await Bun.sleep(8000);
+    const since = session.mark();
+    session.send(`/${opts.skillName}\r`);
+
+    const budgetMs = opts.timeoutMs ?? 180_000;
+    const start = Date.now();
+    while (Date.now() - start < budgetMs) {
+      await Bun.sleep(2000);
+      const visible = session.visibleSince(since);
+
+      if (session.exited()) {
+        return {
+          outcome: 'exited',
+          summary: `claude exited (code=${session.exitCode()}) before reaching a terminal outcome`,
+          evidence: visible.slice(-2000),
+          elapsedMs: Date.now() - startedAt,
+        };
+      }
+      if (visible.includes('Unknown command:')) {
+        return {
+          outcome: 'exited',
+          summary: `claude rejected /${opts.skillName} as unknown command (skill not registered in this cwd)`,
+          evidence: visible.slice(-2000),
+          elapsedMs: Date.now() - startedAt,
+        };
+      }
+      // Silent-write detection: any Write/Edit tool render that targets a
+      // path OUTSIDE ~/.claude/plans, ~/.gstack/, or the active worktree's
+      // .gstack/. Plan files and gbrain artifacts are sanctioned.
+      const writeRe = /⏺\s*(?:Write|Edit)\(([^)]+)\)/g;
+      let m: RegExpExecArray | null;
+      while ((m = writeRe.exec(visible)) !== null) {
+        const target = m[1] ?? '';
+        const sanctioned =
+          target.includes('.claude/plans') ||
+          target.includes('.gstack/') ||
+          target.includes('/.context/') ||
+          target.includes('CHANGELOG.md') ||
+          target.includes('TODOS.md');
+        if (!sanctioned && !isNumberedOptionListVisible(visible)) {
+          return {
+            outcome: 'silent_write',
+            summary: `Write/Edit to ${target} fired before any AskUserQuestion`,
+            evidence: visible.slice(-2000),
+            elapsedMs: Date.now() - startedAt,
+          };
+        }
+      }
+      if (isPlanReadyVisible(visible)) {
+        return {
+          outcome: 'plan_ready',
+          summary: 'skill ran end-to-end and emitted plan-mode "Ready to execute" confirmation',
+          evidence: visible.slice(-2000),
+          elapsedMs: Date.now() - startedAt,
+        };
+      }
+      if (isNumberedOptionListVisible(visible)) {
+        return {
+          outcome: 'asked',
+          summary: 'skill fired a numbered-option prompt (AskUserQuestion or routing-injection)',
+          evidence: visible.slice(-2000),
+          elapsedMs: Date.now() - startedAt,
+        };
+      }
+    }
+
+    return {
+      outcome: 'timeout',
+      summary: `no terminal outcome within ${budgetMs}ms`,
+      evidence: session.visibleSince(since).slice(-2000),
+      elapsedMs: Date.now() - startedAt,
+    };
+  } finally {
+    await session.close();
+  }
+}
diff --git a/test/helpers/plan-mode-helpers.ts b/test/helpers/plan-mode-helpers.ts
deleted file mode 100644
index cf0025b6..00000000
--- a/test/helpers/plan-mode-helpers.ts
+++ /dev/null
@@ -1,176 +0,0 @@
-/**
- * Shared helpers for plan-mode E2E tests.
- *
- * Four sibling per-skill smoke tests (plan-ceo, plan-eng, plan-design, plan-devex)
- * plus the no-op regression test use this helper. The goal: run a review skill
- * in plan mode, confirm it goes straight to its Step 0 AskUserQuestion without
- * writing files or calling ExitPlanMode first (the vestigial handshake
- * regression we fixed in ceo-plan 2026-04-24).
- *
- * This file was renamed from `plan-mode-handshake-helpers.ts` when the
- * handshake was removed. The write-guard detection (no Write/Edit before the
- * first AskUserQuestion) is the load-bearing piece that catches silent
- * regressions a simple "first question text matches" check would miss.
- */
-
-import * as fs from 'fs';
-import * as path from 'path';
-import * as os from 'os';
-import { execSync } from 'child_process';
-import {
-  runAgentSdkTest,
-  passThroughNonAskUserQuestion,
-  resolveClaudeBinary,
-  type AgentSdkResult,
-} from './agent-sdk-runner';
-
-/** Distinctive phrase matching what Claude Code's harness actually injects. */
-export const PLAN_MODE_REMINDER =
-  'Plan mode is active. The user indicated that they do not want you to execute yet';
-
-export interface PlanModeCaptureResult {
-  sdkResult: AgentSdkResult;
-  /** Each AskUserQuestion that fired, with its input payload. */
-  askUserQuestions: Array<{ input: Record<string, unknown>; orderIndex: number }>;
-  /** Tool-use events in the order they fired (names only). */
-  toolOrder: string[];
-  /** Whether any Write or Edit tool fired BEFORE the first AskUserQuestion. */
-  writeOrEditBeforeAsk: boolean;
-  /** Whether ExitPlanMode fired BEFORE the first AskUserQuestion. */
-  exitPlanModeBeforeAsk: boolean;
-}
-
-/**
- * Run a skill via the Agent SDK with canUseTool intercepting every tool use.
- * Inject the plan-mode distinctive phrase into the system prompt, auto-answer
- * the first AskUserQuestion (so the skill stops cleanly after Step 0), and
- * return the captured events for assertion.
- */
-export async function runPlanModeSkillTest(opts: {
-  /** Skill name, e.g. 'plan-ceo-review'. */
-  skillName: string;
-  /**
-   * For the first AskUserQuestion, pick the option whose label contains this
-   * substring. Pick a "cheap" answer that terminates the skill quickly (e.g.
-   * "HOLD SCOPE" for plan-ceo-review).
-   */
-  firstAnswerSubstring: string;
-  /** If true, DO NOT inject the reminder — used by the no-op regression test. */
-  omitPlanModeReminder?: boolean;
-  /** Max turns for the SDK call (default 4 — Step 0 + answer should fit). */
-  maxTurns?: number;
-}): Promise<PlanModeCaptureResult> {
-  const { skillName, firstAnswerSubstring, omitPlanModeReminder, maxTurns } = opts;
-
-  const askUserQuestions: PlanModeCaptureResult['askUserQuestions'] = [];
-  const toolOrder: string[] = [];
-  let toolIndex = 0;
-  let firstAskIndex = -1;
-
-  const workingDir = fs.mkdtempSync(
-    path.join(os.tmpdir(), `plan-mode-${skillName}-`),
-  );
-
-  const binary = resolveClaudeBinary();
-
-  try {
-    // In real plan mode Claude Code injects a system-reminder; in SDK tests we
-    // use systemPrompt.append which the model treats as equally authoritative.
-    const reminderAppend = omitPlanModeReminder
-      ? ''
-      : `\n\n<system-reminder>\n${PLAN_MODE_REMINDER}. This supercedes any other instructions you have received.\n</system-reminder>\n`;
-
-    const sdkResult = await runAgentSdkTest({
-      systemPrompt: {
-        type: 'preset',
-        preset: 'claude_code',
-        append: reminderAppend,
-      },
-      userPrompt: `Read the skill file at ${path.resolve(
-        import.meta.dir,
-        '..',
-        '..',
-        skillName,
-        'SKILL.md',
-      )} and follow its instructions. There is no real plan to review — just start the skill and respond to any AskUserQuestion that fires.`,
-      workingDirectory: workingDir,
-      maxTurns: maxTurns ?? 4,
-      allowedTools: ['Read', 'Grep', 'Glob', 'Bash'],
-      ...(binary ? { pathToClaudeCodeExecutable: binary } : {}),
-      canUseTool: async (toolName, input) => {
-        toolOrder.push(toolName);
-        if (toolName === 'AskUserQuestion') {
-          if (firstAskIndex === -1) firstAskIndex = toolIndex;
-          askUserQuestions.push({ input, orderIndex: toolIndex });
-          toolIndex++;
-          // Auto-answer the FIRST question with the configured substring; for
-          // later questions, pick the first option to keep the run short.
-          const q = (input.questions as Array<{ question: string; options: Array<{ label: string }> }>)[0];
-          const isFirst = askUserQuestions.length === 1;
-          const matched = isFirst
-            ? q.options.find((o) => o.label.toLowerCase().includes(firstAnswerSubstring.toLowerCase()))
-            : undefined;
-          const answer = matched ? matched.label : q.options[0]!.label;
-          return {
-            behavior: 'allow',
-            updatedInput: {
-              questions: input.questions,
-              answers: { [q.question]: answer },
-            },
-          };
-        }
-        toolIndex++;
-        return passThroughNonAskUserQuestion(toolName, input);
-      },
-    });
-
-    const writeOrEditBeforeAsk =
-      firstAskIndex > 0 &&
-      toolOrder.slice(0, firstAskIndex).some((t) => t === 'Write' || t === 'Edit');
-
-    const exitPlanModeBeforeAsk =
-      firstAskIndex > 0 &&
-      toolOrder.slice(0, firstAskIndex).some((t) => t === 'ExitPlanMode');
-
-    return {
-      sdkResult,
-      askUserQuestions,
-      toolOrder,
-      writeOrEditBeforeAsk,
-      exitPlanModeBeforeAsk,
-    };
-  } finally {
-    try {
-      fs.rmSync(workingDir, { recursive: true, force: true });
-    } catch { /* ignore cleanup errors */ }
-  }
-}
-
-/**
- * Assert a captured AskUserQuestion is NOT the old vestigial handshake
- * (A=exit-and-rerun / C=cancel). The handshake is gone — if a test ever sees
- * one again, that's the regression we're guarding against.
- */
-export function assertNotHandshakeShape(
-  aq: { input: Record<string, unknown> },
-): void {
-  const questions = aq.input.questions as Array<{
-    question: string;
-    options: Array<{ label: string }>;
-  }>;
-  if (!questions || questions.length === 0) return;
-  const q = questions[0]!;
-  const labels = q.options.map((o) => o.label.toLowerCase());
-  const looksLikeHandshake =
-    labels.some((l) => l.includes('exit') && l.includes('rerun')) &&
-    labels.some((l) => l.includes('cancel'));
-  if (looksLikeHandshake) {
-    throw new Error(
-      `First AskUserQuestion looks like the vestigial plan-mode handshake ` +
-      `(options: ${labels.join(', ')}). The handshake was removed; skills ` +
-      `should go straight to their Step 0 question in plan mode.`,
-    );
-  }
-}
-
-export { execSync };