/** * Shared helpers for plan-mode handshake E2E tests. * * Four sibling test files (plan-ceo, plan-eng, plan-design, plan-devex) exercise * the identical handshake contract against different skills. This helper * centralizes the canUseTool interceptor and the assertion shape so the four * test files are thin wiring (~40 LOC each) and can't drift out of sync. * * See scripts/resolvers/preamble/generate-plan-mode-handshake.ts for the * handshake prose that the tests below assert against. */ import { expect } from 'bun:test'; import * as fs from 'fs'; import * as path from 'path'; import * as os from 'os'; import { execSync } from 'child_process'; import { runAgentSdkTest, passThroughNonAskUserQuestion, resolveClaudeBinary, type AgentSdkResult, } from './agent-sdk-runner'; /** Distinctive phrase matching what Claude Code's harness actually injects. */ export const PLAN_MODE_REMINDER = 'Plan mode is active. The user indicated that they do not want you to execute yet'; export interface HandshakeCaptureResult { sdkResult: AgentSdkResult; /** Each AskUserQuestion that fired, with its input payload. */ askUserQuestions: Array<{ input: Record; orderIndex: number }>; /** Tool-use events in the order they fired (names only). */ toolOrder: string[]; /** Whether any Write or Edit tool fired BEFORE the first AskUserQuestion. */ writeOrEditBeforeAsk: boolean; } /** * Run a skill via the Agent SDK with canUseTool intercepting every tool use. * Inject the plan-mode distinctive phrase into the system prompt and auto- * answer the handshake with the given answerLabel ("Exit" or "Cancel"). Return * the captured events for assertion. */ export async function runPlanModeHandshakeTest(opts: { /** Skill name, e.g. 'plan-ceo-review'. */ skillName: string; /** "Exit" to pick option A (exit-and-rerun) or "Cancel" for option C. */ answerLabel: 'Exit' | 'Cancel'; /** If true, DO NOT inject the reminder — used by the no-op regression test. */ omitPlanModeReminder?: boolean; /** Max turns for the SDK call (default 4 — handshake + exit should fit easily). */ maxTurns?: number; }): Promise { const { skillName, answerLabel, omitPlanModeReminder, maxTurns } = opts; const askUserQuestions: HandshakeCaptureResult['askUserQuestions'] = []; const toolOrder: string[] = []; let toolIndex = 0; let firstAskIndex = -1; const workingDir = fs.mkdtempSync( path.join(os.tmpdir(), `plan-mode-handshake-${skillName}-`), ); // The SDK requires AskUserQuestion to be in the allowed tools list. The // harness auto-adds it when canUseTool is supplied, but we also want Read // so the skill can load its own file if it tries to. const binary = resolveClaudeBinary(); try { // Inject the distinctive phrase into the system prompt by appending it to // the default Claude Code preset. Claude Code's real plan mode uses an // injected system-reminder; in SDK tests we use systemPrompt.append which // the model treats as equally authoritative. const reminderAppend = omitPlanModeReminder ? '' : `\n\n\n${PLAN_MODE_REMINDER}. This supercedes any other instructions you have received.\n\n`; const sdkResult = await runAgentSdkTest({ systemPrompt: { type: 'preset', preset: 'claude_code', append: reminderAppend, }, userPrompt: `Read the skill file at ${path.resolve( import.meta.dir, '..', '..', skillName, 'SKILL.md', )} and follow its instructions. There is no real plan to review — just start the skill and respond to any AskUserQuestion that fires.`, workingDirectory: workingDir, maxTurns: maxTurns ?? 4, allowedTools: ['Read', 'Grep', 'Glob', 'Bash'], ...(binary ? { pathToClaudeCodeExecutable: binary } : {}), canUseTool: async (toolName, input) => { toolOrder.push(toolName); if (toolName === 'AskUserQuestion') { if (firstAskIndex === -1) firstAskIndex = toolIndex; askUserQuestions.push({ input, orderIndex: toolIndex }); toolIndex++; // Auto-answer with the label the test specified. const q = (input.questions as Array<{ question: string; options: Array<{ label: string }> }>)[0]; const matched = q.options.find((o) => o.label.includes(answerLabel)); const answer = matched ? matched.label : q.options[0]!.label; return { behavior: 'allow', updatedInput: { questions: input.questions, answers: { [q.question]: answer }, }, }; } toolIndex++; return passThroughNonAskUserQuestion(toolName, input); }, }); const writeOrEditBeforeAsk = firstAskIndex > 0 && toolOrder.slice(0, firstAskIndex).some((t) => t === 'Write' || t === 'Edit'); return { sdkResult, askUserQuestions, toolOrder, writeOrEditBeforeAsk }; } finally { try { fs.rmSync(workingDir, { recursive: true, force: true }); } catch { /* ignore cleanup errors */ } } } /** Assert the shape of a fired handshake AskUserQuestion. */ export function assertHandshakeShape( aq: { input: Record }, ): void { const questions = aq.input.questions as Array<{ question: string; options: Array<{ label: string }>; }>; expect(questions).toBeDefined(); expect(questions.length).toBe(1); const q = questions[0]!; // D8 dropped Option B; handshake has exactly 2 options. expect(q.options.length).toBe(2); const labels = q.options.map((o) => o.label); expect(labels.some((l) => l.includes('Exit'))).toBe(true); expect(labels.some((l) => l.includes('Cancel'))).toBe(true); } /** Read the skill-usage.jsonl log and return handshake entries. */ export function readHandshakeLog(): Array> { const logPath = path.join(os.homedir(), '.gstack', 'analytics', 'skill-usage.jsonl'); if (!fs.existsSync(logPath)) return []; const lines = fs.readFileSync(logPath, 'utf-8').split('\n').filter(Boolean); return lines .map((line) => { try { return JSON.parse(line); } catch { return null; } }) .filter((x): x is Record => x !== null && x.event === 'plan_mode_handshake'); } export { execSync };