diff --git a/test/e2e-harness-audit.test.ts b/test/e2e-harness-audit.test.ts index b517ef84..ce709cae 100644 --- a/test/e2e-harness-audit.test.ts +++ b/test/e2e-harness-audit.test.ts @@ -78,12 +78,12 @@ function findInteractiveSkills(): string[] { /** * Scan a test file's contents for the canUseTool-via-harness pattern. * Either: direct canUseTool usage in runAgentSdkTest, or usage of the - * shared plan-mode-handshake-helpers that wrap it. + * shared plan-mode-helpers that wrap it. */ function hasCanUseToolCoverage(testFile: string): boolean { const content = fs.readFileSync(testFile, 'utf-8'); if (content.includes('canUseTool')) return true; - if (content.includes('runPlanModeHandshakeTest')) return true; + if (content.includes('runPlanModeSkillTest')) return true; return false; } diff --git a/test/gen-skill-docs.test.ts b/test/gen-skill-docs.test.ts index 60dc8ad9..8afc7b8e 100644 --- a/test/gen-skill-docs.test.ts +++ b/test/gen-skill-docs.test.ts @@ -2775,72 +2775,90 @@ describe('voice-triggers processing', () => { }); }); -describe('plan-mode handshake (interactive: true) resolver', () => { - const INTERACTIVE_SKILLS = [ +describe('plan-mode-info resolver (handshake-replacement)', () => { + const REVIEW_SKILLS = [ 'plan-ceo-review', 'plan-eng-review', 'plan-design-review', 'plan-devex-review', ]; + // Header for the vestigial handshake that was removed. If it ever reappears, + // someone accidentally re-introduced the resolver. const HANDSHAKE_MARKER = '## Plan Mode Handshake'; + // Header for the new plan-mode-info section (previously lived at the tail + // of completion-status.ts; now hoisted to position 1 of the preamble). + const PLAN_MODE_INFO_MARKER = '## Skill Invocation During Plan Mode'; - test.each(INTERACTIVE_SKILLS)( - '%s (Claude host) SKILL.md contains the handshake section', - (skill) => { - const content = fs.readFileSync(path.join(ROOT, skill, 'SKILL.md'), 'utf-8'); - expect(content).toContain(HANDSHAKE_MARKER); - expect(content).toContain( - 'Plan mode is active. The user indicated that they do not want you to execute yet', - ); - }, - ); - - test('handshake is absent from non-interactive Claude skills', () => { - const nonInteractive = ['ship', 'review', 'qa', 'office-hours', 'codex', 'retro', 'cso']; - for (const skill of nonInteractive) { - const content = fs.readFileSync(path.join(ROOT, skill, 'SKILL.md'), 'utf-8'); - expect(content).not.toContain(HANDSHAKE_MARKER); + test('vestigial handshake is absent from all generated Claude SKILL.md files', () => { + // Scan every generated SKILL.md under ROOT (top-level directory per skill). + // Using fs.readdirSync + filter instead of a glob so we catch any skill + // that gets added later without updating this list. + const entries = fs.readdirSync(ROOT, { withFileTypes: true }); + let checked = 0; + for (const entry of entries) { + if (!entry.isDirectory()) continue; + const skillMd = path.join(ROOT, entry.name, 'SKILL.md'); + if (!fs.existsSync(skillMd)) continue; + const content = fs.readFileSync(skillMd, 'utf-8'); + expect(content, `handshake marker in ${entry.name}/SKILL.md`).not.toContain(HANDSHAKE_MARKER); + checked++; } + expect(checked).toBeGreaterThan(0); }); - test('handshake is absent from non-Claude host outputs when present on disk', () => { + test('vestigial handshake is absent from non-Claude host outputs when present on disk', () => { // Non-Claude hosts render to hostSubdirs (.agents/, .openclaw/, etc). The - // handshake resolver returns '' when ctx.host !== 'claude', so those - // outputs must not contain the marker. The current gen-skill-docs layout - // prefixes skill names as `gstack-` under the hostSubdir; older - // layouts used `gstack/` (no prefix). Only stable-present paths - // are asserted — older ones may or may not exist per install history. - const candidateOutputs = [ - // Current prefixed layout - path.join(ROOT, '.agents', 'skills', 'gstack-plan-ceo-review', 'SKILL.md'), - path.join(ROOT, '.openclaw', 'skills', 'gstack-plan-ceo-review', 'SKILL.md'), - path.join(ROOT, '.opencode', 'skills', 'gstack-plan-ceo-review', 'SKILL.md'), - path.join(ROOT, '.factory', 'skills', 'gstack-plan-ceo-review', 'SKILL.md'), - path.join(ROOT, '.hermes', 'skills', 'gstack-plan-ceo-review', 'SKILL.md'), - ]; + // plan-mode-info resolver has no host-scoping — all hosts get the new + // section, none get the old handshake. Scan all candidate host dirs. + const hostDirs = ['.agents', '.openclaw', '.opencode', '.factory', '.hermes', '.kiro', '.cursor', '.slate']; let checked = 0; - for (const out of candidateOutputs) { - if (fs.existsSync(out)) { - const content = fs.readFileSync(out, 'utf-8'); - expect(content).not.toContain(HANDSHAKE_MARKER); + for (const host of hostDirs) { + const skillsRoot = path.join(ROOT, host, 'skills'); + if (!fs.existsSync(skillsRoot)) continue; + const entries = fs.readdirSync(skillsRoot, { withFileTypes: true }); + for (const entry of entries) { + if (!entry.isDirectory()) continue; + const skillMd = path.join(skillsRoot, entry.name, 'SKILL.md'); + if (!fs.existsSync(skillMd)) continue; + const content = fs.readFileSync(skillMd, 'utf-8'); + expect(content, `handshake marker in ${host}/skills/${entry.name}/SKILL.md`).not.toContain(HANDSHAKE_MARKER); checked++; } } - // At least one non-Claude host's output should exist after a full gen - // run; this test is meaningful only if we checked something. If no - // non-Claude outputs exist locally, the cross-host guarantee is still - // enforced by the resolver's ctx.host check; this test is belt-and- - // suspenders and becomes a no-op rather than a false positive. if (checked === 0) { // eslint-disable-next-line no-console console.warn( - 'plan-mode handshake: no non-Claude host outputs found for cross-host absence check — ' + + 'plan-mode-info: no non-Claude host outputs found for cross-host absence check — ' + 'run `bun run gen:skill-docs --host all` to populate', ); } }); + test.each(REVIEW_SKILLS)( + '%s/SKILL.md contains the new plan-mode-info section near the top', + (skill) => { + const content = fs.readFileSync(path.join(ROOT, skill, 'SKILL.md'), 'utf-8'); + const idx = content.indexOf(PLAN_MODE_INFO_MARKER); + expect(idx).toBeGreaterThan(0); + // Position 1 in preamble composition = within the first ~300 lines. + // Roughly translates to first ~15KB of text. + expect(idx).toBeLessThan(15_000); + }, + ); + + test('plan-mode-info is wired BEFORE generateUpgradeCheck in preamble', () => { + const content = fs.readFileSync( + path.join(ROOT, 'plan-ceo-review', 'SKILL.md'), + 'utf-8', + ); + const planModeIdx = content.indexOf(PLAN_MODE_INFO_MARKER); + const upgradeIdx = content.indexOf('UPGRADE_AVAILABLE'); + expect(planModeIdx).toBeGreaterThan(0); + expect(upgradeIdx).toBeGreaterThan(0); + expect(planModeIdx).toBeLessThan(upgradeIdx); + }); + test('0C-bis STOP block present in plan-ceo-review/SKILL.md', () => { const content = fs.readFileSync(path.join(ROOT, 'plan-ceo-review', 'SKILL.md'), 'utf-8'); const presentIdx = content.indexOf('Present these approach options via AskUserQuestion'); @@ -2851,16 +2869,4 @@ describe('plan-mode handshake (interactive: true) resolver', () => { expect(between).toContain('**STOP.**'); expect(between).toContain('Do NOT proceed to Step 0D or 0F until the user responds to 0C-bis'); }); - - test('handshake resolver is wired BEFORE generateUpgradeCheck in preamble', () => { - const content = fs.readFileSync( - path.join(ROOT, 'plan-ceo-review', 'SKILL.md'), - 'utf-8', - ); - const handshakeIdx = content.indexOf(HANDSHAKE_MARKER); - const upgradeIdx = content.indexOf('UPGRADE_AVAILABLE'); - expect(handshakeIdx).toBeGreaterThan(0); - expect(upgradeIdx).toBeGreaterThan(0); - expect(handshakeIdx).toBeLessThan(upgradeIdx); - }); }); diff --git a/test/helpers/plan-mode-handshake-helpers.ts b/test/helpers/plan-mode-helpers.ts similarity index 52% rename from test/helpers/plan-mode-handshake-helpers.ts rename to test/helpers/plan-mode-helpers.ts index 581932be..cf0025b6 100644 --- a/test/helpers/plan-mode-handshake-helpers.ts +++ b/test/helpers/plan-mode-helpers.ts @@ -1,16 +1,18 @@ /** - * Shared helpers for plan-mode handshake E2E tests. + * Shared helpers for plan-mode E2E tests. * - * Four sibling test files (plan-ceo, plan-eng, plan-design, plan-devex) exercise - * the identical handshake contract against different skills. This helper - * centralizes the canUseTool interceptor and the assertion shape so the four - * test files are thin wiring (~40 LOC each) and can't drift out of sync. + * Four sibling per-skill smoke tests (plan-ceo, plan-eng, plan-design, plan-devex) + * plus the no-op regression test use this helper. The goal: run a review skill + * in plan mode, confirm it goes straight to its Step 0 AskUserQuestion without + * writing files or calling ExitPlanMode first (the vestigial handshake + * regression we fixed in ceo-plan 2026-04-24). * - * See scripts/resolvers/preamble/generate-plan-mode-handshake.ts for the - * handshake prose that the tests below assert against. + * This file was renamed from `plan-mode-handshake-helpers.ts` when the + * handshake was removed. The write-guard detection (no Write/Edit before the + * first AskUserQuestion) is the load-bearing piece that catches silent + * regressions a simple "first question text matches" check would miss. */ -import { expect } from 'bun:test'; import * as fs from 'fs'; import * as path from 'path'; import * as os from 'os'; @@ -26,7 +28,7 @@ import { export const PLAN_MODE_REMINDER = 'Plan mode is active. The user indicated that they do not want you to execute yet'; -export interface HandshakeCaptureResult { +export interface PlanModeCaptureResult { sdkResult: AgentSdkResult; /** Each AskUserQuestion that fired, with its input payload. */ askUserQuestions: Array<{ input: Record; orderIndex: number }>; @@ -34,45 +36,46 @@ export interface HandshakeCaptureResult { toolOrder: string[]; /** Whether any Write or Edit tool fired BEFORE the first AskUserQuestion. */ writeOrEditBeforeAsk: boolean; + /** Whether ExitPlanMode fired BEFORE the first AskUserQuestion. */ + exitPlanModeBeforeAsk: boolean; } /** * Run a skill via the Agent SDK with canUseTool intercepting every tool use. - * Inject the plan-mode distinctive phrase into the system prompt and auto- - * answer the handshake with the given answerLabel ("Exit" or "Cancel"). Return - * the captured events for assertion. + * Inject the plan-mode distinctive phrase into the system prompt, auto-answer + * the first AskUserQuestion (so the skill stops cleanly after Step 0), and + * return the captured events for assertion. */ -export async function runPlanModeHandshakeTest(opts: { +export async function runPlanModeSkillTest(opts: { /** Skill name, e.g. 'plan-ceo-review'. */ skillName: string; - /** "Exit" to pick option A (exit-and-rerun) or "Cancel" for option C. */ - answerLabel: 'Exit' | 'Cancel'; + /** + * For the first AskUserQuestion, pick the option whose label contains this + * substring. Pick a "cheap" answer that terminates the skill quickly (e.g. + * "HOLD SCOPE" for plan-ceo-review). + */ + firstAnswerSubstring: string; /** If true, DO NOT inject the reminder — used by the no-op regression test. */ omitPlanModeReminder?: boolean; - /** Max turns for the SDK call (default 4 — handshake + exit should fit easily). */ + /** Max turns for the SDK call (default 4 — Step 0 + answer should fit). */ maxTurns?: number; -}): Promise { - const { skillName, answerLabel, omitPlanModeReminder, maxTurns } = opts; +}): Promise { + const { skillName, firstAnswerSubstring, omitPlanModeReminder, maxTurns } = opts; - const askUserQuestions: HandshakeCaptureResult['askUserQuestions'] = []; + const askUserQuestions: PlanModeCaptureResult['askUserQuestions'] = []; const toolOrder: string[] = []; let toolIndex = 0; let firstAskIndex = -1; const workingDir = fs.mkdtempSync( - path.join(os.tmpdir(), `plan-mode-handshake-${skillName}-`), + path.join(os.tmpdir(), `plan-mode-${skillName}-`), ); - // The SDK requires AskUserQuestion to be in the allowed tools list. The - // harness auto-adds it when canUseTool is supplied, but we also want Read - // so the skill can load its own file if it tries to. const binary = resolveClaudeBinary(); try { - // Inject the distinctive phrase into the system prompt by appending it to - // the default Claude Code preset. Claude Code's real plan mode uses an - // injected system-reminder; in SDK tests we use systemPrompt.append which - // the model treats as equally authoritative. + // In real plan mode Claude Code injects a system-reminder; in SDK tests we + // use systemPrompt.append which the model treats as equally authoritative. const reminderAppend = omitPlanModeReminder ? '' : `\n\n\n${PLAN_MODE_REMINDER}. This supercedes any other instructions you have received.\n\n`; @@ -100,9 +103,13 @@ export async function runPlanModeHandshakeTest(opts: { if (firstAskIndex === -1) firstAskIndex = toolIndex; askUserQuestions.push({ input, orderIndex: toolIndex }); toolIndex++; - // Auto-answer with the label the test specified. + // Auto-answer the FIRST question with the configured substring; for + // later questions, pick the first option to keep the run short. const q = (input.questions as Array<{ question: string; options: Array<{ label: string }> }>)[0]; - const matched = q.options.find((o) => o.label.includes(answerLabel)); + const isFirst = askUserQuestions.length === 1; + const matched = isFirst + ? q.options.find((o) => o.label.toLowerCase().includes(firstAnswerSubstring.toLowerCase())) + : undefined; const answer = matched ? matched.label : q.options[0]!.label; return { behavior: 'allow', @@ -121,7 +128,17 @@ export async function runPlanModeHandshakeTest(opts: { firstAskIndex > 0 && toolOrder.slice(0, firstAskIndex).some((t) => t === 'Write' || t === 'Edit'); - return { sdkResult, askUserQuestions, toolOrder, writeOrEditBeforeAsk }; + const exitPlanModeBeforeAsk = + firstAskIndex > 0 && + toolOrder.slice(0, firstAskIndex).some((t) => t === 'ExitPlanMode'); + + return { + sdkResult, + askUserQuestions, + toolOrder, + writeOrEditBeforeAsk, + exitPlanModeBeforeAsk, + }; } finally { try { fs.rmSync(workingDir, { recursive: true, force: true }); @@ -129,38 +146,31 @@ export async function runPlanModeHandshakeTest(opts: { } } -/** Assert the shape of a fired handshake AskUserQuestion. */ -export function assertHandshakeShape( +/** + * Assert a captured AskUserQuestion is NOT the old vestigial handshake + * (A=exit-and-rerun / C=cancel). The handshake is gone — if a test ever sees + * one again, that's the regression we're guarding against. + */ +export function assertNotHandshakeShape( aq: { input: Record }, ): void { const questions = aq.input.questions as Array<{ question: string; options: Array<{ label: string }>; }>; - expect(questions).toBeDefined(); - expect(questions.length).toBe(1); + if (!questions || questions.length === 0) return; const q = questions[0]!; - // D8 dropped Option B; handshake has exactly 2 options. - expect(q.options.length).toBe(2); - const labels = q.options.map((o) => o.label); - expect(labels.some((l) => l.includes('Exit'))).toBe(true); - expect(labels.some((l) => l.includes('Cancel'))).toBe(true); -} - -/** Read the skill-usage.jsonl log and return handshake entries. */ -export function readHandshakeLog(): Array> { - const logPath = path.join(os.homedir(), '.gstack', 'analytics', 'skill-usage.jsonl'); - if (!fs.existsSync(logPath)) return []; - const lines = fs.readFileSync(logPath, 'utf-8').split('\n').filter(Boolean); - return lines - .map((line) => { - try { - return JSON.parse(line); - } catch { - return null; - } - }) - .filter((x): x is Record => x !== null && x.event === 'plan_mode_handshake'); + const labels = q.options.map((o) => o.label.toLowerCase()); + const looksLikeHandshake = + labels.some((l) => l.includes('exit') && l.includes('rerun')) && + labels.some((l) => l.includes('cancel')); + if (looksLikeHandshake) { + throw new Error( + `First AskUserQuestion looks like the vestigial plan-mode handshake ` + + `(options: ${labels.join(', ')}). The handshake was removed; skills ` + + `should go straight to their Step 0 question in plan mode.`, + ); + } } export { execSync }; diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index d039e771..abc76023 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -82,16 +82,16 @@ export const E2E_TOUCHFILES: Record = { 'plan-eng-review-artifact': ['plan-eng-review/**'], 'plan-review-report': ['plan-eng-review/**', 'scripts/gen-skill-docs.ts'], - // Plan-mode handshake (v1.10.2.0) — gate-tier safety regression tests. - // Each fires when any of: the interactive skill's template, the resolver, - // preamble composition, the Agent SDK harness, the question registry, or - // the one-way-door classifier changes. - 'plan-ceo-review-plan-mode': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-plan-mode-handshake.ts', 'scripts/resolvers/preamble.ts', 'scripts/question-registry.ts', 'scripts/one-way-doors.ts', 'test/helpers/agent-sdk-runner.ts'], - 'plan-eng-review-plan-mode': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-plan-mode-handshake.ts', 'scripts/resolvers/preamble.ts', 'scripts/question-registry.ts', 'scripts/one-way-doors.ts', 'test/helpers/agent-sdk-runner.ts'], - 'plan-design-review-plan-mode-handshake': ['plan-design-review/**', 'scripts/resolvers/preamble/generate-plan-mode-handshake.ts', 'scripts/resolvers/preamble.ts', 'scripts/question-registry.ts', 'scripts/one-way-doors.ts', 'test/helpers/agent-sdk-runner.ts'], - 'plan-devex-review-plan-mode': ['plan-devex-review/**', 'scripts/resolvers/preamble/generate-plan-mode-handshake.ts', 'scripts/resolvers/preamble.ts', 'scripts/question-registry.ts', 'scripts/one-way-doors.ts', 'test/helpers/agent-sdk-runner.ts'], - 'plan-mode-no-op': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-plan-mode-handshake.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/agent-sdk-runner.ts'], - 'e2e-harness-audit': ['plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-plan-mode-handshake.ts', 'test/helpers/agent-sdk-runner.ts'], + // Plan-mode smoke tests — gate-tier safety regression tests. Each fires when + // any of: the interactive skill's template, the plan-mode resolver + // (completion-status now owns generatePlanModeInfo), preamble composition, + // the Agent SDK harness, or the shared plan-mode-helpers change. + 'plan-ceo-review-plan-mode': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/agent-sdk-runner.ts', 'test/helpers/plan-mode-helpers.ts'], + 'plan-eng-review-plan-mode': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/agent-sdk-runner.ts', 'test/helpers/plan-mode-helpers.ts'], + 'plan-design-review-plan-mode': ['plan-design-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/agent-sdk-runner.ts', 'test/helpers/plan-mode-helpers.ts'], + 'plan-devex-review-plan-mode': ['plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/agent-sdk-runner.ts', 'test/helpers/plan-mode-helpers.ts'], + 'plan-mode-no-op': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/agent-sdk-runner.ts', 'test/helpers/plan-mode-helpers.ts'], + 'e2e-harness-audit': ['plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'test/helpers/agent-sdk-runner.ts', 'test/helpers/plan-mode-helpers.ts'], 'brain-privacy-gate': ['scripts/resolvers/preamble/generate-brain-sync-block.ts', 'scripts/resolvers/preamble.ts', 'bin/gstack-brain-sync', 'bin/gstack-brain-init', 'bin/gstack-config', 'test/helpers/agent-sdk-runner.ts'], // AskUserQuestion format regression (RECOMMENDATION + Completeness: N/10) @@ -332,7 +332,7 @@ export const E2E_TIERS: Record = { // Plan-mode handshake — deterministic safety regression, gate-tier 'plan-ceo-review-plan-mode': 'gate', 'plan-eng-review-plan-mode': 'gate', - 'plan-design-review-plan-mode-handshake': 'gate', + 'plan-design-review-plan-mode': 'gate', 'plan-devex-review-plan-mode': 'gate', 'plan-mode-no-op': 'gate', 'e2e-harness-audit': 'gate', diff --git a/test/skill-e2e-plan-ceo-plan-mode.test.ts b/test/skill-e2e-plan-ceo-plan-mode.test.ts index 858e07eb..8914967e 100644 --- a/test/skill-e2e-plan-ceo-plan-mode.test.ts +++ b/test/skill-e2e-plan-ceo-plan-mode.test.ts @@ -1,40 +1,38 @@ /** - * plan-ceo-review plan-mode handshake E2E (gate tier, paid). + * plan-ceo-review plan-mode smoke test (gate tier, paid). * * Asserts: when /plan-ceo-review is invoked with the plan-mode distinctive - * phrase in the system reminder, the skill fires AskUserQuestion FIRST - * (before any Write or Edit), the question has exactly 2 options (A exit, - * C cancel), picking "Exit" leads to an orderly exit with no plan file - * written. + * phrase in the system reminder, the skill goes STRAIGHT to its Step 0 + * scope-mode AskUserQuestion. Specifically: + * 1. First AskUserQuestion is NOT the old vestigial handshake + * (A=exit-and-rerun / C=cancel). + * 2. No Write or Edit tool fires before the first AskUserQuestion + * (catches silent plan-file-write bypass). + * 3. ExitPlanMode does not fire before the first AskUserQuestion. * * Cost: ~$0.50–$1.00 per run. Gated: EVALS=1 EVALS_TIER=gate. - * Depends on: scripts/resolvers/preamble/generate-plan-mode-handshake.ts, - * test/helpers/agent-sdk-runner.ts (canUseTool extension). */ import { describe, test, expect } from 'bun:test'; import { - runPlanModeHandshakeTest, - assertHandshakeShape, -} from './helpers/plan-mode-handshake-helpers'; + runPlanModeSkillTest, + assertNotHandshakeShape, +} from './helpers/plan-mode-helpers'; const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate'; const describeE2E = shouldRun ? describe : describe.skip; -describeE2E('plan-ceo-review plan-mode handshake (gate)', () => { - test('handshake fires before any Write/Edit when plan mode is detected', async () => { - const result = await runPlanModeHandshakeTest({ +describeE2E('plan-ceo-review plan-mode smoke (gate)', () => { + test('goes straight to scope-mode question, no handshake, no silent writes', async () => { + const result = await runPlanModeSkillTest({ skillName: 'plan-ceo-review', - answerLabel: 'Exit', + // Step 0 asks for review mode; HOLD is the cheapest, most-neutral answer. + firstAnswerSubstring: 'HOLD', }); - // Handshake must have fired at least once. expect(result.askUserQuestions.length).toBeGreaterThanOrEqual(1); - // Critically: no Write or Edit fired before the first AskUserQuestion. - // This is the bug v1.10.2.0 fixes — plan mode used to allow silent - // plan-file writes without any interactive gate. + assertNotHandshakeShape(result.askUserQuestions[0]!); expect(result.writeOrEditBeforeAsk).toBe(false); - // Handshake shape: 2 options (Exit/Cancel), Option B dropped per D8. - assertHandshakeShape(result.askUserQuestions[0]!); + expect(result.exitPlanModeBeforeAsk).toBe(false); }, 120_000); }); diff --git a/test/skill-e2e-plan-design-plan-mode.test.ts b/test/skill-e2e-plan-design-plan-mode.test.ts index 1fb7aaf5..98d050e1 100644 --- a/test/skill-e2e-plan-design-plan-mode.test.ts +++ b/test/skill-e2e-plan-design-plan-mode.test.ts @@ -1,28 +1,31 @@ /** - * plan-design-review plan-mode handshake E2E (gate tier, paid). + * plan-design-review plan-mode smoke test (gate tier, paid). * * See test/skill-e2e-plan-ceo-plan-mode.test.ts for the shared assertion - * contract. This file exercises the same handshake against /plan-design-review. + * contract. Exercises the same assertions against /plan-design-review. */ import { describe, test, expect } from 'bun:test'; import { - runPlanModeHandshakeTest, - assertHandshakeShape, -} from './helpers/plan-mode-handshake-helpers'; + runPlanModeSkillTest, + assertNotHandshakeShape, +} from './helpers/plan-mode-helpers'; const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate'; const describeE2E = shouldRun ? describe : describe.skip; -describeE2E('plan-design-review plan-mode handshake (gate)', () => { - test('handshake fires before any Write/Edit when plan mode is detected', async () => { - const result = await runPlanModeHandshakeTest({ +describeE2E('plan-design-review plan-mode smoke (gate)', () => { + test('goes straight to first design question, no handshake, no silent writes', async () => { + const result = await runPlanModeSkillTest({ skillName: 'plan-design-review', - answerLabel: 'Cancel', // exercise the C-cancel branch instead of A-exit + // First question for design review varies; pick any reasonable match. + // The substring match falls back to the first option if no match. + firstAnswerSubstring: '7', }); expect(result.askUserQuestions.length).toBeGreaterThanOrEqual(1); + assertNotHandshakeShape(result.askUserQuestions[0]!); expect(result.writeOrEditBeforeAsk).toBe(false); - assertHandshakeShape(result.askUserQuestions[0]!); + expect(result.exitPlanModeBeforeAsk).toBe(false); }, 120_000); }); diff --git a/test/skill-e2e-plan-devex-plan-mode.test.ts b/test/skill-e2e-plan-devex-plan-mode.test.ts index 2ede50e2..64bc447e 100644 --- a/test/skill-e2e-plan-devex-plan-mode.test.ts +++ b/test/skill-e2e-plan-devex-plan-mode.test.ts @@ -1,28 +1,30 @@ /** - * plan-devex-review plan-mode handshake E2E (gate tier, paid). + * plan-devex-review plan-mode smoke test (gate tier, paid). * * See test/skill-e2e-plan-ceo-plan-mode.test.ts for the shared assertion - * contract. This file exercises the same handshake against /plan-devex-review. + * contract. Exercises the same assertions against /plan-devex-review. */ import { describe, test, expect } from 'bun:test'; import { - runPlanModeHandshakeTest, - assertHandshakeShape, -} from './helpers/plan-mode-handshake-helpers'; + runPlanModeSkillTest, + assertNotHandshakeShape, +} from './helpers/plan-mode-helpers'; const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate'; const describeE2E = shouldRun ? describe : describe.skip; -describeE2E('plan-devex-review plan-mode handshake (gate)', () => { - test('handshake fires before any Write/Edit when plan mode is detected', async () => { - const result = await runPlanModeHandshakeTest({ +describeE2E('plan-devex-review plan-mode smoke (gate)', () => { + test('goes straight to DX-mode question, no handshake, no silent writes', async () => { + const result = await runPlanModeSkillTest({ skillName: 'plan-devex-review', - answerLabel: 'Exit', + // Step 0 asks for DX review mode; TRIAGE is the lightest-weight mode. + firstAnswerSubstring: 'TRIAGE', }); expect(result.askUserQuestions.length).toBeGreaterThanOrEqual(1); + assertNotHandshakeShape(result.askUserQuestions[0]!); expect(result.writeOrEditBeforeAsk).toBe(false); - assertHandshakeShape(result.askUserQuestions[0]!); + expect(result.exitPlanModeBeforeAsk).toBe(false); }, 120_000); }); diff --git a/test/skill-e2e-plan-eng-plan-mode.test.ts b/test/skill-e2e-plan-eng-plan-mode.test.ts index 16da9d7a..c57bbee4 100644 --- a/test/skill-e2e-plan-eng-plan-mode.test.ts +++ b/test/skill-e2e-plan-eng-plan-mode.test.ts @@ -1,28 +1,29 @@ /** - * plan-eng-review plan-mode handshake E2E (gate tier, paid). + * plan-eng-review plan-mode smoke test (gate tier, paid). * * See test/skill-e2e-plan-ceo-plan-mode.test.ts for the shared assertion - * contract. This file exercises the same handshake against /plan-eng-review. + * contract. This file exercises the same assertions against /plan-eng-review. */ import { describe, test, expect } from 'bun:test'; import { - runPlanModeHandshakeTest, - assertHandshakeShape, -} from './helpers/plan-mode-handshake-helpers'; + runPlanModeSkillTest, + assertNotHandshakeShape, +} from './helpers/plan-mode-helpers'; const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate'; const describeE2E = shouldRun ? describe : describe.skip; -describeE2E('plan-eng-review plan-mode handshake (gate)', () => { - test('handshake fires before any Write/Edit when plan mode is detected', async () => { - const result = await runPlanModeHandshakeTest({ +describeE2E('plan-eng-review plan-mode smoke (gate)', () => { + test('goes straight to scope-mode question, no handshake, no silent writes', async () => { + const result = await runPlanModeSkillTest({ skillName: 'plan-eng-review', - answerLabel: 'Exit', + firstAnswerSubstring: 'HOLD', }); expect(result.askUserQuestions.length).toBeGreaterThanOrEqual(1); + assertNotHandshakeShape(result.askUserQuestions[0]!); expect(result.writeOrEditBeforeAsk).toBe(false); - assertHandshakeShape(result.askUserQuestions[0]!); + expect(result.exitPlanModeBeforeAsk).toBe(false); }, 120_000); }); diff --git a/test/skill-e2e-plan-mode-no-op.test.ts b/test/skill-e2e-plan-mode-no-op.test.ts index e222fbff..061e9cd8 100644 --- a/test/skill-e2e-plan-mode-no-op.test.ts +++ b/test/skill-e2e-plan-mode-no-op.test.ts @@ -1,41 +1,45 @@ /** - * Plan-mode handshake negative regression (gate tier, paid). + * Plan-mode-info no-op regression (gate tier, paid). * * Asserts: when /plan-ceo-review is invoked WITHOUT the plan-mode distinctive - * phrase in the system reminder, the handshake does NOT fire. The skill - * should proceed to its normal Step 0 flow. This is the REGRESSION RULE - * guardrail — the handshake must be a no-op outside plan mode or it breaks - * every existing interactive-review session. + * phrase in the system reminder, the plan-mode-info preamble section is a + * no-op. The skill should proceed to its normal Step 0 flow with no + * AskUserQuestion echoing or referencing the plan-mode reminder text. + * + * This guardrails the "outside plan mode, this block doesn't interfere" + * case — a different coverage case from the per-skill in-plan-mode smokes. + * If the plan-mode-info section ever starts misfiring for non-plan-mode + * sessions, this test catches it. * * Cost: ~$0.50 per run. Gated: EVALS=1 EVALS_TIER=gate. */ import { describe, test, expect } from 'bun:test'; import { - runPlanModeHandshakeTest, + runPlanModeSkillTest, PLAN_MODE_REMINDER, -} from './helpers/plan-mode-handshake-helpers'; +} from './helpers/plan-mode-helpers'; const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate'; const describeE2E = shouldRun ? describe : describe.skip; -describeE2E('plan-mode handshake no-op outside plan mode (gate regression)', () => { - test('handshake does NOT fire when distinctive phrase is absent', async () => { - const result = await runPlanModeHandshakeTest({ +describeE2E('plan-mode-info no-op outside plan mode (gate regression)', () => { + test('no AskUserQuestion echoes the plan-mode reminder when absent', async () => { + const result = await runPlanModeSkillTest({ skillName: 'plan-ceo-review', - answerLabel: 'Exit', // ignored — handshake should never fire + firstAnswerSubstring: 'HOLD', omitPlanModeReminder: true, - maxTurns: 3, // enough to see Step 0 start, but bounded + maxTurns: 3, }); - // The handshake AskUserQuestion should NOT have fired during Step 0 entry. - // Other AskUserQuestions may fire later in the skill (e.g., Step 0C-bis), - // but they will NOT have the handshake's question text. + // Skill should still hit Step 0 normally outside plan mode. + expect(result.askUserQuestions.length).toBeGreaterThanOrEqual(1); + + // No AskUserQuestion should echo the plan-mode distinctive phrase. + // If one does, the plan-mode-info section is leaking outside plan mode. for (const aq of result.askUserQuestions) { const questions = aq.input.questions as Array<{ question: string }>; for (const q of questions) { - // The handshake's question mentions the distinctive phrase in its - // prose; a non-handshake AskUserQuestion won't. expect(q.question).not.toContain(PLAN_MODE_REMINDER); } }