diff --git a/test/e2e-harness-audit.test.ts b/test/e2e-harness-audit.test.ts new file mode 100644 index 00000000..b517ef84 --- /dev/null +++ b/test/e2e-harness-audit.test.ts @@ -0,0 +1,113 @@ +/** + * E2E harness audit — every skill with `interactive: true` in its frontmatter + * must have at least one test file that uses `canUseTool` via the extended + * agent-sdk-runner. This prevents future drift where a skill opts into the + * handshake without adding real coverage. + * + * Runs as a free unit test (no API calls). Pure filesystem scan. + */ + +import { describe, test, expect } from 'bun:test'; +import * as fs from 'fs'; +import * as path from 'path'; + +const ROOT = path.resolve(import.meta.dir, '..'); +const SKILL_GLOBS = [ + 'plan-ceo-review', + 'plan-eng-review', + 'plan-design-review', + 'plan-devex-review', + 'office-hours', + 'codex', + 'investigate', + 'qa', + 'retro', + 'cso', + 'review', + 'ship', + 'design-review', + 'devex-review', + 'qa-only', + 'design-consultation', + 'design-shotgun', + 'autoplan', + 'land-and-deploy', + 'plan-tune', + 'document-release', + 'context-save', + 'context-restore', + 'health', + 'setup-deploy', + 'setup-browser-cookies', + 'canary', + 'learn', + 'benchmark', + 'benchmark-models', + 'make-pdf', + 'open-gstack-browser', + 'gstack-upgrade', + 'pair-agent', + 'design-html', + 'freeze', + 'unfreeze', + 'careful', + 'guard', +]; + +/** + * Load .tmpl files for each skill and return the names of those that have + * `interactive: true` in frontmatter. + */ +function findInteractiveSkills(): string[] { + const interactive: string[] = []; + for (const skill of SKILL_GLOBS) { + const tmplPath = path.join(ROOT, skill, 'SKILL.md.tmpl'); + if (!fs.existsSync(tmplPath)) continue; + const content = fs.readFileSync(tmplPath, 'utf-8'); + // Frontmatter lives between the first '---' and the next '---'. + const fmEnd = content.indexOf('\n---', 4); + if (fmEnd < 0) continue; + const frontmatter = content.slice(0, fmEnd); + if (/^interactive:\s*true\s*$/m.test(frontmatter)) { + interactive.push(skill); + } + } + return interactive; +} + +/** + * Scan a test file's contents for the canUseTool-via-harness pattern. + * Either: direct canUseTool usage in runAgentSdkTest, or usage of the + * shared plan-mode-handshake-helpers that wrap it. + */ +function hasCanUseToolCoverage(testFile: string): boolean { + const content = fs.readFileSync(testFile, 'utf-8'); + if (content.includes('canUseTool')) return true; + if (content.includes('runPlanModeHandshakeTest')) return true; + return false; +} + +describe('E2E harness audit — interactive skills must have canUseTool coverage', () => { + test('every interactive: true skill has at least one canUseTool test', () => { + const interactive = findInteractiveSkills(); + expect(interactive.length).toBeGreaterThan(0); + + const testFiles = fs + .readdirSync(path.join(ROOT, 'test')) + .filter((f) => f.startsWith('skill-e2e-') && f.endsWith('.test.ts')) + .map((f) => path.join(ROOT, 'test', f)); + + const filesWithCoverage = testFiles.filter(hasCanUseToolCoverage); + + for (const skill of interactive) { + // Match the skill name in any test file that uses canUseTool. File + // naming convention is `skill-e2e--*.test.ts` — either the full + // name (plan-ceo-review) or a subset token. + const hasDedicatedTest = filesWithCoverage.some((f) => { + const base = path.basename(f, '.test.ts'); + return base.includes(skill) || base.includes(skill.replace(/-review$/, '')); + }); + expect(hasDedicatedTest, `skill "${skill}" has interactive:true but no canUseTool-based E2E test`).toBe(true); + } + }); +}); diff --git a/test/gen-skill-docs.test.ts b/test/gen-skill-docs.test.ts index 7e262317..60dc8ad9 100644 --- a/test/gen-skill-docs.test.ts +++ b/test/gen-skill-docs.test.ts @@ -2774,3 +2774,93 @@ describe('voice-triggers processing', () => { expect(frontmatter).not.toContain('voice-triggers:'); }); }); + +describe('plan-mode handshake (interactive: true) resolver', () => { + const INTERACTIVE_SKILLS = [ + 'plan-ceo-review', + 'plan-eng-review', + 'plan-design-review', + 'plan-devex-review', + ]; + + const HANDSHAKE_MARKER = '## Plan Mode Handshake'; + + test.each(INTERACTIVE_SKILLS)( + '%s (Claude host) SKILL.md contains the handshake section', + (skill) => { + const content = fs.readFileSync(path.join(ROOT, skill, 'SKILL.md'), 'utf-8'); + expect(content).toContain(HANDSHAKE_MARKER); + expect(content).toContain( + 'Plan mode is active. The user indicated that they do not want you to execute yet', + ); + }, + ); + + test('handshake is absent from non-interactive Claude skills', () => { + const nonInteractive = ['ship', 'review', 'qa', 'office-hours', 'codex', 'retro', 'cso']; + for (const skill of nonInteractive) { + const content = fs.readFileSync(path.join(ROOT, skill, 'SKILL.md'), 'utf-8'); + expect(content).not.toContain(HANDSHAKE_MARKER); + } + }); + + test('handshake is absent from non-Claude host outputs when present on disk', () => { + // Non-Claude hosts render to hostSubdirs (.agents/, .openclaw/, etc). The + // handshake resolver returns '' when ctx.host !== 'claude', so those + // outputs must not contain the marker. The current gen-skill-docs layout + // prefixes skill names as `gstack-` under the hostSubdir; older + // layouts used `gstack/` (no prefix). Only stable-present paths + // are asserted — older ones may or may not exist per install history. + const candidateOutputs = [ + // Current prefixed layout + path.join(ROOT, '.agents', 'skills', 'gstack-plan-ceo-review', 'SKILL.md'), + path.join(ROOT, '.openclaw', 'skills', 'gstack-plan-ceo-review', 'SKILL.md'), + path.join(ROOT, '.opencode', 'skills', 'gstack-plan-ceo-review', 'SKILL.md'), + path.join(ROOT, '.factory', 'skills', 'gstack-plan-ceo-review', 'SKILL.md'), + path.join(ROOT, '.hermes', 'skills', 'gstack-plan-ceo-review', 'SKILL.md'), + ]; + let checked = 0; + for (const out of candidateOutputs) { + if (fs.existsSync(out)) { + const content = fs.readFileSync(out, 'utf-8'); + expect(content).not.toContain(HANDSHAKE_MARKER); + checked++; + } + } + // At least one non-Claude host's output should exist after a full gen + // run; this test is meaningful only if we checked something. If no + // non-Claude outputs exist locally, the cross-host guarantee is still + // enforced by the resolver's ctx.host check; this test is belt-and- + // suspenders and becomes a no-op rather than a false positive. + if (checked === 0) { + // eslint-disable-next-line no-console + console.warn( + 'plan-mode handshake: no non-Claude host outputs found for cross-host absence check — ' + + 'run `bun run gen:skill-docs --host all` to populate', + ); + } + }); + + test('0C-bis STOP block present in plan-ceo-review/SKILL.md', () => { + const content = fs.readFileSync(path.join(ROOT, 'plan-ceo-review', 'SKILL.md'), 'utf-8'); + const presentIdx = content.indexOf('Present these approach options via AskUserQuestion'); + const preludeIdx = content.indexOf('### 0D-prelude'); + expect(presentIdx).toBeGreaterThan(0); + expect(preludeIdx).toBeGreaterThan(presentIdx); + const between = content.slice(presentIdx, preludeIdx); + expect(between).toContain('**STOP.**'); + expect(between).toContain('Do NOT proceed to Step 0D or 0F until the user responds to 0C-bis'); + }); + + test('handshake resolver is wired BEFORE generateUpgradeCheck in preamble', () => { + const content = fs.readFileSync( + path.join(ROOT, 'plan-ceo-review', 'SKILL.md'), + 'utf-8', + ); + const handshakeIdx = content.indexOf(HANDSHAKE_MARKER); + const upgradeIdx = content.indexOf('UPGRADE_AVAILABLE'); + expect(handshakeIdx).toBeGreaterThan(0); + expect(upgradeIdx).toBeGreaterThan(0); + expect(handshakeIdx).toBeLessThan(upgradeIdx); + }); +}); diff --git a/test/helpers/plan-mode-handshake-helpers.ts b/test/helpers/plan-mode-handshake-helpers.ts new file mode 100644 index 00000000..581932be --- /dev/null +++ b/test/helpers/plan-mode-handshake-helpers.ts @@ -0,0 +1,166 @@ +/** + * Shared helpers for plan-mode handshake E2E tests. + * + * Four sibling test files (plan-ceo, plan-eng, plan-design, plan-devex) exercise + * the identical handshake contract against different skills. This helper + * centralizes the canUseTool interceptor and the assertion shape so the four + * test files are thin wiring (~40 LOC each) and can't drift out of sync. + * + * See scripts/resolvers/preamble/generate-plan-mode-handshake.ts for the + * handshake prose that the tests below assert against. + */ + +import { expect } from 'bun:test'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { execSync } from 'child_process'; +import { + runAgentSdkTest, + passThroughNonAskUserQuestion, + resolveClaudeBinary, + type AgentSdkResult, +} from './agent-sdk-runner'; + +/** Distinctive phrase matching what Claude Code's harness actually injects. */ +export const PLAN_MODE_REMINDER = + 'Plan mode is active. The user indicated that they do not want you to execute yet'; + +export interface HandshakeCaptureResult { + sdkResult: AgentSdkResult; + /** Each AskUserQuestion that fired, with its input payload. */ + askUserQuestions: Array<{ input: Record; orderIndex: number }>; + /** Tool-use events in the order they fired (names only). */ + toolOrder: string[]; + /** Whether any Write or Edit tool fired BEFORE the first AskUserQuestion. */ + writeOrEditBeforeAsk: boolean; +} + +/** + * Run a skill via the Agent SDK with canUseTool intercepting every tool use. + * Inject the plan-mode distinctive phrase into the system prompt and auto- + * answer the handshake with the given answerLabel ("Exit" or "Cancel"). Return + * the captured events for assertion. + */ +export async function runPlanModeHandshakeTest(opts: { + /** Skill name, e.g. 'plan-ceo-review'. */ + skillName: string; + /** "Exit" to pick option A (exit-and-rerun) or "Cancel" for option C. */ + answerLabel: 'Exit' | 'Cancel'; + /** If true, DO NOT inject the reminder — used by the no-op regression test. */ + omitPlanModeReminder?: boolean; + /** Max turns for the SDK call (default 4 — handshake + exit should fit easily). */ + maxTurns?: number; +}): Promise { + const { skillName, answerLabel, omitPlanModeReminder, maxTurns } = opts; + + const askUserQuestions: HandshakeCaptureResult['askUserQuestions'] = []; + const toolOrder: string[] = []; + let toolIndex = 0; + let firstAskIndex = -1; + + const workingDir = fs.mkdtempSync( + path.join(os.tmpdir(), `plan-mode-handshake-${skillName}-`), + ); + + // The SDK requires AskUserQuestion to be in the allowed tools list. The + // harness auto-adds it when canUseTool is supplied, but we also want Read + // so the skill can load its own file if it tries to. + const binary = resolveClaudeBinary(); + + try { + // Inject the distinctive phrase into the system prompt by appending it to + // the default Claude Code preset. Claude Code's real plan mode uses an + // injected system-reminder; in SDK tests we use systemPrompt.append which + // the model treats as equally authoritative. + const reminderAppend = omitPlanModeReminder + ? '' + : `\n\n\n${PLAN_MODE_REMINDER}. This supercedes any other instructions you have received.\n\n`; + + const sdkResult = await runAgentSdkTest({ + systemPrompt: { + type: 'preset', + preset: 'claude_code', + append: reminderAppend, + }, + userPrompt: `Read the skill file at ${path.resolve( + import.meta.dir, + '..', + '..', + skillName, + 'SKILL.md', + )} and follow its instructions. There is no real plan to review — just start the skill and respond to any AskUserQuestion that fires.`, + workingDirectory: workingDir, + maxTurns: maxTurns ?? 4, + allowedTools: ['Read', 'Grep', 'Glob', 'Bash'], + ...(binary ? { pathToClaudeCodeExecutable: binary } : {}), + canUseTool: async (toolName, input) => { + toolOrder.push(toolName); + if (toolName === 'AskUserQuestion') { + if (firstAskIndex === -1) firstAskIndex = toolIndex; + askUserQuestions.push({ input, orderIndex: toolIndex }); + toolIndex++; + // Auto-answer with the label the test specified. + const q = (input.questions as Array<{ question: string; options: Array<{ label: string }> }>)[0]; + const matched = q.options.find((o) => o.label.includes(answerLabel)); + const answer = matched ? matched.label : q.options[0]!.label; + return { + behavior: 'allow', + updatedInput: { + questions: input.questions, + answers: { [q.question]: answer }, + }, + }; + } + toolIndex++; + return passThroughNonAskUserQuestion(toolName, input); + }, + }); + + const writeOrEditBeforeAsk = + firstAskIndex > 0 && + toolOrder.slice(0, firstAskIndex).some((t) => t === 'Write' || t === 'Edit'); + + return { sdkResult, askUserQuestions, toolOrder, writeOrEditBeforeAsk }; + } finally { + try { + fs.rmSync(workingDir, { recursive: true, force: true }); + } catch { /* ignore cleanup errors */ } + } +} + +/** Assert the shape of a fired handshake AskUserQuestion. */ +export function assertHandshakeShape( + aq: { input: Record }, +): void { + const questions = aq.input.questions as Array<{ + question: string; + options: Array<{ label: string }>; + }>; + expect(questions).toBeDefined(); + expect(questions.length).toBe(1); + const q = questions[0]!; + // D8 dropped Option B; handshake has exactly 2 options. + expect(q.options.length).toBe(2); + const labels = q.options.map((o) => o.label); + expect(labels.some((l) => l.includes('Exit'))).toBe(true); + expect(labels.some((l) => l.includes('Cancel'))).toBe(true); +} + +/** Read the skill-usage.jsonl log and return handshake entries. */ +export function readHandshakeLog(): Array> { + const logPath = path.join(os.homedir(), '.gstack', 'analytics', 'skill-usage.jsonl'); + if (!fs.existsSync(logPath)) return []; + const lines = fs.readFileSync(logPath, 'utf-8').split('\n').filter(Boolean); + return lines + .map((line) => { + try { + return JSON.parse(line); + } catch { + return null; + } + }) + .filter((x): x is Record => x !== null && x.event === 'plan_mode_handshake'); +} + +export { execSync }; diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index 4872f5de..acde310d 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -82,6 +82,17 @@ export const E2E_TOUCHFILES: Record = { 'plan-eng-review-artifact': ['plan-eng-review/**'], 'plan-review-report': ['plan-eng-review/**', 'scripts/gen-skill-docs.ts'], + // Plan-mode handshake (v1.10.2.0) — gate-tier safety regression tests. + // Each fires when any of: the interactive skill's template, the resolver, + // preamble composition, the Agent SDK harness, the question registry, or + // the one-way-door classifier changes. + 'plan-ceo-review-plan-mode': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-plan-mode-handshake.ts', 'scripts/resolvers/preamble.ts', 'scripts/question-registry.ts', 'scripts/one-way-doors.ts', 'test/helpers/agent-sdk-runner.ts'], + 'plan-eng-review-plan-mode': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-plan-mode-handshake.ts', 'scripts/resolvers/preamble.ts', 'scripts/question-registry.ts', 'scripts/one-way-doors.ts', 'test/helpers/agent-sdk-runner.ts'], + 'plan-design-review-plan-mode-handshake': ['plan-design-review/**', 'scripts/resolvers/preamble/generate-plan-mode-handshake.ts', 'scripts/resolvers/preamble.ts', 'scripts/question-registry.ts', 'scripts/one-way-doors.ts', 'test/helpers/agent-sdk-runner.ts'], + 'plan-devex-review-plan-mode': ['plan-devex-review/**', 'scripts/resolvers/preamble/generate-plan-mode-handshake.ts', 'scripts/resolvers/preamble.ts', 'scripts/question-registry.ts', 'scripts/one-way-doors.ts', 'test/helpers/agent-sdk-runner.ts'], + 'plan-mode-no-op': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-plan-mode-handshake.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/agent-sdk-runner.ts'], + 'e2e-harness-audit': ['plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-plan-mode-handshake.ts', 'test/helpers/agent-sdk-runner.ts'], + // AskUserQuestion format regression (RECOMMENDATION + Completeness: N/10) // Fires when either template OR the two preamble resolvers change. 'plan-ceo-review-format-mode': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'], @@ -317,6 +328,14 @@ export const E2E_TIERS: Record = { 'plan-eng-coverage-audit': 'gate', 'plan-review-report': 'gate', + // Plan-mode handshake — deterministic safety regression, gate-tier + 'plan-ceo-review-plan-mode': 'gate', + 'plan-eng-review-plan-mode': 'gate', + 'plan-design-review-plan-mode-handshake': 'gate', + 'plan-devex-review-plan-mode': 'gate', + 'plan-mode-no-op': 'gate', + 'e2e-harness-audit': 'gate', + // AskUserQuestion format regression — periodic (Opus 4.7 non-deterministic benchmark) 'plan-ceo-review-format-mode': 'periodic', 'plan-ceo-review-format-approach': 'periodic', diff --git a/test/skill-e2e-plan-ceo-plan-mode.test.ts b/test/skill-e2e-plan-ceo-plan-mode.test.ts new file mode 100644 index 00000000..858e07eb --- /dev/null +++ b/test/skill-e2e-plan-ceo-plan-mode.test.ts @@ -0,0 +1,40 @@ +/** + * plan-ceo-review plan-mode handshake E2E (gate tier, paid). + * + * Asserts: when /plan-ceo-review is invoked with the plan-mode distinctive + * phrase in the system reminder, the skill fires AskUserQuestion FIRST + * (before any Write or Edit), the question has exactly 2 options (A exit, + * C cancel), picking "Exit" leads to an orderly exit with no plan file + * written. + * + * Cost: ~$0.50–$1.00 per run. Gated: EVALS=1 EVALS_TIER=gate. + * Depends on: scripts/resolvers/preamble/generate-plan-mode-handshake.ts, + * test/helpers/agent-sdk-runner.ts (canUseTool extension). + */ + +import { describe, test, expect } from 'bun:test'; +import { + runPlanModeHandshakeTest, + assertHandshakeShape, +} from './helpers/plan-mode-handshake-helpers'; + +const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate'; +const describeE2E = shouldRun ? describe : describe.skip; + +describeE2E('plan-ceo-review plan-mode handshake (gate)', () => { + test('handshake fires before any Write/Edit when plan mode is detected', async () => { + const result = await runPlanModeHandshakeTest({ + skillName: 'plan-ceo-review', + answerLabel: 'Exit', + }); + + // Handshake must have fired at least once. + expect(result.askUserQuestions.length).toBeGreaterThanOrEqual(1); + // Critically: no Write or Edit fired before the first AskUserQuestion. + // This is the bug v1.10.2.0 fixes — plan mode used to allow silent + // plan-file writes without any interactive gate. + expect(result.writeOrEditBeforeAsk).toBe(false); + // Handshake shape: 2 options (Exit/Cancel), Option B dropped per D8. + assertHandshakeShape(result.askUserQuestions[0]!); + }, 120_000); +}); diff --git a/test/skill-e2e-plan-design-plan-mode.test.ts b/test/skill-e2e-plan-design-plan-mode.test.ts new file mode 100644 index 00000000..1fb7aaf5 --- /dev/null +++ b/test/skill-e2e-plan-design-plan-mode.test.ts @@ -0,0 +1,28 @@ +/** + * plan-design-review plan-mode handshake E2E (gate tier, paid). + * + * See test/skill-e2e-plan-ceo-plan-mode.test.ts for the shared assertion + * contract. This file exercises the same handshake against /plan-design-review. + */ + +import { describe, test, expect } from 'bun:test'; +import { + runPlanModeHandshakeTest, + assertHandshakeShape, +} from './helpers/plan-mode-handshake-helpers'; + +const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate'; +const describeE2E = shouldRun ? describe : describe.skip; + +describeE2E('plan-design-review plan-mode handshake (gate)', () => { + test('handshake fires before any Write/Edit when plan mode is detected', async () => { + const result = await runPlanModeHandshakeTest({ + skillName: 'plan-design-review', + answerLabel: 'Cancel', // exercise the C-cancel branch instead of A-exit + }); + + expect(result.askUserQuestions.length).toBeGreaterThanOrEqual(1); + expect(result.writeOrEditBeforeAsk).toBe(false); + assertHandshakeShape(result.askUserQuestions[0]!); + }, 120_000); +}); diff --git a/test/skill-e2e-plan-devex-plan-mode.test.ts b/test/skill-e2e-plan-devex-plan-mode.test.ts new file mode 100644 index 00000000..2ede50e2 --- /dev/null +++ b/test/skill-e2e-plan-devex-plan-mode.test.ts @@ -0,0 +1,28 @@ +/** + * plan-devex-review plan-mode handshake E2E (gate tier, paid). + * + * See test/skill-e2e-plan-ceo-plan-mode.test.ts for the shared assertion + * contract. This file exercises the same handshake against /plan-devex-review. + */ + +import { describe, test, expect } from 'bun:test'; +import { + runPlanModeHandshakeTest, + assertHandshakeShape, +} from './helpers/plan-mode-handshake-helpers'; + +const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate'; +const describeE2E = shouldRun ? describe : describe.skip; + +describeE2E('plan-devex-review plan-mode handshake (gate)', () => { + test('handshake fires before any Write/Edit when plan mode is detected', async () => { + const result = await runPlanModeHandshakeTest({ + skillName: 'plan-devex-review', + answerLabel: 'Exit', + }); + + expect(result.askUserQuestions.length).toBeGreaterThanOrEqual(1); + expect(result.writeOrEditBeforeAsk).toBe(false); + assertHandshakeShape(result.askUserQuestions[0]!); + }, 120_000); +}); diff --git a/test/skill-e2e-plan-eng-plan-mode.test.ts b/test/skill-e2e-plan-eng-plan-mode.test.ts new file mode 100644 index 00000000..16da9d7a --- /dev/null +++ b/test/skill-e2e-plan-eng-plan-mode.test.ts @@ -0,0 +1,28 @@ +/** + * plan-eng-review plan-mode handshake E2E (gate tier, paid). + * + * See test/skill-e2e-plan-ceo-plan-mode.test.ts for the shared assertion + * contract. This file exercises the same handshake against /plan-eng-review. + */ + +import { describe, test, expect } from 'bun:test'; +import { + runPlanModeHandshakeTest, + assertHandshakeShape, +} from './helpers/plan-mode-handshake-helpers'; + +const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate'; +const describeE2E = shouldRun ? describe : describe.skip; + +describeE2E('plan-eng-review plan-mode handshake (gate)', () => { + test('handshake fires before any Write/Edit when plan mode is detected', async () => { + const result = await runPlanModeHandshakeTest({ + skillName: 'plan-eng-review', + answerLabel: 'Exit', + }); + + expect(result.askUserQuestions.length).toBeGreaterThanOrEqual(1); + expect(result.writeOrEditBeforeAsk).toBe(false); + assertHandshakeShape(result.askUserQuestions[0]!); + }, 120_000); +}); diff --git a/test/skill-e2e-plan-mode-no-op.test.ts b/test/skill-e2e-plan-mode-no-op.test.ts new file mode 100644 index 00000000..e222fbff --- /dev/null +++ b/test/skill-e2e-plan-mode-no-op.test.ts @@ -0,0 +1,43 @@ +/** + * Plan-mode handshake negative regression (gate tier, paid). + * + * Asserts: when /plan-ceo-review is invoked WITHOUT the plan-mode distinctive + * phrase in the system reminder, the handshake does NOT fire. The skill + * should proceed to its normal Step 0 flow. This is the REGRESSION RULE + * guardrail — the handshake must be a no-op outside plan mode or it breaks + * every existing interactive-review session. + * + * Cost: ~$0.50 per run. Gated: EVALS=1 EVALS_TIER=gate. + */ + +import { describe, test, expect } from 'bun:test'; +import { + runPlanModeHandshakeTest, + PLAN_MODE_REMINDER, +} from './helpers/plan-mode-handshake-helpers'; + +const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate'; +const describeE2E = shouldRun ? describe : describe.skip; + +describeE2E('plan-mode handshake no-op outside plan mode (gate regression)', () => { + test('handshake does NOT fire when distinctive phrase is absent', async () => { + const result = await runPlanModeHandshakeTest({ + skillName: 'plan-ceo-review', + answerLabel: 'Exit', // ignored — handshake should never fire + omitPlanModeReminder: true, + maxTurns: 3, // enough to see Step 0 start, but bounded + }); + + // The handshake AskUserQuestion should NOT have fired during Step 0 entry. + // Other AskUserQuestions may fire later in the skill (e.g., Step 0C-bis), + // but they will NOT have the handshake's question text. + for (const aq of result.askUserQuestions) { + const questions = aq.input.questions as Array<{ question: string }>; + for (const q of questions) { + // The handshake's question mentions the distinctive phrase in its + // prose; a non-handshake AskUserQuestion won't. + expect(q.question).not.toContain(PLAN_MODE_REMINDER); + } + } + }, 120_000); +}); diff --git a/test/touchfiles.test.ts b/test/touchfiles.test.ts index 5daae1c3..6ae0718e 100644 --- a/test/touchfiles.test.ts +++ b/test/touchfiles.test.ts @@ -85,8 +85,16 @@ describe('selectTests', () => { expect(result.selected).toContain('codex-offered-ceo-review'); expect(result.selected).toContain('plan-ceo-review-format-mode'); expect(result.selected).toContain('plan-ceo-review-format-approach'); - expect(result.selected.length).toBe(8); - expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 8); + // v1.10.2.0 plan-mode handshake entries also depend on plan-ceo-review/** + expect(result.selected).toContain('plan-ceo-review-plan-mode'); + expect(result.selected).toContain('plan-mode-no-op'); + expect(result.selected).toContain('e2e-harness-audit'); + expect(result.selected).toContain('plan-ceo-review-prosons-cadence'); + expect(result.selected).toContain('plan-review-prosons-format'); + expect(result.selected).toContain('plan-review-prosons-hardstop-neg'); + expect(result.selected).toContain('plan-review-prosons-neutral-neg'); + expect(result.selected.length).toBe(15); + expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 15); }); test('global touchfile triggers ALL tests', () => {