diff --git a/test/e2e-harness-audit.test.ts b/test/e2e-harness-audit.test.ts index ce709cae..bd3ecf46 100644 --- a/test/e2e-harness-audit.test.ts +++ b/test/e2e-harness-audit.test.ts @@ -1,8 +1,11 @@ /** * E2E harness audit — every skill with `interactive: true` in its frontmatter - * must have at least one test file that uses `canUseTool` via the extended - * agent-sdk-runner. This prevents future drift where a skill opts into the - * handshake without adding real coverage. + * must have at least one test file that drives a real interactive session. + * Two valid coverage paths: + * 1. `canUseTool` via the agent-sdk-runner (legacy SDK-based path) + * 2. `runPlanSkillObservation` via the claude-pty-runner (real-PTY path + * added when the SDK harness was found unable to observe plan mode's + * native confirmation UI — see test/helpers/claude-pty-runner.ts) * * Runs as a free unit test (no API calls). Pure filesystem scan. */ @@ -76,14 +79,16 @@ function findInteractiveSkills(): string[] { } /** - * Scan a test file's contents for the canUseTool-via-harness pattern. - * Either: direct canUseTool usage in runAgentSdkTest, or usage of the - * shared plan-mode-helpers that wrap it. + * Scan a test file's contents for any of the supported real-interactive + * coverage patterns. Either: direct canUseTool usage in runAgentSdkTest, + * the legacy plan-mode-helpers wrapper, or the new real-PTY observation + * helper. */ function hasCanUseToolCoverage(testFile: string): boolean { const content = fs.readFileSync(testFile, 'utf-8'); if (content.includes('canUseTool')) return true; if (content.includes('runPlanModeSkillTest')) return true; + if (content.includes('runPlanSkillObservation')) return true; return false; } diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index abc76023..fb10f58a 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -84,14 +84,15 @@ export const E2E_TOUCHFILES: Record = { // Plan-mode smoke tests — gate-tier safety regression tests. Each fires when // any of: the interactive skill's template, the plan-mode resolver - // (completion-status now owns generatePlanModeInfo), preamble composition, - // the Agent SDK harness, or the shared plan-mode-helpers change. - 'plan-ceo-review-plan-mode': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/agent-sdk-runner.ts', 'test/helpers/plan-mode-helpers.ts'], - 'plan-eng-review-plan-mode': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/agent-sdk-runner.ts', 'test/helpers/plan-mode-helpers.ts'], - 'plan-design-review-plan-mode': ['plan-design-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/agent-sdk-runner.ts', 'test/helpers/plan-mode-helpers.ts'], - 'plan-devex-review-plan-mode': ['plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/agent-sdk-runner.ts', 'test/helpers/plan-mode-helpers.ts'], - 'plan-mode-no-op': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/agent-sdk-runner.ts', 'test/helpers/plan-mode-helpers.ts'], - 'e2e-harness-audit': ['plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'test/helpers/agent-sdk-runner.ts', 'test/helpers/plan-mode-helpers.ts'], + // (completion-status owns generatePlanModeInfo), preamble composition, or + // the real-PTY runner (which the tests now use instead of the SDK harness) + // change. + 'plan-ceo-review-plan-mode': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'], + 'plan-eng-review-plan-mode': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'], + 'plan-design-review-plan-mode': ['plan-design-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'], + 'plan-devex-review-plan-mode': ['plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'], + 'plan-mode-no-op': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'], + 'e2e-harness-audit': ['plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'test/helpers/agent-sdk-runner.ts', 'test/helpers/claude-pty-runner.ts'], 'brain-privacy-gate': ['scripts/resolvers/preamble/generate-brain-sync-block.ts', 'scripts/resolvers/preamble.ts', 'bin/gstack-brain-sync', 'bin/gstack-brain-init', 'bin/gstack-config', 'test/helpers/agent-sdk-runner.ts'], // AskUserQuestion format regression (RECOMMENDATION + Completeness: N/10) diff --git a/test/skill-e2e-plan-ceo-plan-mode.test.ts b/test/skill-e2e-plan-ceo-plan-mode.test.ts index 8914967e..8bb6a95b 100644 --- a/test/skill-e2e-plan-ceo-plan-mode.test.ts +++ b/test/skill-e2e-plan-ceo-plan-mode.test.ts @@ -1,38 +1,48 @@ /** - * plan-ceo-review plan-mode smoke test (gate tier, paid). + * plan-ceo-review plan-mode smoke (gate, paid, real-PTY). * - * Asserts: when /plan-ceo-review is invoked with the plan-mode distinctive - * phrase in the system reminder, the skill goes STRAIGHT to its Step 0 - * scope-mode AskUserQuestion. Specifically: - * 1. First AskUserQuestion is NOT the old vestigial handshake - * (A=exit-and-rerun / C=cancel). - * 2. No Write or Edit tool fires before the first AskUserQuestion - * (catches silent plan-file-write bypass). - * 3. ExitPlanMode does not fire before the first AskUserQuestion. + * Asserts: when /plan-ceo-review is invoked in plan mode, the skill reaches + * a terminal outcome that is either: + * - 'asked' — skill emitted its Step 0 numbered prompt (scope mode + * selection, or the routing-injection prompt that runs + * before Step 0) + * - 'plan_ready' — skill ran end-to-end and surfaced claude's native + * "Ready to execute" confirmation * - * Cost: ~$0.50–$1.00 per run. Gated: EVALS=1 EVALS_TIER=gate. + * FAIL conditions: silent Write/Edit before any prompt, claude crash, + * timeout. + * + * Replaces the SDK-based test that never worked: the SDK's canUseTool + * interceptor on AskUserQuestion never fires in plan mode because plan + * mode renders its native confirmation as TTY UI, not via the + * AskUserQuestion tool. The real PTY harness observes the rendered + * terminal output directly. + * + * See test/helpers/claude-pty-runner.ts for runner internals. */ import { describe, test, expect } from 'bun:test'; -import { - runPlanModeSkillTest, - assertNotHandshakeShape, -} from './helpers/plan-mode-helpers'; +import { runPlanSkillObservation } from './helpers/claude-pty-runner'; const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate'; const describeE2E = shouldRun ? describe : describe.skip; describeE2E('plan-ceo-review plan-mode smoke (gate)', () => { - test('goes straight to scope-mode question, no handshake, no silent writes', async () => { - const result = await runPlanModeSkillTest({ + test('reaches a terminal outcome (asked or plan_ready) without silent writes', async () => { + const obs = await runPlanSkillObservation({ skillName: 'plan-ceo-review', - // Step 0 asks for review mode; HOLD is the cheapest, most-neutral answer. - firstAnswerSubstring: 'HOLD', + inPlanMode: true, + timeoutMs: 300_000, }); - expect(result.askUserQuestions.length).toBeGreaterThanOrEqual(1); - assertNotHandshakeShape(result.askUserQuestions[0]!); - expect(result.writeOrEditBeforeAsk).toBe(false); - expect(result.exitPlanModeBeforeAsk).toBe(false); - }, 120_000); + if (obs.outcome === 'silent_write' || obs.outcome === 'exited' || obs.outcome === 'timeout') { + throw new Error( + `plan-ceo-review plan-mode smoke FAILED: outcome=${obs.outcome}\n` + + `summary: ${obs.summary}\n` + + `elapsed: ${obs.elapsedMs}ms\n` + + `--- evidence (last 2KB visible) ---\n${obs.evidence}`, + ); + } + expect(['asked', 'plan_ready']).toContain(obs.outcome); + }, 360_000); }); diff --git a/test/skill-e2e-plan-design-plan-mode.test.ts b/test/skill-e2e-plan-design-plan-mode.test.ts index 98d050e1..6fd7881a 100644 --- a/test/skill-e2e-plan-design-plan-mode.test.ts +++ b/test/skill-e2e-plan-design-plan-mode.test.ts @@ -1,31 +1,36 @@ /** - * plan-design-review plan-mode smoke test (gate tier, paid). + * plan-design-review plan-mode smoke (gate, paid, real-PTY). * * See test/skill-e2e-plan-ceo-plan-mode.test.ts for the shared assertion - * contract. Exercises the same assertions against /plan-design-review. + * contract. Exercises the same contract against /plan-design-review. + * + * Note: on no-UI-scope branches plan-design-review legitimately short- + * circuits to plan_ready without firing AskUserQuestion. Both 'asked' and + * 'plan_ready' are valid pass outcomes. */ import { describe, test, expect } from 'bun:test'; -import { - runPlanModeSkillTest, - assertNotHandshakeShape, -} from './helpers/plan-mode-helpers'; +import { runPlanSkillObservation } from './helpers/claude-pty-runner'; const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate'; const describeE2E = shouldRun ? describe : describe.skip; describeE2E('plan-design-review plan-mode smoke (gate)', () => { - test('goes straight to first design question, no handshake, no silent writes', async () => { - const result = await runPlanModeSkillTest({ + test('reaches a terminal outcome (asked or plan_ready) without silent writes', async () => { + const obs = await runPlanSkillObservation({ skillName: 'plan-design-review', - // First question for design review varies; pick any reasonable match. - // The substring match falls back to the first option if no match. - firstAnswerSubstring: '7', + inPlanMode: true, + timeoutMs: 300_000, }); - expect(result.askUserQuestions.length).toBeGreaterThanOrEqual(1); - assertNotHandshakeShape(result.askUserQuestions[0]!); - expect(result.writeOrEditBeforeAsk).toBe(false); - expect(result.exitPlanModeBeforeAsk).toBe(false); - }, 120_000); + if (obs.outcome === 'silent_write' || obs.outcome === 'exited' || obs.outcome === 'timeout') { + throw new Error( + `plan-design-review plan-mode smoke FAILED: outcome=${obs.outcome}\n` + + `summary: ${obs.summary}\n` + + `elapsed: ${obs.elapsedMs}ms\n` + + `--- evidence (last 2KB visible) ---\n${obs.evidence}`, + ); + } + expect(['asked', 'plan_ready']).toContain(obs.outcome); + }, 360_000); }); diff --git a/test/skill-e2e-plan-devex-plan-mode.test.ts b/test/skill-e2e-plan-devex-plan-mode.test.ts index 64bc447e..05f1abb3 100644 --- a/test/skill-e2e-plan-devex-plan-mode.test.ts +++ b/test/skill-e2e-plan-devex-plan-mode.test.ts @@ -1,30 +1,32 @@ /** - * plan-devex-review plan-mode smoke test (gate tier, paid). + * plan-devex-review plan-mode smoke (gate, paid, real-PTY). * * See test/skill-e2e-plan-ceo-plan-mode.test.ts for the shared assertion - * contract. Exercises the same assertions against /plan-devex-review. + * contract. Exercises the same contract against /plan-devex-review. */ import { describe, test, expect } from 'bun:test'; -import { - runPlanModeSkillTest, - assertNotHandshakeShape, -} from './helpers/plan-mode-helpers'; +import { runPlanSkillObservation } from './helpers/claude-pty-runner'; const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate'; const describeE2E = shouldRun ? describe : describe.skip; describeE2E('plan-devex-review plan-mode smoke (gate)', () => { - test('goes straight to DX-mode question, no handshake, no silent writes', async () => { - const result = await runPlanModeSkillTest({ + test('reaches a terminal outcome (asked or plan_ready) without silent writes', async () => { + const obs = await runPlanSkillObservation({ skillName: 'plan-devex-review', - // Step 0 asks for DX review mode; TRIAGE is the lightest-weight mode. - firstAnswerSubstring: 'TRIAGE', + inPlanMode: true, + timeoutMs: 300_000, }); - expect(result.askUserQuestions.length).toBeGreaterThanOrEqual(1); - assertNotHandshakeShape(result.askUserQuestions[0]!); - expect(result.writeOrEditBeforeAsk).toBe(false); - expect(result.exitPlanModeBeforeAsk).toBe(false); - }, 120_000); + if (obs.outcome === 'silent_write' || obs.outcome === 'exited' || obs.outcome === 'timeout') { + throw new Error( + `plan-devex-review plan-mode smoke FAILED: outcome=${obs.outcome}\n` + + `summary: ${obs.summary}\n` + + `elapsed: ${obs.elapsedMs}ms\n` + + `--- evidence (last 2KB visible) ---\n${obs.evidence}`, + ); + } + expect(['asked', 'plan_ready']).toContain(obs.outcome); + }, 360_000); }); diff --git a/test/skill-e2e-plan-eng-plan-mode.test.ts b/test/skill-e2e-plan-eng-plan-mode.test.ts index c57bbee4..93d55ece 100644 --- a/test/skill-e2e-plan-eng-plan-mode.test.ts +++ b/test/skill-e2e-plan-eng-plan-mode.test.ts @@ -1,29 +1,32 @@ /** - * plan-eng-review plan-mode smoke test (gate tier, paid). + * plan-eng-review plan-mode smoke (gate, paid, real-PTY). * * See test/skill-e2e-plan-ceo-plan-mode.test.ts for the shared assertion - * contract. This file exercises the same assertions against /plan-eng-review. + * contract. This file exercises the same contract against /plan-eng-review. */ import { describe, test, expect } from 'bun:test'; -import { - runPlanModeSkillTest, - assertNotHandshakeShape, -} from './helpers/plan-mode-helpers'; +import { runPlanSkillObservation } from './helpers/claude-pty-runner'; const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate'; const describeE2E = shouldRun ? describe : describe.skip; describeE2E('plan-eng-review plan-mode smoke (gate)', () => { - test('goes straight to scope-mode question, no handshake, no silent writes', async () => { - const result = await runPlanModeSkillTest({ + test('reaches a terminal outcome (asked or plan_ready) without silent writes', async () => { + const obs = await runPlanSkillObservation({ skillName: 'plan-eng-review', - firstAnswerSubstring: 'HOLD', + inPlanMode: true, + timeoutMs: 300_000, }); - expect(result.askUserQuestions.length).toBeGreaterThanOrEqual(1); - assertNotHandshakeShape(result.askUserQuestions[0]!); - expect(result.writeOrEditBeforeAsk).toBe(false); - expect(result.exitPlanModeBeforeAsk).toBe(false); - }, 120_000); + if (obs.outcome === 'silent_write' || obs.outcome === 'exited' || obs.outcome === 'timeout') { + throw new Error( + `plan-eng-review plan-mode smoke FAILED: outcome=${obs.outcome}\n` + + `summary: ${obs.summary}\n` + + `elapsed: ${obs.elapsedMs}ms\n` + + `--- evidence (last 2KB visible) ---\n${obs.evidence}`, + ); + } + expect(['asked', 'plan_ready']).toContain(obs.outcome); + }, 360_000); }); diff --git a/test/skill-e2e-plan-mode-no-op.test.ts b/test/skill-e2e-plan-mode-no-op.test.ts index 061e9cd8..73999522 100644 --- a/test/skill-e2e-plan-mode-no-op.test.ts +++ b/test/skill-e2e-plan-mode-no-op.test.ts @@ -1,47 +1,48 @@ /** - * Plan-mode-info no-op regression (gate tier, paid). + * Plan-mode-info no-op regression (gate tier, paid, real-PTY). * - * Asserts: when /plan-ceo-review is invoked WITHOUT the plan-mode distinctive - * phrase in the system reminder, the plan-mode-info preamble section is a - * no-op. The skill should proceed to its normal Step 0 flow with no - * AskUserQuestion echoing or referencing the plan-mode reminder text. + * Asserts: when /plan-ceo-review is invoked OUTSIDE plan mode (no + * --permission-mode plan flag, no plan-mode reminder injected), the skill + * still reaches a terminal outcome ('asked' or 'plan_ready'). This is the + * negative coverage to the per-skill plan-mode smokes — if the + * plan-mode-info preamble section ever starts misfiring for non-plan-mode + * sessions (e.g., gating questions on a phrase that isn't there), this + * test catches it. * - * This guardrails the "outside plan mode, this block doesn't interfere" - * case — a different coverage case from the per-skill in-plan-mode smokes. - * If the plan-mode-info section ever starts misfiring for non-plan-mode - * sessions, this test catches it. - * - * Cost: ~$0.50 per run. Gated: EVALS=1 EVALS_TIER=gate. + * Why this matters: outside plan mode, claude doesn't render a native + * confirmation UI. The skill must drive its own AskUserQuestion. Same + * runner, same outcome contract — just `inPlanMode: false`. */ import { describe, test, expect } from 'bun:test'; -import { - runPlanModeSkillTest, - PLAN_MODE_REMINDER, -} from './helpers/plan-mode-helpers'; +import { runPlanSkillObservation } from './helpers/claude-pty-runner'; const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate'; const describeE2E = shouldRun ? describe : describe.skip; describeE2E('plan-mode-info no-op outside plan mode (gate regression)', () => { - test('no AskUserQuestion echoes the plan-mode reminder when absent', async () => { - const result = await runPlanModeSkillTest({ + test('skill reaches a terminal outcome outside plan mode', async () => { + const obs = await runPlanSkillObservation({ skillName: 'plan-ceo-review', - firstAnswerSubstring: 'HOLD', - omitPlanModeReminder: true, - maxTurns: 3, + inPlanMode: false, + timeoutMs: 300_000, }); - // Skill should still hit Step 0 normally outside plan mode. - expect(result.askUserQuestions.length).toBeGreaterThanOrEqual(1); - - // No AskUserQuestion should echo the plan-mode distinctive phrase. - // If one does, the plan-mode-info section is leaking outside plan mode. - for (const aq of result.askUserQuestions) { - const questions = aq.input.questions as Array<{ question: string }>; - for (const q of questions) { - expect(q.question).not.toContain(PLAN_MODE_REMINDER); - } + if (obs.outcome === 'silent_write' || obs.outcome === 'exited' || obs.outcome === 'timeout') { + throw new Error( + `plan-mode no-op regression FAILED: outcome=${obs.outcome}\n` + + `summary: ${obs.summary}\n` + + `elapsed: ${obs.elapsedMs}ms\n` + + `--- evidence (last 2KB visible) ---\n${obs.evidence}`, + ); } - }, 120_000); + expect(['asked', 'plan_ready']).toContain(obs.outcome); + + // Negative regression: the rendered output must NOT echo the plan-mode + // distinctive reminder phrase. If it does, the plan-mode preamble + // section is leaking outside plan mode. + const PLAN_MODE_REMINDER = + 'Plan mode is active. The user indicated that they do not want you to execute yet'; + expect(obs.evidence).not.toContain(PLAN_MODE_REMINDER); + }, 360_000); });