diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index 4552b8e1..79eed956 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -82,17 +82,30 @@ export const E2E_TOUCHFILES: Record = { 'plan-eng-review-artifact': ['plan-eng-review/**'], 'plan-review-report': ['plan-eng-review/**', 'scripts/gen-skill-docs.ts'], - // Plan-mode smoke tests — gate-tier safety regression tests. Each fires when - // any of: the interactive skill's template, the plan-mode resolver - // (completion-status owns generatePlanModeInfo), preamble composition, or - // the real-PTY runner (which the tests now use instead of the SDK harness) - // change. - 'plan-ceo-review-plan-mode': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'], - 'plan-eng-review-plan-mode': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'], - 'plan-design-review-plan-mode': ['plan-design-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'], - 'plan-devex-review-plan-mode': ['plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'], + // Plan-mode smoke tests — gate-tier safety regression tests. Each test file + // contains TWO test cases as of v1.21: the baseline plan-mode case and the + // AskUserQuestion-blocked regression case (--disallowedTools AskUserQuestion + // parameterized — the flag set Conductor uses by default). Touchfiles + // include question-tuning.ts and generate-ask-user-format.ts because the + // AUTO_DECIDE preamble injection lives there and changes can flip the + // regression test outcome between 'asked' and 'auto_decided'. + 'plan-ceo-review-plan-mode': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'], + 'plan-eng-review-plan-mode': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'], + 'plan-design-review-plan-mode': ['plan-design-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'], + 'plan-devex-review-plan-mode': ['plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'], 'plan-mode-no-op': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'], + // v1.21+ AskUserQuestion-blocked regression tests — Conductor launches + // claude with `--disallowedTools AskUserQuestion --permission-mode default` + // (verified via `ps`); skills must still surface user-decisions through a + // fallback path (mcp__conductor__AskUserQuestion or plan-file flow) rather + // than silently auto-deciding. Parameterized regression test cases live + // INSIDE the existing 4 plan-X-review-plan-mode test files (covered + // transitively by the entries above). Two new standalone files exist for + // skills with no prior plan-mode test: + 'autoplan-auto-mode': ['autoplan/**', 'plan-ceo-review/**', 'plan-design-review/**', 'plan-eng-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'], + 'office-hours-auto-mode': ['office-hours/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'], + // Real-PTY E2E batch (#6 new tests on the harness). // Each one tests behavior the SDK harness can't observe (rendered TTY, // numbered-option lists, multi-phase ordering, idempotency state echo). @@ -369,6 +382,9 @@ export const E2E_TIERS: Record = { 'plan-design-review-plan-mode': 'gate', 'plan-devex-review-plan-mode': 'gate', 'plan-mode-no-op': 'gate', + // v1.21+ auto-mode regression tests + 'autoplan-auto-mode': 'gate', + 'office-hours-auto-mode': 'gate', 'e2e-harness-audit': 'gate', // Real-PTY E2E batch — tier classification: diff --git a/test/skill-e2e-autoplan-auto-mode.test.ts b/test/skill-e2e-autoplan-auto-mode.test.ts new file mode 100644 index 00000000..ecf13852 --- /dev/null +++ b/test/skill-e2e-autoplan-auto-mode.test.ts @@ -0,0 +1,48 @@ +/** + * autoplan AskUserQuestion-blocked regression (gate, paid, real-PTY). + * + * v1.21+ regression: Conductor launches Claude Code with + * `--disallowedTools AskUserQuestion --permission-mode default` (verified + * by inspecting the parent claude process via `ps`). The native + * AskUserQuestion tool is removed from the model's tool registry; without + * fallback guidance the model can't ask the user and silently proceeds. + * + * Autoplan auto-decides INTERMEDIATE questions BY DESIGN + * (autoplan/SKILL.md.tmpl:45), but Phase 1's premise confirmation gate is + * one of the few non-auto-decided AskUserQuestions and MUST surface to the + * user. This test asserts that gate still surfaces when AskUserQuestion is + * disallowed at the tool-registry level — the fix must route the question + * through a Conductor-side variant (mcp__conductor__AskUserQuestion) or + * through the plan-file + ExitPlanMode flow. + * + * Filename keeps `auto-mode` for branch-history continuity. Auto-mode (the + * AUTO_DECIDE preamble path when QUESTION_TUNING=true) is a related but + * distinct silencing mechanism; both share the same fix surface. + */ + +import { describe, test, expect } from 'bun:test'; +import { runPlanSkillObservation } from './helpers/claude-pty-runner'; + +const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate'; +const describeE2E = shouldRun ? describe : describe.skip; + +describeE2E('autoplan AskUserQuestion-blocked smoke (gate)', () => { + test('a non-auto-decided gate surfaces when AskUserQuestion is --disallowedTools', async () => { + const obs = await runPlanSkillObservation({ + skillName: 'autoplan', + inPlanMode: true, + extraArgs: ['--disallowedTools', 'AskUserQuestion'], + timeoutMs: 300_000, + }); + + if (obs.outcome !== 'asked') { + throw new Error( + `autoplan AskUserQuestion-blocked regression: outcome=${obs.outcome}\n` + + `summary: ${obs.summary}\n` + + `elapsed: ${obs.elapsedMs}ms\n` + + `--- evidence (last 2KB visible) ---\n${obs.evidence}`, + ); + } + expect(obs.outcome).toEqual('asked'); + }, 360_000); +}); diff --git a/test/skill-e2e-office-hours-auto-mode.test.ts b/test/skill-e2e-office-hours-auto-mode.test.ts new file mode 100644 index 00000000..27ab4b03 --- /dev/null +++ b/test/skill-e2e-office-hours-auto-mode.test.ts @@ -0,0 +1,44 @@ +/** + * office-hours AskUserQuestion-blocked regression (gate, paid, real-PTY). + * + * v1.21+ regression: Conductor launches Claude Code with + * `--disallowedTools AskUserQuestion --permission-mode default` (verified + * by inspecting the parent claude process via `ps`). office-hours' first + * step issues a startup-vs-builder mode AskUserQuestion + * (office-hours/SKILL.md.tmpl:69); when AskUserQuestion is disallowed at + * the tool-registry level the model cannot ask and silently picks one mode, + * breaking the whole interactive premise. This test asserts that question + * still surfaces — fix must route through mcp__conductor__AskUserQuestion + * (when present) or plan-file + ExitPlanMode flow. + * + * Filename keeps `auto-mode` for branch-history continuity. Auto-mode (the + * AUTO_DECIDE preamble path when QUESTION_TUNING=true) is a related but + * distinct silencing mechanism; both share the same fix surface. + */ + +import { describe, test, expect } from 'bun:test'; +import { runPlanSkillObservation } from './helpers/claude-pty-runner'; + +const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate'; +const describeE2E = shouldRun ? describe : describe.skip; + +describeE2E('office-hours AskUserQuestion-blocked smoke (gate)', () => { + test('AskUserQuestion surfaces when --disallowedTools AskUserQuestion is set', async () => { + const obs = await runPlanSkillObservation({ + skillName: 'office-hours', + inPlanMode: true, + extraArgs: ['--disallowedTools', 'AskUserQuestion'], + timeoutMs: 300_000, + }); + + if (obs.outcome !== 'asked') { + throw new Error( + `office-hours AskUserQuestion-blocked regression: outcome=${obs.outcome}\n` + + `summary: ${obs.summary}\n` + + `elapsed: ${obs.elapsedMs}ms\n` + + `--- evidence (last 2KB visible) ---\n${obs.evidence}`, + ); + } + expect(obs.outcome).toEqual('asked'); + }, 360_000); +}); diff --git a/test/skill-e2e-plan-ceo-plan-mode.test.ts b/test/skill-e2e-plan-ceo-plan-mode.test.ts index 8bb6a95b..279d25ca 100644 --- a/test/skill-e2e-plan-ceo-plan-mode.test.ts +++ b/test/skill-e2e-plan-ceo-plan-mode.test.ts @@ -45,4 +45,31 @@ describeE2E('plan-ceo-review plan-mode smoke (gate)', () => { } expect(['asked', 'plan_ready']).toContain(obs.outcome); }, 360_000); + + // v1.21+ regression: Conductor launches Claude Code with + // `--disallowedTools AskUserQuestion --permission-mode default` (verified + // via `ps` on the live Conductor claude process). Native AskUserQuestion + // is removed from the model's tool registry; without fallback guidance + // the model can't ask and silently proceeds. plan-ceo-review's Step 0 + // always asks a scope-mode question, so 'asked' is the only pass — the + // fix must route through mcp__conductor__AskUserQuestion (when present) + // or the plan-file + ExitPlanMode flow. + test('AskUserQuestion surfaces when --disallowedTools AskUserQuestion is set', async () => { + const obs = await runPlanSkillObservation({ + skillName: 'plan-ceo-review', + inPlanMode: true, + extraArgs: ['--disallowedTools', 'AskUserQuestion'], + timeoutMs: 300_000, + }); + + if (obs.outcome !== 'asked') { + throw new Error( + `plan-ceo-review AskUserQuestion-blocked regression: outcome=${obs.outcome}\n` + + `summary: ${obs.summary}\n` + + `elapsed: ${obs.elapsedMs}ms\n` + + `--- evidence (last 2KB visible) ---\n${obs.evidence}`, + ); + } + expect(obs.outcome).toEqual('asked'); + }, 360_000); }); diff --git a/test/skill-e2e-plan-design-plan-mode.test.ts b/test/skill-e2e-plan-design-plan-mode.test.ts index 6fd7881a..5d9824b4 100644 --- a/test/skill-e2e-plan-design-plan-mode.test.ts +++ b/test/skill-e2e-plan-design-plan-mode.test.ts @@ -33,4 +33,34 @@ describeE2E('plan-design-review plan-mode smoke (gate)', () => { } expect(['asked', 'plan_ready']).toContain(obs.outcome); }, 360_000); + + // v1.21+ regression: see skill-e2e-plan-ceo-plan-mode.test.ts for the + // contract. plan-design-review legitimately short-circuits on no-UI-scope + // branches, so this case keeps the same ['asked', 'plan_ready'] envelope + // as the baseline. The discriminating regression signals are + // 'auto_decided' (AUTO_DECIDE preamble fired upstream) or any failure + // outcome — both mean the user never saw a question they should have. + test('does not silently auto-decide when --disallowedTools AskUserQuestion is set', async () => { + const obs = await runPlanSkillObservation({ + skillName: 'plan-design-review', + inPlanMode: true, + extraArgs: ['--disallowedTools', 'AskUserQuestion'], + timeoutMs: 300_000, + }); + + if ( + obs.outcome === 'auto_decided' || + obs.outcome === 'silent_write' || + obs.outcome === 'exited' || + obs.outcome === 'timeout' + ) { + throw new Error( + `plan-design-review AskUserQuestion-blocked regression: outcome=${obs.outcome}\n` + + `summary: ${obs.summary}\n` + + `elapsed: ${obs.elapsedMs}ms\n` + + `--- evidence (last 2KB visible) ---\n${obs.evidence}`, + ); + } + expect(['asked', 'plan_ready']).toContain(obs.outcome); + }, 360_000); }); diff --git a/test/skill-e2e-plan-devex-plan-mode.test.ts b/test/skill-e2e-plan-devex-plan-mode.test.ts index 05f1abb3..5b51f843 100644 --- a/test/skill-e2e-plan-devex-plan-mode.test.ts +++ b/test/skill-e2e-plan-devex-plan-mode.test.ts @@ -29,4 +29,27 @@ describeE2E('plan-devex-review plan-mode smoke (gate)', () => { } expect(['asked', 'plan_ready']).toContain(obs.outcome); }, 360_000); + + // v1.21+ regression: see skill-e2e-plan-ceo-plan-mode.test.ts for the + // contract. plan-devex-review's Step 0 always issues a persona-mode + // AskUserQuestion before scoring, so 'asked' is the only pass when + // AskUserQuestion is --disallowedTools. + test('AskUserQuestion surfaces when --disallowedTools AskUserQuestion is set', async () => { + const obs = await runPlanSkillObservation({ + skillName: 'plan-devex-review', + inPlanMode: true, + extraArgs: ['--disallowedTools', 'AskUserQuestion'], + timeoutMs: 300_000, + }); + + if (obs.outcome !== 'asked') { + throw new Error( + `plan-devex-review AskUserQuestion-blocked regression: outcome=${obs.outcome}\n` + + `summary: ${obs.summary}\n` + + `elapsed: ${obs.elapsedMs}ms\n` + + `--- evidence (last 2KB visible) ---\n${obs.evidence}`, + ); + } + expect(obs.outcome).toEqual('asked'); + }, 360_000); }); diff --git a/test/skill-e2e-plan-eng-plan-mode.test.ts b/test/skill-e2e-plan-eng-plan-mode.test.ts index 93d55ece..903aa824 100644 --- a/test/skill-e2e-plan-eng-plan-mode.test.ts +++ b/test/skill-e2e-plan-eng-plan-mode.test.ts @@ -29,4 +29,27 @@ describeE2E('plan-eng-review plan-mode smoke (gate)', () => { } expect(['asked', 'plan_ready']).toContain(obs.outcome); }, 360_000); + + // v1.21+ regression: see skill-e2e-plan-ceo-plan-mode.test.ts for the + // contract. plan-eng-review's Step 0 always issues a scope-challenge + // AskUserQuestion (and per-section STOPs after that), so 'asked' is the + // only pass when AskUserQuestion is --disallowedTools. + test('AskUserQuestion surfaces when --disallowedTools AskUserQuestion is set', async () => { + const obs = await runPlanSkillObservation({ + skillName: 'plan-eng-review', + inPlanMode: true, + extraArgs: ['--disallowedTools', 'AskUserQuestion'], + timeoutMs: 300_000, + }); + + if (obs.outcome !== 'asked') { + throw new Error( + `plan-eng-review AskUserQuestion-blocked regression: outcome=${obs.outcome}\n` + + `summary: ${obs.summary}\n` + + `elapsed: ${obs.elapsedMs}ms\n` + + `--- evidence (last 2KB visible) ---\n${obs.evidence}`, + ); + } + expect(obs.outcome).toEqual('asked'); + }, 360_000); }); diff --git a/test/touchfiles.test.ts b/test/touchfiles.test.ts index 0d9ada4b..1135fe61 100644 --- a/test/touchfiles.test.ts +++ b/test/touchfiles.test.ts @@ -97,8 +97,10 @@ describe('selectTests', () => { expect(result.selected).toContain('ask-user-question-format-pty'); expect(result.selected).toContain('plan-ceo-mode-routing'); expect(result.selected).toContain('autoplan-chain-pty'); - expect(result.selected.length).toBe(18); - expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 18); + // v1.21+ auto-mode regression: autoplan-auto-mode also depends on plan-ceo-review/** + expect(result.selected).toContain('autoplan-auto-mode'); + expect(result.selected.length).toBe(19); + expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 19); }); test('global touchfile triggers ALL tests', () => {