From aaf26681033d1456daa241aacd9fe070b0664ad5 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Thu, 30 Apr 2026 21:13:48 -0700 Subject: [PATCH] test(e2e): add AskUserQuestion-blocked regression cases for 6 plan-mode skills MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Conductor launches Claude Code with --disallowedTools AskUserQuestion --permission-mode default --permission-prompt-tool stdio (verified by inspecting the live conductor claude process via ps -p ... -o args=). Native AskUserQuestion is removed from the model's tool registry; without fallback guidance the plan-mode skills (plan-ceo-review, plan-eng-review, plan-design-review, plan-devex-review, autoplan, office-hours) silently proceed and never surface decisions to the user. Adds 6 gate-tier real-PTY regression cases: - 4 inline test cases inside the existing plan-X-review-plan-mode.test files, each exercising the same skill with extraArgs ['--disallowedTools', 'AskUserQuestion'] and asserting outcome === 'asked'. plan-design-review keeps the ['asked', 'plan_ready'] envelope (legitimate short-circuit on no-UI-scope) but explicitly fails on 'auto_decided'. - 2 standalone test files for autoplan + office-hours (which had no prior plan-mode test). autoplan asserts the FIRST non-auto-decided gate fires (Phase 1 premise confirmation) — autoplan auto-decides intermediate questions BY DESIGN. Touchfile entries: - autoplan-auto-mode + office-hours-auto-mode added to E2E_TOUCHFILES + E2E_TIERS (gate) - existing plan-X-review-plan-mode entries gain question-tuning.ts and generate-ask-user-format.ts touchfile deps so AUTO_DECIDE-related resolver changes correctly invalidate the regression tests - touchfiles.test.ts count updated 18 -> 19 to cover the autoplan touchfile dependency on plan-ceo-review/** Filenames retain `auto-mode` for branch-history continuity. Auto-mode (the AUTO_DECIDE preamble path when QUESTION_TUNING=true) is a related but distinct silencing mechanism; both share the same fix surface in the preamble. These tests are expected to FAIL on this branch until the fix lands. The failure is the receipt for the regression. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/helpers/touchfiles.ts | 34 +++++++++---- test/skill-e2e-autoplan-auto-mode.test.ts | 48 +++++++++++++++++++ test/skill-e2e-office-hours-auto-mode.test.ts | 44 +++++++++++++++++ test/skill-e2e-plan-ceo-plan-mode.test.ts | 27 +++++++++++ test/skill-e2e-plan-design-plan-mode.test.ts | 30 ++++++++++++ test/skill-e2e-plan-devex-plan-mode.test.ts | 23 +++++++++ test/skill-e2e-plan-eng-plan-mode.test.ts | 23 +++++++++ test/touchfiles.test.ts | 6 ++- 8 files changed, 224 insertions(+), 11 deletions(-) create mode 100644 test/skill-e2e-autoplan-auto-mode.test.ts create mode 100644 test/skill-e2e-office-hours-auto-mode.test.ts diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index 4552b8e1..79eed956 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -82,17 +82,30 @@ export const E2E_TOUCHFILES: Record = { 'plan-eng-review-artifact': ['plan-eng-review/**'], 'plan-review-report': ['plan-eng-review/**', 'scripts/gen-skill-docs.ts'], - // Plan-mode smoke tests — gate-tier safety regression tests. Each fires when - // any of: the interactive skill's template, the plan-mode resolver - // (completion-status owns generatePlanModeInfo), preamble composition, or - // the real-PTY runner (which the tests now use instead of the SDK harness) - // change. - 'plan-ceo-review-plan-mode': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'], - 'plan-eng-review-plan-mode': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'], - 'plan-design-review-plan-mode': ['plan-design-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'], - 'plan-devex-review-plan-mode': ['plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'], + // Plan-mode smoke tests — gate-tier safety regression tests. Each test file + // contains TWO test cases as of v1.21: the baseline plan-mode case and the + // AskUserQuestion-blocked regression case (--disallowedTools AskUserQuestion + // parameterized — the flag set Conductor uses by default). Touchfiles + // include question-tuning.ts and generate-ask-user-format.ts because the + // AUTO_DECIDE preamble injection lives there and changes can flip the + // regression test outcome between 'asked' and 'auto_decided'. + 'plan-ceo-review-plan-mode': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'], + 'plan-eng-review-plan-mode': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'], + 'plan-design-review-plan-mode': ['plan-design-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'], + 'plan-devex-review-plan-mode': ['plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'], 'plan-mode-no-op': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'], + // v1.21+ AskUserQuestion-blocked regression tests — Conductor launches + // claude with `--disallowedTools AskUserQuestion --permission-mode default` + // (verified via `ps`); skills must still surface user-decisions through a + // fallback path (mcp__conductor__AskUserQuestion or plan-file flow) rather + // than silently auto-deciding. Parameterized regression test cases live + // INSIDE the existing 4 plan-X-review-plan-mode test files (covered + // transitively by the entries above). Two new standalone files exist for + // skills with no prior plan-mode test: + 'autoplan-auto-mode': ['autoplan/**', 'plan-ceo-review/**', 'plan-design-review/**', 'plan-eng-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'], + 'office-hours-auto-mode': ['office-hours/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'], + // Real-PTY E2E batch (#6 new tests on the harness). // Each one tests behavior the SDK harness can't observe (rendered TTY, // numbered-option lists, multi-phase ordering, idempotency state echo). @@ -369,6 +382,9 @@ export const E2E_TIERS: Record = { 'plan-design-review-plan-mode': 'gate', 'plan-devex-review-plan-mode': 'gate', 'plan-mode-no-op': 'gate', + // v1.21+ auto-mode regression tests + 'autoplan-auto-mode': 'gate', + 'office-hours-auto-mode': 'gate', 'e2e-harness-audit': 'gate', // Real-PTY E2E batch — tier classification: diff --git a/test/skill-e2e-autoplan-auto-mode.test.ts b/test/skill-e2e-autoplan-auto-mode.test.ts new file mode 100644 index 00000000..ecf13852 --- /dev/null +++ b/test/skill-e2e-autoplan-auto-mode.test.ts @@ -0,0 +1,48 @@ +/** + * autoplan AskUserQuestion-blocked regression (gate, paid, real-PTY). + * + * v1.21+ regression: Conductor launches Claude Code with + * `--disallowedTools AskUserQuestion --permission-mode default` (verified + * by inspecting the parent claude process via `ps`). The native + * AskUserQuestion tool is removed from the model's tool registry; without + * fallback guidance the model can't ask the user and silently proceeds. + * + * Autoplan auto-decides INTERMEDIATE questions BY DESIGN + * (autoplan/SKILL.md.tmpl:45), but Phase 1's premise confirmation gate is + * one of the few non-auto-decided AskUserQuestions and MUST surface to the + * user. This test asserts that gate still surfaces when AskUserQuestion is + * disallowed at the tool-registry level — the fix must route the question + * through a Conductor-side variant (mcp__conductor__AskUserQuestion) or + * through the plan-file + ExitPlanMode flow. + * + * Filename keeps `auto-mode` for branch-history continuity. Auto-mode (the + * AUTO_DECIDE preamble path when QUESTION_TUNING=true) is a related but + * distinct silencing mechanism; both share the same fix surface. + */ + +import { describe, test, expect } from 'bun:test'; +import { runPlanSkillObservation } from './helpers/claude-pty-runner'; + +const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate'; +const describeE2E = shouldRun ? describe : describe.skip; + +describeE2E('autoplan AskUserQuestion-blocked smoke (gate)', () => { + test('a non-auto-decided gate surfaces when AskUserQuestion is --disallowedTools', async () => { + const obs = await runPlanSkillObservation({ + skillName: 'autoplan', + inPlanMode: true, + extraArgs: ['--disallowedTools', 'AskUserQuestion'], + timeoutMs: 300_000, + }); + + if (obs.outcome !== 'asked') { + throw new Error( + `autoplan AskUserQuestion-blocked regression: outcome=${obs.outcome}\n` + + `summary: ${obs.summary}\n` + + `elapsed: ${obs.elapsedMs}ms\n` + + `--- evidence (last 2KB visible) ---\n${obs.evidence}`, + ); + } + expect(obs.outcome).toEqual('asked'); + }, 360_000); +}); diff --git a/test/skill-e2e-office-hours-auto-mode.test.ts b/test/skill-e2e-office-hours-auto-mode.test.ts new file mode 100644 index 00000000..27ab4b03 --- /dev/null +++ b/test/skill-e2e-office-hours-auto-mode.test.ts @@ -0,0 +1,44 @@ +/** + * office-hours AskUserQuestion-blocked regression (gate, paid, real-PTY). + * + * v1.21+ regression: Conductor launches Claude Code with + * `--disallowedTools AskUserQuestion --permission-mode default` (verified + * by inspecting the parent claude process via `ps`). office-hours' first + * step issues a startup-vs-builder mode AskUserQuestion + * (office-hours/SKILL.md.tmpl:69); when AskUserQuestion is disallowed at + * the tool-registry level the model cannot ask and silently picks one mode, + * breaking the whole interactive premise. This test asserts that question + * still surfaces — fix must route through mcp__conductor__AskUserQuestion + * (when present) or plan-file + ExitPlanMode flow. + * + * Filename keeps `auto-mode` for branch-history continuity. Auto-mode (the + * AUTO_DECIDE preamble path when QUESTION_TUNING=true) is a related but + * distinct silencing mechanism; both share the same fix surface. + */ + +import { describe, test, expect } from 'bun:test'; +import { runPlanSkillObservation } from './helpers/claude-pty-runner'; + +const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate'; +const describeE2E = shouldRun ? describe : describe.skip; + +describeE2E('office-hours AskUserQuestion-blocked smoke (gate)', () => { + test('AskUserQuestion surfaces when --disallowedTools AskUserQuestion is set', async () => { + const obs = await runPlanSkillObservation({ + skillName: 'office-hours', + inPlanMode: true, + extraArgs: ['--disallowedTools', 'AskUserQuestion'], + timeoutMs: 300_000, + }); + + if (obs.outcome !== 'asked') { + throw new Error( + `office-hours AskUserQuestion-blocked regression: outcome=${obs.outcome}\n` + + `summary: ${obs.summary}\n` + + `elapsed: ${obs.elapsedMs}ms\n` + + `--- evidence (last 2KB visible) ---\n${obs.evidence}`, + ); + } + expect(obs.outcome).toEqual('asked'); + }, 360_000); +}); diff --git a/test/skill-e2e-plan-ceo-plan-mode.test.ts b/test/skill-e2e-plan-ceo-plan-mode.test.ts index 8bb6a95b..279d25ca 100644 --- a/test/skill-e2e-plan-ceo-plan-mode.test.ts +++ b/test/skill-e2e-plan-ceo-plan-mode.test.ts @@ -45,4 +45,31 @@ describeE2E('plan-ceo-review plan-mode smoke (gate)', () => { } expect(['asked', 'plan_ready']).toContain(obs.outcome); }, 360_000); + + // v1.21+ regression: Conductor launches Claude Code with + // `--disallowedTools AskUserQuestion --permission-mode default` (verified + // via `ps` on the live Conductor claude process). Native AskUserQuestion + // is removed from the model's tool registry; without fallback guidance + // the model can't ask and silently proceeds. plan-ceo-review's Step 0 + // always asks a scope-mode question, so 'asked' is the only pass — the + // fix must route through mcp__conductor__AskUserQuestion (when present) + // or the plan-file + ExitPlanMode flow. + test('AskUserQuestion surfaces when --disallowedTools AskUserQuestion is set', async () => { + const obs = await runPlanSkillObservation({ + skillName: 'plan-ceo-review', + inPlanMode: true, + extraArgs: ['--disallowedTools', 'AskUserQuestion'], + timeoutMs: 300_000, + }); + + if (obs.outcome !== 'asked') { + throw new Error( + `plan-ceo-review AskUserQuestion-blocked regression: outcome=${obs.outcome}\n` + + `summary: ${obs.summary}\n` + + `elapsed: ${obs.elapsedMs}ms\n` + + `--- evidence (last 2KB visible) ---\n${obs.evidence}`, + ); + } + expect(obs.outcome).toEqual('asked'); + }, 360_000); }); diff --git a/test/skill-e2e-plan-design-plan-mode.test.ts b/test/skill-e2e-plan-design-plan-mode.test.ts index 6fd7881a..5d9824b4 100644 --- a/test/skill-e2e-plan-design-plan-mode.test.ts +++ b/test/skill-e2e-plan-design-plan-mode.test.ts @@ -33,4 +33,34 @@ describeE2E('plan-design-review plan-mode smoke (gate)', () => { } expect(['asked', 'plan_ready']).toContain(obs.outcome); }, 360_000); + + // v1.21+ regression: see skill-e2e-plan-ceo-plan-mode.test.ts for the + // contract. plan-design-review legitimately short-circuits on no-UI-scope + // branches, so this case keeps the same ['asked', 'plan_ready'] envelope + // as the baseline. The discriminating regression signals are + // 'auto_decided' (AUTO_DECIDE preamble fired upstream) or any failure + // outcome — both mean the user never saw a question they should have. + test('does not silently auto-decide when --disallowedTools AskUserQuestion is set', async () => { + const obs = await runPlanSkillObservation({ + skillName: 'plan-design-review', + inPlanMode: true, + extraArgs: ['--disallowedTools', 'AskUserQuestion'], + timeoutMs: 300_000, + }); + + if ( + obs.outcome === 'auto_decided' || + obs.outcome === 'silent_write' || + obs.outcome === 'exited' || + obs.outcome === 'timeout' + ) { + throw new Error( + `plan-design-review AskUserQuestion-blocked regression: outcome=${obs.outcome}\n` + + `summary: ${obs.summary}\n` + + `elapsed: ${obs.elapsedMs}ms\n` + + `--- evidence (last 2KB visible) ---\n${obs.evidence}`, + ); + } + expect(['asked', 'plan_ready']).toContain(obs.outcome); + }, 360_000); }); diff --git a/test/skill-e2e-plan-devex-plan-mode.test.ts b/test/skill-e2e-plan-devex-plan-mode.test.ts index 05f1abb3..5b51f843 100644 --- a/test/skill-e2e-plan-devex-plan-mode.test.ts +++ b/test/skill-e2e-plan-devex-plan-mode.test.ts @@ -29,4 +29,27 @@ describeE2E('plan-devex-review plan-mode smoke (gate)', () => { } expect(['asked', 'plan_ready']).toContain(obs.outcome); }, 360_000); + + // v1.21+ regression: see skill-e2e-plan-ceo-plan-mode.test.ts for the + // contract. plan-devex-review's Step 0 always issues a persona-mode + // AskUserQuestion before scoring, so 'asked' is the only pass when + // AskUserQuestion is --disallowedTools. + test('AskUserQuestion surfaces when --disallowedTools AskUserQuestion is set', async () => { + const obs = await runPlanSkillObservation({ + skillName: 'plan-devex-review', + inPlanMode: true, + extraArgs: ['--disallowedTools', 'AskUserQuestion'], + timeoutMs: 300_000, + }); + + if (obs.outcome !== 'asked') { + throw new Error( + `plan-devex-review AskUserQuestion-blocked regression: outcome=${obs.outcome}\n` + + `summary: ${obs.summary}\n` + + `elapsed: ${obs.elapsedMs}ms\n` + + `--- evidence (last 2KB visible) ---\n${obs.evidence}`, + ); + } + expect(obs.outcome).toEqual('asked'); + }, 360_000); }); diff --git a/test/skill-e2e-plan-eng-plan-mode.test.ts b/test/skill-e2e-plan-eng-plan-mode.test.ts index 93d55ece..903aa824 100644 --- a/test/skill-e2e-plan-eng-plan-mode.test.ts +++ b/test/skill-e2e-plan-eng-plan-mode.test.ts @@ -29,4 +29,27 @@ describeE2E('plan-eng-review plan-mode smoke (gate)', () => { } expect(['asked', 'plan_ready']).toContain(obs.outcome); }, 360_000); + + // v1.21+ regression: see skill-e2e-plan-ceo-plan-mode.test.ts for the + // contract. plan-eng-review's Step 0 always issues a scope-challenge + // AskUserQuestion (and per-section STOPs after that), so 'asked' is the + // only pass when AskUserQuestion is --disallowedTools. + test('AskUserQuestion surfaces when --disallowedTools AskUserQuestion is set', async () => { + const obs = await runPlanSkillObservation({ + skillName: 'plan-eng-review', + inPlanMode: true, + extraArgs: ['--disallowedTools', 'AskUserQuestion'], + timeoutMs: 300_000, + }); + + if (obs.outcome !== 'asked') { + throw new Error( + `plan-eng-review AskUserQuestion-blocked regression: outcome=${obs.outcome}\n` + + `summary: ${obs.summary}\n` + + `elapsed: ${obs.elapsedMs}ms\n` + + `--- evidence (last 2KB visible) ---\n${obs.evidence}`, + ); + } + expect(obs.outcome).toEqual('asked'); + }, 360_000); }); diff --git a/test/touchfiles.test.ts b/test/touchfiles.test.ts index 0d9ada4b..1135fe61 100644 --- a/test/touchfiles.test.ts +++ b/test/touchfiles.test.ts @@ -97,8 +97,10 @@ describe('selectTests', () => { expect(result.selected).toContain('ask-user-question-format-pty'); expect(result.selected).toContain('plan-ceo-mode-routing'); expect(result.selected).toContain('autoplan-chain-pty'); - expect(result.selected.length).toBe(18); - expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 18); + // v1.21+ auto-mode regression: autoplan-auto-mode also depends on plan-ceo-review/** + expect(result.selected).toContain('autoplan-auto-mode'); + expect(result.selected.length).toBe(19); + expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 19); }); test('global touchfile triggers ALL tests', () => {