test(e2e): add AskUserQuestion-blocked regression cases for 6 plan-mode skills

Conductor launches Claude Code with --disallowedTools AskUserQuestion --permission-mode default --permission-prompt-tool stdio (verified by inspecting the live conductor claude process via ps -p ... -o args=). Native AskUserQuestion is removed from the model's tool registry; without fallback guidance the plan-mode skills (plan-ceo-review, plan-eng-review, plan-design-review, plan-devex-review, autoplan, office-hours) silently proceed and never surface decisions to the user. Adds 6 gate-tier real-PTY regression cases: - 4 inline test cases inside the existing plan-X-review-plan-mode.test files, each exercising the same skill with extraArgs ['--disallowedTools', 'AskUserQuestion'] and asserting outcome === 'asked'. plan-design-review keeps the ['asked', 'plan_ready'] envelope (legitimate short-circuit on no-UI-scope) but explicitly fails on 'auto_decided'. - 2 standalone test files for autoplan + office-hours (which had no prior plan-mode test). autoplan asserts the FIRST non-auto-decided gate fires (Phase 1 premise confirmation) — autoplan auto-decides intermediate questions BY DESIGN. Touchfile entries: - autoplan-auto-mode + office-hours-auto-mode added to E2E_TOUCHFILES + E2E_TIERS (gate) - existing plan-X-review-plan-mode entries gain question-tuning.ts and generate-ask-user-format.ts touchfile deps so AUTO_DECIDE-related resolver changes correctly invalidate the regression tests - touchfiles.test.ts count updated 18 -> 19 to cover the autoplan touchfile dependency on plan-ceo-review/** Filenames retain `auto-mode` for branch-history continuity. Auto-mode (the AUTO_DECIDE preamble path when QUESTION_TUNING=true) is a related but distinct silencing mechanism; both share the same fix surface in the preamble. These tests are expected to FAIL on this branch until the fix lands. The failure is the receipt for the regression. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-07-06 16:17:56 +02:00 · 2026-04-30 21:13:48 -07:00
parent 6c2db0bec6
commit aaf2668103
8 changed files with 224 additions and 11 deletions
@@ -82,17 +82,30 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
  'plan-eng-review-artifact':  ['plan-eng-review/**'],
  'plan-review-report':        ['plan-eng-review/**', 'scripts/gen-skill-docs.ts'],

-  // Plan-mode smoke tests — gate-tier safety regression tests. Each fires when
-  // any of: the interactive skill's template, the plan-mode resolver
-  // (completion-status owns generatePlanModeInfo), preamble composition, or
-  // the real-PTY runner (which the tests now use instead of the SDK harness)
-  // change.
-  'plan-ceo-review-plan-mode':    ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
-  'plan-eng-review-plan-mode':    ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
-  'plan-design-review-plan-mode': ['plan-design-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
-  'plan-devex-review-plan-mode':  ['plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
+  // Plan-mode smoke tests — gate-tier safety regression tests. Each test file
+  // contains TWO test cases as of v1.21: the baseline plan-mode case and the
+  // AskUserQuestion-blocked regression case (--disallowedTools AskUserQuestion
+  // parameterized — the flag set Conductor uses by default). Touchfiles
+  // include question-tuning.ts and generate-ask-user-format.ts because the
+  // AUTO_DECIDE preamble injection lives there and changes can flip the
+  // regression test outcome between 'asked' and 'auto_decided'.
+  'plan-ceo-review-plan-mode':    ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
+  'plan-eng-review-plan-mode':    ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
+  'plan-design-review-plan-mode': ['plan-design-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
+  'plan-devex-review-plan-mode':  ['plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
  'plan-mode-no-op':              ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],

+  // v1.21+ AskUserQuestion-blocked regression tests — Conductor launches
+  // claude with `--disallowedTools AskUserQuestion --permission-mode default`
+  // (verified via `ps`); skills must still surface user-decisions through a
+  // fallback path (mcp__conductor__AskUserQuestion or plan-file flow) rather
+  // than silently auto-deciding. Parameterized regression test cases live
+  // INSIDE the existing 4 plan-X-review-plan-mode test files (covered
+  // transitively by the entries above). Two new standalone files exist for
+  // skills with no prior plan-mode test:
+  'autoplan-auto-mode':           ['autoplan/**', 'plan-ceo-review/**', 'plan-design-review/**', 'plan-eng-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
+  'office-hours-auto-mode':       ['office-hours/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
+
  // Real-PTY E2E batch (#6 new tests on the harness).
  // Each one tests behavior the SDK harness can't observe (rendered TTY,
  // numbered-option lists, multi-phase ordering, idempotency state echo).
@@ -369,6 +382,9 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
  'plan-design-review-plan-mode': 'gate',
  'plan-devex-review-plan-mode': 'gate',
  'plan-mode-no-op': 'gate',
+  // v1.21+ auto-mode regression tests
+  'autoplan-auto-mode': 'gate',
+  'office-hours-auto-mode': 'gate',
  'e2e-harness-audit': 'gate',

  // Real-PTY E2E batch — tier classification: