test(e2e): add AskUserQuestion-blocked regression cases for 6 plan-mode skills

Conductor launches Claude Code with --disallowedTools AskUserQuestion
--permission-mode default --permission-prompt-tool stdio (verified by
inspecting the live conductor claude process via ps -p ... -o args=).
Native AskUserQuestion is removed from the model's tool registry; without
fallback guidance the plan-mode skills (plan-ceo-review, plan-eng-review,
plan-design-review, plan-devex-review, autoplan, office-hours) silently
proceed and never surface decisions to the user.

Adds 6 gate-tier real-PTY regression cases:

  - 4 inline test cases inside the existing plan-X-review-plan-mode.test
    files, each exercising the same skill with extraArgs ['--disallowedTools',
    'AskUserQuestion'] and asserting outcome === 'asked'. plan-design-review
    keeps the ['asked', 'plan_ready'] envelope (legitimate short-circuit on
    no-UI-scope) but explicitly fails on 'auto_decided'.
  - 2 standalone test files for autoplan + office-hours (which had no prior
    plan-mode test). autoplan asserts the FIRST non-auto-decided gate fires
    (Phase 1 premise confirmation) — autoplan auto-decides intermediate
    questions BY DESIGN.

Touchfile entries:
  - autoplan-auto-mode + office-hours-auto-mode added to E2E_TOUCHFILES +
    E2E_TIERS (gate)
  - existing plan-X-review-plan-mode entries gain question-tuning.ts and
    generate-ask-user-format.ts touchfile deps so AUTO_DECIDE-related
    resolver changes correctly invalidate the regression tests
  - touchfiles.test.ts count updated 18 -> 19 to cover the autoplan
    touchfile dependency on plan-ceo-review/**

Filenames retain `auto-mode` for branch-history continuity. Auto-mode (the
AUTO_DECIDE preamble path when QUESTION_TUNING=true) is a related but
distinct silencing mechanism; both share the same fix surface in the
preamble.

These tests are expected to FAIL on this branch until the fix lands. The
failure is the receipt for the regression.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-04-30 21:13:48 -07:00
parent 6c2db0bec6
commit aaf2668103
8 changed files with 224 additions and 11 deletions
+25 -9
View File
@@ -82,17 +82,30 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
'plan-eng-review-artifact': ['plan-eng-review/**'],
'plan-review-report': ['plan-eng-review/**', 'scripts/gen-skill-docs.ts'],
// Plan-mode smoke tests — gate-tier safety regression tests. Each fires when
// any of: the interactive skill's template, the plan-mode resolver
// (completion-status owns generatePlanModeInfo), preamble composition, or
// the real-PTY runner (which the tests now use instead of the SDK harness)
// change.
'plan-ceo-review-plan-mode': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
'plan-eng-review-plan-mode': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
'plan-design-review-plan-mode': ['plan-design-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
'plan-devex-review-plan-mode': ['plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
// Plan-mode smoke tests — gate-tier safety regression tests. Each test file
// contains TWO test cases as of v1.21: the baseline plan-mode case and the
// AskUserQuestion-blocked regression case (--disallowedTools AskUserQuestion
// parameterized — the flag set Conductor uses by default). Touchfiles
// include question-tuning.ts and generate-ask-user-format.ts because the
// AUTO_DECIDE preamble injection lives there and changes can flip the
// regression test outcome between 'asked' and 'auto_decided'.
'plan-ceo-review-plan-mode': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
'plan-eng-review-plan-mode': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
'plan-design-review-plan-mode': ['plan-design-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
'plan-devex-review-plan-mode': ['plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
'plan-mode-no-op': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
// v1.21+ AskUserQuestion-blocked regression tests — Conductor launches
// claude with `--disallowedTools AskUserQuestion --permission-mode default`
// (verified via `ps`); skills must still surface user-decisions through a
// fallback path (mcp__conductor__AskUserQuestion or plan-file flow) rather
// than silently auto-deciding. Parameterized regression test cases live
// INSIDE the existing 4 plan-X-review-plan-mode test files (covered
// transitively by the entries above). Two new standalone files exist for
// skills with no prior plan-mode test:
'autoplan-auto-mode': ['autoplan/**', 'plan-ceo-review/**', 'plan-design-review/**', 'plan-eng-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
'office-hours-auto-mode': ['office-hours/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
// Real-PTY E2E batch (#6 new tests on the harness).
// Each one tests behavior the SDK harness can't observe (rendered TTY,
// numbered-option lists, multi-phase ordering, idempotency state echo).
@@ -369,6 +382,9 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
'plan-design-review-plan-mode': 'gate',
'plan-devex-review-plan-mode': 'gate',
'plan-mode-no-op': 'gate',
// v1.21+ auto-mode regression tests
'autoplan-auto-mode': 'gate',
'office-hours-auto-mode': 'gate',
'e2e-harness-audit': 'gate',
// Real-PTY E2E batch — tier classification:
+48
View File
@@ -0,0 +1,48 @@
/**
* autoplan AskUserQuestion-blocked regression (gate, paid, real-PTY).
*
* v1.21+ regression: Conductor launches Claude Code with
* `--disallowedTools AskUserQuestion --permission-mode default` (verified
* by inspecting the parent claude process via `ps`). The native
* AskUserQuestion tool is removed from the model's tool registry; without
* fallback guidance the model can't ask the user and silently proceeds.
*
* Autoplan auto-decides INTERMEDIATE questions BY DESIGN
* (autoplan/SKILL.md.tmpl:45), but Phase 1's premise confirmation gate is
* one of the few non-auto-decided AskUserQuestions and MUST surface to the
* user. This test asserts that gate still surfaces when AskUserQuestion is
* disallowed at the tool-registry level — the fix must route the question
* through a Conductor-side variant (mcp__conductor__AskUserQuestion) or
* through the plan-file + ExitPlanMode flow.
*
* Filename keeps `auto-mode` for branch-history continuity. Auto-mode (the
* AUTO_DECIDE preamble path when QUESTION_TUNING=true) is a related but
* distinct silencing mechanism; both share the same fix surface.
*/
import { describe, test, expect } from 'bun:test';
import { runPlanSkillObservation } from './helpers/claude-pty-runner';
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
const describeE2E = shouldRun ? describe : describe.skip;
describeE2E('autoplan AskUserQuestion-blocked smoke (gate)', () => {
test('a non-auto-decided gate surfaces when AskUserQuestion is --disallowedTools', async () => {
const obs = await runPlanSkillObservation({
skillName: 'autoplan',
inPlanMode: true,
extraArgs: ['--disallowedTools', 'AskUserQuestion'],
timeoutMs: 300_000,
});
if (obs.outcome !== 'asked') {
throw new Error(
`autoplan AskUserQuestion-blocked regression: outcome=${obs.outcome}\n` +
`summary: ${obs.summary}\n` +
`elapsed: ${obs.elapsedMs}ms\n` +
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
);
}
expect(obs.outcome).toEqual('asked');
}, 360_000);
});
@@ -0,0 +1,44 @@
/**
* office-hours AskUserQuestion-blocked regression (gate, paid, real-PTY).
*
* v1.21+ regression: Conductor launches Claude Code with
* `--disallowedTools AskUserQuestion --permission-mode default` (verified
* by inspecting the parent claude process via `ps`). office-hours' first
* step issues a startup-vs-builder mode AskUserQuestion
* (office-hours/SKILL.md.tmpl:69); when AskUserQuestion is disallowed at
* the tool-registry level the model cannot ask and silently picks one mode,
* breaking the whole interactive premise. This test asserts that question
* still surfaces — fix must route through mcp__conductor__AskUserQuestion
* (when present) or plan-file + ExitPlanMode flow.
*
* Filename keeps `auto-mode` for branch-history continuity. Auto-mode (the
* AUTO_DECIDE preamble path when QUESTION_TUNING=true) is a related but
* distinct silencing mechanism; both share the same fix surface.
*/
import { describe, test, expect } from 'bun:test';
import { runPlanSkillObservation } from './helpers/claude-pty-runner';
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
const describeE2E = shouldRun ? describe : describe.skip;
describeE2E('office-hours AskUserQuestion-blocked smoke (gate)', () => {
test('AskUserQuestion surfaces when --disallowedTools AskUserQuestion is set', async () => {
const obs = await runPlanSkillObservation({
skillName: 'office-hours',
inPlanMode: true,
extraArgs: ['--disallowedTools', 'AskUserQuestion'],
timeoutMs: 300_000,
});
if (obs.outcome !== 'asked') {
throw new Error(
`office-hours AskUserQuestion-blocked regression: outcome=${obs.outcome}\n` +
`summary: ${obs.summary}\n` +
`elapsed: ${obs.elapsedMs}ms\n` +
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
);
}
expect(obs.outcome).toEqual('asked');
}, 360_000);
});
+27
View File
@@ -45,4 +45,31 @@ describeE2E('plan-ceo-review plan-mode smoke (gate)', () => {
}
expect(['asked', 'plan_ready']).toContain(obs.outcome);
}, 360_000);
// v1.21+ regression: Conductor launches Claude Code with
// `--disallowedTools AskUserQuestion --permission-mode default` (verified
// via `ps` on the live Conductor claude process). Native AskUserQuestion
// is removed from the model's tool registry; without fallback guidance
// the model can't ask and silently proceeds. plan-ceo-review's Step 0
// always asks a scope-mode question, so 'asked' is the only pass — the
// fix must route through mcp__conductor__AskUserQuestion (when present)
// or the plan-file + ExitPlanMode flow.
test('AskUserQuestion surfaces when --disallowedTools AskUserQuestion is set', async () => {
const obs = await runPlanSkillObservation({
skillName: 'plan-ceo-review',
inPlanMode: true,
extraArgs: ['--disallowedTools', 'AskUserQuestion'],
timeoutMs: 300_000,
});
if (obs.outcome !== 'asked') {
throw new Error(
`plan-ceo-review AskUserQuestion-blocked regression: outcome=${obs.outcome}\n` +
`summary: ${obs.summary}\n` +
`elapsed: ${obs.elapsedMs}ms\n` +
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
);
}
expect(obs.outcome).toEqual('asked');
}, 360_000);
});
@@ -33,4 +33,34 @@ describeE2E('plan-design-review plan-mode smoke (gate)', () => {
}
expect(['asked', 'plan_ready']).toContain(obs.outcome);
}, 360_000);
// v1.21+ regression: see skill-e2e-plan-ceo-plan-mode.test.ts for the
// contract. plan-design-review legitimately short-circuits on no-UI-scope
// branches, so this case keeps the same ['asked', 'plan_ready'] envelope
// as the baseline. The discriminating regression signals are
// 'auto_decided' (AUTO_DECIDE preamble fired upstream) or any failure
// outcome — both mean the user never saw a question they should have.
test('does not silently auto-decide when --disallowedTools AskUserQuestion is set', async () => {
const obs = await runPlanSkillObservation({
skillName: 'plan-design-review',
inPlanMode: true,
extraArgs: ['--disallowedTools', 'AskUserQuestion'],
timeoutMs: 300_000,
});
if (
obs.outcome === 'auto_decided' ||
obs.outcome === 'silent_write' ||
obs.outcome === 'exited' ||
obs.outcome === 'timeout'
) {
throw new Error(
`plan-design-review AskUserQuestion-blocked regression: outcome=${obs.outcome}\n` +
`summary: ${obs.summary}\n` +
`elapsed: ${obs.elapsedMs}ms\n` +
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
);
}
expect(['asked', 'plan_ready']).toContain(obs.outcome);
}, 360_000);
});
@@ -29,4 +29,27 @@ describeE2E('plan-devex-review plan-mode smoke (gate)', () => {
}
expect(['asked', 'plan_ready']).toContain(obs.outcome);
}, 360_000);
// v1.21+ regression: see skill-e2e-plan-ceo-plan-mode.test.ts for the
// contract. plan-devex-review's Step 0 always issues a persona-mode
// AskUserQuestion before scoring, so 'asked' is the only pass when
// AskUserQuestion is --disallowedTools.
test('AskUserQuestion surfaces when --disallowedTools AskUserQuestion is set', async () => {
const obs = await runPlanSkillObservation({
skillName: 'plan-devex-review',
inPlanMode: true,
extraArgs: ['--disallowedTools', 'AskUserQuestion'],
timeoutMs: 300_000,
});
if (obs.outcome !== 'asked') {
throw new Error(
`plan-devex-review AskUserQuestion-blocked regression: outcome=${obs.outcome}\n` +
`summary: ${obs.summary}\n` +
`elapsed: ${obs.elapsedMs}ms\n` +
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
);
}
expect(obs.outcome).toEqual('asked');
}, 360_000);
});
+23
View File
@@ -29,4 +29,27 @@ describeE2E('plan-eng-review plan-mode smoke (gate)', () => {
}
expect(['asked', 'plan_ready']).toContain(obs.outcome);
}, 360_000);
// v1.21+ regression: see skill-e2e-plan-ceo-plan-mode.test.ts for the
// contract. plan-eng-review's Step 0 always issues a scope-challenge
// AskUserQuestion (and per-section STOPs after that), so 'asked' is the
// only pass when AskUserQuestion is --disallowedTools.
test('AskUserQuestion surfaces when --disallowedTools AskUserQuestion is set', async () => {
const obs = await runPlanSkillObservation({
skillName: 'plan-eng-review',
inPlanMode: true,
extraArgs: ['--disallowedTools', 'AskUserQuestion'],
timeoutMs: 300_000,
});
if (obs.outcome !== 'asked') {
throw new Error(
`plan-eng-review AskUserQuestion-blocked regression: outcome=${obs.outcome}\n` +
`summary: ${obs.summary}\n` +
`elapsed: ${obs.elapsedMs}ms\n` +
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
);
}
expect(obs.outcome).toEqual('asked');
}, 360_000);
});
+4 -2
View File
@@ -97,8 +97,10 @@ describe('selectTests', () => {
expect(result.selected).toContain('ask-user-question-format-pty');
expect(result.selected).toContain('plan-ceo-mode-routing');
expect(result.selected).toContain('autoplan-chain-pty');
expect(result.selected.length).toBe(18);
expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 18);
// v1.21+ auto-mode regression: autoplan-auto-mode also depends on plan-ceo-review/**
expect(result.selected).toContain('autoplan-auto-mode');
expect(result.selected.length).toBe(19);
expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 19);
});
test('global touchfile triggers ALL tests', () => {