From 9ef34603dfc9e55cf351ea547c0e68a1dc19faa4 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Fri, 1 May 2026 07:10:37 -0700 Subject: [PATCH] test(harness): require ## Decisions section under --disallowedTools plan_ready MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adversarial review (during /ship Step 11) found that the previous gate-test envelope ['asked', 'plan_ready'] for the AskUserQuestion-blocked regression cases accepted the bug they exist to catch: a model that silently skips Step 0 entirely (writes a plan with no questions, no `## Decisions to confirm` section, just ExitPlanModes) reaches plan_ready and passes. The fix tightens the contract in two layers: 1. Harness: PlanSkillObservation gains a `planFile?: string` field populated when outcome is plan_ready. extractPlanFilePath() walks the visible TTY buffer for "Plan saved to:", "Plan file:", or ".claude/plans/.md" patterns and resolves tilde to absolute. planFileHasDecisionsSection() reads the resolved file and returns true if it contains a `## Decisions` heading (any form: "to confirm", "needed", etc.). 2. Tests: 5 of 6 regression cases now require, when outcome is plan_ready, that obs.planFile is set AND planFileHasDecisionsSection returns true. Otherwise the test fails with a "Step 0 was silently skipped" diagnosis. plan-design-review remains the sole exception — it legitimately short-circuits to plan_ready on no-UI-scope branches and we have no deterministic way to distinguish that from a silent skip. This closes the loophole the adversarial review identified. The fix preamble flow already tells the model to write `## Decisions to confirm` when neither AUQ variant is callable — now the test verifies the model actually did it. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/helpers/claude-pty-runner.ts | 86 ++++++++++++++++++- test/skill-e2e-autoplan-auto-mode.test.ts | 10 ++- test/skill-e2e-office-hours-auto-mode.test.ts | 10 ++- test/skill-e2e-plan-ceo-plan-mode.test.ts | 20 ++++- test/skill-e2e-plan-design-plan-mode.test.ts | 8 +- test/skill-e2e-plan-devex-plan-mode.test.ts | 10 ++- test/skill-e2e-plan-eng-plan-mode.test.ts | 10 ++- 7 files changed, 147 insertions(+), 7 deletions(-) diff --git a/test/helpers/claude-pty-runner.ts b/test/helpers/claude-pty-runner.ts index 948010f7..d3602b12 100644 --- a/test/helpers/claude-pty-runner.ts +++ b/test/helpers/claude-pty-runner.ts @@ -163,6 +163,73 @@ export function isAutoDecidedVisible(visible: string): boolean { return /\(yourpreference\)/i.test(visible.replace(/\s+/g, '')); } +/** + * Extract the plan file path from rendered TTY output. Plan-mode's native + * confirmation includes one of these formats near the "Ready to execute?" + * prompt: + * - `Plan saved to: /path/to/plan.md` + * - `Plan file: /path/to/plan.md` + * - `ctrl-g to edit in VSCode · ~/.claude/plans/.md` + * + * stripAnsi may collapse whitespace via cursor-positioning escape removal, + * so the regex tolerates variable spacing. Returns the resolved absolute + * path with `~` expanded, or null if no path was rendered. + * + * Used by v1.22 AskUserQuestion-blocked regression tests to read the plan + * file post-`plan_ready` and verify it contains a decisions section, which + * distinguishes the legitimate fallback flow ("write decision brief into + * plan file") from the silent-skip regression ("write a plan that didn't + * surface any decisions"). + */ +export function extractPlanFilePath(visible: string): string | null { + // Patterns checked in order of specificity. Each captures the .md path. + // The visible buffer may have stripAnsi-collapsed whitespace, so we + // accept space-or-not separators in the prompts and accept paths + // without intervening whitespace (e.g. `editinVSCode·~/.claude/...`). + const patterns: RegExp[] = [ + /Plan\s*saved\s*to\s*:?\s*(\S+\.md)/i, + /Plan\s*file\s*:?\s*(\S+\.md)/i, + /·\s*(\S+\.claude\/plans\/\S+\.md)/i, + // Fallback: any reference to a .claude/plans path in the buffer. + /(~?\/?\S*\.claude\/plans\/[\w-]+\.md)/i, + ]; + for (const p of patterns) { + const m = visible.match(p); + if (m && m[1]) { + let raw = m[1]; + // Some patterns capture trailing punctuation; strip a trailing dot. + raw = raw.replace(/\.+$/, '.md').replace(/\.md\.+$/, '.md'); + // Tilde expansion to absolute path. + if (raw.startsWith('~')) { + const home = process.env.HOME ?? ''; + raw = home + raw.slice(1); + } + return raw; + } + } + return null; +} + +/** + * Read a plan file written by a plan-mode skill and verify it contains a + * "decisions" section — evidence the skill surfaced the decisions it was + * supposed to gate on, even when AskUserQuestion is --disallowedTools and + * the model used the plan-file fallback flow instead of a numbered prompt. + * + * Accepts any `## Decisions ...` heading (the canonical form from the + * preamble is `## Decisions to confirm`, but small variants like + * `## Decisions needed` or `## Decisions for review` are common). Returns + * false if the file is unreadable, missing, or has no decisions section. + */ +export function planFileHasDecisionsSection(planFile: string): boolean { + try { + const content = fs.readFileSync(planFile, 'utf-8'); + return /^##\s+Decisions\b/im.test(content); + } catch { + return false; + } +} + /** * Recent-tail window (in bytes of stripped TTY text) used when classifying * permission dialogs. Old permission text persists in the visibleSince buffer @@ -963,6 +1030,15 @@ export interface PlanSkillObservation { evidence: string; /** Wall time (ms) until the outcome was decided. */ elapsedMs: number; + /** + * Path to the plan file the skill wrote (if outcome is 'plan_ready'). + * Extracted from the visible TTY via {@link extractPlanFilePath}. Lets the + * v1.22 AskUserQuestion-blocked regression tests verify the plan file + * contains a `## Decisions to confirm` section under --disallowedTools — + * a model that silently skips Step 0 reaches plan_ready WITHOUT writing + * the section, and that's the regression we want to catch. + */ + planFile?: string; } /** @@ -1046,11 +1122,19 @@ export async function runPlanSkillObservation(opts: { } const classified = classifyVisible(visible); if (classified) { - return { + const obs: PlanSkillObservation = { ...classified, evidence: visible.slice(-2000), elapsedMs: Date.now() - startedAt, }; + // For plan_ready outcomes, capture the plan file path from the full + // visible buffer — tests under --disallowedTools verify the file's + // contents to distinguish legitimate fallback flow from silent-skip. + if (classified.outcome === 'plan_ready') { + const planFile = extractPlanFilePath(visible); + if (planFile) obs.planFile = planFile; + } + return obs; } } diff --git a/test/skill-e2e-autoplan-auto-mode.test.ts b/test/skill-e2e-autoplan-auto-mode.test.ts index d64e1f0b..f5fe84db 100644 --- a/test/skill-e2e-autoplan-auto-mode.test.ts +++ b/test/skill-e2e-autoplan-auto-mode.test.ts @@ -21,7 +21,7 @@ */ import { describe, test, expect } from 'bun:test'; -import { runPlanSkillObservation } from './helpers/claude-pty-runner'; +import { runPlanSkillObservation, planFileHasDecisionsSection } from './helpers/claude-pty-runner'; const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate'; const describeE2E = shouldRun ? describe : describe.skip; @@ -54,6 +54,14 @@ describeE2E('autoplan AskUserQuestion-blocked smoke (gate)', () => { `--- evidence (last 2KB visible) ---\n${obs.evidence}`, ); } + if (obs.outcome === 'plan_ready') { + if (!obs.planFile || !planFileHasDecisionsSection(obs.planFile)) { + throw new Error( + `autoplan AskUserQuestion-blocked regression: plan_ready without a "## Decisions" section in ${obs.planFile ?? ''} — Phase 1 premise gate was silently skipped.\n` + + `--- evidence (last 2KB visible) ---\n${obs.evidence}`, + ); + } + } expect(['asked', 'plan_ready']).toContain(obs.outcome); }, 360_000); }); diff --git a/test/skill-e2e-office-hours-auto-mode.test.ts b/test/skill-e2e-office-hours-auto-mode.test.ts index b841d525..5e1a2948 100644 --- a/test/skill-e2e-office-hours-auto-mode.test.ts +++ b/test/skill-e2e-office-hours-auto-mode.test.ts @@ -17,7 +17,7 @@ */ import { describe, test, expect } from 'bun:test'; -import { runPlanSkillObservation } from './helpers/claude-pty-runner'; +import { runPlanSkillObservation, planFileHasDecisionsSection } from './helpers/claude-pty-runner'; const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate'; const describeE2E = shouldRun ? describe : describe.skip; @@ -46,6 +46,14 @@ describeE2E('office-hours AskUserQuestion-blocked smoke (gate)', () => { `--- evidence (last 2KB visible) ---\n${obs.evidence}`, ); } + if (obs.outcome === 'plan_ready') { + if (!obs.planFile || !planFileHasDecisionsSection(obs.planFile)) { + throw new Error( + `office-hours AskUserQuestion-blocked regression: plan_ready without a "## Decisions" section in ${obs.planFile ?? ''} — startup-vs-builder mode question was silently skipped.\n` + + `--- evidence (last 2KB visible) ---\n${obs.evidence}`, + ); + } + } expect(['asked', 'plan_ready']).toContain(obs.outcome); }, 360_000); }); diff --git a/test/skill-e2e-plan-ceo-plan-mode.test.ts b/test/skill-e2e-plan-ceo-plan-mode.test.ts index 36d384db..8ee1efdb 100644 --- a/test/skill-e2e-plan-ceo-plan-mode.test.ts +++ b/test/skill-e2e-plan-ceo-plan-mode.test.ts @@ -34,7 +34,7 @@ */ import { describe, test, expect } from 'bun:test'; -import { runPlanSkillObservation } from './helpers/claude-pty-runner'; +import { runPlanSkillObservation, planFileHasDecisionsSection } from './helpers/claude-pty-runner'; const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate'; const describeE2E = shouldRun ? describe : describe.skip; @@ -110,6 +110,24 @@ describeE2E('plan-ceo-review plan-mode smoke (gate)', () => { `--- evidence (last 2KB visible) ---\n${obs.evidence}`, ); } + // plan_ready under --disallowedTools is only a pass when the model used + // the plan-file fallback (wrote a `## Decisions to confirm` section). + // Without that section, plan_ready means the model silently skipped Step 0 + // and went straight to ExitPlanMode — the regression we're catching. + if (obs.outcome === 'plan_ready') { + if (!obs.planFile) { + throw new Error( + `plan-ceo-review AskUserQuestion-blocked regression: outcome=plan_ready but no plan file path detected in TTY output. Cannot verify the model used the fallback flow.\n` + + `--- evidence (last 2KB visible) ---\n${obs.evidence}`, + ); + } + if (!planFileHasDecisionsSection(obs.planFile)) { + throw new Error( + `plan-ceo-review AskUserQuestion-blocked regression: model wrote ${obs.planFile} without a "## Decisions" section. Step 0 was silently skipped.\n` + + `--- evidence (last 2KB visible) ---\n${obs.evidence}`, + ); + } + } expect(['asked', 'plan_ready']).toContain(obs.outcome); }, 360_000); }); diff --git a/test/skill-e2e-plan-design-plan-mode.test.ts b/test/skill-e2e-plan-design-plan-mode.test.ts index 5d9824b4..0f2bd69a 100644 --- a/test/skill-e2e-plan-design-plan-mode.test.ts +++ b/test/skill-e2e-plan-design-plan-mode.test.ts @@ -10,7 +10,7 @@ */ import { describe, test, expect } from 'bun:test'; -import { runPlanSkillObservation } from './helpers/claude-pty-runner'; +import { runPlanSkillObservation, planFileHasDecisionsSection } from './helpers/claude-pty-runner'; const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate'; const describeE2E = shouldRun ? describe : describe.skip; @@ -61,6 +61,12 @@ describeE2E('plan-design-review plan-mode smoke (gate)', () => { `--- evidence (last 2KB visible) ---\n${obs.evidence}`, ); } + // plan-design-review legitimately short-circuits to plan_ready on no-UI + // branches. Allow plan_ready WITHOUT a decisions section ONLY if the + // plan file genuinely has no UI scope (we don't have a deterministic way + // to check this from the test, so this skill keeps the looser envelope). + // Other plan-mode skills require the decisions section under + // --disallowedTools; design is the special case. expect(['asked', 'plan_ready']).toContain(obs.outcome); }, 360_000); }); diff --git a/test/skill-e2e-plan-devex-plan-mode.test.ts b/test/skill-e2e-plan-devex-plan-mode.test.ts index 2ab1b2be..5ecad5aa 100644 --- a/test/skill-e2e-plan-devex-plan-mode.test.ts +++ b/test/skill-e2e-plan-devex-plan-mode.test.ts @@ -6,7 +6,7 @@ */ import { describe, test, expect } from 'bun:test'; -import { runPlanSkillObservation } from './helpers/claude-pty-runner'; +import { runPlanSkillObservation, planFileHasDecisionsSection } from './helpers/claude-pty-runner'; const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate'; const describeE2E = shouldRun ? describe : describe.skip; @@ -55,6 +55,14 @@ describeE2E('plan-devex-review plan-mode smoke (gate)', () => { `--- evidence (last 2KB visible) ---\n${obs.evidence}`, ); } + if (obs.outcome === 'plan_ready') { + if (!obs.planFile || !planFileHasDecisionsSection(obs.planFile)) { + throw new Error( + `plan-devex-review AskUserQuestion-blocked regression: plan_ready without a "## Decisions" section in ${obs.planFile ?? ''} — Step 0 was silently skipped.\n` + + `--- evidence (last 2KB visible) ---\n${obs.evidence}`, + ); + } + } expect(['asked', 'plan_ready']).toContain(obs.outcome); }, 360_000); }); diff --git a/test/skill-e2e-plan-eng-plan-mode.test.ts b/test/skill-e2e-plan-eng-plan-mode.test.ts index 35d6c918..d4a635e2 100644 --- a/test/skill-e2e-plan-eng-plan-mode.test.ts +++ b/test/skill-e2e-plan-eng-plan-mode.test.ts @@ -6,7 +6,7 @@ */ import { describe, test, expect } from 'bun:test'; -import { runPlanSkillObservation } from './helpers/claude-pty-runner'; +import { runPlanSkillObservation, planFileHasDecisionsSection } from './helpers/claude-pty-runner'; const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate'; const describeE2E = shouldRun ? describe : describe.skip; @@ -55,6 +55,14 @@ describeE2E('plan-eng-review plan-mode smoke (gate)', () => { `--- evidence (last 2KB visible) ---\n${obs.evidence}`, ); } + if (obs.outcome === 'plan_ready') { + if (!obs.planFile || !planFileHasDecisionsSection(obs.planFile)) { + throw new Error( + `plan-eng-review AskUserQuestion-blocked regression: plan_ready without a "## Decisions" section in ${obs.planFile ?? ''} — Step 0 was silently skipped.\n` + + `--- evidence (last 2KB visible) ---\n${obs.evidence}`, + ); + } + } expect(['asked', 'plan_ready']).toContain(obs.outcome); }, 360_000); });