From 9ef34603dfc9e55cf351ea547c0e68a1dc19faa4 Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Fri, 1 May 2026 07:10:37 -0700
Subject: [PATCH] test(harness): require ## Decisions section under
 --disallowedTools plan_ready
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adversarial review (during /ship Step 11) found that the previous gate-test
envelope ['asked', 'plan_ready'] for the AskUserQuestion-blocked regression
cases accepted the bug they exist to catch: a model that silently skips
Step 0 entirely (writes a plan with no questions, no `## Decisions to
confirm` section, just ExitPlanModes) reaches plan_ready and passes.

The fix tightens the contract in two layers:

1. Harness: PlanSkillObservation gains a `planFile?: string` field
   populated when outcome is plan_ready. extractPlanFilePath() walks the
   visible TTY buffer for "Plan saved to:", "Plan file:", or
   ".claude/plans/<name>.md" patterns and resolves tilde to absolute.
   planFileHasDecisionsSection() reads the resolved file and returns true
   if it contains a `## Decisions` heading (any form: "to confirm",
   "needed", etc.).

2. Tests: 5 of 6 regression cases now require, when outcome is plan_ready,
   that obs.planFile is set AND planFileHasDecisionsSection returns true.
   Otherwise the test fails with a "Step 0 was silently skipped" diagnosis.
   plan-design-review remains the sole exception — it legitimately
   short-circuits to plan_ready on no-UI-scope branches and we have no
   deterministic way to distinguish that from a silent skip.

This closes the loophole the adversarial review identified. The fix
preamble flow already tells the model to write `## Decisions to confirm`
when neither AUQ variant is callable — now the test verifies the model
actually did it.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 test/helpers/claude-pty-runner.ts             | 86 ++++++++++++++++++-
 test/skill-e2e-autoplan-auto-mode.test.ts     | 10 ++-
 test/skill-e2e-office-hours-auto-mode.test.ts | 10 ++-
 test/skill-e2e-plan-ceo-plan-mode.test.ts     | 20 ++++-
 test/skill-e2e-plan-design-plan-mode.test.ts  |  8 +-
 test/skill-e2e-plan-devex-plan-mode.test.ts   | 10 ++-
 test/skill-e2e-plan-eng-plan-mode.test.ts     | 10 ++-
 7 files changed, 147 insertions(+), 7 deletions(-)
diff --git a/test/helpers/claude-pty-runner.ts b/test/helpers/claude-pty-runner.ts
index 948010f7..d3602b12 100644
--- a/test/helpers/claude-pty-runner.ts
+++ b/test/helpers/claude-pty-runner.ts
@@ -163,6 +163,73 @@ export function isAutoDecidedVisible(visible: string): boolean {
   return /\(yourpreference\)/i.test(visible.replace(/\s+/g, ''));
 }
 
+/**
+ * Extract the plan file path from rendered TTY output. Plan-mode's native
+ * confirmation includes one of these formats near the "Ready to execute?"
+ * prompt:
+ *   - `Plan saved to: /path/to/plan.md`
+ *   - `Plan file: /path/to/plan.md`
+ *   - `ctrl-g to edit in VSCode · ~/.claude/plans/<name>.md`
+ *
+ * stripAnsi may collapse whitespace via cursor-positioning escape removal,
+ * so the regex tolerates variable spacing. Returns the resolved absolute
+ * path with `~` expanded, or null if no path was rendered.
+ *
+ * Used by v1.22 AskUserQuestion-blocked regression tests to read the plan
+ * file post-`plan_ready` and verify it contains a decisions section, which
+ * distinguishes the legitimate fallback flow ("write decision brief into
+ * plan file") from the silent-skip regression ("write a plan that didn't
+ * surface any decisions").
+ */
+export function extractPlanFilePath(visible: string): string | null {
+  // Patterns checked in order of specificity. Each captures the .md path.
+  // The visible buffer may have stripAnsi-collapsed whitespace, so we
+  // accept space-or-not separators in the prompts and accept paths
+  // without intervening whitespace (e.g. `editinVSCode·~/.claude/...`).
+  const patterns: RegExp[] = [
+    /Plan\s*saved\s*to\s*:?\s*(\S+\.md)/i,
+    /Plan\s*file\s*:?\s*(\S+\.md)/i,
+    /·\s*(\S+\.claude\/plans\/\S+\.md)/i,
+    // Fallback: any reference to a .claude/plans path in the buffer.
+    /(~?\/?\S*\.claude\/plans\/[\w-]+\.md)/i,
+  ];
+  for (const p of patterns) {
+    const m = visible.match(p);
+    if (m && m[1]) {
+      let raw = m[1];
+      // Some patterns capture trailing punctuation; strip a trailing dot.
+      raw = raw.replace(/\.+$/, '.md').replace(/\.md\.+$/, '.md');
+      // Tilde expansion to absolute path.
+      if (raw.startsWith('~')) {
+        const home = process.env.HOME ?? '';
+        raw = home + raw.slice(1);
+      }
+      return raw;
+    }
+  }
+  return null;
+}
+
+/**
+ * Read a plan file written by a plan-mode skill and verify it contains a
+ * "decisions" section — evidence the skill surfaced the decisions it was
+ * supposed to gate on, even when AskUserQuestion is --disallowedTools and
+ * the model used the plan-file fallback flow instead of a numbered prompt.
+ *
+ * Accepts any `## Decisions ...` heading (the canonical form from the
+ * preamble is `## Decisions to confirm`, but small variants like
+ * `## Decisions needed` or `## Decisions for review` are common). Returns
+ * false if the file is unreadable, missing, or has no decisions section.
+ */
+export function planFileHasDecisionsSection(planFile: string): boolean {
+  try {
+    const content = fs.readFileSync(planFile, 'utf-8');
+    return /^##\s+Decisions\b/im.test(content);
+  } catch {
+    return false;
+  }
+}
+
 /**
  * Recent-tail window (in bytes of stripped TTY text) used when classifying
  * permission dialogs. Old permission text persists in the visibleSince buffer
@@ -963,6 +1030,15 @@ export interface PlanSkillObservation {
   evidence: string;
   /** Wall time (ms) until the outcome was decided. */
   elapsedMs: number;
+  /**
+   * Path to the plan file the skill wrote (if outcome is 'plan_ready').
+   * Extracted from the visible TTY via {@link extractPlanFilePath}. Lets the
+   * v1.22 AskUserQuestion-blocked regression tests verify the plan file
+   * contains a `## Decisions to confirm` section under --disallowedTools —
+   * a model that silently skips Step 0 reaches plan_ready WITHOUT writing
+   * the section, and that's the regression we want to catch.
+   */
+  planFile?: string;
 }
 
 /**
@@ -1046,11 +1122,19 @@ export async function runPlanSkillObservation(opts: {
       }
       const classified = classifyVisible(visible);
       if (classified) {
-        return {
+        const obs: PlanSkillObservation = {
           ...classified,
           evidence: visible.slice(-2000),
           elapsedMs: Date.now() - startedAt,
         };
+        // For plan_ready outcomes, capture the plan file path from the full
+        // visible buffer — tests under --disallowedTools verify the file's
+        // contents to distinguish legitimate fallback flow from silent-skip.
+        if (classified.outcome === 'plan_ready') {
+          const planFile = extractPlanFilePath(visible);
+          if (planFile) obs.planFile = planFile;
+        }
+        return obs;
       }
     }
 
diff --git a/test/skill-e2e-autoplan-auto-mode.test.ts b/test/skill-e2e-autoplan-auto-mode.test.ts
index d64e1f0b..f5fe84db 100644
--- a/test/skill-e2e-autoplan-auto-mode.test.ts
+++ b/test/skill-e2e-autoplan-auto-mode.test.ts
@@ -21,7 +21,7 @@
  */
 
 import { describe, test, expect } from 'bun:test';
-import { runPlanSkillObservation } from './helpers/claude-pty-runner';
+import { runPlanSkillObservation, planFileHasDecisionsSection } from './helpers/claude-pty-runner';
 
 const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
 const describeE2E = shouldRun ? describe : describe.skip;
@@ -54,6 +54,14 @@ describeE2E('autoplan AskUserQuestion-blocked smoke (gate)', () => {
           `--- evidence (last 2KB visible) ---\n${obs.evidence}`,
       );
     }
+    if (obs.outcome === 'plan_ready') {
+      if (!obs.planFile || !planFileHasDecisionsSection(obs.planFile)) {
+        throw new Error(
+          `autoplan AskUserQuestion-blocked regression: plan_ready without a "## Decisions" section in ${obs.planFile ?? '<no plan file detected>'} — Phase 1 premise gate was silently skipped.\n` +
+            `--- evidence (last 2KB visible) ---\n${obs.evidence}`,
+        );
+      }
+    }
     expect(['asked', 'plan_ready']).toContain(obs.outcome);
   }, 360_000);
 });
diff --git a/test/skill-e2e-office-hours-auto-mode.test.ts b/test/skill-e2e-office-hours-auto-mode.test.ts
index b841d525..5e1a2948 100644
--- a/test/skill-e2e-office-hours-auto-mode.test.ts
+++ b/test/skill-e2e-office-hours-auto-mode.test.ts
@@ -17,7 +17,7 @@
  */
 
 import { describe, test, expect } from 'bun:test';
-import { runPlanSkillObservation } from './helpers/claude-pty-runner';
+import { runPlanSkillObservation, planFileHasDecisionsSection } from './helpers/claude-pty-runner';
 
 const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
 const describeE2E = shouldRun ? describe : describe.skip;
@@ -46,6 +46,14 @@ describeE2E('office-hours AskUserQuestion-blocked smoke (gate)', () => {
           `--- evidence (last 2KB visible) ---\n${obs.evidence}`,
       );
     }
+    if (obs.outcome === 'plan_ready') {
+      if (!obs.planFile || !planFileHasDecisionsSection(obs.planFile)) {
+        throw new Error(
+          `office-hours AskUserQuestion-blocked regression: plan_ready without a "## Decisions" section in ${obs.planFile ?? '<no plan file detected>'} — startup-vs-builder mode question was silently skipped.\n` +
+            `--- evidence (last 2KB visible) ---\n${obs.evidence}`,
+        );
+      }
+    }
     expect(['asked', 'plan_ready']).toContain(obs.outcome);
   }, 360_000);
 });
diff --git a/test/skill-e2e-plan-ceo-plan-mode.test.ts b/test/skill-e2e-plan-ceo-plan-mode.test.ts
index 36d384db..8ee1efdb 100644
--- a/test/skill-e2e-plan-ceo-plan-mode.test.ts
+++ b/test/skill-e2e-plan-ceo-plan-mode.test.ts
@@ -34,7 +34,7 @@
  */
 
 import { describe, test, expect } from 'bun:test';
-import { runPlanSkillObservation } from './helpers/claude-pty-runner';
+import { runPlanSkillObservation, planFileHasDecisionsSection } from './helpers/claude-pty-runner';
 
 const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
 const describeE2E = shouldRun ? describe : describe.skip;
@@ -110,6 +110,24 @@ describeE2E('plan-ceo-review plan-mode smoke (gate)', () => {
           `--- evidence (last 2KB visible) ---\n${obs.evidence}`,
       );
     }
+    // plan_ready under --disallowedTools is only a pass when the model used
+    // the plan-file fallback (wrote a `## Decisions to confirm` section).
+    // Without that section, plan_ready means the model silently skipped Step 0
+    // and went straight to ExitPlanMode — the regression we're catching.
+    if (obs.outcome === 'plan_ready') {
+      if (!obs.planFile) {
+        throw new Error(
+          `plan-ceo-review AskUserQuestion-blocked regression: outcome=plan_ready but no plan file path detected in TTY output. Cannot verify the model used the fallback flow.\n` +
+            `--- evidence (last 2KB visible) ---\n${obs.evidence}`,
+        );
+      }
+      if (!planFileHasDecisionsSection(obs.planFile)) {
+        throw new Error(
+          `plan-ceo-review AskUserQuestion-blocked regression: model wrote ${obs.planFile} without a "## Decisions" section. Step 0 was silently skipped.\n` +
+            `--- evidence (last 2KB visible) ---\n${obs.evidence}`,
+        );
+      }
+    }
     expect(['asked', 'plan_ready']).toContain(obs.outcome);
   }, 360_000);
 });
diff --git a/test/skill-e2e-plan-design-plan-mode.test.ts b/test/skill-e2e-plan-design-plan-mode.test.ts
index 5d9824b4..0f2bd69a 100644
--- a/test/skill-e2e-plan-design-plan-mode.test.ts
+++ b/test/skill-e2e-plan-design-plan-mode.test.ts
@@ -10,7 +10,7 @@
  */
 
 import { describe, test, expect } from 'bun:test';
-import { runPlanSkillObservation } from './helpers/claude-pty-runner';
+import { runPlanSkillObservation, planFileHasDecisionsSection } from './helpers/claude-pty-runner';
 
 const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
 const describeE2E = shouldRun ? describe : describe.skip;
@@ -61,6 +61,12 @@ describeE2E('plan-design-review plan-mode smoke (gate)', () => {
           `--- evidence (last 2KB visible) ---\n${obs.evidence}`,
       );
     }
+    // plan-design-review legitimately short-circuits to plan_ready on no-UI
+    // branches. Allow plan_ready WITHOUT a decisions section ONLY if the
+    // plan file genuinely has no UI scope (we don't have a deterministic way
+    // to check this from the test, so this skill keeps the looser envelope).
+    // Other plan-mode skills require the decisions section under
+    // --disallowedTools; design is the special case.
     expect(['asked', 'plan_ready']).toContain(obs.outcome);
   }, 360_000);
 });
diff --git a/test/skill-e2e-plan-devex-plan-mode.test.ts b/test/skill-e2e-plan-devex-plan-mode.test.ts
index 2ab1b2be..5ecad5aa 100644
--- a/test/skill-e2e-plan-devex-plan-mode.test.ts
+++ b/test/skill-e2e-plan-devex-plan-mode.test.ts
@@ -6,7 +6,7 @@
  */
 
 import { describe, test, expect } from 'bun:test';
-import { runPlanSkillObservation } from './helpers/claude-pty-runner';
+import { runPlanSkillObservation, planFileHasDecisionsSection } from './helpers/claude-pty-runner';
 
 const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
 const describeE2E = shouldRun ? describe : describe.skip;
@@ -55,6 +55,14 @@ describeE2E('plan-devex-review plan-mode smoke (gate)', () => {
           `--- evidence (last 2KB visible) ---\n${obs.evidence}`,
       );
     }
+    if (obs.outcome === 'plan_ready') {
+      if (!obs.planFile || !planFileHasDecisionsSection(obs.planFile)) {
+        throw new Error(
+          `plan-devex-review AskUserQuestion-blocked regression: plan_ready without a "## Decisions" section in ${obs.planFile ?? '<no plan file detected>'} — Step 0 was silently skipped.\n` +
+            `--- evidence (last 2KB visible) ---\n${obs.evidence}`,
+        );
+      }
+    }
     expect(['asked', 'plan_ready']).toContain(obs.outcome);
   }, 360_000);
 });
diff --git a/test/skill-e2e-plan-eng-plan-mode.test.ts b/test/skill-e2e-plan-eng-plan-mode.test.ts
index 35d6c918..d4a635e2 100644
--- a/test/skill-e2e-plan-eng-plan-mode.test.ts
+++ b/test/skill-e2e-plan-eng-plan-mode.test.ts
@@ -6,7 +6,7 @@
  */
 
 import { describe, test, expect } from 'bun:test';
-import { runPlanSkillObservation } from './helpers/claude-pty-runner';
+import { runPlanSkillObservation, planFileHasDecisionsSection } from './helpers/claude-pty-runner';
 
 const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
 const describeE2E = shouldRun ? describe : describe.skip;
@@ -55,6 +55,14 @@ describeE2E('plan-eng-review plan-mode smoke (gate)', () => {
           `--- evidence (last 2KB visible) ---\n${obs.evidence}`,
       );
     }
+    if (obs.outcome === 'plan_ready') {
+      if (!obs.planFile || !planFileHasDecisionsSection(obs.planFile)) {
+        throw new Error(
+          `plan-eng-review AskUserQuestion-blocked regression: plan_ready without a "## Decisions" section in ${obs.planFile ?? '<no plan file detected>'} — Step 0 was silently skipped.\n` +
+            `--- evidence (last 2KB visible) ---\n${obs.evidence}`,
+        );
+      }
+    }
     expect(['asked', 'plan_ready']).toContain(obs.outcome);
   }, 360_000);
 });