mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-02 03:35:09 +02:00
test(harness): require ## Decisions section under --disallowedTools plan_ready
Adversarial review (during /ship Step 11) found that the previous gate-test envelope ['asked', 'plan_ready'] for the AskUserQuestion-blocked regression cases accepted the bug they exist to catch: a model that silently skips Step 0 entirely (writes a plan with no questions, no `## Decisions to confirm` section, just ExitPlanModes) reaches plan_ready and passes. The fix tightens the contract in two layers: 1. Harness: PlanSkillObservation gains a `planFile?: string` field populated when outcome is plan_ready. extractPlanFilePath() walks the visible TTY buffer for "Plan saved to:", "Plan file:", or ".claude/plans/<name>.md" patterns and resolves tilde to absolute. planFileHasDecisionsSection() reads the resolved file and returns true if it contains a `## Decisions` heading (any form: "to confirm", "needed", etc.). 2. Tests: 5 of 6 regression cases now require, when outcome is plan_ready, that obs.planFile is set AND planFileHasDecisionsSection returns true. Otherwise the test fails with a "Step 0 was silently skipped" diagnosis. plan-design-review remains the sole exception — it legitimately short-circuits to plan_ready on no-UI-scope branches and we have no deterministic way to distinguish that from a silent skip. This closes the loophole the adversarial review identified. The fix preamble flow already tells the model to write `## Decisions to confirm` when neither AUQ variant is callable — now the test verifies the model actually did it. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -163,6 +163,73 @@ export function isAutoDecidedVisible(visible: string): boolean {
|
||||
return /\(yourpreference\)/i.test(visible.replace(/\s+/g, ''));
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract the plan file path from rendered TTY output. Plan-mode's native
|
||||
* confirmation includes one of these formats near the "Ready to execute?"
|
||||
* prompt:
|
||||
* - `Plan saved to: /path/to/plan.md`
|
||||
* - `Plan file: /path/to/plan.md`
|
||||
* - `ctrl-g to edit in VSCode · ~/.claude/plans/<name>.md`
|
||||
*
|
||||
* stripAnsi may collapse whitespace via cursor-positioning escape removal,
|
||||
* so the regex tolerates variable spacing. Returns the resolved absolute
|
||||
* path with `~` expanded, or null if no path was rendered.
|
||||
*
|
||||
* Used by v1.22 AskUserQuestion-blocked regression tests to read the plan
|
||||
* file post-`plan_ready` and verify it contains a decisions section, which
|
||||
* distinguishes the legitimate fallback flow ("write decision brief into
|
||||
* plan file") from the silent-skip regression ("write a plan that didn't
|
||||
* surface any decisions").
|
||||
*/
|
||||
export function extractPlanFilePath(visible: string): string | null {
|
||||
// Patterns checked in order of specificity. Each captures the .md path.
|
||||
// The visible buffer may have stripAnsi-collapsed whitespace, so we
|
||||
// accept space-or-not separators in the prompts and accept paths
|
||||
// without intervening whitespace (e.g. `editinVSCode·~/.claude/...`).
|
||||
const patterns: RegExp[] = [
|
||||
/Plan\s*saved\s*to\s*:?\s*(\S+\.md)/i,
|
||||
/Plan\s*file\s*:?\s*(\S+\.md)/i,
|
||||
/·\s*(\S+\.claude\/plans\/\S+\.md)/i,
|
||||
// Fallback: any reference to a .claude/plans path in the buffer.
|
||||
/(~?\/?\S*\.claude\/plans\/[\w-]+\.md)/i,
|
||||
];
|
||||
for (const p of patterns) {
|
||||
const m = visible.match(p);
|
||||
if (m && m[1]) {
|
||||
let raw = m[1];
|
||||
// Some patterns capture trailing punctuation; strip a trailing dot.
|
||||
raw = raw.replace(/\.+$/, '.md').replace(/\.md\.+$/, '.md');
|
||||
// Tilde expansion to absolute path.
|
||||
if (raw.startsWith('~')) {
|
||||
const home = process.env.HOME ?? '';
|
||||
raw = home + raw.slice(1);
|
||||
}
|
||||
return raw;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Read a plan file written by a plan-mode skill and verify it contains a
|
||||
* "decisions" section — evidence the skill surfaced the decisions it was
|
||||
* supposed to gate on, even when AskUserQuestion is --disallowedTools and
|
||||
* the model used the plan-file fallback flow instead of a numbered prompt.
|
||||
*
|
||||
* Accepts any `## Decisions ...` heading (the canonical form from the
|
||||
* preamble is `## Decisions to confirm`, but small variants like
|
||||
* `## Decisions needed` or `## Decisions for review` are common). Returns
|
||||
* false if the file is unreadable, missing, or has no decisions section.
|
||||
*/
|
||||
export function planFileHasDecisionsSection(planFile: string): boolean {
|
||||
try {
|
||||
const content = fs.readFileSync(planFile, 'utf-8');
|
||||
return /^##\s+Decisions\b/im.test(content);
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Recent-tail window (in bytes of stripped TTY text) used when classifying
|
||||
* permission dialogs. Old permission text persists in the visibleSince buffer
|
||||
@@ -963,6 +1030,15 @@ export interface PlanSkillObservation {
|
||||
evidence: string;
|
||||
/** Wall time (ms) until the outcome was decided. */
|
||||
elapsedMs: number;
|
||||
/**
|
||||
* Path to the plan file the skill wrote (if outcome is 'plan_ready').
|
||||
* Extracted from the visible TTY via {@link extractPlanFilePath}. Lets the
|
||||
* v1.22 AskUserQuestion-blocked regression tests verify the plan file
|
||||
* contains a `## Decisions to confirm` section under --disallowedTools —
|
||||
* a model that silently skips Step 0 reaches plan_ready WITHOUT writing
|
||||
* the section, and that's the regression we want to catch.
|
||||
*/
|
||||
planFile?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -1046,11 +1122,19 @@ export async function runPlanSkillObservation(opts: {
|
||||
}
|
||||
const classified = classifyVisible(visible);
|
||||
if (classified) {
|
||||
return {
|
||||
const obs: PlanSkillObservation = {
|
||||
...classified,
|
||||
evidence: visible.slice(-2000),
|
||||
elapsedMs: Date.now() - startedAt,
|
||||
};
|
||||
// For plan_ready outcomes, capture the plan file path from the full
|
||||
// visible buffer — tests under --disallowedTools verify the file's
|
||||
// contents to distinguish legitimate fallback flow from silent-skip.
|
||||
if (classified.outcome === 'plan_ready') {
|
||||
const planFile = extractPlanFilePath(visible);
|
||||
if (planFile) obs.planFile = planFile;
|
||||
}
|
||||
return obs;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import { runPlanSkillObservation } from './helpers/claude-pty-runner';
|
||||
import { runPlanSkillObservation, planFileHasDecisionsSection } from './helpers/claude-pty-runner';
|
||||
|
||||
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
|
||||
const describeE2E = shouldRun ? describe : describe.skip;
|
||||
@@ -54,6 +54,14 @@ describeE2E('autoplan AskUserQuestion-blocked smoke (gate)', () => {
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
if (obs.outcome === 'plan_ready') {
|
||||
if (!obs.planFile || !planFileHasDecisionsSection(obs.planFile)) {
|
||||
throw new Error(
|
||||
`autoplan AskUserQuestion-blocked regression: plan_ready without a "## Decisions" section in ${obs.planFile ?? '<no plan file detected>'} — Phase 1 premise gate was silently skipped.\n` +
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
expect(['asked', 'plan_ready']).toContain(obs.outcome);
|
||||
}, 360_000);
|
||||
});
|
||||
|
||||
@@ -17,7 +17,7 @@
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import { runPlanSkillObservation } from './helpers/claude-pty-runner';
|
||||
import { runPlanSkillObservation, planFileHasDecisionsSection } from './helpers/claude-pty-runner';
|
||||
|
||||
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
|
||||
const describeE2E = shouldRun ? describe : describe.skip;
|
||||
@@ -46,6 +46,14 @@ describeE2E('office-hours AskUserQuestion-blocked smoke (gate)', () => {
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
if (obs.outcome === 'plan_ready') {
|
||||
if (!obs.planFile || !planFileHasDecisionsSection(obs.planFile)) {
|
||||
throw new Error(
|
||||
`office-hours AskUserQuestion-blocked regression: plan_ready without a "## Decisions" section in ${obs.planFile ?? '<no plan file detected>'} — startup-vs-builder mode question was silently skipped.\n` +
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
expect(['asked', 'plan_ready']).toContain(obs.outcome);
|
||||
}, 360_000);
|
||||
});
|
||||
|
||||
@@ -34,7 +34,7 @@
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import { runPlanSkillObservation } from './helpers/claude-pty-runner';
|
||||
import { runPlanSkillObservation, planFileHasDecisionsSection } from './helpers/claude-pty-runner';
|
||||
|
||||
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
|
||||
const describeE2E = shouldRun ? describe : describe.skip;
|
||||
@@ -110,6 +110,24 @@ describeE2E('plan-ceo-review plan-mode smoke (gate)', () => {
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
// plan_ready under --disallowedTools is only a pass when the model used
|
||||
// the plan-file fallback (wrote a `## Decisions to confirm` section).
|
||||
// Without that section, plan_ready means the model silently skipped Step 0
|
||||
// and went straight to ExitPlanMode — the regression we're catching.
|
||||
if (obs.outcome === 'plan_ready') {
|
||||
if (!obs.planFile) {
|
||||
throw new Error(
|
||||
`plan-ceo-review AskUserQuestion-blocked regression: outcome=plan_ready but no plan file path detected in TTY output. Cannot verify the model used the fallback flow.\n` +
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
if (!planFileHasDecisionsSection(obs.planFile)) {
|
||||
throw new Error(
|
||||
`plan-ceo-review AskUserQuestion-blocked regression: model wrote ${obs.planFile} without a "## Decisions" section. Step 0 was silently skipped.\n` +
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
expect(['asked', 'plan_ready']).toContain(obs.outcome);
|
||||
}, 360_000);
|
||||
});
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import { runPlanSkillObservation } from './helpers/claude-pty-runner';
|
||||
import { runPlanSkillObservation, planFileHasDecisionsSection } from './helpers/claude-pty-runner';
|
||||
|
||||
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
|
||||
const describeE2E = shouldRun ? describe : describe.skip;
|
||||
@@ -61,6 +61,12 @@ describeE2E('plan-design-review plan-mode smoke (gate)', () => {
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
// plan-design-review legitimately short-circuits to plan_ready on no-UI
|
||||
// branches. Allow plan_ready WITHOUT a decisions section ONLY if the
|
||||
// plan file genuinely has no UI scope (we don't have a deterministic way
|
||||
// to check this from the test, so this skill keeps the looser envelope).
|
||||
// Other plan-mode skills require the decisions section under
|
||||
// --disallowedTools; design is the special case.
|
||||
expect(['asked', 'plan_ready']).toContain(obs.outcome);
|
||||
}, 360_000);
|
||||
});
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import { runPlanSkillObservation } from './helpers/claude-pty-runner';
|
||||
import { runPlanSkillObservation, planFileHasDecisionsSection } from './helpers/claude-pty-runner';
|
||||
|
||||
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
|
||||
const describeE2E = shouldRun ? describe : describe.skip;
|
||||
@@ -55,6 +55,14 @@ describeE2E('plan-devex-review plan-mode smoke (gate)', () => {
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
if (obs.outcome === 'plan_ready') {
|
||||
if (!obs.planFile || !planFileHasDecisionsSection(obs.planFile)) {
|
||||
throw new Error(
|
||||
`plan-devex-review AskUserQuestion-blocked regression: plan_ready without a "## Decisions" section in ${obs.planFile ?? '<no plan file detected>'} — Step 0 was silently skipped.\n` +
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
expect(['asked', 'plan_ready']).toContain(obs.outcome);
|
||||
}, 360_000);
|
||||
});
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import { runPlanSkillObservation } from './helpers/claude-pty-runner';
|
||||
import { runPlanSkillObservation, planFileHasDecisionsSection } from './helpers/claude-pty-runner';
|
||||
|
||||
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
|
||||
const describeE2E = shouldRun ? describe : describe.skip;
|
||||
@@ -55,6 +55,14 @@ describeE2E('plan-eng-review plan-mode smoke (gate)', () => {
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
if (obs.outcome === 'plan_ready') {
|
||||
if (!obs.planFile || !planFileHasDecisionsSection(obs.planFile)) {
|
||||
throw new Error(
|
||||
`plan-eng-review AskUserQuestion-blocked regression: plan_ready without a "## Decisions" section in ${obs.planFile ?? '<no plan file detected>'} — Step 0 was silently skipped.\n` +
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
expect(['asked', 'plan_ready']).toContain(obs.outcome);
|
||||
}, 360_000);
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user