mirror of
https://github.com/garrytan/gstack.git
synced 2026-06-17 15:20:11 +02:00
test: expand plan-mode pass envelopes to accept BLOCKED path
Three existing plan-mode regression tests previously codified the preamble fallback as a valid PASS path under --disallowedTools AskUserQuestion: outcome=plan_ready was accepted only when the model wrote a "## Decisions to confirm" section. The forever-war fix deletes that fallback, so this assertion would fail post-deletion. Expanded envelope accepts EITHER: - 'plan_ready' WITH (## Decisions section [legacy] OR BLOCKED string visible in TTY [post-fix]) - 'exited' WITH BLOCKED string visible in TTY [post-fix] The legacy ## Decisions branch stays in the envelope so these tests keep passing on today's code (where the fallback still exists) and on tomorrow's code (where the model reports BLOCKED instead). Once the deletion has been on main long enough that the cache flushes, the legacy branch can be removed in a follow-up. Failure signals (regression we DO want to catch) unchanged: auto_decided / silent_write / timeout / exited-without-BLOCKED / plan_ready-without-(decisions OR BLOCKED). - test/skill-e2e-plan-ceo-plan-mode.test.ts (test 2 only) - test/skill-e2e-autoplan-auto-mode.test.ts - test/skill-e2e-plan-design-plan-mode.test.ts Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -37,12 +37,17 @@ const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
|
||||
const describeE2E = shouldRun ? describe : describe.skip;
|
||||
|
||||
describeE2E('autoplan AskUserQuestion-blocked smoke (gate)', () => {
|
||||
// Pass envelope is ['asked', 'plan_ready']: model either renders the
|
||||
// first non-auto-decided gate (Phase 1 premise confirmation) as numbered
|
||||
// prose or surfaces it through the plan file + ExitPlanMode flow.
|
||||
// Pass envelope: model either renders the first non-auto-decided gate
|
||||
// (Phase 1 premise confirmation) as numbered prose ('asked'), surfaces
|
||||
// it through the plan-file + ExitPlanMode flow ('plan_ready' with a
|
||||
// "## Decisions" section [legacy fallback] OR with BLOCKED visible
|
||||
// [post-v1.28 fix]), or terminates with the BLOCKED string visible
|
||||
// ('exited' post-fix).
|
||||
//
|
||||
// Autoplan auto-decides intermediate questions BY DESIGN; the failure
|
||||
// signal we care about is the AUTO_DECIDE preamble firing on a gate it
|
||||
// shouldn't (caught explicitly via the 'auto_decided' outcome).
|
||||
// shouldn't (caught explicitly via the 'auto_decided' outcome) or the
|
||||
// model proceeding silently.
|
||||
test('a non-auto-decided gate surfaces when AskUserQuestion is --disallowedTools', async () => {
|
||||
const obs = await runPlanSkillObservation({
|
||||
skillName: 'autoplan',
|
||||
@@ -51,10 +56,11 @@ describeE2E('autoplan AskUserQuestion-blocked smoke (gate)', () => {
|
||||
timeoutMs: 300_000,
|
||||
});
|
||||
|
||||
const blockedVisible = /BLOCKED\s*[—-]\s*AskUserQuestion/i.test(obs.evidence);
|
||||
|
||||
if (
|
||||
obs.outcome === 'auto_decided' ||
|
||||
obs.outcome === 'silent_write' ||
|
||||
obs.outcome === 'exited' ||
|
||||
obs.outcome === 'timeout'
|
||||
) {
|
||||
throw new Error(
|
||||
@@ -64,14 +70,21 @@ describeE2E('autoplan AskUserQuestion-blocked smoke (gate)', () => {
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
if (obs.outcome === 'exited' && !blockedVisible) {
|
||||
throw new Error(
|
||||
`autoplan AskUserQuestion-blocked regression: outcome=exited without BLOCKED — AskUserQuestion string in TTY. Model quit silently instead of surfacing the failure mode.\n` +
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
if (obs.outcome === 'plan_ready') {
|
||||
if (!obs.planFile || !planFileHasDecisionsSection(obs.planFile)) {
|
||||
const decisionsOk = obs.planFile && planFileHasDecisionsSection(obs.planFile);
|
||||
if (!decisionsOk && !blockedVisible) {
|
||||
throw new Error(
|
||||
`autoplan AskUserQuestion-blocked regression: plan_ready without a "## Decisions" section in ${obs.planFile ?? '<no plan file detected>'} — Phase 1 premise gate was silently skipped.\n` +
|
||||
`autoplan AskUserQuestion-blocked regression: plan_ready without a "## Decisions" section in ${obs.planFile ?? '<no plan file detected>'} AND no BLOCKED string in TTY — Phase 1 premise gate was silently skipped.\n` +
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
expect(['asked', 'plan_ready']).toContain(obs.outcome);
|
||||
expect(['asked', 'plan_ready', 'exited']).toContain(obs.outcome);
|
||||
}, 360_000);
|
||||
});
|
||||
|
||||
@@ -81,19 +81,26 @@ describeE2E('plan-ceo-review plan-mode smoke (gate)', () => {
|
||||
// is removed from the model's tool registry; without fallback guidance
|
||||
// the model can't ask and silently proceeds.
|
||||
//
|
||||
// The fix (Tool resolution preamble) accepts two surface paths under
|
||||
// --disallowedTools:
|
||||
// - 'asked' — model emits a numbered-option prompt as prose (with
|
||||
// the same D<N> + Pros/cons format as a real AUQ)
|
||||
// - 'plan_ready' — model writes the question into the plan file as a
|
||||
// "## Decisions to confirm" section + ExitPlanMode;
|
||||
// the native plan-mode "Ready to execute?" surfaces
|
||||
// it through the TTY confirmation
|
||||
// After v1.28+ (forever-war fix), the preamble fallback that wrote a
|
||||
// "## Decisions to confirm" section was deleted in favor of a hard
|
||||
// BLOCKED rule. The pass envelope under --disallowedTools accepts:
|
||||
// - 'asked' — model emits a numbered-option prompt as prose
|
||||
// - 'plan_ready' WITH (## Decisions section [legacy]
|
||||
// OR BLOCKED string visible [post-fix])
|
||||
// - 'exited' WITH BLOCKED string visible [post-fix]
|
||||
//
|
||||
// Both let the user see the decision. Failure signals are
|
||||
// silent_write/exited/timeout (model never surfaced the question) and
|
||||
// 'auto_decided' (the AUTO_DECIDE preamble fired without a /plan-tune
|
||||
// opt-in — caught explicitly).
|
||||
// The legacy `## Decisions` path stays in the envelope so this test
|
||||
// keeps passing during the migration window when the fallback delete
|
||||
// and resolver edits land in the same PR but mid-rebase states are
|
||||
// possible. Once the deletion has been on main long enough that the
|
||||
// generated SKILL.md cache has flushed, the legacy branch can be
|
||||
// removed in a follow-up.
|
||||
//
|
||||
// Failure signals (regression we DO want to catch):
|
||||
// - 'auto_decided' — AUTO_DECIDE preamble fired without /plan-tune opt-in
|
||||
// - 'silent_write' — Write/Edit before any AUQ surface
|
||||
// - 'timeout' — neither asked nor terminated in budget
|
||||
// - 'plan_ready' or 'exited' WITHOUT either Decisions section or BLOCKED
|
||||
test('AskUserQuestion surfaces when --disallowedTools AskUserQuestion is set', async () => {
|
||||
const obs = await runPlanSkillObservation({
|
||||
skillName: 'plan-ceo-review',
|
||||
@@ -102,10 +109,11 @@ describeE2E('plan-ceo-review plan-mode smoke (gate)', () => {
|
||||
timeoutMs: 300_000,
|
||||
});
|
||||
|
||||
const blockedVisible = /BLOCKED\s*[—-]\s*AskUserQuestion/i.test(obs.evidence);
|
||||
|
||||
if (
|
||||
obs.outcome === 'auto_decided' ||
|
||||
obs.outcome === 'silent_write' ||
|
||||
obs.outcome === 'exited' ||
|
||||
obs.outcome === 'timeout'
|
||||
) {
|
||||
throw new Error(
|
||||
@@ -115,25 +123,35 @@ describeE2E('plan-ceo-review plan-mode smoke (gate)', () => {
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
// plan_ready under --disallowedTools is only a pass when the model used
|
||||
// the plan-file fallback (wrote a `## Decisions to confirm` section).
|
||||
// Without that section, plan_ready means the model silently skipped Step 0
|
||||
// and went straight to ExitPlanMode — the regression we're catching.
|
||||
if (obs.outcome === 'plan_ready') {
|
||||
if (!obs.planFile) {
|
||||
// 'exited' is acceptable ONLY when BLOCKED string is visible (post-fix
|
||||
// path). Without BLOCKED, exited means the model crashed or quit silently.
|
||||
if (obs.outcome === 'exited') {
|
||||
if (!blockedVisible) {
|
||||
throw new Error(
|
||||
`plan-ceo-review AskUserQuestion-blocked regression: outcome=plan_ready but no plan file path detected in TTY output. Cannot verify the model used the fallback flow.\n` +
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
if (!planFileHasDecisionsSection(obs.planFile)) {
|
||||
throw new Error(
|
||||
`plan-ceo-review AskUserQuestion-blocked regression: model wrote ${obs.planFile} without a "## Decisions" section. Step 0 was silently skipped.\n` +
|
||||
`plan-ceo-review AskUserQuestion-blocked regression: outcome=exited without BLOCKED — AskUserQuestion string in TTY. Model quit silently instead of surfacing the failure mode.\n` +
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
expect(['asked', 'plan_ready']).toContain(obs.outcome);
|
||||
// 'plan_ready' is acceptable when EITHER (legacy) the model wrote a
|
||||
// "## Decisions to confirm" section OR (post-fix) BLOCKED is visible
|
||||
// in the TTY. Neither = silent ExitPlanMode = the regression we catch.
|
||||
if (obs.outcome === 'plan_ready') {
|
||||
if (!obs.planFile) {
|
||||
if (!blockedVisible) {
|
||||
throw new Error(
|
||||
`plan-ceo-review AskUserQuestion-blocked regression: outcome=plan_ready but no plan file path detected and no BLOCKED string in TTY. Cannot verify the model used either the legacy fallback or the post-fix BLOCKED path.\n` +
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
} else if (!planFileHasDecisionsSection(obs.planFile) && !blockedVisible) {
|
||||
throw new Error(
|
||||
`plan-ceo-review AskUserQuestion-blocked regression: model wrote ${obs.planFile} without a "## Decisions" section AND no BLOCKED string in TTY. Step 0 was silently skipped.\n` +
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
expect(['asked', 'plan_ready', 'exited']).toContain(obs.outcome);
|
||||
assertReportAtBottomIfPlanWritten(obs);
|
||||
}, 360_000);
|
||||
});
|
||||
|
||||
@@ -40,10 +40,17 @@ describeE2E('plan-design-review plan-mode smoke (gate)', () => {
|
||||
|
||||
// v1.21+ regression: see skill-e2e-plan-ceo-plan-mode.test.ts for the
|
||||
// contract. plan-design-review legitimately short-circuits on no-UI-scope
|
||||
// branches, so this case keeps the same ['asked', 'plan_ready'] envelope
|
||||
// as the baseline. The discriminating regression signals are
|
||||
// 'auto_decided' (AUTO_DECIDE preamble fired upstream) or any failure
|
||||
// outcome — both mean the user never saw a question they should have.
|
||||
// branches, so this case has historically used a looser envelope.
|
||||
//
|
||||
// Post-v1.28 (forever-war fix), 'exited' is acceptable when BLOCKED is
|
||||
// visible in the TTY (model correctly recognized the AUQ-unavailable
|
||||
// failure mode and stopped). The legacy 'plan_ready' (with or without
|
||||
// decisions section) and 'asked' paths remain valid pass outcomes.
|
||||
//
|
||||
// The discriminating regression signals are 'auto_decided' (AUTO_DECIDE
|
||||
// preamble fired upstream), 'silent_write', 'timeout', or 'exited'
|
||||
// without BLOCKED visible — all mean the user never saw a question they
|
||||
// should have.
|
||||
test('does not silently auto-decide when --disallowedTools AskUserQuestion is set', async () => {
|
||||
const obs = await runPlanSkillObservation({
|
||||
skillName: 'plan-design-review',
|
||||
@@ -52,10 +59,11 @@ describeE2E('plan-design-review plan-mode smoke (gate)', () => {
|
||||
timeoutMs: 300_000,
|
||||
});
|
||||
|
||||
const blockedVisible = /BLOCKED\s*[—-]\s*AskUserQuestion/i.test(obs.evidence);
|
||||
|
||||
if (
|
||||
obs.outcome === 'auto_decided' ||
|
||||
obs.outcome === 'silent_write' ||
|
||||
obs.outcome === 'exited' ||
|
||||
obs.outcome === 'timeout'
|
||||
) {
|
||||
throw new Error(
|
||||
@@ -65,13 +73,13 @@ describeE2E('plan-design-review plan-mode smoke (gate)', () => {
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
// plan-design-review legitimately short-circuits to plan_ready on no-UI
|
||||
// branches. Allow plan_ready WITHOUT a decisions section ONLY if the
|
||||
// plan file genuinely has no UI scope (we don't have a deterministic way
|
||||
// to check this from the test, so this skill keeps the looser envelope).
|
||||
// Other plan-mode skills require the decisions section under
|
||||
// --disallowedTools; design is the special case.
|
||||
expect(['asked', 'plan_ready']).toContain(obs.outcome);
|
||||
if (obs.outcome === 'exited' && !blockedVisible) {
|
||||
throw new Error(
|
||||
`plan-design-review AskUserQuestion-blocked regression: outcome=exited without BLOCKED — AskUserQuestion string in TTY. Model quit silently instead of surfacing the failure mode.\n` +
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
expect(['asked', 'plan_ready', 'exited']).toContain(obs.outcome);
|
||||
assertReportAtBottomIfPlanWritten(obs);
|
||||
}, 360_000);
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user