diff --git a/test/skill-e2e-autoplan-auto-mode.test.ts b/test/skill-e2e-autoplan-auto-mode.test.ts index 0677917bc..4a68bb319 100644 --- a/test/skill-e2e-autoplan-auto-mode.test.ts +++ b/test/skill-e2e-autoplan-auto-mode.test.ts @@ -37,12 +37,17 @@ const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate'; const describeE2E = shouldRun ? describe : describe.skip; describeE2E('autoplan AskUserQuestion-blocked smoke (gate)', () => { - // Pass envelope is ['asked', 'plan_ready']: model either renders the - // first non-auto-decided gate (Phase 1 premise confirmation) as numbered - // prose or surfaces it through the plan file + ExitPlanMode flow. + // Pass envelope: model either renders the first non-auto-decided gate + // (Phase 1 premise confirmation) as numbered prose ('asked'), surfaces + // it through the plan-file + ExitPlanMode flow ('plan_ready' with a + // "## Decisions" section [legacy fallback] OR with BLOCKED visible + // [post-v1.28 fix]), or terminates with the BLOCKED string visible + // ('exited' post-fix). + // // Autoplan auto-decides intermediate questions BY DESIGN; the failure // signal we care about is the AUTO_DECIDE preamble firing on a gate it - // shouldn't (caught explicitly via the 'auto_decided' outcome). + // shouldn't (caught explicitly via the 'auto_decided' outcome) or the + // model proceeding silently. test('a non-auto-decided gate surfaces when AskUserQuestion is --disallowedTools', async () => { const obs = await runPlanSkillObservation({ skillName: 'autoplan', @@ -51,10 +56,11 @@ describeE2E('autoplan AskUserQuestion-blocked smoke (gate)', () => { timeoutMs: 300_000, }); + const blockedVisible = /BLOCKED\s*[—-]\s*AskUserQuestion/i.test(obs.evidence); + if ( obs.outcome === 'auto_decided' || obs.outcome === 'silent_write' || - obs.outcome === 'exited' || obs.outcome === 'timeout' ) { throw new Error( @@ -64,14 +70,21 @@ describeE2E('autoplan AskUserQuestion-blocked smoke (gate)', () => { `--- evidence (last 2KB visible) ---\n${obs.evidence}`, ); } + if (obs.outcome === 'exited' && !blockedVisible) { + throw new Error( + `autoplan AskUserQuestion-blocked regression: outcome=exited without BLOCKED — AskUserQuestion string in TTY. Model quit silently instead of surfacing the failure mode.\n` + + `--- evidence (last 2KB visible) ---\n${obs.evidence}`, + ); + } if (obs.outcome === 'plan_ready') { - if (!obs.planFile || !planFileHasDecisionsSection(obs.planFile)) { + const decisionsOk = obs.planFile && planFileHasDecisionsSection(obs.planFile); + if (!decisionsOk && !blockedVisible) { throw new Error( - `autoplan AskUserQuestion-blocked regression: plan_ready without a "## Decisions" section in ${obs.planFile ?? ''} — Phase 1 premise gate was silently skipped.\n` + + `autoplan AskUserQuestion-blocked regression: plan_ready without a "## Decisions" section in ${obs.planFile ?? ''} AND no BLOCKED string in TTY — Phase 1 premise gate was silently skipped.\n` + `--- evidence (last 2KB visible) ---\n${obs.evidence}`, ); } } - expect(['asked', 'plan_ready']).toContain(obs.outcome); + expect(['asked', 'plan_ready', 'exited']).toContain(obs.outcome); }, 360_000); }); diff --git a/test/skill-e2e-plan-ceo-plan-mode.test.ts b/test/skill-e2e-plan-ceo-plan-mode.test.ts index a99084e19..8695aabe8 100644 --- a/test/skill-e2e-plan-ceo-plan-mode.test.ts +++ b/test/skill-e2e-plan-ceo-plan-mode.test.ts @@ -81,19 +81,26 @@ describeE2E('plan-ceo-review plan-mode smoke (gate)', () => { // is removed from the model's tool registry; without fallback guidance // the model can't ask and silently proceeds. // - // The fix (Tool resolution preamble) accepts two surface paths under - // --disallowedTools: - // - 'asked' — model emits a numbered-option prompt as prose (with - // the same D + Pros/cons format as a real AUQ) - // - 'plan_ready' — model writes the question into the plan file as a - // "## Decisions to confirm" section + ExitPlanMode; - // the native plan-mode "Ready to execute?" surfaces - // it through the TTY confirmation + // After v1.28+ (forever-war fix), the preamble fallback that wrote a + // "## Decisions to confirm" section was deleted in favor of a hard + // BLOCKED rule. The pass envelope under --disallowedTools accepts: + // - 'asked' — model emits a numbered-option prompt as prose + // - 'plan_ready' WITH (## Decisions section [legacy] + // OR BLOCKED string visible [post-fix]) + // - 'exited' WITH BLOCKED string visible [post-fix] // - // Both let the user see the decision. Failure signals are - // silent_write/exited/timeout (model never surfaced the question) and - // 'auto_decided' (the AUTO_DECIDE preamble fired without a /plan-tune - // opt-in — caught explicitly). + // The legacy `## Decisions` path stays in the envelope so this test + // keeps passing during the migration window when the fallback delete + // and resolver edits land in the same PR but mid-rebase states are + // possible. Once the deletion has been on main long enough that the + // generated SKILL.md cache has flushed, the legacy branch can be + // removed in a follow-up. + // + // Failure signals (regression we DO want to catch): + // - 'auto_decided' — AUTO_DECIDE preamble fired without /plan-tune opt-in + // - 'silent_write' — Write/Edit before any AUQ surface + // - 'timeout' — neither asked nor terminated in budget + // - 'plan_ready' or 'exited' WITHOUT either Decisions section or BLOCKED test('AskUserQuestion surfaces when --disallowedTools AskUserQuestion is set', async () => { const obs = await runPlanSkillObservation({ skillName: 'plan-ceo-review', @@ -102,10 +109,11 @@ describeE2E('plan-ceo-review plan-mode smoke (gate)', () => { timeoutMs: 300_000, }); + const blockedVisible = /BLOCKED\s*[—-]\s*AskUserQuestion/i.test(obs.evidence); + if ( obs.outcome === 'auto_decided' || obs.outcome === 'silent_write' || - obs.outcome === 'exited' || obs.outcome === 'timeout' ) { throw new Error( @@ -115,25 +123,35 @@ describeE2E('plan-ceo-review plan-mode smoke (gate)', () => { `--- evidence (last 2KB visible) ---\n${obs.evidence}`, ); } - // plan_ready under --disallowedTools is only a pass when the model used - // the plan-file fallback (wrote a `## Decisions to confirm` section). - // Without that section, plan_ready means the model silently skipped Step 0 - // and went straight to ExitPlanMode — the regression we're catching. - if (obs.outcome === 'plan_ready') { - if (!obs.planFile) { + // 'exited' is acceptable ONLY when BLOCKED string is visible (post-fix + // path). Without BLOCKED, exited means the model crashed or quit silently. + if (obs.outcome === 'exited') { + if (!blockedVisible) { throw new Error( - `plan-ceo-review AskUserQuestion-blocked regression: outcome=plan_ready but no plan file path detected in TTY output. Cannot verify the model used the fallback flow.\n` + - `--- evidence (last 2KB visible) ---\n${obs.evidence}`, - ); - } - if (!planFileHasDecisionsSection(obs.planFile)) { - throw new Error( - `plan-ceo-review AskUserQuestion-blocked regression: model wrote ${obs.planFile} without a "## Decisions" section. Step 0 was silently skipped.\n` + + `plan-ceo-review AskUserQuestion-blocked regression: outcome=exited without BLOCKED — AskUserQuestion string in TTY. Model quit silently instead of surfacing the failure mode.\n` + `--- evidence (last 2KB visible) ---\n${obs.evidence}`, ); } } - expect(['asked', 'plan_ready']).toContain(obs.outcome); + // 'plan_ready' is acceptable when EITHER (legacy) the model wrote a + // "## Decisions to confirm" section OR (post-fix) BLOCKED is visible + // in the TTY. Neither = silent ExitPlanMode = the regression we catch. + if (obs.outcome === 'plan_ready') { + if (!obs.planFile) { + if (!blockedVisible) { + throw new Error( + `plan-ceo-review AskUserQuestion-blocked regression: outcome=plan_ready but no plan file path detected and no BLOCKED string in TTY. Cannot verify the model used either the legacy fallback or the post-fix BLOCKED path.\n` + + `--- evidence (last 2KB visible) ---\n${obs.evidence}`, + ); + } + } else if (!planFileHasDecisionsSection(obs.planFile) && !blockedVisible) { + throw new Error( + `plan-ceo-review AskUserQuestion-blocked regression: model wrote ${obs.planFile} without a "## Decisions" section AND no BLOCKED string in TTY. Step 0 was silently skipped.\n` + + `--- evidence (last 2KB visible) ---\n${obs.evidence}`, + ); + } + } + expect(['asked', 'plan_ready', 'exited']).toContain(obs.outcome); assertReportAtBottomIfPlanWritten(obs); }, 360_000); }); diff --git a/test/skill-e2e-plan-design-plan-mode.test.ts b/test/skill-e2e-plan-design-plan-mode.test.ts index da3d591ad..ddf9217ce 100644 --- a/test/skill-e2e-plan-design-plan-mode.test.ts +++ b/test/skill-e2e-plan-design-plan-mode.test.ts @@ -40,10 +40,17 @@ describeE2E('plan-design-review plan-mode smoke (gate)', () => { // v1.21+ regression: see skill-e2e-plan-ceo-plan-mode.test.ts for the // contract. plan-design-review legitimately short-circuits on no-UI-scope - // branches, so this case keeps the same ['asked', 'plan_ready'] envelope - // as the baseline. The discriminating regression signals are - // 'auto_decided' (AUTO_DECIDE preamble fired upstream) or any failure - // outcome — both mean the user never saw a question they should have. + // branches, so this case has historically used a looser envelope. + // + // Post-v1.28 (forever-war fix), 'exited' is acceptable when BLOCKED is + // visible in the TTY (model correctly recognized the AUQ-unavailable + // failure mode and stopped). The legacy 'plan_ready' (with or without + // decisions section) and 'asked' paths remain valid pass outcomes. + // + // The discriminating regression signals are 'auto_decided' (AUTO_DECIDE + // preamble fired upstream), 'silent_write', 'timeout', or 'exited' + // without BLOCKED visible — all mean the user never saw a question they + // should have. test('does not silently auto-decide when --disallowedTools AskUserQuestion is set', async () => { const obs = await runPlanSkillObservation({ skillName: 'plan-design-review', @@ -52,10 +59,11 @@ describeE2E('plan-design-review plan-mode smoke (gate)', () => { timeoutMs: 300_000, }); + const blockedVisible = /BLOCKED\s*[—-]\s*AskUserQuestion/i.test(obs.evidence); + if ( obs.outcome === 'auto_decided' || obs.outcome === 'silent_write' || - obs.outcome === 'exited' || obs.outcome === 'timeout' ) { throw new Error( @@ -65,13 +73,13 @@ describeE2E('plan-design-review plan-mode smoke (gate)', () => { `--- evidence (last 2KB visible) ---\n${obs.evidence}`, ); } - // plan-design-review legitimately short-circuits to plan_ready on no-UI - // branches. Allow plan_ready WITHOUT a decisions section ONLY if the - // plan file genuinely has no UI scope (we don't have a deterministic way - // to check this from the test, so this skill keeps the looser envelope). - // Other plan-mode skills require the decisions section under - // --disallowedTools; design is the special case. - expect(['asked', 'plan_ready']).toContain(obs.outcome); + if (obs.outcome === 'exited' && !blockedVisible) { + throw new Error( + `plan-design-review AskUserQuestion-blocked regression: outcome=exited without BLOCKED — AskUserQuestion string in TTY. Model quit silently instead of surfacing the failure mode.\n` + + `--- evidence (last 2KB visible) ---\n${obs.evidence}`, + ); + } + expect(['asked', 'plan_ready', 'exited']).toContain(obs.outcome); assertReportAtBottomIfPlanWritten(obs); }, 360_000); });