mirror of
https://github.com/garrytan/gstack.git
synced 2026-06-17 15:20:11 +02:00
test: delete --disallowedTools AskUserQuestion-blocked test variants
These tests simulated a fictional environment that doesn't exist in production. Real Conductor sessions launch claude with `--disallowedTools AskUserQuestion` AND register `mcp__conductor__AskUserQuestion` — the model has the MCP variant. But the tests passed `--disallowedTools` without standing up any MCP server, so they tested "model behavior with NO AUQ available," which no real user state produces. Combined with bare `/plan-ceo-review` invocation (no follow-up content), this forced the model into a 5+ minute deliberation loop trying to prose-render a question with options it had to first invent. The result was persistent flakes that consumed nine paid E2E runs trying to fix "the model takes too long" — but the actual problem was the test configuration, not the model. Removals: - test/skill-e2e-autoplan-auto-mode.test.ts (deleted; the entire file was a single AUQ-blocked test) - test/skill-e2e-plan-ceo-plan-mode.test.ts test 2 (the migrated --disallowedTools test); test 1 (baseline plan-mode smoke) stays - test/skill-e2e-plan-design-plan-mode.test.ts test 2 (same shape); test 1 stays - test/skill-e2e-plan-eng-plan-mode.test.ts test 2 (same shape); test 1 (baseline) and test 3 (STOP-gate with seeded plan, different contract) stay - test/helpers/touchfiles.ts: autoplan-auto-mode entry removed - test/touchfiles.test.ts: assertion count + commentary updated Coverage retained: test 1 of each plan-mode file already verifies the model fires AUQ; the periodic finding-count tests verify per-finding AUQ cadence end-to-end. The harness improvements landed during this debugging cycle (isProseAUQVisible regex, LLM judge, snapshot logging, high-water-mark tracking, ENOENT-tolerant assertReportAtBottomIfPlanWritten) all stay — they're useful for the remaining plan-mode tests that can also encounter prose rendering and slow-thinking phases. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -103,7 +103,6 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
// INSIDE the existing 4 plan-X-review-plan-mode test files (covered
|
||||
// transitively by the entries above). Two new standalone files exist for
|
||||
// skills with no prior plan-mode test:
|
||||
'autoplan-auto-mode': ['autoplan/**', 'plan-ceo-review/**', 'plan-design-review/**', 'plan-eng-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
|
||||
'office-hours-auto-mode': ['office-hours/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
|
||||
'office-hours-phase4-fork': ['office-hours/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/question-tuning.ts', 'test/helpers/llm-judge.ts', 'test/skill-e2e-office-hours-phase4.test.ts'],
|
||||
'llm-judge-recommendation': ['test/helpers/llm-judge.ts', 'test/llm-judge-recommendation.test.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'codex/SKILL.md.tmpl', 'scripts/resolvers/review.ts'],
|
||||
@@ -423,7 +422,6 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
|
||||
'plan-devex-review-plan-mode': 'gate',
|
||||
'plan-mode-no-op': 'gate',
|
||||
// v1.21+ auto-mode regression tests
|
||||
'autoplan-auto-mode': 'gate',
|
||||
'office-hours-auto-mode': 'gate',
|
||||
'auto-decide-preserved': 'periodic',
|
||||
'e2e-harness-audit': 'gate',
|
||||
|
||||
@@ -1,101 +0,0 @@
|
||||
/**
|
||||
* autoplan AskUserQuestion-blocked regression (gate, paid, real-PTY).
|
||||
*
|
||||
* v1.21+ regression: Conductor launches Claude Code with
|
||||
* `--disallowedTools AskUserQuestion --permission-mode default` (verified
|
||||
* by inspecting the parent claude process via `ps`). The native
|
||||
* AskUserQuestion tool is removed from the model's tool registry; without
|
||||
* fallback guidance the model can't ask the user and silently proceeds.
|
||||
*
|
||||
* Autoplan auto-decides INTERMEDIATE questions BY DESIGN
|
||||
* (autoplan/SKILL.md.tmpl:45), but Phase 1's premise confirmation gate is
|
||||
* one of the few non-auto-decided AskUserQuestions and MUST surface to the
|
||||
* user. This test asserts that gate still surfaces when AskUserQuestion is
|
||||
* disallowed at the tool-registry level — the fix must route the question
|
||||
* through a Conductor-side variant (mcp__conductor__AskUserQuestion) or
|
||||
* through the plan-file + ExitPlanMode flow.
|
||||
*
|
||||
* Filename keeps `auto-mode` for branch-history continuity. Auto-mode (the
|
||||
* AUTO_DECIDE preamble path when QUESTION_TUNING=true) is a related but
|
||||
* distinct silencing mechanism; both share the same fix surface.
|
||||
*
|
||||
* Note on report-at-bottom contract: the GSTACK REVIEW REPORT delete-then-
|
||||
* append flow lives in `scripts/resolvers/review.ts` and is exercised when
|
||||
* reviews actually run. The PTY harness can't drive autoplan through its
|
||||
* review phases without auto-progression of AUQs (see runPlanSkillCounting),
|
||||
* and `--disallowedTools AskUserQuestion` makes autoplan bail at the
|
||||
* premise gate via the plan-file fallback before any review runs. The
|
||||
* report-at-bottom prompt change is verified statically in
|
||||
* `test/gen-skill-docs.test.ts` instead — that's the load-bearing
|
||||
* verification for the contradictory-prompt fix.
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import {
|
||||
runPlanSkillObservation,
|
||||
planFileHasDecisionsSection,
|
||||
isProseAUQVisible,
|
||||
} from './helpers/claude-pty-runner';
|
||||
|
||||
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
|
||||
const describeE2E = shouldRun ? describe : describe.skip;
|
||||
|
||||
describeE2E('autoplan AskUserQuestion-blocked smoke (gate)', () => {
|
||||
// Pass envelope: model either renders the first non-auto-decided gate
|
||||
// (Phase 1 premise confirmation) as numbered prose ('asked'), surfaces
|
||||
// it through the plan-file + ExitPlanMode flow ('plan_ready' with a
|
||||
// "## Decisions" section [legacy fallback] OR with BLOCKED visible
|
||||
// [post-v1.28 fix]), or terminates with the BLOCKED string visible
|
||||
// ('exited' post-fix).
|
||||
//
|
||||
// Autoplan auto-decides intermediate questions BY DESIGN; the failure
|
||||
// signal we care about is the AUTO_DECIDE preamble firing on a gate it
|
||||
// shouldn't (caught explicitly via the 'auto_decided' outcome) or the
|
||||
// model proceeding silently.
|
||||
test('a non-auto-decided gate surfaces when AskUserQuestion is --disallowedTools', async () => {
|
||||
const obs = await runPlanSkillObservation({
|
||||
skillName: 'autoplan',
|
||||
inPlanMode: true,
|
||||
extraArgs: ['--disallowedTools', 'AskUserQuestion'],
|
||||
timeoutMs: 300_000,
|
||||
});
|
||||
|
||||
// The user must SEE the question one way or another. Three valid surfaces:
|
||||
// 1. `## Decisions to confirm` section in the plan file (legacy fallback path)
|
||||
// 2. `BLOCKED — AskUserQuestion` string visible in TTY (post-v1.28 BLOCKED rule)
|
||||
// 3. Numbered/lettered options visible in TTY as prose (post-v1.28 prose-AUQ rendering)
|
||||
// If NONE of these are present, the question was silently buried.
|
||||
const blockedVisible = /BLOCKED\s*[—-]\s*AskUserQuestion/i.test(obs.evidence);
|
||||
const proseAUQVisible = isProseAUQVisible(obs.evidence) || obs.proseAUQEverObserved === true;
|
||||
const surfaceVisible = blockedVisible || proseAUQVisible || obs.waitingEverObserved === true;
|
||||
|
||||
if (
|
||||
obs.outcome === 'auto_decided' ||
|
||||
obs.outcome === 'silent_write' ||
|
||||
obs.outcome === 'timeout'
|
||||
) {
|
||||
throw new Error(
|
||||
`autoplan AskUserQuestion-blocked regression: outcome=${obs.outcome}\n` +
|
||||
`summary: ${obs.summary}\n` +
|
||||
`elapsed: ${obs.elapsedMs}ms\n` +
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
if (obs.outcome === 'exited' && !surfaceVisible) {
|
||||
throw new Error(
|
||||
`autoplan AskUserQuestion-blocked regression: outcome=exited without any visible question surface (no BLOCKED string, no prose-rendered AUQ options). Model quit silently.\n` +
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
if (obs.outcome === 'plan_ready') {
|
||||
const decisionsOk = obs.planFile && planFileHasDecisionsSection(obs.planFile);
|
||||
if (!decisionsOk && !surfaceVisible) {
|
||||
throw new Error(
|
||||
`autoplan AskUserQuestion-blocked regression: plan_ready without any visible question surface (no "## Decisions" section in ${obs.planFile ?? '<no plan file detected>'}, no BLOCKED string, no prose AUQ options) — Phase 1 premise gate was silently skipped.\n` +
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
expect(['asked', 'plan_ready', 'exited']).toContain(obs.outcome);
|
||||
}, 360_000);
|
||||
});
|
||||
@@ -33,43 +33,15 @@
|
||||
* See test/helpers/claude-pty-runner.ts for runner internals.
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import { describe, test } from 'bun:test';
|
||||
import {
|
||||
runPlanSkillObservation,
|
||||
planFileHasDecisionsSection,
|
||||
assertReportAtBottomIfPlanWritten,
|
||||
isProseAUQVisible,
|
||||
} from './helpers/claude-pty-runner';
|
||||
|
||||
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
|
||||
const describeE2E = shouldRun ? describe : describe.skip;
|
||||
|
||||
// Concrete plan to review. Used by the --disallowedTools test to skip
|
||||
// the "what should I review?" deliberation that otherwise eats the
|
||||
// model's budget. Has CEO-review-shaped issues (premise gap, vague
|
||||
// success metric, scope-creep smell) so Step 0 has real material.
|
||||
const SEED_PLAN_FOR_CEO_REVIEW = `
|
||||
# Plan: Launch a "developer-friendly" pricing tier
|
||||
|
||||
## Goal
|
||||
Increase developer adoption.
|
||||
|
||||
## Success metric
|
||||
More signups.
|
||||
|
||||
## Premise
|
||||
We haven't talked to any developers about whether the current pricing
|
||||
is actually a barrier. The team agreed it "feels like" it should be
|
||||
cheaper. No data yet on what dev users would pay for or what the unit
|
||||
economics would look like at the new price point.
|
||||
|
||||
## Plan
|
||||
- Pick a 30% discount as the developer tier
|
||||
- Add an email field to /pricing for "verify with @company.com"
|
||||
- Auto-enroll anyone with @gmail/@hotmail addresses too as a pilot
|
||||
- Ship next week
|
||||
`.trim();
|
||||
|
||||
describeE2E('plan-ceo-review plan-mode smoke (gate)', () => {
|
||||
test('first terminal outcome is asked (Step 0 fires before any plan write)', async () => {
|
||||
const obs = await runPlanSkillObservation({
|
||||
@@ -101,101 +73,4 @@ describeE2E('plan-ceo-review plan-mode smoke (gate)', () => {
|
||||
}
|
||||
assertReportAtBottomIfPlanWritten(obs);
|
||||
}, 360_000);
|
||||
|
||||
// v1.21+ regression: Conductor launches Claude Code with
|
||||
// `--disallowedTools AskUserQuestion --permission-mode default` (verified
|
||||
// via `ps` on the live Conductor claude process). Native AskUserQuestion
|
||||
// is removed from the model's tool registry; without fallback guidance
|
||||
// the model can't ask and silently proceeds.
|
||||
//
|
||||
// After v1.28+ (forever-war fix), the preamble fallback that wrote a
|
||||
// "## Decisions to confirm" section was deleted in favor of a hard
|
||||
// BLOCKED rule. The pass envelope under --disallowedTools accepts:
|
||||
// - 'asked' — model emits a numbered-option prompt as prose
|
||||
// - 'plan_ready' WITH (## Decisions section [legacy]
|
||||
// OR BLOCKED string visible [post-fix])
|
||||
// - 'exited' WITH BLOCKED string visible [post-fix]
|
||||
//
|
||||
// The legacy `## Decisions` path stays in the envelope so this test
|
||||
// keeps passing during the migration window when the fallback delete
|
||||
// and resolver edits land in the same PR but mid-rebase states are
|
||||
// possible. Once the deletion has been on main long enough that the
|
||||
// generated SKILL.md cache has flushed, the legacy branch can be
|
||||
// removed in a follow-up.
|
||||
//
|
||||
// Failure signals (regression we DO want to catch):
|
||||
// - 'auto_decided' — AUTO_DECIDE preamble fired without /plan-tune opt-in
|
||||
// - 'silent_write' — Write/Edit before any AUQ surface
|
||||
// - 'timeout' — neither asked nor terminated in budget
|
||||
// - 'plan_ready' or 'exited' WITHOUT either Decisions section or BLOCKED
|
||||
test('AskUserQuestion surfaces when --disallowedTools AskUserQuestion is set', async () => {
|
||||
// Pre-prime with concrete plan content so the model doesn't burn its
|
||||
// budget deliberating about WHICH artifact to review. Without this seed,
|
||||
// a bare /plan-ceo-review under --disallowedTools puts the model in a
|
||||
// 5-minute thinking loop trying to enumerate scope options before
|
||||
// surfacing them as prose. With the seed, the model has a real plan to
|
||||
// critique and can move directly to Step 0 / Section 1 findings.
|
||||
//
|
||||
// The test still exercises the regression we care about: under
|
||||
// --disallowedTools, does the skill SURFACE its first decision question
|
||||
// (via prose, BLOCKED, or some visible surface) rather than silently
|
||||
// ExitPlanMode-ing?
|
||||
const obs = await runPlanSkillObservation({
|
||||
skillName: 'plan-ceo-review',
|
||||
inPlanMode: true,
|
||||
extraArgs: ['--disallowedTools', 'AskUserQuestion'],
|
||||
initialPlanContent: SEED_PLAN_FOR_CEO_REVIEW,
|
||||
timeoutMs: 300_000,
|
||||
});
|
||||
|
||||
// The user must SEE the question one way or another. Three valid surfaces:
|
||||
// 1. `## Decisions to confirm` section in the plan file (legacy fallback)
|
||||
// 2. `BLOCKED — AskUserQuestion` string visible in TTY (post-v1.28 BLOCKED rule)
|
||||
// 3. Numbered/lettered options visible in TTY as prose (post-v1.28 prose-AUQ rendering)
|
||||
const blockedVisible = /BLOCKED\s*[—-]\s*AskUserQuestion/i.test(obs.evidence);
|
||||
const proseAUQVisible = isProseAUQVisible(obs.evidence) || obs.proseAUQEverObserved === true;
|
||||
const surfaceVisible = blockedVisible || proseAUQVisible || obs.waitingEverObserved === true;
|
||||
|
||||
if (
|
||||
obs.outcome === 'auto_decided' ||
|
||||
obs.outcome === 'silent_write' ||
|
||||
obs.outcome === 'timeout'
|
||||
) {
|
||||
throw new Error(
|
||||
`plan-ceo-review AskUserQuestion-blocked regression: outcome=${obs.outcome}\n` +
|
||||
`summary: ${obs.summary}\n` +
|
||||
`elapsed: ${obs.elapsedMs}ms\n` +
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
if (obs.outcome === 'exited' && !surfaceVisible) {
|
||||
throw new Error(
|
||||
`plan-ceo-review AskUserQuestion-blocked regression: outcome=exited without any visible question surface (no BLOCKED string, no prose-rendered AUQ options). Model quit silently.\n` +
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
if (obs.outcome === 'plan_ready') {
|
||||
if (!obs.planFile) {
|
||||
if (!surfaceVisible) {
|
||||
throw new Error(
|
||||
`plan-ceo-review AskUserQuestion-blocked regression: outcome=plan_ready but no plan file path detected, no BLOCKED string, no prose AUQ options. Cannot verify the model used any legitimate path.\n` +
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
} else if (!planFileHasDecisionsSection(obs.planFile) && !surfaceVisible) {
|
||||
throw new Error(
|
||||
`plan-ceo-review AskUserQuestion-blocked regression: model wrote ${obs.planFile} without a "## Decisions" section AND no BLOCKED string AND no prose AUQ options in TTY. Step 0 was silently skipped.\n` +
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
expect(['asked', 'plan_ready', 'exited']).toContain(obs.outcome);
|
||||
// NOTE: assertReportAtBottomIfPlanWritten is intentionally NOT called
|
||||
// here. This test runs --disallowedTools AskUserQuestion and only
|
||||
// checks "did the question surface" — the model can't run the full
|
||||
// multi-section review without AUQ tools, so no review report exists
|
||||
// to enforce the at-bottom contract against. The contract is
|
||||
// exercised by the periodic finding-count tests, which DO run the
|
||||
// full review.
|
||||
}, 360_000);
|
||||
});
|
||||
|
||||
@@ -13,7 +13,6 @@ import { describe, test, expect } from 'bun:test';
|
||||
import {
|
||||
runPlanSkillObservation,
|
||||
assertReportAtBottomIfPlanWritten,
|
||||
isProseAUQVisible,
|
||||
} from './helpers/claude-pty-runner';
|
||||
|
||||
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
|
||||
@@ -38,56 +37,4 @@ describeE2E('plan-design-review plan-mode smoke (gate)', () => {
|
||||
expect(['asked', 'plan_ready']).toContain(obs.outcome);
|
||||
assertReportAtBottomIfPlanWritten(obs);
|
||||
}, 360_000);
|
||||
|
||||
// v1.21+ regression: see skill-e2e-plan-ceo-plan-mode.test.ts for the
|
||||
// contract. plan-design-review legitimately short-circuits on no-UI-scope
|
||||
// branches, so this case has historically used a looser envelope.
|
||||
//
|
||||
// Post-v1.28 (forever-war fix), 'exited' is acceptable when BLOCKED is
|
||||
// visible in the TTY (model correctly recognized the AUQ-unavailable
|
||||
// failure mode and stopped). The legacy 'plan_ready' (with or without
|
||||
// decisions section) and 'asked' paths remain valid pass outcomes.
|
||||
//
|
||||
// The discriminating regression signals are 'auto_decided' (AUTO_DECIDE
|
||||
// preamble fired upstream), 'silent_write', 'timeout', or 'exited'
|
||||
// without BLOCKED visible — all mean the user never saw a question they
|
||||
// should have.
|
||||
test('does not silently auto-decide when --disallowedTools AskUserQuestion is set', async () => {
|
||||
const obs = await runPlanSkillObservation({
|
||||
skillName: 'plan-design-review',
|
||||
inPlanMode: true,
|
||||
extraArgs: ['--disallowedTools', 'AskUserQuestion'],
|
||||
timeoutMs: 300_000,
|
||||
});
|
||||
|
||||
// Surface visibility check (same as ceo / autoplan migrations): user
|
||||
// must SEE the question via BLOCKED string OR prose-rendered AUQ options.
|
||||
const blockedVisible = /BLOCKED\s*[—-]\s*AskUserQuestion/i.test(obs.evidence);
|
||||
const proseAUQVisible = isProseAUQVisible(obs.evidence) || obs.proseAUQEverObserved === true;
|
||||
const surfaceVisible = blockedVisible || proseAUQVisible || obs.waitingEverObserved === true;
|
||||
|
||||
if (
|
||||
obs.outcome === 'auto_decided' ||
|
||||
obs.outcome === 'silent_write' ||
|
||||
obs.outcome === 'timeout'
|
||||
) {
|
||||
throw new Error(
|
||||
`plan-design-review AskUserQuestion-blocked regression: outcome=${obs.outcome}\n` +
|
||||
`summary: ${obs.summary}\n` +
|
||||
`elapsed: ${obs.elapsedMs}ms\n` +
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
if (obs.outcome === 'exited' && !surfaceVisible) {
|
||||
throw new Error(
|
||||
`plan-design-review AskUserQuestion-blocked regression: outcome=exited without any visible question surface (no BLOCKED string, no prose-rendered AUQ options). Model quit silently.\n` +
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
expect(['asked', 'plan_ready', 'exited']).toContain(obs.outcome);
|
||||
// NOTE: assertReportAtBottomIfPlanWritten intentionally not called —
|
||||
// see skill-e2e-plan-ceo-plan-mode test 2 for the full rationale. Under
|
||||
// --disallowedTools the model can't run a full review, so the
|
||||
// report-at-bottom contract doesn't apply.
|
||||
}, 360_000);
|
||||
});
|
||||
|
||||
@@ -10,7 +10,6 @@ import {
|
||||
runPlanSkillObservation,
|
||||
planFileHasDecisionsSection,
|
||||
assertReportAtBottomIfPlanWritten,
|
||||
isProseAUQVisible,
|
||||
} from './helpers/claude-pty-runner';
|
||||
|
||||
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
|
||||
@@ -66,60 +65,6 @@ describeE2E('plan-eng-review plan-mode smoke (gate)', () => {
|
||||
assertReportAtBottomIfPlanWritten(obs);
|
||||
}, 360_000);
|
||||
|
||||
// v1.21+ regression: see skill-e2e-plan-ceo-plan-mode.test.ts for the
|
||||
// contract. Pass envelope is ['asked', 'plan_ready']; failure signals
|
||||
// are 'auto_decided' (AUTO_DECIDE without opt-in) plus the standard
|
||||
// silent_write/exited/timeout.
|
||||
test('AskUserQuestion surfaces when --disallowedTools AskUserQuestion is set', async () => {
|
||||
const obs = await runPlanSkillObservation({
|
||||
skillName: 'plan-eng-review',
|
||||
inPlanMode: true,
|
||||
extraArgs: ['--disallowedTools', 'AskUserQuestion'],
|
||||
timeoutMs: 300_000,
|
||||
});
|
||||
|
||||
// Surface visibility check (consistent with plan-ceo / plan-design /
|
||||
// autoplan migrations): user must SEE the question via a `## Decisions`
|
||||
// section in the plan file (legacy) OR a BLOCKED string in TTY OR
|
||||
// prose-rendered AUQ options in TTY.
|
||||
const blockedVisible = /BLOCKED\s*[—-]\s*AskUserQuestion/i.test(obs.evidence);
|
||||
const proseAUQVisible = isProseAUQVisible(obs.evidence) || obs.proseAUQEverObserved === true;
|
||||
const surfaceVisible = blockedVisible || proseAUQVisible || obs.waitingEverObserved === true;
|
||||
|
||||
if (
|
||||
obs.outcome === 'auto_decided' ||
|
||||
obs.outcome === 'silent_write' ||
|
||||
obs.outcome === 'timeout'
|
||||
) {
|
||||
throw new Error(
|
||||
`plan-eng-review AskUserQuestion-blocked regression: outcome=${obs.outcome}\n` +
|
||||
`summary: ${obs.summary}\n` +
|
||||
`elapsed: ${obs.elapsedMs}ms\n` +
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
if (obs.outcome === 'exited' && !surfaceVisible) {
|
||||
throw new Error(
|
||||
`plan-eng-review AskUserQuestion-blocked regression: outcome=exited without any visible question surface (no BLOCKED string, no prose-rendered AUQ options). Model quit silently.\n` +
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
if (obs.outcome === 'plan_ready') {
|
||||
const decisionsOk = obs.planFile && planFileHasDecisionsSection(obs.planFile);
|
||||
if (!decisionsOk && !surfaceVisible) {
|
||||
throw new Error(
|
||||
`plan-eng-review AskUserQuestion-blocked regression: plan_ready without any visible question surface (no "## Decisions" section in ${obs.planFile ?? '<no plan file detected>'}, no BLOCKED string, no prose AUQ options) — Step 0 was silently skipped.\n` +
|
||||
`--- evidence (last 2KB visible) ---\n${obs.evidence}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
expect(['asked', 'plan_ready', 'exited']).toContain(obs.outcome);
|
||||
// NOTE: assertReportAtBottomIfPlanWritten intentionally not called —
|
||||
// see plan-ceo-plan-mode test 2 for the rationale. Under
|
||||
// --disallowedTools the model can't run the full review, so the
|
||||
// report-at-bottom contract doesn't apply here.
|
||||
}, 360_000);
|
||||
|
||||
// D3-B / D4-B: when a plan with guaranteed-finding-triggering complexity
|
||||
// is seeded, the skill MUST fire AskUserQuestion (or fall back to a
|
||||
// Decisions section) before writing findings to the plan. The
|
||||
|
||||
@@ -99,14 +99,14 @@ describe('selectTests', () => {
|
||||
expect(result.selected).toContain('autoplan-chain-pty');
|
||||
// Per-finding count + review-report-at-bottom (v1.21.x)
|
||||
expect(result.selected).toContain('plan-ceo-finding-count');
|
||||
// v1.22+ AskUserQuestion-blocked regression: autoplan-auto-mode +
|
||||
// auto-decide-preserved also depend on plan-ceo-review/**
|
||||
expect(result.selected).toContain('autoplan-auto-mode');
|
||||
// v1.22+ AskUserQuestion-blocked regression: auto-decide-preserved
|
||||
// also depends on plan-ceo-review/** (autoplan-auto-mode test was
|
||||
// removed in v1.28 — see commit message for the rationale).
|
||||
expect(result.selected).toContain('auto-decide-preserved');
|
||||
// v1.27+ gate-tier reviewCount-floor regression for transcript bug
|
||||
expect(result.selected).toContain('plan-ceo-finding-floor');
|
||||
expect(result.selected.length).toBe(22);
|
||||
expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 22);
|
||||
expect(result.selected.length).toBe(21);
|
||||
expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 21);
|
||||
});
|
||||
|
||||
test('global touchfile triggers ALL tests', () => {
|
||||
|
||||
Reference in New Issue
Block a user