Files
gstack/test/skill-e2e-plan-ceo-mode-routing.test.ts
T
Garry Tan e6ce1dca70 test: extract MODE_RE + optionsSignature into PTY runner exports
Refactor prep for the upcoming per-finding AskUserQuestion count test
across plan-{ceo,eng,design,devex}-review. Both new tests and the existing
mode-routing test need the same mode regex and the same option-list
fingerprint dedupe — pulling them into one source of truth in
test/helpers/claude-pty-runner.ts so a fifth mode (or a tweak to the
fingerprint shape) updates everywhere instead of drifting per-test.

Mechanical: no behavior change in the mode-routing test.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-28 09:05:11 -07:00

213 lines
8.4 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* /plan-ceo-review mode-routing E2E (periodic, paid, real-PTY).
*
* Asserts: when /plan-ceo-review reaches its Step 0F mode-selection
* AskUserQuestion and the user picks HOLD SCOPE or SCOPE EXPANSION,
* the downstream rendered output reflects that mode's distinctive
* posture language.
*
* Why this exists: existing tests verify that the question fires. Nothing
* verifies the answer actually routes. A regression where Step 0F shows
* the question but the agent ignores the choice (e.g. always defaults
* to EXPANSION) would not be caught by any prior test.
*
* Tier: periodic (not gate). Each run navigates 8-12 prior AskUserQuestions (telemetry,
* proactive, routing, vendoring, brain, office-hours, premise×3, approach)
* before reaching Step 0F. At ~30s per AskUserQuestion that's a 4-6 min navigation
* phase per case. The full 2-case suite runs ~12-15 min, $3-4. Too slow
* for gate-tier; weekly is fine.
*
* Mode coverage: HOLD SCOPE + SCOPE EXPANSION cover the two posture poles
* (rigor vs ambition). SELECTIVE EXPANSION and SCOPE REDUCTION are V2 once
* the navigation phase is shorter or has a deterministic fast-path through
* Step 0A/0C-bis.
*
* Posture assertions: each mode has distinct downstream language. The
* checks below are deliberately permissive — they catch the binary
* "did the mode posture even apply" question, not Opus-specific phrasing.
*
* HOLD SCOPE — "rigor" or "bulletproof" or "hold scope"
* SCOPE EXPANSION — "expansion" or "10x" or "delight" or "dream"
*/
import { describe, test } from 'bun:test';
import {
launchClaudePty,
isNumberedOptionListVisible,
isPermissionDialogVisible,
parseNumberedOptions,
isPlanReadyVisible,
MODE_RE,
optionsSignature,
TAIL_SCAN_BYTES,
type ClaudePtySession,
} from './helpers/claude-pty-runner';
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
const describeE2E = shouldRun ? describe : describe.skip;
interface ModeCase {
mode: 'HOLD SCOPE' | 'SCOPE EXPANSION';
/** Regex applied to visible-since-mode-pick text. At least one must match. */
postureRe: RegExp;
}
const CASES: ModeCase[] = [
{ mode: 'HOLD SCOPE', postureRe: /\b(rigor|bulletproof|hold\s*scope|maximum\s+rigor)\b/i },
{ mode: 'SCOPE EXPANSION', postureRe: /\b(expansion|10x|delight|dream|cathedral|opt[\s-]?in)\b/i },
];
/**
* Navigate prior AskUserQuestions by picking option 1 until we hit an AskUserQuestion whose
* options match one of the 4 mode names. Returns the option index
* matching `targetMode`, with the buffer marker pointing AT that AskUserQuestion.
*
* Throws if we don't reach the mode AskUserQuestion within `maxNav` prior AskUserQuestions or
* the overall budget.
*/
async function navigateToModeAskUserQuestion(
session: ClaudePtySession,
since: number,
targetMode: ModeCase['mode'],
opts: { maxNav?: number; budgetMs?: number } = {},
): Promise<{ modeIndex: number; visibleAtMode: string }> {
// /plan-ceo-review's mode AskUserQuestion (Step 0F) sits behind several preamble
// and Step 0A-0C-bis gates: telemetry, proactive, routing, vendoring,
// brain privacy, office-hours offer, premise challenge (3 questions),
// approach selection. 12 hops is the conservative ceiling.
const maxNav = opts.maxNav ?? 12;
const budgetMs = opts.budgetMs ?? 420_000;
const start = Date.now();
let priorAnswered = 0;
let lastSeenList: Array<{ index: number; label: string }> = [];
while (Date.now() - start < budgetMs) {
if (session.exited()) {
throw new Error(
`claude exited (code=${session.exitCode()}) during nav.\n` +
`Last visible:\n${session.visibleSince(since).slice(-2000)}`,
);
}
await Bun.sleep(2000);
const visible = session.visibleSince(since);
if (!isNumberedOptionListVisible(visible)) continue;
const opts = parseNumberedOptions(visible);
if (opts.length < 2) continue;
// Has the rendered list changed since last poll? If not, we're seeing
// the same prompt and shouldn't double-press.
const sig = optionsSignature(opts);
const lastSig = optionsSignature(lastSeenList);
if (sig === lastSig) continue;
lastSeenList = opts;
// Is THIS the mode AskUserQuestion?
if (opts.some(o => MODE_RE.test(o.label))) {
const target = opts.find(o => o.label.toUpperCase().includes(targetMode));
if (!target) {
throw new Error(
`Mode AskUserQuestion rendered but target "${targetMode}" not in option labels:\n` +
opts.map(o => ` ${o.index}. ${o.label}`).join('\n'),
);
}
return { modeIndex: target.index, visibleAtMode: visible };
}
// Permission dialog? Grant with "1" but don't count it against nav budget.
// Classify on the recent tail only — old permission text persists in
// visibleSince and would re-trigger forever.
//
// Note: runPlanSkillObservation has its own permission-dialog filter that
// simply skips classification (since it observes, doesn't drive). This nav
// loop drives the PTY directly via launchClaudePty and so owns its own
// dialog handling — granting with "1" so the workflow advances. Both
// paths share TAIL_SCAN_BYTES as the recent-tail window so tuning stays
// in sync.
if (isPermissionDialogVisible(visible.slice(-TAIL_SCAN_BYTES))) {
session.send('1\r');
await Bun.sleep(1500);
continue;
}
// Not the mode AskUserQuestion — answer with option 1 (recommended) and continue.
if (priorAnswered >= maxNav) {
throw new Error(
`Navigated ${maxNav} prior AskUserQuestions without reaching the mode AskUserQuestion. ` +
`Last list:\n${opts.map(o => ` ${o.index}. ${o.label}`).join('\n')}`,
);
}
priorAnswered++;
session.send('1\r');
// Give the agent a beat to advance before re-polling.
await Bun.sleep(2000);
}
throw new Error(`Mode AskUserQuestion not reached within ${budgetMs}ms`);
}
describeE2E('/plan-ceo-review mode routing (gate)', () => {
for (const c of CASES) {
test(
`mode "${c.mode}" routes to its distinctive posture`,
async () => {
const session = await launchClaudePty({
permissionMode: 'plan',
timeoutMs: 540_000,
});
try {
await Bun.sleep(8000);
const since = session.mark();
session.send('/plan-ceo-review\r');
const { modeIndex } = await navigateToModeAskUserQuestion(session, since, c.mode);
// Snapshot the visible buffer at mode-pick time, then send the index.
const sincePick = session.rawOutput().length;
session.send(`${modeIndex}\r`);
// Wait for downstream evidence: either next AskUserQuestion or plan_ready or
// a posture-distinctive substring shows up.
const budgetMs = 240_000;
const start = Date.now();
let postureMatched = false;
let downstreamSnapshot = '';
while (Date.now() - start < budgetMs) {
await Bun.sleep(2500);
if (session.exited()) {
throw new Error(
`claude exited (code=${session.exitCode()}) after mode pick.\n` +
`Downstream:\n${session.visibleSince(sincePick).slice(-2000)}`,
);
}
downstreamSnapshot = session.visibleSince(sincePick);
if (c.postureRe.test(downstreamSnapshot)) {
postureMatched = true;
break;
}
// Don't bail early on plan_ready alone — the posture text may
// arrive as the agent finishes writing the plan. Only break
// once we either match posture or run the clock.
if (
isPlanReadyVisible(downstreamSnapshot) &&
isNumberedOptionListVisible(downstreamSnapshot) &&
!c.postureRe.test(downstreamSnapshot)
) {
// Plan-ready AND a follow-up AskUserQuestion are both visible but
// posture text has not appeared yet. Keep polling for a bit.
}
}
if (!postureMatched) {
throw new Error(
`Mode "${c.mode}" routing FAILED: no posture match for ${c.postureRe.source}.\n` +
`--- downstream visible since mode pick (last 3KB) ---\n` +
downstreamSnapshot.slice(-3000),
);
}
} finally {
await session.close();
}
},
600_000,
);
}
});