mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-02 03:35:09 +02:00
e6ce1dca70
Refactor prep for the upcoming per-finding AskUserQuestion count test
across plan-{ceo,eng,design,devex}-review. Both new tests and the existing
mode-routing test need the same mode regex and the same option-list
fingerprint dedupe — pulling them into one source of truth in
test/helpers/claude-pty-runner.ts so a fifth mode (or a tweak to the
fingerprint shape) updates everywhere instead of drifting per-test.
Mechanical: no behavior change in the mode-routing test.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
213 lines
8.4 KiB
TypeScript
213 lines
8.4 KiB
TypeScript
/**
|
||
* /plan-ceo-review mode-routing E2E (periodic, paid, real-PTY).
|
||
*
|
||
* Asserts: when /plan-ceo-review reaches its Step 0F mode-selection
|
||
* AskUserQuestion and the user picks HOLD SCOPE or SCOPE EXPANSION,
|
||
* the downstream rendered output reflects that mode's distinctive
|
||
* posture language.
|
||
*
|
||
* Why this exists: existing tests verify that the question fires. Nothing
|
||
* verifies the answer actually routes. A regression where Step 0F shows
|
||
* the question but the agent ignores the choice (e.g. always defaults
|
||
* to EXPANSION) would not be caught by any prior test.
|
||
*
|
||
* Tier: periodic (not gate). Each run navigates 8-12 prior AskUserQuestions (telemetry,
|
||
* proactive, routing, vendoring, brain, office-hours, premise×3, approach)
|
||
* before reaching Step 0F. At ~30s per AskUserQuestion that's a 4-6 min navigation
|
||
* phase per case. The full 2-case suite runs ~12-15 min, $3-4. Too slow
|
||
* for gate-tier; weekly is fine.
|
||
*
|
||
* Mode coverage: HOLD SCOPE + SCOPE EXPANSION cover the two posture poles
|
||
* (rigor vs ambition). SELECTIVE EXPANSION and SCOPE REDUCTION are V2 once
|
||
* the navigation phase is shorter or has a deterministic fast-path through
|
||
* Step 0A/0C-bis.
|
||
*
|
||
* Posture assertions: each mode has distinct downstream language. The
|
||
* checks below are deliberately permissive — they catch the binary
|
||
* "did the mode posture even apply" question, not Opus-specific phrasing.
|
||
*
|
||
* HOLD SCOPE — "rigor" or "bulletproof" or "hold scope"
|
||
* SCOPE EXPANSION — "expansion" or "10x" or "delight" or "dream"
|
||
*/
|
||
|
||
import { describe, test } from 'bun:test';
|
||
import {
|
||
launchClaudePty,
|
||
isNumberedOptionListVisible,
|
||
isPermissionDialogVisible,
|
||
parseNumberedOptions,
|
||
isPlanReadyVisible,
|
||
MODE_RE,
|
||
optionsSignature,
|
||
TAIL_SCAN_BYTES,
|
||
type ClaudePtySession,
|
||
} from './helpers/claude-pty-runner';
|
||
|
||
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
|
||
const describeE2E = shouldRun ? describe : describe.skip;
|
||
|
||
interface ModeCase {
|
||
mode: 'HOLD SCOPE' | 'SCOPE EXPANSION';
|
||
/** Regex applied to visible-since-mode-pick text. At least one must match. */
|
||
postureRe: RegExp;
|
||
}
|
||
|
||
const CASES: ModeCase[] = [
|
||
{ mode: 'HOLD SCOPE', postureRe: /\b(rigor|bulletproof|hold\s*scope|maximum\s+rigor)\b/i },
|
||
{ mode: 'SCOPE EXPANSION', postureRe: /\b(expansion|10x|delight|dream|cathedral|opt[\s-]?in)\b/i },
|
||
];
|
||
|
||
/**
|
||
* Navigate prior AskUserQuestions by picking option 1 until we hit an AskUserQuestion whose
|
||
* options match one of the 4 mode names. Returns the option index
|
||
* matching `targetMode`, with the buffer marker pointing AT that AskUserQuestion.
|
||
*
|
||
* Throws if we don't reach the mode AskUserQuestion within `maxNav` prior AskUserQuestions or
|
||
* the overall budget.
|
||
*/
|
||
async function navigateToModeAskUserQuestion(
|
||
session: ClaudePtySession,
|
||
since: number,
|
||
targetMode: ModeCase['mode'],
|
||
opts: { maxNav?: number; budgetMs?: number } = {},
|
||
): Promise<{ modeIndex: number; visibleAtMode: string }> {
|
||
// /plan-ceo-review's mode AskUserQuestion (Step 0F) sits behind several preamble
|
||
// and Step 0A-0C-bis gates: telemetry, proactive, routing, vendoring,
|
||
// brain privacy, office-hours offer, premise challenge (3 questions),
|
||
// approach selection. 12 hops is the conservative ceiling.
|
||
const maxNav = opts.maxNav ?? 12;
|
||
const budgetMs = opts.budgetMs ?? 420_000;
|
||
const start = Date.now();
|
||
let priorAnswered = 0;
|
||
let lastSeenList: Array<{ index: number; label: string }> = [];
|
||
|
||
while (Date.now() - start < budgetMs) {
|
||
if (session.exited()) {
|
||
throw new Error(
|
||
`claude exited (code=${session.exitCode()}) during nav.\n` +
|
||
`Last visible:\n${session.visibleSince(since).slice(-2000)}`,
|
||
);
|
||
}
|
||
await Bun.sleep(2000);
|
||
const visible = session.visibleSince(since);
|
||
if (!isNumberedOptionListVisible(visible)) continue;
|
||
const opts = parseNumberedOptions(visible);
|
||
if (opts.length < 2) continue;
|
||
|
||
// Has the rendered list changed since last poll? If not, we're seeing
|
||
// the same prompt and shouldn't double-press.
|
||
const sig = optionsSignature(opts);
|
||
const lastSig = optionsSignature(lastSeenList);
|
||
if (sig === lastSig) continue;
|
||
lastSeenList = opts;
|
||
|
||
// Is THIS the mode AskUserQuestion?
|
||
if (opts.some(o => MODE_RE.test(o.label))) {
|
||
const target = opts.find(o => o.label.toUpperCase().includes(targetMode));
|
||
if (!target) {
|
||
throw new Error(
|
||
`Mode AskUserQuestion rendered but target "${targetMode}" not in option labels:\n` +
|
||
opts.map(o => ` ${o.index}. ${o.label}`).join('\n'),
|
||
);
|
||
}
|
||
return { modeIndex: target.index, visibleAtMode: visible };
|
||
}
|
||
|
||
// Permission dialog? Grant with "1" but don't count it against nav budget.
|
||
// Classify on the recent tail only — old permission text persists in
|
||
// visibleSince and would re-trigger forever.
|
||
//
|
||
// Note: runPlanSkillObservation has its own permission-dialog filter that
|
||
// simply skips classification (since it observes, doesn't drive). This nav
|
||
// loop drives the PTY directly via launchClaudePty and so owns its own
|
||
// dialog handling — granting with "1" so the workflow advances. Both
|
||
// paths share TAIL_SCAN_BYTES as the recent-tail window so tuning stays
|
||
// in sync.
|
||
if (isPermissionDialogVisible(visible.slice(-TAIL_SCAN_BYTES))) {
|
||
session.send('1\r');
|
||
await Bun.sleep(1500);
|
||
continue;
|
||
}
|
||
|
||
// Not the mode AskUserQuestion — answer with option 1 (recommended) and continue.
|
||
if (priorAnswered >= maxNav) {
|
||
throw new Error(
|
||
`Navigated ${maxNav} prior AskUserQuestions without reaching the mode AskUserQuestion. ` +
|
||
`Last list:\n${opts.map(o => ` ${o.index}. ${o.label}`).join('\n')}`,
|
||
);
|
||
}
|
||
priorAnswered++;
|
||
session.send('1\r');
|
||
// Give the agent a beat to advance before re-polling.
|
||
await Bun.sleep(2000);
|
||
}
|
||
throw new Error(`Mode AskUserQuestion not reached within ${budgetMs}ms`);
|
||
}
|
||
|
||
describeE2E('/plan-ceo-review mode routing (gate)', () => {
|
||
for (const c of CASES) {
|
||
test(
|
||
`mode "${c.mode}" routes to its distinctive posture`,
|
||
async () => {
|
||
const session = await launchClaudePty({
|
||
permissionMode: 'plan',
|
||
timeoutMs: 540_000,
|
||
});
|
||
try {
|
||
await Bun.sleep(8000);
|
||
const since = session.mark();
|
||
session.send('/plan-ceo-review\r');
|
||
|
||
const { modeIndex } = await navigateToModeAskUserQuestion(session, since, c.mode);
|
||
|
||
// Snapshot the visible buffer at mode-pick time, then send the index.
|
||
const sincePick = session.rawOutput().length;
|
||
session.send(`${modeIndex}\r`);
|
||
|
||
// Wait for downstream evidence: either next AskUserQuestion or plan_ready or
|
||
// a posture-distinctive substring shows up.
|
||
const budgetMs = 240_000;
|
||
const start = Date.now();
|
||
let postureMatched = false;
|
||
let downstreamSnapshot = '';
|
||
while (Date.now() - start < budgetMs) {
|
||
await Bun.sleep(2500);
|
||
if (session.exited()) {
|
||
throw new Error(
|
||
`claude exited (code=${session.exitCode()}) after mode pick.\n` +
|
||
`Downstream:\n${session.visibleSince(sincePick).slice(-2000)}`,
|
||
);
|
||
}
|
||
downstreamSnapshot = session.visibleSince(sincePick);
|
||
if (c.postureRe.test(downstreamSnapshot)) {
|
||
postureMatched = true;
|
||
break;
|
||
}
|
||
// Don't bail early on plan_ready alone — the posture text may
|
||
// arrive as the agent finishes writing the plan. Only break
|
||
// once we either match posture or run the clock.
|
||
if (
|
||
isPlanReadyVisible(downstreamSnapshot) &&
|
||
isNumberedOptionListVisible(downstreamSnapshot) &&
|
||
!c.postureRe.test(downstreamSnapshot)
|
||
) {
|
||
// Plan-ready AND a follow-up AskUserQuestion are both visible but
|
||
// posture text has not appeared yet. Keep polling for a bit.
|
||
}
|
||
}
|
||
if (!postureMatched) {
|
||
throw new Error(
|
||
`Mode "${c.mode}" routing FAILED: no posture match for ${c.postureRe.source}.\n` +
|
||
`--- downstream visible since mode pick (last 3KB) ---\n` +
|
||
downstreamSnapshot.slice(-3000),
|
||
);
|
||
}
|
||
} finally {
|
||
await session.close();
|
||
}
|
||
},
|
||
600_000,
|
||
);
|
||
}
|
||
});
|