Files
gstack/test/skill-e2e-plan-ceo-mode-routing.test.ts
T
Garry Tan fa78a20188 test: extract classifyVisible() + permission-dialog filter in PTY runner
Pure classifier extracted from runPlanSkillObservation's polling loop so
unit tests can exercise the actual branch order with synthetic input
strings. Runner gains:

- env? passthrough on runPlanSkillObservation (forwarded to launchClaudePty).
  gstack-config does not yet honor env overrides; plumbing is in place for a
  future change to make tests hermetic.
- TAIL_SCAN_BYTES = 1500 exported constant. Replaces a duplicated magic
  number in test/skill-e2e-plan-ceo-mode-routing.test.ts so tuning stays
  in sync.
- isPermissionDialogVisible: the bare phrase "Do you want to proceed?" now
  requires a file-edit context co-trigger. Other clauses unchanged. Skill
  questions that contain the bare phrase are no longer mis-classified.
- classifyVisible(visible): pure function. Branch order silent_write →
  plan_ready → asked → null. Permission dialogs filtered out of the
  'asked' classification so a permission prompt cannot pose as a Step 0
  skill question.

Adds 24 unit tests covering all classifier branches, edge cases, and the
co-trigger contract.
2026-04-28 00:00:10 -07:00

213 lines
8.5 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* /plan-ceo-review mode-routing E2E (periodic, paid, real-PTY).
*
* Asserts: when /plan-ceo-review reaches its Step 0F mode-selection
* AskUserQuestion and the user picks HOLD SCOPE or SCOPE EXPANSION,
* the downstream rendered output reflects that mode's distinctive
* posture language.
*
* Why this exists: existing tests verify that the question fires. Nothing
* verifies the answer actually routes. A regression where Step 0F shows
* the question but the agent ignores the choice (e.g. always defaults
* to EXPANSION) would not be caught by any prior test.
*
* Tier: periodic (not gate). Each run navigates 8-12 prior AskUserQuestions (telemetry,
* proactive, routing, vendoring, brain, office-hours, premise×3, approach)
* before reaching Step 0F. At ~30s per AskUserQuestion that's a 4-6 min navigation
* phase per case. The full 2-case suite runs ~12-15 min, $3-4. Too slow
* for gate-tier; weekly is fine.
*
* Mode coverage: HOLD SCOPE + SCOPE EXPANSION cover the two posture poles
* (rigor vs ambition). SELECTIVE EXPANSION and SCOPE REDUCTION are V2 once
* the navigation phase is shorter or has a deterministic fast-path through
* Step 0A/0C-bis.
*
* Posture assertions: each mode has distinct downstream language. The
* checks below are deliberately permissive — they catch the binary
* "did the mode posture even apply" question, not Opus-specific phrasing.
*
* HOLD SCOPE — "rigor" or "bulletproof" or "hold scope"
* SCOPE EXPANSION — "expansion" or "10x" or "delight" or "dream"
*/
import { describe, test } from 'bun:test';
import {
launchClaudePty,
isNumberedOptionListVisible,
isPermissionDialogVisible,
parseNumberedOptions,
isPlanReadyVisible,
TAIL_SCAN_BYTES,
type ClaudePtySession,
} from './helpers/claude-pty-runner';
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
const describeE2E = shouldRun ? describe : describe.skip;
const MODE_RE = /HOLD SCOPE|SCOPE EXPANSION|SELECTIVE EXPANSION|SCOPE REDUCTION/i;
interface ModeCase {
mode: 'HOLD SCOPE' | 'SCOPE EXPANSION';
/** Regex applied to visible-since-mode-pick text. At least one must match. */
postureRe: RegExp;
}
const CASES: ModeCase[] = [
{ mode: 'HOLD SCOPE', postureRe: /\b(rigor|bulletproof|hold\s*scope|maximum\s+rigor)\b/i },
{ mode: 'SCOPE EXPANSION', postureRe: /\b(expansion|10x|delight|dream|cathedral|opt[\s-]?in)\b/i },
];
/**
* Navigate prior AskUserQuestions by picking option 1 until we hit an AskUserQuestion whose
* options match one of the 4 mode names. Returns the option index
* matching `targetMode`, with the buffer marker pointing AT that AskUserQuestion.
*
* Throws if we don't reach the mode AskUserQuestion within `maxNav` prior AskUserQuestions or
* the overall budget.
*/
async function navigateToModeAskUserQuestion(
session: ClaudePtySession,
since: number,
targetMode: ModeCase['mode'],
opts: { maxNav?: number; budgetMs?: number } = {},
): Promise<{ modeIndex: number; visibleAtMode: string }> {
// /plan-ceo-review's mode AskUserQuestion (Step 0F) sits behind several preamble
// and Step 0A-0C-bis gates: telemetry, proactive, routing, vendoring,
// brain privacy, office-hours offer, premise challenge (3 questions),
// approach selection. 12 hops is the conservative ceiling.
const maxNav = opts.maxNav ?? 12;
const budgetMs = opts.budgetMs ?? 420_000;
const start = Date.now();
let priorAnswered = 0;
let lastSeenList: Array<{ index: number; label: string }> = [];
while (Date.now() - start < budgetMs) {
if (session.exited()) {
throw new Error(
`claude exited (code=${session.exitCode()}) during nav.\n` +
`Last visible:\n${session.visibleSince(since).slice(-2000)}`,
);
}
await Bun.sleep(2000);
const visible = session.visibleSince(since);
if (!isNumberedOptionListVisible(visible)) continue;
const opts = parseNumberedOptions(visible);
if (opts.length < 2) continue;
// Has the rendered list changed since last poll? If not, we're seeing
// the same prompt and shouldn't double-press.
const sig = opts.map(o => `${o.index}:${o.label}`).join('|');
const lastSig = lastSeenList.map(o => `${o.index}:${o.label}`).join('|');
if (sig === lastSig) continue;
lastSeenList = opts;
// Is THIS the mode AskUserQuestion?
if (opts.some(o => MODE_RE.test(o.label))) {
const target = opts.find(o => o.label.toUpperCase().includes(targetMode));
if (!target) {
throw new Error(
`Mode AskUserQuestion rendered but target "${targetMode}" not in option labels:\n` +
opts.map(o => ` ${o.index}. ${o.label}`).join('\n'),
);
}
return { modeIndex: target.index, visibleAtMode: visible };
}
// Permission dialog? Grant with "1" but don't count it against nav budget.
// Classify on the recent tail only — old permission text persists in
// visibleSince and would re-trigger forever.
//
// Note: runPlanSkillObservation has its own permission-dialog filter that
// simply skips classification (since it observes, doesn't drive). This nav
// loop drives the PTY directly via launchClaudePty and so owns its own
// dialog handling — granting with "1" so the workflow advances. Both
// paths share TAIL_SCAN_BYTES as the recent-tail window so tuning stays
// in sync.
if (isPermissionDialogVisible(visible.slice(-TAIL_SCAN_BYTES))) {
session.send('1\r');
await Bun.sleep(1500);
continue;
}
// Not the mode AskUserQuestion — answer with option 1 (recommended) and continue.
if (priorAnswered >= maxNav) {
throw new Error(
`Navigated ${maxNav} prior AskUserQuestions without reaching the mode AskUserQuestion. ` +
`Last list:\n${opts.map(o => ` ${o.index}. ${o.label}`).join('\n')}`,
);
}
priorAnswered++;
session.send('1\r');
// Give the agent a beat to advance before re-polling.
await Bun.sleep(2000);
}
throw new Error(`Mode AskUserQuestion not reached within ${budgetMs}ms`);
}
describeE2E('/plan-ceo-review mode routing (gate)', () => {
for (const c of CASES) {
test(
`mode "${c.mode}" routes to its distinctive posture`,
async () => {
const session = await launchClaudePty({
permissionMode: 'plan',
timeoutMs: 540_000,
});
try {
await Bun.sleep(8000);
const since = session.mark();
session.send('/plan-ceo-review\r');
const { modeIndex } = await navigateToModeAskUserQuestion(session, since, c.mode);
// Snapshot the visible buffer at mode-pick time, then send the index.
const sincePick = session.rawOutput().length;
session.send(`${modeIndex}\r`);
// Wait for downstream evidence: either next AskUserQuestion or plan_ready or
// a posture-distinctive substring shows up.
const budgetMs = 240_000;
const start = Date.now();
let postureMatched = false;
let downstreamSnapshot = '';
while (Date.now() - start < budgetMs) {
await Bun.sleep(2500);
if (session.exited()) {
throw new Error(
`claude exited (code=${session.exitCode()}) after mode pick.\n` +
`Downstream:\n${session.visibleSince(sincePick).slice(-2000)}`,
);
}
downstreamSnapshot = session.visibleSince(sincePick);
if (c.postureRe.test(downstreamSnapshot)) {
postureMatched = true;
break;
}
// Don't bail early on plan_ready alone — the posture text may
// arrive as the agent finishes writing the plan. Only break
// once we either match posture or run the clock.
if (
isPlanReadyVisible(downstreamSnapshot) &&
isNumberedOptionListVisible(downstreamSnapshot) &&
!c.postureRe.test(downstreamSnapshot)
) {
// Plan-ready AND a follow-up AskUserQuestion are both visible but
// posture text has not appeared yet. Keep polling for a bit.
}
}
if (!postureMatched) {
throw new Error(
`Mode "${c.mode}" routing FAILED: no posture match for ${c.postureRe.source}.\n` +
`--- downstream visible since mode pick (last 3KB) ---\n` +
downstreamSnapshot.slice(-3000),
);
}
} finally {
await session.close();
}
},
600_000,
);
}
});