Files
gstack/test/skill-e2e-plan-ceo-mode-routing.test.ts
T
Garry Tan e6fd776a37 feat(test): 3 periodic-tier real-PTY E2E tests
skill-e2e-plan-ceo-mode-routing.test.ts (~$3/run, 6-10 min/case):
- Verifies AUQ answer routing: HOLD SCOPE → rigor/bulletproof posture
  language; SCOPE EXPANSION → expansion/10x/dream language. Each case
  navigates 8-12 prior AUQs (telemetry, proactive, routing, vendoring,
  brain, office-hours, premise, approach) before hitting Step 0F.
- Periodic, not gate: navigation phase too slow for PR-blocking.
  V2 expansion to 4 modes (SELECTIVE + REDUCTION) when nav is faster.

skill-e2e-ship-idempotency.test.ts (~$3/run, 5-10 min):
- Builds a real git fixture with VERSION 0.0.2 already bumped, matching
  package.json, CHANGELOG entry, pushed to a local bare remote. Runs
  /ship in plan mode and asserts STATE: ALREADY_BUMPED echoes from the
  Step 12 idempotency check, OR plan_ready terminates without mutation.
- Snapshots VERSION + package.json + CHANGELOG entry count + commit
  count + branch HEAD before/after; fails if any changed.

skill-e2e-autoplan-chain.test.ts (~$8/run, 12-18 min):
- Asserts /autoplan phases run sequentially: tees timestamps as each
  "**Phase N complete.**" marker first appears. Phase 1 (CEO) must
  precede Phase 3 (Eng); Phase 2 (Design) is optional but if it
  appears, must sit between 1 and 3.
- Auto-grants permission dialogs that fire during phase transitions.

All three auto-handle permission dialogs (preamble side-effects on
fresh user envs without .feature-prompted-* markers).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-26 04:36:51 -07:00

205 lines
7.8 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* /plan-ceo-review mode-routing E2E (periodic, paid, real-PTY).
*
* Asserts: when /plan-ceo-review reaches its Step 0F mode-selection
* AskUserQuestion and the user picks HOLD SCOPE or SCOPE EXPANSION,
* the downstream rendered output reflects that mode's distinctive
* posture language.
*
* Why this exists: existing tests verify that the question fires. Nothing
* verifies the answer actually routes. A regression where Step 0F shows
* the question but the agent ignores the choice (e.g. always defaults
* to EXPANSION) would not be caught by any prior test.
*
* Tier: periodic (not gate). Each run navigates 8-12 prior AUQs (telemetry,
* proactive, routing, vendoring, brain, office-hours, premise×3, approach)
* before reaching Step 0F. At ~30s per AUQ that's a 4-6 min navigation
* phase per case. The full 2-case suite runs ~12-15 min, $3-4. Too slow
* for gate-tier; weekly is fine.
*
* Mode coverage: HOLD SCOPE + SCOPE EXPANSION cover the two posture poles
* (rigor vs ambition). SELECTIVE EXPANSION and SCOPE REDUCTION are V2 once
* the navigation phase is shorter or has a deterministic fast-path through
* Step 0A/0C-bis.
*
* Posture assertions: each mode has distinct downstream language. The
* checks below are deliberately permissive — they catch the binary
* "did the mode posture even apply" question, not Opus-specific phrasing.
*
* HOLD SCOPE — "rigor" or "bulletproof" or "hold scope"
* SCOPE EXPANSION — "expansion" or "10x" or "delight" or "dream"
*/
import { describe, test } from 'bun:test';
import {
launchClaudePty,
isNumberedOptionListVisible,
isPermissionDialogVisible,
parseNumberedOptions,
isPlanReadyVisible,
type ClaudePtySession,
} from './helpers/claude-pty-runner';
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
const describeE2E = shouldRun ? describe : describe.skip;
const MODE_RE = /HOLD SCOPE|SCOPE EXPANSION|SELECTIVE EXPANSION|SCOPE REDUCTION/i;
interface ModeCase {
mode: 'HOLD SCOPE' | 'SCOPE EXPANSION';
/** Regex applied to visible-since-mode-pick text. At least one must match. */
postureRe: RegExp;
}
const CASES: ModeCase[] = [
{ mode: 'HOLD SCOPE', postureRe: /\b(rigor|bulletproof|hold\s*scope|maximum\s+rigor)\b/i },
{ mode: 'SCOPE EXPANSION', postureRe: /\b(expansion|10x|delight|dream|cathedral|opt[\s-]?in)\b/i },
];
/**
* Navigate prior AUQs by picking option 1 until we hit an AUQ whose
* options match one of the 4 mode names. Returns the option index
* matching `targetMode`, with the buffer marker pointing AT that AUQ.
*
* Throws if we don't reach the mode AUQ within `maxNav` prior AUQs or
* the overall budget.
*/
async function navigateToModeAuq(
session: ClaudePtySession,
since: number,
targetMode: ModeCase['mode'],
opts: { maxNav?: number; budgetMs?: number } = {},
): Promise<{ modeIndex: number; visibleAtMode: string }> {
// /plan-ceo-review's mode AUQ (Step 0F) sits behind several preamble
// and Step 0A-0C-bis gates: telemetry, proactive, routing, vendoring,
// brain privacy, office-hours offer, premise challenge (3 questions),
// approach selection. 12 hops is the conservative ceiling.
const maxNav = opts.maxNav ?? 12;
const budgetMs = opts.budgetMs ?? 420_000;
const start = Date.now();
let priorAnswered = 0;
let lastSeenList: Array<{ index: number; label: string }> = [];
while (Date.now() - start < budgetMs) {
if (session.exited()) {
throw new Error(
`claude exited (code=${session.exitCode()}) during nav.\n` +
`Last visible:\n${session.visibleSince(since).slice(-2000)}`,
);
}
await Bun.sleep(2000);
const visible = session.visibleSince(since);
if (!isNumberedOptionListVisible(visible)) continue;
const opts = parseNumberedOptions(visible);
if (opts.length < 2) continue;
// Has the rendered list changed since last poll? If not, we're seeing
// the same prompt and shouldn't double-press.
const sig = opts.map(o => `${o.index}:${o.label}`).join('|');
const lastSig = lastSeenList.map(o => `${o.index}:${o.label}`).join('|');
if (sig === lastSig) continue;
lastSeenList = opts;
// Is THIS the mode AUQ?
if (opts.some(o => MODE_RE.test(o.label))) {
const target = opts.find(o => o.label.toUpperCase().includes(targetMode));
if (!target) {
throw new Error(
`Mode AUQ rendered but target "${targetMode}" not in option labels:\n` +
opts.map(o => ` ${o.index}. ${o.label}`).join('\n'),
);
}
return { modeIndex: target.index, visibleAtMode: visible };
}
// Permission dialog? Grant with "1" but don't count it against nav budget.
// Classify on the recent tail only — old permission text persists in
// visibleSince and would re-trigger forever.
if (isPermissionDialogVisible(visible.slice(-1500))) {
session.send('1\r');
await Bun.sleep(1500);
continue;
}
// Not the mode AUQ — answer with option 1 (recommended) and continue.
if (priorAnswered >= maxNav) {
throw new Error(
`Navigated ${maxNav} prior AUQs without reaching the mode AUQ. ` +
`Last list:\n${opts.map(o => ` ${o.index}. ${o.label}`).join('\n')}`,
);
}
priorAnswered++;
session.send('1\r');
// Give the agent a beat to advance before re-polling.
await Bun.sleep(2000);
}
throw new Error(`Mode AUQ not reached within ${budgetMs}ms`);
}
describeE2E('/plan-ceo-review mode routing (gate)', () => {
for (const c of CASES) {
test(
`mode "${c.mode}" routes to its distinctive posture`,
async () => {
const session = await launchClaudePty({
permissionMode: 'plan',
timeoutMs: 540_000,
});
try {
await Bun.sleep(8000);
const since = session.mark();
session.send('/plan-ceo-review\r');
const { modeIndex } = await navigateToModeAuq(session, since, c.mode);
// Snapshot the visible buffer at mode-pick time, then send the index.
const sincePick = session.rawOutput().length;
session.send(`${modeIndex}\r`);
// Wait for downstream evidence: either next AUQ or plan_ready or
// a posture-distinctive substring shows up.
const budgetMs = 240_000;
const start = Date.now();
let postureMatched = false;
let downstreamSnapshot = '';
while (Date.now() - start < budgetMs) {
await Bun.sleep(2500);
if (session.exited()) {
throw new Error(
`claude exited (code=${session.exitCode()}) after mode pick.\n` +
`Downstream:\n${session.visibleSince(sincePick).slice(-2000)}`,
);
}
downstreamSnapshot = session.visibleSince(sincePick);
if (c.postureRe.test(downstreamSnapshot)) {
postureMatched = true;
break;
}
// Don't bail early on plan_ready alone — the posture text may
// arrive as the agent finishes writing the plan. Only break
// once we either match posture or run the clock.
if (
isPlanReadyVisible(downstreamSnapshot) &&
isNumberedOptionListVisible(downstreamSnapshot) &&
!c.postureRe.test(downstreamSnapshot)
) {
// Plan-ready AND a follow-up AUQ are both visible but
// posture text has not appeared yet. Keep polling for a bit.
}
}
if (!postureMatched) {
throw new Error(
`Mode "${c.mode}" routing FAILED: no posture match for ${c.postureRe.source}.\n` +
`--- downstream visible since mode pick (last 3KB) ---\n` +
downstreamSnapshot.slice(-3000),
);
}
} finally {
await session.close();
}
},
600_000,
);
}
});