gstack/test/skill-e2e-plan-ceo-mode-routing.test.ts

/**
 * /plan-ceo-review mode-routing E2E (periodic, paid, real-PTY).
 *
 * Asserts: when /plan-ceo-review reaches its Step 0F mode-selection
 * AskUserQuestion and the user picks HOLD SCOPE or SCOPE EXPANSION,
 * the downstream rendered output reflects that mode's distinctive
 * posture language.
 *
 * Why this exists: existing tests verify that the question fires. Nothing
 * verifies the answer actually routes. A regression where Step 0F shows
 * the question but the agent ignores the choice (e.g. always defaults
 * to EXPANSION) would not be caught by any prior test.
 *
 * Tier: periodic (not gate). Each run navigates 8-12 prior AUQs (telemetry,
 * proactive, routing, vendoring, brain, office-hours, premise×3, approach)
 * before reaching Step 0F. At ~30s per AUQ that's a 4-6 min navigation
 * phase per case. The full 2-case suite runs ~12-15 min, $3-4. Too slow
 * for gate-tier; weekly is fine.
 *
 * Mode coverage: HOLD SCOPE + SCOPE EXPANSION cover the two posture poles
 * (rigor vs ambition). SELECTIVE EXPANSION and SCOPE REDUCTION are V2 once
 * the navigation phase is shorter or has a deterministic fast-path through
 * Step 0A/0C-bis.
 *
 * Posture assertions: each mode has distinct downstream language. The
 * checks below are deliberately permissive — they catch the binary
 * "did the mode posture even apply" question, not Opus-specific phrasing.
 *
 *   HOLD SCOPE        — "rigor" or "bulletproof" or "hold scope"
 *   SCOPE EXPANSION   — "expansion" or "10x" or "delight" or "dream"
 */

import { describe, test } from 'bun:test';
import {
  launchClaudePty,
  isNumberedOptionListVisible,
  isPermissionDialogVisible,
  parseNumberedOptions,
  isPlanReadyVisible,
  type ClaudePtySession,
} from './helpers/claude-pty-runner';

const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
const describeE2E = shouldRun ? describe : describe.skip;

const MODE_RE = /HOLD SCOPE|SCOPE EXPANSION|SELECTIVE EXPANSION|SCOPE REDUCTION/i;

interface ModeCase {
  mode: 'HOLD SCOPE' | 'SCOPE EXPANSION';
  /** Regex applied to visible-since-mode-pick text. At least one must match. */
  postureRe: RegExp;
}

const CASES: ModeCase[] = [
  { mode: 'HOLD SCOPE',      postureRe: /\b(rigor|bulletproof|hold\s*scope|maximum\s+rigor)\b/i },
  { mode: 'SCOPE EXPANSION', postureRe: /\b(expansion|10x|delight|dream|cathedral|opt[\s-]?in)\b/i },
];

/**
 * Navigate prior AUQs by picking option 1 until we hit an AUQ whose
 * options match one of the 4 mode names. Returns the option index
 * matching `targetMode`, with the buffer marker pointing AT that AUQ.
 *
 * Throws if we don't reach the mode AUQ within `maxNav` prior AUQs or
 * the overall budget.
 */
async function navigateToModeAuq(
  session: ClaudePtySession,
  since: number,
  targetMode: ModeCase['mode'],
  opts: { maxNav?: number; budgetMs?: number } = {},
): Promise<{ modeIndex: number; visibleAtMode: string }> {
  // /plan-ceo-review's mode AUQ (Step 0F) sits behind several preamble
  // and Step 0A-0C-bis gates: telemetry, proactive, routing, vendoring,
  // brain privacy, office-hours offer, premise challenge (3 questions),
  // approach selection. 12 hops is the conservative ceiling.
  const maxNav = opts.maxNav ?? 12;
  const budgetMs = opts.budgetMs ?? 420_000;
  const start = Date.now();
  let priorAnswered = 0;
  let lastSeenList: Array<{ index: number; label: string }> = [];

  while (Date.now() - start < budgetMs) {
    if (session.exited()) {
      throw new Error(
        `claude exited (code=${session.exitCode()}) during nav.\n` +
        `Last visible:\n${session.visibleSince(since).slice(-2000)}`,
      );
    }
    await Bun.sleep(2000);
    const visible = session.visibleSince(since);
    if (!isNumberedOptionListVisible(visible)) continue;
    const opts = parseNumberedOptions(visible);
    if (opts.length < 2) continue;

    // Has the rendered list changed since last poll? If not, we're seeing
    // the same prompt and shouldn't double-press.
    const sig = opts.map(o => `${o.index}:${o.label}`).join('|');
    const lastSig = lastSeenList.map(o => `${o.index}:${o.label}`).join('|');
    if (sig === lastSig) continue;
    lastSeenList = opts;

    // Is THIS the mode AUQ?
    if (opts.some(o => MODE_RE.test(o.label))) {
      const target = opts.find(o => o.label.toUpperCase().includes(targetMode));
      if (!target) {
        throw new Error(
          `Mode AUQ rendered but target "${targetMode}" not in option labels:\n` +
          opts.map(o => `  ${o.index}. ${o.label}`).join('\n'),
        );
      }
      return { modeIndex: target.index, visibleAtMode: visible };
    }

    // Permission dialog? Grant with "1" but don't count it against nav budget.
    // Classify on the recent tail only — old permission text persists in
    // visibleSince and would re-trigger forever.
    if (isPermissionDialogVisible(visible.slice(-1500))) {
      session.send('1\r');
      await Bun.sleep(1500);
      continue;
    }

    // Not the mode AUQ — answer with option 1 (recommended) and continue.
    if (priorAnswered >= maxNav) {
      throw new Error(
        `Navigated ${maxNav} prior AUQs without reaching the mode AUQ. ` +
        `Last list:\n${opts.map(o => `  ${o.index}. ${o.label}`).join('\n')}`,
      );
    }
    priorAnswered++;
    session.send('1\r');
    // Give the agent a beat to advance before re-polling.
    await Bun.sleep(2000);
  }
  throw new Error(`Mode AUQ not reached within ${budgetMs}ms`);
}

describeE2E('/plan-ceo-review mode routing (gate)', () => {
  for (const c of CASES) {
    test(
      `mode "${c.mode}" routes to its distinctive posture`,
      async () => {
        const session = await launchClaudePty({
          permissionMode: 'plan',
          timeoutMs: 540_000,
        });
        try {
          await Bun.sleep(8000);
          const since = session.mark();
          session.send('/plan-ceo-review\r');

          const { modeIndex } = await navigateToModeAuq(session, since, c.mode);

          // Snapshot the visible buffer at mode-pick time, then send the index.
          const sincePick = session.rawOutput().length;
          session.send(`${modeIndex}\r`);

          // Wait for downstream evidence: either next AUQ or plan_ready or
          // a posture-distinctive substring shows up.
          const budgetMs = 240_000;
          const start = Date.now();
          let postureMatched = false;
          let downstreamSnapshot = '';
          while (Date.now() - start < budgetMs) {
            await Bun.sleep(2500);
            if (session.exited()) {
              throw new Error(
                `claude exited (code=${session.exitCode()}) after mode pick.\n` +
                `Downstream:\n${session.visibleSince(sincePick).slice(-2000)}`,
              );
            }
            downstreamSnapshot = session.visibleSince(sincePick);
            if (c.postureRe.test(downstreamSnapshot)) {
              postureMatched = true;
              break;
            }
            // Don't bail early on plan_ready alone — the posture text may
            // arrive as the agent finishes writing the plan. Only break
            // once we either match posture or run the clock.
            if (
              isPlanReadyVisible(downstreamSnapshot) &&
              isNumberedOptionListVisible(downstreamSnapshot) &&
              !c.postureRe.test(downstreamSnapshot)
            ) {
              // Plan-ready AND a follow-up AUQ are both visible but
              // posture text has not appeared yet. Keep polling for a bit.
            }
          }
          if (!postureMatched) {
            throw new Error(
              `Mode "${c.mode}" routing FAILED: no posture match for ${c.postureRe.source}.\n` +
              `--- downstream visible since mode pick (last 3KB) ---\n` +
              downstreamSnapshot.slice(-3000),
            );
          }
        } finally {
          await session.close();
        }
      },
      600_000,
    );
  }
});