From 864fba73a7be1fc939ac81cababb5be68b85e5cb Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Wed, 29 Apr 2026 18:27:53 -0700
Subject: [PATCH] test: add firstAUQPick + plan-ceo skip-interview routing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Calibration run 1 surfaced a second issue beyond the parser bug: the
default pick of 1 on /plan-ceo-review's scope-selection AUQ routes
the agent to "branch diff vs main" — so it reviews the gstack PR
itself (recursive!) instead of the seeded fixture plan we sent.

Added firstAUQPick callback to runPlanSkillCounting. Override applies
only to the FIRST AUQ; subsequent presses keep using defaultPick.

ceoStep0Boundary now fires on either the mode-pick AUQ (existing path)
or any AUQ containing "Skip interview and plan immediately" — which
is the scope-selection AUQ. Picking that option bypasses Step 0 and
routes straight to review-phase using the chat-paste plan as context.

Plan-ceo test wires firstAUQPick = pickSkipInterview which finds the
"Skip interview" option by label. Falls back to "describe inline" if
the option labels change.

Two new unit tests: ceoStep0Boundary fires on the scope-selection
fixture; existing mode-pick fixture still fires.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 test/helpers/claude-pty-runner.ts             | 27 +++++++++++++++--
 test/helpers/claude-pty-runner.unit.test.ts   | 20 +++++++++++++
 test/skill-e2e-plan-ceo-finding-count.test.ts | 29 +++++++++++++++++++
 3 files changed, 73 insertions(+), 3 deletions(-)

diff --git a/test/helpers/claude-pty-runner.ts b/test/helpers/claude-pty-runner.ts
index a85ba863..790e8587 100644
--- a/test/helpers/claude-pty-runner.ts
+++ b/test/helpers/claude-pty-runner.ts
@@ -621,7 +621,12 @@ export function assertReviewReportAtBottom(
  * files import them directly.
  */
 export const ceoStep0Boundary: Step0BoundaryPredicate = (fp) =>
-  fp.options.some((o) => MODE_RE.test(o.label));
+  // Mode-pick path (Step 0F): one of HOLD SCOPE / SCOPE EXPANSION / etc.
+  fp.options.some((o) => MODE_RE.test(o.label)) ||
+  // Skip-interview path: scope-selection AUQ has "Skip interview and plan
+  // immediately" — picking it bypasses the rest of Step 0 and routes
+  // directly to review-phase. Boundary fires on the scope AUQ itself.
+  fp.options.some((o) => /skip\s+interview|plan\s+immediately/i.test(o.label));
 
 export const engStep0Boundary: Step0BoundaryPredicate = (fp) =>
   /scope reduction recommendation|cross[\s-]?project learnings/i.test(
@@ -1097,6 +1102,18 @@ export async function runPlanSkillCounting(opts: {
   reviewCountCeiling: number;
   /** Numbered option to press by default. Defaults to 1 (recommended). */
   defaultPick?: number;
+  /**
+   * Optional override for the FIRST AUQ observed. Receives the fingerprint;
+   * returns the option index to press. Subsequent AUQs always use defaultPick.
+   *
+   * Skill-specific routing helper: /plan-ceo-review's first AUQ asks "what
+   * scope?" with options like "branch diff" / "describe inline" / "skip
+   * interview". Pressing the default 1 routes to "branch diff" (the wrong
+   * review target for a seeded fixture). firstAUQPick lets the test pick
+   * "Skip interview" or "describe inline" so the agent reviews the
+   * follow-up plan content the test sent, not the git diff.
+   */
+  firstAUQPick?: (fp: AskUserQuestionFingerprint) => number;
   /** Working directory. Default process.cwd() (repo cwd holds skill registry). */
   cwd?: string;
   /** Total budget for skill to reach a terminal outcome. Default 1_500_000 (25 min). */
@@ -1120,6 +1137,7 @@ export async function runPlanSkillCounting(opts: {
   let boundaryFired = false;
   let step0Count = 0;
   let reviewCount = 0;
+  let isFirstAUQ = true;
   let lastSig = '';
 
   function snapshot(
@@ -1239,8 +1257,11 @@ export async function runPlanSkillCounting(opts: {
       if (boundaryFired) reviewCount += 1;
       else step0Count += 1;
 
-      // Press to advance.
-      session.send(`${defaultPick}\r`);
+      // Press to advance — first AUQ may use the override pick.
+      const pickIdx =
+        isFirstAUQ && opts.firstAUQPick ? opts.firstAUQPick(fp) : defaultPick;
+      isFirstAUQ = false;
+      session.send(`${pickIdx}\r`);
 
       // Evaluate boundary AFTER pressing — if THIS AUQ was the last Step 0
       // question, all subsequent AUQs go to reviewCount.
diff --git a/test/helpers/claude-pty-runner.unit.test.ts b/test/helpers/claude-pty-runner.unit.test.ts
index 3960f322..e830d730 100644
--- a/test/helpers/claude-pty-runner.unit.test.ts
+++ b/test/helpers/claude-pty-runner.unit.test.ts
@@ -665,6 +665,26 @@ describe('Step0BoundaryPredicate per-skill', () => {
       expect(ceoStep0Boundary(f)).toBe(true);
     });
 
+    test('FIRES on scope-selection AUQ with "Skip interview" option (skip-interview path)', () => {
+      // After calibration run 1: plan-ceo's first AUQ is scope-selection,
+      // and we route via "Skip interview and plan immediately" to bypass
+      // Step 0 entirely. Boundary must fire on this AUQ so subsequent
+      // AUQs go to reviewCount.
+      const f = fp(
+        'What scope do you want me to CEO-review?',
+        [
+          "The branch's diff vs main",
+          'A specific plan file',
+          "An idea you'll describe inline",
+          'Cancel — wrong skill',
+          'Type something.',
+          'Chat about this',
+          'Skip interview and plan immediately',
+        ],
+      );
+      expect(ceoStep0Boundary(f)).toBe(true);
+    });
+
     test('does NOT fire on premise challenge AUQs', () => {
       const f = fp('D1 — Premise check: is this the right problem?', ['Yes', 'No', 'Other']);
       expect(ceoStep0Boundary(f)).toBe(false);
diff --git a/test/skill-e2e-plan-ceo-finding-count.test.ts b/test/skill-e2e-plan-ceo-finding-count.test.ts
index 0bc94d9c..850c1a03 100644
--- a/test/skill-e2e-plan-ceo-finding-count.test.ts
+++ b/test/skill-e2e-plan-ceo-finding-count.test.ts
@@ -21,8 +21,36 @@ import {
   runPlanSkillCounting,
   ceoStep0Boundary,
   assertReviewReportAtBottom,
+  type AskUserQuestionFingerprint,
 } from './helpers/claude-pty-runner';
 
+/**
+ * /plan-ceo-review's first AUQ asks "what scope?" with options like
+ *   1. Branch diff vs main
+ *   2. A specific plan file or design doc
+ *   3. An idea you'll describe inline
+ *   ...
+ *   7. Skip interview and plan immediately
+ *
+ * The default pick (1) routes to "branch diff vs main" — the wrong target
+ * for our seeded fixture (the agent would review the gstack PR itself,
+ * recursively). Picking "Skip interview and plan immediately" bypasses
+ * Step 0 and routes the agent to review the chat context (where our
+ * follow-up plan was pasted).
+ */
+function pickSkipInterview(fp: AskUserQuestionFingerprint): number {
+  const skipOpt = fp.options.find((o) =>
+    /skip\s+interview|plan\s+immediately/i.test(o.label),
+  );
+  if (skipOpt) return skipOpt.index;
+  // Fallback: "describe inline" also routes to using our pasted plan.
+  const inlineOpt = fp.options.find((o) =>
+    /describe.*inline|inline.*idea/i.test(o.label),
+  );
+  if (inlineOpt) return inlineOpt.index;
+  return 1;
+}
+
 const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
 const describeE2E = shouldRun ? describe : describe.skip;
 
@@ -93,6 +121,7 @@ describeE2E('/plan-ceo-review per-finding AskUserQuestion count (periodic)', ()
         followUpPrompt: PLAN_CEO_5_FINDINGS,
         isLastStep0AUQ: ceoStep0Boundary,
         reviewCountCeiling: CEILING_DISTINCT + 1, // hard cap above assertion ceiling
+        firstAUQPick: pickSkipInterview, // bypass scope-selection, route to review
         cwd: process.cwd(),
         timeoutMs: 1_500_000, // 25 min
         env: { QUESTION_TUNING: 'false', EXPLAIN_LEVEL: 'default' },