From 864fba73a7be1fc939ac81cababb5be68b85e5cb Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Wed, 29 Apr 2026 18:27:53 -0700 Subject: [PATCH] test: add firstAUQPick + plan-ceo skip-interview routing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Calibration run 1 surfaced a second issue beyond the parser bug: the default pick of 1 on /plan-ceo-review's scope-selection AUQ routes the agent to "branch diff vs main" — so it reviews the gstack PR itself (recursive!) instead of the seeded fixture plan we sent. Added firstAUQPick callback to runPlanSkillCounting. Override applies only to the FIRST AUQ; subsequent presses keep using defaultPick. ceoStep0Boundary now fires on either the mode-pick AUQ (existing path) or any AUQ containing "Skip interview and plan immediately" — which is the scope-selection AUQ. Picking that option bypasses Step 0 and routes straight to review-phase using the chat-paste plan as context. Plan-ceo test wires firstAUQPick = pickSkipInterview which finds the "Skip interview" option by label. Falls back to "describe inline" if the option labels change. Two new unit tests: ceoStep0Boundary fires on the scope-selection fixture; existing mode-pick fixture still fires. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/helpers/claude-pty-runner.ts | 27 +++++++++++++++-- test/helpers/claude-pty-runner.unit.test.ts | 20 +++++++++++++ test/skill-e2e-plan-ceo-finding-count.test.ts | 29 +++++++++++++++++++ 3 files changed, 73 insertions(+), 3 deletions(-) diff --git a/test/helpers/claude-pty-runner.ts b/test/helpers/claude-pty-runner.ts index a85ba863..790e8587 100644 --- a/test/helpers/claude-pty-runner.ts +++ b/test/helpers/claude-pty-runner.ts @@ -621,7 +621,12 @@ export function assertReviewReportAtBottom( * files import them directly. */ export const ceoStep0Boundary: Step0BoundaryPredicate = (fp) => - fp.options.some((o) => MODE_RE.test(o.label)); + // Mode-pick path (Step 0F): one of HOLD SCOPE / SCOPE EXPANSION / etc. + fp.options.some((o) => MODE_RE.test(o.label)) || + // Skip-interview path: scope-selection AUQ has "Skip interview and plan + // immediately" — picking it bypasses the rest of Step 0 and routes + // directly to review-phase. Boundary fires on the scope AUQ itself. + fp.options.some((o) => /skip\s+interview|plan\s+immediately/i.test(o.label)); export const engStep0Boundary: Step0BoundaryPredicate = (fp) => /scope reduction recommendation|cross[\s-]?project learnings/i.test( @@ -1097,6 +1102,18 @@ export async function runPlanSkillCounting(opts: { reviewCountCeiling: number; /** Numbered option to press by default. Defaults to 1 (recommended). */ defaultPick?: number; + /** + * Optional override for the FIRST AUQ observed. Receives the fingerprint; + * returns the option index to press. Subsequent AUQs always use defaultPick. + * + * Skill-specific routing helper: /plan-ceo-review's first AUQ asks "what + * scope?" with options like "branch diff" / "describe inline" / "skip + * interview". Pressing the default 1 routes to "branch diff" (the wrong + * review target for a seeded fixture). firstAUQPick lets the test pick + * "Skip interview" or "describe inline" so the agent reviews the + * follow-up plan content the test sent, not the git diff. + */ + firstAUQPick?: (fp: AskUserQuestionFingerprint) => number; /** Working directory. Default process.cwd() (repo cwd holds skill registry). */ cwd?: string; /** Total budget for skill to reach a terminal outcome. Default 1_500_000 (25 min). */ @@ -1120,6 +1137,7 @@ export async function runPlanSkillCounting(opts: { let boundaryFired = false; let step0Count = 0; let reviewCount = 0; + let isFirstAUQ = true; let lastSig = ''; function snapshot( @@ -1239,8 +1257,11 @@ export async function runPlanSkillCounting(opts: { if (boundaryFired) reviewCount += 1; else step0Count += 1; - // Press to advance. - session.send(`${defaultPick}\r`); + // Press to advance — first AUQ may use the override pick. + const pickIdx = + isFirstAUQ && opts.firstAUQPick ? opts.firstAUQPick(fp) : defaultPick; + isFirstAUQ = false; + session.send(`${pickIdx}\r`); // Evaluate boundary AFTER pressing — if THIS AUQ was the last Step 0 // question, all subsequent AUQs go to reviewCount. diff --git a/test/helpers/claude-pty-runner.unit.test.ts b/test/helpers/claude-pty-runner.unit.test.ts index 3960f322..e830d730 100644 --- a/test/helpers/claude-pty-runner.unit.test.ts +++ b/test/helpers/claude-pty-runner.unit.test.ts @@ -665,6 +665,26 @@ describe('Step0BoundaryPredicate per-skill', () => { expect(ceoStep0Boundary(f)).toBe(true); }); + test('FIRES on scope-selection AUQ with "Skip interview" option (skip-interview path)', () => { + // After calibration run 1: plan-ceo's first AUQ is scope-selection, + // and we route via "Skip interview and plan immediately" to bypass + // Step 0 entirely. Boundary must fire on this AUQ so subsequent + // AUQs go to reviewCount. + const f = fp( + 'What scope do you want me to CEO-review?', + [ + "The branch's diff vs main", + 'A specific plan file', + "An idea you'll describe inline", + 'Cancel — wrong skill', + 'Type something.', + 'Chat about this', + 'Skip interview and plan immediately', + ], + ); + expect(ceoStep0Boundary(f)).toBe(true); + }); + test('does NOT fire on premise challenge AUQs', () => { const f = fp('D1 — Premise check: is this the right problem?', ['Yes', 'No', 'Other']); expect(ceoStep0Boundary(f)).toBe(false); diff --git a/test/skill-e2e-plan-ceo-finding-count.test.ts b/test/skill-e2e-plan-ceo-finding-count.test.ts index 0bc94d9c..850c1a03 100644 --- a/test/skill-e2e-plan-ceo-finding-count.test.ts +++ b/test/skill-e2e-plan-ceo-finding-count.test.ts @@ -21,8 +21,36 @@ import { runPlanSkillCounting, ceoStep0Boundary, assertReviewReportAtBottom, + type AskUserQuestionFingerprint, } from './helpers/claude-pty-runner'; +/** + * /plan-ceo-review's first AUQ asks "what scope?" with options like + * 1. Branch diff vs main + * 2. A specific plan file or design doc + * 3. An idea you'll describe inline + * ... + * 7. Skip interview and plan immediately + * + * The default pick (1) routes to "branch diff vs main" — the wrong target + * for our seeded fixture (the agent would review the gstack PR itself, + * recursively). Picking "Skip interview and plan immediately" bypasses + * Step 0 and routes the agent to review the chat context (where our + * follow-up plan was pasted). + */ +function pickSkipInterview(fp: AskUserQuestionFingerprint): number { + const skipOpt = fp.options.find((o) => + /skip\s+interview|plan\s+immediately/i.test(o.label), + ); + if (skipOpt) return skipOpt.index; + // Fallback: "describe inline" also routes to using our pasted plan. + const inlineOpt = fp.options.find((o) => + /describe.*inline|inline.*idea/i.test(o.label), + ); + if (inlineOpt) return inlineOpt.index; + return 1; +} + const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic'; const describeE2E = shouldRun ? describe : describe.skip; @@ -93,6 +121,7 @@ describeE2E('/plan-ceo-review per-finding AskUserQuestion count (periodic)', () followUpPrompt: PLAN_CEO_5_FINDINGS, isLastStep0AUQ: ceoStep0Boundary, reviewCountCeiling: CEILING_DISTINCT + 1, // hard cap above assertion ceiling + firstAUQPick: pickSkipInterview, // bypass scope-selection, route to review cwd: process.cwd(), timeoutMs: 1_500_000, // 25 min env: { QUESTION_TUNING: 'false', EXPLAIN_LEVEL: 'default' },