diff --git a/test/helpers/claude-pty-runner.ts b/test/helpers/claude-pty-runner.ts index 2fb7b484e..86f7581f4 100644 --- a/test/helpers/claude-pty-runner.ts +++ b/test/helpers/claude-pty-runner.ts @@ -297,6 +297,58 @@ export function isNumberedOptionListVisible(visible: string): boolean { return /❯\s*1\./.test(visible) && /(^|[^0-9])2\./.test(visible); } +/** + * Detect a prose-rendered AskUserQuestion in plan mode. + * + * Plan-mode AUQs sometimes render as visible model output rather than via + * the native numbered-prompt UI — e.g., when --disallowedTools AskUserQuestion + * is set and no MCP variant is callable, the model surfaces the question as + * lettered or numbered options in plain text. isNumberedOptionListVisible + * doesn't catch these because the `❯` cursor sits on the empty input prompt, + * not on option 1. + * + * Detection patterns: + * - 2+ distinct lettered options (A) B) C) D)) at line starts — typical + * for plan-eng / plan-design / plan-devex prose AUQ + * - 3+ distinct numbered options (1. 2. 3.) at line starts WITHOUT a + * `❯1.` cursor — typical for autoplan / office-hours prose AUQ + * + * Used by classifyVisible and runPlanSkillFloorCheck to return outcome='asked' + * (or auq_observed) instead of letting the harness time out when the model + * is correctly surfacing the question and waiting for user input via prose. + * + * The 4KB tail window avoids matching stale options from earlier prompts in + * scrollback. Permission dialogs are filtered out by the caller (see + * isPermissionDialogVisible callers in classifyVisible). + */ +export function isProseAUQVisible(visible: string): boolean { + const tail = visible.length > 4096 ? visible.slice(-4096) : visible; + + // Pattern 1: 2+ distinct lettered options at line starts. Allow leading + // whitespace or `❯` cursor before the marker. PTY may collapse multiple + // option lines onto one logical line via stripped cursor-positioning + // escapes, but the NEWLINE before each option survives. + const letteredRe = /(?:^|\n)[ \t❯]*([A-D])\)/g; + const letteredHits = new Set(); + let lm: RegExpExecArray | null; + while ((lm = letteredRe.exec(tail)) !== null) { + if (lm[1]) letteredHits.add(lm[1]); + } + if (letteredHits.size >= 2) return true; + + // Pattern 2: 3+ distinct numbered options at line starts, AND no `❯1.` + // cursor anywhere in the FULL visible buffer (which would mean + // isNumberedOptionListVisible already covers the case via the native UI). + if (/❯\s*1\./.test(visible)) return false; + const numberedRe = /(?:^|\n)[ \t❯]*([1-9])\./g; + const numberedHits = new Set(); + let nm: RegExpExecArray | null; + while ((nm = numberedRe.exec(tail)) !== null) { + if (nm[1]) numberedHits.add(nm[1]); + } + return numberedHits.size >= 3; +} + /** * Parse a rendered numbered-option list out of the visible TTY text. * @@ -570,6 +622,21 @@ export function classifyVisible( summary: 'skill fired a numbered-option prompt (AskUserQuestion or routing-injection)', }; } + // Prose-rendered AUQ: model surfaced the question as lettered or numbered + // options in plain text (typical under --disallowedTools AskUserQuestion + // when no MCP variant is callable). The model is waiting for user input + // via the plan-mode input prompt rather than via the AUQ tool UI; this + // is still a legitimate "asked" surface — semantically equivalent to a + // tool-call AUQ from the test's perspective. + if (isProseAUQVisible(visible)) { + if (isPermissionDialogVisible(visible.slice(-TAIL_SCAN_BYTES))) { + return null; + } + return { + outcome: 'asked', + summary: 'skill rendered a prose-style AskUserQuestion (model waiting for user input)', + }; + } return null; } @@ -1652,12 +1719,15 @@ export async function runPlanSkillFloorCheck(opts: { }; } - // Success: ANY non-permission numbered-option list is an AUQ render. - // The bug we're catching is "fired zero AUQs," so observing one is - // sufficient — we don't need to fingerprint or navigate past it. + // Success: ANY non-permission numbered-option list is an AUQ render — + // either via the native numbered-prompt UI (isNumberedOptionListVisible) + // OR via prose-rendered options under --disallowedTools when no MCP + // variant is callable (isProseAUQVisible). Both surface the question + // to the user; the bug we're catching is "fired zero AUQs." + const tail = visible.slice(-TAIL_SCAN_BYTES); if ( - isNumberedOptionListVisible(visible) && - !isPermissionDialogVisible(visible.slice(-TAIL_SCAN_BYTES)) + (isNumberedOptionListVisible(visible) || isProseAUQVisible(visible)) && + !isPermissionDialogVisible(tail) ) { return { auqObserved: true, diff --git a/test/helpers/claude-pty-runner.unit.test.ts b/test/helpers/claude-pty-runner.unit.test.ts index dcb4f5efe..1e79ca137 100644 --- a/test/helpers/claude-pty-runner.unit.test.ts +++ b/test/helpers/claude-pty-runner.unit.test.ts @@ -26,6 +26,7 @@ import { describe, test, expect } from 'bun:test'; import { isPermissionDialogVisible, isNumberedOptionListVisible, + isProseAUQVisible, isPlanReadyVisible, parseNumberedOptions, classifyVisible, @@ -192,6 +193,86 @@ describe('isNumberedOptionListVisible', () => { }); }); +describe('isProseAUQVisible', () => { + test('matches 4 lettered options A) B) C) D) at line starts (plan-eng prose AUQ shape)', () => { + const sample = ` +What would you like me to review? Options: +A) Point me at an existing design doc or plan file (path). +B) Describe new work you're planning — I'll explore the codebase. +C) You meant /review for the diff already on this branch. +D) Something else (tell me). +Recommendation: A if you have a doc in mind, otherwise B. +❯ +`; + expect(isProseAUQVisible(sample)).toBe(true); + }); + + test('matches 2 lettered options (minimum threshold)', () => { + const sample = ` +A) First option +B) Second option +`; + expect(isProseAUQVisible(sample)).toBe(true); + }); + + test('matches 3 numbered options 1. 2. 3. without ❯ 1. cursor (autoplan prose AUQ shape)', () => { + const sample = ` +What's the task? A few options: + 1. You have a plan idea in mind — describe it. + 2. You want to review an existing plan elsewhere. + 3. You meant a different command — /plan-ceo-review etc. +❯ +`; + expect(isProseAUQVisible(sample)).toBe(true); + }); + + test('returns false when ❯ 1. cursor is present (native UI handled by isNumberedOptionListVisible)', () => { + const sample = ` +❯ 1. First option + 2. Second option + 3. Third option +`; + expect(isProseAUQVisible(sample)).toBe(false); + }); + + test('returns false on single lettered option', () => { + const sample = ` +A) Only one option mentioned in passing. +`; + expect(isProseAUQVisible(sample)).toBe(false); + }); + + test('returns false on 2 numbered options (need 3+ for prose numbered)', () => { + const sample = ` +1. First note. +2. Second note. +`; + expect(isProseAUQVisible(sample)).toBe(false); + }); + + test('does not match mid-prose lettered text like "(see option B) above"', () => { + const sample = ` +This refers to (see option B) above and also to point A) earlier. +`; + // The B) and A) markers are mid-line, not at line starts, so they don't count. + expect(isProseAUQVisible(sample)).toBe(false); + }); + + test('matches with leading whitespace and ❯ prefix on options', () => { + const sample = ` + A) Option with whitespace prefix +❯ B) Option with cursor prefix + C) Another option +`; + expect(isProseAUQVisible(sample)).toBe(true); + }); + + test('returns false on plain text with no option markers', () => { + expect(isProseAUQVisible('Just some plain text output from the model.')).toBe(false); + expect(isProseAUQVisible('')).toBe(false); + }); +}); + describe('classifyVisible (runtime path through the runner classifier)', () => { // These tests call the actual classifier so a future contributor who // reorders branches (e.g. moves the permission short-circuit before