mirror of
https://github.com/garrytan/gstack.git
synced 2026-06-17 15:20:11 +02:00
test(harness): detect prose-rendered AskUserQuestion in plan mode
When --disallowedTools AskUserQuestion is set and no MCP variant is
callable, the model surfaces decisions as visible prose options
("A) ... B) ... C) ..." or "1. ... 2. ... 3. ...") rather than via the
native numbered-prompt UI. isNumberedOptionListVisible doesn't catch
these because the ❯ cursor sits on the empty input prompt rather than
on option 1, so runPlanSkillObservation and runPlanSkillFloorCheck
would time out at 5-10 minutes per test even though the model was
correctly waiting for user input.
This was exposed by the v1.28 fallback deletion: pre-deletion the
model used the preamble fallback to silently auto-resolve to
plan_ready in this scenario. Post-deletion the model correctly
surfaces the question and waits, but the harness couldn't tell.
isProseAUQVisible matches:
- 2+ distinct lettered options at line starts (A/B/C/D form)
- 3+ distinct numbered options at line starts WITHOUT a `❯ 1.`
cursor (so it doesn't double-fire on native numbered prompts)
Wired into:
- classifyVisible (used by runPlanSkillObservation) → returns
outcome='asked' instead of timeout
- runPlanSkillFloorCheck → counts as auq_observed (floor met)
8 new unit tests in claude-pty-runner.unit.test.ts cover the lettered
shape, numbered shape, threshold edges, native-cursor exclusion, and
mid-prose false-positive guard.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -297,6 +297,58 @@ export function isNumberedOptionListVisible(visible: string): boolean {
|
||||
return /❯\s*1\./.test(visible) && /(^|[^0-9])2\./.test(visible);
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect a prose-rendered AskUserQuestion in plan mode.
|
||||
*
|
||||
* Plan-mode AUQs sometimes render as visible model output rather than via
|
||||
* the native numbered-prompt UI — e.g., when --disallowedTools AskUserQuestion
|
||||
* is set and no MCP variant is callable, the model surfaces the question as
|
||||
* lettered or numbered options in plain text. isNumberedOptionListVisible
|
||||
* doesn't catch these because the `❯` cursor sits on the empty input prompt,
|
||||
* not on option 1.
|
||||
*
|
||||
* Detection patterns:
|
||||
* - 2+ distinct lettered options (A) B) C) D)) at line starts — typical
|
||||
* for plan-eng / plan-design / plan-devex prose AUQ
|
||||
* - 3+ distinct numbered options (1. 2. 3.) at line starts WITHOUT a
|
||||
* `❯<spaces>1.` cursor — typical for autoplan / office-hours prose AUQ
|
||||
*
|
||||
* Used by classifyVisible and runPlanSkillFloorCheck to return outcome='asked'
|
||||
* (or auq_observed) instead of letting the harness time out when the model
|
||||
* is correctly surfacing the question and waiting for user input via prose.
|
||||
*
|
||||
* The 4KB tail window avoids matching stale options from earlier prompts in
|
||||
* scrollback. Permission dialogs are filtered out by the caller (see
|
||||
* isPermissionDialogVisible callers in classifyVisible).
|
||||
*/
|
||||
export function isProseAUQVisible(visible: string): boolean {
|
||||
const tail = visible.length > 4096 ? visible.slice(-4096) : visible;
|
||||
|
||||
// Pattern 1: 2+ distinct lettered options at line starts. Allow leading
|
||||
// whitespace or `❯` cursor before the marker. PTY may collapse multiple
|
||||
// option lines onto one logical line via stripped cursor-positioning
|
||||
// escapes, but the NEWLINE before each option survives.
|
||||
const letteredRe = /(?:^|\n)[ \t❯]*([A-D])\)/g;
|
||||
const letteredHits = new Set<string>();
|
||||
let lm: RegExpExecArray | null;
|
||||
while ((lm = letteredRe.exec(tail)) !== null) {
|
||||
if (lm[1]) letteredHits.add(lm[1]);
|
||||
}
|
||||
if (letteredHits.size >= 2) return true;
|
||||
|
||||
// Pattern 2: 3+ distinct numbered options at line starts, AND no `❯<spaces>1.`
|
||||
// cursor anywhere in the FULL visible buffer (which would mean
|
||||
// isNumberedOptionListVisible already covers the case via the native UI).
|
||||
if (/❯\s*1\./.test(visible)) return false;
|
||||
const numberedRe = /(?:^|\n)[ \t❯]*([1-9])\./g;
|
||||
const numberedHits = new Set<string>();
|
||||
let nm: RegExpExecArray | null;
|
||||
while ((nm = numberedRe.exec(tail)) !== null) {
|
||||
if (nm[1]) numberedHits.add(nm[1]);
|
||||
}
|
||||
return numberedHits.size >= 3;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a rendered numbered-option list out of the visible TTY text.
|
||||
*
|
||||
@@ -570,6 +622,21 @@ export function classifyVisible(
|
||||
summary: 'skill fired a numbered-option prompt (AskUserQuestion or routing-injection)',
|
||||
};
|
||||
}
|
||||
// Prose-rendered AUQ: model surfaced the question as lettered or numbered
|
||||
// options in plain text (typical under --disallowedTools AskUserQuestion
|
||||
// when no MCP variant is callable). The model is waiting for user input
|
||||
// via the plan-mode input prompt rather than via the AUQ tool UI; this
|
||||
// is still a legitimate "asked" surface — semantically equivalent to a
|
||||
// tool-call AUQ from the test's perspective.
|
||||
if (isProseAUQVisible(visible)) {
|
||||
if (isPermissionDialogVisible(visible.slice(-TAIL_SCAN_BYTES))) {
|
||||
return null;
|
||||
}
|
||||
return {
|
||||
outcome: 'asked',
|
||||
summary: 'skill rendered a prose-style AskUserQuestion (model waiting for user input)',
|
||||
};
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@@ -1652,12 +1719,15 @@ export async function runPlanSkillFloorCheck(opts: {
|
||||
};
|
||||
}
|
||||
|
||||
// Success: ANY non-permission numbered-option list is an AUQ render.
|
||||
// The bug we're catching is "fired zero AUQs," so observing one is
|
||||
// sufficient — we don't need to fingerprint or navigate past it.
|
||||
// Success: ANY non-permission numbered-option list is an AUQ render —
|
||||
// either via the native numbered-prompt UI (isNumberedOptionListVisible)
|
||||
// OR via prose-rendered options under --disallowedTools when no MCP
|
||||
// variant is callable (isProseAUQVisible). Both surface the question
|
||||
// to the user; the bug we're catching is "fired zero AUQs."
|
||||
const tail = visible.slice(-TAIL_SCAN_BYTES);
|
||||
if (
|
||||
isNumberedOptionListVisible(visible) &&
|
||||
!isPermissionDialogVisible(visible.slice(-TAIL_SCAN_BYTES))
|
||||
(isNumberedOptionListVisible(visible) || isProseAUQVisible(visible)) &&
|
||||
!isPermissionDialogVisible(tail)
|
||||
) {
|
||||
return {
|
||||
auqObserved: true,
|
||||
|
||||
@@ -26,6 +26,7 @@ import { describe, test, expect } from 'bun:test';
|
||||
import {
|
||||
isPermissionDialogVisible,
|
||||
isNumberedOptionListVisible,
|
||||
isProseAUQVisible,
|
||||
isPlanReadyVisible,
|
||||
parseNumberedOptions,
|
||||
classifyVisible,
|
||||
@@ -192,6 +193,86 @@ describe('isNumberedOptionListVisible', () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe('isProseAUQVisible', () => {
|
||||
test('matches 4 lettered options A) B) C) D) at line starts (plan-eng prose AUQ shape)', () => {
|
||||
const sample = `
|
||||
What would you like me to review? Options:
|
||||
A) Point me at an existing design doc or plan file (path).
|
||||
B) Describe new work you're planning — I'll explore the codebase.
|
||||
C) You meant /review for the diff already on this branch.
|
||||
D) Something else (tell me).
|
||||
Recommendation: A if you have a doc in mind, otherwise B.
|
||||
❯
|
||||
`;
|
||||
expect(isProseAUQVisible(sample)).toBe(true);
|
||||
});
|
||||
|
||||
test('matches 2 lettered options (minimum threshold)', () => {
|
||||
const sample = `
|
||||
A) First option
|
||||
B) Second option
|
||||
`;
|
||||
expect(isProseAUQVisible(sample)).toBe(true);
|
||||
});
|
||||
|
||||
test('matches 3 numbered options 1. 2. 3. without ❯ 1. cursor (autoplan prose AUQ shape)', () => {
|
||||
const sample = `
|
||||
What's the task? A few options:
|
||||
1. You have a plan idea in mind — describe it.
|
||||
2. You want to review an existing plan elsewhere.
|
||||
3. You meant a different command — /plan-ceo-review etc.
|
||||
❯
|
||||
`;
|
||||
expect(isProseAUQVisible(sample)).toBe(true);
|
||||
});
|
||||
|
||||
test('returns false when ❯ 1. cursor is present (native UI handled by isNumberedOptionListVisible)', () => {
|
||||
const sample = `
|
||||
❯ 1. First option
|
||||
2. Second option
|
||||
3. Third option
|
||||
`;
|
||||
expect(isProseAUQVisible(sample)).toBe(false);
|
||||
});
|
||||
|
||||
test('returns false on single lettered option', () => {
|
||||
const sample = `
|
||||
A) Only one option mentioned in passing.
|
||||
`;
|
||||
expect(isProseAUQVisible(sample)).toBe(false);
|
||||
});
|
||||
|
||||
test('returns false on 2 numbered options (need 3+ for prose numbered)', () => {
|
||||
const sample = `
|
||||
1. First note.
|
||||
2. Second note.
|
||||
`;
|
||||
expect(isProseAUQVisible(sample)).toBe(false);
|
||||
});
|
||||
|
||||
test('does not match mid-prose lettered text like "(see option B) above"', () => {
|
||||
const sample = `
|
||||
This refers to (see option B) above and also to point A) earlier.
|
||||
`;
|
||||
// The B) and A) markers are mid-line, not at line starts, so they don't count.
|
||||
expect(isProseAUQVisible(sample)).toBe(false);
|
||||
});
|
||||
|
||||
test('matches with leading whitespace and ❯ prefix on options', () => {
|
||||
const sample = `
|
||||
A) Option with whitespace prefix
|
||||
❯ B) Option with cursor prefix
|
||||
C) Another option
|
||||
`;
|
||||
expect(isProseAUQVisible(sample)).toBe(true);
|
||||
});
|
||||
|
||||
test('returns false on plain text with no option markers', () => {
|
||||
expect(isProseAUQVisible('Just some plain text output from the model.')).toBe(false);
|
||||
expect(isProseAUQVisible('')).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe('classifyVisible (runtime path through the runner classifier)', () => {
|
||||
// These tests call the actual classifier so a future contributor who
|
||||
// reorders branches (e.g. moves the permission short-circuit before
|
||||
|
||||
Reference in New Issue
Block a user