test: expose high-water-mark flags through PlanSkillObservation

The 2KB obs.evidence window often misses the prose-AUQ moment because
ExitPlanMode UI ("Ready to execute" + numbered approve/reject prompt)
pushes the model's earlier option list out of the tail by the time
outcome=plan_ready fires. Tests checking "did the user see a question"
need to consult historical state, not just the truncated final tail.

Adds two optional fields to PlanSkillObservation:
  - proseAUQEverObserved: true if isProseAUQVisible was true at any tick
  - waitingEverObserved: true if the LLM judge ever returned 'waiting'

The 4 plan-mode --disallowedTools tests now check these flags as part
of the surfaceVisible computation:
    isProseAUQVisible(obs.evidence) || obs.proseAUQEverObserved === true
    blockedVisible || proseAUQVisible || obs.waitingEverObserved === true

This catches the autoplan / plan-ceo / plan-eng case where the model
surfaces options briefly, fails to get a response, then keeps thinking
— eventually emitting ExitPlanMode and pushing options out of evidence.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-05-08 23:19:34 -07:00
parent 6c13b5e657
commit 7757233d53
5 changed files with 35 additions and 8 deletions
+27
View File
@@ -1402,6 +1402,27 @@ export interface PlanSkillObservation {
* the section, and that's the regression we want to catch.
*/
planFile?: string;
/**
* High-water-mark flag: did the polling loop ever observe a
* prose-rendered AskUserQuestion (lettered or numbered options visible)
* during the run? Set true the first poll iteration that
* isProseAUQVisible returns true on the recent buffer; remains true
* for the rest of the observation.
*
* The 2KB `evidence` window often misses the prose-AUQ moment because
* by the time outcome=plan_ready fires, the ExitPlanMode "Ready to
* execute" UI has pushed the options out of the tail. Tests that need
* to assert "the user saw the question at SOME point" should check
* this flag rather than re-running isProseAUQVisible on the truncated
* evidence.
*/
proseAUQEverObserved?: boolean;
/**
* High-water-mark flag: did the LLM judge ever return state='waiting'
* during the run? Same shape as proseAUQEverObserved but driven by the
* Haiku judge fallback rather than the regex detector.
*/
waitingEverObserved?: boolean;
}
/**
@@ -1543,6 +1564,8 @@ export async function runPlanSkillObservation(opts: {
...classified,
evidence: visible.slice(-2000),
elapsedMs: Date.now() - startedAt,
proseAUQEverObserved,
waitingEverObserved,
};
// Capture the plan file path on any outcome where one may have been
// written. Gating only on 'plan_ready' missed two cases: (1) the
@@ -1594,6 +1617,8 @@ export async function runPlanSkillObservation(opts: {
: ''),
evidence: finalVisible.slice(-2000),
elapsedMs: Date.now() - startedAt,
proseAUQEverObserved,
waitingEverObserved,
};
}
return {
@@ -1605,6 +1630,8 @@ export async function runPlanSkillObservation(opts: {
: ''),
evidence: finalVisible.slice(-2000),
elapsedMs: Date.now() - startedAt,
proseAUQEverObserved,
waitingEverObserved,
};
} finally {
await session.close();
+2 -2
View File
@@ -66,8 +66,8 @@ describeE2E('autoplan AskUserQuestion-blocked smoke (gate)', () => {
// 3. Numbered/lettered options visible in TTY as prose (post-v1.28 prose-AUQ rendering)
// If NONE of these are present, the question was silently buried.
const blockedVisible = /BLOCKED\s*[—-]\s*AskUserQuestion/i.test(obs.evidence);
const proseAUQVisible = isProseAUQVisible(obs.evidence);
const surfaceVisible = blockedVisible || proseAUQVisible;
const proseAUQVisible = isProseAUQVisible(obs.evidence) || obs.proseAUQEverObserved === true;
const surfaceVisible = blockedVisible || proseAUQVisible || obs.waitingEverObserved === true;
if (
obs.outcome === 'auto_decided' ||
+2 -2
View File
@@ -115,8 +115,8 @@ describeE2E('plan-ceo-review plan-mode smoke (gate)', () => {
// 2. `BLOCKED — AskUserQuestion` string visible in TTY (post-v1.28 BLOCKED rule)
// 3. Numbered/lettered options visible in TTY as prose (post-v1.28 prose-AUQ rendering)
const blockedVisible = /BLOCKED\s*[—-]\s*AskUserQuestion/i.test(obs.evidence);
const proseAUQVisible = isProseAUQVisible(obs.evidence);
const surfaceVisible = blockedVisible || proseAUQVisible;
const proseAUQVisible = isProseAUQVisible(obs.evidence) || obs.proseAUQEverObserved === true;
const surfaceVisible = blockedVisible || proseAUQVisible || obs.waitingEverObserved === true;
if (
obs.outcome === 'auto_decided' ||
+2 -2
View File
@@ -63,8 +63,8 @@ describeE2E('plan-design-review plan-mode smoke (gate)', () => {
// Surface visibility check (same as ceo / autoplan migrations): user
// must SEE the question via BLOCKED string OR prose-rendered AUQ options.
const blockedVisible = /BLOCKED\s*[—-]\s*AskUserQuestion/i.test(obs.evidence);
const proseAUQVisible = isProseAUQVisible(obs.evidence);
const surfaceVisible = blockedVisible || proseAUQVisible;
const proseAUQVisible = isProseAUQVisible(obs.evidence) || obs.proseAUQEverObserved === true;
const surfaceVisible = blockedVisible || proseAUQVisible || obs.waitingEverObserved === true;
if (
obs.outcome === 'auto_decided' ||
+2 -2
View File
@@ -83,8 +83,8 @@ describeE2E('plan-eng-review plan-mode smoke (gate)', () => {
// section in the plan file (legacy) OR a BLOCKED string in TTY OR
// prose-rendered AUQ options in TTY.
const blockedVisible = /BLOCKED\s*[—-]\s*AskUserQuestion/i.test(obs.evidence);
const proseAUQVisible = isProseAUQVisible(obs.evidence);
const surfaceVisible = blockedVisible || proseAUQVisible;
const proseAUQVisible = isProseAUQVisible(obs.evidence) || obs.proseAUQEverObserved === true;
const surfaceVisible = blockedVisible || proseAUQVisible || obs.waitingEverObserved === true;
if (
obs.outcome === 'auto_decided' ||