mirror of
https://github.com/garrytan/gstack.git
synced 2026-06-17 15:20:11 +02:00
test: expose high-water-mark flags through PlanSkillObservation
The 2KB obs.evidence window often misses the prose-AUQ moment because
ExitPlanMode UI ("Ready to execute" + numbered approve/reject prompt)
pushes the model's earlier option list out of the tail by the time
outcome=plan_ready fires. Tests checking "did the user see a question"
need to consult historical state, not just the truncated final tail.
Adds two optional fields to PlanSkillObservation:
- proseAUQEverObserved: true if isProseAUQVisible was true at any tick
- waitingEverObserved: true if the LLM judge ever returned 'waiting'
The 4 plan-mode --disallowedTools tests now check these flags as part
of the surfaceVisible computation:
isProseAUQVisible(obs.evidence) || obs.proseAUQEverObserved === true
blockedVisible || proseAUQVisible || obs.waitingEverObserved === true
This catches the autoplan / plan-ceo / plan-eng case where the model
surfaces options briefly, fails to get a response, then keeps thinking
— eventually emitting ExitPlanMode and pushing options out of evidence.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1402,6 +1402,27 @@ export interface PlanSkillObservation {
|
||||
* the section, and that's the regression we want to catch.
|
||||
*/
|
||||
planFile?: string;
|
||||
/**
|
||||
* High-water-mark flag: did the polling loop ever observe a
|
||||
* prose-rendered AskUserQuestion (lettered or numbered options visible)
|
||||
* during the run? Set true the first poll iteration that
|
||||
* isProseAUQVisible returns true on the recent buffer; remains true
|
||||
* for the rest of the observation.
|
||||
*
|
||||
* The 2KB `evidence` window often misses the prose-AUQ moment because
|
||||
* by the time outcome=plan_ready fires, the ExitPlanMode "Ready to
|
||||
* execute" UI has pushed the options out of the tail. Tests that need
|
||||
* to assert "the user saw the question at SOME point" should check
|
||||
* this flag rather than re-running isProseAUQVisible on the truncated
|
||||
* evidence.
|
||||
*/
|
||||
proseAUQEverObserved?: boolean;
|
||||
/**
|
||||
* High-water-mark flag: did the LLM judge ever return state='waiting'
|
||||
* during the run? Same shape as proseAUQEverObserved but driven by the
|
||||
* Haiku judge fallback rather than the regex detector.
|
||||
*/
|
||||
waitingEverObserved?: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -1543,6 +1564,8 @@ export async function runPlanSkillObservation(opts: {
|
||||
...classified,
|
||||
evidence: visible.slice(-2000),
|
||||
elapsedMs: Date.now() - startedAt,
|
||||
proseAUQEverObserved,
|
||||
waitingEverObserved,
|
||||
};
|
||||
// Capture the plan file path on any outcome where one may have been
|
||||
// written. Gating only on 'plan_ready' missed two cases: (1) the
|
||||
@@ -1594,6 +1617,8 @@ export async function runPlanSkillObservation(opts: {
|
||||
: ''),
|
||||
evidence: finalVisible.slice(-2000),
|
||||
elapsedMs: Date.now() - startedAt,
|
||||
proseAUQEverObserved,
|
||||
waitingEverObserved,
|
||||
};
|
||||
}
|
||||
return {
|
||||
@@ -1605,6 +1630,8 @@ export async function runPlanSkillObservation(opts: {
|
||||
: ''),
|
||||
evidence: finalVisible.slice(-2000),
|
||||
elapsedMs: Date.now() - startedAt,
|
||||
proseAUQEverObserved,
|
||||
waitingEverObserved,
|
||||
};
|
||||
} finally {
|
||||
await session.close();
|
||||
|
||||
@@ -66,8 +66,8 @@ describeE2E('autoplan AskUserQuestion-blocked smoke (gate)', () => {
|
||||
// 3. Numbered/lettered options visible in TTY as prose (post-v1.28 prose-AUQ rendering)
|
||||
// If NONE of these are present, the question was silently buried.
|
||||
const blockedVisible = /BLOCKED\s*[—-]\s*AskUserQuestion/i.test(obs.evidence);
|
||||
const proseAUQVisible = isProseAUQVisible(obs.evidence);
|
||||
const surfaceVisible = blockedVisible || proseAUQVisible;
|
||||
const proseAUQVisible = isProseAUQVisible(obs.evidence) || obs.proseAUQEverObserved === true;
|
||||
const surfaceVisible = blockedVisible || proseAUQVisible || obs.waitingEverObserved === true;
|
||||
|
||||
if (
|
||||
obs.outcome === 'auto_decided' ||
|
||||
|
||||
@@ -115,8 +115,8 @@ describeE2E('plan-ceo-review plan-mode smoke (gate)', () => {
|
||||
// 2. `BLOCKED — AskUserQuestion` string visible in TTY (post-v1.28 BLOCKED rule)
|
||||
// 3. Numbered/lettered options visible in TTY as prose (post-v1.28 prose-AUQ rendering)
|
||||
const blockedVisible = /BLOCKED\s*[—-]\s*AskUserQuestion/i.test(obs.evidence);
|
||||
const proseAUQVisible = isProseAUQVisible(obs.evidence);
|
||||
const surfaceVisible = blockedVisible || proseAUQVisible;
|
||||
const proseAUQVisible = isProseAUQVisible(obs.evidence) || obs.proseAUQEverObserved === true;
|
||||
const surfaceVisible = blockedVisible || proseAUQVisible || obs.waitingEverObserved === true;
|
||||
|
||||
if (
|
||||
obs.outcome === 'auto_decided' ||
|
||||
|
||||
@@ -63,8 +63,8 @@ describeE2E('plan-design-review plan-mode smoke (gate)', () => {
|
||||
// Surface visibility check (same as ceo / autoplan migrations): user
|
||||
// must SEE the question via BLOCKED string OR prose-rendered AUQ options.
|
||||
const blockedVisible = /BLOCKED\s*[—-]\s*AskUserQuestion/i.test(obs.evidence);
|
||||
const proseAUQVisible = isProseAUQVisible(obs.evidence);
|
||||
const surfaceVisible = blockedVisible || proseAUQVisible;
|
||||
const proseAUQVisible = isProseAUQVisible(obs.evidence) || obs.proseAUQEverObserved === true;
|
||||
const surfaceVisible = blockedVisible || proseAUQVisible || obs.waitingEverObserved === true;
|
||||
|
||||
if (
|
||||
obs.outcome === 'auto_decided' ||
|
||||
|
||||
@@ -83,8 +83,8 @@ describeE2E('plan-eng-review plan-mode smoke (gate)', () => {
|
||||
// section in the plan file (legacy) OR a BLOCKED string in TTY OR
|
||||
// prose-rendered AUQ options in TTY.
|
||||
const blockedVisible = /BLOCKED\s*[—-]\s*AskUserQuestion/i.test(obs.evidence);
|
||||
const proseAUQVisible = isProseAUQVisible(obs.evidence);
|
||||
const surfaceVisible = blockedVisible || proseAUQVisible;
|
||||
const proseAUQVisible = isProseAUQVisible(obs.evidence) || obs.proseAUQEverObserved === true;
|
||||
const surfaceVisible = blockedVisible || proseAUQVisible || obs.waitingEverObserved === true;
|
||||
|
||||
if (
|
||||
obs.outcome === 'auto_decided' ||
|
||||
|
||||
Reference in New Issue
Block a user