From 836f86ab5cffa001fe49de11205fe24627459305 Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Fri, 8 May 2026 22:36:50 -0700
Subject: [PATCH] test(harness): high-water-mark prose-AUQ tracking across
 polling iterations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The autoplan E2E surfaces a brief prose-AUQ window (model emits options,
waits ~30s for non-existent test responder, then resumes thinking) that
the existing polling loop misses: by judge-tick time the buffer has
moved into spinner state, so the LLM judge correctly reports 'working'
and the loop times out at 5min.

Adds two flags tracked across polling iterations:
  - proseAUQEverObserved: set true the first tick isProseAUQVisible
    returns true on the recent buffer
  - waitingEverObserved: set true on the first LLM judge 'waiting' verdict

At timeout, if either flag is set, return outcome='asked' with a
summary explaining the historical signal. The model DID surface the
question — we just missed the live-state window.

Snapshot logged with tag='prose-auq-surfaced' when GSTACK_PTY_LOG=1
for postmortem trace.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 test/helpers/claude-pty-runner.ts | 40 ++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/test/helpers/claude-pty-runner.ts b/test/helpers/claude-pty-runner.ts
index 6499a17a6..faa0b6d38 100644
--- a/test/helpers/claude-pty-runner.ts
+++ b/test/helpers/claude-pty-runner.ts
@@ -1485,6 +1485,13 @@ export async function runPlanSkillObservation(opts: {
     const start = Date.now();
     let lastJudgeAt = 0;
     let lastJudgeVerdict: PtyStateVerdict | null = null;
+    // High-water marks: did we EVER see a prose-AUQ surface or a judge
+    // 'waiting' verdict during the run? Models may surface options
+    // briefly, then resume thinking when no user response comes (test
+    // env has no responder). At timeout we trust historical signals
+    // even if the current state is 'working'.
+    let proseAUQEverObserved = false;
+    let waitingEverObserved = false;
     const JUDGE_AFTER_MS = 60_000;
     const JUDGE_INTERVAL_MS = 30_000;
     while (Date.now() - start < budgetMs) {
@@ -1507,6 +1514,18 @@ export async function runPlanSkillObservation(opts: {
           elapsedMs: Date.now() - startedAt,
         };
       }
+
+      // Cheap surface-tracking: did the model ever surface a prose AUQ in
+      // this tick's recent buffer? Track once-true (high water).
+      if (!proseAUQEverObserved && isProseAUQVisible(visible)) {
+        proseAUQEverObserved = true;
+        logPtySnapshot(visible, {
+          testName: opts.skillName,
+          elapsedMs: Date.now() - start,
+          tag: 'prose-auq-surfaced',
+        });
+      }
+
       const classified = classifyVisible(visible, {
         strictPlanWrites: !!opts.initialPlanContent,
       });
@@ -1539,6 +1558,7 @@ export async function runPlanSkillObservation(opts: {
         logPtySnapshot(visible, { testName: opts.skillName, elapsedMs: elapsed, tag: 'judge-tick' });
         lastJudgeVerdict = judgePtyState(visible, { testName: opts.skillName });
         if (lastJudgeVerdict.state === 'waiting') {
+          waitingEverObserved = true;
           return {
             outcome: 'asked',
             summary: `LLM judge: ${lastJudgeVerdict.reasoning} (state=waiting after ${Math.round(elapsed / 1000)}s)`,
@@ -1549,6 +1569,24 @@ export async function runPlanSkillObservation(opts: {
       }
     }
 
+    // Timeout fallback: if we observed a prose-AUQ surface OR a judge
+    // 'waiting' verdict at any point during the run, treat as 'asked'.
+    // This catches the model-surfaced-then-resumed-thinking case where
+    // by the time the timeout fires, the buffer has moved past the
+    // options into spinner state but the question DID surface earlier.
+    const finalVisible = session.visibleSince(since);
+    if (proseAUQEverObserved || waitingEverObserved) {
+      return {
+        outcome: 'asked',
+        summary:
+          `prose-AUQ surface observed during run (proseAUQEverObserved=${proseAUQEverObserved}, waitingEverObserved=${waitingEverObserved}); model surfaced the question and the test budget elapsed without a follow-up classification` +
+          (lastJudgeVerdict
+            ? ` (last LLM judge: ${lastJudgeVerdict.state} — ${lastJudgeVerdict.reasoning})`
+            : ''),
+        evidence: finalVisible.slice(-2000),
+        elapsedMs: Date.now() - startedAt,
+      };
+    }
     return {
       outcome: 'timeout',
       summary:
@@ -1556,7 +1594,7 @@ export async function runPlanSkillObservation(opts: {
         (lastJudgeVerdict
           ? ` (last LLM judge: state=${lastJudgeVerdict.state} — ${lastJudgeVerdict.reasoning})`
           : ''),
-      evidence: session.visibleSince(since).slice(-2000),
+      evidence: finalVisible.slice(-2000),
       elapsedMs: Date.now() - startedAt,
     };
   } finally {