From 55dfb9e26c971af40a1abf63e86420f8f6a82c3d Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Fri, 8 May 2026 21:43:11 -0700
Subject: [PATCH] test(harness): LLM judge for waiting-vs-working PTY state +
 snapshot logs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Regex detectors (isNumberedOptionListVisible, isProseAUQVisible) are
fast and free, but PTY rendering quirks fragment prose AUQ option
lists across logical lines that no regex can reliably reassemble.
When detection misses, polling loops time out at the full budget
even though the model is correctly waiting for user input.

Adds judgePtyState — a Haiku-graded trichotomy classifier:
  - waiting: agent surfaced a question/options, sitting at input prompt
  - working: spinner / tool calls / generation in progress
  - hung:    stopped without surfacing anything (rare crash signal)

Wired as a fallback into the polling loops of runPlanSkillObservation
and runPlanSkillFloorCheck: after 60s with no regex hit, snapshot the
TTY every 30s and call the judge. On 'waiting' verdict, return
outcome=asked / auq_observed early. On 'working' or 'hung', enrich the
eventual timeout summary with the verdict so failures are diagnosable.

Implementation:
  - Spawns `claude -p --model claude-haiku-4-5 --max-turns 1` synchronously
    with prompt piped via stdin (subscription auth, no API key env required)
  - In-process cache keyed by SHA-1 of normalized last-4KB so identical
    spinner-frame snapshots don't re-charge
  - Best-effort JSONL log to ~/.gstack/analytics/pty-judge.jsonl with
    timestamp, testName, state, reasoning, hash, judge wall time
  - 30s timeout per call; returns state='unknown' with diagnostic on any
    failure mode (timeout, malformed JSON, missing claude binary)

Snapshot logging: when GSTACK_PTY_LOG=1 is set, dump last 4KB of visible
TTY at every judge tick to ~/.gstack/analytics/pty-snapshots/<test>-
<elapsed>ms.txt — postmortem trail for debugging flakes.

Cost: ~$0.0005 per call; ~10 calls per 5-min test budget; ~$0.005 per
test added in worst case (only when regex detectors miss).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 test/helpers/claude-pty-runner.ts | 239 +++++++++++++++++++++++++++++-
 1 file changed, 238 insertions(+), 1 deletion(-)
diff --git a/test/helpers/claude-pty-runner.ts b/test/helpers/claude-pty-runner.ts
index 86f7581f4..08dbb6032 100644
--- a/test/helpers/claude-pty-runner.ts
+++ b/test/helpers/claude-pty-runner.ts
@@ -297,6 +297,189 @@ export function isNumberedOptionListVisible(visible: string): boolean {
   return /❯\s*1\./.test(visible) && /(^|[^0-9])2\./.test(visible);
 }
 
+// ────────────────────────────────────────────────────────────────────────────
+// LLM judge — "is the model waiting for user input, working, or hung?"
+//
+// Regex detectors (isNumberedOptionListVisible, isProseAUQVisible) are fast
+// and deterministic but brittle to PTY rendering quirks (cursor-positioning
+// escapes that collapse multi-line option lists onto a single logical line).
+// When they miss, the polling loop times out at the full budget — even
+// though the model is correctly surfacing a question via a format the regex
+// can't reassemble.
+//
+// This LLM judge takes a TTY snapshot and answers a trichotomy:
+//   - 'waiting'  — agent surfaced a question/options, sitting at input prompt
+//   - 'working'  — agent is still generating (spinner, tool calls, "Musing")
+//   - 'hung'     — agent stopped without surfacing anything (rare)
+//
+// Used by polling loops as a fallback after N seconds with no terminal
+// classification. On 'waiting' verdict, return outcome='asked' early.
+//
+// Cost: ~$0.0005 per call using claude haiku 4.5. Cached by snapshot hash so
+// identical TTY frames don't re-charge. All verdicts logged to
+// ~/.gstack/analytics/pty-judge.jsonl for offline analysis.
+// ────────────────────────────────────────────────────────────────────────────
+
+import { spawnSync as nodeSpawnSync } from 'node:child_process';
+import { createHash } from 'node:crypto';
+
+export interface PtyStateVerdict {
+  state: 'waiting' | 'working' | 'hung' | 'unknown';
+  reasoning: string;
+  /** SHA-1 of the normalized snapshot input (for caching/dedup). */
+  hash: string;
+  /** Wall time (ms) the judge call took. */
+  elapsedMs: number;
+}
+
+const PTY_VERDICT_CACHE = new Map<string, PtyStateVerdict>();
+
+/**
+ * Persist a verdict (or snapshot dump) to the analytics JSONL log.
+ * Best-effort — failures (disk full, permission denied, etc.) are swallowed
+ * so the harness never fails on logging.
+ */
+function logPtyJudge(record: Record<string, unknown>): void {
+  try {
+    const dir = `${process.env.HOME}/.gstack/analytics`;
+    fs.mkdirSync(dir, { recursive: true });
+    fs.appendFileSync(`${dir}/pty-judge.jsonl`, JSON.stringify(record) + '\n');
+  } catch {
+    /* best-effort */
+  }
+}
+
+/**
+ * Snapshot dump for postmortem debugging when GSTACK_PTY_LOG=1.
+ * Writes the last 4KB of visible TTY plus context to
+ * ~/.gstack/analytics/pty-snapshots/<testName>-<elapsed>ms.txt.
+ */
+export function logPtySnapshot(visible: string, ctx: { testName: string; elapsedMs: number; tag?: string }): void {
+  if (process.env.GSTACK_PTY_LOG !== '1') return;
+  try {
+    const dir = `${process.env.HOME}/.gstack/analytics/pty-snapshots`;
+    fs.mkdirSync(dir, { recursive: true });
+    const tag = ctx.tag ? `-${ctx.tag}` : '';
+    const file = `${dir}/${ctx.testName}-${ctx.elapsedMs}ms${tag}.txt`;
+    fs.writeFileSync(
+      file,
+      `# testName: ${ctx.testName}\n# elapsedMs: ${ctx.elapsedMs}\n# tag: ${ctx.tag ?? ''}\n# visible.length: ${visible.length}\n\n${visible.slice(-4096)}`,
+    );
+  } catch {
+    /* best-effort */
+  }
+}
+
+/**
+ * Ask Claude Haiku 4.5 to classify a TTY snapshot as waiting/working/hung.
+ *
+ * Implementation: spawns `claude -p --model claude-haiku-4-5` synchronously
+ * with the prompt piped via stdin. Uses subscription auth (no API key env
+ * required). 30-second timeout; returns 'unknown' on any failure mode
+ * (timeout, malformed JSON, missing claude binary).
+ *
+ * Cache: identical snapshot hashes return the cached verdict without
+ * re-calling. Cache lives in-process; resets between test runs.
+ */
+export function judgePtyState(
+  visible: string,
+  ctx?: { testName?: string },
+): PtyStateVerdict {
+  // Normalize: strip trailing whitespace lines + take last 4KB. Hash the
+  // normalized form so spinner-frame-only diffs (which all look "working")
+  // don't bust the cache and rack up cost.
+  const tail = visible.slice(-4096).replace(/[ \t]+$/gm, '');
+  const hash = createHash('sha1').update(tail).digest('hex').slice(0, 16);
+
+  const cached = PTY_VERDICT_CACHE.get(hash);
+  if (cached) return cached;
+
+  const judgeStart = Date.now();
+  const prompt = `You are reading a snapshot of a terminal where Claude Code is running in plan mode for an automated test. Your job: classify the agent's current state.
+
+Pick exactly ONE:
+- WAITING — agent surfaced a question or option list and is sitting at the input prompt waiting for user reply. Signs: numbered/lettered options visible (1./2./3. or A)/B)/C)), "Recommendation:" line, cursor at empty input prompt with no recent generation activity.
+- WORKING — agent is actively generating or running tools. Signs: spinner glyphs (✻ ✶ ✳ ✢ ✽), "Musing..." or "Churned for ..." text, recent tool-call blocks (Read/Edit/Bash/Grep), in-flight token output.
+- HUNG — agent has stopped without surfacing a question and without any spinner/work activity. Rare; usually means a crash.
+
+Respond with strict JSON ONLY (no markdown fences, no prose):
+{"state":"waiting","reasoning":"one short sentence"}
+
+Terminal snapshot (last 4KB):
+\`\`\`
+${tail}
+\`\`\``;
+
+  let verdict: PtyStateVerdict = {
+    state: 'unknown',
+    reasoning: 'judge call did not complete',
+    hash,
+    elapsedMs: 0,
+  };
+
+  try {
+    const result = nodeSpawnSync(
+      'claude',
+      ['-p', '--model', 'claude-haiku-4-5', '--max-turns', '1'],
+      {
+        input: prompt,
+        stdio: ['pipe', 'pipe', 'pipe'],
+        timeout: 30_000,
+        encoding: 'utf-8',
+      },
+    );
+    const elapsedMs = Date.now() - judgeStart;
+    if (result.status === 0 && result.stdout) {
+      // Pull the first {...} JSON object out of stdout. Haiku occasionally
+      // wraps in ```json ...``` despite the prompt; tolerate that.
+      const match = result.stdout.match(/\{[\s\S]*?"state"[\s\S]*?\}/);
+      if (match) {
+        try {
+          const parsed = JSON.parse(match[0]);
+          const state = ['waiting', 'working', 'hung'].includes(parsed.state)
+            ? (parsed.state as 'waiting' | 'working' | 'hung')
+            : 'unknown';
+          verdict = {
+            state,
+            reasoning: typeof parsed.reasoning === 'string' ? parsed.reasoning.slice(0, 200) : '',
+            hash,
+            elapsedMs,
+          };
+        } catch {
+          verdict = { state: 'unknown', reasoning: 'malformed JSON', hash, elapsedMs };
+        }
+      } else {
+        verdict = { state: 'unknown', reasoning: 'no JSON in response', hash, elapsedMs };
+      }
+    } else {
+      verdict = {
+        state: 'unknown',
+        reasoning: `claude exited ${result.status} (${(result.stderr ?? '').slice(0, 80)})`,
+        hash,
+        elapsedMs,
+      };
+    }
+  } catch (err) {
+    verdict = {
+      state: 'unknown',
+      reasoning: `judge spawn failed: ${(err as Error).message}`.slice(0, 200),
+      hash,
+      elapsedMs: Date.now() - judgeStart,
+    };
+  }
+
+  PTY_VERDICT_CACHE.set(hash, verdict);
+  logPtyJudge({
+    ts: new Date().toISOString(),
+    testName: ctx?.testName ?? 'unknown',
+    state: verdict.state,
+    reasoning: verdict.reasoning,
+    hash: verdict.hash,
+    judgeMs: verdict.elapsedMs,
+  });
+  return verdict;
+}
+
 /**
  * Detect a prose-rendered AskUserQuestion in plan mode.
  *
@@ -1287,6 +1470,10 @@ export async function runPlanSkillObservation(opts: {
 
     const budgetMs = opts.timeoutMs ?? 180_000;
     const start = Date.now();
+    let lastJudgeAt = 0;
+    let lastJudgeVerdict: PtyStateVerdict | null = null;
+    const JUDGE_AFTER_MS = 60_000;
+    const JUDGE_INTERVAL_MS = 30_000;
     while (Date.now() - start < budgetMs) {
       await Bun.sleep(2000);
       const visible = session.visibleSince(since);
@@ -1327,11 +1514,35 @@ export async function runPlanSkillObservation(opts: {
         if (planFile) obs.planFile = planFile;
         return obs;
       }
+
+      // LLM judge fallback: if regex detectors didn't classify and we've
+      // burned >60s with periodic ticks, ask Haiku "is the model waiting,
+      // working, or hung?" Treat 'waiting' as 'asked' (model surfaced a
+      // question via prose the regex couldn't reassemble). Snapshot the
+      // visible buffer at each judge call when GSTACK_PTY_LOG=1.
+      const elapsed = Date.now() - start;
+      if (elapsed > JUDGE_AFTER_MS && Date.now() - lastJudgeAt > JUDGE_INTERVAL_MS) {
+        lastJudgeAt = Date.now();
+        logPtySnapshot(visible, { testName: opts.skillName, elapsedMs: elapsed, tag: 'judge-tick' });
+        lastJudgeVerdict = judgePtyState(visible, { testName: opts.skillName });
+        if (lastJudgeVerdict.state === 'waiting') {
+          return {
+            outcome: 'asked',
+            summary: `LLM judge: ${lastJudgeVerdict.reasoning} (state=waiting after ${Math.round(elapsed / 1000)}s)`,
+            evidence: visible.slice(-2000),
+            elapsedMs: Date.now() - startedAt,
+          };
+        }
+      }
     }
 
     return {
       outcome: 'timeout',
-      summary: `no terminal outcome within ${budgetMs}ms`,
+      summary:
+        `no terminal outcome within ${budgetMs}ms` +
+        (lastJudgeVerdict
+          ? ` (last LLM judge: state=${lastJudgeVerdict.state} — ${lastJudgeVerdict.reasoning})`
+          : ''),
       evidence: session.visibleSince(since).slice(-2000),
       elapsedMs: Date.now() - startedAt,
     };
@@ -1696,6 +1907,10 @@ export async function runPlanSkillFloorCheck(opts: {
     session.send(`${opts.followUpPrompt}\r`);
 
     const start = Date.now();
+    let lastJudgeAt = 0;
+    let lastJudgeVerdict: PtyStateVerdict | null = null;
+    const JUDGE_AFTER_MS = 60_000;
+    const JUDGE_INTERVAL_MS = 30_000;
     while (Date.now() - start < timeoutMs) {
       await Bun.sleep(2000);
       const visible = session.visibleSince(since);
@@ -1738,6 +1953,28 @@ export async function runPlanSkillFloorCheck(opts: {
         };
       }
 
+      // LLM judge fallback: same shape as runPlanSkillObservation. After 60s
+      // of polling without a regex hit, ask Haiku to classify the snapshot.
+      // 'waiting' verdict counts as floor met (model surfaced a question via
+      // prose the regex couldn't catch). 'working' / 'hung' / 'unknown' don't
+      // change the outcome — they enrich the eventual timeout summary so the
+      // failure diagnostic is more actionable than "no AUQ render."
+      const elapsed = Date.now() - start;
+      if (elapsed > JUDGE_AFTER_MS && Date.now() - lastJudgeAt > JUDGE_INTERVAL_MS) {
+        lastJudgeAt = Date.now();
+        logPtySnapshot(visible, { testName: opts.skillName, elapsedMs: elapsed, tag: 'floor-judge-tick' });
+        lastJudgeVerdict = judgePtyState(visible, { testName: opts.skillName });
+        if (lastJudgeVerdict.state === 'waiting') {
+          return {
+            auqObserved: true,
+            outcome: 'auq_observed',
+            summary: `LLM judge: ${lastJudgeVerdict.reasoning} (state=waiting after ${Math.round(elapsed / 1000)}s; floor met)`,
+            evidence: visible.slice(-3000),
+            elapsedMs: Date.now() - startedAt,
+          };
+        }
+      }
+
       // Silent write outside sanctioned dirs is the transcript-bug shape.
       const writeRe = /⏺\s*(?:Write|Edit)\(([^)]+)\)/g;
       let m: RegExpExecArray | null;