test: extract classifyVisible() + permission-dialog filter in PTY runner

Pure classifier extracted from runPlanSkillObservation's polling loop so unit tests can exercise the actual branch order with synthetic input strings. Runner gains: - env? passthrough on runPlanSkillObservation (forwarded to launchClaudePty). gstack-config does not yet honor env overrides; plumbing is in place for a future change to make tests hermetic. - TAIL_SCAN_BYTES = 1500 exported constant. Replaces a duplicated magic number in test/skill-e2e-plan-ceo-mode-routing.test.ts so tuning stays in sync. - isPermissionDialogVisible: the bare phrase "Do you want to proceed?" now requires a file-edit context co-trigger. Other clauses unchanged. Skill questions that contain the bare phrase are no longer mis-classified. - classifyVisible(visible): pure function. Branch order silent_write → plan_ready → asked → null. Permission dialogs filtered out of the 'asked' classification so a permission prompt cannot pose as a Step 0 skill question. Adds 24 unit tests covering all classifier branches, edge cases, and the co-trigger contract.
2026-05-02 11:45:20 +02:00 · 2026-04-28 00:00:10 -07:00
parent dde55103fc
commit fa78a20188
3 changed files with 440 additions and 47 deletions
@@ -138,6 +138,15 @@ export function isPlanReadyVisible(visible: string): boolean {
  return /ready to execute|Would you like to proceed/i.test(visible);
 }

+/**
+ * Recent-tail window (in bytes of stripped TTY text) used when classifying
+ * permission dialogs. Old permission text persists in the visibleSince buffer
+ * after the dialog is dismissed, so callers should pass `visible.slice(-TAIL_SCAN_BYTES)`
+ * to avoid re-triggering on stale scrollback. Shared between `runPlanSkillObservation`
+ * and `navigateToModeAskUserQuestion` in the routing test so tuning stays in sync.
+ */
+export const TAIL_SCAN_BYTES = 1500;
+
 /**
 * Detect a Claude Code permission dialog. These render as a numbered
 * option list (so isNumberedOptionListVisible matches them) but they
@@ -145,23 +154,37 @@ export function isPlanReadyVisible(visible: string): boolean {
 * whether to grant a tool/file permission. Tests that look for skill
 * AskUserQuestions must explicitly skip these.
 *
- * Both English phrases below are stable across recent Claude Code
+ * The English phrases below are stable across recent Claude Code
 * versions. The check is permissive on whitespace because TTY rendering
 * may wrap or reflow text.
+ *
+ * Co-trigger requirement: the bare phrase "Do you want to proceed?" is
+ * generic enough that a skill question could legitimately use it
+ * ("Do you want to proceed with HOLD SCOPE?"). To avoid mis-classifying
+ * skill questions as permission dialogs, this phrase only counts when it
+ * co-occurs with a file-edit context ("Edit to <path>" or "Write to <path>").
+ * The standalone permission signatures (`requested permissions to`,
+ * `allow all edits`, `always allow access to`, `Bash command requires permission`)
+ * remain unconditional.
 */
 export function isPermissionDialogVisible(visible: string): boolean {
-  return (
-    /requested\s+permissions?\s+to/i.test(visible) ||
-    /Do\s+you\s+want\s+to\s+proceed\?/i.test(visible) ||
-    // "Yes / Yes, allow all edits / No" shape rendered by Claude Code for
-    // file-edit permission grants. The middle option's "allow all" phrase
-    // is the unique signature.
-    /\ballow\s+all\s+edits\b/i.test(visible) ||
-    // "Yes, and always allow access to <dir>" shape (workspace trust).
-    /always\s+allow\s+access\s+to/i.test(visible) ||
-    // Bash command permission prompts.
-    /Bash\s+command\s+.*\s+requires\s+permission/i.test(visible)
-  );
+  // Standalone signatures — high specificity, never appear in skill questions.
+  if (/requested\s+permissions?\s+to/i.test(visible)) return true;
+  // "Yes / Yes, allow all edits / No" shape — file-edit permission grants.
+  if (/\ballow\s+all\s+edits\b/i.test(visible)) return true;
+  // "Yes, and always allow access to <dir>" shape — workspace trust.
+  if (/always\s+allow\s+access\s+to/i.test(visible)) return true;
+  // Bash command permission prompts.
+  if (/Bash\s+command\s+.*\s+requires\s+permission/i.test(visible)) return true;
+  // "Do you want to proceed?" only counts as a permission dialog when paired
+  // with a file-edit context. Skill questions can use the bare phrase.
+  if (
+    /Do\s+you\s+want\s+to\s+proceed\?/i.test(visible) &&
+    /(Edit|Write)\s+to\s+\S+/i.test(visible)
+  ) {
+    return true;
+  }
+  return false;
 }

 /** Detect any AskUserQuestion-shaped numbered option list with cursor. */
@@ -261,6 +284,69 @@ export function parseNumberedOptions(
  return found;
 }

+/**
+ * Pure classifier for the visible TTY buffer. Decides which outcome the
+ * polling loop should return on this tick, or `null` to keep polling.
+ *
+ * Extracted from `runPlanSkillObservation` so the unit suite can exercise
+ * the actual branch order with synthetic input strings — a future contributor
+ * who reorders the branches (e.g., moves the permission short-circuit) gets
+ * caught by the unit tests, not by a stochastic E2E run.
+ *
+ * Live-state branches (process exited, "Unknown command") stay in the runner
+ * since they need the session handle.
+ */
+export type ClassifyResult =
+  | { outcome: 'silent_write'; summary: string }
+  | { outcome: 'plan_ready'; summary: string }
+  | { outcome: 'asked'; summary: string }
+  | null;
+
+const SANCTIONED_WRITE_SUBSTRINGS = [
+  '.claude/plans',
+  '.gstack/',
+  '/.context/',
+  'CHANGELOG.md',
+  'TODOS.md',
+];
+
+export function classifyVisible(visible: string): ClassifyResult {
+  // Silent-write detection: any Write/Edit tool render that targets a path
+  // OUTSIDE the sanctioned dirs, AND no numbered prompt is currently on screen
+  // (a numbered prompt means a permission/AskUserQuestion is gating the write,
+  // not an actual silent write).
+  const writeRe = /⏺\s*(?:Write|Edit)\(([^)]+)\)/g;
+  let m: RegExpExecArray | null;
+  while ((m = writeRe.exec(visible)) !== null) {
+    const target = m[1] ?? '';
+    const sanctioned = SANCTIONED_WRITE_SUBSTRINGS.some((s) => target.includes(s));
+    if (!sanctioned && !isNumberedOptionListVisible(visible)) {
+      return {
+        outcome: 'silent_write',
+        summary: `Write/Edit to ${target} fired before any AskUserQuestion`,
+      };
+    }
+  }
+  if (isPlanReadyVisible(visible)) {
+    return {
+      outcome: 'plan_ready',
+      summary: 'skill ran end-to-end and emitted plan-mode "Ready to execute" confirmation',
+    };
+  }
+  if (isNumberedOptionListVisible(visible)) {
+    // Permission dialogs render numbered lists too. Skip them — the
+    // bug we want to catch is "skill question never fired."
+    if (isPermissionDialogVisible(visible.slice(-TAIL_SCAN_BYTES))) {
+      return null;
+    }
+    return {
+      outcome: 'asked',
+      summary: 'skill fired a numbered-option prompt (AskUserQuestion or routing-injection)',
+    };
+  }
+  return null;
+}
+
 /**
 * Spawn `claude --permission-mode plan` in a real PTY and return a session
 * handle. Caller is responsible for `await session.close()` to release the
@@ -566,12 +652,21 @@ export async function runPlanSkillObservation(opts: {
  cwd?: string;
  /** Total budget for skill to reach a terminal outcome. Default 180000. */
  timeoutMs?: number;
+  /**
+   * Extra env merged into the spawned `claude` process. `launchClaudePty`
+   * already supports this; exposing it here lets per-skill tests isolate
+   * from local config that would mask the regression they're trying to
+   * catch (e.g., `QUESTION_TUNING=true` causing AUTO_DECIDE to skip the
+   * rendered AskUserQuestion list).
+   */
+  env?: Record<string, string>;
 }): Promise<PlanSkillObservation> {
  const startedAt = Date.now();
  const session = await launchClaudePty({
    permissionMode: opts.inPlanMode === false ? null : 'plan',
    cwd: opts.cwd,
    timeoutMs: (opts.timeoutMs ?? 180_000) + 30_000,
+    env: opts.env,
  });

  try {
@@ -602,40 +697,10 @@ export async function runPlanSkillObservation(opts: {
          elapsedMs: Date.now() - startedAt,
        };
      }
-      // Silent-write detection: any Write/Edit tool render that targets a
-      // path OUTSIDE ~/.claude/plans, ~/.gstack/, or the active worktree's
-      // .gstack/. Plan files and gbrain artifacts are sanctioned.
-      const writeRe = /⏺\s*(?:Write|Edit)\(([^)]+)\)/g;
-      let m: RegExpExecArray | null;
-      while ((m = writeRe.exec(visible)) !== null) {
-        const target = m[1] ?? '';
-        const sanctioned =
-          target.includes('.claude/plans') ||
-          target.includes('.gstack/') ||
-          target.includes('/.context/') ||
-          target.includes('CHANGELOG.md') ||
-          target.includes('TODOS.md');
-        if (!sanctioned && !isNumberedOptionListVisible(visible)) {
-          return {
-            outcome: 'silent_write',
-            summary: `Write/Edit to ${target} fired before any AskUserQuestion`,
-            evidence: visible.slice(-2000),
-            elapsedMs: Date.now() - startedAt,
-          };
-        }
-      }
-      if (isPlanReadyVisible(visible)) {
+      const classified = classifyVisible(visible);
+      if (classified) {
        return {
-          outcome: 'plan_ready',
-          summary: 'skill ran end-to-end and emitted plan-mode "Ready to execute" confirmation',
-          evidence: visible.slice(-2000),
-          elapsedMs: Date.now() - startedAt,
-        };
-      }
-      if (isNumberedOptionListVisible(visible)) {
-        return {
-          outcome: 'asked',
-          summary: 'skill fired a numbered-option prompt (AskUserQuestion or routing-injection)',
+          ...classified,
          evidence: visible.slice(-2000),
          elapsedMs: Date.now() - startedAt,
        };
@@ -0,0 +1,320 @@
+/**
+ * Deterministic unit tests for claude-pty-runner.ts behavior changes.
+ *
+ * Free-tier (no EVALS=1 needed). Runs in <1s on every `bun test`. Catches
+ * harness plumbing bugs before stochastic PTY runs surface them.
+ *
+ * Two surface areas tested:
+ *
+ * 1. Permission-dialog short-circuit in 'asked' classification: a TTY frame
+ *    that matches BOTH isPermissionDialogVisible AND isNumberedOptionListVisible
+ *    must NOT be classified as a skill question — permission dialogs render
+ *    as numbered lists too, but they're not what we're guarding.
+ *
+ * 2. Env passthrough surface: runPlanSkillObservation accepts an `env`
+ *    option and threads it to launchClaudePty. We can't fully exercise the
+ *    spawn pipeline without paying for a PTY session, but we CAN verify the
+ *    option exists in the type signature and that calling without env still
+ *    works (no regression).
+ *
+ * The PTY test (skill-e2e-plan-ceo-plan-mode.test.ts) is the integration
+ * check; this file is the cheap deterministic guard for the harness primitives
+ * those tests stand on.
+ */
+
+import { describe, test, expect } from 'bun:test';
+import {
+  isPermissionDialogVisible,
+  isNumberedOptionListVisible,
+  isPlanReadyVisible,
+  parseNumberedOptions,
+  classifyVisible,
+  TAIL_SCAN_BYTES,
+  type ClaudePtyOptions,
+} from './claude-pty-runner';
+
+describe('isPermissionDialogVisible', () => {
+  test('matches "Bash command requires permission" prompts', () => {
+    const sample = `
+      Some preamble output
+
+      Bash command \`gstack-config get telemetry\` requires permission to run.
+
+      ❯ 1. Yes
+        2. Yes, and always allow
+        3. No, abort
+    `;
+    expect(isPermissionDialogVisible(sample)).toBe(true);
+  });
+
+  test('matches "allow all edits" file-edit prompts', () => {
+    // Isolated to the "allow all edits" clause only — no overlapping
+    // "Do you want to proceed?" co-trigger, so this asserts the clause works.
+    const sample = `
+      Edit to ~/.gstack/config.yaml
+
+      ❯ 1. Yes
+        2. Yes, allow all edits during this session
+        3. No
+    `;
+    expect(isPermissionDialogVisible(sample)).toBe(true);
+  });
+
+  test('matches the "Do you want to proceed?" file-edit confirmation by itself', () => {
+    // Separate fixture so weakening this clause is detected by a dedicated test.
+    const sample = `
+      Edit to ~/.gstack/config.yaml
+
+      Do you want to proceed?
+
+      ❯ 1. Yes
+        2. No
+    `;
+    expect(isPermissionDialogVisible(sample)).toBe(true);
+  });
+
+  test('matches workspace-trust "always allow access to" prompt', () => {
+    const sample = `
+      Do you trust the files in this folder?
+
+      ❯ 1. Yes, proceed
+        2. Yes, and always allow access to /Users/me/repo
+        3. No, exit
+    `;
+    expect(isPermissionDialogVisible(sample)).toBe(true);
+  });
+
+  test('does NOT match a skill AskUserQuestion list', () => {
+    const sample = `
+      D1 — Premise challenge: do users actually want this?
+
+      ❯ 1. Yes, validated
+        2. No, premise is wrong
+        3. Need more info
+    `;
+    expect(isPermissionDialogVisible(sample)).toBe(false);
+  });
+
+  test('does NOT match a plan-ready confirmation', () => {
+    const sample = `
+      Ready to execute the plan?
+
+      ❯ 1. Yes
+        2. No, keep planning
+    `;
+    expect(isPermissionDialogVisible(sample)).toBe(false);
+  });
+
+  test('does NOT match a skill question that contains the bare phrase "Do you want to proceed?"', () => {
+    // Co-trigger requirement: "Do you want to proceed?" alone is not enough.
+    // It must appear with "Edit to <path>" or "Write to <path>" to count as
+    // a permission dialog. This guards against a skill question like
+    // "Do you want to proceed with HOLD SCOPE?" being mis-classified.
+    const sample = `
+      Choose your scope mode for this review.
+      Do you want to proceed?
+
+      ❯ 1. HOLD SCOPE
+        2. SCOPE EXPANSION
+        3. SELECTIVE EXPANSION
+    `;
+    expect(isPermissionDialogVisible(sample)).toBe(false);
+  });
+
+  test('does NOT mis-match when adversarial prose includes "Edit to <path>" alongside the bare proceed phrase', () => {
+    // Adversarial fixture: a skill question whose body legitimately mentions
+    // "Edit to <path>" in prose AND ends with "Do you want to proceed?". The
+    // current co-trigger regex would mis-classify this as a permission
+    // dialog. We DO want this test to fail until the regex is tightened
+    // further (e.g., proximity constraint, or anchoring "Edit to" to a
+    // line-start). For now this is documented as a known limitation: a
+    // skill question that talks about "Edit to" in prose IS still treated
+    // as a permission dialog. The test asserts the current behavior so a
+    // future fix can flip it intentionally.
+    const sample = `
+      Plan: I will Edit to ./plan.md to capture the decision.
+      Do you want to proceed?
+
+      ❯ 1. HOLD SCOPE
+        2. SCOPE EXPANSION
+    `;
+    // KNOWN LIMITATION: the co-trigger fires here. Documented as a
+    // post-merge follow-up. Flip this assertion once the regex tightens.
+    expect(isPermissionDialogVisible(sample)).toBe(true);
+  });
+});
+
+describe('isNumberedOptionListVisible', () => {
+  test('matches a basic ❯ 1. + 2. cursor list', () => {
+    const sample = `
+      ❯ 1. Option one
+        2. Option two
+        3. Option three
+    `;
+    expect(isNumberedOptionListVisible(sample)).toBe(true);
+  });
+
+  test('returns false on a single-option prompt', () => {
+    const sample = `
+      ❯ 1. Only option
+    `;
+    expect(isNumberedOptionListVisible(sample)).toBe(false);
+  });
+
+  test('returns false when no cursor renders', () => {
+    const sample = `
+      Just some prose with 1. a numbered point and 2. another.
+    `;
+    expect(isNumberedOptionListVisible(sample)).toBe(false);
+  });
+
+  test('overlaps permission dialogs (this is why D5 short-circuits)', () => {
+    // The whole point of D5: this string matches BOTH classifiers, so the
+    // runner must consult isPermissionDialogVisible to disambiguate.
+    const sample = `
+      Bash command \`do-thing\` requires permission to run.
+
+      ❯ 1. Yes
+        2. No
+    `;
+    expect(isNumberedOptionListVisible(sample)).toBe(true);
+    expect(isPermissionDialogVisible(sample)).toBe(true);
+  });
+});
+
+describe('classifyVisible (runtime path through the runner classifier)', () => {
+  // These tests call the actual classifier so a future contributor who
+  // reorders branches (e.g. moves the permission short-circuit before
+  // isPlanReadyVisible) is caught deterministically.
+
+  test('skill question → returns asked', () => {
+    const visible = `
+      D1 — Choose your scope mode
+
+      ❯ 1. HOLD SCOPE
+        2. SCOPE EXPANSION
+        3. SELECTIVE EXPANSION
+        4. SCOPE REDUCTION
+    `;
+    const result = classifyVisible(visible);
+    expect(result?.outcome).toBe('asked');
+  });
+
+  test('permission dialog (Bash) → returns null (skip, keep polling)', () => {
+    const visible = `
+      Bash command \`gstack-update-check\` requires permission to run.
+
+      ❯ 1. Yes
+        2. No
+    `;
+    expect(isNumberedOptionListVisible(visible)).toBe(true); // pre-filter
+    expect(classifyVisible(visible)).toBeNull(); // post-filter
+  });
+
+  test('plan-ready confirmation → returns plan_ready (wins over asked)', () => {
+    const visible = `
+      Ready to execute the plan?
+
+      ❯ 1. Yes, proceed
+        2. No, keep planning
+    `;
+    const result = classifyVisible(visible);
+    expect(result?.outcome).toBe('plan_ready');
+  });
+
+  test('silent write to unsanctioned path → returns silent_write', () => {
+    const visible = `
+      ⏺ Write(src/app/dangerous-write.ts)
+      ⎿  Wrote 42 lines
+    `;
+    const result = classifyVisible(visible);
+    expect(result?.outcome).toBe('silent_write');
+    expect(result?.summary).toContain('src/app/dangerous-write.ts');
+  });
+
+  test('write to sanctioned path (.claude/plans) → returns null (allowed)', () => {
+    const visible = `
+      ⏺ Write(/Users/me/.claude/plans/some-plan.md)
+      ⎿  Wrote 42 lines
+    `;
+    expect(classifyVisible(visible)).toBeNull();
+  });
+
+  test('write while a permission dialog is on screen → returns null (gated, not silent, not asked)', () => {
+    const visible = `
+      ⏺ Write(src/app/edit-with-permission.ts)
+
+      Edit to src/app/edit-with-permission.ts
+
+      Do you want to proceed?
+
+      ❯ 1. Yes
+        2. No
+    `;
+    // The numbered prompt is a permission dialog (Edit to + Do you want to proceed?);
+    // silent_write is suppressed because a numbered prompt is visible, AND
+    // 'asked' is suppressed because the prompt is a permission dialog.
+    expect(classifyVisible(visible)).toBeNull();
+  });
+
+  test('write while a real skill question is on screen → returns asked (write is captured but not silent)', () => {
+    const visible = `
+      ⏺ Write(src/app/foo.ts)
+
+      D1 — Choose your scope mode
+
+      ❯ 1. HOLD SCOPE
+        2. SCOPE EXPANSION
+    `;
+    // The numbered prompt is a skill question, not a permission dialog;
+    // silent_write is suppressed (numbered prompt is visible) and the
+    // outcome is 'asked' — Step 0 fired.
+    const result = classifyVisible(visible);
+    expect(result?.outcome).toBe('asked');
+  });
+
+  test('idle / no signals → returns null', () => {
+    const visible = `
+      Some prose without any classifier signals.
+    `;
+    expect(classifyVisible(visible)).toBeNull();
+  });
+
+  test('TAIL_SCAN_BYTES is exported as 1500', () => {
+    // Shared between runner and routing test; a regression that desyncs the
+    // recent-tail window would surface here.
+    expect(TAIL_SCAN_BYTES).toBe(1500);
+  });
+});
+
+describe('parseNumberedOptions', () => {
+  test('extracts options from a clean cursor list', () => {
+    const visible = `
+      ❯ 1. HOLD SCOPE
+        2. SCOPE EXPANSION
+    `;
+    const opts = parseNumberedOptions(visible);
+    expect(opts).toHaveLength(2);
+    expect(opts[0]).toEqual({ index: 1, label: 'HOLD SCOPE' });
+    expect(opts[1]).toEqual({ index: 2, label: 'SCOPE EXPANSION' });
+  });
+
+  test('returns empty array on prose-with-numbers (no cursor)', () => {
+    expect(parseNumberedOptions('text 1. one 2. two')).toEqual([]);
+  });
+});
+
+describe('runPlanSkillObservation env passthrough surface', () => {
+  test('ClaudePtyOptions exposes env: Record<string, string>', () => {
+    // Type-level guard: this file would fail to compile if the env field
+    // were removed or its shape regressed. The actual env merge happens in
+    // launchClaudePty's spawn call (`env: { ...process.env, ...opts.env }`),
+    // so a regression where `env: opts.env` gets dropped from the
+    // runPlanSkillObservation -> launchClaudePty handoff is only caught by
+    // the live PTY test, not here.
+    const opts: ClaudePtyOptions = {
+      env: { QUESTION_TUNING: 'false', EXPLAIN_LEVEL: 'default' },
+    };
+    expect(opts.env).toEqual({ QUESTION_TUNING: 'false', EXPLAIN_LEVEL: 'default' });
+  });
+});
@@ -37,6 +37,7 @@ import {
  isPermissionDialogVisible,
  parseNumberedOptions,
  isPlanReadyVisible,
+  TAIL_SCAN_BYTES,
  type ClaudePtySession,
 } from './helpers/claude-pty-runner';

@@ -115,7 +116,14 @@ async function navigateToModeAskUserQuestion(
    // Permission dialog? Grant with "1" but don't count it against nav budget.
    // Classify on the recent tail only — old permission text persists in
    // visibleSince and would re-trigger forever.
-    if (isPermissionDialogVisible(visible.slice(-1500))) {
+    //
+    // Note: runPlanSkillObservation has its own permission-dialog filter that
+    // simply skips classification (since it observes, doesn't drive). This nav
+    // loop drives the PTY directly via launchClaudePty and so owns its own
+    // dialog handling — granting with "1" so the workflow advances. Both
+    // paths share TAIL_SCAN_BYTES as the recent-tail window so tuning stays
+    // in sync.
+    if (isPermissionDialogVisible(visible.slice(-TAIL_SCAN_BYTES))) {
      session.send('1\r');
      await Bun.sleep(1500);
      continue;