test: extract classifyVisible() + permission-dialog filter in PTY runner

Pure classifier extracted from runPlanSkillObservation's polling loop so
unit tests can exercise the actual branch order with synthetic input
strings. Runner gains:

- env? passthrough on runPlanSkillObservation (forwarded to launchClaudePty).
  gstack-config does not yet honor env overrides; plumbing is in place for a
  future change to make tests hermetic.
- TAIL_SCAN_BYTES = 1500 exported constant. Replaces a duplicated magic
  number in test/skill-e2e-plan-ceo-mode-routing.test.ts so tuning stays
  in sync.
- isPermissionDialogVisible: the bare phrase "Do you want to proceed?" now
  requires a file-edit context co-trigger. Other clauses unchanged. Skill
  questions that contain the bare phrase are no longer mis-classified.
- classifyVisible(visible): pure function. Branch order silent_write →
  plan_ready → asked → null. Permission dialogs filtered out of the
  'asked' classification so a permission prompt cannot pose as a Step 0
  skill question.

Adds 24 unit tests covering all classifier branches, edge cases, and the
co-trigger contract.
This commit is contained in:
Garry Tan
2026-04-28 00:00:10 -07:00
parent dde55103fc
commit fa78a20188
3 changed files with 440 additions and 47 deletions
+111 -46
View File
@@ -138,6 +138,15 @@ export function isPlanReadyVisible(visible: string): boolean {
return /ready to execute|Would you like to proceed/i.test(visible);
}
/**
* Recent-tail window (in bytes of stripped TTY text) used when classifying
* permission dialogs. Old permission text persists in the visibleSince buffer
* after the dialog is dismissed, so callers should pass `visible.slice(-TAIL_SCAN_BYTES)`
* to avoid re-triggering on stale scrollback. Shared between `runPlanSkillObservation`
* and `navigateToModeAskUserQuestion` in the routing test so tuning stays in sync.
*/
export const TAIL_SCAN_BYTES = 1500;
/**
* Detect a Claude Code permission dialog. These render as a numbered
* option list (so isNumberedOptionListVisible matches them) but they
@@ -145,23 +154,37 @@ export function isPlanReadyVisible(visible: string): boolean {
* whether to grant a tool/file permission. Tests that look for skill
* AskUserQuestions must explicitly skip these.
*
* Both English phrases below are stable across recent Claude Code
* The English phrases below are stable across recent Claude Code
* versions. The check is permissive on whitespace because TTY rendering
* may wrap or reflow text.
*
* Co-trigger requirement: the bare phrase "Do you want to proceed?" is
* generic enough that a skill question could legitimately use it
* ("Do you want to proceed with HOLD SCOPE?"). To avoid mis-classifying
* skill questions as permission dialogs, this phrase only counts when it
* co-occurs with a file-edit context ("Edit to <path>" or "Write to <path>").
* The standalone permission signatures (`requested permissions to`,
* `allow all edits`, `always allow access to`, `Bash command requires permission`)
* remain unconditional.
*/
export function isPermissionDialogVisible(visible: string): boolean {
return (
/requested\s+permissions?\s+to/i.test(visible) ||
/Do\s+you\s+want\s+to\s+proceed\?/i.test(visible) ||
// "Yes / Yes, allow all edits / No" shape rendered by Claude Code for
// file-edit permission grants. The middle option's "allow all" phrase
// is the unique signature.
/\ballow\s+all\s+edits\b/i.test(visible) ||
// "Yes, and always allow access to <dir>" shape (workspace trust).
/always\s+allow\s+access\s+to/i.test(visible) ||
// Bash command permission prompts.
/Bash\s+command\s+.*\s+requires\s+permission/i.test(visible)
);
// Standalone signatures — high specificity, never appear in skill questions.
if (/requested\s+permissions?\s+to/i.test(visible)) return true;
// "Yes / Yes, allow all edits / No" shape — file-edit permission grants.
if (/\ballow\s+all\s+edits\b/i.test(visible)) return true;
// "Yes, and always allow access to <dir>" shape — workspace trust.
if (/always\s+allow\s+access\s+to/i.test(visible)) return true;
// Bash command permission prompts.
if (/Bash\s+command\s+.*\s+requires\s+permission/i.test(visible)) return true;
// "Do you want to proceed?" only counts as a permission dialog when paired
// with a file-edit context. Skill questions can use the bare phrase.
if (
/Do\s+you\s+want\s+to\s+proceed\?/i.test(visible) &&
/(Edit|Write)\s+to\s+\S+/i.test(visible)
) {
return true;
}
return false;
}
/** Detect any AskUserQuestion-shaped numbered option list with cursor. */
@@ -261,6 +284,69 @@ export function parseNumberedOptions(
return found;
}
/**
* Pure classifier for the visible TTY buffer. Decides which outcome the
* polling loop should return on this tick, or `null` to keep polling.
*
* Extracted from `runPlanSkillObservation` so the unit suite can exercise
* the actual branch order with synthetic input strings — a future contributor
* who reorders the branches (e.g., moves the permission short-circuit) gets
* caught by the unit tests, not by a stochastic E2E run.
*
* Live-state branches (process exited, "Unknown command") stay in the runner
* since they need the session handle.
*/
export type ClassifyResult =
| { outcome: 'silent_write'; summary: string }
| { outcome: 'plan_ready'; summary: string }
| { outcome: 'asked'; summary: string }
| null;
const SANCTIONED_WRITE_SUBSTRINGS = [
'.claude/plans',
'.gstack/',
'/.context/',
'CHANGELOG.md',
'TODOS.md',
];
export function classifyVisible(visible: string): ClassifyResult {
// Silent-write detection: any Write/Edit tool render that targets a path
// OUTSIDE the sanctioned dirs, AND no numbered prompt is currently on screen
// (a numbered prompt means a permission/AskUserQuestion is gating the write,
// not an actual silent write).
const writeRe = /⏺\s*(?:Write|Edit)\(([^)]+)\)/g;
let m: RegExpExecArray | null;
while ((m = writeRe.exec(visible)) !== null) {
const target = m[1] ?? '';
const sanctioned = SANCTIONED_WRITE_SUBSTRINGS.some((s) => target.includes(s));
if (!sanctioned && !isNumberedOptionListVisible(visible)) {
return {
outcome: 'silent_write',
summary: `Write/Edit to ${target} fired before any AskUserQuestion`,
};
}
}
if (isPlanReadyVisible(visible)) {
return {
outcome: 'plan_ready',
summary: 'skill ran end-to-end and emitted plan-mode "Ready to execute" confirmation',
};
}
if (isNumberedOptionListVisible(visible)) {
// Permission dialogs render numbered lists too. Skip them — the
// bug we want to catch is "skill question never fired."
if (isPermissionDialogVisible(visible.slice(-TAIL_SCAN_BYTES))) {
return null;
}
return {
outcome: 'asked',
summary: 'skill fired a numbered-option prompt (AskUserQuestion or routing-injection)',
};
}
return null;
}
/**
* Spawn `claude --permission-mode plan` in a real PTY and return a session
* handle. Caller is responsible for `await session.close()` to release the
@@ -566,12 +652,21 @@ export async function runPlanSkillObservation(opts: {
cwd?: string;
/** Total budget for skill to reach a terminal outcome. Default 180000. */
timeoutMs?: number;
/**
* Extra env merged into the spawned `claude` process. `launchClaudePty`
* already supports this; exposing it here lets per-skill tests isolate
* from local config that would mask the regression they're trying to
* catch (e.g., `QUESTION_TUNING=true` causing AUTO_DECIDE to skip the
* rendered AskUserQuestion list).
*/
env?: Record<string, string>;
}): Promise<PlanSkillObservation> {
const startedAt = Date.now();
const session = await launchClaudePty({
permissionMode: opts.inPlanMode === false ? null : 'plan',
cwd: opts.cwd,
timeoutMs: (opts.timeoutMs ?? 180_000) + 30_000,
env: opts.env,
});
try {
@@ -602,40 +697,10 @@ export async function runPlanSkillObservation(opts: {
elapsedMs: Date.now() - startedAt,
};
}
// Silent-write detection: any Write/Edit tool render that targets a
// path OUTSIDE ~/.claude/plans, ~/.gstack/, or the active worktree's
// .gstack/. Plan files and gbrain artifacts are sanctioned.
const writeRe = /⏺\s*(?:Write|Edit)\(([^)]+)\)/g;
let m: RegExpExecArray | null;
while ((m = writeRe.exec(visible)) !== null) {
const target = m[1] ?? '';
const sanctioned =
target.includes('.claude/plans') ||
target.includes('.gstack/') ||
target.includes('/.context/') ||
target.includes('CHANGELOG.md') ||
target.includes('TODOS.md');
if (!sanctioned && !isNumberedOptionListVisible(visible)) {
return {
outcome: 'silent_write',
summary: `Write/Edit to ${target} fired before any AskUserQuestion`,
evidence: visible.slice(-2000),
elapsedMs: Date.now() - startedAt,
};
}
}
if (isPlanReadyVisible(visible)) {
const classified = classifyVisible(visible);
if (classified) {
return {
outcome: 'plan_ready',
summary: 'skill ran end-to-end and emitted plan-mode "Ready to execute" confirmation',
evidence: visible.slice(-2000),
elapsedMs: Date.now() - startedAt,
};
}
if (isNumberedOptionListVisible(visible)) {
return {
outcome: 'asked',
summary: 'skill fired a numbered-option prompt (AskUserQuestion or routing-injection)',
...classified,
evidence: visible.slice(-2000),
elapsedMs: Date.now() - startedAt,
};
+320
View File
@@ -0,0 +1,320 @@
/**
* Deterministic unit tests for claude-pty-runner.ts behavior changes.
*
* Free-tier (no EVALS=1 needed). Runs in <1s on every `bun test`. Catches
* harness plumbing bugs before stochastic PTY runs surface them.
*
* Two surface areas tested:
*
* 1. Permission-dialog short-circuit in 'asked' classification: a TTY frame
* that matches BOTH isPermissionDialogVisible AND isNumberedOptionListVisible
* must NOT be classified as a skill question — permission dialogs render
* as numbered lists too, but they're not what we're guarding.
*
* 2. Env passthrough surface: runPlanSkillObservation accepts an `env`
* option and threads it to launchClaudePty. We can't fully exercise the
* spawn pipeline without paying for a PTY session, but we CAN verify the
* option exists in the type signature and that calling without env still
* works (no regression).
*
* The PTY test (skill-e2e-plan-ceo-plan-mode.test.ts) is the integration
* check; this file is the cheap deterministic guard for the harness primitives
* those tests stand on.
*/
import { describe, test, expect } from 'bun:test';
import {
isPermissionDialogVisible,
isNumberedOptionListVisible,
isPlanReadyVisible,
parseNumberedOptions,
classifyVisible,
TAIL_SCAN_BYTES,
type ClaudePtyOptions,
} from './claude-pty-runner';
describe('isPermissionDialogVisible', () => {
test('matches "Bash command requires permission" prompts', () => {
const sample = `
Some preamble output
Bash command \`gstack-config get telemetry\` requires permission to run.
1. Yes
2. Yes, and always allow
3. No, abort
`;
expect(isPermissionDialogVisible(sample)).toBe(true);
});
test('matches "allow all edits" file-edit prompts', () => {
// Isolated to the "allow all edits" clause only — no overlapping
// "Do you want to proceed?" co-trigger, so this asserts the clause works.
const sample = `
Edit to ~/.gstack/config.yaml
1. Yes
2. Yes, allow all edits during this session
3. No
`;
expect(isPermissionDialogVisible(sample)).toBe(true);
});
test('matches the "Do you want to proceed?" file-edit confirmation by itself', () => {
// Separate fixture so weakening this clause is detected by a dedicated test.
const sample = `
Edit to ~/.gstack/config.yaml
Do you want to proceed?
1. Yes
2. No
`;
expect(isPermissionDialogVisible(sample)).toBe(true);
});
test('matches workspace-trust "always allow access to" prompt', () => {
const sample = `
Do you trust the files in this folder?
1. Yes, proceed
2. Yes, and always allow access to /Users/me/repo
3. No, exit
`;
expect(isPermissionDialogVisible(sample)).toBe(true);
});
test('does NOT match a skill AskUserQuestion list', () => {
const sample = `
D1 — Premise challenge: do users actually want this?
1. Yes, validated
2. No, premise is wrong
3. Need more info
`;
expect(isPermissionDialogVisible(sample)).toBe(false);
});
test('does NOT match a plan-ready confirmation', () => {
const sample = `
Ready to execute the plan?
1. Yes
2. No, keep planning
`;
expect(isPermissionDialogVisible(sample)).toBe(false);
});
test('does NOT match a skill question that contains the bare phrase "Do you want to proceed?"', () => {
// Co-trigger requirement: "Do you want to proceed?" alone is not enough.
// It must appear with "Edit to <path>" or "Write to <path>" to count as
// a permission dialog. This guards against a skill question like
// "Do you want to proceed with HOLD SCOPE?" being mis-classified.
const sample = `
Choose your scope mode for this review.
Do you want to proceed?
1. HOLD SCOPE
2. SCOPE EXPANSION
3. SELECTIVE EXPANSION
`;
expect(isPermissionDialogVisible(sample)).toBe(false);
});
test('does NOT mis-match when adversarial prose includes "Edit to <path>" alongside the bare proceed phrase', () => {
// Adversarial fixture: a skill question whose body legitimately mentions
// "Edit to <path>" in prose AND ends with "Do you want to proceed?". The
// current co-trigger regex would mis-classify this as a permission
// dialog. We DO want this test to fail until the regex is tightened
// further (e.g., proximity constraint, or anchoring "Edit to" to a
// line-start). For now this is documented as a known limitation: a
// skill question that talks about "Edit to" in prose IS still treated
// as a permission dialog. The test asserts the current behavior so a
// future fix can flip it intentionally.
const sample = `
Plan: I will Edit to ./plan.md to capture the decision.
Do you want to proceed?
1. HOLD SCOPE
2. SCOPE EXPANSION
`;
// KNOWN LIMITATION: the co-trigger fires here. Documented as a
// post-merge follow-up. Flip this assertion once the regex tightens.
expect(isPermissionDialogVisible(sample)).toBe(true);
});
});
describe('isNumberedOptionListVisible', () => {
test('matches a basic 1. + 2. cursor list', () => {
const sample = `
1. Option one
2. Option two
3. Option three
`;
expect(isNumberedOptionListVisible(sample)).toBe(true);
});
test('returns false on a single-option prompt', () => {
const sample = `
1. Only option
`;
expect(isNumberedOptionListVisible(sample)).toBe(false);
});
test('returns false when no cursor renders', () => {
const sample = `
Just some prose with 1. a numbered point and 2. another.
`;
expect(isNumberedOptionListVisible(sample)).toBe(false);
});
test('overlaps permission dialogs (this is why D5 short-circuits)', () => {
// The whole point of D5: this string matches BOTH classifiers, so the
// runner must consult isPermissionDialogVisible to disambiguate.
const sample = `
Bash command \`do-thing\` requires permission to run.
1. Yes
2. No
`;
expect(isNumberedOptionListVisible(sample)).toBe(true);
expect(isPermissionDialogVisible(sample)).toBe(true);
});
});
describe('classifyVisible (runtime path through the runner classifier)', () => {
// These tests call the actual classifier so a future contributor who
// reorders branches (e.g. moves the permission short-circuit before
// isPlanReadyVisible) is caught deterministically.
test('skill question → returns asked', () => {
const visible = `
D1 — Choose your scope mode
1. HOLD SCOPE
2. SCOPE EXPANSION
3. SELECTIVE EXPANSION
4. SCOPE REDUCTION
`;
const result = classifyVisible(visible);
expect(result?.outcome).toBe('asked');
});
test('permission dialog (Bash) → returns null (skip, keep polling)', () => {
const visible = `
Bash command \`gstack-update-check\` requires permission to run.
1. Yes
2. No
`;
expect(isNumberedOptionListVisible(visible)).toBe(true); // pre-filter
expect(classifyVisible(visible)).toBeNull(); // post-filter
});
test('plan-ready confirmation → returns plan_ready (wins over asked)', () => {
const visible = `
Ready to execute the plan?
1. Yes, proceed
2. No, keep planning
`;
const result = classifyVisible(visible);
expect(result?.outcome).toBe('plan_ready');
});
test('silent write to unsanctioned path → returns silent_write', () => {
const visible = `
⏺ Write(src/app/dangerous-write.ts)
⎿ Wrote 42 lines
`;
const result = classifyVisible(visible);
expect(result?.outcome).toBe('silent_write');
expect(result?.summary).toContain('src/app/dangerous-write.ts');
});
test('write to sanctioned path (.claude/plans) → returns null (allowed)', () => {
const visible = `
⏺ Write(/Users/me/.claude/plans/some-plan.md)
⎿ Wrote 42 lines
`;
expect(classifyVisible(visible)).toBeNull();
});
test('write while a permission dialog is on screen → returns null (gated, not silent, not asked)', () => {
const visible = `
⏺ Write(src/app/edit-with-permission.ts)
Edit to src/app/edit-with-permission.ts
Do you want to proceed?
1. Yes
2. No
`;
// The numbered prompt is a permission dialog (Edit to + Do you want to proceed?);
// silent_write is suppressed because a numbered prompt is visible, AND
// 'asked' is suppressed because the prompt is a permission dialog.
expect(classifyVisible(visible)).toBeNull();
});
test('write while a real skill question is on screen → returns asked (write is captured but not silent)', () => {
const visible = `
⏺ Write(src/app/foo.ts)
D1 — Choose your scope mode
1. HOLD SCOPE
2. SCOPE EXPANSION
`;
// The numbered prompt is a skill question, not a permission dialog;
// silent_write is suppressed (numbered prompt is visible) and the
// outcome is 'asked' — Step 0 fired.
const result = classifyVisible(visible);
expect(result?.outcome).toBe('asked');
});
test('idle / no signals → returns null', () => {
const visible = `
Some prose without any classifier signals.
`;
expect(classifyVisible(visible)).toBeNull();
});
test('TAIL_SCAN_BYTES is exported as 1500', () => {
// Shared between runner and routing test; a regression that desyncs the
// recent-tail window would surface here.
expect(TAIL_SCAN_BYTES).toBe(1500);
});
});
describe('parseNumberedOptions', () => {
test('extracts options from a clean cursor list', () => {
const visible = `
1. HOLD SCOPE
2. SCOPE EXPANSION
`;
const opts = parseNumberedOptions(visible);
expect(opts).toHaveLength(2);
expect(opts[0]).toEqual({ index: 1, label: 'HOLD SCOPE' });
expect(opts[1]).toEqual({ index: 2, label: 'SCOPE EXPANSION' });
});
test('returns empty array on prose-with-numbers (no cursor)', () => {
expect(parseNumberedOptions('text 1. one 2. two')).toEqual([]);
});
});
describe('runPlanSkillObservation env passthrough surface', () => {
test('ClaudePtyOptions exposes env: Record<string, string>', () => {
// Type-level guard: this file would fail to compile if the env field
// were removed or its shape regressed. The actual env merge happens in
// launchClaudePty's spawn call (`env: { ...process.env, ...opts.env }`),
// so a regression where `env: opts.env` gets dropped from the
// runPlanSkillObservation -> launchClaudePty handoff is only caught by
// the live PTY test, not here.
const opts: ClaudePtyOptions = {
env: { QUESTION_TUNING: 'false', EXPLAIN_LEVEL: 'default' },
};
expect(opts.env).toEqual({ QUESTION_TUNING: 'false', EXPLAIN_LEVEL: 'default' });
});
});
+9 -1
View File
@@ -37,6 +37,7 @@ import {
isPermissionDialogVisible,
parseNumberedOptions,
isPlanReadyVisible,
TAIL_SCAN_BYTES,
type ClaudePtySession,
} from './helpers/claude-pty-runner';
@@ -115,7 +116,14 @@ async function navigateToModeAskUserQuestion(
// Permission dialog? Grant with "1" but don't count it against nav budget.
// Classify on the recent tail only — old permission text persists in
// visibleSince and would re-trigger forever.
if (isPermissionDialogVisible(visible.slice(-1500))) {
//
// Note: runPlanSkillObservation has its own permission-dialog filter that
// simply skips classification (since it observes, doesn't drive). This nav
// loop drives the PTY directly via launchClaudePty and so owns its own
// dialog handling — granting with "1" so the workflow advances. Both
// paths share TAIL_SCAN_BYTES as the recent-tail window so tuning stays
// in sync.
if (isPermissionDialogVisible(visible.slice(-TAIL_SCAN_BYTES))) {
session.send('1\r');
await Bun.sleep(1500);
continue;