From 474f034109bb4da03d4675f9b11bb019923711c4 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sun, 26 Apr 2026 04:36:08 -0700 Subject: [PATCH] =?UTF-8?q?feat(test):=20harness=20primitives=20=E2=80=94?= =?UTF-8?q?=20parseNumberedOptions=20+=20budget=20regression=20utils?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit claude-pty-runner.ts: - parseNumberedOptions(visible) anchors on the latest "❯ 1." cursor and returns {index, label}[]; tests that route on option labels can find indices without hard-coding positions - isPermissionDialogVisible(visible) detects file-grant + workspace-trust + bash-permission shapes (multiple regex variants) - isNumberedOptionListVisible: replaced \b2\. word-boundary regex with [^0-9]2\. — stripAnsi removes TTY cursor-positioning escapes that collapse "Option 2." to "Option2.", and \b fails on word-to-word eval-store.ts: - findBudgetRegressions(comparison, opts?) — pure function returning tests where tools or turns grew >cap× vs prior run; floors at 5 prior tools / 3 prior turns to avoid noise on tiny numbers - assertNoBudgetRegression() — wrapper that throws with full violation list. Env override GSTACK_BUDGET_RATIO helpers-unit.test.ts: 23 unit tests covering empty/sparse/wrap-around buffers for parseNumberedOptions, plus regression-floor + env-override cases for findBudgetRegressions/assertNoBudgetRegression. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/helpers-unit.test.ts | 290 ++++++++++++++++++++++++++++++ test/helpers/claude-pty-runner.ts | 117 +++++++++++- test/helpers/eval-store.ts | 65 +++++++ 3 files changed, 471 insertions(+), 1 deletion(-) create mode 100644 test/helpers-unit.test.ts diff --git a/test/helpers-unit.test.ts b/test/helpers-unit.test.ts new file mode 100644 index 00000000..c57ff0d2 --- /dev/null +++ b/test/helpers-unit.test.ts @@ -0,0 +1,290 @@ +/** + * Unit tests for two helpers added alongside the new real-PTY E2E tests: + * + * - parseNumberedOptions(visible) + * Parses `❯ 1.` / ` 2.` numbered-option lines out of TTY text. + * Used by the AUQ format-compliance and mode-routing tests to look + * up an option index by its label without hard-coding positions. + * + * - findBudgetRegressions / assertNoBudgetRegression(comparison) + * Computes which tests grew >2× in tool calls or turns vs the prior + * eval run. Used by the budget-regression test. + * + * Free, deterministic, runs under `bun test`. + */ + +import { describe, test, expect } from 'bun:test'; +import { parseNumberedOptions } from './helpers/claude-pty-runner'; +import { + assertNoBudgetRegression, + findBudgetRegressions, + type ComparisonResult, + type TestDelta, +} from './helpers/eval-store'; + +// --- parseNumberedOptions --- + +describe('parseNumberedOptions', () => { + test('returns [] for empty input', () => { + expect(parseNumberedOptions('')).toEqual([]); + }); + + test('returns [] when no numbered list is rendered', () => { + expect(parseNumberedOptions('just some prose with no list')).toEqual([]); + }); + + test('parses a basic 3-option list with cursor on first', () => { + const visible = [ + 'Some prompt prose above.', + '', + '❯ 1. HOLD SCOPE', + ' 2. SCOPE EXPANSION', + ' 3. SELECTIVE EXPANSION', + '', + ].join('\n'); + expect(parseNumberedOptions(visible)).toEqual([ + { index: 1, label: 'HOLD SCOPE' }, + { index: 2, label: 'SCOPE EXPANSION' }, + { index: 3, label: 'SELECTIVE EXPANSION' }, + ]); + }); + + test('parses cursor on a non-first option', () => { + const visible = [ + ' 1. Option A', + '❯ 2. Option B', + ' 3. Option C', + ].join('\n'); + const opts = parseNumberedOptions(visible); + expect(opts.map(o => o.index)).toEqual([1, 2, 3]); + expect(opts.map(o => o.label)).toEqual(['Option A', 'Option B', 'Option C']); + }); + + test('handles 9 options (max single-digit)', () => { + const lines = ['❯ 1. one']; + for (let i = 2; i <= 9; i++) lines.push(` ${i}. opt${i}`); + const opts = parseNumberedOptions(lines.join('\n')); + expect(opts.length).toBe(9); + expect(opts[8]).toEqual({ index: 9, label: 'opt9' }); + }); + + test('truncates at first sequence gap', () => { + // Real bug shape: prose contains "1. blah" and "2. blah" then a real + // option list shows up later. We only return the consecutive run that + // starts at 1. + const visible = [ + '❯ 1. Real option', + ' 2. Other real option', + 'some prose', + ' 4. Stray number', + ].join('\n'); + expect(parseNumberedOptions(visible)).toEqual([ + { index: 1, label: 'Real option' }, + { index: 2, label: 'Other real option' }, + ]); + }); + + test('returns [] when sequence does not start at 1', () => { + const visible = [' 3. orphan', ' 4. orphan'].join('\n'); + expect(parseNumberedOptions(visible)).toEqual([]); + }); + + test('returns [] for a single option (need at least 2 to be a real list)', () => { + expect(parseNumberedOptions('❯ 1. lonely')).toEqual([]); + }); + + test('preserves trailing markers on labels (e.g. recommended)', () => { + const visible = [ + '❯ 1. Cover all 4 modes (recommended)', + ' 2. Just HOLD + EXPANSION', + ].join('\n'); + const opts = parseNumberedOptions(visible); + expect(opts[0]!.label).toContain('(recommended)'); + }); + + test('only matches the most recent list when buffer is large', () => { + // First (stale) list, then >4KB of intervening text, then the real list. + // parseNumberedOptions reads only the last 4KB, so the stale list is + // dropped — this is the desired behavior for tests that re-open the + // session and want the current prompt only. + const stale = ['❯ 1. STALE_A', ' 2. STALE_B'].join('\n'); + const filler = 'x'.repeat(5000); + const fresh = ['❯ 1. FRESH_A', ' 2. FRESH_B'].join('\n'); + const visible = stale + '\n' + filler + '\n' + fresh; + const opts = parseNumberedOptions(visible); + expect(opts.map(o => o.label)).toEqual(['FRESH_A', 'FRESH_B']); + }); + + test('anchors on LAST cursor when both stale and fresh fit in the tail', () => { + // Both lists fit in the same 4KB tail (small buffer). The granted + // permission dialog options come first, the real AUQ comes second. + // We must return the FRESH options, not the STALE ones. + const visible = [ + '❯ 1. STALE_grant', + ' 2. STALE_deny', + 'some narration the agent printed after we granted', + 'and a few more lines of bash output', + '❯ 1. FRESH_keep', + ' 2. FRESH_drop', + ].join('\n'); + const opts = parseNumberedOptions(visible); + expect(opts.map(o => o.label)).toEqual(['FRESH_keep', 'FRESH_drop']); + }); + + test('falls back to last `1.` if cursor is not currently rendered on option 1', () => { + // The user pressed Down, so cursor is on option 2; but the parser + // should still return options 1+2 by anchoring on the last `1.` line. + const visible = [ + ' 1. Option A', + '❯ 2. Option B', + ' 3. Option C', + ].join('\n'); + const opts = parseNumberedOptions(visible); + expect(opts.map(o => o.label)).toEqual(['Option A', 'Option B', 'Option C']); + }); +}); + +// --- findBudgetRegressions / assertNoBudgetRegression --- + +function makeDelta( + name: string, + beforeTools: Record, + afterTools: Record, + beforeTurns?: number, + afterTurns?: number, +): TestDelta { + return { + name, + before: { passed: true, cost_usd: 0, tool_summary: beforeTools, turns_used: beforeTurns }, + after: { passed: true, cost_usd: 0, tool_summary: afterTools, turns_used: afterTurns }, + status_change: 'unchanged', + }; +} + +function makeComparison(deltas: TestDelta[]): ComparisonResult { + return { + before_file: '/tmp/before.json', + after_file: '/tmp/after.json', + before_branch: 'main', + after_branch: 'feat/x', + before_timestamp: '2025-01-01T00:00:00Z', + after_timestamp: '2025-01-02T00:00:00Z', + deltas, + total_cost_delta: 0, + total_duration_delta: 0, + improved: 0, + regressed: 0, + unchanged: deltas.length, + tool_count_before: 0, + tool_count_after: 0, + }; +} + +describe('findBudgetRegressions', () => { + test('empty comparison → no regressions', () => { + expect(findBudgetRegressions(makeComparison([]))).toEqual([]); + }); + + test('no regression when after ≤ 2× before for tools', () => { + const c = makeComparison([ + makeDelta('a', { Bash: 10 }, { Bash: 19 }), // 1.9× — under cap + ]); + expect(findBudgetRegressions(c)).toEqual([]); + }); + + test('flags >2× tool growth', () => { + const c = makeComparison([ + makeDelta('a', { Bash: 10, Read: 5 }, { Bash: 25, Read: 12 }), // 15→37 = 2.47× + ]); + const regs = findBudgetRegressions(c); + expect(regs.length).toBe(1); + expect(regs[0]!.metric).toBe('tools'); + expect(regs[0]!.before).toBe(15); + expect(regs[0]!.after).toBe(37); + }); + + test('flags >2× turn growth independently of tools', () => { + const c = makeComparison([ + makeDelta('a', { Bash: 10 }, { Bash: 12 }, 5, 15), // turns 5→15 = 3× + ]); + const regs = findBudgetRegressions(c); + expect(regs.length).toBe(1); + expect(regs[0]!.metric).toBe('turns'); + }); + + test('skips tests with no prior tool data (new test)', () => { + const c = makeComparison([ + makeDelta('new-test', {}, { Bash: 100 }), // no prior — should not flag + ]); + expect(findBudgetRegressions(c)).toEqual([]); + }); + + test('skips when prior tool count is below the floor (noise floor)', () => { + // 1 → 4 tools is 4× ratio but meaningless on tiny numbers. + const c = makeComparison([ + makeDelta('tiny', { Bash: 1 }, { Bash: 4 }), + ]); + expect(findBudgetRegressions(c)).toEqual([]); + }); + + test('respects ratioCap override', () => { + const c = makeComparison([ + makeDelta('a', { Bash: 10 }, { Bash: 16 }), // 1.6× + ]); + expect(findBudgetRegressions(c, { ratioCap: 1.5 }).length).toBe(1); + expect(findBudgetRegressions(c, { ratioCap: 2.0 }).length).toBe(0); + }); + + test('respects GSTACK_BUDGET_RATIO env override', () => { + const c = makeComparison([ + makeDelta('a', { Bash: 10 }, { Bash: 16 }), // 1.6× + ]); + const prev = process.env.GSTACK_BUDGET_RATIO; + try { + process.env.GSTACK_BUDGET_RATIO = '1.5'; + expect(findBudgetRegressions(c).length).toBe(1); + process.env.GSTACK_BUDGET_RATIO = '2.0'; + expect(findBudgetRegressions(c).length).toBe(0); + } finally { + if (prev === undefined) delete process.env.GSTACK_BUDGET_RATIO; + else process.env.GSTACK_BUDGET_RATIO = prev; + } + }); + + test('handles missing tool_summary gracefully', () => { + const delta: TestDelta = { + name: 'sparse', + before: { passed: true, cost_usd: 0 }, + after: { passed: true, cost_usd: 0 }, + status_change: 'unchanged', + }; + expect(findBudgetRegressions(makeComparison([delta]))).toEqual([]); + }); +}); + +describe('assertNoBudgetRegression', () => { + test('does not throw on a clean comparison', () => { + const c = makeComparison([ + makeDelta('a', { Bash: 10 }, { Bash: 11 }), + ]); + expect(() => assertNoBudgetRegression(c)).not.toThrow(); + }); + + test('throws with all violations and the cap value in the message', () => { + const c = makeComparison([ + makeDelta('regressed-tools', { Bash: 10 }, { Bash: 30 }), + makeDelta('regressed-turns', { Bash: 5 }, { Bash: 6 }, 4, 13), + ]); + let err: Error | null = null; + try { + assertNoBudgetRegression(c); + } catch (e) { + err = e as Error; + } + expect(err).not.toBeNull(); + expect(err!.message).toContain('regressed-tools'); + expect(err!.message).toContain('regressed-turns'); + expect(err!.message).toContain('2.00×'); // default cap + expect(err!.message).toContain('GSTACK_BUDGET_RATIO'); + }); +}); diff --git a/test/helpers/claude-pty-runner.ts b/test/helpers/claude-pty-runner.ts index aab48e7d..3a5cbda2 100644 --- a/test/helpers/claude-pty-runner.ts +++ b/test/helpers/claude-pty-runner.ts @@ -138,12 +138,127 @@ export function isPlanReadyVisible(visible: string): boolean { return /ready to execute|Would you like to proceed/i.test(visible); } +/** + * Detect a Claude Code permission dialog. These render as a numbered + * option list (so isNumberedOptionListVisible matches them) but they + * are NOT a skill's AskUserQuestion — they're claude asking the user + * whether to grant a tool/file permission. Tests that look for skill + * AUQs must explicitly skip these. + * + * Both English phrases below are stable across recent Claude Code + * versions. The check is permissive on whitespace because TTY rendering + * may wrap or reflow text. + */ +export function isPermissionDialogVisible(visible: string): boolean { + return ( + /requested\s+permissions?\s+to/i.test(visible) || + /Do\s+you\s+want\s+to\s+proceed\?/i.test(visible) || + // "Yes / Yes, allow all edits / No" shape rendered by Claude Code for + // file-edit permission grants. The middle option's "allow all" phrase + // is the unique signature. + /\ballow\s+all\s+edits\b/i.test(visible) || + // "Yes, and always allow access to " shape (workspace trust). + /always\s+allow\s+access\s+to/i.test(visible) || + // Bash command permission prompts. + /Bash\s+command\s+.*\s+requires\s+permission/i.test(visible) + ); +} + /** Detect any AskUserQuestion-shaped numbered option list with cursor. */ export function isNumberedOptionListVisible(visible: string): boolean { // ❯ cursor + at least two numbered options 1-9. // Matches the trust dialog AND plan-ready prompt AND skill questions. // Tighter classification happens via scope (after-trust, after-skill-cmd, etc). - return /❯\s*1\./.test(visible) && /\b2\./.test(visible); + // + // Note on the `2\.` regex: the TTY uses cursor-positioning escape codes + // (`\x1b[40C`) for whitespace which stripAnsi removes — collapsing + // `text 2.` to `text2.`. A `\b2\.` word-boundary regex therefore fails + // because `t-2` is a word-to-word transition. We use the weaker + // `[^0-9]2\.` to require a non-digit before `2` (so we don't match + // `12.0`) without requiring whitespace. + return /❯\s*1\./.test(visible) && /(^|[^0-9])2\./.test(visible); +} + +/** + * Parse a rendered numbered-option list out of the visible TTY text. + * + * Looks for lines like `❯ 1. label` (cursor) or ` 2. label` (no cursor) + * and returns them in order. Used by tests that need to ROUTE on a specific + * option label (e.g. answer "HOLD SCOPE" by sending its index + Enter) + * without hard-coding positional indexes that drift when option order + * changes between skill versions. + * + * Reads only the LAST 4KB of visible to avoid matching stale option lists + * from earlier prompts in the session. + * + * Returns [] when no list is rendered. Otherwise returns indices in the + * order they appear (1-based, matching what the user types). Labels are + * trimmed but otherwise verbatim from the TTY (may include trailing + * `(recommended)` markers, etc). + */ +export function parseNumberedOptions( + visible: string, +): Array<{ index: number; label: string }> { + const tail = visible.length > 4096 ? visible.slice(-4096) : visible; + // Split on lines, look for `❯ N.` or ` N.` patterns. Up to N=9. + // The `\s*` after `.` (not `\s+`) is required because stripAnsi removes + // TTY cursor-positioning escapes that render as spaces, so a label that + // visually reads "1. Option" can come through as "1.Option". + const optionRe = /^[\s❯]*([1-9])\.\s*(\S.*?)\s*$/; + // We anchor on the LATEST `❯ 1.` line in the buffer — the cursor marker + // for the active AUQ. Older numbered lists (e.g., a granted permission + // dialog still in scrollback) sit above it and must be ignored. Without + // this, parseNumberedOptions returns stale options after the dialog is + // dismissed. + const lines = tail.split('\n'); + // Anchor on the LAST `❯ 1.` line (cursor is on option 1 of the active + // AUQ). Greedy character classes don't help here — we need a literal + // `❯` after optional leading whitespace. + let cursorLineIdx = -1; + for (let i = lines.length - 1; i >= 0; i--) { + if (/^\s*❯\s*1\./.test(lines[i] ?? '')) { + cursorLineIdx = i; + break; + } + } + // Fallback: if cursor isn't on option 1 (user pressed Down), find the + // last `1.` line. Allow leading ` ` or `❯ ` prefixes; do NOT include `❯` + // in the leading character class because greedy matching would eat the + // sigil and prevent the literal-cursor anchor above from finding it. + if (cursorLineIdx < 0) { + for (let i = lines.length - 1; i >= 0; i--) { + if (/^(?:\s*|\s*❯\s+)1\./.test(lines[i] ?? '')) { + cursorLineIdx = i; + break; + } + } + } + if (cursorLineIdx < 0) return []; + const found: Array<{ index: number; label: string }> = []; + const seenIndices = new Set(); + for (let i = cursorLineIdx; i < lines.length; i++) { + const m = optionRe.exec(lines[i] ?? ''); + if (!m) continue; + const idx = Number(m[1]); + const label = (m[2] ?? '').trim(); + if (seenIndices.has(idx)) continue; + if (label.length === 0) continue; + seenIndices.add(idx); + found.push({ index: idx, label }); + } + // Only return if we found a sequential 1.., 2.., ... block (at least 2 + // consecutive options starting at 1). Otherwise it's noise (e.g. a + // numbered list inside prose, like "1. Read the file"). + found.sort((a, b) => a.index - b.index); + if (found.length < 2) return []; + if (found[0]!.index !== 1) return []; + for (let i = 1; i < found.length; i++) { + if (found[i]!.index !== found[i - 1]!.index + 1) { + // Truncate at the first gap. + return found.slice(0, i); + } + } + return found; } /** diff --git a/test/helpers/eval-store.ts b/test/helpers/eval-store.ts index a7d63178..9942f1e3 100644 --- a/test/helpers/eval-store.ts +++ b/test/helpers/eval-store.ts @@ -554,6 +554,71 @@ export function generateCommentary(c: ComparisonResult): string[] { return notes; } +// --- Budget regression assertion --- + +export interface BudgetRegression { + testName: string; + metric: 'tools' | 'turns'; + before: number; + after: number; + ratio: number; +} + +/** + * Compute budget regressions: tests where tool calls or turns grew by more + * than `ratioCap` between two runs. Pure function — caller decides how to + * surface the result. Used by test/skill-budget-regression.test.ts and any + * future ship gate. + * + * `ratioCap` defaults to 2.0 (>2× growth is a regression). Override via + * `GSTACK_BUDGET_RATIO` env var. New tests with no prior data are skipped. + */ +export function findBudgetRegressions( + comparison: ComparisonResult, + opts?: { ratioCap?: number; minPriorTools?: number; minPriorTurns?: number }, +): BudgetRegression[] { + const envRatio = Number(process.env.GSTACK_BUDGET_RATIO); + const cap = opts?.ratioCap ?? (Number.isFinite(envRatio) && envRatio > 0 ? envRatio : 2.0); + // Floors avoid noise on tiny numbers (1 → 3 tools is 3× but meaningless). + const minPriorTools = opts?.minPriorTools ?? 5; + const minPriorTurns = opts?.minPriorTurns ?? 3; + const out: BudgetRegression[] = []; + for (const d of comparison.deltas) { + const beforeTools = Object.values(d.before.tool_summary ?? {}).reduce((a, b) => a + b, 0); + const afterTools = Object.values(d.after.tool_summary ?? {}).reduce((a, b) => a + b, 0); + const beforeTurns = d.before.turns_used ?? 0; + const afterTurns = d.after.turns_used ?? 0; + if (beforeTools >= minPriorTools && afterTools / beforeTools > cap) { + out.push({ testName: d.name, metric: 'tools', before: beforeTools, after: afterTools, ratio: afterTools / beforeTools }); + } + if (beforeTurns >= minPriorTurns && afterTurns / beforeTurns > cap) { + out.push({ testName: d.name, metric: 'turns', before: beforeTurns, after: afterTurns, ratio: afterTurns / beforeTurns }); + } + } + return out; +} + +/** + * Throw if any test in the comparison exceeds the budget cap. Convenience + * wrapper around findBudgetRegressions for use in test assertions. + */ +export function assertNoBudgetRegression( + comparison: ComparisonResult, + opts?: { ratioCap?: number; minPriorTools?: number; minPriorTurns?: number }, +): void { + const regressions = findBudgetRegressions(comparison, opts); + if (regressions.length === 0) return; + const cap = opts?.ratioCap ?? (Number(process.env.GSTACK_BUDGET_RATIO) || 2.0); + const lines = regressions.map( + r => ` "${r.testName}" ${r.metric}: ${r.before} → ${r.after} (${r.ratio.toFixed(2)}× > ${cap.toFixed(2)}× cap)`, + ); + throw new Error( + `Budget regression: ${regressions.length} test(s) exceeded ${cap.toFixed(2)}× prior usage:\n` + + lines.join('\n') + + `\n(Override per run: GSTACK_BUDGET_RATIO=. ${comparison.before_file} vs ${comparison.after_file})`, + ); +} + // --- EvalCollector --- function getGitInfo(): { branch: string; sha: string } {