mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-02 03:35:09 +02:00
feat(test): harness primitives — parseNumberedOptions + budget regression utils
claude-pty-runner.ts:
- parseNumberedOptions(visible) anchors on the latest "❯ 1." cursor and
returns {index, label}[]; tests that route on option labels can find
indices without hard-coding positions
- isPermissionDialogVisible(visible) detects file-grant + workspace-trust
+ bash-permission shapes (multiple regex variants)
- isNumberedOptionListVisible: replaced \b2\. word-boundary regex with
[^0-9]2\. — stripAnsi removes TTY cursor-positioning escapes that
collapse "Option 2." to "Option2.", and \b fails on word-to-word
eval-store.ts:
- findBudgetRegressions(comparison, opts?) — pure function returning
tests where tools or turns grew >cap× vs prior run; floors at 5 prior
tools / 3 prior turns to avoid noise on tiny numbers
- assertNoBudgetRegression() — wrapper that throws with full violation
list. Env override GSTACK_BUDGET_RATIO
helpers-unit.test.ts: 23 unit tests covering empty/sparse/wrap-around
buffers for parseNumberedOptions, plus regression-floor + env-override
cases for findBudgetRegressions/assertNoBudgetRegression.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,290 @@
|
||||
/**
|
||||
* Unit tests for two helpers added alongside the new real-PTY E2E tests:
|
||||
*
|
||||
* - parseNumberedOptions(visible)
|
||||
* Parses `❯ 1.` / ` 2.` numbered-option lines out of TTY text.
|
||||
* Used by the AUQ format-compliance and mode-routing tests to look
|
||||
* up an option index by its label without hard-coding positions.
|
||||
*
|
||||
* - findBudgetRegressions / assertNoBudgetRegression(comparison)
|
||||
* Computes which tests grew >2× in tool calls or turns vs the prior
|
||||
* eval run. Used by the budget-regression test.
|
||||
*
|
||||
* Free, deterministic, runs under `bun test`.
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import { parseNumberedOptions } from './helpers/claude-pty-runner';
|
||||
import {
|
||||
assertNoBudgetRegression,
|
||||
findBudgetRegressions,
|
||||
type ComparisonResult,
|
||||
type TestDelta,
|
||||
} from './helpers/eval-store';
|
||||
|
||||
// --- parseNumberedOptions ---
|
||||
|
||||
describe('parseNumberedOptions', () => {
|
||||
test('returns [] for empty input', () => {
|
||||
expect(parseNumberedOptions('')).toEqual([]);
|
||||
});
|
||||
|
||||
test('returns [] when no numbered list is rendered', () => {
|
||||
expect(parseNumberedOptions('just some prose with no list')).toEqual([]);
|
||||
});
|
||||
|
||||
test('parses a basic 3-option list with cursor on first', () => {
|
||||
const visible = [
|
||||
'Some prompt prose above.',
|
||||
'',
|
||||
'❯ 1. HOLD SCOPE',
|
||||
' 2. SCOPE EXPANSION',
|
||||
' 3. SELECTIVE EXPANSION',
|
||||
'',
|
||||
].join('\n');
|
||||
expect(parseNumberedOptions(visible)).toEqual([
|
||||
{ index: 1, label: 'HOLD SCOPE' },
|
||||
{ index: 2, label: 'SCOPE EXPANSION' },
|
||||
{ index: 3, label: 'SELECTIVE EXPANSION' },
|
||||
]);
|
||||
});
|
||||
|
||||
test('parses cursor on a non-first option', () => {
|
||||
const visible = [
|
||||
' 1. Option A',
|
||||
'❯ 2. Option B',
|
||||
' 3. Option C',
|
||||
].join('\n');
|
||||
const opts = parseNumberedOptions(visible);
|
||||
expect(opts.map(o => o.index)).toEqual([1, 2, 3]);
|
||||
expect(opts.map(o => o.label)).toEqual(['Option A', 'Option B', 'Option C']);
|
||||
});
|
||||
|
||||
test('handles 9 options (max single-digit)', () => {
|
||||
const lines = ['❯ 1. one'];
|
||||
for (let i = 2; i <= 9; i++) lines.push(` ${i}. opt${i}`);
|
||||
const opts = parseNumberedOptions(lines.join('\n'));
|
||||
expect(opts.length).toBe(9);
|
||||
expect(opts[8]).toEqual({ index: 9, label: 'opt9' });
|
||||
});
|
||||
|
||||
test('truncates at first sequence gap', () => {
|
||||
// Real bug shape: prose contains "1. blah" and "2. blah" then a real
|
||||
// option list shows up later. We only return the consecutive run that
|
||||
// starts at 1.
|
||||
const visible = [
|
||||
'❯ 1. Real option',
|
||||
' 2. Other real option',
|
||||
'some prose',
|
||||
' 4. Stray number',
|
||||
].join('\n');
|
||||
expect(parseNumberedOptions(visible)).toEqual([
|
||||
{ index: 1, label: 'Real option' },
|
||||
{ index: 2, label: 'Other real option' },
|
||||
]);
|
||||
});
|
||||
|
||||
test('returns [] when sequence does not start at 1', () => {
|
||||
const visible = [' 3. orphan', ' 4. orphan'].join('\n');
|
||||
expect(parseNumberedOptions(visible)).toEqual([]);
|
||||
});
|
||||
|
||||
test('returns [] for a single option (need at least 2 to be a real list)', () => {
|
||||
expect(parseNumberedOptions('❯ 1. lonely')).toEqual([]);
|
||||
});
|
||||
|
||||
test('preserves trailing markers on labels (e.g. recommended)', () => {
|
||||
const visible = [
|
||||
'❯ 1. Cover all 4 modes (recommended)',
|
||||
' 2. Just HOLD + EXPANSION',
|
||||
].join('\n');
|
||||
const opts = parseNumberedOptions(visible);
|
||||
expect(opts[0]!.label).toContain('(recommended)');
|
||||
});
|
||||
|
||||
test('only matches the most recent list when buffer is large', () => {
|
||||
// First (stale) list, then >4KB of intervening text, then the real list.
|
||||
// parseNumberedOptions reads only the last 4KB, so the stale list is
|
||||
// dropped — this is the desired behavior for tests that re-open the
|
||||
// session and want the current prompt only.
|
||||
const stale = ['❯ 1. STALE_A', ' 2. STALE_B'].join('\n');
|
||||
const filler = 'x'.repeat(5000);
|
||||
const fresh = ['❯ 1. FRESH_A', ' 2. FRESH_B'].join('\n');
|
||||
const visible = stale + '\n' + filler + '\n' + fresh;
|
||||
const opts = parseNumberedOptions(visible);
|
||||
expect(opts.map(o => o.label)).toEqual(['FRESH_A', 'FRESH_B']);
|
||||
});
|
||||
|
||||
test('anchors on LAST cursor when both stale and fresh fit in the tail', () => {
|
||||
// Both lists fit in the same 4KB tail (small buffer). The granted
|
||||
// permission dialog options come first, the real AUQ comes second.
|
||||
// We must return the FRESH options, not the STALE ones.
|
||||
const visible = [
|
||||
'❯ 1. STALE_grant',
|
||||
' 2. STALE_deny',
|
||||
'some narration the agent printed after we granted',
|
||||
'and a few more lines of bash output',
|
||||
'❯ 1. FRESH_keep',
|
||||
' 2. FRESH_drop',
|
||||
].join('\n');
|
||||
const opts = parseNumberedOptions(visible);
|
||||
expect(opts.map(o => o.label)).toEqual(['FRESH_keep', 'FRESH_drop']);
|
||||
});
|
||||
|
||||
test('falls back to last `1.` if cursor is not currently rendered on option 1', () => {
|
||||
// The user pressed Down, so cursor is on option 2; but the parser
|
||||
// should still return options 1+2 by anchoring on the last `1.` line.
|
||||
const visible = [
|
||||
' 1. Option A',
|
||||
'❯ 2. Option B',
|
||||
' 3. Option C',
|
||||
].join('\n');
|
||||
const opts = parseNumberedOptions(visible);
|
||||
expect(opts.map(o => o.label)).toEqual(['Option A', 'Option B', 'Option C']);
|
||||
});
|
||||
});
|
||||
|
||||
// --- findBudgetRegressions / assertNoBudgetRegression ---
|
||||
|
||||
function makeDelta(
|
||||
name: string,
|
||||
beforeTools: Record<string, number>,
|
||||
afterTools: Record<string, number>,
|
||||
beforeTurns?: number,
|
||||
afterTurns?: number,
|
||||
): TestDelta {
|
||||
return {
|
||||
name,
|
||||
before: { passed: true, cost_usd: 0, tool_summary: beforeTools, turns_used: beforeTurns },
|
||||
after: { passed: true, cost_usd: 0, tool_summary: afterTools, turns_used: afterTurns },
|
||||
status_change: 'unchanged',
|
||||
};
|
||||
}
|
||||
|
||||
function makeComparison(deltas: TestDelta[]): ComparisonResult {
|
||||
return {
|
||||
before_file: '/tmp/before.json',
|
||||
after_file: '/tmp/after.json',
|
||||
before_branch: 'main',
|
||||
after_branch: 'feat/x',
|
||||
before_timestamp: '2025-01-01T00:00:00Z',
|
||||
after_timestamp: '2025-01-02T00:00:00Z',
|
||||
deltas,
|
||||
total_cost_delta: 0,
|
||||
total_duration_delta: 0,
|
||||
improved: 0,
|
||||
regressed: 0,
|
||||
unchanged: deltas.length,
|
||||
tool_count_before: 0,
|
||||
tool_count_after: 0,
|
||||
};
|
||||
}
|
||||
|
||||
describe('findBudgetRegressions', () => {
|
||||
test('empty comparison → no regressions', () => {
|
||||
expect(findBudgetRegressions(makeComparison([]))).toEqual([]);
|
||||
});
|
||||
|
||||
test('no regression when after ≤ 2× before for tools', () => {
|
||||
const c = makeComparison([
|
||||
makeDelta('a', { Bash: 10 }, { Bash: 19 }), // 1.9× — under cap
|
||||
]);
|
||||
expect(findBudgetRegressions(c)).toEqual([]);
|
||||
});
|
||||
|
||||
test('flags >2× tool growth', () => {
|
||||
const c = makeComparison([
|
||||
makeDelta('a', { Bash: 10, Read: 5 }, { Bash: 25, Read: 12 }), // 15→37 = 2.47×
|
||||
]);
|
||||
const regs = findBudgetRegressions(c);
|
||||
expect(regs.length).toBe(1);
|
||||
expect(regs[0]!.metric).toBe('tools');
|
||||
expect(regs[0]!.before).toBe(15);
|
||||
expect(regs[0]!.after).toBe(37);
|
||||
});
|
||||
|
||||
test('flags >2× turn growth independently of tools', () => {
|
||||
const c = makeComparison([
|
||||
makeDelta('a', { Bash: 10 }, { Bash: 12 }, 5, 15), // turns 5→15 = 3×
|
||||
]);
|
||||
const regs = findBudgetRegressions(c);
|
||||
expect(regs.length).toBe(1);
|
||||
expect(regs[0]!.metric).toBe('turns');
|
||||
});
|
||||
|
||||
test('skips tests with no prior tool data (new test)', () => {
|
||||
const c = makeComparison([
|
||||
makeDelta('new-test', {}, { Bash: 100 }), // no prior — should not flag
|
||||
]);
|
||||
expect(findBudgetRegressions(c)).toEqual([]);
|
||||
});
|
||||
|
||||
test('skips when prior tool count is below the floor (noise floor)', () => {
|
||||
// 1 → 4 tools is 4× ratio but meaningless on tiny numbers.
|
||||
const c = makeComparison([
|
||||
makeDelta('tiny', { Bash: 1 }, { Bash: 4 }),
|
||||
]);
|
||||
expect(findBudgetRegressions(c)).toEqual([]);
|
||||
});
|
||||
|
||||
test('respects ratioCap override', () => {
|
||||
const c = makeComparison([
|
||||
makeDelta('a', { Bash: 10 }, { Bash: 16 }), // 1.6×
|
||||
]);
|
||||
expect(findBudgetRegressions(c, { ratioCap: 1.5 }).length).toBe(1);
|
||||
expect(findBudgetRegressions(c, { ratioCap: 2.0 }).length).toBe(0);
|
||||
});
|
||||
|
||||
test('respects GSTACK_BUDGET_RATIO env override', () => {
|
||||
const c = makeComparison([
|
||||
makeDelta('a', { Bash: 10 }, { Bash: 16 }), // 1.6×
|
||||
]);
|
||||
const prev = process.env.GSTACK_BUDGET_RATIO;
|
||||
try {
|
||||
process.env.GSTACK_BUDGET_RATIO = '1.5';
|
||||
expect(findBudgetRegressions(c).length).toBe(1);
|
||||
process.env.GSTACK_BUDGET_RATIO = '2.0';
|
||||
expect(findBudgetRegressions(c).length).toBe(0);
|
||||
} finally {
|
||||
if (prev === undefined) delete process.env.GSTACK_BUDGET_RATIO;
|
||||
else process.env.GSTACK_BUDGET_RATIO = prev;
|
||||
}
|
||||
});
|
||||
|
||||
test('handles missing tool_summary gracefully', () => {
|
||||
const delta: TestDelta = {
|
||||
name: 'sparse',
|
||||
before: { passed: true, cost_usd: 0 },
|
||||
after: { passed: true, cost_usd: 0 },
|
||||
status_change: 'unchanged',
|
||||
};
|
||||
expect(findBudgetRegressions(makeComparison([delta]))).toEqual([]);
|
||||
});
|
||||
});
|
||||
|
||||
describe('assertNoBudgetRegression', () => {
|
||||
test('does not throw on a clean comparison', () => {
|
||||
const c = makeComparison([
|
||||
makeDelta('a', { Bash: 10 }, { Bash: 11 }),
|
||||
]);
|
||||
expect(() => assertNoBudgetRegression(c)).not.toThrow();
|
||||
});
|
||||
|
||||
test('throws with all violations and the cap value in the message', () => {
|
||||
const c = makeComparison([
|
||||
makeDelta('regressed-tools', { Bash: 10 }, { Bash: 30 }),
|
||||
makeDelta('regressed-turns', { Bash: 5 }, { Bash: 6 }, 4, 13),
|
||||
]);
|
||||
let err: Error | null = null;
|
||||
try {
|
||||
assertNoBudgetRegression(c);
|
||||
} catch (e) {
|
||||
err = e as Error;
|
||||
}
|
||||
expect(err).not.toBeNull();
|
||||
expect(err!.message).toContain('regressed-tools');
|
||||
expect(err!.message).toContain('regressed-turns');
|
||||
expect(err!.message).toContain('2.00×'); // default cap
|
||||
expect(err!.message).toContain('GSTACK_BUDGET_RATIO');
|
||||
});
|
||||
});
|
||||
@@ -138,12 +138,127 @@ export function isPlanReadyVisible(visible: string): boolean {
|
||||
return /ready to execute|Would you like to proceed/i.test(visible);
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect a Claude Code permission dialog. These render as a numbered
|
||||
* option list (so isNumberedOptionListVisible matches them) but they
|
||||
* are NOT a skill's AskUserQuestion — they're claude asking the user
|
||||
* whether to grant a tool/file permission. Tests that look for skill
|
||||
* AUQs must explicitly skip these.
|
||||
*
|
||||
* Both English phrases below are stable across recent Claude Code
|
||||
* versions. The check is permissive on whitespace because TTY rendering
|
||||
* may wrap or reflow text.
|
||||
*/
|
||||
export function isPermissionDialogVisible(visible: string): boolean {
|
||||
return (
|
||||
/requested\s+permissions?\s+to/i.test(visible) ||
|
||||
/Do\s+you\s+want\s+to\s+proceed\?/i.test(visible) ||
|
||||
// "Yes / Yes, allow all edits / No" shape rendered by Claude Code for
|
||||
// file-edit permission grants. The middle option's "allow all" phrase
|
||||
// is the unique signature.
|
||||
/\ballow\s+all\s+edits\b/i.test(visible) ||
|
||||
// "Yes, and always allow access to <dir>" shape (workspace trust).
|
||||
/always\s+allow\s+access\s+to/i.test(visible) ||
|
||||
// Bash command permission prompts.
|
||||
/Bash\s+command\s+.*\s+requires\s+permission/i.test(visible)
|
||||
);
|
||||
}
|
||||
|
||||
/** Detect any AskUserQuestion-shaped numbered option list with cursor. */
|
||||
export function isNumberedOptionListVisible(visible: string): boolean {
|
||||
// ❯ cursor + at least two numbered options 1-9.
|
||||
// Matches the trust dialog AND plan-ready prompt AND skill questions.
|
||||
// Tighter classification happens via scope (after-trust, after-skill-cmd, etc).
|
||||
return /❯\s*1\./.test(visible) && /\b2\./.test(visible);
|
||||
//
|
||||
// Note on the `2\.` regex: the TTY uses cursor-positioning escape codes
|
||||
// (`\x1b[40C`) for whitespace which stripAnsi removes — collapsing
|
||||
// `text 2.` to `text2.`. A `\b2\.` word-boundary regex therefore fails
|
||||
// because `t-2` is a word-to-word transition. We use the weaker
|
||||
// `[^0-9]2\.` to require a non-digit before `2` (so we don't match
|
||||
// `12.0`) without requiring whitespace.
|
||||
return /❯\s*1\./.test(visible) && /(^|[^0-9])2\./.test(visible);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a rendered numbered-option list out of the visible TTY text.
|
||||
*
|
||||
* Looks for lines like `❯ 1. label` (cursor) or ` 2. label` (no cursor)
|
||||
* and returns them in order. Used by tests that need to ROUTE on a specific
|
||||
* option label (e.g. answer "HOLD SCOPE" by sending its index + Enter)
|
||||
* without hard-coding positional indexes that drift when option order
|
||||
* changes between skill versions.
|
||||
*
|
||||
* Reads only the LAST 4KB of visible to avoid matching stale option lists
|
||||
* from earlier prompts in the session.
|
||||
*
|
||||
* Returns [] when no list is rendered. Otherwise returns indices in the
|
||||
* order they appear (1-based, matching what the user types). Labels are
|
||||
* trimmed but otherwise verbatim from the TTY (may include trailing
|
||||
* `(recommended)` markers, etc).
|
||||
*/
|
||||
export function parseNumberedOptions(
|
||||
visible: string,
|
||||
): Array<{ index: number; label: string }> {
|
||||
const tail = visible.length > 4096 ? visible.slice(-4096) : visible;
|
||||
// Split on lines, look for `❯ N.` or ` N.` patterns. Up to N=9.
|
||||
// The `\s*` after `.` (not `\s+`) is required because stripAnsi removes
|
||||
// TTY cursor-positioning escapes that render as spaces, so a label that
|
||||
// visually reads "1. Option" can come through as "1.Option".
|
||||
const optionRe = /^[\s❯]*([1-9])\.\s*(\S.*?)\s*$/;
|
||||
// We anchor on the LATEST `❯ 1.` line in the buffer — the cursor marker
|
||||
// for the active AUQ. Older numbered lists (e.g., a granted permission
|
||||
// dialog still in scrollback) sit above it and must be ignored. Without
|
||||
// this, parseNumberedOptions returns stale options after the dialog is
|
||||
// dismissed.
|
||||
const lines = tail.split('\n');
|
||||
// Anchor on the LAST `❯ 1.` line (cursor is on option 1 of the active
|
||||
// AUQ). Greedy character classes don't help here — we need a literal
|
||||
// `❯` after optional leading whitespace.
|
||||
let cursorLineIdx = -1;
|
||||
for (let i = lines.length - 1; i >= 0; i--) {
|
||||
if (/^\s*❯\s*1\./.test(lines[i] ?? '')) {
|
||||
cursorLineIdx = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Fallback: if cursor isn't on option 1 (user pressed Down), find the
|
||||
// last `1.` line. Allow leading ` ` or `❯ ` prefixes; do NOT include `❯`
|
||||
// in the leading character class because greedy matching would eat the
|
||||
// sigil and prevent the literal-cursor anchor above from finding it.
|
||||
if (cursorLineIdx < 0) {
|
||||
for (let i = lines.length - 1; i >= 0; i--) {
|
||||
if (/^(?:\s*|\s*❯\s+)1\./.test(lines[i] ?? '')) {
|
||||
cursorLineIdx = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (cursorLineIdx < 0) return [];
|
||||
const found: Array<{ index: number; label: string }> = [];
|
||||
const seenIndices = new Set<number>();
|
||||
for (let i = cursorLineIdx; i < lines.length; i++) {
|
||||
const m = optionRe.exec(lines[i] ?? '');
|
||||
if (!m) continue;
|
||||
const idx = Number(m[1]);
|
||||
const label = (m[2] ?? '').trim();
|
||||
if (seenIndices.has(idx)) continue;
|
||||
if (label.length === 0) continue;
|
||||
seenIndices.add(idx);
|
||||
found.push({ index: idx, label });
|
||||
}
|
||||
// Only return if we found a sequential 1.., 2.., ... block (at least 2
|
||||
// consecutive options starting at 1). Otherwise it's noise (e.g. a
|
||||
// numbered list inside prose, like "1. Read the file").
|
||||
found.sort((a, b) => a.index - b.index);
|
||||
if (found.length < 2) return [];
|
||||
if (found[0]!.index !== 1) return [];
|
||||
for (let i = 1; i < found.length; i++) {
|
||||
if (found[i]!.index !== found[i - 1]!.index + 1) {
|
||||
// Truncate at the first gap.
|
||||
return found.slice(0, i);
|
||||
}
|
||||
}
|
||||
return found;
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -554,6 +554,71 @@ export function generateCommentary(c: ComparisonResult): string[] {
|
||||
return notes;
|
||||
}
|
||||
|
||||
// --- Budget regression assertion ---
|
||||
|
||||
export interface BudgetRegression {
|
||||
testName: string;
|
||||
metric: 'tools' | 'turns';
|
||||
before: number;
|
||||
after: number;
|
||||
ratio: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute budget regressions: tests where tool calls or turns grew by more
|
||||
* than `ratioCap` between two runs. Pure function — caller decides how to
|
||||
* surface the result. Used by test/skill-budget-regression.test.ts and any
|
||||
* future ship gate.
|
||||
*
|
||||
* `ratioCap` defaults to 2.0 (>2× growth is a regression). Override via
|
||||
* `GSTACK_BUDGET_RATIO` env var. New tests with no prior data are skipped.
|
||||
*/
|
||||
export function findBudgetRegressions(
|
||||
comparison: ComparisonResult,
|
||||
opts?: { ratioCap?: number; minPriorTools?: number; minPriorTurns?: number },
|
||||
): BudgetRegression[] {
|
||||
const envRatio = Number(process.env.GSTACK_BUDGET_RATIO);
|
||||
const cap = opts?.ratioCap ?? (Number.isFinite(envRatio) && envRatio > 0 ? envRatio : 2.0);
|
||||
// Floors avoid noise on tiny numbers (1 → 3 tools is 3× but meaningless).
|
||||
const minPriorTools = opts?.minPriorTools ?? 5;
|
||||
const minPriorTurns = opts?.minPriorTurns ?? 3;
|
||||
const out: BudgetRegression[] = [];
|
||||
for (const d of comparison.deltas) {
|
||||
const beforeTools = Object.values(d.before.tool_summary ?? {}).reduce((a, b) => a + b, 0);
|
||||
const afterTools = Object.values(d.after.tool_summary ?? {}).reduce((a, b) => a + b, 0);
|
||||
const beforeTurns = d.before.turns_used ?? 0;
|
||||
const afterTurns = d.after.turns_used ?? 0;
|
||||
if (beforeTools >= minPriorTools && afterTools / beforeTools > cap) {
|
||||
out.push({ testName: d.name, metric: 'tools', before: beforeTools, after: afterTools, ratio: afterTools / beforeTools });
|
||||
}
|
||||
if (beforeTurns >= minPriorTurns && afterTurns / beforeTurns > cap) {
|
||||
out.push({ testName: d.name, metric: 'turns', before: beforeTurns, after: afterTurns, ratio: afterTurns / beforeTurns });
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Throw if any test in the comparison exceeds the budget cap. Convenience
|
||||
* wrapper around findBudgetRegressions for use in test assertions.
|
||||
*/
|
||||
export function assertNoBudgetRegression(
|
||||
comparison: ComparisonResult,
|
||||
opts?: { ratioCap?: number; minPriorTools?: number; minPriorTurns?: number },
|
||||
): void {
|
||||
const regressions = findBudgetRegressions(comparison, opts);
|
||||
if (regressions.length === 0) return;
|
||||
const cap = opts?.ratioCap ?? (Number(process.env.GSTACK_BUDGET_RATIO) || 2.0);
|
||||
const lines = regressions.map(
|
||||
r => ` "${r.testName}" ${r.metric}: ${r.before} → ${r.after} (${r.ratio.toFixed(2)}× > ${cap.toFixed(2)}× cap)`,
|
||||
);
|
||||
throw new Error(
|
||||
`Budget regression: ${regressions.length} test(s) exceeded ${cap.toFixed(2)}× prior usage:\n` +
|
||||
lines.join('\n') +
|
||||
`\n(Override per run: GSTACK_BUDGET_RATIO=<n>. ${comparison.before_file} vs ${comparison.after_file})`,
|
||||
);
|
||||
}
|
||||
|
||||
// --- EvalCollector ---
|
||||
|
||||
function getGitInfo(): { branch: string; sha: string } {
|
||||
|
||||
Reference in New Issue
Block a user