/** * Tool-budget regression test (gate, free). * * Asserts: no test in the most recent eval run grew its tool calls or * turns by more than 2× vs the prior recorded run. Pure library — does * not spawn `claude` or pay any API cost. Reads the project eval dir * (~/.gstack/projects//evals/) and compares the latest run against * its predecessor. * * First-run grace: if there's no prior run, the test passes vacuously. * The purpose is to catch a SECOND-run regression — a real-world scenario * is "preamble change shipped, /qa eval went from 30 tool calls to 90". * * Why two metrics (tools and turns): a regression that adds tool calls * usually reflects an inefficient skill prompt; a regression that adds * turns reflects a skill that is hesitating or losing track. Either is * worth catching. We use a noise floor (5 tool calls / 3 turns) to * avoid flagging tests that started tiny and got slightly bigger. * * Override: GSTACK_BUDGET_RATIO= (default 2.0). * * Skipping: only the gate-level CI-blocking variant runs in EVALS_TIER=gate. * The same logic runs anywhere `bun test` is invoked because comparison * is free — no LLM cost. */ import { describe, test } from 'bun:test'; import { spawnSync } from 'child_process'; import * as fs from 'fs'; import * as path from 'path'; import { getProjectEvalDir, findPreviousRun, compareEvalResults, assertNoBudgetRegression, type EvalResult, } from './helpers/eval-store'; import { logBudgetOverride } from './helpers/budget-override'; /** * v1.45.0.0 T5 — hard eval cost cap. * * Per-tier defaults (override via env): * EVALS_BUDGET_HARD_CAP_GATE default $25/run * EVALS_BUDGET_HARD_CAP_PERIODIC default $70/run * EVALS_BUDGET_HARD_CAP umbrella cap if a tier-specific isn't set; default $30 * EVALS_BUDGET_OVERRIDE_REASON if set, override fires AND audit-logs to * ~/.gstack/analytics/spend-overrides.jsonl * * Caps are dollars-per-run, not dollars-per-test. A test that legitimately * gets more expensive should bake into the baseline; a runaway eval (infinite * retry, model price change) gets stopped here. */ const DEFAULT_HARD_CAP_USD = Number(process.env.EVALS_BUDGET_HARD_CAP) || 30; const TIER_CAPS: Record<'e2e' | 'llm-judge', number> = { e2e: Number(process.env.EVALS_BUDGET_HARD_CAP_GATE) || DEFAULT_HARD_CAP_USD, 'llm-judge': Number(process.env.EVALS_BUDGET_HARD_CAP_PERIODIC) || Math.max(70, DEFAULT_HARD_CAP_USD), }; function currentGitBranch(): string { try { const result = spawnSync('git', ['rev-parse', '--abbrev-ref', 'HEAD'], { stdio: 'pipe', timeout: 3000, }); return result.stdout?.toString().trim() || 'unknown'; } catch { return 'unknown'; } } interface LatestRun { filepath: string; result: EvalResult; } /** Find the most recent finalized (non-_partial) eval file for a tier. */ function findLatestRun(evalDir: string, tier: 'e2e' | 'llm-judge'): LatestRun | null { let entries: string[]; try { entries = fs.readdirSync(evalDir); } catch { return null; } const candidates: Array<{ filepath: string; timestamp: string }> = []; for (const f of entries) { if (!f.endsWith('.json')) continue; if (f.startsWith('_partial')) continue; const fullPath = path.join(evalDir, f); try { const data = JSON.parse(fs.readFileSync(fullPath, 'utf-8')) as EvalResult; if (data.tier !== tier) continue; candidates.push({ filepath: fullPath, timestamp: data.timestamp ?? '' }); } catch { /* ignore corrupt */ } } if (candidates.length === 0) return null; candidates.sort((a, b) => b.timestamp.localeCompare(a.timestamp)); const top = candidates[0]!; return { filepath: top.filepath, result: JSON.parse(fs.readFileSync(top.filepath, 'utf-8')) as EvalResult, }; } function checkTier(tier: 'e2e' | 'llm-judge'): void { const evalDir = getProjectEvalDir(); const latest = findLatestRun(evalDir, tier); if (!latest) { // eslint-disable-next-line no-console console.log(`[budget-regression:${tier}] no current run in ${evalDir} — skipping`); return; } // Branch alignment: only assert when the latest eval was actually // produced by THIS checkout's branch. Cross-branch comparison would // measure noise from unrelated work. Pre-existing eval history from // other branches is not our regression to fix. const myBranch = currentGitBranch(); if (latest.result.branch !== myBranch) { // eslint-disable-next-line no-console console.log( `[budget-regression:${tier}] latest eval is from "${latest.result.branch}" ` + `but current branch is "${myBranch}" — skipping (run evals on this branch first)`, ); return; } const branch = latest.result.branch; const priorPath = findPreviousRun(evalDir, tier, branch, latest.filepath); if (!priorPath) { // eslint-disable-next-line no-console console.log(`[budget-regression:${tier}] no prior run found — first-run grace`); return; } let prior: EvalResult; try { prior = JSON.parse(fs.readFileSync(priorPath, 'utf-8')) as EvalResult; } catch (err) { // eslint-disable-next-line no-console console.warn(`[budget-regression:${tier}] could not read prior ${priorPath}: ${(err as Error).message}`); return; } // Branch-scoped: only compare same-branch history. Cross-branch // comparison is noisy (different branches do different work). If // findPreviousRun fell back to another branch, treat as no prior. if (prior.branch !== branch) { // eslint-disable-next-line no-console console.log( `[budget-regression:${tier}] no same-branch prior (latest on "${branch}", prior on "${prior.branch}") — skipping`, ); return; } const comparison = compareEvalResults(prior, latest.result, priorPath, latest.filepath); // Throws on regression. assertNoBudgetRegression(comparison); // eslint-disable-next-line no-console console.log( `[budget-regression:${tier}] OK — ${comparison.deltas.length} test(s) compared, ` + `${comparison.tool_count_before}→${comparison.tool_count_after} tools, ` + `cost Δ $${comparison.total_cost_delta.toFixed(2)}`, ); } /** Enforce a hard dollar cap on per-run eval cost. */ function checkHardCap(tier: 'e2e' | 'llm-judge'): void { const evalDir = getProjectEvalDir(); const latest = findLatestRun(evalDir, tier); if (!latest) return; const cap = TIER_CAPS[tier]; const cost = latest.result.total_cost_usd; if (cost <= cap) { // eslint-disable-next-line no-console console.log(`[budget-hard-cap:${tier}] OK — $${cost.toFixed(2)} ≤ $${cap.toFixed(2)} cap`); return; } const overrideReason = process.env.EVALS_BUDGET_OVERRIDE_REASON?.trim(); if (overrideReason) { logBudgetOverride({ scope: `evals-cost-cap-${tier}`, reason: overrideReason, details: { tier, cap, observed_cost_usd: cost, run_file: latest.filepath }, }); // eslint-disable-next-line no-console console.warn( `[budget-hard-cap:${tier}] OVERRIDE APPLIED ("${overrideReason}") — $${cost.toFixed(2)} > $${cap.toFixed(2)} cap`, ); return; } throw new Error( `Eval cost exceeded hard cap for tier ${tier}: ` + `$${cost.toFixed(2)} > $${cap.toFixed(2)}. ` + `Set EVALS_BUDGET_OVERRIDE_REASON="why this is OK" to allow + audit. ` + `Per-tier override: EVALS_BUDGET_HARD_CAP_${tier === 'e2e' ? 'GATE' : 'PERIODIC'}=. ` + `Run: ${latest.filepath}`, ); } describe('tool budget regression (gate, free)', () => { test('no e2e test exceeds 2× prior tool calls or turns', () => { checkTier('e2e'); }); test('no llm-judge test exceeds 2× prior tool calls or turns', () => { checkTier('llm-judge'); }); // T5: hard dollar cap on per-run cost (different from regression ratio above) test('e2e run cost ≤ EVALS_BUDGET_HARD_CAP_GATE', () => { checkHardCap('e2e'); }); test('llm-judge run cost ≤ EVALS_BUDGET_HARD_CAP_PERIODIC', () => { checkHardCap('llm-judge'); }); });