mirror of
https://github.com/garrytan/gstack.git
synced 2026-06-19 00:00:13 +02:00
81fdf9cc61
Two new gate-tier guardrails for the v1.45.0.0 compression baseline: 1. test/skill-size-budget.test.ts (NEW) — per-skill SKILL.md size budget. Compares current state to test/fixtures/parity-baseline-v1.44.1.json. Three checks: per-skill (×1.05 default ratio), total corpus, and catalog token estimate (≤7000 for v1.45). The per-skill ratio is 1.05 not 1.0 because the T4 catalog trim moves text from frontmatter to a body section; small skills see a tiny body growth that's fine when offset by the much larger catalog-token win. 2. test/skill-budget-regression.test.ts EXTENDED — hard dollar cap on per-run eval cost. Per-tier defaults: gate $25, periodic $70. Umbrella EVALS_BUDGET_HARD_CAP=$30. Catches runaway eval costs (infinite retry, model price changes) before they amortize across PRs. Both checks support an override path with audit trail: GSTACK_SIZE_BUDGET_OVERRIDE_REASON="why this is OK" — size EVALS_BUDGET_OVERRIDE_REASON="why this is OK" — cost Overrides log to ~/.gstack/analytics/spend-overrides.jsonl with timestamp + scope + reason + CI provenance (runner, branch, commit) via test/helpers/budget-override.ts. Why the override audit: a hard cap with no escape valve becomes operationally hostile (legit price changes, longer transcripts, new required evals can all blow the cap). An override with no audit becomes "everyone overrides everything and the gate is theater." This module ships the audit half so reviewers can see what was waived and why. Codex 2nd-pass critique #3 absorbed: per-suite caps + override path with auditability + budget baselines checked into repo (parity-baseline-v1.44.1.json already in test/fixtures/). Test plan: - bun test test/skill-size-budget.test.ts: 4 pass (per-skill, corpus, catalog, baseline-exists) - bun test test/skill-budget-regression.test.ts: 4 pass (2 existing ratio checks + 2 new hard-cap checks) - Existing eval runs ($14.11 e2e, $0.02 llm-judge) sit well under the new caps Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
213 lines
7.9 KiB
TypeScript
213 lines
7.9 KiB
TypeScript
/**
|
||
* Tool-budget regression test (gate, free).
|
||
*
|
||
* Asserts: no test in the most recent eval run grew its tool calls or
|
||
* turns by more than 2× vs the prior recorded run. Pure library — does
|
||
* not spawn `claude` or pay any API cost. Reads the project eval dir
|
||
* (~/.gstack/projects/<slug>/evals/) and compares the latest run against
|
||
* its predecessor.
|
||
*
|
||
* First-run grace: if there's no prior run, the test passes vacuously.
|
||
* The purpose is to catch a SECOND-run regression — a real-world scenario
|
||
* is "preamble change shipped, /qa eval went from 30 tool calls to 90".
|
||
*
|
||
* Why two metrics (tools and turns): a regression that adds tool calls
|
||
* usually reflects an inefficient skill prompt; a regression that adds
|
||
* turns reflects a skill that is hesitating or losing track. Either is
|
||
* worth catching. We use a noise floor (5 tool calls / 3 turns) to
|
||
* avoid flagging tests that started tiny and got slightly bigger.
|
||
*
|
||
* Override: GSTACK_BUDGET_RATIO=<n> (default 2.0).
|
||
*
|
||
* Skipping: only the gate-level CI-blocking variant runs in EVALS_TIER=gate.
|
||
* The same logic runs anywhere `bun test` is invoked because comparison
|
||
* is free — no LLM cost.
|
||
*/
|
||
|
||
import { describe, test } from 'bun:test';
|
||
import { spawnSync } from 'child_process';
|
||
import * as fs from 'fs';
|
||
import * as path from 'path';
|
||
import {
|
||
getProjectEvalDir,
|
||
findPreviousRun,
|
||
compareEvalResults,
|
||
assertNoBudgetRegression,
|
||
type EvalResult,
|
||
} from './helpers/eval-store';
|
||
import { logBudgetOverride } from './helpers/budget-override';
|
||
|
||
/**
|
||
* v1.45.0.0 T5 — hard eval cost cap.
|
||
*
|
||
* Per-tier defaults (override via env):
|
||
* EVALS_BUDGET_HARD_CAP_GATE default $25/run
|
||
* EVALS_BUDGET_HARD_CAP_PERIODIC default $70/run
|
||
* EVALS_BUDGET_HARD_CAP umbrella cap if a tier-specific isn't set; default $30
|
||
* EVALS_BUDGET_OVERRIDE_REASON if set, override fires AND audit-logs to
|
||
* ~/.gstack/analytics/spend-overrides.jsonl
|
||
*
|
||
* Caps are dollars-per-run, not dollars-per-test. A test that legitimately
|
||
* gets more expensive should bake into the baseline; a runaway eval (infinite
|
||
* retry, model price change) gets stopped here.
|
||
*/
|
||
const DEFAULT_HARD_CAP_USD = Number(process.env.EVALS_BUDGET_HARD_CAP) || 30;
|
||
const TIER_CAPS: Record<'e2e' | 'llm-judge', number> = {
|
||
e2e: Number(process.env.EVALS_BUDGET_HARD_CAP_GATE) || DEFAULT_HARD_CAP_USD,
|
||
'llm-judge': Number(process.env.EVALS_BUDGET_HARD_CAP_PERIODIC) || Math.max(70, DEFAULT_HARD_CAP_USD),
|
||
};
|
||
|
||
function currentGitBranch(): string {
|
||
try {
|
||
const result = spawnSync('git', ['rev-parse', '--abbrev-ref', 'HEAD'], {
|
||
stdio: 'pipe', timeout: 3000,
|
||
});
|
||
return result.stdout?.toString().trim() || 'unknown';
|
||
} catch {
|
||
return 'unknown';
|
||
}
|
||
}
|
||
|
||
interface LatestRun {
|
||
filepath: string;
|
||
result: EvalResult;
|
||
}
|
||
|
||
/** Find the most recent finalized (non-_partial) eval file for a tier. */
|
||
function findLatestRun(evalDir: string, tier: 'e2e' | 'llm-judge'): LatestRun | null {
|
||
let entries: string[];
|
||
try {
|
||
entries = fs.readdirSync(evalDir);
|
||
} catch {
|
||
return null;
|
||
}
|
||
const candidates: Array<{ filepath: string; timestamp: string }> = [];
|
||
for (const f of entries) {
|
||
if (!f.endsWith('.json')) continue;
|
||
if (f.startsWith('_partial')) continue;
|
||
const fullPath = path.join(evalDir, f);
|
||
try {
|
||
const data = JSON.parse(fs.readFileSync(fullPath, 'utf-8')) as EvalResult;
|
||
if (data.tier !== tier) continue;
|
||
candidates.push({ filepath: fullPath, timestamp: data.timestamp ?? '' });
|
||
} catch { /* ignore corrupt */ }
|
||
}
|
||
if (candidates.length === 0) return null;
|
||
candidates.sort((a, b) => b.timestamp.localeCompare(a.timestamp));
|
||
const top = candidates[0]!;
|
||
return {
|
||
filepath: top.filepath,
|
||
result: JSON.parse(fs.readFileSync(top.filepath, 'utf-8')) as EvalResult,
|
||
};
|
||
}
|
||
|
||
function checkTier(tier: 'e2e' | 'llm-judge'): void {
|
||
const evalDir = getProjectEvalDir();
|
||
const latest = findLatestRun(evalDir, tier);
|
||
if (!latest) {
|
||
// eslint-disable-next-line no-console
|
||
console.log(`[budget-regression:${tier}] no current run in ${evalDir} — skipping`);
|
||
return;
|
||
}
|
||
// Branch alignment: only assert when the latest eval was actually
|
||
// produced by THIS checkout's branch. Cross-branch comparison would
|
||
// measure noise from unrelated work. Pre-existing eval history from
|
||
// other branches is not our regression to fix.
|
||
const myBranch = currentGitBranch();
|
||
if (latest.result.branch !== myBranch) {
|
||
// eslint-disable-next-line no-console
|
||
console.log(
|
||
`[budget-regression:${tier}] latest eval is from "${latest.result.branch}" ` +
|
||
`but current branch is "${myBranch}" — skipping (run evals on this branch first)`,
|
||
);
|
||
return;
|
||
}
|
||
const branch = latest.result.branch;
|
||
const priorPath = findPreviousRun(evalDir, tier, branch, latest.filepath);
|
||
if (!priorPath) {
|
||
// eslint-disable-next-line no-console
|
||
console.log(`[budget-regression:${tier}] no prior run found — first-run grace`);
|
||
return;
|
||
}
|
||
let prior: EvalResult;
|
||
try {
|
||
prior = JSON.parse(fs.readFileSync(priorPath, 'utf-8')) as EvalResult;
|
||
} catch (err) {
|
||
// eslint-disable-next-line no-console
|
||
console.warn(`[budget-regression:${tier}] could not read prior ${priorPath}: ${(err as Error).message}`);
|
||
return;
|
||
}
|
||
// Branch-scoped: only compare same-branch history. Cross-branch
|
||
// comparison is noisy (different branches do different work). If
|
||
// findPreviousRun fell back to another branch, treat as no prior.
|
||
if (prior.branch !== branch) {
|
||
// eslint-disable-next-line no-console
|
||
console.log(
|
||
`[budget-regression:${tier}] no same-branch prior (latest on "${branch}", prior on "${prior.branch}") — skipping`,
|
||
);
|
||
return;
|
||
}
|
||
const comparison = compareEvalResults(prior, latest.result, priorPath, latest.filepath);
|
||
// Throws on regression.
|
||
assertNoBudgetRegression(comparison);
|
||
// eslint-disable-next-line no-console
|
||
console.log(
|
||
`[budget-regression:${tier}] OK — ${comparison.deltas.length} test(s) compared, ` +
|
||
`${comparison.tool_count_before}→${comparison.tool_count_after} tools, ` +
|
||
`cost Δ $${comparison.total_cost_delta.toFixed(2)}`,
|
||
);
|
||
}
|
||
|
||
/** Enforce a hard dollar cap on per-run eval cost. */
|
||
function checkHardCap(tier: 'e2e' | 'llm-judge'): void {
|
||
const evalDir = getProjectEvalDir();
|
||
const latest = findLatestRun(evalDir, tier);
|
||
if (!latest) return;
|
||
const cap = TIER_CAPS[tier];
|
||
const cost = latest.result.total_cost_usd;
|
||
if (cost <= cap) {
|
||
// eslint-disable-next-line no-console
|
||
console.log(`[budget-hard-cap:${tier}] OK — $${cost.toFixed(2)} ≤ $${cap.toFixed(2)} cap`);
|
||
return;
|
||
}
|
||
const overrideReason = process.env.EVALS_BUDGET_OVERRIDE_REASON?.trim();
|
||
if (overrideReason) {
|
||
logBudgetOverride({
|
||
scope: `evals-cost-cap-${tier}`,
|
||
reason: overrideReason,
|
||
details: { tier, cap, observed_cost_usd: cost, run_file: latest.filepath },
|
||
});
|
||
// eslint-disable-next-line no-console
|
||
console.warn(
|
||
`[budget-hard-cap:${tier}] OVERRIDE APPLIED ("${overrideReason}") — $${cost.toFixed(2)} > $${cap.toFixed(2)} cap`,
|
||
);
|
||
return;
|
||
}
|
||
throw new Error(
|
||
`Eval cost exceeded hard cap for tier ${tier}: ` +
|
||
`$${cost.toFixed(2)} > $${cap.toFixed(2)}. ` +
|
||
`Set EVALS_BUDGET_OVERRIDE_REASON="why this is OK" to allow + audit. ` +
|
||
`Per-tier override: EVALS_BUDGET_HARD_CAP_${tier === 'e2e' ? 'GATE' : 'PERIODIC'}=<dollars>. ` +
|
||
`Run: ${latest.filepath}`,
|
||
);
|
||
}
|
||
|
||
describe('tool budget regression (gate, free)', () => {
|
||
test('no e2e test exceeds 2× prior tool calls or turns', () => {
|
||
checkTier('e2e');
|
||
});
|
||
|
||
test('no llm-judge test exceeds 2× prior tool calls or turns', () => {
|
||
checkTier('llm-judge');
|
||
});
|
||
|
||
// T5: hard dollar cap on per-run cost (different from regression ratio above)
|
||
test('e2e run cost ≤ EVALS_BUDGET_HARD_CAP_GATE', () => {
|
||
checkHardCap('e2e');
|
||
});
|
||
|
||
test('llm-judge run cost ≤ EVALS_BUDGET_HARD_CAP_PERIODIC', () => {
|
||
checkHardCap('llm-judge');
|
||
});
|
||
});
|