feat: add eval format validation, tier selection, cost tracking

- lib/eval-format.ts: StandardEvalResult interfaces, validateEvalResult(),
  normalizeFromLegacy/normalizeToLegacy round-trip converters
- lib/eval-tier.ts: EvalTier type, resolveTier/resolveJudgeTier from env,
  tierToModel mapping, TIER_ALIASES (haiku→fast, sonnet→standard, opus→full)
- lib/eval-cost.ts: MODEL_PRICING (last verified 2025-05-01), computeCosts(),
  formatCostDashboard(), aggregateCosts(), fallback for unknown models
- 42 tests across 3 test files

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-03-15 09:39:18 -05:00
parent 7f7035f55a
commit 9bc6c9416f
6 changed files with 846 additions and 0 deletions
+158
View File
@@ -0,0 +1,158 @@
/**
* Per-model cost tracking for eval runs.
*
* Computes cost breakdowns from CostEntry arrays and formats
* them as terminal tables. Supports aggregation across multiple runs.
*/
import type { CostEntry, StandardEvalResult } from './eval-format';
// --- Interfaces ---
export interface CostSummary {
model: string;
calls: number;
input_tokens: number;
output_tokens: number;
estimated_cost_usd: number;
}
export interface CostDashboard {
entries: CostSummary[];
total: number;
at_fast_tier: number;
at_full_tier: number;
}
// --- Pricing ---
/**
* Per-million-token pricing for Claude models.
* Last verified: 2025-05-01
*/
export const MODEL_PRICING: Record<string, { input: number; output: number }> = {
'claude-opus-4-6': { input: 15.00, output: 75.00 },
'claude-sonnet-4-6': { input: 3.00, output: 15.00 },
'claude-haiku-4-5': { input: 0.80, output: 4.00 },
// Legacy model IDs
'claude-3-5-sonnet-20241022': { input: 3.00, output: 15.00 },
'claude-3-5-haiku-20241022': { input: 0.80, output: 4.00 },
'claude-3-opus-20240229': { input: 15.00, output: 75.00 },
};
/** Fallback pricing for unknown models (use sonnet pricing as a safe middle ground). */
const FALLBACK_PRICING = { input: 3.00, output: 15.00 };
// --- Computation ---
function getPricing(model: string): { input: number; output: number } {
return MODEL_PRICING[model] || FALLBACK_PRICING;
}
/**
* Compute per-model cost summaries from an array of CostEntry records.
*/
export function computeCosts(costs: CostEntry[]): CostDashboard {
const byModel = new Map<string, CostSummary>();
for (const entry of costs) {
const existing = byModel.get(entry.model);
if (existing) {
existing.calls += entry.calls;
existing.input_tokens += entry.input_tokens;
existing.output_tokens += entry.output_tokens;
} else {
byModel.set(entry.model, {
model: entry.model,
calls: entry.calls,
input_tokens: entry.input_tokens,
output_tokens: entry.output_tokens,
estimated_cost_usd: 0,
});
}
}
// Calculate costs
let total = 0;
let atFast = 0;
let atFull = 0;
const fastPricing = MODEL_PRICING['claude-haiku-4-5'] || FALLBACK_PRICING;
const fullPricing = MODEL_PRICING['claude-opus-4-6'] || FALLBACK_PRICING;
for (const summary of byModel.values()) {
const pricing = getPricing(summary.model);
summary.estimated_cost_usd =
(summary.input_tokens / 1_000_000) * pricing.input +
(summary.output_tokens / 1_000_000) * pricing.output;
total += summary.estimated_cost_usd;
// What-if at fast/full tiers
atFast +=
(summary.input_tokens / 1_000_000) * fastPricing.input +
(summary.output_tokens / 1_000_000) * fastPricing.output;
atFull +=
(summary.input_tokens / 1_000_000) * fullPricing.input +
(summary.output_tokens / 1_000_000) * fullPricing.output;
}
const entries = [...byModel.values()].sort((a, b) => b.estimated_cost_usd - a.estimated_cost_usd);
return {
entries,
total: Math.round(total * 1_000_000) / 1_000_000,
at_fast_tier: Math.round(atFast * 1_000_000) / 1_000_000,
at_full_tier: Math.round(atFull * 1_000_000) / 1_000_000,
};
}
/**
* Format a CostDashboard as a terminal table.
*/
export function formatCostDashboard(dashboard: CostDashboard): string {
const lines: string[] = [];
lines.push('');
lines.push('Cost Breakdown');
lines.push('═'.repeat(75));
lines.push(
' ' +
'Model'.padEnd(32) +
'Calls'.padEnd(8) +
'In Tokens'.padEnd(12) +
'Out Tokens'.padEnd(12) +
'Cost'
);
lines.push('─'.repeat(75));
for (const entry of dashboard.entries) {
const model = entry.model.length > 30 ? entry.model.slice(0, 27) + '...' : entry.model.padEnd(32);
lines.push(
` ${model}` +
`${entry.calls}`.padEnd(8) +
`${entry.input_tokens.toLocaleString()}`.padEnd(12) +
`${entry.output_tokens.toLocaleString()}`.padEnd(12) +
`$${entry.estimated_cost_usd.toFixed(4)}`
);
}
lines.push('─'.repeat(75));
lines.push(` Total: $${dashboard.total.toFixed(4)}`);
lines.push(` At fast tier (Haiku): $${dashboard.at_fast_tier.toFixed(4)}`);
lines.push(` At full tier (Opus): $${dashboard.at_full_tier.toFixed(4)}`);
lines.push('');
return lines.join('\n');
}
/**
* Aggregate costs across multiple StandardEvalResult runs.
* Merges all costs[] arrays and computes a single dashboard.
*/
export function aggregateCosts(results: StandardEvalResult[]): CostDashboard {
const allCosts: CostEntry[] = [];
for (const r of results) {
if (r.costs) {
allCosts.push(...r.costs);
}
}
return computeCosts(allCosts);
}
+229
View File
@@ -0,0 +1,229 @@
/**
* Standard eval result format — validation and normalization.
*
* Superset of the legacy EvalResult from test/helpers/eval-store.ts.
* Any language can produce a JSON file matching StandardEvalResult and
* push it through `gstack eval push`.
*/
import type { EvalResult, EvalTestEntry } from '../test/helpers/eval-store';
// --- Interfaces ---
export interface CostEntry {
model: string;
calls: number;
input_tokens: number;
output_tokens: number;
}
export interface FailureEntry {
test_name: string;
error: string;
category?: string;
}
export interface ComparisonEntry {
label: string;
model: string;
score: number;
cost_usd: number;
}
export interface StandardTestEntry {
name: string;
suite: string;
tier: string;
passed: boolean;
duration_ms: number;
cost_usd: number;
output?: Record<string, unknown>;
// Optional fields from legacy format
turns_used?: number;
exit_reason?: string;
detection_rate?: number;
false_positives?: number;
evidence_quality?: number;
detected_bugs?: string[];
missed_bugs?: string[];
judge_scores?: Record<string, number>;
judge_reasoning?: string;
error?: string;
}
export interface StandardEvalResult {
schema_version: number;
version: string;
label?: string;
git_branch: string;
git_sha: string;
timestamp: string;
hostname: string;
tier: string;
total: number;
passed: number;
failed: number;
total_cost_usd: number;
duration_seconds: number;
all_results: StandardTestEntry[];
prompt_sha?: string;
by_category?: Record<string, { passed: number; failed: number }>;
costs?: CostEntry[];
comparison?: ComparisonEntry[];
failures?: FailureEntry[];
_partial?: boolean;
}
// --- Validation ---
const REQUIRED_FIELDS: Array<[string, string]> = [
['schema_version', 'number'],
['version', 'string'],
['git_branch', 'string'],
['git_sha', 'string'],
['timestamp', 'string'],
['tier', 'string'],
['total', 'number'],
['passed', 'number'],
['failed', 'number'],
['total_cost_usd', 'number'],
['duration_seconds', 'number'],
['all_results', 'object'], // array check below
];
/**
* Validate that an unknown value conforms to StandardEvalResult.
* Returns { valid: true, errors: [] } or { valid: false, errors: [...] }.
*/
export function validateEvalResult(data: unknown): { valid: boolean; errors: string[] } {
const errors: string[] = [];
if (data === null || typeof data !== 'object') {
return { valid: false, errors: ['Input must be a non-null object'] };
}
const obj = data as Record<string, unknown>;
for (const [field, expectedType] of REQUIRED_FIELDS) {
if (!(field in obj)) {
errors.push(`Missing required field: ${field}`);
} else if (typeof obj[field] !== expectedType) {
errors.push(`Field "${field}" must be ${expectedType}, got ${typeof obj[field]}`);
}
}
// all_results must be an array
if ('all_results' in obj && !Array.isArray(obj.all_results)) {
errors.push('Field "all_results" must be an array');
}
// Validate each test entry minimally
if (Array.isArray(obj.all_results)) {
for (let i = 0; i < obj.all_results.length; i++) {
const entry = obj.all_results[i];
if (typeof entry !== 'object' || entry === null) {
errors.push(`all_results[${i}] must be an object`);
continue;
}
if (typeof (entry as Record<string, unknown>).name !== 'string') {
errors.push(`all_results[${i}].name must be a string`);
}
if (typeof (entry as Record<string, unknown>).passed !== 'boolean') {
errors.push(`all_results[${i}].passed must be a boolean`);
}
}
}
return { valid: errors.length === 0, errors };
}
// --- Normalization ---
/**
* Convert legacy EvalResult → StandardEvalResult.
*/
export function normalizeFromLegacy(legacy: EvalResult): StandardEvalResult {
return {
schema_version: legacy.schema_version,
version: legacy.version,
git_branch: legacy.branch,
git_sha: legacy.git_sha,
timestamp: legacy.timestamp,
hostname: legacy.hostname,
tier: legacy.tier,
total: legacy.total_tests,
passed: legacy.passed,
failed: legacy.failed,
total_cost_usd: legacy.total_cost_usd,
duration_seconds: Math.round(legacy.total_duration_ms / 1000),
all_results: legacy.tests.map(legacyTestToStandard),
_partial: legacy._partial,
};
}
function legacyTestToStandard(t: EvalTestEntry): StandardTestEntry {
const entry: StandardTestEntry = {
name: t.name,
suite: t.suite,
tier: t.tier,
passed: t.passed,
duration_ms: t.duration_ms,
cost_usd: t.cost_usd,
};
if (t.turns_used !== undefined) entry.turns_used = t.turns_used;
if (t.exit_reason !== undefined) entry.exit_reason = t.exit_reason;
if (t.detection_rate !== undefined) entry.detection_rate = t.detection_rate;
if (t.false_positives !== undefined) entry.false_positives = t.false_positives;
if (t.evidence_quality !== undefined) entry.evidence_quality = t.evidence_quality;
if (t.detected_bugs) entry.detected_bugs = t.detected_bugs;
if (t.missed_bugs) entry.missed_bugs = t.missed_bugs;
if (t.judge_scores) entry.judge_scores = t.judge_scores;
if (t.judge_reasoning !== undefined) entry.judge_reasoning = t.judge_reasoning;
if (t.error !== undefined) entry.error = t.error;
return entry;
}
/**
* Convert StandardEvalResult → legacy EvalResult for compat with existing compare/list.
*/
export function normalizeToLegacy(standard: StandardEvalResult): EvalResult {
return {
schema_version: standard.schema_version,
version: standard.version,
branch: standard.git_branch,
git_sha: standard.git_sha,
timestamp: standard.timestamp,
hostname: standard.hostname,
tier: standard.tier as 'e2e' | 'llm-judge',
total_tests: standard.total,
passed: standard.passed,
failed: standard.failed,
total_cost_usd: standard.total_cost_usd,
total_duration_ms: standard.duration_seconds * 1000,
tests: standard.all_results.map(standardTestToLegacy),
_partial: standard._partial,
};
}
function standardTestToLegacy(t: StandardTestEntry): EvalTestEntry {
const entry: EvalTestEntry = {
name: t.name,
suite: t.suite,
tier: t.tier as 'e2e' | 'llm-judge',
passed: t.passed,
duration_ms: t.duration_ms,
cost_usd: t.cost_usd,
};
if (t.turns_used !== undefined) entry.turns_used = t.turns_used;
if (t.exit_reason !== undefined) entry.exit_reason = t.exit_reason;
if (t.detection_rate !== undefined) entry.detection_rate = t.detection_rate;
if (t.false_positives !== undefined) entry.false_positives = t.false_positives;
if (t.evidence_quality !== undefined) entry.evidence_quality = t.evidence_quality;
if (t.detected_bugs) entry.detected_bugs = t.detected_bugs;
if (t.missed_bugs) entry.missed_bugs = t.missed_bugs;
if (t.judge_scores) entry.judge_scores = t.judge_scores;
if (t.judge_reasoning !== undefined) entry.judge_reasoning = t.judge_reasoning;
if (t.error !== undefined) entry.error = t.error;
return entry;
}
+51
View File
@@ -0,0 +1,51 @@
/**
* Model tier selection for evals.
*
* Maps tier names to Claude models. Supports env var overrides
* for EVAL_TIER and EVAL_JUDGE_TIER.
*/
export type EvalTier = 'fast' | 'standard' | 'full';
export const TIER_ALIASES: Record<string, EvalTier> = {
haiku: 'fast',
sonnet: 'standard',
opus: 'full',
};
const TIER_TO_MODEL: Record<EvalTier, string> = {
fast: 'claude-haiku-4-5',
standard: 'claude-sonnet-4-6',
full: 'claude-opus-4-6',
};
/**
* Resolve the eval tier from EVAL_TIER env var.
* Supports both tier names ('fast', 'standard', 'full') and
* model aliases ('haiku', 'sonnet', 'opus').
* Defaults to 'standard'.
*/
export function resolveTier(): EvalTier {
const raw = process.env.EVAL_TIER?.toLowerCase().trim();
if (!raw) return 'standard';
if (raw in TIER_ALIASES) return TIER_ALIASES[raw];
if (raw === 'fast' || raw === 'standard' || raw === 'full') return raw;
return 'standard';
}
/**
* Resolve the judge tier from EVAL_JUDGE_TIER env var.
* Falls back to resolveTier() if not set.
*/
export function resolveJudgeTier(): EvalTier {
const raw = process.env.EVAL_JUDGE_TIER?.toLowerCase().trim();
if (!raw) return resolveTier();
if (raw in TIER_ALIASES) return TIER_ALIASES[raw];
if (raw === 'fast' || raw === 'standard' || raw === 'full') return raw;
return resolveTier();
}
/** Map a tier to its Claude model ID. */
export function tierToModel(tier: EvalTier): string {
return TIER_TO_MODEL[tier];
}