mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-05 05:05:08 +02:00
feat: add eval format validation, tier selection, cost tracking
- lib/eval-format.ts: StandardEvalResult interfaces, validateEvalResult(), normalizeFromLegacy/normalizeToLegacy round-trip converters - lib/eval-tier.ts: EvalTier type, resolveTier/resolveJudgeTier from env, tierToModel mapping, TIER_ALIASES (haiku→fast, sonnet→standard, opus→full) - lib/eval-cost.ts: MODEL_PRICING (last verified 2025-05-01), computeCosts(), formatCostDashboard(), aggregateCosts(), fallback for unknown models - 42 tests across 3 test files Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,158 @@
|
||||
/**
|
||||
* Per-model cost tracking for eval runs.
|
||||
*
|
||||
* Computes cost breakdowns from CostEntry arrays and formats
|
||||
* them as terminal tables. Supports aggregation across multiple runs.
|
||||
*/
|
||||
|
||||
import type { CostEntry, StandardEvalResult } from './eval-format';
|
||||
|
||||
// --- Interfaces ---
|
||||
|
||||
export interface CostSummary {
|
||||
model: string;
|
||||
calls: number;
|
||||
input_tokens: number;
|
||||
output_tokens: number;
|
||||
estimated_cost_usd: number;
|
||||
}
|
||||
|
||||
export interface CostDashboard {
|
||||
entries: CostSummary[];
|
||||
total: number;
|
||||
at_fast_tier: number;
|
||||
at_full_tier: number;
|
||||
}
|
||||
|
||||
// --- Pricing ---
|
||||
|
||||
/**
|
||||
* Per-million-token pricing for Claude models.
|
||||
* Last verified: 2025-05-01
|
||||
*/
|
||||
export const MODEL_PRICING: Record<string, { input: number; output: number }> = {
|
||||
'claude-opus-4-6': { input: 15.00, output: 75.00 },
|
||||
'claude-sonnet-4-6': { input: 3.00, output: 15.00 },
|
||||
'claude-haiku-4-5': { input: 0.80, output: 4.00 },
|
||||
// Legacy model IDs
|
||||
'claude-3-5-sonnet-20241022': { input: 3.00, output: 15.00 },
|
||||
'claude-3-5-haiku-20241022': { input: 0.80, output: 4.00 },
|
||||
'claude-3-opus-20240229': { input: 15.00, output: 75.00 },
|
||||
};
|
||||
|
||||
/** Fallback pricing for unknown models (use sonnet pricing as a safe middle ground). */
|
||||
const FALLBACK_PRICING = { input: 3.00, output: 15.00 };
|
||||
|
||||
// --- Computation ---
|
||||
|
||||
function getPricing(model: string): { input: number; output: number } {
|
||||
return MODEL_PRICING[model] || FALLBACK_PRICING;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute per-model cost summaries from an array of CostEntry records.
|
||||
*/
|
||||
export function computeCosts(costs: CostEntry[]): CostDashboard {
|
||||
const byModel = new Map<string, CostSummary>();
|
||||
|
||||
for (const entry of costs) {
|
||||
const existing = byModel.get(entry.model);
|
||||
if (existing) {
|
||||
existing.calls += entry.calls;
|
||||
existing.input_tokens += entry.input_tokens;
|
||||
existing.output_tokens += entry.output_tokens;
|
||||
} else {
|
||||
byModel.set(entry.model, {
|
||||
model: entry.model,
|
||||
calls: entry.calls,
|
||||
input_tokens: entry.input_tokens,
|
||||
output_tokens: entry.output_tokens,
|
||||
estimated_cost_usd: 0,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate costs
|
||||
let total = 0;
|
||||
let atFast = 0;
|
||||
let atFull = 0;
|
||||
const fastPricing = MODEL_PRICING['claude-haiku-4-5'] || FALLBACK_PRICING;
|
||||
const fullPricing = MODEL_PRICING['claude-opus-4-6'] || FALLBACK_PRICING;
|
||||
|
||||
for (const summary of byModel.values()) {
|
||||
const pricing = getPricing(summary.model);
|
||||
summary.estimated_cost_usd =
|
||||
(summary.input_tokens / 1_000_000) * pricing.input +
|
||||
(summary.output_tokens / 1_000_000) * pricing.output;
|
||||
total += summary.estimated_cost_usd;
|
||||
|
||||
// What-if at fast/full tiers
|
||||
atFast +=
|
||||
(summary.input_tokens / 1_000_000) * fastPricing.input +
|
||||
(summary.output_tokens / 1_000_000) * fastPricing.output;
|
||||
atFull +=
|
||||
(summary.input_tokens / 1_000_000) * fullPricing.input +
|
||||
(summary.output_tokens / 1_000_000) * fullPricing.output;
|
||||
}
|
||||
|
||||
const entries = [...byModel.values()].sort((a, b) => b.estimated_cost_usd - a.estimated_cost_usd);
|
||||
|
||||
return {
|
||||
entries,
|
||||
total: Math.round(total * 1_000_000) / 1_000_000,
|
||||
at_fast_tier: Math.round(atFast * 1_000_000) / 1_000_000,
|
||||
at_full_tier: Math.round(atFull * 1_000_000) / 1_000_000,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Format a CostDashboard as a terminal table.
|
||||
*/
|
||||
export function formatCostDashboard(dashboard: CostDashboard): string {
|
||||
const lines: string[] = [];
|
||||
lines.push('');
|
||||
lines.push('Cost Breakdown');
|
||||
lines.push('═'.repeat(75));
|
||||
lines.push(
|
||||
' ' +
|
||||
'Model'.padEnd(32) +
|
||||
'Calls'.padEnd(8) +
|
||||
'In Tokens'.padEnd(12) +
|
||||
'Out Tokens'.padEnd(12) +
|
||||
'Cost'
|
||||
);
|
||||
lines.push('─'.repeat(75));
|
||||
|
||||
for (const entry of dashboard.entries) {
|
||||
const model = entry.model.length > 30 ? entry.model.slice(0, 27) + '...' : entry.model.padEnd(32);
|
||||
lines.push(
|
||||
` ${model}` +
|
||||
`${entry.calls}`.padEnd(8) +
|
||||
`${entry.input_tokens.toLocaleString()}`.padEnd(12) +
|
||||
`${entry.output_tokens.toLocaleString()}`.padEnd(12) +
|
||||
`$${entry.estimated_cost_usd.toFixed(4)}`
|
||||
);
|
||||
}
|
||||
|
||||
lines.push('─'.repeat(75));
|
||||
lines.push(` Total: $${dashboard.total.toFixed(4)}`);
|
||||
lines.push(` At fast tier (Haiku): $${dashboard.at_fast_tier.toFixed(4)}`);
|
||||
lines.push(` At full tier (Opus): $${dashboard.at_full_tier.toFixed(4)}`);
|
||||
lines.push('');
|
||||
|
||||
return lines.join('\n');
|
||||
}
|
||||
|
||||
/**
|
||||
* Aggregate costs across multiple StandardEvalResult runs.
|
||||
* Merges all costs[] arrays and computes a single dashboard.
|
||||
*/
|
||||
export function aggregateCosts(results: StandardEvalResult[]): CostDashboard {
|
||||
const allCosts: CostEntry[] = [];
|
||||
for (const r of results) {
|
||||
if (r.costs) {
|
||||
allCosts.push(...r.costs);
|
||||
}
|
||||
}
|
||||
return computeCosts(allCosts);
|
||||
}
|
||||
@@ -0,0 +1,229 @@
|
||||
/**
|
||||
* Standard eval result format — validation and normalization.
|
||||
*
|
||||
* Superset of the legacy EvalResult from test/helpers/eval-store.ts.
|
||||
* Any language can produce a JSON file matching StandardEvalResult and
|
||||
* push it through `gstack eval push`.
|
||||
*/
|
||||
|
||||
import type { EvalResult, EvalTestEntry } from '../test/helpers/eval-store';
|
||||
|
||||
// --- Interfaces ---
|
||||
|
||||
export interface CostEntry {
|
||||
model: string;
|
||||
calls: number;
|
||||
input_tokens: number;
|
||||
output_tokens: number;
|
||||
}
|
||||
|
||||
export interface FailureEntry {
|
||||
test_name: string;
|
||||
error: string;
|
||||
category?: string;
|
||||
}
|
||||
|
||||
export interface ComparisonEntry {
|
||||
label: string;
|
||||
model: string;
|
||||
score: number;
|
||||
cost_usd: number;
|
||||
}
|
||||
|
||||
export interface StandardTestEntry {
|
||||
name: string;
|
||||
suite: string;
|
||||
tier: string;
|
||||
passed: boolean;
|
||||
duration_ms: number;
|
||||
cost_usd: number;
|
||||
output?: Record<string, unknown>;
|
||||
|
||||
// Optional fields from legacy format
|
||||
turns_used?: number;
|
||||
exit_reason?: string;
|
||||
detection_rate?: number;
|
||||
false_positives?: number;
|
||||
evidence_quality?: number;
|
||||
detected_bugs?: string[];
|
||||
missed_bugs?: string[];
|
||||
judge_scores?: Record<string, number>;
|
||||
judge_reasoning?: string;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
export interface StandardEvalResult {
|
||||
schema_version: number;
|
||||
version: string;
|
||||
label?: string;
|
||||
git_branch: string;
|
||||
git_sha: string;
|
||||
timestamp: string;
|
||||
hostname: string;
|
||||
tier: string;
|
||||
total: number;
|
||||
passed: number;
|
||||
failed: number;
|
||||
total_cost_usd: number;
|
||||
duration_seconds: number;
|
||||
all_results: StandardTestEntry[];
|
||||
prompt_sha?: string;
|
||||
by_category?: Record<string, { passed: number; failed: number }>;
|
||||
costs?: CostEntry[];
|
||||
comparison?: ComparisonEntry[];
|
||||
failures?: FailureEntry[];
|
||||
_partial?: boolean;
|
||||
}
|
||||
|
||||
// --- Validation ---
|
||||
|
||||
const REQUIRED_FIELDS: Array<[string, string]> = [
|
||||
['schema_version', 'number'],
|
||||
['version', 'string'],
|
||||
['git_branch', 'string'],
|
||||
['git_sha', 'string'],
|
||||
['timestamp', 'string'],
|
||||
['tier', 'string'],
|
||||
['total', 'number'],
|
||||
['passed', 'number'],
|
||||
['failed', 'number'],
|
||||
['total_cost_usd', 'number'],
|
||||
['duration_seconds', 'number'],
|
||||
['all_results', 'object'], // array check below
|
||||
];
|
||||
|
||||
/**
|
||||
* Validate that an unknown value conforms to StandardEvalResult.
|
||||
* Returns { valid: true, errors: [] } or { valid: false, errors: [...] }.
|
||||
*/
|
||||
export function validateEvalResult(data: unknown): { valid: boolean; errors: string[] } {
|
||||
const errors: string[] = [];
|
||||
|
||||
if (data === null || typeof data !== 'object') {
|
||||
return { valid: false, errors: ['Input must be a non-null object'] };
|
||||
}
|
||||
|
||||
const obj = data as Record<string, unknown>;
|
||||
|
||||
for (const [field, expectedType] of REQUIRED_FIELDS) {
|
||||
if (!(field in obj)) {
|
||||
errors.push(`Missing required field: ${field}`);
|
||||
} else if (typeof obj[field] !== expectedType) {
|
||||
errors.push(`Field "${field}" must be ${expectedType}, got ${typeof obj[field]}`);
|
||||
}
|
||||
}
|
||||
|
||||
// all_results must be an array
|
||||
if ('all_results' in obj && !Array.isArray(obj.all_results)) {
|
||||
errors.push('Field "all_results" must be an array');
|
||||
}
|
||||
|
||||
// Validate each test entry minimally
|
||||
if (Array.isArray(obj.all_results)) {
|
||||
for (let i = 0; i < obj.all_results.length; i++) {
|
||||
const entry = obj.all_results[i];
|
||||
if (typeof entry !== 'object' || entry === null) {
|
||||
errors.push(`all_results[${i}] must be an object`);
|
||||
continue;
|
||||
}
|
||||
if (typeof (entry as Record<string, unknown>).name !== 'string') {
|
||||
errors.push(`all_results[${i}].name must be a string`);
|
||||
}
|
||||
if (typeof (entry as Record<string, unknown>).passed !== 'boolean') {
|
||||
errors.push(`all_results[${i}].passed must be a boolean`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return { valid: errors.length === 0, errors };
|
||||
}
|
||||
|
||||
// --- Normalization ---
|
||||
|
||||
/**
|
||||
* Convert legacy EvalResult → StandardEvalResult.
|
||||
*/
|
||||
export function normalizeFromLegacy(legacy: EvalResult): StandardEvalResult {
|
||||
return {
|
||||
schema_version: legacy.schema_version,
|
||||
version: legacy.version,
|
||||
git_branch: legacy.branch,
|
||||
git_sha: legacy.git_sha,
|
||||
timestamp: legacy.timestamp,
|
||||
hostname: legacy.hostname,
|
||||
tier: legacy.tier,
|
||||
total: legacy.total_tests,
|
||||
passed: legacy.passed,
|
||||
failed: legacy.failed,
|
||||
total_cost_usd: legacy.total_cost_usd,
|
||||
duration_seconds: Math.round(legacy.total_duration_ms / 1000),
|
||||
all_results: legacy.tests.map(legacyTestToStandard),
|
||||
_partial: legacy._partial,
|
||||
};
|
||||
}
|
||||
|
||||
function legacyTestToStandard(t: EvalTestEntry): StandardTestEntry {
|
||||
const entry: StandardTestEntry = {
|
||||
name: t.name,
|
||||
suite: t.suite,
|
||||
tier: t.tier,
|
||||
passed: t.passed,
|
||||
duration_ms: t.duration_ms,
|
||||
cost_usd: t.cost_usd,
|
||||
};
|
||||
if (t.turns_used !== undefined) entry.turns_used = t.turns_used;
|
||||
if (t.exit_reason !== undefined) entry.exit_reason = t.exit_reason;
|
||||
if (t.detection_rate !== undefined) entry.detection_rate = t.detection_rate;
|
||||
if (t.false_positives !== undefined) entry.false_positives = t.false_positives;
|
||||
if (t.evidence_quality !== undefined) entry.evidence_quality = t.evidence_quality;
|
||||
if (t.detected_bugs) entry.detected_bugs = t.detected_bugs;
|
||||
if (t.missed_bugs) entry.missed_bugs = t.missed_bugs;
|
||||
if (t.judge_scores) entry.judge_scores = t.judge_scores;
|
||||
if (t.judge_reasoning !== undefined) entry.judge_reasoning = t.judge_reasoning;
|
||||
if (t.error !== undefined) entry.error = t.error;
|
||||
return entry;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert StandardEvalResult → legacy EvalResult for compat with existing compare/list.
|
||||
*/
|
||||
export function normalizeToLegacy(standard: StandardEvalResult): EvalResult {
|
||||
return {
|
||||
schema_version: standard.schema_version,
|
||||
version: standard.version,
|
||||
branch: standard.git_branch,
|
||||
git_sha: standard.git_sha,
|
||||
timestamp: standard.timestamp,
|
||||
hostname: standard.hostname,
|
||||
tier: standard.tier as 'e2e' | 'llm-judge',
|
||||
total_tests: standard.total,
|
||||
passed: standard.passed,
|
||||
failed: standard.failed,
|
||||
total_cost_usd: standard.total_cost_usd,
|
||||
total_duration_ms: standard.duration_seconds * 1000,
|
||||
tests: standard.all_results.map(standardTestToLegacy),
|
||||
_partial: standard._partial,
|
||||
};
|
||||
}
|
||||
|
||||
function standardTestToLegacy(t: StandardTestEntry): EvalTestEntry {
|
||||
const entry: EvalTestEntry = {
|
||||
name: t.name,
|
||||
suite: t.suite,
|
||||
tier: t.tier as 'e2e' | 'llm-judge',
|
||||
passed: t.passed,
|
||||
duration_ms: t.duration_ms,
|
||||
cost_usd: t.cost_usd,
|
||||
};
|
||||
if (t.turns_used !== undefined) entry.turns_used = t.turns_used;
|
||||
if (t.exit_reason !== undefined) entry.exit_reason = t.exit_reason;
|
||||
if (t.detection_rate !== undefined) entry.detection_rate = t.detection_rate;
|
||||
if (t.false_positives !== undefined) entry.false_positives = t.false_positives;
|
||||
if (t.evidence_quality !== undefined) entry.evidence_quality = t.evidence_quality;
|
||||
if (t.detected_bugs) entry.detected_bugs = t.detected_bugs;
|
||||
if (t.missed_bugs) entry.missed_bugs = t.missed_bugs;
|
||||
if (t.judge_scores) entry.judge_scores = t.judge_scores;
|
||||
if (t.judge_reasoning !== undefined) entry.judge_reasoning = t.judge_reasoning;
|
||||
if (t.error !== undefined) entry.error = t.error;
|
||||
return entry;
|
||||
}
|
||||
@@ -0,0 +1,51 @@
|
||||
/**
|
||||
* Model tier selection for evals.
|
||||
*
|
||||
* Maps tier names to Claude models. Supports env var overrides
|
||||
* for EVAL_TIER and EVAL_JUDGE_TIER.
|
||||
*/
|
||||
|
||||
export type EvalTier = 'fast' | 'standard' | 'full';
|
||||
|
||||
export const TIER_ALIASES: Record<string, EvalTier> = {
|
||||
haiku: 'fast',
|
||||
sonnet: 'standard',
|
||||
opus: 'full',
|
||||
};
|
||||
|
||||
const TIER_TO_MODEL: Record<EvalTier, string> = {
|
||||
fast: 'claude-haiku-4-5',
|
||||
standard: 'claude-sonnet-4-6',
|
||||
full: 'claude-opus-4-6',
|
||||
};
|
||||
|
||||
/**
|
||||
* Resolve the eval tier from EVAL_TIER env var.
|
||||
* Supports both tier names ('fast', 'standard', 'full') and
|
||||
* model aliases ('haiku', 'sonnet', 'opus').
|
||||
* Defaults to 'standard'.
|
||||
*/
|
||||
export function resolveTier(): EvalTier {
|
||||
const raw = process.env.EVAL_TIER?.toLowerCase().trim();
|
||||
if (!raw) return 'standard';
|
||||
if (raw in TIER_ALIASES) return TIER_ALIASES[raw];
|
||||
if (raw === 'fast' || raw === 'standard' || raw === 'full') return raw;
|
||||
return 'standard';
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve the judge tier from EVAL_JUDGE_TIER env var.
|
||||
* Falls back to resolveTier() if not set.
|
||||
*/
|
||||
export function resolveJudgeTier(): EvalTier {
|
||||
const raw = process.env.EVAL_JUDGE_TIER?.toLowerCase().trim();
|
||||
if (!raw) return resolveTier();
|
||||
if (raw in TIER_ALIASES) return TIER_ALIASES[raw];
|
||||
if (raw === 'fast' || raw === 'standard' || raw === 'full') return raw;
|
||||
return resolveTier();
|
||||
}
|
||||
|
||||
/** Map a tier to its Claude model ID. */
|
||||
export function tierToModel(tier: EvalTier): string {
|
||||
return TIER_TO_MODEL[tier];
|
||||
}
|
||||
@@ -0,0 +1,155 @@
|
||||
/**
|
||||
* Tests for lib/eval-cost.ts — per-model cost tracking.
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import {
|
||||
MODEL_PRICING,
|
||||
computeCosts,
|
||||
formatCostDashboard,
|
||||
aggregateCosts,
|
||||
} from '../lib/eval-cost';
|
||||
import type { CostEntry, StandardEvalResult } from '../lib/eval-format';
|
||||
|
||||
describe('lib/eval-cost', () => {
|
||||
describe('MODEL_PRICING', () => {
|
||||
test('includes current Claude models', () => {
|
||||
expect(MODEL_PRICING['claude-opus-4-6']).toBeDefined();
|
||||
expect(MODEL_PRICING['claude-sonnet-4-6']).toBeDefined();
|
||||
expect(MODEL_PRICING['claude-haiku-4-5']).toBeDefined();
|
||||
});
|
||||
|
||||
test('has input and output pricing for each model', () => {
|
||||
for (const [model, pricing] of Object.entries(MODEL_PRICING)) {
|
||||
expect(pricing.input).toBeGreaterThan(0);
|
||||
expect(pricing.output).toBeGreaterThan(0);
|
||||
expect(pricing.output).toBeGreaterThanOrEqual(pricing.input);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
describe('computeCosts', () => {
|
||||
test('computes cost for a single model', () => {
|
||||
const costs: CostEntry[] = [{
|
||||
model: 'claude-sonnet-4-6',
|
||||
calls: 10,
|
||||
input_tokens: 1_000_000,
|
||||
output_tokens: 500_000,
|
||||
}];
|
||||
const dashboard = computeCosts(costs);
|
||||
expect(dashboard.entries.length).toBe(1);
|
||||
expect(dashboard.entries[0].model).toBe('claude-sonnet-4-6');
|
||||
expect(dashboard.entries[0].calls).toBe(10);
|
||||
// $3/M input + $15/M * 0.5 = $3 + $7.5 = $10.5
|
||||
expect(dashboard.total).toBeCloseTo(10.5, 2);
|
||||
});
|
||||
|
||||
test('aggregates multiple entries for same model', () => {
|
||||
const costs: CostEntry[] = [
|
||||
{ model: 'claude-haiku-4-5', calls: 5, input_tokens: 100_000, output_tokens: 50_000 },
|
||||
{ model: 'claude-haiku-4-5', calls: 3, input_tokens: 200_000, output_tokens: 100_000 },
|
||||
];
|
||||
const dashboard = computeCosts(costs);
|
||||
expect(dashboard.entries.length).toBe(1);
|
||||
expect(dashboard.entries[0].calls).toBe(8);
|
||||
expect(dashboard.entries[0].input_tokens).toBe(300_000);
|
||||
expect(dashboard.entries[0].output_tokens).toBe(150_000);
|
||||
});
|
||||
|
||||
test('handles multiple models', () => {
|
||||
const costs: CostEntry[] = [
|
||||
{ model: 'claude-haiku-4-5', calls: 5, input_tokens: 100_000, output_tokens: 50_000 },
|
||||
{ model: 'claude-opus-4-6', calls: 1, input_tokens: 100_000, output_tokens: 50_000 },
|
||||
];
|
||||
const dashboard = computeCosts(costs);
|
||||
expect(dashboard.entries.length).toBe(2);
|
||||
// Sorted by cost desc — opus is more expensive
|
||||
expect(dashboard.entries[0].model).toBe('claude-opus-4-6');
|
||||
});
|
||||
|
||||
test('uses fallback pricing for unknown models', () => {
|
||||
const costs: CostEntry[] = [{
|
||||
model: 'unknown-model-xyz',
|
||||
calls: 1,
|
||||
input_tokens: 1_000_000,
|
||||
output_tokens: 1_000_000,
|
||||
}];
|
||||
const dashboard = computeCosts(costs);
|
||||
expect(dashboard.entries.length).toBe(1);
|
||||
// Fallback is sonnet pricing: $3 + $15 = $18
|
||||
expect(dashboard.total).toBeCloseTo(18, 2);
|
||||
});
|
||||
|
||||
test('computes what-if at fast and full tiers', () => {
|
||||
const costs: CostEntry[] = [{
|
||||
model: 'claude-sonnet-4-6',
|
||||
calls: 1,
|
||||
input_tokens: 1_000_000,
|
||||
output_tokens: 1_000_000,
|
||||
}];
|
||||
const dashboard = computeCosts(costs);
|
||||
expect(dashboard.at_fast_tier).toBeLessThan(dashboard.total);
|
||||
expect(dashboard.at_full_tier).toBeGreaterThan(dashboard.total);
|
||||
});
|
||||
|
||||
test('handles empty input', () => {
|
||||
const dashboard = computeCosts([]);
|
||||
expect(dashboard.entries.length).toBe(0);
|
||||
expect(dashboard.total).toBe(0);
|
||||
});
|
||||
});
|
||||
|
||||
describe('formatCostDashboard', () => {
|
||||
test('produces readable output', () => {
|
||||
const costs: CostEntry[] = [{
|
||||
model: 'claude-sonnet-4-6',
|
||||
calls: 10,
|
||||
input_tokens: 500_000,
|
||||
output_tokens: 250_000,
|
||||
}];
|
||||
const dashboard = computeCosts(costs);
|
||||
const output = formatCostDashboard(dashboard);
|
||||
expect(output).toContain('Cost Breakdown');
|
||||
expect(output).toContain('claude-sonnet-4-6');
|
||||
expect(output).toContain('10');
|
||||
expect(output).toContain('Total:');
|
||||
expect(output).toContain('fast tier');
|
||||
expect(output).toContain('full tier');
|
||||
});
|
||||
});
|
||||
|
||||
describe('aggregateCosts', () => {
|
||||
test('merges costs from multiple results', () => {
|
||||
const results: StandardEvalResult[] = [
|
||||
{
|
||||
schema_version: 1, version: '1.0', git_branch: 'main', git_sha: 'abc',
|
||||
timestamp: '', hostname: '', tier: 'e2e', total: 1, passed: 1, failed: 0,
|
||||
total_cost_usd: 1, duration_seconds: 10, all_results: [],
|
||||
costs: [{ model: 'claude-haiku-4-5', calls: 5, input_tokens: 100_000, output_tokens: 50_000 }],
|
||||
},
|
||||
{
|
||||
schema_version: 1, version: '1.0', git_branch: 'main', git_sha: 'def',
|
||||
timestamp: '', hostname: '', tier: 'e2e', total: 1, passed: 1, failed: 0,
|
||||
total_cost_usd: 2, duration_seconds: 20, all_results: [],
|
||||
costs: [{ model: 'claude-haiku-4-5', calls: 3, input_tokens: 200_000, output_tokens: 100_000 }],
|
||||
},
|
||||
];
|
||||
const dashboard = aggregateCosts(results);
|
||||
expect(dashboard.entries.length).toBe(1);
|
||||
expect(dashboard.entries[0].calls).toBe(8);
|
||||
});
|
||||
|
||||
test('handles results without costs field', () => {
|
||||
const results: StandardEvalResult[] = [
|
||||
{
|
||||
schema_version: 1, version: '1.0', git_branch: 'main', git_sha: 'abc',
|
||||
timestamp: '', hostname: '', tier: 'e2e', total: 1, passed: 1, failed: 0,
|
||||
total_cost_usd: 1, duration_seconds: 10, all_results: [],
|
||||
},
|
||||
];
|
||||
const dashboard = aggregateCosts(results);
|
||||
expect(dashboard.entries.length).toBe(0);
|
||||
expect(dashboard.total).toBe(0);
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,159 @@
|
||||
/**
|
||||
* Tests for lib/eval-format.ts — standard eval result validation and normalization.
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import {
|
||||
validateEvalResult,
|
||||
normalizeFromLegacy,
|
||||
normalizeToLegacy,
|
||||
} from '../lib/eval-format';
|
||||
import type { StandardEvalResult } from '../lib/eval-format';
|
||||
import type { EvalResult } from '../test/helpers/eval-store';
|
||||
|
||||
function makeValidStandard(): StandardEvalResult {
|
||||
return {
|
||||
schema_version: 1,
|
||||
version: '0.3.3',
|
||||
git_branch: 'main',
|
||||
git_sha: 'abc1234',
|
||||
timestamp: '2025-05-01T12:00:00Z',
|
||||
hostname: 'test-host',
|
||||
tier: 'e2e',
|
||||
total: 2,
|
||||
passed: 1,
|
||||
failed: 1,
|
||||
total_cost_usd: 1.50,
|
||||
duration_seconds: 120,
|
||||
all_results: [
|
||||
{ name: 'test-a', suite: 'core', tier: 'e2e', passed: true, duration_ms: 60000, cost_usd: 0.75 },
|
||||
{ name: 'test-b', suite: 'core', tier: 'e2e', passed: false, duration_ms: 60000, cost_usd: 0.75 },
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
function makeLegacy(): EvalResult {
|
||||
return {
|
||||
schema_version: 1,
|
||||
version: '0.3.3',
|
||||
branch: 'main',
|
||||
git_sha: 'abc1234',
|
||||
timestamp: '2025-05-01T12:00:00Z',
|
||||
hostname: 'test-host',
|
||||
tier: 'e2e',
|
||||
total_tests: 2,
|
||||
passed: 1,
|
||||
failed: 1,
|
||||
total_cost_usd: 1.50,
|
||||
total_duration_ms: 120000,
|
||||
tests: [
|
||||
{ name: 'test-a', suite: 'core', tier: 'e2e', passed: true, duration_ms: 60000, cost_usd: 0.75, turns_used: 5 },
|
||||
{ name: 'test-b', suite: 'core', tier: 'e2e', passed: false, duration_ms: 60000, cost_usd: 0.75, detection_rate: 3 },
|
||||
],
|
||||
};
|
||||
}
|
||||
|
||||
describe('lib/eval-format', () => {
|
||||
describe('validateEvalResult', () => {
|
||||
test('accepts valid standard result', () => {
|
||||
const result = validateEvalResult(makeValidStandard());
|
||||
expect(result.valid).toBe(true);
|
||||
expect(result.errors).toEqual([]);
|
||||
});
|
||||
|
||||
test('rejects null', () => {
|
||||
const result = validateEvalResult(null);
|
||||
expect(result.valid).toBe(false);
|
||||
expect(result.errors[0]).toContain('non-null object');
|
||||
});
|
||||
|
||||
test('rejects non-object', () => {
|
||||
const result = validateEvalResult('not an object');
|
||||
expect(result.valid).toBe(false);
|
||||
});
|
||||
|
||||
test('reports missing required fields', () => {
|
||||
const result = validateEvalResult({});
|
||||
expect(result.valid).toBe(false);
|
||||
expect(result.errors.length).toBeGreaterThan(5);
|
||||
expect(result.errors.some(e => e.includes('schema_version'))).toBe(true);
|
||||
expect(result.errors.some(e => e.includes('git_branch'))).toBe(true);
|
||||
});
|
||||
|
||||
test('reports wrong types', () => {
|
||||
const bad = { ...makeValidStandard(), schema_version: 'not a number' };
|
||||
const result = validateEvalResult(bad);
|
||||
expect(result.valid).toBe(false);
|
||||
expect(result.errors.some(e => e.includes('schema_version') && e.includes('number'))).toBe(true);
|
||||
});
|
||||
|
||||
test('rejects non-array all_results', () => {
|
||||
const bad = { ...makeValidStandard(), all_results: 'not an array' };
|
||||
const result = validateEvalResult(bad);
|
||||
expect(result.valid).toBe(false);
|
||||
expect(result.errors.some(e => e.includes('all_results') && e.includes('array'))).toBe(true);
|
||||
});
|
||||
|
||||
test('validates test entry names', () => {
|
||||
const bad = { ...makeValidStandard(), all_results: [{ passed: true }] };
|
||||
const result = validateEvalResult(bad);
|
||||
expect(result.valid).toBe(false);
|
||||
expect(result.errors.some(e => e.includes('name'))).toBe(true);
|
||||
});
|
||||
|
||||
test('validates test entry passed field', () => {
|
||||
const bad = { ...makeValidStandard(), all_results: [{ name: 'test', passed: 'yes' }] };
|
||||
const result = validateEvalResult(bad);
|
||||
expect(result.valid).toBe(false);
|
||||
expect(result.errors.some(e => e.includes('passed') && e.includes('boolean'))).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe('normalizeFromLegacy', () => {
|
||||
test('maps all fields correctly', () => {
|
||||
const standard = normalizeFromLegacy(makeLegacy());
|
||||
expect(standard.git_branch).toBe('main');
|
||||
expect(standard.total).toBe(2);
|
||||
expect(standard.duration_seconds).toBe(120);
|
||||
expect(standard.all_results.length).toBe(2);
|
||||
expect(standard.all_results[0].turns_used).toBe(5);
|
||||
expect(standard.all_results[1].detection_rate).toBe(3);
|
||||
});
|
||||
|
||||
test('preserves optional fields when present', () => {
|
||||
const legacy = makeLegacy();
|
||||
legacy._partial = true;
|
||||
const standard = normalizeFromLegacy(legacy);
|
||||
expect(standard._partial).toBe(true);
|
||||
});
|
||||
|
||||
test('omits optional fields when absent', () => {
|
||||
const standard = normalizeFromLegacy(makeLegacy());
|
||||
expect(standard.all_results[0].detection_rate).toBeUndefined();
|
||||
expect(standard.all_results[1].turns_used).toBeUndefined();
|
||||
});
|
||||
});
|
||||
|
||||
describe('normalizeToLegacy', () => {
|
||||
test('maps all fields correctly', () => {
|
||||
const legacy = normalizeToLegacy(makeValidStandard());
|
||||
expect(legacy.branch).toBe('main');
|
||||
expect(legacy.total_tests).toBe(2);
|
||||
expect(legacy.total_duration_ms).toBe(120000);
|
||||
expect(legacy.tests.length).toBe(2);
|
||||
});
|
||||
|
||||
test('round-trip preserves data', () => {
|
||||
const original = makeLegacy();
|
||||
const roundTrip = normalizeToLegacy(normalizeFromLegacy(original));
|
||||
expect(roundTrip.branch).toBe(original.branch);
|
||||
expect(roundTrip.total_tests).toBe(original.total_tests);
|
||||
expect(roundTrip.passed).toBe(original.passed);
|
||||
expect(roundTrip.failed).toBe(original.failed);
|
||||
expect(roundTrip.total_cost_usd).toBe(original.total_cost_usd);
|
||||
expect(roundTrip.tests.length).toBe(original.tests.length);
|
||||
expect(roundTrip.tests[0].name).toBe(original.tests[0].name);
|
||||
expect(roundTrip.tests[0].turns_used).toBe(original.tests[0].turns_used);
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,94 @@
|
||||
/**
|
||||
* Tests for lib/eval-tier.ts — model tier selection.
|
||||
*/
|
||||
|
||||
import { describe, test, expect, beforeEach, afterEach } from 'bun:test';
|
||||
import { resolveTier, resolveJudgeTier, tierToModel, TIER_ALIASES } from '../lib/eval-tier';
|
||||
|
||||
describe('lib/eval-tier', () => {
|
||||
const origEvalTier = process.env.EVAL_TIER;
|
||||
const origJudgeTier = process.env.EVAL_JUDGE_TIER;
|
||||
|
||||
afterEach(() => {
|
||||
if (origEvalTier === undefined) delete process.env.EVAL_TIER;
|
||||
else process.env.EVAL_TIER = origEvalTier;
|
||||
if (origJudgeTier === undefined) delete process.env.EVAL_JUDGE_TIER;
|
||||
else process.env.EVAL_JUDGE_TIER = origJudgeTier;
|
||||
});
|
||||
|
||||
describe('resolveTier', () => {
|
||||
test('defaults to standard when unset', () => {
|
||||
delete process.env.EVAL_TIER;
|
||||
expect(resolveTier()).toBe('standard');
|
||||
});
|
||||
|
||||
test('resolves tier names directly', () => {
|
||||
process.env.EVAL_TIER = 'fast';
|
||||
expect(resolveTier()).toBe('fast');
|
||||
process.env.EVAL_TIER = 'full';
|
||||
expect(resolveTier()).toBe('full');
|
||||
});
|
||||
|
||||
test('resolves model aliases', () => {
|
||||
process.env.EVAL_TIER = 'haiku';
|
||||
expect(resolveTier()).toBe('fast');
|
||||
process.env.EVAL_TIER = 'sonnet';
|
||||
expect(resolveTier()).toBe('standard');
|
||||
process.env.EVAL_TIER = 'opus';
|
||||
expect(resolveTier()).toBe('full');
|
||||
});
|
||||
|
||||
test('is case-insensitive', () => {
|
||||
process.env.EVAL_TIER = 'HAIKU';
|
||||
expect(resolveTier()).toBe('fast');
|
||||
process.env.EVAL_TIER = 'Full';
|
||||
expect(resolveTier()).toBe('full');
|
||||
});
|
||||
|
||||
test('defaults to standard for unknown value', () => {
|
||||
process.env.EVAL_TIER = 'gpt-4';
|
||||
expect(resolveTier()).toBe('standard');
|
||||
});
|
||||
});
|
||||
|
||||
describe('resolveJudgeTier', () => {
|
||||
test('falls back to EVAL_TIER when EVAL_JUDGE_TIER unset', () => {
|
||||
delete process.env.EVAL_JUDGE_TIER;
|
||||
process.env.EVAL_TIER = 'fast';
|
||||
expect(resolveJudgeTier()).toBe('fast');
|
||||
});
|
||||
|
||||
test('uses EVAL_JUDGE_TIER when set', () => {
|
||||
process.env.EVAL_TIER = 'fast';
|
||||
process.env.EVAL_JUDGE_TIER = 'full';
|
||||
expect(resolveJudgeTier()).toBe('full');
|
||||
});
|
||||
|
||||
test('resolves aliases for judge tier', () => {
|
||||
process.env.EVAL_JUDGE_TIER = 'opus';
|
||||
expect(resolveJudgeTier()).toBe('full');
|
||||
});
|
||||
});
|
||||
|
||||
describe('tierToModel', () => {
|
||||
test('maps fast to haiku', () => {
|
||||
expect(tierToModel('fast')).toBe('claude-haiku-4-5');
|
||||
});
|
||||
|
||||
test('maps standard to sonnet', () => {
|
||||
expect(tierToModel('standard')).toBe('claude-sonnet-4-6');
|
||||
});
|
||||
|
||||
test('maps full to opus', () => {
|
||||
expect(tierToModel('full')).toBe('claude-opus-4-6');
|
||||
});
|
||||
});
|
||||
|
||||
describe('TIER_ALIASES', () => {
|
||||
test('contains expected aliases', () => {
|
||||
expect(TIER_ALIASES.haiku).toBe('fast');
|
||||
expect(TIER_ALIASES.sonnet).toBe('standard');
|
||||
expect(TIER_ALIASES.opus).toBe('full');
|
||||
});
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user