diff --git a/lib/eval-cost.ts b/lib/eval-cost.ts new file mode 100644 index 00000000..1dbe31c8 --- /dev/null +++ b/lib/eval-cost.ts @@ -0,0 +1,158 @@ +/** + * Per-model cost tracking for eval runs. + * + * Computes cost breakdowns from CostEntry arrays and formats + * them as terminal tables. Supports aggregation across multiple runs. + */ + +import type { CostEntry, StandardEvalResult } from './eval-format'; + +// --- Interfaces --- + +export interface CostSummary { + model: string; + calls: number; + input_tokens: number; + output_tokens: number; + estimated_cost_usd: number; +} + +export interface CostDashboard { + entries: CostSummary[]; + total: number; + at_fast_tier: number; + at_full_tier: number; +} + +// --- Pricing --- + +/** + * Per-million-token pricing for Claude models. + * Last verified: 2025-05-01 + */ +export const MODEL_PRICING: Record = { + 'claude-opus-4-6': { input: 15.00, output: 75.00 }, + 'claude-sonnet-4-6': { input: 3.00, output: 15.00 }, + 'claude-haiku-4-5': { input: 0.80, output: 4.00 }, + // Legacy model IDs + 'claude-3-5-sonnet-20241022': { input: 3.00, output: 15.00 }, + 'claude-3-5-haiku-20241022': { input: 0.80, output: 4.00 }, + 'claude-3-opus-20240229': { input: 15.00, output: 75.00 }, +}; + +/** Fallback pricing for unknown models (use sonnet pricing as a safe middle ground). */ +const FALLBACK_PRICING = { input: 3.00, output: 15.00 }; + +// --- Computation --- + +function getPricing(model: string): { input: number; output: number } { + return MODEL_PRICING[model] || FALLBACK_PRICING; +} + +/** + * Compute per-model cost summaries from an array of CostEntry records. + */ +export function computeCosts(costs: CostEntry[]): CostDashboard { + const byModel = new Map(); + + for (const entry of costs) { + const existing = byModel.get(entry.model); + if (existing) { + existing.calls += entry.calls; + existing.input_tokens += entry.input_tokens; + existing.output_tokens += entry.output_tokens; + } else { + byModel.set(entry.model, { + model: entry.model, + calls: entry.calls, + input_tokens: entry.input_tokens, + output_tokens: entry.output_tokens, + estimated_cost_usd: 0, + }); + } + } + + // Calculate costs + let total = 0; + let atFast = 0; + let atFull = 0; + const fastPricing = MODEL_PRICING['claude-haiku-4-5'] || FALLBACK_PRICING; + const fullPricing = MODEL_PRICING['claude-opus-4-6'] || FALLBACK_PRICING; + + for (const summary of byModel.values()) { + const pricing = getPricing(summary.model); + summary.estimated_cost_usd = + (summary.input_tokens / 1_000_000) * pricing.input + + (summary.output_tokens / 1_000_000) * pricing.output; + total += summary.estimated_cost_usd; + + // What-if at fast/full tiers + atFast += + (summary.input_tokens / 1_000_000) * fastPricing.input + + (summary.output_tokens / 1_000_000) * fastPricing.output; + atFull += + (summary.input_tokens / 1_000_000) * fullPricing.input + + (summary.output_tokens / 1_000_000) * fullPricing.output; + } + + const entries = [...byModel.values()].sort((a, b) => b.estimated_cost_usd - a.estimated_cost_usd); + + return { + entries, + total: Math.round(total * 1_000_000) / 1_000_000, + at_fast_tier: Math.round(atFast * 1_000_000) / 1_000_000, + at_full_tier: Math.round(atFull * 1_000_000) / 1_000_000, + }; +} + +/** + * Format a CostDashboard as a terminal table. + */ +export function formatCostDashboard(dashboard: CostDashboard): string { + const lines: string[] = []; + lines.push(''); + lines.push('Cost Breakdown'); + lines.push('═'.repeat(75)); + lines.push( + ' ' + + 'Model'.padEnd(32) + + 'Calls'.padEnd(8) + + 'In Tokens'.padEnd(12) + + 'Out Tokens'.padEnd(12) + + 'Cost' + ); + lines.push('─'.repeat(75)); + + for (const entry of dashboard.entries) { + const model = entry.model.length > 30 ? entry.model.slice(0, 27) + '...' : entry.model.padEnd(32); + lines.push( + ` ${model}` + + `${entry.calls}`.padEnd(8) + + `${entry.input_tokens.toLocaleString()}`.padEnd(12) + + `${entry.output_tokens.toLocaleString()}`.padEnd(12) + + `$${entry.estimated_cost_usd.toFixed(4)}` + ); + } + + lines.push('─'.repeat(75)); + lines.push(` Total: $${dashboard.total.toFixed(4)}`); + lines.push(` At fast tier (Haiku): $${dashboard.at_fast_tier.toFixed(4)}`); + lines.push(` At full tier (Opus): $${dashboard.at_full_tier.toFixed(4)}`); + lines.push(''); + + return lines.join('\n'); +} + +/** + * Aggregate costs across multiple StandardEvalResult runs. + * Merges all costs[] arrays and computes a single dashboard. + */ +export function aggregateCosts(results: StandardEvalResult[]): CostDashboard { + const allCosts: CostEntry[] = []; + for (const r of results) { + if (r.costs) { + allCosts.push(...r.costs); + } + } + return computeCosts(allCosts); +} diff --git a/lib/eval-format.ts b/lib/eval-format.ts new file mode 100644 index 00000000..0dcc347d --- /dev/null +++ b/lib/eval-format.ts @@ -0,0 +1,229 @@ +/** + * Standard eval result format — validation and normalization. + * + * Superset of the legacy EvalResult from test/helpers/eval-store.ts. + * Any language can produce a JSON file matching StandardEvalResult and + * push it through `gstack eval push`. + */ + +import type { EvalResult, EvalTestEntry } from '../test/helpers/eval-store'; + +// --- Interfaces --- + +export interface CostEntry { + model: string; + calls: number; + input_tokens: number; + output_tokens: number; +} + +export interface FailureEntry { + test_name: string; + error: string; + category?: string; +} + +export interface ComparisonEntry { + label: string; + model: string; + score: number; + cost_usd: number; +} + +export interface StandardTestEntry { + name: string; + suite: string; + tier: string; + passed: boolean; + duration_ms: number; + cost_usd: number; + output?: Record; + + // Optional fields from legacy format + turns_used?: number; + exit_reason?: string; + detection_rate?: number; + false_positives?: number; + evidence_quality?: number; + detected_bugs?: string[]; + missed_bugs?: string[]; + judge_scores?: Record; + judge_reasoning?: string; + error?: string; +} + +export interface StandardEvalResult { + schema_version: number; + version: string; + label?: string; + git_branch: string; + git_sha: string; + timestamp: string; + hostname: string; + tier: string; + total: number; + passed: number; + failed: number; + total_cost_usd: number; + duration_seconds: number; + all_results: StandardTestEntry[]; + prompt_sha?: string; + by_category?: Record; + costs?: CostEntry[]; + comparison?: ComparisonEntry[]; + failures?: FailureEntry[]; + _partial?: boolean; +} + +// --- Validation --- + +const REQUIRED_FIELDS: Array<[string, string]> = [ + ['schema_version', 'number'], + ['version', 'string'], + ['git_branch', 'string'], + ['git_sha', 'string'], + ['timestamp', 'string'], + ['tier', 'string'], + ['total', 'number'], + ['passed', 'number'], + ['failed', 'number'], + ['total_cost_usd', 'number'], + ['duration_seconds', 'number'], + ['all_results', 'object'], // array check below +]; + +/** + * Validate that an unknown value conforms to StandardEvalResult. + * Returns { valid: true, errors: [] } or { valid: false, errors: [...] }. + */ +export function validateEvalResult(data: unknown): { valid: boolean; errors: string[] } { + const errors: string[] = []; + + if (data === null || typeof data !== 'object') { + return { valid: false, errors: ['Input must be a non-null object'] }; + } + + const obj = data as Record; + + for (const [field, expectedType] of REQUIRED_FIELDS) { + if (!(field in obj)) { + errors.push(`Missing required field: ${field}`); + } else if (typeof obj[field] !== expectedType) { + errors.push(`Field "${field}" must be ${expectedType}, got ${typeof obj[field]}`); + } + } + + // all_results must be an array + if ('all_results' in obj && !Array.isArray(obj.all_results)) { + errors.push('Field "all_results" must be an array'); + } + + // Validate each test entry minimally + if (Array.isArray(obj.all_results)) { + for (let i = 0; i < obj.all_results.length; i++) { + const entry = obj.all_results[i]; + if (typeof entry !== 'object' || entry === null) { + errors.push(`all_results[${i}] must be an object`); + continue; + } + if (typeof (entry as Record).name !== 'string') { + errors.push(`all_results[${i}].name must be a string`); + } + if (typeof (entry as Record).passed !== 'boolean') { + errors.push(`all_results[${i}].passed must be a boolean`); + } + } + } + + return { valid: errors.length === 0, errors }; +} + +// --- Normalization --- + +/** + * Convert legacy EvalResult → StandardEvalResult. + */ +export function normalizeFromLegacy(legacy: EvalResult): StandardEvalResult { + return { + schema_version: legacy.schema_version, + version: legacy.version, + git_branch: legacy.branch, + git_sha: legacy.git_sha, + timestamp: legacy.timestamp, + hostname: legacy.hostname, + tier: legacy.tier, + total: legacy.total_tests, + passed: legacy.passed, + failed: legacy.failed, + total_cost_usd: legacy.total_cost_usd, + duration_seconds: Math.round(legacy.total_duration_ms / 1000), + all_results: legacy.tests.map(legacyTestToStandard), + _partial: legacy._partial, + }; +} + +function legacyTestToStandard(t: EvalTestEntry): StandardTestEntry { + const entry: StandardTestEntry = { + name: t.name, + suite: t.suite, + tier: t.tier, + passed: t.passed, + duration_ms: t.duration_ms, + cost_usd: t.cost_usd, + }; + if (t.turns_used !== undefined) entry.turns_used = t.turns_used; + if (t.exit_reason !== undefined) entry.exit_reason = t.exit_reason; + if (t.detection_rate !== undefined) entry.detection_rate = t.detection_rate; + if (t.false_positives !== undefined) entry.false_positives = t.false_positives; + if (t.evidence_quality !== undefined) entry.evidence_quality = t.evidence_quality; + if (t.detected_bugs) entry.detected_bugs = t.detected_bugs; + if (t.missed_bugs) entry.missed_bugs = t.missed_bugs; + if (t.judge_scores) entry.judge_scores = t.judge_scores; + if (t.judge_reasoning !== undefined) entry.judge_reasoning = t.judge_reasoning; + if (t.error !== undefined) entry.error = t.error; + return entry; +} + +/** + * Convert StandardEvalResult → legacy EvalResult for compat with existing compare/list. + */ +export function normalizeToLegacy(standard: StandardEvalResult): EvalResult { + return { + schema_version: standard.schema_version, + version: standard.version, + branch: standard.git_branch, + git_sha: standard.git_sha, + timestamp: standard.timestamp, + hostname: standard.hostname, + tier: standard.tier as 'e2e' | 'llm-judge', + total_tests: standard.total, + passed: standard.passed, + failed: standard.failed, + total_cost_usd: standard.total_cost_usd, + total_duration_ms: standard.duration_seconds * 1000, + tests: standard.all_results.map(standardTestToLegacy), + _partial: standard._partial, + }; +} + +function standardTestToLegacy(t: StandardTestEntry): EvalTestEntry { + const entry: EvalTestEntry = { + name: t.name, + suite: t.suite, + tier: t.tier as 'e2e' | 'llm-judge', + passed: t.passed, + duration_ms: t.duration_ms, + cost_usd: t.cost_usd, + }; + if (t.turns_used !== undefined) entry.turns_used = t.turns_used; + if (t.exit_reason !== undefined) entry.exit_reason = t.exit_reason; + if (t.detection_rate !== undefined) entry.detection_rate = t.detection_rate; + if (t.false_positives !== undefined) entry.false_positives = t.false_positives; + if (t.evidence_quality !== undefined) entry.evidence_quality = t.evidence_quality; + if (t.detected_bugs) entry.detected_bugs = t.detected_bugs; + if (t.missed_bugs) entry.missed_bugs = t.missed_bugs; + if (t.judge_scores) entry.judge_scores = t.judge_scores; + if (t.judge_reasoning !== undefined) entry.judge_reasoning = t.judge_reasoning; + if (t.error !== undefined) entry.error = t.error; + return entry; +} diff --git a/lib/eval-tier.ts b/lib/eval-tier.ts new file mode 100644 index 00000000..77cd440a --- /dev/null +++ b/lib/eval-tier.ts @@ -0,0 +1,51 @@ +/** + * Model tier selection for evals. + * + * Maps tier names to Claude models. Supports env var overrides + * for EVAL_TIER and EVAL_JUDGE_TIER. + */ + +export type EvalTier = 'fast' | 'standard' | 'full'; + +export const TIER_ALIASES: Record = { + haiku: 'fast', + sonnet: 'standard', + opus: 'full', +}; + +const TIER_TO_MODEL: Record = { + fast: 'claude-haiku-4-5', + standard: 'claude-sonnet-4-6', + full: 'claude-opus-4-6', +}; + +/** + * Resolve the eval tier from EVAL_TIER env var. + * Supports both tier names ('fast', 'standard', 'full') and + * model aliases ('haiku', 'sonnet', 'opus'). + * Defaults to 'standard'. + */ +export function resolveTier(): EvalTier { + const raw = process.env.EVAL_TIER?.toLowerCase().trim(); + if (!raw) return 'standard'; + if (raw in TIER_ALIASES) return TIER_ALIASES[raw]; + if (raw === 'fast' || raw === 'standard' || raw === 'full') return raw; + return 'standard'; +} + +/** + * Resolve the judge tier from EVAL_JUDGE_TIER env var. + * Falls back to resolveTier() if not set. + */ +export function resolveJudgeTier(): EvalTier { + const raw = process.env.EVAL_JUDGE_TIER?.toLowerCase().trim(); + if (!raw) return resolveTier(); + if (raw in TIER_ALIASES) return TIER_ALIASES[raw]; + if (raw === 'fast' || raw === 'standard' || raw === 'full') return raw; + return resolveTier(); +} + +/** Map a tier to its Claude model ID. */ +export function tierToModel(tier: EvalTier): string { + return TIER_TO_MODEL[tier]; +} diff --git a/test/lib-eval-cost.test.ts b/test/lib-eval-cost.test.ts new file mode 100644 index 00000000..4d47cb15 --- /dev/null +++ b/test/lib-eval-cost.test.ts @@ -0,0 +1,155 @@ +/** + * Tests for lib/eval-cost.ts — per-model cost tracking. + */ + +import { describe, test, expect } from 'bun:test'; +import { + MODEL_PRICING, + computeCosts, + formatCostDashboard, + aggregateCosts, +} from '../lib/eval-cost'; +import type { CostEntry, StandardEvalResult } from '../lib/eval-format'; + +describe('lib/eval-cost', () => { + describe('MODEL_PRICING', () => { + test('includes current Claude models', () => { + expect(MODEL_PRICING['claude-opus-4-6']).toBeDefined(); + expect(MODEL_PRICING['claude-sonnet-4-6']).toBeDefined(); + expect(MODEL_PRICING['claude-haiku-4-5']).toBeDefined(); + }); + + test('has input and output pricing for each model', () => { + for (const [model, pricing] of Object.entries(MODEL_PRICING)) { + expect(pricing.input).toBeGreaterThan(0); + expect(pricing.output).toBeGreaterThan(0); + expect(pricing.output).toBeGreaterThanOrEqual(pricing.input); + } + }); + }); + + describe('computeCosts', () => { + test('computes cost for a single model', () => { + const costs: CostEntry[] = [{ + model: 'claude-sonnet-4-6', + calls: 10, + input_tokens: 1_000_000, + output_tokens: 500_000, + }]; + const dashboard = computeCosts(costs); + expect(dashboard.entries.length).toBe(1); + expect(dashboard.entries[0].model).toBe('claude-sonnet-4-6'); + expect(dashboard.entries[0].calls).toBe(10); + // $3/M input + $15/M * 0.5 = $3 + $7.5 = $10.5 + expect(dashboard.total).toBeCloseTo(10.5, 2); + }); + + test('aggregates multiple entries for same model', () => { + const costs: CostEntry[] = [ + { model: 'claude-haiku-4-5', calls: 5, input_tokens: 100_000, output_tokens: 50_000 }, + { model: 'claude-haiku-4-5', calls: 3, input_tokens: 200_000, output_tokens: 100_000 }, + ]; + const dashboard = computeCosts(costs); + expect(dashboard.entries.length).toBe(1); + expect(dashboard.entries[0].calls).toBe(8); + expect(dashboard.entries[0].input_tokens).toBe(300_000); + expect(dashboard.entries[0].output_tokens).toBe(150_000); + }); + + test('handles multiple models', () => { + const costs: CostEntry[] = [ + { model: 'claude-haiku-4-5', calls: 5, input_tokens: 100_000, output_tokens: 50_000 }, + { model: 'claude-opus-4-6', calls: 1, input_tokens: 100_000, output_tokens: 50_000 }, + ]; + const dashboard = computeCosts(costs); + expect(dashboard.entries.length).toBe(2); + // Sorted by cost desc — opus is more expensive + expect(dashboard.entries[0].model).toBe('claude-opus-4-6'); + }); + + test('uses fallback pricing for unknown models', () => { + const costs: CostEntry[] = [{ + model: 'unknown-model-xyz', + calls: 1, + input_tokens: 1_000_000, + output_tokens: 1_000_000, + }]; + const dashboard = computeCosts(costs); + expect(dashboard.entries.length).toBe(1); + // Fallback is sonnet pricing: $3 + $15 = $18 + expect(dashboard.total).toBeCloseTo(18, 2); + }); + + test('computes what-if at fast and full tiers', () => { + const costs: CostEntry[] = [{ + model: 'claude-sonnet-4-6', + calls: 1, + input_tokens: 1_000_000, + output_tokens: 1_000_000, + }]; + const dashboard = computeCosts(costs); + expect(dashboard.at_fast_tier).toBeLessThan(dashboard.total); + expect(dashboard.at_full_tier).toBeGreaterThan(dashboard.total); + }); + + test('handles empty input', () => { + const dashboard = computeCosts([]); + expect(dashboard.entries.length).toBe(0); + expect(dashboard.total).toBe(0); + }); + }); + + describe('formatCostDashboard', () => { + test('produces readable output', () => { + const costs: CostEntry[] = [{ + model: 'claude-sonnet-4-6', + calls: 10, + input_tokens: 500_000, + output_tokens: 250_000, + }]; + const dashboard = computeCosts(costs); + const output = formatCostDashboard(dashboard); + expect(output).toContain('Cost Breakdown'); + expect(output).toContain('claude-sonnet-4-6'); + expect(output).toContain('10'); + expect(output).toContain('Total:'); + expect(output).toContain('fast tier'); + expect(output).toContain('full tier'); + }); + }); + + describe('aggregateCosts', () => { + test('merges costs from multiple results', () => { + const results: StandardEvalResult[] = [ + { + schema_version: 1, version: '1.0', git_branch: 'main', git_sha: 'abc', + timestamp: '', hostname: '', tier: 'e2e', total: 1, passed: 1, failed: 0, + total_cost_usd: 1, duration_seconds: 10, all_results: [], + costs: [{ model: 'claude-haiku-4-5', calls: 5, input_tokens: 100_000, output_tokens: 50_000 }], + }, + { + schema_version: 1, version: '1.0', git_branch: 'main', git_sha: 'def', + timestamp: '', hostname: '', tier: 'e2e', total: 1, passed: 1, failed: 0, + total_cost_usd: 2, duration_seconds: 20, all_results: [], + costs: [{ model: 'claude-haiku-4-5', calls: 3, input_tokens: 200_000, output_tokens: 100_000 }], + }, + ]; + const dashboard = aggregateCosts(results); + expect(dashboard.entries.length).toBe(1); + expect(dashboard.entries[0].calls).toBe(8); + }); + + test('handles results without costs field', () => { + const results: StandardEvalResult[] = [ + { + schema_version: 1, version: '1.0', git_branch: 'main', git_sha: 'abc', + timestamp: '', hostname: '', tier: 'e2e', total: 1, passed: 1, failed: 0, + total_cost_usd: 1, duration_seconds: 10, all_results: [], + }, + ]; + const dashboard = aggregateCosts(results); + expect(dashboard.entries.length).toBe(0); + expect(dashboard.total).toBe(0); + }); + }); +}); diff --git a/test/lib-eval-format.test.ts b/test/lib-eval-format.test.ts new file mode 100644 index 00000000..75f9e2d3 --- /dev/null +++ b/test/lib-eval-format.test.ts @@ -0,0 +1,159 @@ +/** + * Tests for lib/eval-format.ts — standard eval result validation and normalization. + */ + +import { describe, test, expect } from 'bun:test'; +import { + validateEvalResult, + normalizeFromLegacy, + normalizeToLegacy, +} from '../lib/eval-format'; +import type { StandardEvalResult } from '../lib/eval-format'; +import type { EvalResult } from '../test/helpers/eval-store'; + +function makeValidStandard(): StandardEvalResult { + return { + schema_version: 1, + version: '0.3.3', + git_branch: 'main', + git_sha: 'abc1234', + timestamp: '2025-05-01T12:00:00Z', + hostname: 'test-host', + tier: 'e2e', + total: 2, + passed: 1, + failed: 1, + total_cost_usd: 1.50, + duration_seconds: 120, + all_results: [ + { name: 'test-a', suite: 'core', tier: 'e2e', passed: true, duration_ms: 60000, cost_usd: 0.75 }, + { name: 'test-b', suite: 'core', tier: 'e2e', passed: false, duration_ms: 60000, cost_usd: 0.75 }, + ], + }; +} + +function makeLegacy(): EvalResult { + return { + schema_version: 1, + version: '0.3.3', + branch: 'main', + git_sha: 'abc1234', + timestamp: '2025-05-01T12:00:00Z', + hostname: 'test-host', + tier: 'e2e', + total_tests: 2, + passed: 1, + failed: 1, + total_cost_usd: 1.50, + total_duration_ms: 120000, + tests: [ + { name: 'test-a', suite: 'core', tier: 'e2e', passed: true, duration_ms: 60000, cost_usd: 0.75, turns_used: 5 }, + { name: 'test-b', suite: 'core', tier: 'e2e', passed: false, duration_ms: 60000, cost_usd: 0.75, detection_rate: 3 }, + ], + }; +} + +describe('lib/eval-format', () => { + describe('validateEvalResult', () => { + test('accepts valid standard result', () => { + const result = validateEvalResult(makeValidStandard()); + expect(result.valid).toBe(true); + expect(result.errors).toEqual([]); + }); + + test('rejects null', () => { + const result = validateEvalResult(null); + expect(result.valid).toBe(false); + expect(result.errors[0]).toContain('non-null object'); + }); + + test('rejects non-object', () => { + const result = validateEvalResult('not an object'); + expect(result.valid).toBe(false); + }); + + test('reports missing required fields', () => { + const result = validateEvalResult({}); + expect(result.valid).toBe(false); + expect(result.errors.length).toBeGreaterThan(5); + expect(result.errors.some(e => e.includes('schema_version'))).toBe(true); + expect(result.errors.some(e => e.includes('git_branch'))).toBe(true); + }); + + test('reports wrong types', () => { + const bad = { ...makeValidStandard(), schema_version: 'not a number' }; + const result = validateEvalResult(bad); + expect(result.valid).toBe(false); + expect(result.errors.some(e => e.includes('schema_version') && e.includes('number'))).toBe(true); + }); + + test('rejects non-array all_results', () => { + const bad = { ...makeValidStandard(), all_results: 'not an array' }; + const result = validateEvalResult(bad); + expect(result.valid).toBe(false); + expect(result.errors.some(e => e.includes('all_results') && e.includes('array'))).toBe(true); + }); + + test('validates test entry names', () => { + const bad = { ...makeValidStandard(), all_results: [{ passed: true }] }; + const result = validateEvalResult(bad); + expect(result.valid).toBe(false); + expect(result.errors.some(e => e.includes('name'))).toBe(true); + }); + + test('validates test entry passed field', () => { + const bad = { ...makeValidStandard(), all_results: [{ name: 'test', passed: 'yes' }] }; + const result = validateEvalResult(bad); + expect(result.valid).toBe(false); + expect(result.errors.some(e => e.includes('passed') && e.includes('boolean'))).toBe(true); + }); + }); + + describe('normalizeFromLegacy', () => { + test('maps all fields correctly', () => { + const standard = normalizeFromLegacy(makeLegacy()); + expect(standard.git_branch).toBe('main'); + expect(standard.total).toBe(2); + expect(standard.duration_seconds).toBe(120); + expect(standard.all_results.length).toBe(2); + expect(standard.all_results[0].turns_used).toBe(5); + expect(standard.all_results[1].detection_rate).toBe(3); + }); + + test('preserves optional fields when present', () => { + const legacy = makeLegacy(); + legacy._partial = true; + const standard = normalizeFromLegacy(legacy); + expect(standard._partial).toBe(true); + }); + + test('omits optional fields when absent', () => { + const standard = normalizeFromLegacy(makeLegacy()); + expect(standard.all_results[0].detection_rate).toBeUndefined(); + expect(standard.all_results[1].turns_used).toBeUndefined(); + }); + }); + + describe('normalizeToLegacy', () => { + test('maps all fields correctly', () => { + const legacy = normalizeToLegacy(makeValidStandard()); + expect(legacy.branch).toBe('main'); + expect(legacy.total_tests).toBe(2); + expect(legacy.total_duration_ms).toBe(120000); + expect(legacy.tests.length).toBe(2); + }); + + test('round-trip preserves data', () => { + const original = makeLegacy(); + const roundTrip = normalizeToLegacy(normalizeFromLegacy(original)); + expect(roundTrip.branch).toBe(original.branch); + expect(roundTrip.total_tests).toBe(original.total_tests); + expect(roundTrip.passed).toBe(original.passed); + expect(roundTrip.failed).toBe(original.failed); + expect(roundTrip.total_cost_usd).toBe(original.total_cost_usd); + expect(roundTrip.tests.length).toBe(original.tests.length); + expect(roundTrip.tests[0].name).toBe(original.tests[0].name); + expect(roundTrip.tests[0].turns_used).toBe(original.tests[0].turns_used); + }); + }); +}); diff --git a/test/lib-eval-tier.test.ts b/test/lib-eval-tier.test.ts new file mode 100644 index 00000000..7a50e2b5 --- /dev/null +++ b/test/lib-eval-tier.test.ts @@ -0,0 +1,94 @@ +/** + * Tests for lib/eval-tier.ts — model tier selection. + */ + +import { describe, test, expect, beforeEach, afterEach } from 'bun:test'; +import { resolveTier, resolveJudgeTier, tierToModel, TIER_ALIASES } from '../lib/eval-tier'; + +describe('lib/eval-tier', () => { + const origEvalTier = process.env.EVAL_TIER; + const origJudgeTier = process.env.EVAL_JUDGE_TIER; + + afterEach(() => { + if (origEvalTier === undefined) delete process.env.EVAL_TIER; + else process.env.EVAL_TIER = origEvalTier; + if (origJudgeTier === undefined) delete process.env.EVAL_JUDGE_TIER; + else process.env.EVAL_JUDGE_TIER = origJudgeTier; + }); + + describe('resolveTier', () => { + test('defaults to standard when unset', () => { + delete process.env.EVAL_TIER; + expect(resolveTier()).toBe('standard'); + }); + + test('resolves tier names directly', () => { + process.env.EVAL_TIER = 'fast'; + expect(resolveTier()).toBe('fast'); + process.env.EVAL_TIER = 'full'; + expect(resolveTier()).toBe('full'); + }); + + test('resolves model aliases', () => { + process.env.EVAL_TIER = 'haiku'; + expect(resolveTier()).toBe('fast'); + process.env.EVAL_TIER = 'sonnet'; + expect(resolveTier()).toBe('standard'); + process.env.EVAL_TIER = 'opus'; + expect(resolveTier()).toBe('full'); + }); + + test('is case-insensitive', () => { + process.env.EVAL_TIER = 'HAIKU'; + expect(resolveTier()).toBe('fast'); + process.env.EVAL_TIER = 'Full'; + expect(resolveTier()).toBe('full'); + }); + + test('defaults to standard for unknown value', () => { + process.env.EVAL_TIER = 'gpt-4'; + expect(resolveTier()).toBe('standard'); + }); + }); + + describe('resolveJudgeTier', () => { + test('falls back to EVAL_TIER when EVAL_JUDGE_TIER unset', () => { + delete process.env.EVAL_JUDGE_TIER; + process.env.EVAL_TIER = 'fast'; + expect(resolveJudgeTier()).toBe('fast'); + }); + + test('uses EVAL_JUDGE_TIER when set', () => { + process.env.EVAL_TIER = 'fast'; + process.env.EVAL_JUDGE_TIER = 'full'; + expect(resolveJudgeTier()).toBe('full'); + }); + + test('resolves aliases for judge tier', () => { + process.env.EVAL_JUDGE_TIER = 'opus'; + expect(resolveJudgeTier()).toBe('full'); + }); + }); + + describe('tierToModel', () => { + test('maps fast to haiku', () => { + expect(tierToModel('fast')).toBe('claude-haiku-4-5'); + }); + + test('maps standard to sonnet', () => { + expect(tierToModel('standard')).toBe('claude-sonnet-4-6'); + }); + + test('maps full to opus', () => { + expect(tierToModel('full')).toBe('claude-opus-4-6'); + }); + }); + + describe('TIER_ALIASES', () => { + test('contains expected aliases', () => { + expect(TIER_ALIASES.haiku).toBe('fast'); + expect(TIER_ALIASES.sonnet).toBe('standard'); + expect(TIER_ALIASES.opus).toBe('full'); + }); + }); +});