feat: add eval format validation, tier selection, cost tracking

- lib/eval-format.ts: StandardEvalResult interfaces, validateEvalResult(), normalizeFromLegacy/normalizeToLegacy round-trip converters - lib/eval-tier.ts: EvalTier type, resolveTier/resolveJudgeTier from env, tierToModel mapping, TIER_ALIASES (haiku→fast, sonnet→standard, opus→full) - lib/eval-cost.ts: MODEL_PRICING (last verified 2025-05-01), computeCosts(), formatCostDashboard(), aggregateCosts(), fallback for unknown models - 42 tests across 3 test files Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-07 14:06:42 +02:00 · 2026-03-15 09:39:18 -05:00
parent 7f7035f55a
commit 9bc6c9416f
6 changed files with 846 additions and 0 deletions
@@ -0,0 +1,155 @@
+/**
+ * Tests for lib/eval-cost.ts — per-model cost tracking.
+ */
+
+import { describe, test, expect } from 'bun:test';
+import {
+  MODEL_PRICING,
+  computeCosts,
+  formatCostDashboard,
+  aggregateCosts,
+} from '../lib/eval-cost';
+import type { CostEntry, StandardEvalResult } from '../lib/eval-format';
+
+describe('lib/eval-cost', () => {
+  describe('MODEL_PRICING', () => {
+    test('includes current Claude models', () => {
+      expect(MODEL_PRICING['claude-opus-4-6']).toBeDefined();
+      expect(MODEL_PRICING['claude-sonnet-4-6']).toBeDefined();
+      expect(MODEL_PRICING['claude-haiku-4-5']).toBeDefined();
+    });
+
+    test('has input and output pricing for each model', () => {
+      for (const [model, pricing] of Object.entries(MODEL_PRICING)) {
+        expect(pricing.input).toBeGreaterThan(0);
+        expect(pricing.output).toBeGreaterThan(0);
+        expect(pricing.output).toBeGreaterThanOrEqual(pricing.input);
+      }
+    });
+  });
+
+  describe('computeCosts', () => {
+    test('computes cost for a single model', () => {
+      const costs: CostEntry[] = [{
+        model: 'claude-sonnet-4-6',
+        calls: 10,
+        input_tokens: 1_000_000,
+        output_tokens: 500_000,
+      }];
+      const dashboard = computeCosts(costs);
+      expect(dashboard.entries.length).toBe(1);
+      expect(dashboard.entries[0].model).toBe('claude-sonnet-4-6');
+      expect(dashboard.entries[0].calls).toBe(10);
+      // $3/M input + $15/M * 0.5 = $3 + $7.5 = $10.5
+      expect(dashboard.total).toBeCloseTo(10.5, 2);
+    });
+
+    test('aggregates multiple entries for same model', () => {
+      const costs: CostEntry[] = [
+        { model: 'claude-haiku-4-5', calls: 5, input_tokens: 100_000, output_tokens: 50_000 },
+        { model: 'claude-haiku-4-5', calls: 3, input_tokens: 200_000, output_tokens: 100_000 },
+      ];
+      const dashboard = computeCosts(costs);
+      expect(dashboard.entries.length).toBe(1);
+      expect(dashboard.entries[0].calls).toBe(8);
+      expect(dashboard.entries[0].input_tokens).toBe(300_000);
+      expect(dashboard.entries[0].output_tokens).toBe(150_000);
+    });
+
+    test('handles multiple models', () => {
+      const costs: CostEntry[] = [
+        { model: 'claude-haiku-4-5', calls: 5, input_tokens: 100_000, output_tokens: 50_000 },
+        { model: 'claude-opus-4-6', calls: 1, input_tokens: 100_000, output_tokens: 50_000 },
+      ];
+      const dashboard = computeCosts(costs);
+      expect(dashboard.entries.length).toBe(2);
+      // Sorted by cost desc — opus is more expensive
+      expect(dashboard.entries[0].model).toBe('claude-opus-4-6');
+    });
+
+    test('uses fallback pricing for unknown models', () => {
+      const costs: CostEntry[] = [{
+        model: 'unknown-model-xyz',
+        calls: 1,
+        input_tokens: 1_000_000,
+        output_tokens: 1_000_000,
+      }];
+      const dashboard = computeCosts(costs);
+      expect(dashboard.entries.length).toBe(1);
+      // Fallback is sonnet pricing: $3 + $15 = $18
+      expect(dashboard.total).toBeCloseTo(18, 2);
+    });
+
+    test('computes what-if at fast and full tiers', () => {
+      const costs: CostEntry[] = [{
+        model: 'claude-sonnet-4-6',
+        calls: 1,
+        input_tokens: 1_000_000,
+        output_tokens: 1_000_000,
+      }];
+      const dashboard = computeCosts(costs);
+      expect(dashboard.at_fast_tier).toBeLessThan(dashboard.total);
+      expect(dashboard.at_full_tier).toBeGreaterThan(dashboard.total);
+    });
+
+    test('handles empty input', () => {
+      const dashboard = computeCosts([]);
+      expect(dashboard.entries.length).toBe(0);
+      expect(dashboard.total).toBe(0);
+    });
+  });
+
+  describe('formatCostDashboard', () => {
+    test('produces readable output', () => {
+      const costs: CostEntry[] = [{
+        model: 'claude-sonnet-4-6',
+        calls: 10,
+        input_tokens: 500_000,
+        output_tokens: 250_000,
+      }];
+      const dashboard = computeCosts(costs);
+      const output = formatCostDashboard(dashboard);
+      expect(output).toContain('Cost Breakdown');
+      expect(output).toContain('claude-sonnet-4-6');
+      expect(output).toContain('10');
+      expect(output).toContain('Total:');
+      expect(output).toContain('fast tier');
+      expect(output).toContain('full tier');
+    });
+  });
+
+  describe('aggregateCosts', () => {
+    test('merges costs from multiple results', () => {
+      const results: StandardEvalResult[] = [
+        {
+          schema_version: 1, version: '1.0', git_branch: 'main', git_sha: 'abc',
+          timestamp: '', hostname: '', tier: 'e2e', total: 1, passed: 1, failed: 0,
+          total_cost_usd: 1, duration_seconds: 10, all_results: [],
+          costs: [{ model: 'claude-haiku-4-5', calls: 5, input_tokens: 100_000, output_tokens: 50_000 }],
+        },
+        {
+          schema_version: 1, version: '1.0', git_branch: 'main', git_sha: 'def',
+          timestamp: '', hostname: '', tier: 'e2e', total: 1, passed: 1, failed: 0,
+          total_cost_usd: 2, duration_seconds: 20, all_results: [],
+          costs: [{ model: 'claude-haiku-4-5', calls: 3, input_tokens: 200_000, output_tokens: 100_000 }],
+        },
+      ];
+      const dashboard = aggregateCosts(results);
+      expect(dashboard.entries.length).toBe(1);
+      expect(dashboard.entries[0].calls).toBe(8);
+    });
+
+    test('handles results without costs field', () => {
+      const results: StandardEvalResult[] = [
+        {
+          schema_version: 1, version: '1.0', git_branch: 'main', git_sha: 'abc',
+          timestamp: '', hostname: '', tier: 'e2e', total: 1, passed: 1, failed: 0,
+          total_cost_usd: 1, duration_seconds: 10, all_results: [],
+        },
+      ];
+      const dashboard = aggregateCosts(results);
+      expect(dashboard.entries.length).toBe(0);
+      expect(dashboard.total).toBe(0);
+    });
+  });
+});
@@ -0,0 +1,159 @@
+/**
+ * Tests for lib/eval-format.ts — standard eval result validation and normalization.
+ */
+
+import { describe, test, expect } from 'bun:test';
+import {
+  validateEvalResult,
+  normalizeFromLegacy,
+  normalizeToLegacy,
+} from '../lib/eval-format';
+import type { StandardEvalResult } from '../lib/eval-format';
+import type { EvalResult } from '../test/helpers/eval-store';
+
+function makeValidStandard(): StandardEvalResult {
+  return {
+    schema_version: 1,
+    version: '0.3.3',
+    git_branch: 'main',
+    git_sha: 'abc1234',
+    timestamp: '2025-05-01T12:00:00Z',
+    hostname: 'test-host',
+    tier: 'e2e',
+    total: 2,
+    passed: 1,
+    failed: 1,
+    total_cost_usd: 1.50,
+    duration_seconds: 120,
+    all_results: [
+      { name: 'test-a', suite: 'core', tier: 'e2e', passed: true, duration_ms: 60000, cost_usd: 0.75 },
+      { name: 'test-b', suite: 'core', tier: 'e2e', passed: false, duration_ms: 60000, cost_usd: 0.75 },
+    ],
+  };
+}
+
+function makeLegacy(): EvalResult {
+  return {
+    schema_version: 1,
+    version: '0.3.3',
+    branch: 'main',
+    git_sha: 'abc1234',
+    timestamp: '2025-05-01T12:00:00Z',
+    hostname: 'test-host',
+    tier: 'e2e',
+    total_tests: 2,
+    passed: 1,
+    failed: 1,
+    total_cost_usd: 1.50,
+    total_duration_ms: 120000,
+    tests: [
+      { name: 'test-a', suite: 'core', tier: 'e2e', passed: true, duration_ms: 60000, cost_usd: 0.75, turns_used: 5 },
+      { name: 'test-b', suite: 'core', tier: 'e2e', passed: false, duration_ms: 60000, cost_usd: 0.75, detection_rate: 3 },
+    ],
+  };
+}
+
+describe('lib/eval-format', () => {
+  describe('validateEvalResult', () => {
+    test('accepts valid standard result', () => {
+      const result = validateEvalResult(makeValidStandard());
+      expect(result.valid).toBe(true);
+      expect(result.errors).toEqual([]);
+    });
+
+    test('rejects null', () => {
+      const result = validateEvalResult(null);
+      expect(result.valid).toBe(false);
+      expect(result.errors[0]).toContain('non-null object');
+    });
+
+    test('rejects non-object', () => {
+      const result = validateEvalResult('not an object');
+      expect(result.valid).toBe(false);
+    });
+
+    test('reports missing required fields', () => {
+      const result = validateEvalResult({});
+      expect(result.valid).toBe(false);
+      expect(result.errors.length).toBeGreaterThan(5);
+      expect(result.errors.some(e => e.includes('schema_version'))).toBe(true);
+      expect(result.errors.some(e => e.includes('git_branch'))).toBe(true);
+    });
+
+    test('reports wrong types', () => {
+      const bad = { ...makeValidStandard(), schema_version: 'not a number' };
+      const result = validateEvalResult(bad);
+      expect(result.valid).toBe(false);
+      expect(result.errors.some(e => e.includes('schema_version') && e.includes('number'))).toBe(true);
+    });
+
+    test('rejects non-array all_results', () => {
+      const bad = { ...makeValidStandard(), all_results: 'not an array' };
+      const result = validateEvalResult(bad);
+      expect(result.valid).toBe(false);
+      expect(result.errors.some(e => e.includes('all_results') && e.includes('array'))).toBe(true);
+    });
+
+    test('validates test entry names', () => {
+      const bad = { ...makeValidStandard(), all_results: [{ passed: true }] };
+      const result = validateEvalResult(bad);
+      expect(result.valid).toBe(false);
+      expect(result.errors.some(e => e.includes('name'))).toBe(true);
+    });
+
+    test('validates test entry passed field', () => {
+      const bad = { ...makeValidStandard(), all_results: [{ name: 'test', passed: 'yes' }] };
+      const result = validateEvalResult(bad);
+      expect(result.valid).toBe(false);
+      expect(result.errors.some(e => e.includes('passed') && e.includes('boolean'))).toBe(true);
+    });
+  });
+
+  describe('normalizeFromLegacy', () => {
+    test('maps all fields correctly', () => {
+      const standard = normalizeFromLegacy(makeLegacy());
+      expect(standard.git_branch).toBe('main');
+      expect(standard.total).toBe(2);
+      expect(standard.duration_seconds).toBe(120);
+      expect(standard.all_results.length).toBe(2);
+      expect(standard.all_results[0].turns_used).toBe(5);
+      expect(standard.all_results[1].detection_rate).toBe(3);
+    });
+
+    test('preserves optional fields when present', () => {
+      const legacy = makeLegacy();
+      legacy._partial = true;
+      const standard = normalizeFromLegacy(legacy);
+      expect(standard._partial).toBe(true);
+    });
+
+    test('omits optional fields when absent', () => {
+      const standard = normalizeFromLegacy(makeLegacy());
+      expect(standard.all_results[0].detection_rate).toBeUndefined();
+      expect(standard.all_results[1].turns_used).toBeUndefined();
+    });
+  });
+
+  describe('normalizeToLegacy', () => {
+    test('maps all fields correctly', () => {
+      const legacy = normalizeToLegacy(makeValidStandard());
+      expect(legacy.branch).toBe('main');
+      expect(legacy.total_tests).toBe(2);
+      expect(legacy.total_duration_ms).toBe(120000);
+      expect(legacy.tests.length).toBe(2);
+    });
+
+    test('round-trip preserves data', () => {
+      const original = makeLegacy();
+      const roundTrip = normalizeToLegacy(normalizeFromLegacy(original));
+      expect(roundTrip.branch).toBe(original.branch);
+      expect(roundTrip.total_tests).toBe(original.total_tests);
+      expect(roundTrip.passed).toBe(original.passed);
+      expect(roundTrip.failed).toBe(original.failed);
+      expect(roundTrip.total_cost_usd).toBe(original.total_cost_usd);
+      expect(roundTrip.tests.length).toBe(original.tests.length);
+      expect(roundTrip.tests[0].name).toBe(original.tests[0].name);
+      expect(roundTrip.tests[0].turns_used).toBe(original.tests[0].turns_used);
+    });
+  });
+});
@@ -0,0 +1,94 @@
+/**
+ * Tests for lib/eval-tier.ts — model tier selection.
+ */
+
+import { describe, test, expect, beforeEach, afterEach } from 'bun:test';
+import { resolveTier, resolveJudgeTier, tierToModel, TIER_ALIASES } from '../lib/eval-tier';
+
+describe('lib/eval-tier', () => {
+  const origEvalTier = process.env.EVAL_TIER;
+  const origJudgeTier = process.env.EVAL_JUDGE_TIER;
+
+  afterEach(() => {
+    if (origEvalTier === undefined) delete process.env.EVAL_TIER;
+    else process.env.EVAL_TIER = origEvalTier;
+    if (origJudgeTier === undefined) delete process.env.EVAL_JUDGE_TIER;
+    else process.env.EVAL_JUDGE_TIER = origJudgeTier;
+  });
+
+  describe('resolveTier', () => {
+    test('defaults to standard when unset', () => {
+      delete process.env.EVAL_TIER;
+      expect(resolveTier()).toBe('standard');
+    });
+
+    test('resolves tier names directly', () => {
+      process.env.EVAL_TIER = 'fast';
+      expect(resolveTier()).toBe('fast');
+      process.env.EVAL_TIER = 'full';
+      expect(resolveTier()).toBe('full');
+    });
+
+    test('resolves model aliases', () => {
+      process.env.EVAL_TIER = 'haiku';
+      expect(resolveTier()).toBe('fast');
+      process.env.EVAL_TIER = 'sonnet';
+      expect(resolveTier()).toBe('standard');
+      process.env.EVAL_TIER = 'opus';
+      expect(resolveTier()).toBe('full');
+    });
+
+    test('is case-insensitive', () => {
+      process.env.EVAL_TIER = 'HAIKU';
+      expect(resolveTier()).toBe('fast');
+      process.env.EVAL_TIER = 'Full';
+      expect(resolveTier()).toBe('full');
+    });
+
+    test('defaults to standard for unknown value', () => {
+      process.env.EVAL_TIER = 'gpt-4';
+      expect(resolveTier()).toBe('standard');
+    });
+  });
+
+  describe('resolveJudgeTier', () => {
+    test('falls back to EVAL_TIER when EVAL_JUDGE_TIER unset', () => {
+      delete process.env.EVAL_JUDGE_TIER;
+      process.env.EVAL_TIER = 'fast';
+      expect(resolveJudgeTier()).toBe('fast');
+    });
+
+    test('uses EVAL_JUDGE_TIER when set', () => {
+      process.env.EVAL_TIER = 'fast';
+      process.env.EVAL_JUDGE_TIER = 'full';
+      expect(resolveJudgeTier()).toBe('full');
+    });
+
+    test('resolves aliases for judge tier', () => {
+      process.env.EVAL_JUDGE_TIER = 'opus';
+      expect(resolveJudgeTier()).toBe('full');
+    });
+  });
+
+  describe('tierToModel', () => {
+    test('maps fast to haiku', () => {
+      expect(tierToModel('fast')).toBe('claude-haiku-4-5');
+    });
+
+    test('maps standard to sonnet', () => {
+      expect(tierToModel('standard')).toBe('claude-sonnet-4-6');
+    });
+
+    test('maps full to opus', () => {
+      expect(tierToModel('full')).toBe('claude-opus-4-6');
+    });
+  });
+
+  describe('TIER_ALIASES', () => {
+    test('contains expected aliases', () => {
+      expect(TIER_ALIASES.haiku).toBe('fast');
+      expect(TIER_ALIASES.sonnet).toBe('standard');
+      expect(TIER_ALIASES.opus).toBe('full');
+    });
+  });
+});