/** * Unit tests for the benchmark runner. * * Mocks adapters to verify: * - All adapters run in parallel (Promise.allSettled not serial) * - Unavailable adapters are skipped or marked depending on flag * - Per-adapter errors don't abort the batch * - Output formatters (table, json, markdown) produce non-empty strings * * Does NOT exercise live CLIs — see test/providers.e2e.test.ts for those. */ import { test, expect } from 'bun:test'; import { formatTable, formatJson, formatMarkdown, type BenchmarkReport } from './helpers/benchmark-runner'; import { estimateCostUsd, PRICING } from './helpers/pricing'; import { missingTools, TOOL_COMPATIBILITY } from './helpers/tool-map'; test('estimateCostUsd returns 0 for unknown model (no crash)', () => { const cost = estimateCostUsd({ input: 1000, output: 500 }, 'unknown-model-7b'); expect(cost).toBe(0); }); test('estimateCostUsd computes correctly for known Claude model', () => { // claude-opus-4-7: $15/MTok input, $75/MTok output // 1M input + 0.5M output = $15 + $37.50 = $52.50 const cost = estimateCostUsd({ input: 1_000_000, output: 500_000 }, 'claude-opus-4-7'); expect(cost).toBeCloseTo(52.50, 2); }); test('estimateCostUsd applies cached input discount alongside uncached input', () => { // tokens.input is uncached-only; tokens.cached is disjoint cache-reads at 10%. // 0 uncached input, 1M cached → 10% of 15 = $1.50 const cost1 = estimateCostUsd({ input: 0, output: 0, cached: 1_000_000 }, 'claude-opus-4-7'); expect(cost1).toBeCloseTo(1.50, 2); // 500K uncached input + 500K cached → $7.50 + $0.75 = $8.25 const cost2 = estimateCostUsd({ input: 500_000, output: 0, cached: 500_000 }, 'claude-opus-4-7'); expect(cost2).toBeCloseTo(8.25, 2); }); test('PRICING table covers the key model families', () => { expect(PRICING['claude-opus-4-7']).toBeDefined(); expect(PRICING['claude-sonnet-4-6']).toBeDefined(); expect(PRICING['gpt-5.4']).toBeDefined(); expect(PRICING['gemini-2.5-pro']).toBeDefined(); }); test('missingTools reports unsupported tools per provider', () => { // GPT/Codex doesn't expose Edit, Glob, Grep expect(missingTools('gpt', ['Edit', 'Glob', 'Grep'])).toEqual(['Edit', 'Glob', 'Grep']); // Claude supports all core tools expect(missingTools('claude', ['Edit', 'Glob', 'Grep', 'Bash', 'Read'])).toEqual([]); // Gemini has very limited agentic surface expect(missingTools('gemini', ['Bash', 'Edit'])).toEqual(['Bash', 'Edit']); }); test('TOOL_COMPATIBILITY is populated for all three families', () => { expect(TOOL_COMPATIBILITY.claude).toBeDefined(); expect(TOOL_COMPATIBILITY.gpt).toBeDefined(); expect(TOOL_COMPATIBILITY.gemini).toBeDefined(); }); test('formatTable handles a report with mixed success/error/unavailable entries', () => { const report: BenchmarkReport = { prompt: 'test prompt', workdir: '/tmp', startedAt: '2026-04-16T20:00:00Z', durationMs: 1500, entries: [ { provider: 'claude', family: 'claude', available: true, result: { output: 'ok', tokens: { input: 100, output: 200 }, durationMs: 800, toolCalls: 3, modelUsed: 'claude-opus-4-7', }, costUsd: 0.0165, qualityScore: 9.2, }, { provider: 'gpt', family: 'gpt', available: true, result: { output: '', tokens: { input: 0, output: 0 }, durationMs: 200, toolCalls: 0, modelUsed: 'gpt-5.4', error: { code: 'auth', reason: 'codex login required' }, }, }, { provider: 'gemini', family: 'gemini', available: false, unavailable_reason: 'gemini CLI not on PATH', }, ], }; const table = formatTable(report); expect(table).toContain('claude-opus-4-7'); expect(table).toContain('ERROR auth'); expect(table).toContain('unavailable'); expect(table).toContain('9.2/10'); }); test('formatJson produces parseable JSON', () => { const report: BenchmarkReport = { prompt: 'x', workdir: '/tmp', startedAt: '2026-04-16T20:00:00Z', durationMs: 100, entries: [], }; const json = formatJson(report); const parsed = JSON.parse(json); expect(parsed.prompt).toBe('x'); expect(parsed.entries).toEqual([]); }); test('formatMarkdown produces a table header', () => { const report: BenchmarkReport = { prompt: 'x', workdir: '/tmp', startedAt: '2026-04-16T20:00:00Z', durationMs: 100, entries: [], }; const md = formatMarkdown(report); expect(md).toContain('# Benchmark report'); expect(md).toContain('| Model | Latency |'); });