Files
gstack/test/benchmark-runner.test.ts
T
Garry Tan 614354fc41 feat: multi-provider model benchmark (boil the ocean)
Adds the full spec Codex asked for: real provider adapters with auth
detection, normalized RunResult, pricing tables, tool compatibility
maps, parallel execution with error isolation, and table/JSON/markdown
output. Judge stays on Anthropic SDK as the single stable source of
quality scoring, gated behind --judge.

Codex flagged the original plan as massively under-scoped — the
existing runner is Claude-only and the judge is Anthropic-only. You
can't benchmark GPT or Gemini without real provider infrastructure.
This commit ships it.

New architecture:

  test/helpers/providers/types.ts       ProviderAdapter interface
  test/helpers/providers/claude.ts      wraps `claude -p --output-format json`
  test/helpers/providers/gpt.ts         wraps `codex exec --json`
  test/helpers/providers/gemini.ts      wraps `gemini -p --output-format stream-json --yolo`
  test/helpers/pricing.ts               per-model USD cost tables (quarterly)
  test/helpers/tool-map.ts              which tools each CLI exposes
  test/helpers/benchmark-runner.ts      orchestrator (Promise.allSettled)
  test/helpers/benchmark-judge.ts       Anthropic SDK quality scorer
  bin/gstack-model-benchmark            CLI entry
  test/benchmark-runner.test.ts         9 unit tests (cost math, formatters, tool-map)

Per-provider error isolation:
  - auth → record reason, don't abort batch
  - timeout → record reason, don't abort batch
  - rate_limit → record reason, don't abort batch
  - binary_missing → record in available() check, skip if --skip-unavailable

Pricing correction: cached input tokens are disjoint from uncached
input tokens (Anthropic/OpenAI report them separately). Original
math subtracted them, producing negative costs. Now adds cached at
the 10% discount alongside the full uncached input cost.

CLI:
  gstack-model-benchmark --prompt "..." --models claude,gpt,gemini
  gstack-model-benchmark ./prompt.txt --output json --judge
  gstack-model-benchmark ./prompt.txt --models claude --timeout-ms 60000

Output formats: table (default), json, markdown. Each shows model,
latency, in→out tokens, cost, quality (when --judge used), tool calls,
and any errors.

Known limitations for v1:
- Claude adapter approximates toolCalls as num_turns (stream-json
  would give exact counts; v2 can upgrade).
- Live E2E tests (test/providers.e2e.test.ts) not included — they
  require CI secrets for all three providers. Unit tests cover the
  shape and math.
- Provider CLIs sometimes return non-JSON error text to stdout; the
  parsers fall back to treating raw output as plain text in that case.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-17 06:16:42 +08:00

138 lines
4.6 KiB
TypeScript

/**
* Unit tests for the benchmark runner.
*
* Mocks adapters to verify:
* - All adapters run in parallel (Promise.allSettled not serial)
* - Unavailable adapters are skipped or marked depending on flag
* - Per-adapter errors don't abort the batch
* - Output formatters (table, json, markdown) produce non-empty strings
*
* Does NOT exercise live CLIs — see test/providers.e2e.test.ts for those.
*/
import { test, expect } from 'bun:test';
import { formatTable, formatJson, formatMarkdown, type BenchmarkReport } from './helpers/benchmark-runner';
import { estimateCostUsd, PRICING } from './helpers/pricing';
import { missingTools, TOOL_COMPATIBILITY } from './helpers/tool-map';
test('estimateCostUsd returns 0 for unknown model (no crash)', () => {
const cost = estimateCostUsd({ input: 1000, output: 500 }, 'unknown-model-7b');
expect(cost).toBe(0);
});
test('estimateCostUsd computes correctly for known Claude model', () => {
// claude-opus-4-7: $15/MTok input, $75/MTok output
// 1M input + 0.5M output = $15 + $37.50 = $52.50
const cost = estimateCostUsd({ input: 1_000_000, output: 500_000 }, 'claude-opus-4-7');
expect(cost).toBeCloseTo(52.50, 2);
});
test('estimateCostUsd applies cached input discount alongside uncached input', () => {
// tokens.input is uncached-only; tokens.cached is disjoint cache-reads at 10%.
// 0 uncached input, 1M cached → 10% of 15 = $1.50
const cost1 = estimateCostUsd({ input: 0, output: 0, cached: 1_000_000 }, 'claude-opus-4-7');
expect(cost1).toBeCloseTo(1.50, 2);
// 500K uncached input + 500K cached → $7.50 + $0.75 = $8.25
const cost2 = estimateCostUsd({ input: 500_000, output: 0, cached: 500_000 }, 'claude-opus-4-7');
expect(cost2).toBeCloseTo(8.25, 2);
});
test('PRICING table covers the key model families', () => {
expect(PRICING['claude-opus-4-7']).toBeDefined();
expect(PRICING['claude-sonnet-4-6']).toBeDefined();
expect(PRICING['gpt-5.4']).toBeDefined();
expect(PRICING['gemini-2.5-pro']).toBeDefined();
});
test('missingTools reports unsupported tools per provider', () => {
// GPT/Codex doesn't expose Edit, Glob, Grep
expect(missingTools('gpt', ['Edit', 'Glob', 'Grep'])).toEqual(['Edit', 'Glob', 'Grep']);
// Claude supports all core tools
expect(missingTools('claude', ['Edit', 'Glob', 'Grep', 'Bash', 'Read'])).toEqual([]);
// Gemini has very limited agentic surface
expect(missingTools('gemini', ['Bash', 'Edit'])).toEqual(['Bash', 'Edit']);
});
test('TOOL_COMPATIBILITY is populated for all three families', () => {
expect(TOOL_COMPATIBILITY.claude).toBeDefined();
expect(TOOL_COMPATIBILITY.gpt).toBeDefined();
expect(TOOL_COMPATIBILITY.gemini).toBeDefined();
});
test('formatTable handles a report with mixed success/error/unavailable entries', () => {
const report: BenchmarkReport = {
prompt: 'test prompt',
workdir: '/tmp',
startedAt: '2026-04-16T20:00:00Z',
durationMs: 1500,
entries: [
{
provider: 'claude',
family: 'claude',
available: true,
result: {
output: 'ok',
tokens: { input: 100, output: 200 },
durationMs: 800,
toolCalls: 3,
modelUsed: 'claude-opus-4-7',
},
costUsd: 0.0165,
qualityScore: 9.2,
},
{
provider: 'gpt',
family: 'gpt',
available: true,
result: {
output: '',
tokens: { input: 0, output: 0 },
durationMs: 200,
toolCalls: 0,
modelUsed: 'gpt-5.4',
error: { code: 'auth', reason: 'codex login required' },
},
},
{
provider: 'gemini',
family: 'gemini',
available: false,
unavailable_reason: 'gemini CLI not on PATH',
},
],
};
const table = formatTable(report);
expect(table).toContain('claude-opus-4-7');
expect(table).toContain('ERROR auth');
expect(table).toContain('unavailable');
expect(table).toContain('9.2/10');
});
test('formatJson produces parseable JSON', () => {
const report: BenchmarkReport = {
prompt: 'x',
workdir: '/tmp',
startedAt: '2026-04-16T20:00:00Z',
durationMs: 100,
entries: [],
};
const json = formatJson(report);
const parsed = JSON.parse(json);
expect(parsed.prompt).toBe('x');
expect(parsed.entries).toEqual([]);
});
test('formatMarkdown produces a table header', () => {
const report: BenchmarkReport = {
prompt: 'x',
workdir: '/tmp',
startedAt: '2026-04-16T20:00:00Z',
durationMs: 100,
entries: [],
};
const md = formatMarkdown(report);
expect(md).toContain('# Benchmark report');
expect(md).toContain('| Model | Latency |');
});