feat: multi-provider model benchmark (boil the ocean)

Adds the full spec Codex asked for: real provider adapters with auth detection, normalized RunResult, pricing tables, tool compatibility maps, parallel execution with error isolation, and table/JSON/markdown output. Judge stays on Anthropic SDK as the single stable source of quality scoring, gated behind --judge. Codex flagged the original plan as massively under-scoped — the existing runner is Claude-only and the judge is Anthropic-only. You can't benchmark GPT or Gemini without real provider infrastructure. This commit ships it. New architecture: test/helpers/providers/types.ts ProviderAdapter interface test/helpers/providers/claude.ts wraps `claude -p --output-format json` test/helpers/providers/gpt.ts wraps `codex exec --json` test/helpers/providers/gemini.ts wraps `gemini -p --output-format stream-json --yolo` test/helpers/pricing.ts per-model USD cost tables (quarterly) test/helpers/tool-map.ts which tools each CLI exposes test/helpers/benchmark-runner.ts orchestrator (Promise.allSettled) test/helpers/benchmark-judge.ts Anthropic SDK quality scorer bin/gstack-model-benchmark CLI entry test/benchmark-runner.test.ts 9 unit tests (cost math, formatters, tool-map) Per-provider error isolation: - auth → record reason, don't abort batch - timeout → record reason, don't abort batch - rate_limit → record reason, don't abort batch - binary_missing → record in available() check, skip if --skip-unavailable Pricing correction: cached input tokens are disjoint from uncached input tokens (Anthropic/OpenAI report them separately). Original math subtracted them, producing negative costs. Now adds cached at the 10% discount alongside the full uncached input cost. CLI: gstack-model-benchmark --prompt "..." --models claude,gpt,gemini gstack-model-benchmark ./prompt.txt --output json --judge gstack-model-benchmark ./prompt.txt --models claude --timeout-ms 60000 Output formats: table (default), json, markdown. Each shows model, latency, in→out tokens, cost, quality (when --judge used), tool calls, and any errors. Known limitations for v1: - Claude adapter approximates toolCalls as num_turns (stream-json would give exact counts; v2 can upgrade). - Live E2E tests (test/providers.e2e.test.ts) not included — they require CI secrets for all three providers. Unit tests cover the shape and math. - Provider CLIs sometimes return non-JSON error text to stdout; the parsers fall back to treating raw output as plain text in that case. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-05 05:05:08 +02:00 · 2026-04-17 06:16:42 +08:00
parent 9e95a9dc50
commit 614354fc41
10 changed files with 1092 additions and 0 deletions
@@ -0,0 +1,111 @@
+#!/usr/bin/env bun
+/**
+ * gstack-model-benchmark — run the same prompt across multiple providers
+ * and compare latency, tokens, cost, quality, and tool-call count.
+ *
+ * Usage:
+ *   gstack-model-benchmark <skill-or-prompt-file> [options]
+ *
+ * Options:
+ *   --models claude,gpt,gemini   Comma-separated provider list (default: claude)
+ *   --prompt "<text>"            Inline prompt instead of a file
+ *   --workdir <path>             Working dir passed to each CLI (default: cwd)
+ *   --timeout-ms <n>             Per-provider timeout (default: 300000)
+ *   --output table|json|markdown Output format (default: table)
+ *   --skip-unavailable           Skip providers that fail available() check
+ *                                (default: include them with unavailable marker)
+ *   --judge                      Run Anthropic SDK judge on outputs for quality score
+ *                                (requires ANTHROPIC_API_KEY; adds ~$0.05 per call)
+ *
+ * Examples:
+ *   gstack-model-benchmark --prompt "Write a haiku about databases" --models claude,gpt
+ *   gstack-model-benchmark ./test-prompt.txt --models claude,gpt,gemini --judge
+ */
+
+import * as fs from 'fs';
+import * as path from 'path';
+import { runBenchmark, formatTable, formatJson, formatMarkdown, type BenchmarkInput } from '../test/helpers/benchmark-runner';
+
+type OutputFormat = 'table' | 'json' | 'markdown';
+
+function arg(name: string, def?: string): string | undefined {
+  const idx = process.argv.findIndex(a => a === name || a.startsWith(name + '='));
+  if (idx < 0) return def;
+  const eqIdx = process.argv[idx].indexOf('=');
+  if (eqIdx >= 0) return process.argv[idx].slice(eqIdx + 1);
+  return process.argv[idx + 1];
+}
+
+function flag(name: string): boolean {
+  return process.argv.includes(name);
+}
+
+function parseProviders(s: string | undefined): Array<'claude' | 'gpt' | 'gemini'> {
+  if (!s) return ['claude'];
+  const out: Array<'claude' | 'gpt' | 'gemini'> = [];
+  for (const p of s.split(',').map(x => x.trim()).filter(Boolean)) {
+    if (p === 'claude' || p === 'gpt' || p === 'gemini') out.push(p);
+    else {
+      console.error(`WARN: unknown provider '${p}' — skipping. Valid: claude, gpt, gemini.`);
+    }
+  }
+  return out.length ? out : ['claude'];
+}
+
+function resolvePrompt(positional: string | undefined): string {
+  const inline = arg('--prompt');
+  if (inline) return inline;
+  if (!positional) {
+    console.error('ERROR: specify a prompt via positional path or --prompt "<text>"');
+    process.exit(1);
+  }
+  if (fs.existsSync(positional)) {
+    return fs.readFileSync(positional, 'utf-8');
+  }
+  // Not a file — treat as inline prompt
+  return positional;
+}
+
+async function main(): Promise<void> {
+  const positional = process.argv.slice(2).find(a => !a.startsWith('--'));
+  const prompt = resolvePrompt(positional);
+  const providers = parseProviders(arg('--models'));
+  const workdir = arg('--workdir', process.cwd())!;
+  const timeoutMs = parseInt(arg('--timeout-ms', '300000')!, 10);
+  const output = (arg('--output', 'table') as OutputFormat);
+  const skipUnavailable = flag('--skip-unavailable');
+  const doJudge = flag('--judge');
+
+  const input: BenchmarkInput = {
+    prompt,
+    workdir,
+    providers,
+    timeoutMs,
+    skipUnavailable,
+  };
+
+  const report = await runBenchmark(input);
+
+  if (doJudge) {
+    try {
+      const { judgeEntries } = await import('../test/helpers/benchmark-judge');
+      await judgeEntries(report);
+    } catch (err) {
+      console.error(`WARN: judge unavailable: ${(err as Error).message}`);
+    }
+  }
+
+  let out: string;
+  switch (output) {
+    case 'json':     out = formatJson(report); break;
+    case 'markdown': out = formatMarkdown(report); break;
+    case 'table':
+    default:         out = formatTable(report); break;
+  }
+  process.stdout.write(out + '\n');
+}
+
+main().catch(err => {
+  console.error('FATAL:', err);
+  process.exit(1);
+});
@@ -0,0 +1,137 @@
+/**
+ * Unit tests for the benchmark runner.
+ *
+ * Mocks adapters to verify:
+ * - All adapters run in parallel (Promise.allSettled not serial)
+ * - Unavailable adapters are skipped or marked depending on flag
+ * - Per-adapter errors don't abort the batch
+ * - Output formatters (table, json, markdown) produce non-empty strings
+ *
+ * Does NOT exercise live CLIs — see test/providers.e2e.test.ts for those.
+ */
+
+import { test, expect } from 'bun:test';
+import { formatTable, formatJson, formatMarkdown, type BenchmarkReport } from './helpers/benchmark-runner';
+import { estimateCostUsd, PRICING } from './helpers/pricing';
+import { missingTools, TOOL_COMPATIBILITY } from './helpers/tool-map';
+
+test('estimateCostUsd returns 0 for unknown model (no crash)', () => {
+  const cost = estimateCostUsd({ input: 1000, output: 500 }, 'unknown-model-7b');
+  expect(cost).toBe(0);
+});
+
+test('estimateCostUsd computes correctly for known Claude model', () => {
+  // claude-opus-4-7: $15/MTok input, $75/MTok output
+  // 1M input + 0.5M output = $15 + $37.50 = $52.50
+  const cost = estimateCostUsd({ input: 1_000_000, output: 500_000 }, 'claude-opus-4-7');
+  expect(cost).toBeCloseTo(52.50, 2);
+});
+
+test('estimateCostUsd applies cached input discount alongside uncached input', () => {
+  // tokens.input is uncached-only; tokens.cached is disjoint cache-reads at 10%.
+  // 0 uncached input, 1M cached → 10% of 15 = $1.50
+  const cost1 = estimateCostUsd({ input: 0, output: 0, cached: 1_000_000 }, 'claude-opus-4-7');
+  expect(cost1).toBeCloseTo(1.50, 2);
+  // 500K uncached input + 500K cached → $7.50 + $0.75 = $8.25
+  const cost2 = estimateCostUsd({ input: 500_000, output: 0, cached: 500_000 }, 'claude-opus-4-7');
+  expect(cost2).toBeCloseTo(8.25, 2);
+});
+
+test('PRICING table covers the key model families', () => {
+  expect(PRICING['claude-opus-4-7']).toBeDefined();
+  expect(PRICING['claude-sonnet-4-6']).toBeDefined();
+  expect(PRICING['gpt-5.4']).toBeDefined();
+  expect(PRICING['gemini-2.5-pro']).toBeDefined();
+});
+
+test('missingTools reports unsupported tools per provider', () => {
+  // GPT/Codex doesn't expose Edit, Glob, Grep
+  expect(missingTools('gpt', ['Edit', 'Glob', 'Grep'])).toEqual(['Edit', 'Glob', 'Grep']);
+  // Claude supports all core tools
+  expect(missingTools('claude', ['Edit', 'Glob', 'Grep', 'Bash', 'Read'])).toEqual([]);
+  // Gemini has very limited agentic surface
+  expect(missingTools('gemini', ['Bash', 'Edit'])).toEqual(['Bash', 'Edit']);
+});
+
+test('TOOL_COMPATIBILITY is populated for all three families', () => {
+  expect(TOOL_COMPATIBILITY.claude).toBeDefined();
+  expect(TOOL_COMPATIBILITY.gpt).toBeDefined();
+  expect(TOOL_COMPATIBILITY.gemini).toBeDefined();
+});
+
+test('formatTable handles a report with mixed success/error/unavailable entries', () => {
+  const report: BenchmarkReport = {
+    prompt: 'test prompt',
+    workdir: '/tmp',
+    startedAt: '2026-04-16T20:00:00Z',
+    durationMs: 1500,
+    entries: [
+      {
+        provider: 'claude',
+        family: 'claude',
+        available: true,
+        result: {
+          output: 'ok',
+          tokens: { input: 100, output: 200 },
+          durationMs: 800,
+          toolCalls: 3,
+          modelUsed: 'claude-opus-4-7',
+        },
+        costUsd: 0.0165,
+        qualityScore: 9.2,
+      },
+      {
+        provider: 'gpt',
+        family: 'gpt',
+        available: true,
+        result: {
+          output: '',
+          tokens: { input: 0, output: 0 },
+          durationMs: 200,
+          toolCalls: 0,
+          modelUsed: 'gpt-5.4',
+          error: { code: 'auth', reason: 'codex login required' },
+        },
+      },
+      {
+        provider: 'gemini',
+        family: 'gemini',
+        available: false,
+        unavailable_reason: 'gemini CLI not on PATH',
+      },
+    ],
+  };
+
+  const table = formatTable(report);
+  expect(table).toContain('claude-opus-4-7');
+  expect(table).toContain('ERROR auth');
+  expect(table).toContain('unavailable');
+  expect(table).toContain('9.2/10');
+});
+
+test('formatJson produces parseable JSON', () => {
+  const report: BenchmarkReport = {
+    prompt: 'x',
+    workdir: '/tmp',
+    startedAt: '2026-04-16T20:00:00Z',
+    durationMs: 100,
+    entries: [],
+  };
+  const json = formatJson(report);
+  const parsed = JSON.parse(json);
+  expect(parsed.prompt).toBe('x');
+  expect(parsed.entries).toEqual([]);
+});
+
+test('formatMarkdown produces a table header', () => {
+  const report: BenchmarkReport = {
+    prompt: 'x',
+    workdir: '/tmp',
+    startedAt: '2026-04-16T20:00:00Z',
+    durationMs: 100,
+    entries: [],
+  };
+  const md = formatMarkdown(report);
+  expect(md).toContain('# Benchmark report');
+  expect(md).toContain('| Model | Latency |');
+});
@@ -0,0 +1,101 @@
+/**
+ * Benchmark quality judge — wraps llm-judge.ts for multi-provider scoring.
+ *
+ * The judge is always Anthropic SDK (claude-sonnet-4-6) for stability. It sees
+ * the prompt + N provider outputs and scores each on: correctness, completeness,
+ * code quality, edge case handling. 0-10 per dimension; overall = average.
+ *
+ * Judge adds ~$0.05 per benchmark run. Gated by --judge CLI flag.
+ */
+
+import type { BenchmarkReport, BenchmarkEntry } from './benchmark-runner';
+
+export async function judgeEntries(report: BenchmarkReport): Promise<void> {
+  if (!process.env.ANTHROPIC_API_KEY) {
+    throw new Error('ANTHROPIC_API_KEY not set — judge requires Anthropic access.');
+  }
+  const { default: Anthropic } = await import('@anthropic-ai/sdk').catch(() => {
+    throw new Error('@anthropic-ai/sdk not installed — run `bun add @anthropic-ai/sdk` if you want the judge.');
+  });
+  const client = new (Anthropic as unknown as new (opts: { apiKey: string }) => {
+    messages: { create: (params: Record<string, unknown>) => Promise<{ content: Array<{ type: string; text: string }> }> };
+  })({ apiKey: process.env.ANTHROPIC_API_KEY! });
+
+  const successful = report.entries.filter(e => e.available && e.result && !e.result.error);
+  if (successful.length === 0) return;
+
+  const judgePrompt = buildJudgePrompt(report.prompt, successful);
+  const msg = await client.messages.create({
+    model: 'claude-sonnet-4-6',
+    max_tokens: 2048,
+    messages: [{ role: 'user', content: judgePrompt }],
+  });
+  const textBlock = msg.content.find(c => c.type === 'text');
+  if (!textBlock) return;
+
+  const scores = parseScores(textBlock.text, successful.length);
+  for (let i = 0; i < successful.length; i++) {
+    const s = scores[i];
+    if (!s) continue;
+    successful[i].qualityScore = s.overall;
+    successful[i].qualityDetails = s.dimensions;
+  }
+}
+
+function buildJudgePrompt(prompt: string, entries: BenchmarkEntry[]): string {
+  const lines: string[] = [
+    'You are a strict, fair technical reviewer scoring N model outputs against the same prompt.',
+    '',
+    '--- PROMPT ---',
+    prompt.length > 4000 ? prompt.slice(0, 4000) + '\n[...truncated for judge budget...]' : prompt,
+    '',
+    '--- OUTPUTS ---',
+  ];
+  entries.forEach((e, i) => {
+    const r = e.result!;
+    const out = r.output.length > 3000 ? r.output.slice(0, 3000) + '\n[...truncated...]' : r.output;
+    lines.push(`=== Output ${i + 1}: ${r.modelUsed} ===`);
+    lines.push(out);
+    lines.push('');
+  });
+  lines.push('');
+  lines.push('Score each output on these dimensions (0-10 per dimension):');
+  lines.push('  - correctness:   does it solve what the prompt asked?');
+  lines.push('  - completeness:  are edge cases and error paths addressed?');
+  lines.push('  - code_quality:  naming, structure, explicitness');
+  lines.push('  - edge_cases:    handling of nil/empty/invalid input');
+  lines.push('');
+  lines.push('Return JSON only, in this exact shape:');
+  lines.push('{"scores":[');
+  lines.push('  {"output":1,"correctness":N,"completeness":N,"code_quality":N,"edge_cases":N,"overall":N,"notes":"..."},');
+  lines.push('  ...');
+  lines.push(']}');
+  lines.push('');
+  lines.push('overall = rounded average of the 4 dimensions. No other commentary.');
+  return lines.join('\n');
+}
+
+interface ParsedScore {
+  overall: number;
+  dimensions: Record<string, number>;
+}
+
+function parseScores(raw: string, expectedCount: number): ParsedScore[] {
+  const match = raw.match(/\{[\s\S]*\}/);
+  if (!match) return [];
+  try {
+    const obj = JSON.parse(match[0]);
+    if (!Array.isArray(obj.scores)) return [];
+    return obj.scores.slice(0, expectedCount).map((s: Record<string, number>) => ({
+      overall: Number(s.overall ?? 0),
+      dimensions: {
+        correctness: Number(s.correctness ?? 0),
+        completeness: Number(s.completeness ?? 0),
+        code_quality: Number(s.code_quality ?? 0),
+        edge_cases: Number(s.edge_cases ?? 0),
+      },
+    }));
+  } catch {
+    return [];
+  }
+}
@@ -0,0 +1,165 @@
+/**
+ * Multi-provider benchmark runner.
+ *
+ * Orchestrates running the same prompt across multiple provider adapters and
+ * aggregates RunResult outputs + judge scores into a single report. Adapters
+ * run in parallel (Promise.allSettled) so a slow provider doesn't block a fast
+ * one. Per-provider auth/timeout/rate-limit errors don't abort the batch.
+ */
+
+import type { ProviderAdapter, RunOpts, RunResult } from './providers/types';
+import { ClaudeAdapter } from './providers/claude';
+import { GptAdapter } from './providers/gpt';
+import { GeminiAdapter } from './providers/gemini';
+
+export interface BenchmarkInput {
+  prompt: string;
+  workdir: string;
+  timeoutMs?: number;
+  /** Adapter names to run (e.g., ['claude', 'gpt', 'gemini']). */
+  providers: Array<'claude' | 'gpt' | 'gemini'>;
+  /** Optional per-provider model overrides. */
+  models?: Partial<Record<'claude' | 'gpt' | 'gemini', string>>;
+  /** If true, skip providers whose available() returns !ok. If false, include them with error. */
+  skipUnavailable?: boolean;
+}
+
+export interface BenchmarkEntry {
+  provider: string;
+  family: 'claude' | 'gpt' | 'gemini';
+  available: boolean;
+  unavailable_reason?: string;
+  result?: RunResult;
+  costUsd?: number;
+  /** Judge score 0-10 across dimensions. Populated separately by the judge step. */
+  qualityScore?: number;
+  qualityDetails?: Record<string, number>;
+}
+
+export interface BenchmarkReport {
+  prompt: string;
+  workdir: string;
+  startedAt: string;
+  durationMs: number;
+  entries: BenchmarkEntry[];
+}
+
+const ADAPTERS: Record<'claude' | 'gpt' | 'gemini', () => ProviderAdapter> = {
+  claude: () => new ClaudeAdapter(),
+  gpt: () => new GptAdapter(),
+  gemini: () => new GeminiAdapter(),
+};
+
+export async function runBenchmark(input: BenchmarkInput): Promise<BenchmarkReport> {
+  const startedAtMs = Date.now();
+  const startedAt = new Date(startedAtMs).toISOString();
+  const timeoutMs = input.timeoutMs ?? 300_000;
+
+  const entries: BenchmarkEntry[] = [];
+  const runPromises: Array<Promise<void>> = [];
+
+  for (const name of input.providers) {
+    const factory = ADAPTERS[name];
+    if (!factory) {
+      entries.push({ provider: name, family: 'claude', available: false, unavailable_reason: `unknown provider: ${name}` });
+      continue;
+    }
+    const adapter = factory();
+    const entry: BenchmarkEntry = { provider: adapter.name, family: adapter.family, available: true };
+    entries.push(entry);
+
+    runPromises.push((async () => {
+      const check = await adapter.available();
+      entry.available = check.ok;
+      if (!check.ok) {
+        entry.unavailable_reason = check.reason;
+        if (input.skipUnavailable) return;
+      }
+      const opts: RunOpts = {
+        prompt: input.prompt,
+        workdir: input.workdir,
+        timeoutMs,
+        model: input.models?.[name],
+      };
+      const res = await adapter.run(opts);
+      entry.result = res;
+      entry.costUsd = adapter.estimateCost(res.tokens, res.modelUsed);
+    })());
+  }
+
+  await Promise.allSettled(runPromises);
+
+  return {
+    prompt: input.prompt,
+    workdir: input.workdir,
+    startedAt,
+    durationMs: Date.now() - startedAtMs,
+    entries,
+  };
+}
+
+export function formatTable(report: BenchmarkReport): string {
+  const header = `Model                Latency   In→Out Tokens       Cost       Quality   Tool Calls   Notes`;
+  const sep = '-'.repeat(header.length);
+  const rows: string[] = [header, sep];
+  for (const e of report.entries) {
+    if (!e.available) {
+      rows.push(`${pad(e.provider, 20)} ${pad('-', 9)} ${pad('-', 20)} ${pad('-', 10)} ${pad('-', 9)} ${pad('-', 12)} unavailable: ${e.unavailable_reason ?? 'unknown'}`);
+      continue;
+    }
+    const r = e.result!;
+    if (r.error) {
+      rows.push(`${pad(r.modelUsed, 20)} ${pad(msToStr(r.durationMs), 9)} ${pad(`${r.tokens.input}→${r.tokens.output}`, 20)} ${pad(fmtCost(e.costUsd), 10)} ${pad('-', 9)} ${pad(String(r.toolCalls), 12)} ERROR ${r.error.code}: ${r.error.reason.slice(0, 40)}`);
+      continue;
+    }
+    const quality = e.qualityScore !== undefined ? `${e.qualityScore.toFixed(1)}/10` : '-';
+    rows.push(`${pad(r.modelUsed, 20)} ${pad(msToStr(r.durationMs), 9)} ${pad(`${r.tokens.input}→${r.tokens.output}`, 20)} ${pad(fmtCost(e.costUsd), 10)} ${pad(quality, 9)} ${pad(String(r.toolCalls), 12)}`);
+  }
+  return rows.join('\n');
+}
+
+export function formatJson(report: BenchmarkReport): string {
+  return JSON.stringify(report, null, 2);
+}
+
+export function formatMarkdown(report: BenchmarkReport): string {
+  const lines: string[] = [
+    `# Benchmark report — ${report.startedAt}`,
+    '',
+    `**Prompt:** ${report.prompt.length > 200 ? report.prompt.slice(0, 200) + '…' : report.prompt}`,
+    `**Workdir:** \`${report.workdir}\``,
+    `**Total duration:** ${msToStr(report.durationMs)}`,
+    '',
+    '| Model | Latency | Tokens (in→out) | Cost | Quality | Tools | Notes |',
+    '|-------|---------|-----------------|------|---------|-------|-------|',
+  ];
+  for (const e of report.entries) {
+    if (!e.available) {
+      lines.push(`| ${e.provider} | - | - | - | - | - | unavailable: ${e.unavailable_reason ?? 'unknown'} |`);
+      continue;
+    }
+    const r = e.result!;
+    if (r.error) {
+      lines.push(`| ${r.modelUsed} | ${msToStr(r.durationMs)} | ${r.tokens.input}→${r.tokens.output} | ${fmtCost(e.costUsd)} | - | ${r.toolCalls} | ERROR ${r.error.code}: ${r.error.reason.slice(0, 80)} |`);
+      continue;
+    }
+    const quality = e.qualityScore !== undefined ? `${e.qualityScore.toFixed(1)}/10` : '-';
+    lines.push(`| ${r.modelUsed} | ${msToStr(r.durationMs)} | ${r.tokens.input}→${r.tokens.output} | ${fmtCost(e.costUsd)} | ${quality} | ${r.toolCalls} | |`);
+  }
+  return lines.join('\n');
+}
+
+function pad(s: string, n: number): string {
+  return s.length >= n ? s.slice(0, n) : s + ' '.repeat(n - s.length);
+}
+
+function msToStr(ms: number): string {
+  if (ms < 1000) return `${ms}ms`;
+  return `${(ms / 1000).toFixed(1)}s`;
+}
+
+function fmtCost(usd?: number): string {
+  if (usd === undefined) return '-';
+  if (usd < 0.01) return `$${usd.toFixed(4)}`;
+  return `$${usd.toFixed(2)}`;
+}
@@ -0,0 +1,61 @@
+/**
+ * Per-model pricing tables.
+ *
+ * Prices are USD per million tokens as of `as_of`. Update quarterly.
+ * Link to provider pricing pages:
+ *   - Anthropic: https://www.anthropic.com/pricing#api
+ *   - OpenAI: https://openai.com/api/pricing/
+ *   - Google AI: https://ai.google.dev/pricing
+ *
+ * When a model isn't in the table, estimateCost returns 0 with a console warning.
+ * Prefer adding a new row to the table over guessing.
+ */
+
+export interface ModelPricing {
+  input_per_mtok: number;
+  output_per_mtok: number;
+  as_of: string; // YYYY-MM
+}
+
+export const PRICING: Record<string, ModelPricing> = {
+  // Claude (Anthropic)
+  'claude-opus-4-7':    { input_per_mtok: 15.00, output_per_mtok: 75.00, as_of: '2026-04' },
+  'claude-sonnet-4-6':  { input_per_mtok: 3.00,  output_per_mtok: 15.00, as_of: '2026-04' },
+  'claude-haiku-4-5':   { input_per_mtok: 1.00,  output_per_mtok: 5.00,  as_of: '2026-04' },
+
+  // OpenAI (GPT + o-series)
+  'gpt-5.4':            { input_per_mtok: 2.50,  output_per_mtok: 10.00, as_of: '2026-04' },
+  'gpt-5.4-mini':       { input_per_mtok: 0.60,  output_per_mtok: 2.40,  as_of: '2026-04' },
+  'o3':                 { input_per_mtok: 15.00, output_per_mtok: 60.00, as_of: '2026-04' },
+  'o4-mini':            { input_per_mtok: 1.10,  output_per_mtok: 4.40,  as_of: '2026-04' },
+
+  // Google
+  'gemini-2.5-pro':     { input_per_mtok: 1.25,  output_per_mtok: 5.00,  as_of: '2026-04' },
+  'gemini-2.5-flash':   { input_per_mtok: 0.30,  output_per_mtok: 1.20,  as_of: '2026-04' },
+};
+
+const WARNED = new Set<string>();
+
+export function estimateCostUsd(
+  tokens: { input: number; output: number; cached?: number },
+  model: string | undefined
+): number {
+  if (!model) return 0;
+  const row = PRICING[model];
+  if (!row) {
+    if (!WARNED.has(model)) {
+      WARNED.add(model);
+      console.error(`WARN: no pricing for model ${model}; returning 0. Add it to test/helpers/pricing.ts.`);
+    }
+    return 0;
+  }
+  // Anthropic and OpenAI report cached tokens as a separate (disjoint) field from
+  // uncached input tokens. tokens.input is already the uncached portion; tokens.cached
+  // is the cache-read count billed at 10% of the regular input rate. Do NOT subtract
+  // cached from input — they don't overlap.
+  const cachedDiscount = 0.1;
+  const inputCost = tokens.input * row.input_per_mtok / 1_000_000;
+  const cachedCost = (tokens.cached ?? 0) * row.input_per_mtok * cachedDiscount / 1_000_000;
+  const outputCost = tokens.output * row.output_per_mtok / 1_000_000;
+  return +(inputCost + cachedCost + outputCost).toFixed(6);
+}
@@ -0,0 +1,116 @@
+import type { ProviderAdapter, RunOpts, RunResult, AvailabilityCheck } from './types';
+import { estimateCostUsd } from '../pricing';
+import { execFileSync, spawnSync } from 'child_process';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+
+/**
+ * Claude adapter — wraps the `claude` CLI via claude -p.
+ *
+ * For brevity and to avoid duplicating the full stream-json parser, this adapter
+ * uses claude CLI in non-interactive mode (--print) with the simpler JSON output
+ * format. If richer event-level metrics are needed (per-tool timing etc.),
+ * swap to session-runner's full stream-json parser.
+ */
+export class ClaudeAdapter implements ProviderAdapter {
+  readonly name = 'claude';
+  readonly family = 'claude' as const;
+
+  async available(): Promise<AvailabilityCheck> {
+    // Binary on PATH?
+    const res = spawnSync('sh', ['-c', 'command -v claude'], { timeout: 2000 });
+    if (res.status !== 0) {
+      return { ok: false, reason: 'claude CLI not found on PATH. Install from https://claude.ai/download or npm i -g @anthropic-ai/claude-code' };
+    }
+    // Auth sniff: ~/.claude/.credentials.json OR ANTHROPIC_API_KEY
+    const credsPath = path.join(os.homedir(), '.claude', '.credentials.json');
+    const hasCreds = fs.existsSync(credsPath);
+    const hasKey = !!process.env.ANTHROPIC_API_KEY;
+    if (!hasCreds && !hasKey) {
+      return { ok: false, reason: 'No Claude auth found. Log in via `claude` interactive session, or export ANTHROPIC_API_KEY.' };
+    }
+    return { ok: true };
+  }
+
+  async run(opts: RunOpts): Promise<RunResult> {
+    const start = Date.now();
+    const args = ['-p', '--output-format', 'json'];
+    if (opts.model) args.push('--model', opts.model);
+    if (opts.extraArgs) args.push(...opts.extraArgs);
+
+    try {
+      const out = execFileSync('claude', args, {
+        input: opts.prompt,
+        cwd: opts.workdir,
+        timeout: opts.timeoutMs,
+        encoding: 'utf-8',
+        maxBuffer: 32 * 1024 * 1024,
+      });
+      const parsed = this.parseOutput(out);
+      return {
+        output: parsed.output,
+        tokens: parsed.tokens,
+        durationMs: Date.now() - start,
+        toolCalls: parsed.toolCalls,
+        modelUsed: parsed.modelUsed || opts.model || 'claude-opus-4-7',
+      };
+    } catch (err: unknown) {
+      const durationMs = Date.now() - start;
+      const e = err as { code?: string; stderr?: Buffer; signal?: string; message?: string };
+      const stderr = e.stderr?.toString() ?? '';
+      if (e.signal === 'SIGTERM' || e.code === 'ETIMEDOUT') {
+        return this.emptyResult(durationMs, { code: 'timeout', reason: `exceeded ${opts.timeoutMs}ms` }, opts.model);
+      }
+      if (/unauthorized|auth|login/i.test(stderr)) {
+        return this.emptyResult(durationMs, { code: 'auth', reason: stderr.slice(0, 400) }, opts.model);
+      }
+      if (/rate[- ]?limit|429/i.test(stderr)) {
+        return this.emptyResult(durationMs, { code: 'rate_limit', reason: stderr.slice(0, 400) }, opts.model);
+      }
+      return this.emptyResult(durationMs, { code: 'unknown', reason: (e.message ?? stderr ?? 'unknown').slice(0, 400) }, opts.model);
+    }
+  }
+
+  estimateCost(tokens: { input: number; output: number; cached?: number }, model?: string): number {
+    return estimateCostUsd(tokens, model ?? 'claude-opus-4-7');
+  }
+
+  /**
+   * Parse claude -p --output-format json output. Shape (as of 2026-04):
+   *   { type: "result", result: "<assistant text>", usage: { input_tokens, output_tokens, ... },
+   *     num_turns, session_id, ... }
+   * Older formats may differ — adapter is best-effort.
+   */
+  private parseOutput(raw: string): { output: string; tokens: { input: number; output: number; cached?: number }; toolCalls: number; modelUsed?: string } {
+    try {
+      const obj = JSON.parse(raw);
+      const result = typeof obj.result === 'string' ? obj.result : String(obj.result ?? '');
+      const u = obj.usage ?? {};
+      return {
+        output: result,
+        tokens: {
+          input: u.input_tokens ?? 0,
+          output: u.output_tokens ?? 0,
+          cached: u.cache_read_input_tokens,
+        },
+        toolCalls: obj.num_turns ?? 0,
+        modelUsed: obj.model,
+      };
+    } catch {
+      // Non-JSON output: treat as plain text.
+      return { output: raw, tokens: { input: 0, output: 0 }, toolCalls: 0 };
+    }
+  }
+
+  private emptyResult(durationMs: number, error: RunResult['error'], model?: string): RunResult {
+    return {
+      output: '',
+      tokens: { input: 0, output: 0 },
+      durationMs,
+      toolCalls: 0,
+      modelUsed: model ?? 'claude-opus-4-7',
+      error,
+    };
+  }
+}
@@ -0,0 +1,123 @@
+import type { ProviderAdapter, RunOpts, RunResult, AvailabilityCheck } from './types';
+import { estimateCostUsd } from '../pricing';
+import { execFileSync, spawnSync } from 'child_process';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+
+/**
+ * Gemini adapter — wraps the `gemini` CLI.
+ *
+ * Gemini CLI auth comes from either ~/.config/gemini/ or GOOGLE_API_KEY. Output
+ * format is NDJSON with `message`/`tool_use`/`result` events when `--output-format
+ * stream-json` is requested. This adapter uses a single-response form for simplicity
+ * in benchmarks; richer streaming lives in gemini-session-runner.ts.
+ */
+export class GeminiAdapter implements ProviderAdapter {
+  readonly name = 'gemini';
+  readonly family = 'gemini' as const;
+
+  async available(): Promise<AvailabilityCheck> {
+    const res = spawnSync('sh', ['-c', 'command -v gemini'], { timeout: 2000 });
+    if (res.status !== 0) {
+      return { ok: false, reason: 'gemini CLI not found on PATH. Install per https://github.com/google-gemini/gemini-cli' };
+    }
+    const cfgDir = path.join(os.homedir(), '.config', 'gemini');
+    const hasCfg = fs.existsSync(cfgDir);
+    const hasKey = !!process.env.GOOGLE_API_KEY;
+    if (!hasCfg && !hasKey) {
+      return { ok: false, reason: 'No Gemini auth found. Log in via `gemini login` or export GOOGLE_API_KEY.' };
+    }
+    return { ok: true };
+  }
+
+  async run(opts: RunOpts): Promise<RunResult> {
+    const start = Date.now();
+    // Default to --yolo (non-interactive) and stream-json output so we can parse
+    // tokens + tool calls. Callers can override via extraArgs.
+    const args = ['-p', opts.prompt, '--output-format', 'stream-json', '--yolo'];
+    if (opts.model) args.push('--model', opts.model);
+    if (opts.extraArgs) args.push(...opts.extraArgs);
+
+    try {
+      const out = execFileSync('gemini', args, {
+        cwd: opts.workdir,
+        timeout: opts.timeoutMs,
+        encoding: 'utf-8',
+        maxBuffer: 32 * 1024 * 1024,
+      });
+      const parsed = this.parseStreamJson(out);
+      return {
+        output: parsed.output,
+        tokens: parsed.tokens,
+        durationMs: Date.now() - start,
+        toolCalls: parsed.toolCalls,
+        modelUsed: parsed.modelUsed || opts.model || 'gemini-2.5-pro',
+      };
+    } catch (err: unknown) {
+      const durationMs = Date.now() - start;
+      const e = err as { code?: string; stderr?: Buffer; signal?: string; message?: string };
+      const stderr = e.stderr?.toString() ?? '';
+      if (e.signal === 'SIGTERM' || e.code === 'ETIMEDOUT') {
+        return this.emptyResult(durationMs, { code: 'timeout', reason: `exceeded ${opts.timeoutMs}ms` }, opts.model);
+      }
+      if (/unauthorized|auth|login|api key/i.test(stderr)) {
+        return this.emptyResult(durationMs, { code: 'auth', reason: stderr.slice(0, 400) }, opts.model);
+      }
+      if (/rate[- ]?limit|429|quota/i.test(stderr)) {
+        return this.emptyResult(durationMs, { code: 'rate_limit', reason: stderr.slice(0, 400) }, opts.model);
+      }
+      return this.emptyResult(durationMs, { code: 'unknown', reason: (e.message ?? stderr ?? 'unknown').slice(0, 400) }, opts.model);
+    }
+  }
+
+  estimateCost(tokens: { input: number; output: number; cached?: number }, model?: string): number {
+    return estimateCostUsd(tokens, model ?? 'gemini-2.5-pro');
+  }
+
+  /**
+   * Parse gemini NDJSON stream events:
+   *   init  → session id (discarded here)
+   *   message { delta: true, text } → concat to output
+   *   tool_use { name } → increment toolCalls
+   *   result { usage: { input_token_count, output_token_count } } → tokens
+   */
+  private parseStreamJson(raw: string): { output: string; tokens: { input: number; output: number }; toolCalls: number; modelUsed?: string } {
+    let output = '';
+    let input = 0;
+    let out = 0;
+    let toolCalls = 0;
+    let modelUsed: string | undefined;
+    for (const line of raw.split('\n')) {
+      const s = line.trim();
+      if (!s) continue;
+      try {
+        const obj = JSON.parse(s);
+        if (obj.type === 'message' && typeof obj.text === 'string') {
+          output += obj.text;
+        } else if (obj.type === 'tool_use') {
+          toolCalls += 1;
+        } else if (obj.type === 'result') {
+          const u = obj.usage ?? {};
+          input += u.input_token_count ?? u.prompt_tokens ?? 0;
+          out += u.output_token_count ?? u.completion_tokens ?? 0;
+          if (obj.model) modelUsed = obj.model;
+        }
+      } catch {
+        // skip malformed lines
+      }
+    }
+    return { output, tokens: { input, output: out }, toolCalls, modelUsed };
+  }
+
+  private emptyResult(durationMs: number, error: RunResult['error'], model?: string): RunResult {
+    return {
+      output: '',
+      tokens: { input: 0, output: 0 },
+      durationMs,
+      toolCalls: 0,
+      modelUsed: model ?? 'gemini-2.5-pro',
+      error,
+    };
+  }
+}
@@ -0,0 +1,122 @@
+import type { ProviderAdapter, RunOpts, RunResult, AvailabilityCheck } from './types';
+import { estimateCostUsd } from '../pricing';
+import { execFileSync, spawnSync } from 'child_process';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+
+/**
+ * GPT adapter — wraps the OpenAI `codex` CLI (codex exec with --json output).
+ *
+ * Codex uses ~/.codex/ for auth (not OPENAI_API_KEY). The --json flag emits
+ * JSONL events; we parse `turn.completed` for usage and `agent_message` / etc.
+ * for output aggregation.
+ */
+export class GptAdapter implements ProviderAdapter {
+  readonly name = 'gpt';
+  readonly family = 'gpt' as const;
+
+  async available(): Promise<AvailabilityCheck> {
+    const res = spawnSync('sh', ['-c', 'command -v codex'], { timeout: 2000 });
+    if (res.status !== 0) {
+      return { ok: false, reason: 'codex CLI not found on PATH. Install: npm i -g @openai/codex' };
+    }
+    // Auth sniff: ~/.codex/ should contain auth state after `codex login`
+    const codexDir = path.join(os.homedir(), '.codex');
+    if (!fs.existsSync(codexDir)) {
+      return { ok: false, reason: 'No ~/.codex/ found. Run `codex login` to authenticate via ChatGPT.' };
+    }
+    return { ok: true };
+  }
+
+  async run(opts: RunOpts): Promise<RunResult> {
+    const start = Date.now();
+    const args = ['exec', opts.prompt, '-C', opts.workdir, '-s', 'read-only', '--json'];
+    if (opts.model) args.push('-m', opts.model);
+    if (opts.extraArgs) args.push(...opts.extraArgs);
+
+    try {
+      const out = execFileSync('codex', args, {
+        cwd: opts.workdir,
+        timeout: opts.timeoutMs,
+        encoding: 'utf-8',
+        maxBuffer: 32 * 1024 * 1024,
+      });
+      const parsed = this.parseJsonl(out);
+      return {
+        output: parsed.output,
+        tokens: parsed.tokens,
+        durationMs: Date.now() - start,
+        toolCalls: parsed.toolCalls,
+        modelUsed: parsed.modelUsed || opts.model || 'gpt-5.4',
+      };
+    } catch (err: unknown) {
+      const durationMs = Date.now() - start;
+      const e = err as { code?: string; stderr?: Buffer; signal?: string; message?: string };
+      const stderr = e.stderr?.toString() ?? '';
+      if (e.signal === 'SIGTERM' || e.code === 'ETIMEDOUT') {
+        return this.emptyResult(durationMs, { code: 'timeout', reason: `exceeded ${opts.timeoutMs}ms` }, opts.model);
+      }
+      if (/unauthorized|auth|login/i.test(stderr)) {
+        return this.emptyResult(durationMs, { code: 'auth', reason: stderr.slice(0, 400) }, opts.model);
+      }
+      if (/rate[- ]?limit|429/i.test(stderr)) {
+        return this.emptyResult(durationMs, { code: 'rate_limit', reason: stderr.slice(0, 400) }, opts.model);
+      }
+      return this.emptyResult(durationMs, { code: 'unknown', reason: (e.message ?? stderr ?? 'unknown').slice(0, 400) }, opts.model);
+    }
+  }
+
+  estimateCost(tokens: { input: number; output: number; cached?: number }, model?: string): number {
+    return estimateCostUsd(tokens, model ?? 'gpt-5.4');
+  }
+
+  /**
+   * Parse codex exec --json JSONL stream.
+   * Key events:
+   *   - item.completed with item.type === 'agent_message' → text output
+   *   - item.completed with item.type === 'command_execution' → tool call
+   *   - turn.completed → usage.input_tokens, usage.output_tokens
+   *   - thread.started → session id (not used here)
+   */
+  private parseJsonl(raw: string): { output: string; tokens: { input: number; output: number }; toolCalls: number; modelUsed?: string } {
+    let output = '';
+    let input = 0;
+    let out = 0;
+    let toolCalls = 0;
+    let modelUsed: string | undefined;
+    for (const line of raw.split('\n')) {
+      const s = line.trim();
+      if (!s) continue;
+      try {
+        const obj = JSON.parse(s);
+        if (obj.type === 'item.completed' && obj.item) {
+          if (obj.item.type === 'agent_message' && typeof obj.item.text === 'string') {
+            output += (output ? '\n' : '') + obj.item.text;
+          } else if (obj.item.type === 'command_execution') {
+            toolCalls += 1;
+          }
+        } else if (obj.type === 'turn.completed') {
+          const u = obj.usage ?? {};
+          input += u.input_tokens ?? 0;
+          out += u.output_tokens ?? 0;
+          if (obj.model) modelUsed = obj.model;
+        }
+      } catch {
+        // skip malformed lines — codex stderr can leak in
+      }
+    }
+    return { output, tokens: { input, output: out }, toolCalls, modelUsed };
+  }
+
+  private emptyResult(durationMs: number, error: RunResult['error'], model?: string): RunResult {
+    return {
+      output: '',
+      tokens: { input: 0, output: 0 },
+      durationMs,
+      toolCalls: 0,
+      modelUsed: model ?? 'gpt-5.4',
+      error,
+    };
+  }
+}
@@ -0,0 +1,74 @@
+/**
+ * Provider adapter interface — uniform contract for Claude, GPT, Gemini.
+ *
+ * Each adapter wraps an existing runner (session-runner.ts, codex-session-runner.ts,
+ * gemini-session-runner.ts) and normalizes its per-provider result shape into the
+ * RunResult below. The benchmark harness only talks to adapters through this
+ * interface, never to the underlying runners directly.
+ */
+
+export interface RunOpts {
+  /** The prompt to send to the model. */
+  prompt: string;
+  /** Working directory passed to the underlying CLI. */
+  workdir: string;
+  /** Hard wall-clock timeout in ms. Default: 300000 (5 min). */
+  timeoutMs: number;
+  /** Specific model within the family, optional. Adapters pass through to provider. */
+  model?: string;
+  /** Extra flags per-provider (escape hatch for rare cases). Prefer staying generic. */
+  extraArgs?: string[];
+}
+
+export interface TokenUsage {
+  input: number;
+  output: number;
+  /** Cached input tokens (Anthropic/OpenAI support). Undefined if provider doesn't report. */
+  cached?: number;
+}
+
+export type RunError =
+  | 'auth'       // Credentials missing or invalid.
+  | 'timeout'    // Exceeded timeoutMs.
+  | 'rate_limit' // Provider rate-limited us; backoff exceeded.
+  | 'binary_missing' // CLI not found on PATH.
+  | 'unknown';   // Catch-all with reason populated.
+
+export interface RunResult {
+  /** Provider's textual output for the prompt. */
+  output: string;
+  /** Normalized token usage. 0s if unreported. */
+  tokens: TokenUsage;
+  /** Wall-clock duration. */
+  durationMs: number;
+  /** Count of tool/function calls made during the run (0 if unsupported). */
+  toolCalls: number;
+  /** Actual model ID the provider reports using (may be a variant of the family). */
+  modelUsed: string;
+  /** If the run failed, error code + human reason. output/tokens may be partial. */
+  error?: { code: RunError; reason: string };
+}
+
+export interface AvailabilityCheck {
+  ok: boolean;
+  /** When !ok: short reason shown to user. Includes install / login / env var hint. */
+  reason?: string;
+}
+
+export type Family = 'claude' | 'gpt' | 'gemini';
+
+export interface ProviderAdapter {
+  /** Stable name used in output tables and config (e.g., 'claude', 'gpt', 'gemini'). */
+  readonly name: string;
+  /** Model family this adapter targets. */
+  readonly family: Family;
+  /**
+   * Check whether the provider's CLI binary is present and authenticated.
+   * Should never block >2s. Non-throwing: returns { ok: false, reason } on failure.
+   */
+  available(): Promise<AvailabilityCheck>;
+  /** Run a prompt and return normalized RunResult. Non-throwing. Errors go in result.error. */
+  run(opts: RunOpts): Promise<RunResult>;
+  /** Estimate USD cost for the reported token usage and model. */
+  estimateCost(tokens: TokenUsage, model?: string): number;
+}
@@ -0,0 +1,82 @@
+/**
+ * Tool compatibility map across provider CLIs.
+ *
+ * Not all provider CLIs expose equivalent tools. A benchmark that uses Edit, Glob,
+ * or Grep won't run cleanly on CLIs that don't have those. The map answers:
+ * "which tools does each provider's CLI expose by default?"
+ *
+ * When a benchmark is scoped to a tool a provider lacks, the harness records
+ * `unsupported_tool` in the result and continues with the other providers.
+ *
+ * Source-of-truth references:
+ *   - Claude Code: https://code.claude.com/docs/en/tools
+ *   - Codex CLI: `codex exec --help` tool listing
+ *   - Gemini CLI: `gemini --help` (limited tool surface as of 2026-04)
+ */
+
+export type ToolName =
+  | 'Read'
+  | 'Write'
+  | 'Edit'
+  | 'Bash'
+  | 'Agent'
+  | 'Glob'
+  | 'Grep'
+  | 'AskUserQuestion'
+  | 'WebSearch'
+  | 'WebFetch';
+
+export const TOOL_COMPATIBILITY: Record<'claude' | 'gpt' | 'gemini', Record<ToolName, boolean>> = {
+  claude: {
+    Read: true,
+    Write: true,
+    Edit: true,
+    Bash: true,
+    Agent: true,
+    Glob: true,
+    Grep: true,
+    AskUserQuestion: true,
+    WebSearch: true,
+    WebFetch: true,
+  },
+  gpt: {
+    // Codex CLI has a narrower tool surface: it uses shell + apply_patch.
+    // Read/Glob/Grep-style operations happen via shell pipelines.
+    Read: true,
+    Write: false,       // apply_patch handles writes; no standalone Write tool
+    Edit: false,        // apply_patch handles edits; no standalone Edit tool
+    Bash: true,
+    Agent: false,
+    Glob: false,
+    Grep: false,
+    AskUserQuestion: false,
+    WebSearch: true,    // --enable web_search_cached
+    WebFetch: false,
+  },
+  gemini: {
+    // Gemini CLI (as of 2026-04) has a limited tool surface in --yolo mode.
+    // Shell access depends on flags; most agentic tools are not exposed.
+    Read: true,
+    Write: false,
+    Edit: false,
+    Bash: false,
+    Agent: false,
+    Glob: false,
+    Grep: false,
+    AskUserQuestion: false,
+    WebSearch: true,
+    WebFetch: false,
+  },
+};
+
+/**
+ * Determine which tools from a required-set are missing for a given provider.
+ * Empty array means full compatibility.
+ */
+export function missingTools(
+  provider: 'claude' | 'gpt' | 'gemini',
+  requiredTools: ToolName[]
+): ToolName[] {
+  const map = TOOL_COMPATIBILITY[provider];
+  return requiredTools.filter(t => !map[t]);
+}