Files
gstack/test/helpers/benchmark-runner.ts
T
Garry Tan 614354fc41 feat: multi-provider model benchmark (boil the ocean)
Adds the full spec Codex asked for: real provider adapters with auth
detection, normalized RunResult, pricing tables, tool compatibility
maps, parallel execution with error isolation, and table/JSON/markdown
output. Judge stays on Anthropic SDK as the single stable source of
quality scoring, gated behind --judge.

Codex flagged the original plan as massively under-scoped — the
existing runner is Claude-only and the judge is Anthropic-only. You
can't benchmark GPT or Gemini without real provider infrastructure.
This commit ships it.

New architecture:

  test/helpers/providers/types.ts       ProviderAdapter interface
  test/helpers/providers/claude.ts      wraps `claude -p --output-format json`
  test/helpers/providers/gpt.ts         wraps `codex exec --json`
  test/helpers/providers/gemini.ts      wraps `gemini -p --output-format stream-json --yolo`
  test/helpers/pricing.ts               per-model USD cost tables (quarterly)
  test/helpers/tool-map.ts              which tools each CLI exposes
  test/helpers/benchmark-runner.ts      orchestrator (Promise.allSettled)
  test/helpers/benchmark-judge.ts       Anthropic SDK quality scorer
  bin/gstack-model-benchmark            CLI entry
  test/benchmark-runner.test.ts         9 unit tests (cost math, formatters, tool-map)

Per-provider error isolation:
  - auth → record reason, don't abort batch
  - timeout → record reason, don't abort batch
  - rate_limit → record reason, don't abort batch
  - binary_missing → record in available() check, skip if --skip-unavailable

Pricing correction: cached input tokens are disjoint from uncached
input tokens (Anthropic/OpenAI report them separately). Original
math subtracted them, producing negative costs. Now adds cached at
the 10% discount alongside the full uncached input cost.

CLI:
  gstack-model-benchmark --prompt "..." --models claude,gpt,gemini
  gstack-model-benchmark ./prompt.txt --output json --judge
  gstack-model-benchmark ./prompt.txt --models claude --timeout-ms 60000

Output formats: table (default), json, markdown. Each shows model,
latency, in→out tokens, cost, quality (when --judge used), tool calls,
and any errors.

Known limitations for v1:
- Claude adapter approximates toolCalls as num_turns (stream-json
  would give exact counts; v2 can upgrade).
- Live E2E tests (test/providers.e2e.test.ts) not included — they
  require CI secrets for all three providers. Unit tests cover the
  shape and math.
- Provider CLIs sometimes return non-JSON error text to stdout; the
  parsers fall back to treating raw output as plain text in that case.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-17 06:16:42 +08:00

166 lines
6.0 KiB
TypeScript

/**
* Multi-provider benchmark runner.
*
* Orchestrates running the same prompt across multiple provider adapters and
* aggregates RunResult outputs + judge scores into a single report. Adapters
* run in parallel (Promise.allSettled) so a slow provider doesn't block a fast
* one. Per-provider auth/timeout/rate-limit errors don't abort the batch.
*/
import type { ProviderAdapter, RunOpts, RunResult } from './providers/types';
import { ClaudeAdapter } from './providers/claude';
import { GptAdapter } from './providers/gpt';
import { GeminiAdapter } from './providers/gemini';
export interface BenchmarkInput {
prompt: string;
workdir: string;
timeoutMs?: number;
/** Adapter names to run (e.g., ['claude', 'gpt', 'gemini']). */
providers: Array<'claude' | 'gpt' | 'gemini'>;
/** Optional per-provider model overrides. */
models?: Partial<Record<'claude' | 'gpt' | 'gemini', string>>;
/** If true, skip providers whose available() returns !ok. If false, include them with error. */
skipUnavailable?: boolean;
}
export interface BenchmarkEntry {
provider: string;
family: 'claude' | 'gpt' | 'gemini';
available: boolean;
unavailable_reason?: string;
result?: RunResult;
costUsd?: number;
/** Judge score 0-10 across dimensions. Populated separately by the judge step. */
qualityScore?: number;
qualityDetails?: Record<string, number>;
}
export interface BenchmarkReport {
prompt: string;
workdir: string;
startedAt: string;
durationMs: number;
entries: BenchmarkEntry[];
}
const ADAPTERS: Record<'claude' | 'gpt' | 'gemini', () => ProviderAdapter> = {
claude: () => new ClaudeAdapter(),
gpt: () => new GptAdapter(),
gemini: () => new GeminiAdapter(),
};
export async function runBenchmark(input: BenchmarkInput): Promise<BenchmarkReport> {
const startedAtMs = Date.now();
const startedAt = new Date(startedAtMs).toISOString();
const timeoutMs = input.timeoutMs ?? 300_000;
const entries: BenchmarkEntry[] = [];
const runPromises: Array<Promise<void>> = [];
for (const name of input.providers) {
const factory = ADAPTERS[name];
if (!factory) {
entries.push({ provider: name, family: 'claude', available: false, unavailable_reason: `unknown provider: ${name}` });
continue;
}
const adapter = factory();
const entry: BenchmarkEntry = { provider: adapter.name, family: adapter.family, available: true };
entries.push(entry);
runPromises.push((async () => {
const check = await adapter.available();
entry.available = check.ok;
if (!check.ok) {
entry.unavailable_reason = check.reason;
if (input.skipUnavailable) return;
}
const opts: RunOpts = {
prompt: input.prompt,
workdir: input.workdir,
timeoutMs,
model: input.models?.[name],
};
const res = await adapter.run(opts);
entry.result = res;
entry.costUsd = adapter.estimateCost(res.tokens, res.modelUsed);
})());
}
await Promise.allSettled(runPromises);
return {
prompt: input.prompt,
workdir: input.workdir,
startedAt,
durationMs: Date.now() - startedAtMs,
entries,
};
}
export function formatTable(report: BenchmarkReport): string {
const header = `Model Latency In→Out Tokens Cost Quality Tool Calls Notes`;
const sep = '-'.repeat(header.length);
const rows: string[] = [header, sep];
for (const e of report.entries) {
if (!e.available) {
rows.push(`${pad(e.provider, 20)} ${pad('-', 9)} ${pad('-', 20)} ${pad('-', 10)} ${pad('-', 9)} ${pad('-', 12)} unavailable: ${e.unavailable_reason ?? 'unknown'}`);
continue;
}
const r = e.result!;
if (r.error) {
rows.push(`${pad(r.modelUsed, 20)} ${pad(msToStr(r.durationMs), 9)} ${pad(`${r.tokens.input}${r.tokens.output}`, 20)} ${pad(fmtCost(e.costUsd), 10)} ${pad('-', 9)} ${pad(String(r.toolCalls), 12)} ERROR ${r.error.code}: ${r.error.reason.slice(0, 40)}`);
continue;
}
const quality = e.qualityScore !== undefined ? `${e.qualityScore.toFixed(1)}/10` : '-';
rows.push(`${pad(r.modelUsed, 20)} ${pad(msToStr(r.durationMs), 9)} ${pad(`${r.tokens.input}${r.tokens.output}`, 20)} ${pad(fmtCost(e.costUsd), 10)} ${pad(quality, 9)} ${pad(String(r.toolCalls), 12)}`);
}
return rows.join('\n');
}
export function formatJson(report: BenchmarkReport): string {
return JSON.stringify(report, null, 2);
}
export function formatMarkdown(report: BenchmarkReport): string {
const lines: string[] = [
`# Benchmark report — ${report.startedAt}`,
'',
`**Prompt:** ${report.prompt.length > 200 ? report.prompt.slice(0, 200) + '…' : report.prompt}`,
`**Workdir:** \`${report.workdir}\``,
`**Total duration:** ${msToStr(report.durationMs)}`,
'',
'| Model | Latency | Tokens (in→out) | Cost | Quality | Tools | Notes |',
'|-------|---------|-----------------|------|---------|-------|-------|',
];
for (const e of report.entries) {
if (!e.available) {
lines.push(`| ${e.provider} | - | - | - | - | - | unavailable: ${e.unavailable_reason ?? 'unknown'} |`);
continue;
}
const r = e.result!;
if (r.error) {
lines.push(`| ${r.modelUsed} | ${msToStr(r.durationMs)} | ${r.tokens.input}${r.tokens.output} | ${fmtCost(e.costUsd)} | - | ${r.toolCalls} | ERROR ${r.error.code}: ${r.error.reason.slice(0, 80)} |`);
continue;
}
const quality = e.qualityScore !== undefined ? `${e.qualityScore.toFixed(1)}/10` : '-';
lines.push(`| ${r.modelUsed} | ${msToStr(r.durationMs)} | ${r.tokens.input}${r.tokens.output} | ${fmtCost(e.costUsd)} | ${quality} | ${r.toolCalls} | |`);
}
return lines.join('\n');
}
function pad(s: string, n: number): string {
return s.length >= n ? s.slice(0, n) : s + ' '.repeat(n - s.length);
}
function msToStr(ms: number): string {
if (ms < 1000) return `${ms}ms`;
return `${(ms / 1000).toFixed(1)}s`;
}
function fmtCost(usd?: number): string {
if (usd === undefined) return '-';
if (usd < 0.01) return `$${usd.toFixed(4)}`;
return `$${usd.toFixed(2)}`;
}