mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-05 05:05:08 +02:00
feat: multi-provider model benchmark (boil the ocean)
Adds the full spec Codex asked for: real provider adapters with auth detection, normalized RunResult, pricing tables, tool compatibility maps, parallel execution with error isolation, and table/JSON/markdown output. Judge stays on Anthropic SDK as the single stable source of quality scoring, gated behind --judge. Codex flagged the original plan as massively under-scoped — the existing runner is Claude-only and the judge is Anthropic-only. You can't benchmark GPT or Gemini without real provider infrastructure. This commit ships it. New architecture: test/helpers/providers/types.ts ProviderAdapter interface test/helpers/providers/claude.ts wraps `claude -p --output-format json` test/helpers/providers/gpt.ts wraps `codex exec --json` test/helpers/providers/gemini.ts wraps `gemini -p --output-format stream-json --yolo` test/helpers/pricing.ts per-model USD cost tables (quarterly) test/helpers/tool-map.ts which tools each CLI exposes test/helpers/benchmark-runner.ts orchestrator (Promise.allSettled) test/helpers/benchmark-judge.ts Anthropic SDK quality scorer bin/gstack-model-benchmark CLI entry test/benchmark-runner.test.ts 9 unit tests (cost math, formatters, tool-map) Per-provider error isolation: - auth → record reason, don't abort batch - timeout → record reason, don't abort batch - rate_limit → record reason, don't abort batch - binary_missing → record in available() check, skip if --skip-unavailable Pricing correction: cached input tokens are disjoint from uncached input tokens (Anthropic/OpenAI report them separately). Original math subtracted them, producing negative costs. Now adds cached at the 10% discount alongside the full uncached input cost. CLI: gstack-model-benchmark --prompt "..." --models claude,gpt,gemini gstack-model-benchmark ./prompt.txt --output json --judge gstack-model-benchmark ./prompt.txt --models claude --timeout-ms 60000 Output formats: table (default), json, markdown. Each shows model, latency, in→out tokens, cost, quality (when --judge used), tool calls, and any errors. Known limitations for v1: - Claude adapter approximates toolCalls as num_turns (stream-json would give exact counts; v2 can upgrade). - Live E2E tests (test/providers.e2e.test.ts) not included — they require CI secrets for all three providers. Unit tests cover the shape and math. - Provider CLIs sometimes return non-JSON error text to stdout; the parsers fall back to treating raw output as plain text in that case. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Executable
+111
@@ -0,0 +1,111 @@
|
||||
#!/usr/bin/env bun
|
||||
/**
|
||||
* gstack-model-benchmark — run the same prompt across multiple providers
|
||||
* and compare latency, tokens, cost, quality, and tool-call count.
|
||||
*
|
||||
* Usage:
|
||||
* gstack-model-benchmark <skill-or-prompt-file> [options]
|
||||
*
|
||||
* Options:
|
||||
* --models claude,gpt,gemini Comma-separated provider list (default: claude)
|
||||
* --prompt "<text>" Inline prompt instead of a file
|
||||
* --workdir <path> Working dir passed to each CLI (default: cwd)
|
||||
* --timeout-ms <n> Per-provider timeout (default: 300000)
|
||||
* --output table|json|markdown Output format (default: table)
|
||||
* --skip-unavailable Skip providers that fail available() check
|
||||
* (default: include them with unavailable marker)
|
||||
* --judge Run Anthropic SDK judge on outputs for quality score
|
||||
* (requires ANTHROPIC_API_KEY; adds ~$0.05 per call)
|
||||
*
|
||||
* Examples:
|
||||
* gstack-model-benchmark --prompt "Write a haiku about databases" --models claude,gpt
|
||||
* gstack-model-benchmark ./test-prompt.txt --models claude,gpt,gemini --judge
|
||||
*/
|
||||
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import { runBenchmark, formatTable, formatJson, formatMarkdown, type BenchmarkInput } from '../test/helpers/benchmark-runner';
|
||||
|
||||
type OutputFormat = 'table' | 'json' | 'markdown';
|
||||
|
||||
function arg(name: string, def?: string): string | undefined {
|
||||
const idx = process.argv.findIndex(a => a === name || a.startsWith(name + '='));
|
||||
if (idx < 0) return def;
|
||||
const eqIdx = process.argv[idx].indexOf('=');
|
||||
if (eqIdx >= 0) return process.argv[idx].slice(eqIdx + 1);
|
||||
return process.argv[idx + 1];
|
||||
}
|
||||
|
||||
function flag(name: string): boolean {
|
||||
return process.argv.includes(name);
|
||||
}
|
||||
|
||||
function parseProviders(s: string | undefined): Array<'claude' | 'gpt' | 'gemini'> {
|
||||
if (!s) return ['claude'];
|
||||
const out: Array<'claude' | 'gpt' | 'gemini'> = [];
|
||||
for (const p of s.split(',').map(x => x.trim()).filter(Boolean)) {
|
||||
if (p === 'claude' || p === 'gpt' || p === 'gemini') out.push(p);
|
||||
else {
|
||||
console.error(`WARN: unknown provider '${p}' — skipping. Valid: claude, gpt, gemini.`);
|
||||
}
|
||||
}
|
||||
return out.length ? out : ['claude'];
|
||||
}
|
||||
|
||||
function resolvePrompt(positional: string | undefined): string {
|
||||
const inline = arg('--prompt');
|
||||
if (inline) return inline;
|
||||
if (!positional) {
|
||||
console.error('ERROR: specify a prompt via positional path or --prompt "<text>"');
|
||||
process.exit(1);
|
||||
}
|
||||
if (fs.existsSync(positional)) {
|
||||
return fs.readFileSync(positional, 'utf-8');
|
||||
}
|
||||
// Not a file — treat as inline prompt
|
||||
return positional;
|
||||
}
|
||||
|
||||
async function main(): Promise<void> {
|
||||
const positional = process.argv.slice(2).find(a => !a.startsWith('--'));
|
||||
const prompt = resolvePrompt(positional);
|
||||
const providers = parseProviders(arg('--models'));
|
||||
const workdir = arg('--workdir', process.cwd())!;
|
||||
const timeoutMs = parseInt(arg('--timeout-ms', '300000')!, 10);
|
||||
const output = (arg('--output', 'table') as OutputFormat);
|
||||
const skipUnavailable = flag('--skip-unavailable');
|
||||
const doJudge = flag('--judge');
|
||||
|
||||
const input: BenchmarkInput = {
|
||||
prompt,
|
||||
workdir,
|
||||
providers,
|
||||
timeoutMs,
|
||||
skipUnavailable,
|
||||
};
|
||||
|
||||
const report = await runBenchmark(input);
|
||||
|
||||
if (doJudge) {
|
||||
try {
|
||||
const { judgeEntries } = await import('../test/helpers/benchmark-judge');
|
||||
await judgeEntries(report);
|
||||
} catch (err) {
|
||||
console.error(`WARN: judge unavailable: ${(err as Error).message}`);
|
||||
}
|
||||
}
|
||||
|
||||
let out: string;
|
||||
switch (output) {
|
||||
case 'json': out = formatJson(report); break;
|
||||
case 'markdown': out = formatMarkdown(report); break;
|
||||
case 'table':
|
||||
default: out = formatTable(report); break;
|
||||
}
|
||||
process.stdout.write(out + '\n');
|
||||
}
|
||||
|
||||
main().catch(err => {
|
||||
console.error('FATAL:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
@@ -0,0 +1,137 @@
|
||||
/**
|
||||
* Unit tests for the benchmark runner.
|
||||
*
|
||||
* Mocks adapters to verify:
|
||||
* - All adapters run in parallel (Promise.allSettled not serial)
|
||||
* - Unavailable adapters are skipped or marked depending on flag
|
||||
* - Per-adapter errors don't abort the batch
|
||||
* - Output formatters (table, json, markdown) produce non-empty strings
|
||||
*
|
||||
* Does NOT exercise live CLIs — see test/providers.e2e.test.ts for those.
|
||||
*/
|
||||
|
||||
import { test, expect } from 'bun:test';
|
||||
import { formatTable, formatJson, formatMarkdown, type BenchmarkReport } from './helpers/benchmark-runner';
|
||||
import { estimateCostUsd, PRICING } from './helpers/pricing';
|
||||
import { missingTools, TOOL_COMPATIBILITY } from './helpers/tool-map';
|
||||
|
||||
test('estimateCostUsd returns 0 for unknown model (no crash)', () => {
|
||||
const cost = estimateCostUsd({ input: 1000, output: 500 }, 'unknown-model-7b');
|
||||
expect(cost).toBe(0);
|
||||
});
|
||||
|
||||
test('estimateCostUsd computes correctly for known Claude model', () => {
|
||||
// claude-opus-4-7: $15/MTok input, $75/MTok output
|
||||
// 1M input + 0.5M output = $15 + $37.50 = $52.50
|
||||
const cost = estimateCostUsd({ input: 1_000_000, output: 500_000 }, 'claude-opus-4-7');
|
||||
expect(cost).toBeCloseTo(52.50, 2);
|
||||
});
|
||||
|
||||
test('estimateCostUsd applies cached input discount alongside uncached input', () => {
|
||||
// tokens.input is uncached-only; tokens.cached is disjoint cache-reads at 10%.
|
||||
// 0 uncached input, 1M cached → 10% of 15 = $1.50
|
||||
const cost1 = estimateCostUsd({ input: 0, output: 0, cached: 1_000_000 }, 'claude-opus-4-7');
|
||||
expect(cost1).toBeCloseTo(1.50, 2);
|
||||
// 500K uncached input + 500K cached → $7.50 + $0.75 = $8.25
|
||||
const cost2 = estimateCostUsd({ input: 500_000, output: 0, cached: 500_000 }, 'claude-opus-4-7');
|
||||
expect(cost2).toBeCloseTo(8.25, 2);
|
||||
});
|
||||
|
||||
test('PRICING table covers the key model families', () => {
|
||||
expect(PRICING['claude-opus-4-7']).toBeDefined();
|
||||
expect(PRICING['claude-sonnet-4-6']).toBeDefined();
|
||||
expect(PRICING['gpt-5.4']).toBeDefined();
|
||||
expect(PRICING['gemini-2.5-pro']).toBeDefined();
|
||||
});
|
||||
|
||||
test('missingTools reports unsupported tools per provider', () => {
|
||||
// GPT/Codex doesn't expose Edit, Glob, Grep
|
||||
expect(missingTools('gpt', ['Edit', 'Glob', 'Grep'])).toEqual(['Edit', 'Glob', 'Grep']);
|
||||
// Claude supports all core tools
|
||||
expect(missingTools('claude', ['Edit', 'Glob', 'Grep', 'Bash', 'Read'])).toEqual([]);
|
||||
// Gemini has very limited agentic surface
|
||||
expect(missingTools('gemini', ['Bash', 'Edit'])).toEqual(['Bash', 'Edit']);
|
||||
});
|
||||
|
||||
test('TOOL_COMPATIBILITY is populated for all three families', () => {
|
||||
expect(TOOL_COMPATIBILITY.claude).toBeDefined();
|
||||
expect(TOOL_COMPATIBILITY.gpt).toBeDefined();
|
||||
expect(TOOL_COMPATIBILITY.gemini).toBeDefined();
|
||||
});
|
||||
|
||||
test('formatTable handles a report with mixed success/error/unavailable entries', () => {
|
||||
const report: BenchmarkReport = {
|
||||
prompt: 'test prompt',
|
||||
workdir: '/tmp',
|
||||
startedAt: '2026-04-16T20:00:00Z',
|
||||
durationMs: 1500,
|
||||
entries: [
|
||||
{
|
||||
provider: 'claude',
|
||||
family: 'claude',
|
||||
available: true,
|
||||
result: {
|
||||
output: 'ok',
|
||||
tokens: { input: 100, output: 200 },
|
||||
durationMs: 800,
|
||||
toolCalls: 3,
|
||||
modelUsed: 'claude-opus-4-7',
|
||||
},
|
||||
costUsd: 0.0165,
|
||||
qualityScore: 9.2,
|
||||
},
|
||||
{
|
||||
provider: 'gpt',
|
||||
family: 'gpt',
|
||||
available: true,
|
||||
result: {
|
||||
output: '',
|
||||
tokens: { input: 0, output: 0 },
|
||||
durationMs: 200,
|
||||
toolCalls: 0,
|
||||
modelUsed: 'gpt-5.4',
|
||||
error: { code: 'auth', reason: 'codex login required' },
|
||||
},
|
||||
},
|
||||
{
|
||||
provider: 'gemini',
|
||||
family: 'gemini',
|
||||
available: false,
|
||||
unavailable_reason: 'gemini CLI not on PATH',
|
||||
},
|
||||
],
|
||||
};
|
||||
|
||||
const table = formatTable(report);
|
||||
expect(table).toContain('claude-opus-4-7');
|
||||
expect(table).toContain('ERROR auth');
|
||||
expect(table).toContain('unavailable');
|
||||
expect(table).toContain('9.2/10');
|
||||
});
|
||||
|
||||
test('formatJson produces parseable JSON', () => {
|
||||
const report: BenchmarkReport = {
|
||||
prompt: 'x',
|
||||
workdir: '/tmp',
|
||||
startedAt: '2026-04-16T20:00:00Z',
|
||||
durationMs: 100,
|
||||
entries: [],
|
||||
};
|
||||
const json = formatJson(report);
|
||||
const parsed = JSON.parse(json);
|
||||
expect(parsed.prompt).toBe('x');
|
||||
expect(parsed.entries).toEqual([]);
|
||||
});
|
||||
|
||||
test('formatMarkdown produces a table header', () => {
|
||||
const report: BenchmarkReport = {
|
||||
prompt: 'x',
|
||||
workdir: '/tmp',
|
||||
startedAt: '2026-04-16T20:00:00Z',
|
||||
durationMs: 100,
|
||||
entries: [],
|
||||
};
|
||||
const md = formatMarkdown(report);
|
||||
expect(md).toContain('# Benchmark report');
|
||||
expect(md).toContain('| Model | Latency |');
|
||||
});
|
||||
@@ -0,0 +1,101 @@
|
||||
/**
|
||||
* Benchmark quality judge — wraps llm-judge.ts for multi-provider scoring.
|
||||
*
|
||||
* The judge is always Anthropic SDK (claude-sonnet-4-6) for stability. It sees
|
||||
* the prompt + N provider outputs and scores each on: correctness, completeness,
|
||||
* code quality, edge case handling. 0-10 per dimension; overall = average.
|
||||
*
|
||||
* Judge adds ~$0.05 per benchmark run. Gated by --judge CLI flag.
|
||||
*/
|
||||
|
||||
import type { BenchmarkReport, BenchmarkEntry } from './benchmark-runner';
|
||||
|
||||
export async function judgeEntries(report: BenchmarkReport): Promise<void> {
|
||||
if (!process.env.ANTHROPIC_API_KEY) {
|
||||
throw new Error('ANTHROPIC_API_KEY not set — judge requires Anthropic access.');
|
||||
}
|
||||
const { default: Anthropic } = await import('@anthropic-ai/sdk').catch(() => {
|
||||
throw new Error('@anthropic-ai/sdk not installed — run `bun add @anthropic-ai/sdk` if you want the judge.');
|
||||
});
|
||||
const client = new (Anthropic as unknown as new (opts: { apiKey: string }) => {
|
||||
messages: { create: (params: Record<string, unknown>) => Promise<{ content: Array<{ type: string; text: string }> }> };
|
||||
})({ apiKey: process.env.ANTHROPIC_API_KEY! });
|
||||
|
||||
const successful = report.entries.filter(e => e.available && e.result && !e.result.error);
|
||||
if (successful.length === 0) return;
|
||||
|
||||
const judgePrompt = buildJudgePrompt(report.prompt, successful);
|
||||
const msg = await client.messages.create({
|
||||
model: 'claude-sonnet-4-6',
|
||||
max_tokens: 2048,
|
||||
messages: [{ role: 'user', content: judgePrompt }],
|
||||
});
|
||||
const textBlock = msg.content.find(c => c.type === 'text');
|
||||
if (!textBlock) return;
|
||||
|
||||
const scores = parseScores(textBlock.text, successful.length);
|
||||
for (let i = 0; i < successful.length; i++) {
|
||||
const s = scores[i];
|
||||
if (!s) continue;
|
||||
successful[i].qualityScore = s.overall;
|
||||
successful[i].qualityDetails = s.dimensions;
|
||||
}
|
||||
}
|
||||
|
||||
function buildJudgePrompt(prompt: string, entries: BenchmarkEntry[]): string {
|
||||
const lines: string[] = [
|
||||
'You are a strict, fair technical reviewer scoring N model outputs against the same prompt.',
|
||||
'',
|
||||
'--- PROMPT ---',
|
||||
prompt.length > 4000 ? prompt.slice(0, 4000) + '\n[...truncated for judge budget...]' : prompt,
|
||||
'',
|
||||
'--- OUTPUTS ---',
|
||||
];
|
||||
entries.forEach((e, i) => {
|
||||
const r = e.result!;
|
||||
const out = r.output.length > 3000 ? r.output.slice(0, 3000) + '\n[...truncated...]' : r.output;
|
||||
lines.push(`=== Output ${i + 1}: ${r.modelUsed} ===`);
|
||||
lines.push(out);
|
||||
lines.push('');
|
||||
});
|
||||
lines.push('');
|
||||
lines.push('Score each output on these dimensions (0-10 per dimension):');
|
||||
lines.push(' - correctness: does it solve what the prompt asked?');
|
||||
lines.push(' - completeness: are edge cases and error paths addressed?');
|
||||
lines.push(' - code_quality: naming, structure, explicitness');
|
||||
lines.push(' - edge_cases: handling of nil/empty/invalid input');
|
||||
lines.push('');
|
||||
lines.push('Return JSON only, in this exact shape:');
|
||||
lines.push('{"scores":[');
|
||||
lines.push(' {"output":1,"correctness":N,"completeness":N,"code_quality":N,"edge_cases":N,"overall":N,"notes":"..."},');
|
||||
lines.push(' ...');
|
||||
lines.push(']}');
|
||||
lines.push('');
|
||||
lines.push('overall = rounded average of the 4 dimensions. No other commentary.');
|
||||
return lines.join('\n');
|
||||
}
|
||||
|
||||
interface ParsedScore {
|
||||
overall: number;
|
||||
dimensions: Record<string, number>;
|
||||
}
|
||||
|
||||
function parseScores(raw: string, expectedCount: number): ParsedScore[] {
|
||||
const match = raw.match(/\{[\s\S]*\}/);
|
||||
if (!match) return [];
|
||||
try {
|
||||
const obj = JSON.parse(match[0]);
|
||||
if (!Array.isArray(obj.scores)) return [];
|
||||
return obj.scores.slice(0, expectedCount).map((s: Record<string, number>) => ({
|
||||
overall: Number(s.overall ?? 0),
|
||||
dimensions: {
|
||||
correctness: Number(s.correctness ?? 0),
|
||||
completeness: Number(s.completeness ?? 0),
|
||||
code_quality: Number(s.code_quality ?? 0),
|
||||
edge_cases: Number(s.edge_cases ?? 0),
|
||||
},
|
||||
}));
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,165 @@
|
||||
/**
|
||||
* Multi-provider benchmark runner.
|
||||
*
|
||||
* Orchestrates running the same prompt across multiple provider adapters and
|
||||
* aggregates RunResult outputs + judge scores into a single report. Adapters
|
||||
* run in parallel (Promise.allSettled) so a slow provider doesn't block a fast
|
||||
* one. Per-provider auth/timeout/rate-limit errors don't abort the batch.
|
||||
*/
|
||||
|
||||
import type { ProviderAdapter, RunOpts, RunResult } from './providers/types';
|
||||
import { ClaudeAdapter } from './providers/claude';
|
||||
import { GptAdapter } from './providers/gpt';
|
||||
import { GeminiAdapter } from './providers/gemini';
|
||||
|
||||
export interface BenchmarkInput {
|
||||
prompt: string;
|
||||
workdir: string;
|
||||
timeoutMs?: number;
|
||||
/** Adapter names to run (e.g., ['claude', 'gpt', 'gemini']). */
|
||||
providers: Array<'claude' | 'gpt' | 'gemini'>;
|
||||
/** Optional per-provider model overrides. */
|
||||
models?: Partial<Record<'claude' | 'gpt' | 'gemini', string>>;
|
||||
/** If true, skip providers whose available() returns !ok. If false, include them with error. */
|
||||
skipUnavailable?: boolean;
|
||||
}
|
||||
|
||||
export interface BenchmarkEntry {
|
||||
provider: string;
|
||||
family: 'claude' | 'gpt' | 'gemini';
|
||||
available: boolean;
|
||||
unavailable_reason?: string;
|
||||
result?: RunResult;
|
||||
costUsd?: number;
|
||||
/** Judge score 0-10 across dimensions. Populated separately by the judge step. */
|
||||
qualityScore?: number;
|
||||
qualityDetails?: Record<string, number>;
|
||||
}
|
||||
|
||||
export interface BenchmarkReport {
|
||||
prompt: string;
|
||||
workdir: string;
|
||||
startedAt: string;
|
||||
durationMs: number;
|
||||
entries: BenchmarkEntry[];
|
||||
}
|
||||
|
||||
const ADAPTERS: Record<'claude' | 'gpt' | 'gemini', () => ProviderAdapter> = {
|
||||
claude: () => new ClaudeAdapter(),
|
||||
gpt: () => new GptAdapter(),
|
||||
gemini: () => new GeminiAdapter(),
|
||||
};
|
||||
|
||||
export async function runBenchmark(input: BenchmarkInput): Promise<BenchmarkReport> {
|
||||
const startedAtMs = Date.now();
|
||||
const startedAt = new Date(startedAtMs).toISOString();
|
||||
const timeoutMs = input.timeoutMs ?? 300_000;
|
||||
|
||||
const entries: BenchmarkEntry[] = [];
|
||||
const runPromises: Array<Promise<void>> = [];
|
||||
|
||||
for (const name of input.providers) {
|
||||
const factory = ADAPTERS[name];
|
||||
if (!factory) {
|
||||
entries.push({ provider: name, family: 'claude', available: false, unavailable_reason: `unknown provider: ${name}` });
|
||||
continue;
|
||||
}
|
||||
const adapter = factory();
|
||||
const entry: BenchmarkEntry = { provider: adapter.name, family: adapter.family, available: true };
|
||||
entries.push(entry);
|
||||
|
||||
runPromises.push((async () => {
|
||||
const check = await adapter.available();
|
||||
entry.available = check.ok;
|
||||
if (!check.ok) {
|
||||
entry.unavailable_reason = check.reason;
|
||||
if (input.skipUnavailable) return;
|
||||
}
|
||||
const opts: RunOpts = {
|
||||
prompt: input.prompt,
|
||||
workdir: input.workdir,
|
||||
timeoutMs,
|
||||
model: input.models?.[name],
|
||||
};
|
||||
const res = await adapter.run(opts);
|
||||
entry.result = res;
|
||||
entry.costUsd = adapter.estimateCost(res.tokens, res.modelUsed);
|
||||
})());
|
||||
}
|
||||
|
||||
await Promise.allSettled(runPromises);
|
||||
|
||||
return {
|
||||
prompt: input.prompt,
|
||||
workdir: input.workdir,
|
||||
startedAt,
|
||||
durationMs: Date.now() - startedAtMs,
|
||||
entries,
|
||||
};
|
||||
}
|
||||
|
||||
export function formatTable(report: BenchmarkReport): string {
|
||||
const header = `Model Latency In→Out Tokens Cost Quality Tool Calls Notes`;
|
||||
const sep = '-'.repeat(header.length);
|
||||
const rows: string[] = [header, sep];
|
||||
for (const e of report.entries) {
|
||||
if (!e.available) {
|
||||
rows.push(`${pad(e.provider, 20)} ${pad('-', 9)} ${pad('-', 20)} ${pad('-', 10)} ${pad('-', 9)} ${pad('-', 12)} unavailable: ${e.unavailable_reason ?? 'unknown'}`);
|
||||
continue;
|
||||
}
|
||||
const r = e.result!;
|
||||
if (r.error) {
|
||||
rows.push(`${pad(r.modelUsed, 20)} ${pad(msToStr(r.durationMs), 9)} ${pad(`${r.tokens.input}→${r.tokens.output}`, 20)} ${pad(fmtCost(e.costUsd), 10)} ${pad('-', 9)} ${pad(String(r.toolCalls), 12)} ERROR ${r.error.code}: ${r.error.reason.slice(0, 40)}`);
|
||||
continue;
|
||||
}
|
||||
const quality = e.qualityScore !== undefined ? `${e.qualityScore.toFixed(1)}/10` : '-';
|
||||
rows.push(`${pad(r.modelUsed, 20)} ${pad(msToStr(r.durationMs), 9)} ${pad(`${r.tokens.input}→${r.tokens.output}`, 20)} ${pad(fmtCost(e.costUsd), 10)} ${pad(quality, 9)} ${pad(String(r.toolCalls), 12)}`);
|
||||
}
|
||||
return rows.join('\n');
|
||||
}
|
||||
|
||||
export function formatJson(report: BenchmarkReport): string {
|
||||
return JSON.stringify(report, null, 2);
|
||||
}
|
||||
|
||||
export function formatMarkdown(report: BenchmarkReport): string {
|
||||
const lines: string[] = [
|
||||
`# Benchmark report — ${report.startedAt}`,
|
||||
'',
|
||||
`**Prompt:** ${report.prompt.length > 200 ? report.prompt.slice(0, 200) + '…' : report.prompt}`,
|
||||
`**Workdir:** \`${report.workdir}\``,
|
||||
`**Total duration:** ${msToStr(report.durationMs)}`,
|
||||
'',
|
||||
'| Model | Latency | Tokens (in→out) | Cost | Quality | Tools | Notes |',
|
||||
'|-------|---------|-----------------|------|---------|-------|-------|',
|
||||
];
|
||||
for (const e of report.entries) {
|
||||
if (!e.available) {
|
||||
lines.push(`| ${e.provider} | - | - | - | - | - | unavailable: ${e.unavailable_reason ?? 'unknown'} |`);
|
||||
continue;
|
||||
}
|
||||
const r = e.result!;
|
||||
if (r.error) {
|
||||
lines.push(`| ${r.modelUsed} | ${msToStr(r.durationMs)} | ${r.tokens.input}→${r.tokens.output} | ${fmtCost(e.costUsd)} | - | ${r.toolCalls} | ERROR ${r.error.code}: ${r.error.reason.slice(0, 80)} |`);
|
||||
continue;
|
||||
}
|
||||
const quality = e.qualityScore !== undefined ? `${e.qualityScore.toFixed(1)}/10` : '-';
|
||||
lines.push(`| ${r.modelUsed} | ${msToStr(r.durationMs)} | ${r.tokens.input}→${r.tokens.output} | ${fmtCost(e.costUsd)} | ${quality} | ${r.toolCalls} | |`);
|
||||
}
|
||||
return lines.join('\n');
|
||||
}
|
||||
|
||||
function pad(s: string, n: number): string {
|
||||
return s.length >= n ? s.slice(0, n) : s + ' '.repeat(n - s.length);
|
||||
}
|
||||
|
||||
function msToStr(ms: number): string {
|
||||
if (ms < 1000) return `${ms}ms`;
|
||||
return `${(ms / 1000).toFixed(1)}s`;
|
||||
}
|
||||
|
||||
function fmtCost(usd?: number): string {
|
||||
if (usd === undefined) return '-';
|
||||
if (usd < 0.01) return `$${usd.toFixed(4)}`;
|
||||
return `$${usd.toFixed(2)}`;
|
||||
}
|
||||
@@ -0,0 +1,61 @@
|
||||
/**
|
||||
* Per-model pricing tables.
|
||||
*
|
||||
* Prices are USD per million tokens as of `as_of`. Update quarterly.
|
||||
* Link to provider pricing pages:
|
||||
* - Anthropic: https://www.anthropic.com/pricing#api
|
||||
* - OpenAI: https://openai.com/api/pricing/
|
||||
* - Google AI: https://ai.google.dev/pricing
|
||||
*
|
||||
* When a model isn't in the table, estimateCost returns 0 with a console warning.
|
||||
* Prefer adding a new row to the table over guessing.
|
||||
*/
|
||||
|
||||
export interface ModelPricing {
|
||||
input_per_mtok: number;
|
||||
output_per_mtok: number;
|
||||
as_of: string; // YYYY-MM
|
||||
}
|
||||
|
||||
export const PRICING: Record<string, ModelPricing> = {
|
||||
// Claude (Anthropic)
|
||||
'claude-opus-4-7': { input_per_mtok: 15.00, output_per_mtok: 75.00, as_of: '2026-04' },
|
||||
'claude-sonnet-4-6': { input_per_mtok: 3.00, output_per_mtok: 15.00, as_of: '2026-04' },
|
||||
'claude-haiku-4-5': { input_per_mtok: 1.00, output_per_mtok: 5.00, as_of: '2026-04' },
|
||||
|
||||
// OpenAI (GPT + o-series)
|
||||
'gpt-5.4': { input_per_mtok: 2.50, output_per_mtok: 10.00, as_of: '2026-04' },
|
||||
'gpt-5.4-mini': { input_per_mtok: 0.60, output_per_mtok: 2.40, as_of: '2026-04' },
|
||||
'o3': { input_per_mtok: 15.00, output_per_mtok: 60.00, as_of: '2026-04' },
|
||||
'o4-mini': { input_per_mtok: 1.10, output_per_mtok: 4.40, as_of: '2026-04' },
|
||||
|
||||
// Google
|
||||
'gemini-2.5-pro': { input_per_mtok: 1.25, output_per_mtok: 5.00, as_of: '2026-04' },
|
||||
'gemini-2.5-flash': { input_per_mtok: 0.30, output_per_mtok: 1.20, as_of: '2026-04' },
|
||||
};
|
||||
|
||||
const WARNED = new Set<string>();
|
||||
|
||||
export function estimateCostUsd(
|
||||
tokens: { input: number; output: number; cached?: number },
|
||||
model: string | undefined
|
||||
): number {
|
||||
if (!model) return 0;
|
||||
const row = PRICING[model];
|
||||
if (!row) {
|
||||
if (!WARNED.has(model)) {
|
||||
WARNED.add(model);
|
||||
console.error(`WARN: no pricing for model ${model}; returning 0. Add it to test/helpers/pricing.ts.`);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
// Anthropic and OpenAI report cached tokens as a separate (disjoint) field from
|
||||
// uncached input tokens. tokens.input is already the uncached portion; tokens.cached
|
||||
// is the cache-read count billed at 10% of the regular input rate. Do NOT subtract
|
||||
// cached from input — they don't overlap.
|
||||
const cachedDiscount = 0.1;
|
||||
const inputCost = tokens.input * row.input_per_mtok / 1_000_000;
|
||||
const cachedCost = (tokens.cached ?? 0) * row.input_per_mtok * cachedDiscount / 1_000_000;
|
||||
const outputCost = tokens.output * row.output_per_mtok / 1_000_000;
|
||||
return +(inputCost + cachedCost + outputCost).toFixed(6);
|
||||
}
|
||||
@@ -0,0 +1,116 @@
|
||||
import type { ProviderAdapter, RunOpts, RunResult, AvailabilityCheck } from './types';
|
||||
import { estimateCostUsd } from '../pricing';
|
||||
import { execFileSync, spawnSync } from 'child_process';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
|
||||
/**
|
||||
* Claude adapter — wraps the `claude` CLI via claude -p.
|
||||
*
|
||||
* For brevity and to avoid duplicating the full stream-json parser, this adapter
|
||||
* uses claude CLI in non-interactive mode (--print) with the simpler JSON output
|
||||
* format. If richer event-level metrics are needed (per-tool timing etc.),
|
||||
* swap to session-runner's full stream-json parser.
|
||||
*/
|
||||
export class ClaudeAdapter implements ProviderAdapter {
|
||||
readonly name = 'claude';
|
||||
readonly family = 'claude' as const;
|
||||
|
||||
async available(): Promise<AvailabilityCheck> {
|
||||
// Binary on PATH?
|
||||
const res = spawnSync('sh', ['-c', 'command -v claude'], { timeout: 2000 });
|
||||
if (res.status !== 0) {
|
||||
return { ok: false, reason: 'claude CLI not found on PATH. Install from https://claude.ai/download or npm i -g @anthropic-ai/claude-code' };
|
||||
}
|
||||
// Auth sniff: ~/.claude/.credentials.json OR ANTHROPIC_API_KEY
|
||||
const credsPath = path.join(os.homedir(), '.claude', '.credentials.json');
|
||||
const hasCreds = fs.existsSync(credsPath);
|
||||
const hasKey = !!process.env.ANTHROPIC_API_KEY;
|
||||
if (!hasCreds && !hasKey) {
|
||||
return { ok: false, reason: 'No Claude auth found. Log in via `claude` interactive session, or export ANTHROPIC_API_KEY.' };
|
||||
}
|
||||
return { ok: true };
|
||||
}
|
||||
|
||||
async run(opts: RunOpts): Promise<RunResult> {
|
||||
const start = Date.now();
|
||||
const args = ['-p', '--output-format', 'json'];
|
||||
if (opts.model) args.push('--model', opts.model);
|
||||
if (opts.extraArgs) args.push(...opts.extraArgs);
|
||||
|
||||
try {
|
||||
const out = execFileSync('claude', args, {
|
||||
input: opts.prompt,
|
||||
cwd: opts.workdir,
|
||||
timeout: opts.timeoutMs,
|
||||
encoding: 'utf-8',
|
||||
maxBuffer: 32 * 1024 * 1024,
|
||||
});
|
||||
const parsed = this.parseOutput(out);
|
||||
return {
|
||||
output: parsed.output,
|
||||
tokens: parsed.tokens,
|
||||
durationMs: Date.now() - start,
|
||||
toolCalls: parsed.toolCalls,
|
||||
modelUsed: parsed.modelUsed || opts.model || 'claude-opus-4-7',
|
||||
};
|
||||
} catch (err: unknown) {
|
||||
const durationMs = Date.now() - start;
|
||||
const e = err as { code?: string; stderr?: Buffer; signal?: string; message?: string };
|
||||
const stderr = e.stderr?.toString() ?? '';
|
||||
if (e.signal === 'SIGTERM' || e.code === 'ETIMEDOUT') {
|
||||
return this.emptyResult(durationMs, { code: 'timeout', reason: `exceeded ${opts.timeoutMs}ms` }, opts.model);
|
||||
}
|
||||
if (/unauthorized|auth|login/i.test(stderr)) {
|
||||
return this.emptyResult(durationMs, { code: 'auth', reason: stderr.slice(0, 400) }, opts.model);
|
||||
}
|
||||
if (/rate[- ]?limit|429/i.test(stderr)) {
|
||||
return this.emptyResult(durationMs, { code: 'rate_limit', reason: stderr.slice(0, 400) }, opts.model);
|
||||
}
|
||||
return this.emptyResult(durationMs, { code: 'unknown', reason: (e.message ?? stderr ?? 'unknown').slice(0, 400) }, opts.model);
|
||||
}
|
||||
}
|
||||
|
||||
estimateCost(tokens: { input: number; output: number; cached?: number }, model?: string): number {
|
||||
return estimateCostUsd(tokens, model ?? 'claude-opus-4-7');
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse claude -p --output-format json output. Shape (as of 2026-04):
|
||||
* { type: "result", result: "<assistant text>", usage: { input_tokens, output_tokens, ... },
|
||||
* num_turns, session_id, ... }
|
||||
* Older formats may differ — adapter is best-effort.
|
||||
*/
|
||||
private parseOutput(raw: string): { output: string; tokens: { input: number; output: number; cached?: number }; toolCalls: number; modelUsed?: string } {
|
||||
try {
|
||||
const obj = JSON.parse(raw);
|
||||
const result = typeof obj.result === 'string' ? obj.result : String(obj.result ?? '');
|
||||
const u = obj.usage ?? {};
|
||||
return {
|
||||
output: result,
|
||||
tokens: {
|
||||
input: u.input_tokens ?? 0,
|
||||
output: u.output_tokens ?? 0,
|
||||
cached: u.cache_read_input_tokens,
|
||||
},
|
||||
toolCalls: obj.num_turns ?? 0,
|
||||
modelUsed: obj.model,
|
||||
};
|
||||
} catch {
|
||||
// Non-JSON output: treat as plain text.
|
||||
return { output: raw, tokens: { input: 0, output: 0 }, toolCalls: 0 };
|
||||
}
|
||||
}
|
||||
|
||||
private emptyResult(durationMs: number, error: RunResult['error'], model?: string): RunResult {
|
||||
return {
|
||||
output: '',
|
||||
tokens: { input: 0, output: 0 },
|
||||
durationMs,
|
||||
toolCalls: 0,
|
||||
modelUsed: model ?? 'claude-opus-4-7',
|
||||
error,
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,123 @@
|
||||
import type { ProviderAdapter, RunOpts, RunResult, AvailabilityCheck } from './types';
|
||||
import { estimateCostUsd } from '../pricing';
|
||||
import { execFileSync, spawnSync } from 'child_process';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
|
||||
/**
|
||||
* Gemini adapter — wraps the `gemini` CLI.
|
||||
*
|
||||
* Gemini CLI auth comes from either ~/.config/gemini/ or GOOGLE_API_KEY. Output
|
||||
* format is NDJSON with `message`/`tool_use`/`result` events when `--output-format
|
||||
* stream-json` is requested. This adapter uses a single-response form for simplicity
|
||||
* in benchmarks; richer streaming lives in gemini-session-runner.ts.
|
||||
*/
|
||||
export class GeminiAdapter implements ProviderAdapter {
|
||||
readonly name = 'gemini';
|
||||
readonly family = 'gemini' as const;
|
||||
|
||||
async available(): Promise<AvailabilityCheck> {
|
||||
const res = spawnSync('sh', ['-c', 'command -v gemini'], { timeout: 2000 });
|
||||
if (res.status !== 0) {
|
||||
return { ok: false, reason: 'gemini CLI not found on PATH. Install per https://github.com/google-gemini/gemini-cli' };
|
||||
}
|
||||
const cfgDir = path.join(os.homedir(), '.config', 'gemini');
|
||||
const hasCfg = fs.existsSync(cfgDir);
|
||||
const hasKey = !!process.env.GOOGLE_API_KEY;
|
||||
if (!hasCfg && !hasKey) {
|
||||
return { ok: false, reason: 'No Gemini auth found. Log in via `gemini login` or export GOOGLE_API_KEY.' };
|
||||
}
|
||||
return { ok: true };
|
||||
}
|
||||
|
||||
async run(opts: RunOpts): Promise<RunResult> {
|
||||
const start = Date.now();
|
||||
// Default to --yolo (non-interactive) and stream-json output so we can parse
|
||||
// tokens + tool calls. Callers can override via extraArgs.
|
||||
const args = ['-p', opts.prompt, '--output-format', 'stream-json', '--yolo'];
|
||||
if (opts.model) args.push('--model', opts.model);
|
||||
if (opts.extraArgs) args.push(...opts.extraArgs);
|
||||
|
||||
try {
|
||||
const out = execFileSync('gemini', args, {
|
||||
cwd: opts.workdir,
|
||||
timeout: opts.timeoutMs,
|
||||
encoding: 'utf-8',
|
||||
maxBuffer: 32 * 1024 * 1024,
|
||||
});
|
||||
const parsed = this.parseStreamJson(out);
|
||||
return {
|
||||
output: parsed.output,
|
||||
tokens: parsed.tokens,
|
||||
durationMs: Date.now() - start,
|
||||
toolCalls: parsed.toolCalls,
|
||||
modelUsed: parsed.modelUsed || opts.model || 'gemini-2.5-pro',
|
||||
};
|
||||
} catch (err: unknown) {
|
||||
const durationMs = Date.now() - start;
|
||||
const e = err as { code?: string; stderr?: Buffer; signal?: string; message?: string };
|
||||
const stderr = e.stderr?.toString() ?? '';
|
||||
if (e.signal === 'SIGTERM' || e.code === 'ETIMEDOUT') {
|
||||
return this.emptyResult(durationMs, { code: 'timeout', reason: `exceeded ${opts.timeoutMs}ms` }, opts.model);
|
||||
}
|
||||
if (/unauthorized|auth|login|api key/i.test(stderr)) {
|
||||
return this.emptyResult(durationMs, { code: 'auth', reason: stderr.slice(0, 400) }, opts.model);
|
||||
}
|
||||
if (/rate[- ]?limit|429|quota/i.test(stderr)) {
|
||||
return this.emptyResult(durationMs, { code: 'rate_limit', reason: stderr.slice(0, 400) }, opts.model);
|
||||
}
|
||||
return this.emptyResult(durationMs, { code: 'unknown', reason: (e.message ?? stderr ?? 'unknown').slice(0, 400) }, opts.model);
|
||||
}
|
||||
}
|
||||
|
||||
estimateCost(tokens: { input: number; output: number; cached?: number }, model?: string): number {
|
||||
return estimateCostUsd(tokens, model ?? 'gemini-2.5-pro');
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse gemini NDJSON stream events:
|
||||
* init → session id (discarded here)
|
||||
* message { delta: true, text } → concat to output
|
||||
* tool_use { name } → increment toolCalls
|
||||
* result { usage: { input_token_count, output_token_count } } → tokens
|
||||
*/
|
||||
private parseStreamJson(raw: string): { output: string; tokens: { input: number; output: number }; toolCalls: number; modelUsed?: string } {
|
||||
let output = '';
|
||||
let input = 0;
|
||||
let out = 0;
|
||||
let toolCalls = 0;
|
||||
let modelUsed: string | undefined;
|
||||
for (const line of raw.split('\n')) {
|
||||
const s = line.trim();
|
||||
if (!s) continue;
|
||||
try {
|
||||
const obj = JSON.parse(s);
|
||||
if (obj.type === 'message' && typeof obj.text === 'string') {
|
||||
output += obj.text;
|
||||
} else if (obj.type === 'tool_use') {
|
||||
toolCalls += 1;
|
||||
} else if (obj.type === 'result') {
|
||||
const u = obj.usage ?? {};
|
||||
input += u.input_token_count ?? u.prompt_tokens ?? 0;
|
||||
out += u.output_token_count ?? u.completion_tokens ?? 0;
|
||||
if (obj.model) modelUsed = obj.model;
|
||||
}
|
||||
} catch {
|
||||
// skip malformed lines
|
||||
}
|
||||
}
|
||||
return { output, tokens: { input, output: out }, toolCalls, modelUsed };
|
||||
}
|
||||
|
||||
private emptyResult(durationMs: number, error: RunResult['error'], model?: string): RunResult {
|
||||
return {
|
||||
output: '',
|
||||
tokens: { input: 0, output: 0 },
|
||||
durationMs,
|
||||
toolCalls: 0,
|
||||
modelUsed: model ?? 'gemini-2.5-pro',
|
||||
error,
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,122 @@
|
||||
import type { ProviderAdapter, RunOpts, RunResult, AvailabilityCheck } from './types';
|
||||
import { estimateCostUsd } from '../pricing';
|
||||
import { execFileSync, spawnSync } from 'child_process';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
|
||||
/**
|
||||
* GPT adapter — wraps the OpenAI `codex` CLI (codex exec with --json output).
|
||||
*
|
||||
* Codex uses ~/.codex/ for auth (not OPENAI_API_KEY). The --json flag emits
|
||||
* JSONL events; we parse `turn.completed` for usage and `agent_message` / etc.
|
||||
* for output aggregation.
|
||||
*/
|
||||
export class GptAdapter implements ProviderAdapter {
|
||||
readonly name = 'gpt';
|
||||
readonly family = 'gpt' as const;
|
||||
|
||||
async available(): Promise<AvailabilityCheck> {
|
||||
const res = spawnSync('sh', ['-c', 'command -v codex'], { timeout: 2000 });
|
||||
if (res.status !== 0) {
|
||||
return { ok: false, reason: 'codex CLI not found on PATH. Install: npm i -g @openai/codex' };
|
||||
}
|
||||
// Auth sniff: ~/.codex/ should contain auth state after `codex login`
|
||||
const codexDir = path.join(os.homedir(), '.codex');
|
||||
if (!fs.existsSync(codexDir)) {
|
||||
return { ok: false, reason: 'No ~/.codex/ found. Run `codex login` to authenticate via ChatGPT.' };
|
||||
}
|
||||
return { ok: true };
|
||||
}
|
||||
|
||||
async run(opts: RunOpts): Promise<RunResult> {
|
||||
const start = Date.now();
|
||||
const args = ['exec', opts.prompt, '-C', opts.workdir, '-s', 'read-only', '--json'];
|
||||
if (opts.model) args.push('-m', opts.model);
|
||||
if (opts.extraArgs) args.push(...opts.extraArgs);
|
||||
|
||||
try {
|
||||
const out = execFileSync('codex', args, {
|
||||
cwd: opts.workdir,
|
||||
timeout: opts.timeoutMs,
|
||||
encoding: 'utf-8',
|
||||
maxBuffer: 32 * 1024 * 1024,
|
||||
});
|
||||
const parsed = this.parseJsonl(out);
|
||||
return {
|
||||
output: parsed.output,
|
||||
tokens: parsed.tokens,
|
||||
durationMs: Date.now() - start,
|
||||
toolCalls: parsed.toolCalls,
|
||||
modelUsed: parsed.modelUsed || opts.model || 'gpt-5.4',
|
||||
};
|
||||
} catch (err: unknown) {
|
||||
const durationMs = Date.now() - start;
|
||||
const e = err as { code?: string; stderr?: Buffer; signal?: string; message?: string };
|
||||
const stderr = e.stderr?.toString() ?? '';
|
||||
if (e.signal === 'SIGTERM' || e.code === 'ETIMEDOUT') {
|
||||
return this.emptyResult(durationMs, { code: 'timeout', reason: `exceeded ${opts.timeoutMs}ms` }, opts.model);
|
||||
}
|
||||
if (/unauthorized|auth|login/i.test(stderr)) {
|
||||
return this.emptyResult(durationMs, { code: 'auth', reason: stderr.slice(0, 400) }, opts.model);
|
||||
}
|
||||
if (/rate[- ]?limit|429/i.test(stderr)) {
|
||||
return this.emptyResult(durationMs, { code: 'rate_limit', reason: stderr.slice(0, 400) }, opts.model);
|
||||
}
|
||||
return this.emptyResult(durationMs, { code: 'unknown', reason: (e.message ?? stderr ?? 'unknown').slice(0, 400) }, opts.model);
|
||||
}
|
||||
}
|
||||
|
||||
estimateCost(tokens: { input: number; output: number; cached?: number }, model?: string): number {
|
||||
return estimateCostUsd(tokens, model ?? 'gpt-5.4');
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse codex exec --json JSONL stream.
|
||||
* Key events:
|
||||
* - item.completed with item.type === 'agent_message' → text output
|
||||
* - item.completed with item.type === 'command_execution' → tool call
|
||||
* - turn.completed → usage.input_tokens, usage.output_tokens
|
||||
* - thread.started → session id (not used here)
|
||||
*/
|
||||
private parseJsonl(raw: string): { output: string; tokens: { input: number; output: number }; toolCalls: number; modelUsed?: string } {
|
||||
let output = '';
|
||||
let input = 0;
|
||||
let out = 0;
|
||||
let toolCalls = 0;
|
||||
let modelUsed: string | undefined;
|
||||
for (const line of raw.split('\n')) {
|
||||
const s = line.trim();
|
||||
if (!s) continue;
|
||||
try {
|
||||
const obj = JSON.parse(s);
|
||||
if (obj.type === 'item.completed' && obj.item) {
|
||||
if (obj.item.type === 'agent_message' && typeof obj.item.text === 'string') {
|
||||
output += (output ? '\n' : '') + obj.item.text;
|
||||
} else if (obj.item.type === 'command_execution') {
|
||||
toolCalls += 1;
|
||||
}
|
||||
} else if (obj.type === 'turn.completed') {
|
||||
const u = obj.usage ?? {};
|
||||
input += u.input_tokens ?? 0;
|
||||
out += u.output_tokens ?? 0;
|
||||
if (obj.model) modelUsed = obj.model;
|
||||
}
|
||||
} catch {
|
||||
// skip malformed lines — codex stderr can leak in
|
||||
}
|
||||
}
|
||||
return { output, tokens: { input, output: out }, toolCalls, modelUsed };
|
||||
}
|
||||
|
||||
private emptyResult(durationMs: number, error: RunResult['error'], model?: string): RunResult {
|
||||
return {
|
||||
output: '',
|
||||
tokens: { input: 0, output: 0 },
|
||||
durationMs,
|
||||
toolCalls: 0,
|
||||
modelUsed: model ?? 'gpt-5.4',
|
||||
error,
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,74 @@
|
||||
/**
|
||||
* Provider adapter interface — uniform contract for Claude, GPT, Gemini.
|
||||
*
|
||||
* Each adapter wraps an existing runner (session-runner.ts, codex-session-runner.ts,
|
||||
* gemini-session-runner.ts) and normalizes its per-provider result shape into the
|
||||
* RunResult below. The benchmark harness only talks to adapters through this
|
||||
* interface, never to the underlying runners directly.
|
||||
*/
|
||||
|
||||
export interface RunOpts {
|
||||
/** The prompt to send to the model. */
|
||||
prompt: string;
|
||||
/** Working directory passed to the underlying CLI. */
|
||||
workdir: string;
|
||||
/** Hard wall-clock timeout in ms. Default: 300000 (5 min). */
|
||||
timeoutMs: number;
|
||||
/** Specific model within the family, optional. Adapters pass through to provider. */
|
||||
model?: string;
|
||||
/** Extra flags per-provider (escape hatch for rare cases). Prefer staying generic. */
|
||||
extraArgs?: string[];
|
||||
}
|
||||
|
||||
export interface TokenUsage {
|
||||
input: number;
|
||||
output: number;
|
||||
/** Cached input tokens (Anthropic/OpenAI support). Undefined if provider doesn't report. */
|
||||
cached?: number;
|
||||
}
|
||||
|
||||
export type RunError =
|
||||
| 'auth' // Credentials missing or invalid.
|
||||
| 'timeout' // Exceeded timeoutMs.
|
||||
| 'rate_limit' // Provider rate-limited us; backoff exceeded.
|
||||
| 'binary_missing' // CLI not found on PATH.
|
||||
| 'unknown'; // Catch-all with reason populated.
|
||||
|
||||
export interface RunResult {
|
||||
/** Provider's textual output for the prompt. */
|
||||
output: string;
|
||||
/** Normalized token usage. 0s if unreported. */
|
||||
tokens: TokenUsage;
|
||||
/** Wall-clock duration. */
|
||||
durationMs: number;
|
||||
/** Count of tool/function calls made during the run (0 if unsupported). */
|
||||
toolCalls: number;
|
||||
/** Actual model ID the provider reports using (may be a variant of the family). */
|
||||
modelUsed: string;
|
||||
/** If the run failed, error code + human reason. output/tokens may be partial. */
|
||||
error?: { code: RunError; reason: string };
|
||||
}
|
||||
|
||||
export interface AvailabilityCheck {
|
||||
ok: boolean;
|
||||
/** When !ok: short reason shown to user. Includes install / login / env var hint. */
|
||||
reason?: string;
|
||||
}
|
||||
|
||||
export type Family = 'claude' | 'gpt' | 'gemini';
|
||||
|
||||
export interface ProviderAdapter {
|
||||
/** Stable name used in output tables and config (e.g., 'claude', 'gpt', 'gemini'). */
|
||||
readonly name: string;
|
||||
/** Model family this adapter targets. */
|
||||
readonly family: Family;
|
||||
/**
|
||||
* Check whether the provider's CLI binary is present and authenticated.
|
||||
* Should never block >2s. Non-throwing: returns { ok: false, reason } on failure.
|
||||
*/
|
||||
available(): Promise<AvailabilityCheck>;
|
||||
/** Run a prompt and return normalized RunResult. Non-throwing. Errors go in result.error. */
|
||||
run(opts: RunOpts): Promise<RunResult>;
|
||||
/** Estimate USD cost for the reported token usage and model. */
|
||||
estimateCost(tokens: TokenUsage, model?: string): number;
|
||||
}
|
||||
@@ -0,0 +1,82 @@
|
||||
/**
|
||||
* Tool compatibility map across provider CLIs.
|
||||
*
|
||||
* Not all provider CLIs expose equivalent tools. A benchmark that uses Edit, Glob,
|
||||
* or Grep won't run cleanly on CLIs that don't have those. The map answers:
|
||||
* "which tools does each provider's CLI expose by default?"
|
||||
*
|
||||
* When a benchmark is scoped to a tool a provider lacks, the harness records
|
||||
* `unsupported_tool` in the result and continues with the other providers.
|
||||
*
|
||||
* Source-of-truth references:
|
||||
* - Claude Code: https://code.claude.com/docs/en/tools
|
||||
* - Codex CLI: `codex exec --help` tool listing
|
||||
* - Gemini CLI: `gemini --help` (limited tool surface as of 2026-04)
|
||||
*/
|
||||
|
||||
export type ToolName =
|
||||
| 'Read'
|
||||
| 'Write'
|
||||
| 'Edit'
|
||||
| 'Bash'
|
||||
| 'Agent'
|
||||
| 'Glob'
|
||||
| 'Grep'
|
||||
| 'AskUserQuestion'
|
||||
| 'WebSearch'
|
||||
| 'WebFetch';
|
||||
|
||||
export const TOOL_COMPATIBILITY: Record<'claude' | 'gpt' | 'gemini', Record<ToolName, boolean>> = {
|
||||
claude: {
|
||||
Read: true,
|
||||
Write: true,
|
||||
Edit: true,
|
||||
Bash: true,
|
||||
Agent: true,
|
||||
Glob: true,
|
||||
Grep: true,
|
||||
AskUserQuestion: true,
|
||||
WebSearch: true,
|
||||
WebFetch: true,
|
||||
},
|
||||
gpt: {
|
||||
// Codex CLI has a narrower tool surface: it uses shell + apply_patch.
|
||||
// Read/Glob/Grep-style operations happen via shell pipelines.
|
||||
Read: true,
|
||||
Write: false, // apply_patch handles writes; no standalone Write tool
|
||||
Edit: false, // apply_patch handles edits; no standalone Edit tool
|
||||
Bash: true,
|
||||
Agent: false,
|
||||
Glob: false,
|
||||
Grep: false,
|
||||
AskUserQuestion: false,
|
||||
WebSearch: true, // --enable web_search_cached
|
||||
WebFetch: false,
|
||||
},
|
||||
gemini: {
|
||||
// Gemini CLI (as of 2026-04) has a limited tool surface in --yolo mode.
|
||||
// Shell access depends on flags; most agentic tools are not exposed.
|
||||
Read: true,
|
||||
Write: false,
|
||||
Edit: false,
|
||||
Bash: false,
|
||||
Agent: false,
|
||||
Glob: false,
|
||||
Grep: false,
|
||||
AskUserQuestion: false,
|
||||
WebSearch: true,
|
||||
WebFetch: false,
|
||||
},
|
||||
};
|
||||
|
||||
/**
|
||||
* Determine which tools from a required-set are missing for a given provider.
|
||||
* Empty array means full compatibility.
|
||||
*/
|
||||
export function missingTools(
|
||||
provider: 'claude' | 'gpt' | 'gemini',
|
||||
requiredTools: ToolName[]
|
||||
): ToolName[] {
|
||||
const map = TOOL_COMPATIBILITY[provider];
|
||||
return requiredTools.filter(t => !map[t]);
|
||||
}
|
||||
Reference in New Issue
Block a user