#!/usr/bin/env bun /** * gstack-model-benchmark — run the same prompt across multiple providers * and compare latency, tokens, cost, quality, and tool-call count. * * Usage: * gstack-model-benchmark [options] * * Options: * --models claude,gpt,gemini Comma-separated provider list (default: claude) * --prompt "" Inline prompt instead of a file * --workdir Working dir passed to each CLI (default: cwd) * --timeout-ms Per-provider timeout (default: 300000) * --output table|json|markdown Output format (default: table) * --skip-unavailable Skip providers that fail available() check * (default: include them with unavailable marker) * --judge Run Anthropic SDK judge on outputs for quality score * (requires ANTHROPIC_API_KEY; adds ~$0.05 per call) * * Examples: * gstack-model-benchmark --prompt "Write a haiku about databases" --models claude,gpt * gstack-model-benchmark ./test-prompt.txt --models claude,gpt,gemini --judge */ import * as fs from 'fs'; import * as path from 'path'; import { runBenchmark, formatTable, formatJson, formatMarkdown, type BenchmarkInput } from '../test/helpers/benchmark-runner'; type OutputFormat = 'table' | 'json' | 'markdown'; function arg(name: string, def?: string): string | undefined { const idx = process.argv.findIndex(a => a === name || a.startsWith(name + '=')); if (idx < 0) return def; const eqIdx = process.argv[idx].indexOf('='); if (eqIdx >= 0) return process.argv[idx].slice(eqIdx + 1); return process.argv[idx + 1]; } function flag(name: string): boolean { return process.argv.includes(name); } function parseProviders(s: string | undefined): Array<'claude' | 'gpt' | 'gemini'> { if (!s) return ['claude']; const out: Array<'claude' | 'gpt' | 'gemini'> = []; for (const p of s.split(',').map(x => x.trim()).filter(Boolean)) { if (p === 'claude' || p === 'gpt' || p === 'gemini') out.push(p); else { console.error(`WARN: unknown provider '${p}' — skipping. Valid: claude, gpt, gemini.`); } } return out.length ? out : ['claude']; } function resolvePrompt(positional: string | undefined): string { const inline = arg('--prompt'); if (inline) return inline; if (!positional) { console.error('ERROR: specify a prompt via positional path or --prompt ""'); process.exit(1); } if (fs.existsSync(positional)) { return fs.readFileSync(positional, 'utf-8'); } // Not a file — treat as inline prompt return positional; } async function main(): Promise { const positional = process.argv.slice(2).find(a => !a.startsWith('--')); const prompt = resolvePrompt(positional); const providers = parseProviders(arg('--models')); const workdir = arg('--workdir', process.cwd())!; const timeoutMs = parseInt(arg('--timeout-ms', '300000')!, 10); const output = (arg('--output', 'table') as OutputFormat); const skipUnavailable = flag('--skip-unavailable'); const doJudge = flag('--judge'); const input: BenchmarkInput = { prompt, workdir, providers, timeoutMs, skipUnavailable, }; const report = await runBenchmark(input); if (doJudge) { try { const { judgeEntries } = await import('../test/helpers/benchmark-judge'); await judgeEntries(report); } catch (err) { console.error(`WARN: judge unavailable: ${(err as Error).message}`); } } let out: string; switch (output) { case 'json': out = formatJson(report); break; case 'markdown': out = formatMarkdown(report); break; case 'table': default: out = formatTable(report); break; } process.stdout.write(out + '\n'); } main().catch(err => { console.error('FATAL:', err); process.exit(1); });