mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-07 05:56:41 +02:00
614354fc41
Adds the full spec Codex asked for: real provider adapters with auth detection, normalized RunResult, pricing tables, tool compatibility maps, parallel execution with error isolation, and table/JSON/markdown output. Judge stays on Anthropic SDK as the single stable source of quality scoring, gated behind --judge. Codex flagged the original plan as massively under-scoped — the existing runner is Claude-only and the judge is Anthropic-only. You can't benchmark GPT or Gemini without real provider infrastructure. This commit ships it. New architecture: test/helpers/providers/types.ts ProviderAdapter interface test/helpers/providers/claude.ts wraps `claude -p --output-format json` test/helpers/providers/gpt.ts wraps `codex exec --json` test/helpers/providers/gemini.ts wraps `gemini -p --output-format stream-json --yolo` test/helpers/pricing.ts per-model USD cost tables (quarterly) test/helpers/tool-map.ts which tools each CLI exposes test/helpers/benchmark-runner.ts orchestrator (Promise.allSettled) test/helpers/benchmark-judge.ts Anthropic SDK quality scorer bin/gstack-model-benchmark CLI entry test/benchmark-runner.test.ts 9 unit tests (cost math, formatters, tool-map) Per-provider error isolation: - auth → record reason, don't abort batch - timeout → record reason, don't abort batch - rate_limit → record reason, don't abort batch - binary_missing → record in available() check, skip if --skip-unavailable Pricing correction: cached input tokens are disjoint from uncached input tokens (Anthropic/OpenAI report them separately). Original math subtracted them, producing negative costs. Now adds cached at the 10% discount alongside the full uncached input cost. CLI: gstack-model-benchmark --prompt "..." --models claude,gpt,gemini gstack-model-benchmark ./prompt.txt --output json --judge gstack-model-benchmark ./prompt.txt --models claude --timeout-ms 60000 Output formats: table (default), json, markdown. Each shows model, latency, in→out tokens, cost, quality (when --judge used), tool calls, and any errors. Known limitations for v1: - Claude adapter approximates toolCalls as num_turns (stream-json would give exact counts; v2 can upgrade). - Live E2E tests (test/providers.e2e.test.ts) not included — they require CI secrets for all three providers. Unit tests cover the shape and math. - Provider CLIs sometimes return non-JSON error text to stdout; the parsers fall back to treating raw output as plain text in that case. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
102 lines
3.9 KiB
TypeScript
102 lines
3.9 KiB
TypeScript
/**
|
|
* Benchmark quality judge — wraps llm-judge.ts for multi-provider scoring.
|
|
*
|
|
* The judge is always Anthropic SDK (claude-sonnet-4-6) for stability. It sees
|
|
* the prompt + N provider outputs and scores each on: correctness, completeness,
|
|
* code quality, edge case handling. 0-10 per dimension; overall = average.
|
|
*
|
|
* Judge adds ~$0.05 per benchmark run. Gated by --judge CLI flag.
|
|
*/
|
|
|
|
import type { BenchmarkReport, BenchmarkEntry } from './benchmark-runner';
|
|
|
|
export async function judgeEntries(report: BenchmarkReport): Promise<void> {
|
|
if (!process.env.ANTHROPIC_API_KEY) {
|
|
throw new Error('ANTHROPIC_API_KEY not set — judge requires Anthropic access.');
|
|
}
|
|
const { default: Anthropic } = await import('@anthropic-ai/sdk').catch(() => {
|
|
throw new Error('@anthropic-ai/sdk not installed — run `bun add @anthropic-ai/sdk` if you want the judge.');
|
|
});
|
|
const client = new (Anthropic as unknown as new (opts: { apiKey: string }) => {
|
|
messages: { create: (params: Record<string, unknown>) => Promise<{ content: Array<{ type: string; text: string }> }> };
|
|
})({ apiKey: process.env.ANTHROPIC_API_KEY! });
|
|
|
|
const successful = report.entries.filter(e => e.available && e.result && !e.result.error);
|
|
if (successful.length === 0) return;
|
|
|
|
const judgePrompt = buildJudgePrompt(report.prompt, successful);
|
|
const msg = await client.messages.create({
|
|
model: 'claude-sonnet-4-6',
|
|
max_tokens: 2048,
|
|
messages: [{ role: 'user', content: judgePrompt }],
|
|
});
|
|
const textBlock = msg.content.find(c => c.type === 'text');
|
|
if (!textBlock) return;
|
|
|
|
const scores = parseScores(textBlock.text, successful.length);
|
|
for (let i = 0; i < successful.length; i++) {
|
|
const s = scores[i];
|
|
if (!s) continue;
|
|
successful[i].qualityScore = s.overall;
|
|
successful[i].qualityDetails = s.dimensions;
|
|
}
|
|
}
|
|
|
|
function buildJudgePrompt(prompt: string, entries: BenchmarkEntry[]): string {
|
|
const lines: string[] = [
|
|
'You are a strict, fair technical reviewer scoring N model outputs against the same prompt.',
|
|
'',
|
|
'--- PROMPT ---',
|
|
prompt.length > 4000 ? prompt.slice(0, 4000) + '\n[...truncated for judge budget...]' : prompt,
|
|
'',
|
|
'--- OUTPUTS ---',
|
|
];
|
|
entries.forEach((e, i) => {
|
|
const r = e.result!;
|
|
const out = r.output.length > 3000 ? r.output.slice(0, 3000) + '\n[...truncated...]' : r.output;
|
|
lines.push(`=== Output ${i + 1}: ${r.modelUsed} ===`);
|
|
lines.push(out);
|
|
lines.push('');
|
|
});
|
|
lines.push('');
|
|
lines.push('Score each output on these dimensions (0-10 per dimension):');
|
|
lines.push(' - correctness: does it solve what the prompt asked?');
|
|
lines.push(' - completeness: are edge cases and error paths addressed?');
|
|
lines.push(' - code_quality: naming, structure, explicitness');
|
|
lines.push(' - edge_cases: handling of nil/empty/invalid input');
|
|
lines.push('');
|
|
lines.push('Return JSON only, in this exact shape:');
|
|
lines.push('{"scores":[');
|
|
lines.push(' {"output":1,"correctness":N,"completeness":N,"code_quality":N,"edge_cases":N,"overall":N,"notes":"..."},');
|
|
lines.push(' ...');
|
|
lines.push(']}');
|
|
lines.push('');
|
|
lines.push('overall = rounded average of the 4 dimensions. No other commentary.');
|
|
return lines.join('\n');
|
|
}
|
|
|
|
interface ParsedScore {
|
|
overall: number;
|
|
dimensions: Record<string, number>;
|
|
}
|
|
|
|
function parseScores(raw: string, expectedCount: number): ParsedScore[] {
|
|
const match = raw.match(/\{[\s\S]*\}/);
|
|
if (!match) return [];
|
|
try {
|
|
const obj = JSON.parse(match[0]);
|
|
if (!Array.isArray(obj.scores)) return [];
|
|
return obj.scores.slice(0, expectedCount).map((s: Record<string, number>) => ({
|
|
overall: Number(s.overall ?? 0),
|
|
dimensions: {
|
|
correctness: Number(s.correctness ?? 0),
|
|
completeness: Number(s.completeness ?? 0),
|
|
code_quality: Number(s.code_quality ?? 0),
|
|
edge_cases: Number(s.edge_cases ?? 0),
|
|
},
|
|
}));
|
|
} catch {
|
|
return [];
|
|
}
|
|
}
|