mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-07 05:56:41 +02:00
614354fc41
Adds the full spec Codex asked for: real provider adapters with auth detection, normalized RunResult, pricing tables, tool compatibility maps, parallel execution with error isolation, and table/JSON/markdown output. Judge stays on Anthropic SDK as the single stable source of quality scoring, gated behind --judge. Codex flagged the original plan as massively under-scoped — the existing runner is Claude-only and the judge is Anthropic-only. You can't benchmark GPT or Gemini without real provider infrastructure. This commit ships it. New architecture: test/helpers/providers/types.ts ProviderAdapter interface test/helpers/providers/claude.ts wraps `claude -p --output-format json` test/helpers/providers/gpt.ts wraps `codex exec --json` test/helpers/providers/gemini.ts wraps `gemini -p --output-format stream-json --yolo` test/helpers/pricing.ts per-model USD cost tables (quarterly) test/helpers/tool-map.ts which tools each CLI exposes test/helpers/benchmark-runner.ts orchestrator (Promise.allSettled) test/helpers/benchmark-judge.ts Anthropic SDK quality scorer bin/gstack-model-benchmark CLI entry test/benchmark-runner.test.ts 9 unit tests (cost math, formatters, tool-map) Per-provider error isolation: - auth → record reason, don't abort batch - timeout → record reason, don't abort batch - rate_limit → record reason, don't abort batch - binary_missing → record in available() check, skip if --skip-unavailable Pricing correction: cached input tokens are disjoint from uncached input tokens (Anthropic/OpenAI report them separately). Original math subtracted them, producing negative costs. Now adds cached at the 10% discount alongside the full uncached input cost. CLI: gstack-model-benchmark --prompt "..." --models claude,gpt,gemini gstack-model-benchmark ./prompt.txt --output json --judge gstack-model-benchmark ./prompt.txt --models claude --timeout-ms 60000 Output formats: table (default), json, markdown. Each shows model, latency, in→out tokens, cost, quality (when --judge used), tool calls, and any errors. Known limitations for v1: - Claude adapter approximates toolCalls as num_turns (stream-json would give exact counts; v2 can upgrade). - Live E2E tests (test/providers.e2e.test.ts) not included — they require CI secrets for all three providers. Unit tests cover the shape and math. - Provider CLIs sometimes return non-JSON error text to stdout; the parsers fall back to treating raw output as plain text in that case. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
124 lines
4.8 KiB
TypeScript
124 lines
4.8 KiB
TypeScript
import type { ProviderAdapter, RunOpts, RunResult, AvailabilityCheck } from './types';
|
|
import { estimateCostUsd } from '../pricing';
|
|
import { execFileSync, spawnSync } from 'child_process';
|
|
import * as fs from 'fs';
|
|
import * as path from 'path';
|
|
import * as os from 'os';
|
|
|
|
/**
|
|
* Gemini adapter — wraps the `gemini` CLI.
|
|
*
|
|
* Gemini CLI auth comes from either ~/.config/gemini/ or GOOGLE_API_KEY. Output
|
|
* format is NDJSON with `message`/`tool_use`/`result` events when `--output-format
|
|
* stream-json` is requested. This adapter uses a single-response form for simplicity
|
|
* in benchmarks; richer streaming lives in gemini-session-runner.ts.
|
|
*/
|
|
export class GeminiAdapter implements ProviderAdapter {
|
|
readonly name = 'gemini';
|
|
readonly family = 'gemini' as const;
|
|
|
|
async available(): Promise<AvailabilityCheck> {
|
|
const res = spawnSync('sh', ['-c', 'command -v gemini'], { timeout: 2000 });
|
|
if (res.status !== 0) {
|
|
return { ok: false, reason: 'gemini CLI not found on PATH. Install per https://github.com/google-gemini/gemini-cli' };
|
|
}
|
|
const cfgDir = path.join(os.homedir(), '.config', 'gemini');
|
|
const hasCfg = fs.existsSync(cfgDir);
|
|
const hasKey = !!process.env.GOOGLE_API_KEY;
|
|
if (!hasCfg && !hasKey) {
|
|
return { ok: false, reason: 'No Gemini auth found. Log in via `gemini login` or export GOOGLE_API_KEY.' };
|
|
}
|
|
return { ok: true };
|
|
}
|
|
|
|
async run(opts: RunOpts): Promise<RunResult> {
|
|
const start = Date.now();
|
|
// Default to --yolo (non-interactive) and stream-json output so we can parse
|
|
// tokens + tool calls. Callers can override via extraArgs.
|
|
const args = ['-p', opts.prompt, '--output-format', 'stream-json', '--yolo'];
|
|
if (opts.model) args.push('--model', opts.model);
|
|
if (opts.extraArgs) args.push(...opts.extraArgs);
|
|
|
|
try {
|
|
const out = execFileSync('gemini', args, {
|
|
cwd: opts.workdir,
|
|
timeout: opts.timeoutMs,
|
|
encoding: 'utf-8',
|
|
maxBuffer: 32 * 1024 * 1024,
|
|
});
|
|
const parsed = this.parseStreamJson(out);
|
|
return {
|
|
output: parsed.output,
|
|
tokens: parsed.tokens,
|
|
durationMs: Date.now() - start,
|
|
toolCalls: parsed.toolCalls,
|
|
modelUsed: parsed.modelUsed || opts.model || 'gemini-2.5-pro',
|
|
};
|
|
} catch (err: unknown) {
|
|
const durationMs = Date.now() - start;
|
|
const e = err as { code?: string; stderr?: Buffer; signal?: string; message?: string };
|
|
const stderr = e.stderr?.toString() ?? '';
|
|
if (e.signal === 'SIGTERM' || e.code === 'ETIMEDOUT') {
|
|
return this.emptyResult(durationMs, { code: 'timeout', reason: `exceeded ${opts.timeoutMs}ms` }, opts.model);
|
|
}
|
|
if (/unauthorized|auth|login|api key/i.test(stderr)) {
|
|
return this.emptyResult(durationMs, { code: 'auth', reason: stderr.slice(0, 400) }, opts.model);
|
|
}
|
|
if (/rate[- ]?limit|429|quota/i.test(stderr)) {
|
|
return this.emptyResult(durationMs, { code: 'rate_limit', reason: stderr.slice(0, 400) }, opts.model);
|
|
}
|
|
return this.emptyResult(durationMs, { code: 'unknown', reason: (e.message ?? stderr ?? 'unknown').slice(0, 400) }, opts.model);
|
|
}
|
|
}
|
|
|
|
estimateCost(tokens: { input: number; output: number; cached?: number }, model?: string): number {
|
|
return estimateCostUsd(tokens, model ?? 'gemini-2.5-pro');
|
|
}
|
|
|
|
/**
|
|
* Parse gemini NDJSON stream events:
|
|
* init → session id (discarded here)
|
|
* message { delta: true, text } → concat to output
|
|
* tool_use { name } → increment toolCalls
|
|
* result { usage: { input_token_count, output_token_count } } → tokens
|
|
*/
|
|
private parseStreamJson(raw: string): { output: string; tokens: { input: number; output: number }; toolCalls: number; modelUsed?: string } {
|
|
let output = '';
|
|
let input = 0;
|
|
let out = 0;
|
|
let toolCalls = 0;
|
|
let modelUsed: string | undefined;
|
|
for (const line of raw.split('\n')) {
|
|
const s = line.trim();
|
|
if (!s) continue;
|
|
try {
|
|
const obj = JSON.parse(s);
|
|
if (obj.type === 'message' && typeof obj.text === 'string') {
|
|
output += obj.text;
|
|
} else if (obj.type === 'tool_use') {
|
|
toolCalls += 1;
|
|
} else if (obj.type === 'result') {
|
|
const u = obj.usage ?? {};
|
|
input += u.input_token_count ?? u.prompt_tokens ?? 0;
|
|
out += u.output_token_count ?? u.completion_tokens ?? 0;
|
|
if (obj.model) modelUsed = obj.model;
|
|
}
|
|
} catch {
|
|
// skip malformed lines
|
|
}
|
|
}
|
|
return { output, tokens: { input, output: out }, toolCalls, modelUsed };
|
|
}
|
|
|
|
private emptyResult(durationMs: number, error: RunResult['error'], model?: string): RunResult {
|
|
return {
|
|
output: '',
|
|
tokens: { input: 0, output: 0 },
|
|
durationMs,
|
|
toolCalls: 0,
|
|
modelUsed: model ?? 'gemini-2.5-pro',
|
|
error,
|
|
};
|
|
}
|
|
}
|