mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-07 05:56:41 +02:00
620f5dbaea
Applied from the adversarial subagent pass during /review on this branch: - test/benchmark-cli.test.ts — new "NOT READY path fires when auth env vars are stripped" test. The default dry-run test always showed OK on dev machines with auth, hiding regressions in the remediation-hint branch. Stripped env (no auth vars, HOME→empty tmpdir) now force- exercises gpt + gemini NOT READY paths and asserts every NOT READY line includes a concrete remediation hint (install/login/export). (claude adapter's os.homedir() call is Bun-cached; the 2-of-3 adapter coverage is sufficient to exercise the branch.) - test/taste-engine.test.ts — session-cap test rewritten to seed the profile with 50 entries + one real CLI call, instead of 55 sequential subprocess spawns. Same coverage (FIFO eviction at the boundary), ~5s faster CI time. Also pins first-casing-wins on the Geist/GEIST merge assertion — bumpPref() keeps the first-arrival casing, so the test documents that policy. - test/skill-e2e-benchmark-providers.test.ts — workdir creation moved from module-load into beforeAll, cleanup added in afterAll. Previous shape leaked a /tmp/bench-e2e-* dir every CI run. - test/publish-dry-run.test.ts — removed unused empty test/helpers mkdirSync from the sandbox setup. The bin doesn't import from there, so the empty dir was a footgun for future maintainers. - test/helpers/providers/gpt.ts — expanded the inline comment on `--skip-git-repo-check` to explicitly note that `-s read-only` is now load-bearing safety (the trust prompt was the secondary boundary; removing read-only while keeping skip-git-repo-check would be unsafe). Net: 45 passing tests (was 44), session-cap test 5s faster, one real regression surface covered that didn't exist before. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
187 lines
7.6 KiB
TypeScript
187 lines
7.6 KiB
TypeScript
/**
|
|
* Multi-provider benchmark adapter E2E — hit real claude, codex, gemini CLIs.
|
|
*
|
|
* Periodic tier: runs under `bun run test:e2e` with EVALS=1. Each provider gated
|
|
* on its own `available()` check so missing auth skips that provider (doesn't
|
|
* abort the batch). Uses the simplest possible prompt ("Reply with exactly: ok")
|
|
* to keep cost near $0.001/provider/run.
|
|
*
|
|
* What this catches that unit tests don't:
|
|
* - CLI output-format drift (the #1 silent breakage path)
|
|
* - Token parsing from real provider responses
|
|
* - Auth-failure vs timeout vs rate-limit error code routing
|
|
* - Cost estimation on real token counts
|
|
* - Parallel execution via Promise.allSettled — slow provider doesn't block fast
|
|
*
|
|
* NOT covered here (would need dedicated test files):
|
|
* - Quality judge integration (benchmark-judge.ts, adds ~$0.05/run)
|
|
* - Multi-turn tool-using prompts — our single-turn smoke skips `toolCalls > 0`
|
|
*/
|
|
|
|
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
|
|
import { ClaudeAdapter } from './helpers/providers/claude';
|
|
import { GptAdapter } from './helpers/providers/gpt';
|
|
import { GeminiAdapter } from './helpers/providers/gemini';
|
|
import { runBenchmark } from './helpers/benchmark-runner';
|
|
import * as fs from 'fs';
|
|
import * as path from 'path';
|
|
import * as os from 'os';
|
|
|
|
// --- Prerequisites / gating ---
|
|
|
|
const evalsEnabled = !!process.env.EVALS;
|
|
const describeIfEvals = evalsEnabled ? describe : describe.skip;
|
|
|
|
const PROMPT = 'Reply with exactly this text and nothing else: ok';
|
|
|
|
// Per-provider gate — each test checks its own availability and skips cleanly.
|
|
// We construct adapters outside `test` so Bun's test reporter shows the skip reason.
|
|
const claude = new ClaudeAdapter();
|
|
const gpt = new GptAdapter();
|
|
const gemini = new GeminiAdapter();
|
|
|
|
// Use a temp working directory so provider CLIs can't accidentally touch the repo.
|
|
// Created in beforeAll / cleaned in afterAll so concurrent CI runs don't leak.
|
|
let workdir: string;
|
|
|
|
describeIfEvals('multi-provider benchmark adapters (live)', () => {
|
|
beforeAll(() => {
|
|
workdir = fs.mkdtempSync(path.join(os.tmpdir(), 'bench-e2e-'));
|
|
});
|
|
|
|
afterAll(() => {
|
|
if (workdir && fs.existsSync(workdir)) {
|
|
fs.rmSync(workdir, { recursive: true, force: true });
|
|
}
|
|
});
|
|
|
|
test('claude: available() returns structured ok/reason', async () => {
|
|
const check = await claude.available();
|
|
expect(check).toHaveProperty('ok');
|
|
if (!check.ok) {
|
|
expect(typeof check.reason).toBe('string');
|
|
expect(check.reason!.length).toBeGreaterThan(0);
|
|
}
|
|
});
|
|
|
|
test('gpt: available() returns structured ok/reason', async () => {
|
|
const check = await gpt.available();
|
|
expect(check).toHaveProperty('ok');
|
|
if (!check.ok) {
|
|
expect(typeof check.reason).toBe('string');
|
|
}
|
|
});
|
|
|
|
test('gemini: available() returns structured ok/reason', async () => {
|
|
const check = await gemini.available();
|
|
expect(check).toHaveProperty('ok');
|
|
if (!check.ok) {
|
|
expect(typeof check.reason).toBe('string');
|
|
}
|
|
});
|
|
|
|
test('claude: trivial prompt produces parseable output', async () => {
|
|
const check = await claude.available();
|
|
if (!check.ok) {
|
|
process.stderr.write(`\nclaude live smoke: SKIPPED — ${check.reason}\n`);
|
|
return;
|
|
}
|
|
const result = await claude.run({ prompt: PROMPT, workdir, timeoutMs: 120_000 });
|
|
if (result.error) {
|
|
throw new Error(`claude errored: ${result.error.code} — ${result.error.reason}`);
|
|
}
|
|
expect(result.output.toLowerCase()).toContain('ok');
|
|
expect(result.tokens.input).toBeGreaterThan(0);
|
|
expect(result.tokens.output).toBeGreaterThan(0);
|
|
expect(result.durationMs).toBeGreaterThan(0);
|
|
expect(typeof result.modelUsed).toBe('string');
|
|
expect(result.modelUsed.length).toBeGreaterThan(0);
|
|
const cost = claude.estimateCost(result.tokens, result.modelUsed);
|
|
expect(cost).toBeGreaterThan(0);
|
|
}, 150_000);
|
|
|
|
test('gpt: trivial prompt produces parseable output', async () => {
|
|
const check = await gpt.available();
|
|
if (!check.ok) {
|
|
process.stderr.write(`\ngpt live smoke: SKIPPED — ${check.reason}\n`);
|
|
return;
|
|
}
|
|
const result = await gpt.run({ prompt: PROMPT, workdir, timeoutMs: 120_000 });
|
|
if (result.error) {
|
|
throw new Error(`gpt errored: ${result.error.code} — ${result.error.reason}`);
|
|
}
|
|
expect(result.output.toLowerCase()).toContain('ok');
|
|
expect(result.tokens.input).toBeGreaterThan(0);
|
|
expect(result.tokens.output).toBeGreaterThan(0);
|
|
expect(result.durationMs).toBeGreaterThan(0);
|
|
expect(typeof result.modelUsed).toBe('string');
|
|
const cost = gpt.estimateCost(result.tokens, result.modelUsed);
|
|
expect(cost).toBeGreaterThan(0);
|
|
}, 150_000);
|
|
|
|
test('gemini: trivial prompt produces parseable output', async () => {
|
|
const check = await gemini.available();
|
|
if (!check.ok) {
|
|
process.stderr.write(`\ngemini live smoke: SKIPPED — ${check.reason}\n`);
|
|
return;
|
|
}
|
|
const result = await gemini.run({ prompt: PROMPT, workdir, timeoutMs: 120_000 });
|
|
if (result.error) {
|
|
throw new Error(`gemini errored: ${result.error.code} — ${result.error.reason}`);
|
|
}
|
|
expect(result.output.toLowerCase()).toContain('ok');
|
|
// Gemini CLI sometimes returns 0 tokens in the result event (older responses);
|
|
// assert non-negative instead of strictly positive.
|
|
expect(result.tokens.input).toBeGreaterThanOrEqual(0);
|
|
expect(result.tokens.output).toBeGreaterThanOrEqual(0);
|
|
expect(result.durationMs).toBeGreaterThan(0);
|
|
expect(typeof result.modelUsed).toBe('string');
|
|
}, 150_000);
|
|
|
|
test('timeout error surfaces as error.code=timeout (no exception)', async () => {
|
|
// Use whatever adapter is available first — all three should share timeout semantics.
|
|
const adapter = (await claude.available()).ok ? claude
|
|
: (await gpt.available()).ok ? gpt
|
|
: (await gemini.available()).ok ? gemini
|
|
: null;
|
|
if (!adapter) {
|
|
process.stderr.write('\ntimeout smoke: SKIPPED — no provider available\n');
|
|
return;
|
|
}
|
|
// 100ms timeout is far too short for any real CLI startup → must timeout.
|
|
const result = await adapter.run({ prompt: PROMPT, workdir, timeoutMs: 100 });
|
|
expect(result.error).toBeDefined();
|
|
// Timeout, binary_missing, or unknown (if CLI dies differently) — all acceptable
|
|
// non-crash outcomes. The point is the adapter returns a RunResult, not throws.
|
|
expect(['timeout', 'unknown', 'binary_missing']).toContain(result.error!.code);
|
|
expect(result.durationMs).toBeGreaterThan(0);
|
|
}, 30_000);
|
|
|
|
test('runBenchmark: Promise.allSettled means one unavailable provider does not block others', async () => {
|
|
// Use the full runner with all three providers — whichever are unauthed should
|
|
// return entries with available=false and not crash the batch.
|
|
const report = await runBenchmark({
|
|
prompt: PROMPT,
|
|
workdir,
|
|
providers: ['claude', 'gpt', 'gemini'],
|
|
timeoutMs: 120_000,
|
|
skipUnavailable: false,
|
|
});
|
|
expect(report.entries).toHaveLength(3);
|
|
for (const e of report.entries) {
|
|
expect(['claude', 'gpt', 'gemini']).toContain(e.family);
|
|
if (e.available) {
|
|
expect(e.result).toBeDefined();
|
|
} else {
|
|
expect(typeof e.unavailable_reason).toBe('string');
|
|
}
|
|
}
|
|
// At least one available provider should have produced a non-error result in a healthy CI env.
|
|
const hadSuccess = report.entries.some(e => e.available && e.result && !e.result.error);
|
|
// We don't hard-assert this: if NO providers are authed, skip silently.
|
|
if (!hadSuccess) {
|
|
process.stderr.write('\nrunBenchmark live: no provider produced a clean result (no auth?)\n');
|
|
}
|
|
}, 300_000);
|
|
});
|