Files
gstack/test/skill-e2e-benchmark-providers.test.ts
T
Garry Tan c875e0c3fc test: lite E2E coverage for benchmark, taste engine, publish
Fills real coverage gaps in v0.19.0.0 primitives. 44 new deterministic
tests (gate tier, ~3s) + 8 live-API tests (periodic tier).

New gate-tier test files (free, <3s total):
- test/taste-engine.test.ts — 24 tests against gstack-taste-update:
  schema shape, Laplace-smoothed confidence, 5%/week decay clamped at 0,
  multi-dimension extraction, case-insensitive matching, session cap,
  legacy profile migration with session truncation, taste-drift conflict
  warning, malformed-JSON recovery, missing-variant exit code.
- test/publish-dry-run.test.ts — 13 tests against gstack-publish --dry-run:
  manifest parsing, missing/malformed JSON, per-skill validation errors
  (missing source file / slug / version / marketplaces), slug filter,
  unknown-skill exit, per-marketplace auth isolation (fake marketplaces
  with always-pass / always-fail / missing-binary CLIs), and a sanity
  check against the real repo manifest.
- test/benchmark-cli.test.ts — 11 tests against gstack-model-benchmark
  --dry-run: provider default, unknown-provider WARN, empty list
  fallback, flag passthrough (timeout/workdir/judge/output), long-prompt
  truncation, prompt resolution (inline vs file vs positional), missing
  prompt exit.

New periodic-tier test file (paid, gated EVALS=1):
- test/skill-e2e-benchmark-providers.test.ts — 8 tests hitting real
  claude, codex, gemini CLIs with a trivial prompt (~$0.001/provider).
  Verifies output parsing, token accounting, cost estimation, timeout
  error.code semantics, Promise.allSettled parallel isolation.
  Per-provider availability gate — unauthed providers skip cleanly.

This suite already caught one real bug (codex adapter missing
--skip-git-repo-check, fixed in 5260987d).

Registered `benchmark-providers-live` in touchfiles.ts (periodic tier,
triggered by changes to bin/gstack-model-benchmark, providers/**,
benchmark-runner.ts, pricing.ts).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-18 06:45:06 +08:00

176 lines
7.4 KiB
TypeScript

/**
* Multi-provider benchmark adapter E2E — hit real claude, codex, gemini CLIs.
*
* Periodic tier: runs under `bun run test:e2e` with EVALS=1. Each provider gated
* on its own `available()` check so missing auth skips that provider (doesn't
* abort the batch). Uses the simplest possible prompt ("Reply with exactly: ok")
* to keep cost near $0.001/provider/run.
*
* What this catches that unit tests don't:
* - CLI output-format drift (the #1 silent breakage path)
* - Token parsing from real provider responses
* - Auth-failure vs timeout vs rate-limit error code routing
* - Cost estimation on real token counts
* - Parallel execution via Promise.allSettled — slow provider doesn't block fast
*
* NOT covered here (would need dedicated test files):
* - Quality judge integration (benchmark-judge.ts, adds ~$0.05/run)
* - Multi-turn tool-using prompts — our single-turn smoke skips `toolCalls > 0`
*/
import { describe, test, expect } from 'bun:test';
import { ClaudeAdapter } from './helpers/providers/claude';
import { GptAdapter } from './helpers/providers/gpt';
import { GeminiAdapter } from './helpers/providers/gemini';
import { runBenchmark } from './helpers/benchmark-runner';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
// --- Prerequisites / gating ---
const evalsEnabled = !!process.env.EVALS;
const describeIfEvals = evalsEnabled ? describe : describe.skip;
const PROMPT = 'Reply with exactly this text and nothing else: ok';
// Per-provider gate — each test checks its own availability and skips cleanly.
// We construct adapters outside `test` so Bun's test reporter shows the skip reason.
const claude = new ClaudeAdapter();
const gpt = new GptAdapter();
const gemini = new GeminiAdapter();
// Use a temp working directory so provider CLIs can't accidentally touch the repo.
const workdir = fs.mkdtempSync(path.join(os.tmpdir(), 'bench-e2e-'));
describeIfEvals('multi-provider benchmark adapters (live)', () => {
test('claude: available() returns structured ok/reason', async () => {
const check = await claude.available();
expect(check).toHaveProperty('ok');
if (!check.ok) {
expect(typeof check.reason).toBe('string');
expect(check.reason!.length).toBeGreaterThan(0);
}
});
test('gpt: available() returns structured ok/reason', async () => {
const check = await gpt.available();
expect(check).toHaveProperty('ok');
if (!check.ok) {
expect(typeof check.reason).toBe('string');
}
});
test('gemini: available() returns structured ok/reason', async () => {
const check = await gemini.available();
expect(check).toHaveProperty('ok');
if (!check.ok) {
expect(typeof check.reason).toBe('string');
}
});
test('claude: trivial prompt produces parseable output', async () => {
const check = await claude.available();
if (!check.ok) {
process.stderr.write(`\nclaude live smoke: SKIPPED — ${check.reason}\n`);
return;
}
const result = await claude.run({ prompt: PROMPT, workdir, timeoutMs: 120_000 });
if (result.error) {
throw new Error(`claude errored: ${result.error.code}${result.error.reason}`);
}
expect(result.output.toLowerCase()).toContain('ok');
expect(result.tokens.input).toBeGreaterThan(0);
expect(result.tokens.output).toBeGreaterThan(0);
expect(result.durationMs).toBeGreaterThan(0);
expect(typeof result.modelUsed).toBe('string');
expect(result.modelUsed.length).toBeGreaterThan(0);
const cost = claude.estimateCost(result.tokens, result.modelUsed);
expect(cost).toBeGreaterThan(0);
}, 150_000);
test('gpt: trivial prompt produces parseable output', async () => {
const check = await gpt.available();
if (!check.ok) {
process.stderr.write(`\ngpt live smoke: SKIPPED — ${check.reason}\n`);
return;
}
const result = await gpt.run({ prompt: PROMPT, workdir, timeoutMs: 120_000 });
if (result.error) {
throw new Error(`gpt errored: ${result.error.code}${result.error.reason}`);
}
expect(result.output.toLowerCase()).toContain('ok');
expect(result.tokens.input).toBeGreaterThan(0);
expect(result.tokens.output).toBeGreaterThan(0);
expect(result.durationMs).toBeGreaterThan(0);
expect(typeof result.modelUsed).toBe('string');
const cost = gpt.estimateCost(result.tokens, result.modelUsed);
expect(cost).toBeGreaterThan(0);
}, 150_000);
test('gemini: trivial prompt produces parseable output', async () => {
const check = await gemini.available();
if (!check.ok) {
process.stderr.write(`\ngemini live smoke: SKIPPED — ${check.reason}\n`);
return;
}
const result = await gemini.run({ prompt: PROMPT, workdir, timeoutMs: 120_000 });
if (result.error) {
throw new Error(`gemini errored: ${result.error.code}${result.error.reason}`);
}
expect(result.output.toLowerCase()).toContain('ok');
// Gemini CLI sometimes returns 0 tokens in the result event (older responses);
// assert non-negative instead of strictly positive.
expect(result.tokens.input).toBeGreaterThanOrEqual(0);
expect(result.tokens.output).toBeGreaterThanOrEqual(0);
expect(result.durationMs).toBeGreaterThan(0);
expect(typeof result.modelUsed).toBe('string');
}, 150_000);
test('timeout error surfaces as error.code=timeout (no exception)', async () => {
// Use whatever adapter is available first — all three should share timeout semantics.
const adapter = (await claude.available()).ok ? claude
: (await gpt.available()).ok ? gpt
: (await gemini.available()).ok ? gemini
: null;
if (!adapter) {
process.stderr.write('\ntimeout smoke: SKIPPED — no provider available\n');
return;
}
// 100ms timeout is far too short for any real CLI startup → must timeout.
const result = await adapter.run({ prompt: PROMPT, workdir, timeoutMs: 100 });
expect(result.error).toBeDefined();
// Timeout, binary_missing, or unknown (if CLI dies differently) — all acceptable
// non-crash outcomes. The point is the adapter returns a RunResult, not throws.
expect(['timeout', 'unknown', 'binary_missing']).toContain(result.error!.code);
expect(result.durationMs).toBeGreaterThan(0);
}, 30_000);
test('runBenchmark: Promise.allSettled means one unavailable provider does not block others', async () => {
// Use the full runner with all three providers — whichever are unauthed should
// return entries with available=false and not crash the batch.
const report = await runBenchmark({
prompt: PROMPT,
workdir,
providers: ['claude', 'gpt', 'gemini'],
timeoutMs: 120_000,
skipUnavailable: false,
});
expect(report.entries).toHaveLength(3);
for (const e of report.entries) {
expect(['claude', 'gpt', 'gemini']).toContain(e.family);
if (e.available) {
expect(e.result).toBeDefined();
} else {
expect(typeof e.unavailable_reason).toBe('string');
}
}
// At least one available provider should have produced a non-error result in a healthy CI env.
const hadSuccess = report.entries.some(e => e.available && e.result && !e.result.error);
// We don't hard-assert this: if NO providers are authed, skip silently.
if (!hadSuccess) {
process.stderr.write('\nrunBenchmark live: no provider produced a clean result (no auth?)\n');
}
}, 300_000);
});