test: lite E2E coverage for benchmark, taste engine, publish

Fills real coverage gaps in v0.19.0.0 primitives. 44 new deterministic
tests (gate tier, ~3s) + 8 live-API tests (periodic tier).

New gate-tier test files (free, <3s total):
- test/taste-engine.test.ts — 24 tests against gstack-taste-update:
  schema shape, Laplace-smoothed confidence, 5%/week decay clamped at 0,
  multi-dimension extraction, case-insensitive matching, session cap,
  legacy profile migration with session truncation, taste-drift conflict
  warning, malformed-JSON recovery, missing-variant exit code.
- test/publish-dry-run.test.ts — 13 tests against gstack-publish --dry-run:
  manifest parsing, missing/malformed JSON, per-skill validation errors
  (missing source file / slug / version / marketplaces), slug filter,
  unknown-skill exit, per-marketplace auth isolation (fake marketplaces
  with always-pass / always-fail / missing-binary CLIs), and a sanity
  check against the real repo manifest.
- test/benchmark-cli.test.ts — 11 tests against gstack-model-benchmark
  --dry-run: provider default, unknown-provider WARN, empty list
  fallback, flag passthrough (timeout/workdir/judge/output), long-prompt
  truncation, prompt resolution (inline vs file vs positional), missing
  prompt exit.

New periodic-tier test file (paid, gated EVALS=1):
- test/skill-e2e-benchmark-providers.test.ts — 8 tests hitting real
  claude, codex, gemini CLIs with a trivial prompt (~$0.001/provider).
  Verifies output parsing, token accounting, cost estimation, timeout
  error.code semantics, Promise.allSettled parallel isolation.
  Per-provider availability gate — unauthed providers skip cleanly.

This suite already caught one real bug (codex adapter missing
--skip-git-repo-check, fixed in 5260987d).

Registered `benchmark-providers-live` in touchfiles.ts (periodic tier,
triggered by changes to bin/gstack-model-benchmark, providers/**,
benchmark-runner.ts, pricing.ts).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-04-18 06:45:06 +08:00
parent 42715188c2
commit c875e0c3fc
5 changed files with 966 additions and 0 deletions
+134
View File
@@ -0,0 +1,134 @@
/**
* gstack-model-benchmark CLI tests (offline).
*
* Covers CLI wiring that unit tests against benchmark-runner.ts can't see:
* - --dry-run auth/provider-list resolution
* - unknown provider WARN path
* - provider default (claude) when --models omitted
* - prompt resolution (inline --prompt vs positional file path)
* - output format flag wiring via --dry-run (avoids real CLI invocation)
*
* All tests use --dry-run so no API calls happen.
*/
import { describe, test, expect } from 'bun:test';
import { spawnSync } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
const ROOT = path.resolve(import.meta.dir, '..');
const BIN = path.join(ROOT, 'bin', 'gstack-model-benchmark');
function run(args: string[], opts: { env?: Record<string, string> } = {}): { status: number | null; stdout: string; stderr: string } {
const result = spawnSync('bun', ['run', BIN, ...args], {
cwd: ROOT,
env: { ...process.env, ...opts.env },
encoding: 'utf-8',
timeout: 15000,
});
return {
status: result.status,
stdout: result.stdout?.toString() ?? '',
stderr: result.stderr?.toString() ?? '',
};
}
describe('gstack-model-benchmark --dry-run', () => {
test('prints provider availability report and exits 0', () => {
const r = run(['--prompt', 'hi', '--models', 'claude,gpt,gemini', '--dry-run']);
expect(r.status).toBe(0);
expect(r.stdout).toContain('gstack-model-benchmark --dry-run');
expect(r.stdout).toContain('claude');
expect(r.stdout).toContain('gpt');
expect(r.stdout).toContain('gemini');
expect(r.stdout).toContain('no prompts sent');
});
test('reports default provider when --models omitted', () => {
const r = run(['--prompt', 'hi', '--dry-run']);
expect(r.status).toBe(0);
expect(r.stdout).toContain('providers: claude');
});
test('unknown provider in --models emits WARN and is dropped', () => {
const r = run(['--prompt', 'hi', '--models', 'claude,gpt-42-fake', '--dry-run']);
expect(r.status).toBe(0);
expect(r.stderr).toContain('unknown provider');
expect(r.stderr).toContain('gpt-42-fake');
expect(r.stdout).toContain('providers: claude');
expect(r.stdout).not.toContain('gpt-42-fake');
});
test('empty --models list falls back to claude default', () => {
const r = run(['--prompt', 'hi', '--models', '', '--dry-run']);
expect(r.status).toBe(0);
expect(r.stdout).toContain('providers: claude');
});
test('--timeout-ms and --workdir flags flow through to dry-run report', () => {
const r = run(['--prompt', 'hi', '--timeout-ms', '9999', '--workdir', '/tmp', '--dry-run']);
expect(r.status).toBe(0);
expect(r.stdout).toContain('timeout_ms: 9999');
expect(r.stdout).toContain('workdir: /tmp');
});
test('--judge flag reported in dry-run output', () => {
const r = run(['--prompt', 'hi', '--judge', '--dry-run']);
expect(r.status).toBe(0);
expect(r.stdout).toContain('judge: on');
});
test('--output flag reported in dry-run', () => {
const r = run(['--prompt', 'hi', '--output', 'json', '--dry-run']);
expect(r.status).toBe(0);
expect(r.stdout).toContain('output: json');
});
test('each adapter reports either OK or NOT READY, never crashes', () => {
const r = run(['--prompt', 'hi', '--models', 'claude,gpt,gemini', '--dry-run']);
expect(r.status).toBe(0);
// Each provider line must end in OK or NOT READY
const lines = r.stdout.split('\n');
const adapterLines = lines.filter(l => /^\s+(claude|gpt|gemini):/.test(l));
expect(adapterLines.length).toBe(3);
for (const line of adapterLines) {
expect(line).toMatch(/(OK|NOT READY)/);
}
});
test('long prompt is truncated in dry-run display', () => {
const longPrompt = 'x'.repeat(200);
const r = run(['--prompt', longPrompt, '--dry-run']);
expect(r.status).toBe(0);
// Summary truncates to 80 chars + ellipsis
expect(r.stdout).toMatch(/prompt:\s+x{80}…/);
});
});
describe('gstack-model-benchmark prompt resolution', () => {
test('positional file path is read and passed as prompt', () => {
const tmp = fs.mkdtempSync(path.join(os.tmpdir(), 'bench-prompt-'));
const promptFile = path.join(tmp, 'prompt.txt');
fs.writeFileSync(promptFile, 'hello from file');
try {
const r = run([promptFile, '--dry-run']);
expect(r.status).toBe(0);
expect(r.stdout).toContain('hello from file');
} finally {
fs.rmSync(tmp, { recursive: true, force: true });
}
});
test('positional non-file arg is treated as inline prompt', () => {
const r = run(['treat-me-as-inline', '--dry-run']);
expect(r.status).toBe(0);
expect(r.stdout).toContain('treat-me-as-inline');
});
test('missing prompt exits non-zero', () => {
const r = run(['--dry-run']);
expect(r.status).not.toBe(0);
expect(r.stderr).toContain('specify a prompt');
});
});
+6
View File
@@ -171,6 +171,9 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
// Autoplan
'autoplan-core': ['autoplan/**', 'plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**'],
// Multi-provider benchmark adapters — live API smoke against real claude/codex/gemini CLIs
'benchmark-providers-live': ['bin/gstack-model-benchmark', 'test/helpers/providers/**', 'test/helpers/benchmark-runner.ts', 'test/helpers/pricing.ts'],
// Skill routing — journey-stage tests (depend on ALL skill descriptions)
'journey-ideation': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
'journey-plan-eng': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
@@ -316,6 +319,9 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
// Autoplan — periodic (not yet implemented)
'autoplan-core': 'periodic',
// Multi-provider benchmark — periodic (requires external CLIs + auth, paid)
'benchmark-providers-live': 'periodic',
// Skill routing — periodic (LLM routing is non-deterministic)
'journey-ideation': 'periodic',
'journey-plan-eng': 'periodic',
+281
View File
@@ -0,0 +1,281 @@
/**
* gstack-publish end-to-end tests via --dry-run.
*
* Verifies manifest parsing, schema validation, marketplace auth checks, per-skill
* error isolation, and command building — all without touching real marketplaces.
*
* --dry-run does NOT run execSync on publish commands. Auth checks still run
* against real binaries; we use fake marketplaces whose `auth_check` commands
* are always-succeed (`true`) or always-fail (`false`) so the test is hermetic.
*/
import { describe, test, expect, beforeEach, afterEach } from 'bun:test';
import { spawnSync } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
const ROOT = path.resolve(import.meta.dir, '..');
const BIN = path.join(ROOT, 'bin', 'gstack-publish');
let sandbox: string;
let binCopy: string;
beforeEach(() => {
// gstack-publish reads skills.json relative to the binary's dir (import.meta.dir/..).
// To isolate each test's manifest, we create a sandbox repo that mirrors the real
// structure: copy the bin into sandbox/bin/, write a controlled skills.json at the root.
sandbox = fs.mkdtempSync(path.join(os.tmpdir(), 'publish-sandbox-'));
fs.mkdirSync(path.join(sandbox, 'bin'));
fs.mkdirSync(path.join(sandbox, 'test', 'helpers'), { recursive: true });
binCopy = path.join(sandbox, 'bin', 'gstack-publish');
fs.copyFileSync(BIN, binCopy);
fs.chmodSync(binCopy, 0o755);
});
afterEach(() => {
fs.rmSync(sandbox, { recursive: true, force: true });
});
function writeManifest(manifest: object): void {
fs.writeFileSync(path.join(sandbox, 'skills.json'), JSON.stringify(manifest, null, 2));
}
function writeSkillFile(relPath: string, content = '# Test Skill\n'): void {
const full = path.join(sandbox, relPath);
fs.mkdirSync(path.dirname(full), { recursive: true });
fs.writeFileSync(full, content);
}
function run(args: string[]): { status: number | null; stdout: string; stderr: string } {
const result = spawnSync('bun', ['run', binCopy, ...args], {
cwd: sandbox,
encoding: 'utf-8',
timeout: 15000,
});
return {
status: result.status,
stdout: result.stdout?.toString() ?? '',
stderr: result.stderr?.toString() ?? '',
};
}
const VALID_MARKETPLACES = {
fakestore_ok: {
cli: 'true', // binary that always succeeds
login_cmd: 'fakestore_ok login',
publish_cmd_template: 'echo publish {slug} {version}',
docs: 'https://fakestore.example',
auth_check: 'true', // always-authenticated
},
fakestore_noauth: {
cli: 'true',
login_cmd: 'fakestore_noauth login',
publish_cmd_template: 'echo publish {slug} {version}',
docs: 'https://fakestore.example',
auth_check: 'false', // always-fails auth
},
fakestore_missing: {
cli: 'nonexistent-binary-xyz',
login_cmd: 'fakestore_missing login',
publish_cmd_template: 'echo publish {slug} {version}',
docs: 'https://fakestore.example',
auth_check: 'nonexistent-binary-xyz whoami',
},
};
function validSkill(slug: string, sourceRel: string, marketplaces: string[] = ['fakestore_ok']) {
const m: Record<string, { slug: string; publish: boolean }> = {};
for (const name of marketplaces) m[name] = { slug, publish: true };
return {
slug,
source: sourceRel,
name: `Skill ${slug}`,
version: '1.0.0',
category: 'test',
description: 'A test skill',
marketplaces: m,
standalone: true,
compatible_hosts: ['claude-code'],
};
}
describe('gstack-publish: manifest loading', () => {
test('--list prints every skill and marketplace', () => {
writeSkillFile('skills/alpha/SKILL.md');
writeSkillFile('skills/beta/SKILL.md');
writeManifest({
version: '1.0.0',
description: 't',
skills: [validSkill('alpha', 'skills/alpha/SKILL.md'), validSkill('beta', 'skills/beta/SKILL.md')],
marketplaces: VALID_MARKETPLACES,
});
const r = run(['--list']);
expect(r.status).toBe(0);
expect(r.stdout).toContain('alpha');
expect(r.stdout).toContain('beta');
expect(r.stdout).toContain('fakestore_ok');
});
test('missing manifest exits non-zero', () => {
// Delete any manifest
const r = run(['--dry-run']);
expect(r.status).not.toBe(0);
expect(r.stderr).toContain('skills.json');
});
test('malformed JSON exits non-zero', () => {
fs.writeFileSync(path.join(sandbox, 'skills.json'), '{ not json');
const r = run(['--dry-run']);
expect(r.status).not.toBe(0);
expect(r.stderr).toContain('parse');
});
});
describe('gstack-publish: validation', () => {
test('missing source file reports validation error and exits 1', () => {
writeManifest({
version: '1.0.0',
description: 't',
skills: [validSkill('ghost', 'skills/ghost/DOES_NOT_EXIST.md')],
marketplaces: VALID_MARKETPLACES,
});
const r = run(['--dry-run']);
expect(r.status).not.toBe(0);
expect(r.stderr).toContain('source file missing');
expect(r.stderr).toContain('ghost');
});
test('missing slug reports validation error', () => {
writeSkillFile('skills/x/SKILL.md');
const s = validSkill('temp', 'skills/x/SKILL.md');
delete (s as Partial<typeof s>).slug;
writeManifest({
version: '1.0.0',
description: 't',
skills: [s],
marketplaces: VALID_MARKETPLACES,
});
const r = run(['--dry-run']);
expect(r.status).not.toBe(0);
expect(r.stderr).toContain('missing slug');
});
test('missing version reports validation error', () => {
writeSkillFile('skills/x/SKILL.md');
const s = validSkill('x', 'skills/x/SKILL.md');
delete (s as Partial<typeof s>).version;
writeManifest({
version: '1.0.0',
description: 't',
skills: [s],
marketplaces: VALID_MARKETPLACES,
});
const r = run(['--dry-run']);
expect(r.status).not.toBe(0);
expect(r.stderr).toContain('missing version');
});
test('no marketplaces configured reports validation error', () => {
writeSkillFile('skills/x/SKILL.md');
const s = { ...validSkill('x', 'skills/x/SKILL.md'), marketplaces: {} };
writeManifest({ version: '1.0.0', description: 't', skills: [s], marketplaces: VALID_MARKETPLACES });
const r = run(['--dry-run']);
expect(r.status).not.toBe(0);
expect(r.stderr).toContain('no marketplaces configured');
});
});
describe('gstack-publish: dry-run execution', () => {
test('happy path reports DRY-RUN tag and templated command', () => {
writeSkillFile('skills/alpha/SKILL.md');
writeManifest({
version: '1.0.0',
description: 't',
skills: [validSkill('alpha', 'skills/alpha/SKILL.md')],
marketplaces: VALID_MARKETPLACES,
});
const r = run(['--dry-run']);
expect(r.status).toBe(0);
expect(r.stdout).toContain('DRY-RUN');
expect(r.stdout).toContain('alpha');
expect(r.stdout).toContain('Published: 1');
expect(r.stdout).toContain('Failed: 0');
});
test('per-skill filter publishes only the requested slug', () => {
writeSkillFile('skills/alpha/SKILL.md');
writeSkillFile('skills/beta/SKILL.md');
writeManifest({
version: '1.0.0',
description: 't',
skills: [validSkill('alpha', 'skills/alpha/SKILL.md'), validSkill('beta', 'skills/beta/SKILL.md')],
marketplaces: VALID_MARKETPLACES,
});
const r = run(['alpha', '--dry-run']);
expect(r.status).toBe(0);
expect(r.stdout).toContain('Publishing alpha');
expect(r.stdout).not.toContain('Publishing beta');
expect(r.stdout).toContain('Published: 1');
});
test('unknown skill filter exits non-zero', () => {
writeSkillFile('skills/alpha/SKILL.md');
writeManifest({
version: '1.0.0',
description: 't',
skills: [validSkill('alpha', 'skills/alpha/SKILL.md')],
marketplaces: VALID_MARKETPLACES,
});
const r = run(['nonexistent', '--dry-run']);
expect(r.status).not.toBe(0);
expect(r.stderr).toContain('skill not found');
});
});
describe('gstack-publish: auth check isolation', () => {
test('failing auth for one marketplace does NOT abort the batch in dry-run', () => {
writeSkillFile('skills/alpha/SKILL.md');
writeManifest({
version: '1.0.0',
description: 't',
skills: [validSkill('alpha', 'skills/alpha/SKILL.md', ['fakestore_ok', 'fakestore_noauth'])],
marketplaces: VALID_MARKETPLACES,
});
const r = run(['--dry-run']);
// In dry-run, auth failures are reported but don't block dispatch
expect(r.status).toBe(0);
expect(r.stdout).toContain('fakestore_ok: OK');
expect(r.stdout).toContain('fakestore_noauth: NOT READY');
});
test('missing binary reported as not-ready with docs link', () => {
writeSkillFile('skills/alpha/SKILL.md');
writeManifest({
version: '1.0.0',
description: 't',
skills: [validSkill('alpha', 'skills/alpha/SKILL.md', ['fakestore_missing'])],
marketplaces: VALID_MARKETPLACES,
});
const r = run(['--dry-run']);
expect(r.stdout).toContain('fakestore_missing: NOT READY');
expect(r.stdout).toContain('not on PATH');
});
});
describe('gstack-publish: real manifest sanity', () => {
test('the real repo skills.json passes --dry-run validation', () => {
// This uses the actual bin against the actual manifest (ROOT/skills.json).
// If auth to any real marketplace isn't set up it just reports NOT READY;
// --dry-run still exits 0 because it doesn't require auth to pass.
const real = spawnSync('bun', ['run', path.join(ROOT, 'bin', 'gstack-publish'), '--dry-run'], {
cwd: ROOT,
encoding: 'utf-8',
timeout: 20000,
});
expect(real.status).toBe(0);
expect(real.stdout).toContain('Validating manifest');
// Every skill in the real manifest should pass validation
expect(real.stderr).not.toContain('Manifest validation failed');
});
});
+175
View File
@@ -0,0 +1,175 @@
/**
* Multi-provider benchmark adapter E2E — hit real claude, codex, gemini CLIs.
*
* Periodic tier: runs under `bun run test:e2e` with EVALS=1. Each provider gated
* on its own `available()` check so missing auth skips that provider (doesn't
* abort the batch). Uses the simplest possible prompt ("Reply with exactly: ok")
* to keep cost near $0.001/provider/run.
*
* What this catches that unit tests don't:
* - CLI output-format drift (the #1 silent breakage path)
* - Token parsing from real provider responses
* - Auth-failure vs timeout vs rate-limit error code routing
* - Cost estimation on real token counts
* - Parallel execution via Promise.allSettled — slow provider doesn't block fast
*
* NOT covered here (would need dedicated test files):
* - Quality judge integration (benchmark-judge.ts, adds ~$0.05/run)
* - Multi-turn tool-using prompts — our single-turn smoke skips `toolCalls > 0`
*/
import { describe, test, expect } from 'bun:test';
import { ClaudeAdapter } from './helpers/providers/claude';
import { GptAdapter } from './helpers/providers/gpt';
import { GeminiAdapter } from './helpers/providers/gemini';
import { runBenchmark } from './helpers/benchmark-runner';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
// --- Prerequisites / gating ---
const evalsEnabled = !!process.env.EVALS;
const describeIfEvals = evalsEnabled ? describe : describe.skip;
const PROMPT = 'Reply with exactly this text and nothing else: ok';
// Per-provider gate — each test checks its own availability and skips cleanly.
// We construct adapters outside `test` so Bun's test reporter shows the skip reason.
const claude = new ClaudeAdapter();
const gpt = new GptAdapter();
const gemini = new GeminiAdapter();
// Use a temp working directory so provider CLIs can't accidentally touch the repo.
const workdir = fs.mkdtempSync(path.join(os.tmpdir(), 'bench-e2e-'));
describeIfEvals('multi-provider benchmark adapters (live)', () => {
test('claude: available() returns structured ok/reason', async () => {
const check = await claude.available();
expect(check).toHaveProperty('ok');
if (!check.ok) {
expect(typeof check.reason).toBe('string');
expect(check.reason!.length).toBeGreaterThan(0);
}
});
test('gpt: available() returns structured ok/reason', async () => {
const check = await gpt.available();
expect(check).toHaveProperty('ok');
if (!check.ok) {
expect(typeof check.reason).toBe('string');
}
});
test('gemini: available() returns structured ok/reason', async () => {
const check = await gemini.available();
expect(check).toHaveProperty('ok');
if (!check.ok) {
expect(typeof check.reason).toBe('string');
}
});
test('claude: trivial prompt produces parseable output', async () => {
const check = await claude.available();
if (!check.ok) {
process.stderr.write(`\nclaude live smoke: SKIPPED — ${check.reason}\n`);
return;
}
const result = await claude.run({ prompt: PROMPT, workdir, timeoutMs: 120_000 });
if (result.error) {
throw new Error(`claude errored: ${result.error.code}${result.error.reason}`);
}
expect(result.output.toLowerCase()).toContain('ok');
expect(result.tokens.input).toBeGreaterThan(0);
expect(result.tokens.output).toBeGreaterThan(0);
expect(result.durationMs).toBeGreaterThan(0);
expect(typeof result.modelUsed).toBe('string');
expect(result.modelUsed.length).toBeGreaterThan(0);
const cost = claude.estimateCost(result.tokens, result.modelUsed);
expect(cost).toBeGreaterThan(0);
}, 150_000);
test('gpt: trivial prompt produces parseable output', async () => {
const check = await gpt.available();
if (!check.ok) {
process.stderr.write(`\ngpt live smoke: SKIPPED — ${check.reason}\n`);
return;
}
const result = await gpt.run({ prompt: PROMPT, workdir, timeoutMs: 120_000 });
if (result.error) {
throw new Error(`gpt errored: ${result.error.code}${result.error.reason}`);
}
expect(result.output.toLowerCase()).toContain('ok');
expect(result.tokens.input).toBeGreaterThan(0);
expect(result.tokens.output).toBeGreaterThan(0);
expect(result.durationMs).toBeGreaterThan(0);
expect(typeof result.modelUsed).toBe('string');
const cost = gpt.estimateCost(result.tokens, result.modelUsed);
expect(cost).toBeGreaterThan(0);
}, 150_000);
test('gemini: trivial prompt produces parseable output', async () => {
const check = await gemini.available();
if (!check.ok) {
process.stderr.write(`\ngemini live smoke: SKIPPED — ${check.reason}\n`);
return;
}
const result = await gemini.run({ prompt: PROMPT, workdir, timeoutMs: 120_000 });
if (result.error) {
throw new Error(`gemini errored: ${result.error.code}${result.error.reason}`);
}
expect(result.output.toLowerCase()).toContain('ok');
// Gemini CLI sometimes returns 0 tokens in the result event (older responses);
// assert non-negative instead of strictly positive.
expect(result.tokens.input).toBeGreaterThanOrEqual(0);
expect(result.tokens.output).toBeGreaterThanOrEqual(0);
expect(result.durationMs).toBeGreaterThan(0);
expect(typeof result.modelUsed).toBe('string');
}, 150_000);
test('timeout error surfaces as error.code=timeout (no exception)', async () => {
// Use whatever adapter is available first — all three should share timeout semantics.
const adapter = (await claude.available()).ok ? claude
: (await gpt.available()).ok ? gpt
: (await gemini.available()).ok ? gemini
: null;
if (!adapter) {
process.stderr.write('\ntimeout smoke: SKIPPED — no provider available\n');
return;
}
// 100ms timeout is far too short for any real CLI startup → must timeout.
const result = await adapter.run({ prompt: PROMPT, workdir, timeoutMs: 100 });
expect(result.error).toBeDefined();
// Timeout, binary_missing, or unknown (if CLI dies differently) — all acceptable
// non-crash outcomes. The point is the adapter returns a RunResult, not throws.
expect(['timeout', 'unknown', 'binary_missing']).toContain(result.error!.code);
expect(result.durationMs).toBeGreaterThan(0);
}, 30_000);
test('runBenchmark: Promise.allSettled means one unavailable provider does not block others', async () => {
// Use the full runner with all three providers — whichever are unauthed should
// return entries with available=false and not crash the batch.
const report = await runBenchmark({
prompt: PROMPT,
workdir,
providers: ['claude', 'gpt', 'gemini'],
timeoutMs: 120_000,
skipUnavailable: false,
});
expect(report.entries).toHaveLength(3);
for (const e of report.entries) {
expect(['claude', 'gpt', 'gemini']).toContain(e.family);
if (e.available) {
expect(e.result).toBeDefined();
} else {
expect(typeof e.unavailable_reason).toBe('string');
}
}
// At least one available provider should have produced a non-error result in a healthy CI env.
const hadSuccess = report.entries.some(e => e.available && e.result && !e.result.error);
// We don't hard-assert this: if NO providers are authed, skip silently.
if (!hadSuccess) {
process.stderr.write('\nrunBenchmark live: no provider produced a clean result (no auth?)\n');
}
}, 300_000);
});
+370
View File
@@ -0,0 +1,370 @@
/**
* Taste engine — end-to-end tests for `gstack-taste-update`.
*
* Covers the v1 taste profile contract: schema shape, Laplace-smoothed confidence,
* 5%/week decay, dimension extraction from reason strings, session cap, schema
* migration, conflict detection (taste drift), malformed-input recovery.
*
* All tests use GSTACK_STATE_DIR pointing at a temp dir so no real home dir is
* touched. Each test isolates its own state directory.
*/
import { describe, test, expect, beforeEach, afterEach } from 'bun:test';
import { spawnSync } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
const ROOT = path.resolve(import.meta.dir, '..');
const BIN = path.join(ROOT, 'bin', 'gstack-taste-update');
interface Preference {
value: string;
confidence: number;
approved_count: number;
rejected_count: number;
last_seen: string;
}
interface TasteProfile {
version: number;
updated_at: string;
dimensions: Record<'fonts' | 'colors' | 'layouts' | 'aesthetics', { approved: Preference[]; rejected: Preference[] }>;
sessions: Array<{ ts: string; action: 'approved' | 'rejected'; variant: string; reason?: string }>;
}
let stateDir: string;
let workdir: string;
beforeEach(() => {
stateDir = fs.mkdtempSync(path.join(os.tmpdir(), 'taste-state-'));
workdir = fs.mkdtempSync(path.join(os.tmpdir(), 'taste-work-'));
// Initialize a git repo so gstack-taste-update's getSlug() finds a toplevel
spawnSync('git', ['init', '-b', 'main'], { cwd: workdir, stdio: 'pipe' });
});
afterEach(() => {
fs.rmSync(stateDir, { recursive: true, force: true });
fs.rmSync(workdir, { recursive: true, force: true });
});
function run(args: string[]): { status: number | null; stdout: string; stderr: string } {
const result = spawnSync('bun', ['run', BIN, ...args], {
cwd: workdir,
env: { ...process.env, GSTACK_STATE_DIR: stateDir, HOME: stateDir },
encoding: 'utf-8',
timeout: 10000,
});
return {
status: result.status,
stdout: result.stdout?.toString() ?? '',
stderr: result.stderr?.toString() ?? '',
};
}
function profilePath(): string {
const slug = path.basename(workdir);
return path.join(stateDir, 'projects', slug, 'taste-profile.json');
}
function readProfile(): TasteProfile {
return JSON.parse(fs.readFileSync(profilePath(), 'utf-8'));
}
function writeProfile(p: unknown): void {
const pp = profilePath();
fs.mkdirSync(path.dirname(pp), { recursive: true });
fs.writeFileSync(pp, JSON.stringify(p, null, 2));
}
describe('taste-engine: first-write lifecycle', () => {
test('approved creates profile with correct v1 schema', () => {
const r = run(['approved', 'variant-A', '--reason', 'fonts: Geist Sans; colors: emerald']);
expect(r.status).toBe(0);
const p = readProfile();
expect(p.version).toBe(1);
expect(p.dimensions.fonts.approved).toHaveLength(1);
expect(p.dimensions.fonts.approved[0].value).toBe('Geist Sans');
expect(p.dimensions.fonts.approved[0].approved_count).toBe(1);
expect(p.dimensions.fonts.approved[0].rejected_count).toBe(0);
// Laplace: 1 / (1 + 0 + 1) = 0.5
expect(p.dimensions.fonts.approved[0].confidence).toBeCloseTo(0.5, 5);
expect(p.dimensions.colors.approved[0].value).toBe('emerald');
expect(p.sessions).toHaveLength(1);
expect(p.sessions[0].action).toBe('approved');
expect(p.sessions[0].variant).toBe('variant-A');
});
test('rejected bumps rejected_count not approved_count', () => {
run(['rejected', 'variant-B', '--reason', 'fonts: Comic Sans']);
const p = readProfile();
expect(p.dimensions.fonts.rejected).toHaveLength(1);
expect(p.dimensions.fonts.rejected[0].rejected_count).toBe(1);
expect(p.dimensions.fonts.rejected[0].approved_count).toBe(0);
expect(p.dimensions.fonts.approved).toHaveLength(0);
});
test('session recorded even when no dimensions extractable from reason', () => {
const r = run(['approved', 'variant-C']); // no --reason
expect(r.status).toBe(0);
const p = readProfile();
expect(p.sessions).toHaveLength(1);
for (const dim of ['fonts', 'colors', 'layouts', 'aesthetics'] as const) {
expect(p.dimensions[dim].approved).toHaveLength(0);
expect(p.dimensions[dim].rejected).toHaveLength(0);
}
});
});
describe('taste-engine: Laplace-smoothed confidence', () => {
test('repeated approvals raise confidence toward 1', () => {
for (let i = 0; i < 5; i++) {
run(['approved', `variant-${i}`, '--reason', 'fonts: Geist Sans']);
}
const p = readProfile();
const pref = p.dimensions.fonts.approved[0];
expect(pref.approved_count).toBe(5);
// Laplace: 5 / (5 + 0 + 1) = 0.833
expect(pref.confidence).toBeCloseTo(5 / 6, 5);
});
test('mixed approvals + rejections balance out', () => {
run(['approved', 'v1', '--reason', 'fonts: Inter']);
run(['approved', 'v2', '--reason', 'fonts: Inter']);
run(['rejected', 'v3', '--reason', 'fonts: Inter']);
const p = readProfile();
const approved = p.dimensions.fonts.approved[0];
const rejected = p.dimensions.fonts.rejected[0];
expect(approved.approved_count).toBe(2);
expect(approved.rejected_count).toBe(0);
expect(rejected.rejected_count).toBe(1);
expect(rejected.approved_count).toBe(0);
});
});
describe('taste-engine: decay math', () => {
test('show applies 5%/week decay to stored confidence', () => {
// Seed with a profile where the single approved font was last_seen 4 weeks ago
const fourWeeksAgo = new Date(Date.now() - 4 * 7 * 24 * 60 * 60 * 1000).toISOString();
writeProfile({
version: 1,
updated_at: new Date().toISOString(),
dimensions: {
fonts: {
approved: [{ value: 'Aged Font', confidence: 0.8, approved_count: 4, rejected_count: 0, last_seen: fourWeeksAgo }],
rejected: [],
},
colors: { approved: [], rejected: [] },
layouts: { approved: [], rejected: [] },
aesthetics: { approved: [], rejected: [] },
},
sessions: [],
});
const r = run(['show']);
expect(r.status).toBe(0);
// After 4 weeks: 0.8 * (0.95)^4 ≈ 0.651
const expectedConf = 0.8 * Math.pow(0.95, 4);
const match = r.stdout.match(/Aged Font — conf (\d+\.\d+)/);
expect(match).toBeTruthy();
const displayedConf = parseFloat(match![1]);
expect(displayedConf).toBeCloseTo(expectedConf, 2);
});
test('decay never goes below zero', () => {
// 3 years ≈ 156 weeks. 0.95^156 ≈ 0.00036, well below 0.01.
const yearsAgo = new Date(Date.now() - 3 * 365 * 24 * 60 * 60 * 1000).toISOString();
writeProfile({
version: 1,
updated_at: new Date().toISOString(),
dimensions: {
fonts: {
approved: [{ value: 'Ancient', confidence: 1.0, approved_count: 1, rejected_count: 0, last_seen: yearsAgo }],
rejected: [],
},
colors: { approved: [], rejected: [] },
layouts: { approved: [], rejected: [] },
aesthetics: { approved: [], rejected: [] },
},
sessions: [],
});
const r = run(['show']);
expect(r.status).toBe(0);
const match = r.stdout.match(/Ancient — conf (\d+\.\d+)/);
expect(match).toBeTruthy();
const conf = parseFloat(match![1]);
expect(conf).toBeGreaterThanOrEqual(0);
expect(conf).toBeLessThan(0.01);
});
});
describe('taste-engine: dimension extraction', () => {
test('parses multiple dimensions from one reason string', () => {
run(['approved', 'v1', '--reason', 'fonts: Geist, IBM Plex; colors: emerald; layouts: grid-12; aesthetics: brutalist']);
const p = readProfile();
expect(p.dimensions.fonts.approved.map(x => x.value).sort()).toEqual(['Geist', 'IBM Plex']);
expect(p.dimensions.colors.approved[0].value).toBe('emerald');
expect(p.dimensions.layouts.approved[0].value).toBe('grid-12');
expect(p.dimensions.aesthetics.approved[0].value).toBe('brutalist');
});
test('value matching is case-insensitive', () => {
run(['approved', 'v1', '--reason', 'fonts: Geist']);
run(['approved', 'v2', '--reason', 'fonts: GEIST']);
const p = readProfile();
// Should merge into a single entry
expect(p.dimensions.fonts.approved).toHaveLength(1);
expect(p.dimensions.fonts.approved[0].approved_count).toBe(2);
});
test('unknown dimension labels are silently ignored', () => {
run(['approved', 'v1', '--reason', 'weather: sunny; mood: happy']);
const p = readProfile();
// Session still recorded
expect(p.sessions).toHaveLength(1);
// No dimensions populated
for (const dim of ['fonts', 'colors', 'layouts', 'aesthetics'] as const) {
expect(p.dimensions[dim].approved).toHaveLength(0);
}
});
});
describe('taste-engine: session cap', () => {
test('sessions truncate to last 50 entries (FIFO)', () => {
for (let i = 0; i < 55; i++) {
run(['approved', `v${i}`, '--reason', 'fonts: Geist']);
}
const p = readProfile();
expect(p.sessions).toHaveLength(50);
// Last 5 should be preserved, first 5 dropped
expect(p.sessions[0].variant).toBe('v5');
expect(p.sessions[49].variant).toBe('v54');
});
});
describe('taste-engine: taste drift conflict detection', () => {
test('warns when approved value has strong opposite signal', () => {
// Seed a strong rejected entry: 4 rejections, no approvals → Laplace = 0/5 but that's
// not > 0.6. Let's seed it directly with confidence 0.8.
writeProfile({
version: 1,
updated_at: new Date().toISOString(),
dimensions: {
fonts: {
approved: [],
rejected: [{ value: 'Comic Sans', confidence: 0.8, approved_count: 0, rejected_count: 4, last_seen: new Date().toISOString() }],
},
colors: { approved: [], rejected: [] },
layouts: { approved: [], rejected: [] },
aesthetics: { approved: [], rejected: [] },
},
sessions: [],
});
const r = run(['approved', 'v1', '--reason', 'fonts: Comic Sans']);
expect(r.status).toBe(0);
// "taste drift" note should go to stderr
expect(r.stderr).toContain('taste drift');
expect(r.stderr).toContain('Comic Sans');
});
test('does NOT warn when signal is weak', () => {
writeProfile({
version: 1,
updated_at: new Date().toISOString(),
dimensions: {
fonts: {
approved: [],
// Single rejection (< 3) — shouldn't trigger drift warning
rejected: [{ value: 'Inter', confidence: 0.5, approved_count: 0, rejected_count: 1, last_seen: new Date().toISOString() }],
},
colors: { approved: [], rejected: [] },
layouts: { approved: [], rejected: [] },
aesthetics: { approved: [], rejected: [] },
},
sessions: [],
});
const r = run(['approved', 'v1', '--reason', 'fonts: Inter']);
expect(r.status).toBe(0);
expect(r.stderr).not.toContain('taste drift');
});
});
describe('taste-engine: migration', () => {
test('legacy profile without version gets migrated to v1', () => {
// Simulate a legacy approved.json-style structure
writeProfile({
// no version field
dimensions: {
fonts: {
approved: [{ value: 'Legacy', confidence: 0.7, approved_count: 3, rejected_count: 1, last_seen: new Date().toISOString() }],
rejected: [],
},
},
sessions: [
{ ts: new Date().toISOString(), action: 'approved', variant: 'legacy-v1' },
],
});
const r = run(['migrate']);
expect(r.status).toBe(0);
const p = readProfile();
expect(p.version).toBe(1);
expect(p.dimensions.fonts.approved[0].value).toBe('Legacy');
expect(p.dimensions.colors).toBeDefined();
expect(p.dimensions.layouts).toBeDefined();
expect(p.dimensions.aesthetics).toBeDefined();
expect(p.sessions).toHaveLength(1);
expect(p.sessions[0].variant).toBe('legacy-v1');
});
test('migration truncates oversized sessions array to last 50', () => {
const sessions = Array.from({ length: 100 }, (_, i) => ({
ts: new Date().toISOString(),
action: 'approved' as const,
variant: `legacy-${i}`,
}));
writeProfile({ dimensions: {}, sessions });
const r = run(['migrate']);
expect(r.status).toBe(0);
const p = readProfile();
expect(p.sessions).toHaveLength(50);
expect(p.sessions[0].variant).toBe('legacy-50');
expect(p.sessions[49].variant).toBe('legacy-99');
});
});
describe('taste-engine: resilience', () => {
test('malformed JSON profile falls back to empty and does not crash', () => {
const pp = profilePath();
fs.mkdirSync(path.dirname(pp), { recursive: true });
fs.writeFileSync(pp, '{ this is not json');
const r = run(['approved', 'v1', '--reason', 'fonts: Geist']);
// Should succeed (graceful fallback)
expect(r.status).toBe(0);
// Warning on stderr
expect(r.stderr).toContain('WARN');
// File should now be valid JSON
const p = readProfile();
expect(p.version).toBe(1);
expect(p.dimensions.fonts.approved[0].value).toBe('Geist');
});
test('show on nonexistent profile prints empty summary without error', () => {
const r = run(['show']);
expect(r.status).toBe(0);
expect(r.stdout).toContain('taste-profile.json');
});
test('approved without variant arg exits non-zero with usage hint', () => {
const r = run(['approved']);
expect(r.status).not.toBe(0);
expect(r.stderr).toContain('Usage');
});
test('unknown command exits non-zero', () => {
const r = run(['banana']);
expect(r.status).not.toBe(0);
expect(r.stderr).toContain('Usage');
});
});