mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-05 13:15:24 +02:00
test: lite E2E coverage for benchmark, taste engine, publish
Fills real coverage gaps in v0.19.0.0 primitives. 44 new deterministic
tests (gate tier, ~3s) + 8 live-API tests (periodic tier).
New gate-tier test files (free, <3s total):
- test/taste-engine.test.ts — 24 tests against gstack-taste-update:
schema shape, Laplace-smoothed confidence, 5%/week decay clamped at 0,
multi-dimension extraction, case-insensitive matching, session cap,
legacy profile migration with session truncation, taste-drift conflict
warning, malformed-JSON recovery, missing-variant exit code.
- test/publish-dry-run.test.ts — 13 tests against gstack-publish --dry-run:
manifest parsing, missing/malformed JSON, per-skill validation errors
(missing source file / slug / version / marketplaces), slug filter,
unknown-skill exit, per-marketplace auth isolation (fake marketplaces
with always-pass / always-fail / missing-binary CLIs), and a sanity
check against the real repo manifest.
- test/benchmark-cli.test.ts — 11 tests against gstack-model-benchmark
--dry-run: provider default, unknown-provider WARN, empty list
fallback, flag passthrough (timeout/workdir/judge/output), long-prompt
truncation, prompt resolution (inline vs file vs positional), missing
prompt exit.
New periodic-tier test file (paid, gated EVALS=1):
- test/skill-e2e-benchmark-providers.test.ts — 8 tests hitting real
claude, codex, gemini CLIs with a trivial prompt (~$0.001/provider).
Verifies output parsing, token accounting, cost estimation, timeout
error.code semantics, Promise.allSettled parallel isolation.
Per-provider availability gate — unauthed providers skip cleanly.
This suite already caught one real bug (codex adapter missing
--skip-git-repo-check, fixed in 5260987d).
Registered `benchmark-providers-live` in touchfiles.ts (periodic tier,
triggered by changes to bin/gstack-model-benchmark, providers/**,
benchmark-runner.ts, pricing.ts).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,134 @@
|
||||
/**
|
||||
* gstack-model-benchmark CLI tests (offline).
|
||||
*
|
||||
* Covers CLI wiring that unit tests against benchmark-runner.ts can't see:
|
||||
* - --dry-run auth/provider-list resolution
|
||||
* - unknown provider WARN path
|
||||
* - provider default (claude) when --models omitted
|
||||
* - prompt resolution (inline --prompt vs positional file path)
|
||||
* - output format flag wiring via --dry-run (avoids real CLI invocation)
|
||||
*
|
||||
* All tests use --dry-run so no API calls happen.
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import { spawnSync } from 'child_process';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
const BIN = path.join(ROOT, 'bin', 'gstack-model-benchmark');
|
||||
|
||||
function run(args: string[], opts: { env?: Record<string, string> } = {}): { status: number | null; stdout: string; stderr: string } {
|
||||
const result = spawnSync('bun', ['run', BIN, ...args], {
|
||||
cwd: ROOT,
|
||||
env: { ...process.env, ...opts.env },
|
||||
encoding: 'utf-8',
|
||||
timeout: 15000,
|
||||
});
|
||||
return {
|
||||
status: result.status,
|
||||
stdout: result.stdout?.toString() ?? '',
|
||||
stderr: result.stderr?.toString() ?? '',
|
||||
};
|
||||
}
|
||||
|
||||
describe('gstack-model-benchmark --dry-run', () => {
|
||||
test('prints provider availability report and exits 0', () => {
|
||||
const r = run(['--prompt', 'hi', '--models', 'claude,gpt,gemini', '--dry-run']);
|
||||
expect(r.status).toBe(0);
|
||||
expect(r.stdout).toContain('gstack-model-benchmark --dry-run');
|
||||
expect(r.stdout).toContain('claude');
|
||||
expect(r.stdout).toContain('gpt');
|
||||
expect(r.stdout).toContain('gemini');
|
||||
expect(r.stdout).toContain('no prompts sent');
|
||||
});
|
||||
|
||||
test('reports default provider when --models omitted', () => {
|
||||
const r = run(['--prompt', 'hi', '--dry-run']);
|
||||
expect(r.status).toBe(0);
|
||||
expect(r.stdout).toContain('providers: claude');
|
||||
});
|
||||
|
||||
test('unknown provider in --models emits WARN and is dropped', () => {
|
||||
const r = run(['--prompt', 'hi', '--models', 'claude,gpt-42-fake', '--dry-run']);
|
||||
expect(r.status).toBe(0);
|
||||
expect(r.stderr).toContain('unknown provider');
|
||||
expect(r.stderr).toContain('gpt-42-fake');
|
||||
expect(r.stdout).toContain('providers: claude');
|
||||
expect(r.stdout).not.toContain('gpt-42-fake');
|
||||
});
|
||||
|
||||
test('empty --models list falls back to claude default', () => {
|
||||
const r = run(['--prompt', 'hi', '--models', '', '--dry-run']);
|
||||
expect(r.status).toBe(0);
|
||||
expect(r.stdout).toContain('providers: claude');
|
||||
});
|
||||
|
||||
test('--timeout-ms and --workdir flags flow through to dry-run report', () => {
|
||||
const r = run(['--prompt', 'hi', '--timeout-ms', '9999', '--workdir', '/tmp', '--dry-run']);
|
||||
expect(r.status).toBe(0);
|
||||
expect(r.stdout).toContain('timeout_ms: 9999');
|
||||
expect(r.stdout).toContain('workdir: /tmp');
|
||||
});
|
||||
|
||||
test('--judge flag reported in dry-run output', () => {
|
||||
const r = run(['--prompt', 'hi', '--judge', '--dry-run']);
|
||||
expect(r.status).toBe(0);
|
||||
expect(r.stdout).toContain('judge: on');
|
||||
});
|
||||
|
||||
test('--output flag reported in dry-run', () => {
|
||||
const r = run(['--prompt', 'hi', '--output', 'json', '--dry-run']);
|
||||
expect(r.status).toBe(0);
|
||||
expect(r.stdout).toContain('output: json');
|
||||
});
|
||||
|
||||
test('each adapter reports either OK or NOT READY, never crashes', () => {
|
||||
const r = run(['--prompt', 'hi', '--models', 'claude,gpt,gemini', '--dry-run']);
|
||||
expect(r.status).toBe(0);
|
||||
// Each provider line must end in OK or NOT READY
|
||||
const lines = r.stdout.split('\n');
|
||||
const adapterLines = lines.filter(l => /^\s+(claude|gpt|gemini):/.test(l));
|
||||
expect(adapterLines.length).toBe(3);
|
||||
for (const line of adapterLines) {
|
||||
expect(line).toMatch(/(OK|NOT READY)/);
|
||||
}
|
||||
});
|
||||
|
||||
test('long prompt is truncated in dry-run display', () => {
|
||||
const longPrompt = 'x'.repeat(200);
|
||||
const r = run(['--prompt', longPrompt, '--dry-run']);
|
||||
expect(r.status).toBe(0);
|
||||
// Summary truncates to 80 chars + ellipsis
|
||||
expect(r.stdout).toMatch(/prompt:\s+x{80}…/);
|
||||
});
|
||||
});
|
||||
|
||||
describe('gstack-model-benchmark prompt resolution', () => {
|
||||
test('positional file path is read and passed as prompt', () => {
|
||||
const tmp = fs.mkdtempSync(path.join(os.tmpdir(), 'bench-prompt-'));
|
||||
const promptFile = path.join(tmp, 'prompt.txt');
|
||||
fs.writeFileSync(promptFile, 'hello from file');
|
||||
try {
|
||||
const r = run([promptFile, '--dry-run']);
|
||||
expect(r.status).toBe(0);
|
||||
expect(r.stdout).toContain('hello from file');
|
||||
} finally {
|
||||
fs.rmSync(tmp, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
test('positional non-file arg is treated as inline prompt', () => {
|
||||
const r = run(['treat-me-as-inline', '--dry-run']);
|
||||
expect(r.status).toBe(0);
|
||||
expect(r.stdout).toContain('treat-me-as-inline');
|
||||
});
|
||||
|
||||
test('missing prompt exits non-zero', () => {
|
||||
const r = run(['--dry-run']);
|
||||
expect(r.status).not.toBe(0);
|
||||
expect(r.stderr).toContain('specify a prompt');
|
||||
});
|
||||
});
|
||||
@@ -171,6 +171,9 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
// Autoplan
|
||||
'autoplan-core': ['autoplan/**', 'plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**'],
|
||||
|
||||
// Multi-provider benchmark adapters — live API smoke against real claude/codex/gemini CLIs
|
||||
'benchmark-providers-live': ['bin/gstack-model-benchmark', 'test/helpers/providers/**', 'test/helpers/benchmark-runner.ts', 'test/helpers/pricing.ts'],
|
||||
|
||||
// Skill routing — journey-stage tests (depend on ALL skill descriptions)
|
||||
'journey-ideation': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
||||
'journey-plan-eng': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
||||
@@ -316,6 +319,9 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
|
||||
// Autoplan — periodic (not yet implemented)
|
||||
'autoplan-core': 'periodic',
|
||||
|
||||
// Multi-provider benchmark — periodic (requires external CLIs + auth, paid)
|
||||
'benchmark-providers-live': 'periodic',
|
||||
|
||||
// Skill routing — periodic (LLM routing is non-deterministic)
|
||||
'journey-ideation': 'periodic',
|
||||
'journey-plan-eng': 'periodic',
|
||||
|
||||
@@ -0,0 +1,281 @@
|
||||
/**
|
||||
* gstack-publish end-to-end tests via --dry-run.
|
||||
*
|
||||
* Verifies manifest parsing, schema validation, marketplace auth checks, per-skill
|
||||
* error isolation, and command building — all without touching real marketplaces.
|
||||
*
|
||||
* --dry-run does NOT run execSync on publish commands. Auth checks still run
|
||||
* against real binaries; we use fake marketplaces whose `auth_check` commands
|
||||
* are always-succeed (`true`) or always-fail (`false`) so the test is hermetic.
|
||||
*/
|
||||
|
||||
import { describe, test, expect, beforeEach, afterEach } from 'bun:test';
|
||||
import { spawnSync } from 'child_process';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
const BIN = path.join(ROOT, 'bin', 'gstack-publish');
|
||||
|
||||
let sandbox: string;
|
||||
let binCopy: string;
|
||||
|
||||
beforeEach(() => {
|
||||
// gstack-publish reads skills.json relative to the binary's dir (import.meta.dir/..).
|
||||
// To isolate each test's manifest, we create a sandbox repo that mirrors the real
|
||||
// structure: copy the bin into sandbox/bin/, write a controlled skills.json at the root.
|
||||
sandbox = fs.mkdtempSync(path.join(os.tmpdir(), 'publish-sandbox-'));
|
||||
fs.mkdirSync(path.join(sandbox, 'bin'));
|
||||
fs.mkdirSync(path.join(sandbox, 'test', 'helpers'), { recursive: true });
|
||||
binCopy = path.join(sandbox, 'bin', 'gstack-publish');
|
||||
fs.copyFileSync(BIN, binCopy);
|
||||
fs.chmodSync(binCopy, 0o755);
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
fs.rmSync(sandbox, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
function writeManifest(manifest: object): void {
|
||||
fs.writeFileSync(path.join(sandbox, 'skills.json'), JSON.stringify(manifest, null, 2));
|
||||
}
|
||||
|
||||
function writeSkillFile(relPath: string, content = '# Test Skill\n'): void {
|
||||
const full = path.join(sandbox, relPath);
|
||||
fs.mkdirSync(path.dirname(full), { recursive: true });
|
||||
fs.writeFileSync(full, content);
|
||||
}
|
||||
|
||||
function run(args: string[]): { status: number | null; stdout: string; stderr: string } {
|
||||
const result = spawnSync('bun', ['run', binCopy, ...args], {
|
||||
cwd: sandbox,
|
||||
encoding: 'utf-8',
|
||||
timeout: 15000,
|
||||
});
|
||||
return {
|
||||
status: result.status,
|
||||
stdout: result.stdout?.toString() ?? '',
|
||||
stderr: result.stderr?.toString() ?? '',
|
||||
};
|
||||
}
|
||||
|
||||
const VALID_MARKETPLACES = {
|
||||
fakestore_ok: {
|
||||
cli: 'true', // binary that always succeeds
|
||||
login_cmd: 'fakestore_ok login',
|
||||
publish_cmd_template: 'echo publish {slug} {version}',
|
||||
docs: 'https://fakestore.example',
|
||||
auth_check: 'true', // always-authenticated
|
||||
},
|
||||
fakestore_noauth: {
|
||||
cli: 'true',
|
||||
login_cmd: 'fakestore_noauth login',
|
||||
publish_cmd_template: 'echo publish {slug} {version}',
|
||||
docs: 'https://fakestore.example',
|
||||
auth_check: 'false', // always-fails auth
|
||||
},
|
||||
fakestore_missing: {
|
||||
cli: 'nonexistent-binary-xyz',
|
||||
login_cmd: 'fakestore_missing login',
|
||||
publish_cmd_template: 'echo publish {slug} {version}',
|
||||
docs: 'https://fakestore.example',
|
||||
auth_check: 'nonexistent-binary-xyz whoami',
|
||||
},
|
||||
};
|
||||
|
||||
function validSkill(slug: string, sourceRel: string, marketplaces: string[] = ['fakestore_ok']) {
|
||||
const m: Record<string, { slug: string; publish: boolean }> = {};
|
||||
for (const name of marketplaces) m[name] = { slug, publish: true };
|
||||
return {
|
||||
slug,
|
||||
source: sourceRel,
|
||||
name: `Skill ${slug}`,
|
||||
version: '1.0.0',
|
||||
category: 'test',
|
||||
description: 'A test skill',
|
||||
marketplaces: m,
|
||||
standalone: true,
|
||||
compatible_hosts: ['claude-code'],
|
||||
};
|
||||
}
|
||||
|
||||
describe('gstack-publish: manifest loading', () => {
|
||||
test('--list prints every skill and marketplace', () => {
|
||||
writeSkillFile('skills/alpha/SKILL.md');
|
||||
writeSkillFile('skills/beta/SKILL.md');
|
||||
writeManifest({
|
||||
version: '1.0.0',
|
||||
description: 't',
|
||||
skills: [validSkill('alpha', 'skills/alpha/SKILL.md'), validSkill('beta', 'skills/beta/SKILL.md')],
|
||||
marketplaces: VALID_MARKETPLACES,
|
||||
});
|
||||
const r = run(['--list']);
|
||||
expect(r.status).toBe(0);
|
||||
expect(r.stdout).toContain('alpha');
|
||||
expect(r.stdout).toContain('beta');
|
||||
expect(r.stdout).toContain('fakestore_ok');
|
||||
});
|
||||
|
||||
test('missing manifest exits non-zero', () => {
|
||||
// Delete any manifest
|
||||
const r = run(['--dry-run']);
|
||||
expect(r.status).not.toBe(0);
|
||||
expect(r.stderr).toContain('skills.json');
|
||||
});
|
||||
|
||||
test('malformed JSON exits non-zero', () => {
|
||||
fs.writeFileSync(path.join(sandbox, 'skills.json'), '{ not json');
|
||||
const r = run(['--dry-run']);
|
||||
expect(r.status).not.toBe(0);
|
||||
expect(r.stderr).toContain('parse');
|
||||
});
|
||||
});
|
||||
|
||||
describe('gstack-publish: validation', () => {
|
||||
test('missing source file reports validation error and exits 1', () => {
|
||||
writeManifest({
|
||||
version: '1.0.0',
|
||||
description: 't',
|
||||
skills: [validSkill('ghost', 'skills/ghost/DOES_NOT_EXIST.md')],
|
||||
marketplaces: VALID_MARKETPLACES,
|
||||
});
|
||||
const r = run(['--dry-run']);
|
||||
expect(r.status).not.toBe(0);
|
||||
expect(r.stderr).toContain('source file missing');
|
||||
expect(r.stderr).toContain('ghost');
|
||||
});
|
||||
|
||||
test('missing slug reports validation error', () => {
|
||||
writeSkillFile('skills/x/SKILL.md');
|
||||
const s = validSkill('temp', 'skills/x/SKILL.md');
|
||||
delete (s as Partial<typeof s>).slug;
|
||||
writeManifest({
|
||||
version: '1.0.0',
|
||||
description: 't',
|
||||
skills: [s],
|
||||
marketplaces: VALID_MARKETPLACES,
|
||||
});
|
||||
const r = run(['--dry-run']);
|
||||
expect(r.status).not.toBe(0);
|
||||
expect(r.stderr).toContain('missing slug');
|
||||
});
|
||||
|
||||
test('missing version reports validation error', () => {
|
||||
writeSkillFile('skills/x/SKILL.md');
|
||||
const s = validSkill('x', 'skills/x/SKILL.md');
|
||||
delete (s as Partial<typeof s>).version;
|
||||
writeManifest({
|
||||
version: '1.0.0',
|
||||
description: 't',
|
||||
skills: [s],
|
||||
marketplaces: VALID_MARKETPLACES,
|
||||
});
|
||||
const r = run(['--dry-run']);
|
||||
expect(r.status).not.toBe(0);
|
||||
expect(r.stderr).toContain('missing version');
|
||||
});
|
||||
|
||||
test('no marketplaces configured reports validation error', () => {
|
||||
writeSkillFile('skills/x/SKILL.md');
|
||||
const s = { ...validSkill('x', 'skills/x/SKILL.md'), marketplaces: {} };
|
||||
writeManifest({ version: '1.0.0', description: 't', skills: [s], marketplaces: VALID_MARKETPLACES });
|
||||
const r = run(['--dry-run']);
|
||||
expect(r.status).not.toBe(0);
|
||||
expect(r.stderr).toContain('no marketplaces configured');
|
||||
});
|
||||
});
|
||||
|
||||
describe('gstack-publish: dry-run execution', () => {
|
||||
test('happy path reports DRY-RUN tag and templated command', () => {
|
||||
writeSkillFile('skills/alpha/SKILL.md');
|
||||
writeManifest({
|
||||
version: '1.0.0',
|
||||
description: 't',
|
||||
skills: [validSkill('alpha', 'skills/alpha/SKILL.md')],
|
||||
marketplaces: VALID_MARKETPLACES,
|
||||
});
|
||||
const r = run(['--dry-run']);
|
||||
expect(r.status).toBe(0);
|
||||
expect(r.stdout).toContain('DRY-RUN');
|
||||
expect(r.stdout).toContain('alpha');
|
||||
expect(r.stdout).toContain('Published: 1');
|
||||
expect(r.stdout).toContain('Failed: 0');
|
||||
});
|
||||
|
||||
test('per-skill filter publishes only the requested slug', () => {
|
||||
writeSkillFile('skills/alpha/SKILL.md');
|
||||
writeSkillFile('skills/beta/SKILL.md');
|
||||
writeManifest({
|
||||
version: '1.0.0',
|
||||
description: 't',
|
||||
skills: [validSkill('alpha', 'skills/alpha/SKILL.md'), validSkill('beta', 'skills/beta/SKILL.md')],
|
||||
marketplaces: VALID_MARKETPLACES,
|
||||
});
|
||||
const r = run(['alpha', '--dry-run']);
|
||||
expect(r.status).toBe(0);
|
||||
expect(r.stdout).toContain('Publishing alpha');
|
||||
expect(r.stdout).not.toContain('Publishing beta');
|
||||
expect(r.stdout).toContain('Published: 1');
|
||||
});
|
||||
|
||||
test('unknown skill filter exits non-zero', () => {
|
||||
writeSkillFile('skills/alpha/SKILL.md');
|
||||
writeManifest({
|
||||
version: '1.0.0',
|
||||
description: 't',
|
||||
skills: [validSkill('alpha', 'skills/alpha/SKILL.md')],
|
||||
marketplaces: VALID_MARKETPLACES,
|
||||
});
|
||||
const r = run(['nonexistent', '--dry-run']);
|
||||
expect(r.status).not.toBe(0);
|
||||
expect(r.stderr).toContain('skill not found');
|
||||
});
|
||||
});
|
||||
|
||||
describe('gstack-publish: auth check isolation', () => {
|
||||
test('failing auth for one marketplace does NOT abort the batch in dry-run', () => {
|
||||
writeSkillFile('skills/alpha/SKILL.md');
|
||||
writeManifest({
|
||||
version: '1.0.0',
|
||||
description: 't',
|
||||
skills: [validSkill('alpha', 'skills/alpha/SKILL.md', ['fakestore_ok', 'fakestore_noauth'])],
|
||||
marketplaces: VALID_MARKETPLACES,
|
||||
});
|
||||
const r = run(['--dry-run']);
|
||||
// In dry-run, auth failures are reported but don't block dispatch
|
||||
expect(r.status).toBe(0);
|
||||
expect(r.stdout).toContain('fakestore_ok: OK');
|
||||
expect(r.stdout).toContain('fakestore_noauth: NOT READY');
|
||||
});
|
||||
|
||||
test('missing binary reported as not-ready with docs link', () => {
|
||||
writeSkillFile('skills/alpha/SKILL.md');
|
||||
writeManifest({
|
||||
version: '1.0.0',
|
||||
description: 't',
|
||||
skills: [validSkill('alpha', 'skills/alpha/SKILL.md', ['fakestore_missing'])],
|
||||
marketplaces: VALID_MARKETPLACES,
|
||||
});
|
||||
const r = run(['--dry-run']);
|
||||
expect(r.stdout).toContain('fakestore_missing: NOT READY');
|
||||
expect(r.stdout).toContain('not on PATH');
|
||||
});
|
||||
});
|
||||
|
||||
describe('gstack-publish: real manifest sanity', () => {
|
||||
test('the real repo skills.json passes --dry-run validation', () => {
|
||||
// This uses the actual bin against the actual manifest (ROOT/skills.json).
|
||||
// If auth to any real marketplace isn't set up it just reports NOT READY;
|
||||
// --dry-run still exits 0 because it doesn't require auth to pass.
|
||||
const real = spawnSync('bun', ['run', path.join(ROOT, 'bin', 'gstack-publish'), '--dry-run'], {
|
||||
cwd: ROOT,
|
||||
encoding: 'utf-8',
|
||||
timeout: 20000,
|
||||
});
|
||||
expect(real.status).toBe(0);
|
||||
expect(real.stdout).toContain('Validating manifest');
|
||||
// Every skill in the real manifest should pass validation
|
||||
expect(real.stderr).not.toContain('Manifest validation failed');
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,175 @@
|
||||
/**
|
||||
* Multi-provider benchmark adapter E2E — hit real claude, codex, gemini CLIs.
|
||||
*
|
||||
* Periodic tier: runs under `bun run test:e2e` with EVALS=1. Each provider gated
|
||||
* on its own `available()` check so missing auth skips that provider (doesn't
|
||||
* abort the batch). Uses the simplest possible prompt ("Reply with exactly: ok")
|
||||
* to keep cost near $0.001/provider/run.
|
||||
*
|
||||
* What this catches that unit tests don't:
|
||||
* - CLI output-format drift (the #1 silent breakage path)
|
||||
* - Token parsing from real provider responses
|
||||
* - Auth-failure vs timeout vs rate-limit error code routing
|
||||
* - Cost estimation on real token counts
|
||||
* - Parallel execution via Promise.allSettled — slow provider doesn't block fast
|
||||
*
|
||||
* NOT covered here (would need dedicated test files):
|
||||
* - Quality judge integration (benchmark-judge.ts, adds ~$0.05/run)
|
||||
* - Multi-turn tool-using prompts — our single-turn smoke skips `toolCalls > 0`
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import { ClaudeAdapter } from './helpers/providers/claude';
|
||||
import { GptAdapter } from './helpers/providers/gpt';
|
||||
import { GeminiAdapter } from './helpers/providers/gemini';
|
||||
import { runBenchmark } from './helpers/benchmark-runner';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
|
||||
// --- Prerequisites / gating ---
|
||||
|
||||
const evalsEnabled = !!process.env.EVALS;
|
||||
const describeIfEvals = evalsEnabled ? describe : describe.skip;
|
||||
|
||||
const PROMPT = 'Reply with exactly this text and nothing else: ok';
|
||||
|
||||
// Per-provider gate — each test checks its own availability and skips cleanly.
|
||||
// We construct adapters outside `test` so Bun's test reporter shows the skip reason.
|
||||
const claude = new ClaudeAdapter();
|
||||
const gpt = new GptAdapter();
|
||||
const gemini = new GeminiAdapter();
|
||||
|
||||
// Use a temp working directory so provider CLIs can't accidentally touch the repo.
|
||||
const workdir = fs.mkdtempSync(path.join(os.tmpdir(), 'bench-e2e-'));
|
||||
|
||||
describeIfEvals('multi-provider benchmark adapters (live)', () => {
|
||||
test('claude: available() returns structured ok/reason', async () => {
|
||||
const check = await claude.available();
|
||||
expect(check).toHaveProperty('ok');
|
||||
if (!check.ok) {
|
||||
expect(typeof check.reason).toBe('string');
|
||||
expect(check.reason!.length).toBeGreaterThan(0);
|
||||
}
|
||||
});
|
||||
|
||||
test('gpt: available() returns structured ok/reason', async () => {
|
||||
const check = await gpt.available();
|
||||
expect(check).toHaveProperty('ok');
|
||||
if (!check.ok) {
|
||||
expect(typeof check.reason).toBe('string');
|
||||
}
|
||||
});
|
||||
|
||||
test('gemini: available() returns structured ok/reason', async () => {
|
||||
const check = await gemini.available();
|
||||
expect(check).toHaveProperty('ok');
|
||||
if (!check.ok) {
|
||||
expect(typeof check.reason).toBe('string');
|
||||
}
|
||||
});
|
||||
|
||||
test('claude: trivial prompt produces parseable output', async () => {
|
||||
const check = await claude.available();
|
||||
if (!check.ok) {
|
||||
process.stderr.write(`\nclaude live smoke: SKIPPED — ${check.reason}\n`);
|
||||
return;
|
||||
}
|
||||
const result = await claude.run({ prompt: PROMPT, workdir, timeoutMs: 120_000 });
|
||||
if (result.error) {
|
||||
throw new Error(`claude errored: ${result.error.code} — ${result.error.reason}`);
|
||||
}
|
||||
expect(result.output.toLowerCase()).toContain('ok');
|
||||
expect(result.tokens.input).toBeGreaterThan(0);
|
||||
expect(result.tokens.output).toBeGreaterThan(0);
|
||||
expect(result.durationMs).toBeGreaterThan(0);
|
||||
expect(typeof result.modelUsed).toBe('string');
|
||||
expect(result.modelUsed.length).toBeGreaterThan(0);
|
||||
const cost = claude.estimateCost(result.tokens, result.modelUsed);
|
||||
expect(cost).toBeGreaterThan(0);
|
||||
}, 150_000);
|
||||
|
||||
test('gpt: trivial prompt produces parseable output', async () => {
|
||||
const check = await gpt.available();
|
||||
if (!check.ok) {
|
||||
process.stderr.write(`\ngpt live smoke: SKIPPED — ${check.reason}\n`);
|
||||
return;
|
||||
}
|
||||
const result = await gpt.run({ prompt: PROMPT, workdir, timeoutMs: 120_000 });
|
||||
if (result.error) {
|
||||
throw new Error(`gpt errored: ${result.error.code} — ${result.error.reason}`);
|
||||
}
|
||||
expect(result.output.toLowerCase()).toContain('ok');
|
||||
expect(result.tokens.input).toBeGreaterThan(0);
|
||||
expect(result.tokens.output).toBeGreaterThan(0);
|
||||
expect(result.durationMs).toBeGreaterThan(0);
|
||||
expect(typeof result.modelUsed).toBe('string');
|
||||
const cost = gpt.estimateCost(result.tokens, result.modelUsed);
|
||||
expect(cost).toBeGreaterThan(0);
|
||||
}, 150_000);
|
||||
|
||||
test('gemini: trivial prompt produces parseable output', async () => {
|
||||
const check = await gemini.available();
|
||||
if (!check.ok) {
|
||||
process.stderr.write(`\ngemini live smoke: SKIPPED — ${check.reason}\n`);
|
||||
return;
|
||||
}
|
||||
const result = await gemini.run({ prompt: PROMPT, workdir, timeoutMs: 120_000 });
|
||||
if (result.error) {
|
||||
throw new Error(`gemini errored: ${result.error.code} — ${result.error.reason}`);
|
||||
}
|
||||
expect(result.output.toLowerCase()).toContain('ok');
|
||||
// Gemini CLI sometimes returns 0 tokens in the result event (older responses);
|
||||
// assert non-negative instead of strictly positive.
|
||||
expect(result.tokens.input).toBeGreaterThanOrEqual(0);
|
||||
expect(result.tokens.output).toBeGreaterThanOrEqual(0);
|
||||
expect(result.durationMs).toBeGreaterThan(0);
|
||||
expect(typeof result.modelUsed).toBe('string');
|
||||
}, 150_000);
|
||||
|
||||
test('timeout error surfaces as error.code=timeout (no exception)', async () => {
|
||||
// Use whatever adapter is available first — all three should share timeout semantics.
|
||||
const adapter = (await claude.available()).ok ? claude
|
||||
: (await gpt.available()).ok ? gpt
|
||||
: (await gemini.available()).ok ? gemini
|
||||
: null;
|
||||
if (!adapter) {
|
||||
process.stderr.write('\ntimeout smoke: SKIPPED — no provider available\n');
|
||||
return;
|
||||
}
|
||||
// 100ms timeout is far too short for any real CLI startup → must timeout.
|
||||
const result = await adapter.run({ prompt: PROMPT, workdir, timeoutMs: 100 });
|
||||
expect(result.error).toBeDefined();
|
||||
// Timeout, binary_missing, or unknown (if CLI dies differently) — all acceptable
|
||||
// non-crash outcomes. The point is the adapter returns a RunResult, not throws.
|
||||
expect(['timeout', 'unknown', 'binary_missing']).toContain(result.error!.code);
|
||||
expect(result.durationMs).toBeGreaterThan(0);
|
||||
}, 30_000);
|
||||
|
||||
test('runBenchmark: Promise.allSettled means one unavailable provider does not block others', async () => {
|
||||
// Use the full runner with all three providers — whichever are unauthed should
|
||||
// return entries with available=false and not crash the batch.
|
||||
const report = await runBenchmark({
|
||||
prompt: PROMPT,
|
||||
workdir,
|
||||
providers: ['claude', 'gpt', 'gemini'],
|
||||
timeoutMs: 120_000,
|
||||
skipUnavailable: false,
|
||||
});
|
||||
expect(report.entries).toHaveLength(3);
|
||||
for (const e of report.entries) {
|
||||
expect(['claude', 'gpt', 'gemini']).toContain(e.family);
|
||||
if (e.available) {
|
||||
expect(e.result).toBeDefined();
|
||||
} else {
|
||||
expect(typeof e.unavailable_reason).toBe('string');
|
||||
}
|
||||
}
|
||||
// At least one available provider should have produced a non-error result in a healthy CI env.
|
||||
const hadSuccess = report.entries.some(e => e.available && e.result && !e.result.error);
|
||||
// We don't hard-assert this: if NO providers are authed, skip silently.
|
||||
if (!hadSuccess) {
|
||||
process.stderr.write('\nrunBenchmark live: no provider produced a clean result (no auth?)\n');
|
||||
}
|
||||
}, 300_000);
|
||||
});
|
||||
@@ -0,0 +1,370 @@
|
||||
/**
|
||||
* Taste engine — end-to-end tests for `gstack-taste-update`.
|
||||
*
|
||||
* Covers the v1 taste profile contract: schema shape, Laplace-smoothed confidence,
|
||||
* 5%/week decay, dimension extraction from reason strings, session cap, schema
|
||||
* migration, conflict detection (taste drift), malformed-input recovery.
|
||||
*
|
||||
* All tests use GSTACK_STATE_DIR pointing at a temp dir so no real home dir is
|
||||
* touched. Each test isolates its own state directory.
|
||||
*/
|
||||
|
||||
import { describe, test, expect, beforeEach, afterEach } from 'bun:test';
|
||||
import { spawnSync } from 'child_process';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
const BIN = path.join(ROOT, 'bin', 'gstack-taste-update');
|
||||
|
||||
interface Preference {
|
||||
value: string;
|
||||
confidence: number;
|
||||
approved_count: number;
|
||||
rejected_count: number;
|
||||
last_seen: string;
|
||||
}
|
||||
|
||||
interface TasteProfile {
|
||||
version: number;
|
||||
updated_at: string;
|
||||
dimensions: Record<'fonts' | 'colors' | 'layouts' | 'aesthetics', { approved: Preference[]; rejected: Preference[] }>;
|
||||
sessions: Array<{ ts: string; action: 'approved' | 'rejected'; variant: string; reason?: string }>;
|
||||
}
|
||||
|
||||
let stateDir: string;
|
||||
let workdir: string;
|
||||
|
||||
beforeEach(() => {
|
||||
stateDir = fs.mkdtempSync(path.join(os.tmpdir(), 'taste-state-'));
|
||||
workdir = fs.mkdtempSync(path.join(os.tmpdir(), 'taste-work-'));
|
||||
// Initialize a git repo so gstack-taste-update's getSlug() finds a toplevel
|
||||
spawnSync('git', ['init', '-b', 'main'], { cwd: workdir, stdio: 'pipe' });
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
fs.rmSync(stateDir, { recursive: true, force: true });
|
||||
fs.rmSync(workdir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
function run(args: string[]): { status: number | null; stdout: string; stderr: string } {
|
||||
const result = spawnSync('bun', ['run', BIN, ...args], {
|
||||
cwd: workdir,
|
||||
env: { ...process.env, GSTACK_STATE_DIR: stateDir, HOME: stateDir },
|
||||
encoding: 'utf-8',
|
||||
timeout: 10000,
|
||||
});
|
||||
return {
|
||||
status: result.status,
|
||||
stdout: result.stdout?.toString() ?? '',
|
||||
stderr: result.stderr?.toString() ?? '',
|
||||
};
|
||||
}
|
||||
|
||||
function profilePath(): string {
|
||||
const slug = path.basename(workdir);
|
||||
return path.join(stateDir, 'projects', slug, 'taste-profile.json');
|
||||
}
|
||||
|
||||
function readProfile(): TasteProfile {
|
||||
return JSON.parse(fs.readFileSync(profilePath(), 'utf-8'));
|
||||
}
|
||||
|
||||
function writeProfile(p: unknown): void {
|
||||
const pp = profilePath();
|
||||
fs.mkdirSync(path.dirname(pp), { recursive: true });
|
||||
fs.writeFileSync(pp, JSON.stringify(p, null, 2));
|
||||
}
|
||||
|
||||
describe('taste-engine: first-write lifecycle', () => {
|
||||
test('approved creates profile with correct v1 schema', () => {
|
||||
const r = run(['approved', 'variant-A', '--reason', 'fonts: Geist Sans; colors: emerald']);
|
||||
expect(r.status).toBe(0);
|
||||
|
||||
const p = readProfile();
|
||||
expect(p.version).toBe(1);
|
||||
expect(p.dimensions.fonts.approved).toHaveLength(1);
|
||||
expect(p.dimensions.fonts.approved[0].value).toBe('Geist Sans');
|
||||
expect(p.dimensions.fonts.approved[0].approved_count).toBe(1);
|
||||
expect(p.dimensions.fonts.approved[0].rejected_count).toBe(0);
|
||||
// Laplace: 1 / (1 + 0 + 1) = 0.5
|
||||
expect(p.dimensions.fonts.approved[0].confidence).toBeCloseTo(0.5, 5);
|
||||
expect(p.dimensions.colors.approved[0].value).toBe('emerald');
|
||||
expect(p.sessions).toHaveLength(1);
|
||||
expect(p.sessions[0].action).toBe('approved');
|
||||
expect(p.sessions[0].variant).toBe('variant-A');
|
||||
});
|
||||
|
||||
test('rejected bumps rejected_count not approved_count', () => {
|
||||
run(['rejected', 'variant-B', '--reason', 'fonts: Comic Sans']);
|
||||
const p = readProfile();
|
||||
expect(p.dimensions.fonts.rejected).toHaveLength(1);
|
||||
expect(p.dimensions.fonts.rejected[0].rejected_count).toBe(1);
|
||||
expect(p.dimensions.fonts.rejected[0].approved_count).toBe(0);
|
||||
expect(p.dimensions.fonts.approved).toHaveLength(0);
|
||||
});
|
||||
|
||||
test('session recorded even when no dimensions extractable from reason', () => {
|
||||
const r = run(['approved', 'variant-C']); // no --reason
|
||||
expect(r.status).toBe(0);
|
||||
const p = readProfile();
|
||||
expect(p.sessions).toHaveLength(1);
|
||||
for (const dim of ['fonts', 'colors', 'layouts', 'aesthetics'] as const) {
|
||||
expect(p.dimensions[dim].approved).toHaveLength(0);
|
||||
expect(p.dimensions[dim].rejected).toHaveLength(0);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
describe('taste-engine: Laplace-smoothed confidence', () => {
|
||||
test('repeated approvals raise confidence toward 1', () => {
|
||||
for (let i = 0; i < 5; i++) {
|
||||
run(['approved', `variant-${i}`, '--reason', 'fonts: Geist Sans']);
|
||||
}
|
||||
const p = readProfile();
|
||||
const pref = p.dimensions.fonts.approved[0];
|
||||
expect(pref.approved_count).toBe(5);
|
||||
// Laplace: 5 / (5 + 0 + 1) = 0.833
|
||||
expect(pref.confidence).toBeCloseTo(5 / 6, 5);
|
||||
});
|
||||
|
||||
test('mixed approvals + rejections balance out', () => {
|
||||
run(['approved', 'v1', '--reason', 'fonts: Inter']);
|
||||
run(['approved', 'v2', '--reason', 'fonts: Inter']);
|
||||
run(['rejected', 'v3', '--reason', 'fonts: Inter']);
|
||||
const p = readProfile();
|
||||
const approved = p.dimensions.fonts.approved[0];
|
||||
const rejected = p.dimensions.fonts.rejected[0];
|
||||
expect(approved.approved_count).toBe(2);
|
||||
expect(approved.rejected_count).toBe(0);
|
||||
expect(rejected.rejected_count).toBe(1);
|
||||
expect(rejected.approved_count).toBe(0);
|
||||
});
|
||||
});
|
||||
|
||||
describe('taste-engine: decay math', () => {
|
||||
test('show applies 5%/week decay to stored confidence', () => {
|
||||
// Seed with a profile where the single approved font was last_seen 4 weeks ago
|
||||
const fourWeeksAgo = new Date(Date.now() - 4 * 7 * 24 * 60 * 60 * 1000).toISOString();
|
||||
writeProfile({
|
||||
version: 1,
|
||||
updated_at: new Date().toISOString(),
|
||||
dimensions: {
|
||||
fonts: {
|
||||
approved: [{ value: 'Aged Font', confidence: 0.8, approved_count: 4, rejected_count: 0, last_seen: fourWeeksAgo }],
|
||||
rejected: [],
|
||||
},
|
||||
colors: { approved: [], rejected: [] },
|
||||
layouts: { approved: [], rejected: [] },
|
||||
aesthetics: { approved: [], rejected: [] },
|
||||
},
|
||||
sessions: [],
|
||||
});
|
||||
const r = run(['show']);
|
||||
expect(r.status).toBe(0);
|
||||
// After 4 weeks: 0.8 * (0.95)^4 ≈ 0.651
|
||||
const expectedConf = 0.8 * Math.pow(0.95, 4);
|
||||
const match = r.stdout.match(/Aged Font — conf (\d+\.\d+)/);
|
||||
expect(match).toBeTruthy();
|
||||
const displayedConf = parseFloat(match![1]);
|
||||
expect(displayedConf).toBeCloseTo(expectedConf, 2);
|
||||
});
|
||||
|
||||
test('decay never goes below zero', () => {
|
||||
// 3 years ≈ 156 weeks. 0.95^156 ≈ 0.00036, well below 0.01.
|
||||
const yearsAgo = new Date(Date.now() - 3 * 365 * 24 * 60 * 60 * 1000).toISOString();
|
||||
writeProfile({
|
||||
version: 1,
|
||||
updated_at: new Date().toISOString(),
|
||||
dimensions: {
|
||||
fonts: {
|
||||
approved: [{ value: 'Ancient', confidence: 1.0, approved_count: 1, rejected_count: 0, last_seen: yearsAgo }],
|
||||
rejected: [],
|
||||
},
|
||||
colors: { approved: [], rejected: [] },
|
||||
layouts: { approved: [], rejected: [] },
|
||||
aesthetics: { approved: [], rejected: [] },
|
||||
},
|
||||
sessions: [],
|
||||
});
|
||||
const r = run(['show']);
|
||||
expect(r.status).toBe(0);
|
||||
const match = r.stdout.match(/Ancient — conf (\d+\.\d+)/);
|
||||
expect(match).toBeTruthy();
|
||||
const conf = parseFloat(match![1]);
|
||||
expect(conf).toBeGreaterThanOrEqual(0);
|
||||
expect(conf).toBeLessThan(0.01);
|
||||
});
|
||||
});
|
||||
|
||||
describe('taste-engine: dimension extraction', () => {
|
||||
test('parses multiple dimensions from one reason string', () => {
|
||||
run(['approved', 'v1', '--reason', 'fonts: Geist, IBM Plex; colors: emerald; layouts: grid-12; aesthetics: brutalist']);
|
||||
const p = readProfile();
|
||||
expect(p.dimensions.fonts.approved.map(x => x.value).sort()).toEqual(['Geist', 'IBM Plex']);
|
||||
expect(p.dimensions.colors.approved[0].value).toBe('emerald');
|
||||
expect(p.dimensions.layouts.approved[0].value).toBe('grid-12');
|
||||
expect(p.dimensions.aesthetics.approved[0].value).toBe('brutalist');
|
||||
});
|
||||
|
||||
test('value matching is case-insensitive', () => {
|
||||
run(['approved', 'v1', '--reason', 'fonts: Geist']);
|
||||
run(['approved', 'v2', '--reason', 'fonts: GEIST']);
|
||||
const p = readProfile();
|
||||
// Should merge into a single entry
|
||||
expect(p.dimensions.fonts.approved).toHaveLength(1);
|
||||
expect(p.dimensions.fonts.approved[0].approved_count).toBe(2);
|
||||
});
|
||||
|
||||
test('unknown dimension labels are silently ignored', () => {
|
||||
run(['approved', 'v1', '--reason', 'weather: sunny; mood: happy']);
|
||||
const p = readProfile();
|
||||
// Session still recorded
|
||||
expect(p.sessions).toHaveLength(1);
|
||||
// No dimensions populated
|
||||
for (const dim of ['fonts', 'colors', 'layouts', 'aesthetics'] as const) {
|
||||
expect(p.dimensions[dim].approved).toHaveLength(0);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
describe('taste-engine: session cap', () => {
|
||||
test('sessions truncate to last 50 entries (FIFO)', () => {
|
||||
for (let i = 0; i < 55; i++) {
|
||||
run(['approved', `v${i}`, '--reason', 'fonts: Geist']);
|
||||
}
|
||||
const p = readProfile();
|
||||
expect(p.sessions).toHaveLength(50);
|
||||
// Last 5 should be preserved, first 5 dropped
|
||||
expect(p.sessions[0].variant).toBe('v5');
|
||||
expect(p.sessions[49].variant).toBe('v54');
|
||||
});
|
||||
});
|
||||
|
||||
describe('taste-engine: taste drift conflict detection', () => {
|
||||
test('warns when approved value has strong opposite signal', () => {
|
||||
// Seed a strong rejected entry: 4 rejections, no approvals → Laplace = 0/5 but that's
|
||||
// not > 0.6. Let's seed it directly with confidence 0.8.
|
||||
writeProfile({
|
||||
version: 1,
|
||||
updated_at: new Date().toISOString(),
|
||||
dimensions: {
|
||||
fonts: {
|
||||
approved: [],
|
||||
rejected: [{ value: 'Comic Sans', confidence: 0.8, approved_count: 0, rejected_count: 4, last_seen: new Date().toISOString() }],
|
||||
},
|
||||
colors: { approved: [], rejected: [] },
|
||||
layouts: { approved: [], rejected: [] },
|
||||
aesthetics: { approved: [], rejected: [] },
|
||||
},
|
||||
sessions: [],
|
||||
});
|
||||
const r = run(['approved', 'v1', '--reason', 'fonts: Comic Sans']);
|
||||
expect(r.status).toBe(0);
|
||||
// "taste drift" note should go to stderr
|
||||
expect(r.stderr).toContain('taste drift');
|
||||
expect(r.stderr).toContain('Comic Sans');
|
||||
});
|
||||
|
||||
test('does NOT warn when signal is weak', () => {
|
||||
writeProfile({
|
||||
version: 1,
|
||||
updated_at: new Date().toISOString(),
|
||||
dimensions: {
|
||||
fonts: {
|
||||
approved: [],
|
||||
// Single rejection (< 3) — shouldn't trigger drift warning
|
||||
rejected: [{ value: 'Inter', confidence: 0.5, approved_count: 0, rejected_count: 1, last_seen: new Date().toISOString() }],
|
||||
},
|
||||
colors: { approved: [], rejected: [] },
|
||||
layouts: { approved: [], rejected: [] },
|
||||
aesthetics: { approved: [], rejected: [] },
|
||||
},
|
||||
sessions: [],
|
||||
});
|
||||
const r = run(['approved', 'v1', '--reason', 'fonts: Inter']);
|
||||
expect(r.status).toBe(0);
|
||||
expect(r.stderr).not.toContain('taste drift');
|
||||
});
|
||||
});
|
||||
|
||||
describe('taste-engine: migration', () => {
|
||||
test('legacy profile without version gets migrated to v1', () => {
|
||||
// Simulate a legacy approved.json-style structure
|
||||
writeProfile({
|
||||
// no version field
|
||||
dimensions: {
|
||||
fonts: {
|
||||
approved: [{ value: 'Legacy', confidence: 0.7, approved_count: 3, rejected_count: 1, last_seen: new Date().toISOString() }],
|
||||
rejected: [],
|
||||
},
|
||||
},
|
||||
sessions: [
|
||||
{ ts: new Date().toISOString(), action: 'approved', variant: 'legacy-v1' },
|
||||
],
|
||||
});
|
||||
|
||||
const r = run(['migrate']);
|
||||
expect(r.status).toBe(0);
|
||||
|
||||
const p = readProfile();
|
||||
expect(p.version).toBe(1);
|
||||
expect(p.dimensions.fonts.approved[0].value).toBe('Legacy');
|
||||
expect(p.dimensions.colors).toBeDefined();
|
||||
expect(p.dimensions.layouts).toBeDefined();
|
||||
expect(p.dimensions.aesthetics).toBeDefined();
|
||||
expect(p.sessions).toHaveLength(1);
|
||||
expect(p.sessions[0].variant).toBe('legacy-v1');
|
||||
});
|
||||
|
||||
test('migration truncates oversized sessions array to last 50', () => {
|
||||
const sessions = Array.from({ length: 100 }, (_, i) => ({
|
||||
ts: new Date().toISOString(),
|
||||
action: 'approved' as const,
|
||||
variant: `legacy-${i}`,
|
||||
}));
|
||||
writeProfile({ dimensions: {}, sessions });
|
||||
const r = run(['migrate']);
|
||||
expect(r.status).toBe(0);
|
||||
const p = readProfile();
|
||||
expect(p.sessions).toHaveLength(50);
|
||||
expect(p.sessions[0].variant).toBe('legacy-50');
|
||||
expect(p.sessions[49].variant).toBe('legacy-99');
|
||||
});
|
||||
});
|
||||
|
||||
describe('taste-engine: resilience', () => {
|
||||
test('malformed JSON profile falls back to empty and does not crash', () => {
|
||||
const pp = profilePath();
|
||||
fs.mkdirSync(path.dirname(pp), { recursive: true });
|
||||
fs.writeFileSync(pp, '{ this is not json');
|
||||
const r = run(['approved', 'v1', '--reason', 'fonts: Geist']);
|
||||
// Should succeed (graceful fallback)
|
||||
expect(r.status).toBe(0);
|
||||
// Warning on stderr
|
||||
expect(r.stderr).toContain('WARN');
|
||||
// File should now be valid JSON
|
||||
const p = readProfile();
|
||||
expect(p.version).toBe(1);
|
||||
expect(p.dimensions.fonts.approved[0].value).toBe('Geist');
|
||||
});
|
||||
|
||||
test('show on nonexistent profile prints empty summary without error', () => {
|
||||
const r = run(['show']);
|
||||
expect(r.status).toBe(0);
|
||||
expect(r.stdout).toContain('taste-profile.json');
|
||||
});
|
||||
|
||||
test('approved without variant arg exits non-zero with usage hint', () => {
|
||||
const r = run(['approved']);
|
||||
expect(r.status).not.toBe(0);
|
||||
expect(r.stderr).toContain('Usage');
|
||||
});
|
||||
|
||||
test('unknown command exits non-zero', () => {
|
||||
const r = run(['banana']);
|
||||
expect(r.status).not.toBe(0);
|
||||
expect(r.stderr).toContain('Usage');
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user