diff --git a/test/benchmark-cli.test.ts b/test/benchmark-cli.test.ts new file mode 100644 index 00000000..3b798abf --- /dev/null +++ b/test/benchmark-cli.test.ts @@ -0,0 +1,134 @@ +/** + * gstack-model-benchmark CLI tests (offline). + * + * Covers CLI wiring that unit tests against benchmark-runner.ts can't see: + * - --dry-run auth/provider-list resolution + * - unknown provider WARN path + * - provider default (claude) when --models omitted + * - prompt resolution (inline --prompt vs positional file path) + * - output format flag wiring via --dry-run (avoids real CLI invocation) + * + * All tests use --dry-run so no API calls happen. + */ + +import { describe, test, expect } from 'bun:test'; +import { spawnSync } from 'child_process'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; + +const ROOT = path.resolve(import.meta.dir, '..'); +const BIN = path.join(ROOT, 'bin', 'gstack-model-benchmark'); + +function run(args: string[], opts: { env?: Record } = {}): { status: number | null; stdout: string; stderr: string } { + const result = spawnSync('bun', ['run', BIN, ...args], { + cwd: ROOT, + env: { ...process.env, ...opts.env }, + encoding: 'utf-8', + timeout: 15000, + }); + return { + status: result.status, + stdout: result.stdout?.toString() ?? '', + stderr: result.stderr?.toString() ?? '', + }; +} + +describe('gstack-model-benchmark --dry-run', () => { + test('prints provider availability report and exits 0', () => { + const r = run(['--prompt', 'hi', '--models', 'claude,gpt,gemini', '--dry-run']); + expect(r.status).toBe(0); + expect(r.stdout).toContain('gstack-model-benchmark --dry-run'); + expect(r.stdout).toContain('claude'); + expect(r.stdout).toContain('gpt'); + expect(r.stdout).toContain('gemini'); + expect(r.stdout).toContain('no prompts sent'); + }); + + test('reports default provider when --models omitted', () => { + const r = run(['--prompt', 'hi', '--dry-run']); + expect(r.status).toBe(0); + expect(r.stdout).toContain('providers: claude'); + }); + + test('unknown provider in --models emits WARN and is dropped', () => { + const r = run(['--prompt', 'hi', '--models', 'claude,gpt-42-fake', '--dry-run']); + expect(r.status).toBe(0); + expect(r.stderr).toContain('unknown provider'); + expect(r.stderr).toContain('gpt-42-fake'); + expect(r.stdout).toContain('providers: claude'); + expect(r.stdout).not.toContain('gpt-42-fake'); + }); + + test('empty --models list falls back to claude default', () => { + const r = run(['--prompt', 'hi', '--models', '', '--dry-run']); + expect(r.status).toBe(0); + expect(r.stdout).toContain('providers: claude'); + }); + + test('--timeout-ms and --workdir flags flow through to dry-run report', () => { + const r = run(['--prompt', 'hi', '--timeout-ms', '9999', '--workdir', '/tmp', '--dry-run']); + expect(r.status).toBe(0); + expect(r.stdout).toContain('timeout_ms: 9999'); + expect(r.stdout).toContain('workdir: /tmp'); + }); + + test('--judge flag reported in dry-run output', () => { + const r = run(['--prompt', 'hi', '--judge', '--dry-run']); + expect(r.status).toBe(0); + expect(r.stdout).toContain('judge: on'); + }); + + test('--output flag reported in dry-run', () => { + const r = run(['--prompt', 'hi', '--output', 'json', '--dry-run']); + expect(r.status).toBe(0); + expect(r.stdout).toContain('output: json'); + }); + + test('each adapter reports either OK or NOT READY, never crashes', () => { + const r = run(['--prompt', 'hi', '--models', 'claude,gpt,gemini', '--dry-run']); + expect(r.status).toBe(0); + // Each provider line must end in OK or NOT READY + const lines = r.stdout.split('\n'); + const adapterLines = lines.filter(l => /^\s+(claude|gpt|gemini):/.test(l)); + expect(adapterLines.length).toBe(3); + for (const line of adapterLines) { + expect(line).toMatch(/(OK|NOT READY)/); + } + }); + + test('long prompt is truncated in dry-run display', () => { + const longPrompt = 'x'.repeat(200); + const r = run(['--prompt', longPrompt, '--dry-run']); + expect(r.status).toBe(0); + // Summary truncates to 80 chars + ellipsis + expect(r.stdout).toMatch(/prompt:\s+x{80}…/); + }); +}); + +describe('gstack-model-benchmark prompt resolution', () => { + test('positional file path is read and passed as prompt', () => { + const tmp = fs.mkdtempSync(path.join(os.tmpdir(), 'bench-prompt-')); + const promptFile = path.join(tmp, 'prompt.txt'); + fs.writeFileSync(promptFile, 'hello from file'); + try { + const r = run([promptFile, '--dry-run']); + expect(r.status).toBe(0); + expect(r.stdout).toContain('hello from file'); + } finally { + fs.rmSync(tmp, { recursive: true, force: true }); + } + }); + + test('positional non-file arg is treated as inline prompt', () => { + const r = run(['treat-me-as-inline', '--dry-run']); + expect(r.status).toBe(0); + expect(r.stdout).toContain('treat-me-as-inline'); + }); + + test('missing prompt exits non-zero', () => { + const r = run(['--dry-run']); + expect(r.status).not.toBe(0); + expect(r.stderr).toContain('specify a prompt'); + }); +}); diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index 34ead7d0..a8f37c23 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -171,6 +171,9 @@ export const E2E_TOUCHFILES: Record = { // Autoplan 'autoplan-core': ['autoplan/**', 'plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**'], + // Multi-provider benchmark adapters — live API smoke against real claude/codex/gemini CLIs + 'benchmark-providers-live': ['bin/gstack-model-benchmark', 'test/helpers/providers/**', 'test/helpers/benchmark-runner.ts', 'test/helpers/pricing.ts'], + // Skill routing — journey-stage tests (depend on ALL skill descriptions) 'journey-ideation': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'], 'journey-plan-eng': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'], @@ -316,6 +319,9 @@ export const E2E_TIERS: Record = { // Autoplan — periodic (not yet implemented) 'autoplan-core': 'periodic', + // Multi-provider benchmark — periodic (requires external CLIs + auth, paid) + 'benchmark-providers-live': 'periodic', + // Skill routing — periodic (LLM routing is non-deterministic) 'journey-ideation': 'periodic', 'journey-plan-eng': 'periodic', diff --git a/test/publish-dry-run.test.ts b/test/publish-dry-run.test.ts new file mode 100644 index 00000000..fe040f0b --- /dev/null +++ b/test/publish-dry-run.test.ts @@ -0,0 +1,281 @@ +/** + * gstack-publish end-to-end tests via --dry-run. + * + * Verifies manifest parsing, schema validation, marketplace auth checks, per-skill + * error isolation, and command building — all without touching real marketplaces. + * + * --dry-run does NOT run execSync on publish commands. Auth checks still run + * against real binaries; we use fake marketplaces whose `auth_check` commands + * are always-succeed (`true`) or always-fail (`false`) so the test is hermetic. + */ + +import { describe, test, expect, beforeEach, afterEach } from 'bun:test'; +import { spawnSync } from 'child_process'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; + +const ROOT = path.resolve(import.meta.dir, '..'); +const BIN = path.join(ROOT, 'bin', 'gstack-publish'); + +let sandbox: string; +let binCopy: string; + +beforeEach(() => { + // gstack-publish reads skills.json relative to the binary's dir (import.meta.dir/..). + // To isolate each test's manifest, we create a sandbox repo that mirrors the real + // structure: copy the bin into sandbox/bin/, write a controlled skills.json at the root. + sandbox = fs.mkdtempSync(path.join(os.tmpdir(), 'publish-sandbox-')); + fs.mkdirSync(path.join(sandbox, 'bin')); + fs.mkdirSync(path.join(sandbox, 'test', 'helpers'), { recursive: true }); + binCopy = path.join(sandbox, 'bin', 'gstack-publish'); + fs.copyFileSync(BIN, binCopy); + fs.chmodSync(binCopy, 0o755); +}); + +afterEach(() => { + fs.rmSync(sandbox, { recursive: true, force: true }); +}); + +function writeManifest(manifest: object): void { + fs.writeFileSync(path.join(sandbox, 'skills.json'), JSON.stringify(manifest, null, 2)); +} + +function writeSkillFile(relPath: string, content = '# Test Skill\n'): void { + const full = path.join(sandbox, relPath); + fs.mkdirSync(path.dirname(full), { recursive: true }); + fs.writeFileSync(full, content); +} + +function run(args: string[]): { status: number | null; stdout: string; stderr: string } { + const result = spawnSync('bun', ['run', binCopy, ...args], { + cwd: sandbox, + encoding: 'utf-8', + timeout: 15000, + }); + return { + status: result.status, + stdout: result.stdout?.toString() ?? '', + stderr: result.stderr?.toString() ?? '', + }; +} + +const VALID_MARKETPLACES = { + fakestore_ok: { + cli: 'true', // binary that always succeeds + login_cmd: 'fakestore_ok login', + publish_cmd_template: 'echo publish {slug} {version}', + docs: 'https://fakestore.example', + auth_check: 'true', // always-authenticated + }, + fakestore_noauth: { + cli: 'true', + login_cmd: 'fakestore_noauth login', + publish_cmd_template: 'echo publish {slug} {version}', + docs: 'https://fakestore.example', + auth_check: 'false', // always-fails auth + }, + fakestore_missing: { + cli: 'nonexistent-binary-xyz', + login_cmd: 'fakestore_missing login', + publish_cmd_template: 'echo publish {slug} {version}', + docs: 'https://fakestore.example', + auth_check: 'nonexistent-binary-xyz whoami', + }, +}; + +function validSkill(slug: string, sourceRel: string, marketplaces: string[] = ['fakestore_ok']) { + const m: Record = {}; + for (const name of marketplaces) m[name] = { slug, publish: true }; + return { + slug, + source: sourceRel, + name: `Skill ${slug}`, + version: '1.0.0', + category: 'test', + description: 'A test skill', + marketplaces: m, + standalone: true, + compatible_hosts: ['claude-code'], + }; +} + +describe('gstack-publish: manifest loading', () => { + test('--list prints every skill and marketplace', () => { + writeSkillFile('skills/alpha/SKILL.md'); + writeSkillFile('skills/beta/SKILL.md'); + writeManifest({ + version: '1.0.0', + description: 't', + skills: [validSkill('alpha', 'skills/alpha/SKILL.md'), validSkill('beta', 'skills/beta/SKILL.md')], + marketplaces: VALID_MARKETPLACES, + }); + const r = run(['--list']); + expect(r.status).toBe(0); + expect(r.stdout).toContain('alpha'); + expect(r.stdout).toContain('beta'); + expect(r.stdout).toContain('fakestore_ok'); + }); + + test('missing manifest exits non-zero', () => { + // Delete any manifest + const r = run(['--dry-run']); + expect(r.status).not.toBe(0); + expect(r.stderr).toContain('skills.json'); + }); + + test('malformed JSON exits non-zero', () => { + fs.writeFileSync(path.join(sandbox, 'skills.json'), '{ not json'); + const r = run(['--dry-run']); + expect(r.status).not.toBe(0); + expect(r.stderr).toContain('parse'); + }); +}); + +describe('gstack-publish: validation', () => { + test('missing source file reports validation error and exits 1', () => { + writeManifest({ + version: '1.0.0', + description: 't', + skills: [validSkill('ghost', 'skills/ghost/DOES_NOT_EXIST.md')], + marketplaces: VALID_MARKETPLACES, + }); + const r = run(['--dry-run']); + expect(r.status).not.toBe(0); + expect(r.stderr).toContain('source file missing'); + expect(r.stderr).toContain('ghost'); + }); + + test('missing slug reports validation error', () => { + writeSkillFile('skills/x/SKILL.md'); + const s = validSkill('temp', 'skills/x/SKILL.md'); + delete (s as Partial).slug; + writeManifest({ + version: '1.0.0', + description: 't', + skills: [s], + marketplaces: VALID_MARKETPLACES, + }); + const r = run(['--dry-run']); + expect(r.status).not.toBe(0); + expect(r.stderr).toContain('missing slug'); + }); + + test('missing version reports validation error', () => { + writeSkillFile('skills/x/SKILL.md'); + const s = validSkill('x', 'skills/x/SKILL.md'); + delete (s as Partial).version; + writeManifest({ + version: '1.0.0', + description: 't', + skills: [s], + marketplaces: VALID_MARKETPLACES, + }); + const r = run(['--dry-run']); + expect(r.status).not.toBe(0); + expect(r.stderr).toContain('missing version'); + }); + + test('no marketplaces configured reports validation error', () => { + writeSkillFile('skills/x/SKILL.md'); + const s = { ...validSkill('x', 'skills/x/SKILL.md'), marketplaces: {} }; + writeManifest({ version: '1.0.0', description: 't', skills: [s], marketplaces: VALID_MARKETPLACES }); + const r = run(['--dry-run']); + expect(r.status).not.toBe(0); + expect(r.stderr).toContain('no marketplaces configured'); + }); +}); + +describe('gstack-publish: dry-run execution', () => { + test('happy path reports DRY-RUN tag and templated command', () => { + writeSkillFile('skills/alpha/SKILL.md'); + writeManifest({ + version: '1.0.0', + description: 't', + skills: [validSkill('alpha', 'skills/alpha/SKILL.md')], + marketplaces: VALID_MARKETPLACES, + }); + const r = run(['--dry-run']); + expect(r.status).toBe(0); + expect(r.stdout).toContain('DRY-RUN'); + expect(r.stdout).toContain('alpha'); + expect(r.stdout).toContain('Published: 1'); + expect(r.stdout).toContain('Failed: 0'); + }); + + test('per-skill filter publishes only the requested slug', () => { + writeSkillFile('skills/alpha/SKILL.md'); + writeSkillFile('skills/beta/SKILL.md'); + writeManifest({ + version: '1.0.0', + description: 't', + skills: [validSkill('alpha', 'skills/alpha/SKILL.md'), validSkill('beta', 'skills/beta/SKILL.md')], + marketplaces: VALID_MARKETPLACES, + }); + const r = run(['alpha', '--dry-run']); + expect(r.status).toBe(0); + expect(r.stdout).toContain('Publishing alpha'); + expect(r.stdout).not.toContain('Publishing beta'); + expect(r.stdout).toContain('Published: 1'); + }); + + test('unknown skill filter exits non-zero', () => { + writeSkillFile('skills/alpha/SKILL.md'); + writeManifest({ + version: '1.0.0', + description: 't', + skills: [validSkill('alpha', 'skills/alpha/SKILL.md')], + marketplaces: VALID_MARKETPLACES, + }); + const r = run(['nonexistent', '--dry-run']); + expect(r.status).not.toBe(0); + expect(r.stderr).toContain('skill not found'); + }); +}); + +describe('gstack-publish: auth check isolation', () => { + test('failing auth for one marketplace does NOT abort the batch in dry-run', () => { + writeSkillFile('skills/alpha/SKILL.md'); + writeManifest({ + version: '1.0.0', + description: 't', + skills: [validSkill('alpha', 'skills/alpha/SKILL.md', ['fakestore_ok', 'fakestore_noauth'])], + marketplaces: VALID_MARKETPLACES, + }); + const r = run(['--dry-run']); + // In dry-run, auth failures are reported but don't block dispatch + expect(r.status).toBe(0); + expect(r.stdout).toContain('fakestore_ok: OK'); + expect(r.stdout).toContain('fakestore_noauth: NOT READY'); + }); + + test('missing binary reported as not-ready with docs link', () => { + writeSkillFile('skills/alpha/SKILL.md'); + writeManifest({ + version: '1.0.0', + description: 't', + skills: [validSkill('alpha', 'skills/alpha/SKILL.md', ['fakestore_missing'])], + marketplaces: VALID_MARKETPLACES, + }); + const r = run(['--dry-run']); + expect(r.stdout).toContain('fakestore_missing: NOT READY'); + expect(r.stdout).toContain('not on PATH'); + }); +}); + +describe('gstack-publish: real manifest sanity', () => { + test('the real repo skills.json passes --dry-run validation', () => { + // This uses the actual bin against the actual manifest (ROOT/skills.json). + // If auth to any real marketplace isn't set up it just reports NOT READY; + // --dry-run still exits 0 because it doesn't require auth to pass. + const real = spawnSync('bun', ['run', path.join(ROOT, 'bin', 'gstack-publish'), '--dry-run'], { + cwd: ROOT, + encoding: 'utf-8', + timeout: 20000, + }); + expect(real.status).toBe(0); + expect(real.stdout).toContain('Validating manifest'); + // Every skill in the real manifest should pass validation + expect(real.stderr).not.toContain('Manifest validation failed'); + }); +}); diff --git a/test/skill-e2e-benchmark-providers.test.ts b/test/skill-e2e-benchmark-providers.test.ts new file mode 100644 index 00000000..a9368a6b --- /dev/null +++ b/test/skill-e2e-benchmark-providers.test.ts @@ -0,0 +1,175 @@ +/** + * Multi-provider benchmark adapter E2E — hit real claude, codex, gemini CLIs. + * + * Periodic tier: runs under `bun run test:e2e` with EVALS=1. Each provider gated + * on its own `available()` check so missing auth skips that provider (doesn't + * abort the batch). Uses the simplest possible prompt ("Reply with exactly: ok") + * to keep cost near $0.001/provider/run. + * + * What this catches that unit tests don't: + * - CLI output-format drift (the #1 silent breakage path) + * - Token parsing from real provider responses + * - Auth-failure vs timeout vs rate-limit error code routing + * - Cost estimation on real token counts + * - Parallel execution via Promise.allSettled — slow provider doesn't block fast + * + * NOT covered here (would need dedicated test files): + * - Quality judge integration (benchmark-judge.ts, adds ~$0.05/run) + * - Multi-turn tool-using prompts — our single-turn smoke skips `toolCalls > 0` + */ + +import { describe, test, expect } from 'bun:test'; +import { ClaudeAdapter } from './helpers/providers/claude'; +import { GptAdapter } from './helpers/providers/gpt'; +import { GeminiAdapter } from './helpers/providers/gemini'; +import { runBenchmark } from './helpers/benchmark-runner'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; + +// --- Prerequisites / gating --- + +const evalsEnabled = !!process.env.EVALS; +const describeIfEvals = evalsEnabled ? describe : describe.skip; + +const PROMPT = 'Reply with exactly this text and nothing else: ok'; + +// Per-provider gate — each test checks its own availability and skips cleanly. +// We construct adapters outside `test` so Bun's test reporter shows the skip reason. +const claude = new ClaudeAdapter(); +const gpt = new GptAdapter(); +const gemini = new GeminiAdapter(); + +// Use a temp working directory so provider CLIs can't accidentally touch the repo. +const workdir = fs.mkdtempSync(path.join(os.tmpdir(), 'bench-e2e-')); + +describeIfEvals('multi-provider benchmark adapters (live)', () => { + test('claude: available() returns structured ok/reason', async () => { + const check = await claude.available(); + expect(check).toHaveProperty('ok'); + if (!check.ok) { + expect(typeof check.reason).toBe('string'); + expect(check.reason!.length).toBeGreaterThan(0); + } + }); + + test('gpt: available() returns structured ok/reason', async () => { + const check = await gpt.available(); + expect(check).toHaveProperty('ok'); + if (!check.ok) { + expect(typeof check.reason).toBe('string'); + } + }); + + test('gemini: available() returns structured ok/reason', async () => { + const check = await gemini.available(); + expect(check).toHaveProperty('ok'); + if (!check.ok) { + expect(typeof check.reason).toBe('string'); + } + }); + + test('claude: trivial prompt produces parseable output', async () => { + const check = await claude.available(); + if (!check.ok) { + process.stderr.write(`\nclaude live smoke: SKIPPED — ${check.reason}\n`); + return; + } + const result = await claude.run({ prompt: PROMPT, workdir, timeoutMs: 120_000 }); + if (result.error) { + throw new Error(`claude errored: ${result.error.code} — ${result.error.reason}`); + } + expect(result.output.toLowerCase()).toContain('ok'); + expect(result.tokens.input).toBeGreaterThan(0); + expect(result.tokens.output).toBeGreaterThan(0); + expect(result.durationMs).toBeGreaterThan(0); + expect(typeof result.modelUsed).toBe('string'); + expect(result.modelUsed.length).toBeGreaterThan(0); + const cost = claude.estimateCost(result.tokens, result.modelUsed); + expect(cost).toBeGreaterThan(0); + }, 150_000); + + test('gpt: trivial prompt produces parseable output', async () => { + const check = await gpt.available(); + if (!check.ok) { + process.stderr.write(`\ngpt live smoke: SKIPPED — ${check.reason}\n`); + return; + } + const result = await gpt.run({ prompt: PROMPT, workdir, timeoutMs: 120_000 }); + if (result.error) { + throw new Error(`gpt errored: ${result.error.code} — ${result.error.reason}`); + } + expect(result.output.toLowerCase()).toContain('ok'); + expect(result.tokens.input).toBeGreaterThan(0); + expect(result.tokens.output).toBeGreaterThan(0); + expect(result.durationMs).toBeGreaterThan(0); + expect(typeof result.modelUsed).toBe('string'); + const cost = gpt.estimateCost(result.tokens, result.modelUsed); + expect(cost).toBeGreaterThan(0); + }, 150_000); + + test('gemini: trivial prompt produces parseable output', async () => { + const check = await gemini.available(); + if (!check.ok) { + process.stderr.write(`\ngemini live smoke: SKIPPED — ${check.reason}\n`); + return; + } + const result = await gemini.run({ prompt: PROMPT, workdir, timeoutMs: 120_000 }); + if (result.error) { + throw new Error(`gemini errored: ${result.error.code} — ${result.error.reason}`); + } + expect(result.output.toLowerCase()).toContain('ok'); + // Gemini CLI sometimes returns 0 tokens in the result event (older responses); + // assert non-negative instead of strictly positive. + expect(result.tokens.input).toBeGreaterThanOrEqual(0); + expect(result.tokens.output).toBeGreaterThanOrEqual(0); + expect(result.durationMs).toBeGreaterThan(0); + expect(typeof result.modelUsed).toBe('string'); + }, 150_000); + + test('timeout error surfaces as error.code=timeout (no exception)', async () => { + // Use whatever adapter is available first — all three should share timeout semantics. + const adapter = (await claude.available()).ok ? claude + : (await gpt.available()).ok ? gpt + : (await gemini.available()).ok ? gemini + : null; + if (!adapter) { + process.stderr.write('\ntimeout smoke: SKIPPED — no provider available\n'); + return; + } + // 100ms timeout is far too short for any real CLI startup → must timeout. + const result = await adapter.run({ prompt: PROMPT, workdir, timeoutMs: 100 }); + expect(result.error).toBeDefined(); + // Timeout, binary_missing, or unknown (if CLI dies differently) — all acceptable + // non-crash outcomes. The point is the adapter returns a RunResult, not throws. + expect(['timeout', 'unknown', 'binary_missing']).toContain(result.error!.code); + expect(result.durationMs).toBeGreaterThan(0); + }, 30_000); + + test('runBenchmark: Promise.allSettled means one unavailable provider does not block others', async () => { + // Use the full runner with all three providers — whichever are unauthed should + // return entries with available=false and not crash the batch. + const report = await runBenchmark({ + prompt: PROMPT, + workdir, + providers: ['claude', 'gpt', 'gemini'], + timeoutMs: 120_000, + skipUnavailable: false, + }); + expect(report.entries).toHaveLength(3); + for (const e of report.entries) { + expect(['claude', 'gpt', 'gemini']).toContain(e.family); + if (e.available) { + expect(e.result).toBeDefined(); + } else { + expect(typeof e.unavailable_reason).toBe('string'); + } + } + // At least one available provider should have produced a non-error result in a healthy CI env. + const hadSuccess = report.entries.some(e => e.available && e.result && !e.result.error); + // We don't hard-assert this: if NO providers are authed, skip silently. + if (!hadSuccess) { + process.stderr.write('\nrunBenchmark live: no provider produced a clean result (no auth?)\n'); + } + }, 300_000); +}); diff --git a/test/taste-engine.test.ts b/test/taste-engine.test.ts new file mode 100644 index 00000000..52a39f89 --- /dev/null +++ b/test/taste-engine.test.ts @@ -0,0 +1,370 @@ +/** + * Taste engine — end-to-end tests for `gstack-taste-update`. + * + * Covers the v1 taste profile contract: schema shape, Laplace-smoothed confidence, + * 5%/week decay, dimension extraction from reason strings, session cap, schema + * migration, conflict detection (taste drift), malformed-input recovery. + * + * All tests use GSTACK_STATE_DIR pointing at a temp dir so no real home dir is + * touched. Each test isolates its own state directory. + */ + +import { describe, test, expect, beforeEach, afterEach } from 'bun:test'; +import { spawnSync } from 'child_process'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; + +const ROOT = path.resolve(import.meta.dir, '..'); +const BIN = path.join(ROOT, 'bin', 'gstack-taste-update'); + +interface Preference { + value: string; + confidence: number; + approved_count: number; + rejected_count: number; + last_seen: string; +} + +interface TasteProfile { + version: number; + updated_at: string; + dimensions: Record<'fonts' | 'colors' | 'layouts' | 'aesthetics', { approved: Preference[]; rejected: Preference[] }>; + sessions: Array<{ ts: string; action: 'approved' | 'rejected'; variant: string; reason?: string }>; +} + +let stateDir: string; +let workdir: string; + +beforeEach(() => { + stateDir = fs.mkdtempSync(path.join(os.tmpdir(), 'taste-state-')); + workdir = fs.mkdtempSync(path.join(os.tmpdir(), 'taste-work-')); + // Initialize a git repo so gstack-taste-update's getSlug() finds a toplevel + spawnSync('git', ['init', '-b', 'main'], { cwd: workdir, stdio: 'pipe' }); +}); + +afterEach(() => { + fs.rmSync(stateDir, { recursive: true, force: true }); + fs.rmSync(workdir, { recursive: true, force: true }); +}); + +function run(args: string[]): { status: number | null; stdout: string; stderr: string } { + const result = spawnSync('bun', ['run', BIN, ...args], { + cwd: workdir, + env: { ...process.env, GSTACK_STATE_DIR: stateDir, HOME: stateDir }, + encoding: 'utf-8', + timeout: 10000, + }); + return { + status: result.status, + stdout: result.stdout?.toString() ?? '', + stderr: result.stderr?.toString() ?? '', + }; +} + +function profilePath(): string { + const slug = path.basename(workdir); + return path.join(stateDir, 'projects', slug, 'taste-profile.json'); +} + +function readProfile(): TasteProfile { + return JSON.parse(fs.readFileSync(profilePath(), 'utf-8')); +} + +function writeProfile(p: unknown): void { + const pp = profilePath(); + fs.mkdirSync(path.dirname(pp), { recursive: true }); + fs.writeFileSync(pp, JSON.stringify(p, null, 2)); +} + +describe('taste-engine: first-write lifecycle', () => { + test('approved creates profile with correct v1 schema', () => { + const r = run(['approved', 'variant-A', '--reason', 'fonts: Geist Sans; colors: emerald']); + expect(r.status).toBe(0); + + const p = readProfile(); + expect(p.version).toBe(1); + expect(p.dimensions.fonts.approved).toHaveLength(1); + expect(p.dimensions.fonts.approved[0].value).toBe('Geist Sans'); + expect(p.dimensions.fonts.approved[0].approved_count).toBe(1); + expect(p.dimensions.fonts.approved[0].rejected_count).toBe(0); + // Laplace: 1 / (1 + 0 + 1) = 0.5 + expect(p.dimensions.fonts.approved[0].confidence).toBeCloseTo(0.5, 5); + expect(p.dimensions.colors.approved[0].value).toBe('emerald'); + expect(p.sessions).toHaveLength(1); + expect(p.sessions[0].action).toBe('approved'); + expect(p.sessions[0].variant).toBe('variant-A'); + }); + + test('rejected bumps rejected_count not approved_count', () => { + run(['rejected', 'variant-B', '--reason', 'fonts: Comic Sans']); + const p = readProfile(); + expect(p.dimensions.fonts.rejected).toHaveLength(1); + expect(p.dimensions.fonts.rejected[0].rejected_count).toBe(1); + expect(p.dimensions.fonts.rejected[0].approved_count).toBe(0); + expect(p.dimensions.fonts.approved).toHaveLength(0); + }); + + test('session recorded even when no dimensions extractable from reason', () => { + const r = run(['approved', 'variant-C']); // no --reason + expect(r.status).toBe(0); + const p = readProfile(); + expect(p.sessions).toHaveLength(1); + for (const dim of ['fonts', 'colors', 'layouts', 'aesthetics'] as const) { + expect(p.dimensions[dim].approved).toHaveLength(0); + expect(p.dimensions[dim].rejected).toHaveLength(0); + } + }); +}); + +describe('taste-engine: Laplace-smoothed confidence', () => { + test('repeated approvals raise confidence toward 1', () => { + for (let i = 0; i < 5; i++) { + run(['approved', `variant-${i}`, '--reason', 'fonts: Geist Sans']); + } + const p = readProfile(); + const pref = p.dimensions.fonts.approved[0]; + expect(pref.approved_count).toBe(5); + // Laplace: 5 / (5 + 0 + 1) = 0.833 + expect(pref.confidence).toBeCloseTo(5 / 6, 5); + }); + + test('mixed approvals + rejections balance out', () => { + run(['approved', 'v1', '--reason', 'fonts: Inter']); + run(['approved', 'v2', '--reason', 'fonts: Inter']); + run(['rejected', 'v3', '--reason', 'fonts: Inter']); + const p = readProfile(); + const approved = p.dimensions.fonts.approved[0]; + const rejected = p.dimensions.fonts.rejected[0]; + expect(approved.approved_count).toBe(2); + expect(approved.rejected_count).toBe(0); + expect(rejected.rejected_count).toBe(1); + expect(rejected.approved_count).toBe(0); + }); +}); + +describe('taste-engine: decay math', () => { + test('show applies 5%/week decay to stored confidence', () => { + // Seed with a profile where the single approved font was last_seen 4 weeks ago + const fourWeeksAgo = new Date(Date.now() - 4 * 7 * 24 * 60 * 60 * 1000).toISOString(); + writeProfile({ + version: 1, + updated_at: new Date().toISOString(), + dimensions: { + fonts: { + approved: [{ value: 'Aged Font', confidence: 0.8, approved_count: 4, rejected_count: 0, last_seen: fourWeeksAgo }], + rejected: [], + }, + colors: { approved: [], rejected: [] }, + layouts: { approved: [], rejected: [] }, + aesthetics: { approved: [], rejected: [] }, + }, + sessions: [], + }); + const r = run(['show']); + expect(r.status).toBe(0); + // After 4 weeks: 0.8 * (0.95)^4 ≈ 0.651 + const expectedConf = 0.8 * Math.pow(0.95, 4); + const match = r.stdout.match(/Aged Font — conf (\d+\.\d+)/); + expect(match).toBeTruthy(); + const displayedConf = parseFloat(match![1]); + expect(displayedConf).toBeCloseTo(expectedConf, 2); + }); + + test('decay never goes below zero', () => { + // 3 years ≈ 156 weeks. 0.95^156 ≈ 0.00036, well below 0.01. + const yearsAgo = new Date(Date.now() - 3 * 365 * 24 * 60 * 60 * 1000).toISOString(); + writeProfile({ + version: 1, + updated_at: new Date().toISOString(), + dimensions: { + fonts: { + approved: [{ value: 'Ancient', confidence: 1.0, approved_count: 1, rejected_count: 0, last_seen: yearsAgo }], + rejected: [], + }, + colors: { approved: [], rejected: [] }, + layouts: { approved: [], rejected: [] }, + aesthetics: { approved: [], rejected: [] }, + }, + sessions: [], + }); + const r = run(['show']); + expect(r.status).toBe(0); + const match = r.stdout.match(/Ancient — conf (\d+\.\d+)/); + expect(match).toBeTruthy(); + const conf = parseFloat(match![1]); + expect(conf).toBeGreaterThanOrEqual(0); + expect(conf).toBeLessThan(0.01); + }); +}); + +describe('taste-engine: dimension extraction', () => { + test('parses multiple dimensions from one reason string', () => { + run(['approved', 'v1', '--reason', 'fonts: Geist, IBM Plex; colors: emerald; layouts: grid-12; aesthetics: brutalist']); + const p = readProfile(); + expect(p.dimensions.fonts.approved.map(x => x.value).sort()).toEqual(['Geist', 'IBM Plex']); + expect(p.dimensions.colors.approved[0].value).toBe('emerald'); + expect(p.dimensions.layouts.approved[0].value).toBe('grid-12'); + expect(p.dimensions.aesthetics.approved[0].value).toBe('brutalist'); + }); + + test('value matching is case-insensitive', () => { + run(['approved', 'v1', '--reason', 'fonts: Geist']); + run(['approved', 'v2', '--reason', 'fonts: GEIST']); + const p = readProfile(); + // Should merge into a single entry + expect(p.dimensions.fonts.approved).toHaveLength(1); + expect(p.dimensions.fonts.approved[0].approved_count).toBe(2); + }); + + test('unknown dimension labels are silently ignored', () => { + run(['approved', 'v1', '--reason', 'weather: sunny; mood: happy']); + const p = readProfile(); + // Session still recorded + expect(p.sessions).toHaveLength(1); + // No dimensions populated + for (const dim of ['fonts', 'colors', 'layouts', 'aesthetics'] as const) { + expect(p.dimensions[dim].approved).toHaveLength(0); + } + }); +}); + +describe('taste-engine: session cap', () => { + test('sessions truncate to last 50 entries (FIFO)', () => { + for (let i = 0; i < 55; i++) { + run(['approved', `v${i}`, '--reason', 'fonts: Geist']); + } + const p = readProfile(); + expect(p.sessions).toHaveLength(50); + // Last 5 should be preserved, first 5 dropped + expect(p.sessions[0].variant).toBe('v5'); + expect(p.sessions[49].variant).toBe('v54'); + }); +}); + +describe('taste-engine: taste drift conflict detection', () => { + test('warns when approved value has strong opposite signal', () => { + // Seed a strong rejected entry: 4 rejections, no approvals → Laplace = 0/5 but that's + // not > 0.6. Let's seed it directly with confidence 0.8. + writeProfile({ + version: 1, + updated_at: new Date().toISOString(), + dimensions: { + fonts: { + approved: [], + rejected: [{ value: 'Comic Sans', confidence: 0.8, approved_count: 0, rejected_count: 4, last_seen: new Date().toISOString() }], + }, + colors: { approved: [], rejected: [] }, + layouts: { approved: [], rejected: [] }, + aesthetics: { approved: [], rejected: [] }, + }, + sessions: [], + }); + const r = run(['approved', 'v1', '--reason', 'fonts: Comic Sans']); + expect(r.status).toBe(0); + // "taste drift" note should go to stderr + expect(r.stderr).toContain('taste drift'); + expect(r.stderr).toContain('Comic Sans'); + }); + + test('does NOT warn when signal is weak', () => { + writeProfile({ + version: 1, + updated_at: new Date().toISOString(), + dimensions: { + fonts: { + approved: [], + // Single rejection (< 3) — shouldn't trigger drift warning + rejected: [{ value: 'Inter', confidence: 0.5, approved_count: 0, rejected_count: 1, last_seen: new Date().toISOString() }], + }, + colors: { approved: [], rejected: [] }, + layouts: { approved: [], rejected: [] }, + aesthetics: { approved: [], rejected: [] }, + }, + sessions: [], + }); + const r = run(['approved', 'v1', '--reason', 'fonts: Inter']); + expect(r.status).toBe(0); + expect(r.stderr).not.toContain('taste drift'); + }); +}); + +describe('taste-engine: migration', () => { + test('legacy profile without version gets migrated to v1', () => { + // Simulate a legacy approved.json-style structure + writeProfile({ + // no version field + dimensions: { + fonts: { + approved: [{ value: 'Legacy', confidence: 0.7, approved_count: 3, rejected_count: 1, last_seen: new Date().toISOString() }], + rejected: [], + }, + }, + sessions: [ + { ts: new Date().toISOString(), action: 'approved', variant: 'legacy-v1' }, + ], + }); + + const r = run(['migrate']); + expect(r.status).toBe(0); + + const p = readProfile(); + expect(p.version).toBe(1); + expect(p.dimensions.fonts.approved[0].value).toBe('Legacy'); + expect(p.dimensions.colors).toBeDefined(); + expect(p.dimensions.layouts).toBeDefined(); + expect(p.dimensions.aesthetics).toBeDefined(); + expect(p.sessions).toHaveLength(1); + expect(p.sessions[0].variant).toBe('legacy-v1'); + }); + + test('migration truncates oversized sessions array to last 50', () => { + const sessions = Array.from({ length: 100 }, (_, i) => ({ + ts: new Date().toISOString(), + action: 'approved' as const, + variant: `legacy-${i}`, + })); + writeProfile({ dimensions: {}, sessions }); + const r = run(['migrate']); + expect(r.status).toBe(0); + const p = readProfile(); + expect(p.sessions).toHaveLength(50); + expect(p.sessions[0].variant).toBe('legacy-50'); + expect(p.sessions[49].variant).toBe('legacy-99'); + }); +}); + +describe('taste-engine: resilience', () => { + test('malformed JSON profile falls back to empty and does not crash', () => { + const pp = profilePath(); + fs.mkdirSync(path.dirname(pp), { recursive: true }); + fs.writeFileSync(pp, '{ this is not json'); + const r = run(['approved', 'v1', '--reason', 'fonts: Geist']); + // Should succeed (graceful fallback) + expect(r.status).toBe(0); + // Warning on stderr + expect(r.stderr).toContain('WARN'); + // File should now be valid JSON + const p = readProfile(); + expect(p.version).toBe(1); + expect(p.dimensions.fonts.approved[0].value).toBe('Geist'); + }); + + test('show on nonexistent profile prints empty summary without error', () => { + const r = run(['show']); + expect(r.status).toBe(0); + expect(r.stdout).toContain('taste-profile.json'); + }); + + test('approved without variant arg exits non-zero with usage hint', () => { + const r = run(['approved']); + expect(r.status).not.toBe(0); + expect(r.stderr).toContain('Usage'); + }); + + test('unknown command exits non-zero', () => { + const r = run(['banana']); + expect(r.status).not.toBe(0); + expect(r.stderr).toContain('Usage'); + }); +});