test: lite E2E coverage for benchmark, taste engine, publish

Fills real coverage gaps in v0.19.0.0 primitives. 44 new deterministic tests (gate tier, ~3s) + 8 live-API tests (periodic tier). New gate-tier test files (free, <3s total): - test/taste-engine.test.ts — 24 tests against gstack-taste-update: schema shape, Laplace-smoothed confidence, 5%/week decay clamped at 0, multi-dimension extraction, case-insensitive matching, session cap, legacy profile migration with session truncation, taste-drift conflict warning, malformed-JSON recovery, missing-variant exit code. - test/publish-dry-run.test.ts — 13 tests against gstack-publish --dry-run: manifest parsing, missing/malformed JSON, per-skill validation errors (missing source file / slug / version / marketplaces), slug filter, unknown-skill exit, per-marketplace auth isolation (fake marketplaces with always-pass / always-fail / missing-binary CLIs), and a sanity check against the real repo manifest. - test/benchmark-cli.test.ts — 11 tests against gstack-model-benchmark --dry-run: provider default, unknown-provider WARN, empty list fallback, flag passthrough (timeout/workdir/judge/output), long-prompt truncation, prompt resolution (inline vs file vs positional), missing prompt exit. New periodic-tier test file (paid, gated EVALS=1): - test/skill-e2e-benchmark-providers.test.ts — 8 tests hitting real claude, codex, gemini CLIs with a trivial prompt (~$0.001/provider). Verifies output parsing, token accounting, cost estimation, timeout error.code semantics, Promise.allSettled parallel isolation. Per-provider availability gate — unauthed providers skip cleanly. This suite already caught one real bug (codex adapter missing --skip-git-repo-check, fixed in 5260987d). Registered `benchmark-providers-live` in touchfiles.ts (periodic tier, triggered by changes to bin/gstack-model-benchmark, providers/**, benchmark-runner.ts, pricing.ts). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-05 13:15:24 +02:00 · 2026-04-18 06:45:06 +08:00
parent 42715188c2
commit c875e0c3fc
5 changed files with 966 additions and 0 deletions
@@ -0,0 +1,134 @@
+/**
+ * gstack-model-benchmark CLI tests (offline).
+ *
+ * Covers CLI wiring that unit tests against benchmark-runner.ts can't see:
+ *   - --dry-run auth/provider-list resolution
+ *   - unknown provider WARN path
+ *   - provider default (claude) when --models omitted
+ *   - prompt resolution (inline --prompt vs positional file path)
+ *   - output format flag wiring via --dry-run (avoids real CLI invocation)
+ *
+ * All tests use --dry-run so no API calls happen.
+ */
+
+import { describe, test, expect } from 'bun:test';
+import { spawnSync } from 'child_process';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+
+const ROOT = path.resolve(import.meta.dir, '..');
+const BIN = path.join(ROOT, 'bin', 'gstack-model-benchmark');
+
+function run(args: string[], opts: { env?: Record<string, string> } = {}): { status: number | null; stdout: string; stderr: string } {
+  const result = spawnSync('bun', ['run', BIN, ...args], {
+    cwd: ROOT,
+    env: { ...process.env, ...opts.env },
+    encoding: 'utf-8',
+    timeout: 15000,
+  });
+  return {
+    status: result.status,
+    stdout: result.stdout?.toString() ?? '',
+    stderr: result.stderr?.toString() ?? '',
+  };
+}
+
+describe('gstack-model-benchmark --dry-run', () => {
+  test('prints provider availability report and exits 0', () => {
+    const r = run(['--prompt', 'hi', '--models', 'claude,gpt,gemini', '--dry-run']);
+    expect(r.status).toBe(0);
+    expect(r.stdout).toContain('gstack-model-benchmark --dry-run');
+    expect(r.stdout).toContain('claude');
+    expect(r.stdout).toContain('gpt');
+    expect(r.stdout).toContain('gemini');
+    expect(r.stdout).toContain('no prompts sent');
+  });
+
+  test('reports default provider when --models omitted', () => {
+    const r = run(['--prompt', 'hi', '--dry-run']);
+    expect(r.status).toBe(0);
+    expect(r.stdout).toContain('providers:  claude');
+  });
+
+  test('unknown provider in --models emits WARN and is dropped', () => {
+    const r = run(['--prompt', 'hi', '--models', 'claude,gpt-42-fake', '--dry-run']);
+    expect(r.status).toBe(0);
+    expect(r.stderr).toContain('unknown provider');
+    expect(r.stderr).toContain('gpt-42-fake');
+    expect(r.stdout).toContain('providers:  claude');
+    expect(r.stdout).not.toContain('gpt-42-fake');
+  });
+
+  test('empty --models list falls back to claude default', () => {
+    const r = run(['--prompt', 'hi', '--models', '', '--dry-run']);
+    expect(r.status).toBe(0);
+    expect(r.stdout).toContain('providers:  claude');
+  });
+
+  test('--timeout-ms and --workdir flags flow through to dry-run report', () => {
+    const r = run(['--prompt', 'hi', '--timeout-ms', '9999', '--workdir', '/tmp', '--dry-run']);
+    expect(r.status).toBe(0);
+    expect(r.stdout).toContain('timeout_ms: 9999');
+    expect(r.stdout).toContain('workdir:    /tmp');
+  });
+
+  test('--judge flag reported in dry-run output', () => {
+    const r = run(['--prompt', 'hi', '--judge', '--dry-run']);
+    expect(r.status).toBe(0);
+    expect(r.stdout).toContain('judge:      on');
+  });
+
+  test('--output flag reported in dry-run', () => {
+    const r = run(['--prompt', 'hi', '--output', 'json', '--dry-run']);
+    expect(r.status).toBe(0);
+    expect(r.stdout).toContain('output:     json');
+  });
+
+  test('each adapter reports either OK or NOT READY, never crashes', () => {
+    const r = run(['--prompt', 'hi', '--models', 'claude,gpt,gemini', '--dry-run']);
+    expect(r.status).toBe(0);
+    // Each provider line must end in OK or NOT READY
+    const lines = r.stdout.split('\n');
+    const adapterLines = lines.filter(l => /^\s+(claude|gpt|gemini):/.test(l));
+    expect(adapterLines.length).toBe(3);
+    for (const line of adapterLines) {
+      expect(line).toMatch(/(OK|NOT READY)/);
+    }
+  });
+
+  test('long prompt is truncated in dry-run display', () => {
+    const longPrompt = 'x'.repeat(200);
+    const r = run(['--prompt', longPrompt, '--dry-run']);
+    expect(r.status).toBe(0);
+    // Summary truncates to 80 chars + ellipsis
+    expect(r.stdout).toMatch(/prompt:\s+x{80}…/);
+  });
+});
+
+describe('gstack-model-benchmark prompt resolution', () => {
+  test('positional file path is read and passed as prompt', () => {
+    const tmp = fs.mkdtempSync(path.join(os.tmpdir(), 'bench-prompt-'));
+    const promptFile = path.join(tmp, 'prompt.txt');
+    fs.writeFileSync(promptFile, 'hello from file');
+    try {
+      const r = run([promptFile, '--dry-run']);
+      expect(r.status).toBe(0);
+      expect(r.stdout).toContain('hello from file');
+    } finally {
+      fs.rmSync(tmp, { recursive: true, force: true });
+    }
+  });
+
+  test('positional non-file arg is treated as inline prompt', () => {
+    const r = run(['treat-me-as-inline', '--dry-run']);
+    expect(r.status).toBe(0);
+    expect(r.stdout).toContain('treat-me-as-inline');
+  });
+
+  test('missing prompt exits non-zero', () => {
+    const r = run(['--dry-run']);
+    expect(r.status).not.toBe(0);
+    expect(r.stderr).toContain('specify a prompt');
+  });
+});
@@ -171,6 +171,9 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
  // Autoplan
  'autoplan-core':  ['autoplan/**', 'plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**'],

+  // Multi-provider benchmark adapters — live API smoke against real claude/codex/gemini CLIs
+  'benchmark-providers-live': ['bin/gstack-model-benchmark', 'test/helpers/providers/**', 'test/helpers/benchmark-runner.ts', 'test/helpers/pricing.ts'],
+
  // Skill routing — journey-stage tests (depend on ALL skill descriptions)
  'journey-ideation':       ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
  'journey-plan-eng':       ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
@@ -316,6 +319,9 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
  // Autoplan — periodic (not yet implemented)
  'autoplan-core': 'periodic',

+  // Multi-provider benchmark — periodic (requires external CLIs + auth, paid)
+  'benchmark-providers-live': 'periodic',
+
  // Skill routing — periodic (LLM routing is non-deterministic)
  'journey-ideation': 'periodic',
  'journey-plan-eng': 'periodic',
@@ -0,0 +1,281 @@
+/**
+ * gstack-publish end-to-end tests via --dry-run.
+ *
+ * Verifies manifest parsing, schema validation, marketplace auth checks, per-skill
+ * error isolation, and command building — all without touching real marketplaces.
+ *
+ * --dry-run does NOT run execSync on publish commands. Auth checks still run
+ * against real binaries; we use fake marketplaces whose `auth_check` commands
+ * are always-succeed (`true`) or always-fail (`false`) so the test is hermetic.
+ */
+
+import { describe, test, expect, beforeEach, afterEach } from 'bun:test';
+import { spawnSync } from 'child_process';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+
+const ROOT = path.resolve(import.meta.dir, '..');
+const BIN = path.join(ROOT, 'bin', 'gstack-publish');
+
+let sandbox: string;
+let binCopy: string;
+
+beforeEach(() => {
+  // gstack-publish reads skills.json relative to the binary's dir (import.meta.dir/..).
+  // To isolate each test's manifest, we create a sandbox repo that mirrors the real
+  // structure: copy the bin into sandbox/bin/, write a controlled skills.json at the root.
+  sandbox = fs.mkdtempSync(path.join(os.tmpdir(), 'publish-sandbox-'));
+  fs.mkdirSync(path.join(sandbox, 'bin'));
+  fs.mkdirSync(path.join(sandbox, 'test', 'helpers'), { recursive: true });
+  binCopy = path.join(sandbox, 'bin', 'gstack-publish');
+  fs.copyFileSync(BIN, binCopy);
+  fs.chmodSync(binCopy, 0o755);
+});
+
+afterEach(() => {
+  fs.rmSync(sandbox, { recursive: true, force: true });
+});
+
+function writeManifest(manifest: object): void {
+  fs.writeFileSync(path.join(sandbox, 'skills.json'), JSON.stringify(manifest, null, 2));
+}
+
+function writeSkillFile(relPath: string, content = '# Test Skill\n'): void {
+  const full = path.join(sandbox, relPath);
+  fs.mkdirSync(path.dirname(full), { recursive: true });
+  fs.writeFileSync(full, content);
+}
+
+function run(args: string[]): { status: number | null; stdout: string; stderr: string } {
+  const result = spawnSync('bun', ['run', binCopy, ...args], {
+    cwd: sandbox,
+    encoding: 'utf-8',
+    timeout: 15000,
+  });
+  return {
+    status: result.status,
+    stdout: result.stdout?.toString() ?? '',
+    stderr: result.stderr?.toString() ?? '',
+  };
+}
+
+const VALID_MARKETPLACES = {
+  fakestore_ok: {
+    cli: 'true', // binary that always succeeds
+    login_cmd: 'fakestore_ok login',
+    publish_cmd_template: 'echo publish {slug} {version}',
+    docs: 'https://fakestore.example',
+    auth_check: 'true', // always-authenticated
+  },
+  fakestore_noauth: {
+    cli: 'true',
+    login_cmd: 'fakestore_noauth login',
+    publish_cmd_template: 'echo publish {slug} {version}',
+    docs: 'https://fakestore.example',
+    auth_check: 'false', // always-fails auth
+  },
+  fakestore_missing: {
+    cli: 'nonexistent-binary-xyz',
+    login_cmd: 'fakestore_missing login',
+    publish_cmd_template: 'echo publish {slug} {version}',
+    docs: 'https://fakestore.example',
+    auth_check: 'nonexistent-binary-xyz whoami',
+  },
+};
+
+function validSkill(slug: string, sourceRel: string, marketplaces: string[] = ['fakestore_ok']) {
+  const m: Record<string, { slug: string; publish: boolean }> = {};
+  for (const name of marketplaces) m[name] = { slug, publish: true };
+  return {
+    slug,
+    source: sourceRel,
+    name: `Skill ${slug}`,
+    version: '1.0.0',
+    category: 'test',
+    description: 'A test skill',
+    marketplaces: m,
+    standalone: true,
+    compatible_hosts: ['claude-code'],
+  };
+}
+
+describe('gstack-publish: manifest loading', () => {
+  test('--list prints every skill and marketplace', () => {
+    writeSkillFile('skills/alpha/SKILL.md');
+    writeSkillFile('skills/beta/SKILL.md');
+    writeManifest({
+      version: '1.0.0',
+      description: 't',
+      skills: [validSkill('alpha', 'skills/alpha/SKILL.md'), validSkill('beta', 'skills/beta/SKILL.md')],
+      marketplaces: VALID_MARKETPLACES,
+    });
+    const r = run(['--list']);
+    expect(r.status).toBe(0);
+    expect(r.stdout).toContain('alpha');
+    expect(r.stdout).toContain('beta');
+    expect(r.stdout).toContain('fakestore_ok');
+  });
+
+  test('missing manifest exits non-zero', () => {
+    // Delete any manifest
+    const r = run(['--dry-run']);
+    expect(r.status).not.toBe(0);
+    expect(r.stderr).toContain('skills.json');
+  });
+
+  test('malformed JSON exits non-zero', () => {
+    fs.writeFileSync(path.join(sandbox, 'skills.json'), '{ not json');
+    const r = run(['--dry-run']);
+    expect(r.status).not.toBe(0);
+    expect(r.stderr).toContain('parse');
+  });
+});
+
+describe('gstack-publish: validation', () => {
+  test('missing source file reports validation error and exits 1', () => {
+    writeManifest({
+      version: '1.0.0',
+      description: 't',
+      skills: [validSkill('ghost', 'skills/ghost/DOES_NOT_EXIST.md')],
+      marketplaces: VALID_MARKETPLACES,
+    });
+    const r = run(['--dry-run']);
+    expect(r.status).not.toBe(0);
+    expect(r.stderr).toContain('source file missing');
+    expect(r.stderr).toContain('ghost');
+  });
+
+  test('missing slug reports validation error', () => {
+    writeSkillFile('skills/x/SKILL.md');
+    const s = validSkill('temp', 'skills/x/SKILL.md');
+    delete (s as Partial<typeof s>).slug;
+    writeManifest({
+      version: '1.0.0',
+      description: 't',
+      skills: [s],
+      marketplaces: VALID_MARKETPLACES,
+    });
+    const r = run(['--dry-run']);
+    expect(r.status).not.toBe(0);
+    expect(r.stderr).toContain('missing slug');
+  });
+
+  test('missing version reports validation error', () => {
+    writeSkillFile('skills/x/SKILL.md');
+    const s = validSkill('x', 'skills/x/SKILL.md');
+    delete (s as Partial<typeof s>).version;
+    writeManifest({
+      version: '1.0.0',
+      description: 't',
+      skills: [s],
+      marketplaces: VALID_MARKETPLACES,
+    });
+    const r = run(['--dry-run']);
+    expect(r.status).not.toBe(0);
+    expect(r.stderr).toContain('missing version');
+  });
+
+  test('no marketplaces configured reports validation error', () => {
+    writeSkillFile('skills/x/SKILL.md');
+    const s = { ...validSkill('x', 'skills/x/SKILL.md'), marketplaces: {} };
+    writeManifest({ version: '1.0.0', description: 't', skills: [s], marketplaces: VALID_MARKETPLACES });
+    const r = run(['--dry-run']);
+    expect(r.status).not.toBe(0);
+    expect(r.stderr).toContain('no marketplaces configured');
+  });
+});
+
+describe('gstack-publish: dry-run execution', () => {
+  test('happy path reports DRY-RUN tag and templated command', () => {
+    writeSkillFile('skills/alpha/SKILL.md');
+    writeManifest({
+      version: '1.0.0',
+      description: 't',
+      skills: [validSkill('alpha', 'skills/alpha/SKILL.md')],
+      marketplaces: VALID_MARKETPLACES,
+    });
+    const r = run(['--dry-run']);
+    expect(r.status).toBe(0);
+    expect(r.stdout).toContain('DRY-RUN');
+    expect(r.stdout).toContain('alpha');
+    expect(r.stdout).toContain('Published: 1');
+    expect(r.stdout).toContain('Failed:    0');
+  });
+
+  test('per-skill filter publishes only the requested slug', () => {
+    writeSkillFile('skills/alpha/SKILL.md');
+    writeSkillFile('skills/beta/SKILL.md');
+    writeManifest({
+      version: '1.0.0',
+      description: 't',
+      skills: [validSkill('alpha', 'skills/alpha/SKILL.md'), validSkill('beta', 'skills/beta/SKILL.md')],
+      marketplaces: VALID_MARKETPLACES,
+    });
+    const r = run(['alpha', '--dry-run']);
+    expect(r.status).toBe(0);
+    expect(r.stdout).toContain('Publishing alpha');
+    expect(r.stdout).not.toContain('Publishing beta');
+    expect(r.stdout).toContain('Published: 1');
+  });
+
+  test('unknown skill filter exits non-zero', () => {
+    writeSkillFile('skills/alpha/SKILL.md');
+    writeManifest({
+      version: '1.0.0',
+      description: 't',
+      skills: [validSkill('alpha', 'skills/alpha/SKILL.md')],
+      marketplaces: VALID_MARKETPLACES,
+    });
+    const r = run(['nonexistent', '--dry-run']);
+    expect(r.status).not.toBe(0);
+    expect(r.stderr).toContain('skill not found');
+  });
+});
+
+describe('gstack-publish: auth check isolation', () => {
+  test('failing auth for one marketplace does NOT abort the batch in dry-run', () => {
+    writeSkillFile('skills/alpha/SKILL.md');
+    writeManifest({
+      version: '1.0.0',
+      description: 't',
+      skills: [validSkill('alpha', 'skills/alpha/SKILL.md', ['fakestore_ok', 'fakestore_noauth'])],
+      marketplaces: VALID_MARKETPLACES,
+    });
+    const r = run(['--dry-run']);
+    // In dry-run, auth failures are reported but don't block dispatch
+    expect(r.status).toBe(0);
+    expect(r.stdout).toContain('fakestore_ok: OK');
+    expect(r.stdout).toContain('fakestore_noauth: NOT READY');
+  });
+
+  test('missing binary reported as not-ready with docs link', () => {
+    writeSkillFile('skills/alpha/SKILL.md');
+    writeManifest({
+      version: '1.0.0',
+      description: 't',
+      skills: [validSkill('alpha', 'skills/alpha/SKILL.md', ['fakestore_missing'])],
+      marketplaces: VALID_MARKETPLACES,
+    });
+    const r = run(['--dry-run']);
+    expect(r.stdout).toContain('fakestore_missing: NOT READY');
+    expect(r.stdout).toContain('not on PATH');
+  });
+});
+
+describe('gstack-publish: real manifest sanity', () => {
+  test('the real repo skills.json passes --dry-run validation', () => {
+    // This uses the actual bin against the actual manifest (ROOT/skills.json).
+    // If auth to any real marketplace isn't set up it just reports NOT READY;
+    // --dry-run still exits 0 because it doesn't require auth to pass.
+    const real = spawnSync('bun', ['run', path.join(ROOT, 'bin', 'gstack-publish'), '--dry-run'], {
+      cwd: ROOT,
+      encoding: 'utf-8',
+      timeout: 20000,
+    });
+    expect(real.status).toBe(0);
+    expect(real.stdout).toContain('Validating manifest');
+    // Every skill in the real manifest should pass validation
+    expect(real.stderr).not.toContain('Manifest validation failed');
+  });
+});
@@ -0,0 +1,175 @@
+/**
+ * Multi-provider benchmark adapter E2E — hit real claude, codex, gemini CLIs.
+ *
+ * Periodic tier: runs under `bun run test:e2e` with EVALS=1. Each provider gated
+ * on its own `available()` check so missing auth skips that provider (doesn't
+ * abort the batch). Uses the simplest possible prompt ("Reply with exactly: ok")
+ * to keep cost near $0.001/provider/run.
+ *
+ * What this catches that unit tests don't:
+ *   - CLI output-format drift (the #1 silent breakage path)
+ *   - Token parsing from real provider responses
+ *   - Auth-failure vs timeout vs rate-limit error code routing
+ *   - Cost estimation on real token counts
+ *   - Parallel execution via Promise.allSettled — slow provider doesn't block fast
+ *
+ * NOT covered here (would need dedicated test files):
+ *   - Quality judge integration (benchmark-judge.ts, adds ~$0.05/run)
+ *   - Multi-turn tool-using prompts — our single-turn smoke skips `toolCalls > 0`
+ */
+
+import { describe, test, expect } from 'bun:test';
+import { ClaudeAdapter } from './helpers/providers/claude';
+import { GptAdapter } from './helpers/providers/gpt';
+import { GeminiAdapter } from './helpers/providers/gemini';
+import { runBenchmark } from './helpers/benchmark-runner';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+
+// --- Prerequisites / gating ---
+
+const evalsEnabled = !!process.env.EVALS;
+const describeIfEvals = evalsEnabled ? describe : describe.skip;
+
+const PROMPT = 'Reply with exactly this text and nothing else: ok';
+
+// Per-provider gate — each test checks its own availability and skips cleanly.
+// We construct adapters outside `test` so Bun's test reporter shows the skip reason.
+const claude = new ClaudeAdapter();
+const gpt = new GptAdapter();
+const gemini = new GeminiAdapter();
+
+// Use a temp working directory so provider CLIs can't accidentally touch the repo.
+const workdir = fs.mkdtempSync(path.join(os.tmpdir(), 'bench-e2e-'));
+
+describeIfEvals('multi-provider benchmark adapters (live)', () => {
+  test('claude: available() returns structured ok/reason', async () => {
+    const check = await claude.available();
+    expect(check).toHaveProperty('ok');
+    if (!check.ok) {
+      expect(typeof check.reason).toBe('string');
+      expect(check.reason!.length).toBeGreaterThan(0);
+    }
+  });
+
+  test('gpt: available() returns structured ok/reason', async () => {
+    const check = await gpt.available();
+    expect(check).toHaveProperty('ok');
+    if (!check.ok) {
+      expect(typeof check.reason).toBe('string');
+    }
+  });
+
+  test('gemini: available() returns structured ok/reason', async () => {
+    const check = await gemini.available();
+    expect(check).toHaveProperty('ok');
+    if (!check.ok) {
+      expect(typeof check.reason).toBe('string');
+    }
+  });
+
+  test('claude: trivial prompt produces parseable output', async () => {
+    const check = await claude.available();
+    if (!check.ok) {
+      process.stderr.write(`\nclaude live smoke: SKIPPED — ${check.reason}\n`);
+      return;
+    }
+    const result = await claude.run({ prompt: PROMPT, workdir, timeoutMs: 120_000 });
+    if (result.error) {
+      throw new Error(`claude errored: ${result.error.code} — ${result.error.reason}`);
+    }
+    expect(result.output.toLowerCase()).toContain('ok');
+    expect(result.tokens.input).toBeGreaterThan(0);
+    expect(result.tokens.output).toBeGreaterThan(0);
+    expect(result.durationMs).toBeGreaterThan(0);
+    expect(typeof result.modelUsed).toBe('string');
+    expect(result.modelUsed.length).toBeGreaterThan(0);
+    const cost = claude.estimateCost(result.tokens, result.modelUsed);
+    expect(cost).toBeGreaterThan(0);
+  }, 150_000);
+
+  test('gpt: trivial prompt produces parseable output', async () => {
+    const check = await gpt.available();
+    if (!check.ok) {
+      process.stderr.write(`\ngpt live smoke: SKIPPED — ${check.reason}\n`);
+      return;
+    }
+    const result = await gpt.run({ prompt: PROMPT, workdir, timeoutMs: 120_000 });
+    if (result.error) {
+      throw new Error(`gpt errored: ${result.error.code} — ${result.error.reason}`);
+    }
+    expect(result.output.toLowerCase()).toContain('ok');
+    expect(result.tokens.input).toBeGreaterThan(0);
+    expect(result.tokens.output).toBeGreaterThan(0);
+    expect(result.durationMs).toBeGreaterThan(0);
+    expect(typeof result.modelUsed).toBe('string');
+    const cost = gpt.estimateCost(result.tokens, result.modelUsed);
+    expect(cost).toBeGreaterThan(0);
+  }, 150_000);
+
+  test('gemini: trivial prompt produces parseable output', async () => {
+    const check = await gemini.available();
+    if (!check.ok) {
+      process.stderr.write(`\ngemini live smoke: SKIPPED — ${check.reason}\n`);
+      return;
+    }
+    const result = await gemini.run({ prompt: PROMPT, workdir, timeoutMs: 120_000 });
+    if (result.error) {
+      throw new Error(`gemini errored: ${result.error.code} — ${result.error.reason}`);
+    }
+    expect(result.output.toLowerCase()).toContain('ok');
+    // Gemini CLI sometimes returns 0 tokens in the result event (older responses);
+    // assert non-negative instead of strictly positive.
+    expect(result.tokens.input).toBeGreaterThanOrEqual(0);
+    expect(result.tokens.output).toBeGreaterThanOrEqual(0);
+    expect(result.durationMs).toBeGreaterThan(0);
+    expect(typeof result.modelUsed).toBe('string');
+  }, 150_000);
+
+  test('timeout error surfaces as error.code=timeout (no exception)', async () => {
+    // Use whatever adapter is available first — all three should share timeout semantics.
+    const adapter = (await claude.available()).ok ? claude
+      : (await gpt.available()).ok ? gpt
+      : (await gemini.available()).ok ? gemini
+      : null;
+    if (!adapter) {
+      process.stderr.write('\ntimeout smoke: SKIPPED — no provider available\n');
+      return;
+    }
+    // 100ms timeout is far too short for any real CLI startup → must timeout.
+    const result = await adapter.run({ prompt: PROMPT, workdir, timeoutMs: 100 });
+    expect(result.error).toBeDefined();
+    // Timeout, binary_missing, or unknown (if CLI dies differently) — all acceptable
+    // non-crash outcomes. The point is the adapter returns a RunResult, not throws.
+    expect(['timeout', 'unknown', 'binary_missing']).toContain(result.error!.code);
+    expect(result.durationMs).toBeGreaterThan(0);
+  }, 30_000);
+
+  test('runBenchmark: Promise.allSettled means one unavailable provider does not block others', async () => {
+    // Use the full runner with all three providers — whichever are unauthed should
+    // return entries with available=false and not crash the batch.
+    const report = await runBenchmark({
+      prompt: PROMPT,
+      workdir,
+      providers: ['claude', 'gpt', 'gemini'],
+      timeoutMs: 120_000,
+      skipUnavailable: false,
+    });
+    expect(report.entries).toHaveLength(3);
+    for (const e of report.entries) {
+      expect(['claude', 'gpt', 'gemini']).toContain(e.family);
+      if (e.available) {
+        expect(e.result).toBeDefined();
+      } else {
+        expect(typeof e.unavailable_reason).toBe('string');
+      }
+    }
+    // At least one available provider should have produced a non-error result in a healthy CI env.
+    const hadSuccess = report.entries.some(e => e.available && e.result && !e.result.error);
+    // We don't hard-assert this: if NO providers are authed, skip silently.
+    if (!hadSuccess) {
+      process.stderr.write('\nrunBenchmark live: no provider produced a clean result (no auth?)\n');
+    }
+  }, 300_000);
+});
@@ -0,0 +1,370 @@
+/**
+ * Taste engine — end-to-end tests for `gstack-taste-update`.
+ *
+ * Covers the v1 taste profile contract: schema shape, Laplace-smoothed confidence,
+ * 5%/week decay, dimension extraction from reason strings, session cap, schema
+ * migration, conflict detection (taste drift), malformed-input recovery.
+ *
+ * All tests use GSTACK_STATE_DIR pointing at a temp dir so no real home dir is
+ * touched. Each test isolates its own state directory.
+ */
+
+import { describe, test, expect, beforeEach, afterEach } from 'bun:test';
+import { spawnSync } from 'child_process';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+
+const ROOT = path.resolve(import.meta.dir, '..');
+const BIN = path.join(ROOT, 'bin', 'gstack-taste-update');
+
+interface Preference {
+  value: string;
+  confidence: number;
+  approved_count: number;
+  rejected_count: number;
+  last_seen: string;
+}
+
+interface TasteProfile {
+  version: number;
+  updated_at: string;
+  dimensions: Record<'fonts' | 'colors' | 'layouts' | 'aesthetics', { approved: Preference[]; rejected: Preference[] }>;
+  sessions: Array<{ ts: string; action: 'approved' | 'rejected'; variant: string; reason?: string }>;
+}
+
+let stateDir: string;
+let workdir: string;
+
+beforeEach(() => {
+  stateDir = fs.mkdtempSync(path.join(os.tmpdir(), 'taste-state-'));
+  workdir = fs.mkdtempSync(path.join(os.tmpdir(), 'taste-work-'));
+  // Initialize a git repo so gstack-taste-update's getSlug() finds a toplevel
+  spawnSync('git', ['init', '-b', 'main'], { cwd: workdir, stdio: 'pipe' });
+});
+
+afterEach(() => {
+  fs.rmSync(stateDir, { recursive: true, force: true });
+  fs.rmSync(workdir, { recursive: true, force: true });
+});
+
+function run(args: string[]): { status: number | null; stdout: string; stderr: string } {
+  const result = spawnSync('bun', ['run', BIN, ...args], {
+    cwd: workdir,
+    env: { ...process.env, GSTACK_STATE_DIR: stateDir, HOME: stateDir },
+    encoding: 'utf-8',
+    timeout: 10000,
+  });
+  return {
+    status: result.status,
+    stdout: result.stdout?.toString() ?? '',
+    stderr: result.stderr?.toString() ?? '',
+  };
+}
+
+function profilePath(): string {
+  const slug = path.basename(workdir);
+  return path.join(stateDir, 'projects', slug, 'taste-profile.json');
+}
+
+function readProfile(): TasteProfile {
+  return JSON.parse(fs.readFileSync(profilePath(), 'utf-8'));
+}
+
+function writeProfile(p: unknown): void {
+  const pp = profilePath();
+  fs.mkdirSync(path.dirname(pp), { recursive: true });
+  fs.writeFileSync(pp, JSON.stringify(p, null, 2));
+}
+
+describe('taste-engine: first-write lifecycle', () => {
+  test('approved creates profile with correct v1 schema', () => {
+    const r = run(['approved', 'variant-A', '--reason', 'fonts: Geist Sans; colors: emerald']);
+    expect(r.status).toBe(0);
+
+    const p = readProfile();
+    expect(p.version).toBe(1);
+    expect(p.dimensions.fonts.approved).toHaveLength(1);
+    expect(p.dimensions.fonts.approved[0].value).toBe('Geist Sans');
+    expect(p.dimensions.fonts.approved[0].approved_count).toBe(1);
+    expect(p.dimensions.fonts.approved[0].rejected_count).toBe(0);
+    // Laplace: 1 / (1 + 0 + 1) = 0.5
+    expect(p.dimensions.fonts.approved[0].confidence).toBeCloseTo(0.5, 5);
+    expect(p.dimensions.colors.approved[0].value).toBe('emerald');
+    expect(p.sessions).toHaveLength(1);
+    expect(p.sessions[0].action).toBe('approved');
+    expect(p.sessions[0].variant).toBe('variant-A');
+  });
+
+  test('rejected bumps rejected_count not approved_count', () => {
+    run(['rejected', 'variant-B', '--reason', 'fonts: Comic Sans']);
+    const p = readProfile();
+    expect(p.dimensions.fonts.rejected).toHaveLength(1);
+    expect(p.dimensions.fonts.rejected[0].rejected_count).toBe(1);
+    expect(p.dimensions.fonts.rejected[0].approved_count).toBe(0);
+    expect(p.dimensions.fonts.approved).toHaveLength(0);
+  });
+
+  test('session recorded even when no dimensions extractable from reason', () => {
+    const r = run(['approved', 'variant-C']); // no --reason
+    expect(r.status).toBe(0);
+    const p = readProfile();
+    expect(p.sessions).toHaveLength(1);
+    for (const dim of ['fonts', 'colors', 'layouts', 'aesthetics'] as const) {
+      expect(p.dimensions[dim].approved).toHaveLength(0);
+      expect(p.dimensions[dim].rejected).toHaveLength(0);
+    }
+  });
+});
+
+describe('taste-engine: Laplace-smoothed confidence', () => {
+  test('repeated approvals raise confidence toward 1', () => {
+    for (let i = 0; i < 5; i++) {
+      run(['approved', `variant-${i}`, '--reason', 'fonts: Geist Sans']);
+    }
+    const p = readProfile();
+    const pref = p.dimensions.fonts.approved[0];
+    expect(pref.approved_count).toBe(5);
+    // Laplace: 5 / (5 + 0 + 1) = 0.833
+    expect(pref.confidence).toBeCloseTo(5 / 6, 5);
+  });
+
+  test('mixed approvals + rejections balance out', () => {
+    run(['approved', 'v1', '--reason', 'fonts: Inter']);
+    run(['approved', 'v2', '--reason', 'fonts: Inter']);
+    run(['rejected', 'v3', '--reason', 'fonts: Inter']);
+    const p = readProfile();
+    const approved = p.dimensions.fonts.approved[0];
+    const rejected = p.dimensions.fonts.rejected[0];
+    expect(approved.approved_count).toBe(2);
+    expect(approved.rejected_count).toBe(0);
+    expect(rejected.rejected_count).toBe(1);
+    expect(rejected.approved_count).toBe(0);
+  });
+});
+
+describe('taste-engine: decay math', () => {
+  test('show applies 5%/week decay to stored confidence', () => {
+    // Seed with a profile where the single approved font was last_seen 4 weeks ago
+    const fourWeeksAgo = new Date(Date.now() - 4 * 7 * 24 * 60 * 60 * 1000).toISOString();
+    writeProfile({
+      version: 1,
+      updated_at: new Date().toISOString(),
+      dimensions: {
+        fonts: {
+          approved: [{ value: 'Aged Font', confidence: 0.8, approved_count: 4, rejected_count: 0, last_seen: fourWeeksAgo }],
+          rejected: [],
+        },
+        colors: { approved: [], rejected: [] },
+        layouts: { approved: [], rejected: [] },
+        aesthetics: { approved: [], rejected: [] },
+      },
+      sessions: [],
+    });
+    const r = run(['show']);
+    expect(r.status).toBe(0);
+    // After 4 weeks: 0.8 * (0.95)^4 ≈ 0.651
+    const expectedConf = 0.8 * Math.pow(0.95, 4);
+    const match = r.stdout.match(/Aged Font — conf (\d+\.\d+)/);
+    expect(match).toBeTruthy();
+    const displayedConf = parseFloat(match![1]);
+    expect(displayedConf).toBeCloseTo(expectedConf, 2);
+  });
+
+  test('decay never goes below zero', () => {
+    // 3 years ≈ 156 weeks. 0.95^156 ≈ 0.00036, well below 0.01.
+    const yearsAgo = new Date(Date.now() - 3 * 365 * 24 * 60 * 60 * 1000).toISOString();
+    writeProfile({
+      version: 1,
+      updated_at: new Date().toISOString(),
+      dimensions: {
+        fonts: {
+          approved: [{ value: 'Ancient', confidence: 1.0, approved_count: 1, rejected_count: 0, last_seen: yearsAgo }],
+          rejected: [],
+        },
+        colors: { approved: [], rejected: [] },
+        layouts: { approved: [], rejected: [] },
+        aesthetics: { approved: [], rejected: [] },
+      },
+      sessions: [],
+    });
+    const r = run(['show']);
+    expect(r.status).toBe(0);
+    const match = r.stdout.match(/Ancient — conf (\d+\.\d+)/);
+    expect(match).toBeTruthy();
+    const conf = parseFloat(match![1]);
+    expect(conf).toBeGreaterThanOrEqual(0);
+    expect(conf).toBeLessThan(0.01);
+  });
+});
+
+describe('taste-engine: dimension extraction', () => {
+  test('parses multiple dimensions from one reason string', () => {
+    run(['approved', 'v1', '--reason', 'fonts: Geist, IBM Plex; colors: emerald; layouts: grid-12; aesthetics: brutalist']);
+    const p = readProfile();
+    expect(p.dimensions.fonts.approved.map(x => x.value).sort()).toEqual(['Geist', 'IBM Plex']);
+    expect(p.dimensions.colors.approved[0].value).toBe('emerald');
+    expect(p.dimensions.layouts.approved[0].value).toBe('grid-12');
+    expect(p.dimensions.aesthetics.approved[0].value).toBe('brutalist');
+  });
+
+  test('value matching is case-insensitive', () => {
+    run(['approved', 'v1', '--reason', 'fonts: Geist']);
+    run(['approved', 'v2', '--reason', 'fonts: GEIST']);
+    const p = readProfile();
+    // Should merge into a single entry
+    expect(p.dimensions.fonts.approved).toHaveLength(1);
+    expect(p.dimensions.fonts.approved[0].approved_count).toBe(2);
+  });
+
+  test('unknown dimension labels are silently ignored', () => {
+    run(['approved', 'v1', '--reason', 'weather: sunny; mood: happy']);
+    const p = readProfile();
+    // Session still recorded
+    expect(p.sessions).toHaveLength(1);
+    // No dimensions populated
+    for (const dim of ['fonts', 'colors', 'layouts', 'aesthetics'] as const) {
+      expect(p.dimensions[dim].approved).toHaveLength(0);
+    }
+  });
+});
+
+describe('taste-engine: session cap', () => {
+  test('sessions truncate to last 50 entries (FIFO)', () => {
+    for (let i = 0; i < 55; i++) {
+      run(['approved', `v${i}`, '--reason', 'fonts: Geist']);
+    }
+    const p = readProfile();
+    expect(p.sessions).toHaveLength(50);
+    // Last 5 should be preserved, first 5 dropped
+    expect(p.sessions[0].variant).toBe('v5');
+    expect(p.sessions[49].variant).toBe('v54');
+  });
+});
+
+describe('taste-engine: taste drift conflict detection', () => {
+  test('warns when approved value has strong opposite signal', () => {
+    // Seed a strong rejected entry: 4 rejections, no approvals → Laplace = 0/5 but that's
+    // not > 0.6. Let's seed it directly with confidence 0.8.
+    writeProfile({
+      version: 1,
+      updated_at: new Date().toISOString(),
+      dimensions: {
+        fonts: {
+          approved: [],
+          rejected: [{ value: 'Comic Sans', confidence: 0.8, approved_count: 0, rejected_count: 4, last_seen: new Date().toISOString() }],
+        },
+        colors: { approved: [], rejected: [] },
+        layouts: { approved: [], rejected: [] },
+        aesthetics: { approved: [], rejected: [] },
+      },
+      sessions: [],
+    });
+    const r = run(['approved', 'v1', '--reason', 'fonts: Comic Sans']);
+    expect(r.status).toBe(0);
+    // "taste drift" note should go to stderr
+    expect(r.stderr).toContain('taste drift');
+    expect(r.stderr).toContain('Comic Sans');
+  });
+
+  test('does NOT warn when signal is weak', () => {
+    writeProfile({
+      version: 1,
+      updated_at: new Date().toISOString(),
+      dimensions: {
+        fonts: {
+          approved: [],
+          // Single rejection (< 3) — shouldn't trigger drift warning
+          rejected: [{ value: 'Inter', confidence: 0.5, approved_count: 0, rejected_count: 1, last_seen: new Date().toISOString() }],
+        },
+        colors: { approved: [], rejected: [] },
+        layouts: { approved: [], rejected: [] },
+        aesthetics: { approved: [], rejected: [] },
+      },
+      sessions: [],
+    });
+    const r = run(['approved', 'v1', '--reason', 'fonts: Inter']);
+    expect(r.status).toBe(0);
+    expect(r.stderr).not.toContain('taste drift');
+  });
+});
+
+describe('taste-engine: migration', () => {
+  test('legacy profile without version gets migrated to v1', () => {
+    // Simulate a legacy approved.json-style structure
+    writeProfile({
+      // no version field
+      dimensions: {
+        fonts: {
+          approved: [{ value: 'Legacy', confidence: 0.7, approved_count: 3, rejected_count: 1, last_seen: new Date().toISOString() }],
+          rejected: [],
+        },
+      },
+      sessions: [
+        { ts: new Date().toISOString(), action: 'approved', variant: 'legacy-v1' },
+      ],
+    });
+
+    const r = run(['migrate']);
+    expect(r.status).toBe(0);
+
+    const p = readProfile();
+    expect(p.version).toBe(1);
+    expect(p.dimensions.fonts.approved[0].value).toBe('Legacy');
+    expect(p.dimensions.colors).toBeDefined();
+    expect(p.dimensions.layouts).toBeDefined();
+    expect(p.dimensions.aesthetics).toBeDefined();
+    expect(p.sessions).toHaveLength(1);
+    expect(p.sessions[0].variant).toBe('legacy-v1');
+  });
+
+  test('migration truncates oversized sessions array to last 50', () => {
+    const sessions = Array.from({ length: 100 }, (_, i) => ({
+      ts: new Date().toISOString(),
+      action: 'approved' as const,
+      variant: `legacy-${i}`,
+    }));
+    writeProfile({ dimensions: {}, sessions });
+    const r = run(['migrate']);
+    expect(r.status).toBe(0);
+    const p = readProfile();
+    expect(p.sessions).toHaveLength(50);
+    expect(p.sessions[0].variant).toBe('legacy-50');
+    expect(p.sessions[49].variant).toBe('legacy-99');
+  });
+});
+
+describe('taste-engine: resilience', () => {
+  test('malformed JSON profile falls back to empty and does not crash', () => {
+    const pp = profilePath();
+    fs.mkdirSync(path.dirname(pp), { recursive: true });
+    fs.writeFileSync(pp, '{ this is not json');
+    const r = run(['approved', 'v1', '--reason', 'fonts: Geist']);
+    // Should succeed (graceful fallback)
+    expect(r.status).toBe(0);
+    // Warning on stderr
+    expect(r.stderr).toContain('WARN');
+    // File should now be valid JSON
+    const p = readProfile();
+    expect(p.version).toBe(1);
+    expect(p.dimensions.fonts.approved[0].value).toBe('Geist');
+  });
+
+  test('show on nonexistent profile prints empty summary without error', () => {
+    const r = run(['show']);
+    expect(r.status).toBe(0);
+    expect(r.stdout).toContain('taste-profile.json');
+  });
+
+  test('approved without variant arg exits non-zero with usage hint', () => {
+    const r = run(['approved']);
+    expect(r.status).not.toBe(0);
+    expect(r.stderr).toContain('Usage');
+  });
+
+  test('unknown command exits non-zero', () => {
+    const r = run(['banana']);
+    expect(r.status).not.toBe(0);
+    expect(r.stderr).toContain('Usage');
+  });
+});