From 4ad73f7362d678c1a6f5adf9f4c65115fd2b6e1a Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Sun, 15 Mar 2026 09:39:36 -0500
Subject: [PATCH] feat: unified gstack eval CLI with list, compare, push,
 cache, cost

- lib/cli-eval.ts: routes to list/compare/summary/push/cost/cache/watch
  subcommands. Ports logic from 4 separate scripts into unified entry.
  Adds ANSI color for TTY (respects NO_COLOR), --limit flag for list.
- bin/gstack-eval: bash wrapper matching bin/gstack-sync pattern
- package.json: eval:* scripts now point to lib/cli-eval.ts
- supabase/migrations/004_eval_costs.sql: per-model cost tracking + RLS
- docs/eval-result-format.md: public format spec for any language
- test/lib-eval-cli.test.ts: integration tests (spawn CLI subprocess)
  including 3 push failure modes (file-not-found, invalid schema,
  sync unavailable)

215 tests passing across 13 files.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 bin/gstack-eval                        |   8 +
 docs/eval-result-format.md             | 136 +++++++
 lib/cli-eval.ts                        | 469 +++++++++++++++++++++++++
 package.json                           |   8 +-
 supabase/migrations/004_eval_costs.sql |  39 ++
 test/lib-eval-cli.test.ts              | 178 ++++++++++
 6 files changed, 834 insertions(+), 4 deletions(-)
 create mode 100755 bin/gstack-eval
 create mode 100644 docs/eval-result-format.md
 create mode 100644 lib/cli-eval.ts
 create mode 100644 supabase/migrations/004_eval_costs.sql
 create mode 100644 test/lib-eval-cli.test.ts
diff --git a/bin/gstack-eval b/bin/gstack-eval
new file mode 100755
index 00000000..91ce03ae
--- /dev/null
+++ b/bin/gstack-eval
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# gstack eval — unified eval CLI
+# Delegates to lib/cli-eval.ts via bun
+
+GSTACK_DIR="${GSTACK_DIR:-$(cd "$(dirname "$0")/.." && pwd)}"
+exec bun run "$GSTACK_DIR/lib/cli-eval.ts" "$@"
diff --git a/docs/eval-result-format.md b/docs/eval-result-format.md
new file mode 100644
index 00000000..f58195b3
--- /dev/null
+++ b/docs/eval-result-format.md
@@ -0,0 +1,136 @@
+# Standard Eval Result Format
+
+This document defines the JSON format that any language can produce and push into gstack's eval infrastructure via `gstack eval push <file>`.
+
+## Required Fields
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `schema_version` | `number` | Format version (currently `1`) |
+| `version` | `string` | Version of the tool/system being evaluated |
+| `git_branch` | `string` | Git branch name |
+| `git_sha` | `string` | Git commit SHA (short or full) |
+| `timestamp` | `string` | ISO 8601 timestamp |
+| `tier` | `string` | Eval tier: `"e2e"`, `"llm-judge"`, or custom |
+| `total` | `number` | Total number of test cases |
+| `passed` | `number` | Number of passing test cases |
+| `failed` | `number` | Number of failing test cases |
+| `total_cost_usd` | `number` | Total estimated cost in USD |
+| `duration_seconds` | `number` | Total wall-clock duration in seconds |
+| `all_results` | `array` | Array of test result objects (see below) |
+
+## Optional Fields
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `hostname` | `string` | Machine hostname |
+| `label` | `string` | Human-readable label for this run |
+| `prompt_sha` | `string` | SHA of the prompt(s) used |
+| `by_category` | `object` | `{ category: { passed, failed } }` breakdown |
+| `costs` | `array` | Per-model cost entries (see below) |
+| `comparison` | `array` | A/B comparison entries |
+| `failures` | `array` | Structured failure details |
+| `_partial` | `boolean` | `true` for incremental saves, absent in final |
+
+## Test Result Entry (`all_results[]`)
+
+Each entry in `all_results` must have:
+
+| Field | Type | Required | Description |
+|-------|------|----------|-------------|
+| `name` | `string` | Yes | Unique test name |
+| `passed` | `boolean` | Yes | Whether this test passed |
+| `suite` | `string` | No | Suite/group name |
+| `tier` | `string` | No | Test tier |
+| `duration_ms` | `number` | No | Duration in milliseconds |
+| `cost_usd` | `number` | No | Cost for this test |
+| `output` | `object` | No | Open-ended output data |
+| `turns_used` | `number` | No | LLM conversation turns |
+| `exit_reason` | `string` | No | `"success"`, `"timeout"`, `"error_max_turns"`, etc. |
+| `detection_rate` | `number` | No | Bugs detected (for QA evals) |
+| `judge_scores` | `object` | No | `{ dimension: score }` from LLM judge |
+| `judge_reasoning` | `string` | No | LLM judge's reasoning |
+| `error` | `string` | No | Error message if test failed |
+
+## Cost Entry (`costs[]`)
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `model` | `string` | Model ID (e.g., `"claude-sonnet-4-6"`) |
+| `calls` | `number` | Number of API calls |
+| `input_tokens` | `number` | Total input tokens |
+| `output_tokens` | `number` | Total output tokens |
+
+## Example
+
+```json
+{
+  "schema_version": 1,
+  "version": "0.3.3",
+  "git_branch": "main",
+  "git_sha": "abc1234",
+  "timestamp": "2025-05-01T12:00:00Z",
+  "hostname": "ci-runner-01",
+  "tier": "e2e",
+  "total": 2,
+  "passed": 1,
+  "failed": 1,
+  "total_cost_usd": 1.50,
+  "duration_seconds": 120,
+  "all_results": [
+    {
+      "name": "login-flow",
+      "suite": "auth",
+      "passed": true,
+      "duration_ms": 60000,
+      "cost_usd": 0.75,
+      "turns_used": 5
+    },
+    {
+      "name": "checkout-flow",
+      "suite": "commerce",
+      "passed": false,
+      "duration_ms": 60000,
+      "cost_usd": 0.75,
+      "error": "Timed out waiting for payment confirmation"
+    }
+  ],
+  "costs": [
+    {
+      "model": "claude-sonnet-4-6",
+      "calls": 10,
+      "input_tokens": 500000,
+      "output_tokens": 250000
+    }
+  ]
+}
+```
+
+## Legacy Format
+
+gstack's internal eval system uses a slightly different format (from `test/helpers/eval-store.ts`). The `normalizeFromLegacy()` and `normalizeToLegacy()` functions in `lib/eval-format.ts` handle conversion:
+
+| Legacy field | Standard field |
+|-------------|---------------|
+| `branch` | `git_branch` |
+| `total_tests` | `total` |
+| `total_duration_ms` | `duration_seconds` (÷ 1000) |
+| `tests` | `all_results` |
+
+## Validation
+
+Use `gstack eval push <file>` to validate and push a result file. Validation checks:
+- All required fields present with correct types
+- `all_results` is an array of objects
+- Each entry has `name` (string) and `passed` (boolean)
+
+## Pushing Results
+
+```bash
+# Validate + save locally + push to team Supabase (if configured)
+gstack eval push my-eval-results.json
+
+# From any language — just write JSON and push:
+python run_evals.py --output results.json
+gstack eval push results.json
+```
diff --git a/lib/cli-eval.ts b/lib/cli-eval.ts
new file mode 100644
index 00000000..df16d033
--- /dev/null
+++ b/lib/cli-eval.ts
@@ -0,0 +1,469 @@
+#!/usr/bin/env bun
+/**
+ * Unified eval CLI: gstack eval <subcommand>
+ *
+ * Subcommands:
+ *   list [--branch <name>] [--tier <tier>] [--limit N]
+ *   compare [file-a] [file-b]
+ *   summary [--limit N]
+ *   push <file>
+ *   cost <file>
+ *   cache read|write|stats|clear|verify [args...]
+ *   watch
+ */
+
+import * as fs from 'fs';
+import * as path from 'path';
+import {
+  EVAL_DIR,
+  GSTACK_DEV_DIR,
+  readJSON,
+  listEvalFiles,
+  loadEvalResults,
+  formatTimestamp,
+} from './util';
+import {
+  findPreviousRun,
+  compareEvalResults,
+  formatComparison,
+} from '../test/helpers/eval-store';
+import type { EvalResult } from '../test/helpers/eval-store';
+import type { ComparisonResult } from '../test/helpers/eval-store';
+
+// --- ANSI color helpers ---
+
+const isTTY = process.stdout.isTTY && !process.env.NO_COLOR;
+
+function green(s: string): string { return isTTY ? `\x1b[32m${s}\x1b[0m` : s; }
+function red(s: string): string { return isTTY ? `\x1b[31m${s}\x1b[0m` : s; }
+function dim(s: string): string { return isTTY ? `\x1b[2m${s}\x1b[0m` : s; }
+
+/**
+ * Wrap ANSI colors around comparison arrows: ↑ green, ↓ red, = dim.
+ */
+export function formatComparisonColor(c: ComparisonResult): string {
+  const plain = formatComparison(c);
+  if (!isTTY) return plain;
+  return plain
+    .replace(/↑/g, green('↑'))
+    .replace(/↓/g, red('↓'))
+    .replace(/ = /g, dim(' = '));
+}
+
+// --- Subcommands ---
+
+async function cmdList(args: string[]): Promise<void> {
+  let filterBranch: string | null = null;
+  let filterTier: string | null = null;
+  let limit = 50;
+
+  for (let i = 0; i < args.length; i++) {
+    if (args[i] === '--branch' && args[i + 1]) { filterBranch = args[++i]; }
+    else if (args[i] === '--tier' && args[i + 1]) { filterTier = args[++i]; }
+    else if (args[i] === '--limit' && args[i + 1]) { limit = parseInt(args[++i], 10); }
+  }
+
+  const files = listEvalFiles();
+  if (files.length === 0) {
+    console.log('No eval runs yet. Run: EVALS=1 bun run test:evals');
+    return;
+  }
+
+  interface RunSummary {
+    file: string;
+    timestamp: string;
+    branch: string;
+    tier: string;
+    version: string;
+    passed: number;
+    total: number;
+    cost: number;
+  }
+
+  const runs: RunSummary[] = [];
+  for (const file of files) {
+    const data = readJSON<Record<string, any>>(file);
+    if (!data) continue;
+    if (filterBranch && data.branch !== filterBranch) continue;
+    if (filterTier && data.tier !== filterTier) continue;
+    runs.push({
+      file: path.basename(file),
+      timestamp: data.timestamp || '',
+      branch: data.branch || 'unknown',
+      tier: data.tier || 'unknown',
+      version: data.version || '?',
+      passed: data.passed || 0,
+      total: data.total_tests || 0,
+      cost: data.total_cost_usd || 0,
+    });
+  }
+
+  runs.sort((a, b) => b.timestamp.localeCompare(a.timestamp));
+  const displayed = runs.slice(0, limit);
+
+  console.log('');
+  console.log(`Eval History (${runs.length} total runs)`);
+  console.log('═'.repeat(90));
+  console.log(
+    '  ' +
+    'Date'.padEnd(17) +
+    'Branch'.padEnd(28) +
+    'Tier'.padEnd(12) +
+    'Pass'.padEnd(8) +
+    'Cost'.padEnd(8) +
+    'Version'
+  );
+  console.log('─'.repeat(90));
+
+  for (const run of displayed) {
+    const date = formatTimestamp(run.timestamp);
+    const branch = run.branch.length > 26 ? run.branch.slice(0, 23) + '...' : run.branch.padEnd(28);
+    const pass = `${run.passed}/${run.total}`.padEnd(8);
+    const cost = `$${run.cost.toFixed(2)}`.padEnd(8);
+    console.log(`  ${date.padEnd(17)}${branch}${run.tier.padEnd(12)}${pass}${cost}v${run.version}`);
+  }
+
+  console.log('─'.repeat(90));
+  const totalCost = runs.reduce((s, r) => s + r.cost, 0);
+  console.log(`  ${runs.length} runs | Total spend: $${totalCost.toFixed(2)} | Showing: ${displayed.length}`);
+  console.log(`  Dir: ${EVAL_DIR}`);
+  console.log('');
+}
+
+async function cmdCompare(args: string[]): Promise<void> {
+  function loadResult(filepath: string): EvalResult {
+    const resolved = path.isAbsolute(filepath) ? filepath : path.join(EVAL_DIR, filepath);
+    if (!fs.existsSync(resolved)) {
+      console.error(`File not found: ${resolved}`);
+      process.exit(1);
+    }
+    return JSON.parse(fs.readFileSync(resolved, 'utf-8'));
+  }
+
+  let beforeFile: string;
+  let afterFile: string;
+
+  if (args.length === 2) {
+    beforeFile = args[0];
+    afterFile = args[1];
+  } else if (args.length === 1) {
+    afterFile = args[0];
+    const resolved = path.isAbsolute(afterFile) ? afterFile : path.join(EVAL_DIR, afterFile);
+    const afterResult = loadResult(resolved);
+    const prev = findPreviousRun(EVAL_DIR, afterResult.tier, afterResult.branch, resolved);
+    if (!prev) {
+      console.log('No previous run found to compare against.');
+      return;
+    }
+    beforeFile = prev;
+  } else {
+    const files = listEvalFiles();
+    if (files.length < 2) {
+      console.log('Need at least 2 eval runs to compare. Run evals again.');
+      return;
+    }
+    afterFile = files[0];
+    const afterResult = loadResult(afterFile);
+    const prev = findPreviousRun(EVAL_DIR, afterResult.tier, afterResult.branch, afterFile);
+    if (!prev) {
+      console.log('No previous run of the same tier found to compare against.');
+      return;
+    }
+    beforeFile = prev;
+  }
+
+  const beforeResult = loadResult(beforeFile);
+  const afterResult = loadResult(afterFile);
+
+  if (beforeResult.tier !== afterResult.tier) {
+    console.warn(`Warning: comparing different tiers (${beforeResult.tier} vs ${afterResult.tier})`);
+  }
+  if (beforeResult.schema_version !== afterResult.schema_version) {
+    console.warn(`Warning: schema version mismatch (${beforeResult.schema_version} vs ${afterResult.schema_version})`);
+  }
+
+  const comparison = compareEvalResults(beforeResult, afterResult, beforeFile, afterFile);
+  console.log(formatComparisonColor(comparison));
+}
+
+async function cmdSummary(args: string[]): Promise<void> {
+  let limit: number | undefined;
+  for (let i = 0; i < args.length; i++) {
+    if (args[i] === '--limit' && args[i + 1]) { limit = parseInt(args[++i], 10); }
+  }
+
+  const results = loadEvalResults<EvalResult>(undefined, limit);
+  if (results.length === 0) {
+    console.log('No eval runs yet. Run: EVALS=1 bun run test:evals');
+    return;
+  }
+
+  const e2eRuns = results.filter(r => r.tier === 'e2e');
+  const judgeRuns = results.filter(r => r.tier === 'llm-judge');
+  const totalCost = results.reduce((s, r) => s + (r.total_cost_usd || 0), 0);
+  const avgE2ECost = e2eRuns.length > 0 ? e2eRuns.reduce((s, r) => s + r.total_cost_usd, 0) / e2eRuns.length : 0;
+  const avgJudgeCost = judgeRuns.length > 0 ? judgeRuns.reduce((s, r) => s + r.total_cost_usd, 0) / judgeRuns.length : 0;
+
+  // Detection rates
+  const detectionRates: number[] = [];
+  for (const r of e2eRuns) {
+    for (const t of r.tests) {
+      if (t.detection_rate !== undefined) detectionRates.push(t.detection_rate);
+    }
+  }
+  const avgDetection = detectionRates.length > 0
+    ? detectionRates.reduce((a, b) => a + b, 0) / detectionRates.length
+    : null;
+
+  // Flaky tests
+  const testResults = new Map<string, boolean[]>();
+  for (const r of results) {
+    for (const t of r.tests) {
+      const key = `${r.tier}:${t.name}`;
+      if (!testResults.has(key)) testResults.set(key, []);
+      testResults.get(key)!.push(t.passed);
+    }
+  }
+  const flakyTests: string[] = [];
+  for (const [name, outcomes] of testResults) {
+    if (outcomes.length >= 2 && outcomes.some(o => o) && outcomes.some(o => !o)) {
+      flakyTests.push(name);
+    }
+  }
+
+  // Branch stats
+  const branchStats = new Map<string, { runs: number; detections: number[] }>();
+  for (const r of e2eRuns) {
+    if (!branchStats.has(r.branch)) branchStats.set(r.branch, { runs: 0, detections: [] });
+    const stats = branchStats.get(r.branch)!;
+    stats.runs++;
+    for (const t of r.tests) {
+      if (t.detection_rate !== undefined) stats.detections.push(t.detection_rate);
+    }
+  }
+
+  // Print
+  console.log('');
+  console.log('Eval Summary');
+  console.log('═'.repeat(60));
+  console.log(`  Total runs:        ${results.length} (${e2eRuns.length} e2e, ${judgeRuns.length} llm-judge)`);
+  console.log(`  Total spend:       $${totalCost.toFixed(2)}`);
+  console.log(`  Avg cost/e2e:      $${avgE2ECost.toFixed(2)}`);
+  console.log(`  Avg cost/judge:    $${avgJudgeCost.toFixed(2)}`);
+  if (avgDetection !== null) {
+    console.log(`  Avg detection:     ${avgDetection.toFixed(1)} bugs`);
+  }
+  console.log('─'.repeat(60));
+
+  if (flakyTests.length > 0) {
+    console.log(`  Flaky tests (${flakyTests.length}):`);
+    for (const name of flakyTests) console.log(`    - ${name}`);
+    console.log('─'.repeat(60));
+  }
+
+  if (branchStats.size > 0) {
+    console.log('  Branches:');
+    const sorted = [...branchStats.entries()].sort((a, b) => {
+      const avgA = a[1].detections.length > 0 ? a[1].detections.reduce((x, y) => x + y, 0) / a[1].detections.length : 0;
+      const avgB = b[1].detections.length > 0 ? b[1].detections.reduce((x, y) => x + y, 0) / b[1].detections.length : 0;
+      return avgB - avgA;
+    });
+    for (const [branch, stats] of sorted) {
+      const avgDet = stats.detections.length > 0
+        ? stats.detections.reduce((a, b) => a + b, 0) / stats.detections.length
+        : null;
+      const det = avgDet !== null ? ` avg det: ${avgDet.toFixed(1)}` : '';
+      console.log(`    ${branch.padEnd(30)} ${stats.runs} runs${det}`);
+    }
+    console.log('─'.repeat(60));
+  }
+
+  const timestamps = results.map(r => r.timestamp).filter(Boolean).sort();
+  if (timestamps.length > 0) {
+    console.log(`  Date range: ${formatTimestamp(timestamps[0])} → ${formatTimestamp(timestamps[timestamps.length - 1])}`);
+  }
+  console.log(`  Dir: ${EVAL_DIR}`);
+  console.log('');
+}
+
+async function cmdPush(args: string[]): Promise<void> {
+  const filePath = args[0];
+  if (!filePath) {
+    console.error('Usage: gstack eval push <file>');
+    process.exit(1);
+  }
+
+  const resolved = path.isAbsolute(filePath) ? filePath : path.resolve(filePath);
+  if (!fs.existsSync(resolved)) {
+    console.error(`File not found: ${resolved}`);
+    process.exit(1);
+  }
+
+  // Load and validate
+  let data: unknown;
+  try {
+    data = JSON.parse(fs.readFileSync(resolved, 'utf-8'));
+  } catch (err: any) {
+    console.error(`Invalid JSON: ${err.message}`);
+    process.exit(1);
+  }
+
+  const { validateEvalResult } = await import('./eval-format');
+  const validation = validateEvalResult(data);
+  if (!validation.valid) {
+    console.error('Validation errors:');
+    for (const err of validation.errors) console.error(`  - ${err}`);
+    process.exit(1);
+  }
+
+  // Copy to local eval dir
+  const basename = path.basename(resolved);
+  const localPath = path.join(EVAL_DIR, basename);
+  fs.mkdirSync(EVAL_DIR, { recursive: true });
+  fs.copyFileSync(resolved, localPath);
+  console.log(`Saved to ${localPath}`);
+
+  // Push to team store (non-fatal)
+  try {
+    const { pushEvalRun } = await import('./sync');
+    const ok = await pushEvalRun(data as Record<string, unknown>);
+    if (ok) console.log('Synced to team store ✓');
+    else console.log('Sync queued (will retry later)');
+  } catch {
+    console.log('Team sync not configured — local only');
+  }
+}
+
+async function cmdCost(args: string[]): Promise<void> {
+  const filePath = args[0];
+  if (!filePath) {
+    console.error('Usage: gstack eval cost <file>');
+    process.exit(1);
+  }
+
+  const resolved = path.isAbsolute(filePath) ? filePath : path.resolve(filePath);
+  const data = readJSON<{ costs?: any[] }>(resolved);
+  if (!data) {
+    console.error(`Cannot read file: ${resolved}`);
+    process.exit(1);
+  }
+
+  if (!data.costs || data.costs.length === 0) {
+    console.log('No cost data in this eval file.');
+    return;
+  }
+
+  const { computeCosts, formatCostDashboard } = await import('./eval-cost');
+  const dashboard = computeCosts(data.costs);
+  console.log(formatCostDashboard(dashboard));
+}
+
+async function cmdCache(args: string[]): Promise<void> {
+  const sub = args[0];
+  const {
+    cacheRead, cacheWrite, cacheStats, cacheClear, cacheVerify,
+  } = await import('./eval-cache');
+
+  switch (sub) {
+    case 'read': {
+      const [suite, key] = [args[1], args[2]];
+      if (!suite || !key) { console.error('Usage: gstack eval cache read <suite> <key>'); process.exit(1); }
+      const data = cacheRead(suite, key);
+      if (data === null) { console.log('MISS'); process.exit(1); }
+      console.log(JSON.stringify(data, null, 2));
+      break;
+    }
+    case 'write': {
+      const [suite, key] = [args[1], args[2]];
+      if (!suite || !key) { console.error('Usage: gstack eval cache write <suite> <key> [json]'); process.exit(1); }
+      let jsonData: string;
+      if (args[3]) {
+        jsonData = args[3];
+      } else if (!process.stdin.isTTY) {
+        jsonData = await Bun.stdin.text();
+      } else {
+        console.error('Provide JSON as argument or pipe to stdin');
+        process.exit(1);
+      }
+      const parsed = JSON.parse(jsonData);
+      cacheWrite(suite, key, parsed);
+      console.log('OK');
+      break;
+    }
+    case 'stats': {
+      const stats = cacheStats(args[1]);
+      if (stats.suites.length === 0) { console.log('Cache is empty'); return; }
+      for (const s of stats.suites) {
+        const size = s.size_bytes > 1024 ? `${(s.size_bytes / 1024).toFixed(1)}KB` : `${s.size_bytes}B`;
+        console.log(`  ${s.name.padEnd(20)} ${s.entries} entries  ${size}`);
+      }
+      break;
+    }
+    case 'clear': {
+      const result = cacheClear(args[1]);
+      console.log(`Cleared ${result.deleted} cache entries`);
+      break;
+    }
+    case 'verify': {
+      const result = cacheVerify(args[1]);
+      console.log(`Valid: ${result.valid}  Invalid: ${result.invalid}`);
+      for (const err of result.errors) console.log(`  ERROR: ${err}`);
+      if (result.invalid > 0) process.exit(1);
+      break;
+    }
+    default:
+      console.error('Usage: gstack eval cache <read|write|stats|clear|verify> [args...]');
+      process.exit(1);
+  }
+}
+
+async function cmdWatch(): Promise<void> {
+  // Delegate to existing watch script
+  const watchScript = path.resolve(__dirname, '..', 'scripts', 'eval-watch.ts');
+  const proc = Bun.spawn(['bun', 'run', watchScript, ...process.argv.slice(3)], {
+    stdin: 'inherit',
+    stdout: 'inherit',
+    stderr: 'inherit',
+  });
+  const exitCode = await proc.exited;
+  process.exit(exitCode);
+}
+
+function printUsage(): void {
+  console.log(`
+gstack eval — eval management CLI
+
+Usage: gstack eval <command> [args]
+
+Commands:
+  list [--branch X] [--tier X] [--limit N]   List eval runs (default limit: 50)
+  compare [file-a] [file-b]                   Compare two eval runs
+  summary [--limit N]                         Aggregate stats across all runs
+  push <file>                                 Validate + save + sync an eval result
+  cost <file>                                 Show per-model cost breakdown
+  cache read|write|stats|clear|verify         Manage eval cache
+  watch                                       Live E2E test dashboard
+`);
+}
+
+// --- Main ---
+
+const command = process.argv[2];
+const cmdArgs = process.argv.slice(3);
+
+switch (command) {
+  case 'list':    cmdList(cmdArgs); break;
+  case 'compare': cmdCompare(cmdArgs); break;
+  case 'summary': cmdSummary(cmdArgs); break;
+  case 'push':    cmdPush(cmdArgs); break;
+  case 'cost':    cmdCost(cmdArgs); break;
+  case 'cache':   cmdCache(cmdArgs); break;
+  case 'watch':   cmdWatch(); break;
+  case '--help': case '-h': case 'help': case undefined:
+    printUsage();
+    break;
+  default:
+    console.error(`Unknown command: ${command}`);
+    printUsage();
+    process.exit(1);
+}
diff --git a/package.json b/package.json
index a5044b7d..18090e7d 100644
--- a/package.json
+++ b/package.json
@@ -18,10 +18,10 @@
     "skill:check": "bun run scripts/skill-check.ts",
     "dev:skill": "bun run scripts/dev-skill.ts",
     "start": "bun run browse/src/server.ts",
-    "eval:list": "bun run scripts/eval-list.ts",
-    "eval:compare": "bun run scripts/eval-compare.ts",
-    "eval:summary": "bun run scripts/eval-summary.ts",
-    "eval:watch": "bun run scripts/eval-watch.ts"
+    "eval:list": "bun run lib/cli-eval.ts list",
+    "eval:compare": "bun run lib/cli-eval.ts compare",
+    "eval:summary": "bun run lib/cli-eval.ts summary",
+    "eval:watch": "bun run lib/cli-eval.ts watch"
   },
   "dependencies": {
     "playwright": "^1.58.2",
diff --git a/supabase/migrations/004_eval_costs.sql b/supabase/migrations/004_eval_costs.sql
new file mode 100644
index 00000000..614d2013
--- /dev/null
+++ b/supabase/migrations/004_eval_costs.sql
@@ -0,0 +1,39 @@
+-- Per-model cost tracking for eval runs.
+-- Stores cost breakdown by model so teams can analyze spend patterns.
+
+create table eval_costs (
+  id uuid primary key default gen_random_uuid(),
+  team_id uuid references teams(id) not null,
+  eval_run_id uuid references eval_runs(id) on delete cascade,
+  model text not null,
+  calls int not null,
+  input_tokens int not null,
+  output_tokens int not null,
+  estimated_cost_usd numeric(10,6) not null,
+  created_at timestamptz default now()
+);
+
+-- Index for querying costs by team and eval run
+create index idx_eval_costs_team_run on eval_costs(team_id, eval_run_id);
+
+-- RLS: team members can read/insert their team's costs
+alter table eval_costs enable row level security;
+
+create policy "Team members can read costs"
+  on eval_costs for select
+  using (team_id in (
+    select team_id from team_members where user_id = auth.uid()
+  ));
+
+create policy "Team members can insert costs"
+  on eval_costs for insert
+  with check (team_id in (
+    select team_id from team_members where user_id = auth.uid()
+  ));
+
+create policy "Admins can delete costs"
+  on eval_costs for delete
+  using (team_id in (
+    select team_id from team_members
+    where user_id = auth.uid() and role = 'admin'
+  ));
diff --git a/test/lib-eval-cli.test.ts b/test/lib-eval-cli.test.ts
new file mode 100644
index 00000000..38814f76
--- /dev/null
+++ b/test/lib-eval-cli.test.ts
@@ -0,0 +1,178 @@
+/**
+ * Tests for lib/cli-eval.ts — eval CLI integration tests.
+ *
+ * Spawns the CLI as a subprocess and verifies exit codes + output.
+ */
+
+import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+
+const CLI_PATH = path.resolve(__dirname, '..', 'lib', 'cli-eval.ts');
+const TEST_DIR = path.join(os.tmpdir(), `gstack-cli-eval-test-${Date.now()}`);
+const EVAL_DIR = path.join(TEST_DIR, 'evals');
+
+function runCli(args: string[], env?: Record<string, string>): { stdout: string; stderr: string; exitCode: number } {
+  const proc = Bun.spawnSync(['bun', 'run', CLI_PATH, ...args], {
+    env: {
+      ...process.env,
+      HOME: TEST_DIR,
+      GSTACK_STATE_DIR: path.join(TEST_DIR, '.gstack'),
+      ...env,
+    },
+    cwd: TEST_DIR,
+  });
+  return {
+    stdout: proc.stdout?.toString() || '',
+    stderr: proc.stderr?.toString() || '',
+    exitCode: proc.exitCode,
+  };
+}
+
+// Write a minimal valid eval result file
+function writeEvalFile(name: string, overrides?: Partial<Record<string, any>>): string {
+  const filePath = path.join(EVAL_DIR, name);
+  const data = {
+    schema_version: 1,
+    version: '0.3.3',
+    branch: 'main',
+    git_sha: 'abc1234',
+    timestamp: '2025-05-01T12:00:00Z',
+    hostname: 'test',
+    tier: 'e2e',
+    total_tests: 1,
+    passed: 1,
+    failed: 0,
+    total_cost_usd: 0.50,
+    total_duration_ms: 30000,
+    tests: [{ name: 'test-a', suite: 'core', tier: 'e2e', passed: true, duration_ms: 30000, cost_usd: 0.50 }],
+    ...overrides,
+  };
+  fs.writeFileSync(filePath, JSON.stringify(data, null, 2));
+  return filePath;
+}
+
+describe('lib/cli-eval', () => {
+  beforeAll(() => {
+    fs.mkdirSync(EVAL_DIR, { recursive: true });
+    fs.mkdirSync(path.join(TEST_DIR, '.gstack-dev', 'evals'), { recursive: true });
+  });
+
+  afterAll(() => {
+    fs.rmSync(TEST_DIR, { recursive: true, force: true });
+  });
+
+  describe('help', () => {
+    test('shows usage with --help', () => {
+      const { stdout, exitCode } = runCli(['--help']);
+      expect(exitCode).toBe(0);
+      expect(stdout).toContain('gstack eval');
+      expect(stdout).toContain('list');
+      expect(stdout).toContain('compare');
+      expect(stdout).toContain('push');
+    });
+
+    test('shows usage with no args', () => {
+      const { stdout, exitCode } = runCli([]);
+      expect(exitCode).toBe(0);
+      expect(stdout).toContain('gstack eval');
+    });
+
+    test('unknown command shows error and usage', () => {
+      const { stderr, exitCode } = runCli(['nonsense']);
+      expect(exitCode).toBe(1);
+      expect(stderr).toContain('Unknown command');
+    });
+  });
+
+  describe('list', () => {
+    test('shows "no eval runs" when empty', () => {
+      const { stdout } = runCli(['list']);
+      expect(stdout).toContain('No eval runs');
+    });
+  });
+
+  describe('push', () => {
+    test('push: missing file argument shows usage', () => {
+      const { stderr, exitCode } = runCli(['push']);
+      expect(exitCode).toBe(1);
+      expect(stderr).toContain('Usage');
+    });
+
+    test('push: file not found exits with error', () => {
+      const { stderr, exitCode } = runCli(['push', '/nonexistent/eval.json']);
+      expect(exitCode).toBe(1);
+      expect(stderr).toContain('File not found');
+    });
+
+    test('push: invalid JSON exits with error', () => {
+      const badFile = path.join(TEST_DIR, 'bad.json');
+      fs.writeFileSync(badFile, 'not json at all');
+      const { stderr, exitCode } = runCli(['push', badFile]);
+      expect(exitCode).toBe(1);
+      expect(stderr).toContain('Invalid JSON');
+    });
+
+    test('push: invalid schema exits with validation errors', () => {
+      const invalidFile = path.join(TEST_DIR, 'invalid-schema.json');
+      fs.writeFileSync(invalidFile, JSON.stringify({ not: 'a valid eval' }));
+      const { stderr, exitCode } = runCli(['push', invalidFile]);
+      expect(exitCode).toBe(1);
+      expect(stderr).toContain('Validation errors');
+    });
+
+    test('push: valid file succeeds with local-only message', () => {
+      // Write a valid standard format eval
+      const validFile = path.join(TEST_DIR, 'valid-eval.json');
+      fs.writeFileSync(validFile, JSON.stringify({
+        schema_version: 1,
+        version: '0.3.3',
+        git_branch: 'main',
+        git_sha: 'abc1234',
+        timestamp: '2025-05-01T12:00:00Z',
+        hostname: 'test',
+        tier: 'e2e',
+        total: 1,
+        passed: 1,
+        failed: 0,
+        total_cost_usd: 0.50,
+        duration_seconds: 30,
+        all_results: [{ name: 'test-a', passed: true }],
+      }));
+      const { stdout, exitCode } = runCli(['push', validFile]);
+      expect(exitCode).toBe(0);
+      expect(stdout).toContain('Saved to');
+      // sync not configured, so we get local-only or "not configured"
+      expect(stdout).toMatch(/local|not configured|Synced|queued/i);
+    });
+  });
+
+  describe('cost', () => {
+    test('cost: missing file shows usage', () => {
+      const { stderr, exitCode } = runCli(['cost']);
+      expect(exitCode).toBe(1);
+      expect(stderr).toContain('Usage');
+    });
+
+    test('cost: file without costs shows message', () => {
+      const file = path.join(TEST_DIR, 'no-costs.json');
+      fs.writeFileSync(file, JSON.stringify({ version: '1.0' }));
+      const { stdout } = runCli(['cost', file]);
+      expect(stdout).toContain('No cost data');
+    });
+  });
+
+  describe('cache', () => {
+    test('cache: no subcommand shows usage', () => {
+      const { stderr, exitCode } = runCli(['cache']);
+      expect(exitCode).toBe(1);
+      expect(stderr).toContain('Usage');
+    });
+
+    test('cache stats: empty cache', () => {
+      const { stdout } = runCli(['cache', 'stats']);
+      expect(stdout).toContain('empty');
+    });
+  });
+});