From 4ad73f7362d678c1a6f5adf9f4c65115fd2b6e1a Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sun, 15 Mar 2026 09:39:36 -0500 Subject: [PATCH] feat: unified gstack eval CLI with list, compare, push, cache, cost - lib/cli-eval.ts: routes to list/compare/summary/push/cost/cache/watch subcommands. Ports logic from 4 separate scripts into unified entry. Adds ANSI color for TTY (respects NO_COLOR), --limit flag for list. - bin/gstack-eval: bash wrapper matching bin/gstack-sync pattern - package.json: eval:* scripts now point to lib/cli-eval.ts - supabase/migrations/004_eval_costs.sql: per-model cost tracking + RLS - docs/eval-result-format.md: public format spec for any language - test/lib-eval-cli.test.ts: integration tests (spawn CLI subprocess) including 3 push failure modes (file-not-found, invalid schema, sync unavailable) 215 tests passing across 13 files. Co-Authored-By: Claude Opus 4.6 (1M context) --- bin/gstack-eval | 8 + docs/eval-result-format.md | 136 +++++++ lib/cli-eval.ts | 469 +++++++++++++++++++++++++ package.json | 8 +- supabase/migrations/004_eval_costs.sql | 39 ++ test/lib-eval-cli.test.ts | 178 ++++++++++ 6 files changed, 834 insertions(+), 4 deletions(-) create mode 100755 bin/gstack-eval create mode 100644 docs/eval-result-format.md create mode 100644 lib/cli-eval.ts create mode 100644 supabase/migrations/004_eval_costs.sql create mode 100644 test/lib-eval-cli.test.ts diff --git a/bin/gstack-eval b/bin/gstack-eval new file mode 100755 index 00000000..91ce03ae --- /dev/null +++ b/bin/gstack-eval @@ -0,0 +1,8 @@ +#!/usr/bin/env bash +set -euo pipefail + +# gstack eval — unified eval CLI +# Delegates to lib/cli-eval.ts via bun + +GSTACK_DIR="${GSTACK_DIR:-$(cd "$(dirname "$0")/.." && pwd)}" +exec bun run "$GSTACK_DIR/lib/cli-eval.ts" "$@" diff --git a/docs/eval-result-format.md b/docs/eval-result-format.md new file mode 100644 index 00000000..f58195b3 --- /dev/null +++ b/docs/eval-result-format.md @@ -0,0 +1,136 @@ +# Standard Eval Result Format + +This document defines the JSON format that any language can produce and push into gstack's eval infrastructure via `gstack eval push `. + +## Required Fields + +| Field | Type | Description | +|-------|------|-------------| +| `schema_version` | `number` | Format version (currently `1`) | +| `version` | `string` | Version of the tool/system being evaluated | +| `git_branch` | `string` | Git branch name | +| `git_sha` | `string` | Git commit SHA (short or full) | +| `timestamp` | `string` | ISO 8601 timestamp | +| `tier` | `string` | Eval tier: `"e2e"`, `"llm-judge"`, or custom | +| `total` | `number` | Total number of test cases | +| `passed` | `number` | Number of passing test cases | +| `failed` | `number` | Number of failing test cases | +| `total_cost_usd` | `number` | Total estimated cost in USD | +| `duration_seconds` | `number` | Total wall-clock duration in seconds | +| `all_results` | `array` | Array of test result objects (see below) | + +## Optional Fields + +| Field | Type | Description | +|-------|------|-------------| +| `hostname` | `string` | Machine hostname | +| `label` | `string` | Human-readable label for this run | +| `prompt_sha` | `string` | SHA of the prompt(s) used | +| `by_category` | `object` | `{ category: { passed, failed } }` breakdown | +| `costs` | `array` | Per-model cost entries (see below) | +| `comparison` | `array` | A/B comparison entries | +| `failures` | `array` | Structured failure details | +| `_partial` | `boolean` | `true` for incremental saves, absent in final | + +## Test Result Entry (`all_results[]`) + +Each entry in `all_results` must have: + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `name` | `string` | Yes | Unique test name | +| `passed` | `boolean` | Yes | Whether this test passed | +| `suite` | `string` | No | Suite/group name | +| `tier` | `string` | No | Test tier | +| `duration_ms` | `number` | No | Duration in milliseconds | +| `cost_usd` | `number` | No | Cost for this test | +| `output` | `object` | No | Open-ended output data | +| `turns_used` | `number` | No | LLM conversation turns | +| `exit_reason` | `string` | No | `"success"`, `"timeout"`, `"error_max_turns"`, etc. | +| `detection_rate` | `number` | No | Bugs detected (for QA evals) | +| `judge_scores` | `object` | No | `{ dimension: score }` from LLM judge | +| `judge_reasoning` | `string` | No | LLM judge's reasoning | +| `error` | `string` | No | Error message if test failed | + +## Cost Entry (`costs[]`) + +| Field | Type | Description | +|-------|------|-------------| +| `model` | `string` | Model ID (e.g., `"claude-sonnet-4-6"`) | +| `calls` | `number` | Number of API calls | +| `input_tokens` | `number` | Total input tokens | +| `output_tokens` | `number` | Total output tokens | + +## Example + +```json +{ + "schema_version": 1, + "version": "0.3.3", + "git_branch": "main", + "git_sha": "abc1234", + "timestamp": "2025-05-01T12:00:00Z", + "hostname": "ci-runner-01", + "tier": "e2e", + "total": 2, + "passed": 1, + "failed": 1, + "total_cost_usd": 1.50, + "duration_seconds": 120, + "all_results": [ + { + "name": "login-flow", + "suite": "auth", + "passed": true, + "duration_ms": 60000, + "cost_usd": 0.75, + "turns_used": 5 + }, + { + "name": "checkout-flow", + "suite": "commerce", + "passed": false, + "duration_ms": 60000, + "cost_usd": 0.75, + "error": "Timed out waiting for payment confirmation" + } + ], + "costs": [ + { + "model": "claude-sonnet-4-6", + "calls": 10, + "input_tokens": 500000, + "output_tokens": 250000 + } + ] +} +``` + +## Legacy Format + +gstack's internal eval system uses a slightly different format (from `test/helpers/eval-store.ts`). The `normalizeFromLegacy()` and `normalizeToLegacy()` functions in `lib/eval-format.ts` handle conversion: + +| Legacy field | Standard field | +|-------------|---------------| +| `branch` | `git_branch` | +| `total_tests` | `total` | +| `total_duration_ms` | `duration_seconds` (÷ 1000) | +| `tests` | `all_results` | + +## Validation + +Use `gstack eval push ` to validate and push a result file. Validation checks: +- All required fields present with correct types +- `all_results` is an array of objects +- Each entry has `name` (string) and `passed` (boolean) + +## Pushing Results + +```bash +# Validate + save locally + push to team Supabase (if configured) +gstack eval push my-eval-results.json + +# From any language — just write JSON and push: +python run_evals.py --output results.json +gstack eval push results.json +``` diff --git a/lib/cli-eval.ts b/lib/cli-eval.ts new file mode 100644 index 00000000..df16d033 --- /dev/null +++ b/lib/cli-eval.ts @@ -0,0 +1,469 @@ +#!/usr/bin/env bun +/** + * Unified eval CLI: gstack eval + * + * Subcommands: + * list [--branch ] [--tier ] [--limit N] + * compare [file-a] [file-b] + * summary [--limit N] + * push + * cost + * cache read|write|stats|clear|verify [args...] + * watch + */ + +import * as fs from 'fs'; +import * as path from 'path'; +import { + EVAL_DIR, + GSTACK_DEV_DIR, + readJSON, + listEvalFiles, + loadEvalResults, + formatTimestamp, +} from './util'; +import { + findPreviousRun, + compareEvalResults, + formatComparison, +} from '../test/helpers/eval-store'; +import type { EvalResult } from '../test/helpers/eval-store'; +import type { ComparisonResult } from '../test/helpers/eval-store'; + +// --- ANSI color helpers --- + +const isTTY = process.stdout.isTTY && !process.env.NO_COLOR; + +function green(s: string): string { return isTTY ? `\x1b[32m${s}\x1b[0m` : s; } +function red(s: string): string { return isTTY ? `\x1b[31m${s}\x1b[0m` : s; } +function dim(s: string): string { return isTTY ? `\x1b[2m${s}\x1b[0m` : s; } + +/** + * Wrap ANSI colors around comparison arrows: ↑ green, ↓ red, = dim. + */ +export function formatComparisonColor(c: ComparisonResult): string { + const plain = formatComparison(c); + if (!isTTY) return plain; + return plain + .replace(/↑/g, green('↑')) + .replace(/↓/g, red('↓')) + .replace(/ = /g, dim(' = ')); +} + +// --- Subcommands --- + +async function cmdList(args: string[]): Promise { + let filterBranch: string | null = null; + let filterTier: string | null = null; + let limit = 50; + + for (let i = 0; i < args.length; i++) { + if (args[i] === '--branch' && args[i + 1]) { filterBranch = args[++i]; } + else if (args[i] === '--tier' && args[i + 1]) { filterTier = args[++i]; } + else if (args[i] === '--limit' && args[i + 1]) { limit = parseInt(args[++i], 10); } + } + + const files = listEvalFiles(); + if (files.length === 0) { + console.log('No eval runs yet. Run: EVALS=1 bun run test:evals'); + return; + } + + interface RunSummary { + file: string; + timestamp: string; + branch: string; + tier: string; + version: string; + passed: number; + total: number; + cost: number; + } + + const runs: RunSummary[] = []; + for (const file of files) { + const data = readJSON>(file); + if (!data) continue; + if (filterBranch && data.branch !== filterBranch) continue; + if (filterTier && data.tier !== filterTier) continue; + runs.push({ + file: path.basename(file), + timestamp: data.timestamp || '', + branch: data.branch || 'unknown', + tier: data.tier || 'unknown', + version: data.version || '?', + passed: data.passed || 0, + total: data.total_tests || 0, + cost: data.total_cost_usd || 0, + }); + } + + runs.sort((a, b) => b.timestamp.localeCompare(a.timestamp)); + const displayed = runs.slice(0, limit); + + console.log(''); + console.log(`Eval History (${runs.length} total runs)`); + console.log('═'.repeat(90)); + console.log( + ' ' + + 'Date'.padEnd(17) + + 'Branch'.padEnd(28) + + 'Tier'.padEnd(12) + + 'Pass'.padEnd(8) + + 'Cost'.padEnd(8) + + 'Version' + ); + console.log('─'.repeat(90)); + + for (const run of displayed) { + const date = formatTimestamp(run.timestamp); + const branch = run.branch.length > 26 ? run.branch.slice(0, 23) + '...' : run.branch.padEnd(28); + const pass = `${run.passed}/${run.total}`.padEnd(8); + const cost = `$${run.cost.toFixed(2)}`.padEnd(8); + console.log(` ${date.padEnd(17)}${branch}${run.tier.padEnd(12)}${pass}${cost}v${run.version}`); + } + + console.log('─'.repeat(90)); + const totalCost = runs.reduce((s, r) => s + r.cost, 0); + console.log(` ${runs.length} runs | Total spend: $${totalCost.toFixed(2)} | Showing: ${displayed.length}`); + console.log(` Dir: ${EVAL_DIR}`); + console.log(''); +} + +async function cmdCompare(args: string[]): Promise { + function loadResult(filepath: string): EvalResult { + const resolved = path.isAbsolute(filepath) ? filepath : path.join(EVAL_DIR, filepath); + if (!fs.existsSync(resolved)) { + console.error(`File not found: ${resolved}`); + process.exit(1); + } + return JSON.parse(fs.readFileSync(resolved, 'utf-8')); + } + + let beforeFile: string; + let afterFile: string; + + if (args.length === 2) { + beforeFile = args[0]; + afterFile = args[1]; + } else if (args.length === 1) { + afterFile = args[0]; + const resolved = path.isAbsolute(afterFile) ? afterFile : path.join(EVAL_DIR, afterFile); + const afterResult = loadResult(resolved); + const prev = findPreviousRun(EVAL_DIR, afterResult.tier, afterResult.branch, resolved); + if (!prev) { + console.log('No previous run found to compare against.'); + return; + } + beforeFile = prev; + } else { + const files = listEvalFiles(); + if (files.length < 2) { + console.log('Need at least 2 eval runs to compare. Run evals again.'); + return; + } + afterFile = files[0]; + const afterResult = loadResult(afterFile); + const prev = findPreviousRun(EVAL_DIR, afterResult.tier, afterResult.branch, afterFile); + if (!prev) { + console.log('No previous run of the same tier found to compare against.'); + return; + } + beforeFile = prev; + } + + const beforeResult = loadResult(beforeFile); + const afterResult = loadResult(afterFile); + + if (beforeResult.tier !== afterResult.tier) { + console.warn(`Warning: comparing different tiers (${beforeResult.tier} vs ${afterResult.tier})`); + } + if (beforeResult.schema_version !== afterResult.schema_version) { + console.warn(`Warning: schema version mismatch (${beforeResult.schema_version} vs ${afterResult.schema_version})`); + } + + const comparison = compareEvalResults(beforeResult, afterResult, beforeFile, afterFile); + console.log(formatComparisonColor(comparison)); +} + +async function cmdSummary(args: string[]): Promise { + let limit: number | undefined; + for (let i = 0; i < args.length; i++) { + if (args[i] === '--limit' && args[i + 1]) { limit = parseInt(args[++i], 10); } + } + + const results = loadEvalResults(undefined, limit); + if (results.length === 0) { + console.log('No eval runs yet. Run: EVALS=1 bun run test:evals'); + return; + } + + const e2eRuns = results.filter(r => r.tier === 'e2e'); + const judgeRuns = results.filter(r => r.tier === 'llm-judge'); + const totalCost = results.reduce((s, r) => s + (r.total_cost_usd || 0), 0); + const avgE2ECost = e2eRuns.length > 0 ? e2eRuns.reduce((s, r) => s + r.total_cost_usd, 0) / e2eRuns.length : 0; + const avgJudgeCost = judgeRuns.length > 0 ? judgeRuns.reduce((s, r) => s + r.total_cost_usd, 0) / judgeRuns.length : 0; + + // Detection rates + const detectionRates: number[] = []; + for (const r of e2eRuns) { + for (const t of r.tests) { + if (t.detection_rate !== undefined) detectionRates.push(t.detection_rate); + } + } + const avgDetection = detectionRates.length > 0 + ? detectionRates.reduce((a, b) => a + b, 0) / detectionRates.length + : null; + + // Flaky tests + const testResults = new Map(); + for (const r of results) { + for (const t of r.tests) { + const key = `${r.tier}:${t.name}`; + if (!testResults.has(key)) testResults.set(key, []); + testResults.get(key)!.push(t.passed); + } + } + const flakyTests: string[] = []; + for (const [name, outcomes] of testResults) { + if (outcomes.length >= 2 && outcomes.some(o => o) && outcomes.some(o => !o)) { + flakyTests.push(name); + } + } + + // Branch stats + const branchStats = new Map(); + for (const r of e2eRuns) { + if (!branchStats.has(r.branch)) branchStats.set(r.branch, { runs: 0, detections: [] }); + const stats = branchStats.get(r.branch)!; + stats.runs++; + for (const t of r.tests) { + if (t.detection_rate !== undefined) stats.detections.push(t.detection_rate); + } + } + + // Print + console.log(''); + console.log('Eval Summary'); + console.log('═'.repeat(60)); + console.log(` Total runs: ${results.length} (${e2eRuns.length} e2e, ${judgeRuns.length} llm-judge)`); + console.log(` Total spend: $${totalCost.toFixed(2)}`); + console.log(` Avg cost/e2e: $${avgE2ECost.toFixed(2)}`); + console.log(` Avg cost/judge: $${avgJudgeCost.toFixed(2)}`); + if (avgDetection !== null) { + console.log(` Avg detection: ${avgDetection.toFixed(1)} bugs`); + } + console.log('─'.repeat(60)); + + if (flakyTests.length > 0) { + console.log(` Flaky tests (${flakyTests.length}):`); + for (const name of flakyTests) console.log(` - ${name}`); + console.log('─'.repeat(60)); + } + + if (branchStats.size > 0) { + console.log(' Branches:'); + const sorted = [...branchStats.entries()].sort((a, b) => { + const avgA = a[1].detections.length > 0 ? a[1].detections.reduce((x, y) => x + y, 0) / a[1].detections.length : 0; + const avgB = b[1].detections.length > 0 ? b[1].detections.reduce((x, y) => x + y, 0) / b[1].detections.length : 0; + return avgB - avgA; + }); + for (const [branch, stats] of sorted) { + const avgDet = stats.detections.length > 0 + ? stats.detections.reduce((a, b) => a + b, 0) / stats.detections.length + : null; + const det = avgDet !== null ? ` avg det: ${avgDet.toFixed(1)}` : ''; + console.log(` ${branch.padEnd(30)} ${stats.runs} runs${det}`); + } + console.log('─'.repeat(60)); + } + + const timestamps = results.map(r => r.timestamp).filter(Boolean).sort(); + if (timestamps.length > 0) { + console.log(` Date range: ${formatTimestamp(timestamps[0])} → ${formatTimestamp(timestamps[timestamps.length - 1])}`); + } + console.log(` Dir: ${EVAL_DIR}`); + console.log(''); +} + +async function cmdPush(args: string[]): Promise { + const filePath = args[0]; + if (!filePath) { + console.error('Usage: gstack eval push '); + process.exit(1); + } + + const resolved = path.isAbsolute(filePath) ? filePath : path.resolve(filePath); + if (!fs.existsSync(resolved)) { + console.error(`File not found: ${resolved}`); + process.exit(1); + } + + // Load and validate + let data: unknown; + try { + data = JSON.parse(fs.readFileSync(resolved, 'utf-8')); + } catch (err: any) { + console.error(`Invalid JSON: ${err.message}`); + process.exit(1); + } + + const { validateEvalResult } = await import('./eval-format'); + const validation = validateEvalResult(data); + if (!validation.valid) { + console.error('Validation errors:'); + for (const err of validation.errors) console.error(` - ${err}`); + process.exit(1); + } + + // Copy to local eval dir + const basename = path.basename(resolved); + const localPath = path.join(EVAL_DIR, basename); + fs.mkdirSync(EVAL_DIR, { recursive: true }); + fs.copyFileSync(resolved, localPath); + console.log(`Saved to ${localPath}`); + + // Push to team store (non-fatal) + try { + const { pushEvalRun } = await import('./sync'); + const ok = await pushEvalRun(data as Record); + if (ok) console.log('Synced to team store ✓'); + else console.log('Sync queued (will retry later)'); + } catch { + console.log('Team sync not configured — local only'); + } +} + +async function cmdCost(args: string[]): Promise { + const filePath = args[0]; + if (!filePath) { + console.error('Usage: gstack eval cost '); + process.exit(1); + } + + const resolved = path.isAbsolute(filePath) ? filePath : path.resolve(filePath); + const data = readJSON<{ costs?: any[] }>(resolved); + if (!data) { + console.error(`Cannot read file: ${resolved}`); + process.exit(1); + } + + if (!data.costs || data.costs.length === 0) { + console.log('No cost data in this eval file.'); + return; + } + + const { computeCosts, formatCostDashboard } = await import('./eval-cost'); + const dashboard = computeCosts(data.costs); + console.log(formatCostDashboard(dashboard)); +} + +async function cmdCache(args: string[]): Promise { + const sub = args[0]; + const { + cacheRead, cacheWrite, cacheStats, cacheClear, cacheVerify, + } = await import('./eval-cache'); + + switch (sub) { + case 'read': { + const [suite, key] = [args[1], args[2]]; + if (!suite || !key) { console.error('Usage: gstack eval cache read '); process.exit(1); } + const data = cacheRead(suite, key); + if (data === null) { console.log('MISS'); process.exit(1); } + console.log(JSON.stringify(data, null, 2)); + break; + } + case 'write': { + const [suite, key] = [args[1], args[2]]; + if (!suite || !key) { console.error('Usage: gstack eval cache write [json]'); process.exit(1); } + let jsonData: string; + if (args[3]) { + jsonData = args[3]; + } else if (!process.stdin.isTTY) { + jsonData = await Bun.stdin.text(); + } else { + console.error('Provide JSON as argument or pipe to stdin'); + process.exit(1); + } + const parsed = JSON.parse(jsonData); + cacheWrite(suite, key, parsed); + console.log('OK'); + break; + } + case 'stats': { + const stats = cacheStats(args[1]); + if (stats.suites.length === 0) { console.log('Cache is empty'); return; } + for (const s of stats.suites) { + const size = s.size_bytes > 1024 ? `${(s.size_bytes / 1024).toFixed(1)}KB` : `${s.size_bytes}B`; + console.log(` ${s.name.padEnd(20)} ${s.entries} entries ${size}`); + } + break; + } + case 'clear': { + const result = cacheClear(args[1]); + console.log(`Cleared ${result.deleted} cache entries`); + break; + } + case 'verify': { + const result = cacheVerify(args[1]); + console.log(`Valid: ${result.valid} Invalid: ${result.invalid}`); + for (const err of result.errors) console.log(` ERROR: ${err}`); + if (result.invalid > 0) process.exit(1); + break; + } + default: + console.error('Usage: gstack eval cache [args...]'); + process.exit(1); + } +} + +async function cmdWatch(): Promise { + // Delegate to existing watch script + const watchScript = path.resolve(__dirname, '..', 'scripts', 'eval-watch.ts'); + const proc = Bun.spawn(['bun', 'run', watchScript, ...process.argv.slice(3)], { + stdin: 'inherit', + stdout: 'inherit', + stderr: 'inherit', + }); + const exitCode = await proc.exited; + process.exit(exitCode); +} + +function printUsage(): void { + console.log(` +gstack eval — eval management CLI + +Usage: gstack eval [args] + +Commands: + list [--branch X] [--tier X] [--limit N] List eval runs (default limit: 50) + compare [file-a] [file-b] Compare two eval runs + summary [--limit N] Aggregate stats across all runs + push Validate + save + sync an eval result + cost Show per-model cost breakdown + cache read|write|stats|clear|verify Manage eval cache + watch Live E2E test dashboard +`); +} + +// --- Main --- + +const command = process.argv[2]; +const cmdArgs = process.argv.slice(3); + +switch (command) { + case 'list': cmdList(cmdArgs); break; + case 'compare': cmdCompare(cmdArgs); break; + case 'summary': cmdSummary(cmdArgs); break; + case 'push': cmdPush(cmdArgs); break; + case 'cost': cmdCost(cmdArgs); break; + case 'cache': cmdCache(cmdArgs); break; + case 'watch': cmdWatch(); break; + case '--help': case '-h': case 'help': case undefined: + printUsage(); + break; + default: + console.error(`Unknown command: ${command}`); + printUsage(); + process.exit(1); +} diff --git a/package.json b/package.json index a5044b7d..18090e7d 100644 --- a/package.json +++ b/package.json @@ -18,10 +18,10 @@ "skill:check": "bun run scripts/skill-check.ts", "dev:skill": "bun run scripts/dev-skill.ts", "start": "bun run browse/src/server.ts", - "eval:list": "bun run scripts/eval-list.ts", - "eval:compare": "bun run scripts/eval-compare.ts", - "eval:summary": "bun run scripts/eval-summary.ts", - "eval:watch": "bun run scripts/eval-watch.ts" + "eval:list": "bun run lib/cli-eval.ts list", + "eval:compare": "bun run lib/cli-eval.ts compare", + "eval:summary": "bun run lib/cli-eval.ts summary", + "eval:watch": "bun run lib/cli-eval.ts watch" }, "dependencies": { "playwright": "^1.58.2", diff --git a/supabase/migrations/004_eval_costs.sql b/supabase/migrations/004_eval_costs.sql new file mode 100644 index 00000000..614d2013 --- /dev/null +++ b/supabase/migrations/004_eval_costs.sql @@ -0,0 +1,39 @@ +-- Per-model cost tracking for eval runs. +-- Stores cost breakdown by model so teams can analyze spend patterns. + +create table eval_costs ( + id uuid primary key default gen_random_uuid(), + team_id uuid references teams(id) not null, + eval_run_id uuid references eval_runs(id) on delete cascade, + model text not null, + calls int not null, + input_tokens int not null, + output_tokens int not null, + estimated_cost_usd numeric(10,6) not null, + created_at timestamptz default now() +); + +-- Index for querying costs by team and eval run +create index idx_eval_costs_team_run on eval_costs(team_id, eval_run_id); + +-- RLS: team members can read/insert their team's costs +alter table eval_costs enable row level security; + +create policy "Team members can read costs" + on eval_costs for select + using (team_id in ( + select team_id from team_members where user_id = auth.uid() + )); + +create policy "Team members can insert costs" + on eval_costs for insert + with check (team_id in ( + select team_id from team_members where user_id = auth.uid() + )); + +create policy "Admins can delete costs" + on eval_costs for delete + using (team_id in ( + select team_id from team_members + where user_id = auth.uid() and role = 'admin' + )); diff --git a/test/lib-eval-cli.test.ts b/test/lib-eval-cli.test.ts new file mode 100644 index 00000000..38814f76 --- /dev/null +++ b/test/lib-eval-cli.test.ts @@ -0,0 +1,178 @@ +/** + * Tests for lib/cli-eval.ts — eval CLI integration tests. + * + * Spawns the CLI as a subprocess and verifies exit codes + output. + */ + +import { describe, test, expect, beforeAll, afterAll } from 'bun:test'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; + +const CLI_PATH = path.resolve(__dirname, '..', 'lib', 'cli-eval.ts'); +const TEST_DIR = path.join(os.tmpdir(), `gstack-cli-eval-test-${Date.now()}`); +const EVAL_DIR = path.join(TEST_DIR, 'evals'); + +function runCli(args: string[], env?: Record): { stdout: string; stderr: string; exitCode: number } { + const proc = Bun.spawnSync(['bun', 'run', CLI_PATH, ...args], { + env: { + ...process.env, + HOME: TEST_DIR, + GSTACK_STATE_DIR: path.join(TEST_DIR, '.gstack'), + ...env, + }, + cwd: TEST_DIR, + }); + return { + stdout: proc.stdout?.toString() || '', + stderr: proc.stderr?.toString() || '', + exitCode: proc.exitCode, + }; +} + +// Write a minimal valid eval result file +function writeEvalFile(name: string, overrides?: Partial>): string { + const filePath = path.join(EVAL_DIR, name); + const data = { + schema_version: 1, + version: '0.3.3', + branch: 'main', + git_sha: 'abc1234', + timestamp: '2025-05-01T12:00:00Z', + hostname: 'test', + tier: 'e2e', + total_tests: 1, + passed: 1, + failed: 0, + total_cost_usd: 0.50, + total_duration_ms: 30000, + tests: [{ name: 'test-a', suite: 'core', tier: 'e2e', passed: true, duration_ms: 30000, cost_usd: 0.50 }], + ...overrides, + }; + fs.writeFileSync(filePath, JSON.stringify(data, null, 2)); + return filePath; +} + +describe('lib/cli-eval', () => { + beforeAll(() => { + fs.mkdirSync(EVAL_DIR, { recursive: true }); + fs.mkdirSync(path.join(TEST_DIR, '.gstack-dev', 'evals'), { recursive: true }); + }); + + afterAll(() => { + fs.rmSync(TEST_DIR, { recursive: true, force: true }); + }); + + describe('help', () => { + test('shows usage with --help', () => { + const { stdout, exitCode } = runCli(['--help']); + expect(exitCode).toBe(0); + expect(stdout).toContain('gstack eval'); + expect(stdout).toContain('list'); + expect(stdout).toContain('compare'); + expect(stdout).toContain('push'); + }); + + test('shows usage with no args', () => { + const { stdout, exitCode } = runCli([]); + expect(exitCode).toBe(0); + expect(stdout).toContain('gstack eval'); + }); + + test('unknown command shows error and usage', () => { + const { stderr, exitCode } = runCli(['nonsense']); + expect(exitCode).toBe(1); + expect(stderr).toContain('Unknown command'); + }); + }); + + describe('list', () => { + test('shows "no eval runs" when empty', () => { + const { stdout } = runCli(['list']); + expect(stdout).toContain('No eval runs'); + }); + }); + + describe('push', () => { + test('push: missing file argument shows usage', () => { + const { stderr, exitCode } = runCli(['push']); + expect(exitCode).toBe(1); + expect(stderr).toContain('Usage'); + }); + + test('push: file not found exits with error', () => { + const { stderr, exitCode } = runCli(['push', '/nonexistent/eval.json']); + expect(exitCode).toBe(1); + expect(stderr).toContain('File not found'); + }); + + test('push: invalid JSON exits with error', () => { + const badFile = path.join(TEST_DIR, 'bad.json'); + fs.writeFileSync(badFile, 'not json at all'); + const { stderr, exitCode } = runCli(['push', badFile]); + expect(exitCode).toBe(1); + expect(stderr).toContain('Invalid JSON'); + }); + + test('push: invalid schema exits with validation errors', () => { + const invalidFile = path.join(TEST_DIR, 'invalid-schema.json'); + fs.writeFileSync(invalidFile, JSON.stringify({ not: 'a valid eval' })); + const { stderr, exitCode } = runCli(['push', invalidFile]); + expect(exitCode).toBe(1); + expect(stderr).toContain('Validation errors'); + }); + + test('push: valid file succeeds with local-only message', () => { + // Write a valid standard format eval + const validFile = path.join(TEST_DIR, 'valid-eval.json'); + fs.writeFileSync(validFile, JSON.stringify({ + schema_version: 1, + version: '0.3.3', + git_branch: 'main', + git_sha: 'abc1234', + timestamp: '2025-05-01T12:00:00Z', + hostname: 'test', + tier: 'e2e', + total: 1, + passed: 1, + failed: 0, + total_cost_usd: 0.50, + duration_seconds: 30, + all_results: [{ name: 'test-a', passed: true }], + })); + const { stdout, exitCode } = runCli(['push', validFile]); + expect(exitCode).toBe(0); + expect(stdout).toContain('Saved to'); + // sync not configured, so we get local-only or "not configured" + expect(stdout).toMatch(/local|not configured|Synced|queued/i); + }); + }); + + describe('cost', () => { + test('cost: missing file shows usage', () => { + const { stderr, exitCode } = runCli(['cost']); + expect(exitCode).toBe(1); + expect(stderr).toContain('Usage'); + }); + + test('cost: file without costs shows message', () => { + const file = path.join(TEST_DIR, 'no-costs.json'); + fs.writeFileSync(file, JSON.stringify({ version: '1.0' })); + const { stdout } = runCli(['cost', file]); + expect(stdout).toContain('No cost data'); + }); + }); + + describe('cache', () => { + test('cache: no subcommand shows usage', () => { + const { stderr, exitCode } = runCli(['cache']); + expect(exitCode).toBe(1); + expect(stderr).toContain('Usage'); + }); + + test('cache stats: empty cache', () => { + const { stdout } = runCli(['cache', 'stats']); + expect(stdout).toContain('empty'); + }); + }); +});