#!/usr/bin/env bun /** * Unified eval CLI: gstack eval * * Subcommands: * list [--branch ] [--tier ] [--limit N] * compare [file-a] [file-b] * summary [--limit N] * push * cost * cache read|write|stats|clear|verify [args...] * watch */ import * as fs from 'fs'; import * as path from 'path'; import { EVAL_DIR, GSTACK_DEV_DIR, readJSON, listEvalFiles, loadEvalResults, formatTimestamp, } from './util'; import { findPreviousRun, compareEvalResults, formatComparison, } from '../test/helpers/eval-store'; import type { EvalResult } from '../test/helpers/eval-store'; import type { ComparisonResult } from '../test/helpers/eval-store'; // --- ANSI color helpers --- const isTTY = process.stdout.isTTY && !process.env.NO_COLOR; function green(s: string): string { return isTTY ? `\x1b[32m${s}\x1b[0m` : s; } function red(s: string): string { return isTTY ? `\x1b[31m${s}\x1b[0m` : s; } function dim(s: string): string { return isTTY ? `\x1b[2m${s}\x1b[0m` : s; } /** * Wrap ANSI colors around comparison arrows: ↑ green, ↓ red, = dim. */ export function formatComparisonColor(c: ComparisonResult): string { const plain = formatComparison(c); if (!isTTY) return plain; return plain .replace(/↑/g, green('↑')) .replace(/↓/g, red('↓')) .replace(/ = /g, dim(' = ')); } // --- Subcommands --- async function cmdList(args: string[]): Promise { let filterBranch: string | null = null; let filterTier: string | null = null; let limit = 50; for (let i = 0; i < args.length; i++) { if (args[i] === '--branch' && args[i + 1]) { filterBranch = args[++i]; } else if (args[i] === '--tier' && args[i + 1]) { filterTier = args[++i]; } else if (args[i] === '--limit' && args[i + 1]) { limit = parseInt(args[++i], 10); } } const files = listEvalFiles(); if (files.length === 0) { console.log('No eval runs yet. Run: EVALS=1 bun run test:evals'); return; } interface RunSummary { file: string; timestamp: string; branch: string; tier: string; version: string; passed: number; total: number; cost: number; } const runs: RunSummary[] = []; for (const file of files) { const data = readJSON>(file); if (!data) continue; if (filterBranch && data.branch !== filterBranch) continue; if (filterTier && data.tier !== filterTier) continue; runs.push({ file: path.basename(file), timestamp: data.timestamp || '', branch: data.branch || 'unknown', tier: data.tier || 'unknown', version: data.version || '?', passed: data.passed || 0, total: data.total_tests || 0, cost: data.total_cost_usd || 0, }); } runs.sort((a, b) => b.timestamp.localeCompare(a.timestamp)); const displayed = runs.slice(0, limit); console.log(''); console.log(`Eval History (${runs.length} total runs)`); console.log('═'.repeat(90)); console.log( ' ' + 'Date'.padEnd(17) + 'Branch'.padEnd(28) + 'Tier'.padEnd(12) + 'Pass'.padEnd(8) + 'Cost'.padEnd(8) + 'Version' ); console.log('─'.repeat(90)); for (const run of displayed) { const date = formatTimestamp(run.timestamp); const branch = run.branch.length > 26 ? run.branch.slice(0, 23) + '...' : run.branch.padEnd(28); const pass = `${run.passed}/${run.total}`.padEnd(8); const cost = `$${run.cost.toFixed(2)}`.padEnd(8); console.log(` ${date.padEnd(17)}${branch}${run.tier.padEnd(12)}${pass}${cost}v${run.version}`); } console.log('─'.repeat(90)); const totalCost = runs.reduce((s, r) => s + r.cost, 0); console.log(` ${runs.length} runs | Total spend: $${totalCost.toFixed(2)} | Showing: ${displayed.length}`); console.log(` Dir: ${EVAL_DIR}`); console.log(''); } async function cmdCompare(args: string[]): Promise { function loadResult(filepath: string): EvalResult { const resolved = path.isAbsolute(filepath) ? filepath : path.join(EVAL_DIR, filepath); if (!fs.existsSync(resolved)) { console.error(`File not found: ${resolved}`); process.exit(1); } return JSON.parse(fs.readFileSync(resolved, 'utf-8')); } let beforeFile: string; let afterFile: string; if (args.length === 2) { beforeFile = args[0]; afterFile = args[1]; } else if (args.length === 1) { afterFile = args[0]; const resolved = path.isAbsolute(afterFile) ? afterFile : path.join(EVAL_DIR, afterFile); const afterResult = loadResult(resolved); const prev = findPreviousRun(EVAL_DIR, afterResult.tier, afterResult.branch, resolved); if (!prev) { console.log('No previous run found to compare against.'); return; } beforeFile = prev; } else { const files = listEvalFiles(); if (files.length < 2) { console.log('Need at least 2 eval runs to compare. Run evals again.'); return; } afterFile = files[0]; const afterResult = loadResult(afterFile); const prev = findPreviousRun(EVAL_DIR, afterResult.tier, afterResult.branch, afterFile); if (!prev) { console.log('No previous run of the same tier found to compare against.'); return; } beforeFile = prev; } const beforeResult = loadResult(beforeFile); const afterResult = loadResult(afterFile); if (beforeResult.tier !== afterResult.tier) { console.warn(`Warning: comparing different tiers (${beforeResult.tier} vs ${afterResult.tier})`); } if (beforeResult.schema_version !== afterResult.schema_version) { console.warn(`Warning: schema version mismatch (${beforeResult.schema_version} vs ${afterResult.schema_version})`); } const comparison = compareEvalResults(beforeResult, afterResult, beforeFile, afterFile); console.log(formatComparisonColor(comparison)); } async function cmdSummary(args: string[]): Promise { let limit: number | undefined; for (let i = 0; i < args.length; i++) { if (args[i] === '--limit' && args[i + 1]) { limit = parseInt(args[++i], 10); } } const results = loadEvalResults(undefined, limit); if (results.length === 0) { console.log('No eval runs yet. Run: EVALS=1 bun run test:evals'); return; } const e2eRuns = results.filter(r => r.tier === 'e2e'); const judgeRuns = results.filter(r => r.tier === 'llm-judge'); const totalCost = results.reduce((s, r) => s + (r.total_cost_usd || 0), 0); const avgE2ECost = e2eRuns.length > 0 ? e2eRuns.reduce((s, r) => s + r.total_cost_usd, 0) / e2eRuns.length : 0; const avgJudgeCost = judgeRuns.length > 0 ? judgeRuns.reduce((s, r) => s + r.total_cost_usd, 0) / judgeRuns.length : 0; // Detection rates const detectionRates: number[] = []; for (const r of e2eRuns) { for (const t of r.tests) { if (t.detection_rate !== undefined) detectionRates.push(t.detection_rate); } } const avgDetection = detectionRates.length > 0 ? detectionRates.reduce((a, b) => a + b, 0) / detectionRates.length : null; // Flaky tests const testResults = new Map(); for (const r of results) { for (const t of r.tests) { const key = `${r.tier}:${t.name}`; if (!testResults.has(key)) testResults.set(key, []); testResults.get(key)!.push(t.passed); } } const flakyTests: string[] = []; for (const [name, outcomes] of testResults) { if (outcomes.length >= 2 && outcomes.some(o => o) && outcomes.some(o => !o)) { flakyTests.push(name); } } // Branch stats const branchStats = new Map(); for (const r of e2eRuns) { if (!branchStats.has(r.branch)) branchStats.set(r.branch, { runs: 0, detections: [] }); const stats = branchStats.get(r.branch)!; stats.runs++; for (const t of r.tests) { if (t.detection_rate !== undefined) stats.detections.push(t.detection_rate); } } // Print console.log(''); console.log('Eval Summary'); console.log('═'.repeat(60)); console.log(` Total runs: ${results.length} (${e2eRuns.length} e2e, ${judgeRuns.length} llm-judge)`); console.log(` Total spend: $${totalCost.toFixed(2)}`); console.log(` Avg cost/e2e: $${avgE2ECost.toFixed(2)}`); console.log(` Avg cost/judge: $${avgJudgeCost.toFixed(2)}`); if (avgDetection !== null) { console.log(` Avg detection: ${avgDetection.toFixed(1)} bugs`); } console.log('─'.repeat(60)); if (flakyTests.length > 0) { console.log(` Flaky tests (${flakyTests.length}):`); for (const name of flakyTests) console.log(` - ${name}`); console.log(` Run 'bun run eval:trend' for detailed time series.`); console.log('─'.repeat(60)); } if (branchStats.size > 0) { console.log(' Branches:'); const sorted = [...branchStats.entries()].sort((a, b) => { const avgA = a[1].detections.length > 0 ? a[1].detections.reduce((x, y) => x + y, 0) / a[1].detections.length : 0; const avgB = b[1].detections.length > 0 ? b[1].detections.reduce((x, y) => x + y, 0) / b[1].detections.length : 0; return avgB - avgA; }); for (const [branch, stats] of sorted) { const avgDet = stats.detections.length > 0 ? stats.detections.reduce((a, b) => a + b, 0) / stats.detections.length : null; const det = avgDet !== null ? ` avg det: ${avgDet.toFixed(1)}` : ''; console.log(` ${branch.padEnd(30)} ${stats.runs} runs${det}`); } console.log('─'.repeat(60)); } const timestamps = results.map(r => r.timestamp).filter(Boolean).sort(); if (timestamps.length > 0) { console.log(` Date range: ${formatTimestamp(timestamps[0])} → ${formatTimestamp(timestamps[timestamps.length - 1])}`); } console.log(` Dir: ${EVAL_DIR}`); console.log(''); } async function cmdPush(args: string[]): Promise { const filePath = args[0]; if (!filePath) { console.error('Usage: gstack eval push '); process.exit(1); } const resolved = path.isAbsolute(filePath) ? filePath : path.resolve(filePath); if (!fs.existsSync(resolved)) { console.error(`File not found: ${resolved}`); process.exit(1); } // Load and validate let data: unknown; try { data = JSON.parse(fs.readFileSync(resolved, 'utf-8')); } catch (err: any) { console.error(`Invalid JSON: ${err.message}`); process.exit(1); } const { validateEvalResult } = await import('./eval-format'); const validation = validateEvalResult(data); if (!validation.valid) { console.error('Validation errors:'); for (const err of validation.errors) console.error(` - ${err}`); process.exit(1); } // Copy to local eval dir const basename = path.basename(resolved); const localPath = path.join(EVAL_DIR, basename); fs.mkdirSync(EVAL_DIR, { recursive: true }); fs.copyFileSync(resolved, localPath); console.log(`Saved to ${localPath}`); // Push to team store (non-fatal) try { const { pushEvalRun } = await import('./sync'); const ok = await pushEvalRun(data as Record); if (ok) console.log('Synced to team store ✓'); else console.log('Sync queued (will retry later)'); } catch { console.log('Team sync not configured — local only'); } } async function cmdCost(args: string[]): Promise { const filePath = args[0]; if (!filePath) { console.error('Usage: gstack eval cost '); process.exit(1); } const resolved = path.isAbsolute(filePath) ? filePath : path.resolve(filePath); const data = readJSON<{ costs?: any[] }>(resolved); if (!data) { console.error(`Cannot read file: ${resolved}`); process.exit(1); } if (!data.costs || data.costs.length === 0) { console.log('No cost data in this eval file.'); return; } const { computeCosts, formatCostDashboard } = await import('./eval-cost'); const dashboard = computeCosts(data.costs); console.log(formatCostDashboard(dashboard)); } async function cmdCache(args: string[]): Promise { const sub = args[0]; const { cacheRead, cacheWrite, cacheStats, cacheClear, cacheVerify, } = await import('./eval-cache'); switch (sub) { case 'read': { const [suite, key] = [args[1], args[2]]; if (!suite || !key) { console.error('Usage: gstack eval cache read '); process.exit(1); } const data = cacheRead(suite, key); if (data === null) { console.log('MISS'); process.exit(1); } console.log(JSON.stringify(data, null, 2)); break; } case 'write': { const [suite, key] = [args[1], args[2]]; if (!suite || !key) { console.error('Usage: gstack eval cache write [json]'); process.exit(1); } let jsonData: string; if (args[3]) { jsonData = args[3]; } else if (!process.stdin.isTTY) { jsonData = await Bun.stdin.text(); } else { console.error('Provide JSON as argument or pipe to stdin'); process.exit(1); } const parsed = JSON.parse(jsonData); cacheWrite(suite, key, parsed); console.log('OK'); break; } case 'stats': { const stats = cacheStats(args[1]); if (stats.suites.length === 0) { console.log('Cache is empty'); return; } for (const s of stats.suites) { const size = s.size_bytes > 1024 ? `${(s.size_bytes / 1024).toFixed(1)}KB` : `${s.size_bytes}B`; console.log(` ${s.name.padEnd(20)} ${s.entries} entries ${size}`); } break; } case 'clear': { const result = cacheClear(args[1]); console.log(`Cleared ${result.deleted} cache entries`); break; } case 'verify': { const result = cacheVerify(args[1]); console.log(`Valid: ${result.valid} Invalid: ${result.invalid}`); for (const err of result.errors) console.log(` ERROR: ${err}`); if (result.invalid > 0) process.exit(1); break; } default: console.error('Usage: gstack eval cache [args...]'); process.exit(1); } } async function cmdWatch(): Promise { // Delegate to existing watch script const watchScript = path.resolve(__dirname, '..', 'scripts', 'eval-watch.ts'); const proc = Bun.spawn(['bun', 'run', watchScript, ...process.argv.slice(3)], { stdin: 'inherit', stdout: 'inherit', stderr: 'inherit', }); const exitCode = await proc.exited; process.exit(exitCode); } // --- Trend tracking --- export interface TestTrend { name: string; tier: string; results: Array<{ timestamp: string; passed: boolean }>; passRate: number; streak: { type: 'pass' | 'fail'; count: number }; flipCount: number; status: 'stable-pass' | 'stable-fail' | 'flaky' | 'improving' | 'degrading'; } /** * Compute per-test pass rate trends from eval results. * Pure function — no I/O. Results are ordered chronologically (oldest first). */ export function computeTrends( results: EvalResult[], filterTier?: string, filterTest?: string, ): TestTrend[] { // Build time series per test (chronological — oldest first) const byTest = new Map>(); // Results from loadEvalResults are newest-first, so reverse for chronological const chronological = [...results].reverse(); for (const r of chronological) { if (filterTier && r.tier !== filterTier) continue; for (const t of r.tests) { if (filterTest && t.name !== filterTest) continue; const key = `${r.tier}:${t.name}`; if (!byTest.has(key)) byTest.set(key, []); byTest.get(key)!.push({ timestamp: r.timestamp, passed: t.passed }); } } const trends: TestTrend[] = []; for (const [key, results] of byTest) { const [tier, ...nameParts] = key.split(':'); const name = nameParts.join(':'); const total = results.length; const passCount = results.filter(r => r.passed).length; const passRate = total > 0 ? passCount / total : 0; // Streak: walk from newest (end of array) backward let streakType: 'pass' | 'fail' = results[results.length - 1].passed ? 'pass' : 'fail'; let streakCount = 0; for (let i = results.length - 1; i >= 0; i--) { const r = results[i].passed ? 'pass' : 'fail'; if (r === streakType) streakCount++; else break; } // Flip count: transitions between pass and fail let flipCount = 0; for (let i = 1; i < results.length; i++) { if (results[i].passed !== results[i - 1].passed) flipCount++; } // Classify status let status: TestTrend['status']; const last3 = results.slice(-3); const earlier = results.slice(0, -3); const last3AllPass = last3.length >= 3 && last3.every(r => r.passed); const last3HasFail = last3.some(r => !r.passed); const earlierHadFailures = earlier.some(r => !r.passed); const earlierWasPassing = earlier.length > 0 && earlier.every(r => r.passed); // Check improving/degrading first — a clear recent trend outranks raw pass rate if (last3AllPass && earlierHadFailures) { status = 'improving'; } else if (last3HasFail && earlierWasPassing) { status = 'degrading'; } else if (flipCount >= 3 || (passRate > 0.3 && passRate < 0.7)) { status = 'flaky'; } else if (passRate >= 0.9 && flipCount <= 1) { status = 'stable-pass'; } else if (passRate <= 0.1 && flipCount <= 1) { status = 'stable-fail'; } else if (passRate >= 0.5) { status = 'stable-pass'; } else { status = 'stable-fail'; } trends.push({ name, tier, results, passRate, streak: { type: streakType, count: streakCount }, flipCount, status, }); } // Sort: flaky first, then flipCount desc, then name trends.sort((a, b) => { const statusOrder = { flaky: 0, degrading: 1, improving: 2, 'stable-fail': 3, 'stable-pass': 4 }; const sa = statusOrder[a.status] ?? 5; const sb = statusOrder[b.status] ?? 5; if (sa !== sb) return sa - sb; if (a.flipCount !== b.flipCount) return b.flipCount - a.flipCount; return a.name.localeCompare(b.name); }); return trends; } async function cmdTrend(args: string[]): Promise { let limit = 10; let filterTier: string | undefined; let filterTest: string | undefined; let useTeam = false; for (let i = 0; i < args.length; i++) { if (args[i] === '--limit' && args[i + 1]) { limit = parseInt(args[++i], 10); } else if (args[i] === '--tier' && args[i + 1]) { filterTier = args[++i]; } else if (args[i] === '--test' && args[i + 1]) { filterTest = args[++i]; } else if (args[i] === '--team') { useTeam = true; } } let results: EvalResult[]; if (useTeam) { try { const { isSyncConfigured } = await import('./sync-config'); const { pullEvalRuns } = await import('./sync'); if (!isSyncConfigured()) { console.log('Team sync not configured — showing local data only. See docs/TEAM_SYNC_SETUP.md'); results = loadEvalResults(undefined, limit); } else { const teamRows = await pullEvalRuns({ limit }); results = teamRows as unknown as EvalResult[]; } } catch { console.log('Team sync not available — showing local data only.'); results = loadEvalResults(undefined, limit); } } else { results = loadEvalResults(undefined, limit); } if (results.length === 0) { console.log('No eval runs yet. Run: EVALS=1 bun run test:evals'); return; } const trends = computeTrends(results, filterTier, filterTest); if (trends.length === 0) { console.log('No test data matching filters.'); return; } // Determine how many result columns to show const maxResults = Math.min(limit, Math.max(...trends.map(t => t.results.length))); console.log(''); console.log(`Test Trends (last ${results.length} runs)`); console.log('═'.repeat(80)); console.log( ' ' + 'Test Name'.padEnd(36) + 'Rate'.padEnd(7) + `Last ${maxResults}`.padEnd(maxResults + 3) + 'Streak'.padEnd(8) + 'Status' ); console.log('─'.repeat(80)); let flakyCount = 0; let degradingCount = 0; for (const t of trends) { if (t.status === 'flaky') flakyCount++; if (t.status === 'degrading') degradingCount++; const fullName = `${t.tier}:${t.name}`; const displayName = fullName.length > 34 ? fullName.slice(0, 31) + '...' : fullName.padEnd(36); const rate = `${Math.round(t.passRate * 100)}%`.padEnd(7); // Build sparkline of last N results const sparkline = t.results .slice(-maxResults) .map(r => r.passed ? '\u2713' : '\u2717') .join(''); const streak = `${t.streak.count}${t.streak.type === 'pass' ? '\u2713' : '\u2717'}`.padEnd(8); // Color status let statusStr = t.status; if (isTTY) { if (t.status === 'flaky' || t.status === 'degrading') statusStr = red(t.status); else if (t.status === 'stable-pass' || t.status === 'improving') statusStr = green(t.status); else statusStr = dim(t.status); } console.log(` ${displayName}${rate}${sparkline.padEnd(maxResults + 3)}${streak}${statusStr}`); } console.log('─'.repeat(80)); const parts: string[] = [`${trends.length} tests tracked`]; if (flakyCount > 0) parts.push(`${flakyCount} flaky`); if (degradingCount > 0) parts.push(`${degradingCount} degrading`); console.log(` ${parts.join(' | ')}`); console.log(''); } function printUsage(): void { console.log(` gstack eval — eval management CLI Usage: gstack eval [args] Commands: list [--branch X] [--tier X] [--limit N] List eval runs (default limit: 50) compare [file-a] [file-b] Compare two eval runs summary [--limit N] Aggregate stats across all runs push Validate + save + sync an eval result cost Show per-model cost breakdown trend [--limit N] [--tier X] [--test X] [--team] Per-test pass rate trends cache read|write|stats|clear|verify Manage eval cache watch Live E2E test dashboard `); } // --- Main (only when run directly, not imported) --- if (import.meta.main) { const command = process.argv[2]; const cmdArgs = process.argv.slice(3); switch (command) { case 'list': cmdList(cmdArgs); break; case 'compare': cmdCompare(cmdArgs); break; case 'summary': cmdSummary(cmdArgs); break; case 'push': cmdPush(cmdArgs); break; case 'cost': cmdCost(cmdArgs); break; case 'trend': cmdTrend(cmdArgs); break; case 'cache': cmdCache(cmdArgs); break; case 'watch': cmdWatch(); break; case '--help': case '-h': case 'help': case undefined: printUsage(); break; default: console.error(`Unknown command: ${command}`); printUsage(); process.exit(1); } }