feat: eval CLI tools + docs cleanup

Add eval:list, eval:compare, eval:summary CLI scripts for exploring eval history from ~/.gstack-dev/evals/. eval:compare reuses the shared comparison functions from eval-store.ts. - eval:list: sorted table with branch/tier/cost filters - eval:compare: thin wrapper around compareEvalResults + formatComparison - eval:summary: aggregate stats, flaky test detection, branch rankings - Remove unused @anthropic-ai/claude-agent-sdk from devDependencies - Update CLAUDE.md: streaming docs, eval CLI commands, remove Agent SDK refs - Add GH Actions eval upload (P2) and web dashboard (P3) to TODOS.md Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-08-01 20:08:37 +02:00 · 2026-03-14 03:49:57 -05:00
parent 84f52f3bad
commit ed802d0c7f
6 changed files with 373 additions and 11 deletions
@@ -0,0 +1,96 @@
+#!/usr/bin/env bun
+/**
+ * Compare two eval runs from ~/.gstack-dev/evals/
+ *
+ * Usage:
+ *   bun run eval:compare                    # compare two most recent of same tier
+ *   bun run eval:compare <file>             # compare file against its predecessor
+ *   bun run eval:compare <file-a> <file-b>  # compare two specific files
+ */
+
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import {
+  findPreviousRun,
+  compareEvalResults,
+  formatComparison,
+} from '../test/helpers/eval-store';
+import type { EvalResult } from '../test/helpers/eval-store';
+
+const EVAL_DIR = path.join(os.homedir(), '.gstack-dev', 'evals');
+
+function loadResult(filepath: string): EvalResult {
+  // Resolve relative to EVAL_DIR if not absolute
+  const resolved = path.isAbsolute(filepath) ? filepath : path.join(EVAL_DIR, filepath);
+  if (!fs.existsSync(resolved)) {
+    console.error(`File not found: ${resolved}`);
+    process.exit(1);
+  }
+  return JSON.parse(fs.readFileSync(resolved, 'utf-8'));
+}
+
+const args = process.argv.slice(2);
+
+let beforeFile: string;
+let afterFile: string;
+
+if (args.length === 2) {
+  // Two explicit files
+  beforeFile = args[0];
+  afterFile = args[1];
+} else if (args.length === 1) {
+  // One file — find its predecessor
+  afterFile = args[0];
+  const resolved = path.isAbsolute(afterFile) ? afterFile : path.join(EVAL_DIR, afterFile);
+  const afterResult = loadResult(resolved);
+  const prev = findPreviousRun(EVAL_DIR, afterResult.tier, afterResult.branch, resolved);
+  if (!prev) {
+    console.log('No previous run found to compare against.');
+    process.exit(0);
+  }
+  beforeFile = prev;
+} else {
+  // No args — find two most recent of the same tier
+  let files: string[];
+  try {
+    files = fs.readdirSync(EVAL_DIR)
+      .filter(f => f.endsWith('.json'))
+      .sort()
+      .reverse();
+  } catch {
+    console.log('No eval runs yet. Run: EVALS=1 bun run test:evals');
+    process.exit(0);
+  }
+
+  if (files.length < 2) {
+    console.log('Need at least 2 eval runs to compare. Run evals again.');
+    process.exit(0);
+  }
+
+  // Most recent file
+  afterFile = path.join(EVAL_DIR, files[0]);
+  const afterResult = loadResult(afterFile);
+  const prev = findPreviousRun(EVAL_DIR, afterResult.tier, afterResult.branch, afterFile);
+  if (!prev) {
+    console.log('No previous run of the same tier found to compare against.');
+    process.exit(0);
+  }
+  beforeFile = prev;
+}
+
+const beforeResult = loadResult(beforeFile);
+const afterResult = loadResult(afterFile);
+
+// Warn if different tiers
+if (beforeResult.tier !== afterResult.tier) {
+  console.warn(`Warning: comparing different tiers (${beforeResult.tier} vs ${afterResult.tier})`);
+}
+
+// Warn on schema mismatch
+if (beforeResult.schema_version !== afterResult.schema_version) {
+  console.warn(`Warning: schema version mismatch (${beforeResult.schema_version} vs ${afterResult.schema_version})`);
+}
+
+const comparison = compareEvalResults(beforeResult, afterResult, beforeFile, afterFile);
+console.log(formatComparison(comparison));
@@ -0,0 +1,105 @@
+#!/usr/bin/env bun
+/**
+ * List eval runs from ~/.gstack-dev/evals/
+ *
+ * Usage: bun run eval:list [--branch <name>] [--tier e2e|llm-judge] [--limit N]
+ */
+
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+
+const EVAL_DIR = path.join(os.homedir(), '.gstack-dev', 'evals');
+
+// Parse args
+const args = process.argv.slice(2);
+let filterBranch: string | null = null;
+let filterTier: string | null = null;
+let limit = 20;
+
+for (let i = 0; i < args.length; i++) {
+  if (args[i] === '--branch' && args[i + 1]) { filterBranch = args[++i]; }
+  else if (args[i] === '--tier' && args[i + 1]) { filterTier = args[++i]; }
+  else if (args[i] === '--limit' && args[i + 1]) { limit = parseInt(args[++i], 10); }
+}
+
+// Read eval files
+let files: string[];
+try {
+  files = fs.readdirSync(EVAL_DIR).filter(f => f.endsWith('.json'));
+} catch {
+  console.log('No eval runs yet. Run: EVALS=1 bun run test:evals');
+  process.exit(0);
+}
+
+if (files.length === 0) {
+  console.log('No eval runs yet. Run: EVALS=1 bun run test:evals');
+  process.exit(0);
+}
+
+// Parse top-level fields from each file
+interface RunSummary {
+  file: string;
+  timestamp: string;
+  branch: string;
+  tier: string;
+  version: string;
+  passed: number;
+  total: number;
+  cost: number;
+}
+
+const runs: RunSummary[] = [];
+for (const file of files) {
+  try {
+    const data = JSON.parse(fs.readFileSync(path.join(EVAL_DIR, file), 'utf-8'));
+    if (filterBranch && data.branch !== filterBranch) continue;
+    if (filterTier && data.tier !== filterTier) continue;
+    runs.push({
+      file,
+      timestamp: data.timestamp || '',
+      branch: data.branch || 'unknown',
+      tier: data.tier || 'unknown',
+      version: data.version || '?',
+      passed: data.passed || 0,
+      total: data.total_tests || 0,
+      cost: data.total_cost_usd || 0,
+    });
+  } catch { continue; }
+}
+
+// Sort by timestamp descending
+runs.sort((a, b) => b.timestamp.localeCompare(a.timestamp));
+
+// Apply limit
+const displayed = runs.slice(0, limit);
+
+// Print table
+console.log('');
+console.log(`Eval History (${runs.length} total runs)`);
+console.log('═'.repeat(90));
+console.log(
+  '  ' +
+  'Date'.padEnd(17) +
+  'Branch'.padEnd(28) +
+  'Tier'.padEnd(12) +
+  'Pass'.padEnd(8) +
+  'Cost'.padEnd(8) +
+  'Version'
+);
+console.log('─'.repeat(90));
+
+for (const run of displayed) {
+  const date = run.timestamp.replace('T', ' ').slice(0, 16);
+  const branch = run.branch.length > 26 ? run.branch.slice(0, 23) + '...' : run.branch.padEnd(28);
+  const pass = `${run.passed}/${run.total}`.padEnd(8);
+  const cost = `$${run.cost.toFixed(2)}`.padEnd(8);
+  console.log(`  ${date.padEnd(17)}${branch}${run.tier.padEnd(12)}${pass}${cost}v${run.version}`);
+}
+
+console.log('─'.repeat(90));
+
+const totalCost = runs.reduce((s, r) => s + r.cost, 0);
+console.log(`  ${runs.length} runs | Total spend: $${totalCost.toFixed(2)} | Showing: ${displayed.length}`);
+console.log(`  Dir: ${EVAL_DIR}`);
+console.log('');
@@ -0,0 +1,134 @@
+#!/usr/bin/env bun
+/**
+ * Aggregate summary of all eval runs from ~/.gstack-dev/evals/
+ *
+ * Usage: bun run eval:summary
+ */
+
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import type { EvalResult } from '../test/helpers/eval-store';
+
+const EVAL_DIR = path.join(os.homedir(), '.gstack-dev', 'evals');
+
+let files: string[];
+try {
+  files = fs.readdirSync(EVAL_DIR).filter(f => f.endsWith('.json'));
+} catch {
+  console.log('No eval runs yet. Run: EVALS=1 bun run test:evals');
+  process.exit(0);
+}
+
+if (files.length === 0) {
+  console.log('No eval runs yet. Run: EVALS=1 bun run test:evals');
+  process.exit(0);
+}
+
+// Load all results
+const results: EvalResult[] = [];
+for (const file of files) {
+  try {
+    results.push(JSON.parse(fs.readFileSync(path.join(EVAL_DIR, file), 'utf-8')));
+  } catch { continue; }
+}
+
+// Aggregate stats
+const e2eRuns = results.filter(r => r.tier === 'e2e');
+const judgeRuns = results.filter(r => r.tier === 'llm-judge');
+const totalCost = results.reduce((s, r) => s + (r.total_cost_usd || 0), 0);
+const avgE2ECost = e2eRuns.length > 0 ? e2eRuns.reduce((s, r) => s + r.total_cost_usd, 0) / e2eRuns.length : 0;
+const avgJudgeCost = judgeRuns.length > 0 ? judgeRuns.reduce((s, r) => s + r.total_cost_usd, 0) / judgeRuns.length : 0;
+
+// Detection rates from outcome evals
+const detectionRates: number[] = [];
+for (const r of e2eRuns) {
+  for (const t of r.tests) {
+    if (t.detection_rate !== undefined) {
+      detectionRates.push(t.detection_rate);
+    }
+  }
+}
+const avgDetection = detectionRates.length > 0
+  ? detectionRates.reduce((a, b) => a + b, 0) / detectionRates.length
+  : null;
+
+// Flaky tests (passed in some runs, failed in others)
+const testResults = new Map<string, boolean[]>();
+for (const r of results) {
+  for (const t of r.tests) {
+    const key = `${r.tier}:${t.name}`;
+    if (!testResults.has(key)) testResults.set(key, []);
+    testResults.get(key)!.push(t.passed);
+  }
+}
+const flakyTests: string[] = [];
+for (const [name, outcomes] of testResults) {
+  if (outcomes.length >= 2) {
+    const hasPass = outcomes.some(o => o);
+    const hasFail = outcomes.some(o => !o);
+    if (hasPass && hasFail) flakyTests.push(name);
+  }
+}
+
+// Branch stats
+const branchStats = new Map<string, { runs: number; avgDetection: number; detections: number[] }>();
+for (const r of e2eRuns) {
+  if (!branchStats.has(r.branch)) {
+    branchStats.set(r.branch, { runs: 0, avgDetection: 0, detections: [] });
+  }
+  const stats = branchStats.get(r.branch)!;
+  stats.runs++;
+  for (const t of r.tests) {
+    if (t.detection_rate !== undefined) {
+      stats.detections.push(t.detection_rate);
+    }
+  }
+}
+for (const stats of branchStats.values()) {
+  stats.avgDetection = stats.detections.length > 0
+    ? stats.detections.reduce((a, b) => a + b, 0) / stats.detections.length
+    : 0;
+}
+
+// Print summary
+console.log('');
+console.log('Eval Summary');
+console.log('═'.repeat(60));
+console.log(`  Total runs:        ${results.length} (${e2eRuns.length} e2e, ${judgeRuns.length} llm-judge)`);
+console.log(`  Total spend:       $${totalCost.toFixed(2)}`);
+console.log(`  Avg cost/e2e:      $${avgE2ECost.toFixed(2)}`);
+console.log(`  Avg cost/judge:    $${avgJudgeCost.toFixed(2)}`);
+if (avgDetection !== null) {
+  console.log(`  Avg detection:     ${avgDetection.toFixed(1)} bugs`);
+}
+console.log('─'.repeat(60));
+
+if (flakyTests.length > 0) {
+  console.log(`  Flaky tests (${flakyTests.length}):`);
+  for (const name of flakyTests) {
+    console.log(`    - ${name}`);
+  }
+  console.log('─'.repeat(60));
+}
+
+if (branchStats.size > 0) {
+  console.log('  Branches:');
+  const sorted = [...branchStats.entries()].sort((a, b) => b[1].avgDetection - a[1].avgDetection);
+  for (const [branch, stats] of sorted) {
+    const det = stats.detections.length > 0 ? ` avg det: ${stats.avgDetection.toFixed(1)}` : '';
+    console.log(`    ${branch.padEnd(30)} ${stats.runs} runs${det}`);
+  }
+  console.log('─'.repeat(60));
+}
+
+// Date range
+const timestamps = results.map(r => r.timestamp).filter(Boolean).sort();
+if (timestamps.length > 0) {
+  const first = timestamps[0].replace('T', ' ').slice(0, 16);
+  const last = timestamps[timestamps.length - 1].replace('T', ' ').slice(0, 16);
+  console.log(`  Date range: ${first} → ${last}`);
+}
+
+console.log(`  Dir: ${EVAL_DIR}`);
+console.log('');