From daea165333311848afcfd58aebaf711a71aff0b5 Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Sun, 15 Mar 2026 16:47:41 -0500
Subject: [PATCH] feat: add eval:trend CLI for per-test pass rate tracking

computeTrends() classifies tests as stable-pass/stable-fail/flaky/
improving/degrading based on pass rate, flip count, and recent streak.
gstack eval trend shows sparkline table with --limit, --tier, --test
filters. Guard CLI main block with import.meta.main to prevent
execution on import.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 lib/cli-eval.ts             | 192 ++++++++++++++++++++++++++++++++++-
 package.json                |   1 +
 test/lib-eval-trend.test.ts | 193 ++++++++++++++++++++++++++++++++++++
 3 files changed, 385 insertions(+), 1 deletion(-)
 create mode 100644 test/lib-eval-trend.test.ts
diff --git a/lib/cli-eval.ts b/lib/cli-eval.ts
index df16d033..bee75ae0 100644
--- a/lib/cli-eval.ts
+++ b/lib/cli-eval.ts
@@ -258,6 +258,7 @@ async function cmdSummary(args: string[]): Promise<void> {
   if (flakyTests.length > 0) {
     console.log(`  Flaky tests (${flakyTests.length}):`);
     for (const name of flakyTests) console.log(`    - ${name}`);
+    console.log(`  Run 'bun run eval:trend' for detailed time series.`);
     console.log('─'.repeat(60));
   }
 
@@ -429,6 +430,191 @@ async function cmdWatch(): Promise<void> {
   process.exit(exitCode);
 }
 
+// --- Trend tracking ---
+
+export interface TestTrend {
+  name: string;
+  tier: string;
+  results: Array<{ timestamp: string; passed: boolean }>;
+  passRate: number;
+  streak: { type: 'pass' | 'fail'; count: number };
+  flipCount: number;
+  status: 'stable-pass' | 'stable-fail' | 'flaky' | 'improving' | 'degrading';
+}
+
+/**
+ * Compute per-test pass rate trends from eval results.
+ * Pure function — no I/O. Results are ordered chronologically (oldest first).
+ */
+export function computeTrends(
+  results: EvalResult[],
+  filterTier?: string,
+  filterTest?: string,
+): TestTrend[] {
+  // Build time series per test (chronological — oldest first)
+  const byTest = new Map<string, Array<{ timestamp: string; passed: boolean }>>();
+
+  // Results from loadEvalResults are newest-first, so reverse for chronological
+  const chronological = [...results].reverse();
+
+  for (const r of chronological) {
+    if (filterTier && r.tier !== filterTier) continue;
+    for (const t of r.tests) {
+      if (filterTest && t.name !== filterTest) continue;
+      const key = `${r.tier}:${t.name}`;
+      if (!byTest.has(key)) byTest.set(key, []);
+      byTest.get(key)!.push({ timestamp: r.timestamp, passed: t.passed });
+    }
+  }
+
+  const trends: TestTrend[] = [];
+
+  for (const [key, results] of byTest) {
+    const [tier, ...nameParts] = key.split(':');
+    const name = nameParts.join(':');
+    const total = results.length;
+    const passCount = results.filter(r => r.passed).length;
+    const passRate = total > 0 ? passCount / total : 0;
+
+    // Streak: walk from newest (end of array) backward
+    let streakType: 'pass' | 'fail' = results[results.length - 1].passed ? 'pass' : 'fail';
+    let streakCount = 0;
+    for (let i = results.length - 1; i >= 0; i--) {
+      const r = results[i].passed ? 'pass' : 'fail';
+      if (r === streakType) streakCount++;
+      else break;
+    }
+
+    // Flip count: transitions between pass and fail
+    let flipCount = 0;
+    for (let i = 1; i < results.length; i++) {
+      if (results[i].passed !== results[i - 1].passed) flipCount++;
+    }
+
+    // Classify status
+    let status: TestTrend['status'];
+    const last3 = results.slice(-3);
+    const earlier = results.slice(0, -3);
+    const last3AllPass = last3.length >= 3 && last3.every(r => r.passed);
+    const last3HasFail = last3.some(r => !r.passed);
+    const earlierHadFailures = earlier.some(r => !r.passed);
+    const earlierWasPassing = earlier.length > 0 && earlier.every(r => r.passed);
+
+    // Check improving/degrading first — a clear recent trend outranks raw pass rate
+    if (last3AllPass && earlierHadFailures) {
+      status = 'improving';
+    } else if (last3HasFail && earlierWasPassing) {
+      status = 'degrading';
+    } else if (flipCount >= 3 || (passRate > 0.3 && passRate < 0.7)) {
+      status = 'flaky';
+    } else if (passRate >= 0.9 && flipCount <= 1) {
+      status = 'stable-pass';
+    } else if (passRate <= 0.1 && flipCount <= 1) {
+      status = 'stable-fail';
+    } else if (passRate >= 0.5) {
+      status = 'stable-pass';
+    } else {
+      status = 'stable-fail';
+    }
+
+    trends.push({
+      name, tier, results, passRate,
+      streak: { type: streakType, count: streakCount },
+      flipCount, status,
+    });
+  }
+
+  // Sort: flaky first, then flipCount desc, then name
+  trends.sort((a, b) => {
+    const statusOrder = { flaky: 0, degrading: 1, improving: 2, 'stable-fail': 3, 'stable-pass': 4 };
+    const sa = statusOrder[a.status] ?? 5;
+    const sb = statusOrder[b.status] ?? 5;
+    if (sa !== sb) return sa - sb;
+    if (a.flipCount !== b.flipCount) return b.flipCount - a.flipCount;
+    return a.name.localeCompare(b.name);
+  });
+
+  return trends;
+}
+
+async function cmdTrend(args: string[]): Promise<void> {
+  let limit = 10;
+  let filterTier: string | undefined;
+  let filterTest: string | undefined;
+
+  for (let i = 0; i < args.length; i++) {
+    if (args[i] === '--limit' && args[i + 1]) { limit = parseInt(args[++i], 10); }
+    else if (args[i] === '--tier' && args[i + 1]) { filterTier = args[++i]; }
+    else if (args[i] === '--test' && args[i + 1]) { filterTest = args[++i]; }
+  }
+
+  const results = loadEvalResults<EvalResult>(undefined, limit);
+  if (results.length === 0) {
+    console.log('No eval runs yet. Run: EVALS=1 bun run test:evals');
+    return;
+  }
+
+  const trends = computeTrends(results, filterTier, filterTest);
+
+  if (trends.length === 0) {
+    console.log('No test data matching filters.');
+    return;
+  }
+
+  // Determine how many result columns to show
+  const maxResults = Math.min(limit, Math.max(...trends.map(t => t.results.length)));
+
+  console.log('');
+  console.log(`Test Trends (last ${results.length} runs)`);
+  console.log('═'.repeat(80));
+  console.log(
+    '  ' +
+    'Test Name'.padEnd(36) +
+    'Rate'.padEnd(7) +
+    `Last ${maxResults}`.padEnd(maxResults + 3) +
+    'Streak'.padEnd(8) +
+    'Status'
+  );
+  console.log('─'.repeat(80));
+
+  let flakyCount = 0;
+  let degradingCount = 0;
+
+  for (const t of trends) {
+    if (t.status === 'flaky') flakyCount++;
+    if (t.status === 'degrading') degradingCount++;
+
+    const fullName = `${t.tier}:${t.name}`;
+    const displayName = fullName.length > 34 ? fullName.slice(0, 31) + '...' : fullName.padEnd(36);
+    const rate = `${Math.round(t.passRate * 100)}%`.padEnd(7);
+
+    // Build sparkline of last N results
+    const sparkline = t.results
+      .slice(-maxResults)
+      .map(r => r.passed ? '\u2713' : '\u2717')
+      .join('');
+
+    const streak = `${t.streak.count}${t.streak.type === 'pass' ? '\u2713' : '\u2717'}`.padEnd(8);
+
+    // Color status
+    let statusStr = t.status;
+    if (isTTY) {
+      if (t.status === 'flaky' || t.status === 'degrading') statusStr = red(t.status);
+      else if (t.status === 'stable-pass' || t.status === 'improving') statusStr = green(t.status);
+      else statusStr = dim(t.status);
+    }
+
+    console.log(`  ${displayName}${rate}${sparkline.padEnd(maxResults + 3)}${streak}${statusStr}`);
+  }
+
+  console.log('─'.repeat(80));
+  const parts: string[] = [`${trends.length} tests tracked`];
+  if (flakyCount > 0) parts.push(`${flakyCount} flaky`);
+  if (degradingCount > 0) parts.push(`${degradingCount} degrading`);
+  console.log(`  ${parts.join(' | ')}`);
+  console.log('');
+}
+
 function printUsage(): void {
   console.log(`
 gstack eval — eval management CLI
@@ -441,13 +627,15 @@ Commands:
   summary [--limit N]                         Aggregate stats across all runs
   push <file>                                 Validate + save + sync an eval result
   cost <file>                                 Show per-model cost breakdown
+  trend [--limit N] [--tier X] [--test X]     Per-test pass rate trends
   cache read|write|stats|clear|verify         Manage eval cache
   watch                                       Live E2E test dashboard
 `);
 }
 
-// --- Main ---
+// --- Main (only when run directly, not imported) ---
 
+if (import.meta.main) {
 const command = process.argv[2];
 const cmdArgs = process.argv.slice(3);
 
@@ -457,6 +645,7 @@ switch (command) {
   case 'summary': cmdSummary(cmdArgs); break;
   case 'push':    cmdPush(cmdArgs); break;
   case 'cost':    cmdCost(cmdArgs); break;
+  case 'trend':   cmdTrend(cmdArgs); break;
   case 'cache':   cmdCache(cmdArgs); break;
   case 'watch':   cmdWatch(); break;
   case '--help': case '-h': case 'help': case undefined:
@@ -467,3 +656,4 @@ switch (command) {
     printUsage();
     process.exit(1);
 }
+}
diff --git a/package.json b/package.json
index 18090e7d..da816815 100644
--- a/package.json
+++ b/package.json
@@ -21,6 +21,7 @@
     "eval:list": "bun run lib/cli-eval.ts list",
     "eval:compare": "bun run lib/cli-eval.ts compare",
     "eval:summary": "bun run lib/cli-eval.ts summary",
+    "eval:trend": "bun run lib/cli-eval.ts trend",
     "eval:watch": "bun run lib/cli-eval.ts watch"
   },
   "dependencies": {
diff --git a/test/lib-eval-trend.test.ts b/test/lib-eval-trend.test.ts
new file mode 100644
index 00000000..c15aa149
--- /dev/null
+++ b/test/lib-eval-trend.test.ts
@@ -0,0 +1,193 @@
+/**
+ * Tests for computeTrends() — per-test pass rate trend tracking.
+ */
+
+import { describe, test, expect } from 'bun:test';
+import { computeTrends } from '../lib/cli-eval';
+import type { EvalResult } from './helpers/eval-store';
+
+/** Build a minimal EvalResult with given tests. */
+function makeRun(opts: {
+  timestamp: string;
+  tier?: 'e2e' | 'llm-judge';
+  tests: Array<{ name: string; passed: boolean }>;
+}): EvalResult {
+  return {
+    schema_version: 1,
+    version: '0.3.3',
+    branch: 'main',
+    git_sha: 'abc',
+    timestamp: opts.timestamp,
+    hostname: 'test',
+    tier: opts.tier || 'e2e',
+    total_tests: opts.tests.length,
+    passed: opts.tests.filter(t => t.passed).length,
+    failed: opts.tests.filter(t => !t.passed).length,
+    total_cost_usd: 0,
+    total_duration_ms: 0,
+    tests: opts.tests.map(t => ({
+      name: t.name, suite: 'test', tier: opts.tier || 'e2e' as const,
+      passed: t.passed, duration_ms: 0, cost_usd: 0,
+    })),
+  };
+}
+
+describe('computeTrends', () => {
+  test('classifies stable-pass test correctly', () => {
+    // 10 runs all passing — results are newest-first (loadEvalResults order)
+    const results = Array.from({ length: 10 }, (_, i) => makeRun({
+      timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
+      tests: [{ name: 'always-pass', passed: true }],
+    })).reverse(); // newest first
+
+    const trends = computeTrends(results);
+    expect(trends).toHaveLength(1);
+    expect(trends[0].status).toBe('stable-pass');
+    expect(trends[0].passRate).toBe(1);
+    expect(trends[0].streak).toEqual({ type: 'pass', count: 10 });
+    expect(trends[0].flipCount).toBe(0);
+  });
+
+  test('classifies stable-fail test correctly', () => {
+    const results = Array.from({ length: 10 }, (_, i) => makeRun({
+      timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
+      tests: [{ name: 'always-fail', passed: false }],
+    })).reverse();
+
+    const trends = computeTrends(results);
+    expect(trends[0].status).toBe('stable-fail');
+    expect(trends[0].passRate).toBe(0);
+    expect(trends[0].streak).toEqual({ type: 'fail', count: 10 });
+  });
+
+  test('classifies flaky test correctly — alternating pass/fail', () => {
+    const results = Array.from({ length: 10 }, (_, i) => makeRun({
+      timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
+      tests: [{ name: 'flaky', passed: i % 2 === 0 }],
+    })).reverse();
+
+    const trends = computeTrends(results);
+    expect(trends[0].status).toBe('flaky');
+    expect(trends[0].flipCount).toBe(9);
+    expect(trends[0].passRate).toBe(0.5);
+  });
+
+  test('classifies improving test correctly', () => {
+    // First 5 fail, last 5 pass
+    const results = Array.from({ length: 10 }, (_, i) => makeRun({
+      timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
+      tests: [{ name: 'improving', passed: i >= 5 }],
+    })).reverse();
+
+    const trends = computeTrends(results);
+    expect(trends[0].status).toBe('improving');
+    expect(trends[0].streak).toEqual({ type: 'pass', count: 5 });
+  });
+
+  test('classifies degrading test correctly', () => {
+    // First 7 pass, last 3 fail
+    const results = Array.from({ length: 10 }, (_, i) => makeRun({
+      timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
+      tests: [{ name: 'degrading', passed: i < 7 }],
+    })).reverse();
+
+    const trends = computeTrends(results);
+    expect(trends[0].status).toBe('degrading');
+    expect(trends[0].streak).toEqual({ type: 'fail', count: 3 });
+  });
+
+  test('computes streak correctly with mixed ending', () => {
+    // pass, pass, fail, pass, pass, pass (newest)
+    const passed = [true, true, false, true, true, true];
+    const results = passed.map((p, i) => makeRun({
+      timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
+      tests: [{ name: 'test', passed: p }],
+    })).reverse();
+
+    const trends = computeTrends(results);
+    expect(trends[0].streak).toEqual({ type: 'pass', count: 3 });
+  });
+
+  test('computes flipCount correctly', () => {
+    // pass, fail, pass, pass, fail, pass → 4 flips
+    const passed = [true, false, true, true, false, true];
+    const results = passed.map((p, i) => makeRun({
+      timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
+      tests: [{ name: 'test', passed: p }],
+    })).reverse();
+
+    const trends = computeTrends(results);
+    expect(trends[0].flipCount).toBe(4);
+  });
+
+  test('handles single run', () => {
+    const results = [makeRun({
+      timestamp: '2026-03-15T00:00:00Z',
+      tests: [{ name: 'single', passed: true }],
+    })];
+
+    const trends = computeTrends(results);
+    expect(trends).toHaveLength(1);
+    expect(trends[0].passRate).toBe(1);
+    expect(trends[0].streak).toEqual({ type: 'pass', count: 1 });
+    expect(trends[0].flipCount).toBe(0);
+    expect(trends[0].status).toBe('stable-pass');
+  });
+
+  test('handles single failing run', () => {
+    const results = [makeRun({
+      timestamp: '2026-03-15T00:00:00Z',
+      tests: [{ name: 'single-fail', passed: false }],
+    })];
+
+    const trends = computeTrends(results);
+    expect(trends[0].status).toBe('stable-fail');
+  });
+
+  test('filters by tier', () => {
+    const results = [
+      makeRun({ timestamp: '2026-03-15T00:00:00Z', tier: 'e2e', tests: [{ name: 'e2e-test', passed: true }] }),
+      makeRun({ timestamp: '2026-03-15T00:00:00Z', tier: 'llm-judge', tests: [{ name: 'judge-test', passed: true }] }),
+    ];
+
+    const e2eOnly = computeTrends(results, 'e2e');
+    expect(e2eOnly).toHaveLength(1);
+    expect(e2eOnly[0].name).toBe('e2e-test');
+
+    const judgeOnly = computeTrends(results, 'llm-judge');
+    expect(judgeOnly).toHaveLength(1);
+    expect(judgeOnly[0].name).toBe('judge-test');
+  });
+
+  test('filters by test name', () => {
+    const results = Array.from({ length: 3 }, (_, i) => makeRun({
+      timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
+      tests: [
+        { name: 'test-a', passed: true },
+        { name: 'test-b', passed: false },
+      ],
+    })).reverse();
+
+    const filtered = computeTrends(results, undefined, 'test-a');
+    expect(filtered).toHaveLength(1);
+    expect(filtered[0].name).toBe('test-a');
+    expect(filtered[0].passRate).toBe(1);
+  });
+
+  test('sorts flaky tests first', () => {
+    // Create runs where test-a is flaky and test-b is stable
+    const results = Array.from({ length: 6 }, (_, i) => makeRun({
+      timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
+      tests: [
+        { name: 'test-a', passed: i % 2 === 0 }, // flaky: alternating
+        { name: 'test-b', passed: true },          // stable-pass
+      ],
+    })).reverse();
+
+    const trends = computeTrends(results);
+    expect(trends[0].name).toBe('test-a');
+    expect(trends[0].status).toBe('flaky');
+    expect(trends[1].name).toBe('test-b');
+    expect(trends[1].status).toBe('stable-pass');
+  });
+});