From daea165333311848afcfd58aebaf711a71aff0b5 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sun, 15 Mar 2026 16:47:41 -0500 Subject: [PATCH] feat: add eval:trend CLI for per-test pass rate tracking computeTrends() classifies tests as stable-pass/stable-fail/flaky/ improving/degrading based on pass rate, flip count, and recent streak. gstack eval trend shows sparkline table with --limit, --tier, --test filters. Guard CLI main block with import.meta.main to prevent execution on import. Co-Authored-By: Claude Opus 4.6 (1M context) --- lib/cli-eval.ts | 192 ++++++++++++++++++++++++++++++++++- package.json | 1 + test/lib-eval-trend.test.ts | 193 ++++++++++++++++++++++++++++++++++++ 3 files changed, 385 insertions(+), 1 deletion(-) create mode 100644 test/lib-eval-trend.test.ts diff --git a/lib/cli-eval.ts b/lib/cli-eval.ts index df16d033..bee75ae0 100644 --- a/lib/cli-eval.ts +++ b/lib/cli-eval.ts @@ -258,6 +258,7 @@ async function cmdSummary(args: string[]): Promise { if (flakyTests.length > 0) { console.log(` Flaky tests (${flakyTests.length}):`); for (const name of flakyTests) console.log(` - ${name}`); + console.log(` Run 'bun run eval:trend' for detailed time series.`); console.log('─'.repeat(60)); } @@ -429,6 +430,191 @@ async function cmdWatch(): Promise { process.exit(exitCode); } +// --- Trend tracking --- + +export interface TestTrend { + name: string; + tier: string; + results: Array<{ timestamp: string; passed: boolean }>; + passRate: number; + streak: { type: 'pass' | 'fail'; count: number }; + flipCount: number; + status: 'stable-pass' | 'stable-fail' | 'flaky' | 'improving' | 'degrading'; +} + +/** + * Compute per-test pass rate trends from eval results. + * Pure function — no I/O. Results are ordered chronologically (oldest first). + */ +export function computeTrends( + results: EvalResult[], + filterTier?: string, + filterTest?: string, +): TestTrend[] { + // Build time series per test (chronological — oldest first) + const byTest = new Map>(); + + // Results from loadEvalResults are newest-first, so reverse for chronological + const chronological = [...results].reverse(); + + for (const r of chronological) { + if (filterTier && r.tier !== filterTier) continue; + for (const t of r.tests) { + if (filterTest && t.name !== filterTest) continue; + const key = `${r.tier}:${t.name}`; + if (!byTest.has(key)) byTest.set(key, []); + byTest.get(key)!.push({ timestamp: r.timestamp, passed: t.passed }); + } + } + + const trends: TestTrend[] = []; + + for (const [key, results] of byTest) { + const [tier, ...nameParts] = key.split(':'); + const name = nameParts.join(':'); + const total = results.length; + const passCount = results.filter(r => r.passed).length; + const passRate = total > 0 ? passCount / total : 0; + + // Streak: walk from newest (end of array) backward + let streakType: 'pass' | 'fail' = results[results.length - 1].passed ? 'pass' : 'fail'; + let streakCount = 0; + for (let i = results.length - 1; i >= 0; i--) { + const r = results[i].passed ? 'pass' : 'fail'; + if (r === streakType) streakCount++; + else break; + } + + // Flip count: transitions between pass and fail + let flipCount = 0; + for (let i = 1; i < results.length; i++) { + if (results[i].passed !== results[i - 1].passed) flipCount++; + } + + // Classify status + let status: TestTrend['status']; + const last3 = results.slice(-3); + const earlier = results.slice(0, -3); + const last3AllPass = last3.length >= 3 && last3.every(r => r.passed); + const last3HasFail = last3.some(r => !r.passed); + const earlierHadFailures = earlier.some(r => !r.passed); + const earlierWasPassing = earlier.length > 0 && earlier.every(r => r.passed); + + // Check improving/degrading first — a clear recent trend outranks raw pass rate + if (last3AllPass && earlierHadFailures) { + status = 'improving'; + } else if (last3HasFail && earlierWasPassing) { + status = 'degrading'; + } else if (flipCount >= 3 || (passRate > 0.3 && passRate < 0.7)) { + status = 'flaky'; + } else if (passRate >= 0.9 && flipCount <= 1) { + status = 'stable-pass'; + } else if (passRate <= 0.1 && flipCount <= 1) { + status = 'stable-fail'; + } else if (passRate >= 0.5) { + status = 'stable-pass'; + } else { + status = 'stable-fail'; + } + + trends.push({ + name, tier, results, passRate, + streak: { type: streakType, count: streakCount }, + flipCount, status, + }); + } + + // Sort: flaky first, then flipCount desc, then name + trends.sort((a, b) => { + const statusOrder = { flaky: 0, degrading: 1, improving: 2, 'stable-fail': 3, 'stable-pass': 4 }; + const sa = statusOrder[a.status] ?? 5; + const sb = statusOrder[b.status] ?? 5; + if (sa !== sb) return sa - sb; + if (a.flipCount !== b.flipCount) return b.flipCount - a.flipCount; + return a.name.localeCompare(b.name); + }); + + return trends; +} + +async function cmdTrend(args: string[]): Promise { + let limit = 10; + let filterTier: string | undefined; + let filterTest: string | undefined; + + for (let i = 0; i < args.length; i++) { + if (args[i] === '--limit' && args[i + 1]) { limit = parseInt(args[++i], 10); } + else if (args[i] === '--tier' && args[i + 1]) { filterTier = args[++i]; } + else if (args[i] === '--test' && args[i + 1]) { filterTest = args[++i]; } + } + + const results = loadEvalResults(undefined, limit); + if (results.length === 0) { + console.log('No eval runs yet. Run: EVALS=1 bun run test:evals'); + return; + } + + const trends = computeTrends(results, filterTier, filterTest); + + if (trends.length === 0) { + console.log('No test data matching filters.'); + return; + } + + // Determine how many result columns to show + const maxResults = Math.min(limit, Math.max(...trends.map(t => t.results.length))); + + console.log(''); + console.log(`Test Trends (last ${results.length} runs)`); + console.log('═'.repeat(80)); + console.log( + ' ' + + 'Test Name'.padEnd(36) + + 'Rate'.padEnd(7) + + `Last ${maxResults}`.padEnd(maxResults + 3) + + 'Streak'.padEnd(8) + + 'Status' + ); + console.log('─'.repeat(80)); + + let flakyCount = 0; + let degradingCount = 0; + + for (const t of trends) { + if (t.status === 'flaky') flakyCount++; + if (t.status === 'degrading') degradingCount++; + + const fullName = `${t.tier}:${t.name}`; + const displayName = fullName.length > 34 ? fullName.slice(0, 31) + '...' : fullName.padEnd(36); + const rate = `${Math.round(t.passRate * 100)}%`.padEnd(7); + + // Build sparkline of last N results + const sparkline = t.results + .slice(-maxResults) + .map(r => r.passed ? '\u2713' : '\u2717') + .join(''); + + const streak = `${t.streak.count}${t.streak.type === 'pass' ? '\u2713' : '\u2717'}`.padEnd(8); + + // Color status + let statusStr = t.status; + if (isTTY) { + if (t.status === 'flaky' || t.status === 'degrading') statusStr = red(t.status); + else if (t.status === 'stable-pass' || t.status === 'improving') statusStr = green(t.status); + else statusStr = dim(t.status); + } + + console.log(` ${displayName}${rate}${sparkline.padEnd(maxResults + 3)}${streak}${statusStr}`); + } + + console.log('─'.repeat(80)); + const parts: string[] = [`${trends.length} tests tracked`]; + if (flakyCount > 0) parts.push(`${flakyCount} flaky`); + if (degradingCount > 0) parts.push(`${degradingCount} degrading`); + console.log(` ${parts.join(' | ')}`); + console.log(''); +} + function printUsage(): void { console.log(` gstack eval — eval management CLI @@ -441,13 +627,15 @@ Commands: summary [--limit N] Aggregate stats across all runs push Validate + save + sync an eval result cost Show per-model cost breakdown + trend [--limit N] [--tier X] [--test X] Per-test pass rate trends cache read|write|stats|clear|verify Manage eval cache watch Live E2E test dashboard `); } -// --- Main --- +// --- Main (only when run directly, not imported) --- +if (import.meta.main) { const command = process.argv[2]; const cmdArgs = process.argv.slice(3); @@ -457,6 +645,7 @@ switch (command) { case 'summary': cmdSummary(cmdArgs); break; case 'push': cmdPush(cmdArgs); break; case 'cost': cmdCost(cmdArgs); break; + case 'trend': cmdTrend(cmdArgs); break; case 'cache': cmdCache(cmdArgs); break; case 'watch': cmdWatch(); break; case '--help': case '-h': case 'help': case undefined: @@ -467,3 +656,4 @@ switch (command) { printUsage(); process.exit(1); } +} diff --git a/package.json b/package.json index 18090e7d..da816815 100644 --- a/package.json +++ b/package.json @@ -21,6 +21,7 @@ "eval:list": "bun run lib/cli-eval.ts list", "eval:compare": "bun run lib/cli-eval.ts compare", "eval:summary": "bun run lib/cli-eval.ts summary", + "eval:trend": "bun run lib/cli-eval.ts trend", "eval:watch": "bun run lib/cli-eval.ts watch" }, "dependencies": { diff --git a/test/lib-eval-trend.test.ts b/test/lib-eval-trend.test.ts new file mode 100644 index 00000000..c15aa149 --- /dev/null +++ b/test/lib-eval-trend.test.ts @@ -0,0 +1,193 @@ +/** + * Tests for computeTrends() — per-test pass rate trend tracking. + */ + +import { describe, test, expect } from 'bun:test'; +import { computeTrends } from '../lib/cli-eval'; +import type { EvalResult } from './helpers/eval-store'; + +/** Build a minimal EvalResult with given tests. */ +function makeRun(opts: { + timestamp: string; + tier?: 'e2e' | 'llm-judge'; + tests: Array<{ name: string; passed: boolean }>; +}): EvalResult { + return { + schema_version: 1, + version: '0.3.3', + branch: 'main', + git_sha: 'abc', + timestamp: opts.timestamp, + hostname: 'test', + tier: opts.tier || 'e2e', + total_tests: opts.tests.length, + passed: opts.tests.filter(t => t.passed).length, + failed: opts.tests.filter(t => !t.passed).length, + total_cost_usd: 0, + total_duration_ms: 0, + tests: opts.tests.map(t => ({ + name: t.name, suite: 'test', tier: opts.tier || 'e2e' as const, + passed: t.passed, duration_ms: 0, cost_usd: 0, + })), + }; +} + +describe('computeTrends', () => { + test('classifies stable-pass test correctly', () => { + // 10 runs all passing — results are newest-first (loadEvalResults order) + const results = Array.from({ length: 10 }, (_, i) => makeRun({ + timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`, + tests: [{ name: 'always-pass', passed: true }], + })).reverse(); // newest first + + const trends = computeTrends(results); + expect(trends).toHaveLength(1); + expect(trends[0].status).toBe('stable-pass'); + expect(trends[0].passRate).toBe(1); + expect(trends[0].streak).toEqual({ type: 'pass', count: 10 }); + expect(trends[0].flipCount).toBe(0); + }); + + test('classifies stable-fail test correctly', () => { + const results = Array.from({ length: 10 }, (_, i) => makeRun({ + timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`, + tests: [{ name: 'always-fail', passed: false }], + })).reverse(); + + const trends = computeTrends(results); + expect(trends[0].status).toBe('stable-fail'); + expect(trends[0].passRate).toBe(0); + expect(trends[0].streak).toEqual({ type: 'fail', count: 10 }); + }); + + test('classifies flaky test correctly — alternating pass/fail', () => { + const results = Array.from({ length: 10 }, (_, i) => makeRun({ + timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`, + tests: [{ name: 'flaky', passed: i % 2 === 0 }], + })).reverse(); + + const trends = computeTrends(results); + expect(trends[0].status).toBe('flaky'); + expect(trends[0].flipCount).toBe(9); + expect(trends[0].passRate).toBe(0.5); + }); + + test('classifies improving test correctly', () => { + // First 5 fail, last 5 pass + const results = Array.from({ length: 10 }, (_, i) => makeRun({ + timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`, + tests: [{ name: 'improving', passed: i >= 5 }], + })).reverse(); + + const trends = computeTrends(results); + expect(trends[0].status).toBe('improving'); + expect(trends[0].streak).toEqual({ type: 'pass', count: 5 }); + }); + + test('classifies degrading test correctly', () => { + // First 7 pass, last 3 fail + const results = Array.from({ length: 10 }, (_, i) => makeRun({ + timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`, + tests: [{ name: 'degrading', passed: i < 7 }], + })).reverse(); + + const trends = computeTrends(results); + expect(trends[0].status).toBe('degrading'); + expect(trends[0].streak).toEqual({ type: 'fail', count: 3 }); + }); + + test('computes streak correctly with mixed ending', () => { + // pass, pass, fail, pass, pass, pass (newest) + const passed = [true, true, false, true, true, true]; + const results = passed.map((p, i) => makeRun({ + timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`, + tests: [{ name: 'test', passed: p }], + })).reverse(); + + const trends = computeTrends(results); + expect(trends[0].streak).toEqual({ type: 'pass', count: 3 }); + }); + + test('computes flipCount correctly', () => { + // pass, fail, pass, pass, fail, pass → 4 flips + const passed = [true, false, true, true, false, true]; + const results = passed.map((p, i) => makeRun({ + timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`, + tests: [{ name: 'test', passed: p }], + })).reverse(); + + const trends = computeTrends(results); + expect(trends[0].flipCount).toBe(4); + }); + + test('handles single run', () => { + const results = [makeRun({ + timestamp: '2026-03-15T00:00:00Z', + tests: [{ name: 'single', passed: true }], + })]; + + const trends = computeTrends(results); + expect(trends).toHaveLength(1); + expect(trends[0].passRate).toBe(1); + expect(trends[0].streak).toEqual({ type: 'pass', count: 1 }); + expect(trends[0].flipCount).toBe(0); + expect(trends[0].status).toBe('stable-pass'); + }); + + test('handles single failing run', () => { + const results = [makeRun({ + timestamp: '2026-03-15T00:00:00Z', + tests: [{ name: 'single-fail', passed: false }], + })]; + + const trends = computeTrends(results); + expect(trends[0].status).toBe('stable-fail'); + }); + + test('filters by tier', () => { + const results = [ + makeRun({ timestamp: '2026-03-15T00:00:00Z', tier: 'e2e', tests: [{ name: 'e2e-test', passed: true }] }), + makeRun({ timestamp: '2026-03-15T00:00:00Z', tier: 'llm-judge', tests: [{ name: 'judge-test', passed: true }] }), + ]; + + const e2eOnly = computeTrends(results, 'e2e'); + expect(e2eOnly).toHaveLength(1); + expect(e2eOnly[0].name).toBe('e2e-test'); + + const judgeOnly = computeTrends(results, 'llm-judge'); + expect(judgeOnly).toHaveLength(1); + expect(judgeOnly[0].name).toBe('judge-test'); + }); + + test('filters by test name', () => { + const results = Array.from({ length: 3 }, (_, i) => makeRun({ + timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`, + tests: [ + { name: 'test-a', passed: true }, + { name: 'test-b', passed: false }, + ], + })).reverse(); + + const filtered = computeTrends(results, undefined, 'test-a'); + expect(filtered).toHaveLength(1); + expect(filtered[0].name).toBe('test-a'); + expect(filtered[0].passRate).toBe(1); + }); + + test('sorts flaky tests first', () => { + // Create runs where test-a is flaky and test-b is stable + const results = Array.from({ length: 6 }, (_, i) => makeRun({ + timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`, + tests: [ + { name: 'test-a', passed: i % 2 === 0 }, // flaky: alternating + { name: 'test-b', passed: true }, // stable-pass + ], + })).reverse(); + + const trends = computeTrends(results); + expect(trends[0].name).toBe('test-a'); + expect(trends[0].status).toBe('flaky'); + expect(trends[1].name).toBe('test-b'); + expect(trends[1].status).toBe('stable-pass'); + }); +});