diff --git a/lib/dashboard-queries.ts b/lib/dashboard-queries.ts new file mode 100644 index 00000000..4d554c0e --- /dev/null +++ b/lib/dashboard-queries.ts @@ -0,0 +1,368 @@ +/** + * Dashboard query/transform functions — pure, no I/O. + * + * All functions take arrays of Supabase rows (Record[]) + * and return structured results. Used by both the CLI leaderboard + * and the shared HTML dashboard. + */ + +// --- Types --- + +export interface RegressionEntry { + testName: string; + previousRate: number; + currentRate: number; + delta: number; +} + +export interface RegressionResult { + regressions: RegressionEntry[]; + overallPreviousRate: number | null; + overallCurrentRate: number | null; + overallDelta: number; +} + +export interface VelocityByUser { + userId: string; + email: string; + shipsThisWeek: number; + shipsThisMonth: number; +} + +export interface VelocityResult { + byUser: VelocityByUser[]; + teamTotal: { week: number; month: number }; +} + +export interface CostWeek { + weekStart: string; + totalCost: number; + runs: number; +} + +export interface CostTrendResult { + weekly: CostWeek[]; + totalAllTime: number; +} + +export interface LeaderboardEntry { + userId: string; + email: string; + ships: number; + evalRuns: number; + sessions: number; + avgPassRate: number | null; + totalCost: number; +} + +export interface QARepoTrend { + repoSlug: string; + scores: Array<{ date: string; score: number }>; +} + +export interface QATrendResult { + byRepo: QARepoTrend[]; +} + +export interface EvalTestTrend { + testName: string; + history: Array<{ timestamp: string; passed: boolean }>; + passRate: number; + isFlaky: boolean; +} + +export interface EvalTrendResult { + byTest: EvalTestTrend[]; +} + +// --- Helpers --- + +function safePassRate(passed: unknown, total: unknown): number | null { + const p = Number(passed) || 0; + const t = Number(total) || 0; + return t > 0 ? (p / t) * 100 : null; +} + +function weekStart(date: Date): string { + const d = new Date(date); + d.setUTCDate(d.getUTCDate() - d.getUTCDay()); + d.setUTCHours(0, 0, 0, 0); + return d.toISOString().slice(0, 10); +} + +function daysAgo(days: number): string { + return new Date(Date.now() - days * 86_400_000).toISOString(); +} + +// --- Query functions --- + +/** + * Detect eval regressions by comparing the most recent run's pass rate + * against the average of the previous runs. + */ +export function detectRegressions(evalRuns: Record[]): RegressionResult { + if (evalRuns.length < 2) { + return { regressions: [], overallPreviousRate: null, overallCurrentRate: null, overallDelta: 0 }; + } + + // Runs should be sorted by timestamp desc (newest first) + const latest = evalRuns[0]; + const previous = evalRuns.slice(1); + + const currentRate = safePassRate(latest.passed, latest.total_tests); + const previousRates = previous + .map(r => safePassRate(r.passed, r.total_tests)) + .filter((r): r is number => r !== null); + + const previousAvg = previousRates.length > 0 + ? previousRates.reduce((a, b) => a + b, 0) / previousRates.length + : null; + + const overallDelta = (currentRate !== null && previousAvg !== null) + ? currentRate - previousAvg + : 0; + + // Per-test regression detection + const regressions: RegressionEntry[] = []; + const latestTests = (latest.tests as any[]) || []; + const previousTests = previous.flatMap(r => (r.tests as any[]) || []); + + // Group previous test results by name + const previousByName = new Map(); + for (const t of previousTests) { + if (!t.name) continue; + const arr = previousByName.get(t.name) || []; + arr.push(!!t.passed); + previousByName.set(t.name, arr); + } + + for (const t of latestTests) { + if (!t.name || t.passed) continue; // only look at failures + const prevResults = previousByName.get(t.name); + if (!prevResults || prevResults.length === 0) continue; + + const prevPassRate = (prevResults.filter(Boolean).length / prevResults.length) * 100; + if (prevPassRate > 50) { + // Was passing >50% of the time, now failed + regressions.push({ + testName: t.name, + previousRate: prevPassRate, + currentRate: 0, + delta: -prevPassRate, + }); + } + } + + return { + regressions, + overallPreviousRate: previousAvg, + overallCurrentRate: currentRate, + overallDelta, + }; +} + +/** + * Compute shipping velocity grouped by user. + */ +export function computeVelocity(shipLogs: Record[], windowDays = 30): VelocityResult { + const weekAgo = daysAgo(7); + const monthAgo = daysAgo(windowDays); + + const byUser = new Map(); + + for (const log of shipLogs) { + const ts = String(log.created_at || log.timestamp || ''); + const userId = String(log.user_id || 'unknown'); + const email = String(log.email || log.user_id || 'unknown'); + + if (!byUser.has(userId)) { + byUser.set(userId, { email, week: 0, month: 0 }); + } + const entry = byUser.get(userId)!; + + if (ts >= monthAgo) entry.month++; + if (ts >= weekAgo) entry.week++; + } + + const sorted = [...byUser.entries()] + .map(([userId, data]) => ({ + userId, + email: data.email, + shipsThisWeek: data.week, + shipsThisMonth: data.month, + })) + .sort((a, b) => b.shipsThisWeek - a.shipsThisWeek || b.shipsThisMonth - a.shipsThisMonth); + + const teamWeek = sorted.reduce((s, u) => s + u.shipsThisWeek, 0); + const teamMonth = sorted.reduce((s, u) => s + u.shipsThisMonth, 0); + + return { + byUser: sorted, + teamTotal: { week: teamWeek, month: teamMonth }, + }; +} + +/** + * Compute weekly cost trend from eval runs. + */ +export function computeCostTrend(evalRuns: Record[]): CostTrendResult { + const byWeek = new Map(); + + for (const run of evalRuns) { + const ts = run.timestamp || run.created_at; + if (!ts) continue; + + const ws = weekStart(new Date(String(ts))); + const entry = byWeek.get(ws) || { cost: 0, runs: 0 }; + entry.cost += Number(run.total_cost_usd) || 0; + entry.runs++; + byWeek.set(ws, entry); + } + + const weekly = [...byWeek.entries()] + .map(([ws, data]) => ({ weekStart: ws, totalCost: data.cost, runs: data.runs })) + .sort((a, b) => b.weekStart.localeCompare(a.weekStart)); + + const totalAllTime = evalRuns.reduce((s, r) => s + (Number(r.total_cost_usd) || 0), 0); + + return { weekly, totalAllTime }; +} + +/** + * Compute team leaderboard for the current week. + */ +export function computeLeaderboard(opts: { + evalRuns: Record[]; + shipLogs: Record[]; + sessions: Record[]; +}): LeaderboardEntry[] { + const { evalRuns, shipLogs, sessions } = opts; + const weekAgo = daysAgo(7); + + const users = new Map(); + + function getUser(userId: string, email: string): LeaderboardEntry { + if (!users.has(userId)) { + users.set(userId, { userId, email, ships: 0, evalRuns: 0, sessions: 0, avgPassRate: null, totalCost: 0 }); + } + return users.get(userId)!; + } + + // Count eval runs this week + const passRates = new Map(); + for (const r of evalRuns) { + const ts = String(r.timestamp || r.created_at || ''); + if (ts < weekAgo) continue; + const userId = String(r.user_id || 'unknown'); + const email = String(r.email || r.user_id || 'unknown'); + const user = getUser(userId, email); + user.evalRuns++; + user.totalCost += Number(r.total_cost_usd) || 0; + + const rate = safePassRate(r.passed, r.total_tests); + if (rate !== null) { + const arr = passRates.get(userId) || []; + arr.push(rate); + passRates.set(userId, arr); + } + } + + // Count ships this week + for (const log of shipLogs) { + const ts = String(log.created_at || log.timestamp || ''); + if (ts < weekAgo) continue; + const userId = String(log.user_id || 'unknown'); + const email = String(log.email || log.user_id || 'unknown'); + const user = getUser(userId, email); + user.ships++; + } + + // Count sessions this week + for (const s of sessions) { + const ts = String(s.started_at || s.created_at || ''); + if (ts < weekAgo) continue; + const userId = String(s.user_id || 'unknown'); + const email = String(s.email || s.user_id || 'unknown'); + const user = getUser(userId, email); + user.sessions++; + } + + // Compute avg pass rates + for (const [userId, rates] of passRates) { + const user = users.get(userId); + if (user && rates.length > 0) { + user.avgPassRate = rates.reduce((a, b) => a + b, 0) / rates.length; + } + } + + // Sort by ships (primary), then eval runs, then sessions + return [...users.values()].sort((a, b) => + b.ships - a.ships || b.evalRuns - a.evalRuns || b.sessions - a.sessions + ); +} + +/** + * Compute QA health score trends grouped by repo. + */ +export function computeQATrend(qaReports: Record[]): QATrendResult { + const byRepo = new Map>(); + + for (const r of qaReports) { + const repoSlug = String(r.repo_slug || 'unknown'); + const date = String(r.created_at || '').slice(0, 10); + const score = Number(r.health_score) || 0; + + if (!byRepo.has(repoSlug)) byRepo.set(repoSlug, []); + byRepo.get(repoSlug)!.push({ date, score }); + } + + // Sort each repo's scores by date descending + const result: QARepoTrend[] = []; + for (const [repoSlug, scores] of byRepo) { + scores.sort((a, b) => b.date.localeCompare(a.date)); + result.push({ repoSlug, scores }); + } + + return { byRepo: result.sort((a, b) => a.repoSlug.localeCompare(b.repoSlug)) }; +} + +/** + * Compute per-test pass rate trends and flaky test detection. + */ +export function computeEvalTrend(evalRuns: Record[]): EvalTrendResult { + const byTest = new Map>(); + + // Runs should be sorted by timestamp desc; we process all of them + for (const run of evalRuns) { + const ts = String(run.timestamp || run.created_at || ''); + const tests = (run.tests as any[]) || []; + + for (const t of tests) { + if (!t.name) continue; + if (!byTest.has(t.name)) byTest.set(t.name, []); + byTest.get(t.name)!.push({ timestamp: ts, passed: !!t.passed }); + } + } + + const result: EvalTestTrend[] = []; + for (const [testName, history] of byTest) { + // Sort by timestamp ascending for trend display + history.sort((a, b) => a.timestamp.localeCompare(b.timestamp)); + + const passCount = history.filter(h => h.passed).length; + const passRate = history.length > 0 ? (passCount / history.length) * 100 : 0; + + // Flaky = has both passes and failures, and pass rate between 20-80% + const isFlaky = history.length >= 3 && passRate > 20 && passRate < 80; + + result.push({ testName, history, passRate, isFlaky }); + } + + // Sort: flaky first, then by pass rate ascending (worst first) + return { + byTest: result.sort((a, b) => { + if (a.isFlaky !== b.isFlaky) return a.isFlaky ? -1 : 1; + return a.passRate - b.passRate; + }), + }; +} diff --git a/test/lib-dashboard-queries.test.ts b/test/lib-dashboard-queries.test.ts new file mode 100644 index 00000000..5e7baa2a --- /dev/null +++ b/test/lib-dashboard-queries.test.ts @@ -0,0 +1,443 @@ +/** + * Tests for dashboard query/transform functions (pure, no network). + */ + +import { describe, test, expect } from 'bun:test'; +import { + detectRegressions, + computeVelocity, + computeCostTrend, + computeLeaderboard, + computeQATrend, + computeEvalTrend, +} from '../lib/dashboard-queries'; + +// --- Helpers --- + +const now = new Date().toISOString(); +const daysAgo = (d: number) => new Date(Date.now() - d * 86_400_000).toISOString(); +const hoursAgo = (h: number) => new Date(Date.now() - h * 3_600_000).toISOString(); + +function makeEvalRun(overrides: Record = {}) { + return { + timestamp: now, + user_id: 'u1', + email: 'alice@test.com', + branch: 'main', + passed: 8, + total_tests: 10, + total_cost_usd: 1.50, + tier: 'e2e', + tests: [], + ...overrides, + }; +} + +function makeShipLog(overrides: Record = {}) { + return { + created_at: now, + user_id: 'u1', + email: 'alice@test.com', + version: '0.3.10', + branch: 'main', + pr_url: 'https://github.com/org/repo/pull/1', + ...overrides, + }; +} + +function makeSession(overrides: Record = {}) { + return { + started_at: now, + ended_at: now, + user_id: 'u1', + email: 'alice@test.com', + repo_slug: 'org/repo', + total_turns: 10, + tools_used: ['Edit', 'Bash'], + summary: 'Did stuff', + ...overrides, + }; +} + +// --- detectRegressions --- + +describe('detectRegressions', () => { + test('returns empty for < 2 runs', () => { + const result = detectRegressions([makeEvalRun()]); + expect(result.regressions).toEqual([]); + expect(result.overallDelta).toBe(0); + expect(result.overallCurrentRate).toBeNull(); + }); + + test('returns empty for empty array', () => { + const result = detectRegressions([]); + expect(result.regressions).toEqual([]); + }); + + test('detects overall regression', () => { + const runs = [ + makeEvalRun({ passed: 5, total_tests: 10 }), // latest: 50% + makeEvalRun({ passed: 9, total_tests: 10, timestamp: daysAgo(1) }), // prev: 90% + makeEvalRun({ passed: 8, total_tests: 10, timestamp: daysAgo(2) }), // prev: 80% + ]; + const result = detectRegressions(runs); + expect(result.overallCurrentRate).toBe(50); + expect(result.overallPreviousRate).toBe(85); // avg of 90 and 80 + expect(result.overallDelta).toBe(-35); + }); + + test('detects per-test regressions', () => { + const runs = [ + makeEvalRun({ passed: 1, total_tests: 2, tests: [ + { name: 'test_a', passed: false }, + { name: 'test_b', passed: true }, + ]}), + makeEvalRun({ passed: 2, total_tests: 2, timestamp: daysAgo(1), tests: [ + { name: 'test_a', passed: true }, + { name: 'test_b', passed: true }, + ]}), + makeEvalRun({ passed: 2, total_tests: 2, timestamp: daysAgo(2), tests: [ + { name: 'test_a', passed: true }, + { name: 'test_b', passed: true }, + ]}), + ]; + const result = detectRegressions(runs); + expect(result.regressions.length).toBe(1); + expect(result.regressions[0].testName).toBe('test_a'); + expect(result.regressions[0].previousRate).toBe(100); + expect(result.regressions[0].currentRate).toBe(0); + }); + + test('handles total_tests = 0 gracefully', () => { + const runs = [ + makeEvalRun({ passed: 0, total_tests: 0 }), + makeEvalRun({ passed: 5, total_tests: 10, timestamp: daysAgo(1) }), + ]; + const result = detectRegressions(runs); + expect(result.overallCurrentRate).toBeNull(); + expect(result.overallDelta).toBe(0); + }); + + test('no regression when pass rate improves', () => { + const runs = [ + makeEvalRun({ passed: 10, total_tests: 10 }), // 100% + makeEvalRun({ passed: 5, total_tests: 10, timestamp: daysAgo(1) }), // 50% + ]; + const result = detectRegressions(runs); + expect(result.overallDelta).toBe(50); + expect(result.regressions).toEqual([]); + }); +}); + +// --- computeVelocity --- + +describe('computeVelocity', () => { + test('groups ships by user', () => { + const logs = [ + makeShipLog({ user_id: 'u1', email: 'alice@test.com', created_at: hoursAgo(1) }), + makeShipLog({ user_id: 'u1', email: 'alice@test.com', created_at: hoursAgo(2) }), + makeShipLog({ user_id: 'u2', email: 'bob@test.com', created_at: hoursAgo(3) }), + ]; + const result = computeVelocity(logs); + + expect(result.teamTotal.week).toBe(3); + expect(result.byUser.length).toBe(2); + expect(result.byUser[0].email).toBe('alice@test.com'); + expect(result.byUser[0].shipsThisWeek).toBe(2); + expect(result.byUser[1].email).toBe('bob@test.com'); + expect(result.byUser[1].shipsThisWeek).toBe(1); + }); + + test('separates week from month', () => { + const logs = [ + makeShipLog({ created_at: hoursAgo(1) }), // this week + makeShipLog({ created_at: daysAgo(10) }), // this month + makeShipLog({ created_at: daysAgo(20) }), // this month + ]; + const result = computeVelocity(logs); + + expect(result.teamTotal.week).toBe(1); + expect(result.teamTotal.month).toBe(3); + }); + + test('handles empty array', () => { + const result = computeVelocity([]); + expect(result.byUser).toEqual([]); + expect(result.teamTotal).toEqual({ week: 0, month: 0 }); + }); + + test('sorts by weekly ships descending', () => { + const logs = [ + makeShipLog({ user_id: 'u1', created_at: hoursAgo(1) }), + makeShipLog({ user_id: 'u2', created_at: hoursAgo(1) }), + makeShipLog({ user_id: 'u2', created_at: hoursAgo(2) }), + makeShipLog({ user_id: 'u2', created_at: hoursAgo(3) }), + ]; + const result = computeVelocity(logs); + expect(result.byUser[0].userId).toBe('u2'); + expect(result.byUser[0].shipsThisWeek).toBe(3); + }); +}); + +// --- computeCostTrend --- + +describe('computeCostTrend', () => { + test('groups costs by week', () => { + const runs = [ + makeEvalRun({ total_cost_usd: 2.00, timestamp: '2026-03-16T12:00:00Z' }), // Mon + makeEvalRun({ total_cost_usd: 3.00, timestamp: '2026-03-17T12:00:00Z' }), // Tue (same week) + makeEvalRun({ total_cost_usd: 1.50, timestamp: '2026-03-08T12:00:00Z' }), // prev week + ]; + const result = computeCostTrend(runs); + + expect(result.totalAllTime).toBe(6.50); + expect(result.weekly.length).toBe(2); + // Most recent week first + const firstWeek = result.weekly[0]; + expect(firstWeek.runs).toBe(2); + expect(firstWeek.totalCost).toBe(5.00); + }); + + test('handles empty array', () => { + const result = computeCostTrend([]); + expect(result.weekly).toEqual([]); + expect(result.totalAllTime).toBe(0); + }); + + test('handles missing cost values', () => { + const runs = [ + makeEvalRun({ total_cost_usd: undefined }), + makeEvalRun({ total_cost_usd: null }), + ]; + const result = computeCostTrend(runs); + expect(result.totalAllTime).toBe(0); + }); +}); + +// --- computeLeaderboard --- + +describe('computeLeaderboard', () => { + test('aggregates across data sources', () => { + const result = computeLeaderboard({ + evalRuns: [ + makeEvalRun({ user_id: 'u1', email: 'alice@test.com', passed: 8, total_tests: 10 }), + makeEvalRun({ user_id: 'u1', email: 'alice@test.com', passed: 10, total_tests: 10 }), + ], + shipLogs: [ + makeShipLog({ user_id: 'u1', email: 'alice@test.com' }), + ], + sessions: [ + makeSession({ user_id: 'u1', email: 'alice@test.com' }), + makeSession({ user_id: 'u1', email: 'alice@test.com' }), + ], + }); + + expect(result.length).toBe(1); + expect(result[0].email).toBe('alice@test.com'); + expect(result[0].ships).toBe(1); + expect(result[0].evalRuns).toBe(2); + expect(result[0].sessions).toBe(2); + expect(result[0].avgPassRate).toBe(90); // avg of 80% and 100% + expect(result[0].totalCost).toBe(3.00); + }); + + test('sorts by ships, then eval runs, then sessions', () => { + const result = computeLeaderboard({ + evalRuns: [ + makeEvalRun({ user_id: 'u1', email: 'alice@test.com' }), + ], + shipLogs: [ + makeShipLog({ user_id: 'u2', email: 'bob@test.com' }), + makeShipLog({ user_id: 'u2', email: 'bob@test.com' }), + ], + sessions: [], + }); + + expect(result[0].email).toBe('bob@test.com'); + expect(result[0].ships).toBe(2); + expect(result[1].email).toBe('alice@test.com'); + }); + + test('excludes data older than 7 days', () => { + const result = computeLeaderboard({ + evalRuns: [ + makeEvalRun({ user_id: 'u1', timestamp: daysAgo(10) }), + ], + shipLogs: [ + makeShipLog({ user_id: 'u1', created_at: daysAgo(10) }), + ], + sessions: [ + makeSession({ user_id: 'u1', started_at: daysAgo(10) }), + ], + }); + + expect(result.length).toBe(0); + }); + + test('handles all empty inputs', () => { + const result = computeLeaderboard({ + evalRuns: [], + shipLogs: [], + sessions: [], + }); + expect(result).toEqual([]); + }); + + test('handles eval runs with total_tests = 0', () => { + const result = computeLeaderboard({ + evalRuns: [makeEvalRun({ passed: 0, total_tests: 0 })], + shipLogs: [], + sessions: [], + }); + expect(result.length).toBe(1); + expect(result[0].avgPassRate).toBeNull(); + }); + + test('multiple users sorted correctly with ties', () => { + const result = computeLeaderboard({ + evalRuns: [ + makeEvalRun({ user_id: 'u1', email: 'alice@test.com' }), + makeEvalRun({ user_id: 'u2', email: 'bob@test.com' }), + ], + shipLogs: [ + makeShipLog({ user_id: 'u1', email: 'alice@test.com' }), + makeShipLog({ user_id: 'u2', email: 'bob@test.com' }), + ], + sessions: [ + makeSession({ user_id: 'u1', email: 'alice@test.com' }), + makeSession({ user_id: 'u1', email: 'alice@test.com' }), + makeSession({ user_id: 'u2', email: 'bob@test.com' }), + ], + }); + + // Same ships (1), same eval runs (1), u1 has more sessions + expect(result[0].email).toBe('alice@test.com'); + expect(result[1].email).toBe('bob@test.com'); + }); +}); + +// --- computeQATrend --- + +describe('computeQATrend', () => { + test('groups scores by repo', () => { + const reports = [ + { repo_slug: 'org/app', health_score: 85, created_at: '2026-03-15T12:00:00Z' }, + { repo_slug: 'org/app', health_score: 90, created_at: '2026-03-14T12:00:00Z' }, + { repo_slug: 'org/api', health_score: 70, created_at: '2026-03-15T12:00:00Z' }, + ]; + const result = computeQATrend(reports); + + expect(result.byRepo.length).toBe(2); + const app = result.byRepo.find(r => r.repoSlug === 'org/app')!; + expect(app.scores.length).toBe(2); + // Most recent first + expect(app.scores[0].score).toBe(85); + expect(app.scores[1].score).toBe(90); + }); + + test('handles empty array', () => { + const result = computeQATrend([]); + expect(result.byRepo).toEqual([]); + }); + + test('handles missing health_score', () => { + const reports = [ + { repo_slug: 'org/app', health_score: null, created_at: '2026-03-15T12:00:00Z' }, + ]; + const result = computeQATrend(reports); + expect(result.byRepo[0].scores[0].score).toBe(0); + }); +}); + +// --- computeEvalTrend --- + +describe('computeEvalTrend', () => { + test('computes per-test pass rates', () => { + const runs = [ + makeEvalRun({ timestamp: '2026-03-15T12:00:00Z', tests: [ + { name: 'test_a', passed: true }, + { name: 'test_b', passed: false }, + ]}), + makeEvalRun({ timestamp: '2026-03-14T12:00:00Z', tests: [ + { name: 'test_a', passed: true }, + { name: 'test_b', passed: true }, + ]}), + ]; + const result = computeEvalTrend(runs); + + const testA = result.byTest.find(t => t.testName === 'test_a')!; + expect(testA.passRate).toBe(100); + expect(testA.isFlaky).toBe(false); + + const testB = result.byTest.find(t => t.testName === 'test_b')!; + expect(testB.passRate).toBe(50); + }); + + test('detects flaky tests', () => { + const runs = [ + makeEvalRun({ timestamp: '2026-03-15T12:00:00Z', tests: [{ name: 'flaky', passed: true }] }), + makeEvalRun({ timestamp: '2026-03-14T12:00:00Z', tests: [{ name: 'flaky', passed: false }] }), + makeEvalRun({ timestamp: '2026-03-13T12:00:00Z', tests: [{ name: 'flaky', passed: true }] }), + makeEvalRun({ timestamp: '2026-03-12T12:00:00Z', tests: [{ name: 'flaky', passed: false }] }), + ]; + const result = computeEvalTrend(runs); + const flaky = result.byTest.find(t => t.testName === 'flaky')!; + expect(flaky.isFlaky).toBe(true); + expect(flaky.passRate).toBe(50); + }); + + test('sorts flaky first, then by worst pass rate', () => { + const runs = [ + makeEvalRun({ tests: [ + { name: 'good', passed: true }, + { name: 'flaky', passed: true }, + { name: 'bad', passed: false }, + ]}), + makeEvalRun({ timestamp: daysAgo(1), tests: [ + { name: 'good', passed: true }, + { name: 'flaky', passed: false }, + { name: 'bad', passed: false }, + ]}), + makeEvalRun({ timestamp: daysAgo(2), tests: [ + { name: 'good', passed: true }, + { name: 'flaky', passed: true }, + { name: 'bad', passed: false }, + ]}), + ]; + const result = computeEvalTrend(runs); + + // Flaky (50% pass rate, has both passes and failures across 3+ runs) should be first + expect(result.byTest[0].testName).toBe('flaky'); + // Then bad (0%), then good (100%) + expect(result.byTest[1].testName).toBe('bad'); + expect(result.byTest[2].testName).toBe('good'); + }); + + test('handles empty array', () => { + const result = computeEvalTrend([]); + expect(result.byTest).toEqual([]); + }); + + test('handles tests without names', () => { + const runs = [ + makeEvalRun({ tests: [{ passed: true }, { name: 'named', passed: true }] }), + ]; + const result = computeEvalTrend(runs); + expect(result.byTest.length).toBe(1); + expect(result.byTest[0].testName).toBe('named'); + }); + + test('history sorted ascending by timestamp', () => { + const runs = [ + makeEvalRun({ timestamp: '2026-03-15T12:00:00Z', tests: [{ name: 'a', passed: true }] }), + makeEvalRun({ timestamp: '2026-03-13T12:00:00Z', tests: [{ name: 'a', passed: false }] }), + makeEvalRun({ timestamp: '2026-03-14T12:00:00Z', tests: [{ name: 'a', passed: true }] }), + ]; + const result = computeEvalTrend(runs); + const a = result.byTest.find(t => t.testName === 'a')!; + // Should be sorted ascending: 13, 14, 15 + expect(a.history[0].timestamp).toContain('2026-03-13'); + expect(a.history[1].timestamp).toContain('2026-03-14'); + expect(a.history[2].timestamp).toContain('2026-03-15'); + }); +});