Files
gstack/test/lib-eval-trend.test.ts
Garry Tan daea165333 feat: add eval:trend CLI for per-test pass rate tracking
computeTrends() classifies tests as stable-pass/stable-fail/flaky/
improving/degrading based on pass rate, flip count, and recent streak.
gstack eval trend shows sparkline table with --limit, --tier, --test
filters. Guard CLI main block with import.meta.main to prevent
execution on import.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-15 16:47:41 -05:00

194 lines
6.8 KiB
TypeScript

/**
* Tests for computeTrends() — per-test pass rate trend tracking.
*/
import { describe, test, expect } from 'bun:test';
import { computeTrends } from '../lib/cli-eval';
import type { EvalResult } from './helpers/eval-store';
/** Build a minimal EvalResult with given tests. */
function makeRun(opts: {
timestamp: string;
tier?: 'e2e' | 'llm-judge';
tests: Array<{ name: string; passed: boolean }>;
}): EvalResult {
return {
schema_version: 1,
version: '0.3.3',
branch: 'main',
git_sha: 'abc',
timestamp: opts.timestamp,
hostname: 'test',
tier: opts.tier || 'e2e',
total_tests: opts.tests.length,
passed: opts.tests.filter(t => t.passed).length,
failed: opts.tests.filter(t => !t.passed).length,
total_cost_usd: 0,
total_duration_ms: 0,
tests: opts.tests.map(t => ({
name: t.name, suite: 'test', tier: opts.tier || 'e2e' as const,
passed: t.passed, duration_ms: 0, cost_usd: 0,
})),
};
}
describe('computeTrends', () => {
test('classifies stable-pass test correctly', () => {
// 10 runs all passing — results are newest-first (loadEvalResults order)
const results = Array.from({ length: 10 }, (_, i) => makeRun({
timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
tests: [{ name: 'always-pass', passed: true }],
})).reverse(); // newest first
const trends = computeTrends(results);
expect(trends).toHaveLength(1);
expect(trends[0].status).toBe('stable-pass');
expect(trends[0].passRate).toBe(1);
expect(trends[0].streak).toEqual({ type: 'pass', count: 10 });
expect(trends[0].flipCount).toBe(0);
});
test('classifies stable-fail test correctly', () => {
const results = Array.from({ length: 10 }, (_, i) => makeRun({
timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
tests: [{ name: 'always-fail', passed: false }],
})).reverse();
const trends = computeTrends(results);
expect(trends[0].status).toBe('stable-fail');
expect(trends[0].passRate).toBe(0);
expect(trends[0].streak).toEqual({ type: 'fail', count: 10 });
});
test('classifies flaky test correctly — alternating pass/fail', () => {
const results = Array.from({ length: 10 }, (_, i) => makeRun({
timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
tests: [{ name: 'flaky', passed: i % 2 === 0 }],
})).reverse();
const trends = computeTrends(results);
expect(trends[0].status).toBe('flaky');
expect(trends[0].flipCount).toBe(9);
expect(trends[0].passRate).toBe(0.5);
});
test('classifies improving test correctly', () => {
// First 5 fail, last 5 pass
const results = Array.from({ length: 10 }, (_, i) => makeRun({
timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
tests: [{ name: 'improving', passed: i >= 5 }],
})).reverse();
const trends = computeTrends(results);
expect(trends[0].status).toBe('improving');
expect(trends[0].streak).toEqual({ type: 'pass', count: 5 });
});
test('classifies degrading test correctly', () => {
// First 7 pass, last 3 fail
const results = Array.from({ length: 10 }, (_, i) => makeRun({
timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
tests: [{ name: 'degrading', passed: i < 7 }],
})).reverse();
const trends = computeTrends(results);
expect(trends[0].status).toBe('degrading');
expect(trends[0].streak).toEqual({ type: 'fail', count: 3 });
});
test('computes streak correctly with mixed ending', () => {
// pass, pass, fail, pass, pass, pass (newest)
const passed = [true, true, false, true, true, true];
const results = passed.map((p, i) => makeRun({
timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
tests: [{ name: 'test', passed: p }],
})).reverse();
const trends = computeTrends(results);
expect(trends[0].streak).toEqual({ type: 'pass', count: 3 });
});
test('computes flipCount correctly', () => {
// pass, fail, pass, pass, fail, pass → 4 flips
const passed = [true, false, true, true, false, true];
const results = passed.map((p, i) => makeRun({
timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
tests: [{ name: 'test', passed: p }],
})).reverse();
const trends = computeTrends(results);
expect(trends[0].flipCount).toBe(4);
});
test('handles single run', () => {
const results = [makeRun({
timestamp: '2026-03-15T00:00:00Z',
tests: [{ name: 'single', passed: true }],
})];
const trends = computeTrends(results);
expect(trends).toHaveLength(1);
expect(trends[0].passRate).toBe(1);
expect(trends[0].streak).toEqual({ type: 'pass', count: 1 });
expect(trends[0].flipCount).toBe(0);
expect(trends[0].status).toBe('stable-pass');
});
test('handles single failing run', () => {
const results = [makeRun({
timestamp: '2026-03-15T00:00:00Z',
tests: [{ name: 'single-fail', passed: false }],
})];
const trends = computeTrends(results);
expect(trends[0].status).toBe('stable-fail');
});
test('filters by tier', () => {
const results = [
makeRun({ timestamp: '2026-03-15T00:00:00Z', tier: 'e2e', tests: [{ name: 'e2e-test', passed: true }] }),
makeRun({ timestamp: '2026-03-15T00:00:00Z', tier: 'llm-judge', tests: [{ name: 'judge-test', passed: true }] }),
];
const e2eOnly = computeTrends(results, 'e2e');
expect(e2eOnly).toHaveLength(1);
expect(e2eOnly[0].name).toBe('e2e-test');
const judgeOnly = computeTrends(results, 'llm-judge');
expect(judgeOnly).toHaveLength(1);
expect(judgeOnly[0].name).toBe('judge-test');
});
test('filters by test name', () => {
const results = Array.from({ length: 3 }, (_, i) => makeRun({
timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
tests: [
{ name: 'test-a', passed: true },
{ name: 'test-b', passed: false },
],
})).reverse();
const filtered = computeTrends(results, undefined, 'test-a');
expect(filtered).toHaveLength(1);
expect(filtered[0].name).toBe('test-a');
expect(filtered[0].passRate).toBe(1);
});
test('sorts flaky tests first', () => {
// Create runs where test-a is flaky and test-b is stable
const results = Array.from({ length: 6 }, (_, i) => makeRun({
timestamp: `2026-03-${String(10 + i).padStart(2, '0')}T00:00:00Z`,
tests: [
{ name: 'test-a', passed: i % 2 === 0 }, // flaky: alternating
{ name: 'test-b', passed: true }, // stable-pass
],
})).reverse();
const trends = computeTrends(results);
expect(trends[0].name).toBe('test-a');
expect(trends[0].status).toBe('flaky');
expect(trends[1].name).toBe('test-b');
expect(trends[1].status).toBe('stable-pass');
});
});