feat: eval persistence with auto-compare against previous run

EvalCollector accumulates test results during eval runs, writes JSON to
~/.gstack-dev/evals/{version}-{branch}-{tier}-{timestamp}.json, prints
a summary table, and automatically compares against the previous run.

- EvalCollector class with addTest() / finalize() / summary table
- findPreviousRun() prefers same branch, falls back to any branch
- compareEvalResults() matches tests by name, detects improved/regressed
- extractToolSummary() counts tool types from transcript events
- formatComparison() renders delta table with per-test + aggregate diffs
- Wire into skill-e2e.test.ts (recordE2E helper) and skill-llm-eval.test.ts
- 19 unit tests for collector + comparison functions
- schema_version: 1 for forward compatibility

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-03-14 03:49:47 -05:00
parent e7347c2f8f
commit 84f52f3bad
4 changed files with 975 additions and 20 deletions
+333
View File
@@ -0,0 +1,333 @@
import { describe, test, expect, beforeEach, afterEach } from 'bun:test';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
import {
EvalCollector,
extractToolSummary,
findPreviousRun,
compareEvalResults,
formatComparison,
} from './eval-store';
import type { EvalResult, EvalTestEntry } from './eval-store';
let tmpDir: string;
beforeEach(() => {
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'eval-store-test-'));
});
afterEach(() => {
try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {}
});
// --- Helper to make a minimal test entry ---
function makeEntry(overrides?: Partial<EvalTestEntry>): EvalTestEntry {
return {
name: 'test-1',
suite: 'suite-1',
tier: 'e2e',
passed: true,
duration_ms: 1000,
cost_usd: 0.05,
...overrides,
};
}
// --- Helper to make a minimal EvalResult ---
function makeResult(overrides?: Partial<EvalResult>): EvalResult {
return {
schema_version: 1,
version: '0.3.6',
branch: 'main',
git_sha: 'abc1234',
timestamp: '2026-03-14T12:00:00.000Z',
hostname: 'test-host',
tier: 'e2e',
total_tests: 1,
passed: 1,
failed: 0,
total_cost_usd: 0.05,
total_duration_ms: 1000,
tests: [makeEntry()],
...overrides,
};
}
// --- EvalCollector tests ---
describe('EvalCollector', () => {
test('addTest accumulates entries', () => {
const collector = new EvalCollector('e2e', tmpDir);
collector.addTest(makeEntry({ name: 'a' }));
collector.addTest(makeEntry({ name: 'b' }));
collector.addTest(makeEntry({ name: 'c' }));
// We can't inspect tests directly, but finalize will write them
});
test('finalize writes JSON file to eval dir', async () => {
const collector = new EvalCollector('e2e', tmpDir);
collector.addTest(makeEntry());
const filepath = await collector.finalize();
expect(filepath).toBeTruthy();
expect(fs.existsSync(filepath)).toBe(true);
const data = JSON.parse(fs.readFileSync(filepath, 'utf-8'));
expect(data.tests).toHaveLength(1);
expect(data.tests[0].name).toBe('test-1');
});
test('written JSON has correct schema fields', async () => {
const collector = new EvalCollector('e2e', tmpDir);
collector.addTest(makeEntry({ passed: true, cost_usd: 0.10, duration_ms: 2000 }));
collector.addTest(makeEntry({ name: 'test-2', passed: false, cost_usd: 0.05, duration_ms: 1000 }));
const filepath = await collector.finalize();
const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8'));
expect(data.schema_version).toBe(1);
expect(data.tier).toBe('e2e');
expect(data.total_tests).toBe(2);
expect(data.passed).toBe(1);
expect(data.failed).toBe(1);
expect(data.total_cost_usd).toBe(0.15);
expect(data.total_duration_ms).toBe(3000);
expect(data.timestamp).toBeTruthy();
expect(data.hostname).toBeTruthy();
});
test('finalize creates directory if missing', async () => {
const nestedDir = path.join(tmpDir, 'nested', 'deep', 'evals');
const collector = new EvalCollector('e2e', nestedDir);
collector.addTest(makeEntry());
const filepath = await collector.finalize();
expect(fs.existsSync(filepath)).toBe(true);
});
test('double finalize does not write twice', async () => {
const collector = new EvalCollector('e2e', tmpDir);
collector.addTest(makeEntry());
const filepath1 = await collector.finalize();
const filepath2 = await collector.finalize();
expect(filepath1).toBeTruthy();
expect(filepath2).toBe(''); // second call returns empty
expect(fs.readdirSync(tmpDir).filter(f => f.endsWith('.json'))).toHaveLength(1);
});
test('empty collector writes valid file', async () => {
const collector = new EvalCollector('llm-judge', tmpDir);
const filepath = await collector.finalize();
const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8'));
expect(data.total_tests).toBe(0);
expect(data.passed).toBe(0);
expect(data.tests).toHaveLength(0);
expect(data.tier).toBe('llm-judge');
});
});
// --- extractToolSummary tests ---
describe('extractToolSummary', () => {
test('counts tool types from transcript events', () => {
const transcript = [
{ type: 'system', subtype: 'init' },
{ type: 'assistant', message: { content: [
{ type: 'tool_use', name: 'Bash', input: {} },
] } },
{ type: 'user', tool_use_result: { stdout: '' } },
{ type: 'assistant', message: { content: [
{ type: 'text', text: 'ok' },
{ type: 'tool_use', name: 'Read', input: {} },
] } },
{ type: 'assistant', message: { content: [
{ type: 'tool_use', name: 'Bash', input: {} },
{ type: 'tool_use', name: 'Write', input: {} },
] } },
];
const summary = extractToolSummary(transcript);
expect(summary).toEqual({ Bash: 2, Read: 1, Write: 1 });
});
test('returns empty object for empty transcript', () => {
expect(extractToolSummary([])).toEqual({});
});
test('handles events with no content array', () => {
const transcript = [
{ type: 'assistant', message: {} },
{ type: 'assistant' },
];
expect(extractToolSummary(transcript)).toEqual({});
});
});
// --- findPreviousRun tests ---
describe('findPreviousRun', () => {
test('finds correct file — same branch preferred, most recent', () => {
// Write three eval files
const files = [
{ name: '0.3.5-main-e2e-20260312-100000.json', data: makeResult({ branch: 'main', timestamp: '2026-03-12T10:00:00Z' }) },
{ name: '0.3.5-feature-e2e-20260313-100000.json', data: makeResult({ branch: 'feature', timestamp: '2026-03-13T10:00:00Z' }) },
{ name: '0.3.6-feature-e2e-20260314-100000.json', data: makeResult({ branch: 'feature', timestamp: '2026-03-14T10:00:00Z' }) },
];
for (const f of files) {
fs.writeFileSync(path.join(tmpDir, f.name), JSON.stringify(f.data));
}
// Should prefer feature branch (most recent on same branch)
const result = findPreviousRun(tmpDir, 'e2e', 'feature', path.join(tmpDir, 'current.json'));
expect(result).toContain('0.3.6-feature-e2e-20260314');
});
test('falls back to different branch when no same-branch match', () => {
const files = [
{ name: '0.3.5-main-e2e-20260312-100000.json', data: makeResult({ branch: 'main', timestamp: '2026-03-12T10:00:00Z' }) },
];
for (const f of files) {
fs.writeFileSync(path.join(tmpDir, f.name), JSON.stringify(f.data));
}
const result = findPreviousRun(tmpDir, 'e2e', 'new-branch', path.join(tmpDir, 'current.json'));
expect(result).toContain('0.3.5-main-e2e');
});
test('returns null when no prior runs exist', () => {
const result = findPreviousRun(tmpDir, 'e2e', 'main', path.join(tmpDir, 'current.json'));
expect(result).toBeNull();
});
test('returns null when directory does not exist', () => {
const result = findPreviousRun('/nonexistent/path', 'e2e', 'main', 'current.json');
expect(result).toBeNull();
});
test('excludes the current file from results', () => {
const filename = '0.3.6-main-e2e-20260314-100000.json';
fs.writeFileSync(
path.join(tmpDir, filename),
JSON.stringify(makeResult({ branch: 'main', timestamp: '2026-03-14T10:00:00Z' })),
);
const result = findPreviousRun(tmpDir, 'e2e', 'main', path.join(tmpDir, filename));
expect(result).toBeNull(); // only file is excluded
});
test('filters by tier', () => {
fs.writeFileSync(
path.join(tmpDir, '0.3.6-main-llm-judge-20260314-100000.json'),
JSON.stringify(makeResult({ tier: 'llm-judge', branch: 'main', timestamp: '2026-03-14T10:00:00Z' })),
);
const result = findPreviousRun(tmpDir, 'e2e', 'main', 'current.json');
expect(result).toBeNull(); // only llm-judge file, looking for e2e
});
});
// --- compareEvalResults tests ---
describe('compareEvalResults', () => {
test('detects improved/regressed/unchanged per test', () => {
const before = makeResult({
tests: [
makeEntry({ name: 'test-a', passed: false }),
makeEntry({ name: 'test-b', passed: true }),
makeEntry({ name: 'test-c', passed: true }),
],
total_tests: 3, passed: 2, failed: 1,
});
const after = makeResult({
tests: [
makeEntry({ name: 'test-a', passed: true }), // improved
makeEntry({ name: 'test-b', passed: false }), // regressed
makeEntry({ name: 'test-c', passed: true }), // unchanged
],
total_tests: 3, passed: 2, failed: 1,
});
const result = compareEvalResults(before, after, 'before.json', 'after.json');
expect(result.improved).toBe(1);
expect(result.regressed).toBe(1);
expect(result.unchanged).toBe(1);
expect(result.deltas.find(d => d.name === 'test-a')?.status_change).toBe('improved');
expect(result.deltas.find(d => d.name === 'test-b')?.status_change).toBe('regressed');
expect(result.deltas.find(d => d.name === 'test-c')?.status_change).toBe('unchanged');
});
test('handles tests present in one run but not the other', () => {
const before = makeResult({
tests: [
makeEntry({ name: 'old-test', passed: true }),
makeEntry({ name: 'shared', passed: true }),
],
});
const after = makeResult({
tests: [
makeEntry({ name: 'shared', passed: true }),
makeEntry({ name: 'new-test', passed: true }),
],
});
const result = compareEvalResults(before, after, 'before.json', 'after.json');
expect(result.deltas).toHaveLength(3); // shared + new-test + old-test (removed)
expect(result.deltas.find(d => d.name.includes('old-test'))?.name).toContain('removed');
});
test('computes cost and duration deltas', () => {
const before = makeResult({ total_cost_usd: 2.00, total_duration_ms: 60000 });
const after = makeResult({ total_cost_usd: 1.50, total_duration_ms: 45000 });
const result = compareEvalResults(before, after, 'a.json', 'b.json');
expect(result.total_cost_delta).toBe(-0.50);
expect(result.total_duration_delta).toBe(-15000);
});
});
// --- formatComparison tests ---
describe('formatComparison', () => {
test('produces readable output with status arrows', () => {
const comparison: ComparisonResult = {
before_file: 'before.json',
after_file: 'after.json',
before_branch: 'main',
after_branch: 'feature',
before_timestamp: '2026-03-13T14:30:00Z',
after_timestamp: '2026-03-14T14:30:00Z',
deltas: [
{
name: 'browse basic',
before: { passed: true, cost_usd: 0.07, tool_summary: { Bash: 3 } },
after: { passed: true, cost_usd: 0.06, tool_summary: { Bash: 4 } },
status_change: 'unchanged',
},
{
name: 'planted bugs static',
before: { passed: false, cost_usd: 1.00, detection_rate: 3, tool_summary: {} },
after: { passed: true, cost_usd: 0.95, detection_rate: 4, tool_summary: {} },
status_change: 'improved',
},
],
total_cost_delta: -0.06,
total_duration_delta: -5000,
improved: 1,
regressed: 0,
unchanged: 1,
tool_count_before: 3,
tool_count_after: 4,
};
const output = formatComparison(comparison);
expect(output).toContain('vs previous');
expect(output).toContain('main');
expect(output).toContain('1 improved');
expect(output).toContain('1 unchanged');
expect(output).toContain('↑'); // improved arrow
expect(output).toContain('='); // unchanged arrow
});
});
+466
View File
@@ -0,0 +1,466 @@
/**
* Eval result persistence and comparison.
*
* EvalCollector accumulates test results, writes them to
* ~/.gstack-dev/evals/{version}-{branch}-{tier}-{timestamp}.json,
* prints a summary table, and auto-compares with the previous run.
*
* Comparison functions are exported for reuse by the eval:compare CLI.
*/
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
import { spawnSync } from 'child_process';
const SCHEMA_VERSION = 1;
const DEFAULT_EVAL_DIR = path.join(os.homedir(), '.gstack-dev', 'evals');
// --- Interfaces ---
export interface EvalTestEntry {
name: string;
suite: string;
tier: 'e2e' | 'llm-judge';
passed: boolean;
duration_ms: number;
cost_usd: number;
// E2E
transcript?: any[];
prompt?: string;
output?: string;
turns_used?: number;
browse_errors?: string[];
// LLM judge
judge_scores?: Record<string, number>;
judge_reasoning?: string;
// Outcome eval
detection_rate?: number;
false_positives?: number;
evidence_quality?: number;
detected_bugs?: string[];
missed_bugs?: string[];
error?: string;
}
export interface EvalResult {
schema_version: number;
version: string;
branch: string;
git_sha: string;
timestamp: string;
hostname: string;
tier: 'e2e' | 'llm-judge';
total_tests: number;
passed: number;
failed: number;
total_cost_usd: number;
total_duration_ms: number;
tests: EvalTestEntry[];
}
export interface TestDelta {
name: string;
before: { passed: boolean; cost_usd: number; turns_used?: number;
detection_rate?: number; tool_summary?: Record<string, number> };
after: { passed: boolean; cost_usd: number; turns_used?: number;
detection_rate?: number; tool_summary?: Record<string, number> };
status_change: 'improved' | 'regressed' | 'unchanged';
}
export interface ComparisonResult {
before_file: string;
after_file: string;
before_branch: string;
after_branch: string;
before_timestamp: string;
after_timestamp: string;
deltas: TestDelta[];
total_cost_delta: number;
total_duration_delta: number;
improved: number;
regressed: number;
unchanged: number;
tool_count_before: number;
tool_count_after: number;
}
// --- Comparison functions (exported for eval:compare CLI) ---
/**
* Extract tool call counts from a transcript.
* Returns e.g. { Bash: 8, Read: 3, Write: 1 }.
*/
export function extractToolSummary(transcript: any[]): Record<string, number> {
const counts: Record<string, number> = {};
for (const event of transcript) {
if (event.type === 'assistant') {
const content = event.message?.content || [];
for (const item of content) {
if (item.type === 'tool_use') {
const name = item.name || 'unknown';
counts[name] = (counts[name] || 0) + 1;
}
}
}
}
return counts;
}
/**
* Find the most recent prior eval file for comparison.
* Prefers same branch, falls back to any branch.
*/
export function findPreviousRun(
evalDir: string,
tier: string,
branch: string,
excludeFile: string,
): string | null {
let files: string[];
try {
files = fs.readdirSync(evalDir).filter(f => f.endsWith('.json'));
} catch {
return null; // dir doesn't exist
}
// Parse top-level fields from each file (cheap — no full tests array needed)
const entries: Array<{ file: string; branch: string; timestamp: string }> = [];
for (const file of files) {
if (file === path.basename(excludeFile)) continue;
const fullPath = path.join(evalDir, file);
try {
const raw = fs.readFileSync(fullPath, 'utf-8');
// Quick parse — only grab the fields we need
const data = JSON.parse(raw);
if (data.tier !== tier) continue;
entries.push({ file: fullPath, branch: data.branch || '', timestamp: data.timestamp || '' });
} catch { continue; }
}
if (entries.length === 0) return null;
// Sort by timestamp descending
entries.sort((a, b) => b.timestamp.localeCompare(a.timestamp));
// Prefer same branch
const sameBranch = entries.find(e => e.branch === branch);
if (sameBranch) return sameBranch.file;
// Fallback: any branch
return entries[0].file;
}
/**
* Compare two eval results. Matches tests by name.
*/
export function compareEvalResults(
before: EvalResult,
after: EvalResult,
beforeFile: string,
afterFile: string,
): ComparisonResult {
const deltas: TestDelta[] = [];
let improved = 0, regressed = 0, unchanged = 0;
let toolCountBefore = 0, toolCountAfter = 0;
// Index before tests by name
const beforeMap = new Map<string, EvalTestEntry>();
for (const t of before.tests) {
beforeMap.set(t.name, t);
}
// Walk after tests, match by name
for (const afterTest of after.tests) {
const beforeTest = beforeMap.get(afterTest.name);
const beforeToolSummary = beforeTest?.transcript ? extractToolSummary(beforeTest.transcript) : {};
const afterToolSummary = afterTest.transcript ? extractToolSummary(afterTest.transcript) : {};
const beforeToolCount = Object.values(beforeToolSummary).reduce((a, b) => a + b, 0);
const afterToolCount = Object.values(afterToolSummary).reduce((a, b) => a + b, 0);
toolCountBefore += beforeToolCount;
toolCountAfter += afterToolCount;
let statusChange: TestDelta['status_change'] = 'unchanged';
if (beforeTest) {
if (!beforeTest.passed && afterTest.passed) { statusChange = 'improved'; improved++; }
else if (beforeTest.passed && !afterTest.passed) { statusChange = 'regressed'; regressed++; }
else { unchanged++; }
} else {
// New test — treat as unchanged (no prior data)
unchanged++;
}
deltas.push({
name: afterTest.name,
before: {
passed: beforeTest?.passed ?? false,
cost_usd: beforeTest?.cost_usd ?? 0,
turns_used: beforeTest?.turns_used,
detection_rate: beforeTest?.detection_rate,
tool_summary: beforeToolSummary,
},
after: {
passed: afterTest.passed,
cost_usd: afterTest.cost_usd,
turns_used: afterTest.turns_used,
detection_rate: afterTest.detection_rate,
tool_summary: afterToolSummary,
},
status_change: statusChange,
});
beforeMap.delete(afterTest.name);
}
// Tests that were in before but not in after (removed tests)
for (const [name, beforeTest] of beforeMap) {
const beforeToolSummary = beforeTest.transcript ? extractToolSummary(beforeTest.transcript) : {};
const beforeToolCount = Object.values(beforeToolSummary).reduce((a, b) => a + b, 0);
toolCountBefore += beforeToolCount;
unchanged++;
deltas.push({
name: `${name} (removed)`,
before: {
passed: beforeTest.passed,
cost_usd: beforeTest.cost_usd,
turns_used: beforeTest.turns_used,
detection_rate: beforeTest.detection_rate,
tool_summary: beforeToolSummary,
},
after: { passed: false, cost_usd: 0, tool_summary: {} },
status_change: 'unchanged',
});
}
return {
before_file: beforeFile,
after_file: afterFile,
before_branch: before.branch,
after_branch: after.branch,
before_timestamp: before.timestamp,
after_timestamp: after.timestamp,
deltas,
total_cost_delta: after.total_cost_usd - before.total_cost_usd,
total_duration_delta: after.total_duration_ms - before.total_duration_ms,
improved,
regressed,
unchanged,
tool_count_before: toolCountBefore,
tool_count_after: toolCountAfter,
};
}
/**
* Format a ComparisonResult as a readable string.
*/
export function formatComparison(c: ComparisonResult): string {
const lines: string[] = [];
const ts = c.before_timestamp ? c.before_timestamp.replace('T', ' ').slice(0, 16) : 'unknown';
lines.push(`\nvs previous: ${c.before_branch}/${c.deltas.length ? 'eval' : ''} (${ts})`);
lines.push('─'.repeat(70));
// Per-test deltas
for (const d of c.deltas) {
const arrow = d.status_change === 'improved' ? '↑' : d.status_change === 'regressed' ? '↓' : '=';
const beforeStatus = d.before.passed ? 'PASS' : 'FAIL';
const afterStatus = d.after.passed ? 'PASS' : 'FAIL';
let detail = '';
if (d.before.detection_rate !== undefined || d.after.detection_rate !== undefined) {
detail = ` ${d.before.detection_rate ?? '?'}${d.after.detection_rate ?? '?'} det`;
} else {
const costBefore = d.before.cost_usd.toFixed(2);
const costAfter = d.after.cost_usd.toFixed(2);
detail = ` $${costBefore}$${costAfter}`;
}
const name = d.name.length > 35 ? d.name.slice(0, 32) + '...' : d.name.padEnd(35);
lines.push(` ${name} ${beforeStatus.padEnd(5)}${afterStatus.padEnd(5)} ${arrow}${detail}`);
}
lines.push('─'.repeat(70));
// Totals
const parts: string[] = [];
if (c.improved > 0) parts.push(`${c.improved} improved`);
if (c.regressed > 0) parts.push(`${c.regressed} regressed`);
if (c.unchanged > 0) parts.push(`${c.unchanged} unchanged`);
lines.push(` Status: ${parts.join(', ')}`);
const costSign = c.total_cost_delta >= 0 ? '+' : '';
lines.push(` Cost: ${costSign}$${c.total_cost_delta.toFixed(2)}`);
const durDelta = Math.round(c.total_duration_delta / 1000);
const durSign = durDelta >= 0 ? '+' : '';
lines.push(` Duration: ${durSign}${durDelta}s`);
const toolDelta = c.tool_count_after - c.tool_count_before;
const toolSign = toolDelta >= 0 ? '+' : '';
lines.push(` Tool calls: ${c.tool_count_before}${c.tool_count_after} (${toolSign}${toolDelta})`);
// Tool breakdown (show tools that changed)
const allTools = new Set<string>();
for (const d of c.deltas) {
for (const t of Object.keys(d.before.tool_summary || {})) allTools.add(t);
for (const t of Object.keys(d.after.tool_summary || {})) allTools.add(t);
}
if (allTools.size > 0) {
// Aggregate tool counts across all tests
const totalBefore: Record<string, number> = {};
const totalAfter: Record<string, number> = {};
for (const d of c.deltas) {
for (const [t, n] of Object.entries(d.before.tool_summary || {})) {
totalBefore[t] = (totalBefore[t] || 0) + n;
}
for (const [t, n] of Object.entries(d.after.tool_summary || {})) {
totalAfter[t] = (totalAfter[t] || 0) + n;
}
}
for (const tool of [...allTools].sort()) {
const b = totalBefore[tool] || 0;
const a = totalAfter[tool] || 0;
if (b !== a) {
const d = a - b;
lines.push(` ${tool}: ${b}${a} (${d >= 0 ? '+' : ''}${d})`);
}
}
}
return lines.join('\n');
}
// --- EvalCollector ---
function getGitInfo(): { branch: string; sha: string } {
try {
const branch = spawnSync('git', ['rev-parse', '--abbrev-ref', 'HEAD'], { stdio: 'pipe', timeout: 5000 });
const sha = spawnSync('git', ['rev-parse', '--short', 'HEAD'], { stdio: 'pipe', timeout: 5000 });
return {
branch: branch.stdout?.toString().trim() || 'unknown',
sha: sha.stdout?.toString().trim() || 'unknown',
};
} catch {
return { branch: 'unknown', sha: 'unknown' };
}
}
function getVersion(): string {
try {
const pkgPath = path.resolve(__dirname, '..', '..', 'package.json');
const pkg = JSON.parse(fs.readFileSync(pkgPath, 'utf-8'));
return pkg.version || 'unknown';
} catch {
return 'unknown';
}
}
export class EvalCollector {
private tier: 'e2e' | 'llm-judge';
private tests: EvalTestEntry[] = [];
private finalized = false;
private evalDir: string;
constructor(tier: 'e2e' | 'llm-judge', evalDir?: string) {
this.tier = tier;
this.evalDir = evalDir || DEFAULT_EVAL_DIR;
}
addTest(entry: EvalTestEntry): void {
this.tests.push(entry);
}
async finalize(): Promise<string> {
if (this.finalized) return '';
this.finalized = true;
const git = getGitInfo();
const version = getVersion();
const timestamp = new Date().toISOString();
const totalCost = this.tests.reduce((s, t) => s + t.cost_usd, 0);
const totalDuration = this.tests.reduce((s, t) => s + t.duration_ms, 0);
const passed = this.tests.filter(t => t.passed).length;
const result: EvalResult = {
schema_version: SCHEMA_VERSION,
version,
branch: git.branch,
git_sha: git.sha,
timestamp,
hostname: os.hostname(),
tier: this.tier,
total_tests: this.tests.length,
passed,
failed: this.tests.length - passed,
total_cost_usd: Math.round(totalCost * 100) / 100,
total_duration_ms: totalDuration,
tests: this.tests,
};
// Write eval file
fs.mkdirSync(this.evalDir, { recursive: true });
const dateStr = timestamp.replace(/[:.]/g, '').replace('T', '-').slice(0, 15);
const safeBranch = git.branch.replace(/[^a-zA-Z0-9._-]/g, '-');
const filename = `${version}-${safeBranch}-${this.tier}-${dateStr}.json`;
const filepath = path.join(this.evalDir, filename);
fs.writeFileSync(filepath, JSON.stringify(result, null, 2) + '\n');
// Print summary table
this.printSummary(result, filepath, git);
// Auto-compare with previous run
try {
const prevFile = findPreviousRun(this.evalDir, this.tier, git.branch, filepath);
if (prevFile) {
const prevResult: EvalResult = JSON.parse(fs.readFileSync(prevFile, 'utf-8'));
const comparison = compareEvalResults(prevResult, result, prevFile, filepath);
process.stderr.write(formatComparison(comparison) + '\n');
} else {
process.stderr.write('\nFirst run — no comparison available.\n');
}
} catch (err: any) {
process.stderr.write(`\nCompare error: ${err.message}\n`);
}
return filepath;
}
private printSummary(result: EvalResult, filepath: string, git: { branch: string; sha: string }): void {
const lines: string[] = [];
lines.push('');
lines.push(`Eval Results — v${result.version} @ ${git.branch} (${git.sha}) — ${this.tier}`);
lines.push('═'.repeat(70));
for (const t of this.tests) {
const status = t.passed ? ' PASS ' : ' FAIL ';
const cost = `$${t.cost_usd.toFixed(2)}`;
let detail = '';
if (t.detection_rate !== undefined) {
detail = `${t.detection_rate}/${(t.detected_bugs?.length || 0) + (t.missed_bugs?.length || 0)} det`;
} else if (t.turns_used !== undefined) {
detail = `${t.turns_used} turns`;
} else if (t.judge_scores) {
const scores = Object.entries(t.judge_scores).map(([k, v]) => `${k[0]}:${v}`).join(' ');
detail = scores;
}
const name = t.name.length > 38 ? t.name.slice(0, 35) + '...' : t.name.padEnd(38);
lines.push(` ${name} ${status} ${cost.padStart(6)} ${detail}`);
}
lines.push('─'.repeat(70));
const totalCost = `$${result.total_cost_usd.toFixed(2)}`;
const totalDur = `${Math.round(result.total_duration_ms / 1000)}s`;
lines.push(` Total: ${result.passed}/${result.total_tests} passed${' '.repeat(20)}${totalCost.padStart(6)} ${totalDur}`);
lines.push(`Saved: ${filepath}`);
process.stderr.write(lines.join('\n') + '\n');
}
}