diff --git a/test/helpers/eval-store.test.ts b/test/helpers/eval-store.test.ts new file mode 100644 index 00000000..64824c68 --- /dev/null +++ b/test/helpers/eval-store.test.ts @@ -0,0 +1,333 @@ +import { describe, test, expect, beforeEach, afterEach } from 'bun:test'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { + EvalCollector, + extractToolSummary, + findPreviousRun, + compareEvalResults, + formatComparison, +} from './eval-store'; +import type { EvalResult, EvalTestEntry } from './eval-store'; + +let tmpDir: string; + +beforeEach(() => { + tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'eval-store-test-')); +}); + +afterEach(() => { + try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {} +}); + +// --- Helper to make a minimal test entry --- + +function makeEntry(overrides?: Partial): EvalTestEntry { + return { + name: 'test-1', + suite: 'suite-1', + tier: 'e2e', + passed: true, + duration_ms: 1000, + cost_usd: 0.05, + ...overrides, + }; +} + +// --- Helper to make a minimal EvalResult --- + +function makeResult(overrides?: Partial): EvalResult { + return { + schema_version: 1, + version: '0.3.6', + branch: 'main', + git_sha: 'abc1234', + timestamp: '2026-03-14T12:00:00.000Z', + hostname: 'test-host', + tier: 'e2e', + total_tests: 1, + passed: 1, + failed: 0, + total_cost_usd: 0.05, + total_duration_ms: 1000, + tests: [makeEntry()], + ...overrides, + }; +} + +// --- EvalCollector tests --- + +describe('EvalCollector', () => { + test('addTest accumulates entries', () => { + const collector = new EvalCollector('e2e', tmpDir); + collector.addTest(makeEntry({ name: 'a' })); + collector.addTest(makeEntry({ name: 'b' })); + collector.addTest(makeEntry({ name: 'c' })); + // We can't inspect tests directly, but finalize will write them + }); + + test('finalize writes JSON file to eval dir', async () => { + const collector = new EvalCollector('e2e', tmpDir); + collector.addTest(makeEntry()); + const filepath = await collector.finalize(); + + expect(filepath).toBeTruthy(); + expect(fs.existsSync(filepath)).toBe(true); + + const data = JSON.parse(fs.readFileSync(filepath, 'utf-8')); + expect(data.tests).toHaveLength(1); + expect(data.tests[0].name).toBe('test-1'); + }); + + test('written JSON has correct schema fields', async () => { + const collector = new EvalCollector('e2e', tmpDir); + collector.addTest(makeEntry({ passed: true, cost_usd: 0.10, duration_ms: 2000 })); + collector.addTest(makeEntry({ name: 'test-2', passed: false, cost_usd: 0.05, duration_ms: 1000 })); + const filepath = await collector.finalize(); + + const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8')); + expect(data.schema_version).toBe(1); + expect(data.tier).toBe('e2e'); + expect(data.total_tests).toBe(2); + expect(data.passed).toBe(1); + expect(data.failed).toBe(1); + expect(data.total_cost_usd).toBe(0.15); + expect(data.total_duration_ms).toBe(3000); + expect(data.timestamp).toBeTruthy(); + expect(data.hostname).toBeTruthy(); + }); + + test('finalize creates directory if missing', async () => { + const nestedDir = path.join(tmpDir, 'nested', 'deep', 'evals'); + const collector = new EvalCollector('e2e', nestedDir); + collector.addTest(makeEntry()); + const filepath = await collector.finalize(); + expect(fs.existsSync(filepath)).toBe(true); + }); + + test('double finalize does not write twice', async () => { + const collector = new EvalCollector('e2e', tmpDir); + collector.addTest(makeEntry()); + const filepath1 = await collector.finalize(); + const filepath2 = await collector.finalize(); + + expect(filepath1).toBeTruthy(); + expect(filepath2).toBe(''); // second call returns empty + expect(fs.readdirSync(tmpDir).filter(f => f.endsWith('.json'))).toHaveLength(1); + }); + + test('empty collector writes valid file', async () => { + const collector = new EvalCollector('llm-judge', tmpDir); + const filepath = await collector.finalize(); + + const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8')); + expect(data.total_tests).toBe(0); + expect(data.passed).toBe(0); + expect(data.tests).toHaveLength(0); + expect(data.tier).toBe('llm-judge'); + }); +}); + +// --- extractToolSummary tests --- + +describe('extractToolSummary', () => { + test('counts tool types from transcript events', () => { + const transcript = [ + { type: 'system', subtype: 'init' }, + { type: 'assistant', message: { content: [ + { type: 'tool_use', name: 'Bash', input: {} }, + ] } }, + { type: 'user', tool_use_result: { stdout: '' } }, + { type: 'assistant', message: { content: [ + { type: 'text', text: 'ok' }, + { type: 'tool_use', name: 'Read', input: {} }, + ] } }, + { type: 'assistant', message: { content: [ + { type: 'tool_use', name: 'Bash', input: {} }, + { type: 'tool_use', name: 'Write', input: {} }, + ] } }, + ]; + + const summary = extractToolSummary(transcript); + expect(summary).toEqual({ Bash: 2, Read: 1, Write: 1 }); + }); + + test('returns empty object for empty transcript', () => { + expect(extractToolSummary([])).toEqual({}); + }); + + test('handles events with no content array', () => { + const transcript = [ + { type: 'assistant', message: {} }, + { type: 'assistant' }, + ]; + expect(extractToolSummary(transcript)).toEqual({}); + }); +}); + +// --- findPreviousRun tests --- + +describe('findPreviousRun', () => { + test('finds correct file — same branch preferred, most recent', () => { + // Write three eval files + const files = [ + { name: '0.3.5-main-e2e-20260312-100000.json', data: makeResult({ branch: 'main', timestamp: '2026-03-12T10:00:00Z' }) }, + { name: '0.3.5-feature-e2e-20260313-100000.json', data: makeResult({ branch: 'feature', timestamp: '2026-03-13T10:00:00Z' }) }, + { name: '0.3.6-feature-e2e-20260314-100000.json', data: makeResult({ branch: 'feature', timestamp: '2026-03-14T10:00:00Z' }) }, + ]; + for (const f of files) { + fs.writeFileSync(path.join(tmpDir, f.name), JSON.stringify(f.data)); + } + + // Should prefer feature branch (most recent on same branch) + const result = findPreviousRun(tmpDir, 'e2e', 'feature', path.join(tmpDir, 'current.json')); + expect(result).toContain('0.3.6-feature-e2e-20260314'); + }); + + test('falls back to different branch when no same-branch match', () => { + const files = [ + { name: '0.3.5-main-e2e-20260312-100000.json', data: makeResult({ branch: 'main', timestamp: '2026-03-12T10:00:00Z' }) }, + ]; + for (const f of files) { + fs.writeFileSync(path.join(tmpDir, f.name), JSON.stringify(f.data)); + } + + const result = findPreviousRun(tmpDir, 'e2e', 'new-branch', path.join(tmpDir, 'current.json')); + expect(result).toContain('0.3.5-main-e2e'); + }); + + test('returns null when no prior runs exist', () => { + const result = findPreviousRun(tmpDir, 'e2e', 'main', path.join(tmpDir, 'current.json')); + expect(result).toBeNull(); + }); + + test('returns null when directory does not exist', () => { + const result = findPreviousRun('/nonexistent/path', 'e2e', 'main', 'current.json'); + expect(result).toBeNull(); + }); + + test('excludes the current file from results', () => { + const filename = '0.3.6-main-e2e-20260314-100000.json'; + fs.writeFileSync( + path.join(tmpDir, filename), + JSON.stringify(makeResult({ branch: 'main', timestamp: '2026-03-14T10:00:00Z' })), + ); + + const result = findPreviousRun(tmpDir, 'e2e', 'main', path.join(tmpDir, filename)); + expect(result).toBeNull(); // only file is excluded + }); + + test('filters by tier', () => { + fs.writeFileSync( + path.join(tmpDir, '0.3.6-main-llm-judge-20260314-100000.json'), + JSON.stringify(makeResult({ tier: 'llm-judge', branch: 'main', timestamp: '2026-03-14T10:00:00Z' })), + ); + + const result = findPreviousRun(tmpDir, 'e2e', 'main', 'current.json'); + expect(result).toBeNull(); // only llm-judge file, looking for e2e + }); +}); + +// --- compareEvalResults tests --- + +describe('compareEvalResults', () => { + test('detects improved/regressed/unchanged per test', () => { + const before = makeResult({ + tests: [ + makeEntry({ name: 'test-a', passed: false }), + makeEntry({ name: 'test-b', passed: true }), + makeEntry({ name: 'test-c', passed: true }), + ], + total_tests: 3, passed: 2, failed: 1, + }); + const after = makeResult({ + tests: [ + makeEntry({ name: 'test-a', passed: true }), // improved + makeEntry({ name: 'test-b', passed: false }), // regressed + makeEntry({ name: 'test-c', passed: true }), // unchanged + ], + total_tests: 3, passed: 2, failed: 1, + }); + + const result = compareEvalResults(before, after, 'before.json', 'after.json'); + expect(result.improved).toBe(1); + expect(result.regressed).toBe(1); + expect(result.unchanged).toBe(1); + expect(result.deltas.find(d => d.name === 'test-a')?.status_change).toBe('improved'); + expect(result.deltas.find(d => d.name === 'test-b')?.status_change).toBe('regressed'); + expect(result.deltas.find(d => d.name === 'test-c')?.status_change).toBe('unchanged'); + }); + + test('handles tests present in one run but not the other', () => { + const before = makeResult({ + tests: [ + makeEntry({ name: 'old-test', passed: true }), + makeEntry({ name: 'shared', passed: true }), + ], + }); + const after = makeResult({ + tests: [ + makeEntry({ name: 'shared', passed: true }), + makeEntry({ name: 'new-test', passed: true }), + ], + }); + + const result = compareEvalResults(before, after, 'before.json', 'after.json'); + expect(result.deltas).toHaveLength(3); // shared + new-test + old-test (removed) + expect(result.deltas.find(d => d.name.includes('old-test'))?.name).toContain('removed'); + }); + + test('computes cost and duration deltas', () => { + const before = makeResult({ total_cost_usd: 2.00, total_duration_ms: 60000 }); + const after = makeResult({ total_cost_usd: 1.50, total_duration_ms: 45000 }); + + const result = compareEvalResults(before, after, 'a.json', 'b.json'); + expect(result.total_cost_delta).toBe(-0.50); + expect(result.total_duration_delta).toBe(-15000); + }); +}); + +// --- formatComparison tests --- + +describe('formatComparison', () => { + test('produces readable output with status arrows', () => { + const comparison: ComparisonResult = { + before_file: 'before.json', + after_file: 'after.json', + before_branch: 'main', + after_branch: 'feature', + before_timestamp: '2026-03-13T14:30:00Z', + after_timestamp: '2026-03-14T14:30:00Z', + deltas: [ + { + name: 'browse basic', + before: { passed: true, cost_usd: 0.07, tool_summary: { Bash: 3 } }, + after: { passed: true, cost_usd: 0.06, tool_summary: { Bash: 4 } }, + status_change: 'unchanged', + }, + { + name: 'planted bugs static', + before: { passed: false, cost_usd: 1.00, detection_rate: 3, tool_summary: {} }, + after: { passed: true, cost_usd: 0.95, detection_rate: 4, tool_summary: {} }, + status_change: 'improved', + }, + ], + total_cost_delta: -0.06, + total_duration_delta: -5000, + improved: 1, + regressed: 0, + unchanged: 1, + tool_count_before: 3, + tool_count_after: 4, + }; + + const output = formatComparison(comparison); + expect(output).toContain('vs previous'); + expect(output).toContain('main'); + expect(output).toContain('1 improved'); + expect(output).toContain('1 unchanged'); + expect(output).toContain('↑'); // improved arrow + expect(output).toContain('='); // unchanged arrow + }); +}); diff --git a/test/helpers/eval-store.ts b/test/helpers/eval-store.ts new file mode 100644 index 00000000..40e537eb --- /dev/null +++ b/test/helpers/eval-store.ts @@ -0,0 +1,466 @@ +/** + * Eval result persistence and comparison. + * + * EvalCollector accumulates test results, writes them to + * ~/.gstack-dev/evals/{version}-{branch}-{tier}-{timestamp}.json, + * prints a summary table, and auto-compares with the previous run. + * + * Comparison functions are exported for reuse by the eval:compare CLI. + */ + +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { spawnSync } from 'child_process'; + +const SCHEMA_VERSION = 1; +const DEFAULT_EVAL_DIR = path.join(os.homedir(), '.gstack-dev', 'evals'); + +// --- Interfaces --- + +export interface EvalTestEntry { + name: string; + suite: string; + tier: 'e2e' | 'llm-judge'; + passed: boolean; + duration_ms: number; + cost_usd: number; + + // E2E + transcript?: any[]; + prompt?: string; + output?: string; + turns_used?: number; + browse_errors?: string[]; + + // LLM judge + judge_scores?: Record; + judge_reasoning?: string; + + // Outcome eval + detection_rate?: number; + false_positives?: number; + evidence_quality?: number; + detected_bugs?: string[]; + missed_bugs?: string[]; + + error?: string; +} + +export interface EvalResult { + schema_version: number; + version: string; + branch: string; + git_sha: string; + timestamp: string; + hostname: string; + tier: 'e2e' | 'llm-judge'; + total_tests: number; + passed: number; + failed: number; + total_cost_usd: number; + total_duration_ms: number; + tests: EvalTestEntry[]; +} + +export interface TestDelta { + name: string; + before: { passed: boolean; cost_usd: number; turns_used?: number; + detection_rate?: number; tool_summary?: Record }; + after: { passed: boolean; cost_usd: number; turns_used?: number; + detection_rate?: number; tool_summary?: Record }; + status_change: 'improved' | 'regressed' | 'unchanged'; +} + +export interface ComparisonResult { + before_file: string; + after_file: string; + before_branch: string; + after_branch: string; + before_timestamp: string; + after_timestamp: string; + deltas: TestDelta[]; + total_cost_delta: number; + total_duration_delta: number; + improved: number; + regressed: number; + unchanged: number; + tool_count_before: number; + tool_count_after: number; +} + +// --- Comparison functions (exported for eval:compare CLI) --- + +/** + * Extract tool call counts from a transcript. + * Returns e.g. { Bash: 8, Read: 3, Write: 1 }. + */ +export function extractToolSummary(transcript: any[]): Record { + const counts: Record = {}; + for (const event of transcript) { + if (event.type === 'assistant') { + const content = event.message?.content || []; + for (const item of content) { + if (item.type === 'tool_use') { + const name = item.name || 'unknown'; + counts[name] = (counts[name] || 0) + 1; + } + } + } + } + return counts; +} + +/** + * Find the most recent prior eval file for comparison. + * Prefers same branch, falls back to any branch. + */ +export function findPreviousRun( + evalDir: string, + tier: string, + branch: string, + excludeFile: string, +): string | null { + let files: string[]; + try { + files = fs.readdirSync(evalDir).filter(f => f.endsWith('.json')); + } catch { + return null; // dir doesn't exist + } + + // Parse top-level fields from each file (cheap — no full tests array needed) + const entries: Array<{ file: string; branch: string; timestamp: string }> = []; + for (const file of files) { + if (file === path.basename(excludeFile)) continue; + const fullPath = path.join(evalDir, file); + try { + const raw = fs.readFileSync(fullPath, 'utf-8'); + // Quick parse — only grab the fields we need + const data = JSON.parse(raw); + if (data.tier !== tier) continue; + entries.push({ file: fullPath, branch: data.branch || '', timestamp: data.timestamp || '' }); + } catch { continue; } + } + + if (entries.length === 0) return null; + + // Sort by timestamp descending + entries.sort((a, b) => b.timestamp.localeCompare(a.timestamp)); + + // Prefer same branch + const sameBranch = entries.find(e => e.branch === branch); + if (sameBranch) return sameBranch.file; + + // Fallback: any branch + return entries[0].file; +} + +/** + * Compare two eval results. Matches tests by name. + */ +export function compareEvalResults( + before: EvalResult, + after: EvalResult, + beforeFile: string, + afterFile: string, +): ComparisonResult { + const deltas: TestDelta[] = []; + let improved = 0, regressed = 0, unchanged = 0; + let toolCountBefore = 0, toolCountAfter = 0; + + // Index before tests by name + const beforeMap = new Map(); + for (const t of before.tests) { + beforeMap.set(t.name, t); + } + + // Walk after tests, match by name + for (const afterTest of after.tests) { + const beforeTest = beforeMap.get(afterTest.name); + const beforeToolSummary = beforeTest?.transcript ? extractToolSummary(beforeTest.transcript) : {}; + const afterToolSummary = afterTest.transcript ? extractToolSummary(afterTest.transcript) : {}; + + const beforeToolCount = Object.values(beforeToolSummary).reduce((a, b) => a + b, 0); + const afterToolCount = Object.values(afterToolSummary).reduce((a, b) => a + b, 0); + toolCountBefore += beforeToolCount; + toolCountAfter += afterToolCount; + + let statusChange: TestDelta['status_change'] = 'unchanged'; + if (beforeTest) { + if (!beforeTest.passed && afterTest.passed) { statusChange = 'improved'; improved++; } + else if (beforeTest.passed && !afterTest.passed) { statusChange = 'regressed'; regressed++; } + else { unchanged++; } + } else { + // New test — treat as unchanged (no prior data) + unchanged++; + } + + deltas.push({ + name: afterTest.name, + before: { + passed: beforeTest?.passed ?? false, + cost_usd: beforeTest?.cost_usd ?? 0, + turns_used: beforeTest?.turns_used, + detection_rate: beforeTest?.detection_rate, + tool_summary: beforeToolSummary, + }, + after: { + passed: afterTest.passed, + cost_usd: afterTest.cost_usd, + turns_used: afterTest.turns_used, + detection_rate: afterTest.detection_rate, + tool_summary: afterToolSummary, + }, + status_change: statusChange, + }); + + beforeMap.delete(afterTest.name); + } + + // Tests that were in before but not in after (removed tests) + for (const [name, beforeTest] of beforeMap) { + const beforeToolSummary = beforeTest.transcript ? extractToolSummary(beforeTest.transcript) : {}; + const beforeToolCount = Object.values(beforeToolSummary).reduce((a, b) => a + b, 0); + toolCountBefore += beforeToolCount; + unchanged++; + deltas.push({ + name: `${name} (removed)`, + before: { + passed: beforeTest.passed, + cost_usd: beforeTest.cost_usd, + turns_used: beforeTest.turns_used, + detection_rate: beforeTest.detection_rate, + tool_summary: beforeToolSummary, + }, + after: { passed: false, cost_usd: 0, tool_summary: {} }, + status_change: 'unchanged', + }); + } + + return { + before_file: beforeFile, + after_file: afterFile, + before_branch: before.branch, + after_branch: after.branch, + before_timestamp: before.timestamp, + after_timestamp: after.timestamp, + deltas, + total_cost_delta: after.total_cost_usd - before.total_cost_usd, + total_duration_delta: after.total_duration_ms - before.total_duration_ms, + improved, + regressed, + unchanged, + tool_count_before: toolCountBefore, + tool_count_after: toolCountAfter, + }; +} + +/** + * Format a ComparisonResult as a readable string. + */ +export function formatComparison(c: ComparisonResult): string { + const lines: string[] = []; + const ts = c.before_timestamp ? c.before_timestamp.replace('T', ' ').slice(0, 16) : 'unknown'; + lines.push(`\nvs previous: ${c.before_branch}/${c.deltas.length ? 'eval' : ''} (${ts})`); + lines.push('─'.repeat(70)); + + // Per-test deltas + for (const d of c.deltas) { + const arrow = d.status_change === 'improved' ? '↑' : d.status_change === 'regressed' ? '↓' : '='; + const beforeStatus = d.before.passed ? 'PASS' : 'FAIL'; + const afterStatus = d.after.passed ? 'PASS' : 'FAIL'; + + let detail = ''; + if (d.before.detection_rate !== undefined || d.after.detection_rate !== undefined) { + detail = ` ${d.before.detection_rate ?? '?'}→${d.after.detection_rate ?? '?'} det`; + } else { + const costBefore = d.before.cost_usd.toFixed(2); + const costAfter = d.after.cost_usd.toFixed(2); + detail = ` $${costBefore}→$${costAfter}`; + } + + const name = d.name.length > 35 ? d.name.slice(0, 32) + '...' : d.name.padEnd(35); + lines.push(` ${name} ${beforeStatus.padEnd(5)} → ${afterStatus.padEnd(5)} ${arrow}${detail}`); + } + + lines.push('─'.repeat(70)); + + // Totals + const parts: string[] = []; + if (c.improved > 0) parts.push(`${c.improved} improved`); + if (c.regressed > 0) parts.push(`${c.regressed} regressed`); + if (c.unchanged > 0) parts.push(`${c.unchanged} unchanged`); + lines.push(` Status: ${parts.join(', ')}`); + + const costSign = c.total_cost_delta >= 0 ? '+' : ''; + lines.push(` Cost: ${costSign}$${c.total_cost_delta.toFixed(2)}`); + + const durDelta = Math.round(c.total_duration_delta / 1000); + const durSign = durDelta >= 0 ? '+' : ''; + lines.push(` Duration: ${durSign}${durDelta}s`); + + const toolDelta = c.tool_count_after - c.tool_count_before; + const toolSign = toolDelta >= 0 ? '+' : ''; + lines.push(` Tool calls: ${c.tool_count_before} → ${c.tool_count_after} (${toolSign}${toolDelta})`); + + // Tool breakdown (show tools that changed) + const allTools = new Set(); + for (const d of c.deltas) { + for (const t of Object.keys(d.before.tool_summary || {})) allTools.add(t); + for (const t of Object.keys(d.after.tool_summary || {})) allTools.add(t); + } + + if (allTools.size > 0) { + // Aggregate tool counts across all tests + const totalBefore: Record = {}; + const totalAfter: Record = {}; + for (const d of c.deltas) { + for (const [t, n] of Object.entries(d.before.tool_summary || {})) { + totalBefore[t] = (totalBefore[t] || 0) + n; + } + for (const [t, n] of Object.entries(d.after.tool_summary || {})) { + totalAfter[t] = (totalAfter[t] || 0) + n; + } + } + + for (const tool of [...allTools].sort()) { + const b = totalBefore[tool] || 0; + const a = totalAfter[tool] || 0; + if (b !== a) { + const d = a - b; + lines.push(` ${tool}: ${b} → ${a} (${d >= 0 ? '+' : ''}${d})`); + } + } + } + + return lines.join('\n'); +} + +// --- EvalCollector --- + +function getGitInfo(): { branch: string; sha: string } { + try { + const branch = spawnSync('git', ['rev-parse', '--abbrev-ref', 'HEAD'], { stdio: 'pipe', timeout: 5000 }); + const sha = spawnSync('git', ['rev-parse', '--short', 'HEAD'], { stdio: 'pipe', timeout: 5000 }); + return { + branch: branch.stdout?.toString().trim() || 'unknown', + sha: sha.stdout?.toString().trim() || 'unknown', + }; + } catch { + return { branch: 'unknown', sha: 'unknown' }; + } +} + +function getVersion(): string { + try { + const pkgPath = path.resolve(__dirname, '..', '..', 'package.json'); + const pkg = JSON.parse(fs.readFileSync(pkgPath, 'utf-8')); + return pkg.version || 'unknown'; + } catch { + return 'unknown'; + } +} + +export class EvalCollector { + private tier: 'e2e' | 'llm-judge'; + private tests: EvalTestEntry[] = []; + private finalized = false; + private evalDir: string; + + constructor(tier: 'e2e' | 'llm-judge', evalDir?: string) { + this.tier = tier; + this.evalDir = evalDir || DEFAULT_EVAL_DIR; + } + + addTest(entry: EvalTestEntry): void { + this.tests.push(entry); + } + + async finalize(): Promise { + if (this.finalized) return ''; + this.finalized = true; + + const git = getGitInfo(); + const version = getVersion(); + const timestamp = new Date().toISOString(); + const totalCost = this.tests.reduce((s, t) => s + t.cost_usd, 0); + const totalDuration = this.tests.reduce((s, t) => s + t.duration_ms, 0); + const passed = this.tests.filter(t => t.passed).length; + + const result: EvalResult = { + schema_version: SCHEMA_VERSION, + version, + branch: git.branch, + git_sha: git.sha, + timestamp, + hostname: os.hostname(), + tier: this.tier, + total_tests: this.tests.length, + passed, + failed: this.tests.length - passed, + total_cost_usd: Math.round(totalCost * 100) / 100, + total_duration_ms: totalDuration, + tests: this.tests, + }; + + // Write eval file + fs.mkdirSync(this.evalDir, { recursive: true }); + const dateStr = timestamp.replace(/[:.]/g, '').replace('T', '-').slice(0, 15); + const safeBranch = git.branch.replace(/[^a-zA-Z0-9._-]/g, '-'); + const filename = `${version}-${safeBranch}-${this.tier}-${dateStr}.json`; + const filepath = path.join(this.evalDir, filename); + fs.writeFileSync(filepath, JSON.stringify(result, null, 2) + '\n'); + + // Print summary table + this.printSummary(result, filepath, git); + + // Auto-compare with previous run + try { + const prevFile = findPreviousRun(this.evalDir, this.tier, git.branch, filepath); + if (prevFile) { + const prevResult: EvalResult = JSON.parse(fs.readFileSync(prevFile, 'utf-8')); + const comparison = compareEvalResults(prevResult, result, prevFile, filepath); + process.stderr.write(formatComparison(comparison) + '\n'); + } else { + process.stderr.write('\nFirst run — no comparison available.\n'); + } + } catch (err: any) { + process.stderr.write(`\nCompare error: ${err.message}\n`); + } + + return filepath; + } + + private printSummary(result: EvalResult, filepath: string, git: { branch: string; sha: string }): void { + const lines: string[] = []; + lines.push(''); + lines.push(`Eval Results — v${result.version} @ ${git.branch} (${git.sha}) — ${this.tier}`); + lines.push('═'.repeat(70)); + + for (const t of this.tests) { + const status = t.passed ? ' PASS ' : ' FAIL '; + const cost = `$${t.cost_usd.toFixed(2)}`; + + let detail = ''; + if (t.detection_rate !== undefined) { + detail = `${t.detection_rate}/${(t.detected_bugs?.length || 0) + (t.missed_bugs?.length || 0)} det`; + } else if (t.turns_used !== undefined) { + detail = `${t.turns_used} turns`; + } else if (t.judge_scores) { + const scores = Object.entries(t.judge_scores).map(([k, v]) => `${k[0]}:${v}`).join(' '); + detail = scores; + } + + const name = t.name.length > 38 ? t.name.slice(0, 35) + '...' : t.name.padEnd(38); + lines.push(` ${name} ${status} ${cost.padStart(6)} ${detail}`); + } + + lines.push('─'.repeat(70)); + const totalCost = `$${result.total_cost_usd.toFixed(2)}`; + const totalDur = `${Math.round(result.total_duration_ms / 1000)}s`; + lines.push(` Total: ${result.passed}/${result.total_tests} passed${' '.repeat(20)}${totalCost.padStart(6)} ${totalDur}`); + lines.push(`Saved: ${filepath}`); + + process.stderr.write(lines.join('\n') + '\n'); + } +} diff --git a/test/skill-e2e.test.ts b/test/skill-e2e.test.ts index 0c66d79e..445d2b52 100644 --- a/test/skill-e2e.test.ts +++ b/test/skill-e2e.test.ts @@ -1,6 +1,9 @@ import { describe, test, expect, beforeAll, afterAll } from 'bun:test'; import { runSkillTest } from './helpers/session-runner'; +import type { SkillTestResult } from './helpers/session-runner'; import { outcomeJudge } from './helpers/llm-judge'; +import { EvalCollector } from './helpers/eval-store'; +import type { EvalTestEntry } from './helpers/eval-store'; import { startTestServer } from '../browse/test/test-server'; import * as fs from 'fs'; import * as path from 'path'; @@ -12,6 +15,24 @@ const ROOT = path.resolve(import.meta.dir, '..'); const evalsEnabled = !!process.env.EVALS; const describeE2E = evalsEnabled ? describe : describe.skip; +// Eval result collector — accumulates test results, writes to ~/.gstack-dev/evals/ on finalize +const evalCollector = evalsEnabled ? new EvalCollector('e2e') : null; + +/** DRY helper to record an E2E test result into the eval collector. */ +function recordE2E(name: string, suite: string, result: SkillTestResult, extra?: Partial) { + evalCollector?.addTest({ + name, suite, tier: 'e2e', + passed: result.exitReason === 'success' && result.browseErrors.length === 0, + duration_ms: result.duration, + cost_usd: result.costEstimate.estimatedCost, + transcript: result.transcript, + output: result.output?.slice(0, 2000), + turns_used: result.costEstimate.turnsUsed, + browse_errors: result.browseErrors, + ...extra, + }); +} + let testServer: ReturnType; let tmpDir: string; const browseBin = path.resolve(ROOT, 'browse', 'dist', 'browse'); @@ -110,6 +131,7 @@ Report the results of each command.`, }); logCost('browse basic', result); + recordE2E('browse basic commands', 'Skill E2E tests', result); expect(result.browseErrors).toHaveLength(0); expect(result.exitReason).toBe('success'); }, 90_000); @@ -129,11 +151,11 @@ Report what each command returned.`, }); logCost('browse snapshot', result); + recordE2E('browse snapshot flags', 'Skill E2E tests', result); expect(result.browseErrors).toHaveLength(0); expect(result.exitReason).toBe('success'); }, 90_000); - test('agent discovers browse binary via SKILL.md setup block', async () => { const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); const setupStart = skillMd.indexOf('## SETUP'); @@ -156,6 +178,7 @@ Report whether it worked.`, timeout: 60_000, }); + recordE2E('SKILL.md setup block discovery', 'Skill E2E tests', result); expect(result.browseErrors).toHaveLength(0); expect(result.exitReason).toBe('success'); }, 90_000); @@ -182,6 +205,7 @@ Report the exact output. Do NOT try to fix or install anything — just report w // Agent should see NEEDS_SETUP (not crash or guess wrong paths) const allText = result.output || ''; + recordE2E('SKILL.md NEEDS_SETUP', 'Skill E2E tests', result); expect(allText).toContain('NEEDS_SETUP'); // Clean up @@ -210,6 +234,7 @@ Report the exact output — either "READY: " or "NEEDS_SETUP".`, // Should either find global binary (READY) or show NEEDS_SETUP — not crash const allText = result.output || ''; + recordE2E('SKILL.md outside git repo', 'Skill E2E tests', result); expect(allText).toMatch(/READY|NEEDS_SETUP/); // Clean up @@ -254,6 +279,7 @@ Write your report to ${qaDir}/qa-reports/qa-report.md`, }); logCost('/qa quick', result); + recordE2E('/qa quick', 'QA skill E2E', result); expect(result.browseErrors).toHaveLength(0); expect(result.exitReason).toBe('success'); }, 240_000); @@ -311,6 +337,7 @@ Write your review findings to ${reviewDir}/review-output.md`, }); logCost('/review', result); + recordE2E('/review SQL injection', 'Review skill E2E', result); expect(result.exitReason).toBe('success'); }, 120_000); }); @@ -392,6 +419,15 @@ Be thorough: check console, check all links, check all forms, check mobile viewp const judgeResult = await outcomeJudge(groundTruth, report); console.log(`${label} outcome:`, JSON.stringify(judgeResult, null, 2)); + // Record to eval collector with outcome judge results + recordE2E(`/qa ${label}`, 'Planted-bug outcome evals', result, { + detection_rate: judgeResult.detection_rate, + false_positives: judgeResult.false_positives, + evidence_quality: judgeResult.evidence_quality, + detected_bugs: judgeResult.detected, + missed_bugs: judgeResult.missed, + }); + // Diagnostic dump on failure (decision 1C) if (judgeResult.detection_rate < groundTruth.minimum_detection || judgeResult.false_positives > groundTruth.max_false_positives) { dumpOutcomeDiagnostic(outcomeDir, label, report, judgeResult); @@ -421,3 +457,14 @@ Be thorough: check console, check all links, check all forms, check mobile viewp // Ship E2E deferred — too complex (requires full git + test suite + VERSION + CHANGELOG) test.todo('/ship completes without browse errors'); }); + +// Module-level afterAll — finalize eval collector after all tests complete +afterAll(async () => { + if (evalCollector) { + try { + await evalCollector.finalize(); + } catch (err) { + console.error('Failed to save eval results:', err); + } + } +}); diff --git a/test/skill-llm-eval.test.ts b/test/skill-llm-eval.test.ts index 94031849..6db8c87b 100644 --- a/test/skill-llm-eval.test.ts +++ b/test/skill-llm-eval.test.ts @@ -10,21 +10,26 @@ * Cost: ~$0.05-0.15 per run (sonnet) */ -import { describe, test, expect } from 'bun:test'; +import { describe, test, expect, afterAll } from 'bun:test'; import Anthropic from '@anthropic-ai/sdk'; import * as fs from 'fs'; import * as path from 'path'; import { callJudge, judge } from './helpers/llm-judge'; import type { JudgeScore } from './helpers/llm-judge'; +import { EvalCollector } from './helpers/eval-store'; const ROOT = path.resolve(import.meta.dir, '..'); // Run when EVALS=1 is set (requires ANTHROPIC_API_KEY in env) -const describeEval = process.env.EVALS ? describe : describe.skip; +const evalsEnabled = !!process.env.EVALS; +const describeEval = evalsEnabled ? describe : describe.skip; + +// Eval result collector +const evalCollector = evalsEnabled ? new EvalCollector('llm-judge') : null; describeEval('LLM-as-judge quality evals', () => { test('command reference table scores >= 4 on all dimensions', async () => { + const t0 = Date.now(); const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); - // Extract just the command reference section const start = content.indexOf('## Command Reference'); const end = content.indexOf('## Tips'); const section = content.slice(start, end); @@ -32,12 +37,24 @@ describeEval('LLM-as-judge quality evals', () => { const scores = await judge('command reference table', section); console.log('Command reference scores:', JSON.stringify(scores, null, 2)); + evalCollector?.addTest({ + name: 'command reference table', + suite: 'LLM-as-judge quality evals', + tier: 'llm-judge', + passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4, + duration_ms: Date.now() - t0, + cost_usd: 0.02, + judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability }, + judge_reasoning: scores.reasoning, + }); + expect(scores.clarity).toBeGreaterThanOrEqual(4); expect(scores.completeness).toBeGreaterThanOrEqual(4); expect(scores.actionability).toBeGreaterThanOrEqual(4); }, 30_000); test('snapshot flags section scores >= 4 on all dimensions', async () => { + const t0 = Date.now(); const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); const start = content.indexOf('## Snapshot System'); const end = content.indexOf('## Command Reference'); @@ -46,26 +63,49 @@ describeEval('LLM-as-judge quality evals', () => { const scores = await judge('snapshot flags reference', section); console.log('Snapshot flags scores:', JSON.stringify(scores, null, 2)); + evalCollector?.addTest({ + name: 'snapshot flags reference', + suite: 'LLM-as-judge quality evals', + tier: 'llm-judge', + passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4, + duration_ms: Date.now() - t0, + cost_usd: 0.02, + judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability }, + judge_reasoning: scores.reasoning, + }); + expect(scores.clarity).toBeGreaterThanOrEqual(4); expect(scores.completeness).toBeGreaterThanOrEqual(4); expect(scores.actionability).toBeGreaterThanOrEqual(4); }, 30_000); test('browse/SKILL.md overall scores >= 4', async () => { + const t0 = Date.now(); const content = fs.readFileSync(path.join(ROOT, 'browse', 'SKILL.md'), 'utf-8'); - // Just the reference sections (skip examples/patterns) const start = content.indexOf('## Snapshot Flags'); const section = content.slice(start); const scores = await judge('browse skill reference (flags + commands)', section); console.log('Browse SKILL.md scores:', JSON.stringify(scores, null, 2)); + evalCollector?.addTest({ + name: 'browse/SKILL.md reference', + suite: 'LLM-as-judge quality evals', + tier: 'llm-judge', + passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4, + duration_ms: Date.now() - t0, + cost_usd: 0.02, + judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability }, + judge_reasoning: scores.reasoning, + }); + expect(scores.clarity).toBeGreaterThanOrEqual(4); expect(scores.completeness).toBeGreaterThanOrEqual(4); expect(scores.actionability).toBeGreaterThanOrEqual(4); }, 30_000); test('setup block scores >= 4 on actionability and clarity', async () => { + const t0 = Date.now(); const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); const setupStart = content.indexOf('## SETUP'); const setupEnd = content.indexOf('## IMPORTANT'); @@ -74,13 +114,23 @@ describeEval('LLM-as-judge quality evals', () => { const scores = await judge('setup/binary discovery instructions', section); console.log('Setup block scores:', JSON.stringify(scores, null, 2)); + evalCollector?.addTest({ + name: 'setup block', + suite: 'LLM-as-judge quality evals', + tier: 'llm-judge', + passed: scores.actionability >= 4 && scores.clarity >= 4, + duration_ms: Date.now() - t0, + cost_usd: 0.02, + judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability }, + judge_reasoning: scores.reasoning, + }); + expect(scores.actionability).toBeGreaterThanOrEqual(4); expect(scores.clarity).toBeGreaterThanOrEqual(4); }, 30_000); test('regression check: compare branch vs baseline quality', async () => { - // This test compares the generated output against the hand-maintained - // baseline from main. The generated version should score equal or higher. + const t0 = Date.now(); const generated = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); const genStart = generated.indexOf('## Command Reference'); const genEnd = generated.indexOf('## Tips'); @@ -151,7 +201,17 @@ Scores are 1-5 overall quality.`, const result = JSON.parse(jsonMatch[0]); console.log('Regression comparison:', JSON.stringify(result, null, 2)); - // Generated version should be at least as good as hand-maintained + evalCollector?.addTest({ + name: 'regression vs baseline', + suite: 'LLM-as-judge quality evals', + tier: 'llm-judge', + passed: result.b_score >= result.a_score, + duration_ms: Date.now() - t0, + cost_usd: 0.02, + judge_scores: { a_score: result.a_score, b_score: result.b_score }, + judge_reasoning: result.reasoning, + }); + expect(result.b_score).toBeGreaterThanOrEqual(result.a_score); }, 30_000); }); @@ -162,13 +222,11 @@ describeEval('QA skill quality evals', () => { const qaContent = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8'); test('qa/SKILL.md workflow quality scores >= 4', async () => { - // Extract the workflow section (Phases 1-7) + const t0 = Date.now(); const start = qaContent.indexOf('## Workflow'); const end = qaContent.indexOf('## Health Score Rubric'); const section = qaContent.slice(start, end); - // Use workflow-specific prompt (not the CLI-reference judge, since this is a - // workflow doc that references $B commands defined in a separate browse SKILL.md) const scores = await callJudge(`You are evaluating the quality of a QA testing workflow document for an AI coding agent. The agent reads this document to learn how to systematically QA test a web application. The workflow references @@ -188,16 +246,27 @@ Here is the QA workflow to evaluate: ${section}`); console.log('QA workflow scores:', JSON.stringify(scores, null, 2)); + evalCollector?.addTest({ + name: 'qa/SKILL.md workflow', + suite: 'QA skill quality evals', + tier: 'llm-judge', + passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4, + duration_ms: Date.now() - t0, + cost_usd: 0.02, + judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability }, + judge_reasoning: scores.reasoning, + }); + expect(scores.clarity).toBeGreaterThanOrEqual(4); expect(scores.completeness).toBeGreaterThanOrEqual(4); expect(scores.actionability).toBeGreaterThanOrEqual(4); }, 30_000); test('qa/SKILL.md health score rubric is unambiguous', async () => { + const t0 = Date.now(); const start = qaContent.indexOf('## Health Score Rubric'); const section = qaContent.slice(start); - // Use rubric-specific prompt const scores = await callJudge(`You are evaluating a health score rubric that an AI agent must follow to compute a numeric QA score. The agent uses this rubric after QA testing a website. It needs to: @@ -218,11 +287,18 @@ Here is the rubric to evaluate: ${section}`); console.log('QA health rubric scores:', JSON.stringify(scores, null, 2)); + evalCollector?.addTest({ + name: 'qa/SKILL.md health rubric', + suite: 'QA skill quality evals', + tier: 'llm-judge', + passed: scores.clarity >= 4 && scores.completeness >= 3 && scores.actionability >= 4, + duration_ms: Date.now() - t0, + cost_usd: 0.02, + judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability }, + judge_reasoning: scores.reasoning, + }); + expect(scores.clarity).toBeGreaterThanOrEqual(4); - // Completeness threshold is 3 — the rubric intentionally leaves some edge cases - // to agent judgment (e.g., partial testing, cross-category findings). The judge - // consistently flags these as gaps, but over-specifying would make the rubric - // rigid and harder to follow. Clarity + actionability >= 4 is what matters. expect(scores.completeness).toBeGreaterThanOrEqual(3); expect(scores.actionability).toBeGreaterThanOrEqual(4); }, 30_000); @@ -232,12 +308,12 @@ ${section}`); describeEval('Cross-skill consistency evals', () => { test('greptile-history patterns are consistent across all skills', async () => { + const t0 = Date.now(); const reviewContent = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8'); const shipContent = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8'); const triageContent = fs.readFileSync(path.join(ROOT, 'review', 'greptile-triage.md'), 'utf-8'); const retroContent = fs.readFileSync(path.join(ROOT, 'retro', 'SKILL.md'), 'utf-8'); - // Extract greptile-related lines from each file const extractGrepLines = (content: string, filename: string) => { const lines = content.split('\n') .filter(l => /greptile|history\.md|REMOTE_SLUG/i.test(l)) @@ -277,6 +353,17 @@ score (1-5): 5 = perfectly consistent, 1 = contradictory`); console.log('Cross-skill consistency:', JSON.stringify(result, null, 2)); + evalCollector?.addTest({ + name: 'cross-skill greptile consistency', + suite: 'Cross-skill consistency evals', + tier: 'llm-judge', + passed: result.consistent && result.score >= 4, + duration_ms: Date.now() - t0, + cost_usd: 0.02, + judge_scores: { consistency_score: result.score }, + judge_reasoning: result.reasoning, + }); + expect(result.consistent).toBe(true); expect(result.score).toBeGreaterThanOrEqual(4); }, 30_000); @@ -288,6 +375,7 @@ describeEval('Baseline score pinning', () => { const baselinesPath = path.join(ROOT, 'test', 'fixtures', 'eval-baselines.json'); test('LLM eval scores do not regress below baselines', async () => { + const t0 = Date.now(); if (!fs.existsSync(baselinesPath)) { console.log('No baseline file found — skipping pinning check'); return; @@ -296,7 +384,6 @@ describeEval('Baseline score pinning', () => { const baselines = JSON.parse(fs.readFileSync(baselinesPath, 'utf-8')); const regressions: string[] = []; - // Test command reference const skillContent = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); const cmdStart = skillContent.indexOf('## Command Reference'); const cmdEnd = skillContent.indexOf('## Tips'); @@ -309,7 +396,6 @@ describeEval('Baseline score pinning', () => { } } - // Update baselines if requested if (process.env.UPDATE_BASELINES) { baselines.command_reference = { clarity: cmdScores.clarity, @@ -320,8 +406,31 @@ describeEval('Baseline score pinning', () => { console.log('Updated eval baselines'); } - if (regressions.length > 0) { + const passed = regressions.length === 0; + evalCollector?.addTest({ + name: 'baseline score pinning', + suite: 'Baseline score pinning', + tier: 'llm-judge', + passed, + duration_ms: Date.now() - t0, + cost_usd: 0.02, + judge_scores: { clarity: cmdScores.clarity, completeness: cmdScores.completeness, actionability: cmdScores.actionability }, + judge_reasoning: passed ? 'All scores at or above baseline' : regressions.join('; '), + }); + + if (!passed) { throw new Error(`Score regressions detected:\n${regressions.join('\n')}`); } }, 60_000); }); + +// Module-level afterAll — finalize eval collector after all tests complete +afterAll(async () => { + if (evalCollector) { + try { + await evalCollector.finalize(); + } catch (err) { + console.error('Failed to save eval results:', err); + } + } +});