feat: eval persistence with auto-compare against previous run

EvalCollector accumulates test results during eval runs, writes JSON to ~/.gstack-dev/evals/{version}-{branch}-{tier}-{timestamp}.json, prints a summary table, and automatically compares against the previous run. - EvalCollector class with addTest() / finalize() / summary table - findPreviousRun() prefers same branch, falls back to any branch - compareEvalResults() matches tests by name, detects improved/regressed - extractToolSummary() counts tool types from transcript events - formatComparison() renders delta table with per-test + aggregate diffs - Wire into skill-e2e.test.ts (recordE2E helper) and skill-llm-eval.test.ts - 19 unit tests for collector + comparison functions - schema_version: 1 for forward compatibility Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-02 03:35:09 +02:00 · 2026-03-14 03:49:47 -05:00
parent e7347c2f8f
commit 84f52f3bad
4 changed files with 975 additions and 20 deletions
@@ -0,0 +1,333 @@
+import { describe, test, expect, beforeEach, afterEach } from 'bun:test';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import {
+  EvalCollector,
+  extractToolSummary,
+  findPreviousRun,
+  compareEvalResults,
+  formatComparison,
+} from './eval-store';
+import type { EvalResult, EvalTestEntry } from './eval-store';
+
+let tmpDir: string;
+
+beforeEach(() => {
+  tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'eval-store-test-'));
+});
+
+afterEach(() => {
+  try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {}
+});
+
+// --- Helper to make a minimal test entry ---
+
+function makeEntry(overrides?: Partial<EvalTestEntry>): EvalTestEntry {
+  return {
+    name: 'test-1',
+    suite: 'suite-1',
+    tier: 'e2e',
+    passed: true,
+    duration_ms: 1000,
+    cost_usd: 0.05,
+    ...overrides,
+  };
+}
+
+// --- Helper to make a minimal EvalResult ---
+
+function makeResult(overrides?: Partial<EvalResult>): EvalResult {
+  return {
+    schema_version: 1,
+    version: '0.3.6',
+    branch: 'main',
+    git_sha: 'abc1234',
+    timestamp: '2026-03-14T12:00:00.000Z',
+    hostname: 'test-host',
+    tier: 'e2e',
+    total_tests: 1,
+    passed: 1,
+    failed: 0,
+    total_cost_usd: 0.05,
+    total_duration_ms: 1000,
+    tests: [makeEntry()],
+    ...overrides,
+  };
+}
+
+// --- EvalCollector tests ---
+
+describe('EvalCollector', () => {
+  test('addTest accumulates entries', () => {
+    const collector = new EvalCollector('e2e', tmpDir);
+    collector.addTest(makeEntry({ name: 'a' }));
+    collector.addTest(makeEntry({ name: 'b' }));
+    collector.addTest(makeEntry({ name: 'c' }));
+    // We can't inspect tests directly, but finalize will write them
+  });
+
+  test('finalize writes JSON file to eval dir', async () => {
+    const collector = new EvalCollector('e2e', tmpDir);
+    collector.addTest(makeEntry());
+    const filepath = await collector.finalize();
+
+    expect(filepath).toBeTruthy();
+    expect(fs.existsSync(filepath)).toBe(true);
+
+    const data = JSON.parse(fs.readFileSync(filepath, 'utf-8'));
+    expect(data.tests).toHaveLength(1);
+    expect(data.tests[0].name).toBe('test-1');
+  });
+
+  test('written JSON has correct schema fields', async () => {
+    const collector = new EvalCollector('e2e', tmpDir);
+    collector.addTest(makeEntry({ passed: true, cost_usd: 0.10, duration_ms: 2000 }));
+    collector.addTest(makeEntry({ name: 'test-2', passed: false, cost_usd: 0.05, duration_ms: 1000 }));
+    const filepath = await collector.finalize();
+
+    const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8'));
+    expect(data.schema_version).toBe(1);
+    expect(data.tier).toBe('e2e');
+    expect(data.total_tests).toBe(2);
+    expect(data.passed).toBe(1);
+    expect(data.failed).toBe(1);
+    expect(data.total_cost_usd).toBe(0.15);
+    expect(data.total_duration_ms).toBe(3000);
+    expect(data.timestamp).toBeTruthy();
+    expect(data.hostname).toBeTruthy();
+  });
+
+  test('finalize creates directory if missing', async () => {
+    const nestedDir = path.join(tmpDir, 'nested', 'deep', 'evals');
+    const collector = new EvalCollector('e2e', nestedDir);
+    collector.addTest(makeEntry());
+    const filepath = await collector.finalize();
+    expect(fs.existsSync(filepath)).toBe(true);
+  });
+
+  test('double finalize does not write twice', async () => {
+    const collector = new EvalCollector('e2e', tmpDir);
+    collector.addTest(makeEntry());
+    const filepath1 = await collector.finalize();
+    const filepath2 = await collector.finalize();
+
+    expect(filepath1).toBeTruthy();
+    expect(filepath2).toBe(''); // second call returns empty
+    expect(fs.readdirSync(tmpDir).filter(f => f.endsWith('.json'))).toHaveLength(1);
+  });
+
+  test('empty collector writes valid file', async () => {
+    const collector = new EvalCollector('llm-judge', tmpDir);
+    const filepath = await collector.finalize();
+
+    const data: EvalResult = JSON.parse(fs.readFileSync(filepath, 'utf-8'));
+    expect(data.total_tests).toBe(0);
+    expect(data.passed).toBe(0);
+    expect(data.tests).toHaveLength(0);
+    expect(data.tier).toBe('llm-judge');
+  });
+});
+
+// --- extractToolSummary tests ---
+
+describe('extractToolSummary', () => {
+  test('counts tool types from transcript events', () => {
+    const transcript = [
+      { type: 'system', subtype: 'init' },
+      { type: 'assistant', message: { content: [
+        { type: 'tool_use', name: 'Bash', input: {} },
+      ] } },
+      { type: 'user', tool_use_result: { stdout: '' } },
+      { type: 'assistant', message: { content: [
+        { type: 'text', text: 'ok' },
+        { type: 'tool_use', name: 'Read', input: {} },
+      ] } },
+      { type: 'assistant', message: { content: [
+        { type: 'tool_use', name: 'Bash', input: {} },
+        { type: 'tool_use', name: 'Write', input: {} },
+      ] } },
+    ];
+
+    const summary = extractToolSummary(transcript);
+    expect(summary).toEqual({ Bash: 2, Read: 1, Write: 1 });
+  });
+
+  test('returns empty object for empty transcript', () => {
+    expect(extractToolSummary([])).toEqual({});
+  });
+
+  test('handles events with no content array', () => {
+    const transcript = [
+      { type: 'assistant', message: {} },
+      { type: 'assistant' },
+    ];
+    expect(extractToolSummary(transcript)).toEqual({});
+  });
+});
+
+// --- findPreviousRun tests ---
+
+describe('findPreviousRun', () => {
+  test('finds correct file — same branch preferred, most recent', () => {
+    // Write three eval files
+    const files = [
+      { name: '0.3.5-main-e2e-20260312-100000.json', data: makeResult({ branch: 'main', timestamp: '2026-03-12T10:00:00Z' }) },
+      { name: '0.3.5-feature-e2e-20260313-100000.json', data: makeResult({ branch: 'feature', timestamp: '2026-03-13T10:00:00Z' }) },
+      { name: '0.3.6-feature-e2e-20260314-100000.json', data: makeResult({ branch: 'feature', timestamp: '2026-03-14T10:00:00Z' }) },
+    ];
+    for (const f of files) {
+      fs.writeFileSync(path.join(tmpDir, f.name), JSON.stringify(f.data));
+    }
+
+    // Should prefer feature branch (most recent on same branch)
+    const result = findPreviousRun(tmpDir, 'e2e', 'feature', path.join(tmpDir, 'current.json'));
+    expect(result).toContain('0.3.6-feature-e2e-20260314');
+  });
+
+  test('falls back to different branch when no same-branch match', () => {
+    const files = [
+      { name: '0.3.5-main-e2e-20260312-100000.json', data: makeResult({ branch: 'main', timestamp: '2026-03-12T10:00:00Z' }) },
+    ];
+    for (const f of files) {
+      fs.writeFileSync(path.join(tmpDir, f.name), JSON.stringify(f.data));
+    }
+
+    const result = findPreviousRun(tmpDir, 'e2e', 'new-branch', path.join(tmpDir, 'current.json'));
+    expect(result).toContain('0.3.5-main-e2e');
+  });
+
+  test('returns null when no prior runs exist', () => {
+    const result = findPreviousRun(tmpDir, 'e2e', 'main', path.join(tmpDir, 'current.json'));
+    expect(result).toBeNull();
+  });
+
+  test('returns null when directory does not exist', () => {
+    const result = findPreviousRun('/nonexistent/path', 'e2e', 'main', 'current.json');
+    expect(result).toBeNull();
+  });
+
+  test('excludes the current file from results', () => {
+    const filename = '0.3.6-main-e2e-20260314-100000.json';
+    fs.writeFileSync(
+      path.join(tmpDir, filename),
+      JSON.stringify(makeResult({ branch: 'main', timestamp: '2026-03-14T10:00:00Z' })),
+    );
+
+    const result = findPreviousRun(tmpDir, 'e2e', 'main', path.join(tmpDir, filename));
+    expect(result).toBeNull(); // only file is excluded
+  });
+
+  test('filters by tier', () => {
+    fs.writeFileSync(
+      path.join(tmpDir, '0.3.6-main-llm-judge-20260314-100000.json'),
+      JSON.stringify(makeResult({ tier: 'llm-judge', branch: 'main', timestamp: '2026-03-14T10:00:00Z' })),
+    );
+
+    const result = findPreviousRun(tmpDir, 'e2e', 'main', 'current.json');
+    expect(result).toBeNull(); // only llm-judge file, looking for e2e
+  });
+});
+
+// --- compareEvalResults tests ---
+
+describe('compareEvalResults', () => {
+  test('detects improved/regressed/unchanged per test', () => {
+    const before = makeResult({
+      tests: [
+        makeEntry({ name: 'test-a', passed: false }),
+        makeEntry({ name: 'test-b', passed: true }),
+        makeEntry({ name: 'test-c', passed: true }),
+      ],
+      total_tests: 3, passed: 2, failed: 1,
+    });
+    const after = makeResult({
+      tests: [
+        makeEntry({ name: 'test-a', passed: true }),   // improved
+        makeEntry({ name: 'test-b', passed: false }),  // regressed
+        makeEntry({ name: 'test-c', passed: true }),   // unchanged
+      ],
+      total_tests: 3, passed: 2, failed: 1,
+    });
+
+    const result = compareEvalResults(before, after, 'before.json', 'after.json');
+    expect(result.improved).toBe(1);
+    expect(result.regressed).toBe(1);
+    expect(result.unchanged).toBe(1);
+    expect(result.deltas.find(d => d.name === 'test-a')?.status_change).toBe('improved');
+    expect(result.deltas.find(d => d.name === 'test-b')?.status_change).toBe('regressed');
+    expect(result.deltas.find(d => d.name === 'test-c')?.status_change).toBe('unchanged');
+  });
+
+  test('handles tests present in one run but not the other', () => {
+    const before = makeResult({
+      tests: [
+        makeEntry({ name: 'old-test', passed: true }),
+        makeEntry({ name: 'shared', passed: true }),
+      ],
+    });
+    const after = makeResult({
+      tests: [
+        makeEntry({ name: 'shared', passed: true }),
+        makeEntry({ name: 'new-test', passed: true }),
+      ],
+    });
+
+    const result = compareEvalResults(before, after, 'before.json', 'after.json');
+    expect(result.deltas).toHaveLength(3); // shared + new-test + old-test (removed)
+    expect(result.deltas.find(d => d.name.includes('old-test'))?.name).toContain('removed');
+  });
+
+  test('computes cost and duration deltas', () => {
+    const before = makeResult({ total_cost_usd: 2.00, total_duration_ms: 60000 });
+    const after = makeResult({ total_cost_usd: 1.50, total_duration_ms: 45000 });
+
+    const result = compareEvalResults(before, after, 'a.json', 'b.json');
+    expect(result.total_cost_delta).toBe(-0.50);
+    expect(result.total_duration_delta).toBe(-15000);
+  });
+});
+
+// --- formatComparison tests ---
+
+describe('formatComparison', () => {
+  test('produces readable output with status arrows', () => {
+    const comparison: ComparisonResult = {
+      before_file: 'before.json',
+      after_file: 'after.json',
+      before_branch: 'main',
+      after_branch: 'feature',
+      before_timestamp: '2026-03-13T14:30:00Z',
+      after_timestamp: '2026-03-14T14:30:00Z',
+      deltas: [
+        {
+          name: 'browse basic',
+          before: { passed: true, cost_usd: 0.07, tool_summary: { Bash: 3 } },
+          after: { passed: true, cost_usd: 0.06, tool_summary: { Bash: 4 } },
+          status_change: 'unchanged',
+        },
+        {
+          name: 'planted bugs static',
+          before: { passed: false, cost_usd: 1.00, detection_rate: 3, tool_summary: {} },
+          after: { passed: true, cost_usd: 0.95, detection_rate: 4, tool_summary: {} },
+          status_change: 'improved',
+        },
+      ],
+      total_cost_delta: -0.06,
+      total_duration_delta: -5000,
+      improved: 1,
+      regressed: 0,
+      unchanged: 1,
+      tool_count_before: 3,
+      tool_count_after: 4,
+    };
+
+    const output = formatComparison(comparison);
+    expect(output).toContain('vs previous');
+    expect(output).toContain('main');
+    expect(output).toContain('1 improved');
+    expect(output).toContain('1 unchanged');
+    expect(output).toContain('↑'); // improved arrow
+    expect(output).toContain('='); // unchanged arrow
+  });
+});
@@ -0,0 +1,466 @@
+/**
+ * Eval result persistence and comparison.
+ *
+ * EvalCollector accumulates test results, writes them to
+ * ~/.gstack-dev/evals/{version}-{branch}-{tier}-{timestamp}.json,
+ * prints a summary table, and auto-compares with the previous run.
+ *
+ * Comparison functions are exported for reuse by the eval:compare CLI.
+ */
+
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import { spawnSync } from 'child_process';
+
+const SCHEMA_VERSION = 1;
+const DEFAULT_EVAL_DIR = path.join(os.homedir(), '.gstack-dev', 'evals');
+
+// --- Interfaces ---
+
+export interface EvalTestEntry {
+  name: string;
+  suite: string;
+  tier: 'e2e' | 'llm-judge';
+  passed: boolean;
+  duration_ms: number;
+  cost_usd: number;
+
+  // E2E
+  transcript?: any[];
+  prompt?: string;
+  output?: string;
+  turns_used?: number;
+  browse_errors?: string[];
+
+  // LLM judge
+  judge_scores?: Record<string, number>;
+  judge_reasoning?: string;
+
+  // Outcome eval
+  detection_rate?: number;
+  false_positives?: number;
+  evidence_quality?: number;
+  detected_bugs?: string[];
+  missed_bugs?: string[];
+
+  error?: string;
+}
+
+export interface EvalResult {
+  schema_version: number;
+  version: string;
+  branch: string;
+  git_sha: string;
+  timestamp: string;
+  hostname: string;
+  tier: 'e2e' | 'llm-judge';
+  total_tests: number;
+  passed: number;
+  failed: number;
+  total_cost_usd: number;
+  total_duration_ms: number;
+  tests: EvalTestEntry[];
+}
+
+export interface TestDelta {
+  name: string;
+  before: { passed: boolean; cost_usd: number; turns_used?: number;
+            detection_rate?: number; tool_summary?: Record<string, number> };
+  after:  { passed: boolean; cost_usd: number; turns_used?: number;
+            detection_rate?: number; tool_summary?: Record<string, number> };
+  status_change: 'improved' | 'regressed' | 'unchanged';
+}
+
+export interface ComparisonResult {
+  before_file: string;
+  after_file: string;
+  before_branch: string;
+  after_branch: string;
+  before_timestamp: string;
+  after_timestamp: string;
+  deltas: TestDelta[];
+  total_cost_delta: number;
+  total_duration_delta: number;
+  improved: number;
+  regressed: number;
+  unchanged: number;
+  tool_count_before: number;
+  tool_count_after: number;
+}
+
+// --- Comparison functions (exported for eval:compare CLI) ---
+
+/**
+ * Extract tool call counts from a transcript.
+ * Returns e.g. { Bash: 8, Read: 3, Write: 1 }.
+ */
+export function extractToolSummary(transcript: any[]): Record<string, number> {
+  const counts: Record<string, number> = {};
+  for (const event of transcript) {
+    if (event.type === 'assistant') {
+      const content = event.message?.content || [];
+      for (const item of content) {
+        if (item.type === 'tool_use') {
+          const name = item.name || 'unknown';
+          counts[name] = (counts[name] || 0) + 1;
+        }
+      }
+    }
+  }
+  return counts;
+}
+
+/**
+ * Find the most recent prior eval file for comparison.
+ * Prefers same branch, falls back to any branch.
+ */
+export function findPreviousRun(
+  evalDir: string,
+  tier: string,
+  branch: string,
+  excludeFile: string,
+): string | null {
+  let files: string[];
+  try {
+    files = fs.readdirSync(evalDir).filter(f => f.endsWith('.json'));
+  } catch {
+    return null; // dir doesn't exist
+  }
+
+  // Parse top-level fields from each file (cheap — no full tests array needed)
+  const entries: Array<{ file: string; branch: string; timestamp: string }> = [];
+  for (const file of files) {
+    if (file === path.basename(excludeFile)) continue;
+    const fullPath = path.join(evalDir, file);
+    try {
+      const raw = fs.readFileSync(fullPath, 'utf-8');
+      // Quick parse — only grab the fields we need
+      const data = JSON.parse(raw);
+      if (data.tier !== tier) continue;
+      entries.push({ file: fullPath, branch: data.branch || '', timestamp: data.timestamp || '' });
+    } catch { continue; }
+  }
+
+  if (entries.length === 0) return null;
+
+  // Sort by timestamp descending
+  entries.sort((a, b) => b.timestamp.localeCompare(a.timestamp));
+
+  // Prefer same branch
+  const sameBranch = entries.find(e => e.branch === branch);
+  if (sameBranch) return sameBranch.file;
+
+  // Fallback: any branch
+  return entries[0].file;
+}
+
+/**
+ * Compare two eval results. Matches tests by name.
+ */
+export function compareEvalResults(
+  before: EvalResult,
+  after: EvalResult,
+  beforeFile: string,
+  afterFile: string,
+): ComparisonResult {
+  const deltas: TestDelta[] = [];
+  let improved = 0, regressed = 0, unchanged = 0;
+  let toolCountBefore = 0, toolCountAfter = 0;
+
+  // Index before tests by name
+  const beforeMap = new Map<string, EvalTestEntry>();
+  for (const t of before.tests) {
+    beforeMap.set(t.name, t);
+  }
+
+  // Walk after tests, match by name
+  for (const afterTest of after.tests) {
+    const beforeTest = beforeMap.get(afterTest.name);
+    const beforeToolSummary = beforeTest?.transcript ? extractToolSummary(beforeTest.transcript) : {};
+    const afterToolSummary = afterTest.transcript ? extractToolSummary(afterTest.transcript) : {};
+
+    const beforeToolCount = Object.values(beforeToolSummary).reduce((a, b) => a + b, 0);
+    const afterToolCount = Object.values(afterToolSummary).reduce((a, b) => a + b, 0);
+    toolCountBefore += beforeToolCount;
+    toolCountAfter += afterToolCount;
+
+    let statusChange: TestDelta['status_change'] = 'unchanged';
+    if (beforeTest) {
+      if (!beforeTest.passed && afterTest.passed) { statusChange = 'improved'; improved++; }
+      else if (beforeTest.passed && !afterTest.passed) { statusChange = 'regressed'; regressed++; }
+      else { unchanged++; }
+    } else {
+      // New test — treat as unchanged (no prior data)
+      unchanged++;
+    }
+
+    deltas.push({
+      name: afterTest.name,
+      before: {
+        passed: beforeTest?.passed ?? false,
+        cost_usd: beforeTest?.cost_usd ?? 0,
+        turns_used: beforeTest?.turns_used,
+        detection_rate: beforeTest?.detection_rate,
+        tool_summary: beforeToolSummary,
+      },
+      after: {
+        passed: afterTest.passed,
+        cost_usd: afterTest.cost_usd,
+        turns_used: afterTest.turns_used,
+        detection_rate: afterTest.detection_rate,
+        tool_summary: afterToolSummary,
+      },
+      status_change: statusChange,
+    });
+
+    beforeMap.delete(afterTest.name);
+  }
+
+  // Tests that were in before but not in after (removed tests)
+  for (const [name, beforeTest] of beforeMap) {
+    const beforeToolSummary = beforeTest.transcript ? extractToolSummary(beforeTest.transcript) : {};
+    const beforeToolCount = Object.values(beforeToolSummary).reduce((a, b) => a + b, 0);
+    toolCountBefore += beforeToolCount;
+    unchanged++;
+    deltas.push({
+      name: `${name} (removed)`,
+      before: {
+        passed: beforeTest.passed,
+        cost_usd: beforeTest.cost_usd,
+        turns_used: beforeTest.turns_used,
+        detection_rate: beforeTest.detection_rate,
+        tool_summary: beforeToolSummary,
+      },
+      after: { passed: false, cost_usd: 0, tool_summary: {} },
+      status_change: 'unchanged',
+    });
+  }
+
+  return {
+    before_file: beforeFile,
+    after_file: afterFile,
+    before_branch: before.branch,
+    after_branch: after.branch,
+    before_timestamp: before.timestamp,
+    after_timestamp: after.timestamp,
+    deltas,
+    total_cost_delta: after.total_cost_usd - before.total_cost_usd,
+    total_duration_delta: after.total_duration_ms - before.total_duration_ms,
+    improved,
+    regressed,
+    unchanged,
+    tool_count_before: toolCountBefore,
+    tool_count_after: toolCountAfter,
+  };
+}
+
+/**
+ * Format a ComparisonResult as a readable string.
+ */
+export function formatComparison(c: ComparisonResult): string {
+  const lines: string[] = [];
+  const ts = c.before_timestamp ? c.before_timestamp.replace('T', ' ').slice(0, 16) : 'unknown';
+  lines.push(`\nvs previous: ${c.before_branch}/${c.deltas.length ? 'eval' : ''} (${ts})`);
+  lines.push('─'.repeat(70));
+
+  // Per-test deltas
+  for (const d of c.deltas) {
+    const arrow = d.status_change === 'improved' ? '↑' : d.status_change === 'regressed' ? '↓' : '=';
+    const beforeStatus = d.before.passed ? 'PASS' : 'FAIL';
+    const afterStatus = d.after.passed ? 'PASS' : 'FAIL';
+
+    let detail = '';
+    if (d.before.detection_rate !== undefined || d.after.detection_rate !== undefined) {
+      detail = ` ${d.before.detection_rate ?? '?'}→${d.after.detection_rate ?? '?'} det`;
+    } else {
+      const costBefore = d.before.cost_usd.toFixed(2);
+      const costAfter = d.after.cost_usd.toFixed(2);
+      detail = ` $${costBefore}→$${costAfter}`;
+    }
+
+    const name = d.name.length > 35 ? d.name.slice(0, 32) + '...' : d.name.padEnd(35);
+    lines.push(`  ${name}  ${beforeStatus.padEnd(5)} → ${afterStatus.padEnd(5)}  ${arrow}${detail}`);
+  }
+
+  lines.push('─'.repeat(70));
+
+  // Totals
+  const parts: string[] = [];
+  if (c.improved > 0) parts.push(`${c.improved} improved`);
+  if (c.regressed > 0) parts.push(`${c.regressed} regressed`);
+  if (c.unchanged > 0) parts.push(`${c.unchanged} unchanged`);
+  lines.push(`  Status: ${parts.join(', ')}`);
+
+  const costSign = c.total_cost_delta >= 0 ? '+' : '';
+  lines.push(`  Cost:   ${costSign}$${c.total_cost_delta.toFixed(2)}`);
+
+  const durDelta = Math.round(c.total_duration_delta / 1000);
+  const durSign = durDelta >= 0 ? '+' : '';
+  lines.push(`  Duration: ${durSign}${durDelta}s`);
+
+  const toolDelta = c.tool_count_after - c.tool_count_before;
+  const toolSign = toolDelta >= 0 ? '+' : '';
+  lines.push(`  Tool calls: ${c.tool_count_before} → ${c.tool_count_after} (${toolSign}${toolDelta})`);
+
+  // Tool breakdown (show tools that changed)
+  const allTools = new Set<string>();
+  for (const d of c.deltas) {
+    for (const t of Object.keys(d.before.tool_summary || {})) allTools.add(t);
+    for (const t of Object.keys(d.after.tool_summary || {})) allTools.add(t);
+  }
+
+  if (allTools.size > 0) {
+    // Aggregate tool counts across all tests
+    const totalBefore: Record<string, number> = {};
+    const totalAfter: Record<string, number> = {};
+    for (const d of c.deltas) {
+      for (const [t, n] of Object.entries(d.before.tool_summary || {})) {
+        totalBefore[t] = (totalBefore[t] || 0) + n;
+      }
+      for (const [t, n] of Object.entries(d.after.tool_summary || {})) {
+        totalAfter[t] = (totalAfter[t] || 0) + n;
+      }
+    }
+
+    for (const tool of [...allTools].sort()) {
+      const b = totalBefore[tool] || 0;
+      const a = totalAfter[tool] || 0;
+      if (b !== a) {
+        const d = a - b;
+        lines.push(`    ${tool}: ${b} → ${a} (${d >= 0 ? '+' : ''}${d})`);
+      }
+    }
+  }
+
+  return lines.join('\n');
+}
+
+// --- EvalCollector ---
+
+function getGitInfo(): { branch: string; sha: string } {
+  try {
+    const branch = spawnSync('git', ['rev-parse', '--abbrev-ref', 'HEAD'], { stdio: 'pipe', timeout: 5000 });
+    const sha = spawnSync('git', ['rev-parse', '--short', 'HEAD'], { stdio: 'pipe', timeout: 5000 });
+    return {
+      branch: branch.stdout?.toString().trim() || 'unknown',
+      sha: sha.stdout?.toString().trim() || 'unknown',
+    };
+  } catch {
+    return { branch: 'unknown', sha: 'unknown' };
+  }
+}
+
+function getVersion(): string {
+  try {
+    const pkgPath = path.resolve(__dirname, '..', '..', 'package.json');
+    const pkg = JSON.parse(fs.readFileSync(pkgPath, 'utf-8'));
+    return pkg.version || 'unknown';
+  } catch {
+    return 'unknown';
+  }
+}
+
+export class EvalCollector {
+  private tier: 'e2e' | 'llm-judge';
+  private tests: EvalTestEntry[] = [];
+  private finalized = false;
+  private evalDir: string;
+
+  constructor(tier: 'e2e' | 'llm-judge', evalDir?: string) {
+    this.tier = tier;
+    this.evalDir = evalDir || DEFAULT_EVAL_DIR;
+  }
+
+  addTest(entry: EvalTestEntry): void {
+    this.tests.push(entry);
+  }
+
+  async finalize(): Promise<string> {
+    if (this.finalized) return '';
+    this.finalized = true;
+
+    const git = getGitInfo();
+    const version = getVersion();
+    const timestamp = new Date().toISOString();
+    const totalCost = this.tests.reduce((s, t) => s + t.cost_usd, 0);
+    const totalDuration = this.tests.reduce((s, t) => s + t.duration_ms, 0);
+    const passed = this.tests.filter(t => t.passed).length;
+
+    const result: EvalResult = {
+      schema_version: SCHEMA_VERSION,
+      version,
+      branch: git.branch,
+      git_sha: git.sha,
+      timestamp,
+      hostname: os.hostname(),
+      tier: this.tier,
+      total_tests: this.tests.length,
+      passed,
+      failed: this.tests.length - passed,
+      total_cost_usd: Math.round(totalCost * 100) / 100,
+      total_duration_ms: totalDuration,
+      tests: this.tests,
+    };
+
+    // Write eval file
+    fs.mkdirSync(this.evalDir, { recursive: true });
+    const dateStr = timestamp.replace(/[:.]/g, '').replace('T', '-').slice(0, 15);
+    const safeBranch = git.branch.replace(/[^a-zA-Z0-9._-]/g, '-');
+    const filename = `${version}-${safeBranch}-${this.tier}-${dateStr}.json`;
+    const filepath = path.join(this.evalDir, filename);
+    fs.writeFileSync(filepath, JSON.stringify(result, null, 2) + '\n');
+
+    // Print summary table
+    this.printSummary(result, filepath, git);
+
+    // Auto-compare with previous run
+    try {
+      const prevFile = findPreviousRun(this.evalDir, this.tier, git.branch, filepath);
+      if (prevFile) {
+        const prevResult: EvalResult = JSON.parse(fs.readFileSync(prevFile, 'utf-8'));
+        const comparison = compareEvalResults(prevResult, result, prevFile, filepath);
+        process.stderr.write(formatComparison(comparison) + '\n');
+      } else {
+        process.stderr.write('\nFirst run — no comparison available.\n');
+      }
+    } catch (err: any) {
+      process.stderr.write(`\nCompare error: ${err.message}\n`);
+    }
+
+    return filepath;
+  }
+
+  private printSummary(result: EvalResult, filepath: string, git: { branch: string; sha: string }): void {
+    const lines: string[] = [];
+    lines.push('');
+    lines.push(`Eval Results — v${result.version} @ ${git.branch} (${git.sha}) — ${this.tier}`);
+    lines.push('═'.repeat(70));
+
+    for (const t of this.tests) {
+      const status = t.passed ? ' PASS ' : ' FAIL ';
+      const cost = `$${t.cost_usd.toFixed(2)}`;
+
+      let detail = '';
+      if (t.detection_rate !== undefined) {
+        detail = `${t.detection_rate}/${(t.detected_bugs?.length || 0) + (t.missed_bugs?.length || 0)} det`;
+      } else if (t.turns_used !== undefined) {
+        detail = `${t.turns_used} turns`;
+      } else if (t.judge_scores) {
+        const scores = Object.entries(t.judge_scores).map(([k, v]) => `${k[0]}:${v}`).join(' ');
+        detail = scores;
+      }
+
+      const name = t.name.length > 38 ? t.name.slice(0, 35) + '...' : t.name.padEnd(38);
+      lines.push(`  ${name}  ${status}  ${cost.padStart(6)}  ${detail}`);
+    }
+
+    lines.push('─'.repeat(70));
+    const totalCost = `$${result.total_cost_usd.toFixed(2)}`;
+    const totalDur = `${Math.round(result.total_duration_ms / 1000)}s`;
+    lines.push(`  Total: ${result.passed}/${result.total_tests} passed${' '.repeat(20)}${totalCost.padStart(6)}  ${totalDur}`);
+    lines.push(`Saved: ${filepath}`);
+
+    process.stderr.write(lines.join('\n') + '\n');
+  }
+}
@@ -1,6 +1,9 @@
 import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
 import { runSkillTest } from './helpers/session-runner';
+import type { SkillTestResult } from './helpers/session-runner';
 import { outcomeJudge } from './helpers/llm-judge';
+import { EvalCollector } from './helpers/eval-store';
+import type { EvalTestEntry } from './helpers/eval-store';
 import { startTestServer } from '../browse/test/test-server';
 import * as fs from 'fs';
 import * as path from 'path';
@@ -12,6 +15,24 @@ const ROOT = path.resolve(import.meta.dir, '..');
 const evalsEnabled = !!process.env.EVALS;
 const describeE2E = evalsEnabled ? describe : describe.skip;

+// Eval result collector — accumulates test results, writes to ~/.gstack-dev/evals/ on finalize
+const evalCollector = evalsEnabled ? new EvalCollector('e2e') : null;
+
+/** DRY helper to record an E2E test result into the eval collector. */
+function recordE2E(name: string, suite: string, result: SkillTestResult, extra?: Partial<EvalTestEntry>) {
+  evalCollector?.addTest({
+    name, suite, tier: 'e2e',
+    passed: result.exitReason === 'success' && result.browseErrors.length === 0,
+    duration_ms: result.duration,
+    cost_usd: result.costEstimate.estimatedCost,
+    transcript: result.transcript,
+    output: result.output?.slice(0, 2000),
+    turns_used: result.costEstimate.turnsUsed,
+    browse_errors: result.browseErrors,
+    ...extra,
+  });
+}
+
 let testServer: ReturnType<typeof startTestServer>;
 let tmpDir: string;
 const browseBin = path.resolve(ROOT, 'browse', 'dist', 'browse');
@@ -110,6 +131,7 @@ Report the results of each command.`,
    });

    logCost('browse basic', result);
+    recordE2E('browse basic commands', 'Skill E2E tests', result);
    expect(result.browseErrors).toHaveLength(0);
    expect(result.exitReason).toBe('success');
  }, 90_000);
@@ -129,11 +151,11 @@ Report what each command returned.`,
    });

    logCost('browse snapshot', result);
+    recordE2E('browse snapshot flags', 'Skill E2E tests', result);
    expect(result.browseErrors).toHaveLength(0);
    expect(result.exitReason).toBe('success');
  }, 90_000);

-
  test('agent discovers browse binary via SKILL.md setup block', async () => {
    const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
    const setupStart = skillMd.indexOf('## SETUP');
@@ -156,6 +178,7 @@ Report whether it worked.`,
      timeout: 60_000,
    });

+    recordE2E('SKILL.md setup block discovery', 'Skill E2E tests', result);
    expect(result.browseErrors).toHaveLength(0);
    expect(result.exitReason).toBe('success');
  }, 90_000);
@@ -182,6 +205,7 @@ Report the exact output. Do NOT try to fix or install anything — just report w

    // Agent should see NEEDS_SETUP (not crash or guess wrong paths)
    const allText = result.output || '';
+    recordE2E('SKILL.md NEEDS_SETUP', 'Skill E2E tests', result);
    expect(allText).toContain('NEEDS_SETUP');

    // Clean up
@@ -210,6 +234,7 @@ Report the exact output — either "READY: <path>" or "NEEDS_SETUP".`,

    // Should either find global binary (READY) or show NEEDS_SETUP — not crash
    const allText = result.output || '';
+    recordE2E('SKILL.md outside git repo', 'Skill E2E tests', result);
    expect(allText).toMatch(/READY|NEEDS_SETUP/);

    // Clean up
@@ -254,6 +279,7 @@ Write your report to ${qaDir}/qa-reports/qa-report.md`,
    });

    logCost('/qa quick', result);
+    recordE2E('/qa quick', 'QA skill E2E', result);
    expect(result.browseErrors).toHaveLength(0);
    expect(result.exitReason).toBe('success');
  }, 240_000);
@@ -311,6 +337,7 @@ Write your review findings to ${reviewDir}/review-output.md`,
    });

    logCost('/review', result);
+    recordE2E('/review SQL injection', 'Review skill E2E', result);
    expect(result.exitReason).toBe('success');
  }, 120_000);
 });
@@ -392,6 +419,15 @@ Be thorough: check console, check all links, check all forms, check mobile viewp
    const judgeResult = await outcomeJudge(groundTruth, report);
    console.log(`${label} outcome:`, JSON.stringify(judgeResult, null, 2));

+    // Record to eval collector with outcome judge results
+    recordE2E(`/qa ${label}`, 'Planted-bug outcome evals', result, {
+      detection_rate: judgeResult.detection_rate,
+      false_positives: judgeResult.false_positives,
+      evidence_quality: judgeResult.evidence_quality,
+      detected_bugs: judgeResult.detected,
+      missed_bugs: judgeResult.missed,
+    });
+
    // Diagnostic dump on failure (decision 1C)
    if (judgeResult.detection_rate < groundTruth.minimum_detection || judgeResult.false_positives > groundTruth.max_false_positives) {
      dumpOutcomeDiagnostic(outcomeDir, label, report, judgeResult);
@@ -421,3 +457,14 @@ Be thorough: check console, check all links, check all forms, check mobile viewp
  // Ship E2E deferred — too complex (requires full git + test suite + VERSION + CHANGELOG)
  test.todo('/ship completes without browse errors');
 });
+
+// Module-level afterAll — finalize eval collector after all tests complete
+afterAll(async () => {
+  if (evalCollector) {
+    try {
+      await evalCollector.finalize();
+    } catch (err) {
+      console.error('Failed to save eval results:', err);
+    }
+  }
+});
@@ -10,21 +10,26 @@
 * Cost: ~$0.05-0.15 per run (sonnet)
 */

-import { describe, test, expect } from 'bun:test';
+import { describe, test, expect, afterAll } from 'bun:test';
 import Anthropic from '@anthropic-ai/sdk';
 import * as fs from 'fs';
 import * as path from 'path';
 import { callJudge, judge } from './helpers/llm-judge';
 import type { JudgeScore } from './helpers/llm-judge';
+import { EvalCollector } from './helpers/eval-store';

 const ROOT = path.resolve(import.meta.dir, '..');
 // Run when EVALS=1 is set (requires ANTHROPIC_API_KEY in env)
-const describeEval = process.env.EVALS ? describe : describe.skip;
+const evalsEnabled = !!process.env.EVALS;
+const describeEval = evalsEnabled ? describe : describe.skip;
+
+// Eval result collector
+const evalCollector = evalsEnabled ? new EvalCollector('llm-judge') : null;

 describeEval('LLM-as-judge quality evals', () => {
  test('command reference table scores >= 4 on all dimensions', async () => {
+    const t0 = Date.now();
    const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
-    // Extract just the command reference section
    const start = content.indexOf('## Command Reference');
    const end = content.indexOf('## Tips');
    const section = content.slice(start, end);
@@ -32,12 +37,24 @@ describeEval('LLM-as-judge quality evals', () => {
    const scores = await judge('command reference table', section);
    console.log('Command reference scores:', JSON.stringify(scores, null, 2));

+    evalCollector?.addTest({
+      name: 'command reference table',
+      suite: 'LLM-as-judge quality evals',
+      tier: 'llm-judge',
+      passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4,
+      duration_ms: Date.now() - t0,
+      cost_usd: 0.02,
+      judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
+      judge_reasoning: scores.reasoning,
+    });
+
    expect(scores.clarity).toBeGreaterThanOrEqual(4);
    expect(scores.completeness).toBeGreaterThanOrEqual(4);
    expect(scores.actionability).toBeGreaterThanOrEqual(4);
  }, 30_000);

  test('snapshot flags section scores >= 4 on all dimensions', async () => {
+    const t0 = Date.now();
    const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
    const start = content.indexOf('## Snapshot System');
    const end = content.indexOf('## Command Reference');
@@ -46,26 +63,49 @@ describeEval('LLM-as-judge quality evals', () => {
    const scores = await judge('snapshot flags reference', section);
    console.log('Snapshot flags scores:', JSON.stringify(scores, null, 2));

+    evalCollector?.addTest({
+      name: 'snapshot flags reference',
+      suite: 'LLM-as-judge quality evals',
+      tier: 'llm-judge',
+      passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4,
+      duration_ms: Date.now() - t0,
+      cost_usd: 0.02,
+      judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
+      judge_reasoning: scores.reasoning,
+    });
+
    expect(scores.clarity).toBeGreaterThanOrEqual(4);
    expect(scores.completeness).toBeGreaterThanOrEqual(4);
    expect(scores.actionability).toBeGreaterThanOrEqual(4);
  }, 30_000);

  test('browse/SKILL.md overall scores >= 4', async () => {
+    const t0 = Date.now();
    const content = fs.readFileSync(path.join(ROOT, 'browse', 'SKILL.md'), 'utf-8');
-    // Just the reference sections (skip examples/patterns)
    const start = content.indexOf('## Snapshot Flags');
    const section = content.slice(start);

    const scores = await judge('browse skill reference (flags + commands)', section);
    console.log('Browse SKILL.md scores:', JSON.stringify(scores, null, 2));

+    evalCollector?.addTest({
+      name: 'browse/SKILL.md reference',
+      suite: 'LLM-as-judge quality evals',
+      tier: 'llm-judge',
+      passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4,
+      duration_ms: Date.now() - t0,
+      cost_usd: 0.02,
+      judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
+      judge_reasoning: scores.reasoning,
+    });
+
    expect(scores.clarity).toBeGreaterThanOrEqual(4);
    expect(scores.completeness).toBeGreaterThanOrEqual(4);
    expect(scores.actionability).toBeGreaterThanOrEqual(4);
  }, 30_000);

  test('setup block scores >= 4 on actionability and clarity', async () => {
+    const t0 = Date.now();
    const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
    const setupStart = content.indexOf('## SETUP');
    const setupEnd = content.indexOf('## IMPORTANT');
@@ -74,13 +114,23 @@ describeEval('LLM-as-judge quality evals', () => {
    const scores = await judge('setup/binary discovery instructions', section);
    console.log('Setup block scores:', JSON.stringify(scores, null, 2));

+    evalCollector?.addTest({
+      name: 'setup block',
+      suite: 'LLM-as-judge quality evals',
+      tier: 'llm-judge',
+      passed: scores.actionability >= 4 && scores.clarity >= 4,
+      duration_ms: Date.now() - t0,
+      cost_usd: 0.02,
+      judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
+      judge_reasoning: scores.reasoning,
+    });
+
    expect(scores.actionability).toBeGreaterThanOrEqual(4);
    expect(scores.clarity).toBeGreaterThanOrEqual(4);
  }, 30_000);

  test('regression check: compare branch vs baseline quality', async () => {
-    // This test compares the generated output against the hand-maintained
-    // baseline from main. The generated version should score equal or higher.
+    const t0 = Date.now();
    const generated = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
    const genStart = generated.indexOf('## Command Reference');
    const genEnd = generated.indexOf('## Tips');
@@ -151,7 +201,17 @@ Scores are 1-5 overall quality.`,
    const result = JSON.parse(jsonMatch[0]);
    console.log('Regression comparison:', JSON.stringify(result, null, 2));

-    // Generated version should be at least as good as hand-maintained
+    evalCollector?.addTest({
+      name: 'regression vs baseline',
+      suite: 'LLM-as-judge quality evals',
+      tier: 'llm-judge',
+      passed: result.b_score >= result.a_score,
+      duration_ms: Date.now() - t0,
+      cost_usd: 0.02,
+      judge_scores: { a_score: result.a_score, b_score: result.b_score },
+      judge_reasoning: result.reasoning,
+    });
+
    expect(result.b_score).toBeGreaterThanOrEqual(result.a_score);
  }, 30_000);
 });
@@ -162,13 +222,11 @@ describeEval('QA skill quality evals', () => {
  const qaContent = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');

  test('qa/SKILL.md workflow quality scores >= 4', async () => {
-    // Extract the workflow section (Phases 1-7)
+    const t0 = Date.now();
    const start = qaContent.indexOf('## Workflow');
    const end = qaContent.indexOf('## Health Score Rubric');
    const section = qaContent.slice(start, end);

-    // Use workflow-specific prompt (not the CLI-reference judge, since this is a
-    // workflow doc that references $B commands defined in a separate browse SKILL.md)
    const scores = await callJudge<JudgeScore>(`You are evaluating the quality of a QA testing workflow document for an AI coding agent.

 The agent reads this document to learn how to systematically QA test a web application. The workflow references
@@ -188,16 +246,27 @@ Here is the QA workflow to evaluate:
 ${section}`);
    console.log('QA workflow scores:', JSON.stringify(scores, null, 2));

+    evalCollector?.addTest({
+      name: 'qa/SKILL.md workflow',
+      suite: 'QA skill quality evals',
+      tier: 'llm-judge',
+      passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4,
+      duration_ms: Date.now() - t0,
+      cost_usd: 0.02,
+      judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
+      judge_reasoning: scores.reasoning,
+    });
+
    expect(scores.clarity).toBeGreaterThanOrEqual(4);
    expect(scores.completeness).toBeGreaterThanOrEqual(4);
    expect(scores.actionability).toBeGreaterThanOrEqual(4);
  }, 30_000);

  test('qa/SKILL.md health score rubric is unambiguous', async () => {
+    const t0 = Date.now();
    const start = qaContent.indexOf('## Health Score Rubric');
    const section = qaContent.slice(start);

-    // Use rubric-specific prompt
    const scores = await callJudge<JudgeScore>(`You are evaluating a health score rubric that an AI agent must follow to compute a numeric QA score.

 The agent uses this rubric after QA testing a website. It needs to:
@@ -218,11 +287,18 @@ Here is the rubric to evaluate:
 ${section}`);
    console.log('QA health rubric scores:', JSON.stringify(scores, null, 2));

+    evalCollector?.addTest({
+      name: 'qa/SKILL.md health rubric',
+      suite: 'QA skill quality evals',
+      tier: 'llm-judge',
+      passed: scores.clarity >= 4 && scores.completeness >= 3 && scores.actionability >= 4,
+      duration_ms: Date.now() - t0,
+      cost_usd: 0.02,
+      judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
+      judge_reasoning: scores.reasoning,
+    });
+
    expect(scores.clarity).toBeGreaterThanOrEqual(4);
-    // Completeness threshold is 3 — the rubric intentionally leaves some edge cases
-    // to agent judgment (e.g., partial testing, cross-category findings). The judge
-    // consistently flags these as gaps, but over-specifying would make the rubric
-    // rigid and harder to follow. Clarity + actionability >= 4 is what matters.
    expect(scores.completeness).toBeGreaterThanOrEqual(3);
    expect(scores.actionability).toBeGreaterThanOrEqual(4);
  }, 30_000);
@@ -232,12 +308,12 @@ ${section}`);

 describeEval('Cross-skill consistency evals', () => {
  test('greptile-history patterns are consistent across all skills', async () => {
+    const t0 = Date.now();
    const reviewContent = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
    const shipContent = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
    const triageContent = fs.readFileSync(path.join(ROOT, 'review', 'greptile-triage.md'), 'utf-8');
    const retroContent = fs.readFileSync(path.join(ROOT, 'retro', 'SKILL.md'), 'utf-8');

-    // Extract greptile-related lines from each file
    const extractGrepLines = (content: string, filename: string) => {
      const lines = content.split('\n')
        .filter(l => /greptile|history\.md|REMOTE_SLUG/i.test(l))
@@ -277,6 +353,17 @@ score (1-5): 5 = perfectly consistent, 1 = contradictory`);

    console.log('Cross-skill consistency:', JSON.stringify(result, null, 2));

+    evalCollector?.addTest({
+      name: 'cross-skill greptile consistency',
+      suite: 'Cross-skill consistency evals',
+      tier: 'llm-judge',
+      passed: result.consistent && result.score >= 4,
+      duration_ms: Date.now() - t0,
+      cost_usd: 0.02,
+      judge_scores: { consistency_score: result.score },
+      judge_reasoning: result.reasoning,
+    });
+
    expect(result.consistent).toBe(true);
    expect(result.score).toBeGreaterThanOrEqual(4);
  }, 30_000);
@@ -288,6 +375,7 @@ describeEval('Baseline score pinning', () => {
  const baselinesPath = path.join(ROOT, 'test', 'fixtures', 'eval-baselines.json');

  test('LLM eval scores do not regress below baselines', async () => {
+    const t0 = Date.now();
    if (!fs.existsSync(baselinesPath)) {
      console.log('No baseline file found — skipping pinning check');
      return;
@@ -296,7 +384,6 @@ describeEval('Baseline score pinning', () => {
    const baselines = JSON.parse(fs.readFileSync(baselinesPath, 'utf-8'));
    const regressions: string[] = [];

-    // Test command reference
    const skillContent = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
    const cmdStart = skillContent.indexOf('## Command Reference');
    const cmdEnd = skillContent.indexOf('## Tips');
@@ -309,7 +396,6 @@ describeEval('Baseline score pinning', () => {
      }
    }

-    // Update baselines if requested
    if (process.env.UPDATE_BASELINES) {
      baselines.command_reference = {
        clarity: cmdScores.clarity,
@@ -320,8 +406,31 @@ describeEval('Baseline score pinning', () => {
      console.log('Updated eval baselines');
    }

-    if (regressions.length > 0) {
+    const passed = regressions.length === 0;
+    evalCollector?.addTest({
+      name: 'baseline score pinning',
+      suite: 'Baseline score pinning',
+      tier: 'llm-judge',
+      passed,
+      duration_ms: Date.now() - t0,
+      cost_usd: 0.02,
+      judge_scores: { clarity: cmdScores.clarity, completeness: cmdScores.completeness, actionability: cmdScores.actionability },
+      judge_reasoning: passed ? 'All scores at or above baseline' : regressions.join('; '),
+    });
+
+    if (!passed) {
      throw new Error(`Score regressions detected:\n${regressions.join('\n')}`);
    }
  }, 60_000);
 });
+
+// Module-level afterAll — finalize eval collector after all tests complete
+afterAll(async () => {
+  if (evalCollector) {
+    try {
+      await evalCollector.finalize();
+    } catch (err) {
+      console.error('Failed to save eval results:', err);
+    }
+  }
+});