feat: eval persistence with auto-compare against previous run

EvalCollector accumulates test results during eval runs, writes JSON to
~/.gstack-dev/evals/{version}-{branch}-{tier}-{timestamp}.json, prints
a summary table, and automatically compares against the previous run.

- EvalCollector class with addTest() / finalize() / summary table
- findPreviousRun() prefers same branch, falls back to any branch
- compareEvalResults() matches tests by name, detects improved/regressed
- extractToolSummary() counts tool types from transcript events
- formatComparison() renders delta table with per-test + aggregate diffs
- Wire into skill-e2e.test.ts (recordE2E helper) and skill-llm-eval.test.ts
- 19 unit tests for collector + comparison functions
- schema_version: 1 for forward compatibility

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-03-14 03:49:47 -05:00
parent e7347c2f8f
commit 84f52f3bad
4 changed files with 975 additions and 20 deletions
+48 -1
View File
@@ -1,6 +1,9 @@
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
import { runSkillTest } from './helpers/session-runner';
import type { SkillTestResult } from './helpers/session-runner';
import { outcomeJudge } from './helpers/llm-judge';
import { EvalCollector } from './helpers/eval-store';
import type { EvalTestEntry } from './helpers/eval-store';
import { startTestServer } from '../browse/test/test-server';
import * as fs from 'fs';
import * as path from 'path';
@@ -12,6 +15,24 @@ const ROOT = path.resolve(import.meta.dir, '..');
const evalsEnabled = !!process.env.EVALS;
const describeE2E = evalsEnabled ? describe : describe.skip;
// Eval result collector — accumulates test results, writes to ~/.gstack-dev/evals/ on finalize
const evalCollector = evalsEnabled ? new EvalCollector('e2e') : null;
/** DRY helper to record an E2E test result into the eval collector. */
function recordE2E(name: string, suite: string, result: SkillTestResult, extra?: Partial<EvalTestEntry>) {
evalCollector?.addTest({
name, suite, tier: 'e2e',
passed: result.exitReason === 'success' && result.browseErrors.length === 0,
duration_ms: result.duration,
cost_usd: result.costEstimate.estimatedCost,
transcript: result.transcript,
output: result.output?.slice(0, 2000),
turns_used: result.costEstimate.turnsUsed,
browse_errors: result.browseErrors,
...extra,
});
}
let testServer: ReturnType<typeof startTestServer>;
let tmpDir: string;
const browseBin = path.resolve(ROOT, 'browse', 'dist', 'browse');
@@ -110,6 +131,7 @@ Report the results of each command.`,
});
logCost('browse basic', result);
recordE2E('browse basic commands', 'Skill E2E tests', result);
expect(result.browseErrors).toHaveLength(0);
expect(result.exitReason).toBe('success');
}, 90_000);
@@ -129,11 +151,11 @@ Report what each command returned.`,
});
logCost('browse snapshot', result);
recordE2E('browse snapshot flags', 'Skill E2E tests', result);
expect(result.browseErrors).toHaveLength(0);
expect(result.exitReason).toBe('success');
}, 90_000);
test('agent discovers browse binary via SKILL.md setup block', async () => {
const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
const setupStart = skillMd.indexOf('## SETUP');
@@ -156,6 +178,7 @@ Report whether it worked.`,
timeout: 60_000,
});
recordE2E('SKILL.md setup block discovery', 'Skill E2E tests', result);
expect(result.browseErrors).toHaveLength(0);
expect(result.exitReason).toBe('success');
}, 90_000);
@@ -182,6 +205,7 @@ Report the exact output. Do NOT try to fix or install anything — just report w
// Agent should see NEEDS_SETUP (not crash or guess wrong paths)
const allText = result.output || '';
recordE2E('SKILL.md NEEDS_SETUP', 'Skill E2E tests', result);
expect(allText).toContain('NEEDS_SETUP');
// Clean up
@@ -210,6 +234,7 @@ Report the exact output — either "READY: <path>" or "NEEDS_SETUP".`,
// Should either find global binary (READY) or show NEEDS_SETUP — not crash
const allText = result.output || '';
recordE2E('SKILL.md outside git repo', 'Skill E2E tests', result);
expect(allText).toMatch(/READY|NEEDS_SETUP/);
// Clean up
@@ -254,6 +279,7 @@ Write your report to ${qaDir}/qa-reports/qa-report.md`,
});
logCost('/qa quick', result);
recordE2E('/qa quick', 'QA skill E2E', result);
expect(result.browseErrors).toHaveLength(0);
expect(result.exitReason).toBe('success');
}, 240_000);
@@ -311,6 +337,7 @@ Write your review findings to ${reviewDir}/review-output.md`,
});
logCost('/review', result);
recordE2E('/review SQL injection', 'Review skill E2E', result);
expect(result.exitReason).toBe('success');
}, 120_000);
});
@@ -392,6 +419,15 @@ Be thorough: check console, check all links, check all forms, check mobile viewp
const judgeResult = await outcomeJudge(groundTruth, report);
console.log(`${label} outcome:`, JSON.stringify(judgeResult, null, 2));
// Record to eval collector with outcome judge results
recordE2E(`/qa ${label}`, 'Planted-bug outcome evals', result, {
detection_rate: judgeResult.detection_rate,
false_positives: judgeResult.false_positives,
evidence_quality: judgeResult.evidence_quality,
detected_bugs: judgeResult.detected,
missed_bugs: judgeResult.missed,
});
// Diagnostic dump on failure (decision 1C)
if (judgeResult.detection_rate < groundTruth.minimum_detection || judgeResult.false_positives > groundTruth.max_false_positives) {
dumpOutcomeDiagnostic(outcomeDir, label, report, judgeResult);
@@ -421,3 +457,14 @@ Be thorough: check console, check all links, check all forms, check mobile viewp
// Ship E2E deferred — too complex (requires full git + test suite + VERSION + CHANGELOG)
test.todo('/ship completes without browse errors');
});
// Module-level afterAll — finalize eval collector after all tests complete
afterAll(async () => {
if (evalCollector) {
try {
await evalCollector.finalize();
} catch (err) {
console.error('Failed to save eval results:', err);
}
}
});