mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-02 03:35:09 +02:00
f3ee0ee28a
* feat: browser ref staleness detection via async count() validation resolveRef() now checks element count to detect stale refs after page mutations (e.g. SPA navigation). RefEntry stores role+name metadata for better diagnostics. 3 new snapshot tests for staleness detection. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * feat: qa-only skill, qa fix loop, plan-to-QA artifact flow Add /qa-only (report-only, Edit tool blocked), restructure /qa with find-fix-verify cycle, add {{QA_METHODOLOGY}} DRY placeholder for shared methodology. /plan-eng-review now writes test-plan artifacts to ~/.gstack/projects/<slug>/ for QA consumption. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * feat: eval efficiency metrics — turns, duration, commentary across all surfaces Add generateCommentary() for natural-language delta interpretation, per-test turns/duration in comparison and summary output, judgePassed unit tests, 3 new E2E tests (qa-only, qa fix loop, plan artifact). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * chore: bump version and changelog (v0.4.0) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * docs: update ARCHITECTURE, BROWSER, CONTRIBUTING, README for v0.4.0 - ARCHITECTURE: add ref staleness detection section, update RefEntry type - BROWSER: add ref staleness paragraph to snapshot system docs - CONTRIBUTING: update eval tool descriptions with commentary feature - README: fix missing qa-only in project-local uninstall command Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * docs: add user-facing benefit descriptions to v0.4.0 changelog Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
682 lines
23 KiB
TypeScript
682 lines
23 KiB
TypeScript
/**
|
|
* Eval result persistence and comparison.
|
|
*
|
|
* EvalCollector accumulates test results, writes them to
|
|
* ~/.gstack-dev/evals/{version}-{branch}-{tier}-{timestamp}.json,
|
|
* prints a summary table, and auto-compares with the previous run.
|
|
*
|
|
* Comparison functions are exported for reuse by the eval:compare CLI.
|
|
*/
|
|
|
|
import * as fs from 'fs';
|
|
import * as path from 'path';
|
|
import * as os from 'os';
|
|
import { spawnSync } from 'child_process';
|
|
|
|
const SCHEMA_VERSION = 1;
|
|
const DEFAULT_EVAL_DIR = path.join(os.homedir(), '.gstack-dev', 'evals');
|
|
|
|
// --- Interfaces ---
|
|
|
|
export interface EvalTestEntry {
|
|
name: string;
|
|
suite: string;
|
|
tier: 'e2e' | 'llm-judge';
|
|
passed: boolean;
|
|
duration_ms: number;
|
|
cost_usd: number;
|
|
|
|
// E2E
|
|
transcript?: any[];
|
|
prompt?: string;
|
|
output?: string;
|
|
turns_used?: number;
|
|
browse_errors?: string[];
|
|
|
|
// LLM judge
|
|
judge_scores?: Record<string, number>;
|
|
judge_reasoning?: string;
|
|
|
|
// Machine-readable diagnostics
|
|
exit_reason?: string; // 'success' | 'timeout' | 'error_max_turns' | 'exit_code_N'
|
|
timeout_at_turn?: number; // which turn was active when timeout hit
|
|
last_tool_call?: string; // e.g. "Write(review-output.md)"
|
|
|
|
// Outcome eval
|
|
detection_rate?: number;
|
|
false_positives?: number;
|
|
evidence_quality?: number;
|
|
detected_bugs?: string[];
|
|
missed_bugs?: string[];
|
|
|
|
error?: string;
|
|
}
|
|
|
|
export interface EvalResult {
|
|
schema_version: number;
|
|
version: string;
|
|
branch: string;
|
|
git_sha: string;
|
|
timestamp: string;
|
|
hostname: string;
|
|
tier: 'e2e' | 'llm-judge';
|
|
total_tests: number;
|
|
passed: number;
|
|
failed: number;
|
|
total_cost_usd: number;
|
|
total_duration_ms: number;
|
|
tests: EvalTestEntry[];
|
|
_partial?: boolean; // true for incremental saves, absent in final
|
|
}
|
|
|
|
export interface TestDelta {
|
|
name: string;
|
|
before: { passed: boolean; cost_usd: number; turns_used?: number; duration_ms?: number;
|
|
detection_rate?: number; tool_summary?: Record<string, number> };
|
|
after: { passed: boolean; cost_usd: number; turns_used?: number; duration_ms?: number;
|
|
detection_rate?: number; tool_summary?: Record<string, number> };
|
|
status_change: 'improved' | 'regressed' | 'unchanged';
|
|
}
|
|
|
|
export interface ComparisonResult {
|
|
before_file: string;
|
|
after_file: string;
|
|
before_branch: string;
|
|
after_branch: string;
|
|
before_timestamp: string;
|
|
after_timestamp: string;
|
|
deltas: TestDelta[];
|
|
total_cost_delta: number;
|
|
total_duration_delta: number;
|
|
improved: number;
|
|
regressed: number;
|
|
unchanged: number;
|
|
tool_count_before: number;
|
|
tool_count_after: number;
|
|
}
|
|
|
|
// --- Shared helpers ---
|
|
|
|
/**
|
|
* Determine if a planted-bug eval passed based on judge results vs ground truth thresholds.
|
|
* Centralizes the pass/fail logic so all planted-bug tests use the same criteria.
|
|
*/
|
|
export function judgePassed(
|
|
judgeResult: { detection_rate: number; false_positives: number; evidence_quality: number },
|
|
groundTruth: { minimum_detection: number; max_false_positives: number },
|
|
): boolean {
|
|
return judgeResult.detection_rate >= groundTruth.minimum_detection
|
|
&& judgeResult.false_positives <= groundTruth.max_false_positives
|
|
&& judgeResult.evidence_quality >= 2;
|
|
}
|
|
|
|
// --- Comparison functions (exported for eval:compare CLI) ---
|
|
|
|
/**
|
|
* Extract tool call counts from a transcript.
|
|
* Returns e.g. { Bash: 8, Read: 3, Write: 1 }.
|
|
*/
|
|
export function extractToolSummary(transcript: any[]): Record<string, number> {
|
|
const counts: Record<string, number> = {};
|
|
for (const event of transcript) {
|
|
if (event.type === 'assistant') {
|
|
const content = event.message?.content || [];
|
|
for (const item of content) {
|
|
if (item.type === 'tool_use') {
|
|
const name = item.name || 'unknown';
|
|
counts[name] = (counts[name] || 0) + 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return counts;
|
|
}
|
|
|
|
/**
|
|
* Find the most recent prior eval file for comparison.
|
|
* Prefers same branch, falls back to any branch.
|
|
*/
|
|
export function findPreviousRun(
|
|
evalDir: string,
|
|
tier: string,
|
|
branch: string,
|
|
excludeFile: string,
|
|
): string | null {
|
|
let files: string[];
|
|
try {
|
|
files = fs.readdirSync(evalDir).filter(f => f.endsWith('.json'));
|
|
} catch {
|
|
return null; // dir doesn't exist
|
|
}
|
|
|
|
// Parse top-level fields from each file (cheap — no full tests array needed)
|
|
const entries: Array<{ file: string; branch: string; timestamp: string }> = [];
|
|
for (const file of files) {
|
|
if (file === path.basename(excludeFile)) continue;
|
|
const fullPath = path.join(evalDir, file);
|
|
try {
|
|
const raw = fs.readFileSync(fullPath, 'utf-8');
|
|
// Quick parse — only grab the fields we need
|
|
const data = JSON.parse(raw);
|
|
if (data.tier !== tier) continue;
|
|
entries.push({ file: fullPath, branch: data.branch || '', timestamp: data.timestamp || '' });
|
|
} catch { continue; }
|
|
}
|
|
|
|
if (entries.length === 0) return null;
|
|
|
|
// Sort by timestamp descending
|
|
entries.sort((a, b) => b.timestamp.localeCompare(a.timestamp));
|
|
|
|
// Prefer same branch
|
|
const sameBranch = entries.find(e => e.branch === branch);
|
|
if (sameBranch) return sameBranch.file;
|
|
|
|
// Fallback: any branch
|
|
return entries[0].file;
|
|
}
|
|
|
|
/**
|
|
* Compare two eval results. Matches tests by name.
|
|
*/
|
|
export function compareEvalResults(
|
|
before: EvalResult,
|
|
after: EvalResult,
|
|
beforeFile: string,
|
|
afterFile: string,
|
|
): ComparisonResult {
|
|
const deltas: TestDelta[] = [];
|
|
let improved = 0, regressed = 0, unchanged = 0;
|
|
let toolCountBefore = 0, toolCountAfter = 0;
|
|
|
|
// Index before tests by name
|
|
const beforeMap = new Map<string, EvalTestEntry>();
|
|
for (const t of before.tests) {
|
|
beforeMap.set(t.name, t);
|
|
}
|
|
|
|
// Walk after tests, match by name
|
|
for (const afterTest of after.tests) {
|
|
const beforeTest = beforeMap.get(afterTest.name);
|
|
const beforeToolSummary = beforeTest?.transcript ? extractToolSummary(beforeTest.transcript) : {};
|
|
const afterToolSummary = afterTest.transcript ? extractToolSummary(afterTest.transcript) : {};
|
|
|
|
const beforeToolCount = Object.values(beforeToolSummary).reduce((a, b) => a + b, 0);
|
|
const afterToolCount = Object.values(afterToolSummary).reduce((a, b) => a + b, 0);
|
|
toolCountBefore += beforeToolCount;
|
|
toolCountAfter += afterToolCount;
|
|
|
|
let statusChange: TestDelta['status_change'] = 'unchanged';
|
|
if (beforeTest) {
|
|
if (!beforeTest.passed && afterTest.passed) { statusChange = 'improved'; improved++; }
|
|
else if (beforeTest.passed && !afterTest.passed) { statusChange = 'regressed'; regressed++; }
|
|
else { unchanged++; }
|
|
} else {
|
|
// New test — treat as unchanged (no prior data)
|
|
unchanged++;
|
|
}
|
|
|
|
deltas.push({
|
|
name: afterTest.name,
|
|
before: {
|
|
passed: beforeTest?.passed ?? false,
|
|
cost_usd: beforeTest?.cost_usd ?? 0,
|
|
turns_used: beforeTest?.turns_used,
|
|
duration_ms: beforeTest?.duration_ms,
|
|
detection_rate: beforeTest?.detection_rate,
|
|
tool_summary: beforeToolSummary,
|
|
},
|
|
after: {
|
|
passed: afterTest.passed,
|
|
cost_usd: afterTest.cost_usd,
|
|
turns_used: afterTest.turns_used,
|
|
duration_ms: afterTest.duration_ms,
|
|
detection_rate: afterTest.detection_rate,
|
|
tool_summary: afterToolSummary,
|
|
},
|
|
status_change: statusChange,
|
|
});
|
|
|
|
beforeMap.delete(afterTest.name);
|
|
}
|
|
|
|
// Tests that were in before but not in after (removed tests)
|
|
for (const [name, beforeTest] of beforeMap) {
|
|
const beforeToolSummary = beforeTest.transcript ? extractToolSummary(beforeTest.transcript) : {};
|
|
const beforeToolCount = Object.values(beforeToolSummary).reduce((a, b) => a + b, 0);
|
|
toolCountBefore += beforeToolCount;
|
|
unchanged++;
|
|
deltas.push({
|
|
name: `${name} (removed)`,
|
|
before: {
|
|
passed: beforeTest.passed,
|
|
cost_usd: beforeTest.cost_usd,
|
|
turns_used: beforeTest.turns_used,
|
|
duration_ms: beforeTest.duration_ms,
|
|
detection_rate: beforeTest.detection_rate,
|
|
tool_summary: beforeToolSummary,
|
|
},
|
|
after: { passed: false, cost_usd: 0, tool_summary: {} },
|
|
status_change: 'unchanged',
|
|
});
|
|
}
|
|
|
|
return {
|
|
before_file: beforeFile,
|
|
after_file: afterFile,
|
|
before_branch: before.branch,
|
|
after_branch: after.branch,
|
|
before_timestamp: before.timestamp,
|
|
after_timestamp: after.timestamp,
|
|
deltas,
|
|
total_cost_delta: after.total_cost_usd - before.total_cost_usd,
|
|
total_duration_delta: after.total_duration_ms - before.total_duration_ms,
|
|
improved,
|
|
regressed,
|
|
unchanged,
|
|
tool_count_before: toolCountBefore,
|
|
tool_count_after: toolCountAfter,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Format a ComparisonResult as a readable string.
|
|
*/
|
|
export function formatComparison(c: ComparisonResult): string {
|
|
const lines: string[] = [];
|
|
const ts = c.before_timestamp ? c.before_timestamp.replace('T', ' ').slice(0, 16) : 'unknown';
|
|
lines.push(`\nvs previous: ${c.before_branch}/${c.deltas.length ? 'eval' : ''} (${ts})`);
|
|
lines.push('─'.repeat(70));
|
|
|
|
// Per-test deltas
|
|
for (const d of c.deltas) {
|
|
const arrow = d.status_change === 'improved' ? '↑' : d.status_change === 'regressed' ? '↓' : '=';
|
|
const beforeStatus = d.before.passed ? 'PASS' : 'FAIL';
|
|
const afterStatus = d.after.passed ? 'PASS' : 'FAIL';
|
|
|
|
// Turns delta
|
|
let turnsDelta = '';
|
|
if (d.before.turns_used !== undefined && d.after.turns_used !== undefined) {
|
|
const td = d.after.turns_used - d.before.turns_used;
|
|
turnsDelta = ` ${d.before.turns_used}→${d.after.turns_used}t`;
|
|
if (td !== 0) turnsDelta += `(${td > 0 ? '+' : ''}${td})`;
|
|
} else if (d.after.turns_used !== undefined) {
|
|
turnsDelta = ` ${d.after.turns_used}t`;
|
|
}
|
|
|
|
// Duration delta
|
|
let durDelta = '';
|
|
if (d.before.duration_ms !== undefined && d.after.duration_ms !== undefined) {
|
|
const bs = Math.round(d.before.duration_ms / 1000);
|
|
const as = Math.round(d.after.duration_ms / 1000);
|
|
const dd = as - bs;
|
|
durDelta = ` ${bs}→${as}s`;
|
|
if (dd !== 0) durDelta += `(${dd > 0 ? '+' : ''}${dd})`;
|
|
} else if (d.after.duration_ms !== undefined) {
|
|
durDelta = ` ${Math.round(d.after.duration_ms / 1000)}s`;
|
|
}
|
|
|
|
let detail = '';
|
|
if (d.before.detection_rate !== undefined || d.after.detection_rate !== undefined) {
|
|
detail = ` ${d.before.detection_rate ?? '?'}→${d.after.detection_rate ?? '?'} det`;
|
|
} else {
|
|
const costBefore = d.before.cost_usd.toFixed(2);
|
|
const costAfter = d.after.cost_usd.toFixed(2);
|
|
detail = ` $${costBefore}→$${costAfter}`;
|
|
}
|
|
|
|
const name = d.name.length > 30 ? d.name.slice(0, 27) + '...' : d.name.padEnd(30);
|
|
lines.push(` ${name} ${beforeStatus.padEnd(5)} → ${afterStatus.padEnd(5)} ${arrow}${detail}${turnsDelta}${durDelta}`);
|
|
}
|
|
|
|
lines.push('─'.repeat(70));
|
|
|
|
// Totals
|
|
const parts: string[] = [];
|
|
if (c.improved > 0) parts.push(`${c.improved} improved`);
|
|
if (c.regressed > 0) parts.push(`${c.regressed} regressed`);
|
|
if (c.unchanged > 0) parts.push(`${c.unchanged} unchanged`);
|
|
lines.push(` Status: ${parts.join(', ')}`);
|
|
|
|
const costSign = c.total_cost_delta >= 0 ? '+' : '';
|
|
lines.push(` Cost: ${costSign}$${c.total_cost_delta.toFixed(2)}`);
|
|
|
|
const durDelta = Math.round(c.total_duration_delta / 1000);
|
|
const durSign = durDelta >= 0 ? '+' : '';
|
|
lines.push(` Duration: ${durSign}${durDelta}s`);
|
|
|
|
const toolDelta = c.tool_count_after - c.tool_count_before;
|
|
const toolSign = toolDelta >= 0 ? '+' : '';
|
|
lines.push(` Tool calls: ${c.tool_count_before} → ${c.tool_count_after} (${toolSign}${toolDelta})`);
|
|
|
|
// Tool breakdown (show tools that changed)
|
|
const allTools = new Set<string>();
|
|
for (const d of c.deltas) {
|
|
for (const t of Object.keys(d.before.tool_summary || {})) allTools.add(t);
|
|
for (const t of Object.keys(d.after.tool_summary || {})) allTools.add(t);
|
|
}
|
|
|
|
if (allTools.size > 0) {
|
|
// Aggregate tool counts across all tests
|
|
const totalBefore: Record<string, number> = {};
|
|
const totalAfter: Record<string, number> = {};
|
|
for (const d of c.deltas) {
|
|
for (const [t, n] of Object.entries(d.before.tool_summary || {})) {
|
|
totalBefore[t] = (totalBefore[t] || 0) + n;
|
|
}
|
|
for (const [t, n] of Object.entries(d.after.tool_summary || {})) {
|
|
totalAfter[t] = (totalAfter[t] || 0) + n;
|
|
}
|
|
}
|
|
|
|
for (const tool of [...allTools].sort()) {
|
|
const b = totalBefore[tool] || 0;
|
|
const a = totalAfter[tool] || 0;
|
|
if (b !== a) {
|
|
const d = a - b;
|
|
lines.push(` ${tool}: ${b} → ${a} (${d >= 0 ? '+' : ''}${d})`);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Commentary — interpret what the deltas mean
|
|
const commentary = generateCommentary(c);
|
|
if (commentary.length > 0) {
|
|
lines.push('');
|
|
lines.push(' Takeaway:');
|
|
for (const line of commentary) {
|
|
lines.push(` ${line}`);
|
|
}
|
|
}
|
|
|
|
return lines.join('\n');
|
|
}
|
|
|
|
/**
|
|
* Generate human-readable commentary interpreting comparison deltas.
|
|
* Pure function — analyzes the numbers and explains what they mean.
|
|
*/
|
|
export function generateCommentary(c: ComparisonResult): string[] {
|
|
const notes: string[] = [];
|
|
|
|
// 1. Regressions are the most important signal — call them out first
|
|
const regressions = c.deltas.filter(d => d.status_change === 'regressed');
|
|
if (regressions.length > 0) {
|
|
for (const d of regressions) {
|
|
notes.push(`REGRESSION: "${d.name}" was passing, now fails. Investigate immediately.`);
|
|
}
|
|
}
|
|
|
|
// 2. Improvements
|
|
const improvements = c.deltas.filter(d => d.status_change === 'improved');
|
|
for (const d of improvements) {
|
|
notes.push(`Fixed: "${d.name}" now passes.`);
|
|
}
|
|
|
|
// 3. Per-test efficiency changes (only for unchanged-status tests — regressions/improvements are already noted)
|
|
const stable = c.deltas.filter(d => d.status_change === 'unchanged' && d.after.passed);
|
|
for (const d of stable) {
|
|
const insights: string[] = [];
|
|
|
|
// Turns
|
|
if (d.before.turns_used !== undefined && d.after.turns_used !== undefined && d.before.turns_used > 0) {
|
|
const turnsDelta = d.after.turns_used - d.before.turns_used;
|
|
const turnsPct = Math.round((turnsDelta / d.before.turns_used) * 100);
|
|
if (Math.abs(turnsPct) >= 20 && Math.abs(turnsDelta) >= 2) {
|
|
if (turnsDelta < 0) {
|
|
insights.push(`${Math.abs(turnsDelta)} fewer turns (${Math.abs(turnsPct)}% more efficient)`);
|
|
} else {
|
|
insights.push(`${turnsDelta} more turns (${turnsPct}% less efficient)`);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Duration
|
|
if (d.before.duration_ms !== undefined && d.after.duration_ms !== undefined && d.before.duration_ms > 0) {
|
|
const durDelta = d.after.duration_ms - d.before.duration_ms;
|
|
const durPct = Math.round((durDelta / d.before.duration_ms) * 100);
|
|
if (Math.abs(durPct) >= 20 && Math.abs(durDelta) >= 5000) {
|
|
if (durDelta < 0) {
|
|
insights.push(`${Math.round(Math.abs(durDelta) / 1000)}s faster`);
|
|
} else {
|
|
insights.push(`${Math.round(durDelta / 1000)}s slower`);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Detection rate
|
|
if (d.before.detection_rate !== undefined && d.after.detection_rate !== undefined) {
|
|
const detDelta = d.after.detection_rate - d.before.detection_rate;
|
|
if (detDelta !== 0) {
|
|
if (detDelta > 0) {
|
|
insights.push(`detecting ${detDelta} more bug${detDelta > 1 ? 's' : ''}`);
|
|
} else {
|
|
insights.push(`detecting ${Math.abs(detDelta)} fewer bug${Math.abs(detDelta) > 1 ? 's' : ''} — check prompt quality`);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Cost
|
|
if (d.before.cost_usd > 0) {
|
|
const costDelta = d.after.cost_usd - d.before.cost_usd;
|
|
const costPct = Math.round((costDelta / d.before.cost_usd) * 100);
|
|
if (Math.abs(costPct) >= 30 && Math.abs(costDelta) >= 0.05) {
|
|
if (costDelta < 0) {
|
|
insights.push(`${Math.abs(costPct)}% cheaper`);
|
|
} else {
|
|
insights.push(`${costPct}% more expensive`);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (insights.length > 0) {
|
|
notes.push(`"${d.name}": ${insights.join(', ')}.`);
|
|
}
|
|
}
|
|
|
|
// 4. Overall summary
|
|
if (c.deltas.length >= 3 && regressions.length === 0) {
|
|
const overallParts: string[] = [];
|
|
|
|
// Total cost
|
|
const totalBefore = c.deltas.reduce((s, d) => s + d.before.cost_usd, 0);
|
|
if (totalBefore > 0) {
|
|
const costPct = Math.round((c.total_cost_delta / totalBefore) * 100);
|
|
if (Math.abs(costPct) >= 10) {
|
|
overallParts.push(`${Math.abs(costPct)}% ${costPct < 0 ? 'cheaper' : 'more expensive'} overall`);
|
|
}
|
|
}
|
|
|
|
// Total duration
|
|
const totalDurBefore = c.deltas.reduce((s, d) => s + (d.before.duration_ms || 0), 0);
|
|
if (totalDurBefore > 0) {
|
|
const durPct = Math.round((c.total_duration_delta / totalDurBefore) * 100);
|
|
if (Math.abs(durPct) >= 10) {
|
|
overallParts.push(`${Math.abs(durPct)}% ${durPct < 0 ? 'faster' : 'slower'}`);
|
|
}
|
|
}
|
|
|
|
// Total turns
|
|
const turnsBefore = c.deltas.reduce((s, d) => s + (d.before.turns_used || 0), 0);
|
|
const turnsAfter = c.deltas.reduce((s, d) => s + (d.after.turns_used || 0), 0);
|
|
if (turnsBefore > 0) {
|
|
const turnsPct = Math.round(((turnsAfter - turnsBefore) / turnsBefore) * 100);
|
|
if (Math.abs(turnsPct) >= 10) {
|
|
overallParts.push(`${Math.abs(turnsPct)}% ${turnsPct < 0 ? 'fewer' : 'more'} turns`);
|
|
}
|
|
}
|
|
|
|
if (overallParts.length > 0) {
|
|
notes.push(`Overall: ${overallParts.join(', ')}. ${regressions.length === 0 ? 'No regressions.' : ''}`);
|
|
} else if (regressions.length === 0) {
|
|
notes.push('Stable run — no significant efficiency changes, no regressions.');
|
|
}
|
|
}
|
|
|
|
return notes;
|
|
}
|
|
|
|
// --- EvalCollector ---
|
|
|
|
function getGitInfo(): { branch: string; sha: string } {
|
|
try {
|
|
const branch = spawnSync('git', ['rev-parse', '--abbrev-ref', 'HEAD'], { stdio: 'pipe', timeout: 5000 });
|
|
const sha = spawnSync('git', ['rev-parse', '--short', 'HEAD'], { stdio: 'pipe', timeout: 5000 });
|
|
return {
|
|
branch: branch.stdout?.toString().trim() || 'unknown',
|
|
sha: sha.stdout?.toString().trim() || 'unknown',
|
|
};
|
|
} catch {
|
|
return { branch: 'unknown', sha: 'unknown' };
|
|
}
|
|
}
|
|
|
|
function getVersion(): string {
|
|
try {
|
|
const pkgPath = path.resolve(__dirname, '..', '..', 'package.json');
|
|
const pkg = JSON.parse(fs.readFileSync(pkgPath, 'utf-8'));
|
|
return pkg.version || 'unknown';
|
|
} catch {
|
|
return 'unknown';
|
|
}
|
|
}
|
|
|
|
export class EvalCollector {
|
|
private tier: 'e2e' | 'llm-judge';
|
|
private tests: EvalTestEntry[] = [];
|
|
private finalized = false;
|
|
private evalDir: string;
|
|
|
|
constructor(tier: 'e2e' | 'llm-judge', evalDir?: string) {
|
|
this.tier = tier;
|
|
this.evalDir = evalDir || DEFAULT_EVAL_DIR;
|
|
}
|
|
|
|
addTest(entry: EvalTestEntry): void {
|
|
this.tests.push(entry);
|
|
this.savePartial();
|
|
}
|
|
|
|
/** Write incremental results after each test. Atomic write, non-fatal. */
|
|
savePartial(): void {
|
|
try {
|
|
const git = getGitInfo();
|
|
const version = getVersion();
|
|
const totalCost = this.tests.reduce((s, t) => s + t.cost_usd, 0);
|
|
const totalDuration = this.tests.reduce((s, t) => s + t.duration_ms, 0);
|
|
const passed = this.tests.filter(t => t.passed).length;
|
|
|
|
const partial: EvalResult = {
|
|
schema_version: SCHEMA_VERSION,
|
|
version,
|
|
branch: git.branch,
|
|
git_sha: git.sha,
|
|
timestamp: new Date().toISOString(),
|
|
hostname: os.hostname(),
|
|
tier: this.tier,
|
|
total_tests: this.tests.length,
|
|
passed,
|
|
failed: this.tests.length - passed,
|
|
total_cost_usd: Math.round(totalCost * 100) / 100,
|
|
total_duration_ms: totalDuration,
|
|
tests: this.tests,
|
|
_partial: true,
|
|
};
|
|
|
|
fs.mkdirSync(this.evalDir, { recursive: true });
|
|
const partialPath = path.join(this.evalDir, '_partial-e2e.json');
|
|
const tmp = partialPath + '.tmp';
|
|
fs.writeFileSync(tmp, JSON.stringify(partial, null, 2) + '\n');
|
|
fs.renameSync(tmp, partialPath);
|
|
} catch { /* non-fatal — partial saves are best-effort */ }
|
|
}
|
|
|
|
async finalize(): Promise<string> {
|
|
if (this.finalized) return '';
|
|
this.finalized = true;
|
|
|
|
const git = getGitInfo();
|
|
const version = getVersion();
|
|
const timestamp = new Date().toISOString();
|
|
const totalCost = this.tests.reduce((s, t) => s + t.cost_usd, 0);
|
|
const totalDuration = this.tests.reduce((s, t) => s + t.duration_ms, 0);
|
|
const passed = this.tests.filter(t => t.passed).length;
|
|
|
|
const result: EvalResult = {
|
|
schema_version: SCHEMA_VERSION,
|
|
version,
|
|
branch: git.branch,
|
|
git_sha: git.sha,
|
|
timestamp,
|
|
hostname: os.hostname(),
|
|
tier: this.tier,
|
|
total_tests: this.tests.length,
|
|
passed,
|
|
failed: this.tests.length - passed,
|
|
total_cost_usd: Math.round(totalCost * 100) / 100,
|
|
total_duration_ms: totalDuration,
|
|
tests: this.tests,
|
|
};
|
|
|
|
// Write eval file
|
|
fs.mkdirSync(this.evalDir, { recursive: true });
|
|
const dateStr = timestamp.replace(/[:.]/g, '').replace('T', '-').slice(0, 15);
|
|
const safeBranch = git.branch.replace(/[^a-zA-Z0-9._-]/g, '-');
|
|
const filename = `${version}-${safeBranch}-${this.tier}-${dateStr}.json`;
|
|
const filepath = path.join(this.evalDir, filename);
|
|
fs.writeFileSync(filepath, JSON.stringify(result, null, 2) + '\n');
|
|
|
|
// Print summary table
|
|
this.printSummary(result, filepath, git);
|
|
|
|
// Auto-compare with previous run
|
|
try {
|
|
const prevFile = findPreviousRun(this.evalDir, this.tier, git.branch, filepath);
|
|
if (prevFile) {
|
|
const prevResult: EvalResult = JSON.parse(fs.readFileSync(prevFile, 'utf-8'));
|
|
const comparison = compareEvalResults(prevResult, result, prevFile, filepath);
|
|
process.stderr.write(formatComparison(comparison) + '\n');
|
|
} else {
|
|
process.stderr.write('\nFirst run — no comparison available.\n');
|
|
}
|
|
} catch (err: any) {
|
|
process.stderr.write(`\nCompare error: ${err.message}\n`);
|
|
}
|
|
|
|
return filepath;
|
|
}
|
|
|
|
private printSummary(result: EvalResult, filepath: string, git: { branch: string; sha: string }): void {
|
|
const lines: string[] = [];
|
|
lines.push('');
|
|
lines.push(`Eval Results — v${result.version} @ ${git.branch} (${git.sha}) — ${this.tier}`);
|
|
lines.push('═'.repeat(70));
|
|
|
|
for (const t of this.tests) {
|
|
const status = t.passed ? ' PASS ' : ' FAIL ';
|
|
const cost = `$${t.cost_usd.toFixed(2)}`;
|
|
const dur = t.duration_ms ? `${Math.round(t.duration_ms / 1000)}s` : '';
|
|
const turns = t.turns_used !== undefined ? `${t.turns_used}t` : '';
|
|
|
|
let detail = '';
|
|
if (t.detection_rate !== undefined) {
|
|
detail = `${t.detection_rate}/${(t.detected_bugs?.length || 0) + (t.missed_bugs?.length || 0)} det`;
|
|
} else if (t.judge_scores) {
|
|
const scores = Object.entries(t.judge_scores).map(([k, v]) => `${k[0]}:${v}`).join(' ');
|
|
detail = scores;
|
|
}
|
|
|
|
const name = t.name.length > 35 ? t.name.slice(0, 32) + '...' : t.name.padEnd(35);
|
|
lines.push(` ${name} ${status} ${cost.padStart(6)} ${turns.padStart(4)} ${dur.padStart(5)} ${detail}`);
|
|
}
|
|
|
|
lines.push('─'.repeat(70));
|
|
const totalCost = `$${result.total_cost_usd.toFixed(2)}`;
|
|
const totalDur = `${Math.round(result.total_duration_ms / 1000)}s`;
|
|
lines.push(` Total: ${result.passed}/${result.total_tests} passed${' '.repeat(20)}${totalCost.padStart(6)} ${totalDur}`);
|
|
lines.push(`Saved: ${filepath}`);
|
|
|
|
process.stderr.write(lines.join('\n') + '\n');
|
|
}
|
|
}
|