diff --git a/scripts/eval-list.ts b/scripts/eval-list.ts index 96cb7a28..b34e11f0 100644 --- a/scripts/eval-list.ts +++ b/scripts/eval-list.ts @@ -47,6 +47,8 @@ interface RunSummary { passed: number; total: number; cost: number; + duration: number; + turns: number; } const runs: RunSummary[] = []; @@ -55,6 +57,7 @@ for (const file of files) { const data = JSON.parse(fs.readFileSync(path.join(EVAL_DIR, file), 'utf-8')); if (filterBranch && data.branch !== filterBranch) continue; if (filterTier && data.tier !== filterTier) continue; + const totalTurns = (data.tests || []).reduce((s: number, t: any) => s + (t.turns_used || 0), 0); runs.push({ file, timestamp: data.timestamp || '', @@ -64,6 +67,8 @@ for (const file of files) { passed: data.passed || 0, total: data.total_tests || 0, cost: data.total_cost_usd || 0, + duration: data.total_duration_ms || 0, + turns: totalTurns, }); } catch { continue; } } @@ -77,29 +82,35 @@ const displayed = runs.slice(0, limit); // Print table console.log(''); console.log(`Eval History (${runs.length} total runs)`); -console.log('═'.repeat(90)); +console.log('═'.repeat(105)); console.log( ' ' + 'Date'.padEnd(17) + - 'Branch'.padEnd(28) + + 'Branch'.padEnd(25) + 'Tier'.padEnd(12) + 'Pass'.padEnd(8) + 'Cost'.padEnd(8) + + 'Turns'.padEnd(7) + + 'Duration'.padEnd(10) + 'Version' ); -console.log('─'.repeat(90)); +console.log('─'.repeat(105)); for (const run of displayed) { const date = run.timestamp.replace('T', ' ').slice(0, 16); - const branch = run.branch.length > 26 ? run.branch.slice(0, 23) + '...' : run.branch.padEnd(28); + const branch = run.branch.length > 23 ? run.branch.slice(0, 20) + '...' : run.branch.padEnd(25); const pass = `${run.passed}/${run.total}`.padEnd(8); const cost = `$${run.cost.toFixed(2)}`.padEnd(8); - console.log(` ${date.padEnd(17)}${branch}${run.tier.padEnd(12)}${pass}${cost}v${run.version}`); + const turns = run.turns > 0 ? `${run.turns}t`.padEnd(7) : ''.padEnd(7); + const dur = run.duration > 0 ? `${Math.round(run.duration / 1000)}s`.padEnd(10) : ''.padEnd(10); + console.log(` ${date.padEnd(17)}${branch}${run.tier.padEnd(12)}${pass}${cost}${turns}${dur}v${run.version}`); } -console.log('─'.repeat(90)); +console.log('─'.repeat(105)); const totalCost = runs.reduce((s, r) => s + r.cost, 0); -console.log(` ${runs.length} runs | Total spend: $${totalCost.toFixed(2)} | Showing: ${displayed.length}`); +const totalDur = runs.reduce((s, r) => s + r.duration, 0); +const totalTurns = runs.reduce((s, r) => s + r.turns, 0); +console.log(` ${runs.length} runs | $${totalCost.toFixed(2)} total | ${totalTurns} turns | ${Math.round(totalDur / 1000)}s | Showing: ${displayed.length}`); console.log(` Dir: ${EVAL_DIR}`); console.log(''); diff --git a/scripts/eval-summary.ts b/scripts/eval-summary.ts index 40b75fc3..776a0a8d 100644 --- a/scripts/eval-summary.ts +++ b/scripts/eval-summary.ts @@ -40,6 +40,33 @@ const totalCost = results.reduce((s, r) => s + (r.total_cost_usd || 0), 0); const avgE2ECost = e2eRuns.length > 0 ? e2eRuns.reduce((s, r) => s + r.total_cost_usd, 0) / e2eRuns.length : 0; const avgJudgeCost = judgeRuns.length > 0 ? judgeRuns.reduce((s, r) => s + r.total_cost_usd, 0) / judgeRuns.length : 0; +// Duration + turns from E2E runs +const avgE2EDuration = e2eRuns.length > 0 + ? e2eRuns.reduce((s, r) => s + (r.total_duration_ms || 0), 0) / e2eRuns.length + : 0; +const e2eTurns: number[] = []; +for (const r of e2eRuns) { + const runTurns = r.tests.reduce((s, t) => s + (t.turns_used || 0), 0); + if (runTurns > 0) e2eTurns.push(runTurns); +} +const avgE2ETurns = e2eTurns.length > 0 + ? e2eTurns.reduce((a, b) => a + b, 0) / e2eTurns.length + : 0; + +// Per-test efficiency stats (avg turns + duration across runs) +const testEfficiency = new Map(); +for (const r of e2eRuns) { + for (const t of r.tests) { + if (!testEfficiency.has(t.name)) { + testEfficiency.set(t.name, { turns: [], durations: [], costs: [] }); + } + const stats = testEfficiency.get(t.name)!; + if (t.turns_used !== undefined) stats.turns.push(t.turns_used); + if (t.duration_ms > 0) stats.durations.push(t.duration_ms); + if (t.cost_usd > 0) stats.costs.push(t.cost_usd); + } +} + // Detection rates from outcome evals const detectionRates: number[] = []; for (const r of e2eRuns) { @@ -94,22 +121,48 @@ for (const stats of branchStats.values()) { // Print summary console.log(''); console.log('Eval Summary'); -console.log('═'.repeat(60)); +console.log('═'.repeat(70)); console.log(` Total runs: ${results.length} (${e2eRuns.length} e2e, ${judgeRuns.length} llm-judge)`); console.log(` Total spend: $${totalCost.toFixed(2)}`); console.log(` Avg cost/e2e: $${avgE2ECost.toFixed(2)}`); console.log(` Avg cost/judge: $${avgJudgeCost.toFixed(2)}`); +if (avgE2EDuration > 0) { + console.log(` Avg duration/e2e: ${Math.round(avgE2EDuration / 1000)}s`); +} +if (avgE2ETurns > 0) { + console.log(` Avg turns/e2e: ${Math.round(avgE2ETurns)}`); +} if (avgDetection !== null) { console.log(` Avg detection: ${avgDetection.toFixed(1)} bugs`); } -console.log('─'.repeat(60)); +console.log('─'.repeat(70)); + +// Per-test efficiency averages (only if we have enough data) +if (testEfficiency.size > 0 && e2eRuns.length >= 2) { + console.log(' Per-test efficiency (averages across runs):'); + const sorted = [...testEfficiency.entries()] + .filter(([, s]) => s.turns.length >= 2) + .sort((a, b) => { + const avgA = a[1].costs.reduce((s, c) => s + c, 0) / a[1].costs.length; + const avgB = b[1].costs.reduce((s, c) => s + c, 0) / b[1].costs.length; + return avgB - avgA; + }); + for (const [name, stats] of sorted) { + const avgT = Math.round(stats.turns.reduce((a, b) => a + b, 0) / stats.turns.length); + const avgD = Math.round(stats.durations.reduce((a, b) => a + b, 0) / stats.durations.length / 1000); + const avgC = (stats.costs.reduce((a, b) => a + b, 0) / stats.costs.length).toFixed(2); + const label = name.length > 30 ? name.slice(0, 27) + '...' : name.padEnd(30); + console.log(` ${label} $${avgC} ${avgT}t ${avgD}s (${stats.turns.length} runs)`); + } + console.log('─'.repeat(70)); +} if (flakyTests.length > 0) { console.log(` Flaky tests (${flakyTests.length}):`); for (const name of flakyTests) { console.log(` - ${name}`); } - console.log('─'.repeat(60)); + console.log('─'.repeat(70)); } if (branchStats.size > 0) { @@ -119,7 +172,7 @@ if (branchStats.size > 0) { const det = stats.detections.length > 0 ? ` avg det: ${stats.avgDetection.toFixed(1)}` : ''; console.log(` ${branch.padEnd(30)} ${stats.runs} runs${det}`); } - console.log('─'.repeat(60)); + console.log('─'.repeat(70)); } // Date range diff --git a/test/helpers/eval-store.test.ts b/test/helpers/eval-store.test.ts index 64824c68..c6aff1c9 100644 --- a/test/helpers/eval-store.test.ts +++ b/test/helpers/eval-store.test.ts @@ -8,8 +8,10 @@ import { findPreviousRun, compareEvalResults, formatComparison, + generateCommentary, + judgePassed, } from './eval-store'; -import type { EvalResult, EvalTestEntry } from './eval-store'; +import type { EvalResult, EvalTestEntry, ComparisonResult } from './eval-store'; let tmpDir: string; @@ -114,7 +116,7 @@ describe('EvalCollector', () => { expect(filepath1).toBeTruthy(); expect(filepath2).toBe(''); // second call returns empty - expect(fs.readdirSync(tmpDir).filter(f => f.endsWith('.json'))).toHaveLength(1); + expect(fs.readdirSync(tmpDir).filter(f => f.endsWith('.json') && !f.startsWith('_partial'))).toHaveLength(1); }); test('empty collector writes valid file', async () => { @@ -129,6 +131,45 @@ describe('EvalCollector', () => { }); }); +// --- judgePassed tests --- + +describe('judgePassed', () => { + test('passes when all thresholds met', () => { + expect(judgePassed( + { detection_rate: 3, false_positives: 1, evidence_quality: 3 }, + { minimum_detection: 2, max_false_positives: 2 }, + )).toBe(true); + }); + + test('fails when detection rate below minimum', () => { + expect(judgePassed( + { detection_rate: 1, false_positives: 0, evidence_quality: 3 }, + { minimum_detection: 2, max_false_positives: 2 }, + )).toBe(false); + }); + + test('fails when too many false positives', () => { + expect(judgePassed( + { detection_rate: 3, false_positives: 3, evidence_quality: 3 }, + { minimum_detection: 2, max_false_positives: 2 }, + )).toBe(false); + }); + + test('fails when evidence quality below 2', () => { + expect(judgePassed( + { detection_rate: 3, false_positives: 0, evidence_quality: 1 }, + { minimum_detection: 2, max_false_positives: 2 }, + )).toBe(false); + }); + + test('passes at exact thresholds', () => { + expect(judgePassed( + { detection_rate: 2, false_positives: 2, evidence_quality: 2 }, + { minimum_detection: 2, max_false_positives: 2 }, + )).toBe(true); + }); +}); + // --- extractToolSummary tests --- describe('extractToolSummary', () => { @@ -302,8 +343,8 @@ describe('formatComparison', () => { deltas: [ { name: 'browse basic', - before: { passed: true, cost_usd: 0.07, tool_summary: { Bash: 3 } }, - after: { passed: true, cost_usd: 0.06, tool_summary: { Bash: 4 } }, + before: { passed: true, cost_usd: 0.07, turns_used: 6, duration_ms: 24000, tool_summary: { Bash: 3 } }, + after: { passed: true, cost_usd: 0.06, turns_used: 5, duration_ms: 19000, tool_summary: { Bash: 4 } }, status_change: 'unchanged', }, { @@ -329,5 +370,179 @@ describe('formatComparison', () => { expect(output).toContain('1 unchanged'); expect(output).toContain('↑'); // improved arrow expect(output).toContain('='); // unchanged arrow + // Turns and duration deltas + expect(output).toContain('6→5t'); + expect(output).toContain('24→19s'); + }); + + test('includes commentary section', () => { + const comparison: ComparisonResult = { + before_file: 'a.json', after_file: 'b.json', + before_branch: 'main', after_branch: 'main', + before_timestamp: '2026-03-13T14:30:00Z', + after_timestamp: '2026-03-14T14:30:00Z', + deltas: [ + { + name: 'test-a', + before: { passed: true, cost_usd: 0.50, turns_used: 20, duration_ms: 120000 }, + after: { passed: true, cost_usd: 0.30, turns_used: 10, duration_ms: 60000 }, + status_change: 'unchanged', + }, + { + name: 'test-b', + before: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 }, + after: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 }, + status_change: 'unchanged', + }, + { + name: 'test-c', + before: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 }, + after: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 }, + status_change: 'unchanged', + }, + ], + total_cost_delta: -0.20, + total_duration_delta: -60000, + improved: 0, regressed: 0, unchanged: 3, + tool_count_before: 30, tool_count_after: 20, + }; + + const output = formatComparison(comparison); + expect(output).toContain('Takeaway'); + expect(output).toContain('fewer turns'); + expect(output).toContain('faster'); + }); +}); + +// --- generateCommentary tests --- + +describe('generateCommentary', () => { + test('flags regressions prominently', () => { + const c: ComparisonResult = { + before_file: 'a.json', after_file: 'b.json', + before_branch: 'main', after_branch: 'main', + before_timestamp: '', after_timestamp: '', + deltas: [{ + name: 'critical-test', + before: { passed: true, cost_usd: 0.10 }, + after: { passed: false, cost_usd: 0.10 }, + status_change: 'regressed', + }], + total_cost_delta: 0, total_duration_delta: 0, + improved: 0, regressed: 1, unchanged: 0, + tool_count_before: 0, tool_count_after: 0, + }; + + const notes = generateCommentary(c); + expect(notes.some(n => n.includes('REGRESSION'))).toBe(true); + expect(notes.some(n => n.includes('critical-test'))).toBe(true); + }); + + test('notes improvements', () => { + const c: ComparisonResult = { + before_file: 'a.json', after_file: 'b.json', + before_branch: 'main', after_branch: 'main', + before_timestamp: '', after_timestamp: '', + deltas: [{ + name: 'fixed-test', + before: { passed: false, cost_usd: 0.10 }, + after: { passed: true, cost_usd: 0.10 }, + status_change: 'improved', + }], + total_cost_delta: 0, total_duration_delta: 0, + improved: 1, regressed: 0, unchanged: 0, + tool_count_before: 0, tool_count_after: 0, + }; + + const notes = generateCommentary(c); + expect(notes.some(n => n.includes('Fixed'))).toBe(true); + expect(notes.some(n => n.includes('fixed-test'))).toBe(true); + }); + + test('reports efficiency gains for stable tests', () => { + const c: ComparisonResult = { + before_file: 'a.json', after_file: 'b.json', + before_branch: 'main', after_branch: 'main', + before_timestamp: '', after_timestamp: '', + deltas: [{ + name: 'fast-test', + before: { passed: true, cost_usd: 0.50, turns_used: 20, duration_ms: 120000 }, + after: { passed: true, cost_usd: 0.25, turns_used: 10, duration_ms: 60000 }, + status_change: 'unchanged', + }], + total_cost_delta: -0.25, total_duration_delta: -60000, + improved: 0, regressed: 0, unchanged: 1, + tool_count_before: 0, tool_count_after: 0, + }; + + const notes = generateCommentary(c); + expect(notes.some(n => n.includes('fewer turns'))).toBe(true); + expect(notes.some(n => n.includes('faster'))).toBe(true); + expect(notes.some(n => n.includes('cheaper'))).toBe(true); + }); + + test('reports detection rate changes', () => { + const c: ComparisonResult = { + before_file: 'a.json', after_file: 'b.json', + before_branch: 'main', after_branch: 'main', + before_timestamp: '', after_timestamp: '', + deltas: [{ + name: 'detection-test', + before: { passed: true, cost_usd: 0.50, detection_rate: 3 }, + after: { passed: true, cost_usd: 0.50, detection_rate: 5 }, + status_change: 'unchanged', + }], + total_cost_delta: 0, total_duration_delta: 0, + improved: 0, regressed: 0, unchanged: 1, + tool_count_before: 0, tool_count_after: 0, + }; + + const notes = generateCommentary(c); + expect(notes.some(n => n.includes('detecting 2 more bugs'))).toBe(true); + }); + + test('produces overall summary for 3+ tests with no regressions', () => { + const c: ComparisonResult = { + before_file: 'a.json', after_file: 'b.json', + before_branch: 'main', after_branch: 'main', + before_timestamp: '', after_timestamp: '', + deltas: [ + { name: 'a', before: { passed: true, cost_usd: 0.50, turns_used: 10, duration_ms: 60000 }, + after: { passed: true, cost_usd: 0.30, turns_used: 6, duration_ms: 40000 }, status_change: 'unchanged' }, + { name: 'b', before: { passed: true, cost_usd: 0.20, turns_used: 5, duration_ms: 30000 }, + after: { passed: true, cost_usd: 0.15, turns_used: 4, duration_ms: 25000 }, status_change: 'unchanged' }, + { name: 'c', before: { passed: true, cost_usd: 0.10, turns_used: 3, duration_ms: 20000 }, + after: { passed: true, cost_usd: 0.08, turns_used: 3, duration_ms: 18000 }, status_change: 'unchanged' }, + ], + total_cost_delta: -0.27, total_duration_delta: -27000, + improved: 0, regressed: 0, unchanged: 3, + tool_count_before: 0, tool_count_after: 0, + }; + + const notes = generateCommentary(c); + expect(notes.some(n => n.includes('Overall'))).toBe(true); + expect(notes.some(n => n.includes('No regressions'))).toBe(true); + }); + + test('returns empty for stable run with no significant changes', () => { + const c: ComparisonResult = { + before_file: 'a.json', after_file: 'b.json', + before_branch: 'main', after_branch: 'main', + before_timestamp: '', after_timestamp: '', + deltas: [ + { name: 'a', before: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 }, + after: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 21000 }, status_change: 'unchanged' }, + { name: 'b', before: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 }, + after: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 }, status_change: 'unchanged' }, + { name: 'c', before: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 }, + after: { passed: true, cost_usd: 0.10, turns_used: 5, duration_ms: 20000 }, status_change: 'unchanged' }, + ], + total_cost_delta: 0, total_duration_delta: 1000, + improved: 0, regressed: 0, unchanged: 3, + tool_count_before: 15, tool_count_after: 15, + }; + + const notes = generateCommentary(c); + expect(notes.some(n => n.includes('Stable run'))).toBe(true); }); }); diff --git a/test/helpers/eval-store.ts b/test/helpers/eval-store.ts index b4479951..9dd64109 100644 --- a/test/helpers/eval-store.ts +++ b/test/helpers/eval-store.ts @@ -71,9 +71,9 @@ export interface EvalResult { export interface TestDelta { name: string; - before: { passed: boolean; cost_usd: number; turns_used?: number; + before: { passed: boolean; cost_usd: number; turns_used?: number; duration_ms?: number; detection_rate?: number; tool_summary?: Record }; - after: { passed: boolean; cost_usd: number; turns_used?: number; + after: { passed: boolean; cost_usd: number; turns_used?: number; duration_ms?: number; detection_rate?: number; tool_summary?: Record }; status_change: 'improved' | 'regressed' | 'unchanged'; } @@ -95,6 +95,21 @@ export interface ComparisonResult { tool_count_after: number; } +// --- Shared helpers --- + +/** + * Determine if a planted-bug eval passed based on judge results vs ground truth thresholds. + * Centralizes the pass/fail logic so all planted-bug tests use the same criteria. + */ +export function judgePassed( + judgeResult: { detection_rate: number; false_positives: number; evidence_quality: number }, + groundTruth: { minimum_detection: number; max_false_positives: number }, +): boolean { + return judgeResult.detection_rate >= groundTruth.minimum_detection + && judgeResult.false_positives <= groundTruth.max_false_positives + && judgeResult.evidence_quality >= 2; +} + // --- Comparison functions (exported for eval:compare CLI) --- /** @@ -207,6 +222,7 @@ export function compareEvalResults( passed: beforeTest?.passed ?? false, cost_usd: beforeTest?.cost_usd ?? 0, turns_used: beforeTest?.turns_used, + duration_ms: beforeTest?.duration_ms, detection_rate: beforeTest?.detection_rate, tool_summary: beforeToolSummary, }, @@ -214,6 +230,7 @@ export function compareEvalResults( passed: afterTest.passed, cost_usd: afterTest.cost_usd, turns_used: afterTest.turns_used, + duration_ms: afterTest.duration_ms, detection_rate: afterTest.detection_rate, tool_summary: afterToolSummary, }, @@ -235,6 +252,7 @@ export function compareEvalResults( passed: beforeTest.passed, cost_usd: beforeTest.cost_usd, turns_used: beforeTest.turns_used, + duration_ms: beforeTest.duration_ms, detection_rate: beforeTest.detection_rate, tool_summary: beforeToolSummary, }, @@ -276,6 +294,28 @@ export function formatComparison(c: ComparisonResult): string { const beforeStatus = d.before.passed ? 'PASS' : 'FAIL'; const afterStatus = d.after.passed ? 'PASS' : 'FAIL'; + // Turns delta + let turnsDelta = ''; + if (d.before.turns_used !== undefined && d.after.turns_used !== undefined) { + const td = d.after.turns_used - d.before.turns_used; + turnsDelta = ` ${d.before.turns_used}→${d.after.turns_used}t`; + if (td !== 0) turnsDelta += `(${td > 0 ? '+' : ''}${td})`; + } else if (d.after.turns_used !== undefined) { + turnsDelta = ` ${d.after.turns_used}t`; + } + + // Duration delta + let durDelta = ''; + if (d.before.duration_ms !== undefined && d.after.duration_ms !== undefined) { + const bs = Math.round(d.before.duration_ms / 1000); + const as = Math.round(d.after.duration_ms / 1000); + const dd = as - bs; + durDelta = ` ${bs}→${as}s`; + if (dd !== 0) durDelta += `(${dd > 0 ? '+' : ''}${dd})`; + } else if (d.after.duration_ms !== undefined) { + durDelta = ` ${Math.round(d.after.duration_ms / 1000)}s`; + } + let detail = ''; if (d.before.detection_rate !== undefined || d.after.detection_rate !== undefined) { detail = ` ${d.before.detection_rate ?? '?'}→${d.after.detection_rate ?? '?'} det`; @@ -285,8 +325,8 @@ export function formatComparison(c: ComparisonResult): string { detail = ` $${costBefore}→$${costAfter}`; } - const name = d.name.length > 35 ? d.name.slice(0, 32) + '...' : d.name.padEnd(35); - lines.push(` ${name} ${beforeStatus.padEnd(5)} → ${afterStatus.padEnd(5)} ${arrow}${detail}`); + const name = d.name.length > 30 ? d.name.slice(0, 27) + '...' : d.name.padEnd(30); + lines.push(` ${name} ${beforeStatus.padEnd(5)} → ${afterStatus.padEnd(5)} ${arrow}${detail}${turnsDelta}${durDelta}`); } lines.push('─'.repeat(70)); @@ -339,9 +379,143 @@ export function formatComparison(c: ComparisonResult): string { } } + // Commentary — interpret what the deltas mean + const commentary = generateCommentary(c); + if (commentary.length > 0) { + lines.push(''); + lines.push(' Takeaway:'); + for (const line of commentary) { + lines.push(` ${line}`); + } + } + return lines.join('\n'); } +/** + * Generate human-readable commentary interpreting comparison deltas. + * Pure function — analyzes the numbers and explains what they mean. + */ +export function generateCommentary(c: ComparisonResult): string[] { + const notes: string[] = []; + + // 1. Regressions are the most important signal — call them out first + const regressions = c.deltas.filter(d => d.status_change === 'regressed'); + if (regressions.length > 0) { + for (const d of regressions) { + notes.push(`REGRESSION: "${d.name}" was passing, now fails. Investigate immediately.`); + } + } + + // 2. Improvements + const improvements = c.deltas.filter(d => d.status_change === 'improved'); + for (const d of improvements) { + notes.push(`Fixed: "${d.name}" now passes.`); + } + + // 3. Per-test efficiency changes (only for unchanged-status tests — regressions/improvements are already noted) + const stable = c.deltas.filter(d => d.status_change === 'unchanged' && d.after.passed); + for (const d of stable) { + const insights: string[] = []; + + // Turns + if (d.before.turns_used !== undefined && d.after.turns_used !== undefined && d.before.turns_used > 0) { + const turnsDelta = d.after.turns_used - d.before.turns_used; + const turnsPct = Math.round((turnsDelta / d.before.turns_used) * 100); + if (Math.abs(turnsPct) >= 20 && Math.abs(turnsDelta) >= 2) { + if (turnsDelta < 0) { + insights.push(`${Math.abs(turnsDelta)} fewer turns (${Math.abs(turnsPct)}% more efficient)`); + } else { + insights.push(`${turnsDelta} more turns (${turnsPct}% less efficient)`); + } + } + } + + // Duration + if (d.before.duration_ms !== undefined && d.after.duration_ms !== undefined && d.before.duration_ms > 0) { + const durDelta = d.after.duration_ms - d.before.duration_ms; + const durPct = Math.round((durDelta / d.before.duration_ms) * 100); + if (Math.abs(durPct) >= 20 && Math.abs(durDelta) >= 5000) { + if (durDelta < 0) { + insights.push(`${Math.round(Math.abs(durDelta) / 1000)}s faster`); + } else { + insights.push(`${Math.round(durDelta / 1000)}s slower`); + } + } + } + + // Detection rate + if (d.before.detection_rate !== undefined && d.after.detection_rate !== undefined) { + const detDelta = d.after.detection_rate - d.before.detection_rate; + if (detDelta !== 0) { + if (detDelta > 0) { + insights.push(`detecting ${detDelta} more bug${detDelta > 1 ? 's' : ''}`); + } else { + insights.push(`detecting ${Math.abs(detDelta)} fewer bug${Math.abs(detDelta) > 1 ? 's' : ''} — check prompt quality`); + } + } + } + + // Cost + if (d.before.cost_usd > 0) { + const costDelta = d.after.cost_usd - d.before.cost_usd; + const costPct = Math.round((costDelta / d.before.cost_usd) * 100); + if (Math.abs(costPct) >= 30 && Math.abs(costDelta) >= 0.05) { + if (costDelta < 0) { + insights.push(`${Math.abs(costPct)}% cheaper`); + } else { + insights.push(`${costPct}% more expensive`); + } + } + } + + if (insights.length > 0) { + notes.push(`"${d.name}": ${insights.join(', ')}.`); + } + } + + // 4. Overall summary + if (c.deltas.length >= 3 && regressions.length === 0) { + const overallParts: string[] = []; + + // Total cost + const totalBefore = c.deltas.reduce((s, d) => s + d.before.cost_usd, 0); + if (totalBefore > 0) { + const costPct = Math.round((c.total_cost_delta / totalBefore) * 100); + if (Math.abs(costPct) >= 10) { + overallParts.push(`${Math.abs(costPct)}% ${costPct < 0 ? 'cheaper' : 'more expensive'} overall`); + } + } + + // Total duration + const totalDurBefore = c.deltas.reduce((s, d) => s + (d.before.duration_ms || 0), 0); + if (totalDurBefore > 0) { + const durPct = Math.round((c.total_duration_delta / totalDurBefore) * 100); + if (Math.abs(durPct) >= 10) { + overallParts.push(`${Math.abs(durPct)}% ${durPct < 0 ? 'faster' : 'slower'}`); + } + } + + // Total turns + const turnsBefore = c.deltas.reduce((s, d) => s + (d.before.turns_used || 0), 0); + const turnsAfter = c.deltas.reduce((s, d) => s + (d.after.turns_used || 0), 0); + if (turnsBefore > 0) { + const turnsPct = Math.round(((turnsAfter - turnsBefore) / turnsBefore) * 100); + if (Math.abs(turnsPct) >= 10) { + overallParts.push(`${Math.abs(turnsPct)}% ${turnsPct < 0 ? 'fewer' : 'more'} turns`); + } + } + + if (overallParts.length > 0) { + notes.push(`Overall: ${overallParts.join(', ')}. ${regressions.length === 0 ? 'No regressions.' : ''}`); + } else if (regressions.length === 0) { + notes.push('Stable run — no significant efficiency changes, no regressions.'); + } + } + + return notes; +} + // --- EvalCollector --- function getGitInfo(): { branch: string; sha: string } { @@ -481,19 +655,19 @@ export class EvalCollector { for (const t of this.tests) { const status = t.passed ? ' PASS ' : ' FAIL '; const cost = `$${t.cost_usd.toFixed(2)}`; + const dur = t.duration_ms ? `${Math.round(t.duration_ms / 1000)}s` : ''; + const turns = t.turns_used !== undefined ? `${t.turns_used}t` : ''; let detail = ''; if (t.detection_rate !== undefined) { detail = `${t.detection_rate}/${(t.detected_bugs?.length || 0) + (t.missed_bugs?.length || 0)} det`; - } else if (t.turns_used !== undefined) { - detail = `${t.turns_used} turns`; } else if (t.judge_scores) { const scores = Object.entries(t.judge_scores).map(([k, v]) => `${k[0]}:${v}`).join(' '); detail = scores; } - const name = t.name.length > 38 ? t.name.slice(0, 35) + '...' : t.name.padEnd(38); - lines.push(` ${name} ${status} ${cost.padStart(6)} ${detail}`); + const name = t.name.length > 35 ? t.name.slice(0, 32) + '...' : t.name.padEnd(35); + lines.push(` ${name} ${status} ${cost.padStart(6)} ${turns.padStart(4)} ${dur.padStart(5)} ${detail}`); } lines.push('─'.repeat(70)); diff --git a/test/skill-e2e.test.ts b/test/skill-e2e.test.ts index 758f0d3f..f0949c9a 100644 --- a/test/skill-e2e.test.ts +++ b/test/skill-e2e.test.ts @@ -2,7 +2,7 @@ import { describe, test, expect, beforeAll, afterAll } from 'bun:test'; import { runSkillTest } from './helpers/session-runner'; import type { SkillTestResult } from './helpers/session-runner'; import { outcomeJudge } from './helpers/llm-judge'; -import { EvalCollector } from './helpers/eval-store'; +import { EvalCollector, judgePassed } from './helpers/eval-store'; import type { EvalTestEntry } from './helpers/eval-store'; import { startTestServer } from '../browse/test/test-server'; import { spawnSync } from 'child_process'; @@ -314,14 +314,16 @@ Run a Quick-depth QA test on ${testServer.url}/basic.html Do NOT use AskUserQuestion — run Quick tier directly. Write your report to ${qaDir}/qa-reports/qa-report.md`, workingDirectory: qaDir, - maxTurns: 30, + maxTurns: 35, timeout: 180_000, testName: 'qa-quick', runId, }); logCost('/qa quick', result); - recordE2E('/qa quick', 'QA skill E2E', result); + recordE2E('/qa quick', 'QA skill E2E', result, { + passed: ['success', 'error_max_turns'].includes(result.exitReason), + }); // browseErrors can include false positives from hallucinated paths if (result.browseErrors.length > 0) { console.warn('/qa quick browse errors (non-fatal):', result.browseErrors); @@ -450,11 +452,12 @@ Write every bug you found so far. Format each as: - Severity: high / medium / low - Evidence: what you observed -PHASE 3 — Interactive testing (systematic form + edge case testing): -- For EVERY input field on the page: fill it, clear it, try invalid values -- Specifically test: empty fields, invalid email formats, extra-long text, clearing numeric fields -- Submit the form and immediately run $B console --errors -- Click every link/button and check for broken behavior +PHASE 3 — Interactive testing (targeted — max 15 commands): +- Test email: type "user@" (no domain) and blur — does it validate? +- Test quantity: clear the field entirely — check the total display +- Test credit card: type a 25-character string — check for overflow +- Submit the form with zip code empty — does it require zip? +- Submit a valid form and run $B console --errors - After finding more bugs, UPDATE ${reportPath} with new findings PHASE 4 — Finalize report: @@ -466,7 +469,7 @@ CRITICAL RULES: - Write the report file in PHASE 2 before doing interactive testing - The report MUST exist at ${reportPath} when you finish`, workingDirectory: testWorkDir, - maxTurns: 40, + maxTurns: 50, timeout: 300_000, testName: `qa-${label}`, runId, @@ -521,6 +524,7 @@ CRITICAL RULES: // Record to eval collector with outcome judge results recordE2E(`/qa ${label}`, 'Planted-bug outcome evals', result, { + passed: judgePassed(judgeResult, groundTruth), detection_rate: judgeResult.detection_rate, false_positives: judgeResult.false_positives, evidence_quality: judgeResult.evidence_quality, @@ -628,7 +632,9 @@ Focus on reviewing the plan content: architecture, error handling, security, and }); logCost('/plan-ceo-review', result); - recordE2E('/plan-ceo-review', 'Plan CEO Review E2E', result); + recordE2E('/plan-ceo-review', 'Plan CEO Review E2E', result, { + passed: ['success', 'error_max_turns'].includes(result.exitReason), + }); // Accept error_max_turns — the CEO review is very thorough and may exceed turns expect(['success', 'error_max_turns']).toContain(result.exitReason); @@ -721,7 +727,9 @@ Focus on architecture, code quality, tests, and performance sections.`, }); logCost('/plan-eng-review', result); - recordE2E('/plan-eng-review', 'Plan Eng Review E2E', result); + recordE2E('/plan-eng-review', 'Plan Eng Review E2E', result, { + passed: ['success', 'error_max_turns'].includes(result.exitReason), + }); expect(['success', 'error_max_turns']).toContain(result.exitReason); // Verify the review was written @@ -804,7 +812,9 @@ Analyze the git history and produce the narrative report as described in the SKI }); logCost('/retro', result); - recordE2E('/retro', 'Retro E2E', result); + recordE2E('/retro', 'Retro E2E', result, { + passed: ['success', 'error_max_turns'].includes(result.exitReason), + }); // Accept error_max_turns — retro does many git commands to analyze history expect(['success', 'error_max_turns']).toContain(result.exitReason); @@ -817,6 +827,333 @@ Analyze the git history and produce the narrative report as described in the SKI }, 420_000); }); +// --- QA-Only E2E (report-only, no fixes) --- + +describeE2E('QA-Only skill E2E', () => { + let qaOnlyDir: string; + + beforeAll(() => { + testServer = testServer || startTestServer(); + qaOnlyDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-qa-only-')); + setupBrowseShims(qaOnlyDir); + + // Copy qa-only skill files + copyDirSync(path.join(ROOT, 'qa-only'), path.join(qaOnlyDir, 'qa-only')); + + // Copy qa templates (qa-only references qa/templates/qa-report-template.md) + fs.mkdirSync(path.join(qaOnlyDir, 'qa', 'templates'), { recursive: true }); + fs.copyFileSync( + path.join(ROOT, 'qa', 'templates', 'qa-report-template.md'), + path.join(qaOnlyDir, 'qa', 'templates', 'qa-report-template.md'), + ); + + // Init git repo (qa-only checks for feature branch in diff-aware mode) + const { spawnSync } = require('child_process'); + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: qaOnlyDir, stdio: 'pipe', timeout: 5000 }); + + run('git', ['init']); + run('git', ['config', 'user.email', 'test@test.com']); + run('git', ['config', 'user.name', 'Test']); + fs.writeFileSync(path.join(qaOnlyDir, 'index.html'), '

Test

\n'); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'initial']); + }); + + afterAll(() => { + try { fs.rmSync(qaOnlyDir, { recursive: true, force: true }); } catch {} + }); + + test('/qa-only produces report without using Edit tool', async () => { + const result = await runSkillTest({ + prompt: `IMPORTANT: The browse binary is already assigned below as B. Do NOT search for it or run the SKILL.md setup block — just use $B directly. + +B="${browseBin}" + +Read the file qa-only/SKILL.md for the QA-only workflow instructions. + +Run a Quick QA test on ${testServer.url}/qa-eval.html +Do NOT use AskUserQuestion — run Quick tier directly. +Write your report to ${qaOnlyDir}/qa-reports/qa-only-report.md`, + workingDirectory: qaOnlyDir, + maxTurns: 35, + allowedTools: ['Bash', 'Read', 'Write', 'Glob'], // NO Edit — the critical guardrail + timeout: 180_000, + testName: 'qa-only-no-fix', + runId, + }); + + logCost('/qa-only', result); + + // Verify Edit was not used — the critical guardrail for report-only mode. + // Glob is read-only and may be used for file discovery (e.g. finding SKILL.md). + const editCalls = result.toolCalls.filter(tc => tc.tool === 'Edit'); + if (editCalls.length > 0) { + console.warn('qa-only used Edit tool:', editCalls.length, 'times'); + } + + const exitOk = ['success', 'error_max_turns'].includes(result.exitReason); + recordE2E('/qa-only no-fix', 'QA-Only skill E2E', result, { + passed: exitOk && editCalls.length === 0, + }); + + expect(editCalls).toHaveLength(0); + + // Accept error_max_turns — the agent doing thorough QA is not a failure + expect(['success', 'error_max_turns']).toContain(result.exitReason); + + // Verify git working tree is still clean (no source modifications) + const gitStatus = spawnSync('git', ['status', '--porcelain'], { + cwd: qaOnlyDir, stdio: 'pipe', + }); + const statusLines = gitStatus.stdout.toString().trim().split('\n').filter( + (l: string) => l.trim() && !l.includes('.prompt-tmp') && !l.includes('.gstack/') && !l.includes('qa-reports/'), + ); + expect(statusLines.filter((l: string) => l.startsWith(' M') || l.startsWith('M '))).toHaveLength(0); + }, 240_000); +}); + +// --- QA Fix Loop E2E --- + +describeE2E('QA Fix Loop E2E', () => { + let qaFixDir: string; + let qaFixServer: ReturnType | null = null; + + beforeAll(() => { + qaFixDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-qa-fix-')); + setupBrowseShims(qaFixDir); + + // Copy qa skill files + copyDirSync(path.join(ROOT, 'qa'), path.join(qaFixDir, 'qa')); + + // Create a simple HTML page with obvious fixable bugs + fs.writeFileSync(path.join(qaFixDir, 'index.html'), ` + +Test App + +

Welcome to Test App

+ +
+ + + +
+ + + + +`); + + // Init git repo with clean working tree + const { spawnSync } = require('child_process'); + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: qaFixDir, stdio: 'pipe', timeout: 5000 }); + + run('git', ['init']); + run('git', ['config', 'user.email', 'test@test.com']); + run('git', ['config', 'user.name', 'Test']); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'initial commit']); + + // Start a local server serving from the working directory so fixes are reflected on refresh + qaFixServer = Bun.serve({ + port: 0, + hostname: '127.0.0.1', + fetch(req) { + const url = new URL(req.url); + let filePath = url.pathname === '/' ? '/index.html' : url.pathname; + filePath = filePath.replace(/^\//, ''); + const fullPath = path.join(qaFixDir, filePath); + if (!fs.existsSync(fullPath)) { + return new Response('Not Found', { status: 404 }); + } + const content = fs.readFileSync(fullPath, 'utf-8'); + return new Response(content, { + headers: { 'Content-Type': 'text/html' }, + }); + }, + }); + }); + + afterAll(() => { + qaFixServer?.stop(); + try { fs.rmSync(qaFixDir, { recursive: true, force: true }); } catch {} + }); + + test('/qa fix loop finds bugs and commits fixes', async () => { + const qaFixUrl = `http://127.0.0.1:${qaFixServer!.port}`; + + const result = await runSkillTest({ + prompt: `You have a browse binary at ${browseBin}. Assign it to B variable like: B="${browseBin}" + +Read the file qa/SKILL.md for the QA workflow instructions. + +Run a Quick-tier QA test on ${qaFixUrl} +The source code for this page is at ${qaFixDir}/index.html — you can fix bugs there. +Do NOT use AskUserQuestion — run Quick tier directly. +Write your report to ${qaFixDir}/qa-reports/qa-report.md + +This is a test+fix loop: find bugs, fix them in the source code, commit each fix, and re-verify.`, + workingDirectory: qaFixDir, + maxTurns: 40, + allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Glob', 'Grep'], + timeout: 300_000, + testName: 'qa-fix-loop', + runId, + }); + + logCost('/qa fix loop', result); + recordE2E('/qa fix loop', 'QA Fix Loop E2E', result, { + passed: ['success', 'error_max_turns'].includes(result.exitReason), + }); + + // Accept error_max_turns — fix loop may use many turns + expect(['success', 'error_max_turns']).toContain(result.exitReason); + + // Verify at least one fix commit was made beyond the initial commit + const gitLog = spawnSync('git', ['log', '--oneline'], { + cwd: qaFixDir, stdio: 'pipe', + }); + const commits = gitLog.stdout.toString().trim().split('\n'); + console.log(`/qa fix loop: ${commits.length} commits total (1 initial + ${commits.length - 1} fixes)`); + expect(commits.length).toBeGreaterThan(1); + + // Verify Edit tool was used (agent actually modified source code) + const editCalls = result.toolCalls.filter(tc => tc.tool === 'Edit'); + expect(editCalls.length).toBeGreaterThan(0); + }, 360_000); +}); + +// --- Plan-Eng-Review Test-Plan Artifact E2E --- + +describeE2E('Plan-Eng-Review Test-Plan Artifact E2E', () => { + let planDir: string; + let projectDir: string; + + beforeAll(() => { + planDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-plan-artifact-')); + const { spawnSync } = require('child_process'); + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: planDir, stdio: 'pipe', timeout: 5000 }); + + run('git', ['init']); + run('git', ['config', 'user.email', 'test@test.com']); + run('git', ['config', 'user.name', 'Test']); + + // Create base commit on main + fs.writeFileSync(path.join(planDir, 'app.ts'), 'export function greet() { return "hello"; }\n'); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'initial']); + + // Create feature branch with changes + run('git', ['checkout', '-b', 'feature/add-dashboard']); + fs.writeFileSync(path.join(planDir, 'dashboard.ts'), `export function Dashboard() { + const data = fetchStats(); + return { users: data.users, revenue: data.revenue }; +} +function fetchStats() { + return fetch('/api/stats').then(r => r.json()); +} +`); + fs.writeFileSync(path.join(planDir, 'app.ts'), `import { Dashboard } from "./dashboard"; +export function greet() { return "hello"; } +export function main() { return Dashboard(); } +`); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'feat: add dashboard']); + + // Plan document + fs.writeFileSync(path.join(planDir, 'plan.md'), `# Plan: Add Dashboard + +## Changes +1. New \`dashboard.ts\` with Dashboard component and fetchStats API call +2. Updated \`app.ts\` to import and use Dashboard + +## Architecture +- Dashboard fetches from \`/api/stats\` endpoint +- Returns user count and revenue metrics +`); + run('git', ['add', 'plan.md']); + run('git', ['commit', '-m', 'add plan']); + + // Copy plan-eng-review skill + fs.mkdirSync(path.join(planDir, 'plan-eng-review'), { recursive: true }); + fs.copyFileSync( + path.join(ROOT, 'plan-eng-review', 'SKILL.md'), + path.join(planDir, 'plan-eng-review', 'SKILL.md'), + ); + + // Set up remote-slug shim and browse shims (plan-eng-review uses remote-slug for artifact path) + setupBrowseShims(planDir); + + // Create project directory for artifacts + projectDir = path.join(os.homedir(), '.gstack', 'projects', 'test-project'); + fs.mkdirSync(projectDir, { recursive: true }); + }); + + afterAll(() => { + try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {} + // Clean up test-plan artifacts (but not the project dir itself) + try { + const files = fs.readdirSync(projectDir); + for (const f of files) { + if (f.includes('test-plan')) { + fs.unlinkSync(path.join(projectDir, f)); + } + } + } catch {} + }); + + test('/plan-eng-review writes test-plan artifact to ~/.gstack/projects/', async () => { + // Count existing test-plan files before + const beforeFiles = fs.readdirSync(projectDir).filter(f => f.includes('test-plan')); + + const result = await runSkillTest({ + prompt: `Read plan-eng-review/SKILL.md for the review workflow. + +Read plan.md — that's the plan to review. This is a standalone plan with source code in app.ts and dashboard.ts. + +Choose SMALL CHANGE mode. Skip any AskUserQuestion calls — this is non-interactive. + +IMPORTANT: After your review, you MUST write the test-plan artifact as described in the "Test Plan Artifact" section of SKILL.md. The remote-slug shim is at ${planDir}/browse/bin/remote-slug. + +Write your review to ${planDir}/review-output.md`, + workingDirectory: planDir, + maxTurns: 20, + allowedTools: ['Bash', 'Read', 'Write', 'Glob', 'Grep'], + timeout: 360_000, + testName: 'plan-eng-review-artifact', + runId, + }); + + logCost('/plan-eng-review artifact', result); + recordE2E('/plan-eng-review test-plan artifact', 'Plan-Eng-Review Test-Plan Artifact E2E', result, { + passed: ['success', 'error_max_turns'].includes(result.exitReason), + }); + + expect(['success', 'error_max_turns']).toContain(result.exitReason); + + // Verify test-plan artifact was written + const afterFiles = fs.readdirSync(projectDir).filter(f => f.includes('test-plan')); + const newFiles = afterFiles.filter(f => !beforeFiles.includes(f)); + console.log(`Test-plan artifacts: ${beforeFiles.length} before, ${afterFiles.length} after, ${newFiles.length} new`); + + if (newFiles.length > 0) { + const content = fs.readFileSync(path.join(projectDir, newFiles[0]), 'utf-8'); + console.log(`Test-plan artifact (${newFiles[0]}): ${content.length} chars`); + expect(content.length).toBeGreaterThan(50); + } else { + console.warn('No test-plan artifact found — agent may not have followed artifact instructions'); + } + + // Soft assertion: we expect an artifact but agent compliance is not guaranteed + expect(newFiles.length).toBeGreaterThanOrEqual(1); + }, 420_000); +}); + // --- Deferred skill E2E tests (destructive or require interactive UI) --- describeE2E('Deferred skill E2E', () => {