feat: eval efficiency metrics — turns, duration, commentary across all surfaces

Add generateCommentary() for natural-language delta interpretation, per-test turns/duration in comparison and summary output, judgePassed unit tests, 3 new E2E tests (qa-only, qa fix loop, plan artifact). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-06-23 18:20:00 +02:00 · 2026-03-15 21:17:12 -05:00
parent 4120537d66
commit 03a6270b9c
5 changed files with 825 additions and 35 deletions
@@ -47,6 +47,8 @@ interface RunSummary {
  passed: number;
  total: number;
  cost: number;
+  duration: number;
+  turns: number;
 }

 const runs: RunSummary[] = [];
@@ -55,6 +57,7 @@ for (const file of files) {
    const data = JSON.parse(fs.readFileSync(path.join(EVAL_DIR, file), 'utf-8'));
    if (filterBranch && data.branch !== filterBranch) continue;
    if (filterTier && data.tier !== filterTier) continue;
+    const totalTurns = (data.tests || []).reduce((s: number, t: any) => s + (t.turns_used || 0), 0);
    runs.push({
      file,
      timestamp: data.timestamp || '',
@@ -64,6 +67,8 @@ for (const file of files) {
      passed: data.passed || 0,
      total: data.total_tests || 0,
      cost: data.total_cost_usd || 0,
+      duration: data.total_duration_ms || 0,
+      turns: totalTurns,
    });
  } catch { continue; }
 }
@@ -77,29 +82,35 @@ const displayed = runs.slice(0, limit);
 // Print table
 console.log('');
 console.log(`Eval History (${runs.length} total runs)`);
-console.log('═'.repeat(90));
+console.log('═'.repeat(105));
 console.log(
  '  ' +
  'Date'.padEnd(17) +
-  'Branch'.padEnd(28) +
+  'Branch'.padEnd(25) +
  'Tier'.padEnd(12) +
  'Pass'.padEnd(8) +
  'Cost'.padEnd(8) +
+  'Turns'.padEnd(7) +
+  'Duration'.padEnd(10) +
  'Version'
 );
-console.log('─'.repeat(90));
+console.log('─'.repeat(105));

 for (const run of displayed) {
  const date = run.timestamp.replace('T', ' ').slice(0, 16);
-  const branch = run.branch.length > 26 ? run.branch.slice(0, 23) + '...' : run.branch.padEnd(28);
+  const branch = run.branch.length > 23 ? run.branch.slice(0, 20) + '...' : run.branch.padEnd(25);
  const pass = `${run.passed}/${run.total}`.padEnd(8);
  const cost = `$${run.cost.toFixed(2)}`.padEnd(8);
-  console.log(`  ${date.padEnd(17)}${branch}${run.tier.padEnd(12)}${pass}${cost}v${run.version}`);
+  const turns = run.turns > 0 ? `${run.turns}t`.padEnd(7) : ''.padEnd(7);
+  const dur = run.duration > 0 ? `${Math.round(run.duration / 1000)}s`.padEnd(10) : ''.padEnd(10);
+  console.log(`  ${date.padEnd(17)}${branch}${run.tier.padEnd(12)}${pass}${cost}${turns}${dur}v${run.version}`);
 }

-console.log('─'.repeat(90));
+console.log('─'.repeat(105));

 const totalCost = runs.reduce((s, r) => s + r.cost, 0);
-console.log(`  ${runs.length} runs | Total spend: $${totalCost.toFixed(2)} | Showing: ${displayed.length}`);
+const totalDur = runs.reduce((s, r) => s + r.duration, 0);
+const totalTurns = runs.reduce((s, r) => s + r.turns, 0);
+console.log(`  ${runs.length} runs | $${totalCost.toFixed(2)} total | ${totalTurns} turns | ${Math.round(totalDur / 1000)}s | Showing: ${displayed.length}`);
 console.log(`  Dir: ${EVAL_DIR}`);
 console.log('');
@@ -40,6 +40,33 @@ const totalCost = results.reduce((s, r) => s + (r.total_cost_usd || 0), 0);
 const avgE2ECost = e2eRuns.length > 0 ? e2eRuns.reduce((s, r) => s + r.total_cost_usd, 0) / e2eRuns.length : 0;
 const avgJudgeCost = judgeRuns.length > 0 ? judgeRuns.reduce((s, r) => s + r.total_cost_usd, 0) / judgeRuns.length : 0;

+// Duration + turns from E2E runs
+const avgE2EDuration = e2eRuns.length > 0
+  ? e2eRuns.reduce((s, r) => s + (r.total_duration_ms || 0), 0) / e2eRuns.length
+  : 0;
+const e2eTurns: number[] = [];
+for (const r of e2eRuns) {
+  const runTurns = r.tests.reduce((s, t) => s + (t.turns_used || 0), 0);
+  if (runTurns > 0) e2eTurns.push(runTurns);
+}
+const avgE2ETurns = e2eTurns.length > 0
+  ? e2eTurns.reduce((a, b) => a + b, 0) / e2eTurns.length
+  : 0;
+
+// Per-test efficiency stats (avg turns + duration across runs)
+const testEfficiency = new Map<string, { turns: number[]; durations: number[]; costs: number[] }>();
+for (const r of e2eRuns) {
+  for (const t of r.tests) {
+    if (!testEfficiency.has(t.name)) {
+      testEfficiency.set(t.name, { turns: [], durations: [], costs: [] });
+    }
+    const stats = testEfficiency.get(t.name)!;
+    if (t.turns_used !== undefined) stats.turns.push(t.turns_used);
+    if (t.duration_ms > 0) stats.durations.push(t.duration_ms);
+    if (t.cost_usd > 0) stats.costs.push(t.cost_usd);
+  }
+}
+
 // Detection rates from outcome evals
 const detectionRates: number[] = [];
 for (const r of e2eRuns) {
@@ -94,22 +121,48 @@ for (const stats of branchStats.values()) {
 // Print summary
 console.log('');
 console.log('Eval Summary');
-console.log('═'.repeat(60));
+console.log('═'.repeat(70));
 console.log(`  Total runs:        ${results.length} (${e2eRuns.length} e2e, ${judgeRuns.length} llm-judge)`);
 console.log(`  Total spend:       $${totalCost.toFixed(2)}`);
 console.log(`  Avg cost/e2e:      $${avgE2ECost.toFixed(2)}`);
 console.log(`  Avg cost/judge:    $${avgJudgeCost.toFixed(2)}`);
+if (avgE2EDuration > 0) {
+  console.log(`  Avg duration/e2e:  ${Math.round(avgE2EDuration / 1000)}s`);
+}
+if (avgE2ETurns > 0) {
+  console.log(`  Avg turns/e2e:     ${Math.round(avgE2ETurns)}`);
+}
 if (avgDetection !== null) {
  console.log(`  Avg detection:     ${avgDetection.toFixed(1)} bugs`);
 }
-console.log('─'.repeat(60));
+console.log('─'.repeat(70));
+
+// Per-test efficiency averages (only if we have enough data)
+if (testEfficiency.size > 0 && e2eRuns.length >= 2) {
+  console.log('  Per-test efficiency (averages across runs):');
+  const sorted = [...testEfficiency.entries()]
+    .filter(([, s]) => s.turns.length >= 2)
+    .sort((a, b) => {
+      const avgA = a[1].costs.reduce((s, c) => s + c, 0) / a[1].costs.length;
+      const avgB = b[1].costs.reduce((s, c) => s + c, 0) / b[1].costs.length;
+      return avgB - avgA;
+    });
+  for (const [name, stats] of sorted) {
+    const avgT = Math.round(stats.turns.reduce((a, b) => a + b, 0) / stats.turns.length);
+    const avgD = Math.round(stats.durations.reduce((a, b) => a + b, 0) / stats.durations.length / 1000);
+    const avgC = (stats.costs.reduce((a, b) => a + b, 0) / stats.costs.length).toFixed(2);
+    const label = name.length > 30 ? name.slice(0, 27) + '...' : name.padEnd(30);
+    console.log(`    ${label}  $${avgC}  ${avgT}t  ${avgD}s  (${stats.turns.length} runs)`);
+  }
+  console.log('─'.repeat(70));
+}

 if (flakyTests.length > 0) {
  console.log(`  Flaky tests (${flakyTests.length}):`);
  for (const name of flakyTests) {
    console.log(`    - ${name}`);
  }
-  console.log('─'.repeat(60));
+  console.log('─'.repeat(70));
 }

 if (branchStats.size > 0) {
@@ -119,7 +172,7 @@ if (branchStats.size > 0) {
    const det = stats.detections.length > 0 ? ` avg det: ${stats.avgDetection.toFixed(1)}` : '';
    console.log(`    ${branch.padEnd(30)} ${stats.runs} runs${det}`);
  }
-  console.log('─'.repeat(60));
+  console.log('─'.repeat(70));
 }

 // Date range