mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-03 12:15:14 +02:00
dc5e0538e5
* refactor: extract gen-skill-docs into modular resolver architecture Break the 3000-line monolith into 10 domain modules under scripts/resolvers/: types, constants, preamble, utility, browse, design, testing, review, codex-helpers, and index. Each module owns one domain of template generation. The preamble module introduces a 4-tier composition system (T1-T4) so skills only pay for the preamble sections they actually need, reducing token usage for lightweight skills by ~40%. Adds a token budget dashboard that prints after every generation run showing per-skill and total token counts. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: tiered preamble — skills only pay for what they use Tag all 23 templates with preamble-tier (T1-T4). Lightweight skills like /browse and /benchmark get a minimal preamble (~40% fewer tokens), while review skills get the full stack. Regenerate all SKILL.md files. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: migrate eval storage to project-scoped paths Move eval results and E2E run artifacts from ~/.gstack-dev/evals/ to ~/.gstack/projects/$SLUG/evals/ so each project's eval history lives alongside its other gstack data. Falls back to legacy path if slug detection fails. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: sync package.json version with VERSION after merge Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: add WorktreeManager for isolated test environments Reusable platform module (lib/worktree.ts) that creates git worktrees for test isolation and harvests useful changes as patches. Includes SHA-256 dedup, original SHA tracking for committed change detection, and automatic gitignored artifact copying (.agents/, browse/dist/). 12 unit tests covering lifecycle, harvest, dedup, and error handling. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: integrate worktree isolation into E2E test infrastructure Add createTestWorktree(), harvestAndCleanup(), and describeWithWorktree() helpers to e2e-helpers.ts. Add harvest field to EvalTestEntry for eval-store integration. Register lib/worktree.ts as a global touchfile. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: run Gemini and Codex E2E tests in worktrees Switch both test suites from cwd: ROOT to worktree isolation. Gemini (--yolo) no longer pollutes the working tree. Codex (read-only) gets worktree for consistency. Useful changes are harvested as patches for cherry-picking. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: skip symlinks in copyDirSync to prevent infinite recursion Adversarial review caught that .claude/skills/gstack may be a symlink back to the repo root, causing copyDirSync to recurse infinitely when copying gitignored artifacts into worktrees. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * chore: bump version and changelog (v0.11.12.0) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: relax session-awareness assertion to accept structured options The LLM consistently presents well-formatted A/B choices with pros/cons but doesn't always use the exact string "RECOMMENDATION". Accept case-insensitive "recommend", "option a", "which do you want", or "which approach" as equivalent signals of a structured recommendation. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
189 lines
6.5 KiB
TypeScript
189 lines
6.5 KiB
TypeScript
#!/usr/bin/env bun
|
|
/**
|
|
* Aggregate summary of all eval runs from ~/.gstack-dev/evals/
|
|
*
|
|
* Usage: bun run eval:summary
|
|
*/
|
|
|
|
import * as fs from 'fs';
|
|
import * as path from 'path';
|
|
import * as os from 'os';
|
|
import type { EvalResult } from '../test/helpers/eval-store';
|
|
import { getProjectEvalDir } from '../test/helpers/eval-store';
|
|
|
|
const EVAL_DIR = getProjectEvalDir();
|
|
|
|
let files: string[];
|
|
try {
|
|
files = fs.readdirSync(EVAL_DIR).filter(f => f.endsWith('.json'));
|
|
} catch {
|
|
console.log('No eval runs yet. Run: EVALS=1 bun run test:evals');
|
|
process.exit(0);
|
|
}
|
|
|
|
if (files.length === 0) {
|
|
console.log('No eval runs yet. Run: EVALS=1 bun run test:evals');
|
|
process.exit(0);
|
|
}
|
|
|
|
// Load all results
|
|
const results: EvalResult[] = [];
|
|
for (const file of files) {
|
|
try {
|
|
results.push(JSON.parse(fs.readFileSync(path.join(EVAL_DIR, file), 'utf-8')));
|
|
} catch { continue; }
|
|
}
|
|
|
|
// Aggregate stats
|
|
const e2eRuns = results.filter(r => r.tier === 'e2e');
|
|
const judgeRuns = results.filter(r => r.tier === 'llm-judge');
|
|
const totalCost = results.reduce((s, r) => s + (r.total_cost_usd || 0), 0);
|
|
const avgE2ECost = e2eRuns.length > 0 ? e2eRuns.reduce((s, r) => s + r.total_cost_usd, 0) / e2eRuns.length : 0;
|
|
const avgJudgeCost = judgeRuns.length > 0 ? judgeRuns.reduce((s, r) => s + r.total_cost_usd, 0) / judgeRuns.length : 0;
|
|
|
|
// Duration + turns from E2E runs
|
|
const avgE2EDuration = e2eRuns.length > 0
|
|
? e2eRuns.reduce((s, r) => s + (r.total_duration_ms || 0), 0) / e2eRuns.length
|
|
: 0;
|
|
const e2eTurns: number[] = [];
|
|
for (const r of e2eRuns) {
|
|
const runTurns = r.tests.reduce((s, t) => s + (t.turns_used || 0), 0);
|
|
if (runTurns > 0) e2eTurns.push(runTurns);
|
|
}
|
|
const avgE2ETurns = e2eTurns.length > 0
|
|
? e2eTurns.reduce((a, b) => a + b, 0) / e2eTurns.length
|
|
: 0;
|
|
|
|
// Per-test efficiency stats (avg turns + duration across runs)
|
|
const testEfficiency = new Map<string, { turns: number[]; durations: number[]; costs: number[] }>();
|
|
for (const r of e2eRuns) {
|
|
for (const t of r.tests) {
|
|
if (!testEfficiency.has(t.name)) {
|
|
testEfficiency.set(t.name, { turns: [], durations: [], costs: [] });
|
|
}
|
|
const stats = testEfficiency.get(t.name)!;
|
|
if (t.turns_used !== undefined) stats.turns.push(t.turns_used);
|
|
if (t.duration_ms > 0) stats.durations.push(t.duration_ms);
|
|
if (t.cost_usd > 0) stats.costs.push(t.cost_usd);
|
|
}
|
|
}
|
|
|
|
// Detection rates from outcome evals
|
|
const detectionRates: number[] = [];
|
|
for (const r of e2eRuns) {
|
|
for (const t of r.tests) {
|
|
if (t.detection_rate !== undefined) {
|
|
detectionRates.push(t.detection_rate);
|
|
}
|
|
}
|
|
}
|
|
const avgDetection = detectionRates.length > 0
|
|
? detectionRates.reduce((a, b) => a + b, 0) / detectionRates.length
|
|
: null;
|
|
|
|
// Flaky tests (passed in some runs, failed in others)
|
|
const testResults = new Map<string, boolean[]>();
|
|
for (const r of results) {
|
|
for (const t of r.tests) {
|
|
const key = `${r.tier}:${t.name}`;
|
|
if (!testResults.has(key)) testResults.set(key, []);
|
|
testResults.get(key)!.push(t.passed);
|
|
}
|
|
}
|
|
const flakyTests: string[] = [];
|
|
for (const [name, outcomes] of testResults) {
|
|
if (outcomes.length >= 2) {
|
|
const hasPass = outcomes.some(o => o);
|
|
const hasFail = outcomes.some(o => !o);
|
|
if (hasPass && hasFail) flakyTests.push(name);
|
|
}
|
|
}
|
|
|
|
// Branch stats
|
|
const branchStats = new Map<string, { runs: number; avgDetection: number; detections: number[] }>();
|
|
for (const r of e2eRuns) {
|
|
if (!branchStats.has(r.branch)) {
|
|
branchStats.set(r.branch, { runs: 0, avgDetection: 0, detections: [] });
|
|
}
|
|
const stats = branchStats.get(r.branch)!;
|
|
stats.runs++;
|
|
for (const t of r.tests) {
|
|
if (t.detection_rate !== undefined) {
|
|
stats.detections.push(t.detection_rate);
|
|
}
|
|
}
|
|
}
|
|
for (const stats of branchStats.values()) {
|
|
stats.avgDetection = stats.detections.length > 0
|
|
? stats.detections.reduce((a, b) => a + b, 0) / stats.detections.length
|
|
: 0;
|
|
}
|
|
|
|
// Print summary
|
|
console.log('');
|
|
console.log('Eval Summary');
|
|
console.log('═'.repeat(70));
|
|
console.log(` Total runs: ${results.length} (${e2eRuns.length} e2e, ${judgeRuns.length} llm-judge)`);
|
|
console.log(` Total spend: $${totalCost.toFixed(2)}`);
|
|
console.log(` Avg cost/e2e: $${avgE2ECost.toFixed(2)}`);
|
|
console.log(` Avg cost/judge: $${avgJudgeCost.toFixed(2)}`);
|
|
if (avgE2EDuration > 0) {
|
|
console.log(` Avg duration/e2e: ${Math.round(avgE2EDuration / 1000)}s`);
|
|
}
|
|
if (avgE2ETurns > 0) {
|
|
console.log(` Avg turns/e2e: ${Math.round(avgE2ETurns)}`);
|
|
}
|
|
if (avgDetection !== null) {
|
|
console.log(` Avg detection: ${avgDetection.toFixed(1)} bugs`);
|
|
}
|
|
console.log('─'.repeat(70));
|
|
|
|
// Per-test efficiency averages (only if we have enough data)
|
|
if (testEfficiency.size > 0 && e2eRuns.length >= 2) {
|
|
console.log(' Per-test efficiency (averages across runs):');
|
|
const sorted = [...testEfficiency.entries()]
|
|
.filter(([, s]) => s.turns.length >= 2)
|
|
.sort((a, b) => {
|
|
const avgA = a[1].costs.reduce((s, c) => s + c, 0) / a[1].costs.length;
|
|
const avgB = b[1].costs.reduce((s, c) => s + c, 0) / b[1].costs.length;
|
|
return avgB - avgA;
|
|
});
|
|
for (const [name, stats] of sorted) {
|
|
const avgT = Math.round(stats.turns.reduce((a, b) => a + b, 0) / stats.turns.length);
|
|
const avgD = Math.round(stats.durations.reduce((a, b) => a + b, 0) / stats.durations.length / 1000);
|
|
const avgC = (stats.costs.reduce((a, b) => a + b, 0) / stats.costs.length).toFixed(2);
|
|
const label = name.length > 30 ? name.slice(0, 27) + '...' : name.padEnd(30);
|
|
console.log(` ${label} $${avgC} ${avgT}t ${avgD}s (${stats.turns.length} runs)`);
|
|
}
|
|
console.log('─'.repeat(70));
|
|
}
|
|
|
|
if (flakyTests.length > 0) {
|
|
console.log(` Flaky tests (${flakyTests.length}):`);
|
|
for (const name of flakyTests) {
|
|
console.log(` - ${name}`);
|
|
}
|
|
console.log('─'.repeat(70));
|
|
}
|
|
|
|
if (branchStats.size > 0) {
|
|
console.log(' Branches:');
|
|
const sorted = [...branchStats.entries()].sort((a, b) => b[1].avgDetection - a[1].avgDetection);
|
|
for (const [branch, stats] of sorted) {
|
|
const det = stats.detections.length > 0 ? ` avg det: ${stats.avgDetection.toFixed(1)}` : '';
|
|
console.log(` ${branch.padEnd(30)} ${stats.runs} runs${det}`);
|
|
}
|
|
console.log('─'.repeat(70));
|
|
}
|
|
|
|
// Date range
|
|
const timestamps = results.map(r => r.timestamp).filter(Boolean).sort();
|
|
if (timestamps.length > 0) {
|
|
const first = timestamps[0].replace('T', ' ').slice(0, 16);
|
|
const last = timestamps[timestamps.length - 1].replace('T', ' ').slice(0, 16);
|
|
console.log(` Date range: ${first} → ${last}`);
|
|
}
|
|
|
|
console.log(` Dir: ${EVAL_DIR}`);
|
|
console.log('');
|