diff --git a/CLAUDE.md b/CLAUDE.md index e565a4b6..c6909357 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -5,20 +5,21 @@ ```bash bun install # install dependencies bun test # run free tests (browse + snapshot + skill validation) -bun run test:evals # run paid evals: LLM judge + Agent SDK E2E (~$4/run) -bun run test:e2e # run Agent SDK E2E tests only (~$3.85/run) +bun run test:evals # run paid evals: LLM judge + E2E (~$4/run) +bun run test:e2e # run E2E tests only (~$3.85/run) bun run dev # run CLI in dev mode, e.g. bun run dev goto https://example.com bun run build # gen docs + compile binaries bun run gen:skill-docs # regenerate SKILL.md files from templates bun run skill:check # health dashboard for all skills bun run dev:skill # watch mode: auto-regen + validate on change +bun run eval:list # list all eval runs from ~/.gstack-dev/evals/ +bun run eval:compare # compare two eval runs (auto-picks most recent) +bun run eval:summary # aggregate stats across all eval runs ``` -`test:evals` requires `ANTHROPIC_API_KEY` and must be run from a plain terminal -(not inside Claude Code — nested Agent SDK sessions hang). - -**Update (v0.3.5):** The session runner now strips CLAUDE* env vars automatically, -so `test:evals` may work inside Claude Code. If E2E tests hang, run from a plain terminal. +`test:evals` requires `ANTHROPIC_API_KEY`. E2E tests stream progress in real-time +(tool-by-tool via `--output-format stream-json --verbose`). Results are persisted +to `~/.gstack-dev/evals/` with auto-comparison against the previous run. ## Project structure @@ -35,12 +36,12 @@ gstack/ │ ├── skill-check.ts # Health dashboard │ └── dev-skill.ts # Watch mode ├── test/ # Skill validation + eval tests -│ ├── helpers/ # skill-parser.ts, session-runner.ts, llm-judge.ts +│ ├── helpers/ # skill-parser.ts, session-runner.ts, llm-judge.ts, eval-store.ts │ ├── fixtures/ # Ground truth JSON, planted-bug fixtures, eval baselines │ ├── skill-validation.test.ts # Tier 1: static validation (free, <1s) │ ├── gen-skill-docs.test.ts # Tier 1: generator quality (free, <1s) │ ├── skill-llm-eval.test.ts # Tier 3: LLM-as-judge (~$0.15/run) -│ └── skill-e2e.test.ts # Tier 2: Agent SDK E2E (~$3.85/run) +│ └── skill-e2e.test.ts # Tier 2: E2E via claude -p (~$3.85/run) ├── ship/ # Ship workflow skill ├── review/ # PR review skill ├── plan-ceo-review/ # /plan-ceo-review skill diff --git a/TODOS.md b/TODOS.md index edbc25f5..2b9bda3b 100644 --- a/TODOS.md +++ b/TODOS.md @@ -22,3 +22,27 @@ **Depends on:** v0.3.5 shipping first (the `{{UPDATE_CHECK}}` resolver). **Effort:** S (small, ~20 min) **Priority:** P2 (prevents drift on next preamble change) + +## GitHub Actions eval upload + +**What:** Run eval suite in CI, upload result JSON as artifact, post summary comment on PR. + +**Why:** Currently evals only run locally. CI integration would catch quality regressions before merge and provide a persistent record of eval results per PR. + +**Context:** Requires `ANTHROPIC_API_KEY` in CI secrets. Cost is ~$4/run. The eval persistence system (v0.3.6) writes JSON to `~/.gstack-dev/evals/` — CI would upload these as GitHub Actions artifacts and use `eval:compare` to post a delta comment on the PR. + +**Depends on:** Eval persistence shipping (v0.3.6). +**Effort:** M (medium) +**Priority:** P2 + +## Eval web dashboard + +**What:** `bun run eval:dashboard` serves local HTML with charts: cost trending, detection rate over time, pass/fail history. + +**Why:** The CLI tools (`eval:list`, `eval:compare`, `eval:summary`) are good for quick checks but visual charts are better for spotting trends over many runs. + +**Context:** Reads the same `~/.gstack-dev/evals/*.json` files. ~200 lines HTML + chart.js code served via a simple Bun HTTP server. No external dependencies beyond what's already installed. + +**Depends on:** Eval persistence + eval:list shipping (v0.3.6). +**Effort:** M (medium) +**Priority:** P3 (nice-to-have, revisit after eval system sees regular use) diff --git a/package.json b/package.json index ea507c2a..38c9a0b6 100644 --- a/package.json +++ b/package.json @@ -17,7 +17,10 @@ "test:e2e": "EVALS=1 bun test test/skill-e2e.test.ts", "skill:check": "bun run scripts/skill-check.ts", "dev:skill": "bun run scripts/dev-skill.ts", - "start": "bun run browse/src/server.ts" + "start": "bun run browse/src/server.ts", + "eval:list": "bun run scripts/eval-list.ts", + "eval:compare": "bun run scripts/eval-compare.ts", + "eval:summary": "bun run scripts/eval-summary.ts" }, "dependencies": { "playwright": "^1.58.2", @@ -37,7 +40,6 @@ "devtools" ], "devDependencies": { - "@anthropic-ai/claude-agent-sdk": "^0.2.75", "@anthropic-ai/sdk": "^0.78.0" } } diff --git a/scripts/eval-compare.ts b/scripts/eval-compare.ts new file mode 100644 index 00000000..6e2f6a8c --- /dev/null +++ b/scripts/eval-compare.ts @@ -0,0 +1,96 @@ +#!/usr/bin/env bun +/** + * Compare two eval runs from ~/.gstack-dev/evals/ + * + * Usage: + * bun run eval:compare # compare two most recent of same tier + * bun run eval:compare # compare file against its predecessor + * bun run eval:compare # compare two specific files + */ + +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { + findPreviousRun, + compareEvalResults, + formatComparison, +} from '../test/helpers/eval-store'; +import type { EvalResult } from '../test/helpers/eval-store'; + +const EVAL_DIR = path.join(os.homedir(), '.gstack-dev', 'evals'); + +function loadResult(filepath: string): EvalResult { + // Resolve relative to EVAL_DIR if not absolute + const resolved = path.isAbsolute(filepath) ? filepath : path.join(EVAL_DIR, filepath); + if (!fs.existsSync(resolved)) { + console.error(`File not found: ${resolved}`); + process.exit(1); + } + return JSON.parse(fs.readFileSync(resolved, 'utf-8')); +} + +const args = process.argv.slice(2); + +let beforeFile: string; +let afterFile: string; + +if (args.length === 2) { + // Two explicit files + beforeFile = args[0]; + afterFile = args[1]; +} else if (args.length === 1) { + // One file — find its predecessor + afterFile = args[0]; + const resolved = path.isAbsolute(afterFile) ? afterFile : path.join(EVAL_DIR, afterFile); + const afterResult = loadResult(resolved); + const prev = findPreviousRun(EVAL_DIR, afterResult.tier, afterResult.branch, resolved); + if (!prev) { + console.log('No previous run found to compare against.'); + process.exit(0); + } + beforeFile = prev; +} else { + // No args — find two most recent of the same tier + let files: string[]; + try { + files = fs.readdirSync(EVAL_DIR) + .filter(f => f.endsWith('.json')) + .sort() + .reverse(); + } catch { + console.log('No eval runs yet. Run: EVALS=1 bun run test:evals'); + process.exit(0); + } + + if (files.length < 2) { + console.log('Need at least 2 eval runs to compare. Run evals again.'); + process.exit(0); + } + + // Most recent file + afterFile = path.join(EVAL_DIR, files[0]); + const afterResult = loadResult(afterFile); + const prev = findPreviousRun(EVAL_DIR, afterResult.tier, afterResult.branch, afterFile); + if (!prev) { + console.log('No previous run of the same tier found to compare against.'); + process.exit(0); + } + beforeFile = prev; +} + +const beforeResult = loadResult(beforeFile); +const afterResult = loadResult(afterFile); + +// Warn if different tiers +if (beforeResult.tier !== afterResult.tier) { + console.warn(`Warning: comparing different tiers (${beforeResult.tier} vs ${afterResult.tier})`); +} + +// Warn on schema mismatch +if (beforeResult.schema_version !== afterResult.schema_version) { + console.warn(`Warning: schema version mismatch (${beforeResult.schema_version} vs ${afterResult.schema_version})`); +} + +const comparison = compareEvalResults(beforeResult, afterResult, beforeFile, afterFile); +console.log(formatComparison(comparison)); diff --git a/scripts/eval-list.ts b/scripts/eval-list.ts new file mode 100644 index 00000000..96cb7a28 --- /dev/null +++ b/scripts/eval-list.ts @@ -0,0 +1,105 @@ +#!/usr/bin/env bun +/** + * List eval runs from ~/.gstack-dev/evals/ + * + * Usage: bun run eval:list [--branch ] [--tier e2e|llm-judge] [--limit N] + */ + +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; + +const EVAL_DIR = path.join(os.homedir(), '.gstack-dev', 'evals'); + +// Parse args +const args = process.argv.slice(2); +let filterBranch: string | null = null; +let filterTier: string | null = null; +let limit = 20; + +for (let i = 0; i < args.length; i++) { + if (args[i] === '--branch' && args[i + 1]) { filterBranch = args[++i]; } + else if (args[i] === '--tier' && args[i + 1]) { filterTier = args[++i]; } + else if (args[i] === '--limit' && args[i + 1]) { limit = parseInt(args[++i], 10); } +} + +// Read eval files +let files: string[]; +try { + files = fs.readdirSync(EVAL_DIR).filter(f => f.endsWith('.json')); +} catch { + console.log('No eval runs yet. Run: EVALS=1 bun run test:evals'); + process.exit(0); +} + +if (files.length === 0) { + console.log('No eval runs yet. Run: EVALS=1 bun run test:evals'); + process.exit(0); +} + +// Parse top-level fields from each file +interface RunSummary { + file: string; + timestamp: string; + branch: string; + tier: string; + version: string; + passed: number; + total: number; + cost: number; +} + +const runs: RunSummary[] = []; +for (const file of files) { + try { + const data = JSON.parse(fs.readFileSync(path.join(EVAL_DIR, file), 'utf-8')); + if (filterBranch && data.branch !== filterBranch) continue; + if (filterTier && data.tier !== filterTier) continue; + runs.push({ + file, + timestamp: data.timestamp || '', + branch: data.branch || 'unknown', + tier: data.tier || 'unknown', + version: data.version || '?', + passed: data.passed || 0, + total: data.total_tests || 0, + cost: data.total_cost_usd || 0, + }); + } catch { continue; } +} + +// Sort by timestamp descending +runs.sort((a, b) => b.timestamp.localeCompare(a.timestamp)); + +// Apply limit +const displayed = runs.slice(0, limit); + +// Print table +console.log(''); +console.log(`Eval History (${runs.length} total runs)`); +console.log('═'.repeat(90)); +console.log( + ' ' + + 'Date'.padEnd(17) + + 'Branch'.padEnd(28) + + 'Tier'.padEnd(12) + + 'Pass'.padEnd(8) + + 'Cost'.padEnd(8) + + 'Version' +); +console.log('─'.repeat(90)); + +for (const run of displayed) { + const date = run.timestamp.replace('T', ' ').slice(0, 16); + const branch = run.branch.length > 26 ? run.branch.slice(0, 23) + '...' : run.branch.padEnd(28); + const pass = `${run.passed}/${run.total}`.padEnd(8); + const cost = `$${run.cost.toFixed(2)}`.padEnd(8); + console.log(` ${date.padEnd(17)}${branch}${run.tier.padEnd(12)}${pass}${cost}v${run.version}`); +} + +console.log('─'.repeat(90)); + +const totalCost = runs.reduce((s, r) => s + r.cost, 0); +console.log(` ${runs.length} runs | Total spend: $${totalCost.toFixed(2)} | Showing: ${displayed.length}`); +console.log(` Dir: ${EVAL_DIR}`); +console.log(''); diff --git a/scripts/eval-summary.ts b/scripts/eval-summary.ts new file mode 100644 index 00000000..40b75fc3 --- /dev/null +++ b/scripts/eval-summary.ts @@ -0,0 +1,134 @@ +#!/usr/bin/env bun +/** + * Aggregate summary of all eval runs from ~/.gstack-dev/evals/ + * + * Usage: bun run eval:summary + */ + +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import type { EvalResult } from '../test/helpers/eval-store'; + +const EVAL_DIR = path.join(os.homedir(), '.gstack-dev', 'evals'); + +let files: string[]; +try { + files = fs.readdirSync(EVAL_DIR).filter(f => f.endsWith('.json')); +} catch { + console.log('No eval runs yet. Run: EVALS=1 bun run test:evals'); + process.exit(0); +} + +if (files.length === 0) { + console.log('No eval runs yet. Run: EVALS=1 bun run test:evals'); + process.exit(0); +} + +// Load all results +const results: EvalResult[] = []; +for (const file of files) { + try { + results.push(JSON.parse(fs.readFileSync(path.join(EVAL_DIR, file), 'utf-8'))); + } catch { continue; } +} + +// Aggregate stats +const e2eRuns = results.filter(r => r.tier === 'e2e'); +const judgeRuns = results.filter(r => r.tier === 'llm-judge'); +const totalCost = results.reduce((s, r) => s + (r.total_cost_usd || 0), 0); +const avgE2ECost = e2eRuns.length > 0 ? e2eRuns.reduce((s, r) => s + r.total_cost_usd, 0) / e2eRuns.length : 0; +const avgJudgeCost = judgeRuns.length > 0 ? judgeRuns.reduce((s, r) => s + r.total_cost_usd, 0) / judgeRuns.length : 0; + +// Detection rates from outcome evals +const detectionRates: number[] = []; +for (const r of e2eRuns) { + for (const t of r.tests) { + if (t.detection_rate !== undefined) { + detectionRates.push(t.detection_rate); + } + } +} +const avgDetection = detectionRates.length > 0 + ? detectionRates.reduce((a, b) => a + b, 0) / detectionRates.length + : null; + +// Flaky tests (passed in some runs, failed in others) +const testResults = new Map(); +for (const r of results) { + for (const t of r.tests) { + const key = `${r.tier}:${t.name}`; + if (!testResults.has(key)) testResults.set(key, []); + testResults.get(key)!.push(t.passed); + } +} +const flakyTests: string[] = []; +for (const [name, outcomes] of testResults) { + if (outcomes.length >= 2) { + const hasPass = outcomes.some(o => o); + const hasFail = outcomes.some(o => !o); + if (hasPass && hasFail) flakyTests.push(name); + } +} + +// Branch stats +const branchStats = new Map(); +for (const r of e2eRuns) { + if (!branchStats.has(r.branch)) { + branchStats.set(r.branch, { runs: 0, avgDetection: 0, detections: [] }); + } + const stats = branchStats.get(r.branch)!; + stats.runs++; + for (const t of r.tests) { + if (t.detection_rate !== undefined) { + stats.detections.push(t.detection_rate); + } + } +} +for (const stats of branchStats.values()) { + stats.avgDetection = stats.detections.length > 0 + ? stats.detections.reduce((a, b) => a + b, 0) / stats.detections.length + : 0; +} + +// Print summary +console.log(''); +console.log('Eval Summary'); +console.log('═'.repeat(60)); +console.log(` Total runs: ${results.length} (${e2eRuns.length} e2e, ${judgeRuns.length} llm-judge)`); +console.log(` Total spend: $${totalCost.toFixed(2)}`); +console.log(` Avg cost/e2e: $${avgE2ECost.toFixed(2)}`); +console.log(` Avg cost/judge: $${avgJudgeCost.toFixed(2)}`); +if (avgDetection !== null) { + console.log(` Avg detection: ${avgDetection.toFixed(1)} bugs`); +} +console.log('─'.repeat(60)); + +if (flakyTests.length > 0) { + console.log(` Flaky tests (${flakyTests.length}):`); + for (const name of flakyTests) { + console.log(` - ${name}`); + } + console.log('─'.repeat(60)); +} + +if (branchStats.size > 0) { + console.log(' Branches:'); + const sorted = [...branchStats.entries()].sort((a, b) => b[1].avgDetection - a[1].avgDetection); + for (const [branch, stats] of sorted) { + const det = stats.detections.length > 0 ? ` avg det: ${stats.avgDetection.toFixed(1)}` : ''; + console.log(` ${branch.padEnd(30)} ${stats.runs} runs${det}`); + } + console.log('─'.repeat(60)); +} + +// Date range +const timestamps = results.map(r => r.timestamp).filter(Boolean).sort(); +if (timestamps.length > 0) { + const first = timestamps[0].replace('T', ' ').slice(0, 16); + const last = timestamps[timestamps.length - 1].replace('T', ' ').slice(0, 16); + console.log(` Date range: ${first} → ${last}`); +} + +console.log(` Dir: ${EVAL_DIR}`); +console.log('');