mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-01 19:25:10 +02:00
feat: eval CLI tools + docs cleanup
Add eval:list, eval:compare, eval:summary CLI scripts for exploring eval history from ~/.gstack-dev/evals/. eval:compare reuses the shared comparison functions from eval-store.ts. - eval:list: sorted table with branch/tier/cost filters - eval:compare: thin wrapper around compareEvalResults + formatComparison - eval:summary: aggregate stats, flaky test detection, branch rankings - Remove unused @anthropic-ai/claude-agent-sdk from devDependencies - Update CLAUDE.md: streaming docs, eval CLI commands, remove Agent SDK refs - Add GH Actions eval upload (P2) and web dashboard (P3) to TODOS.md Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -5,20 +5,21 @@
|
||||
```bash
|
||||
bun install # install dependencies
|
||||
bun test # run free tests (browse + snapshot + skill validation)
|
||||
bun run test:evals # run paid evals: LLM judge + Agent SDK E2E (~$4/run)
|
||||
bun run test:e2e # run Agent SDK E2E tests only (~$3.85/run)
|
||||
bun run test:evals # run paid evals: LLM judge + E2E (~$4/run)
|
||||
bun run test:e2e # run E2E tests only (~$3.85/run)
|
||||
bun run dev <cmd> # run CLI in dev mode, e.g. bun run dev goto https://example.com
|
||||
bun run build # gen docs + compile binaries
|
||||
bun run gen:skill-docs # regenerate SKILL.md files from templates
|
||||
bun run skill:check # health dashboard for all skills
|
||||
bun run dev:skill # watch mode: auto-regen + validate on change
|
||||
bun run eval:list # list all eval runs from ~/.gstack-dev/evals/
|
||||
bun run eval:compare # compare two eval runs (auto-picks most recent)
|
||||
bun run eval:summary # aggregate stats across all eval runs
|
||||
```
|
||||
|
||||
`test:evals` requires `ANTHROPIC_API_KEY` and must be run from a plain terminal
|
||||
(not inside Claude Code — nested Agent SDK sessions hang).
|
||||
|
||||
**Update (v0.3.5):** The session runner now strips CLAUDE* env vars automatically,
|
||||
so `test:evals` may work inside Claude Code. If E2E tests hang, run from a plain terminal.
|
||||
`test:evals` requires `ANTHROPIC_API_KEY`. E2E tests stream progress in real-time
|
||||
(tool-by-tool via `--output-format stream-json --verbose`). Results are persisted
|
||||
to `~/.gstack-dev/evals/` with auto-comparison against the previous run.
|
||||
|
||||
## Project structure
|
||||
|
||||
@@ -35,12 +36,12 @@ gstack/
|
||||
│ ├── skill-check.ts # Health dashboard
|
||||
│ └── dev-skill.ts # Watch mode
|
||||
├── test/ # Skill validation + eval tests
|
||||
│ ├── helpers/ # skill-parser.ts, session-runner.ts, llm-judge.ts
|
||||
│ ├── helpers/ # skill-parser.ts, session-runner.ts, llm-judge.ts, eval-store.ts
|
||||
│ ├── fixtures/ # Ground truth JSON, planted-bug fixtures, eval baselines
|
||||
│ ├── skill-validation.test.ts # Tier 1: static validation (free, <1s)
|
||||
│ ├── gen-skill-docs.test.ts # Tier 1: generator quality (free, <1s)
|
||||
│ ├── skill-llm-eval.test.ts # Tier 3: LLM-as-judge (~$0.15/run)
|
||||
│ └── skill-e2e.test.ts # Tier 2: Agent SDK E2E (~$3.85/run)
|
||||
│ └── skill-e2e.test.ts # Tier 2: E2E via claude -p (~$3.85/run)
|
||||
├── ship/ # Ship workflow skill
|
||||
├── review/ # PR review skill
|
||||
├── plan-ceo-review/ # /plan-ceo-review skill
|
||||
|
||||
@@ -22,3 +22,27 @@
|
||||
**Depends on:** v0.3.5 shipping first (the `{{UPDATE_CHECK}}` resolver).
|
||||
**Effort:** S (small, ~20 min)
|
||||
**Priority:** P2 (prevents drift on next preamble change)
|
||||
|
||||
## GitHub Actions eval upload
|
||||
|
||||
**What:** Run eval suite in CI, upload result JSON as artifact, post summary comment on PR.
|
||||
|
||||
**Why:** Currently evals only run locally. CI integration would catch quality regressions before merge and provide a persistent record of eval results per PR.
|
||||
|
||||
**Context:** Requires `ANTHROPIC_API_KEY` in CI secrets. Cost is ~$4/run. The eval persistence system (v0.3.6) writes JSON to `~/.gstack-dev/evals/` — CI would upload these as GitHub Actions artifacts and use `eval:compare` to post a delta comment on the PR.
|
||||
|
||||
**Depends on:** Eval persistence shipping (v0.3.6).
|
||||
**Effort:** M (medium)
|
||||
**Priority:** P2
|
||||
|
||||
## Eval web dashboard
|
||||
|
||||
**What:** `bun run eval:dashboard` serves local HTML with charts: cost trending, detection rate over time, pass/fail history.
|
||||
|
||||
**Why:** The CLI tools (`eval:list`, `eval:compare`, `eval:summary`) are good for quick checks but visual charts are better for spotting trends over many runs.
|
||||
|
||||
**Context:** Reads the same `~/.gstack-dev/evals/*.json` files. ~200 lines HTML + chart.js code served via a simple Bun HTTP server. No external dependencies beyond what's already installed.
|
||||
|
||||
**Depends on:** Eval persistence + eval:list shipping (v0.3.6).
|
||||
**Effort:** M (medium)
|
||||
**Priority:** P3 (nice-to-have, revisit after eval system sees regular use)
|
||||
|
||||
+4
-2
@@ -17,7 +17,10 @@
|
||||
"test:e2e": "EVALS=1 bun test test/skill-e2e.test.ts",
|
||||
"skill:check": "bun run scripts/skill-check.ts",
|
||||
"dev:skill": "bun run scripts/dev-skill.ts",
|
||||
"start": "bun run browse/src/server.ts"
|
||||
"start": "bun run browse/src/server.ts",
|
||||
"eval:list": "bun run scripts/eval-list.ts",
|
||||
"eval:compare": "bun run scripts/eval-compare.ts",
|
||||
"eval:summary": "bun run scripts/eval-summary.ts"
|
||||
},
|
||||
"dependencies": {
|
||||
"playwright": "^1.58.2",
|
||||
@@ -37,7 +40,6 @@
|
||||
"devtools"
|
||||
],
|
||||
"devDependencies": {
|
||||
"@anthropic-ai/claude-agent-sdk": "^0.2.75",
|
||||
"@anthropic-ai/sdk": "^0.78.0"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,96 @@
|
||||
#!/usr/bin/env bun
|
||||
/**
|
||||
* Compare two eval runs from ~/.gstack-dev/evals/
|
||||
*
|
||||
* Usage:
|
||||
* bun run eval:compare # compare two most recent of same tier
|
||||
* bun run eval:compare <file> # compare file against its predecessor
|
||||
* bun run eval:compare <file-a> <file-b> # compare two specific files
|
||||
*/
|
||||
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
import {
|
||||
findPreviousRun,
|
||||
compareEvalResults,
|
||||
formatComparison,
|
||||
} from '../test/helpers/eval-store';
|
||||
import type { EvalResult } from '../test/helpers/eval-store';
|
||||
|
||||
const EVAL_DIR = path.join(os.homedir(), '.gstack-dev', 'evals');
|
||||
|
||||
function loadResult(filepath: string): EvalResult {
|
||||
// Resolve relative to EVAL_DIR if not absolute
|
||||
const resolved = path.isAbsolute(filepath) ? filepath : path.join(EVAL_DIR, filepath);
|
||||
if (!fs.existsSync(resolved)) {
|
||||
console.error(`File not found: ${resolved}`);
|
||||
process.exit(1);
|
||||
}
|
||||
return JSON.parse(fs.readFileSync(resolved, 'utf-8'));
|
||||
}
|
||||
|
||||
const args = process.argv.slice(2);
|
||||
|
||||
let beforeFile: string;
|
||||
let afterFile: string;
|
||||
|
||||
if (args.length === 2) {
|
||||
// Two explicit files
|
||||
beforeFile = args[0];
|
||||
afterFile = args[1];
|
||||
} else if (args.length === 1) {
|
||||
// One file — find its predecessor
|
||||
afterFile = args[0];
|
||||
const resolved = path.isAbsolute(afterFile) ? afterFile : path.join(EVAL_DIR, afterFile);
|
||||
const afterResult = loadResult(resolved);
|
||||
const prev = findPreviousRun(EVAL_DIR, afterResult.tier, afterResult.branch, resolved);
|
||||
if (!prev) {
|
||||
console.log('No previous run found to compare against.');
|
||||
process.exit(0);
|
||||
}
|
||||
beforeFile = prev;
|
||||
} else {
|
||||
// No args — find two most recent of the same tier
|
||||
let files: string[];
|
||||
try {
|
||||
files = fs.readdirSync(EVAL_DIR)
|
||||
.filter(f => f.endsWith('.json'))
|
||||
.sort()
|
||||
.reverse();
|
||||
} catch {
|
||||
console.log('No eval runs yet. Run: EVALS=1 bun run test:evals');
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
if (files.length < 2) {
|
||||
console.log('Need at least 2 eval runs to compare. Run evals again.');
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// Most recent file
|
||||
afterFile = path.join(EVAL_DIR, files[0]);
|
||||
const afterResult = loadResult(afterFile);
|
||||
const prev = findPreviousRun(EVAL_DIR, afterResult.tier, afterResult.branch, afterFile);
|
||||
if (!prev) {
|
||||
console.log('No previous run of the same tier found to compare against.');
|
||||
process.exit(0);
|
||||
}
|
||||
beforeFile = prev;
|
||||
}
|
||||
|
||||
const beforeResult = loadResult(beforeFile);
|
||||
const afterResult = loadResult(afterFile);
|
||||
|
||||
// Warn if different tiers
|
||||
if (beforeResult.tier !== afterResult.tier) {
|
||||
console.warn(`Warning: comparing different tiers (${beforeResult.tier} vs ${afterResult.tier})`);
|
||||
}
|
||||
|
||||
// Warn on schema mismatch
|
||||
if (beforeResult.schema_version !== afterResult.schema_version) {
|
||||
console.warn(`Warning: schema version mismatch (${beforeResult.schema_version} vs ${afterResult.schema_version})`);
|
||||
}
|
||||
|
||||
const comparison = compareEvalResults(beforeResult, afterResult, beforeFile, afterFile);
|
||||
console.log(formatComparison(comparison));
|
||||
@@ -0,0 +1,105 @@
|
||||
#!/usr/bin/env bun
|
||||
/**
|
||||
* List eval runs from ~/.gstack-dev/evals/
|
||||
*
|
||||
* Usage: bun run eval:list [--branch <name>] [--tier e2e|llm-judge] [--limit N]
|
||||
*/
|
||||
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
|
||||
const EVAL_DIR = path.join(os.homedir(), '.gstack-dev', 'evals');
|
||||
|
||||
// Parse args
|
||||
const args = process.argv.slice(2);
|
||||
let filterBranch: string | null = null;
|
||||
let filterTier: string | null = null;
|
||||
let limit = 20;
|
||||
|
||||
for (let i = 0; i < args.length; i++) {
|
||||
if (args[i] === '--branch' && args[i + 1]) { filterBranch = args[++i]; }
|
||||
else if (args[i] === '--tier' && args[i + 1]) { filterTier = args[++i]; }
|
||||
else if (args[i] === '--limit' && args[i + 1]) { limit = parseInt(args[++i], 10); }
|
||||
}
|
||||
|
||||
// Read eval files
|
||||
let files: string[];
|
||||
try {
|
||||
files = fs.readdirSync(EVAL_DIR).filter(f => f.endsWith('.json'));
|
||||
} catch {
|
||||
console.log('No eval runs yet. Run: EVALS=1 bun run test:evals');
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
if (files.length === 0) {
|
||||
console.log('No eval runs yet. Run: EVALS=1 bun run test:evals');
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// Parse top-level fields from each file
|
||||
interface RunSummary {
|
||||
file: string;
|
||||
timestamp: string;
|
||||
branch: string;
|
||||
tier: string;
|
||||
version: string;
|
||||
passed: number;
|
||||
total: number;
|
||||
cost: number;
|
||||
}
|
||||
|
||||
const runs: RunSummary[] = [];
|
||||
for (const file of files) {
|
||||
try {
|
||||
const data = JSON.parse(fs.readFileSync(path.join(EVAL_DIR, file), 'utf-8'));
|
||||
if (filterBranch && data.branch !== filterBranch) continue;
|
||||
if (filterTier && data.tier !== filterTier) continue;
|
||||
runs.push({
|
||||
file,
|
||||
timestamp: data.timestamp || '',
|
||||
branch: data.branch || 'unknown',
|
||||
tier: data.tier || 'unknown',
|
||||
version: data.version || '?',
|
||||
passed: data.passed || 0,
|
||||
total: data.total_tests || 0,
|
||||
cost: data.total_cost_usd || 0,
|
||||
});
|
||||
} catch { continue; }
|
||||
}
|
||||
|
||||
// Sort by timestamp descending
|
||||
runs.sort((a, b) => b.timestamp.localeCompare(a.timestamp));
|
||||
|
||||
// Apply limit
|
||||
const displayed = runs.slice(0, limit);
|
||||
|
||||
// Print table
|
||||
console.log('');
|
||||
console.log(`Eval History (${runs.length} total runs)`);
|
||||
console.log('═'.repeat(90));
|
||||
console.log(
|
||||
' ' +
|
||||
'Date'.padEnd(17) +
|
||||
'Branch'.padEnd(28) +
|
||||
'Tier'.padEnd(12) +
|
||||
'Pass'.padEnd(8) +
|
||||
'Cost'.padEnd(8) +
|
||||
'Version'
|
||||
);
|
||||
console.log('─'.repeat(90));
|
||||
|
||||
for (const run of displayed) {
|
||||
const date = run.timestamp.replace('T', ' ').slice(0, 16);
|
||||
const branch = run.branch.length > 26 ? run.branch.slice(0, 23) + '...' : run.branch.padEnd(28);
|
||||
const pass = `${run.passed}/${run.total}`.padEnd(8);
|
||||
const cost = `$${run.cost.toFixed(2)}`.padEnd(8);
|
||||
console.log(` ${date.padEnd(17)}${branch}${run.tier.padEnd(12)}${pass}${cost}v${run.version}`);
|
||||
}
|
||||
|
||||
console.log('─'.repeat(90));
|
||||
|
||||
const totalCost = runs.reduce((s, r) => s + r.cost, 0);
|
||||
console.log(` ${runs.length} runs | Total spend: $${totalCost.toFixed(2)} | Showing: ${displayed.length}`);
|
||||
console.log(` Dir: ${EVAL_DIR}`);
|
||||
console.log('');
|
||||
@@ -0,0 +1,134 @@
|
||||
#!/usr/bin/env bun
|
||||
/**
|
||||
* Aggregate summary of all eval runs from ~/.gstack-dev/evals/
|
||||
*
|
||||
* Usage: bun run eval:summary
|
||||
*/
|
||||
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
import type { EvalResult } from '../test/helpers/eval-store';
|
||||
|
||||
const EVAL_DIR = path.join(os.homedir(), '.gstack-dev', 'evals');
|
||||
|
||||
let files: string[];
|
||||
try {
|
||||
files = fs.readdirSync(EVAL_DIR).filter(f => f.endsWith('.json'));
|
||||
} catch {
|
||||
console.log('No eval runs yet. Run: EVALS=1 bun run test:evals');
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
if (files.length === 0) {
|
||||
console.log('No eval runs yet. Run: EVALS=1 bun run test:evals');
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
// Load all results
|
||||
const results: EvalResult[] = [];
|
||||
for (const file of files) {
|
||||
try {
|
||||
results.push(JSON.parse(fs.readFileSync(path.join(EVAL_DIR, file), 'utf-8')));
|
||||
} catch { continue; }
|
||||
}
|
||||
|
||||
// Aggregate stats
|
||||
const e2eRuns = results.filter(r => r.tier === 'e2e');
|
||||
const judgeRuns = results.filter(r => r.tier === 'llm-judge');
|
||||
const totalCost = results.reduce((s, r) => s + (r.total_cost_usd || 0), 0);
|
||||
const avgE2ECost = e2eRuns.length > 0 ? e2eRuns.reduce((s, r) => s + r.total_cost_usd, 0) / e2eRuns.length : 0;
|
||||
const avgJudgeCost = judgeRuns.length > 0 ? judgeRuns.reduce((s, r) => s + r.total_cost_usd, 0) / judgeRuns.length : 0;
|
||||
|
||||
// Detection rates from outcome evals
|
||||
const detectionRates: number[] = [];
|
||||
for (const r of e2eRuns) {
|
||||
for (const t of r.tests) {
|
||||
if (t.detection_rate !== undefined) {
|
||||
detectionRates.push(t.detection_rate);
|
||||
}
|
||||
}
|
||||
}
|
||||
const avgDetection = detectionRates.length > 0
|
||||
? detectionRates.reduce((a, b) => a + b, 0) / detectionRates.length
|
||||
: null;
|
||||
|
||||
// Flaky tests (passed in some runs, failed in others)
|
||||
const testResults = new Map<string, boolean[]>();
|
||||
for (const r of results) {
|
||||
for (const t of r.tests) {
|
||||
const key = `${r.tier}:${t.name}`;
|
||||
if (!testResults.has(key)) testResults.set(key, []);
|
||||
testResults.get(key)!.push(t.passed);
|
||||
}
|
||||
}
|
||||
const flakyTests: string[] = [];
|
||||
for (const [name, outcomes] of testResults) {
|
||||
if (outcomes.length >= 2) {
|
||||
const hasPass = outcomes.some(o => o);
|
||||
const hasFail = outcomes.some(o => !o);
|
||||
if (hasPass && hasFail) flakyTests.push(name);
|
||||
}
|
||||
}
|
||||
|
||||
// Branch stats
|
||||
const branchStats = new Map<string, { runs: number; avgDetection: number; detections: number[] }>();
|
||||
for (const r of e2eRuns) {
|
||||
if (!branchStats.has(r.branch)) {
|
||||
branchStats.set(r.branch, { runs: 0, avgDetection: 0, detections: [] });
|
||||
}
|
||||
const stats = branchStats.get(r.branch)!;
|
||||
stats.runs++;
|
||||
for (const t of r.tests) {
|
||||
if (t.detection_rate !== undefined) {
|
||||
stats.detections.push(t.detection_rate);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (const stats of branchStats.values()) {
|
||||
stats.avgDetection = stats.detections.length > 0
|
||||
? stats.detections.reduce((a, b) => a + b, 0) / stats.detections.length
|
||||
: 0;
|
||||
}
|
||||
|
||||
// Print summary
|
||||
console.log('');
|
||||
console.log('Eval Summary');
|
||||
console.log('═'.repeat(60));
|
||||
console.log(` Total runs: ${results.length} (${e2eRuns.length} e2e, ${judgeRuns.length} llm-judge)`);
|
||||
console.log(` Total spend: $${totalCost.toFixed(2)}`);
|
||||
console.log(` Avg cost/e2e: $${avgE2ECost.toFixed(2)}`);
|
||||
console.log(` Avg cost/judge: $${avgJudgeCost.toFixed(2)}`);
|
||||
if (avgDetection !== null) {
|
||||
console.log(` Avg detection: ${avgDetection.toFixed(1)} bugs`);
|
||||
}
|
||||
console.log('─'.repeat(60));
|
||||
|
||||
if (flakyTests.length > 0) {
|
||||
console.log(` Flaky tests (${flakyTests.length}):`);
|
||||
for (const name of flakyTests) {
|
||||
console.log(` - ${name}`);
|
||||
}
|
||||
console.log('─'.repeat(60));
|
||||
}
|
||||
|
||||
if (branchStats.size > 0) {
|
||||
console.log(' Branches:');
|
||||
const sorted = [...branchStats.entries()].sort((a, b) => b[1].avgDetection - a[1].avgDetection);
|
||||
for (const [branch, stats] of sorted) {
|
||||
const det = stats.detections.length > 0 ? ` avg det: ${stats.avgDetection.toFixed(1)}` : '';
|
||||
console.log(` ${branch.padEnd(30)} ${stats.runs} runs${det}`);
|
||||
}
|
||||
console.log('─'.repeat(60));
|
||||
}
|
||||
|
||||
// Date range
|
||||
const timestamps = results.map(r => r.timestamp).filter(Boolean).sort();
|
||||
if (timestamps.length > 0) {
|
||||
const first = timestamps[0].replace('T', ' ').slice(0, 16);
|
||||
const last = timestamps[timestamps.length - 1].replace('T', ' ').slice(0, 16);
|
||||
console.log(` Date range: ${first} → ${last}`);
|
||||
}
|
||||
|
||||
console.log(` Dir: ${EVAL_DIR}`);
|
||||
console.log('');
|
||||
Reference in New Issue
Block a user