mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-01 19:25:10 +02:00
dc5e0538e5
* refactor: extract gen-skill-docs into modular resolver architecture Break the 3000-line monolith into 10 domain modules under scripts/resolvers/: types, constants, preamble, utility, browse, design, testing, review, codex-helpers, and index. Each module owns one domain of template generation. The preamble module introduces a 4-tier composition system (T1-T4) so skills only pay for the preamble sections they actually need, reducing token usage for lightweight skills by ~40%. Adds a token budget dashboard that prints after every generation run showing per-skill and total token counts. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: tiered preamble — skills only pay for what they use Tag all 23 templates with preamble-tier (T1-T4). Lightweight skills like /browse and /benchmark get a minimal preamble (~40% fewer tokens), while review skills get the full stack. Regenerate all SKILL.md files. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: migrate eval storage to project-scoped paths Move eval results and E2E run artifacts from ~/.gstack-dev/evals/ to ~/.gstack/projects/$SLUG/evals/ so each project's eval history lives alongside its other gstack data. Falls back to legacy path if slug detection fails. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: sync package.json version with VERSION after merge Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: add WorktreeManager for isolated test environments Reusable platform module (lib/worktree.ts) that creates git worktrees for test isolation and harvests useful changes as patches. Includes SHA-256 dedup, original SHA tracking for committed change detection, and automatic gitignored artifact copying (.agents/, browse/dist/). 12 unit tests covering lifecycle, harvest, dedup, and error handling. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: integrate worktree isolation into E2E test infrastructure Add createTestWorktree(), harvestAndCleanup(), and describeWithWorktree() helpers to e2e-helpers.ts. Add harvest field to EvalTestEntry for eval-store integration. Register lib/worktree.ts as a global touchfile. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: run Gemini and Codex E2E tests in worktrees Switch both test suites from cwd: ROOT to worktree isolation. Gemini (--yolo) no longer pollutes the working tree. Codex (read-only) gets worktree for consistency. Useful changes are harvested as patches for cherry-picking. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: skip symlinks in copyDirSync to prevent infinite recursion Adversarial review caught that .claude/skills/gstack may be a symlink back to the repo root, causing copyDirSync to recurse infinitely when copying gitignored artifacts into worktrees. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * chore: bump version and changelog (v0.11.12.0) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: relax session-awareness assertion to accept structured options The LLM consistently presents well-formatted A/B choices with pros/cons but doesn't always use the exact string "RECOMMENDATION". Accept case-insensitive "recommend", "option a", "which do you want", or "which approach" as equivalent signals of a structured recommendation. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
118 lines
3.5 KiB
TypeScript
118 lines
3.5 KiB
TypeScript
#!/usr/bin/env bun
|
|
/**
|
|
* List eval runs from ~/.gstack-dev/evals/
|
|
*
|
|
* Usage: bun run eval:list [--branch <name>] [--tier e2e|llm-judge] [--limit N]
|
|
*/
|
|
|
|
import * as fs from 'fs';
|
|
import * as path from 'path';
|
|
import * as os from 'os';
|
|
import { getProjectEvalDir } from '../test/helpers/eval-store';
|
|
|
|
const EVAL_DIR = getProjectEvalDir();
|
|
|
|
// Parse args
|
|
const args = process.argv.slice(2);
|
|
let filterBranch: string | null = null;
|
|
let filterTier: string | null = null;
|
|
let limit = 20;
|
|
|
|
for (let i = 0; i < args.length; i++) {
|
|
if (args[i] === '--branch' && args[i + 1]) { filterBranch = args[++i]; }
|
|
else if (args[i] === '--tier' && args[i + 1]) { filterTier = args[++i]; }
|
|
else if (args[i] === '--limit' && args[i + 1]) { limit = parseInt(args[++i], 10); }
|
|
}
|
|
|
|
// Read eval files
|
|
let files: string[];
|
|
try {
|
|
files = fs.readdirSync(EVAL_DIR).filter(f => f.endsWith('.json'));
|
|
} catch {
|
|
console.log('No eval runs yet. Run: EVALS=1 bun run test:evals');
|
|
process.exit(0);
|
|
}
|
|
|
|
if (files.length === 0) {
|
|
console.log('No eval runs yet. Run: EVALS=1 bun run test:evals');
|
|
process.exit(0);
|
|
}
|
|
|
|
// Parse top-level fields from each file
|
|
interface RunSummary {
|
|
file: string;
|
|
timestamp: string;
|
|
branch: string;
|
|
tier: string;
|
|
version: string;
|
|
passed: number;
|
|
total: number;
|
|
cost: number;
|
|
duration: number;
|
|
turns: number;
|
|
}
|
|
|
|
const runs: RunSummary[] = [];
|
|
for (const file of files) {
|
|
try {
|
|
const data = JSON.parse(fs.readFileSync(path.join(EVAL_DIR, file), 'utf-8'));
|
|
if (filterBranch && data.branch !== filterBranch) continue;
|
|
if (filterTier && data.tier !== filterTier) continue;
|
|
const totalTurns = (data.tests || []).reduce((s: number, t: any) => s + (t.turns_used || 0), 0);
|
|
runs.push({
|
|
file,
|
|
timestamp: data.timestamp || '',
|
|
branch: data.branch || 'unknown',
|
|
tier: data.tier || 'unknown',
|
|
version: data.version || '?',
|
|
passed: data.passed || 0,
|
|
total: data.total_tests || 0,
|
|
cost: data.total_cost_usd || 0,
|
|
duration: data.total_duration_ms || 0,
|
|
turns: totalTurns,
|
|
});
|
|
} catch { continue; }
|
|
}
|
|
|
|
// Sort by timestamp descending
|
|
runs.sort((a, b) => b.timestamp.localeCompare(a.timestamp));
|
|
|
|
// Apply limit
|
|
const displayed = runs.slice(0, limit);
|
|
|
|
// Print table
|
|
console.log('');
|
|
console.log(`Eval History (${runs.length} total runs)`);
|
|
console.log('═'.repeat(105));
|
|
console.log(
|
|
' ' +
|
|
'Date'.padEnd(17) +
|
|
'Branch'.padEnd(25) +
|
|
'Tier'.padEnd(12) +
|
|
'Pass'.padEnd(8) +
|
|
'Cost'.padEnd(8) +
|
|
'Turns'.padEnd(7) +
|
|
'Duration'.padEnd(10) +
|
|
'Version'
|
|
);
|
|
console.log('─'.repeat(105));
|
|
|
|
for (const run of displayed) {
|
|
const date = run.timestamp.replace('T', ' ').slice(0, 16);
|
|
const branch = run.branch.length > 23 ? run.branch.slice(0, 20) + '...' : run.branch.padEnd(25);
|
|
const pass = `${run.passed}/${run.total}`.padEnd(8);
|
|
const cost = `$${run.cost.toFixed(2)}`.padEnd(8);
|
|
const turns = run.turns > 0 ? `${run.turns}t`.padEnd(7) : ''.padEnd(7);
|
|
const dur = run.duration > 0 ? `${Math.round(run.duration / 1000)}s`.padEnd(10) : ''.padEnd(10);
|
|
console.log(` ${date.padEnd(17)}${branch}${run.tier.padEnd(12)}${pass}${cost}${turns}${dur}v${run.version}`);
|
|
}
|
|
|
|
console.log('─'.repeat(105));
|
|
|
|
const totalCost = runs.reduce((s, r) => s + r.cost, 0);
|
|
const totalDur = runs.reduce((s, r) => s + r.duration, 0);
|
|
const totalTurns = runs.reduce((s, r) => s + r.turns, 0);
|
|
console.log(` ${runs.length} runs | $${totalCost.toFixed(2)} total | ${totalTurns} turns | ${Math.round(totalDur / 1000)}s | Showing: ${displayed.length}`);
|
|
console.log(` Dir: ${EVAL_DIR}`);
|
|
console.log('');
|