mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-02 03:35:09 +02:00
dc5e0538e5
* refactor: extract gen-skill-docs into modular resolver architecture Break the 3000-line monolith into 10 domain modules under scripts/resolvers/: types, constants, preamble, utility, browse, design, testing, review, codex-helpers, and index. Each module owns one domain of template generation. The preamble module introduces a 4-tier composition system (T1-T4) so skills only pay for the preamble sections they actually need, reducing token usage for lightweight skills by ~40%. Adds a token budget dashboard that prints after every generation run showing per-skill and total token counts. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: tiered preamble — skills only pay for what they use Tag all 23 templates with preamble-tier (T1-T4). Lightweight skills like /browse and /benchmark get a minimal preamble (~40% fewer tokens), while review skills get the full stack. Regenerate all SKILL.md files. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: migrate eval storage to project-scoped paths Move eval results and E2E run artifacts from ~/.gstack-dev/evals/ to ~/.gstack/projects/$SLUG/evals/ so each project's eval history lives alongside its other gstack data. Falls back to legacy path if slug detection fails. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: sync package.json version with VERSION after merge Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: add WorktreeManager for isolated test environments Reusable platform module (lib/worktree.ts) that creates git worktrees for test isolation and harvests useful changes as patches. Includes SHA-256 dedup, original SHA tracking for committed change detection, and automatic gitignored artifact copying (.agents/, browse/dist/). 12 unit tests covering lifecycle, harvest, dedup, and error handling. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: integrate worktree isolation into E2E test infrastructure Add createTestWorktree(), harvestAndCleanup(), and describeWithWorktree() helpers to e2e-helpers.ts. Add harvest field to EvalTestEntry for eval-store integration. Register lib/worktree.ts as a global touchfile. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: run Gemini and Codex E2E tests in worktrees Switch both test suites from cwd: ROOT to worktree isolation. Gemini (--yolo) no longer pollutes the working tree. Codex (read-only) gets worktree for consistency. Useful changes are harvested as patches for cherry-picking. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: skip symlinks in copyDirSync to prevent infinite recursion Adversarial review caught that .claude/skills/gstack may be a symlink back to the repo root, causing copyDirSync to recurse infinitely when copying gitignored artifacts into worktrees. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * chore: bump version and changelog (v0.11.12.0) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: relax session-awareness assertion to accept structured options The LLM consistently presents well-formatted A/B choices with pros/cons but doesn't always use the exact string "RECOMMENDATION". Accept case-insensitive "recommend", "option a", "which do you want", or "which approach" as equivalent signals of a structured recommendation. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
98 lines
3.0 KiB
TypeScript
98 lines
3.0 KiB
TypeScript
#!/usr/bin/env bun
|
|
/**
|
|
* Compare two eval runs from ~/.gstack-dev/evals/
|
|
*
|
|
* Usage:
|
|
* bun run eval:compare # compare two most recent of same tier
|
|
* bun run eval:compare <file> # compare file against its predecessor
|
|
* bun run eval:compare <file-a> <file-b> # compare two specific files
|
|
*/
|
|
|
|
import * as fs from 'fs';
|
|
import * as path from 'path';
|
|
import * as os from 'os';
|
|
import {
|
|
findPreviousRun,
|
|
compareEvalResults,
|
|
formatComparison,
|
|
getProjectEvalDir,
|
|
} from '../test/helpers/eval-store';
|
|
import type { EvalResult } from '../test/helpers/eval-store';
|
|
|
|
const EVAL_DIR = getProjectEvalDir();
|
|
|
|
function loadResult(filepath: string): EvalResult {
|
|
// Resolve relative to EVAL_DIR if not absolute
|
|
const resolved = path.isAbsolute(filepath) ? filepath : path.join(EVAL_DIR, filepath);
|
|
if (!fs.existsSync(resolved)) {
|
|
console.error(`File not found: ${resolved}`);
|
|
process.exit(1);
|
|
}
|
|
return JSON.parse(fs.readFileSync(resolved, 'utf-8'));
|
|
}
|
|
|
|
const args = process.argv.slice(2);
|
|
|
|
let beforeFile: string;
|
|
let afterFile: string;
|
|
|
|
if (args.length === 2) {
|
|
// Two explicit files
|
|
beforeFile = args[0];
|
|
afterFile = args[1];
|
|
} else if (args.length === 1) {
|
|
// One file — find its predecessor
|
|
afterFile = args[0];
|
|
const resolved = path.isAbsolute(afterFile) ? afterFile : path.join(EVAL_DIR, afterFile);
|
|
const afterResult = loadResult(resolved);
|
|
const prev = findPreviousRun(EVAL_DIR, afterResult.tier, afterResult.branch, resolved);
|
|
if (!prev) {
|
|
console.log('No previous run found to compare against.');
|
|
process.exit(0);
|
|
}
|
|
beforeFile = prev;
|
|
} else {
|
|
// No args — find two most recent of the same tier
|
|
let files: string[];
|
|
try {
|
|
files = fs.readdirSync(EVAL_DIR)
|
|
.filter(f => f.endsWith('.json'))
|
|
.sort()
|
|
.reverse();
|
|
} catch {
|
|
console.log('No eval runs yet. Run: EVALS=1 bun run test:evals');
|
|
process.exit(0);
|
|
}
|
|
|
|
if (files.length < 2) {
|
|
console.log('Need at least 2 eval runs to compare. Run evals again.');
|
|
process.exit(0);
|
|
}
|
|
|
|
// Most recent file
|
|
afterFile = path.join(EVAL_DIR, files[0]);
|
|
const afterResult = loadResult(afterFile);
|
|
const prev = findPreviousRun(EVAL_DIR, afterResult.tier, afterResult.branch, afterFile);
|
|
if (!prev) {
|
|
console.log('No previous run of the same tier found to compare against.');
|
|
process.exit(0);
|
|
}
|
|
beforeFile = prev;
|
|
}
|
|
|
|
const beforeResult = loadResult(beforeFile);
|
|
const afterResult = loadResult(afterFile);
|
|
|
|
// Warn if different tiers
|
|
if (beforeResult.tier !== afterResult.tier) {
|
|
console.warn(`Warning: comparing different tiers (${beforeResult.tier} vs ${afterResult.tier})`);
|
|
}
|
|
|
|
// Warn on schema mismatch
|
|
if (beforeResult.schema_version !== afterResult.schema_version) {
|
|
console.warn(`Warning: schema version mismatch (${beforeResult.schema_version} vs ${afterResult.schema_version})`);
|
|
}
|
|
|
|
const comparison = compareEvalResults(beforeResult, afterResult, beforeFile, afterFile);
|
|
console.log(formatComparison(comparison));
|