mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-01 19:25:10 +02:00
76803d789a
Adds comprehensive eval infrastructure: - Tier 1 (free): 13 new static tests — cross-skill path consistency, QA structure validation, greptile format, planted-bug fixture validation - Tier 2 (Agent SDK E2E): /qa quick, /review with pre-built git repo, 3 planted-bug outcome evals (static, SPA, checkout — each with 5 bugs) - Tier 3 (LLM judge): QA workflow quality, health rubric clarity, cross-skill consistency, baseline score pinning New fixtures: 3 HTML pages with 15 total planted bugs, ground truth JSON, review-eval-vuln.rb, eval-baselines.json. Shared llm-judge.ts helper (DRY). Unified EVALS=1 flag replaces SKILL_E2E + ANTHROPIC_API_KEY checks. `bun run test:evals` runs everything that costs money (~$4/run). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
343 lines
13 KiB
TypeScript
343 lines
13 KiB
TypeScript
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
|
|
import { runSkillTest } from './helpers/session-runner';
|
|
import { outcomeJudge } from './helpers/llm-judge';
|
|
import { startTestServer } from '../browse/test/test-server';
|
|
import * as fs from 'fs';
|
|
import * as path from 'path';
|
|
import * as os from 'os';
|
|
|
|
const ROOT = path.resolve(import.meta.dir, '..');
|
|
|
|
// Skip unless EVALS=1 (or legacy SKILL_E2E=1). Also skip inside Claude Code /
|
|
// Agent SDK sessions — nested sessions hang because the parent intercepts child subprocesses.
|
|
const isInsideAgentSDK = !!process.env.CLAUDECODE || !!process.env.CLAUDE_CODE_ENTRYPOINT;
|
|
const evalsEnabled = !!(process.env.EVALS || process.env.SKILL_E2E);
|
|
const describeE2E = (evalsEnabled && !isInsideAgentSDK) ? describe : describe.skip;
|
|
|
|
let testServer: ReturnType<typeof startTestServer>;
|
|
let tmpDir: string;
|
|
const browseBin = path.resolve(ROOT, 'browse', 'dist', 'browse');
|
|
|
|
/**
|
|
* Copy a directory tree recursively (files only, follows structure).
|
|
*/
|
|
function copyDirSync(src: string, dest: string) {
|
|
fs.mkdirSync(dest, { recursive: true });
|
|
for (const entry of fs.readdirSync(src, { withFileTypes: true })) {
|
|
const srcPath = path.join(src, entry.name);
|
|
const destPath = path.join(dest, entry.name);
|
|
if (entry.isDirectory()) {
|
|
copyDirSync(srcPath, destPath);
|
|
} else {
|
|
fs.copyFileSync(srcPath, destPath);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Set up browse shims (binary symlink, find-browse, remote-slug) in a tmpDir.
|
|
*/
|
|
function setupBrowseShims(dir: string) {
|
|
// Symlink browse binary
|
|
const binDir = path.join(dir, 'browse', 'dist');
|
|
fs.mkdirSync(binDir, { recursive: true });
|
|
if (fs.existsSync(browseBin)) {
|
|
fs.symlinkSync(browseBin, path.join(binDir, 'browse'));
|
|
}
|
|
|
|
// find-browse shim
|
|
const findBrowseDir = path.join(dir, 'browse', 'bin');
|
|
fs.mkdirSync(findBrowseDir, { recursive: true });
|
|
fs.writeFileSync(
|
|
path.join(findBrowseDir, 'find-browse'),
|
|
`#!/bin/bash\necho "${browseBin}"\n`,
|
|
{ mode: 0o755 },
|
|
);
|
|
|
|
// remote-slug shim (returns test-project)
|
|
fs.writeFileSync(
|
|
path.join(findBrowseDir, 'remote-slug'),
|
|
`#!/bin/bash\necho "test-project"\n`,
|
|
{ mode: 0o755 },
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Print cost summary after an E2E test.
|
|
*/
|
|
function logCost(label: string, result: { costEstimate: { turnsUsed: number; estimatedTokens: number; estimatedCost: number }; duration: number }) {
|
|
const { turnsUsed, estimatedTokens, estimatedCost } = result.costEstimate;
|
|
const durationSec = Math.round(result.duration / 1000);
|
|
console.log(`${label}: $${estimatedCost.toFixed(2)} (${turnsUsed} turns, ${(estimatedTokens / 1000).toFixed(1)}k tokens, ${durationSec}s)`);
|
|
}
|
|
|
|
/**
|
|
* Dump diagnostic info on planted-bug outcome failure (decision 1C).
|
|
*/
|
|
function dumpOutcomeDiagnostic(dir: string, label: string, report: string, judgeResult: any) {
|
|
try {
|
|
const transcriptDir = path.join(dir, '.gstack', 'test-transcripts');
|
|
fs.mkdirSync(transcriptDir, { recursive: true });
|
|
const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
|
|
fs.writeFileSync(
|
|
path.join(transcriptDir, `${label}-outcome-${timestamp}.json`),
|
|
JSON.stringify({ label, report, judgeResult }, null, 2),
|
|
);
|
|
} catch { /* non-fatal */ }
|
|
}
|
|
|
|
describeE2E('Skill E2E tests', () => {
|
|
beforeAll(() => {
|
|
testServer = startTestServer();
|
|
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-'));
|
|
setupBrowseShims(tmpDir);
|
|
});
|
|
|
|
afterAll(() => {
|
|
testServer?.server?.stop();
|
|
try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {}
|
|
});
|
|
|
|
test('browse basic commands work without errors', async () => {
|
|
const result = await runSkillTest({
|
|
prompt: `You have a browse binary at ${browseBin}. Assign it to B variable and run these commands in sequence:
|
|
1. $B goto ${testServer.url}
|
|
2. $B snapshot -i
|
|
3. $B text
|
|
4. $B screenshot /tmp/skill-e2e-test.png
|
|
Report the results of each command.`,
|
|
workingDirectory: tmpDir,
|
|
maxTurns: 10,
|
|
timeout: 60_000,
|
|
});
|
|
|
|
logCost('browse basic', result);
|
|
expect(result.browseErrors).toHaveLength(0);
|
|
expect(result.exitReason).toBe('success');
|
|
}, 90_000);
|
|
|
|
test('browse snapshot flags all work', async () => {
|
|
const result = await runSkillTest({
|
|
prompt: `You have a browse binary at ${browseBin}. Assign it to B variable and run:
|
|
1. $B goto ${testServer.url}
|
|
2. $B snapshot -i
|
|
3. $B snapshot -c
|
|
4. $B snapshot -D
|
|
5. $B snapshot -i -a -o /tmp/skill-e2e-annotated.png
|
|
Report what each command returned.`,
|
|
workingDirectory: tmpDir,
|
|
maxTurns: 10,
|
|
timeout: 60_000,
|
|
});
|
|
|
|
logCost('browse snapshot', result);
|
|
expect(result.browseErrors).toHaveLength(0);
|
|
expect(result.exitReason).toBe('success');
|
|
}, 90_000);
|
|
});
|
|
|
|
// --- B4: QA skill E2E ---
|
|
|
|
describeE2E('QA skill E2E', () => {
|
|
let qaDir: string;
|
|
|
|
beforeAll(() => {
|
|
testServer = testServer || startTestServer();
|
|
qaDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-qa-'));
|
|
setupBrowseShims(qaDir);
|
|
|
|
// Copy qa skill files into tmpDir
|
|
copyDirSync(path.join(ROOT, 'qa'), path.join(qaDir, 'qa'));
|
|
|
|
// Create report directory
|
|
fs.mkdirSync(path.join(qaDir, 'qa-reports'), { recursive: true });
|
|
});
|
|
|
|
afterAll(() => {
|
|
testServer?.server?.stop();
|
|
try { fs.rmSync(qaDir, { recursive: true, force: true }); } catch {}
|
|
});
|
|
|
|
test('/qa quick completes without browse errors', async () => {
|
|
const result = await runSkillTest({
|
|
prompt: `You have a browse binary at ${browseBin}. Assign it to B variable like: B="${browseBin}"
|
|
|
|
Read the file qa/SKILL.md for the QA workflow instructions.
|
|
|
|
Run a Quick-depth QA test on ${testServer.url}/basic.html
|
|
Do NOT use AskUserQuestion — run Quick tier directly.
|
|
Write your report to ${qaDir}/qa-reports/qa-report.md`,
|
|
workingDirectory: qaDir,
|
|
maxTurns: 20,
|
|
timeout: 120_000,
|
|
});
|
|
|
|
logCost('/qa quick', result);
|
|
expect(result.browseErrors).toHaveLength(0);
|
|
expect(result.exitReason).toBe('success');
|
|
}, 180_000);
|
|
});
|
|
|
|
// --- B5: Review skill E2E ---
|
|
|
|
describeE2E('Review skill E2E', () => {
|
|
let reviewDir: string;
|
|
|
|
beforeAll(() => {
|
|
reviewDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-review-'));
|
|
|
|
// Pre-build a git repo with a vulnerable file on a feature branch (decision 5A)
|
|
const { spawnSync } = require('child_process');
|
|
const run = (cmd: string, args: string[]) =>
|
|
spawnSync(cmd, args, { cwd: reviewDir, stdio: 'pipe', timeout: 5000 });
|
|
|
|
run('git', ['init']);
|
|
run('git', ['config', 'user.email', 'test@test.com']);
|
|
run('git', ['config', 'user.name', 'Test']);
|
|
|
|
// Commit a clean base on main
|
|
fs.writeFileSync(path.join(reviewDir, 'app.rb'), '# clean base\nclass App\nend\n');
|
|
run('git', ['add', 'app.rb']);
|
|
run('git', ['commit', '-m', 'initial commit']);
|
|
|
|
// Create feature branch with vulnerable code
|
|
run('git', ['checkout', '-b', 'feature/add-user-controller']);
|
|
const vulnContent = fs.readFileSync(path.join(ROOT, 'test', 'fixtures', 'review-eval-vuln.rb'), 'utf-8');
|
|
fs.writeFileSync(path.join(reviewDir, 'user_controller.rb'), vulnContent);
|
|
run('git', ['add', 'user_controller.rb']);
|
|
run('git', ['commit', '-m', 'add user controller']);
|
|
|
|
// Copy review skill files
|
|
fs.copyFileSync(path.join(ROOT, 'review', 'SKILL.md'), path.join(reviewDir, 'review-SKILL.md'));
|
|
fs.copyFileSync(path.join(ROOT, 'review', 'checklist.md'), path.join(reviewDir, 'review-checklist.md'));
|
|
fs.copyFileSync(path.join(ROOT, 'review', 'greptile-triage.md'), path.join(reviewDir, 'review-greptile-triage.md'));
|
|
});
|
|
|
|
afterAll(() => {
|
|
try { fs.rmSync(reviewDir, { recursive: true, force: true }); } catch {}
|
|
});
|
|
|
|
test('/review produces findings on SQL injection branch', async () => {
|
|
const result = await runSkillTest({
|
|
prompt: `You are in a git repo on a feature branch with changes against main.
|
|
Read review-SKILL.md for the review workflow instructions.
|
|
Also read review-checklist.md and apply it.
|
|
Run /review on the current diff (git diff main...HEAD).
|
|
Write your review findings to ${reviewDir}/review-output.md`,
|
|
workingDirectory: reviewDir,
|
|
maxTurns: 15,
|
|
timeout: 90_000,
|
|
});
|
|
|
|
logCost('/review', result);
|
|
expect(result.exitReason).toBe('success');
|
|
}, 120_000);
|
|
});
|
|
|
|
// --- B6/B7/B8: Planted-bug outcome evals ---
|
|
|
|
// Outcome evals also need ANTHROPIC_API_KEY for the LLM judge
|
|
const hasApiKey = !!process.env.ANTHROPIC_API_KEY;
|
|
const describeOutcome = (evalsEnabled && !isInsideAgentSDK && hasApiKey) ? describe : describe.skip;
|
|
|
|
describeOutcome('Planted-bug outcome evals', () => {
|
|
let outcomeDir: string;
|
|
|
|
beforeAll(() => {
|
|
testServer = testServer || startTestServer();
|
|
outcomeDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-outcome-'));
|
|
setupBrowseShims(outcomeDir);
|
|
|
|
// Copy qa skill files
|
|
copyDirSync(path.join(ROOT, 'qa'), path.join(outcomeDir, 'qa'));
|
|
});
|
|
|
|
afterAll(() => {
|
|
testServer?.server?.stop();
|
|
try { fs.rmSync(outcomeDir, { recursive: true, force: true }); } catch {}
|
|
});
|
|
|
|
/**
|
|
* Shared planted-bug eval runner.
|
|
* Runs /qa Standard on a fixture page, then scores with outcomeJudge.
|
|
*/
|
|
async function runPlantedBugEval(fixture: string, groundTruthFile: string, label: string) {
|
|
const reportDir = path.join(outcomeDir, `reports-${label}`);
|
|
fs.mkdirSync(path.join(reportDir, 'screenshots'), { recursive: true });
|
|
const reportPath = path.join(reportDir, 'qa-report.md');
|
|
|
|
// Phase 1: Agent SDK runs /qa Standard
|
|
const result = await runSkillTest({
|
|
prompt: `You have a browse binary at ${browseBin}. Assign it to B variable like: B="${browseBin}"
|
|
|
|
Read the file qa/SKILL.md for the QA workflow instructions.
|
|
|
|
Navigate to ${testServer.url}/${fixture} and run a Standard-depth QA test.
|
|
Do NOT use AskUserQuestion — run Standard tier directly.
|
|
Write your report to ${reportPath}
|
|
Save screenshots to ${reportDir}/screenshots/
|
|
|
|
Be thorough: check console, check all links, check all forms, check mobile viewport, check accessibility.`,
|
|
workingDirectory: outcomeDir,
|
|
maxTurns: 25,
|
|
timeout: 180_000,
|
|
});
|
|
|
|
logCost(`/qa ${label}`, result);
|
|
|
|
// Phase 1 assertions: browse mechanics
|
|
expect(result.browseErrors).toHaveLength(0);
|
|
expect(result.exitReason).toBe('success');
|
|
|
|
// Phase 2: Outcome evaluation via LLM judge
|
|
const groundTruth = JSON.parse(
|
|
fs.readFileSync(path.join(ROOT, 'test', 'fixtures', groundTruthFile), 'utf-8'),
|
|
);
|
|
|
|
// Read the generated report (try the expected path, then glob for any .md in reportDir)
|
|
let report: string;
|
|
if (fs.existsSync(reportPath)) {
|
|
report = fs.readFileSync(reportPath, 'utf-8');
|
|
} else {
|
|
// Agent may have named it differently — find any .md in reportDir
|
|
const mdFiles = fs.readdirSync(reportDir).filter(f => f.endsWith('.md'));
|
|
if (mdFiles.length === 0) {
|
|
dumpOutcomeDiagnostic(outcomeDir, label, '(no report file found)', { error: 'missing report' });
|
|
throw new Error(`No report file found in ${reportDir}`);
|
|
}
|
|
report = fs.readFileSync(path.join(reportDir, mdFiles[0]), 'utf-8');
|
|
}
|
|
|
|
const judgeResult = await outcomeJudge(groundTruth, report);
|
|
console.log(`${label} outcome:`, JSON.stringify(judgeResult, null, 2));
|
|
|
|
// Diagnostic dump on failure (decision 1C)
|
|
if (judgeResult.detection_rate < groundTruth.minimum_detection || judgeResult.false_positives > groundTruth.max_false_positives) {
|
|
dumpOutcomeDiagnostic(outcomeDir, label, report, judgeResult);
|
|
}
|
|
|
|
// Phase 2 assertions
|
|
expect(judgeResult.detection_rate).toBeGreaterThanOrEqual(groundTruth.minimum_detection);
|
|
expect(judgeResult.false_positives).toBeLessThanOrEqual(groundTruth.max_false_positives);
|
|
expect(judgeResult.evidence_quality).toBeGreaterThanOrEqual(3);
|
|
}
|
|
|
|
// B6: Static dashboard — broken link, disabled submit, overflow, missing alt, console error
|
|
test('/qa standard finds >= 3 of 5 planted bugs (static)', async () => {
|
|
await runPlantedBugEval('qa-eval.html', 'qa-eval-ground-truth.json', 'b6-static');
|
|
}, 240_000);
|
|
|
|
// B7: SPA — broken route, stale state, async race, missing aria, console warning
|
|
test('/qa standard finds >= 3 of 5 planted SPA bugs', async () => {
|
|
await runPlantedBugEval('qa-eval-spa.html', 'qa-eval-spa-ground-truth.json', 'b7-spa');
|
|
}, 240_000);
|
|
|
|
// B8: Checkout — email regex, NaN total, CC overflow, missing required, stripe error
|
|
test('/qa standard finds >= 3 of 5 planted checkout bugs', async () => {
|
|
await runPlantedBugEval('qa-eval-checkout.html', 'qa-eval-checkout-ground-truth.json', 'b8-checkout');
|
|
}, 240_000);
|
|
|
|
// Ship E2E deferred — too complex (requires full git + test suite + VERSION + CHANGELOG)
|
|
test.todo('/ship completes without browse errors');
|
|
});
|