Files
gstack/test/skill-e2e.test.ts
T
Garry Tan 4063104126 fix: remove false-positive Exit code 1 pattern, fix NEEDS_SETUP test, update QA tests
- Remove /Exit code 1/ from BROWSE_ERROR_PATTERNS — too broad, matches any
  bash command exit code in the transcript (e.g., git diff, test commands).
  Remaining patterns (Unknown command, Unknown snapshot flag, binary not found,
  server failed, no such file) are specific to browse errors.

- Fix NEEDS_SETUP E2E test — accepts READY when global binary exists at
  ~/.claude/skills/gstack/browse/dist/browse (which it does on dev machines).
  Test now verifies the setup block handles missing local binary gracefully.

- Update QA skill structure validation tests to match current qa/SKILL.md
  template content (phases renamed, modes replaced tiers, output structure).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-14 04:48:35 -05:00

475 lines
18 KiB
TypeScript

import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
import { runSkillTest } from './helpers/session-runner';
import type { SkillTestResult } from './helpers/session-runner';
import { outcomeJudge } from './helpers/llm-judge';
import { EvalCollector } from './helpers/eval-store';
import type { EvalTestEntry } from './helpers/eval-store';
import { startTestServer } from '../browse/test/test-server';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
const ROOT = path.resolve(import.meta.dir, '..');
// Skip unless EVALS=1. Session runner strips CLAUDE* env vars to avoid nested session issues.
const evalsEnabled = !!process.env.EVALS;
const describeE2E = evalsEnabled ? describe : describe.skip;
// Eval result collector — accumulates test results, writes to ~/.gstack-dev/evals/ on finalize
const evalCollector = evalsEnabled ? new EvalCollector('e2e') : null;
/** DRY helper to record an E2E test result into the eval collector. */
function recordE2E(name: string, suite: string, result: SkillTestResult, extra?: Partial<EvalTestEntry>) {
evalCollector?.addTest({
name, suite, tier: 'e2e',
passed: result.exitReason === 'success' && result.browseErrors.length === 0,
duration_ms: result.duration,
cost_usd: result.costEstimate.estimatedCost,
transcript: result.transcript,
output: result.output?.slice(0, 2000),
turns_used: result.costEstimate.turnsUsed,
browse_errors: result.browseErrors,
...extra,
});
}
let testServer: ReturnType<typeof startTestServer>;
let tmpDir: string;
const browseBin = path.resolve(ROOT, 'browse', 'dist', 'browse');
/**
* Copy a directory tree recursively (files only, follows structure).
*/
function copyDirSync(src: string, dest: string) {
fs.mkdirSync(dest, { recursive: true });
for (const entry of fs.readdirSync(src, { withFileTypes: true })) {
const srcPath = path.join(src, entry.name);
const destPath = path.join(dest, entry.name);
if (entry.isDirectory()) {
copyDirSync(srcPath, destPath);
} else {
fs.copyFileSync(srcPath, destPath);
}
}
}
/**
* Set up browse shims (binary symlink, find-browse, remote-slug) in a tmpDir.
*/
function setupBrowseShims(dir: string) {
// Symlink browse binary
const binDir = path.join(dir, 'browse', 'dist');
fs.mkdirSync(binDir, { recursive: true });
if (fs.existsSync(browseBin)) {
fs.symlinkSync(browseBin, path.join(binDir, 'browse'));
}
// find-browse shim
const findBrowseDir = path.join(dir, 'browse', 'bin');
fs.mkdirSync(findBrowseDir, { recursive: true });
fs.writeFileSync(
path.join(findBrowseDir, 'find-browse'),
`#!/bin/bash\necho "${browseBin}"\n`,
{ mode: 0o755 },
);
// remote-slug shim (returns test-project)
fs.writeFileSync(
path.join(findBrowseDir, 'remote-slug'),
`#!/bin/bash\necho "test-project"\n`,
{ mode: 0o755 },
);
}
/**
* Print cost summary after an E2E test.
*/
function logCost(label: string, result: { costEstimate: { turnsUsed: number; estimatedTokens: number; estimatedCost: number }; duration: number }) {
const { turnsUsed, estimatedTokens, estimatedCost } = result.costEstimate;
const durationSec = Math.round(result.duration / 1000);
console.log(`${label}: $${estimatedCost.toFixed(2)} (${turnsUsed} turns, ${(estimatedTokens / 1000).toFixed(1)}k tokens, ${durationSec}s)`);
}
/**
* Dump diagnostic info on planted-bug outcome failure (decision 1C).
*/
function dumpOutcomeDiagnostic(dir: string, label: string, report: string, judgeResult: any) {
try {
const transcriptDir = path.join(dir, '.gstack', 'test-transcripts');
fs.mkdirSync(transcriptDir, { recursive: true });
const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
fs.writeFileSync(
path.join(transcriptDir, `${label}-outcome-${timestamp}.json`),
JSON.stringify({ label, report, judgeResult }, null, 2),
);
} catch { /* non-fatal */ }
}
describeE2E('Skill E2E tests', () => {
beforeAll(() => {
testServer = startTestServer();
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-'));
setupBrowseShims(tmpDir);
});
afterAll(() => {
testServer?.server?.stop();
try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {}
});
test('browse basic commands work without errors', async () => {
const result = await runSkillTest({
prompt: `You have a browse binary at ${browseBin}. Assign it to B variable and run these commands in sequence:
1. $B goto ${testServer.url}
2. $B snapshot -i
3. $B text
4. $B screenshot /tmp/skill-e2e-test.png
Report the results of each command.`,
workingDirectory: tmpDir,
maxTurns: 10,
timeout: 60_000,
});
logCost('browse basic', result);
recordE2E('browse basic commands', 'Skill E2E tests', result);
expect(result.browseErrors).toHaveLength(0);
expect(result.exitReason).toBe('success');
}, 90_000);
test('browse snapshot flags all work', async () => {
const result = await runSkillTest({
prompt: `You have a browse binary at ${browseBin}. Assign it to B variable and run:
1. $B goto ${testServer.url}
2. $B snapshot -i
3. $B snapshot -c
4. $B snapshot -D
5. $B snapshot -i -a -o /tmp/skill-e2e-annotated.png
Report what each command returned.`,
workingDirectory: tmpDir,
maxTurns: 10,
timeout: 60_000,
});
logCost('browse snapshot', result);
recordE2E('browse snapshot flags', 'Skill E2E tests', result);
expect(result.browseErrors).toHaveLength(0);
expect(result.exitReason).toBe('success');
}, 90_000);
test('agent discovers browse binary via SKILL.md setup block', async () => {
const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
const setupStart = skillMd.indexOf('## SETUP');
const setupEnd = skillMd.indexOf('## IMPORTANT');
const setupBlock = skillMd.slice(setupStart, setupEnd);
// Guard: verify we extracted a valid setup block
expect(setupBlock).toContain('browse/dist/browse');
const result = await runSkillTest({
prompt: `Follow these instructions to find the browse binary and run a basic command.
${setupBlock}
After finding the binary, run: $B goto ${testServer.url}
Then run: $B text
Report whether it worked.`,
workingDirectory: tmpDir,
maxTurns: 10,
timeout: 60_000,
});
recordE2E('SKILL.md setup block discovery', 'Skill E2E tests', result);
expect(result.browseErrors).toHaveLength(0);
expect(result.exitReason).toBe('success');
}, 90_000);
test('SKILL.md setup block handles missing local binary gracefully', async () => {
// Create a tmpdir with no browse binary — no local .claude/skills/gstack/browse/dist/browse
const emptyDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-empty-'));
const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
const setupStart = skillMd.indexOf('## SETUP');
const setupEnd = skillMd.indexOf('## IMPORTANT');
const setupBlock = skillMd.slice(setupStart, setupEnd);
const result = await runSkillTest({
prompt: `Follow these instructions exactly. Run the bash code block below and report what it outputs.
${setupBlock}
Report the exact output. Do NOT try to fix or install anything — just report what you see.`,
workingDirectory: emptyDir,
maxTurns: 5,
timeout: 30_000,
});
// Setup block should either find the global binary (READY) or show NEEDS_SETUP.
// On dev machines with gstack installed globally, the fallback path
// ~/.claude/skills/gstack/browse/dist/browse exists, so we get READY.
// The important thing is it doesn't crash or give a confusing error.
const allText = result.output || '';
recordE2E('SKILL.md setup block (no local binary)', 'Skill E2E tests', result);
expect(allText).toMatch(/READY|NEEDS_SETUP/);
expect(result.exitReason).toBe('success');
// Clean up
try { fs.rmSync(emptyDir, { recursive: true, force: true }); } catch {}
}, 60_000);
test('SKILL.md setup block works outside git repo', async () => {
// Create a tmpdir outside any git repo
const nonGitDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-nogit-'));
const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
const setupStart = skillMd.indexOf('## SETUP');
const setupEnd = skillMd.indexOf('## IMPORTANT');
const setupBlock = skillMd.slice(setupStart, setupEnd);
const result = await runSkillTest({
prompt: `Follow these instructions exactly. Run the bash code block below and report what it outputs.
${setupBlock}
Report the exact output — either "READY: <path>" or "NEEDS_SETUP".`,
workingDirectory: nonGitDir,
maxTurns: 5,
timeout: 30_000,
});
// Should either find global binary (READY) or show NEEDS_SETUP — not crash
const allText = result.output || '';
recordE2E('SKILL.md outside git repo', 'Skill E2E tests', result);
expect(allText).toMatch(/READY|NEEDS_SETUP/);
// Clean up
try { fs.rmSync(nonGitDir, { recursive: true, force: true }); } catch {}
}, 60_000);
});
// --- B4: QA skill E2E ---
describeE2E('QA skill E2E', () => {
let qaDir: string;
beforeAll(() => {
testServer = testServer || startTestServer();
qaDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-qa-'));
setupBrowseShims(qaDir);
// Copy qa skill files into tmpDir
copyDirSync(path.join(ROOT, 'qa'), path.join(qaDir, 'qa'));
// Create report directory
fs.mkdirSync(path.join(qaDir, 'qa-reports'), { recursive: true });
});
afterAll(() => {
testServer?.server?.stop();
try { fs.rmSync(qaDir, { recursive: true, force: true }); } catch {}
});
test('/qa quick completes without browse errors', async () => {
const result = await runSkillTest({
prompt: `You have a browse binary at ${browseBin}. Assign it to B variable like: B="${browseBin}"
Read the file qa/SKILL.md for the QA workflow instructions.
Run a Quick-depth QA test on ${testServer.url}/basic.html
Do NOT use AskUserQuestion — run Quick tier directly.
Write your report to ${qaDir}/qa-reports/qa-report.md`,
workingDirectory: qaDir,
maxTurns: 30,
timeout: 180_000,
});
logCost('/qa quick', result);
recordE2E('/qa quick', 'QA skill E2E', result);
expect(result.browseErrors).toHaveLength(0);
expect(result.exitReason).toBe('success');
}, 240_000);
});
// --- B5: Review skill E2E ---
describeE2E('Review skill E2E', () => {
let reviewDir: string;
beforeAll(() => {
reviewDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-review-'));
// Pre-build a git repo with a vulnerable file on a feature branch (decision 5A)
const { spawnSync } = require('child_process');
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: reviewDir, stdio: 'pipe', timeout: 5000 });
run('git', ['init']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
// Commit a clean base on main
fs.writeFileSync(path.join(reviewDir, 'app.rb'), '# clean base\nclass App\nend\n');
run('git', ['add', 'app.rb']);
run('git', ['commit', '-m', 'initial commit']);
// Create feature branch with vulnerable code
run('git', ['checkout', '-b', 'feature/add-user-controller']);
const vulnContent = fs.readFileSync(path.join(ROOT, 'test', 'fixtures', 'review-eval-vuln.rb'), 'utf-8');
fs.writeFileSync(path.join(reviewDir, 'user_controller.rb'), vulnContent);
run('git', ['add', 'user_controller.rb']);
run('git', ['commit', '-m', 'add user controller']);
// Copy review skill files
fs.copyFileSync(path.join(ROOT, 'review', 'SKILL.md'), path.join(reviewDir, 'review-SKILL.md'));
fs.copyFileSync(path.join(ROOT, 'review', 'checklist.md'), path.join(reviewDir, 'review-checklist.md'));
fs.copyFileSync(path.join(ROOT, 'review', 'greptile-triage.md'), path.join(reviewDir, 'review-greptile-triage.md'));
});
afterAll(() => {
try { fs.rmSync(reviewDir, { recursive: true, force: true }); } catch {}
});
test('/review produces findings on SQL injection branch', async () => {
const result = await runSkillTest({
prompt: `You are in a git repo on a feature branch with changes against main.
Read review-SKILL.md for the review workflow instructions.
Also read review-checklist.md and apply it.
Run /review on the current diff (git diff main...HEAD).
Write your review findings to ${reviewDir}/review-output.md`,
workingDirectory: reviewDir,
maxTurns: 15,
timeout: 90_000,
});
logCost('/review', result);
recordE2E('/review SQL injection', 'Review skill E2E', result);
expect(result.exitReason).toBe('success');
}, 120_000);
});
// --- B6/B7/B8: Planted-bug outcome evals ---
// Outcome evals also need ANTHROPIC_API_KEY for the LLM judge
const hasApiKey = !!process.env.ANTHROPIC_API_KEY;
const describeOutcome = (evalsEnabled && hasApiKey) ? describe : describe.skip;
describeOutcome('Planted-bug outcome evals', () => {
let outcomeDir: string;
beforeAll(() => {
testServer = testServer || startTestServer();
outcomeDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-outcome-'));
setupBrowseShims(outcomeDir);
// Copy qa skill files
copyDirSync(path.join(ROOT, 'qa'), path.join(outcomeDir, 'qa'));
});
afterAll(() => {
testServer?.server?.stop();
try { fs.rmSync(outcomeDir, { recursive: true, force: true }); } catch {}
});
/**
* Shared planted-bug eval runner.
* Runs /qa Standard on a fixture page, then scores with outcomeJudge.
*/
async function runPlantedBugEval(fixture: string, groundTruthFile: string, label: string) {
const reportDir = path.join(outcomeDir, `reports-${label}`);
fs.mkdirSync(path.join(reportDir, 'screenshots'), { recursive: true });
const reportPath = path.join(reportDir, 'qa-report.md');
// Phase 1: runs /qa Standard
const result = await runSkillTest({
prompt: `You have a browse binary at ${browseBin}. Assign it to B variable like: B="${browseBin}"
Read the file qa/SKILL.md for the QA workflow instructions.
Navigate to ${testServer.url}/${fixture} and run a Standard-depth QA test.
Do NOT use AskUserQuestion — run Standard tier directly.
Write your report to ${reportPath}
Save screenshots to ${reportDir}/screenshots/
Be thorough: check console, check all links, check all forms, check mobile viewport, check accessibility.`,
workingDirectory: outcomeDir,
maxTurns: 40,
timeout: 300_000,
});
logCost(`/qa ${label}`, result);
// Phase 1 assertions: browse mechanics
expect(result.browseErrors).toHaveLength(0);
expect(result.exitReason).toBe('success');
// Phase 2: Outcome evaluation via LLM judge
const groundTruth = JSON.parse(
fs.readFileSync(path.join(ROOT, 'test', 'fixtures', groundTruthFile), 'utf-8'),
);
// Read the generated report (try the expected path, then glob for any .md in reportDir)
let report: string;
if (fs.existsSync(reportPath)) {
report = fs.readFileSync(reportPath, 'utf-8');
} else {
// Agent may have named it differently — find any .md in reportDir
const mdFiles = fs.readdirSync(reportDir).filter(f => f.endsWith('.md'));
if (mdFiles.length === 0) {
dumpOutcomeDiagnostic(outcomeDir, label, '(no report file found)', { error: 'missing report' });
throw new Error(`No report file found in ${reportDir}`);
}
report = fs.readFileSync(path.join(reportDir, mdFiles[0]), 'utf-8');
}
const judgeResult = await outcomeJudge(groundTruth, report);
console.log(`${label} outcome:`, JSON.stringify(judgeResult, null, 2));
// Record to eval collector with outcome judge results
recordE2E(`/qa ${label}`, 'Planted-bug outcome evals', result, {
detection_rate: judgeResult.detection_rate,
false_positives: judgeResult.false_positives,
evidence_quality: judgeResult.evidence_quality,
detected_bugs: judgeResult.detected,
missed_bugs: judgeResult.missed,
});
// Diagnostic dump on failure (decision 1C)
if (judgeResult.detection_rate < groundTruth.minimum_detection || judgeResult.false_positives > groundTruth.max_false_positives) {
dumpOutcomeDiagnostic(outcomeDir, label, report, judgeResult);
}
// Phase 2 assertions
expect(judgeResult.detection_rate).toBeGreaterThanOrEqual(groundTruth.minimum_detection);
expect(judgeResult.false_positives).toBeLessThanOrEqual(groundTruth.max_false_positives);
expect(judgeResult.evidence_quality).toBeGreaterThanOrEqual(3);
}
// B6: Static dashboard — broken link, disabled submit, overflow, missing alt, console error
test('/qa standard finds >= 3 of 5 planted bugs (static)', async () => {
await runPlantedBugEval('qa-eval.html', 'qa-eval-ground-truth.json', 'b6-static');
}, 360_000);
// B7: SPA — broken route, stale state, async race, missing aria, console warning
test('/qa standard finds >= 3 of 5 planted SPA bugs', async () => {
await runPlantedBugEval('qa-eval-spa.html', 'qa-eval-spa-ground-truth.json', 'b7-spa');
}, 360_000);
// B8: Checkout — email regex, NaN total, CC overflow, missing required, stripe error
test('/qa standard finds >= 3 of 5 planted checkout bugs', async () => {
await runPlantedBugEval('qa-eval-checkout.html', 'qa-eval-checkout-ground-truth.json', 'b8-checkout');
}, 360_000);
// Ship E2E deferred — too complex (requires full git + test suite + VERSION + CHANGELOG)
test.todo('/ship completes without browse errors');
});
// Module-level afterAll — finalize eval collector after all tests complete
afterAll(async () => {
if (evalCollector) {
try {
await evalCollector.finalize();
} catch (err) {
console.error('Failed to save eval results:', err);
}
}
});