mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-02 11:45:20 +02:00
7ab00588c3
Switch all E2E tests from serial test() to testConcurrentIfSelected() so tests within each file run in parallel. Wall clock drops from ~18min to ~6min (limited by the longest single test, not sequential sum). The concurrent helper was already built in e2e-helpers.ts but never wired up. Each test runs in its own describe block with its own beforeAll/tmpdir — no shared state conflicts. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
195 lines
8.1 KiB
TypeScript
195 lines
8.1 KiB
TypeScript
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
|
|
import { runSkillTest } from './helpers/session-runner';
|
|
import { outcomeJudge } from './helpers/llm-judge';
|
|
import { judgePassed } from './helpers/eval-store';
|
|
import {
|
|
ROOT, browseBin, runId, evalsEnabled, selectedTests, hasApiKey,
|
|
describeIfSelected, describeE2E, testConcurrentIfSelected,
|
|
copyDirSync, setupBrowseShims, logCost, recordE2E, dumpOutcomeDiagnostic,
|
|
createEvalCollector, finalizeEvalCollector,
|
|
} from './helpers/e2e-helpers';
|
|
import { startTestServer } from '../browse/test/test-server';
|
|
import { spawnSync } from 'child_process';
|
|
import * as fs from 'fs';
|
|
import * as path from 'path';
|
|
import * as os from 'os';
|
|
|
|
const evalCollector = createEvalCollector('e2e-qa-bugs');
|
|
|
|
// --- B6/B7/B8: Planted-bug outcome evals ---
|
|
|
|
// Outcome evals also need ANTHROPIC_API_KEY for the LLM judge
|
|
const describeOutcome = (evalsEnabled && hasApiKey) ? describe : describe.skip;
|
|
|
|
// Wrap describeOutcome with selection — skip if no planted-bug tests are selected
|
|
const outcomeTestNames = ['qa-b6-static', 'qa-b7-spa', 'qa-b8-checkout'];
|
|
const anyOutcomeSelected = selectedTests === null || outcomeTestNames.some(t => selectedTests!.includes(t));
|
|
|
|
let testServer: ReturnType<typeof startTestServer>;
|
|
|
|
(anyOutcomeSelected ? describeOutcome : describe.skip)('Planted-bug outcome evals', () => {
|
|
let outcomeDir: string;
|
|
|
|
beforeAll(() => {
|
|
testServer = startTestServer();
|
|
outcomeDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-outcome-'));
|
|
setupBrowseShims(outcomeDir);
|
|
|
|
// Copy qa skill files
|
|
copyDirSync(path.join(ROOT, 'qa'), path.join(outcomeDir, 'qa'));
|
|
});
|
|
|
|
afterAll(() => {
|
|
testServer?.server?.stop();
|
|
try { fs.rmSync(outcomeDir, { recursive: true, force: true }); } catch {}
|
|
});
|
|
|
|
/**
|
|
* Shared planted-bug eval runner.
|
|
* Gives the agent concise bug-finding instructions (not the full QA workflow),
|
|
* then scores the report with an LLM outcome judge.
|
|
*/
|
|
async function runPlantedBugEval(fixture: string, groundTruthFile: string, label: string) {
|
|
// Each test gets its own isolated working directory to prevent cross-contamination
|
|
// (agents reading previous tests' reports and hallucinating those bugs)
|
|
const testWorkDir = fs.mkdtempSync(path.join(os.tmpdir(), `skill-e2e-${label}-`));
|
|
setupBrowseShims(testWorkDir);
|
|
const reportDir = path.join(testWorkDir, 'reports');
|
|
fs.mkdirSync(path.join(reportDir, 'screenshots'), { recursive: true });
|
|
const reportPath = path.join(reportDir, 'qa-report.md');
|
|
|
|
// Direct bug-finding with browse. Keep prompt concise — no reading long SKILL.md docs.
|
|
// "Write early, update later" pattern ensures report exists even if agent hits max turns.
|
|
const targetUrl = `${testServer.url}/${fixture}`;
|
|
const result = await runSkillTest({
|
|
prompt: `Find bugs on this page: ${targetUrl}
|
|
|
|
Browser binary: B="${browseBin}"
|
|
|
|
PHASE 1 — Quick scan (5 commands max):
|
|
$B goto ${targetUrl}
|
|
$B console --errors
|
|
$B snapshot -i
|
|
$B snapshot -c
|
|
$B accessibility
|
|
|
|
PHASE 2 — Write initial report to ${reportPath}:
|
|
Write every bug you found so far. Format each as:
|
|
- Category: functional / visual / accessibility / console
|
|
- Severity: high / medium / low
|
|
- Evidence: what you observed
|
|
|
|
PHASE 3 — Interactive testing (targeted — max 15 commands):
|
|
- Test email: type "user@" (no domain) and blur — does it validate?
|
|
- Test quantity: clear the field entirely — check the total display
|
|
- Test credit card: type a 25-character string — check for overflow
|
|
- Submit the form with zip code empty — does it require zip?
|
|
- Submit a valid form and run $B console --errors
|
|
- After finding more bugs, UPDATE ${reportPath} with new findings
|
|
|
|
PHASE 4 — Finalize report:
|
|
- UPDATE ${reportPath} with ALL bugs found across all phases
|
|
- Include console errors, form validation issues, visual overflow, missing attributes
|
|
|
|
CRITICAL RULES:
|
|
- ONLY test the page at ${targetUrl} — do not navigate to other sites
|
|
- Write the report file in PHASE 2 before doing interactive testing
|
|
- The report MUST exist at ${reportPath} when you finish`,
|
|
workingDirectory: testWorkDir,
|
|
maxTurns: 50,
|
|
timeout: 300_000,
|
|
testName: `qa-${label}`,
|
|
runId,
|
|
model: 'claude-opus-4-6',
|
|
});
|
|
|
|
logCost(`/qa ${label}`, result);
|
|
|
|
// Phase 1: browse mechanics. Accept error_max_turns — agent may have written
|
|
// a partial report before running out of turns. What matters is detection rate.
|
|
if (result.browseErrors.length > 0) {
|
|
console.warn(`${label} browse errors:`, result.browseErrors);
|
|
}
|
|
if (result.exitReason !== 'success' && result.exitReason !== 'error_max_turns') {
|
|
throw new Error(`${label}: unexpected exit reason: ${result.exitReason}`);
|
|
}
|
|
|
|
// Phase 2: Outcome evaluation via LLM judge
|
|
const groundTruth = JSON.parse(
|
|
fs.readFileSync(path.join(ROOT, 'test', 'fixtures', groundTruthFile), 'utf-8'),
|
|
);
|
|
|
|
// Read the generated report (try expected path, then glob for any .md in reportDir or workDir)
|
|
let report: string | null = null;
|
|
if (fs.existsSync(reportPath)) {
|
|
report = fs.readFileSync(reportPath, 'utf-8');
|
|
} else {
|
|
// Agent may have named it differently — find any .md in reportDir or testWorkDir
|
|
for (const searchDir of [reportDir, testWorkDir]) {
|
|
try {
|
|
const mdFiles = fs.readdirSync(searchDir).filter(f => f.endsWith('.md'));
|
|
if (mdFiles.length > 0) {
|
|
report = fs.readFileSync(path.join(searchDir, mdFiles[0]), 'utf-8');
|
|
break;
|
|
}
|
|
} catch { /* dir may not exist if agent hit max_turns early */ }
|
|
}
|
|
|
|
// Also check the agent's final output for inline report content
|
|
if (!report && result.output && result.output.length > 100) {
|
|
report = result.output;
|
|
}
|
|
}
|
|
|
|
if (!report) {
|
|
dumpOutcomeDiagnostic(testWorkDir, label, '(no report file found)', { error: 'missing report' });
|
|
recordE2E(evalCollector, `/qa ${label}`, 'Planted-bug outcome evals', result, { error: 'no report generated' } as any);
|
|
throw new Error(`No report file found in ${reportDir}`);
|
|
}
|
|
|
|
const judgeResult = await outcomeJudge(groundTruth, report);
|
|
console.log(`${label} outcome:`, JSON.stringify(judgeResult, null, 2));
|
|
|
|
// Record to eval collector with outcome judge results
|
|
recordE2E(evalCollector, `/qa ${label}`, 'Planted-bug outcome evals', result, {
|
|
passed: judgePassed(judgeResult, groundTruth),
|
|
detection_rate: judgeResult.detection_rate,
|
|
false_positives: judgeResult.false_positives,
|
|
evidence_quality: judgeResult.evidence_quality,
|
|
detected_bugs: judgeResult.detected,
|
|
missed_bugs: judgeResult.missed,
|
|
} as any);
|
|
|
|
// Diagnostic dump on failure (decision 1C)
|
|
if (judgeResult.detection_rate < groundTruth.minimum_detection || judgeResult.false_positives > groundTruth.max_false_positives) {
|
|
dumpOutcomeDiagnostic(testWorkDir, label, report, judgeResult);
|
|
}
|
|
|
|
// Phase 2 assertions
|
|
expect(judgeResult.detection_rate).toBeGreaterThanOrEqual(groundTruth.minimum_detection);
|
|
expect(judgeResult.false_positives).toBeLessThanOrEqual(groundTruth.max_false_positives);
|
|
expect(judgeResult.evidence_quality).toBeGreaterThanOrEqual(2);
|
|
}
|
|
|
|
// B6: Static dashboard — broken link, disabled submit, overflow, missing alt, console error
|
|
testConcurrentIfSelected('qa-b6-static', async () => {
|
|
await runPlantedBugEval('qa-eval.html', 'qa-eval-ground-truth.json', 'b6-static');
|
|
}, 360_000);
|
|
|
|
// B7: SPA — broken route, stale state, async race, missing aria, console warning
|
|
testConcurrentIfSelected('qa-b7-spa', async () => {
|
|
await runPlantedBugEval('qa-eval-spa.html', 'qa-eval-spa-ground-truth.json', 'b7-spa');
|
|
}, 360_000);
|
|
|
|
// B8: Checkout — email regex, NaN total, CC overflow, missing required, stripe error
|
|
testConcurrentIfSelected('qa-b8-checkout', async () => {
|
|
await runPlantedBugEval('qa-eval-checkout.html', 'qa-eval-checkout-ground-truth.json', 'b8-checkout');
|
|
}, 360_000);
|
|
|
|
});
|
|
|
|
// Module-level afterAll — finalize eval collector after all tests complete
|
|
afterAll(async () => {
|
|
await finalizeEvalCollector(evalCollector);
|
|
});
|