Files
gstack/test/skill-e2e-qa-bugs.test.ts
T
Garry Tan 351bbb8426 test: bump remaining claude-opus-4-6 → 4-7 references
Mechanical model ID bump across the E2E eval suite. All six in-repo
files that referenced the older opus identifier are updated to match
the model gstack now defaults to. No behavior change beyond the model
ID the test harness asks for.

Contributed by @johnnysoftware7 (#1392).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-10 11:08:26 -07:00

195 lines
8.1 KiB
TypeScript

import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
import { runSkillTest } from './helpers/session-runner';
import { outcomeJudge } from './helpers/llm-judge';
import { judgePassed } from './helpers/eval-store';
import {
ROOT, browseBin, runId, evalsEnabled, selectedTests, hasApiKey,
describeIfSelected, describeE2E, testConcurrentIfSelected,
copyDirSync, setupBrowseShims, logCost, recordE2E, dumpOutcomeDiagnostic,
createEvalCollector, finalizeEvalCollector,
} from './helpers/e2e-helpers';
import { startTestServer } from '../browse/test/test-server';
import { spawnSync } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
const evalCollector = createEvalCollector('e2e-qa-bugs');
// --- B6/B7/B8: Planted-bug outcome evals ---
// Outcome evals also need ANTHROPIC_API_KEY for the LLM judge
const describeOutcome = (evalsEnabled && hasApiKey) ? describe : describe.skip;
// Wrap describeOutcome with selection — skip if no planted-bug tests are selected
const outcomeTestNames = ['qa-b6-static', 'qa-b7-spa', 'qa-b8-checkout'];
const anyOutcomeSelected = selectedTests === null || outcomeTestNames.some(t => selectedTests!.includes(t));
let testServer: ReturnType<typeof startTestServer>;
(anyOutcomeSelected ? describeOutcome : describe.skip)('Planted-bug outcome evals', () => {
let outcomeDir: string;
beforeAll(() => {
testServer = startTestServer();
outcomeDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-outcome-'));
setupBrowseShims(outcomeDir);
// Copy qa skill files
copyDirSync(path.join(ROOT, 'qa'), path.join(outcomeDir, 'qa'));
});
afterAll(() => {
testServer?.server?.stop();
try { fs.rmSync(outcomeDir, { recursive: true, force: true }); } catch {}
});
/**
* Shared planted-bug eval runner.
* Gives the agent concise bug-finding instructions (not the full QA workflow),
* then scores the report with an LLM outcome judge.
*/
async function runPlantedBugEval(fixture: string, groundTruthFile: string, label: string) {
// Each test gets its own isolated working directory to prevent cross-contamination
// (agents reading previous tests' reports and hallucinating those bugs)
const testWorkDir = fs.mkdtempSync(path.join(os.tmpdir(), `skill-e2e-${label}-`));
setupBrowseShims(testWorkDir);
const reportDir = path.join(testWorkDir, 'reports');
fs.mkdirSync(path.join(reportDir, 'screenshots'), { recursive: true });
const reportPath = path.join(reportDir, 'qa-report.md');
// Direct bug-finding with browse. Keep prompt concise — no reading long SKILL.md docs.
// "Write early, update later" pattern ensures report exists even if agent hits max turns.
const targetUrl = `${testServer.url}/${fixture}`;
const result = await runSkillTest({
prompt: `Find bugs on this page: ${targetUrl}
Browser binary: B="${browseBin}"
PHASE 1 — Quick scan (5 commands max):
$B goto ${targetUrl}
$B console --errors
$B snapshot -i
$B snapshot -c
$B accessibility
PHASE 2 — Write initial report to ${reportPath}:
Write every bug you found so far. Format each as:
- Category: functional / visual / accessibility / console
- Severity: high / medium / low
- Evidence: what you observed
PHASE 3 — Interactive testing (targeted — max 15 commands):
- Test email: type "user@" (no domain) and blur — does it validate?
- Test quantity: clear the field entirely — check the total display
- Test credit card: type a 25-character string — check for overflow
- Submit the form with zip code empty — does it require zip?
- Submit a valid form and run $B console --errors
- After finding more bugs, UPDATE ${reportPath} with new findings
PHASE 4 — Finalize report:
- UPDATE ${reportPath} with ALL bugs found across all phases
- Include console errors, form validation issues, visual overflow, missing attributes
CRITICAL RULES:
- ONLY test the page at ${targetUrl} — do not navigate to other sites
- Write the report file in PHASE 2 before doing interactive testing
- The report MUST exist at ${reportPath} when you finish`,
workingDirectory: testWorkDir,
maxTurns: 50,
timeout: 300_000,
testName: `qa-${label}`,
runId,
model: 'claude-opus-4-7',
});
logCost(`/qa ${label}`, result);
// Phase 1: browse mechanics. Accept error_max_turns — agent may have written
// a partial report before running out of turns. What matters is detection rate.
if (result.browseErrors.length > 0) {
console.warn(`${label} browse errors:`, result.browseErrors);
}
if (result.exitReason !== 'success' && result.exitReason !== 'error_max_turns') {
throw new Error(`${label}: unexpected exit reason: ${result.exitReason}`);
}
// Phase 2: Outcome evaluation via LLM judge
const groundTruth = JSON.parse(
fs.readFileSync(path.join(ROOT, 'test', 'fixtures', groundTruthFile), 'utf-8'),
);
// Read the generated report (try expected path, then glob for any .md in reportDir or workDir)
let report: string | null = null;
if (fs.existsSync(reportPath)) {
report = fs.readFileSync(reportPath, 'utf-8');
} else {
// Agent may have named it differently — find any .md in reportDir or testWorkDir
for (const searchDir of [reportDir, testWorkDir]) {
try {
const mdFiles = fs.readdirSync(searchDir).filter(f => f.endsWith('.md'));
if (mdFiles.length > 0) {
report = fs.readFileSync(path.join(searchDir, mdFiles[0]), 'utf-8');
break;
}
} catch { /* dir may not exist if agent hit max_turns early */ }
}
// Also check the agent's final output for inline report content
if (!report && result.output && result.output.length > 100) {
report = result.output;
}
}
if (!report) {
dumpOutcomeDiagnostic(testWorkDir, label, '(no report file found)', { error: 'missing report' });
recordE2E(evalCollector, `/qa ${label}`, 'Planted-bug outcome evals', result, { error: 'no report generated' } as any);
throw new Error(`No report file found in ${reportDir}`);
}
const judgeResult = await outcomeJudge(groundTruth, report);
console.log(`${label} outcome:`, JSON.stringify(judgeResult, null, 2));
// Record to eval collector with outcome judge results
recordE2E(evalCollector, `/qa ${label}`, 'Planted-bug outcome evals', result, {
passed: judgePassed(judgeResult, groundTruth),
detection_rate: judgeResult.detection_rate,
false_positives: judgeResult.false_positives,
evidence_quality: judgeResult.evidence_quality,
detected_bugs: judgeResult.detected,
missed_bugs: judgeResult.missed,
} as any);
// Diagnostic dump on failure (decision 1C)
if (judgeResult.detection_rate < groundTruth.minimum_detection || judgeResult.false_positives > groundTruth.max_false_positives) {
dumpOutcomeDiagnostic(testWorkDir, label, report, judgeResult);
}
// Phase 2 assertions
expect(judgeResult.detection_rate).toBeGreaterThanOrEqual(groundTruth.minimum_detection);
expect(judgeResult.false_positives).toBeLessThanOrEqual(groundTruth.max_false_positives);
expect(judgeResult.evidence_quality).toBeGreaterThanOrEqual(2);
}
// B6: Static dashboard — broken link, disabled submit, overflow, missing alt, console error
testConcurrentIfSelected('qa-b6-static', async () => {
await runPlantedBugEval('qa-eval.html', 'qa-eval-ground-truth.json', 'b6-static');
}, 360_000);
// B7: SPA — broken route, stale state, async race, missing aria, console warning
testConcurrentIfSelected('qa-b7-spa', async () => {
await runPlantedBugEval('qa-eval-spa.html', 'qa-eval-spa-ground-truth.json', 'b7-spa');
}, 360_000);
// B8: Checkout — email regex, NaN total, CC overflow, missing required, stripe error
testConcurrentIfSelected('qa-b8-checkout', async () => {
await runPlantedBugEval('qa-eval-checkout.html', 'qa-eval-checkout-ground-truth.json', 'b8-checkout');
}, 360_000);
});
// Module-level afterAll — finalize eval collector after all tests complete
afterAll(async () => {
await finalizeEvalCollector(evalCollector);
});