feat: enable within-file E2E test concurrency for 3x faster runs

Switch all E2E tests from serial test() to testConcurrentIfSelected()
so tests within each file run in parallel. Wall clock drops from ~18min
to ~6min (limited by the longest single test, not sequential sum).

The concurrent helper was already built in e2e-helpers.ts but never
wired up. Each test runs in its own describe block with its own
beforeAll/tmpdir — no shared state conflicts.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-03-22 23:06:07 -07:00
parent 7fbf68bb3f
commit 7ab00588c3
9 changed files with 27 additions and 27 deletions
+1 -1
View File
@@ -76,7 +76,7 @@ if (evalsEnabled && !process.env.EVALS_ALL) {
/** Skip an individual test if not selected by diff-based selection. */
function testIfSelected(testName: string, fn: () => Promise<void>, timeout: number) {
const shouldRun = selectedTests === null || selectedTests.includes(testName);
(shouldRun ? test : test.skip)(testName, fn, timeout);
(shouldRun ? test.concurrent : test.skip)(testName, fn, timeout);
}
// --- Eval result collector ---
+4 -4
View File
@@ -44,7 +44,7 @@ describeIfSelected('Land-and-Deploy skill E2E', ['land-and-deploy-workflow'], ()
try { fs.rmSync(landDir, { recursive: true, force: true }); } catch {}
});
test('/land-and-deploy detects Fly.io platform and produces deploy report structure', async () => {
testConcurrentIfSelected('land-and-deploy-workflow', async () => {
const result = await runSkillTest({
prompt: `Read land-and-deploy/SKILL.md for the /land-and-deploy skill instructions.
@@ -110,7 +110,7 @@ describeIfSelected('Canary skill E2E', ['canary-workflow'], () => {
try { fs.rmSync(canaryDir, { recursive: true, force: true }); } catch {}
});
test('/canary skill produces monitoring report structure', async () => {
testConcurrentIfSelected('canary-workflow', async () => {
const result = await runSkillTest({
prompt: `Read canary/SKILL.md for the /canary skill instructions.
@@ -171,7 +171,7 @@ describeIfSelected('Benchmark skill E2E', ['benchmark-workflow'], () => {
try { fs.rmSync(benchDir, { recursive: true, force: true }); } catch {}
});
test('/benchmark skill produces performance report structure', async () => {
testConcurrentIfSelected('benchmark-workflow', async () => {
const result = await runSkillTest({
prompt: `Read benchmark/SKILL.md for the /benchmark skill instructions.
@@ -237,7 +237,7 @@ describeIfSelected('Setup-Deploy skill E2E', ['setup-deploy-workflow'], () => {
try { fs.rmSync(setupDir, { recursive: true, force: true }); } catch {}
});
test('/setup-deploy detects Fly.io and writes config to CLAUDE.md', async () => {
testConcurrentIfSelected('setup-deploy-workflow', async () => {
const result = await runSkillTest({
prompt: `Read setup-deploy/SKILL.md for the /setup-deploy skill instructions.
+1 -1
View File
@@ -560,7 +560,7 @@ describeIfSelected('Design Review E2E', ['design-review-fix'], () => {
try { fs.rmSync(qaDesignDir, { recursive: true, force: true }); } catch {}
});
test('Test 7: /design-review audits and fixes design issues', async () => {
testConcurrentIfSelected('design-review-fix', async () => {
const serverUrl = `http://localhost:${(qaDesignServer as any)?.port}`;
const result = await runSkillTest({
+6 -6
View File
@@ -66,7 +66,7 @@ We're building a new user dashboard that shows recent activity, notifications, a
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
});
test('/plan-ceo-review produces structured review output', async () => {
testConcurrentIfSelected('plan-ceo-review', async () => {
const result = await runSkillTest({
prompt: `Read plan-ceo-review/SKILL.md for the review workflow.
@@ -150,7 +150,7 @@ We're building a new user dashboard that shows recent activity, notifications, a
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
});
test('/plan-ceo-review SELECTIVE EXPANSION produces structured review output', async () => {
testConcurrentIfSelected('plan-ceo-review-selective', async () => {
const result = await runSkillTest({
prompt: `Read plan-ceo-review/SKILL.md for the review workflow.
@@ -244,7 +244,7 @@ Replace session-cookie auth with JWT tokens. Currently using express-session + R
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
});
test('/plan-eng-review produces structured review output', async () => {
testConcurrentIfSelected('plan-eng-review', async () => {
const result = await runSkillTest({
prompt: `Read plan-eng-review/SKILL.md for the review workflow.
@@ -364,7 +364,7 @@ export function main() { return Dashboard(); }
} catch {}
});
test('/plan-eng-review writes test-plan artifact to ~/.gstack/projects/', async () => {
testConcurrentIfSelected('plan-eng-review-artifact', async () => {
// Count existing test-plan files before
const beforeFiles = fs.readdirSync(projectDir).filter(f => f.includes('test-plan'));
@@ -442,7 +442,7 @@ describeIfSelected('Office Hours Spec Review E2E', ['office-hours-spec-review'],
try { fs.rmSync(ohDir, { recursive: true, force: true }); } catch {}
});
test('/office-hours SKILL.md contains spec review loop', async () => {
testConcurrentIfSelected('office-hours-spec-review', async () => {
const result = await runSkillTest({
prompt: `Read office-hours/SKILL.md. I want to understand the spec review loop.
@@ -502,7 +502,7 @@ describeIfSelected('Plan CEO Review Benefits-From E2E', ['plan-ceo-review-benefi
try { fs.rmSync(benefitsDir, { recursive: true, force: true }); } catch {}
});
test('/plan-ceo-review SKILL.md contains prerequisite skill offer', async () => {
testConcurrentIfSelected('plan-ceo-review-benefits', async () => {
const result = await runSkillTest({
prompt: `Read plan-ceo-review/SKILL.md. Search for sections about "Prerequisite" or "office-hours" or "design doc found".
+4 -4
View File
@@ -4,7 +4,7 @@ import { outcomeJudge } from './helpers/llm-judge';
import { judgePassed } from './helpers/eval-store';
import {
ROOT, browseBin, runId, evalsEnabled, selectedTests, hasApiKey,
describeIfSelected, describeE2E,
describeIfSelected, describeE2E, testConcurrentIfSelected,
copyDirSync, setupBrowseShims, logCost, recordE2E, dumpOutcomeDiagnostic,
createEvalCollector, finalizeEvalCollector,
} from './helpers/e2e-helpers';
@@ -172,17 +172,17 @@ CRITICAL RULES:
}
// B6: Static dashboard — broken link, disabled submit, overflow, missing alt, console error
test('/qa finds >= 2 of 5 planted bugs (static)', async () => {
testConcurrentIfSelected('qa-b6-static', async () => {
await runPlantedBugEval('qa-eval.html', 'qa-eval-ground-truth.json', 'b6-static');
}, 360_000);
// B7: SPA — broken route, stale state, async race, missing aria, console warning
test('/qa finds >= 2 of 5 planted SPA bugs', async () => {
testConcurrentIfSelected('qa-b7-spa', async () => {
await runPlantedBugEval('qa-eval-spa.html', 'qa-eval-spa-ground-truth.json', 'b7-spa');
}, 360_000);
// B8: Checkout — email regex, NaN total, CC overflow, missing required, stripe error
test('/qa finds >= 2 of 5 planted checkout bugs', async () => {
testConcurrentIfSelected('qa-b8-checkout', async () => {
await runPlantedBugEval('qa-eval-checkout.html', 'qa-eval-checkout-ground-truth.json', 'b8-checkout');
}, 360_000);
+3 -3
View File
@@ -37,7 +37,7 @@ describeIfSelected('QA skill E2E', ['qa-quick'], () => {
try { fs.rmSync(qaDir, { recursive: true, force: true }); } catch {}
});
test('/qa quick completes without browse errors', async () => {
testConcurrentIfSelected('qa-quick', async () => {
const result = await runSkillTest({
prompt: `B="${browseBin}"
@@ -108,7 +108,7 @@ describeIfSelected('QA-Only skill E2E', ['qa-only-no-fix'], () => {
try { fs.rmSync(qaOnlyDir, { recursive: true, force: true }); } catch {}
});
test('/qa-only produces report without using Edit tool', async () => {
testConcurrentIfSelected('qa-only-no-fix', async () => {
const result = await runSkillTest({
prompt: `IMPORTANT: The browse binary is already assigned below as B. Do NOT search for it or run the SKILL.md setup block — just use $B directly.
@@ -227,7 +227,7 @@ describeIfSelected('QA Fix Loop E2E', ['qa-fix-loop'], () => {
try { fs.rmSync(qaFixDir, { recursive: true, force: true }); } catch {}
});
test('/qa fix loop finds bugs and commits fixes', async () => {
testConcurrentIfSelected('qa-fix-loop', async () => {
const qaFixUrl = `http://127.0.0.1:${qaFixServer!.port}`;
const result = await runSkillTest({
+4 -4
View File
@@ -51,7 +51,7 @@ describeIfSelected('Review skill E2E', ['review-sql-injection'], () => {
try { fs.rmSync(reviewDir, { recursive: true, force: true }); } catch {}
});
test('/review produces findings on SQL injection branch', async () => {
testConcurrentIfSelected('review-sql-injection', async () => {
const result = await runSkillTest({
prompt: `You are in a git repo on a feature branch with changes against main.
Read review-SKILL.md for the review workflow instructions.
@@ -125,7 +125,7 @@ describeIfSelected('Review enum completeness E2E', ['review-enum-completeness'],
try { fs.rmSync(enumDir, { recursive: true, force: true }); } catch {}
});
test('/review catches missing enum handlers for new status value', async () => {
testConcurrentIfSelected('review-enum-completeness', async () => {
const result = await runSkillTest({
prompt: `You are in a git repo on branch feature/add-returned-status with changes against main.
Read review-SKILL.md for the review workflow instructions.
@@ -200,7 +200,7 @@ describeIfSelected('Review design lite E2E', ['review-design-lite'], () => {
try { fs.rmSync(designDir, { recursive: true, force: true }); } catch {}
});
test('/review catches design anti-patterns in CSS/HTML diff', async () => {
testConcurrentIfSelected('review-design-lite', async () => {
const result = await runSkillTest({
prompt: `You are in a git repo on branch feature/add-landing-page with changes against main.
Read review-SKILL.md for the review workflow instructions.
@@ -497,7 +497,7 @@ describeIfSelected('Retro E2E', ['retro'], () => {
try { fs.rmSync(retroDir, { recursive: true, force: true }); } catch {}
});
test('/retro produces analysis from git history', async () => {
testConcurrentIfSelected('retro', async () => {
const result = await runSkillTest({
prompt: `Read retro/SKILL.md for instructions on how to run a retrospective.
+3 -3
View File
@@ -60,7 +60,7 @@ describeIfSelected('Document-Release skill E2E', ['document-release'], () => {
try { fs.rmSync(docReleaseDir, { recursive: true, force: true }); } catch {}
});
test('/document-release updates docs without clobbering CHANGELOG', async () => {
testConcurrentIfSelected('document-release', async () => {
const result = await runSkillTest({
prompt: `Read the file document-release/SKILL.md for the document-release workflow instructions.
@@ -461,7 +461,7 @@ describe('processPayment', () => {
try { fs.rmSync(coverageDir, { recursive: true, force: true }); } catch {}
});
test('/ship Step 3.4 produces coverage diagram', async () => {
testConcurrentIfSelected('ship-coverage-audit', async () => {
const result = await runSkillTest({
prompt: `Read the file ship/SKILL.md for the ship workflow instructions.
@@ -544,7 +544,7 @@ describeIfSelected('Codex skill E2E', ['codex-review'], () => {
try { fs.rmSync(codexDir, { recursive: true, force: true }); } catch {}
});
test('/codex review produces findings and GATE verdict', async () => {
testConcurrentIfSelected('codex-review', async () => {
// Check codex is available — skip if not installed
const codexCheck = spawnSync('which', ['codex'], { stdio: 'pipe', timeout: 3000 });
if (codexCheck.status !== 0) {
+1 -1
View File
@@ -56,7 +56,7 @@ function describeIfSelected(name: string, testNames: string[], fn: () => void) {
/** Skip an individual test if not selected (for multi-test describe blocks). */
function testIfSelected(testName: string, fn: () => Promise<void>, timeout: number) {
const shouldRun = selectedTests === null || selectedTests.includes(testName);
(shouldRun ? test : test.skip)(testName, fn, timeout);
(shouldRun ? test.concurrent : test.skip)(testName, fn, timeout);
}
describeIfSelected('LLM-as-judge quality evals', [