feat: enable within-file E2E test concurrency for 3x faster runs

Switch all E2E tests from serial test() to testConcurrentIfSelected() so tests within each file run in parallel. Wall clock drops from ~18min to ~6min (limited by the longest single test, not sequential sum). The concurrent helper was already built in e2e-helpers.ts but never wired up. Each test runs in its own describe block with its own beforeAll/tmpdir — no shared state conflicts. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-05 05:05:08 +02:00 · 2026-03-22 23:06:07 -07:00
parent 7fbf68bb3f
commit 7ab00588c3
9 changed files with 27 additions and 27 deletions
@@ -76,7 +76,7 @@ if (evalsEnabled && !process.env.EVALS_ALL) {
 /** Skip an individual test if not selected by diff-based selection. */
 function testIfSelected(testName: string, fn: () => Promise<void>, timeout: number) {
  const shouldRun = selectedTests === null || selectedTests.includes(testName);
-  (shouldRun ? test : test.skip)(testName, fn, timeout);
+  (shouldRun ? test.concurrent : test.skip)(testName, fn, timeout);
 }

 // --- Eval result collector ---
@@ -44,7 +44,7 @@ describeIfSelected('Land-and-Deploy skill E2E', ['land-and-deploy-workflow'], ()
    try { fs.rmSync(landDir, { recursive: true, force: true }); } catch {}
  });

-  test('/land-and-deploy detects Fly.io platform and produces deploy report structure', async () => {
+  testConcurrentIfSelected('land-and-deploy-workflow', async () => {
    const result = await runSkillTest({
      prompt: `Read land-and-deploy/SKILL.md for the /land-and-deploy skill instructions.

@@ -110,7 +110,7 @@ describeIfSelected('Canary skill E2E', ['canary-workflow'], () => {
    try { fs.rmSync(canaryDir, { recursive: true, force: true }); } catch {}
  });

-  test('/canary skill produces monitoring report structure', async () => {
+  testConcurrentIfSelected('canary-workflow', async () => {
    const result = await runSkillTest({
      prompt: `Read canary/SKILL.md for the /canary skill instructions.

@@ -171,7 +171,7 @@ describeIfSelected('Benchmark skill E2E', ['benchmark-workflow'], () => {
    try { fs.rmSync(benchDir, { recursive: true, force: true }); } catch {}
  });

-  test('/benchmark skill produces performance report structure', async () => {
+  testConcurrentIfSelected('benchmark-workflow', async () => {
    const result = await runSkillTest({
      prompt: `Read benchmark/SKILL.md for the /benchmark skill instructions.

@@ -237,7 +237,7 @@ describeIfSelected('Setup-Deploy skill E2E', ['setup-deploy-workflow'], () => {
    try { fs.rmSync(setupDir, { recursive: true, force: true }); } catch {}
  });

-  test('/setup-deploy detects Fly.io and writes config to CLAUDE.md', async () => {
+  testConcurrentIfSelected('setup-deploy-workflow', async () => {
    const result = await runSkillTest({
      prompt: `Read setup-deploy/SKILL.md for the /setup-deploy skill instructions.

@@ -560,7 +560,7 @@ describeIfSelected('Design Review E2E', ['design-review-fix'], () => {
    try { fs.rmSync(qaDesignDir, { recursive: true, force: true }); } catch {}
  });

-  test('Test 7: /design-review audits and fixes design issues', async () => {
+  testConcurrentIfSelected('design-review-fix', async () => {
    const serverUrl = `http://localhost:${(qaDesignServer as any)?.port}`;

    const result = await runSkillTest({
@@ -66,7 +66,7 @@ We're building a new user dashboard that shows recent activity, notifications, a
    try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
  });

-  test('/plan-ceo-review produces structured review output', async () => {
+  testConcurrentIfSelected('plan-ceo-review', async () => {
    const result = await runSkillTest({
      prompt: `Read plan-ceo-review/SKILL.md for the review workflow.

@@ -150,7 +150,7 @@ We're building a new user dashboard that shows recent activity, notifications, a
    try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
  });

-  test('/plan-ceo-review SELECTIVE EXPANSION produces structured review output', async () => {
+  testConcurrentIfSelected('plan-ceo-review-selective', async () => {
    const result = await runSkillTest({
      prompt: `Read plan-ceo-review/SKILL.md for the review workflow.

@@ -244,7 +244,7 @@ Replace session-cookie auth with JWT tokens. Currently using express-session + R
    try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
  });

-  test('/plan-eng-review produces structured review output', async () => {
+  testConcurrentIfSelected('plan-eng-review', async () => {
    const result = await runSkillTest({
      prompt: `Read plan-eng-review/SKILL.md for the review workflow.

@@ -364,7 +364,7 @@ export function main() { return Dashboard(); }
    } catch {}
  });

-  test('/plan-eng-review writes test-plan artifact to ~/.gstack/projects/', async () => {
+  testConcurrentIfSelected('plan-eng-review-artifact', async () => {
    // Count existing test-plan files before
    const beforeFiles = fs.readdirSync(projectDir).filter(f => f.includes('test-plan'));

@@ -442,7 +442,7 @@ describeIfSelected('Office Hours Spec Review E2E', ['office-hours-spec-review'],
    try { fs.rmSync(ohDir, { recursive: true, force: true }); } catch {}
  });

-  test('/office-hours SKILL.md contains spec review loop', async () => {
+  testConcurrentIfSelected('office-hours-spec-review', async () => {
    const result = await runSkillTest({
      prompt: `Read office-hours/SKILL.md. I want to understand the spec review loop.

@@ -502,7 +502,7 @@ describeIfSelected('Plan CEO Review Benefits-From E2E', ['plan-ceo-review-benefi
    try { fs.rmSync(benefitsDir, { recursive: true, force: true }); } catch {}
  });

-  test('/plan-ceo-review SKILL.md contains prerequisite skill offer', async () => {
+  testConcurrentIfSelected('plan-ceo-review-benefits', async () => {
    const result = await runSkillTest({
      prompt: `Read plan-ceo-review/SKILL.md. Search for sections about "Prerequisite" or "office-hours" or "design doc found".

@@ -4,7 +4,7 @@ import { outcomeJudge } from './helpers/llm-judge';
 import { judgePassed } from './helpers/eval-store';
 import {
  ROOT, browseBin, runId, evalsEnabled, selectedTests, hasApiKey,
-  describeIfSelected, describeE2E,
+  describeIfSelected, describeE2E, testConcurrentIfSelected,
  copyDirSync, setupBrowseShims, logCost, recordE2E, dumpOutcomeDiagnostic,
  createEvalCollector, finalizeEvalCollector,
 } from './helpers/e2e-helpers';
@@ -172,17 +172,17 @@ CRITICAL RULES:
  }

  // B6: Static dashboard — broken link, disabled submit, overflow, missing alt, console error
-  test('/qa finds >= 2 of 5 planted bugs (static)', async () => {
+  testConcurrentIfSelected('qa-b6-static', async () => {
    await runPlantedBugEval('qa-eval.html', 'qa-eval-ground-truth.json', 'b6-static');
  }, 360_000);

  // B7: SPA — broken route, stale state, async race, missing aria, console warning
-  test('/qa finds >= 2 of 5 planted SPA bugs', async () => {
+  testConcurrentIfSelected('qa-b7-spa', async () => {
    await runPlantedBugEval('qa-eval-spa.html', 'qa-eval-spa-ground-truth.json', 'b7-spa');
  }, 360_000);

  // B8: Checkout — email regex, NaN total, CC overflow, missing required, stripe error
-  test('/qa finds >= 2 of 5 planted checkout bugs', async () => {
+  testConcurrentIfSelected('qa-b8-checkout', async () => {
    await runPlantedBugEval('qa-eval-checkout.html', 'qa-eval-checkout-ground-truth.json', 'b8-checkout');
  }, 360_000);

@@ -37,7 +37,7 @@ describeIfSelected('QA skill E2E', ['qa-quick'], () => {
    try { fs.rmSync(qaDir, { recursive: true, force: true }); } catch {}
  });

-  test('/qa quick completes without browse errors', async () => {
+  testConcurrentIfSelected('qa-quick', async () => {
    const result = await runSkillTest({
      prompt: `B="${browseBin}"

@@ -108,7 +108,7 @@ describeIfSelected('QA-Only skill E2E', ['qa-only-no-fix'], () => {
    try { fs.rmSync(qaOnlyDir, { recursive: true, force: true }); } catch {}
  });

-  test('/qa-only produces report without using Edit tool', async () => {
+  testConcurrentIfSelected('qa-only-no-fix', async () => {
    const result = await runSkillTest({
      prompt: `IMPORTANT: The browse binary is already assigned below as B. Do NOT search for it or run the SKILL.md setup block — just use $B directly.

@@ -227,7 +227,7 @@ describeIfSelected('QA Fix Loop E2E', ['qa-fix-loop'], () => {
    try { fs.rmSync(qaFixDir, { recursive: true, force: true }); } catch {}
  });

-  test('/qa fix loop finds bugs and commits fixes', async () => {
+  testConcurrentIfSelected('qa-fix-loop', async () => {
    const qaFixUrl = `http://127.0.0.1:${qaFixServer!.port}`;

    const result = await runSkillTest({
@@ -51,7 +51,7 @@ describeIfSelected('Review skill E2E', ['review-sql-injection'], () => {
    try { fs.rmSync(reviewDir, { recursive: true, force: true }); } catch {}
  });

-  test('/review produces findings on SQL injection branch', async () => {
+  testConcurrentIfSelected('review-sql-injection', async () => {
    const result = await runSkillTest({
      prompt: `You are in a git repo on a feature branch with changes against main.
 Read review-SKILL.md for the review workflow instructions.
@@ -125,7 +125,7 @@ describeIfSelected('Review enum completeness E2E', ['review-enum-completeness'],
    try { fs.rmSync(enumDir, { recursive: true, force: true }); } catch {}
  });

-  test('/review catches missing enum handlers for new status value', async () => {
+  testConcurrentIfSelected('review-enum-completeness', async () => {
    const result = await runSkillTest({
      prompt: `You are in a git repo on branch feature/add-returned-status with changes against main.
 Read review-SKILL.md for the review workflow instructions.
@@ -200,7 +200,7 @@ describeIfSelected('Review design lite E2E', ['review-design-lite'], () => {
    try { fs.rmSync(designDir, { recursive: true, force: true }); } catch {}
  });

-  test('/review catches design anti-patterns in CSS/HTML diff', async () => {
+  testConcurrentIfSelected('review-design-lite', async () => {
    const result = await runSkillTest({
      prompt: `You are in a git repo on branch feature/add-landing-page with changes against main.
 Read review-SKILL.md for the review workflow instructions.
@@ -497,7 +497,7 @@ describeIfSelected('Retro E2E', ['retro'], () => {
    try { fs.rmSync(retroDir, { recursive: true, force: true }); } catch {}
  });

-  test('/retro produces analysis from git history', async () => {
+  testConcurrentIfSelected('retro', async () => {
    const result = await runSkillTest({
      prompt: `Read retro/SKILL.md for instructions on how to run a retrospective.

@@ -60,7 +60,7 @@ describeIfSelected('Document-Release skill E2E', ['document-release'], () => {
    try { fs.rmSync(docReleaseDir, { recursive: true, force: true }); } catch {}
  });

-  test('/document-release updates docs without clobbering CHANGELOG', async () => {
+  testConcurrentIfSelected('document-release', async () => {
    const result = await runSkillTest({
      prompt: `Read the file document-release/SKILL.md for the document-release workflow instructions.

@@ -461,7 +461,7 @@ describe('processPayment', () => {
    try { fs.rmSync(coverageDir, { recursive: true, force: true }); } catch {}
  });

-  test('/ship Step 3.4 produces coverage diagram', async () => {
+  testConcurrentIfSelected('ship-coverage-audit', async () => {
    const result = await runSkillTest({
      prompt: `Read the file ship/SKILL.md for the ship workflow instructions.

@@ -544,7 +544,7 @@ describeIfSelected('Codex skill E2E', ['codex-review'], () => {
    try { fs.rmSync(codexDir, { recursive: true, force: true }); } catch {}
  });

-  test('/codex review produces findings and GATE verdict', async () => {
+  testConcurrentIfSelected('codex-review', async () => {
    // Check codex is available — skip if not installed
    const codexCheck = spawnSync('which', ['codex'], { stdio: 'pipe', timeout: 3000 });
    if (codexCheck.status !== 0) {
@@ -56,7 +56,7 @@ function describeIfSelected(name: string, testNames: string[], fn: () => void) {
 /** Skip an individual test if not selected (for multi-test describe blocks). */
 function testIfSelected(testName: string, fn: () => Promise<void>, timeout: number) {
  const shouldRun = selectedTests === null || selectedTests.includes(testName);
-  (shouldRun ? test : test.skip)(testName, fn, timeout);
+  (shouldRun ? test.concurrent : test.skip)(testName, fn, timeout);
 }

 describeIfSelected('LLM-as-judge quality evals', [