mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-05 05:05:08 +02:00
feat: enable within-file E2E test concurrency for 3x faster runs
Switch all E2E tests from serial test() to testConcurrentIfSelected() so tests within each file run in parallel. Wall clock drops from ~18min to ~6min (limited by the longest single test, not sequential sum). The concurrent helper was already built in e2e-helpers.ts but never wired up. Each test runs in its own describe block with its own beforeAll/tmpdir — no shared state conflicts. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -76,7 +76,7 @@ if (evalsEnabled && !process.env.EVALS_ALL) {
|
||||
/** Skip an individual test if not selected by diff-based selection. */
|
||||
function testIfSelected(testName: string, fn: () => Promise<void>, timeout: number) {
|
||||
const shouldRun = selectedTests === null || selectedTests.includes(testName);
|
||||
(shouldRun ? test : test.skip)(testName, fn, timeout);
|
||||
(shouldRun ? test.concurrent : test.skip)(testName, fn, timeout);
|
||||
}
|
||||
|
||||
// --- Eval result collector ---
|
||||
|
||||
@@ -44,7 +44,7 @@ describeIfSelected('Land-and-Deploy skill E2E', ['land-and-deploy-workflow'], ()
|
||||
try { fs.rmSync(landDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/land-and-deploy detects Fly.io platform and produces deploy report structure', async () => {
|
||||
testConcurrentIfSelected('land-and-deploy-workflow', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read land-and-deploy/SKILL.md for the /land-and-deploy skill instructions.
|
||||
|
||||
@@ -110,7 +110,7 @@ describeIfSelected('Canary skill E2E', ['canary-workflow'], () => {
|
||||
try { fs.rmSync(canaryDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/canary skill produces monitoring report structure', async () => {
|
||||
testConcurrentIfSelected('canary-workflow', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read canary/SKILL.md for the /canary skill instructions.
|
||||
|
||||
@@ -171,7 +171,7 @@ describeIfSelected('Benchmark skill E2E', ['benchmark-workflow'], () => {
|
||||
try { fs.rmSync(benchDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/benchmark skill produces performance report structure', async () => {
|
||||
testConcurrentIfSelected('benchmark-workflow', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read benchmark/SKILL.md for the /benchmark skill instructions.
|
||||
|
||||
@@ -237,7 +237,7 @@ describeIfSelected('Setup-Deploy skill E2E', ['setup-deploy-workflow'], () => {
|
||||
try { fs.rmSync(setupDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/setup-deploy detects Fly.io and writes config to CLAUDE.md', async () => {
|
||||
testConcurrentIfSelected('setup-deploy-workflow', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read setup-deploy/SKILL.md for the /setup-deploy skill instructions.
|
||||
|
||||
|
||||
@@ -560,7 +560,7 @@ describeIfSelected('Design Review E2E', ['design-review-fix'], () => {
|
||||
try { fs.rmSync(qaDesignDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('Test 7: /design-review audits and fixes design issues', async () => {
|
||||
testConcurrentIfSelected('design-review-fix', async () => {
|
||||
const serverUrl = `http://localhost:${(qaDesignServer as any)?.port}`;
|
||||
|
||||
const result = await runSkillTest({
|
||||
|
||||
@@ -66,7 +66,7 @@ We're building a new user dashboard that shows recent activity, notifications, a
|
||||
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/plan-ceo-review produces structured review output', async () => {
|
||||
testConcurrentIfSelected('plan-ceo-review', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read plan-ceo-review/SKILL.md for the review workflow.
|
||||
|
||||
@@ -150,7 +150,7 @@ We're building a new user dashboard that shows recent activity, notifications, a
|
||||
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/plan-ceo-review SELECTIVE EXPANSION produces structured review output', async () => {
|
||||
testConcurrentIfSelected('plan-ceo-review-selective', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read plan-ceo-review/SKILL.md for the review workflow.
|
||||
|
||||
@@ -244,7 +244,7 @@ Replace session-cookie auth with JWT tokens. Currently using express-session + R
|
||||
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/plan-eng-review produces structured review output', async () => {
|
||||
testConcurrentIfSelected('plan-eng-review', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read plan-eng-review/SKILL.md for the review workflow.
|
||||
|
||||
@@ -364,7 +364,7 @@ export function main() { return Dashboard(); }
|
||||
} catch {}
|
||||
});
|
||||
|
||||
test('/plan-eng-review writes test-plan artifact to ~/.gstack/projects/', async () => {
|
||||
testConcurrentIfSelected('plan-eng-review-artifact', async () => {
|
||||
// Count existing test-plan files before
|
||||
const beforeFiles = fs.readdirSync(projectDir).filter(f => f.includes('test-plan'));
|
||||
|
||||
@@ -442,7 +442,7 @@ describeIfSelected('Office Hours Spec Review E2E', ['office-hours-spec-review'],
|
||||
try { fs.rmSync(ohDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/office-hours SKILL.md contains spec review loop', async () => {
|
||||
testConcurrentIfSelected('office-hours-spec-review', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read office-hours/SKILL.md. I want to understand the spec review loop.
|
||||
|
||||
@@ -502,7 +502,7 @@ describeIfSelected('Plan CEO Review Benefits-From E2E', ['plan-ceo-review-benefi
|
||||
try { fs.rmSync(benefitsDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/plan-ceo-review SKILL.md contains prerequisite skill offer', async () => {
|
||||
testConcurrentIfSelected('plan-ceo-review-benefits', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read plan-ceo-review/SKILL.md. Search for sections about "Prerequisite" or "office-hours" or "design doc found".
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ import { outcomeJudge } from './helpers/llm-judge';
|
||||
import { judgePassed } from './helpers/eval-store';
|
||||
import {
|
||||
ROOT, browseBin, runId, evalsEnabled, selectedTests, hasApiKey,
|
||||
describeIfSelected, describeE2E,
|
||||
describeIfSelected, describeE2E, testConcurrentIfSelected,
|
||||
copyDirSync, setupBrowseShims, logCost, recordE2E, dumpOutcomeDiagnostic,
|
||||
createEvalCollector, finalizeEvalCollector,
|
||||
} from './helpers/e2e-helpers';
|
||||
@@ -172,17 +172,17 @@ CRITICAL RULES:
|
||||
}
|
||||
|
||||
// B6: Static dashboard — broken link, disabled submit, overflow, missing alt, console error
|
||||
test('/qa finds >= 2 of 5 planted bugs (static)', async () => {
|
||||
testConcurrentIfSelected('qa-b6-static', async () => {
|
||||
await runPlantedBugEval('qa-eval.html', 'qa-eval-ground-truth.json', 'b6-static');
|
||||
}, 360_000);
|
||||
|
||||
// B7: SPA — broken route, stale state, async race, missing aria, console warning
|
||||
test('/qa finds >= 2 of 5 planted SPA bugs', async () => {
|
||||
testConcurrentIfSelected('qa-b7-spa', async () => {
|
||||
await runPlantedBugEval('qa-eval-spa.html', 'qa-eval-spa-ground-truth.json', 'b7-spa');
|
||||
}, 360_000);
|
||||
|
||||
// B8: Checkout — email regex, NaN total, CC overflow, missing required, stripe error
|
||||
test('/qa finds >= 2 of 5 planted checkout bugs', async () => {
|
||||
testConcurrentIfSelected('qa-b8-checkout', async () => {
|
||||
await runPlantedBugEval('qa-eval-checkout.html', 'qa-eval-checkout-ground-truth.json', 'b8-checkout');
|
||||
}, 360_000);
|
||||
|
||||
|
||||
@@ -37,7 +37,7 @@ describeIfSelected('QA skill E2E', ['qa-quick'], () => {
|
||||
try { fs.rmSync(qaDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/qa quick completes without browse errors', async () => {
|
||||
testConcurrentIfSelected('qa-quick', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `B="${browseBin}"
|
||||
|
||||
@@ -108,7 +108,7 @@ describeIfSelected('QA-Only skill E2E', ['qa-only-no-fix'], () => {
|
||||
try { fs.rmSync(qaOnlyDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/qa-only produces report without using Edit tool', async () => {
|
||||
testConcurrentIfSelected('qa-only-no-fix', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `IMPORTANT: The browse binary is already assigned below as B. Do NOT search for it or run the SKILL.md setup block — just use $B directly.
|
||||
|
||||
@@ -227,7 +227,7 @@ describeIfSelected('QA Fix Loop E2E', ['qa-fix-loop'], () => {
|
||||
try { fs.rmSync(qaFixDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/qa fix loop finds bugs and commits fixes', async () => {
|
||||
testConcurrentIfSelected('qa-fix-loop', async () => {
|
||||
const qaFixUrl = `http://127.0.0.1:${qaFixServer!.port}`;
|
||||
|
||||
const result = await runSkillTest({
|
||||
|
||||
@@ -51,7 +51,7 @@ describeIfSelected('Review skill E2E', ['review-sql-injection'], () => {
|
||||
try { fs.rmSync(reviewDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/review produces findings on SQL injection branch', async () => {
|
||||
testConcurrentIfSelected('review-sql-injection', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `You are in a git repo on a feature branch with changes against main.
|
||||
Read review-SKILL.md for the review workflow instructions.
|
||||
@@ -125,7 +125,7 @@ describeIfSelected('Review enum completeness E2E', ['review-enum-completeness'],
|
||||
try { fs.rmSync(enumDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/review catches missing enum handlers for new status value', async () => {
|
||||
testConcurrentIfSelected('review-enum-completeness', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `You are in a git repo on branch feature/add-returned-status with changes against main.
|
||||
Read review-SKILL.md for the review workflow instructions.
|
||||
@@ -200,7 +200,7 @@ describeIfSelected('Review design lite E2E', ['review-design-lite'], () => {
|
||||
try { fs.rmSync(designDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/review catches design anti-patterns in CSS/HTML diff', async () => {
|
||||
testConcurrentIfSelected('review-design-lite', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `You are in a git repo on branch feature/add-landing-page with changes against main.
|
||||
Read review-SKILL.md for the review workflow instructions.
|
||||
@@ -497,7 +497,7 @@ describeIfSelected('Retro E2E', ['retro'], () => {
|
||||
try { fs.rmSync(retroDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/retro produces analysis from git history', async () => {
|
||||
testConcurrentIfSelected('retro', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read retro/SKILL.md for instructions on how to run a retrospective.
|
||||
|
||||
|
||||
@@ -60,7 +60,7 @@ describeIfSelected('Document-Release skill E2E', ['document-release'], () => {
|
||||
try { fs.rmSync(docReleaseDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/document-release updates docs without clobbering CHANGELOG', async () => {
|
||||
testConcurrentIfSelected('document-release', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read the file document-release/SKILL.md for the document-release workflow instructions.
|
||||
|
||||
@@ -461,7 +461,7 @@ describe('processPayment', () => {
|
||||
try { fs.rmSync(coverageDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/ship Step 3.4 produces coverage diagram', async () => {
|
||||
testConcurrentIfSelected('ship-coverage-audit', async () => {
|
||||
const result = await runSkillTest({
|
||||
prompt: `Read the file ship/SKILL.md for the ship workflow instructions.
|
||||
|
||||
@@ -544,7 +544,7 @@ describeIfSelected('Codex skill E2E', ['codex-review'], () => {
|
||||
try { fs.rmSync(codexDir, { recursive: true, force: true }); } catch {}
|
||||
});
|
||||
|
||||
test('/codex review produces findings and GATE verdict', async () => {
|
||||
testConcurrentIfSelected('codex-review', async () => {
|
||||
// Check codex is available — skip if not installed
|
||||
const codexCheck = spawnSync('which', ['codex'], { stdio: 'pipe', timeout: 3000 });
|
||||
if (codexCheck.status !== 0) {
|
||||
|
||||
@@ -56,7 +56,7 @@ function describeIfSelected(name: string, testNames: string[], fn: () => void) {
|
||||
/** Skip an individual test if not selected (for multi-test describe blocks). */
|
||||
function testIfSelected(testName: string, fn: () => Promise<void>, timeout: number) {
|
||||
const shouldRun = selectedTests === null || selectedTests.includes(testName);
|
||||
(shouldRun ? test : test.skip)(testName, fn, timeout);
|
||||
(shouldRun ? test.concurrent : test.skip)(testName, fn, timeout);
|
||||
}
|
||||
|
||||
describeIfSelected('LLM-as-judge quality evals', [
|
||||
|
||||
Reference in New Issue
Block a user