From 5d023d269fe67a7f3207c1c1e03fbc5f93995af7 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Thu, 26 Mar 2026 22:11:34 -0600 Subject: [PATCH] fix: stabilize flaky E2E tests (browse-basic, ship-base-branch, dashboard-via) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit browse-basic: bump maxTurns 5→7 (agent reads PNG per SKILL.md instruction) ship-base-branch: extract Step 0 only instead of full 1900-line ship/SKILL.md dashboard-via: extract dashboard section only + increase timeout 90s→180s Root cause: copying full SKILL.md files into test fixtures caused context bloat, leading to timeouts and flaky turn limits. Extracting only the relevant section cut dashboard-via from timing out at 240s to finishing in 38s. Co-Authored-By: Claude Opus 4.6 (1M context) --- test/skill-e2e-bws.test.ts | 2 +- test/skill-e2e-review.test.ts | 34 ++++++++++++++++++++-------------- 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/test/skill-e2e-bws.test.ts b/test/skill-e2e-bws.test.ts index 8c0d4a42..6a611fe7 100644 --- a/test/skill-e2e-bws.test.ts +++ b/test/skill-e2e-bws.test.ts @@ -45,7 +45,7 @@ describeIfSelected('Skill E2E tests', [ 4. $B screenshot /tmp/skill-e2e-test.png Report the results of each command.`, workingDirectory: tmpDir, - maxTurns: 5, + maxTurns: 7, timeout: 60_000, testName: 'browse-basic', runId, diff --git a/test/skill-e2e-review.test.ts b/test/skill-e2e-review.test.ts index b5ad501c..dacd4b16 100644 --- a/test/skill-e2e-review.test.ts +++ b/test/skill-e2e-review.test.ts @@ -340,21 +340,22 @@ Write your findings to ${dir}/review-output.md`, run('git', ['add', 'app.ts'], dir); run('git', ['commit', '-m', 'feat: update to v2'], dir); - // Copy ship skill - fs.copyFileSync(path.join(ROOT, 'ship', 'SKILL.md'), path.join(dir, 'ship-SKILL.md')); + // Extract only Step 0 (base branch detection) from ship/SKILL.md + // (copying the full 1900-line file causes agent context bloat and flaky timeouts) + const fullShipSkill = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8'); + const step0Start = fullShipSkill.indexOf('## Step 0: Detect platform and base branch'); + const step0End = fullShipSkill.indexOf('## Step 1: Pre-flight'); + const shipSection = fullShipSkill.slice(step0Start, step0End > step0Start ? step0End : undefined); + fs.writeFileSync(path.join(dir, 'ship-SKILL.md'), shipSection); const result = await runSkillTest({ - prompt: `Read ship-SKILL.md for the ship workflow. + prompt: `Read ship-SKILL.md. It contains Step 0 (Detect base branch) from the ship workflow. -Skip the preamble bash block, lake intro, telemetry, and contributor mode sections — go straight to Step 0. +Run the base branch detection. Since there is no remote, gh commands will fail — fall back to main. -Run ONLY Step 0 (Detect base branch) and Step 1 (Pre-flight) from the ship workflow. -Since there is no remote, gh commands will fail — fall back to main. +Then run git diff and git log against the detected base branch. -After completing Step 0 and Step 1, STOP. Do NOT proceed to Step 2 or beyond. -Do NOT push, create PRs, or modify VERSION/CHANGELOG. - -Write a summary of what you detected to ${dir}/ship-preflight.md including: +Write a summary to ${dir}/ship-preflight.md including: - The detected base branch name - The current branch name - The diff stat against the base branch`, @@ -580,8 +581,13 @@ describeIfSelected('Review Dashboard Via Attribution', ['review-dashboard-via'], ].join('\n')); fs.chmodSync(path.join(mockBinDir, 'gstack-review-read'), 0o755); - // Copy ship skill - fs.copyFileSync(path.join(ROOT, 'ship', 'SKILL.md'), path.join(dashDir, 'ship-SKILL.md')); + // Extract only the Review Readiness Dashboard section from ship/SKILL.md + // (copying the full 1900-line file causes agent context bloat and timeouts) + const fullSkill = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8'); + const dashStart = fullSkill.indexOf('## Review Readiness Dashboard'); + const dashEnd = fullSkill.indexOf('\n---\n', dashStart); + const dashSection = fullSkill.slice(dashStart, dashEnd > dashStart ? dashEnd : undefined); + fs.writeFileSync(path.join(dashDir, 'ship-SKILL.md'), dashSection); }); afterAll(() => { @@ -605,7 +611,7 @@ Skip the preamble, lake intro, telemetry, and all other ship steps. Write the dashboard output to ${dashDir}/dashboard-output.md`, workingDirectory: dashDir, maxTurns: 12, - timeout: 90_000, + timeout: 180_000, testName: 'review-dashboard-via', runId, }); @@ -639,7 +645,7 @@ Write the dashboard output to ${dashDir}/dashboard-output.md`, ); // Ship dashboard should not gate when eng review is clear expect(gateQuestions).toHaveLength(0); - }, 120_000); + }, 240_000); }); // Module-level afterAll — finalize eval collector after all tests complete