fix: stabilize flaky E2E tests (browse-basic, ship-base-branch, dashboard-via)

browse-basic: bump maxTurns 5→7 (agent reads PNG per SKILL.md instruction) ship-base-branch: extract Step 0 only instead of full 1900-line ship/SKILL.md dashboard-via: extract dashboard section only + increase timeout 90s→180s Root cause: copying full SKILL.md files into test fixtures caused context bloat, leading to timeouts and flaky turn limits. Extracting only the relevant section cut dashboard-via from timing out at 240s to finishing in 38s. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-05 13:15:24 +02:00 · 2026-03-26 22:11:34 -06:00
parent eae56ff62a
commit 5d023d269f
2 changed files with 21 additions and 15 deletions
@@ -45,7 +45,7 @@ describeIfSelected('Skill E2E tests', [
 4. $B screenshot /tmp/skill-e2e-test.png
 Report the results of each command.`,
      workingDirectory: tmpDir,
-      maxTurns: 5,
+      maxTurns: 7,
      timeout: 60_000,
      testName: 'browse-basic',
      runId,
@@ -340,21 +340,22 @@ Write your findings to ${dir}/review-output.md`,
    run('git', ['add', 'app.ts'], dir);
    run('git', ['commit', '-m', 'feat: update to v2'], dir);

-    // Copy ship skill
-    fs.copyFileSync(path.join(ROOT, 'ship', 'SKILL.md'), path.join(dir, 'ship-SKILL.md'));
+    // Extract only Step 0 (base branch detection) from ship/SKILL.md
+    // (copying the full 1900-line file causes agent context bloat and flaky timeouts)
+    const fullShipSkill = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
+    const step0Start = fullShipSkill.indexOf('## Step 0: Detect platform and base branch');
+    const step0End = fullShipSkill.indexOf('## Step 1: Pre-flight');
+    const shipSection = fullShipSkill.slice(step0Start, step0End > step0Start ? step0End : undefined);
+    fs.writeFileSync(path.join(dir, 'ship-SKILL.md'), shipSection);

    const result = await runSkillTest({
-      prompt: `Read ship-SKILL.md for the ship workflow.
+      prompt: `Read ship-SKILL.md. It contains Step 0 (Detect base branch) from the ship workflow.

-Skip the preamble bash block, lake intro, telemetry, and contributor mode sections — go straight to Step 0.
+Run the base branch detection. Since there is no remote, gh commands will fail — fall back to main.

-Run ONLY Step 0 (Detect base branch) and Step 1 (Pre-flight) from the ship workflow.
-Since there is no remote, gh commands will fail — fall back to main.
+Then run git diff and git log against the detected base branch.

-After completing Step 0 and Step 1, STOP. Do NOT proceed to Step 2 or beyond.
-Do NOT push, create PRs, or modify VERSION/CHANGELOG.
-
-Write a summary of what you detected to ${dir}/ship-preflight.md including:
+Write a summary to ${dir}/ship-preflight.md including:
 - The detected base branch name
 - The current branch name
 - The diff stat against the base branch`,
@@ -580,8 +581,13 @@ describeIfSelected('Review Dashboard Via Attribution', ['review-dashboard-via'],
    ].join('\n'));
    fs.chmodSync(path.join(mockBinDir, 'gstack-review-read'), 0o755);

-    // Copy ship skill
-    fs.copyFileSync(path.join(ROOT, 'ship', 'SKILL.md'), path.join(dashDir, 'ship-SKILL.md'));
+    // Extract only the Review Readiness Dashboard section from ship/SKILL.md
+    // (copying the full 1900-line file causes agent context bloat and timeouts)
+    const fullSkill = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
+    const dashStart = fullSkill.indexOf('## Review Readiness Dashboard');
+    const dashEnd = fullSkill.indexOf('\n---\n', dashStart);
+    const dashSection = fullSkill.slice(dashStart, dashEnd > dashStart ? dashEnd : undefined);
+    fs.writeFileSync(path.join(dashDir, 'ship-SKILL.md'), dashSection);
  });

  afterAll(() => {
@@ -605,7 +611,7 @@ Skip the preamble, lake intro, telemetry, and all other ship steps.
 Write the dashboard output to ${dashDir}/dashboard-output.md`,
      workingDirectory: dashDir,
      maxTurns: 12,
-      timeout: 90_000,
+      timeout: 180_000,
      testName: 'review-dashboard-via',
      runId,
    });
@@ -639,7 +645,7 @@ Write the dashboard output to ${dashDir}/dashboard-output.md`,
    );
    // Ship dashboard should not gate when eng review is clear
    expect(gateQuestions).toHaveLength(0);
-  }, 120_000);
+  }, 240_000);
 });

 // Module-level afterAll — finalize eval collector after all tests complete