fix: three flaky E2E test fixes

ship-local-workflow: Use `git log --all` on bare remote so we count
commits on feature/ship-test, not just HEAD (main).

setup-cookies-detect: Accept "no browsers detected" as valid on CI
(headless Ubuntu has no browser cookie databases). Increase maxTurns
from 5→8 and make prompt explicit about always writing the file.

routing tests: Apply EVALS_TIER filtering — all routing tests are
periodic but the file had no tier awareness, so they ran under
EVALS_TIER=gate in CI and failed non-deterministically.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-03-24 14:08:38 -07:00
parent 91387d4766
commit 8763787cdf
2 changed files with 53 additions and 22 deletions
+36 -12
View File
@@ -3,7 +3,7 @@ import { runSkillTest } from './helpers/session-runner';
import type { SkillTestResult } from './helpers/session-runner';
import { EvalCollector } from './helpers/eval-store';
import type { EvalTestEntry } from './helpers/eval-store';
import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES } from './helpers/touchfiles';
import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, E2E_TIERS, GLOBAL_TOUCHFILES } from './helpers/touchfiles';
import { spawnSync } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
@@ -42,6 +42,21 @@ if (evalsEnabled && !process.env.EVALS_ALL) {
}
}
// Apply EVALS_TIER filter (same logic as e2e-helpers.ts)
if (evalsEnabled && process.env.EVALS_TIER) {
const tier = process.env.EVALS_TIER as 'gate' | 'periodic';
const tierTests = Object.entries(E2E_TIERS)
.filter(([, t]) => t === tier)
.map(([name]) => name);
if (selectedTests === null) {
selectedTests = tierTests;
} else {
selectedTests = selectedTests.filter(t => tierTests.includes(t));
}
process.stderr.write(`Routing EVALS_TIER=${tier}: ${selectedTests.length} tests\n\n`);
}
// --- Helper functions ---
/** Copy all SKILL.md files for auto-discovery.
@@ -140,6 +155,15 @@ function recordRouting(name: string, result: SkillTestResult, expectedSkill: str
});
}
// Skip individual tests based on selectedTests (diff + tier filtering)
const testIfSelected = (name: string, fn: () => Promise<void>, timeout?: number) => {
if (selectedTests !== null && !selectedTests.includes(name)) {
test.skip(name, () => {});
} else {
test.concurrent(name, fn, timeout);
}
};
// --- Tests ---
describeE2E('Skill Routing E2E — Developer Journey', () => {
@@ -147,7 +171,7 @@ describeE2E('Skill Routing E2E — Developer Journey', () => {
evalCollector?.finalize();
});
test.concurrent('journey-ideation', async () => {
testIfSelected('journey-ideation', async () => {
const tmpDir = createRoutingWorkDir('ideation');
try {
@@ -176,7 +200,7 @@ describeE2E('Skill Routing E2E — Developer Journey', () => {
}
}, 150_000);
test.concurrent('journey-plan-eng', async () => {
testIfSelected('journey-plan-eng', async () => {
const tmpDir = createRoutingWorkDir('plan-eng');
try {
fs.writeFileSync(path.join(tmpDir, 'plan.md'), `# Waitlist App Architecture
@@ -226,7 +250,7 @@ describeE2E('Skill Routing E2E — Developer Journey', () => {
}
}, 150_000);
test.concurrent('journey-think-bigger', async () => {
testIfSelected('journey-think-bigger', async () => {
const tmpDir = createRoutingWorkDir('think-bigger');
try {
fs.writeFileSync(path.join(tmpDir, 'plan.md'), `# Waitlist App Architecture
@@ -277,7 +301,7 @@ describeE2E('Skill Routing E2E — Developer Journey', () => {
}
}, 180_000);
test.concurrent('journey-debug', async () => {
testIfSelected('journey-debug', async () => {
const tmpDir = createRoutingWorkDir('debug');
try {
const run = (cmd: string, args: string[]) =>
@@ -335,7 +359,7 @@ export default app;
}
}, 150_000);
test.concurrent('journey-qa', async () => {
testIfSelected('journey-qa', async () => {
const tmpDir = createRoutingWorkDir('qa');
try {
fs.writeFileSync(path.join(tmpDir, 'package.json'), JSON.stringify({ name: 'waitlist-app', scripts: { dev: 'next dev' } }, null, 2));
@@ -371,7 +395,7 @@ export default app;
}
}, 150_000);
test.concurrent('journey-code-review', async () => {
testIfSelected('journey-code-review', async () => {
const tmpDir = createRoutingWorkDir('code-review');
try {
const run = (cmd: string, args: string[]) =>
@@ -411,7 +435,7 @@ export default app;
}
}, 150_000);
test.concurrent('journey-ship', async () => {
testIfSelected('journey-ship', async () => {
const tmpDir = createRoutingWorkDir('ship');
try {
const run = (cmd: string, args: string[]) =>
@@ -450,7 +474,7 @@ export default app;
}
}, 150_000);
test.concurrent('journey-docs', async () => {
testIfSelected('journey-docs', async () => {
const tmpDir = createRoutingWorkDir('docs');
try {
const run = (cmd: string, args: string[]) =>
@@ -487,7 +511,7 @@ export default app;
}
}, 150_000);
test.concurrent('journey-retro', async () => {
testIfSelected('journey-retro', async () => {
const tmpDir = createRoutingWorkDir('retro');
try {
const run = (cmd: string, args: string[]) =>
@@ -530,7 +554,7 @@ export default app;
}
}, 150_000);
test.concurrent('journey-design-system', async () => {
testIfSelected('journey-design-system', async () => {
const tmpDir = createRoutingWorkDir('design-system');
try {
@@ -559,7 +583,7 @@ export default app;
}
}, 150_000);
test.concurrent('journey-visual-qa', async () => {
testIfSelected('journey-visual-qa', async () => {
const tmpDir = createRoutingWorkDir('visual-qa');
try {
const run = (cmd: string, args: string[]) =>