fix: three flaky E2E test fixes

ship-local-workflow: Use `git log --all` on bare remote so we count
commits on feature/ship-test, not just HEAD (main).

setup-cookies-detect: Accept "no browsers detected" as valid on CI
(headless Ubuntu has no browser cookie databases). Increase maxTurns
from 5→8 and make prompt explicit about always writing the file.

routing tests: Apply EVALS_TIER filtering — all routing tests are
periodic but the file had no tier awareness, so they ran under
EVALS_TIER=gate in CI and failed non-deterministically.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-03-24 14:08:38 -07:00
parent 91387d4766
commit 8763787cdf
2 changed files with 53 additions and 22 deletions
+17 -10
View File
@@ -175,9 +175,10 @@ describeIfSelected('Ship workflow E2E', ['ship-local-workflow'], () => {
logCost('/ship local workflow', result);
// Check push succeeded
const remoteLog = spawnSync('git', ['log', '--oneline'], { cwd: shipRemoteDir, stdio: 'pipe' });
const remoteCommits = remoteLog.stdout.toString().trim().split('\n').length;
// Check push succeeded — check the feature branch on the bare remote
// (bare repo HEAD points to main which only has 1 commit; the push goes to feature/ship-test)
const remoteLog = spawnSync('git', ['log', '--oneline', '--all'], { cwd: shipRemoteDir, stdio: 'pipe' });
const remoteCommits = remoteLog.stdout.toString().trim().split('\n').filter(l => l.length > 0).length;
// Check VERSION was bumped
const versionContent = fs.existsSync(path.join(shipWorkDir, 'VERSION'))
@@ -217,12 +218,14 @@ describeIfSelected('Setup Browser Cookies E2E', ['setup-cookies-detect'], () =>
const result = await runSkillTest({
prompt: `Read setup-browser-cookies/SKILL.md for the cookie import workflow.
This is a test environment. List which browsers you can detect on this system by checking for their cookie database files.
Write the detected browsers to ${cookieDir}/detected-browsers.md.
This is a test environment. Check which browsers exist on this system by looking for their cookie database files.
IMPORTANT: You MUST write a file called ${cookieDir}/detected-browsers.md with your findings.
If you find browsers, list them. If you find NO browsers, write "No browsers detected" to the file.
The file must always be created regardless of results.
Do NOT launch the cookie picker UI — just detect and report.`,
workingDirectory: cookieDir,
maxTurns: 5,
timeout: 45_000,
maxTurns: 8,
timeout: 60_000,
testName: 'setup-cookies-detect',
runId,
});
@@ -233,17 +236,21 @@ Do NOT launch the cookie picker UI — just detect and report.`,
const detectExists = fs.existsSync(detectPath);
const detectContent = detectExists ? fs.readFileSync(detectPath, 'utf-8') : '';
const hasBrowserName = /chrome|arc|brave|edge|comet|safari|firefox/i.test(detectContent);
const hasNoBrowsers = /no browser|none|not found|not detected|could not|couldn't/i.test(detectContent);
// On CI (headless Ubuntu), no browsers are installed — "no browsers detected" is valid
const contentValid = hasBrowserName || hasNoBrowsers;
recordE2E(evalCollector, '/setup-browser-cookies detect', 'Setup Browser Cookies E2E', result, {
passed: detectExists && hasBrowserName && ['success', 'error_max_turns'].includes(result.exitReason),
passed: detectExists && contentValid && ['success', 'error_max_turns'].includes(result.exitReason),
});
expect(['success', 'error_max_turns']).toContain(result.exitReason);
expect(detectExists).toBe(true);
if (detectExists) {
expect(hasBrowserName).toBe(true);
expect(contentValid).toBe(true);
}
}, 60_000);
}, 90_000);
});
// --- gstack-upgrade E2E ---
+36 -12
View File
@@ -3,7 +3,7 @@ import { runSkillTest } from './helpers/session-runner';
import type { SkillTestResult } from './helpers/session-runner';
import { EvalCollector } from './helpers/eval-store';
import type { EvalTestEntry } from './helpers/eval-store';
import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES } from './helpers/touchfiles';
import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, E2E_TIERS, GLOBAL_TOUCHFILES } from './helpers/touchfiles';
import { spawnSync } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
@@ -42,6 +42,21 @@ if (evalsEnabled && !process.env.EVALS_ALL) {
}
}
// Apply EVALS_TIER filter (same logic as e2e-helpers.ts)
if (evalsEnabled && process.env.EVALS_TIER) {
const tier = process.env.EVALS_TIER as 'gate' | 'periodic';
const tierTests = Object.entries(E2E_TIERS)
.filter(([, t]) => t === tier)
.map(([name]) => name);
if (selectedTests === null) {
selectedTests = tierTests;
} else {
selectedTests = selectedTests.filter(t => tierTests.includes(t));
}
process.stderr.write(`Routing EVALS_TIER=${tier}: ${selectedTests.length} tests\n\n`);
}
// --- Helper functions ---
/** Copy all SKILL.md files for auto-discovery.
@@ -140,6 +155,15 @@ function recordRouting(name: string, result: SkillTestResult, expectedSkill: str
});
}
// Skip individual tests based on selectedTests (diff + tier filtering)
const testIfSelected = (name: string, fn: () => Promise<void>, timeout?: number) => {
if (selectedTests !== null && !selectedTests.includes(name)) {
test.skip(name, () => {});
} else {
test.concurrent(name, fn, timeout);
}
};
// --- Tests ---
describeE2E('Skill Routing E2E — Developer Journey', () => {
@@ -147,7 +171,7 @@ describeE2E('Skill Routing E2E — Developer Journey', () => {
evalCollector?.finalize();
});
test.concurrent('journey-ideation', async () => {
testIfSelected('journey-ideation', async () => {
const tmpDir = createRoutingWorkDir('ideation');
try {
@@ -176,7 +200,7 @@ describeE2E('Skill Routing E2E — Developer Journey', () => {
}
}, 150_000);
test.concurrent('journey-plan-eng', async () => {
testIfSelected('journey-plan-eng', async () => {
const tmpDir = createRoutingWorkDir('plan-eng');
try {
fs.writeFileSync(path.join(tmpDir, 'plan.md'), `# Waitlist App Architecture
@@ -226,7 +250,7 @@ describeE2E('Skill Routing E2E — Developer Journey', () => {
}
}, 150_000);
test.concurrent('journey-think-bigger', async () => {
testIfSelected('journey-think-bigger', async () => {
const tmpDir = createRoutingWorkDir('think-bigger');
try {
fs.writeFileSync(path.join(tmpDir, 'plan.md'), `# Waitlist App Architecture
@@ -277,7 +301,7 @@ describeE2E('Skill Routing E2E — Developer Journey', () => {
}
}, 180_000);
test.concurrent('journey-debug', async () => {
testIfSelected('journey-debug', async () => {
const tmpDir = createRoutingWorkDir('debug');
try {
const run = (cmd: string, args: string[]) =>
@@ -335,7 +359,7 @@ export default app;
}
}, 150_000);
test.concurrent('journey-qa', async () => {
testIfSelected('journey-qa', async () => {
const tmpDir = createRoutingWorkDir('qa');
try {
fs.writeFileSync(path.join(tmpDir, 'package.json'), JSON.stringify({ name: 'waitlist-app', scripts: { dev: 'next dev' } }, null, 2));
@@ -371,7 +395,7 @@ export default app;
}
}, 150_000);
test.concurrent('journey-code-review', async () => {
testIfSelected('journey-code-review', async () => {
const tmpDir = createRoutingWorkDir('code-review');
try {
const run = (cmd: string, args: string[]) =>
@@ -411,7 +435,7 @@ export default app;
}
}, 150_000);
test.concurrent('journey-ship', async () => {
testIfSelected('journey-ship', async () => {
const tmpDir = createRoutingWorkDir('ship');
try {
const run = (cmd: string, args: string[]) =>
@@ -450,7 +474,7 @@ export default app;
}
}, 150_000);
test.concurrent('journey-docs', async () => {
testIfSelected('journey-docs', async () => {
const tmpDir = createRoutingWorkDir('docs');
try {
const run = (cmd: string, args: string[]) =>
@@ -487,7 +511,7 @@ export default app;
}
}, 150_000);
test.concurrent('journey-retro', async () => {
testIfSelected('journey-retro', async () => {
const tmpDir = createRoutingWorkDir('retro');
try {
const run = (cmd: string, args: string[]) =>
@@ -530,7 +554,7 @@ export default app;
}
}, 150_000);
test.concurrent('journey-design-system', async () => {
testIfSelected('journey-design-system', async () => {
const tmpDir = createRoutingWorkDir('design-system');
try {
@@ -559,7 +583,7 @@ export default app;
}
}, 150_000);
test.concurrent('journey-visual-qa', async () => {
testIfSelected('journey-visual-qa', async () => {
const tmpDir = createRoutingWorkDir('visual-qa');
try {
const run = (cmd: string, args: string[]) =>