fix: three flaky E2E test fixes

- evals-periodic.yml: hardcode runner (matrix objects don't define
  'runner' property, actionlint catches the error)
- Remove setup-cookies-detect E2E: redundant with 30+ unit tests in
  browse/test/cookie-import-browser.test.ts; E2E just tested LLM
  instruction-following on a CI box with no browsers
- ship-local-workflow: check branch existence on remote instead of
  counting commits (fragile with bare repos + --all)
This commit is contained in:
Garry Tan
2026-03-24 14:19:25 -07:00
parent 8763787cdf
commit 06063eef24
3 changed files with 13 additions and 72 deletions
+1 -1
View File
@@ -56,7 +56,7 @@ jobs:
${{ env.IMAGE }}:latest
evals:
runs-on: ${{ matrix.suite.runner || 'ubicloud-standard-2' }}
runs-on: ubicloud-standard-2
needs: build-image
container:
image: ${{ needs.build-image.outputs.image-tag }}
-6
View File
@@ -80,9 +80,6 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
'ship-base-branch': ['ship/**', 'bin/gstack-repo-mode'],
'ship-local-workflow': ['ship/**', 'scripts/gen-skill-docs.ts'],
// Setup browser cookies
'setup-cookies-detect': ['setup-browser-cookies/**'],
// Retro
'retro': ['retro/**'],
'retro-base-branch': ['retro/**'],
@@ -207,9 +204,6 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
'ship-coverage-audit': 'gate',
'ship-triage': 'gate',
// Setup browser cookies
'setup-cookies-detect': 'gate',
// Retro — gate for cheap branch detection, periodic for full Opus retro
'retro': 'periodic',
'retro-base-branch': 'gate',
+12 -65
View File
@@ -175,83 +175,30 @@ describeIfSelected('Ship workflow E2E', ['ship-local-workflow'], () => {
logCost('/ship local workflow', result);
// Check push succeeded — check the feature branch on the bare remote
// (bare repo HEAD points to main which only has 1 commit; the push goes to feature/ship-test)
const remoteLog = spawnSync('git', ['log', '--oneline', '--all'], { cwd: shipRemoteDir, stdio: 'pipe' });
const remoteCommits = remoteLog.stdout.toString().trim().split('\n').filter(l => l.length > 0).length;
// Check push succeeded — verify the feature branch exists on the bare remote
const branchCheck = spawnSync('git', ['branch', '--list', 'feature/ship-test'], { cwd: shipRemoteDir, stdio: 'pipe' });
const branchExists = branchCheck.stdout.toString().trim().length > 0;
// Check VERSION was bumped
// Check VERSION was bumped locally (even if push failed, this shows the LLM did the work)
const versionContent = fs.existsSync(path.join(shipWorkDir, 'VERSION'))
? fs.readFileSync(path.join(shipWorkDir, 'VERSION'), 'utf-8').trim() : '';
const versionBumped = versionContent !== '0.1.0.0';
recordE2E(evalCollector, '/ship local workflow', 'Ship workflow E2E', result, {
passed: remoteCommits > 1 && ['success', 'error_max_turns'].includes(result.exitReason),
passed: branchExists && versionBumped && ['success', 'error_max_turns'].includes(result.exitReason),
});
expect(['success', 'error_max_turns']).toContain(result.exitReason);
expect(remoteCommits).toBeGreaterThan(1);
console.log(`Remote commits: ${remoteCommits}, VERSION: ${versionContent}, bumped: ${versionBumped}`);
expect(branchExists).toBe(true);
expect(versionBumped).toBe(true);
console.log(`Branch pushed: ${branchExists}, VERSION: ${versionContent}, bumped: ${versionBumped}`);
}, 150_000);
});
// --- Browser cookie detection smoke test ---
describeIfSelected('Setup Browser Cookies E2E', ['setup-cookies-detect'], () => {
let cookieDir: string;
beforeAll(() => {
cookieDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-cookies-'));
// Copy skill files
fs.mkdirSync(path.join(cookieDir, 'setup-browser-cookies'), { recursive: true });
fs.copyFileSync(
path.join(ROOT, 'setup-browser-cookies', 'SKILL.md'),
path.join(cookieDir, 'setup-browser-cookies', 'SKILL.md'),
);
});
afterAll(() => {
try { fs.rmSync(cookieDir, { recursive: true, force: true }); } catch {}
});
testConcurrentIfSelected('setup-cookies-detect', async () => {
const result = await runSkillTest({
prompt: `Read setup-browser-cookies/SKILL.md for the cookie import workflow.
This is a test environment. Check which browsers exist on this system by looking for their cookie database files.
IMPORTANT: You MUST write a file called ${cookieDir}/detected-browsers.md with your findings.
If you find browsers, list them. If you find NO browsers, write "No browsers detected" to the file.
The file must always be created regardless of results.
Do NOT launch the cookie picker UI — just detect and report.`,
workingDirectory: cookieDir,
maxTurns: 8,
timeout: 60_000,
testName: 'setup-cookies-detect',
runId,
});
logCost('/setup-browser-cookies detect', result);
const detectPath = path.join(cookieDir, 'detected-browsers.md');
const detectExists = fs.existsSync(detectPath);
const detectContent = detectExists ? fs.readFileSync(detectPath, 'utf-8') : '';
const hasBrowserName = /chrome|arc|brave|edge|comet|safari|firefox/i.test(detectContent);
const hasNoBrowsers = /no browser|none|not found|not detected|could not|couldn't/i.test(detectContent);
// On CI (headless Ubuntu), no browsers are installed — "no browsers detected" is valid
const contentValid = hasBrowserName || hasNoBrowsers;
recordE2E(evalCollector, '/setup-browser-cookies detect', 'Setup Browser Cookies E2E', result, {
passed: detectExists && contentValid && ['success', 'error_max_turns'].includes(result.exitReason),
});
expect(['success', 'error_max_turns']).toContain(result.exitReason);
expect(detectExists).toBe(true);
if (detectExists) {
expect(contentValid).toBe(true);
}
}, 90_000);
});
// setup-cookies-detect REMOVED: The cookie-import-browser module has 30+ thorough
// unit tests in browse/test/cookie-import-browser.test.ts (decryption, profile
// detection, error handling, path traversal). The E2E just tested LLM instruction-
// following ("write a file saying no browsers") on a CI box with no browsers.
// --- gstack-upgrade E2E ---