mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-07 14:06:42 +02:00
fix: three flaky E2E test fixes
- evals-periodic.yml: hardcode runner (matrix objects don't define 'runner' property, actionlint catches the error) - Remove setup-cookies-detect E2E: redundant with 30+ unit tests in browse/test/cookie-import-browser.test.ts; E2E just tested LLM instruction-following on a CI box with no browsers - ship-local-workflow: check branch existence on remote instead of counting commits (fragile with bare repos + --all)
This commit is contained in:
@@ -56,7 +56,7 @@ jobs:
|
|||||||
${{ env.IMAGE }}:latest
|
${{ env.IMAGE }}:latest
|
||||||
|
|
||||||
evals:
|
evals:
|
||||||
runs-on: ${{ matrix.suite.runner || 'ubicloud-standard-2' }}
|
runs-on: ubicloud-standard-2
|
||||||
needs: build-image
|
needs: build-image
|
||||||
container:
|
container:
|
||||||
image: ${{ needs.build-image.outputs.image-tag }}
|
image: ${{ needs.build-image.outputs.image-tag }}
|
||||||
|
|||||||
@@ -80,9 +80,6 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
|||||||
'ship-base-branch': ['ship/**', 'bin/gstack-repo-mode'],
|
'ship-base-branch': ['ship/**', 'bin/gstack-repo-mode'],
|
||||||
'ship-local-workflow': ['ship/**', 'scripts/gen-skill-docs.ts'],
|
'ship-local-workflow': ['ship/**', 'scripts/gen-skill-docs.ts'],
|
||||||
|
|
||||||
// Setup browser cookies
|
|
||||||
'setup-cookies-detect': ['setup-browser-cookies/**'],
|
|
||||||
|
|
||||||
// Retro
|
// Retro
|
||||||
'retro': ['retro/**'],
|
'retro': ['retro/**'],
|
||||||
'retro-base-branch': ['retro/**'],
|
'retro-base-branch': ['retro/**'],
|
||||||
@@ -207,9 +204,6 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
|
|||||||
'ship-coverage-audit': 'gate',
|
'ship-coverage-audit': 'gate',
|
||||||
'ship-triage': 'gate',
|
'ship-triage': 'gate',
|
||||||
|
|
||||||
// Setup browser cookies
|
|
||||||
'setup-cookies-detect': 'gate',
|
|
||||||
|
|
||||||
// Retro — gate for cheap branch detection, periodic for full Opus retro
|
// Retro — gate for cheap branch detection, periodic for full Opus retro
|
||||||
'retro': 'periodic',
|
'retro': 'periodic',
|
||||||
'retro-base-branch': 'gate',
|
'retro-base-branch': 'gate',
|
||||||
|
|||||||
@@ -175,83 +175,30 @@ describeIfSelected('Ship workflow E2E', ['ship-local-workflow'], () => {
|
|||||||
|
|
||||||
logCost('/ship local workflow', result);
|
logCost('/ship local workflow', result);
|
||||||
|
|
||||||
// Check push succeeded — check the feature branch on the bare remote
|
// Check push succeeded — verify the feature branch exists on the bare remote
|
||||||
// (bare repo HEAD points to main which only has 1 commit; the push goes to feature/ship-test)
|
const branchCheck = spawnSync('git', ['branch', '--list', 'feature/ship-test'], { cwd: shipRemoteDir, stdio: 'pipe' });
|
||||||
const remoteLog = spawnSync('git', ['log', '--oneline', '--all'], { cwd: shipRemoteDir, stdio: 'pipe' });
|
const branchExists = branchCheck.stdout.toString().trim().length > 0;
|
||||||
const remoteCommits = remoteLog.stdout.toString().trim().split('\n').filter(l => l.length > 0).length;
|
|
||||||
|
|
||||||
// Check VERSION was bumped
|
// Check VERSION was bumped locally (even if push failed, this shows the LLM did the work)
|
||||||
const versionContent = fs.existsSync(path.join(shipWorkDir, 'VERSION'))
|
const versionContent = fs.existsSync(path.join(shipWorkDir, 'VERSION'))
|
||||||
? fs.readFileSync(path.join(shipWorkDir, 'VERSION'), 'utf-8').trim() : '';
|
? fs.readFileSync(path.join(shipWorkDir, 'VERSION'), 'utf-8').trim() : '';
|
||||||
const versionBumped = versionContent !== '0.1.0.0';
|
const versionBumped = versionContent !== '0.1.0.0';
|
||||||
|
|
||||||
recordE2E(evalCollector, '/ship local workflow', 'Ship workflow E2E', result, {
|
recordE2E(evalCollector, '/ship local workflow', 'Ship workflow E2E', result, {
|
||||||
passed: remoteCommits > 1 && ['success', 'error_max_turns'].includes(result.exitReason),
|
passed: branchExists && versionBumped && ['success', 'error_max_turns'].includes(result.exitReason),
|
||||||
});
|
});
|
||||||
|
|
||||||
expect(['success', 'error_max_turns']).toContain(result.exitReason);
|
expect(['success', 'error_max_turns']).toContain(result.exitReason);
|
||||||
expect(remoteCommits).toBeGreaterThan(1);
|
expect(branchExists).toBe(true);
|
||||||
console.log(`Remote commits: ${remoteCommits}, VERSION: ${versionContent}, bumped: ${versionBumped}`);
|
expect(versionBumped).toBe(true);
|
||||||
|
console.log(`Branch pushed: ${branchExists}, VERSION: ${versionContent}, bumped: ${versionBumped}`);
|
||||||
}, 150_000);
|
}, 150_000);
|
||||||
});
|
});
|
||||||
|
|
||||||
// --- Browser cookie detection smoke test ---
|
// setup-cookies-detect REMOVED: The cookie-import-browser module has 30+ thorough
|
||||||
|
// unit tests in browse/test/cookie-import-browser.test.ts (decryption, profile
|
||||||
describeIfSelected('Setup Browser Cookies E2E', ['setup-cookies-detect'], () => {
|
// detection, error handling, path traversal). The E2E just tested LLM instruction-
|
||||||
let cookieDir: string;
|
// following ("write a file saying no browsers") on a CI box with no browsers.
|
||||||
|
|
||||||
beforeAll(() => {
|
|
||||||
cookieDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-cookies-'));
|
|
||||||
// Copy skill files
|
|
||||||
fs.mkdirSync(path.join(cookieDir, 'setup-browser-cookies'), { recursive: true });
|
|
||||||
fs.copyFileSync(
|
|
||||||
path.join(ROOT, 'setup-browser-cookies', 'SKILL.md'),
|
|
||||||
path.join(cookieDir, 'setup-browser-cookies', 'SKILL.md'),
|
|
||||||
);
|
|
||||||
});
|
|
||||||
|
|
||||||
afterAll(() => {
|
|
||||||
try { fs.rmSync(cookieDir, { recursive: true, force: true }); } catch {}
|
|
||||||
});
|
|
||||||
|
|
||||||
testConcurrentIfSelected('setup-cookies-detect', async () => {
|
|
||||||
const result = await runSkillTest({
|
|
||||||
prompt: `Read setup-browser-cookies/SKILL.md for the cookie import workflow.
|
|
||||||
|
|
||||||
This is a test environment. Check which browsers exist on this system by looking for their cookie database files.
|
|
||||||
IMPORTANT: You MUST write a file called ${cookieDir}/detected-browsers.md with your findings.
|
|
||||||
If you find browsers, list them. If you find NO browsers, write "No browsers detected" to the file.
|
|
||||||
The file must always be created regardless of results.
|
|
||||||
Do NOT launch the cookie picker UI — just detect and report.`,
|
|
||||||
workingDirectory: cookieDir,
|
|
||||||
maxTurns: 8,
|
|
||||||
timeout: 60_000,
|
|
||||||
testName: 'setup-cookies-detect',
|
|
||||||
runId,
|
|
||||||
});
|
|
||||||
|
|
||||||
logCost('/setup-browser-cookies detect', result);
|
|
||||||
|
|
||||||
const detectPath = path.join(cookieDir, 'detected-browsers.md');
|
|
||||||
const detectExists = fs.existsSync(detectPath);
|
|
||||||
const detectContent = detectExists ? fs.readFileSync(detectPath, 'utf-8') : '';
|
|
||||||
const hasBrowserName = /chrome|arc|brave|edge|comet|safari|firefox/i.test(detectContent);
|
|
||||||
const hasNoBrowsers = /no browser|none|not found|not detected|could not|couldn't/i.test(detectContent);
|
|
||||||
|
|
||||||
// On CI (headless Ubuntu), no browsers are installed — "no browsers detected" is valid
|
|
||||||
const contentValid = hasBrowserName || hasNoBrowsers;
|
|
||||||
|
|
||||||
recordE2E(evalCollector, '/setup-browser-cookies detect', 'Setup Browser Cookies E2E', result, {
|
|
||||||
passed: detectExists && contentValid && ['success', 'error_max_turns'].includes(result.exitReason),
|
|
||||||
});
|
|
||||||
|
|
||||||
expect(['success', 'error_max_turns']).toContain(result.exitReason);
|
|
||||||
expect(detectExists).toBe(true);
|
|
||||||
if (detectExists) {
|
|
||||||
expect(contentValid).toBe(true);
|
|
||||||
}
|
|
||||||
}, 90_000);
|
|
||||||
});
|
|
||||||
|
|
||||||
// --- gstack-upgrade E2E ---
|
// --- gstack-upgrade E2E ---
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user