diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 8ffc16aa..3908a2ca 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -69,7 +69,7 @@ The server writes `.gstack/browse.json` (atomic write via tmp + rename, mode 0o6 { "pid": 12345, "port": 34567, "token": "uuid-v4", "startedAt": "...", "binaryVersion": "abc123" } ``` -The CLI reads this file to find the server. If the file is missing, stale, or the PID is dead, the CLI spawns a new server. +The CLI reads this file to find the server. If the file is missing or the server fails an HTTP health check, the CLI spawns a new server. On Windows, PID-based process detection is unreliable in Bun binaries, so the health check (GET /health) is the primary liveness signal on all platforms. ### Port selection diff --git a/CHANGELOG.md b/CHANGELOG.md index 68baa0f8..4a7ff0d9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,31 @@ # Changelog +## [0.11.15.0] - 2026-03-24 — E2E Test Coverage for Plan Reviews & Codex + +### Added + +- **E2E tests verify plan review reports appear at the bottom of plans.** The `/plan-eng-review` review report is now tested end-to-end — if it stops writing `## GSTACK REVIEW REPORT` to the plan file, the test catches it. +- **E2E tests verify Codex is offered in every plan skill.** Four new lightweight tests confirm that `/office-hours`, `/plan-ceo-review`, `/plan-design-review`, and `/plan-eng-review` all check for Codex availability, prompt the user, and handle the fallback when Codex is unavailable. + +### For contributors + +- New E2E tests in `test/skill-e2e-plan.test.ts`: `plan-review-report`, `codex-offered-eng-review`, `codex-offered-ceo-review`, `codex-offered-office-hours`, `codex-offered-design-review` +- Updated touchfile mappings and selection count assertions +- Added `touchfiles` to the documented global touchfile list in CLAUDE.md + +## [0.11.14.0] - 2026-03-24 — Windows Browse Fix + +### Fixed + +- **Browse engine now works on Windows.** Three compounding bugs blocked all Windows `/browse` users: the server process died when the CLI exited (Bun's `unref()` doesn't truly detach on Windows), the health check never ran because `process.kill(pid, 0)` is broken in Bun binaries on Windows, and Chromium's sandbox failed when spawned through the Bun→Node process chain. All three are now fixed. Credits to @fqueiro (PR #191) for identifying the `detached: true` approach. +- **Health check runs first on all platforms.** `ensureServer()` now tries an HTTP health check before falling back to PID-based detection — more reliable on every OS, not just Windows. +- **Startup errors are logged to disk.** When the server fails to start, errors are written to `~/.gstack/browse-startup-error.log` so Windows users (who lose stderr due to process detachment) can debug. +- **Chromium sandbox disabled on Windows.** Chromium's sandbox requires elevated privileges when spawned through the Bun→Node chain — now disabled on Windows only. + +### For contributors + +- New tests for `isServerHealthy()` and startup error logging in `browse/test/config.test.ts` + ## [0.11.13.0] - 2026-03-24 — Worktree Isolation + Infrastructure Elegance ### Added diff --git a/CLAUDE.md b/CLAUDE.md index 25673f4c..492a5adf 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -29,7 +29,7 @@ against the previous run. **Diff-based test selection:** `test:evals` and `test:e2e` auto-select tests based on `git diff` against the base branch. Each test declares its file dependencies in `test/helpers/touchfiles.ts`. Changes to global touchfiles (session-runner, eval-store, -llm-judge, gen-skill-docs) trigger all tests. Use `EVALS_ALL=1` or the `:all` script +llm-judge, gen-skill-docs, touchfiles) trigger all tests. Use `EVALS_ALL=1` or the `:all` script variants to force all tests. Run `eval:select` to preview which tests would run. ## Testing diff --git a/VERSION b/VERSION index 855bffbe..446cced3 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.11.13.0 +0.11.15.0 diff --git a/browse/src/browser-manager.ts b/browse/src/browser-manager.ts index caaa5e86..335ff19e 100644 --- a/browse/src/browser-manager.ts +++ b/browse/src/browser-manager.ts @@ -89,6 +89,10 @@ export class BrowserManager { this.browser = await chromium.launch({ headless: useHeadless, + // On Windows, Chromium's sandbox fails when the server is spawned through + // the Bun→Node process chain (GitHub #276). Disable it — local daemon + // browsing user-specified URLs has marginal sandbox benefit. + chromiumSandbox: process.platform !== 'win32', ...(launchArgs.length > 0 ? { args: launchArgs } : {}), }); @@ -492,7 +496,11 @@ export class BrowserManager { // 2. Launch new headed browser (try-catch — if this fails, headless stays running) let newBrowser: Browser; try { - newBrowser = await chromium.launch({ headless: false, timeout: 15000 }); + newBrowser = await chromium.launch({ + headless: false, + timeout: 15000, + chromiumSandbox: process.platform !== 'win32', + }); } catch (err: unknown) { const msg = err instanceof Error ? err.message : String(err); return `ERROR: Cannot open headed browser — ${msg}. Headless browser still running.`; diff --git a/browse/src/cli.ts b/browse/src/cli.ts index 384f4f4d..2d48ecf7 100644 --- a/browse/src/cli.ts +++ b/browse/src/cli.ts @@ -76,6 +76,13 @@ export function resolveNodeServerScript( const NODE_SERVER_SCRIPT = IS_WINDOWS ? resolveNodeServerScript() : null; +// On Windows, hard-fail if server-node.mjs is missing — the Bun path is known broken. +if (IS_WINDOWS && !NODE_SERVER_SCRIPT) { + throw new Error( + 'server-node.mjs not found. Run `bun run build` to generate the Windows server bundle.' + ); +} + interface ServerState { pid: number; port: number; @@ -96,6 +103,19 @@ function readState(): ServerState | null { } function isProcessAlive(pid: number): boolean { + if (IS_WINDOWS) { + // Bun's compiled binary can't signal Windows PIDs (always throws ESRCH). + // Use tasklist as a fallback. Only for one-shot calls — too slow for polling loops. + try { + const result = Bun.spawnSync( + ['tasklist', '/FI', `PID eq ${pid}`, '/NH', '/FO', 'CSV'], + { stdout: 'pipe', stderr: 'pipe', timeout: 3000 } + ); + return result.stdout.toString().includes(`"${pid}"`); + } catch { + return false; + } + } try { process.kill(pid, 0); return true; @@ -104,10 +124,42 @@ function isProcessAlive(pid: number): boolean { } } +/** + * HTTP health check — definitive proof the server is alive and responsive. + * Used in all polling loops instead of isProcessAlive() (which is slow on Windows). + */ +export async function isServerHealthy(port: number): Promise { + try { + const resp = await fetch(`http://127.0.0.1:${port}/health`, { + signal: AbortSignal.timeout(2000), + }); + if (!resp.ok) return false; + const health = await resp.json() as any; + return health.status === 'healthy'; + } catch { + return false; + } +} + // ─── Process Management ───────────────────────────────────────── async function killServer(pid: number): Promise { if (!isProcessAlive(pid)) return; + if (IS_WINDOWS) { + // taskkill /T /F kills the process tree (Node + Chromium) + try { + Bun.spawnSync( + ['taskkill', '/PID', String(pid), '/T', '/F'], + { stdout: 'pipe', stderr: 'pipe', timeout: 5000 } + ); + } catch {} + const deadline = Date.now() + 2000; + while (Date.now() < deadline && isProcessAlive(pid)) { + await Bun.sleep(100); + } + return; + } + try { process.kill(pid, 'SIGTERM'); } catch { return; } // Wait up to 2s for graceful shutdown @@ -127,6 +179,10 @@ async function killServer(pid: number): Promise { * Verifies PID ownership before sending signals. */ function cleanupLegacyState(): void { + // No legacy state on Windows — /tmp and `ps` don't exist, and gstack + // never ran on Windows before the Node.js fallback was added. + if (IS_WINDOWS) return; + try { const files = fs.readdirSync('/tmp').filter(f => f.startsWith('browse-server') && f.endsWith('.json')); for (const file of files) { @@ -164,44 +220,65 @@ function cleanupLegacyState(): void { async function startServer(): Promise { ensureStateDir(config); - // Clean up stale state file + // Clean up stale state file and error log try { fs.unlinkSync(config.stateFile); } catch {} + try { fs.unlinkSync(path.join(config.stateDir, 'browse-startup-error.log')); } catch {} - // Start server as detached background process. - // On Windows, Bun can't launch/connect to Playwright's Chromium (oven-sh/bun#4253, #9911). - // Fall back to running the server under Node.js with Bun API polyfills. - const useNode = IS_WINDOWS && NODE_SERVER_SCRIPT; - const serverCmd = useNode - ? ['node', NODE_SERVER_SCRIPT] - : ['bun', 'run', SERVER_SCRIPT]; - const proc = Bun.spawn(serverCmd, { - stdio: ['ignore', 'pipe', 'pipe'], - env: { ...process.env, BROWSE_STATE_FILE: config.stateFile }, - }); + let proc: any = null; - // Don't hold the CLI open - proc.unref(); + if (IS_WINDOWS && NODE_SERVER_SCRIPT) { + // Windows: Bun.spawn() + proc.unref() doesn't truly detach on Windows — + // when the CLI exits, the server dies with it. Use Node's child_process.spawn + // with { detached: true } instead, which is the gold standard for Windows + // process independence. Credit: PR #191 by @fqueiro. + const launcherCode = + `const{spawn}=require('child_process');` + + `spawn(process.execPath,[${JSON.stringify(NODE_SERVER_SCRIPT)}],` + + `{detached:true,stdio:'ignore',env:Object.assign({},process.env,` + + `{BROWSE_STATE_FILE:${JSON.stringify(config.stateFile)}})}).unref()`; + Bun.spawnSync(['node', '-e', launcherCode], { stdio: 'ignore' }); + } else { + // macOS/Linux: Bun.spawn + unref works correctly + proc = Bun.spawn(['bun', 'run', SERVER_SCRIPT], { + stdio: ['ignore', 'pipe', 'pipe'], + env: { ...process.env, BROWSE_STATE_FILE: config.stateFile }, + }); + proc.unref(); + } - // Wait for state file to appear + // Wait for server to become healthy. + // Use HTTP health check (not isProcessAlive) — it's fast (~instant ECONNREFUSED) + // and works reliably on all platforms including Windows. const start = Date.now(); while (Date.now() - start < MAX_START_WAIT) { const state = readState(); - if (state && isProcessAlive(state.pid)) { + if (state && await isServerHealthy(state.port)) { return state; } await Bun.sleep(100); } - // If we get here, server didn't start in time - // Try to read stderr for error message - const stderr = proc.stderr; - if (stderr) { - const reader = stderr.getReader(); + // Server didn't start in time — try to get error details + if (proc?.stderr) { + // macOS/Linux: read stderr from the spawned process + const reader = proc.stderr.getReader(); const { value } = await reader.read(); if (value) { const errText = new TextDecoder().decode(value); throw new Error(`Server failed to start:\n${errText}`); } + } else { + // Windows: check startup error log (server writes errors to disk since + // stderr is unavailable due to stdio: 'ignore' for detachment) + const errorLogPath = path.join(config.stateDir, 'browse-startup-error.log'); + try { + const errorLog = fs.readFileSync(errorLogPath, 'utf-8').trim(); + if (errorLog) { + throw new Error(`Server failed to start:\n${errorLog}`); + } + } catch (e: any) { + if (e.code !== 'ENOENT') throw e; + } } throw new Error(`Server failed to start within ${MAX_START_WAIT / 1000}s`); } @@ -237,7 +314,10 @@ function acquireServerLock(): (() => void) | null { async function ensureServer(): Promise { const state = readState(); - if (state && isProcessAlive(state.pid)) { + // Health-check-first: HTTP is definitive proof the server is alive and responsive. + // This replaces the PID-gated approach which breaks on Windows (Bun's process.kill + // always throws ESRCH for Windows PIDs in compiled binaries). + if (state && await isServerHealthy(state.port)) { // Check for binary version mismatch (auto-restart on update) const currentVersion = readVersionHash(); if (currentVersion && state.binaryVersion && currentVersion !== state.binaryVersion) { @@ -245,21 +325,7 @@ async function ensureServer(): Promise { await killServer(state.pid); return startServer(); } - - // Server appears alive — do a health check - try { - const resp = await fetch(`http://127.0.0.1:${state.port}/health`, { - signal: AbortSignal.timeout(2000), - }); - if (resp.ok) { - const health = await resp.json() as any; - if (health.status === 'healthy') { - return state; - } - } - } catch { - // Health check failed — server is dead or unhealthy - } + return state; } // Ensure state directory exists before lock acquisition (lock file lives there) @@ -273,7 +339,7 @@ async function ensureServer(): Promise { const start = Date.now(); while (Date.now() - start < MAX_START_WAIT) { const freshState = readState(); - if (freshState && isProcessAlive(freshState.pid)) return freshState; + if (freshState && await isServerHealthy(freshState.port)) return freshState; await Bun.sleep(200); } throw new Error('Timed out waiting for another instance to start the server'); @@ -282,7 +348,7 @@ async function ensureServer(): Promise { try { // Re-read state under lock in case another process just started the server const freshState = readState(); - if (freshState && isProcessAlive(freshState.pid)) { + if (freshState && await isServerHealthy(freshState.port)) { return freshState; } diff --git a/browse/src/server.ts b/browse/src/server.ts index 82af28bd..fe2c27cb 100644 --- a/browse/src/server.ts +++ b/browse/src/server.ts @@ -286,6 +286,13 @@ async function shutdown() { // Handle signals process.on('SIGTERM', shutdown); process.on('SIGINT', shutdown); +// Windows: taskkill /F bypasses SIGTERM, but 'exit' fires for some shutdown paths. +// Defense-in-depth — primary cleanup is the CLI's stale-state detection via health check. +if (process.platform === 'win32') { + process.on('exit', () => { + try { fs.unlinkSync(config.stateFile); } catch {} + }); +} // ─── Start ───────────────────────────────────────────────────── async function start() { @@ -365,5 +372,14 @@ async function start() { start().catch((err) => { console.error(`[browse] Failed to start: ${err.message}`); + // Write error to disk for the CLI to read — on Windows, the CLI can't capture + // stderr because the server is launched with detached: true, stdio: 'ignore'. + try { + const errorLogPath = path.join(config.stateDir, 'browse-startup-error.log'); + fs.mkdirSync(config.stateDir, { recursive: true }); + fs.writeFileSync(errorLogPath, `${new Date().toISOString()} ${err.message}\n${err.stack || ''}\n`); + } catch { + // stateDir may not exist — nothing more we can do + } process.exit(1); }); diff --git a/browse/test/config.test.ts b/browse/test/config.test.ts index 0cbe47fa..b3642694 100644 --- a/browse/test/config.test.ts +++ b/browse/test/config.test.ts @@ -248,3 +248,69 @@ describe('version mismatch detection', () => { expect(shouldRestart).toBe(false); }); }); + +describe('isServerHealthy', () => { + const { isServerHealthy } = require('../src/cli'); + const http = require('http'); + + test('returns true for a healthy server', async () => { + const server = http.createServer((_req: any, res: any) => { + res.writeHead(200, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ status: 'healthy' })); + }); + await new Promise(resolve => server.listen(0, resolve)); + const port = server.address().port; + try { + expect(await isServerHealthy(port)).toBe(true); + } finally { + server.close(); + } + }); + + test('returns false for an unhealthy server', async () => { + const server = http.createServer((_req: any, res: any) => { + res.writeHead(200, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ status: 'unhealthy' })); + }); + await new Promise(resolve => server.listen(0, resolve)); + const port = server.address().port; + try { + expect(await isServerHealthy(port)).toBe(false); + } finally { + server.close(); + } + }); + + test('returns false when server is not running', async () => { + // Use a port that's almost certainly not in use + expect(await isServerHealthy(59999)).toBe(false); + }); + + test('returns false on non-200 response', async () => { + const server = http.createServer((_req: any, res: any) => { + res.writeHead(500); + res.end('Internal Server Error'); + }); + await new Promise(resolve => server.listen(0, resolve)); + const port = server.address().port; + try { + expect(await isServerHealthy(port)).toBe(false); + } finally { + server.close(); + } + }); +}); + +describe('startup error log', () => { + test('write and read error log', () => { + const tmpDir = path.join(os.tmpdir(), `browse-error-log-test-${Date.now()}`); + fs.mkdirSync(tmpDir, { recursive: true }); + const errorLogPath = path.join(tmpDir, 'browse-startup-error.log'); + const errorMsg = 'Cannot find module playwright'; + fs.writeFileSync(errorLogPath, `2026-03-23T00:00:00.000Z ${errorMsg}\n`); + const content = fs.readFileSync(errorLogPath, 'utf-8').trim(); + expect(content).toContain(errorMsg); + expect(content).toMatch(/^\d{4}-\d{2}-\d{2}T/); // ISO timestamp prefix + fs.rmSync(tmpDir, { recursive: true, force: true }); + }); +}); diff --git a/package.json b/package.json index 1dd1cd2a..ebcd27bb 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "gstack", - "version": "0.11.13.0", + "version": "0.11.14.0", "description": "Garry's Stack — Claude Code skills + fast headless browser. One repo, one install, entire AI engineering workflow.", "license": "MIT", "type": "module", diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index d0d232a5..931bcda8 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -68,6 +68,13 @@ export const E2E_TOUCHFILES: Record = { 'plan-ceo-review-benefits': ['plan-ceo-review/**', 'scripts/gen-skill-docs.ts'], 'plan-eng-review': ['plan-eng-review/**'], 'plan-eng-review-artifact': ['plan-eng-review/**'], + 'plan-review-report': ['plan-eng-review/**', 'scripts/gen-skill-docs.ts'], + + // Codex offering verification + 'codex-offered-office-hours': ['office-hours/**', 'scripts/gen-skill-docs.ts'], + 'codex-offered-ceo-review': ['plan-ceo-review/**', 'scripts/gen-skill-docs.ts'], + 'codex-offered-design-review': ['plan-design-review/**', 'scripts/gen-skill-docs.ts'], + 'codex-offered-eng-review': ['plan-eng-review/**', 'scripts/gen-skill-docs.ts'], // Ship 'ship-base-branch': ['ship/**', 'bin/gstack-repo-mode'], diff --git a/test/skill-e2e-plan.test.ts b/test/skill-e2e-plan.test.ts index 884fe67b..8953200b 100644 --- a/test/skill-e2e-plan.test.ts +++ b/test/skill-e2e-plan.test.ts @@ -535,6 +535,199 @@ Write your summary to ${benefitsDir}/benefits-summary.md`, }, 180_000); }); +// --- Plan Review Report E2E --- +// Verifies that plan-eng-review writes a "## GSTACK REVIEW REPORT" section +// to the bottom of the plan file (the living review status footer). + +describeIfSelected('Plan Review Report E2E', ['plan-review-report'], () => { + let planDir: string; + + beforeAll(() => { + planDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-review-report-')); + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: planDir, stdio: 'pipe', timeout: 5000 }); + + run('git', ['init', '-b', 'main']); + run('git', ['config', 'user.email', 'test@test.com']); + run('git', ['config', 'user.name', 'Test']); + + fs.writeFileSync(path.join(planDir, 'plan.md'), `# Plan: Add Notifications System + +## Context +We're building a real-time notification system for our SaaS app. + +## Changes +1. WebSocket server for push notifications +2. Notification preferences API +3. Email digest fallback for offline users +4. PostgreSQL table for notification storage + +## Architecture +- WebSocket: Socket.io on Express +- Queue: Bull + Redis for email digests +- Storage: PostgreSQL notifications table +- Frontend: React toast component + +## Open questions +- Retry policy for failed WebSocket delivery? +- Max notifications stored per user? +`); + + run('git', ['add', '.']); + run('git', ['commit', '-m', 'add plan']); + + // Copy plan-eng-review skill + fs.mkdirSync(path.join(planDir, 'plan-eng-review'), { recursive: true }); + fs.copyFileSync( + path.join(ROOT, 'plan-eng-review', 'SKILL.md'), + path.join(planDir, 'plan-eng-review', 'SKILL.md'), + ); + }); + + afterAll(() => { + try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {} + }); + + test('/plan-eng-review writes GSTACK REVIEW REPORT to plan file', async () => { + const result = await runSkillTest({ + prompt: `Read plan-eng-review/SKILL.md for the review workflow. + +Read plan.md — that's the plan to review. This is a standalone plan document, not a codebase — skip any codebase exploration steps. + +Proceed directly to the full review. Skip any AskUserQuestion calls — this is non-interactive. +Skip the preamble bash block, lake intro, telemetry, and contributor mode sections. + +CRITICAL REQUIREMENT: plan.md IS the plan file for this review session. After completing your review, you MUST write a "## GSTACK REVIEW REPORT" section to the END of plan.md, exactly as described in the "Plan File Review Report" section of SKILL.md. If gstack-review-read is not available or returns NO_REVIEWS, write the placeholder table with all four review rows (CEO, Codex, Eng, Design). Use the Edit tool to append to plan.md — do NOT overwrite the existing plan content. + +This review report at the bottom of the plan is the MOST IMPORTANT deliverable of this test.`, + workingDirectory: planDir, + maxTurns: 20, + timeout: 360_000, + testName: 'plan-review-report', + runId, + model: 'claude-opus-4-6', + }); + + logCost('/plan-eng-review report', result); + recordE2E(evalCollector, '/plan-review-report', 'Plan Review Report E2E', result, { + passed: ['success', 'error_max_turns'].includes(result.exitReason), + }); + expect(['success', 'error_max_turns']).toContain(result.exitReason); + + // Verify the review report was written to the plan file + const planContent = fs.readFileSync(path.join(planDir, 'plan.md'), 'utf-8'); + + // Original plan content should still be present + expect(planContent).toContain('# Plan: Add Notifications System'); + expect(planContent).toContain('WebSocket'); + + // Review report section must exist + expect(planContent).toContain('## GSTACK REVIEW REPORT'); + + // Report should be at the bottom of the file + const reportIndex = planContent.lastIndexOf('## GSTACK REVIEW REPORT'); + const afterReport = planContent.slice(reportIndex); + + // Should contain the review table with standard rows + expect(afterReport).toMatch(/\|\s*Review\s*\|/); + expect(afterReport).toContain('CEO Review'); + expect(afterReport).toContain('Eng Review'); + expect(afterReport).toContain('Design Review'); + + console.log('Plan review report found at bottom of plan.md'); + }, 420_000); +}); + +// --- Codex Offering E2E --- +// Verifies that Codex is properly offered (with availability check, user prompt, +// and fallback) in office-hours, plan-ceo-review, plan-design-review, plan-eng-review. + +describeIfSelected('Codex Offering E2E', [ + 'codex-offered-office-hours', 'codex-offered-ceo-review', + 'codex-offered-design-review', 'codex-offered-eng-review', +], () => { + let testDir: string; + + beforeAll(() => { + testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-codex-offer-')); + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: testDir, stdio: 'pipe', timeout: 5000 }); + + run('git', ['init', '-b', 'main']); + run('git', ['config', 'user.email', 'test@test.com']); + run('git', ['config', 'user.name', 'Test']); + fs.writeFileSync(path.join(testDir, 'README.md'), '# Test Project\n'); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'init']); + + // Copy all 4 SKILL.md files + for (const skill of ['office-hours', 'plan-ceo-review', 'plan-design-review', 'plan-eng-review']) { + fs.mkdirSync(path.join(testDir, skill), { recursive: true }); + fs.copyFileSync( + path.join(ROOT, skill, 'SKILL.md'), + path.join(testDir, skill, 'SKILL.md'), + ); + } + }); + + afterAll(() => { + try { fs.rmSync(testDir, { recursive: true, force: true }); } catch {} + }); + + async function checkCodexOffering(skill: string, testName: string, featureName: string) { + const result = await runSkillTest({ + prompt: `Read ${skill}/SKILL.md. Search for ALL sections related to "codex", "outside voice", or "second opinion". + +Summarize the Codex/${featureName} integration — answer these specific questions: +1. How is Codex availability checked? (what exact bash command?) +2. How is the user prompted? (via AskUserQuestion? what are the options?) +3. What happens when Codex is NOT available? (fallback to subagent? skip entirely?) +4. Is this step blocking (gates the workflow) or optional (can be skipped)? +5. What prompt/context is sent to Codex? + +Write your summary to ${testDir}/${testName}-summary.md`, + workingDirectory: testDir, + maxTurns: 8, + timeout: 120_000, + testName, + runId, + }); + + logCost(`/${skill} codex offering`, result); + recordE2E(evalCollector, `/${testName}`, 'Codex Offering E2E', result); + expect(result.exitReason).toBe('success'); + + const summaryPath = path.join(testDir, `${testName}-summary.md`); + expect(fs.existsSync(summaryPath)).toBe(true); + + const summary = fs.readFileSync(summaryPath, 'utf-8').toLowerCase(); + // All skills should have codex availability check + expect(summary).toMatch(/which codex/); + // All skills should have fallback behavior + expect(summary).toMatch(/fallback|subagent|unavailable|not available|skip/); + // All skills should show it's optional/non-blocking + expect(summary).toMatch(/optional|non.?blocking|skip|not.*required/); + + console.log(`${skill}: Codex offering verified`); + } + + testConcurrentIfSelected('codex-offered-office-hours', async () => { + await checkCodexOffering('office-hours', 'codex-offered-office-hours', 'second opinion'); + }, 180_000); + + testConcurrentIfSelected('codex-offered-ceo-review', async () => { + await checkCodexOffering('plan-ceo-review', 'codex-offered-ceo-review', 'outside voice'); + }, 180_000); + + testConcurrentIfSelected('codex-offered-design-review', async () => { + await checkCodexOffering('plan-design-review', 'codex-offered-design-review', 'design outside voices'); + }, 180_000); + + testConcurrentIfSelected('codex-offered-eng-review', async () => { + await checkCodexOffering('plan-eng-review', 'codex-offered-eng-review', 'outside voice'); + }, 180_000); +}); + // Module-level afterAll — finalize eval collector after all tests complete afterAll(async () => { await finalizeEvalCollector(evalCollector); diff --git a/test/touchfiles.test.ts b/test/touchfiles.test.ts index 0e24b124..69572970 100644 --- a/test/touchfiles.test.ts +++ b/test/touchfiles.test.ts @@ -80,8 +80,9 @@ describe('selectTests', () => { expect(result.selected).toContain('plan-ceo-review-selective'); expect(result.selected).toContain('plan-ceo-review-benefits'); expect(result.selected).toContain('autoplan-core'); - expect(result.selected.length).toBe(4); - expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 4); + expect(result.selected).toContain('codex-offered-ceo-review'); + expect(result.selected.length).toBe(5); + expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 5); }); test('global touchfile triggers ALL tests', () => {