Merge remote-tracking branch 'origin/main' into garrytan/fix-startup-noise

2026-05-02 11:45:20 +02:00 · 2026-03-24 13:41:09 -07:00
parent 5a8a602df4 6156122571
commit b8b38ba361
12 changed files with 429 additions and 46 deletions
@@ -69,7 +69,7 @@ The server writes `.gstack/browse.json` (atomic write via tmp + rename, mode 0o6
 { "pid": 12345, "port": 34567, "token": "uuid-v4", "startedAt": "...", "binaryVersion": "abc123" }
 ```

-The CLI reads this file to find the server. If the file is missing, stale, or the PID is dead, the CLI spawns a new server.
+The CLI reads this file to find the server. If the file is missing or the server fails an HTTP health check, the CLI spawns a new server. On Windows, PID-based process detection is unreliable in Bun binaries, so the health check (GET /health) is the primary liveness signal on all platforms.

 ### Port selection

@@ -1,5 +1,31 @@
 # Changelog

+## [0.11.15.0] - 2026-03-24 — E2E Test Coverage for Plan Reviews & Codex
+
+### Added
+
+- **E2E tests verify plan review reports appear at the bottom of plans.** The `/plan-eng-review` review report is now tested end-to-end — if it stops writing `## GSTACK REVIEW REPORT` to the plan file, the test catches it.
+- **E2E tests verify Codex is offered in every plan skill.** Four new lightweight tests confirm that `/office-hours`, `/plan-ceo-review`, `/plan-design-review`, and `/plan-eng-review` all check for Codex availability, prompt the user, and handle the fallback when Codex is unavailable.
+
+### For contributors
+
+- New E2E tests in `test/skill-e2e-plan.test.ts`: `plan-review-report`, `codex-offered-eng-review`, `codex-offered-ceo-review`, `codex-offered-office-hours`, `codex-offered-design-review`
+- Updated touchfile mappings and selection count assertions
+- Added `touchfiles` to the documented global touchfile list in CLAUDE.md
+
+## [0.11.14.0] - 2026-03-24 — Windows Browse Fix
+
+### Fixed
+
+- **Browse engine now works on Windows.** Three compounding bugs blocked all Windows `/browse` users: the server process died when the CLI exited (Bun's `unref()` doesn't truly detach on Windows), the health check never ran because `process.kill(pid, 0)` is broken in Bun binaries on Windows, and Chromium's sandbox failed when spawned through the Bun→Node process chain. All three are now fixed. Credits to @fqueiro (PR #191) for identifying the `detached: true` approach.
+- **Health check runs first on all platforms.** `ensureServer()` now tries an HTTP health check before falling back to PID-based detection — more reliable on every OS, not just Windows.
+- **Startup errors are logged to disk.** When the server fails to start, errors are written to `~/.gstack/browse-startup-error.log` so Windows users (who lose stderr due to process detachment) can debug.
+- **Chromium sandbox disabled on Windows.** Chromium's sandbox requires elevated privileges when spawned through the Bun→Node chain — now disabled on Windows only.
+
+### For contributors
+
+- New tests for `isServerHealthy()` and startup error logging in `browse/test/config.test.ts`
+
 ## [0.11.13.0] - 2026-03-24 — Worktree Isolation + Infrastructure Elegance

 ### Added
@@ -29,7 +29,7 @@ against the previous run.
 **Diff-based test selection:** `test:evals` and `test:e2e` auto-select tests based
 on `git diff` against the base branch. Each test declares its file dependencies in
 `test/helpers/touchfiles.ts`. Changes to global touchfiles (session-runner, eval-store,
-llm-judge, gen-skill-docs) trigger all tests. Use `EVALS_ALL=1` or the `:all` script
+llm-judge, gen-skill-docs, touchfiles) trigger all tests. Use `EVALS_ALL=1` or the `:all` script
 variants to force all tests. Run `eval:select` to preview which tests would run.

 ## Testing
@@ -1 +1 @@
-0.11.13.0
+0.11.15.0
@@ -89,6 +89,10 @@ export class BrowserManager {

    this.browser = await chromium.launch({
      headless: useHeadless,
+      // On Windows, Chromium's sandbox fails when the server is spawned through
+      // the Bun→Node process chain (GitHub #276). Disable it — local daemon
+      // browsing user-specified URLs has marginal sandbox benefit.
+      chromiumSandbox: process.platform !== 'win32',
      ...(launchArgs.length > 0 ? { args: launchArgs } : {}),
    });

@@ -492,7 +496,11 @@ export class BrowserManager {
    // 2. Launch new headed browser (try-catch — if this fails, headless stays running)
    let newBrowser: Browser;
    try {
-      newBrowser = await chromium.launch({ headless: false, timeout: 15000 });
+      newBrowser = await chromium.launch({
+        headless: false,
+        timeout: 15000,
+        chromiumSandbox: process.platform !== 'win32',
+      });
    } catch (err: unknown) {
      const msg = err instanceof Error ? err.message : String(err);
      return `ERROR: Cannot open headed browser — ${msg}. Headless browser still running.`;
@@ -76,6 +76,13 @@ export function resolveNodeServerScript(

 const NODE_SERVER_SCRIPT = IS_WINDOWS ? resolveNodeServerScript() : null;

+// On Windows, hard-fail if server-node.mjs is missing — the Bun path is known broken.
+if (IS_WINDOWS && !NODE_SERVER_SCRIPT) {
+  throw new Error(
+    'server-node.mjs not found. Run `bun run build` to generate the Windows server bundle.'
+  );
+}
+
 interface ServerState {
  pid: number;
  port: number;
@@ -96,6 +103,19 @@ function readState(): ServerState | null {
 }

 function isProcessAlive(pid: number): boolean {
+  if (IS_WINDOWS) {
+    // Bun's compiled binary can't signal Windows PIDs (always throws ESRCH).
+    // Use tasklist as a fallback. Only for one-shot calls — too slow for polling loops.
+    try {
+      const result = Bun.spawnSync(
+        ['tasklist', '/FI', `PID eq ${pid}`, '/NH', '/FO', 'CSV'],
+        { stdout: 'pipe', stderr: 'pipe', timeout: 3000 }
+      );
+      return result.stdout.toString().includes(`"${pid}"`);
+    } catch {
+      return false;
+    }
+  }
  try {
    process.kill(pid, 0);
    return true;
@@ -104,10 +124,42 @@ function isProcessAlive(pid: number): boolean {
  }
 }

+/**
+ * HTTP health check — definitive proof the server is alive and responsive.
+ * Used in all polling loops instead of isProcessAlive() (which is slow on Windows).
+ */
+export async function isServerHealthy(port: number): Promise<boolean> {
+  try {
+    const resp = await fetch(`http://127.0.0.1:${port}/health`, {
+      signal: AbortSignal.timeout(2000),
+    });
+    if (!resp.ok) return false;
+    const health = await resp.json() as any;
+    return health.status === 'healthy';
+  } catch {
+    return false;
+  }
+}
+
 // ─── Process Management ─────────────────────────────────────────
 async function killServer(pid: number): Promise<void> {
  if (!isProcessAlive(pid)) return;

+  if (IS_WINDOWS) {
+    // taskkill /T /F kills the process tree (Node + Chromium)
+    try {
+      Bun.spawnSync(
+        ['taskkill', '/PID', String(pid), '/T', '/F'],
+        { stdout: 'pipe', stderr: 'pipe', timeout: 5000 }
+      );
+    } catch {}
+    const deadline = Date.now() + 2000;
+    while (Date.now() < deadline && isProcessAlive(pid)) {
+      await Bun.sleep(100);
+    }
+    return;
+  }
+
  try { process.kill(pid, 'SIGTERM'); } catch { return; }

  // Wait up to 2s for graceful shutdown
@@ -127,6 +179,10 @@ async function killServer(pid: number): Promise<void> {
 * Verifies PID ownership before sending signals.
 */
 function cleanupLegacyState(): void {
+  // No legacy state on Windows — /tmp and `ps` don't exist, and gstack
+  // never ran on Windows before the Node.js fallback was added.
+  if (IS_WINDOWS) return;
+
  try {
    const files = fs.readdirSync('/tmp').filter(f => f.startsWith('browse-server') && f.endsWith('.json'));
    for (const file of files) {
@@ -164,44 +220,65 @@ function cleanupLegacyState(): void {
 async function startServer(): Promise<ServerState> {
  ensureStateDir(config);

-  // Clean up stale state file
+  // Clean up stale state file and error log
  try { fs.unlinkSync(config.stateFile); } catch {}
+  try { fs.unlinkSync(path.join(config.stateDir, 'browse-startup-error.log')); } catch {}

-  // Start server as detached background process.
-  // On Windows, Bun can't launch/connect to Playwright's Chromium (oven-sh/bun#4253, #9911).
-  // Fall back to running the server under Node.js with Bun API polyfills.
-  const useNode = IS_WINDOWS && NODE_SERVER_SCRIPT;
-  const serverCmd = useNode
-    ? ['node', NODE_SERVER_SCRIPT]
-    : ['bun', 'run', SERVER_SCRIPT];
-  const proc = Bun.spawn(serverCmd, {
-    stdio: ['ignore', 'pipe', 'pipe'],
-    env: { ...process.env, BROWSE_STATE_FILE: config.stateFile },
-  });
+  let proc: any = null;

-  // Don't hold the CLI open
-  proc.unref();
+  if (IS_WINDOWS && NODE_SERVER_SCRIPT) {
+    // Windows: Bun.spawn() + proc.unref() doesn't truly detach on Windows —
+    // when the CLI exits, the server dies with it. Use Node's child_process.spawn
+    // with { detached: true } instead, which is the gold standard for Windows
+    // process independence. Credit: PR #191 by @fqueiro.
+    const launcherCode =
+      `const{spawn}=require('child_process');` +
+      `spawn(process.execPath,[${JSON.stringify(NODE_SERVER_SCRIPT)}],` +
+      `{detached:true,stdio:'ignore',env:Object.assign({},process.env,` +
+      `{BROWSE_STATE_FILE:${JSON.stringify(config.stateFile)}})}).unref()`;
+    Bun.spawnSync(['node', '-e', launcherCode], { stdio: 'ignore' });
+  } else {
+    // macOS/Linux: Bun.spawn + unref works correctly
+    proc = Bun.spawn(['bun', 'run', SERVER_SCRIPT], {
+      stdio: ['ignore', 'pipe', 'pipe'],
+      env: { ...process.env, BROWSE_STATE_FILE: config.stateFile },
+    });
+    proc.unref();
+  }

-  // Wait for state file to appear
+  // Wait for server to become healthy.
+  // Use HTTP health check (not isProcessAlive) — it's fast (~instant ECONNREFUSED)
+  // and works reliably on all platforms including Windows.
  const start = Date.now();
  while (Date.now() - start < MAX_START_WAIT) {
    const state = readState();
-    if (state && isProcessAlive(state.pid)) {
+    if (state && await isServerHealthy(state.port)) {
      return state;
    }
    await Bun.sleep(100);
  }

-  // If we get here, server didn't start in time
-  // Try to read stderr for error message
-  const stderr = proc.stderr;
-  if (stderr) {
-    const reader = stderr.getReader();
+  // Server didn't start in time — try to get error details
+  if (proc?.stderr) {
+    // macOS/Linux: read stderr from the spawned process
+    const reader = proc.stderr.getReader();
    const { value } = await reader.read();
    if (value) {
      const errText = new TextDecoder().decode(value);
      throw new Error(`Server failed to start:\n${errText}`);
    }
+  } else {
+    // Windows: check startup error log (server writes errors to disk since
+    // stderr is unavailable due to stdio: 'ignore' for detachment)
+    const errorLogPath = path.join(config.stateDir, 'browse-startup-error.log');
+    try {
+      const errorLog = fs.readFileSync(errorLogPath, 'utf-8').trim();
+      if (errorLog) {
+        throw new Error(`Server failed to start:\n${errorLog}`);
+      }
+    } catch (e: any) {
+      if (e.code !== 'ENOENT') throw e;
+    }
  }
  throw new Error(`Server failed to start within ${MAX_START_WAIT / 1000}s`);
 }
@@ -237,7 +314,10 @@ function acquireServerLock(): (() => void) | null {
 async function ensureServer(): Promise<ServerState> {
  const state = readState();

-  if (state && isProcessAlive(state.pid)) {
+  // Health-check-first: HTTP is definitive proof the server is alive and responsive.
+  // This replaces the PID-gated approach which breaks on Windows (Bun's process.kill
+  // always throws ESRCH for Windows PIDs in compiled binaries).
+  if (state && await isServerHealthy(state.port)) {
    // Check for binary version mismatch (auto-restart on update)
    const currentVersion = readVersionHash();
    if (currentVersion && state.binaryVersion && currentVersion !== state.binaryVersion) {
@@ -245,21 +325,7 @@ async function ensureServer(): Promise<ServerState> {
      await killServer(state.pid);
      return startServer();
    }
-
-    // Server appears alive — do a health check
-    try {
-      const resp = await fetch(`http://127.0.0.1:${state.port}/health`, {
-        signal: AbortSignal.timeout(2000),
-      });
-      if (resp.ok) {
-        const health = await resp.json() as any;
-        if (health.status === 'healthy') {
-          return state;
-        }
-      }
-    } catch {
-      // Health check failed — server is dead or unhealthy
-    }
+    return state;
  }

  // Ensure state directory exists before lock acquisition (lock file lives there)
@@ -273,7 +339,7 @@ async function ensureServer(): Promise<ServerState> {
    const start = Date.now();
    while (Date.now() - start < MAX_START_WAIT) {
      const freshState = readState();
-      if (freshState && isProcessAlive(freshState.pid)) return freshState;
+      if (freshState && await isServerHealthy(freshState.port)) return freshState;
      await Bun.sleep(200);
    }
    throw new Error('Timed out waiting for another instance to start the server');
@@ -282,7 +348,7 @@ async function ensureServer(): Promise<ServerState> {
  try {
    // Re-read state under lock in case another process just started the server
    const freshState = readState();
-    if (freshState && isProcessAlive(freshState.pid)) {
+    if (freshState && await isServerHealthy(freshState.port)) {
      return freshState;
    }

@@ -286,6 +286,13 @@ async function shutdown() {
 // Handle signals
 process.on('SIGTERM', shutdown);
 process.on('SIGINT', shutdown);
+// Windows: taskkill /F bypasses SIGTERM, but 'exit' fires for some shutdown paths.
+// Defense-in-depth — primary cleanup is the CLI's stale-state detection via health check.
+if (process.platform === 'win32') {
+  process.on('exit', () => {
+    try { fs.unlinkSync(config.stateFile); } catch {}
+  });
+}

 // ─── Start ─────────────────────────────────────────────────────
 async function start() {
@@ -365,5 +372,14 @@ async function start() {

 start().catch((err) => {
  console.error(`[browse] Failed to start: ${err.message}`);
+  // Write error to disk for the CLI to read — on Windows, the CLI can't capture
+  // stderr because the server is launched with detached: true, stdio: 'ignore'.
+  try {
+    const errorLogPath = path.join(config.stateDir, 'browse-startup-error.log');
+    fs.mkdirSync(config.stateDir, { recursive: true });
+    fs.writeFileSync(errorLogPath, `${new Date().toISOString()} ${err.message}\n${err.stack || ''}\n`);
+  } catch {
+    // stateDir may not exist — nothing more we can do
+  }
  process.exit(1);
 });
@@ -248,3 +248,69 @@ describe('version mismatch detection', () => {
    expect(shouldRestart).toBe(false);
  });
 });
+
+describe('isServerHealthy', () => {
+  const { isServerHealthy } = require('../src/cli');
+  const http = require('http');
+
+  test('returns true for a healthy server', async () => {
+    const server = http.createServer((_req: any, res: any) => {
+      res.writeHead(200, { 'Content-Type': 'application/json' });
+      res.end(JSON.stringify({ status: 'healthy' }));
+    });
+    await new Promise<void>(resolve => server.listen(0, resolve));
+    const port = server.address().port;
+    try {
+      expect(await isServerHealthy(port)).toBe(true);
+    } finally {
+      server.close();
+    }
+  });
+
+  test('returns false for an unhealthy server', async () => {
+    const server = http.createServer((_req: any, res: any) => {
+      res.writeHead(200, { 'Content-Type': 'application/json' });
+      res.end(JSON.stringify({ status: 'unhealthy' }));
+    });
+    await new Promise<void>(resolve => server.listen(0, resolve));
+    const port = server.address().port;
+    try {
+      expect(await isServerHealthy(port)).toBe(false);
+    } finally {
+      server.close();
+    }
+  });
+
+  test('returns false when server is not running', async () => {
+    // Use a port that's almost certainly not in use
+    expect(await isServerHealthy(59999)).toBe(false);
+  });
+
+  test('returns false on non-200 response', async () => {
+    const server = http.createServer((_req: any, res: any) => {
+      res.writeHead(500);
+      res.end('Internal Server Error');
+    });
+    await new Promise<void>(resolve => server.listen(0, resolve));
+    const port = server.address().port;
+    try {
+      expect(await isServerHealthy(port)).toBe(false);
+    } finally {
+      server.close();
+    }
+  });
+});
+
+describe('startup error log', () => {
+  test('write and read error log', () => {
+    const tmpDir = path.join(os.tmpdir(), `browse-error-log-test-${Date.now()}`);
+    fs.mkdirSync(tmpDir, { recursive: true });
+    const errorLogPath = path.join(tmpDir, 'browse-startup-error.log');
+    const errorMsg = 'Cannot find module playwright';
+    fs.writeFileSync(errorLogPath, `2026-03-23T00:00:00.000Z ${errorMsg}\n`);
+    const content = fs.readFileSync(errorLogPath, 'utf-8').trim();
+    expect(content).toContain(errorMsg);
+    expect(content).toMatch(/^\d{4}-\d{2}-\d{2}T/); // ISO timestamp prefix
+    fs.rmSync(tmpDir, { recursive: true, force: true });
+  });
+});
@@ -1,6 +1,6 @@
 {
  "name": "gstack",
-  "version": "0.11.13.0",
+  "version": "0.11.14.0",
  "description": "Garry's Stack — Claude Code skills + fast headless browser. One repo, one install, entire AI engineering workflow.",
  "license": "MIT",
  "type": "module",
@@ -68,6 +68,13 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
  'plan-ceo-review-benefits':  ['plan-ceo-review/**', 'scripts/gen-skill-docs.ts'],
  'plan-eng-review':           ['plan-eng-review/**'],
  'plan-eng-review-artifact':  ['plan-eng-review/**'],
+  'plan-review-report':        ['plan-eng-review/**', 'scripts/gen-skill-docs.ts'],
+
+  // Codex offering verification
+  'codex-offered-office-hours':  ['office-hours/**', 'scripts/gen-skill-docs.ts'],
+  'codex-offered-ceo-review':    ['plan-ceo-review/**', 'scripts/gen-skill-docs.ts'],
+  'codex-offered-design-review': ['plan-design-review/**', 'scripts/gen-skill-docs.ts'],
+  'codex-offered-eng-review':    ['plan-eng-review/**', 'scripts/gen-skill-docs.ts'],

  // Ship
  'ship-base-branch': ['ship/**', 'bin/gstack-repo-mode'],
@@ -535,6 +535,199 @@ Write your summary to ${benefitsDir}/benefits-summary.md`,
  }, 180_000);
 });

+// --- Plan Review Report E2E ---
+// Verifies that plan-eng-review writes a "## GSTACK REVIEW REPORT" section
+// to the bottom of the plan file (the living review status footer).
+
+describeIfSelected('Plan Review Report E2E', ['plan-review-report'], () => {
+  let planDir: string;
+
+  beforeAll(() => {
+    planDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-review-report-'));
+    const run = (cmd: string, args: string[]) =>
+      spawnSync(cmd, args, { cwd: planDir, stdio: 'pipe', timeout: 5000 });
+
+    run('git', ['init', '-b', 'main']);
+    run('git', ['config', 'user.email', 'test@test.com']);
+    run('git', ['config', 'user.name', 'Test']);
+
+    fs.writeFileSync(path.join(planDir, 'plan.md'), `# Plan: Add Notifications System
+
+## Context
+We're building a real-time notification system for our SaaS app.
+
+## Changes
+1. WebSocket server for push notifications
+2. Notification preferences API
+3. Email digest fallback for offline users
+4. PostgreSQL table for notification storage
+
+## Architecture
+- WebSocket: Socket.io on Express
+- Queue: Bull + Redis for email digests
+- Storage: PostgreSQL notifications table
+- Frontend: React toast component
+
+## Open questions
+- Retry policy for failed WebSocket delivery?
+- Max notifications stored per user?
+`);
+
+    run('git', ['add', '.']);
+    run('git', ['commit', '-m', 'add plan']);
+
+    // Copy plan-eng-review skill
+    fs.mkdirSync(path.join(planDir, 'plan-eng-review'), { recursive: true });
+    fs.copyFileSync(
+      path.join(ROOT, 'plan-eng-review', 'SKILL.md'),
+      path.join(planDir, 'plan-eng-review', 'SKILL.md'),
+    );
+  });
+
+  afterAll(() => {
+    try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
+  });
+
+  test('/plan-eng-review writes GSTACK REVIEW REPORT to plan file', async () => {
+    const result = await runSkillTest({
+      prompt: `Read plan-eng-review/SKILL.md for the review workflow.
+
+Read plan.md — that's the plan to review. This is a standalone plan document, not a codebase — skip any codebase exploration steps.
+
+Proceed directly to the full review. Skip any AskUserQuestion calls — this is non-interactive.
+Skip the preamble bash block, lake intro, telemetry, and contributor mode sections.
+
+CRITICAL REQUIREMENT: plan.md IS the plan file for this review session. After completing your review, you MUST write a "## GSTACK REVIEW REPORT" section to the END of plan.md, exactly as described in the "Plan File Review Report" section of SKILL.md. If gstack-review-read is not available or returns NO_REVIEWS, write the placeholder table with all four review rows (CEO, Codex, Eng, Design). Use the Edit tool to append to plan.md — do NOT overwrite the existing plan content.
+
+This review report at the bottom of the plan is the MOST IMPORTANT deliverable of this test.`,
+      workingDirectory: planDir,
+      maxTurns: 20,
+      timeout: 360_000,
+      testName: 'plan-review-report',
+      runId,
+      model: 'claude-opus-4-6',
+    });
+
+    logCost('/plan-eng-review report', result);
+    recordE2E(evalCollector, '/plan-review-report', 'Plan Review Report E2E', result, {
+      passed: ['success', 'error_max_turns'].includes(result.exitReason),
+    });
+    expect(['success', 'error_max_turns']).toContain(result.exitReason);
+
+    // Verify the review report was written to the plan file
+    const planContent = fs.readFileSync(path.join(planDir, 'plan.md'), 'utf-8');
+
+    // Original plan content should still be present
+    expect(planContent).toContain('# Plan: Add Notifications System');
+    expect(planContent).toContain('WebSocket');
+
+    // Review report section must exist
+    expect(planContent).toContain('## GSTACK REVIEW REPORT');
+
+    // Report should be at the bottom of the file
+    const reportIndex = planContent.lastIndexOf('## GSTACK REVIEW REPORT');
+    const afterReport = planContent.slice(reportIndex);
+
+    // Should contain the review table with standard rows
+    expect(afterReport).toMatch(/\|\s*Review\s*\|/);
+    expect(afterReport).toContain('CEO Review');
+    expect(afterReport).toContain('Eng Review');
+    expect(afterReport).toContain('Design Review');
+
+    console.log('Plan review report found at bottom of plan.md');
+  }, 420_000);
+});
+
+// --- Codex Offering E2E ---
+// Verifies that Codex is properly offered (with availability check, user prompt,
+// and fallback) in office-hours, plan-ceo-review, plan-design-review, plan-eng-review.
+
+describeIfSelected('Codex Offering E2E', [
+  'codex-offered-office-hours', 'codex-offered-ceo-review',
+  'codex-offered-design-review', 'codex-offered-eng-review',
+], () => {
+  let testDir: string;
+
+  beforeAll(() => {
+    testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-codex-offer-'));
+    const run = (cmd: string, args: string[]) =>
+      spawnSync(cmd, args, { cwd: testDir, stdio: 'pipe', timeout: 5000 });
+
+    run('git', ['init', '-b', 'main']);
+    run('git', ['config', 'user.email', 'test@test.com']);
+    run('git', ['config', 'user.name', 'Test']);
+    fs.writeFileSync(path.join(testDir, 'README.md'), '# Test Project\n');
+    run('git', ['add', '.']);
+    run('git', ['commit', '-m', 'init']);
+
+    // Copy all 4 SKILL.md files
+    for (const skill of ['office-hours', 'plan-ceo-review', 'plan-design-review', 'plan-eng-review']) {
+      fs.mkdirSync(path.join(testDir, skill), { recursive: true });
+      fs.copyFileSync(
+        path.join(ROOT, skill, 'SKILL.md'),
+        path.join(testDir, skill, 'SKILL.md'),
+      );
+    }
+  });
+
+  afterAll(() => {
+    try { fs.rmSync(testDir, { recursive: true, force: true }); } catch {}
+  });
+
+  async function checkCodexOffering(skill: string, testName: string, featureName: string) {
+    const result = await runSkillTest({
+      prompt: `Read ${skill}/SKILL.md. Search for ALL sections related to "codex", "outside voice", or "second opinion".
+
+Summarize the Codex/${featureName} integration — answer these specific questions:
+1. How is Codex availability checked? (what exact bash command?)
+2. How is the user prompted? (via AskUserQuestion? what are the options?)
+3. What happens when Codex is NOT available? (fallback to subagent? skip entirely?)
+4. Is this step blocking (gates the workflow) or optional (can be skipped)?
+5. What prompt/context is sent to Codex?
+
+Write your summary to ${testDir}/${testName}-summary.md`,
+      workingDirectory: testDir,
+      maxTurns: 8,
+      timeout: 120_000,
+      testName,
+      runId,
+    });
+
+    logCost(`/${skill} codex offering`, result);
+    recordE2E(evalCollector, `/${testName}`, 'Codex Offering E2E', result);
+    expect(result.exitReason).toBe('success');
+
+    const summaryPath = path.join(testDir, `${testName}-summary.md`);
+    expect(fs.existsSync(summaryPath)).toBe(true);
+
+    const summary = fs.readFileSync(summaryPath, 'utf-8').toLowerCase();
+    // All skills should have codex availability check
+    expect(summary).toMatch(/which codex/);
+    // All skills should have fallback behavior
+    expect(summary).toMatch(/fallback|subagent|unavailable|not available|skip/);
+    // All skills should show it's optional/non-blocking
+    expect(summary).toMatch(/optional|non.?blocking|skip|not.*required/);
+
+    console.log(`${skill}: Codex offering verified`);
+  }
+
+  testConcurrentIfSelected('codex-offered-office-hours', async () => {
+    await checkCodexOffering('office-hours', 'codex-offered-office-hours', 'second opinion');
+  }, 180_000);
+
+  testConcurrentIfSelected('codex-offered-ceo-review', async () => {
+    await checkCodexOffering('plan-ceo-review', 'codex-offered-ceo-review', 'outside voice');
+  }, 180_000);
+
+  testConcurrentIfSelected('codex-offered-design-review', async () => {
+    await checkCodexOffering('plan-design-review', 'codex-offered-design-review', 'design outside voices');
+  }, 180_000);
+
+  testConcurrentIfSelected('codex-offered-eng-review', async () => {
+    await checkCodexOffering('plan-eng-review', 'codex-offered-eng-review', 'outside voice');
+  }, 180_000);
+});
+
 // Module-level afterAll — finalize eval collector after all tests complete
 afterAll(async () => {
  await finalizeEvalCollector(evalCollector);
@@ -80,8 +80,9 @@ describe('selectTests', () => {
    expect(result.selected).toContain('plan-ceo-review-selective');
    expect(result.selected).toContain('plan-ceo-review-benefits');
    expect(result.selected).toContain('autoplan-core');
-    expect(result.selected.length).toBe(4);
-    expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 4);
+    expect(result.selected).toContain('codex-offered-ceo-review');
+    expect(result.selected.length).toBe(5);
+    expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 5);
  });

  test('global touchfile triggers ALL tests', () => {