Merge branch 'main' into garrytan/team-supabase-store

Brings in 55 commits from main (v0.12.x–v0.13.5.0): Factory Droid compat, prompt injection defense, user sovereignty, security audit, design binary, skill namespacing, modular resolvers, Chrome sidebar, and more. Conflict resolution: - .agents/ SKILL.md files: deleted (main moved to .factory/) - 8 .tmpl templates: accepted main (new features: CDP mode, design tools, global retro, parallelization, distribution checks, plan audits) - scripts/gen-skill-docs.ts: accepted main's modular resolver refactor - test/helpers/session-runner.ts: accepted main + layered back CostEntry tracking from team branch - Generated SKILL.md files: regenerated via bun run gen:skill-docs - Updated tests to match main's gstack-slug output (2 lines, no PROJECTS_DIR) and review log mechanism (gstack-review-log, not $BRANCH.jsonl) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-06 13:45:35 +02:00 · 2026-03-29 15:12:12 -07:00
parent 8444626c6a 484cf1fb3b
commit 15e6d9d8f1
267 changed files with 60292 additions and 12207 deletions
@@ -0,0 +1,88 @@
+import { describe, test, expect } from 'bun:test';
+import { readFileSync, readdirSync, existsSync } from 'fs';
+import { join } from 'path';
+
+const ROOT = join(import.meta.dir, '..');
+
+function getAllSkillMds(): Array<{ name: string; content: string }> {
+  const results: Array<{ name: string; content: string }> = [];
+  const rootPath = join(ROOT, 'SKILL.md');
+  if (existsSync(rootPath)) {
+    results.push({ name: 'root', content: readFileSync(rootPath, 'utf-8') });
+  }
+  for (const entry of readdirSync(ROOT, { withFileTypes: true })) {
+    if (!entry.isDirectory() || entry.name.startsWith('.') || entry.name === 'node_modules') continue;
+    const skillPath = join(ROOT, entry.name, 'SKILL.md');
+    if (existsSync(skillPath)) {
+      results.push({ name: entry.name, content: readFileSync(skillPath, 'utf-8') });
+    }
+  }
+  return results;
+}
+
+describe('Audit compliance', () => {
+  // Fix 1: W007 — No hardcoded credentials in documentation
+  test('no hardcoded credential patterns in SKILL.md.tmpl', () => {
+    const tmpl = readFileSync(join(ROOT, 'SKILL.md.tmpl'), 'utf-8');
+    expect(tmpl).not.toContain('"password123"');
+    expect(tmpl).not.toContain('"test@example.com"');
+    expect(tmpl).not.toContain('"test@test.com"');
+    expect(tmpl).toContain('$TEST_EMAIL');
+    expect(tmpl).toContain('$TEST_PASSWORD');
+  });
+
+  // Fix 2: Conditional telemetry — binary calls wrapped with existence check
+  test('preamble telemetry calls are conditional on _TEL and binary existence', () => {
+    const preamble = readFileSync(join(ROOT, 'scripts/resolvers/preamble.ts'), 'utf-8');
+    // Pending finalization must check _TEL and binary existence
+    expect(preamble).toContain('_TEL" != "off"');
+    expect(preamble).toContain('-x ');
+    expect(preamble).toContain('gstack-telemetry-log');
+    // End-of-skill telemetry must also be conditional
+    const completionIdx = preamble.indexOf('Telemetry (run last)');
+    expect(completionIdx).toBeGreaterThan(-1);
+    const completionSection = preamble.slice(completionIdx);
+    expect(completionSection).toContain('_TEL" != "off"');
+  });
+
+  // Fix 3: W012 — Bun install is version-pinned
+  test('bun install commands use version pinning', () => {
+    const browseResolver = readFileSync(join(ROOT, 'scripts/resolvers/browse.ts'), 'utf-8');
+    expect(browseResolver).toContain('BUN_VERSION');
+    // Should not have unpinned curl|bash (without BUN_VERSION on same line)
+    const lines = browseResolver.split('\n');
+    for (const line of lines) {
+      if (line.includes('bun.sh/install') && line.includes('bash') && !line.includes('BUN_VERSION') && !line.includes('command -v')) {
+        throw new Error(`Unpinned bun install found: ${line.trim()}`);
+      }
+    }
+  });
+
+  // Fix 4: W011 — Untrusted content warning in command reference
+  test('command reference includes untrusted content warning after Navigation', () => {
+    const rootSkill = readFileSync(join(ROOT, 'SKILL.md'), 'utf-8');
+    const navIdx = rootSkill.indexOf('### Navigation');
+    const readingIdx = rootSkill.indexOf('### Reading');
+    expect(navIdx).toBeGreaterThan(-1);
+    expect(readingIdx).toBeGreaterThan(navIdx);
+    const between = rootSkill.slice(navIdx, readingIdx);
+    expect(between.toLowerCase()).toContain('untrusted');
+  });
+
+  // Fix 5: Data flow documentation in review.ts
+  test('review.ts has data flow documentation', () => {
+    const review = readFileSync(join(ROOT, 'scripts/resolvers/review.ts'), 'utf-8');
+    expect(review).toContain('Data sent');
+    expect(review).toContain('Data NOT sent');
+  });
+
+  // Fix 2+6: All generated SKILL.md files with telemetry are conditional
+  test('all generated SKILL.md files with telemetry calls use conditional pattern', () => {
+    const skills = getAllSkillMds();
+    for (const { name, content } of skills) {
+      if (content.includes('gstack-telemetry-log')) {
+        expect(content).toContain('_TEL" != "off"');
+      }
+    }
+  });
+});
@@ -13,12 +13,13 @@
 * Skips gracefully when prerequisites are not met.
 */

-import { describe, test, expect, afterAll } from 'bun:test';
+import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
 import { runCodexSkill, parseCodexJSONL, installSkillToTempHome } from './helpers/codex-session-runner';
 import type { CodexResult } from './helpers/codex-session-runner';
 import { EvalCollector } from './helpers/eval-store';
 import type { EvalTestEntry } from './helpers/eval-store';
 import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES } from './helpers/touchfiles';
+import { createTestWorktree, harvestAndCleanup } from './helpers/e2e-helpers';
 import * as fs from 'fs';
 import * as path from 'path';
 import * as os from 'os';
@@ -118,16 +119,25 @@ afterAll(async () => {
 // --- Tests ---

 describeCodex('Codex E2E', () => {
+  let testWorktree: string;
+
+  beforeAll(() => {
+    testWorktree = createTestWorktree('codex');
+  });
+
+  afterAll(() => {
+    harvestAndCleanup('codex');
+  });

  testIfSelected('codex-discover-skill', async () => {
    // Install gstack-review skill to a temp HOME and ask Codex to list skills
-    const skillDir = path.join(ROOT, '.agents', 'skills', 'gstack-review');
+    const skillDir = path.join(testWorktree, '.agents', 'skills', 'gstack-review');

    const result = await runCodexSkill({
      skillDir,
      prompt: 'List any skills or instructions you have available. Just list the names.',
      timeoutMs: 60_000,
-      cwd: ROOT,
+      cwd: testWorktree,
      skillName: 'gstack-review',
    });

@@ -139,6 +149,9 @@ describeCodex('Codex E2E', () => {

    expect(result.exitCode).toBe(0);
    expect(result.output.length).toBeGreaterThan(0);
+    // Skill loading errors mean our generated SKILL.md files are broken
+    expect(result.stderr).not.toContain('invalid');
+    expect(result.stderr).not.toContain('Skipped loading');
    // The output should reference the skill name in some form
    const outputLower = result.output.toLowerCase();
    expect(
@@ -150,14 +163,14 @@ describeCodex('Codex E2E', () => {
  // code review, and produce structured review output with findings/issues.
  // Accepts Codex timeout (exit 124/137) as non-failure since that's a CLI perf issue.
  testIfSelected('codex-review-findings', async () => {
-    // Install gstack-review skill and ask Codex to review the current repo
-    const skillDir = path.join(ROOT, '.agents', 'skills', 'gstack-review');
+    // Install gstack-review skill and ask Codex to review the worktree
+    const skillDir = path.join(testWorktree, '.agents', 'skills', 'gstack-review');

    const result = await runCodexSkill({
      skillDir,
      prompt: 'Run the gstack-review skill on this repository. Review the current branch diff and report your findings.',
      timeoutMs: 540_000,
-      cwd: ROOT,
+      cwd: testWorktree,
      skillName: 'gstack-review',
    });

@@ -0,0 +1,76 @@
+/**
+ * Shared fixture for test coverage audit E2E tests.
+ *
+ * Creates a Node.js project with billing source code that has intentional
+ * test coverage gaps: processPayment has happy-path-only tests,
+ * refundPayment has no tests at all.
+ *
+ * Used by: ship-coverage-audit E2E, review-coverage-audit E2E
+ */
+
+import * as fs from 'fs';
+import * as path from 'path';
+import { spawnSync } from 'child_process';
+
+export function createCoverageAuditFixture(dir: string): void {
+  // Create a Node.js project WITH test framework but coverage gaps
+  fs.writeFileSync(path.join(dir, 'package.json'), JSON.stringify({
+    name: 'test-coverage-app',
+    version: '1.0.0',
+    type: 'module',
+    scripts: { test: 'echo "no tests yet"' },
+    devDependencies: { vitest: '^1.0.0' },
+  }, null, 2));
+
+  // Create vitest config
+  fs.writeFileSync(path.join(dir, 'vitest.config.ts'),
+    `import { defineConfig } from 'vitest/config';\nexport default defineConfig({ test: {} });\n`);
+
+  fs.writeFileSync(path.join(dir, 'VERSION'), '0.1.0.0\n');
+  fs.writeFileSync(path.join(dir, 'CHANGELOG.md'), '# Changelog\n');
+
+  // Create source file with multiple code paths
+  fs.mkdirSync(path.join(dir, 'src'), { recursive: true });
+  fs.writeFileSync(path.join(dir, 'src', 'billing.ts'), `
+export function processPayment(amount: number, currency: string) {
+  if (amount <= 0) throw new Error('Invalid amount');
+  if (currency !== 'USD' && currency !== 'EUR') throw new Error('Unsupported currency');
+  return { status: 'success', amount, currency };
+}
+
+export function refundPayment(paymentId: string, reason: string) {
+  if (!paymentId) throw new Error('Payment ID required');
+  if (!reason) throw new Error('Reason required');
+  return { status: 'refunded', paymentId, reason };
+}
+`);
+
+  // Create a test directory with ONE test (partial coverage)
+  fs.mkdirSync(path.join(dir, 'test'), { recursive: true });
+  fs.writeFileSync(path.join(dir, 'test', 'billing.test.ts'), `
+import { describe, test, expect } from 'vitest';
+import { processPayment } from '../src/billing';
+
+describe('processPayment', () => {
+  test('processes valid payment', () => {
+    const result = processPayment(100, 'USD');
+    expect(result.status).toBe('success');
+  });
+  // GAP: no test for invalid amount
+  // GAP: no test for unsupported currency
+  // GAP: refundPayment not tested at all
+});
+`);
+
+  // Init git repo with main branch
+  const run = (cmd: string, args: string[]) =>
+    spawnSync(cmd, args, { cwd: dir, stdio: 'pipe', timeout: 5000 });
+  run('git', ['init', '-b', 'main']);
+  run('git', ['config', 'user.email', 'test@test.com']);
+  run('git', ['config', 'user.name', 'Test']);
+  run('git', ['add', '.']);
+  run('git', ['commit', '-m', 'initial commit']);
+
+  // Create feature branch
+  run('git', ['checkout', '-b', 'feature/billing']);
+}
@@ -13,11 +13,12 @@
 * Skips gracefully when prerequisites are not met.
 */

-import { describe, test, expect, afterAll } from 'bun:test';
+import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
 import { runGeminiSkill } from './helpers/gemini-session-runner';
 import type { GeminiResult } from './helpers/gemini-session-runner';
 import { EvalCollector } from './helpers/eval-store';
 import { selectTests, detectBaseBranch, getChangedFiles, GLOBAL_TOUCHFILES } from './helpers/touchfiles';
+import { createTestWorktree, harvestAndCleanup } from './helpers/e2e-helpers';
 import * as path from 'path';

 const ROOT = path.resolve(import.meta.dir, '..');
@@ -76,7 +77,7 @@ if (evalsEnabled && !process.env.EVALS_ALL) {
 /** Skip an individual test if not selected by diff-based selection. */
 function testIfSelected(testName: string, fn: () => Promise<void>, timeout: number) {
  const shouldRun = selectedTests === null || selectedTests.includes(testName);
-  (shouldRun ? test : test.skip)(testName, fn, timeout);
+  (shouldRun ? test.concurrent : test.skip)(testName, fn, timeout);
 }

 // --- Eval result collector ---
@@ -114,13 +115,22 @@ afterAll(async () => {
 // --- Tests ---

 describeGemini('Gemini E2E', () => {
+  let testWorktree: string;
+
+  beforeAll(() => {
+    testWorktree = createTestWorktree('gemini');
+  });
+
+  afterAll(() => {
+    harvestAndCleanup('gemini');
+  });

  testIfSelected('gemini-discover-skill', async () => {
-    // Run Gemini in the repo root where .agents/skills/ exists
+    // Run Gemini in an isolated worktree (has .agents/skills/ copied from ROOT)
    const result = await runGeminiSkill({
      prompt: 'List any skills or instructions you have available. Just list the names.',
      timeoutMs: 60_000,
-      cwd: ROOT,
+      cwd: testWorktree,
    });

    logGeminiCost('gemini-discover-skill', result);
@@ -139,11 +149,11 @@ describeGemini('Gemini E2E', () => {
  }, 120_000);

  testIfSelected('gemini-review-findings', async () => {
-    // Run gstack-review skill via Gemini on this repo
+    // Run gstack-review skill via Gemini on worktree (isolated from main working tree)
    const result = await runGeminiSkill({
      prompt: 'Run the gstack-review skill on this repository. Review the current branch diff and report your findings.',
      timeoutMs: 540_000,
-      cwd: ROOT,
+      cwd: testWorktree,
    });

    logGeminiCost('gemini-review-findings', result);
@@ -0,0 +1,187 @@
+import { describe, test, expect, beforeEach, afterEach } from "bun:test";
+import { mkdtempSync, mkdirSync, writeFileSync, rmSync, existsSync } from "fs";
+import { join } from "path";
+import { tmpdir } from "os";
+import { spawnSync } from "child_process";
+
+// Import normalizeRemoteUrl for unit testing
+// We test the script end-to-end via CLI and normalizeRemoteUrl via import
+const scriptPath = join(import.meta.dir, "..", "bin", "gstack-global-discover.ts");
+
+describe("gstack-global-discover", () => {
+  describe("normalizeRemoteUrl", () => {
+    // Dynamically import to test the exported function
+    let normalizeRemoteUrl: (url: string) => string;
+
+    beforeEach(async () => {
+      const mod = await import("../bin/gstack-global-discover.ts");
+      normalizeRemoteUrl = mod.normalizeRemoteUrl;
+    });
+
+    test("strips .git suffix", () => {
+      expect(normalizeRemoteUrl("https://github.com/user/repo.git")).toBe(
+        "https://github.com/user/repo"
+      );
+    });
+
+    test("converts SSH to HTTPS", () => {
+      expect(normalizeRemoteUrl("git@github.com:user/repo.git")).toBe(
+        "https://github.com/user/repo"
+      );
+    });
+
+    test("converts SSH without .git to HTTPS", () => {
+      expect(normalizeRemoteUrl("git@github.com:user/repo")).toBe(
+        "https://github.com/user/repo"
+      );
+    });
+
+    test("lowercases host", () => {
+      expect(normalizeRemoteUrl("https://GitHub.COM/user/repo")).toBe(
+        "https://github.com/user/repo"
+      );
+    });
+
+    test("SSH and HTTPS for same repo normalize to same URL", () => {
+      const ssh = normalizeRemoteUrl("git@github.com:garrytan/gstack.git");
+      const https = normalizeRemoteUrl("https://github.com/garrytan/gstack.git");
+      const httpsNoDotGit = normalizeRemoteUrl("https://github.com/garrytan/gstack");
+      expect(ssh).toBe(https);
+      expect(https).toBe(httpsNoDotGit);
+    });
+
+    test("handles local: URLs consistently", () => {
+      const result = normalizeRemoteUrl("local:/tmp/my-repo");
+      // local: gets parsed as a URL scheme — the important thing is consistency
+      expect(result).toContain("/tmp/my-repo");
+    });
+
+    test("handles GitLab SSH URLs", () => {
+      expect(normalizeRemoteUrl("git@gitlab.com:org/project.git")).toBe(
+        "https://gitlab.com/org/project"
+      );
+    });
+  });
+
+  describe("CLI", () => {
+    test("--help exits 0 and prints usage", () => {
+      const result = spawnSync("bun", ["run", scriptPath, "--help"], {
+        encoding: "utf-8",
+        timeout: 10000,
+      });
+      expect(result.status).toBe(0);
+      expect(result.stderr).toContain("--since");
+    });
+
+    test("no args exits 1 with error", () => {
+      const result = spawnSync("bun", ["run", scriptPath], {
+        encoding: "utf-8",
+        timeout: 10000,
+      });
+      expect(result.status).toBe(1);
+      expect(result.stderr).toContain("--since is required");
+    });
+
+    test("invalid window format exits 1", () => {
+      const result = spawnSync("bun", ["run", scriptPath, "--since", "abc"], {
+        encoding: "utf-8",
+        timeout: 10000,
+      });
+      expect(result.status).toBe(1);
+      expect(result.stderr).toContain("Invalid window format");
+    });
+
+    test("--since 7d produces valid JSON", () => {
+      const result = spawnSync(
+        "bun",
+        ["run", scriptPath, "--since", "7d", "--format", "json"],
+        { encoding: "utf-8", timeout: 30000 }
+      );
+      expect(result.status).toBe(0);
+      const json = JSON.parse(result.stdout);
+      expect(json).toHaveProperty("window", "7d");
+      expect(json).toHaveProperty("repos");
+      expect(json).toHaveProperty("total_sessions");
+      expect(json).toHaveProperty("total_repos");
+      expect(json).toHaveProperty("tools");
+      expect(Array.isArray(json.repos)).toBe(true);
+    });
+
+    test("--since 7d --format summary produces readable output", () => {
+      const result = spawnSync(
+        "bun",
+        ["run", scriptPath, "--since", "7d", "--format", "summary"],
+        { encoding: "utf-8", timeout: 30000 }
+      );
+      expect(result.status).toBe(0);
+      expect(result.stdout).toContain("Window: 7d");
+      expect(result.stdout).toContain("Sessions:");
+      expect(result.stdout).toContain("Repos:");
+    });
+
+    test("--since 1h returns results (may be empty)", () => {
+      const result = spawnSync(
+        "bun",
+        ["run", scriptPath, "--since", "1h", "--format", "json"],
+        { encoding: "utf-8", timeout: 30000 }
+      );
+      expect(result.status).toBe(0);
+      const json = JSON.parse(result.stdout);
+      expect(json.total_sessions).toBeGreaterThanOrEqual(0);
+    });
+  });
+
+  describe("discovery output structure", () => {
+    test("repos have required fields", () => {
+      const result = spawnSync(
+        "bun",
+        ["run", scriptPath, "--since", "30d", "--format", "json"],
+        { encoding: "utf-8", timeout: 30000 }
+      );
+      expect(result.status).toBe(0);
+      const json = JSON.parse(result.stdout);
+
+      for (const repo of json.repos) {
+        expect(repo).toHaveProperty("name");
+        expect(repo).toHaveProperty("remote");
+        expect(repo).toHaveProperty("paths");
+        expect(repo).toHaveProperty("sessions");
+        expect(Array.isArray(repo.paths)).toBe(true);
+        expect(repo.paths.length).toBeGreaterThan(0);
+        expect(repo.sessions).toHaveProperty("claude_code");
+        expect(repo.sessions).toHaveProperty("codex");
+        expect(repo.sessions).toHaveProperty("gemini");
+      }
+    });
+
+    test("tools summary matches repo data", () => {
+      const result = spawnSync(
+        "bun",
+        ["run", scriptPath, "--since", "30d", "--format", "json"],
+        { encoding: "utf-8", timeout: 30000 }
+      );
+      const json = JSON.parse(result.stdout);
+
+      // Total sessions should equal sum across tools
+      const toolTotal =
+        json.tools.claude_code.total_sessions +
+        json.tools.codex.total_sessions +
+        json.tools.gemini.total_sessions;
+      expect(json.total_sessions).toBe(toolTotal);
+    });
+
+    test("deduplicates Conductor workspaces by remote", () => {
+      const result = spawnSync(
+        "bun",
+        ["run", scriptPath, "--since", "30d", "--format", "json"],
+        { encoding: "utf-8", timeout: 30000 }
+      );
+      const json = JSON.parse(result.stdout);
+
+      // Check that no two repos share the same normalized remote
+      const remotes = json.repos.map((r: any) => r.remote);
+      const uniqueRemotes = new Set(remotes);
+      expect(remotes.length).toBe(uniqueRemotes.size);
+    });
+  });
+});
@@ -27,6 +27,7 @@ export interface CodexResult {
  durationMs: number;       // Wall clock time
  sessionId: string | null; // Thread ID for session continuity
  rawLines: string[];       // Raw JSONL lines for debugging
+  stderr: string;           // Stderr output (skill loading errors, auth failures)
 }

 // --- JSONL parser (ported from Python in codex/SKILL.md.tmpl) ---
@@ -98,7 +99,8 @@ export function parseCodexJSONL(lines: string[]): ParsedCodexJSONL {

 /**
 * Install a SKILL.md into a temp HOME directory for Codex to discover.
- * Creates ~/.codex/skills/{skillName}/SKILL.md in the temp HOME.
+ * Creates ~/.codex/skills/{skillName}/SKILL.md in the temp HOME and copies
+ * agents/openai.yaml when present so Codex sees the same metadata as a real install.
 *
 * Returns the temp HOME path. Caller is responsible for cleanup.
 */
@@ -116,6 +118,13 @@ export function installSkillToTempHome(
    fs.copyFileSync(srcSkill, path.join(destDir, 'SKILL.md'));
  }

+  const srcOpenAIYaml = path.join(skillDir, 'agents', 'openai.yaml');
+  if (fs.existsSync(srcOpenAIYaml)) {
+    const destAgentsDir = path.join(destDir, 'agents');
+    fs.mkdirSync(destAgentsDir, { recursive: true });
+    fs.copyFileSync(srcOpenAIYaml, path.join(destAgentsDir, 'openai.yaml'));
+  }
+
  return home;
 }

@@ -159,6 +168,7 @@ export async function runCodexSkill(opts: {
      durationMs: Date.now() - startTime,
      sessionId: null,
      rawLines: [],
+      stderr: '',
    };
  }

@@ -274,6 +284,7 @@ export async function runCodexSkill(opts: {
      durationMs,
      sessionId: parsed.sessionId,
      rawLines: collectedLines,
+      stderr,
    };
  } finally {
    // Clean up temp HOME
@@ -5,11 +5,13 @@
 * tests across multiple files by category.
 */

-import { describe, test, afterAll } from 'bun:test';
+import { describe, test, beforeAll, afterAll } from 'bun:test';
 import type { SkillTestResult } from './session-runner';
 import { EvalCollector, judgePassed } from './eval-store';
 import type { EvalTestEntry } from './eval-store';
-import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES } from './touchfiles';
+import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, E2E_TIERS, GLOBAL_TOUCHFILES } from './touchfiles';
+import { WorktreeManager } from '../../lib/worktree';
+import type { HarvestResult } from '../../lib/worktree';
 import { spawnSync } from 'child_process';
 import * as fs from 'fs';
 import * as path from 'path';
@@ -30,13 +32,6 @@ export const evalsEnabled = !!process.env.EVALS;
 // Set EVALS_ALL=1 to force all tests. Set EVALS_BASE to override base branch.
 export let selectedTests: string[] | null = null; // null = run all

-// EVALS_FAST: skip the 8 slowest tests (all Opus quality tests) for quick feedback
-const FAST_EXCLUDED_TESTS = [
-  'plan-ceo-review-selective', 'plan-ceo-review', 'retro', 'retro-base-branch',
-  'design-consultation-core', 'design-consultation-existing',
-  'qa-fix-loop', 'design-review-fix',
-];
-
 if (evalsEnabled && !process.env.EVALS_ALL) {
  const baseBranch = process.env.EVALS_BASE
    || detectBaseBranch(ROOT)
@@ -55,15 +50,22 @@ if (evalsEnabled && !process.env.EVALS_ALL) {
  // If changedFiles is empty (e.g., on main branch), selectedTests stays null → run all
 }

-// Apply EVALS_FAST filter after diff-based selection
-if (evalsEnabled && process.env.EVALS_FAST) {
+// EVALS_TIER: filter tests by tier after diff-based selection.
+// 'gate' = gate tests only (CI default — blocks merge)
+// 'periodic' = periodic tests only (weekly cron / manual)
+// not set = run all selected tests (local dev default, backward compat)
+if (evalsEnabled && process.env.EVALS_TIER) {
+  const tier = process.env.EVALS_TIER as 'gate' | 'periodic';
+  const tierTests = Object.entries(E2E_TIERS)
+    .filter(([, t]) => t === tier)
+    .map(([name]) => name);
+
  if (selectedTests === null) {
-    // Run all minus excluded
-    selectedTests = Object.keys(E2E_TOUCHFILES).filter(t => !FAST_EXCLUDED_TESTS.includes(t));
+    selectedTests = tierTests;
  } else {
-    selectedTests = selectedTests.filter(t => !FAST_EXCLUDED_TESTS.includes(t));
+    selectedTests = selectedTests.filter(t => tierTests.includes(t));
  }
-  process.stderr.write(`EVALS_FAST: excluded ${FAST_EXCLUDED_TESTS.length} slow tests, running ${selectedTests.length}\n\n`);
+  process.stderr.write(`EVALS_TIER=${tier}: ${selectedTests.length} tests\n\n`);
 }

 export const describeE2E = evalsEnabled ? describe : describe.skip;
@@ -205,7 +207,7 @@ export async function finalizeEvalCollector(evalCollector: EvalCollector | null)
 if (evalsEnabled) {
  const gstackDir = path.join(os.homedir(), '.gstack');
  fs.mkdirSync(gstackDir, { recursive: true });
-  for (const f of ['.completeness-intro-seen', '.telemetry-prompted']) {
+  for (const f of ['.completeness-intro-seen', '.telemetry-prompted', '.proactive-prompted']) {
    const p = path.join(gstackDir, f);
    if (!fs.existsSync(p)) fs.writeFileSync(p, '');
  }
@@ -234,6 +236,59 @@ export function testConcurrentIfSelected(testName: string, fn: () => Promise<voi
  (shouldRun ? test.concurrent : test.skip)(testName, fn, timeout);
 }

+// --- Worktree isolation ---
+
+let worktreeManager: WorktreeManager | null = null;
+
+export function getWorktreeManager(): WorktreeManager {
+  if (!worktreeManager) {
+    worktreeManager = new WorktreeManager();
+    worktreeManager.pruneStale();
+  }
+  return worktreeManager;
+}
+
+/** Create an isolated worktree for a test. Returns the worktree path. */
+export function createTestWorktree(testName: string): string {
+  return getWorktreeManager().create(testName);
+}
+
+/** Harvest changes and clean up. Call in afterAll(). Returns HarvestResult for eval integration. */
+export function harvestAndCleanup(testName: string): HarvestResult | null {
+  const mgr = getWorktreeManager();
+  const result = mgr.harvest(testName);
+  if (result) {
+    if (result.isDuplicate) {
+      process.stderr.write(`\n  HARVEST [${testName}]: duplicate patch (skipped)\n`);
+    } else {
+      process.stderr.write(`\n  HARVEST [${testName}]: ${result.changedFiles.length} files changed\n`);
+      process.stderr.write(`  Patch: ${result.patchPath}\n`);
+      process.stderr.write(`  ${result.diffStat}\n\n`);
+    }
+  }
+  mgr.cleanup(testName);
+  return result;
+}
+
+/**
+ * Convenience: describe block with automatic worktree isolation + harvest.
+ * Any test file can use this to get real repo context instead of a tmpdir.
+ * Note: tests with planted-bug fixtures should NOT use this — they need their fixture repos.
+ */
+export function describeWithWorktree(
+  name: string,
+  testNames: string[],
+  fn: (getWorktreePath: () => string) => void,
+) {
+  describeIfSelected(name, testNames, () => {
+    let worktreePath: string;
+    beforeAll(() => { worktreePath = createTestWorktree(name); });
+    afterAll(() => { harvestAndCleanup(name); });
+    fn(() => worktreePath);
+  });
+}
+
 export { judgePassed } from './eval-store';
 export { EvalCollector } from './eval-store';
 export type { EvalTestEntry } from './eval-store';
+export type { HarvestResult } from '../../lib/worktree';
@@ -2,7 +2,7 @@
 * Eval result persistence and comparison.
 *
 * EvalCollector accumulates test results, writes them to
- * ~/.gstack-dev/evals/{version}-{branch}-{tier}-{timestamp}.json,
+ * ~/.gstack/projects/$SLUG/evals/{version}-{branch}-{tier}-{timestamp}.json,
 * prints a summary table, and auto-compares with the previous run.
 *
 * Comparison functions are exported for reuse by the eval:compare CLI.
@@ -16,7 +16,32 @@ import { getGitInfo as getGitInfoShared, getVersion as getVersionShared } from '
 import type { CostEntry } from '../../lib/eval-format';

 const SCHEMA_VERSION = 1;
-const DEFAULT_EVAL_DIR = path.join(os.homedir(), '.gstack-dev', 'evals');
+const LEGACY_EVAL_DIR = path.join(os.homedir(), '.gstack-dev', 'evals');
+
+/**
+ * Detect project-scoped eval dir via gstack-slug.
+ * Falls back to legacy ~/.gstack-dev/evals/ if slug detection fails.
+ */
+export function getProjectEvalDir(): string {
+  try {
+    // Try repo-local gstack-slug first, then global install
+    const localSlug = spawnSync('bash', ['-c', '.claude/skills/gstack/bin/gstack-slug 2>/dev/null || ~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null'], {
+      stdio: 'pipe', timeout: 3000,
+    });
+    const output = localSlug.stdout?.toString().trim();
+    if (output) {
+      const slugMatch = output.match(/^SLUG=(.+)$/m);
+      if (slugMatch && slugMatch[1]) {
+        const dir = path.join(os.homedir(), '.gstack', 'projects', slugMatch[1], 'evals');
+        fs.mkdirSync(dir, { recursive: true });
+        return dir;
+      }
+    }
+  } catch { /* fall through */ }
+  return LEGACY_EVAL_DIR;
+}
+
+const DEFAULT_EVAL_DIR = getProjectEvalDir();

 // --- Interfaces ---

@@ -60,6 +85,13 @@ export interface EvalTestEntry {
  costs?: CostEntry[];

  error?: string;
+
+  // Worktree harvest data
+  harvest?: {
+    filesChanged: number;
+    patchPath: string;
+    isDuplicate: boolean;
+  };
 }

 export interface EvalResult {
@@ -9,15 +9,23 @@
 import * as fs from 'fs';
 import * as path from 'path';
 import * as os from 'os';
-import { atomicWriteSync, sanitizeForFilename, GSTACK_DEV_DIR } from '../../lib/util';
+import { getProjectEvalDir } from './eval-store';
 import type { CostEntry } from '../../lib/eval-format';
-import { resolveTier, tierToModel } from '../../lib/eval-tier';

-const HEARTBEAT_PATH = path.join(GSTACK_DEV_DIR, 'e2e-live.json');
+const GSTACK_DEV_DIR = path.join(os.homedir(), '.gstack-dev');
+const HEARTBEAT_PATH = path.join(GSTACK_DEV_DIR, 'e2e-live.json'); // heartbeat stays global
+const PROJECT_DIR = path.dirname(getProjectEvalDir()); // ~/.gstack/projects/$SLUG/

 /** Sanitize test name for use as filename: strip leading slashes, replace / with - */
 export function sanitizeTestName(name: string): string {
-  return sanitizeForFilename(name);
+  return name.replace(/^\/+/, '').replace(/\//g, '-');
+}
+
+/** Atomic write: write to .tmp then rename. Non-fatal on error. */
+function atomicWriteSync(filePath: string, data: string): void {
+  const tmp = filePath + '.tmp';
+  fs.writeFileSync(tmp, data);
+  fs.renameSync(tmp, filePath);
 }

 export interface CostEstimate {
@@ -140,15 +148,13 @@ export async function runSkillTest(options: {
  const safeName = testName ? sanitizeTestName(testName) : null;
  if (runId) {
    try {
-      runDir = path.join(GSTACK_DEV_DIR, 'e2e-runs', runId);
+      runDir = path.join(PROJECT_DIR, 'e2e-runs', runId);
      fs.mkdirSync(runDir, { recursive: true });
    } catch { /* non-fatal */ }
  }

  // Spawn claude -p with streaming NDJSON output. Prompt piped via stdin to
  // avoid shell escaping issues. --verbose is required for stream-json mode.
-  // Model pinned via EVAL_TIER env var (default: sonnet).
-  const evalModel = tierToModel(resolveTier());
  const args = [
    '-p',
    '--model', model,
@@ -32,25 +32,25 @@ export function matchGlob(file: string, pattern: string): boolean {
 * Each test lists the file patterns that, if changed, require the test to run.
 */
 export const E2E_TOUCHFILES: Record<string, string[]> = {
-  // Browse core
-  'browse-basic':    ['browse/src/**'],
-  'browse-snapshot': ['browse/src/**'],
+  // Browse core (+ test-server dependency)
+  'browse-basic':    ['browse/src/**', 'browse/test/test-server.ts'],
+  'browse-snapshot': ['browse/src/**', 'browse/test/test-server.ts'],

-  // SKILL.md setup + preamble (depend on ROOT SKILL.md only)
-  'skillmd-setup-discovery':  ['SKILL.md', 'SKILL.md.tmpl'],
-  'skillmd-no-local-binary':  ['SKILL.md', 'SKILL.md.tmpl'],
-  'skillmd-outside-git':      ['SKILL.md', 'SKILL.md.tmpl'],
+  // SKILL.md setup + preamble (depend on ROOT SKILL.md + gen-skill-docs)
+  'skillmd-setup-discovery':  ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
+  'skillmd-no-local-binary':  ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
+  'skillmd-outside-git':      ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],

  'contributor-mode':           ['SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
-  'session-awareness':        ['SKILL.md', 'SKILL.md.tmpl'],
+  'session-awareness':        ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],

-  // QA
-  'qa-quick':       ['qa/**', 'browse/src/**'],
-  'qa-b6-static':   ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval.html', 'test/fixtures/qa-eval-ground-truth.json'],
-  'qa-b7-spa':      ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval-spa.html', 'test/fixtures/qa-eval-spa-ground-truth.json'],
-  'qa-b8-checkout': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval-checkout.html', 'test/fixtures/qa-eval-checkout-ground-truth.json'],
+  // QA (+ test-server dependency)
+  'qa-quick':       ['qa/**', 'browse/src/**', 'browse/test/test-server.ts'],
+  'qa-b6-static':   ['qa/**', 'browse/src/**', 'browse/test/test-server.ts', 'test/helpers/llm-judge.ts', 'browse/test/fixtures/qa-eval.html', 'test/fixtures/qa-eval-ground-truth.json'],
+  'qa-b7-spa':      ['qa/**', 'browse/src/**', 'browse/test/test-server.ts', 'test/helpers/llm-judge.ts', 'browse/test/fixtures/qa-eval-spa.html', 'test/fixtures/qa-eval-spa-ground-truth.json'],
+  'qa-b8-checkout': ['qa/**', 'browse/src/**', 'browse/test/test-server.ts', 'test/helpers/llm-judge.ts', 'browse/test/fixtures/qa-eval-checkout.html', 'test/fixtures/qa-eval-checkout-ground-truth.json'],
  'qa-only-no-fix': ['qa-only/**', 'qa/templates/**'],
-  'qa-fix-loop':    ['qa/**', 'browse/src/**'],
+  'qa-fix-loop':    ['qa/**', 'browse/src/**', 'browse/test/test-server.ts'],
  'qa-bootstrap':   ['qa/**', 'ship/**'],

  // Review
@@ -68,58 +68,94 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
  'plan-ceo-review-benefits':  ['plan-ceo-review/**', 'scripts/gen-skill-docs.ts'],
  'plan-eng-review':           ['plan-eng-review/**'],
  'plan-eng-review-artifact':  ['plan-eng-review/**'],
+  'plan-review-report':        ['plan-eng-review/**', 'scripts/gen-skill-docs.ts'],
+
+  // Codex offering verification
+  'codex-offered-office-hours':  ['office-hours/**', 'scripts/gen-skill-docs.ts'],
+  'codex-offered-ceo-review':    ['plan-ceo-review/**', 'scripts/gen-skill-docs.ts'],
+  'codex-offered-design-review': ['plan-design-review/**', 'scripts/gen-skill-docs.ts'],
+  'codex-offered-eng-review':    ['plan-eng-review/**', 'scripts/gen-skill-docs.ts'],

  // Ship
-  'ship-base-branch':    ['ship/**'],
+  'ship-base-branch': ['ship/**', 'bin/gstack-repo-mode'],
  'ship-local-workflow': ['ship/**', 'scripts/gen-skill-docs.ts'],
-
-  // Setup browser cookies
-  'setup-cookies-detect': ['setup-browser-cookies/**'],
+  'review-dashboard-via': ['ship/**', 'scripts/resolvers/review.ts', 'codex/**', 'autoplan/**', 'land-and-deploy/**'],
+  'ship-plan-completion': ['ship/**', 'scripts/gen-skill-docs.ts'],
+  'ship-plan-verification': ['ship/**', 'scripts/gen-skill-docs.ts'],

  // Retro
  'retro':             ['retro/**'],
  'retro-base-branch': ['retro/**'],

+  // Global discover
+  'global-discover':   ['bin/gstack-global-discover.ts', 'test/global-discover.test.ts'],
+
+  // CSO
+  'cso-full-audit':   ['cso/**'],
+  'cso-diff-mode':    ['cso/**'],
+  'cso-infra-scope':  ['cso/**'],
+
  // Document-release
  'document-release': ['document-release/**'],

  // Codex (Claude E2E — tests /codex skill via Claude)
  'codex-review': ['codex/**'],

-  // Codex E2E (tests skills via Codex CLI)
-  'codex-discover-skill':  ['codex/**', '.agents/skills/**', 'test/helpers/codex-session-runner.ts'],
-  'codex-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'codex/**', 'test/helpers/codex-session-runner.ts'],
+  // Codex E2E (tests skills via Codex CLI + worktree)
+  'codex-discover-skill':  ['codex/**', '.agents/skills/**', 'test/helpers/codex-session-runner.ts', 'lib/worktree.ts'],
+  'codex-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'codex/**', 'test/helpers/codex-session-runner.ts', 'lib/worktree.ts'],

-  // Gemini E2E (tests skills via Gemini CLI)
-  'gemini-discover-skill':  ['.agents/skills/**', 'test/helpers/gemini-session-runner.ts'],
-  'gemini-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'test/helpers/gemini-session-runner.ts'],
+  // Gemini E2E (tests skills via Gemini CLI + worktree)
+  'gemini-discover-skill':  ['.agents/skills/**', 'test/helpers/gemini-session-runner.ts', 'lib/worktree.ts'],
+  'gemini-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'test/helpers/gemini-session-runner.ts', 'lib/worktree.ts'],


-  // Ship coverage audit
-  'ship-coverage-audit': ['ship/**'],
+  // Coverage audit (shared fixture) + triage + gates
+  'ship-coverage-audit': ['ship/**', 'test/fixtures/coverage-audit-fixture.ts', 'bin/gstack-repo-mode'],
+  'review-coverage-audit': ['review/**', 'test/fixtures/coverage-audit-fixture.ts'],
+  'plan-eng-coverage-audit': ['plan-eng-review/**', 'test/fixtures/coverage-audit-fixture.ts'],
+  'ship-triage': ['ship/**', 'bin/gstack-repo-mode'],
+
+  // Plan completion audit + verification
+  'ship-plan-completion': ['ship/**', 'scripts/gen-skill-docs.ts'],
+  'ship-plan-verification': ['ship/**', 'qa-only/**', 'scripts/gen-skill-docs.ts'],
+  'review-plan-completion': ['review/**', 'scripts/gen-skill-docs.ts'],

  // Design
-  'design-consultation-core':       ['design-consultation/**'],
-  'design-consultation-existing':   ['design-consultation/**'],
-  'design-consultation-research':   ['design-consultation/**'],
-  'design-consultation-preview':    ['design-consultation/**'],
-  'plan-design-review-plan-mode':   ['plan-design-review/**'],
-  'plan-design-review-no-ui-scope': ['plan-design-review/**'],
-  'design-review-fix':              ['design-review/**', 'browse/src/**'],
+  'design-consultation-core':       ['design-consultation/**', 'scripts/gen-skill-docs.ts', 'test/helpers/llm-judge.ts'],
+  'design-consultation-existing':   ['design-consultation/**', 'scripts/gen-skill-docs.ts'],
+  'design-consultation-research':   ['design-consultation/**', 'scripts/gen-skill-docs.ts'],
+  'design-consultation-preview':    ['design-consultation/**', 'scripts/gen-skill-docs.ts'],
+  'plan-design-review-plan-mode':   ['plan-design-review/**', 'scripts/gen-skill-docs.ts'],
+  'plan-design-review-no-ui-scope': ['plan-design-review/**', 'scripts/gen-skill-docs.ts'],
+  'design-review-fix':              ['design-review/**', 'browse/src/**', 'scripts/gen-skill-docs.ts'],
+
+  // Design Shotgun
+  'design-shotgun-path':            ['design-shotgun/**', 'design/src/**', 'scripts/resolvers/design.ts'],
+  'design-shotgun-session':         ['design-shotgun/**', 'scripts/resolvers/design.ts'],
+  'design-shotgun-full':            ['design-shotgun/**', 'design/src/**', 'browse/src/**'],

  // gstack-upgrade
  'gstack-upgrade-happy-path': ['gstack-upgrade/**'],

  // Deploy skills
-  'land-and-deploy-workflow':   ['land-and-deploy/**', 'scripts/gen-skill-docs.ts'],
-  'canary-workflow':            ['canary/**', 'browse/src/**'],
-  'benchmark-workflow':         ['benchmark/**', 'browse/src/**'],
-  'setup-deploy-workflow':      ['setup-deploy/**', 'scripts/gen-skill-docs.ts'],
+  'land-and-deploy-workflow':      ['land-and-deploy/**', 'scripts/gen-skill-docs.ts'],
+  'land-and-deploy-first-run':     ['land-and-deploy/**', 'scripts/gen-skill-docs.ts', 'bin/gstack-slug'],
+  'land-and-deploy-review-gate':   ['land-and-deploy/**', 'bin/gstack-review-read'],
+  'canary-workflow':               ['canary/**', 'browse/src/**'],
+  'benchmark-workflow':            ['benchmark/**', 'browse/src/**'],
+  'setup-deploy-workflow':         ['setup-deploy/**', 'scripts/gen-skill-docs.ts'],
+
+  // Sidebar agent
+  'sidebar-navigate':              ['browse/src/server.ts', 'browse/src/sidebar-agent.ts', 'browse/src/sidebar-utils.ts', 'extension/**'],
+  'sidebar-url-accuracy':          ['browse/src/server.ts', 'browse/src/sidebar-agent.ts', 'browse/src/sidebar-utils.ts', 'extension/background.js'],
+
+  // Autoplan
+  'autoplan-core':  ['autoplan/**', 'plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**'],

  // Skill routing — journey-stage tests (depend on ALL skill descriptions)
  'journey-ideation':       ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
  'journey-plan-eng':       ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
-  'journey-think-bigger':   ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
  'journey-debug':          ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
  'journey-qa':             ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
  'journey-code-review':    ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
@@ -130,6 +166,133 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
  'journey-visual-qa':      ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
 };

+/**
+ * E2E test tiers — 'gate' blocks PRs, 'periodic' runs weekly/on-demand.
+ * Must have exactly the same keys as E2E_TOUCHFILES.
+ */
+export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
+  // Browse core — gate (if browse breaks, everything breaks)
+  'browse-basic': 'gate',
+  'browse-snapshot': 'gate',
+
+  // SKILL.md setup — gate (if setup breaks, no skill works)
+  'skillmd-setup-discovery': 'gate',
+  'skillmd-no-local-binary': 'gate',
+  'skillmd-outside-git': 'gate',
+  'contributor-mode': 'gate',
+  'session-awareness': 'gate',
+
+  // QA — gate for functional, periodic for quality/benchmarks
+  'qa-quick': 'gate',
+  'qa-b6-static': 'periodic',
+  'qa-b7-spa': 'periodic',
+  'qa-b8-checkout': 'periodic',
+  'qa-only-no-fix': 'gate',     // CRITICAL guardrail: Edit tool forbidden
+  'qa-fix-loop': 'periodic',
+  'qa-bootstrap': 'gate',
+
+  // Review — gate for functional/guardrails, periodic for quality
+  'review-sql-injection': 'gate',     // Security guardrail
+  'review-enum-completeness': 'gate',
+  'review-base-branch': 'gate',
+  'review-design-lite': 'periodic',   // 4/7 threshold is subjective
+  'review-coverage-audit': 'gate',
+  'review-plan-completion': 'gate',
+  'review-dashboard-via': 'gate',
+
+  // Office Hours
+  'office-hours-spec-review': 'gate',
+
+  // Plan reviews — gate for cheap functional, periodic for Opus quality
+  'plan-ceo-review': 'periodic',
+  'plan-ceo-review-selective': 'periodic',
+  'plan-ceo-review-benefits': 'gate',
+  'plan-eng-review': 'periodic',
+  'plan-eng-review-artifact': 'periodic',
+  'plan-eng-coverage-audit': 'gate',
+  'plan-review-report': 'gate',
+
+  // Codex offering verification
+  'codex-offered-office-hours': 'gate',
+  'codex-offered-ceo-review': 'gate',
+  'codex-offered-design-review': 'gate',
+  'codex-offered-eng-review': 'gate',
+
+  // Ship — gate (end-to-end ship path)
+  'ship-base-branch': 'gate',
+  'ship-local-workflow': 'gate',
+  'ship-coverage-audit': 'gate',
+  'ship-triage': 'gate',
+  'ship-plan-completion': 'gate',
+  'ship-plan-verification': 'gate',
+
+  // Retro — gate for cheap branch detection, periodic for full Opus retro
+  'retro': 'periodic',
+  'retro-base-branch': 'gate',
+
+  // Global discover
+  'global-discover': 'gate',
+
+  // CSO — gate for security guardrails, periodic for quality
+  'cso-full-audit': 'gate',      // Hardcoded secrets detection
+  'cso-diff-mode': 'gate',
+  'cso-infra-scope': 'periodic',
+
+  // Document-release — gate (CHANGELOG guardrail)
+  'document-release': 'gate',
+
+  // Codex — periodic (Opus, requires codex CLI)
+  'codex-review': 'periodic',
+
+  // Multi-AI — periodic (require external CLIs)
+  'codex-discover-skill': 'periodic',
+  'codex-review-findings': 'periodic',
+  'gemini-discover-skill': 'periodic',
+  'gemini-review-findings': 'periodic',
+
+  // Design — gate for cheap functional, periodic for Opus/quality
+  'design-consultation-core': 'periodic',
+  'design-consultation-existing': 'periodic',
+  'design-consultation-research': 'gate',
+  'design-consultation-preview': 'gate',
+  'plan-design-review-plan-mode': 'periodic',
+  'plan-design-review-no-ui-scope': 'gate',
+  'design-review-fix': 'periodic',
+  'design-shotgun-path': 'gate',
+  'design-shotgun-session': 'gate',
+  'design-shotgun-full': 'periodic',
+
+  // gstack-upgrade
+  'gstack-upgrade-happy-path': 'gate',
+
+  // Deploy skills
+  'land-and-deploy-workflow': 'gate',
+  'land-and-deploy-first-run': 'gate',
+  'land-and-deploy-review-gate': 'gate',
+  'canary-workflow': 'gate',
+  'benchmark-workflow': 'gate',
+  'setup-deploy-workflow': 'gate',
+
+  // Sidebar agent
+  'sidebar-navigate': 'periodic',
+  'sidebar-url-accuracy': 'periodic',
+
+  // Autoplan — periodic (not yet implemented)
+  'autoplan-core': 'periodic',
+
+  // Skill routing — periodic (LLM routing is non-deterministic)
+  'journey-ideation': 'periodic',
+  'journey-plan-eng': 'periodic',
+  'journey-debug': 'periodic',
+  'journey-qa': 'periodic',
+  'journey-code-review': 'periodic',
+  'journey-ship': 'periodic',
+  'journey-docs': 'periodic',
+  'journey-retro': 'periodic',
+  'journey-design-system': 'periodic',
+  'journey-visual-qa': 'periodic',
+};
+
 /**
 * LLM-judge test touchfiles — keyed by test description string.
 */
@@ -172,20 +335,22 @@ export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
  'retro/SKILL.md instructions':          ['retro/SKILL.md', 'retro/SKILL.md.tmpl'],
  'qa-only/SKILL.md workflow':            ['qa-only/SKILL.md', 'qa-only/SKILL.md.tmpl'],
  'gstack-upgrade/SKILL.md upgrade flow': ['gstack-upgrade/SKILL.md', 'gstack-upgrade/SKILL.md.tmpl'],
+
+  // Voice directive
+  'voice directive tone':                 ['scripts/resolvers/preamble.ts', 'review/SKILL.md', 'review/SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
 };

 /**
 * Changes to any of these files trigger ALL tests (both E2E and LLM-judge).
+ *
+ * Keep this list minimal — only files that genuinely affect every test.
+ * Scoped dependencies (gen-skill-docs, llm-judge, test-server, worktree,
+ * codex/gemini session runners) belong in individual test entries instead.
 */
 export const GLOBAL_TOUCHFILES = [
-  'test/helpers/session-runner.ts',
-  'test/helpers/codex-session-runner.ts',
-  'test/helpers/gemini-session-runner.ts',
-  'test/helpers/eval-store.ts',
-  'test/helpers/llm-judge.ts',
-  'scripts/gen-skill-docs.ts',
-  'test/helpers/touchfiles.ts',
-  'browse/test/test-server.ts',
+  'test/helpers/session-runner.ts',  // All E2E tests use this runner
+  'test/helpers/eval-store.ts',      // All E2E tests store results here
+  'test/helpers/touchfiles.ts',      // Self-referential — reclassifying wrong is dangerous
 ];

 // --- Base branch detection ---
@@ -0,0 +1,77 @@
+import { describe, test, expect, beforeEach, afterEach } from 'bun:test';
+import { execSync, ExecSyncOptionsWithStringEncoding } from 'child_process';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+
+const ROOT = path.resolve(import.meta.dir, '..');
+const BIN = path.join(ROOT, 'bin');
+
+let tmpDir: string;
+let slugDir: string;
+
+function run(input: string, opts: { expectFail?: boolean } = {}): { stdout: string; exitCode: number } {
+  const execOpts: ExecSyncOptionsWithStringEncoding = {
+    cwd: ROOT,
+    env: { ...process.env, GSTACK_HOME: tmpDir },
+    encoding: 'utf-8',
+    timeout: 10000,
+  };
+  try {
+    const stdout = execSync(`${BIN}/gstack-review-log '${input.replace(/'/g, "'\\''")}'`, execOpts).trim();
+    return { stdout, exitCode: 0 };
+  } catch (e: any) {
+    if (opts.expectFail) {
+      return { stdout: e.stderr?.toString() || '', exitCode: e.status || 1 };
+    }
+    throw e;
+  }
+}
+
+beforeEach(() => {
+  tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-revlog-'));
+  // gstack-review-log uses gstack-slug which needs a git repo — create the projects dir
+  // with a predictable slug by pre-creating the directory structure
+  slugDir = path.join(tmpDir, 'projects');
+  fs.mkdirSync(slugDir, { recursive: true });
+});
+
+afterEach(() => {
+  fs.rmSync(tmpDir, { recursive: true, force: true });
+});
+
+describe('gstack-review-log', () => {
+  test('appends valid JSON to review JSONL file', () => {
+    const input = '{"skill":"plan-eng-review","status":"clean"}';
+    const result = run(input);
+    expect(result.exitCode).toBe(0);
+
+    // Find the JSONL file that was written
+    const projectDirs = fs.readdirSync(slugDir);
+    expect(projectDirs.length).toBeGreaterThan(0);
+    const projectDir = path.join(slugDir, projectDirs[0]);
+    const jsonlFiles = fs.readdirSync(projectDir).filter(f => f.endsWith('.jsonl'));
+    expect(jsonlFiles.length).toBeGreaterThan(0);
+
+    const content = fs.readFileSync(path.join(projectDir, jsonlFiles[0]), 'utf-8').trim();
+    const parsed = JSON.parse(content);
+    expect(parsed.skill).toBe('plan-eng-review');
+    expect(parsed.status).toBe('clean');
+  });
+
+  test('rejects non-JSON input with non-zero exit code', () => {
+    const result = run('not json at all', { expectFail: true });
+    expect(result.exitCode).not.toBe(0);
+
+    // Verify nothing was written
+    const projectDirs = fs.readdirSync(slugDir);
+    if (projectDirs.length > 0) {
+      const projectDir = path.join(slugDir, projectDirs[0]);
+      const jsonlFiles = fs.readdirSync(projectDir).filter(f => f.endsWith('.jsonl'));
+      if (jsonlFiles.length > 0) {
+        const content = fs.readFileSync(path.join(projectDir, jsonlFiles[0]), 'utf-8').trim();
+        expect(content).toBe('');
+      }
+    }
+  });
+});
@@ -25,7 +25,11 @@ describeIfSelected('Skill E2E tests', [
    testServer = startTestServer();
    tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-'));
    setupBrowseShims(tmpDir);
-  });
+
+    // Pre-warm the browse server so Chromium is already launched for tests.
+    // In CI, Chromium can take 10-20s to launch (Docker + --no-sandbox).
+    spawnSync(browseBin, ['goto', testServer.url], { cwd: tmpDir, timeout: 30000, stdio: 'pipe' });
+  }, 45_000);

  afterAll(() => {
    testServer?.server?.stop();
@@ -41,7 +45,7 @@ describeIfSelected('Skill E2E tests', [
 4. $B screenshot /tmp/skill-e2e-test.png
 Report the results of each command.`,
      workingDirectory: tmpDir,
-      maxTurns: 10,
+      maxTurns: 7,
      timeout: 60_000,
      testName: 'browse-basic',
      runId,
@@ -63,7 +67,7 @@ Report the results of each command.`,
 5. $B snapshot -i -a -o /tmp/skill-e2e-annotated.png
 Report what each command returned.`,
      workingDirectory: tmpDir,
-      maxTurns: 10,
+      maxTurns: 7,
      timeout: 60_000,
      testName: 'browse-snapshot',
      runId,
@@ -274,12 +278,25 @@ Remember: _SESSIONS=4, so ELI16 mode is active. The user is juggling multiple wi
      expect(lower.includes('payment') || lower.includes('feature')).toBe(true);
      // Must mention what we're working on
      expect(lower.includes('stripe') || lower.includes('checkout') || lower.includes('payment')).toBe(true);
-      // Must have a RECOMMENDATION
-      expect(output).toContain('RECOMMENDATION');
+      // Must have a recommendation or structured options
+      expect(
+        output.includes('RECOMMENDATION') ||
+        lower.includes('recommend') ||
+        lower.includes('option a') ||
+        lower.includes('which do you want') ||
+        lower.includes('which approach')
+      ).toBe(true);
    } else {
      // Check agent output as fallback
      const output = result.output || '';
-      expect(output).toContain('RECOMMENDATION');
+      const lowerOut = output.toLowerCase();
+      expect(
+        output.includes('RECOMMENDATION') ||
+        lowerOut.includes('recommend') ||
+        lowerOut.includes('option a') ||
+        lowerOut.includes('which do you want') ||
+        lowerOut.includes('which approach')
+      ).toBe(true);
    }

    // Clean up
@@ -0,0 +1,258 @@
+import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
+import { runSkillTest } from './helpers/session-runner';
+import {
+  ROOT, runId, evalsEnabled,
+  describeIfSelected, logCost, recordE2E,
+  createEvalCollector, finalizeEvalCollector,
+} from './helpers/e2e-helpers';
+import { spawnSync } from 'child_process';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+
+const evalCollector = createEvalCollector('e2e-cso');
+
+afterAll(() => {
+  finalizeEvalCollector(evalCollector);
+});
+
+// --- CSO v2 E2E Tests ---
+
+describeIfSelected('CSO v2 — full audit', ['cso-full-audit'], () => {
+  let csoDir: string;
+
+  beforeAll(() => {
+    csoDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-cso-'));
+
+    const run = (cmd: string, args: string[]) =>
+      spawnSync(cmd, args, { cwd: csoDir, stdio: 'pipe', timeout: 5000 });
+
+    run('git', ['init', '-b', 'main']);
+    run('git', ['config', 'user.email', 'test@test.com']);
+    run('git', ['config', 'user.name', 'Test']);
+
+    // Create a minimal app with a planted vulnerability
+    fs.writeFileSync(path.join(csoDir, 'package.json'), JSON.stringify({
+      name: 'cso-test-app',
+      version: '1.0.0',
+      dependencies: { express: '4.18.0' },
+    }, null, 2));
+
+    // Planted vuln: hardcoded API key
+    fs.writeFileSync(path.join(csoDir, 'server.ts'), `
+import express from 'express';
+const app = express();
+const API_KEY = "sk-1234567890abcdef1234567890abcdef";
+app.get('/api/data', (req, res) => {
+  const id = req.query.id;
+  res.json({ data: \`result for \${id}\` });
+});
+app.listen(3000);
+`);
+
+    // Planted vuln: .env tracked by git
+    fs.writeFileSync(path.join(csoDir, '.env'), 'DATABASE_URL=postgres://admin:secretpass@prod.db.example.com:5432/myapp\n');
+
+    run('git', ['add', '.']);
+    run('git', ['commit', '-m', 'initial']);
+  });
+
+  afterAll(() => {
+    try { fs.rmSync(csoDir, { recursive: true, force: true }); } catch {}
+  });
+
+  test('/cso finds planted vulnerabilities', async () => {
+    const result = await runSkillTest({
+      prompt: `Read the file ${path.join(ROOT, 'cso', 'SKILL.md')} for the CSO skill instructions.
+
+Run /cso on this repo (full daily audit, no flags).
+
+IMPORTANT:
+- Do NOT use AskUserQuestion — skip any interactive prompts.
+- Focus on finding the planted vulnerabilities in this small repo.
+- Produce the SECURITY FINDINGS table.
+- Save the report to .gstack/security-reports/.`,
+      workingDirectory: csoDir,
+      maxTurns: 30,
+      allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob', 'Agent'],
+      timeout: 300_000,
+    });
+
+    logCost('cso', result);
+    expect(result.exitReason).toBe('success');
+
+    // Should detect hardcoded API key
+    const output = result.output.toLowerCase();
+    expect(
+      output.includes('sk-') || output.includes('hardcoded') || output.includes('api key') || output.includes('api_key')
+    ).toBe(true);
+
+    // Should detect .env tracked by git
+    expect(
+      output.includes('.env') && (output.includes('tracked') || output.includes('gitignore'))
+    ).toBe(true);
+
+    // Should produce a findings table
+    expect(
+      output.includes('security findings') || output.includes('SECURITY FINDINGS')
+    ).toBe(true);
+
+    // Should save a report
+    const reportDir = path.join(csoDir, '.gstack', 'security-reports');
+    const reportExists = fs.existsSync(reportDir);
+    if (reportExists) {
+      const reports = fs.readdirSync(reportDir).filter(f => f.endsWith('.json'));
+      expect(reports.length).toBeGreaterThanOrEqual(1);
+    }
+
+    recordE2E(evalCollector, 'cso-full-audit', 'e2e-cso', result);
+  }, 300_000);
+});
+
+describeIfSelected('CSO v2 — diff mode', ['cso-diff-mode'], () => {
+  let csoDiffDir: string;
+
+  beforeAll(() => {
+    csoDiffDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-cso-diff-'));
+
+    const run = (cmd: string, args: string[]) =>
+      spawnSync(cmd, args, { cwd: csoDiffDir, stdio: 'pipe', timeout: 5000 });
+
+    run('git', ['init', '-b', 'main']);
+    run('git', ['config', 'user.email', 'test@test.com']);
+    run('git', ['config', 'user.name', 'Test']);
+
+    // Clean initial commit
+    fs.writeFileSync(path.join(csoDiffDir, 'package.json'), JSON.stringify({
+      name: 'cso-diff-test', version: '1.0.0',
+    }, null, 2));
+    fs.writeFileSync(path.join(csoDiffDir, 'app.ts'), 'console.log("hello");\n');
+    run('git', ['add', '.']);
+    run('git', ['commit', '-m', 'initial']);
+
+    // Feature branch with a vuln
+    run('git', ['checkout', '-b', 'feat/add-webhook']);
+    fs.writeFileSync(path.join(csoDiffDir, 'webhook.ts'), `
+import express from 'express';
+const app = express();
+// No signature verification!
+app.post('/webhook/stripe', (req, res) => {
+  const event = req.body;
+  processPayment(event);
+  res.sendStatus(200);
+});
+`);
+    run('git', ['add', '.']);
+    run('git', ['commit', '-m', 'feat: add webhook']);
+  });
+
+  afterAll(() => {
+    try { fs.rmSync(csoDiffDir, { recursive: true, force: true }); } catch {}
+  });
+
+  test('/cso --diff scopes to branch changes', async () => {
+    const result = await runSkillTest({
+      prompt: `Read the file ${path.join(ROOT, 'cso', 'SKILL.md')} for the CSO skill instructions.
+
+Run /cso --diff on this repo. The base branch is "main".
+
+IMPORTANT:
+- Do NOT use AskUserQuestion — skip any interactive prompts.
+- Focus on changes in the current branch vs main.
+- The webhook.ts file was added on this branch — it should be analyzed.`,
+      workingDirectory: csoDiffDir,
+      maxTurns: 25,
+      allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob', 'Agent'],
+      timeout: 240_000,
+    });
+
+    logCost('cso', result);
+    expect(result.exitReason).toBe('success');
+
+    const output = result.output.toLowerCase();
+    // Should mention webhook and missing signature verification
+    expect(
+      output.includes('webhook') && (output.includes('signature') || output.includes('verify'))
+    ).toBe(true);
+
+    recordE2E(evalCollector, 'cso-diff-mode', 'e2e-cso', result);
+  }, 240_000);
+});
+
+describeIfSelected('CSO v2 — infra scope', ['cso-infra-scope'], () => {
+  let csoInfraDir: string;
+
+  beforeAll(() => {
+    csoInfraDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-cso-infra-'));
+
+    const run = (cmd: string, args: string[]) =>
+      spawnSync(cmd, args, { cwd: csoInfraDir, stdio: 'pipe', timeout: 5000 });
+
+    run('git', ['init', '-b', 'main']);
+    run('git', ['config', 'user.email', 'test@test.com']);
+    run('git', ['config', 'user.name', 'Test']);
+
+    // CI workflow with unpinned action
+    fs.mkdirSync(path.join(csoInfraDir, '.github', 'workflows'), { recursive: true });
+    fs.writeFileSync(path.join(csoInfraDir, '.github', 'workflows', 'ci.yml'), `
+name: CI
+on: [push]
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: some-third-party/action@main
+      - run: echo "Building..."
+`);
+
+    // Dockerfile running as root
+    fs.writeFileSync(path.join(csoInfraDir, 'Dockerfile'), `
+FROM node:20
+WORKDIR /app
+COPY . .
+RUN npm install
+EXPOSE 3000
+CMD ["node", "server.js"]
+`);
+
+    run('git', ['add', '.']);
+    run('git', ['commit', '-m', 'initial']);
+  });
+
+  afterAll(() => {
+    try { fs.rmSync(csoInfraDir, { recursive: true, force: true }); } catch {}
+  });
+
+  test('/cso --infra runs infrastructure phases only', async () => {
+    const result = await runSkillTest({
+      prompt: `Read the file ${path.join(ROOT, 'cso', 'SKILL.md')} for the CSO skill instructions.
+
+Run /cso --infra on this repo. This should run infrastructure-only phases (0-6, 12-14).
+
+IMPORTANT:
+- Do NOT use AskUserQuestion — skip any interactive prompts.
+- This is a TINY repo with only 3 files: .github/workflows/ci.yml, Dockerfile, and package.json. Do NOT waste turns exploring — just read those files directly and audit them.
+- The Dockerfile has no USER directive (runs as root). The CI workflow uses an unpinned third-party GitHub Action (some-third-party/action@main).
+- Focus on infrastructure findings, NOT code-level OWASP scanning.
+- Skip the preamble (gstack-update-check, telemetry, etc.) — go straight to the audit.
+- Do NOT use the Agent tool for exploration or verification — read the files yourself. This repo is too small to need subagents.`,
+      workingDirectory: csoInfraDir,
+      maxTurns: 30,
+      allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob'],
+      timeout: 360_000,
+    });
+
+    logCost('cso', result);
+    expect(result.exitReason).toBe('success');
+
+    const output = result.output.toLowerCase();
+    // Should mention unpinned action or Dockerfile issues
+    expect(
+      output.includes('unpinned') || output.includes('third-party') ||
+      output.includes('user directive') || output.includes('root')
+    ).toBe(true);
+
+    recordE2E(evalCollector, 'cso-infra-scope', 'e2e-cso', result);
+  }, 360_000);
+});
@@ -44,7 +44,7 @@ describeIfSelected('Land-and-Deploy skill E2E', ['land-and-deploy-workflow'], ()
    try { fs.rmSync(landDir, { recursive: true, force: true }); } catch {}
  });

-  test('/land-and-deploy detects Fly.io platform and produces deploy report structure', async () => {
+  testConcurrentIfSelected('land-and-deploy-workflow', async () => {
    const result = await runSkillTest({
      prompt: `Read land-and-deploy/SKILL.md for the /land-and-deploy skill instructions.

@@ -85,6 +85,161 @@ Do NOT use AskUserQuestion. Do NOT run gh or fly commands.`,
  }, 180_000);
 });

+// --- Land-and-Deploy First-Run E2E ---
+
+describeIfSelected('Land-and-Deploy first-run E2E', ['land-and-deploy-first-run'], () => {
+  let firstRunDir: string;
+
+  beforeAll(() => {
+    firstRunDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-land-first-run-'));
+    const run = (cmd: string, args: string[]) =>
+      spawnSync(cmd, args, { cwd: firstRunDir, stdio: 'pipe', timeout: 5000 });
+
+    run('git', ['init', '-b', 'main']);
+    run('git', ['config', 'user.email', 'test@test.com']);
+    run('git', ['config', 'user.name', 'Test']);
+
+    fs.writeFileSync(path.join(firstRunDir, 'app.ts'), 'export function hello() { return "world"; }\n');
+    fs.writeFileSync(path.join(firstRunDir, 'fly.toml'), 'app = "first-run-app"\n\n[http_service]\n  internal_port = 3000\n');
+    run('git', ['add', '.']);
+    run('git', ['commit', '-m', 'initial']);
+
+    run('git', ['checkout', '-b', 'feat/first-deploy']);
+    fs.writeFileSync(path.join(firstRunDir, 'app.ts'), 'export function hello() { return "first deploy"; }\n');
+    run('git', ['add', '.']);
+    run('git', ['commit', '-m', 'feat: first deploy']);
+
+    copyDirSync(path.join(ROOT, 'land-and-deploy'), path.join(firstRunDir, 'land-and-deploy'));
+  });
+
+  afterAll(() => {
+    try { fs.rmSync(firstRunDir, { recursive: true, force: true }); } catch {}
+  });
+
+  testConcurrentIfSelected('land-and-deploy-first-run', async () => {
+    const result = await runSkillTest({
+      prompt: `Read land-and-deploy/SKILL.md for the /land-and-deploy skill instructions.
+
+You are on branch feat/first-deploy. This is the FIRST TIME running /land-and-deploy
+for this project — there is NO land-deploy-confirmed file.
+
+This repo has a fly.toml with app = "first-run-app", indicating a Fly.io deployment.
+
+IMPORTANT: There is NO remote and NO GitHub PR — you cannot run gh commands.
+Instead, simulate the Step 1.5 first-run dry-run validation:
+1. Detect that this is a FIRST_RUN (no land-deploy-confirmed file)
+2. Detect the deploy platform from fly.toml (Fly.io, app = first-run-app)
+3. Infer the production URL (https://first-run-app.fly.dev)
+4. Build the DEPLOY INFRASTRUCTURE VALIDATION table showing:
+   - Platform detected
+   - Command validation results (simulated as all passing)
+   - Staging detection results (none expected)
+   - What will happen steps
+5. Write the dry-run report to .gstack/deploy-reports/dry-run-validation.md
+
+Do NOT use AskUserQuestion. Do NOT run gh or fly commands.
+Just demonstrate the first-run dry-run output.`,
+      workingDirectory: firstRunDir,
+      maxTurns: 20,
+      allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob'],
+      timeout: 120_000,
+      testName: 'land-and-deploy-first-run',
+      runId,
+    });
+
+    logCost('/land-and-deploy first-run', result);
+    recordE2E(evalCollector, '/land-and-deploy first-run', 'Land-and-Deploy first-run E2E', result);
+    expect(result.exitReason).toBe('success');
+
+    // Verify dry-run report was created
+    const reportDir = path.join(firstRunDir, '.gstack', 'deploy-reports');
+    expect(fs.existsSync(reportDir)).toBe(true);
+
+    // Check report content mentions platform detection
+    const reportFiles = fs.readdirSync(reportDir);
+    expect(reportFiles.length).toBeGreaterThan(0);
+    const reportContent = fs.readFileSync(path.join(reportDir, reportFiles[0]), 'utf-8');
+    const hasPlatform = reportContent.toLowerCase().includes('fly') || reportContent.toLowerCase().includes('first-run-app');
+    expect(hasPlatform).toBe(true);
+  }, 180_000);
+});
+
+// --- Land-and-Deploy Review Gate E2E ---
+
+describeIfSelected('Land-and-Deploy review gate E2E', ['land-and-deploy-review-gate'], () => {
+  let reviewDir: string;
+
+  beforeAll(() => {
+    reviewDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-land-review-'));
+    const run = (cmd: string, args: string[]) =>
+      spawnSync(cmd, args, { cwd: reviewDir, stdio: 'pipe', timeout: 5000 });
+
+    run('git', ['init', '-b', 'main']);
+    run('git', ['config', 'user.email', 'test@test.com']);
+    run('git', ['config', 'user.name', 'Test']);
+
+    fs.writeFileSync(path.join(reviewDir, 'app.ts'), 'export function hello() { return "world"; }\n');
+    run('git', ['add', '.']);
+    run('git', ['commit', '-m', 'initial']);
+
+    // Create 6 more commits to make any review stale
+    for (let i = 1; i <= 6; i++) {
+      fs.writeFileSync(path.join(reviewDir, `file${i}.ts`), `export const x${i} = ${i};\n`);
+      run('git', ['add', '.']);
+      run('git', ['commit', '-m', `feat: add file${i}`]);
+    }
+
+    copyDirSync(path.join(ROOT, 'land-and-deploy'), path.join(reviewDir, 'land-and-deploy'));
+  });
+
+  afterAll(() => {
+    try { fs.rmSync(reviewDir, { recursive: true, force: true }); } catch {}
+  });
+
+  testConcurrentIfSelected('land-and-deploy-review-gate', async () => {
+    const result = await runSkillTest({
+      prompt: `Read land-and-deploy/SKILL.md for the /land-and-deploy skill instructions.
+
+Focus on Step 3.5a and Step 3.5a-bis (the review staleness check and inline review offer).
+
+This repo has 6 commits since the initial commit. There are NO review logs
+(gstack-review-read would return NO_REVIEWS).
+
+Simulate what the readiness gate would show:
+1. Run gstack-review-read equivalent (simulate NO_REVIEWS output)
+2. Determine review staleness: Eng Review should be "NOT RUN"
+3. Note that Step 3.5a-bis would offer an inline review
+4. Write a simulated readiness report to .gstack/deploy-reports/readiness-report.md
+   showing the review status as NOT RUN with the inline review offer text
+
+Do NOT use AskUserQuestion. Do NOT run gh commands.
+Show what the readiness gate output would look like.`,
+      workingDirectory: reviewDir,
+      maxTurns: 15,
+      allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob'],
+      timeout: 120_000,
+      testName: 'land-and-deploy-review-gate',
+      runId,
+    });
+
+    logCost('/land-and-deploy review-gate', result);
+    recordE2E(evalCollector, '/land-and-deploy review-gate', 'Land-and-Deploy review gate E2E', result);
+    expect(result.exitReason).toBe('success');
+
+    // Verify readiness report was created
+    const reportDir = path.join(reviewDir, '.gstack', 'deploy-reports');
+    expect(fs.existsSync(reportDir)).toBe(true);
+
+    const reportFiles = fs.readdirSync(reportDir);
+    expect(reportFiles.length).toBeGreaterThan(0);
+    const reportContent = fs.readFileSync(path.join(reportDir, reportFiles[0]), 'utf-8');
+    // Should mention review status
+    const hasReviewMention = reportContent.toLowerCase().includes('review') ||
+                              reportContent.toLowerCase().includes('not run');
+    expect(hasReviewMention).toBe(true);
+  }, 180_000);
+});
+
 // --- Canary skill E2E ---

 describeIfSelected('Canary skill E2E', ['canary-workflow'], () => {
@@ -110,7 +265,7 @@ describeIfSelected('Canary skill E2E', ['canary-workflow'], () => {
    try { fs.rmSync(canaryDir, { recursive: true, force: true }); } catch {}
  });

-  test('/canary skill produces monitoring report structure', async () => {
+  testConcurrentIfSelected('canary-workflow', async () => {
    const result = await runSkillTest({
      prompt: `Read canary/SKILL.md for the /canary skill instructions.

@@ -171,7 +326,7 @@ describeIfSelected('Benchmark skill E2E', ['benchmark-workflow'], () => {
    try { fs.rmSync(benchDir, { recursive: true, force: true }); } catch {}
  });

-  test('/benchmark skill produces performance report structure', async () => {
+  testConcurrentIfSelected('benchmark-workflow', async () => {
    const result = await runSkillTest({
      prompt: `Read benchmark/SKILL.md for the /benchmark skill instructions.

@@ -237,7 +392,7 @@ describeIfSelected('Setup-Deploy skill E2E', ['setup-deploy-workflow'], () => {
    try { fs.rmSync(setupDir, { recursive: true, force: true }); } catch {}
  });

-  test('/setup-deploy detects Fly.io and writes config to CLAUDE.md', async () => {
+  testConcurrentIfSelected('setup-deploy-workflow', async () => {
    const result = await runSkillTest({
      prompt: `Read setup-deploy/SKILL.md for the /setup-deploy skill instructions.

@@ -560,7 +560,7 @@ describeIfSelected('Design Review E2E', ['design-review-fix'], () => {
    try { fs.rmSync(qaDesignDir, { recursive: true, force: true }); } catch {}
  });

-  test('Test 7: /design-review audits and fixes design issues', async () => {
+  testConcurrentIfSelected('design-review-fix', async () => {
    const serverUrl = `http://localhost:${(qaDesignServer as any)?.port}`;

    const result = await runSkillTest({
@@ -66,7 +66,7 @@ We're building a new user dashboard that shows recent activity, notifications, a
    try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
  });

-  test('/plan-ceo-review produces structured review output', async () => {
+  testConcurrentIfSelected('plan-ceo-review', async () => {
    const result = await runSkillTest({
      prompt: `Read plan-ceo-review/SKILL.md for the review workflow.

@@ -150,7 +150,7 @@ We're building a new user dashboard that shows recent activity, notifications, a
    try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
  });

-  test('/plan-ceo-review SELECTIVE EXPANSION produces structured review output', async () => {
+  testConcurrentIfSelected('plan-ceo-review-selective', async () => {
    const result = await runSkillTest({
      prompt: `Read plan-ceo-review/SKILL.md for the review workflow.

@@ -244,7 +244,7 @@ Replace session-cookie auth with JWT tokens. Currently using express-session + R
    try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
  });

-  test('/plan-eng-review produces structured review output', async () => {
+  testConcurrentIfSelected('plan-eng-review', async () => {
    const result = await runSkillTest({
      prompt: `Read plan-eng-review/SKILL.md for the review workflow.

@@ -364,7 +364,7 @@ export function main() { return Dashboard(); }
    } catch {}
  });

-  test('/plan-eng-review writes test-plan artifact to ~/.gstack/projects/', async () => {
+  testConcurrentIfSelected('plan-eng-review-artifact', async () => {
    // Count existing test-plan files before
    const beforeFiles = fs.readdirSync(projectDir).filter(f => f.includes('test-plan'));

@@ -408,8 +408,11 @@ Write your review to ${planDir}/review-output.md`,
      console.warn('No test-plan artifact found — agent may not have followed artifact instructions');
    }

-    // Soft assertion: we expect an artifact but agent compliance is not guaranteed
-    expect(newFiles.length).toBeGreaterThanOrEqual(1);
+    // Soft assertion: we expect an artifact but agent compliance is not guaranteed.
+    // Log rather than fail — the test-plan artifact is a bonus output, not the core test.
+    if (newFiles.length === 0) {
+      console.warn('SOFT FAIL: No test-plan artifact written — agent did not follow artifact instructions');
+    }
  }, 420_000);
 });

@@ -442,7 +445,7 @@ describeIfSelected('Office Hours Spec Review E2E', ['office-hours-spec-review'],
    try { fs.rmSync(ohDir, { recursive: true, force: true }); } catch {}
  });

-  test('/office-hours SKILL.md contains spec review loop', async () => {
+  testConcurrentIfSelected('office-hours-spec-review', async () => {
    const result = await runSkillTest({
      prompt: `Read office-hours/SKILL.md. I want to understand the spec review loop.

@@ -502,7 +505,7 @@ describeIfSelected('Plan CEO Review Benefits-From E2E', ['plan-ceo-review-benefi
    try { fs.rmSync(benefitsDir, { recursive: true, force: true }); } catch {}
  });

-  test('/plan-ceo-review SKILL.md contains prerequisite skill offer', async () => {
+  testConcurrentIfSelected('plan-ceo-review-benefits', async () => {
    const result = await runSkillTest({
      prompt: `Read plan-ceo-review/SKILL.md. Search for sections about "Prerequisite" or "office-hours" or "design doc found".

@@ -532,6 +535,199 @@ Write your summary to ${benefitsDir}/benefits-summary.md`,
  }, 180_000);
 });

+// --- Plan Review Report E2E ---
+// Verifies that plan-eng-review writes a "## GSTACK REVIEW REPORT" section
+// to the bottom of the plan file (the living review status footer).
+
+describeIfSelected('Plan Review Report E2E', ['plan-review-report'], () => {
+  let planDir: string;
+
+  beforeAll(() => {
+    planDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-review-report-'));
+    const run = (cmd: string, args: string[]) =>
+      spawnSync(cmd, args, { cwd: planDir, stdio: 'pipe', timeout: 5000 });
+
+    run('git', ['init', '-b', 'main']);
+    run('git', ['config', 'user.email', 'test@test.com']);
+    run('git', ['config', 'user.name', 'Test']);
+
+    fs.writeFileSync(path.join(planDir, 'plan.md'), `# Plan: Add Notifications System
+
+## Context
+We're building a real-time notification system for our SaaS app.
+
+## Changes
+1. WebSocket server for push notifications
+2. Notification preferences API
+3. Email digest fallback for offline users
+4. PostgreSQL table for notification storage
+
+## Architecture
+- WebSocket: Socket.io on Express
+- Queue: Bull + Redis for email digests
+- Storage: PostgreSQL notifications table
+- Frontend: React toast component
+
+## Open questions
+- Retry policy for failed WebSocket delivery?
+- Max notifications stored per user?
+`);
+
+    run('git', ['add', '.']);
+    run('git', ['commit', '-m', 'add plan']);
+
+    // Copy plan-eng-review skill
+    fs.mkdirSync(path.join(planDir, 'plan-eng-review'), { recursive: true });
+    fs.copyFileSync(
+      path.join(ROOT, 'plan-eng-review', 'SKILL.md'),
+      path.join(planDir, 'plan-eng-review', 'SKILL.md'),
+    );
+  });
+
+  afterAll(() => {
+    try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
+  });
+
+  test('/plan-eng-review writes GSTACK REVIEW REPORT to plan file', async () => {
+    const result = await runSkillTest({
+      prompt: `Read plan-eng-review/SKILL.md for the review workflow.
+
+Read plan.md — that's the plan to review. This is a standalone plan document, not a codebase — skip any codebase exploration steps.
+
+Proceed directly to the full review. Skip any AskUserQuestion calls — this is non-interactive.
+Skip the preamble bash block, lake intro, telemetry, and contributor mode sections.
+
+CRITICAL REQUIREMENT: plan.md IS the plan file for this review session. After completing your review, you MUST write a "## GSTACK REVIEW REPORT" section to the END of plan.md, exactly as described in the "Plan File Review Report" section of SKILL.md. If gstack-review-read is not available or returns NO_REVIEWS, write the placeholder table with all four review rows (CEO, Codex, Eng, Design). Use the Edit tool to append to plan.md — do NOT overwrite the existing plan content.
+
+This review report at the bottom of the plan is the MOST IMPORTANT deliverable of this test.`,
+      workingDirectory: planDir,
+      maxTurns: 20,
+      timeout: 360_000,
+      testName: 'plan-review-report',
+      runId,
+      model: 'claude-opus-4-6',
+    });
+
+    logCost('/plan-eng-review report', result);
+    recordE2E(evalCollector, '/plan-review-report', 'Plan Review Report E2E', result, {
+      passed: ['success', 'error_max_turns'].includes(result.exitReason),
+    });
+    expect(['success', 'error_max_turns']).toContain(result.exitReason);
+
+    // Verify the review report was written to the plan file
+    const planContent = fs.readFileSync(path.join(planDir, 'plan.md'), 'utf-8');
+
+    // Original plan content should still be present
+    expect(planContent).toContain('# Plan: Add Notifications System');
+    expect(planContent).toContain('WebSocket');
+
+    // Review report section must exist
+    expect(planContent).toContain('## GSTACK REVIEW REPORT');
+
+    // Report should be at the bottom of the file
+    const reportIndex = planContent.lastIndexOf('## GSTACK REVIEW REPORT');
+    const afterReport = planContent.slice(reportIndex);
+
+    // Should contain the review table with standard rows
+    expect(afterReport).toMatch(/\|\s*Review\s*\|/);
+    expect(afterReport).toContain('CEO Review');
+    expect(afterReport).toContain('Eng Review');
+    expect(afterReport).toContain('Design Review');
+
+    console.log('Plan review report found at bottom of plan.md');
+  }, 420_000);
+});
+
+// --- Codex Offering E2E ---
+// Verifies that Codex is properly offered (with availability check, user prompt,
+// and fallback) in office-hours, plan-ceo-review, plan-design-review, plan-eng-review.
+
+describeIfSelected('Codex Offering E2E', [
+  'codex-offered-office-hours', 'codex-offered-ceo-review',
+  'codex-offered-design-review', 'codex-offered-eng-review',
+], () => {
+  let testDir: string;
+
+  beforeAll(() => {
+    testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-codex-offer-'));
+    const run = (cmd: string, args: string[]) =>
+      spawnSync(cmd, args, { cwd: testDir, stdio: 'pipe', timeout: 5000 });
+
+    run('git', ['init', '-b', 'main']);
+    run('git', ['config', 'user.email', 'test@test.com']);
+    run('git', ['config', 'user.name', 'Test']);
+    fs.writeFileSync(path.join(testDir, 'README.md'), '# Test Project\n');
+    run('git', ['add', '.']);
+    run('git', ['commit', '-m', 'init']);
+
+    // Copy all 4 SKILL.md files
+    for (const skill of ['office-hours', 'plan-ceo-review', 'plan-design-review', 'plan-eng-review']) {
+      fs.mkdirSync(path.join(testDir, skill), { recursive: true });
+      fs.copyFileSync(
+        path.join(ROOT, skill, 'SKILL.md'),
+        path.join(testDir, skill, 'SKILL.md'),
+      );
+    }
+  });
+
+  afterAll(() => {
+    try { fs.rmSync(testDir, { recursive: true, force: true }); } catch {}
+  });
+
+  async function checkCodexOffering(skill: string, testName: string, featureName: string) {
+    const result = await runSkillTest({
+      prompt: `Read ${skill}/SKILL.md. Search for ALL sections related to "codex", "outside voice", or "second opinion".
+
+Summarize the Codex/${featureName} integration — answer these specific questions:
+1. How is Codex availability checked? (what exact bash command?)
+2. How is the user prompted? (via AskUserQuestion? what are the options?)
+3. What happens when Codex is NOT available? (fallback to subagent? skip entirely?)
+4. Is this step blocking (gates the workflow) or optional (can be skipped)?
+5. What prompt/context is sent to Codex?
+
+Write your summary to ${testDir}/${testName}-summary.md`,
+      workingDirectory: testDir,
+      maxTurns: 8,
+      timeout: 120_000,
+      testName,
+      runId,
+    });
+
+    logCost(`/${skill} codex offering`, result);
+    recordE2E(evalCollector, `/${testName}`, 'Codex Offering E2E', result);
+    expect(result.exitReason).toBe('success');
+
+    const summaryPath = path.join(testDir, `${testName}-summary.md`);
+    expect(fs.existsSync(summaryPath)).toBe(true);
+
+    const summary = fs.readFileSync(summaryPath, 'utf-8').toLowerCase();
+    // All skills should have codex availability check
+    expect(summary).toMatch(/which codex/);
+    // All skills should have fallback behavior
+    expect(summary).toMatch(/fallback|subagent|unavailable|not available|skip/);
+    // All skills should show it's optional/non-blocking
+    expect(summary).toMatch(/optional|non.?blocking|skip|not.*required/);
+
+    console.log(`${skill}: Codex offering verified`);
+  }
+
+  testConcurrentIfSelected('codex-offered-office-hours', async () => {
+    await checkCodexOffering('office-hours', 'codex-offered-office-hours', 'second opinion');
+  }, 180_000);
+
+  testConcurrentIfSelected('codex-offered-ceo-review', async () => {
+    await checkCodexOffering('plan-ceo-review', 'codex-offered-ceo-review', 'outside voice');
+  }, 180_000);
+
+  testConcurrentIfSelected('codex-offered-design-review', async () => {
+    await checkCodexOffering('plan-design-review', 'codex-offered-design-review', 'design outside voices');
+  }, 180_000);
+
+  testConcurrentIfSelected('codex-offered-eng-review', async () => {
+    await checkCodexOffering('plan-eng-review', 'codex-offered-eng-review', 'outside voice');
+  }, 180_000);
+});
+
 // Module-level afterAll — finalize eval collector after all tests complete
 afterAll(async () => {
  await finalizeEvalCollector(evalCollector);
@@ -4,7 +4,7 @@ import { outcomeJudge } from './helpers/llm-judge';
 import { judgePassed } from './helpers/eval-store';
 import {
  ROOT, browseBin, runId, evalsEnabled, selectedTests, hasApiKey,
-  describeIfSelected, describeE2E,
+  describeIfSelected, describeE2E, testConcurrentIfSelected,
  copyDirSync, setupBrowseShims, logCost, recordE2E, dumpOutcomeDiagnostic,
  createEvalCollector, finalizeEvalCollector,
 } from './helpers/e2e-helpers';
@@ -172,17 +172,17 @@ CRITICAL RULES:
  }

  // B6: Static dashboard — broken link, disabled submit, overflow, missing alt, console error
-  test('/qa finds >= 2 of 5 planted bugs (static)', async () => {
+  testConcurrentIfSelected('qa-b6-static', async () => {
    await runPlantedBugEval('qa-eval.html', 'qa-eval-ground-truth.json', 'b6-static');
  }, 360_000);

  // B7: SPA — broken route, stale state, async race, missing aria, console warning
-  test('/qa finds >= 2 of 5 planted SPA bugs', async () => {
+  testConcurrentIfSelected('qa-b7-spa', async () => {
    await runPlantedBugEval('qa-eval-spa.html', 'qa-eval-spa-ground-truth.json', 'b7-spa');
  }, 360_000);

  // B8: Checkout — email regex, NaN total, CC overflow, missing required, stripe error
-  test('/qa finds >= 2 of 5 planted checkout bugs', async () => {
+  testConcurrentIfSelected('qa-b8-checkout', async () => {
    await runPlantedBugEval('qa-eval-checkout.html', 'qa-eval-checkout-ground-truth.json', 'b8-checkout');
  }, 360_000);

@@ -37,7 +37,7 @@ describeIfSelected('QA skill E2E', ['qa-quick'], () => {
    try { fs.rmSync(qaDir, { recursive: true, force: true }); } catch {}
  });

-  test('/qa quick completes without browse errors', async () => {
+  testConcurrentIfSelected('qa-quick', async () => {
    const result = await runSkillTest({
      prompt: `B="${browseBin}"

@@ -108,7 +108,7 @@ describeIfSelected('QA-Only skill E2E', ['qa-only-no-fix'], () => {
    try { fs.rmSync(qaOnlyDir, { recursive: true, force: true }); } catch {}
  });

-  test('/qa-only produces report without using Edit tool', async () => {
+  testConcurrentIfSelected('qa-only-no-fix', async () => {
    const result = await runSkillTest({
      prompt: `IMPORTANT: The browse binary is already assigned below as B. Do NOT search for it or run the SKILL.md setup block — just use $B directly.

@@ -227,7 +227,7 @@ describeIfSelected('QA Fix Loop E2E', ['qa-fix-loop'], () => {
    try { fs.rmSync(qaFixDir, { recursive: true, force: true }); } catch {}
  });

-  test('/qa fix loop finds bugs and commits fixes', async () => {
+  testConcurrentIfSelected('qa-fix-loop', async () => {
    const qaFixUrl = `http://127.0.0.1:${qaFixServer!.port}`;

    const result = await runSkillTest({
@@ -51,7 +51,7 @@ describeIfSelected('Review skill E2E', ['review-sql-injection'], () => {
    try { fs.rmSync(reviewDir, { recursive: true, force: true }); } catch {}
  });

-  test('/review produces findings on SQL injection branch', async () => {
+  testConcurrentIfSelected('review-sql-injection', async () => {
    const result = await runSkillTest({
      prompt: `You are in a git repo on a feature branch with changes against main.
 Read review-SKILL.md for the review workflow instructions.
@@ -125,7 +125,7 @@ describeIfSelected('Review enum completeness E2E', ['review-enum-completeness'],
    try { fs.rmSync(enumDir, { recursive: true, force: true }); } catch {}
  });

-  test('/review catches missing enum handlers for new status value', async () => {
+  testConcurrentIfSelected('review-enum-completeness', async () => {
    const result = await runSkillTest({
      prompt: `You are in a git repo on branch feature/add-returned-status with changes against main.
 Read review-SKILL.md for the review workflow instructions.
@@ -200,7 +200,7 @@ describeIfSelected('Review design lite E2E', ['review-design-lite'], () => {
    try { fs.rmSync(designDir, { recursive: true, force: true }); } catch {}
  });

-  test('/review catches design anti-patterns in CSS/HTML diff', async () => {
+  testConcurrentIfSelected('review-design-lite', async () => {
    const result = await runSkillTest({
      prompt: `You are in a git repo on branch feature/add-landing-page with changes against main.
 Read review-SKILL.md for the review workflow instructions.
@@ -340,21 +340,22 @@ Write your findings to ${dir}/review-output.md`,
    run('git', ['add', 'app.ts'], dir);
    run('git', ['commit', '-m', 'feat: update to v2'], dir);

-    // Copy ship skill
-    fs.copyFileSync(path.join(ROOT, 'ship', 'SKILL.md'), path.join(dir, 'ship-SKILL.md'));
+    // Extract only Step 0 (base branch detection) from ship/SKILL.md
+    // (copying the full 1900-line file causes agent context bloat and flaky timeouts)
+    const fullShipSkill = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
+    const step0Start = fullShipSkill.indexOf('## Step 0: Detect platform and base branch');
+    const step0End = fullShipSkill.indexOf('## Step 1: Pre-flight');
+    const shipSection = fullShipSkill.slice(step0Start, step0End > step0Start ? step0End : undefined);
+    fs.writeFileSync(path.join(dir, 'ship-SKILL.md'), shipSection);

    const result = await runSkillTest({
-      prompt: `Read ship-SKILL.md for the ship workflow.
+      prompt: `Read ship-SKILL.md. It contains Step 0 (Detect base branch) from the ship workflow.

-Skip the preamble bash block, lake intro, telemetry, and contributor mode sections — go straight to Step 0.
+Run the base branch detection. Since there is no remote, gh commands will fail — fall back to main.

-Run ONLY Step 0 (Detect base branch) and Step 1 (Pre-flight) from the ship workflow.
-Since there is no remote, gh commands will fail — fall back to main.
+Then run git diff and git log against the detected base branch.

-After completing Step 0 and Step 1, STOP. Do NOT proceed to Step 2 or beyond.
-Do NOT push, create PRs, or modify VERSION/CHANGELOG.
-
-Write a summary of what you detected to ${dir}/ship-preflight.md including:
+Write a summary to ${dir}/ship-preflight.md including:
 - The detected base branch name
 - The current branch name
 - The diff stat against the base branch`,
@@ -497,7 +498,7 @@ describeIfSelected('Retro E2E', ['retro'], () => {
    try { fs.rmSync(retroDir, { recursive: true, force: true }); } catch {}
  });

-  test('/retro produces analysis from git history', async () => {
+  testConcurrentIfSelected('retro', async () => {
    const result = await runSkillTest({
      prompt: `Read retro/SKILL.md for instructions on how to run a retrospective.

@@ -529,6 +530,124 @@ Analyze the git history and produce the narrative report as described in the SKI
  }, 420_000);
 });

+// --- Review Dashboard Via Attribution E2E ---
+
+describeIfSelected('Review Dashboard Via Attribution', ['review-dashboard-via'], () => {
+  let dashDir: string;
+
+  beforeAll(() => {
+    dashDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-dashboard-via-'));
+    const run = (cmd: string, args: string[], cwd = dashDir) =>
+      spawnSync(cmd, args, { cwd, stdio: 'pipe', timeout: 5000 });
+
+    // Create git repo with feature branch
+    run('git', ['init', '-b', 'main']);
+    run('git', ['config', 'user.email', 'test@test.com']);
+    run('git', ['config', 'user.name', 'Test']);
+
+    fs.writeFileSync(path.join(dashDir, 'app.ts'), 'console.log("v1");\n');
+    run('git', ['add', 'app.ts']);
+    run('git', ['commit', '-m', 'initial']);
+
+    run('git', ['checkout', '-b', 'feature/dashboard-test']);
+    fs.writeFileSync(path.join(dashDir, 'app.ts'), 'console.log("v2");\n');
+    run('git', ['add', 'app.ts']);
+    run('git', ['commit', '-m', 'feat: update']);
+
+    // Get HEAD commit for review entries
+    const headResult = spawnSync('git', ['rev-parse', '--short', 'HEAD'], { cwd: dashDir, stdio: 'pipe' });
+    const commit = headResult.stdout.toString().trim();
+
+    // Pre-populate review log with autoplan-sourced entries
+    // gstack-review-read reads from ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl
+    // For the test, we'll write a mock gstack-review-read script that returns our test data
+    const timestamp = new Date().toISOString().replace(/\.\d{3}Z$/, 'Z');
+    const reviewData = [
+      `{"skill":"plan-eng-review","timestamp":"${timestamp}","status":"clean","unresolved":0,"critical_gaps":0,"issues_found":0,"mode":"FULL_REVIEW","via":"autoplan","commit":"${commit}"}`,
+      `{"skill":"plan-ceo-review","timestamp":"${timestamp}","status":"clean","unresolved":0,"critical_gaps":0,"mode":"SELECTIVE_EXPANSION","via":"autoplan","commit":"${commit}"}`,
+      `{"skill":"codex-plan-review","timestamp":"${timestamp}","status":"clean","source":"codex","commit":"${commit}"}`,
+    ].join('\n');
+
+    // Write a mock gstack-review-read that returns our test data
+    const mockBinDir = path.join(dashDir, '.mock-bin');
+    fs.mkdirSync(mockBinDir, { recursive: true });
+    fs.writeFileSync(path.join(mockBinDir, 'gstack-review-read'), [
+      '#!/usr/bin/env bash',
+      `echo '${reviewData.split('\n').join("'\necho '")}'`,
+      'echo "---CONFIG---"',
+      'echo "false"',
+      'echo "---HEAD---"',
+      `echo "${commit}"`,
+    ].join('\n'));
+    fs.chmodSync(path.join(mockBinDir, 'gstack-review-read'), 0o755);
+
+    // Extract only the Review Readiness Dashboard section from ship/SKILL.md
+    // (copying the full 1900-line file causes agent context bloat and timeouts)
+    const fullSkill = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
+    const dashStart = fullSkill.indexOf('## Review Readiness Dashboard');
+    const dashEnd = fullSkill.indexOf('\n---\n', dashStart);
+    const dashSection = fullSkill.slice(dashStart, dashEnd > dashStart ? dashEnd : undefined);
+    fs.writeFileSync(path.join(dashDir, 'ship-SKILL.md'), dashSection);
+  });
+
+  afterAll(() => {
+    try { fs.rmSync(dashDir, { recursive: true, force: true }); } catch {}
+  });
+
+  testConcurrentIfSelected('review-dashboard-via', async () => {
+    const mockBinDir = path.join(dashDir, '.mock-bin');
+
+    const result = await runSkillTest({
+      prompt: `Read ship-SKILL.md. You only need to run the Review Readiness Dashboard section.
+
+Instead of running ~/.claude/skills/gstack/bin/gstack-review-read, run this mock: ${mockBinDir}/gstack-review-read
+
+Parse the output and display the dashboard table. Pay attention to:
+1. The "via" field in entries — show source attribution (e.g., "via /autoplan")
+2. The codex-plan-review entry — it should populate the Outside Voice row
+3. Since Eng Review IS clear, there should be NO gate blocking — just display the dashboard
+
+Skip the preamble, lake intro, telemetry, and all other ship steps.
+Write the dashboard output to ${dashDir}/dashboard-output.md`,
+      workingDirectory: dashDir,
+      maxTurns: 12,
+      timeout: 180_000,
+      testName: 'review-dashboard-via',
+      runId,
+    });
+
+    logCost('/ship dashboard-via', result);
+    recordE2E(evalCollector, '/ship review dashboard via attribution', 'Dashboard via field', result);
+    expect(result.exitReason).toBe('success');
+
+    // Check dashboard output for via attribution
+    const dashPath = path.join(dashDir, 'dashboard-output.md');
+    const allOutput = [
+      result.output || '',
+      ...result.toolCalls.map(tc => tc.output || ''),
+    ].join('\n').toLowerCase();
+
+    // Verify via attribution appears somewhere (conversation or file)
+    let dashContent = '';
+    if (fs.existsSync(dashPath)) {
+      dashContent = fs.readFileSync(dashPath, 'utf-8').toLowerCase();
+    }
+    const combined = allOutput + dashContent;
+
+    // Should mention autoplan attribution
+    expect(combined).toMatch(/autoplan/);
+    // Should show eng review as CLEAR (it has a clean entry)
+    expect(combined).toMatch(/clear/i);
+    // Should NOT contain AskUserQuestion gate (no blocking)
+    const gateQuestions = result.toolCalls.filter(tc =>
+      tc.tool === 'mcp__conductor__AskUserQuestion' ||
+      (tc.tool === 'AskUserQuestion')
+    );
+    // Ship dashboard should not gate when eng review is clear
+    expect(gateQuestions).toHaveLength(0);
+  }, 240_000);
+});
+
 // Module-level afterAll — finalize eval collector after all tests complete
 afterAll(async () => {
  await finalizeEvalCollector(evalCollector);
@@ -0,0 +1,279 @@
+/**
+ * Layer 4: E2E tests for the sidebar agent.
+ *
+ * sidebar-url-accuracy: Deterministic test that verifies the activeTabUrl fix.
+ *   Starts server (no browser), POSTs to /sidebar-command with different activeTabUrl
+ *   values, reads the queue file, and verifies the prompt uses the extension URL.
+ *   No real Claude needed — this is a fast, cheap, deterministic test.
+ *
+ * sidebar-navigate: Full E2E with real Claude (requires ANTHROPIC_API_KEY).
+ *   Starts server + sidebar-agent, sends a message, waits for Claude to respond.
+ *   Tests the complete message flow through the queue.
+ */
+
+import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
+import { spawn, type Subprocess } from 'bun';
+import * as fs from 'fs';
+import * as os from 'os';
+import * as path from 'path';
+import {
+  ROOT,
+  describeIfSelected, testIfSelected,
+  createEvalCollector, finalizeEvalCollector,
+} from './helpers/e2e-helpers';
+
+const evalCollector = createEvalCollector('e2e-sidebar');
+
+// --- Sidebar URL Accuracy (deterministic, no Claude) ---
+
+describeIfSelected('Sidebar URL accuracy E2E', ['sidebar-url-accuracy'], () => {
+  let serverProc: Subprocess | null = null;
+  let serverPort: number = 0;
+  let authToken: string = '';
+  let tmpDir: string = '';
+  let stateFile: string = '';
+  let queueFile: string = '';
+
+  async function api(pathname: string, opts: RequestInit = {}): Promise<Response> {
+    const headers: Record<string, string> = {
+      'Content-Type': 'application/json',
+      ...(opts.headers as Record<string, string> || {}),
+    };
+    if (!headers['Authorization'] && authToken) {
+      headers['Authorization'] = `Bearer ${authToken}`;
+    }
+    return fetch(`http://127.0.0.1:${serverPort}${pathname}`, { ...opts, headers });
+  }
+
+  beforeAll(async () => {
+    tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'sidebar-e2e-url-'));
+    stateFile = path.join(tmpDir, 'browse.json');
+    queueFile = path.join(tmpDir, 'sidebar-queue.jsonl');
+    fs.mkdirSync(path.dirname(queueFile), { recursive: true });
+
+    const serverScript = path.resolve(ROOT, 'browse', 'src', 'server.ts');
+    serverProc = spawn(['bun', 'run', serverScript], {
+      env: {
+        ...process.env,
+        BROWSE_STATE_FILE: stateFile,
+        BROWSE_HEADLESS_SKIP: '1',
+        BROWSE_PORT: '0',
+        SIDEBAR_QUEUE_PATH: queueFile,
+        BROWSE_IDLE_TIMEOUT: '300',
+      },
+      stdio: ['ignore', 'pipe', 'pipe'],
+    });
+
+    const deadline = Date.now() + 15000;
+    while (Date.now() < deadline) {
+      if (fs.existsSync(stateFile)) {
+        try {
+          const state = JSON.parse(fs.readFileSync(stateFile, 'utf-8'));
+          if (state.port && state.token) {
+            serverPort = state.port;
+            authToken = state.token;
+            break;
+          }
+        } catch {}
+      }
+      await new Promise(r => setTimeout(r, 100));
+    }
+    if (!serverPort) throw new Error('Server did not start in time');
+  }, 20000);
+
+  afterAll(() => {
+    if (serverProc) { try { serverProc.kill(); } catch {} }
+    finalizeEvalCollector(evalCollector);
+    try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {}
+  });
+
+  testIfSelected('sidebar-url-accuracy', async () => {
+    // Fresh session
+    await api('/sidebar-session/new', { method: 'POST' });
+    fs.writeFileSync(queueFile, '');
+
+    const extensionUrl = 'https://example.com/user-navigated-here';
+    const resp = await api('/sidebar-command', {
+      method: 'POST',
+      body: JSON.stringify({
+        message: 'What page am I on?',
+        activeTabUrl: extensionUrl,
+      }),
+    });
+    expect(resp.status).toBe(200);
+
+    // Wait for queue entry
+    let lastEntry: any = null;
+    const deadline = Date.now() + 5000;
+    while (Date.now() < deadline) {
+      await new Promise(r => setTimeout(r, 100));
+      if (!fs.existsSync(queueFile)) continue;
+      const lines = fs.readFileSync(queueFile, 'utf-8').trim().split('\n').filter(Boolean);
+      if (lines.length > 0) {
+        lastEntry = JSON.parse(lines[lines.length - 1]);
+        break;
+      }
+    }
+
+    expect(lastEntry).not.toBeNull();
+    // Extension URL should be used, not the Playwright fallback
+    expect(lastEntry.pageUrl).toBe(extensionUrl);
+    expect(lastEntry.prompt).toContain(extensionUrl);
+    expect(lastEntry.pageUrl).not.toBe('about:blank');
+
+    // Also test: chrome:// URL should be rejected, falling back to about:blank
+    await api('/sidebar-agent/kill', { method: 'POST' });
+    fs.writeFileSync(queueFile, '');
+
+    await api('/sidebar-command', {
+      method: 'POST',
+      body: JSON.stringify({
+        message: 'test',
+        activeTabUrl: 'chrome://settings',
+      }),
+    });
+    await new Promise(r => setTimeout(r, 200));
+    const lines2 = fs.readFileSync(queueFile, 'utf-8').trim().split('\n').filter(Boolean);
+    if (lines2.length > 0) {
+      const entry2 = JSON.parse(lines2[lines2.length - 1]);
+      expect(entry2.pageUrl).toBe('about:blank');
+    }
+
+    evalCollector?.addTest({
+      name: 'sidebar-url-accuracy', suite: 'Sidebar URL accuracy E2E', tier: 'e2e',
+      passed: true,
+      duration_ms: 0,
+      cost_usd: 0,
+      exit_reason: 'success',
+    });
+  }, 30_000);
+});
+
+// --- Sidebar Navigate (real Claude, requires ANTHROPIC_API_KEY) ---
+
+describeIfSelected('Sidebar navigate E2E', ['sidebar-navigate'], () => {
+  let serverProc: Subprocess | null = null;
+  let agentProc: Subprocess | null = null;
+  let serverPort: number = 0;
+  let authToken: string = '';
+  let tmpDir: string = '';
+  let stateFile: string = '';
+  let queueFile: string = '';
+
+  async function api(pathname: string, opts: RequestInit = {}): Promise<Response> {
+    const headers: Record<string, string> = {
+      'Content-Type': 'application/json',
+      ...(opts.headers as Record<string, string> || {}),
+    };
+    if (!headers['Authorization'] && authToken) {
+      headers['Authorization'] = `Bearer ${authToken}`;
+    }
+    return fetch(`http://127.0.0.1:${serverPort}${pathname}`, { ...opts, headers });
+  }
+
+  beforeAll(async () => {
+    tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'sidebar-e2e-nav-'));
+    stateFile = path.join(tmpDir, 'browse.json');
+    queueFile = path.join(tmpDir, 'sidebar-queue.jsonl');
+    fs.mkdirSync(path.dirname(queueFile), { recursive: true });
+
+    // Start server WITHOUT headless skip — we need a real browser for Claude to use
+    const serverScript = path.resolve(ROOT, 'browse', 'src', 'server.ts');
+    serverProc = spawn(['bun', 'run', serverScript], {
+      env: {
+        ...process.env,
+        BROWSE_STATE_FILE: stateFile,
+        BROWSE_HEADLESS_SKIP: '1',  // Still skip browser — Claude uses curl/fetch instead
+        BROWSE_PORT: '0',
+        SIDEBAR_QUEUE_PATH: queueFile,
+        BROWSE_IDLE_TIMEOUT: '300',
+      },
+      stdio: ['ignore', 'pipe', 'pipe'],
+    });
+
+    const deadline = Date.now() + 15000;
+    while (Date.now() < deadline) {
+      if (fs.existsSync(stateFile)) {
+        try {
+          const state = JSON.parse(fs.readFileSync(stateFile, 'utf-8'));
+          if (state.port && state.token) {
+            serverPort = state.port;
+            authToken = state.token;
+            break;
+          }
+        } catch {}
+      }
+      await new Promise(r => setTimeout(r, 100));
+    }
+    if (!serverPort) throw new Error('Server did not start in time');
+
+    // Start sidebar-agent
+    const agentScript = path.resolve(ROOT, 'browse', 'src', 'sidebar-agent.ts');
+    agentProc = spawn(['bun', 'run', agentScript], {
+      env: {
+        ...process.env,
+        BROWSE_SERVER_PORT: String(serverPort),
+        BROWSE_STATE_FILE: stateFile,
+        SIDEBAR_QUEUE_PATH: queueFile,
+        SIDEBAR_AGENT_TIMEOUT: '90000',
+        BROWSE_BIN: 'echo',  // browse commands won't work, but Claude can use curl
+      },
+      stdio: ['ignore', 'pipe', 'pipe'],
+    });
+
+    await new Promise(r => setTimeout(r, 1500));
+  }, 25000);
+
+  afterAll(() => {
+    if (agentProc) { try { agentProc.kill(); } catch {} }
+    if (serverProc) { try { serverProc.kill(); } catch {} }
+    finalizeEvalCollector(evalCollector);
+    try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {}
+  });
+
+  testIfSelected('sidebar-navigate', async () => {
+    await api('/sidebar-session/new', { method: 'POST' });
+    fs.writeFileSync(queueFile, '');
+    const startTime = Date.now();
+
+    // Ask Claude a simple question — it doesn't need browse commands for this
+    const resp = await api('/sidebar-command', {
+      method: 'POST',
+      body: JSON.stringify({
+        message: 'Say exactly "SIDEBAR_TEST_OK" and nothing else.',
+        activeTabUrl: 'https://example.com',
+      }),
+    });
+    expect(resp.status).toBe(200);
+
+    // Poll for agent_done
+    const deadline = Date.now() + 90000;
+    let entries: any[] = [];
+    while (Date.now() < deadline) {
+      const chatResp = await api('/sidebar-chat?after=0');
+      const data = await chatResp.json();
+      entries = data.entries;
+      if (entries.some((e: any) => e.type === 'agent_done')) break;
+      await new Promise(r => setTimeout(r, 2000));
+    }
+
+    const duration = Date.now() - startTime;
+    const doneEntry = entries.find((e: any) => e.type === 'agent_done');
+    expect(doneEntry).toBeDefined();
+
+    // Claude should have responded with something
+    const agentText = entries
+      .filter((e: any) => e.role === 'agent' && (e.type === 'text' || e.type === 'result'))
+      .map((e: any) => e.text || '')
+      .join(' ');
+    expect(agentText.length).toBeGreaterThan(0);
+
+    evalCollector?.addTest({
+      name: 'sidebar-navigate', suite: 'Sidebar navigate E2E', tier: 'e2e',
+      passed: !!doneEntry && agentText.length > 0,
+      duration_ms: duration,
+      cost_usd: 0,
+      exit_reason: doneEntry ? 'success' : 'timeout',
+    });
+  }, 120_000);
+});
@@ -60,7 +60,7 @@ describeIfSelected('Document-Release skill E2E', ['document-release'], () => {
    try { fs.rmSync(docReleaseDir, { recursive: true, force: true }); } catch {}
  });

-  test('/document-release updates docs without clobbering CHANGELOG', async () => {
+  testConcurrentIfSelected('document-release', async () => {
    const result = await runSkillTest({
      prompt: `Read the file document-release/SKILL.md for the document-release workflow instructions.

@@ -161,36 +161,13 @@ describeIfSelected('Ship workflow E2E', ['ship-local-workflow'], () => {

  testConcurrentIfSelected('ship-local-workflow', async () => {
    const result = await runSkillTest({
-      prompt: `You are running a ship workflow. This is fully automated — do NOT ask for confirmation at any step. Run straight through.
-
-Step 0 — Detect base branch:
-Try: gh pr view --json baseRefName -q .baseRefName
-If that fails, try: gh repo view --json defaultBranchRef -q .defaultBranchRef.name
-If both fail, fall back to "main". Use the detected branch as <base> in all subsequent steps.
-
-Step 2 — Merge base branch:
-git fetch origin <base> && git merge origin/<base> --no-edit
-If already up to date, continue silently.
-
-Step 4 — Version bump:
-Read the VERSION file (4-digit format: MAJOR.MINOR.PATCH.MICRO).
-Auto-pick MICRO bump (increment the 4th digit). Write the new version to VERSION.
-
-Step 5 — CHANGELOG:
-Read CHANGELOG.md. Auto-generate an entry from the branch commits:
- git log <base>..HEAD --oneline
- git diff <base>...HEAD
-Format: ## [X.Y.Z.W] - YYYY-MM-DD with bullet points. Prepend after the header.
-
-Step 6 — Commit:
-Stage all changes. Commit with message: "chore: bump version and changelog (vX.Y.Z.W)"
-
-Step 7 — Push:
-git push -u origin <branch-name>
-
-Finally, write ship-summary.md with the version and branch.`,
+      prompt: `You are in a git repo on branch feature/ship-test. Do these steps in order:
+1. Read VERSION file and bump the last digit by 1 (e.g. 0.1.0.0 → 0.1.0.1). Write the new version back.
+2. Add a CHANGELOG.md entry: "## [NEW_VERSION] - TODAY" with a bullet "- Ship test feature".
+3. Stage all changes, commit with message "ship: vNEW_VERSION".
+4. Push to origin: git push origin feature/ship-test`,
      workingDirectory: shipWorkDir,
-      maxTurns: 15,
+      maxTurns: 8,
      timeout: 120_000,
      testName: 'ship-local-workflow',
      runId,
@@ -198,76 +175,30 @@ Finally, write ship-summary.md with the version and branch.`,

    logCost('/ship local workflow', result);

-    // Check push succeeded
-    const remoteLog = spawnSync('git', ['log', '--oneline'], { cwd: shipRemoteDir, stdio: 'pipe' });
-    const remoteCommits = remoteLog.stdout.toString().trim().split('\n').length;
+    // Check push succeeded — verify the feature branch exists on the bare remote
+    const branchCheck = spawnSync('git', ['branch', '--list', 'feature/ship-test'], { cwd: shipRemoteDir, stdio: 'pipe' });
+    const branchExists = branchCheck.stdout.toString().trim().length > 0;

-    // Check VERSION was bumped
+    // Check VERSION was bumped locally (even if push failed, this shows the LLM did the work)
    const versionContent = fs.existsSync(path.join(shipWorkDir, 'VERSION'))
      ? fs.readFileSync(path.join(shipWorkDir, 'VERSION'), 'utf-8').trim() : '';
    const versionBumped = versionContent !== '0.1.0.0';

    recordE2E(evalCollector, '/ship local workflow', 'Ship workflow E2E', result, {
-      passed: remoteCommits > 1 && ['success', 'error_max_turns'].includes(result.exitReason),
+      passed: branchExists && versionBumped && ['success', 'error_max_turns'].includes(result.exitReason),
    });

    expect(['success', 'error_max_turns']).toContain(result.exitReason);
-    expect(remoteCommits).toBeGreaterThan(1);
-    console.log(`Remote commits: ${remoteCommits}, VERSION: ${versionContent}, bumped: ${versionBumped}`);
+    expect(branchExists).toBe(true);
+    expect(versionBumped).toBe(true);
+    console.log(`Branch pushed: ${branchExists}, VERSION: ${versionContent}, bumped: ${versionBumped}`);
  }, 150_000);
 });

-// --- Browser cookie detection smoke test ---
-
-describeIfSelected('Setup Browser Cookies E2E', ['setup-cookies-detect'], () => {
-  let cookieDir: string;
-
-  beforeAll(() => {
-    cookieDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-cookies-'));
-    // Copy skill files
-    fs.mkdirSync(path.join(cookieDir, 'setup-browser-cookies'), { recursive: true });
-    fs.copyFileSync(
-      path.join(ROOT, 'setup-browser-cookies', 'SKILL.md'),
-      path.join(cookieDir, 'setup-browser-cookies', 'SKILL.md'),
-    );
-  });
-
-  afterAll(() => {
-    try { fs.rmSync(cookieDir, { recursive: true, force: true }); } catch {}
-  });
-
-  testConcurrentIfSelected('setup-cookies-detect', async () => {
-    const result = await runSkillTest({
-      prompt: `Read setup-browser-cookies/SKILL.md for the cookie import workflow.
-
-This is a test environment. List which browsers you can detect on this system by checking for their cookie database files.
-Write the detected browsers to ${cookieDir}/detected-browsers.md.
-Do NOT launch the cookie picker UI — just detect and report.`,
-      workingDirectory: cookieDir,
-      maxTurns: 5,
-      timeout: 45_000,
-      testName: 'setup-cookies-detect',
-      runId,
-    });
-
-    logCost('/setup-browser-cookies detect', result);
-
-    const detectPath = path.join(cookieDir, 'detected-browsers.md');
-    const detectExists = fs.existsSync(detectPath);
-    const detectContent = detectExists ? fs.readFileSync(detectPath, 'utf-8') : '';
-    const hasBrowserName = /chrome|arc|brave|edge|comet|safari|firefox/i.test(detectContent);
-
-    recordE2E(evalCollector, '/setup-browser-cookies detect', 'Setup Browser Cookies E2E', result, {
-      passed: detectExists && hasBrowserName && ['success', 'error_max_turns'].includes(result.exitReason),
-    });
-
-    expect(['success', 'error_max_turns']).toContain(result.exitReason);
-    expect(detectExists).toBe(true);
-    if (detectExists) {
-      expect(hasBrowserName).toBe(true);
-    }
-  }, 60_000);
-});
+// setup-cookies-detect REMOVED: The cookie-import-browser module has 30+ thorough
+// unit tests in browse/test/cookie-import-browser.test.ts (decryption, profile
+// detection, error handling, path traversal). The E2E just tested LLM instruction-
+// following ("write a file saying no browsers") on a CI box with no browsers.

 // --- gstack-upgrade E2E ---

@@ -461,7 +392,7 @@ describe('processPayment', () => {
    try { fs.rmSync(coverageDir, { recursive: true, force: true }); } catch {}
  });

-  test('/ship Step 3.4 produces coverage diagram', async () => {
+  testConcurrentIfSelected('ship-coverage-audit', async () => {
    const result = await runSkillTest({
      prompt: `Read the file ship/SKILL.md for the ship workflow instructions.

@@ -544,7 +475,7 @@ describeIfSelected('Codex skill E2E', ['codex-review'], () => {
    try { fs.rmSync(codexDir, { recursive: true, force: true }); } catch {}
  });

-  test('/codex review produces findings and GATE verdict', async () => {
+  testConcurrentIfSelected('codex-review', async () => {
    // Check codex is available — skip if not installed
    const codexCheck = spawnSync('which', ['codex'], { stdio: 'pipe', timeout: 3000 });
    if (codexCheck.status !== 0) {
@@ -74,7 +74,7 @@ function describeIfSelected(name: string, testNames: string[], fn: () => void) {
 /** Skip an individual test if not selected (for multi-test describe blocks). */
 function testIfSelected(testName: string, fn: () => Promise<void>, timeout: number) {
  const shouldRun = selectedTests === null || selectedTests.includes(testName);
-  (shouldRun ? test : test.skip)(testName, fn, timeout);
+  (shouldRun ? test.concurrent : test.skip)(testName, fn, timeout);
 }

 describeIfSelected('LLM-as-judge quality evals', [
@@ -91,11 +91,14 @@ describeIfSelected('LLM-as-judge quality evals', [
    const { result: scores, meta } = await judge('command reference table', section);
    console.log('Command reference scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : '');

+    // Completeness threshold is 3 (not 4) — the command reference table is
+    // intentionally terse (quick-reference format). The judge consistently scores
+    // completeness=3 because detailed argument docs live in per-command sections.
    evalCollector?.addTest({
      name: 'command reference table',
      suite: 'LLM-as-judge quality evals',
      tier: 'llm-judge',
-      passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4,
+      passed: scores.clarity >= 4 && scores.completeness >= 3 && scores.actionability >= 4,
      duration_ms: Date.now() - t0,
      cost_usd: judgeCost(meta),
      judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
@@ -104,7 +107,7 @@ describeIfSelected('LLM-as-judge quality evals', [
    });

    expect(scores.clarity).toBeGreaterThanOrEqual(4);
-    expect(scores.completeness).toBeGreaterThanOrEqual(4);
+    expect(scores.completeness).toBeGreaterThanOrEqual(3);
    expect(scores.actionability).toBeGreaterThanOrEqual(4);
  }, 30_000);

@@ -790,6 +793,69 @@ describeIfSelected('Other skill evals', [
  }, 30_000);
 });

+// Voice directive eval — tests that the voice section produces the right tone
+describeIfSelected('Voice directive eval', ['voice directive tone'], () => {
+  testIfSelected('voice directive tone', async () => {
+    const t0 = Date.now();
+    // Read a tier 2+ skill to get the full voice directive in context
+    const content = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
+    const voiceStart = content.indexOf('## Voice');
+    if (voiceStart === -1) {
+      throw new Error('Voice section not found in review/SKILL.md. Was preamble.ts regenerated?');
+    }
+    const voiceEnd = content.indexOf('\n## ', voiceStart + 1);
+    const voiceSection = content.slice(voiceStart, voiceEnd > 0 ? voiceEnd : voiceStart + 3000);
+
+    const result = await callJudge<{
+      directness: number;
+      concreteness: number;
+      avoids_corporate: number;
+      avoids_ai_vocabulary: number;
+      connects_user_outcomes: number;
+      reasoning: string;
+    }>(`You are evaluating a voice directive for an AI coding assistant framework called GStack.
+Score each dimension 1-5 where 5 is excellent:
+
+1. directness: Does it instruct the agent to be direct, lead with the point, take positions?
+2. concreteness: Does it instruct the agent to name specific files, commands, line numbers, real numbers?
+3. avoids_corporate: Does it explicitly ban corporate/formal/academic tone and provide alternatives?
+4. avoids_ai_vocabulary: Does it ban AI-tell words and phrases with specific lists?
+5. connects_user_outcomes: Does it instruct the agent to connect technical work to real user experience?
+
+Return JSON only:
+{"directness": N, "concreteness": N, "avoids_corporate": N, "avoids_ai_vocabulary": N, "connects_user_outcomes": N, "reasoning": "..."}
+
+THE VOICE DIRECTIVE:
+${voiceSection}`);
+
+    console.log('Voice directive scores:', JSON.stringify(result, null, 2));
+
+    evalCollector?.addTest({
+      name: 'voice directive tone',
+      suite: 'Voice directive eval',
+      tier: 'llm-judge',
+      passed: result.directness >= 4 && result.concreteness >= 4 && result.avoids_corporate >= 4
+        && result.avoids_ai_vocabulary >= 4 && result.connects_user_outcomes >= 4,
+      duration_ms: Date.now() - t0,
+      cost_usd: 0.02,
+      judge_scores: {
+        directness: result.directness,
+        concreteness: result.concreteness,
+        avoids_corporate: result.avoids_corporate,
+        avoids_ai_vocabulary: result.avoids_ai_vocabulary,
+        connects_user_outcomes: result.connects_user_outcomes,
+      },
+      judge_reasoning: result.reasoning,
+    });
+
+    expect(result.directness).toBeGreaterThanOrEqual(4);
+    expect(result.concreteness).toBeGreaterThanOrEqual(4);
+    expect(result.avoids_corporate).toBeGreaterThanOrEqual(4);
+    expect(result.avoids_ai_vocabulary).toBeGreaterThanOrEqual(4);
+    expect(result.connects_user_outcomes).toBeGreaterThanOrEqual(4);
+  }, 30_000);
+});
+
 // Module-level afterAll — finalize eval collector after all tests complete
 afterAll(async () => {
  if (evalCollector) {
@@ -3,7 +3,7 @@ import { runSkillTest } from './helpers/session-runner';
 import type { SkillTestResult } from './helpers/session-runner';
 import { EvalCollector } from './helpers/eval-store';
 import type { EvalTestEntry } from './helpers/eval-store';
-import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES } from './helpers/touchfiles';
+import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, E2E_TIERS, GLOBAL_TOUCHFILES } from './helpers/touchfiles';
 import { spawnSync } from 'child_process';
 import * as fs from 'fs';
 import * as path from 'path';
@@ -42,9 +42,28 @@ if (evalsEnabled && !process.env.EVALS_ALL) {
  }
 }

+// Apply EVALS_TIER filter (same logic as e2e-helpers.ts)
+if (evalsEnabled && process.env.EVALS_TIER) {
+  const tier = process.env.EVALS_TIER as 'gate' | 'periodic';
+  const tierTests = Object.entries(E2E_TIERS)
+    .filter(([, t]) => t === tier)
+    .map(([name]) => name);
+
+  if (selectedTests === null) {
+    selectedTests = tierTests;
+  } else {
+    selectedTests = selectedTests.filter(t => tierTests.includes(t));
+  }
+  process.stderr.write(`Routing EVALS_TIER=${tier}: ${selectedTests.length} tests\n\n`);
+}
+
 // --- Helper functions ---

-/** Copy all SKILL.md files into tmpDir/.claude/skills/gstack/ for auto-discovery */
+/** Copy all SKILL.md files for auto-discovery.
+ *  Install to BOTH project-level (.claude/skills/) AND user-level (~/.claude/skills/)
+ *  because Claude Code discovers skills from both locations. In CI containers,
+ *  $HOME may differ from the working directory, so we need both paths to ensure
+ *  the Skill tool appears in Claude's available tools list. */
 function installSkills(tmpDir: string) {
  const skillDirs = [
    '', // root gstack SKILL.md
@@ -54,15 +73,30 @@ function installSkills(tmpDir: string) {
    'gstack-upgrade', 'humanizer',
  ];

+  // Install to both project-level and user-level skill directories
+  const homeDir = process.env.HOME || os.homedir();
+  const installTargets = [
+    path.join(tmpDir, '.claude', 'skills'),        // project-level
+    path.join(homeDir, '.claude', 'skills'),        // user-level (~/.claude/skills/)
+  ];
+
  for (const skill of skillDirs) {
    const srcPath = path.join(ROOT, skill, 'SKILL.md');
    if (!fs.existsSync(srcPath)) continue;

-    const destDir = skill
-      ? path.join(tmpDir, '.claude', 'skills', 'gstack', skill)
-      : path.join(tmpDir, '.claude', 'skills', 'gstack');
-    fs.mkdirSync(destDir, { recursive: true });
-    fs.copyFileSync(srcPath, path.join(destDir, 'SKILL.md'));
+    const skillName = skill || 'gstack';
+
+    for (const targetBase of installTargets) {
+      const destDir = path.join(targetBase, skillName);
+      fs.mkdirSync(destDir, { recursive: true });
+      fs.copyFileSync(srcPath, path.join(destDir, 'SKILL.md'));
+    }
+  }
+
+  // Copy CLAUDE.md so Claude has project context for skill routing.
+  const claudeMdSrc = path.join(ROOT, 'CLAUDE.md');
+  if (fs.existsSync(claudeMdSrc)) {
+    fs.copyFileSync(claudeMdSrc, path.join(tmpDir, 'CLAUDE.md'));
  }
 }

@@ -75,6 +109,31 @@ function initGitRepo(dir: string) {
  run('git', ['config', 'user.name', 'Test']);
 }

+/**
+ * Create a routing test working directory.
+ * Uses the actual repo checkout (ROOT) which has CLAUDE.md, .claude/skills/,
+ * and full project context. This matches the local environment where routing
+ * tests pass reliably. In containerized CI, bare tmpDirs lack the context
+ * Claude needs to make correct routing decisions.
+ */
+function createRoutingWorkDir(suffix: string): string {
+  // Clone the repo checkout into a tmpDir so concurrent tests don't interfere
+  const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), `routing-${suffix}-`));
+  // Copy essential context files
+  const filesToCopy = ['CLAUDE.md', 'README.md', 'package.json', 'ETHOS.md'];
+  for (const f of filesToCopy) {
+    const src = path.join(ROOT, f);
+    if (fs.existsSync(src)) fs.copyFileSync(src, path.join(tmpDir, f));
+  }
+  // Copy skill files
+  installSkills(tmpDir);
+  // Init git
+  initGitRepo(tmpDir);
+  spawnSync('git', ['add', '.'], { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
+  spawnSync('git', ['commit', '-m', 'initial'], { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
+  return tmpDir;
+}
+
 function logCost(label: string, result: { costEstimate: { turnsUsed: number; estimatedTokens: number; estimatedCost: number }; duration: number }) {
  const { turnsUsed, estimatedTokens, estimatedCost } = result.costEstimate;
  const durationSec = Math.round(result.duration / 1000);
@@ -96,6 +155,15 @@ function recordRouting(name: string, result: SkillTestResult, expectedSkill: str
  });
 }

+// Skip individual tests based on selectedTests (diff + tier filtering)
+const testIfSelected = (name: string, fn: () => Promise<void>, timeout?: number) => {
+  if (selectedTests !== null && !selectedTests.includes(name)) {
+    test.skip(name, () => {});
+  } else {
+    test.concurrent(name, fn, timeout);
+  }
+};
+
 // --- Tests ---

 describeE2E('Skill Routing E2E — Developer Journey', () => {
@@ -103,14 +171,9 @@ describeE2E('Skill Routing E2E — Developer Journey', () => {
    evalCollector?.finalize();
  });

-  test.concurrent('journey-ideation', async () => {
-    const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-ideation-'));
+  testIfSelected('journey-ideation', async () => {
+    const tmpDir = createRoutingWorkDir('ideation');
    try {
-      initGitRepo(tmpDir);
-      installSkills(tmpDir);
-      fs.writeFileSync(path.join(tmpDir, 'README.md'), '# New Project\n');
-      spawnSync('git', ['add', '.'], { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
-      spawnSync('git', ['commit', '-m', 'initial'], { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });

      const testName = 'journey-ideation';
      const expectedSkill = 'office-hours';
@@ -137,11 +200,9 @@ describeE2E('Skill Routing E2E — Developer Journey', () => {
    }
  }, 150_000);

-  test.concurrent('journey-plan-eng', async () => {
-    const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-plan-eng-'));
+  testIfSelected('journey-plan-eng', async () => {
+    const tmpDir = createRoutingWorkDir('plan-eng');
    try {
-      initGitRepo(tmpDir);
-      installSkills(tmpDir);
      fs.writeFileSync(path.join(tmpDir, 'plan.md'), `# Waitlist App Architecture

 ## Components
@@ -189,64 +250,14 @@ describeE2E('Skill Routing E2E — Developer Journey', () => {
    }
  }, 150_000);

-  test.concurrent('journey-think-bigger', async () => {
-    const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-think-bigger-'));
+  // Removed: journey-think-bigger
+  // Tested ambiguous routing ("think bigger" → plan-ceo-review) but Claude
+  // legitimately answers directly instead of routing. Never passed reliably.
+  // The other 10 journey tests cover routing with clear signals.
+
+  testIfSelected('journey-debug', async () => {
+    const tmpDir = createRoutingWorkDir('debug');
    try {
-      initGitRepo(tmpDir);
-      installSkills(tmpDir);
-      fs.writeFileSync(path.join(tmpDir, 'plan.md'), `# Waitlist App Architecture
-
-## Components
- REST API (Express.js)
- PostgreSQL database
- React frontend
- SMS integration (Twilio)
-
-## Data Model
- restaurants (id, name, settings)
- parties (id, restaurant_id, name, size, phone, status, created_at)
- wait_estimates (id, restaurant_id, avg_wait_minutes)
-
-## API Endpoints
- POST /api/parties - add party to waitlist
- GET /api/parties - list current waitlist
- PATCH /api/parties/:id/status - update party status
- GET /api/estimate - get current wait estimate
-`);
-      spawnSync('git', ['add', '.'], { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
-      spawnSync('git', ['commit', '-m', 'initial'], { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
-
-      const testName = 'journey-think-bigger';
-      const expectedSkill = 'plan-ceo-review';
-      const result = await runSkillTest({
-        prompt: "Actually, looking at this plan again, I feel like we're thinking too small. We're just doing waitlists but what about the whole restaurant guest experience? Is there a bigger opportunity here we should go after?",
-        workingDirectory: tmpDir,
-        maxTurns: 5,
-        allowedTools: ['Skill', 'Read', 'Bash', 'Glob', 'Grep'],
-        timeout: 120_000,
-        testName,
-        runId,
-      });
-
-      const skillCalls = result.toolCalls.filter(tc => tc.tool === 'Skill');
-      const actualSkill = skillCalls.length > 0 ? skillCalls[0]?.input?.skill : undefined;
-
-      logCost(`journey: ${testName}`, result);
-      recordRouting(testName, result, expectedSkill, actualSkill);
-
-      expect(skillCalls.length, `Expected Skill tool to be called but got 0 calls. Claude may have answered directly without invoking a skill. Tool calls: ${result.toolCalls.map(tc => tc.tool).join(', ')}`).toBeGreaterThan(0);
-      expect([expectedSkill], `Expected skill ${expectedSkill} but got ${actualSkill}`).toContain(actualSkill);
-    } finally {
-      fs.rmSync(tmpDir, { recursive: true, force: true });
-    }
-  }, 180_000);
-
-  test.concurrent('journey-debug', async () => {
-    const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-debug-'));
-    try {
-      initGitRepo(tmpDir);
-      installSkills(tmpDir);
-
      const run = (cmd: string, args: string[]) =>
        spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });

@@ -295,18 +306,16 @@ export default app;
      recordRouting(testName, result, expectedSkill, actualSkill);

      expect(skillCalls.length, `Expected Skill tool to be called but got 0 calls. Claude may have answered directly without invoking a skill. Tool calls: ${result.toolCalls.map(tc => tc.tool).join(', ')}`).toBeGreaterThan(0);
-      expect([expectedSkill], `Expected skill ${expectedSkill} but got ${actualSkill}`).toContain(actualSkill);
+      const validSkills = ['investigate', 'qa'];
+      expect(validSkills, `Expected one of ${validSkills.join('/')} but got ${actualSkill}`).toContain(actualSkill);
    } finally {
      fs.rmSync(tmpDir, { recursive: true, force: true });
    }
  }, 150_000);

-  test.concurrent('journey-qa', async () => {
-    const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-qa-'));
+  testIfSelected('journey-qa', async () => {
+    const tmpDir = createRoutingWorkDir('qa');
    try {
-      initGitRepo(tmpDir);
-      installSkills(tmpDir);
-
      fs.writeFileSync(path.join(tmpDir, 'package.json'), JSON.stringify({ name: 'waitlist-app', scripts: { dev: 'next dev' } }, null, 2));
      fs.mkdirSync(path.join(tmpDir, 'src'), { recursive: true });
      fs.writeFileSync(path.join(tmpDir, 'src/index.html'), '<html><body><h1>Waitlist App</h1></body></html>');
@@ -340,18 +349,15 @@ export default app;
    }
  }, 150_000);

-  test.concurrent('journey-code-review', async () => {
-    const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-code-review-'));
+  testIfSelected('journey-code-review', async () => {
+    const tmpDir = createRoutingWorkDir('code-review');
    try {
-      initGitRepo(tmpDir);
-      installSkills(tmpDir);
-
      const run = (cmd: string, args: string[]) =>
        spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });

      fs.writeFileSync(path.join(tmpDir, 'app.ts'), '// base\n');
      run('git', ['add', '.']);
-      run('git', ['commit', '-m', 'initial']);
+      run('git', ['commit', '-m', 'add base app']);
      run('git', ['checkout', '-b', 'feature/add-waitlist']);
      fs.writeFileSync(path.join(tmpDir, 'app.ts'), '// updated with waitlist feature\nimport { WaitlistService } from "./waitlist";\n');
      fs.writeFileSync(path.join(tmpDir, 'waitlist.ts'), 'export class WaitlistService {\n  async addParty(name: string, size: number) {\n    // TODO: implement\n  }\n}\n');
@@ -383,18 +389,15 @@ export default app;
    }
  }, 150_000);

-  test.concurrent('journey-ship', async () => {
-    const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-ship-'));
+  testIfSelected('journey-ship', async () => {
+    const tmpDir = createRoutingWorkDir('ship');
    try {
-      initGitRepo(tmpDir);
-      installSkills(tmpDir);
-
      const run = (cmd: string, args: string[]) =>
        spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });

      fs.writeFileSync(path.join(tmpDir, 'app.ts'), '// base\n');
      run('git', ['add', '.']);
-      run('git', ['commit', '-m', 'initial']);
+      run('git', ['commit', '-m', 'add base app']);
      run('git', ['checkout', '-b', 'feature/waitlist']);
      fs.writeFileSync(path.join(tmpDir, 'app.ts'), '// waitlist feature\n');
      run('git', ['add', '.']);
@@ -425,12 +428,9 @@ export default app;
    }
  }, 150_000);

-  test.concurrent('journey-docs', async () => {
-    const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-docs-'));
+  testIfSelected('journey-docs', async () => {
+    const tmpDir = createRoutingWorkDir('docs');
    try {
-      initGitRepo(tmpDir);
-      installSkills(tmpDir);
-
      const run = (cmd: string, args: string[]) =>
        spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });

@@ -465,12 +465,9 @@ export default app;
    }
  }, 150_000);

-  test.concurrent('journey-retro', async () => {
-    const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-retro-'));
+  testIfSelected('journey-retro', async () => {
+    const tmpDir = createRoutingWorkDir('retro');
    try {
-      initGitRepo(tmpDir);
-      installSkills(tmpDir);
-
      const run = (cmd: string, args: string[]) =>
        spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });

@@ -511,18 +508,9 @@ export default app;
    }
  }, 150_000);

-  test.concurrent('journey-design-system', async () => {
-    const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-design-system-'));
+  testIfSelected('journey-design-system', async () => {
+    const tmpDir = createRoutingWorkDir('design-system');
    try {
-      initGitRepo(tmpDir);
-      installSkills(tmpDir);
-
-      const run = (cmd: string, args: string[]) =>
-        spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });
-
-      fs.writeFileSync(path.join(tmpDir, 'package.json'), JSON.stringify({ name: 'waitlist-app' }, null, 2));
-      run('git', ['add', '.']);
-      run('git', ['commit', '-m', 'initial']);

      const testName = 'journey-design-system';
      const expectedSkill = 'design-consultation';
@@ -549,12 +537,9 @@ export default app;
    }
  }, 150_000);

-  test.concurrent('journey-visual-qa', async () => {
-    const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'routing-visual-qa-'));
+  testIfSelected('journey-visual-qa', async () => {
+    const tmpDir = createRoutingWorkDir('visual-qa');
    try {
-      initGitRepo(tmpDir);
-      installSkills(tmpDir);
-
      const run = (cmd: string, args: string[]) =>
        spawnSync(cmd, args, { cwd: tmpDir, stdio: 'pipe', timeout: 5000 });

@@ -597,7 +582,8 @@ body { font-family: sans-serif; }
      recordRouting(testName, result, expectedSkill, actualSkill);

      expect(skillCalls.length, `Expected Skill tool to be called but got 0 calls. Claude may have answered directly without invoking a skill. Tool calls: ${result.toolCalls.map(tc => tc.tool).join(', ')}`).toBeGreaterThan(0);
-      expect([expectedSkill], `Expected skill ${expectedSkill} but got ${actualSkill}`).toContain(actualSkill);
+      const validSkills = ['design-review', 'qa', 'qa-only', 'browse'];
+      expect(validSkills, `Expected one of ${validSkills.join('/')} but got ${actualSkill}`).toContain(actualSkill);
    } finally {
      fs.rmSync(tmpDir, { recursive: true, force: true });
    }
@@ -99,6 +99,20 @@ describe('SKILL.md command validation', () => {
    const result = validateSkill(skill);
    expect(result.snapshotFlagErrors).toHaveLength(0);
  });
+
+  test('all $B commands in autoplan/SKILL.md are valid browse commands', () => {
+    const skill = path.join(ROOT, 'autoplan', 'SKILL.md');
+    if (!fs.existsSync(skill)) return;
+    const result = validateSkill(skill);
+    expect(result.invalid).toHaveLength(0);
+  });
+
+  test('all snapshot flags in autoplan/SKILL.md are valid', () => {
+    const skill = path.join(ROOT, 'autoplan', 'SKILL.md');
+    if (!fs.existsSync(skill)) return;
+    const result = validateSkill(skill);
+    expect(result.snapshotFlagErrors).toHaveLength(0);
+  });
 });

 describe('Command registry consistency', () => {
@@ -227,6 +241,7 @@ describe('Update check preamble', () => {
    'benchmark/SKILL.md',
    'land-and-deploy/SKILL.md',
    'setup-deploy/SKILL.md',
+    'cso/SKILL.md',
  ];

  for (const skill of skillsWithUpdateCheck) {
@@ -513,10 +528,12 @@ describe('TODOS-format.md reference consistency', () => {
 // --- v0.4.1 feature coverage: RECOMMENDATION format, session awareness, enum completeness ---

 describe('v0.4.1 preamble features', () => {
-  const skillsWithPreamble = [
-    'SKILL.md', 'browse/SKILL.md', 'qa/SKILL.md',
-    'qa-only/SKILL.md',
-    'setup-browser-cookies/SKILL.md',
+  // Tier 1 skills have core preamble only (no AskUserQuestion format)
+  const tier1Skills = ['SKILL.md', 'browse/SKILL.md', 'setup-browser-cookies/SKILL.md', 'benchmark/SKILL.md'];
+
+  // Tier 2+ skills have AskUserQuestion format with RECOMMENDATION
+  const tier2PlusSkills = [
+    'qa/SKILL.md', 'qa-only/SKILL.md',
    'ship/SKILL.md', 'review/SKILL.md',
    'plan-ceo-review/SKILL.md', 'plan-eng-review/SKILL.md',
    'retro/SKILL.md',
@@ -526,22 +543,25 @@ describe('v0.4.1 preamble features', () => {
    'design-consultation/SKILL.md',
    'document-release/SKILL.md',
    'canary/SKILL.md',
-    'benchmark/SKILL.md',
    'land-and-deploy/SKILL.md',
    'setup-deploy/SKILL.md',
+    'cso/SKILL.md',
  ];

-  for (const skill of skillsWithPreamble) {
+  const skillsWithPreamble = [...tier1Skills, ...tier2PlusSkills];
+
+  for (const skill of tier2PlusSkills) {
    test(`${skill} contains RECOMMENDATION format`, () => {
      const content = fs.readFileSync(path.join(ROOT, skill), 'utf-8');
      expect(content).toContain('RECOMMENDATION: Choose');
      expect(content).toContain('AskUserQuestion');
    });
+  }

+  for (const skill of skillsWithPreamble) {
    test(`${skill} contains session awareness`, () => {
      const content = fs.readFileSync(path.join(ROOT, skill), 'utf-8');
      expect(content).toContain('_SESSIONS');
-      expect(content).toContain('RECOMMENDATION');
    });
  }

@@ -724,14 +744,8 @@ describe('Contributor mode preamble structure', () => {
  for (const skill of skillsWithPreamble) {
    test(`${skill} has 0-10 rating in contributor mode`, () => {
      const content = fs.readFileSync(path.join(ROOT, skill), 'utf-8');
-      expect(content).toContain('0 to 10');
-      expect(content).toContain('My rating');
-    });
-
-    test(`${skill} has calibration example`, () => {
-      const content = fs.readFileSync(path.join(ROOT, skill), 'utf-8');
-      expect(content).toContain('Calibration');
-      expect(content).toContain('the bar');
+      expect(content).toContain('0-10');
+      expect(content).toContain('Rating');
    });

    test(`${skill} has "what would make this a 10" field`, () => {
@@ -807,7 +821,7 @@ describe('Completeness Principle in generated SKILL.md files', () => {
    'design-review/SKILL.md',
    'design-consultation/SKILL.md',
    'document-release/SKILL.md',
-  ];
+    'cso/SKILL.md',  ];

  for (const skill of skillsWithPreamble) {
    test(`${skill} contains Completeness Principle section`, () => {
@@ -817,17 +831,12 @@ describe('Completeness Principle in generated SKILL.md files', () => {
    });
  }

-  test('Completeness Principle includes compression table', () => {
-    const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
+  test('Completeness Principle includes compression table in tier 2+ skills', () => {
+    // Root is tier 1 (no completeness). Check tier 2+ skill.
+    const content = fs.readFileSync(path.join(ROOT, 'cso', 'SKILL.md'), 'utf-8');
    expect(content).toContain('CC+gstack');
    expect(content).toContain('Compression');
  });
-
-  test('Completeness Principle includes anti-patterns', () => {
-    const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
-    expect(content).toContain('BAD:');
-    expect(content).toContain('Anti-patterns');
-  });
 });

 // --- Part 7: Planted-bug fixture validation (A4) ---
@@ -961,10 +970,37 @@ describe('gstack-slug', () => {
  test('output is eval-compatible (KEY=VALUE format)', () => {
    const result = Bun.spawnSync([SLUG_BIN], { cwd: ROOT, stdout: 'pipe', stderr: 'pipe' });
    const lines = result.stdout.toString().trim().split('\n');
-    expect(lines.length).toBe(3);
+    expect(lines.length).toBe(2);
    expect(lines[0]).toMatch(/^SLUG=.+/);
    expect(lines[1]).toMatch(/^BRANCH=.+/);
-    expect(lines[2]).toMatch(/^PROJECTS_DIR=.+/);
+  });
+
+  test('output values contain only safe characters (no shell metacharacters)', () => {
+    const result = Bun.spawnSync([SLUG_BIN], { cwd: ROOT, stdout: 'pipe', stderr: 'pipe' });
+    const slug = result.stdout.toString().match(/SLUG=(.*)/)?.[1] ?? '';
+    const branch = result.stdout.toString().match(/BRANCH=(.*)/)?.[1] ?? '';
+    // Only alphanumeric, dot, dash, underscore are allowed (#133)
+    expect(slug).toMatch(/^[a-zA-Z0-9._-]+$/);
+    expect(branch).toMatch(/^[a-zA-Z0-9._-]+$/);
+  });
+  test('eval sets variables under bash with set -euo pipefail', () => {
+    const result = Bun.spawnSync(
+      ['bash', '-c', 'set -euo pipefail; eval "$(./bin/gstack-slug 2>/dev/null)"; echo "SLUG=$SLUG"; echo "BRANCH=$BRANCH"'],
+      { cwd: ROOT, stdout: 'pipe', stderr: 'pipe' }
+    );
+    expect(result.exitCode).toBe(0);
+    const output = result.stdout.toString();
+    expect(output).toMatch(/^SLUG=.+/m);
+    expect(output).toMatch(/^BRANCH=.+/m);
+  });
+
+  test('no templates or bin scripts use source process substitution for gstack-slug', () => {
+    const result = Bun.spawnSync(
+      ['grep', '-r', 'source <(.*gstack-slug', '--include=*.tmpl', '--include=gstack-review-*', '.'],
+      { cwd: ROOT, stdout: 'pipe', stderr: 'pipe' }
+    );
+    // grep returns exit code 1 when no matches found — that's what we want
+    expect(result.stdout.toString().trim()).toBe('');
  });
 });

@@ -1275,7 +1311,7 @@ describe('Codex skill', () => {
    expect(content).toContain('fall back to the Claude adversarial subagent');
    // Review log uses new skill name
    expect(content).toContain('adversarial-review');
-    expect(content).toContain('xhigh');
+    expect(content).toContain('reasoning_effort="high"');
    expect(content).toContain('ADVERSARIAL REVIEW SYNTHESIS');
  });

@@ -1285,17 +1321,23 @@ describe('Codex skill', () => {
    expect(content).toContain('< 50');
    expect(content).toContain('200+');
    expect(content).toContain('adversarial-review');
-    expect(content).toContain('xhigh');
+    expect(content).toContain('reasoning_effort="high"');
    expect(content).toContain('Investigate and fix');
  });

  test('codex-host ship/review do NOT contain adversarial review step', () => {
+    // .agents/ is gitignored — generate on demand
+    Bun.spawnSync(['bun', 'run', 'scripts/gen-skill-docs.ts', '--host', 'codex'], {
+      cwd: ROOT, stdout: 'pipe', stderr: 'pipe',
+    });
    const shipContent = fs.readFileSync(path.join(ROOT, '.agents', 'skills', 'gstack-ship', 'SKILL.md'), 'utf-8');
    expect(shipContent).not.toContain('codex review --base');
-    expect(shipContent).not.toContain('Investigate and fix');
+    expect(shipContent).not.toContain('CODEX_REVIEWS');

    const reviewContent = fs.readFileSync(path.join(ROOT, '.agents', 'skills', 'gstack-review', 'SKILL.md'), 'utf-8');
    expect(reviewContent).not.toContain('codex review --base');
+    expect(reviewContent).not.toContain('codex_reviews');
+    expect(reviewContent).not.toContain('CODEX_REVIEWS');
    expect(reviewContent).not.toContain('adversarial-review');
    expect(reviewContent).not.toContain('Investigate and fix');
  });
@@ -1306,6 +1348,13 @@ describe('Codex skill', () => {
    expect(content).toContain('codex exec');
  });

+  test('/review persists a review-log entry for ship readiness', () => {
+    const content = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
+    expect(content).toContain('"skill":"review"');
+    expect(content).toContain('"issues_found":N');
+    expect(content).toContain('Persist Eng Review result');
+  });
+
  test('Review Readiness Dashboard includes Adversarial Review row', () => {
    const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
    expect(content).toContain('Adversarial');
@@ -1362,6 +1411,11 @@ describe('Skill trigger phrases', () => {
 describe('Codex skill validation', () => {
  const AGENTS_DIR = path.join(ROOT, '.agents', 'skills');

+  // .agents/ is gitignored (v0.11.2.0) — generate on demand for tests
+  Bun.spawnSync(['bun', 'run', 'scripts/gen-skill-docs.ts', '--host', 'codex'], {
+    cwd: ROOT, stdout: 'pipe', stderr: 'pipe',
+  });
+
  // Discover all Claude skills with templates (except /codex which is Claude-only)
  const CLAUDE_SKILLS_WITH_TEMPLATES = (() => {
    const skills: string[] = [];
@@ -1423,3 +1477,59 @@ describe('Codex skill validation', () => {
    }
  });
 });
+
+// --- Repo mode and test failure triage validation ---
+
+describe('Repo mode preamble validation', () => {
+  test('generated SKILL.md preamble contains REPO_MODE output', () => {
+    const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
+    expect(content).toContain('REPO_MODE:');
+    expect(content).toContain('gstack-repo-mode');
+  });
+
+  test('tier 3+ skills contain See Something Say Something section', () => {
+    // Root SKILL.md is tier 1 (no Repo Mode). Check a tier 3 skill instead.
+    const content = fs.readFileSync(path.join(ROOT, 'plan-ceo-review', 'SKILL.md'), 'utf-8');
+    expect(content).toContain('See Something, Say Something');
+    expect(content).toContain('REPO_MODE');
+    expect(content).toContain('solo');
+    expect(content).toContain('collaborative');
+  });
+});
+
+describe('Test failure triage in ship skill', () => {
+  test('ship/SKILL.md contains Test Failure Ownership Triage', () => {
+    const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
+    expect(content).toContain('Test Failure Ownership Triage');
+  });
+
+  test('ship/SKILL.md triage uses git diff for classification', () => {
+    const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
+    expect(content).toContain('git diff origin/<base>...HEAD --name-only');
+  });
+
+  test('ship/SKILL.md triage has solo and collaborative paths', () => {
+    const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
+    expect(content).toContain('REPO_MODE');
+    expect(content).toContain('solo');
+    expect(content).toContain('collaborative');
+    expect(content).toContain('Investigate and fix now');
+    expect(content).toContain('Add as P0 TODO');
+  });
+
+  test('ship/SKILL.md triage has GitHub issue assignment for collaborative mode', () => {
+    const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
+    expect(content).toContain('gh issue create');
+    expect(content).toContain('--assignee');
+  });
+
+  test('{{TEST_FAILURE_TRIAGE}} placeholder is fully resolved in ship/SKILL.md', () => {
+    const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
+    expect(content).not.toContain('{{TEST_FAILURE_TRIAGE}}');
+  });
+
+  test('ship/SKILL.md uses in-branch language for stop condition', () => {
+    const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
+    expect(content).toContain('In-branch test failures');
+  });
+});
@@ -78,8 +78,8 @@ describe('gstack-telemetry-log', () => {

    const events = parseJsonl();
    expect(events).toHaveLength(1);
-    // installation_id should be a SHA-256 hash (64 hex chars)
-    expect(events[0].installation_id).toMatch(/^[a-f0-9]{64}$/);
+    // installation_id should be a UUID v4 (or hex fallback)
+    expect(events[0].installation_id).toMatch(/^[a-f0-9-]{32,36}$/);
  });

  test('installation_id is null for anonymous tier', () => {
@@ -125,6 +125,82 @@ describe('gstack-telemetry-log', () => {
    expect(events[0]).toHaveProperty('_branch');
  });

+  // ─── json_safe() injection prevention tests ────────────────
+  test('sanitizes skill name with quote injection attempt', () => {
+    setConfig('telemetry', 'anonymous');
+    run(`${BIN}/gstack-telemetry-log --skill 'review","injected":"true' --duration 10 --outcome success --session-id inj-1`);
+
+    const lines = readJsonl();
+    expect(lines).toHaveLength(1);
+    // Must be valid JSON (no injection — quotes stripped, so no field injection possible)
+    const event = JSON.parse(lines[0]);
+    // The key check: no injected top-level property was created
+    expect(event).not.toHaveProperty('injected');
+    // Skill field should have quotes stripped but content preserved
+    expect(event.skill).not.toContain('"');
+  });
+
+  test('truncates skill name exceeding 200 chars', () => {
+    setConfig('telemetry', 'anonymous');
+    const longSkill = 'a'.repeat(250);
+    run(`${BIN}/gstack-telemetry-log --skill '${longSkill}' --duration 10 --outcome success --session-id trunc-1`);
+
+    const events = parseJsonl();
+    expect(events[0].skill.length).toBeLessThanOrEqual(200);
+  });
+
+  test('sanitizes outcome with newline injection attempt', () => {
+    setConfig('telemetry', 'anonymous');
+    // Use printf to pass actual newline in the argument
+    run(`bash -c 'OUTCOME=$(printf "success\\nfake\\":\\"true"); ${BIN}/gstack-telemetry-log --skill qa --duration 10 --outcome "$OUTCOME" --session-id inj-2'`);
+
+    const lines = readJsonl();
+    expect(lines).toHaveLength(1);
+    const event = JSON.parse(lines[0]);
+    expect(event).not.toHaveProperty('fake');
+  });
+
+  test('sanitizes session_id with backslash-quote injection', () => {
+    setConfig('telemetry', 'anonymous');
+    run(`${BIN}/gstack-telemetry-log --skill qa --duration 10 --outcome success --session-id 'id\\\\"","x":"y'`);
+
+    const lines = readJsonl();
+    expect(lines).toHaveLength(1);
+    const event = JSON.parse(lines[0]);
+    expect(event).not.toHaveProperty('x');
+  });
+
+  test('sanitizes error_class with quote injection', () => {
+    setConfig('telemetry', 'anonymous');
+    run(`${BIN}/gstack-telemetry-log --skill qa --duration 10 --outcome error --error-class 'timeout","extra":"val' --session-id inj-3`);
+
+    const lines = readJsonl();
+    expect(lines).toHaveLength(1);
+    const event = JSON.parse(lines[0]);
+    expect(event).not.toHaveProperty('extra');
+  });
+
+  test('sanitizes failed_step with quote injection', () => {
+    setConfig('telemetry', 'anonymous');
+    run(`${BIN}/gstack-telemetry-log --skill qa --duration 10 --outcome error --failed-step 'step1","hacked":"yes' --session-id inj-4`);
+
+    const lines = readJsonl();
+    expect(lines).toHaveLength(1);
+    const event = JSON.parse(lines[0]);
+    expect(event).not.toHaveProperty('hacked');
+  });
+
+  test('escapes error_message quotes and preserves content', () => {
+    setConfig('telemetry', 'anonymous');
+    run(`${BIN}/gstack-telemetry-log --skill qa --duration 10 --outcome error --error-message 'Error: file "test.txt" not found' --session-id inj-5`);
+
+    const lines = readJsonl();
+    expect(lines).toHaveLength(1);
+    const event = JSON.parse(lines[0]);
+    expect(event.error_message).toContain('file');
+    expect(event.error_message).toContain('not found');
+  });
+
  test('creates analytics directory if missing', () => {
    // Remove analytics dir
    const analyticsDir = path.join(tmpDir, 'analytics');
@@ -136,6 +212,34 @@ describe('gstack-telemetry-log', () => {
    expect(fs.existsSync(analyticsDir)).toBe(true);
    expect(readJsonl()).toHaveLength(1);
  });
+
+  // ─── Telemetry JSON safety: branch/repo with special chars ────
+  test('branch name with quotes does not corrupt JSON', () => {
+    setConfig('telemetry', 'anonymous');
+    // Simulate a branch name with double quotes by setting it via git env override
+    // The json_safe function strips quotes, so the JSONL should remain valid
+    run(`${BIN}/gstack-telemetry-log --skill qa --duration 10 --outcome success --session-id branch-quotes-1`);
+
+    const lines = readJsonl();
+    expect(lines).toHaveLength(1);
+    // Every line must be valid JSON
+    const event = JSON.parse(lines[0]);
+    expect(event._branch).toBeDefined();
+    // _branch should not contain double quotes (json_safe strips them)
+    expect(event._branch).not.toContain('"');
+  });
+
+  test('repo slug with special chars does not corrupt JSON', () => {
+    setConfig('telemetry', 'anonymous');
+    run(`${BIN}/gstack-telemetry-log --skill qa --duration 10 --outcome success --session-id repo-special-1`);
+
+    const lines = readJsonl();
+    expect(lines).toHaveLength(1);
+    const event = JSON.parse(lines[0]);
+    expect(event._repo_slug).toBeDefined();
+    // _repo_slug should not contain double quotes (json_safe strips them)
+    expect(event._repo_slug).not.toContain('"');
+  });
 });

 describe('.pending marker', () => {
@@ -244,16 +348,32 @@ describe('gstack-analytics', () => {
 });

 describe('gstack-telemetry-sync', () => {
-  test('exits silently with no endpoint configured', () => {
-    // Default: GSTACK_TELEMETRY_ENDPOINT is not set → exit 0
+  test('exits silently with no Supabase URL configured', () => {
+    // Default: GSTACK_SUPABASE_URL is not set → exit 0
    const result = run(`${BIN}/gstack-telemetry-sync`);
    expect(result).toBe('');
  });

  test('exits silently with no JSONL file', () => {
-    const result = run(`${BIN}/gstack-telemetry-sync`, { GSTACK_TELEMETRY_ENDPOINT: 'http://localhost:9999' });
+    const result = run(`${BIN}/gstack-telemetry-sync`, { GSTACK_SUPABASE_URL: 'http://localhost:9999' });
    expect(result).toBe('');
  });
+
+  test('does not rename JSONL field names (edge function expects raw names)', () => {
+    setConfig('telemetry', 'anonymous');
+    run(`${BIN}/gstack-telemetry-log --skill qa --duration 60 --outcome success --session-id raw-fields-1`);
+
+    const events = parseJsonl();
+    expect(events).toHaveLength(1);
+    // Edge function expects these raw field names, NOT Postgres column names
+    expect(events[0]).toHaveProperty('v');
+    expect(events[0]).toHaveProperty('ts');
+    expect(events[0]).toHaveProperty('sessions');
+    // Should NOT have Postgres column names
+    expect(events[0]).not.toHaveProperty('schema_version');
+    expect(events[0]).not.toHaveProperty('event_timestamp');
+    expect(events[0]).not.toHaveProperty('concurrent_sessions');
+  });
 });

 describe('gstack-community-dashboard', () => {
@@ -13,6 +13,7 @@ import {
  selectTests,
  detectBaseBranch,
  E2E_TOUCHFILES,
+  E2E_TIERS,
  LLM_JUDGE_TOUCHFILES,
  GLOBAL_TOUCHFILES,
 } from './helpers/touchfiles';
@@ -79,8 +80,10 @@ describe('selectTests', () => {
    expect(result.selected).toContain('plan-ceo-review');
    expect(result.selected).toContain('plan-ceo-review-selective');
    expect(result.selected).toContain('plan-ceo-review-benefits');
-    expect(result.selected.length).toBe(3);
-    expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 3);
+    expect(result.selected).toContain('autoplan-core');
+    expect(result.selected).toContain('codex-offered-ceo-review');
+    expect(result.selected.length).toBe(5);
+    expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 5);
  });

  test('global touchfile triggers ALL tests', () => {
@@ -90,10 +93,19 @@ describe('selectTests', () => {
    expect(result.reason).toContain('global');
  });

-  test('gen-skill-docs.ts is a global touchfile', () => {
+  test('gen-skill-docs.ts is a scoped touchfile, not global', () => {
    const result = selectTests(['scripts/gen-skill-docs.ts'], E2E_TOUCHFILES);
-    expect(result.selected.length).toBe(Object.keys(E2E_TOUCHFILES).length);
-    expect(result.reason).toContain('global');
+    // Should select tests that list gen-skill-docs.ts in their touchfiles, not ALL tests
+    expect(result.selected.length).toBeGreaterThan(0);
+    expect(result.selected.length).toBeLessThan(Object.keys(E2E_TOUCHFILES).length);
+    expect(result.reason).toBe('diff');
+    // Should include tests that depend on gen-skill-docs.ts
+    expect(result.selected).toContain('skillmd-setup-discovery');
+    expect(result.selected).toContain('contributor-mode');
+    expect(result.selected).toContain('journey-ideation');
+    // Should NOT include tests that don't depend on it
+    expect(result.selected).not.toContain('retro');
+    expect(result.selected).not.toContain('cso-full-audit');
  });

  test('unrelated file selects nothing', () => {
@@ -142,7 +154,7 @@ describe('selectTests', () => {
  });

  test('global touchfiles work for LLM-judge tests too', () => {
-    const result = selectTests(['scripts/gen-skill-docs.ts'], LLM_JUDGE_TOUCHFILES);
+    const result = selectTests(['test/helpers/session-runner.ts'], LLM_JUDGE_TOUCHFILES);
    expect(result.selected.length).toBe(Object.keys(LLM_JUDGE_TOUCHFILES).length);
  });
 });
@@ -232,6 +244,36 @@ describe('TOUCHFILES completeness', () => {
    }
  });

+  test('E2E_TIERS covers exactly the same tests as E2E_TOUCHFILES', () => {
+    const touchfileKeys = new Set(Object.keys(E2E_TOUCHFILES));
+    const tierKeys = new Set(Object.keys(E2E_TIERS));
+
+    const missingFromTiers = [...touchfileKeys].filter(k => !tierKeys.has(k));
+    const extraInTiers = [...tierKeys].filter(k => !touchfileKeys.has(k));
+
+    if (missingFromTiers.length > 0) {
+      throw new Error(
+        `E2E tests missing TIER entries: ${missingFromTiers.join(', ')}\n` +
+        `Add these to E2E_TIERS in test/helpers/touchfiles.ts`,
+      );
+    }
+    if (extraInTiers.length > 0) {
+      throw new Error(
+        `E2E_TIERS has extra entries not in E2E_TOUCHFILES: ${extraInTiers.join(', ')}\n` +
+        `Remove these from E2E_TIERS or add to E2E_TOUCHFILES`,
+      );
+    }
+  });
+
+  test('E2E_TIERS only contains valid tier values', () => {
+    const validTiers = ['gate', 'periodic'];
+    for (const [name, tier] of Object.entries(E2E_TIERS)) {
+      if (!validTiers.includes(tier)) {
+        throw new Error(`E2E_TIERS['${name}'] has invalid tier '${tier}'. Valid: ${validTiers.join(', ')}`);
+      }
+    }
+  });
+
  test('every LLM-judge test has a TOUCHFILES entry', () => {
    const llmContent = fs.readFileSync(
      path.join(ROOT, 'test', 'skill-llm-eval.test.ts'),
@@ -0,0 +1,165 @@
+import { describe, test, expect, beforeEach, afterEach } from 'bun:test';
+import { spawnSync } from 'child_process';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+
+const ROOT = path.resolve(import.meta.dir, '..');
+const UNINSTALL = path.join(ROOT, 'bin', 'gstack-uninstall');
+
+describe('gstack-uninstall', () => {
+  test('syntax check passes', () => {
+    const result = spawnSync('bash', ['-n', UNINSTALL], { stdio: 'pipe' });
+    expect(result.status).toBe(0);
+  });
+
+  test('--help prints usage and exits 0', () => {
+    const result = spawnSync('bash', [UNINSTALL, '--help'], { stdio: 'pipe' });
+    expect(result.status).toBe(0);
+    const output = result.stdout.toString();
+    expect(output).toContain('gstack-uninstall');
+    expect(output).toContain('--force');
+    expect(output).toContain('--keep-state');
+  });
+
+  test('unknown flag exits with error', () => {
+    const result = spawnSync('bash', [UNINSTALL, '--bogus'], {
+      stdio: 'pipe',
+      env: { ...process.env, HOME: '/nonexistent' },
+    });
+    expect(result.status).toBe(1);
+    expect(result.stderr.toString()).toContain('Unknown option');
+  });
+
+  describe('integration tests with mock layout', () => {
+    let tmpDir: string;
+    let mockHome: string;
+    let mockGitRoot: string;
+
+    beforeEach(() => {
+      tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-uninstall-test-'));
+      mockHome = path.join(tmpDir, 'home');
+      mockGitRoot = path.join(tmpDir, 'repo');
+
+      // Create mock gstack install layout
+      fs.mkdirSync(path.join(mockHome, '.claude', 'skills', 'gstack'), { recursive: true });
+      fs.writeFileSync(path.join(mockHome, '.claude', 'skills', 'gstack', 'SKILL.md'), 'test');
+
+      // Create per-skill symlinks (both old unprefixed and new prefixed)
+      fs.symlinkSync('gstack/review', path.join(mockHome, '.claude', 'skills', 'review'));
+      fs.symlinkSync('gstack/ship', path.join(mockHome, '.claude', 'skills', 'gstack-ship'));
+
+      // Create a non-gstack symlink (should NOT be removed)
+      fs.mkdirSync(path.join(mockHome, '.claude', 'skills', 'other-tool'), { recursive: true });
+
+      // Create state directory
+      fs.mkdirSync(path.join(mockHome, '.gstack', 'projects'), { recursive: true });
+      fs.writeFileSync(path.join(mockHome, '.gstack', 'config.json'), '{}');
+
+      // Create mock git repo
+      fs.mkdirSync(mockGitRoot, { recursive: true });
+      spawnSync('git', ['init', '-b', 'main'], { cwd: mockGitRoot, stdio: 'pipe' });
+    });
+
+    afterEach(() => {
+      fs.rmSync(tmpDir, { recursive: true, force: true });
+    });
+
+    test('--force removes global Claude skills and state', () => {
+      const result = spawnSync('bash', [UNINSTALL, '--force'], {
+        stdio: 'pipe',
+        env: {
+          ...process.env,
+          HOME: mockHome,
+          GSTACK_DIR: path.join(mockHome, '.claude', 'skills', 'gstack'),
+          GSTACK_STATE_DIR: path.join(mockHome, '.gstack'),
+        },
+        cwd: mockGitRoot,
+      });
+
+      expect(result.status).toBe(0);
+      const output = result.stdout.toString();
+      expect(output).toContain('gstack uninstalled');
+
+      // Global skill dir should be removed
+      expect(fs.existsSync(path.join(mockHome, '.claude', 'skills', 'gstack'))).toBe(false);
+
+      // Per-skill symlinks pointing into gstack/ should be removed
+      expect(fs.existsSync(path.join(mockHome, '.claude', 'skills', 'review'))).toBe(false);
+      expect(fs.existsSync(path.join(mockHome, '.claude', 'skills', 'gstack-ship'))).toBe(false);
+
+      // Non-gstack tool should still exist
+      expect(fs.existsSync(path.join(mockHome, '.claude', 'skills', 'other-tool'))).toBe(true);
+
+      // State should be removed
+      expect(fs.existsSync(path.join(mockHome, '.gstack'))).toBe(false);
+    });
+
+    test('--keep-state preserves state directory', () => {
+      const result = spawnSync('bash', [UNINSTALL, '--force', '--keep-state'], {
+        stdio: 'pipe',
+        env: {
+          ...process.env,
+          HOME: mockHome,
+          GSTACK_DIR: path.join(mockHome, '.claude', 'skills', 'gstack'),
+          GSTACK_STATE_DIR: path.join(mockHome, '.gstack'),
+        },
+        cwd: mockGitRoot,
+      });
+
+      expect(result.status).toBe(0);
+
+      // Skills should be removed
+      expect(fs.existsSync(path.join(mockHome, '.claude', 'skills', 'gstack'))).toBe(false);
+
+      // State should still exist
+      expect(fs.existsSync(path.join(mockHome, '.gstack'))).toBe(true);
+      expect(fs.existsSync(path.join(mockHome, '.gstack', 'config.json'))).toBe(true);
+    });
+
+    test('clean system outputs nothing to remove', () => {
+      const cleanHome = path.join(tmpDir, 'clean-home');
+      fs.mkdirSync(cleanHome, { recursive: true });
+
+      const result = spawnSync('bash', [UNINSTALL, '--force'], {
+        stdio: 'pipe',
+        env: {
+          ...process.env,
+          HOME: cleanHome,
+          GSTACK_DIR: path.join(cleanHome, 'nonexistent'),
+          GSTACK_STATE_DIR: path.join(cleanHome, '.gstack'),
+        },
+        cwd: mockGitRoot,
+      });
+
+      expect(result.status).toBe(0);
+      expect(result.stdout.toString()).toContain('Nothing to remove');
+    });
+
+    test('upgrade path: prefixed install + uninstall cleans both old and new symlinks', () => {
+      // Simulate the state after setup --no-prefix followed by setup (with prefix):
+      // Both old unprefixed and new prefixed symlinks exist
+      // (mockHome already has both 'review' and 'gstack-ship' symlinks)
+
+      const result = spawnSync('bash', [UNINSTALL, '--force'], {
+        stdio: 'pipe',
+        env: {
+          ...process.env,
+          HOME: mockHome,
+          GSTACK_DIR: path.join(mockHome, '.claude', 'skills', 'gstack'),
+          GSTACK_STATE_DIR: path.join(mockHome, '.gstack'),
+        },
+        cwd: mockGitRoot,
+      });
+
+      expect(result.status).toBe(0);
+
+      // Both old (review) and new (gstack-ship) symlinks should be gone
+      expect(fs.existsSync(path.join(mockHome, '.claude', 'skills', 'review'))).toBe(false);
+      expect(fs.existsSync(path.join(mockHome, '.claude', 'skills', 'gstack-ship'))).toBe(false);
+
+      // Non-gstack should survive
+      expect(fs.existsSync(path.join(mockHome, '.claude', 'skills', 'other-tool'))).toBe(true);
+    });
+  });
+});
@@ -0,0 +1,271 @@
+/**
+ * Unit tests for WorktreeManager.
+ *
+ * Tests worktree lifecycle: create, harvest, dedup, cleanup, prune.
+ * Each test creates real git worktrees in a temporary repo.
+ */
+
+import { describe, test, expect, afterEach } from 'bun:test';
+import { WorktreeManager } from '../lib/worktree';
+import type { HarvestResult } from '../lib/worktree';
+import { spawnSync } from 'child_process';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+
+/** Create a minimal git repo in a tmpdir for testing. */
+function createTestRepo(): string {
+  const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'worktree-test-'));
+  spawnSync('git', ['init'], { cwd: dir, stdio: 'pipe' });
+  spawnSync('git', ['config', 'user.email', 'test@test.com'], { cwd: dir, stdio: 'pipe' });
+  spawnSync('git', ['config', 'user.name', 'Test'], { cwd: dir, stdio: 'pipe' });
+
+  // Create initial commit so HEAD exists
+  fs.writeFileSync(path.join(dir, 'README.md'), '# Test repo\n');
+  // Add .gitignore matching real repo (so copied build artifacts don't appear as changes)
+  fs.writeFileSync(path.join(dir, '.gitignore'), '.agents/\nbrowse/dist/\n.gstack-worktrees/\n');
+  // Create a .agents directory (simulating gitignored build artifacts)
+  fs.mkdirSync(path.join(dir, '.agents', 'skills'), { recursive: true });
+  fs.writeFileSync(path.join(dir, '.agents', 'skills', 'test-skill.md'), '# Test skill\n');
+  // Create browse/dist (simulating build artifacts)
+  fs.mkdirSync(path.join(dir, 'browse', 'dist'), { recursive: true });
+  fs.writeFileSync(path.join(dir, 'browse', 'dist', 'browse'), '#!/bin/sh\necho browse\n');
+
+  spawnSync('git', ['add', 'README.md', '.gitignore'], { cwd: dir, stdio: 'pipe' });
+  spawnSync('git', ['commit', '-m', 'Initial commit'], { cwd: dir, stdio: 'pipe' });
+
+  return dir;
+}
+
+/** Clean up a test repo. */
+function cleanupRepo(dir: string): void {
+  // Prune worktrees first to avoid git lock issues
+  spawnSync('git', ['worktree', 'prune'], { cwd: dir, stdio: 'pipe' });
+  fs.rmSync(dir, { recursive: true, force: true });
+}
+
+// Track repos to clean up
+const repos: string[] = [];
+
+// Dedup index path — clear before each test to avoid cross-run contamination
+const DEDUP_PATH = path.join(os.homedir(), '.gstack-dev', 'harvests', 'dedup.json');
+
+afterEach(() => {
+  for (const repo of repos) {
+    try { cleanupRepo(repo); } catch { /* best effort */ }
+  }
+  repos.length = 0;
+  // Clear dedup index so tests are independent
+  try { fs.unlinkSync(DEDUP_PATH); } catch { /* may not exist */ }
+});
+
+describe('WorktreeManager', () => {
+
+  test('create() produces a valid worktree at the expected path', () => {
+    const repo = createTestRepo();
+    repos.push(repo);
+    const mgr = new WorktreeManager(repo);
+
+    const worktreePath = mgr.create('test-1');
+
+    expect(fs.existsSync(worktreePath)).toBe(true);
+    expect(fs.existsSync(path.join(worktreePath, 'README.md'))).toBe(true);
+    expect(worktreePath).toContain('.gstack-worktrees');
+    expect(worktreePath).toContain('test-1');
+
+    mgr.cleanup('test-1');
+  });
+
+  test('create() worktree has .agents/skills/ (gitignored artifacts copied)', () => {
+    const repo = createTestRepo();
+    repos.push(repo);
+    const mgr = new WorktreeManager(repo);
+
+    const worktreePath = mgr.create('test-agents');
+
+    expect(fs.existsSync(path.join(worktreePath, '.agents', 'skills', 'test-skill.md'))).toBe(true);
+    expect(fs.existsSync(path.join(worktreePath, 'browse', 'dist', 'browse'))).toBe(true);
+
+    mgr.cleanup('test-agents');
+  });
+
+  test('create() stores correct originalSha', () => {
+    const repo = createTestRepo();
+    repos.push(repo);
+    const mgr = new WorktreeManager(repo);
+
+    const expectedSha = spawnSync('git', ['rev-parse', 'HEAD'], { cwd: repo, stdio: 'pipe' })
+      .stdout.toString().trim();
+
+    mgr.create('test-sha');
+
+    const info = mgr.getInfo('test-sha');
+    expect(info).toBeDefined();
+    expect(info!.originalSha).toBe(expectedSha);
+
+    mgr.cleanup('test-sha');
+  });
+
+  test('harvest() captures modifications to tracked files', () => {
+    const repo = createTestRepo();
+    repos.push(repo);
+    const mgr = new WorktreeManager(repo);
+
+    const worktreePath = mgr.create('test-harvest-mod');
+
+    // Modify a tracked file in the worktree
+    fs.writeFileSync(path.join(worktreePath, 'README.md'), '# Modified!\n');
+
+    const result = mgr.harvest('test-harvest-mod');
+
+    expect(result).not.toBeNull();
+    expect(result!.changedFiles).toContain('README.md');
+    expect(result!.isDuplicate).toBe(false);
+    expect(result!.patchPath).toBeTruthy();
+    expect(fs.existsSync(result!.patchPath)).toBe(true);
+
+    mgr.cleanup('test-harvest-mod');
+  });
+
+  test('harvest() captures new untracked files (git add -A path)', () => {
+    const repo = createTestRepo();
+    repos.push(repo);
+    const mgr = new WorktreeManager(repo);
+
+    const worktreePath = mgr.create('test-harvest-new');
+
+    // Create a new file in the worktree
+    fs.writeFileSync(path.join(worktreePath, 'new-file.txt'), 'Hello from agent\n');
+
+    const result = mgr.harvest('test-harvest-new');
+
+    expect(result).not.toBeNull();
+    expect(result!.changedFiles).toContain('new-file.txt');
+
+    mgr.cleanup('test-harvest-new');
+  });
+
+  test('harvest() captures committed changes (git diff originalSha)', () => {
+    const repo = createTestRepo();
+    repos.push(repo);
+    const mgr = new WorktreeManager(repo);
+
+    const worktreePath = mgr.create('test-harvest-commit');
+
+    // Make a commit in the worktree (simulating agent running git commit)
+    fs.writeFileSync(path.join(worktreePath, 'committed.txt'), 'Agent committed this\n');
+    spawnSync('git', ['add', 'committed.txt'], { cwd: worktreePath, stdio: 'pipe' });
+    spawnSync('git', ['commit', '-m', 'Agent commit'], { cwd: worktreePath, stdio: 'pipe' });
+
+    const result = mgr.harvest('test-harvest-commit');
+
+    expect(result).not.toBeNull();
+    expect(result!.changedFiles).toContain('committed.txt');
+
+    mgr.cleanup('test-harvest-commit');
+  });
+
+  test('harvest() returns null when worktree is clean', () => {
+    const repo = createTestRepo();
+    repos.push(repo);
+    const mgr = new WorktreeManager(repo);
+
+    mgr.create('test-harvest-clean');
+
+    // Don't modify anything
+    const result = mgr.harvest('test-harvest-clean');
+
+    expect(result).toBeNull();
+
+    mgr.cleanup('test-harvest-clean');
+  });
+
+  test('harvest() dedup skips identical patches', () => {
+    const repo = createTestRepo();
+    repos.push(repo);
+
+    // First run
+    const mgr1 = new WorktreeManager(repo);
+    const wt1 = mgr1.create('test-dedup-1');
+    fs.writeFileSync(path.join(wt1, 'dedup-test.txt'), 'same content\n');
+    const result1 = mgr1.harvest('test-dedup-1');
+    mgr1.cleanup('test-dedup-1');
+
+    expect(result1).not.toBeNull();
+    expect(result1!.isDuplicate).toBe(false);
+
+    // Second run with same change
+    const mgr2 = new WorktreeManager(repo);
+    const wt2 = mgr2.create('test-dedup-2');
+    fs.writeFileSync(path.join(wt2, 'dedup-test.txt'), 'same content\n');
+    const result2 = mgr2.harvest('test-dedup-2');
+    mgr2.cleanup('test-dedup-2');
+
+    expect(result2).not.toBeNull();
+    expect(result2!.isDuplicate).toBe(true);
+  });
+
+  test('cleanup() removes worktree directory', () => {
+    const repo = createTestRepo();
+    repos.push(repo);
+    const mgr = new WorktreeManager(repo);
+
+    const worktreePath = mgr.create('test-cleanup');
+    expect(fs.existsSync(worktreePath)).toBe(true);
+
+    mgr.cleanup('test-cleanup');
+    expect(fs.existsSync(worktreePath)).toBe(false);
+  });
+
+  test('pruneStale() removes orphaned worktrees from previous runs', () => {
+    const repo = createTestRepo();
+    repos.push(repo);
+
+    // Create a worktree with a different manager (simulating a previous run)
+    const oldMgr = new WorktreeManager(repo);
+    const oldPath = oldMgr.create('stale-test');
+    const oldRunDir = path.dirname(oldPath);
+    expect(fs.existsSync(oldPath)).toBe(true);
+
+    // Remove via git but leave directory (simulating a crash)
+    spawnSync('git', ['worktree', 'remove', '--force', oldPath], { cwd: repo, stdio: 'pipe' });
+    // Recreate the directory to simulate orphaned state
+    fs.mkdirSync(oldPath, { recursive: true });
+
+    // New manager should prune the old run's directory
+    const newMgr = new WorktreeManager(repo);
+    newMgr.pruneStale();
+
+    expect(fs.existsSync(oldRunDir)).toBe(false);
+  });
+
+  test('create() throws on failure (no silent fallback to ROOT)', () => {
+    const repo = createTestRepo();
+    repos.push(repo);
+    const mgr = new WorktreeManager(repo);
+
+    // Create the same worktree twice — second should fail because path exists
+    mgr.create('test-fail');
+    expect(() => mgr.create('test-fail')).toThrow();
+
+    mgr.cleanup('test-fail');
+  });
+
+  test('harvest() returns null gracefully when worktree dir was deleted by agent', () => {
+    const repo = createTestRepo();
+    repos.push(repo);
+    const mgr = new WorktreeManager(repo);
+
+    const worktreePath = mgr.create('test-deleted');
+
+    // Simulate agent deleting its own worktree directory
+    fs.rmSync(worktreePath, { recursive: true, force: true });
+
+    // harvest should return null gracefully, not throw
+    const result = mgr.harvest('test-deleted');
+    expect(result).toBeNull();
+
+    // cleanup should also be non-fatal
+    mgr.cleanup('test-deleted');
+  });
+});