Merge remote-tracking branch 'origin/main' into garrytan/trunk-land-skill

# Conflicts: # CHANGELOG.md # VERSION # package.json
2026-06-18 07:40:09 +02:00 · 2026-06-17 08:36:46 -07:00
parent e0bb11dfc9 c7ae63201a
commit 2e102232e4
159 changed files with 15237 additions and 500 deletions
@@ -347,7 +347,13 @@ describe('runAgentSdkTest — options propagation', () => {
    expect(opts.permissionMode).toBe('bypassPermissions');
    expect(opts.allowDangerouslySkipPermissions).toBe(true);
    expect(opts.settingSources).toEqual([]);
-    expect(opts.env).toEqual({ ANTHROPIC_API_KEY: 'fake' });
+    // env is the COMPLETE hermetic env with the per-test override merged
+    // last — partial pass-through was the documented SDK auth-breaker
+    // (Options.env replaces the child's entire environment).
+    expect(opts.env?.ANTHROPIC_API_KEY).toBe('fake');
+    expect(opts.env?.PATH).toBeTruthy();
+    expect(opts.env?.CLAUDE_CONFIG_DIR).toMatch(/\/\.claude$/);
+    expect(opts.env?.GSTACK_HOME).toContain('gstack-home');
    expect(opts.pathToClaudeCodeExecutable).toBe('/fake/path/claude');
  });

@@ -0,0 +1,91 @@
+import { describe, test, expect } from 'bun:test';
+import * as path from 'path';
+import * as fs from 'fs';
+
+// Static tripwires for the B2 render-isolation wiring. These fail CI if a
+// refactor drops a load-bearing line, re-introducing the "dev-setup dirties
+// tracked SKILL.md" drift (or worse, leaks the skip-guard into real installs).
+const ROOT = path.resolve(import.meta.dir, '..');
+const read = (rel: string) => fs.readFileSync(path.join(ROOT, rel), 'utf-8');
+
+describe('dev-setup: worktree stays canonical', () => {
+  const devSetup = read('bin/dev-setup');
+
+  test('passes GSTACK_SKIP_GBRAIN_REGEN inline on the nested setup call', () => {
+    expect(devSetup).toContain('GSTACK_SKIP_GBRAIN_REGEN=1 "$GSTACK_LINK/setup"');
+  });
+
+  test('never exports GSTACK_SKIP_GBRAIN_REGEN (would leak into other setup paths)', () => {
+    expect(devSetup).not.toMatch(/export\s+GSTACK_SKIP_GBRAIN_REGEN/);
+  });
+
+  test('renders the :user variant into an out-dir, not in place', () => {
+    expect(devSetup).toContain('--out-dir');
+    expect(devSetup).toContain('.claude/gstack-rendered');
+  });
+
+  test('gates the render on gstack-gbrain-detect --is-ok', () => {
+    expect(devSetup).toContain('--is-ok');
+  });
+});
+
+describe('setup: honors GSTACK_SKIP_GBRAIN_REGEN', () => {
+  const setup = read('setup');
+
+  test('skips the in-place :user regen when the guard is set', () => {
+    expect(setup).toContain('${GSTACK_SKIP_GBRAIN_REGEN:-}');
+    // The guard must wrap the in-place render, not the detection persist.
+    const idx = setup.indexOf('GSTACK_SKIP_GBRAIN_REGEN');
+    const after = setup.slice(idx, idx + 600);
+    expect(after).toContain('leaving tracked SKILL.md canonical');
+  });
+
+  test('uses a PID-unique detection tmp (no concurrent clobber)', () => {
+    expect(setup).toContain('$DETECTION_FILE.$$.tmp');
+  });
+
+  test('gates detection on the shared --is-ok check', () => {
+    expect(setup).toContain('"$DETECT_BIN" --is-ok');
+  });
+});
+
+describe('gen-skill-docs: section rewrite is gated on --out-dir', () => {
+  const gen = read('scripts/gen-skill-docs.ts');
+
+  test('rewriteSectionBase is a no-op without --out-dir', () => {
+    expect(gen).toContain('function rewriteSectionBase');
+    const idx = gen.indexOf('function rewriteSectionBase');
+    const body = gen.slice(idx, idx + 400);
+    expect(body).toContain('if (!OUT_DIR) return content');
+    expect(body).toContain('sections'); // surgical: regex targets only /sections/ paths
+  });
+});
+
+describe('dev-teardown: removes the untracked render', () => {
+  const teardown = read('bin/dev-teardown');
+
+  test('rm -rf the gstack-rendered dir', () => {
+    expect(teardown).toContain('gstack-rendered');
+    expect(teardown).toMatch(/rm -rf .*RENDER_DIR/);
+  });
+});
+
+describe('.gitignore: render dir is declared untracked', () => {
+  test('.claude/gstack-rendered/ is ignored', () => {
+    expect(read('.gitignore')).toContain('.claude/gstack-rendered/');
+  });
+});
+
+describe('dev-skill: refreshes the render on template change', () => {
+  const devSkill = read('scripts/dev-skill.ts');
+
+  test('re-renders the :user variant into the workspace render dir', () => {
+    expect(devSkill).toContain('gstack-rendered');
+    expect(devSkill).toContain('--out-dir');
+    expect(devSkill).toContain('--respect-detection');
+  });
+
+  test('only refreshes when the render dir already exists (never creates it during plain dev)', () => {
+    expect(devSkill).toContain('fs.existsSync(RENDER_DIR)');
+  });
+});
@@ -0,0 +1,96 @@
+/**
+ * Drift guards for the committed diagram-render bundle (eng-review D2).
+ *
+ * Tier 1 (always, free, <50ms): dist/diagram-render.html must hash to exactly
+ * what dist/BUILD_INFO.json records, and the BUILD_INFO dependency pins must
+ * match package.json. Catches hand-edited dist files and "bumped the pin,
+ * forgot to rebuild" commits.
+ *
+ * Tier 2 (deep, CI / post-install only): rebuild from source and compare
+ * hashes. Skipped when lib/diagram-render/node_modules is absent (fresh
+ * clone without `bun install` in that dir) or when the local bun version
+ * differs from the one recorded at build time (minifier output is only
+ * guaranteed deterministic within a bun version).
+ */
+import { describe, expect, test } from "bun:test";
+import { createHash } from "node:crypto";
+import { existsSync } from "node:fs";
+import path from "node:path";
+
+const ROOT = path.resolve(import.meta.dir, "..", "lib", "diagram-render");
+const DIST_HTML = path.join(ROOT, "dist", "diagram-render.html");
+const BUILD_INFO = path.join(ROOT, "dist", "BUILD_INFO.json");
+
+describe("diagram-render bundle drift", () => {
+  test("dist hash matches BUILD_INFO (tamper check)", async () => {
+    const html = await Bun.file(DIST_HTML).text();
+    const info = await Bun.file(BUILD_INFO).json();
+    const sha = createHash("sha256").update(html).digest("hex");
+    expect(sha).toBe(info.sha256);
+    expect(Buffer.byteLength(html)).toBe(info.bytes);
+  });
+
+  test("BUILD_INFO dependency pins match package.json", async () => {
+    const info = await Bun.file(BUILD_INFO).json();
+    const pkg = await Bun.file(path.join(ROOT, "package.json")).json();
+    expect(info.deps).toEqual(pkg.dependencies);
+  });
+
+  test("BUILD_INFO srcSha256 matches src on disk (edited-src-forgot-rebuild guard)", async () => {
+    // The deep rebuild check below needs node_modules, which CI doesn't
+    // install for this nested package — this tier-1.5 fingerprint catches a
+    // src edit committed without a rebuild using nothing but file hashes.
+    const info = await Bun.file(BUILD_INFO).json();
+    const srcSha = createHash("sha256")
+      .update(await Bun.file(path.join(ROOT, "src", "entry.ts")).text())
+      .update(await Bun.file(path.join(ROOT, "scripts", "build.ts")).text())
+      .digest("hex");
+    expect(srcSha).toBe(info.srcSha256);
+  });
+
+  test("bundle font stack matches print-css (text-measurement drift guard)", async () => {
+    const entrySrc = await Bun.file(path.join(ROOT, "src", "entry.ts")).text();
+    // Every family print-css composes into the body stack must appear in the
+    // bundle's PRINT_SANS literal — mermaid measures text with these fonts and
+    // the print document lays it out with print-css's; drift = overflowing
+    // labels (eng-review D3).
+    for (const family of [
+      "Helvetica", "Liberation Sans", "Arial",
+      "Hiragino Kaku Gothic ProN", "Noto Sans CJK JP", "Microsoft YaHei",
+      "Apple Color Emoji", "Segoe UI Emoji", "Noto Color Emoji",
+    ]) {
+      expect(entrySrc).toContain(family);
+    }
+  });
+
+  test("page invariants: module script, base href, escaped terminators, error trap", async () => {
+    const html = await Bun.file(DIST_HTML).text();
+    expect(html).toContain('<script type="module">');
+    expect(html).toContain('<base href="https://gstack-render.localhost/">');
+    expect(html).toContain("window.__errors = []");
+    // The inline module must contain no live </script> other than the page's
+    // own closers: head error-trap closer + module closer.
+    const closers = html.match(/<\/script>/g) ?? [];
+    expect(closers.length).toBe(2);
+  });
+
+  const nodeModules = path.join(ROOT, "node_modules");
+  let builtWithSameBun = false;
+  try {
+    const info = require(BUILD_INFO);
+    builtWithSameBun = info.bunVersion === Bun.version;
+  } catch {}
+  const canDeepCheck = existsSync(nodeModules) && builtWithSameBun;
+
+  test.skipIf(!canDeepCheck)(
+    "deep: fresh build reproduces committed dist",
+    async () => {
+      const before = await Bun.file(BUILD_INFO).json();
+      const proc = Bun.spawnSync(["bun", "run", "scripts/build.ts"], { cwd: ROOT });
+      expect(proc.exitCode).toBe(0);
+      const after = await Bun.file(BUILD_INFO).json();
+      expect(after.sha256).toBe(before.sha256);
+    },
+    60000,
+  );
+});
@@ -53,6 +53,13 @@ echo "REPO_MODE: $REPO_MODE"
 _SESSION_KIND=$(~/.claude/skills/gstack/bin/gstack-session-kind 2>/dev/null || echo "interactive")
 case "$_SESSION_KIND" in spawned|headless|interactive) ;; *) _SESSION_KIND="interactive" ;; esac
 echo "SESSION_KIND: $_SESSION_KIND"
+# Conductor host: AskUserQuestion is unreliable here (native disabled, MCP
+# variant flaky), so skills render decisions as prose instead of calling the
+# tool. Gated on !headless so an eval/CI run INSIDE Conductor (GSTACK_HEADLESS)
+# still BLOCKs rather than rendering prose to nobody.
+if [ "$_SESSION_KIND" != "headless" ] && { [ -n "${CONDUCTOR_WORKSPACE_PATH:-}" ] || [ -n "${CONDUCTOR_PORT:-}" ]; }; then
+  echo "CONDUCTOR_SESSION: true"
+fi
 _LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no")
 echo "LAKE_INTRO: $_LAKE_SEEN"
 _TEL=$(~/.claude/skills/gstack/bin/gstack-config get telemetry 2>/dev/null || true)
@@ -302,7 +309,9 @@ AI orchestrator (e.g., OpenClaw). In spawned sessions:

 "AskUserQuestion" can resolve to two tools at runtime: the **host MCP variant** (e.g. `mcp__conductor__AskUserQuestion` — appears in your tool list when the host registers it) or the **native** Claude Code tool.

-**Rule:** if any `mcp__*__AskUserQuestion` variant is in your tool list, prefer it. Hosts may disable native AUQ via `--disallowedTools AskUserQuestion` (Conductor does, by default) and route through their MCP variant; calling native there silently fails. Same questions/options shape; same decision-brief format applies.
+**Conductor rule (read before the MCP rule):** if `CONDUCTOR_SESSION: true` was echoed by the preamble, do NOT call AskUserQuestion at all — neither native nor any `mcp__*__AskUserQuestion` variant. Render EVERY decision brief as the **prose form** below and STOP. This is proactive, not a reaction to a failure: Conductor disables native AUQ and its MCP variant is flaky (it returns `[Tool result missing due to internal error]`), so prose is the reliable path. **Auto-decide preferences still apply first:** if a `[plan-tune auto-decide] <id> → <option>` result has already surfaced for a question, proceed with that option (no prose). Because in Conductor you go straight to prose without ever calling the tool, this auto-decide-first ordering is enforced HERE, not only by the PreToolUse hook. When you render a Conductor prose brief, also capture it with `bin/gstack-question-log` (the PostToolUse capture hook never fires on a prose path, so `/plan-tune` history/learning depends on this call).
+
+**Rule (non-Conductor):** if any `mcp__*__AskUserQuestion` variant is in your tool list, prefer it. Hosts may disable native AUQ via `--disallowedTools AskUserQuestion` (Conductor does, by default) and route through their MCP variant; calling native there silently fails. Same questions/options shape; same decision-brief format applies.

 If AskUserQuestion is unavailable (no variant in your tool list) OR a call to it fails, do NOT silently auto-decide or write the decision to the plan file as a substitute. Follow the **failure fallback** below.

@@ -324,7 +333,11 @@ Tell three outcomes apart:
 2. **Completeness scores per choice** — explicit `Completeness: X/10` on EACH choice (10 complete, 7 happy-path, 3 shortcut); use the kind-note when options differ in kind not coverage, but never silently drop the score.
 3. **The recommendation and why** — a `Recommendation: <choice> because <reason>` line plus the `(recommended)` marker on that choice.

-Layout: a `D<N>` title + a one-line note that AskUserQuestion failed and to reply with a letter; the issue ELI10; the Recommendation line; then ONE paragraph per choice carrying its `(recommended)` marker, its `Completeness: X/10`, and 2-4 sentences of reasoning — never a bare bullet list; a closing `Net:` line. Split chains / 5+ options: one prose block per per-option call, in sequence. Then STOP and wait — the user's typed answer is the decision. In plan mode this satisfies end-of-turn like a tool call.
+Layout: a `D<N>` title + a one-line note to reply with a letter (in Conductor this is the normal path; elsewhere it means AskUserQuestion was unavailable or errored); the issue ELI10; the Recommendation line; then ONE paragraph per choice carrying its `(recommended)` marker, its `Completeness: X/10`, and 2-4 sentences of reasoning — never a bare bullet list; a closing `Net:` line. Split chains / 5+ options: one prose block per per-option call, in sequence. Then STOP and wait — the user's typed answer is the decision. In plan mode this satisfies end-of-turn like a tool call.
+
+**Continuation — mapping a typed reply back to a brief.** Each brief carries a stable label (`D<N>`, or `D<N>.k` in a split chain). The user references it (e.g. "3.2: B"). A bare letter maps to the single most-recent UNANSWERED brief; if more than one is open (a split chain), do NOT guess — ask which `D<N>.k` it answers. Never apply a bare letter ambiguously across a chain.
+
+**One-way / destructive confirmations in prose.** When the decision is a one-way door (irreversible or destructive — delete, force-push, drop, overwrite), prose is a WEAKER gate than the tool, so make it stronger: require an explicit typed confirmation (the exact option letter or word), state plainly what is irreversible, and NEVER proceed on a vague, partial, or ambiguous reply — re-ask instead. Treat silence or "ok"/"sure" without the explicit choice as not-yet-confirmed.

 ### Format

@@ -408,7 +421,7 @@ Before calling AskUserQuestion, verify:
 - [ ] (recommended) label on one option (even for neutral-posture)
 - [ ] Dual-scale effort labels on effort-bearing options (human / CC)
 - [ ] Net line closes the decision
- [ ] You are calling the tool, not writing prose — unless the documented failure fallback applies (then: prose with the mandatory triad — issue ELI10, per-choice Completeness, Recommendation + `(recommended)` — and a "reply with a letter" instruction, then STOP)
+- [ ] You are calling the tool, not writing prose — unless `CONDUCTOR_SESSION: true` (then prose is the DEFAULT, not the tool) OR the documented failure fallback applies (then: prose with the mandatory triad — issue ELI10, per-choice Completeness, Recommendation + `(recommended)` — and a "reply with a letter" instruction, then STOP)
 - [ ] Non-ASCII characters (CJK / accents) written directly, NOT \u-escaped
 - [ ] If you had 5+ options, you split (or batched into ≤4-groups) — did NOT drop any
 - [ ] If you split, you checked dependencies between options before firing the chain
@@ -39,6 +39,13 @@ echo "REPO_MODE: $REPO_MODE"
 _SESSION_KIND=$($GSTACK_BIN/gstack-session-kind 2>/dev/null || echo "interactive")
 case "$_SESSION_KIND" in spawned|headless|interactive) ;; *) _SESSION_KIND="interactive" ;; esac
 echo "SESSION_KIND: $_SESSION_KIND"
+# Conductor host: AskUserQuestion is unreliable here (native disabled, MCP
+# variant flaky), so skills render decisions as prose instead of calling the
+# tool. Gated on !headless so an eval/CI run INSIDE Conductor (GSTACK_HEADLESS)
+# still BLOCKs rather than rendering prose to nobody.
+if [ "$_SESSION_KIND" != "headless" ] && { [ -n "${CONDUCTOR_WORKSPACE_PATH:-}" ] || [ -n "${CONDUCTOR_PORT:-}" ]; }; then
+  echo "CONDUCTOR_SESSION: true"
+fi
 _LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no")
 echo "LAKE_INTRO: $_LAKE_SEEN"
 _TEL=$($GSTACK_BIN/gstack-config get telemetry 2>/dev/null || true)
@@ -288,7 +295,9 @@ AI orchestrator (e.g., OpenClaw). In spawned sessions:

 "AskUserQuestion" can resolve to two tools at runtime: the **host MCP variant** (e.g. `mcp__conductor__AskUserQuestion` — appears in your tool list when the host registers it) or the **native** Claude Code tool.

-**Rule:** if any `mcp__*__AskUserQuestion` variant is in your tool list, prefer it. Hosts may disable native AUQ via `--disallowedTools AskUserQuestion` (Conductor does, by default) and route through their MCP variant; calling native there silently fails. Same questions/options shape; same decision-brief format applies.
+**Conductor rule (read before the MCP rule):** if `CONDUCTOR_SESSION: true` was echoed by the preamble, do NOT call AskUserQuestion at all — neither native nor any `mcp__*__AskUserQuestion` variant. Render EVERY decision brief as the **prose form** below and STOP. This is proactive, not a reaction to a failure: Conductor disables native AUQ and its MCP variant is flaky (it returns `[Tool result missing due to internal error]`), so prose is the reliable path. **Auto-decide preferences still apply first:** if a `[plan-tune auto-decide] <id> → <option>` result has already surfaced for a question, proceed with that option (no prose). Because in Conductor you go straight to prose without ever calling the tool, this auto-decide-first ordering is enforced HERE, not only by the PreToolUse hook. When you render a Conductor prose brief, also capture it with `bin/gstack-question-log` (the PostToolUse capture hook never fires on a prose path, so `/plan-tune` history/learning depends on this call).
+
+**Rule (non-Conductor):** if any `mcp__*__AskUserQuestion` variant is in your tool list, prefer it. Hosts may disable native AUQ via `--disallowedTools AskUserQuestion` (Conductor does, by default) and route through their MCP variant; calling native there silently fails. Same questions/options shape; same decision-brief format applies.

 If AskUserQuestion is unavailable (no variant in your tool list) OR a call to it fails, do NOT silently auto-decide or write the decision to the plan file as a substitute. Follow the **failure fallback** below.

@@ -310,7 +319,11 @@ Tell three outcomes apart:
 2. **Completeness scores per choice** — explicit `Completeness: X/10` on EACH choice (10 complete, 7 happy-path, 3 shortcut); use the kind-note when options differ in kind not coverage, but never silently drop the score.
 3. **The recommendation and why** — a `Recommendation: <choice> because <reason>` line plus the `(recommended)` marker on that choice.

-Layout: a `D<N>` title + a one-line note that AskUserQuestion failed and to reply with a letter; the issue ELI10; the Recommendation line; then ONE paragraph per choice carrying its `(recommended)` marker, its `Completeness: X/10`, and 2-4 sentences of reasoning — never a bare bullet list; a closing `Net:` line. Split chains / 5+ options: one prose block per per-option call, in sequence. Then STOP and wait — the user's typed answer is the decision. In plan mode this satisfies end-of-turn like a tool call.
+Layout: a `D<N>` title + a one-line note to reply with a letter (in Conductor this is the normal path; elsewhere it means AskUserQuestion was unavailable or errored); the issue ELI10; the Recommendation line; then ONE paragraph per choice carrying its `(recommended)` marker, its `Completeness: X/10`, and 2-4 sentences of reasoning — never a bare bullet list; a closing `Net:` line. Split chains / 5+ options: one prose block per per-option call, in sequence. Then STOP and wait — the user's typed answer is the decision. In plan mode this satisfies end-of-turn like a tool call.
+
+**Continuation — mapping a typed reply back to a brief.** Each brief carries a stable label (`D<N>`, or `D<N>.k` in a split chain). The user references it (e.g. "3.2: B"). A bare letter maps to the single most-recent UNANSWERED brief; if more than one is open (a split chain), do NOT guess — ask which `D<N>.k` it answers. Never apply a bare letter ambiguously across a chain.
+
+**One-way / destructive confirmations in prose.** When the decision is a one-way door (irreversible or destructive — delete, force-push, drop, overwrite), prose is a WEAKER gate than the tool, so make it stronger: require an explicit typed confirmation (the exact option letter or word), state plainly what is irreversible, and NEVER proceed on a vague, partial, or ambiguous reply — re-ask instead. Treat silence or "ok"/"sure" without the explicit choice as not-yet-confirmed.

 ### Format

@@ -394,7 +407,7 @@ Before calling AskUserQuestion, verify:
 - [ ] (recommended) label on one option (even for neutral-posture)
 - [ ] Dual-scale effort labels on effort-bearing options (human / CC)
 - [ ] Net line closes the decision
- [ ] You are calling the tool, not writing prose — unless the documented failure fallback applies (then: prose with the mandatory triad — issue ELI10, per-choice Completeness, Recommendation + `(recommended)` — and a "reply with a letter" instruction, then STOP)
+- [ ] You are calling the tool, not writing prose — unless `CONDUCTOR_SESSION: true` (then prose is the DEFAULT, not the tool) OR the documented failure fallback applies (then: prose with the mandatory triad — issue ELI10, per-choice Completeness, Recommendation + `(recommended)` — and a "reply with a letter" instruction, then STOP)
 - [ ] Non-ASCII characters (CJK / accents) written directly, NOT \u-escaped
 - [ ] If you had 5+ options, you split (or batched into ≤4-groups) — did NOT drop any
 - [ ] If you split, you checked dependencies between options before firing the chain
@@ -1277,6 +1290,22 @@ EVAL_JUDGE_TIER=full EVAL_VERBOSE=1 bin/test-lane --eval test/evals/<suite>_eval

 If multiple suites need to run, run them sequentially (each needs a test lane). If the first suite fails, stop immediately — don't burn API cost on remaining suites.

+**Long eval suites (30+ min): launch detached so a turn boundary can't kill them.**
+A plain backgrounded eval lives in the harness's process group and dies to a
+SIGTERM ("polite quit") on a turn boundary, a stopped monitor, or an interruption
+(observed mid-`/ship`: `script terminated by signal SIGTERM`). Run it through
+`$GSTACK_ROOT/bin/gstack-detach` instead — it survives in its own
+session, serializes against other worktrees via a machine lock (no API
+saturation), and writes a guaranteed `### gstack-detach EXIT=<code> ###` sentinel:
+
+```bash
+$GSTACK_ROOT/bin/gstack-detach --label ship-evals --lock gstack-evals --timeout 5400 -- <project eval command>
+```
+
+Then poll the printed log path; break on the `EXIT=` sentinel (covers both pass
+and crash — silence is never success). The detached run survives even if your
+poller is reaped.
+
 **4. Check results:**

 - **If any eval fails:** Show the failures, the cost dashboard, and **STOP**. Do not proceed.
@@ -41,6 +41,13 @@ echo "REPO_MODE: $REPO_MODE"
 _SESSION_KIND=$($GSTACK_BIN/gstack-session-kind 2>/dev/null || echo "interactive")
 case "$_SESSION_KIND" in spawned|headless|interactive) ;; *) _SESSION_KIND="interactive" ;; esac
 echo "SESSION_KIND: $_SESSION_KIND"
+# Conductor host: AskUserQuestion is unreliable here (native disabled, MCP
+# variant flaky), so skills render decisions as prose instead of calling the
+# tool. Gated on !headless so an eval/CI run INSIDE Conductor (GSTACK_HEADLESS)
+# still BLOCKs rather than rendering prose to nobody.
+if [ "$_SESSION_KIND" != "headless" ] && { [ -n "${CONDUCTOR_WORKSPACE_PATH:-}" ] || [ -n "${CONDUCTOR_PORT:-}" ]; }; then
+  echo "CONDUCTOR_SESSION: true"
+fi
 _LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no")
 echo "LAKE_INTRO: $_LAKE_SEEN"
 _TEL=$($GSTACK_BIN/gstack-config get telemetry 2>/dev/null || true)
@@ -290,7 +297,9 @@ AI orchestrator (e.g., OpenClaw). In spawned sessions:

 "AskUserQuestion" can resolve to two tools at runtime: the **host MCP variant** (e.g. `mcp__conductor__AskUserQuestion` — appears in your tool list when the host registers it) or the **native** Claude Code tool.

-**Rule:** if any `mcp__*__AskUserQuestion` variant is in your tool list, prefer it. Hosts may disable native AUQ via `--disallowedTools AskUserQuestion` (Conductor does, by default) and route through their MCP variant; calling native there silently fails. Same questions/options shape; same decision-brief format applies.
+**Conductor rule (read before the MCP rule):** if `CONDUCTOR_SESSION: true` was echoed by the preamble, do NOT call AskUserQuestion at all — neither native nor any `mcp__*__AskUserQuestion` variant. Render EVERY decision brief as the **prose form** below and STOP. This is proactive, not a reaction to a failure: Conductor disables native AUQ and its MCP variant is flaky (it returns `[Tool result missing due to internal error]`), so prose is the reliable path. **Auto-decide preferences still apply first:** if a `[plan-tune auto-decide] <id> → <option>` result has already surfaced for a question, proceed with that option (no prose). Because in Conductor you go straight to prose without ever calling the tool, this auto-decide-first ordering is enforced HERE, not only by the PreToolUse hook. When you render a Conductor prose brief, also capture it with `bin/gstack-question-log` (the PostToolUse capture hook never fires on a prose path, so `/plan-tune` history/learning depends on this call).
+
+**Rule (non-Conductor):** if any `mcp__*__AskUserQuestion` variant is in your tool list, prefer it. Hosts may disable native AUQ via `--disallowedTools AskUserQuestion` (Conductor does, by default) and route through their MCP variant; calling native there silently fails. Same questions/options shape; same decision-brief format applies.

 If AskUserQuestion is unavailable (no variant in your tool list) OR a call to it fails, do NOT silently auto-decide or write the decision to the plan file as a substitute. Follow the **failure fallback** below.

@@ -312,7 +321,11 @@ Tell three outcomes apart:
 2. **Completeness scores per choice** — explicit `Completeness: X/10` on EACH choice (10 complete, 7 happy-path, 3 shortcut); use the kind-note when options differ in kind not coverage, but never silently drop the score.
 3. **The recommendation and why** — a `Recommendation: <choice> because <reason>` line plus the `(recommended)` marker on that choice.

-Layout: a `D<N>` title + a one-line note that AskUserQuestion failed and to reply with a letter; the issue ELI10; the Recommendation line; then ONE paragraph per choice carrying its `(recommended)` marker, its `Completeness: X/10`, and 2-4 sentences of reasoning — never a bare bullet list; a closing `Net:` line. Split chains / 5+ options: one prose block per per-option call, in sequence. Then STOP and wait — the user's typed answer is the decision. In plan mode this satisfies end-of-turn like a tool call.
+Layout: a `D<N>` title + a one-line note to reply with a letter (in Conductor this is the normal path; elsewhere it means AskUserQuestion was unavailable or errored); the issue ELI10; the Recommendation line; then ONE paragraph per choice carrying its `(recommended)` marker, its `Completeness: X/10`, and 2-4 sentences of reasoning — never a bare bullet list; a closing `Net:` line. Split chains / 5+ options: one prose block per per-option call, in sequence. Then STOP and wait — the user's typed answer is the decision. In plan mode this satisfies end-of-turn like a tool call.
+
+**Continuation — mapping a typed reply back to a brief.** Each brief carries a stable label (`D<N>`, or `D<N>.k` in a split chain). The user references it (e.g. "3.2: B"). A bare letter maps to the single most-recent UNANSWERED brief; if more than one is open (a split chain), do NOT guess — ask which `D<N>.k` it answers. Never apply a bare letter ambiguously across a chain.
+
+**One-way / destructive confirmations in prose.** When the decision is a one-way door (irreversible or destructive — delete, force-push, drop, overwrite), prose is a WEAKER gate than the tool, so make it stronger: require an explicit typed confirmation (the exact option letter or word), state plainly what is irreversible, and NEVER proceed on a vague, partial, or ambiguous reply — re-ask instead. Treat silence or "ok"/"sure" without the explicit choice as not-yet-confirmed.

 ### Format

@@ -396,7 +409,7 @@ Before calling AskUserQuestion, verify:
 - [ ] (recommended) label on one option (even for neutral-posture)
 - [ ] Dual-scale effort labels on effort-bearing options (human / CC)
 - [ ] Net line closes the decision
- [ ] You are calling the tool, not writing prose — unless the documented failure fallback applies (then: prose with the mandatory triad — issue ELI10, per-choice Completeness, Recommendation + `(recommended)` — and a "reply with a letter" instruction, then STOP)
+- [ ] You are calling the tool, not writing prose — unless `CONDUCTOR_SESSION: true` (then prose is the DEFAULT, not the tool) OR the documented failure fallback applies (then: prose with the mandatory triad — issue ELI10, per-choice Completeness, Recommendation + `(recommended)` — and a "reply with a letter" instruction, then STOP)
 - [ ] Non-ASCII characters (CJK / accents) written directly, NOT \u-escaped
 - [ ] If you had 5+ options, you split (or batched into ≤4-groups) — did NOT drop any
 - [ ] If you split, you checked dependencies between options before firing the chain
@@ -1279,6 +1292,22 @@ EVAL_JUDGE_TIER=full EVAL_VERBOSE=1 bin/test-lane --eval test/evals/<suite>_eval

 If multiple suites need to run, run them sequentially (each needs a test lane). If the first suite fails, stop immediately — don't burn API cost on remaining suites.

+**Long eval suites (30+ min): launch detached so a turn boundary can't kill them.**
+A plain backgrounded eval lives in the harness's process group and dies to a
+SIGTERM ("polite quit") on a turn boundary, a stopped monitor, or an interruption
+(observed mid-`/ship`: `script terminated by signal SIGTERM`). Run it through
+`$GSTACK_ROOT/bin/gstack-detach` instead — it survives in its own
+session, serializes against other worktrees via a machine lock (no API
+saturation), and writes a guaranteed `### gstack-detach EXIT=<code> ###` sentinel:
+
+```bash
+$GSTACK_ROOT/bin/gstack-detach --label ship-evals --lock gstack-evals --timeout 5400 -- <project eval command>
+```
+
+Then poll the printed log path; break on the `EXIT=` sentinel (covers both pass
+and crash — silence is never success). The detached run survives even if your
+poller is reaped.
+
 **4. Check results:**

 - **If any eval fails:** Show the failures, the cost dashboard, and **STOP**. Do not proceed.
@@ -2332,23 +2361,47 @@ For each comment in `comments`:

 Every diff gets adversarial review from both Claude and Codex. LOC is not a proxy for risk — a 5-line auth change can be critical.

-**Detect diff size and tool availability:**
+**Detect diff size:**

 ```bash
 DIFF_BASE=$(git merge-base origin/<base> HEAD)
 DIFF_INS=$(git diff "$DIFF_BASE" --stat | tail -1 | grep -oE '[0-9]+ insertion' | grep -oE '[0-9]+' || echo "0")
 DIFF_DEL=$(git diff "$DIFF_BASE" --stat | tail -1 | grep -oE '[0-9]+ deletion' | grep -oE '[0-9]+' || echo "0")
 DIFF_TOTAL=$((DIFF_INS + DIFF_DEL))
-command -v codex >/dev/null 2>&1 && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE"
-# Legacy opt-out — only gates Codex passes, Claude always runs
-OLD_CFG=$($GSTACK_ROOT/bin/gstack-config get codex_reviews 2>/dev/null || true)
 echo "DIFF_SIZE: $DIFF_TOTAL"
-echo "OLD_CFG: ${OLD_CFG:-not_set}"
 ```

-If `OLD_CFG` is `disabled`: skip Codex passes only. Claude adversarial subagent still runs (it's free and fast). Jump to the "Claude adversarial subagent" section.
+**Detect the Codex master switch + tool availability:**

-**User override:** If the user explicitly requested "full review", "structured review", or "P1 gate", also run the Codex structured review regardless of diff size.
+```bash
+# Codex preflight: one block (functions sourced here don't persist to later blocks).
+_TEL=$($GSTACK_ROOT/bin/gstack-config get telemetry 2>/dev/null || echo off)
+_CODEX_CFG=$($GSTACK_ROOT/bin/gstack-config get codex_reviews 2>/dev/null || echo enabled)
+source $GSTACK_ROOT/bin/gstack-codex-probe 2>/dev/null || true
+if [ "$_CODEX_CFG" = "disabled" ]; then
+  _CODEX_MODE="disabled"
+elif ! command -v codex >/dev/null 2>&1; then
+  _CODEX_MODE="not_installed"; _gstack_codex_log_event "codex_cli_missing" 2>/dev/null || true
+elif ! _gstack_codex_auth_probe >/dev/null 2>&1; then
+  _CODEX_MODE="not_authed"; _gstack_codex_log_event "codex_auth_failed" 2>/dev/null || true
+else
+  _CODEX_MODE="ready"; _gstack_codex_version_check 2>/dev/null || true
+fi
+echo "CODEX_MODE: $_CODEX_MODE"
+```
+
+Branch on the echoed `CODEX_MODE`:
+- **`disabled`** — the user turned Codex reviews off (`codex_reviews=disabled`). Skip the Codex passes only; the Claude adversarial subagent below STILL runs (it is free and fast). Print: "Codex passes skipped (codex_reviews disabled) — running Claude adversarial only."
+- **`not_installed`** — Codex CLI absent. Print: "Codex not installed — using Claude subagent. Install for cross-model coverage: `npm install -g @openai/codex`." Fall back to the Claude subagent path.
+- **`not_authed`** — installed but no credentials. Print: "Codex installed but not authenticated — using Claude subagent. Run `codex login` or set `$CODEX_API_KEY`." Fall back to the Claude subagent path.
+- **`ready`** — run the Codex pass below.
+
+For this diff-review path, `CODEX_MODE: disabled` means skip the Codex passes ONLY — the
+Claude adversarial subagent below still runs (it's free and fast). `ready` runs the Codex
+passes; `not_installed` / `not_authed` skip them with the printed note and continue with
+Claude only.
+
+**User override:** If the user explicitly requested "full review", "structured review", or "P1 gate", also run the Codex structured review regardless of diff size (still requires `CODEX_MODE: ready`).

 ---

@@ -2369,9 +2422,9 @@ If the subagent fails or times out: "Claude adversarial subagent unavailable. Co

 ---

-### Codex adversarial challenge (always runs when available)
+### Codex adversarial challenge (runs whenever `CODEX_MODE: ready`)

-If Codex is available AND `OLD_CFG` is NOT `disabled`:
+If `CODEX_MODE` is `ready`:

 ```bash
 TMPERR_ADV=$(mktemp /tmp/codex-adv-XXXXXXXX)
@@ -2393,13 +2446,13 @@ Present the full output verbatim. This is informational — it never blocks ship

 **Cleanup:** Run `rm -f "$TMPERR_ADV"` after processing.

-If Codex is NOT available: "Codex CLI not found — running Claude adversarial only. Install Codex for cross-model coverage: `npm install -g @openai/codex`"
+If `CODEX_MODE` is `not_installed` / `not_authed` / `disabled`: the preflight already printed the reason; run Claude adversarial only.

 ---

 ### Codex structured review (large diffs only, 200+ lines)

-If `DIFF_TOTAL >= 200` AND Codex is available AND `OLD_CFG` is NOT `disabled`:
+If `DIFF_TOTAL >= 200` AND `CODEX_MODE` is `ready`:

 ```bash
 TMPERR=$(mktemp /tmp/codex-review-XXXXXXXX)
@@ -16,7 +16,7 @@
 */

 import { describe, it, expect } from "bun:test";
-import { execFileSync } from "child_process";
+import { execFileSync, spawnSync } from "child_process";
 import {
  mkdtempSync,
  mkdirSync,
@@ -47,6 +47,16 @@ function runDetect(env: Partial<NodeJS.ProcessEnv>): string {
  });
 }

+/** Run detect with --is-ok and return its exit code (never throws). */
+function runIsOk(env: Partial<NodeJS.ProcessEnv>): number {
+  const r = spawnSync(BUN_BIN, ["run", DETECT_BIN, "--is-ok"], {
+    timeout: 15_000,
+    stdio: ["ignore", "pipe", "pipe"],
+    env: { ...process.env, ...env },
+  });
+  return r.status ?? 1;
+}
+
 interface DetectShape {
  gbrain_on_path: boolean;
  gbrain_version: string | null;
@@ -244,3 +254,66 @@ exit 0
    }
  });
 });
+
+describe("bin/gstack-gbrain-detect --is-ok — live gate", () => {
+  it("exits non-zero when gbrain is not on PATH (no-cli)", () => {
+    const tmp = mkdtempSync(join(tmpdir(), "detect-isok-"));
+    try {
+      const code = runIsOk({
+        HOME: tmp,
+        PATH: "/usr/bin:/bin", // no gbrain
+        GSTACK_HOME: tmp,
+        GSTACK_DETECT_NO_CACHE: "1",
+      });
+      expect(code).not.toBe(0);
+    } finally {
+      rmSync(tmp, { recursive: true, force: true });
+    }
+  });
+
+  it("exits 0 when a fake gbrain reports a healthy engine (ok)", () => {
+    const tmp = mkdtempSync(join(tmpdir(), "detect-isok-"));
+    const bindir = join(tmp, "bin");
+    const home = join(tmp, "home");
+    const configDir = join(home, ".gbrain");
+    try {
+      mkdirSync(bindir, { recursive: true });
+      mkdirSync(configDir, { recursive: true });
+      writeFileSync(join(configDir, "config.json"), JSON.stringify({ engine: "pglite" }));
+      const fake = `#!/bin/sh
+case "$1 $2" in
+  "--version ")        echo "gbrain 0.33.1.0"; exit 0 ;;
+  "sources list")      echo '{"sources":[]}'; exit 0 ;;
+  "doctor "*)          echo '{"status":"ok","checks":[]}'; exit 0 ;;
+esac
+exit 0
+`;
+      const gbrainPath = join(bindir, "gbrain");
+      writeFileSync(gbrainPath, fake);
+      chmodSync(gbrainPath, 0o755);
+
+      const code = runIsOk({
+        HOME: home,
+        PATH: `${bindir}:/usr/bin:/bin`,
+        GSTACK_HOME: tmp,
+        GSTACK_DETECT_NO_CACHE: "1",
+      });
+      expect(code).toBe(0);
+    } finally {
+      rmSync(tmp, { recursive: true, force: true });
+    }
+  });
+
+  it("exit code agrees with the JSON gbrain_local_status (no skew)", () => {
+    // Run both surfaces against the same env and assert they never disagree.
+    const tmp = mkdtempSync(join(tmpdir(), "detect-isok-"));
+    try {
+      const env = { HOME: tmp, PATH: "/usr/bin:/bin", GSTACK_HOME: tmp, GSTACK_DETECT_NO_CACHE: "1" };
+      const status = (JSON.parse(runDetect(env)) as DetectShape).gbrain_local_status;
+      const code = runIsOk(env);
+      expect(code === 0).toBe(status === "ok");
+    } finally {
+      rmSync(tmp, { recursive: true, force: true });
+    }
+  });
+});
@@ -0,0 +1,60 @@
+import { describe, test, expect } from 'bun:test';
+import * as path from 'path';
+import * as fs from 'fs';
+
+// Static tripwires for the C (machine-wide) render in `gstack-config
+// gbrain-refresh`. The render mutates the shared global install, so the guards
+// that stop it from touching the wrong directory are load-bearing — these fail
+// CI if any guard is dropped.
+const ROOT = path.resolve(import.meta.dir, '..');
+const SRC = fs.readFileSync(path.join(ROOT, 'bin', 'gstack-config'), 'utf-8');
+
+// Pull out just the gbrain-refresh `ok)` branch so assertions can't be
+// satisfied by unrelated text elsewhere in the file.
+function okBranch(): string {
+  const start = SRC.indexOf('gbrain-refresh)');
+  const ok = SRC.indexOf('ok)', start);
+  const end = SRC.indexOf(';;', ok);
+  if (start < 0 || ok < 0 || end < 0) throw new Error('Could not locate gbrain-refresh ok) branch');
+  return SRC.slice(ok, end);
+}
+
+describe('gstack-config gbrain-refresh: machine-wide render guards', () => {
+  const branch = okBranch();
+
+  test('targets the global install', () => {
+    expect(branch).toContain('$HOME/.claude/skills/gstack');
+  });
+
+  test('refuses a symlinked install (would dirty a dev worktree)', () => {
+    expect(branch).toMatch(/\[ -L "\$INSTALL_DIR" \]/);
+  });
+
+  test('verifies it is a real gstack clone before mutating it', () => {
+    expect(branch).toContain('$INSTALL_DIR/VERSION');
+    expect(branch).toContain('$INSTALL_DIR/package.json');
+  });
+
+  test('requires bun on PATH', () => {
+    expect(branch).toContain('command -v bun');
+  });
+
+  test('renders the :user variant in place into the install', () => {
+    expect(branch).toContain('gen:skill-docs:user --host claude');
+  });
+
+  test('is self-documenting about the reset --hard / re-run cycle', () => {
+    expect(branch).toContain('reset --hard');
+    expect(branch).toContain('gbrain-refresh');
+  });
+});
+
+describe('CLAUDE.md: deploy section documents the re-run', () => {
+  test('notes re-running gbrain-refresh after reset --hard', () => {
+    const claudeMd = fs.readFileSync(path.join(ROOT, 'CLAUDE.md'), 'utf-8');
+    const idx = claudeMd.indexOf('## Deploying to the active skill');
+    expect(idx).toBeGreaterThan(-1);
+    const section = claudeMd.slice(idx, idx + 1200);
+    expect(section).toContain('gbrain-refresh');
+  });
+});
@@ -0,0 +1,84 @@
+import { describe, test, expect } from 'bun:test';
+import { spawnSync } from 'child_process';
+import { createHash } from 'crypto';
+import * as path from 'path';
+import * as fs from 'fs';
+import * as os from 'os';
+
+const ROOT = path.resolve(import.meta.dir, '..');
+
+// Render the gbrain `:user` variant into a temp out-dir, forcing detection ON
+// via a crafted GSTACK_HOME so the test is deterministic regardless of whether
+// the dev machine actually has gbrain installed. Asserts the B2 contract:
+//   (a) the worktree SKILL.md is byte-unchanged (source stays canonical),
+//   (b) the out-dir SKILL.md gained the inline Brain Context Load block,
+//   (c) its section refs point at the out-dir, not ~/.claude/skills/gstack,
+//   (d) bin/ refs are left pointing at the global install,
+//   (e) the out-dir section file gained the Save Results to Brain block.
+describe('gen-skill-docs --out-dir (B2 render isolation)', () => {
+  function hashFile(p: string): string {
+    return createHash('sha256').update(fs.readFileSync(p)).digest('hex');
+  }
+
+  test('renders :user to out-dir, rewrites section paths, leaves worktree canonical', () => {
+    const tmpHome = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-home-'));
+    const outDir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-out-'));
+    const worktreeSkill = path.join(ROOT, 'ship', 'SKILL.md');
+    const beforeHash = hashFile(worktreeSkill);
+    try {
+      // Force gbrain detection ON for --respect-detection.
+      fs.writeFileSync(
+        path.join(tmpHome, 'gbrain-detection.json'),
+        JSON.stringify({ gbrain_local_status: 'ok', gbrain_version: '9.9.9' }),
+      );
+
+      const res = spawnSync(
+        'bun',
+        ['run', 'scripts/gen-skill-docs.ts', '--respect-detection', '--host', 'claude', '--out-dir', outDir],
+        { cwd: ROOT, encoding: 'utf-8', timeout: 120_000, env: { ...process.env, GSTACK_HOME: tmpHome } },
+      );
+      expect(res.status).toBe(0);
+
+      const outSkill = path.join(outDir, 'ship', 'SKILL.md');
+      const outSection = path.join(outDir, 'ship', 'sections', 'adversarial.md');
+      expect(fs.existsSync(outSkill)).toBe(true);
+      const skillContent = fs.readFileSync(outSkill, 'utf-8');
+
+      // (a) worktree byte-unchanged
+      expect(hashFile(worktreeSkill)).toBe(beforeHash);
+
+      // (b) inline block present in the rendered SKILL.md
+      expect(skillContent).toContain('Brain Context Load');
+
+      // (c) section refs repointed to the out-dir; none left pointing at the install
+      expect(skillContent).toContain(`${outDir}/ship/sections/`);
+      expect(skillContent).not.toContain('~/.claude/skills/gstack/ship/sections/');
+
+      // (d) bin refs are NOT rewritten — they still resolve to the global install
+      expect(skillContent).toContain('~/.claude/skills/gstack/bin/');
+
+      // (e) the SAVE block landed in the rendered section file
+      expect(fs.existsSync(outSection)).toBe(true);
+      expect(fs.readFileSync(outSection, 'utf-8')).toContain('Save Results to Brain');
+    } finally {
+      fs.rmSync(tmpHome, { recursive: true, force: true });
+      fs.rmSync(outDir, { recursive: true, force: true });
+    }
+  });
+
+  test('global extras (proactive-suggestions.json) are NOT written in out-dir mode', () => {
+    const outDir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-out-'));
+    try {
+      const res = spawnSync(
+        'bun',
+        ['run', 'scripts/gen-skill-docs.ts', '--host', 'claude', '--out-dir', outDir],
+        { cwd: ROOT, encoding: 'utf-8', timeout: 120_000 },
+      );
+      expect(res.status).toBe(0);
+      // proactive-suggestions.json lives at a repo path; out-dir mode must skip it.
+      expect(fs.existsSync(path.join(outDir, 'scripts', 'proactive-suggestions.json'))).toBe(false);
+    } finally {
+      fs.rmSync(outDir, { recursive: true, force: true });
+    }
+  });
+});
@@ -0,0 +1,96 @@
+/**
+ * gstack-detach — the eval-infra robustness guard. Pins the four killer fixes:
+ *   1. SIGTERM-proof detachment (runs in a different process group, outlives the launcher)
+ *   2. run-scoped default log path (no shared-/tmp collision between worktrees)
+ *   3. watchdog --timeout (no silent hang) + guaranteed EXIT sentinel
+ *   4. machine-wide --lock serialization (no cross-worktree API saturation)
+ */
+import { describe, test, expect } from 'bun:test';
+import { spawnSync, spawn } from 'child_process';
+import * as fs from 'fs';
+import * as os from 'os';
+import * as path from 'path';
+
+const ROOT = path.resolve(import.meta.dir, '..');
+const DETACH = path.join(ROOT, 'bin', 'gstack-detach');
+
+function ownPgid(): string {
+  return (spawnSync('ps', ['-o', 'pgid=', '-p', String(process.pid)], { encoding: 'utf-8' }).stdout || '').trim();
+}
+function waitFor(pred: () => boolean, ms: number): boolean {
+  const end = Date.now() + ms;
+  while (Date.now() < end) {
+    if (pred()) return true;
+    spawnSync('sleep', ['0.2']);
+  }
+  return pred();
+}
+function logHas(p: string, needle: string): boolean {
+  try { return fs.readFileSync(p, 'utf-8').includes(needle); } catch { return false; }
+}
+
+describe('gstack-detach', () => {
+  test('detaches (different pgid), returns immediately, completes, writes EXIT sentinel', () => {
+    const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'gd-'));
+    const log = path.join(dir, 'run.log');
+    try {
+      const t0 = Date.now();
+      const r = spawnSync(DETACH, ['--log', log, '--', 'bash', '-c', 'sleep 2; echo body-ran'], { encoding: 'utf-8', timeout: 10000 });
+      const elapsed = Date.now() - t0;
+      expect(r.status).toBe(0);
+      expect(r.stdout).toContain(`gstack-detach LOG ${log}`);
+      expect(elapsed).toBeLessThan(1500);                         // non-blocking
+      expect(waitFor(() => logHas(log, '### gstack-detach EXIT=0 ###'), 8000)).toBe(true);
+      expect(logHas(log, 'body-ran')).toBe(true);                 // ran to completion after launcher returned
+      const m = fs.readFileSync(log, 'utf-8').match(/pgid=(\d+)/);
+      expect(m).not.toBeNull();
+      expect(m![1]).not.toBe(ownPgid());                          // detached into its own group
+    } finally { fs.rmSync(dir, { recursive: true, force: true }); }
+  }, 15000);
+
+  test('default log is run-scoped under ~/.gstack-dev/eval-runs (no shared /tmp)', () => {
+    const r = spawnSync(DETACH, ['--label', 'unittest', '--', 'true'], { encoding: 'utf-8', timeout: 10000 });
+    const log = (r.stdout.match(/gstack-detach LOG (\S+)/) || [])[1];
+    try {
+      expect(log).toContain('/.gstack-dev/eval-runs/');
+      expect(path.basename(log)).toContain('unittest-');
+      expect(path.basename(log)).toMatch(/-\d+\.log$/);            // pid-unique
+      waitFor(() => logHas(log, '### gstack-detach EXIT=0 ###'), 6000);
+    } finally { if (log) fs.rmSync(log, { force: true }); }
+  }, 12000);
+
+  test('watchdog kills a stalled run and records EXIT=timeout (no silent hang)', () => {
+    const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'gd-'));
+    const log = path.join(dir, 'run.log');
+    try {
+      spawnSync(DETACH, ['--log', log, '--timeout', '1', '--', 'sleep', '60'], { encoding: 'utf-8', timeout: 10000 });
+      expect(waitFor(() => logHas(log, '### gstack-detach EXIT=timeout ###'), 12000)).toBe(true);
+      expect(logHas(log, 'WATCHDOG fired')).toBe(true);
+    } finally { fs.rmSync(dir, { recursive: true, force: true }); }
+  }, 16000);
+
+  test('machine --lock serializes concurrent runs (second WAITS for the first)', () => {
+    const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'gd-'));
+    const lock = `gstack-detach-test-${process.pid}`;
+    const logA = path.join(dir, 'a.log');
+    const logB = path.join(dir, 'b.log');
+    try {
+      // First holds the lock for ~3s; second must wait then acquire.
+      spawnSync(DETACH, ['--log', logA, '--lock', lock, '--', 'sleep', '3'], { encoding: 'utf-8', timeout: 10000 });
+      waitFor(() => logHas(logA, "ACQUIRED"), 4000);
+      spawnSync(DETACH, ['--log', logB, '--lock', lock, '--', 'echo', 'second-ran'], { encoding: 'utf-8', timeout: 10000 });
+      // Second should report WAITING (first still holds it) then ACQUIRE after release.
+      expect(waitFor(() => logHas(logB, 'WAITING for lock'), 4000)).toBe(true);
+      expect(waitFor(() => logHas(logB, '### gstack-detach EXIT=0 ###'), 12000)).toBe(true);
+      expect(logHas(logB, 'second-ran')).toBe(true);
+    } finally {
+      fs.rmSync(dir, { recursive: true, force: true });
+      fs.rmSync(path.join(os.homedir(), '.gstack', 'locks', `${lock}.lock`), { force: true });
+    }
+  }, 20000);
+
+  test('rejects missing command (exit 2)', () => {
+    const r = spawnSync(DETACH, ['--label', 'x'], { encoding: 'utf-8' });
+    expect(r.status).toBe(2);
+  });
+});
@@ -36,6 +36,7 @@ import {
 import * as fs from 'fs';
 import * as path from 'path';
 import { resolveClaudeBinary as resolveClaudeBinaryShared } from '../../browse/src/claude-bin';
+import { hermeticChildEnv } from './hermetic-env';
 import type { SkillTestResult } from './session-runner';

 // ---------------------------------------------------------------------------
@@ -300,12 +301,17 @@ export async function runAgentSdkTest(
  const queryImpl: QueryProvider = opts.queryProvider ?? query;
  const model = opts.model ?? 'claude-opus-4-7';

-  // NOTE on GSTACK_HEADLESS: the SDK child inherits process.env, so headless
-  // classification for eval/E2E runs is set by the `test:gate` / `test:evals`
-  // package.json scripts (scoped to that invocation), NOT mutated here. We must not
-  // pass sdkOpts.env (it breaks the SDK auth pipeline — see CLAUDE.md) and must not
-  // mutate process.env ambiently (it would leak headless into later interactive-path
-  // tests in the same Bun process — Codex review finding).
+  // NOTE on env: the SDK child gets the COMPLETE hermetic env (allowlist
+  // scrub + ANTHROPIC_API_KEY + hermetic CLAUDE_CONFIG_DIR/GSTACK_HOME), with
+  // per-test opts.env merging last. The historical "passing env: breaks SDK
+  // auth" failure (old CLAUDE.md warning) was partial-env replacement —
+  // Options.env REPLACES the child's entire environment, so an object without
+  // the key killed auth. A complete env is safe (validated 2026-06-12 via
+  // query() with hermeticChildEnv(): success, real cost, Bash tool working).
+  // Do not mutate process.env ambiently here (it would leak into later
+  // interactive-path tests in the same Bun process — Codex review finding);
+  // ambient ANTHROPIC_API_KEY mutation by tests still works because the
+  // builder reads process.env at call time.

  let attempt = 0;
  let lastErr: unknown = null;
@@ -356,7 +362,7 @@ export async function runAgentSdkTest(
        permissionMode: resolvedPermissionMode,
        allowDangerouslySkipPermissions: resolvedPermissionMode === 'bypassPermissions',
        settingSources: opts.settingSources ?? [],
-        env: opts.env,
+        env: hermeticChildEnv(opts.env),
        pathToClaudeCodeExecutable: opts.pathToClaudeCodeExecutable,
        ...(hasCanUseTool ? { canUseTool: opts.canUseTool } : {}),
      };
@@ -145,6 +145,9 @@ export const CARVE_GUARDS: Record<string, CarveGuard> = {
    maxSkeletonBytes: 90_000,
    minUnionBytes: 80_000,
    mustContain: ['SCOPE EXPANSION', 'SELECTIVE EXPANSION', 'HOLD SCOPE', 'SCOPE REDUCTION'],
+    // Default-on Codex outside-voice (codexPreflight block + CODEX_MODE branch
+    // prose replacing the smaller opt-in question) lands this ~5.2% over baseline.
+    maxSizeRatio: 1.08,
  },
  'plan-eng-review': {
    skill: 'plan-eng-review',
@@ -162,9 +165,11 @@ export const CARVE_GUARDS: Record<string, CarveGuard> = {
    minUnionBytes: 70_000,
    mustContain: ['Architecture', 'Code Quality', 'Test', 'Performance'],
    // Cross-cutting preamble growth (v1.57.2.0 AUQ-failure prose fallback + the
-    // decision-memory nudge + the v1.57.4.0 Boil-the-Ocean rename) lands this just
-    // over the strict 1.05; small headroom for the shared preamble additions.
-    maxSizeRatio: 1.06,
+    // decision-memory nudge + the v1.57.4.0 Boil-the-Ocean rename) plus the
+    // default-on Codex outside-voice (codexPreflight block + CODEX_MODE branch
+    // prose, replacing the smaller opt-in question) land this at ~6.6% over the
+    // v1.53.0.0 baseline. Headroom for those intentional additions.
+    maxSizeRatio: 1.08,
  },
  'plan-design-review': {
    skill: 'plan-design-review',
@@ -178,7 +183,9 @@ export const CARVE_GUARDS: Record<string, CarveGuard> = {
      gateAfterStop: 'EXIT PLAN MODE GATE',
    },
    behavioral: 'plan',
-    maxSkeletonBytes: 82_000,
+    // +Conductor AUQ-default-prose rule + one-way/continuation safety in the
+    // always-loaded AskUserQuestion Format section.
+    maxSkeletonBytes: 84_000,
    minUnionBytes: 70_000,
    mustContain: ['design', 'visual'],
  },
@@ -194,9 +201,14 @@ export const CARVE_GUARDS: Record<string, CarveGuard> = {
      gateAfterStop: 'EXIT PLAN MODE GATE',
    },
    behavioral: 'plan',
-    maxSkeletonBytes: 76_000,
+    // +Conductor AUQ-default-prose rule + one-way/destructive prose safety +
+    // continuation protocol in the always-loaded AskUserQuestion Format section.
+    maxSkeletonBytes: 78_000,
    minUnionBytes: 70_000,
    mustContain: ['developer experience', 'Getting Started'],
+    // Default-on Codex outside-voice (codexPreflight block + CODEX_MODE branch
+    // prose replacing the smaller opt-in question) lands this ~5.7% over baseline.
+    maxSizeRatio: 1.08,
  },
  'office-hours': {
    skill: 'office-hours',
@@ -229,14 +241,20 @@ export const CARVE_GUARDS: Record<string, CarveGuard> = {
      gateAfterStop: undefined,
    },
    behavioral: 'prompt',
-    maxSkeletonBytes: 50_000,
+    // +Conductor AUQ-default-prose rule + one-way/continuation safety in the
+    // always-loaded AskUserQuestion Format section.
+    maxSkeletonBytes: 53_000,
    minUnionBytes: 55_000,
    mustContain: ['CHANGELOG', 'Diataxis', 'coverage'],
-    // The AUQ-failure prose fallback (v1.57.2.0) adds ~2KB to every skill's
-    // always-loaded preamble; on this small carved skeleton that lands at ~5.9%
-    // over the pre-carve/pre-AUQ v1.53.0.0 baseline. Headroom for the
-    // cross-cutting addition; all other skills keep the strict 1.05 ceiling.
-    maxSizeRatio: 1.08,
+    // Two intentional additions stack on this small skill: the AUQ-failure prose
+    // fallback (v1.57.2.0, ~2KB to every preamble) AND the new default-on Codex
+    // documentation-review section (codexPreflight + prompt + apply-gate, carved
+    // into release-body so the SKELETON stays under maxSkeletonBytes). On a ~55KB
+    // baseline that whole new capability is ~18.6% of union bytes. The doc review
+    // is a deliberate new feature, not preamble creep; the union ceiling is raised
+    // to match while the skeleton budget (50_000) still holds the always-loaded
+    // cost flat.
+    maxSizeRatio: 1.20,
  },
  'design-consultation': {
    skill: 'design-consultation',
@@ -250,7 +268,9 @@ export const CARVE_GUARDS: Record<string, CarveGuard> = {
      gateAfterStop: undefined,
    },
    behavioral: 'prompt',
-    maxSkeletonBytes: 64_000,
+    // +Conductor AUQ-default-prose rule + one-way/continuation safety in the
+    // always-loaded AskUserQuestion Format section.
+    maxSkeletonBytes: 67_000,
    minUnionBytes: 72_000,
    mustContain: ['Typography', 'Color', 'Aesthetic Direction'],
    // Cross-cutting preamble growth (v1.57.2.0 AUQ-failure prose fallback ~2KB +
@@ -286,7 +306,9 @@ export const CARVE_GUARDS: Record<string, CarveGuard> = {
      gateAfterStop: undefined,
    },
    behavioral: 'prompt',
-    maxSkeletonBytes: 70_000,
+    // +Conductor AUQ-default-prose rule + one-way/continuation safety in the
+    // always-loaded AskUserQuestion Format section.
+    maxSkeletonBytes: 73_000,
    minUnionBytes: 72_000,
    mustContain: ['OWASP', 'STRIDE', 'daily', 'comprehensive', 'verif'],
    // cso keeps its mode-dispatch + FP-filtering phases always-loaded, so the
@@ -24,6 +24,7 @@
 import * as fs from 'fs';
 import * as os from 'os';
 import * as path from 'path';
+import { hermeticChildEnv, isHermeticEnabled } from './hermetic-env';

 /** Strip ANSI escapes for pattern-matching against visible text. */
 export function stripAnsi(s: string): string {
@@ -120,6 +121,13 @@ export interface ClaudePtySession {
  exited(): boolean;
  /** Exit code, if known. */
  exitCode(): number | null;
+  /**
+   * The hermetic CLAUDE_CONFIG_DIR this session's claude was pointed at, or
+   * null when EVALS_HERMETIC=0. Forensics: hermetic plan files live under
+   * `<hermeticConfigDir>/plans/` (extractPlanFilePath still matches them —
+   * the dir name ends in `/.claude` by contract).
+   */
+  hermeticConfigDir: string | null;
  /**
   * Send SIGINT, then SIGKILL after 1s. Always safe to call multiple times.
   * Awaits process exit before resolving.
@@ -1143,8 +1151,17 @@ export async function launchClaudePty(
  if (permissionMode !== null) {
    args.push('--permission-mode', permissionMode);
  }
+  // Hermetic children get zero MCP servers; gated on the same call-time
+  // check as the env scrub so EVALS_HERMETIC=0 restores operator MCP too.
+  // Before opts.extraArgs so a test could theoretically supply --mcp-config.
+  const hermetic = isHermeticEnabled();
+  if (hermetic) args.push('--strict-mcp-config');
  if (opts.extraArgs) args.push(...opts.extraArgs);

+  // Hermetic by default (test/helpers/hermetic-env.ts): operator session
+  // context never reaches the child; per-test opts.env merges last.
+  const childEnv = hermeticChildEnv(opts.env);
+
  // eslint-disable-next-line @typescript-eslint/no-explicit-any
  const proc = (Bun as any).spawn([claudePath, ...args], {
    terminal: {
@@ -1155,7 +1172,7 @@ export async function launchClaudePty(
      },
    },
    cwd,
-    env: { ...process.env, ...(opts.env ?? {}) },
+    env: childEnv,
  });

  // Track exit so waitForAny can fail fast if claude crashes.
@@ -1307,6 +1324,7 @@ export async function launchClaudePty(
    pid: () => proc.pid as number | undefined,
    exited: () => exited,
    exitCode: () => exitCodeCaptured,
+    hermeticConfigDir: hermetic ? childEnv.CLAUDE_CONFIG_DIR ?? null : null,
    close,
  };
 }
@@ -15,6 +15,7 @@
 import * as fs from 'fs';
 import * as path from 'path';
 import * as os from 'os';
+import { hermeticChildEnv } from './hermetic-env';

 // --- Interfaces ---

@@ -201,15 +202,18 @@ export async function runCodexSkill(opts: {
    // Build codex exec command
    const args = ['exec', prompt, '--json', '-s', sandbox];

-    // Spawn codex with temp HOME so it discovers our installed skill
+    // Spawn codex with temp HOME so it discovers our installed skill.
+    // Hermetic scrub (test/helpers/hermetic-env.ts) with codex's auth surface
+    // re-admitted: codex auths from $HOME/.codex (copied into tempHome above)
+    // plus OPENAI_API_KEY/CODEX_* when present. HOME override merges last.
    const proc = Bun.spawn(['codex', ...args], {
      cwd: cwd || skillDir,
      stdout: 'pipe',
      stderr: 'pipe',
-      env: {
-        ...process.env,
-        HOME: tempHome,
-      },
+      env: hermeticChildEnv(
+        { HOME: tempHome },
+        { extraAllow: ['OPENAI_API_KEY', 'CODEX_*'] },
+      ),
    });

    // Race against timeout
@@ -14,6 +14,7 @@
 */

 import * as path from 'path';
+import { hermeticChildEnv } from './hermetic-env';

 // --- Interfaces ---

@@ -122,11 +123,16 @@ export async function runGeminiSkill(opts: {
  // Build gemini command
  const args = ['-p', prompt, '--output-format', 'stream-json', '--yolo'];

-  // Spawn gemini — uses real HOME for auth, cwd for skill discovery
+  // Spawn gemini — uses real HOME for auth (~/.gemini; HOME is allowlisted),
+  // cwd for skill discovery. Hermetic scrub with gemini's auth surface
+  // re-admitted (previously this spawn inherited the full operator env).
  const proc = Bun.spawn(['gemini', ...args], {
    cwd: cwd || process.cwd(),
    stdout: 'pipe',
    stderr: 'pipe',
+    env: hermeticChildEnv(undefined, {
+      extraAllow: ['GEMINI_API_KEY', 'GOOGLE_API_KEY', 'GOOGLE_APPLICATION_CREDENTIALS', 'GOOGLE_CLOUD_*', 'GEMINI_*'],
+    }),
  });

  // Race against timeout
@@ -0,0 +1,269 @@
+/**
+ * Unit tests for the hermetic child-env builder. Free tier — no API calls.
+ *
+ * Pins three contracts:
+ * 1. Allowlist semantics: contamination vars dropped, basics/auth/network
+ *    kept, overrides merge last, EVALS_HERMETIC=0 is byte-identical legacy.
+ * 2. Seed-config shape: 20-char key suffix, trusted dirs, undefined-key safe.
+ * 3. Dir lifecycle: /.claude suffix (extractPlanFilePath contract —
+ *    claude-pty-runner.ts:191), sync singleton reuse, pid-aware GC.
+ */
+
+import { describe, test, expect, afterAll } from 'bun:test';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import {
+  buildHermeticEnv,
+  buildSeedConfig,
+  isHermeticEnabled,
+  getHermeticDirs,
+  gcStaleHermeticDirs,
+  hermeticChildEnv,
+} from './hermetic-env';
+
+const CONTAMINATED: NodeJS.ProcessEnv = {
+  PATH: '/usr/bin', HOME: '/Users/op', TMPDIR: '/tmp', TERM: 'xterm',
+  ANTHROPIC_API_KEY: 'sk-ant-0123456789abcdefghijklmn',
+  ANTHROPIC_BASE_URL: 'https://proxy.example/api',
+  ANTHROPIC_MODEL: 'sneaky-model-override',
+  EVALS_MODEL: 'claude-sonnet-4-6',
+  GITHUB_ACTIONS: 'true',
+  HTTPS_PROXY: 'http://corp:3128',
+  NODE_EXTRA_CA_CERTS: '/etc/corp.pem',
+  CONDUCTOR_WORKSPACE_PATH: '/Users/op/conductor/ws',
+  CONDUCTOR_SESSION: '1',
+  CLAUDECODE: '1',
+  CLAUDE_CODE_ENTRYPOINT: 'cli',
+  CLAUDE_CONFIG_DIR: '/Users/op/.claude',
+  GSTACK_HOME: '/Users/op/.gstack',
+  GSTACK_HEADLESS_DEFAULT: 'x',
+  MCP_TIMEOUT: '5000',
+  GBRAIN_ENDPOINT: 'http://localhost:1234',
+  OPENAI_API_KEY: 'sk-openai-secret',
+  VOYAGE_API_KEY: 'vg-secret',
+  GH_TOKEN: 'gho_secret',
+  SSH_AUTH_SOCK: '/tmp/ssh.sock',
+  GIT_AUTHOR_NAME: 'Op',
+};
+
+const HERMETIC_VARS = { CLAUDE_CONFIG_DIR: '/x/.claude', GSTACK_HOME: '/x/gstack-home' };
+
+describe('buildHermeticEnv allowlist', () => {
+  const env = buildHermeticEnv(CONTAMINATED, HERMETIC_VARS);
+
+  test('keeps process basics, network, CI, and eval knobs', () => {
+    expect(env.PATH).toBe('/usr/bin');
+    expect(env.HOME).toBe('/Users/op');
+    expect(env.EVALS_MODEL).toBe('claude-sonnet-4-6');
+    expect(env.GITHUB_ACTIONS).toBe('true');
+    expect(env.HTTPS_PROXY).toBe('http://corp:3128');
+    expect(env.NODE_EXTRA_CA_CERTS).toBe('/etc/corp.pem');
+  });
+
+  test('keeps named auth vars but not the broad ANTHROPIC_ prefix', () => {
+    expect(env.ANTHROPIC_API_KEY).toBe(CONTAMINATED.ANTHROPIC_API_KEY);
+    expect(env.ANTHROPIC_BASE_URL).toBe(CONTAMINATED.ANTHROPIC_BASE_URL);
+    expect(env.ANTHROPIC_MODEL).toBeUndefined(); // behavior knob, not auth
+  });
+
+  test('drops session-context and operator-credential vars', () => {
+    for (const k of [
+      'CONDUCTOR_WORKSPACE_PATH', 'CONDUCTOR_SESSION', 'CLAUDECODE',
+      'CLAUDE_CODE_ENTRYPOINT', 'GSTACK_HEADLESS_DEFAULT', 'MCP_TIMEOUT',
+      'GBRAIN_ENDPOINT', 'OPENAI_API_KEY', 'VOYAGE_API_KEY', 'GH_TOKEN',
+      'SSH_AUTH_SOCK', 'GIT_AUTHOR_NAME',
+    ]) {
+      expect(env[k]).toBeUndefined();
+    }
+  });
+
+  test('redirects CLAUDE_CONFIG_DIR and GSTACK_HOME to hermetic values', () => {
+    expect(env.CLAUDE_CONFIG_DIR).toBe('/x/.claude');
+    expect(env.GSTACK_HOME).toBe('/x/gstack-home');
+  });
+
+  test('overrides merge last — per-test re-contamination is deliberate', () => {
+    const e = buildHermeticEnv(CONTAMINATED, HERMETIC_VARS, {
+      CONDUCTOR_WORKSPACE_PATH: '/tmp/test-ws',
+      GSTACK_HOME: '/tmp/test-home',
+      GSTACK_HEADLESS: '',
+    });
+    expect(e.CONDUCTOR_WORKSPACE_PATH).toBe('/tmp/test-ws');
+    expect(e.GSTACK_HOME).toBe('/tmp/test-home');
+    expect(e.GSTACK_HEADLESS).toBe('');
+  });
+
+  test('promotes GSTACK_ANTHROPIC_API_KEY when canonical absent (shared shim fn)', () => {
+    const base = { ...CONTAMINATED } as NodeJS.ProcessEnv;
+    delete base.ANTHROPIC_API_KEY;
+    base.GSTACK_ANTHROPIC_API_KEY = 'sk-ant-promoted-9876543210';
+    const e = buildHermeticEnv(base, HERMETIC_VARS);
+    expect(e.ANTHROPIC_API_KEY).toBe('sk-ant-promoted-9876543210');
+    expect(e.GSTACK_ANTHROPIC_API_KEY).toBeUndefined(); // GSTACK_* still dropped
+  });
+
+  test('extraAllow re-admits exact names and prefixes per runner', () => {
+    const e = buildHermeticEnv(CONTAMINATED, HERMETIC_VARS, undefined, {
+      extraAllow: ['OPENAI_API_KEY', 'GIT_*'],
+    });
+    expect(e.OPENAI_API_KEY).toBe('sk-openai-secret');
+    expect(e.GIT_AUTHOR_NAME).toBe('Op');
+    expect(e.GH_TOKEN).toBeUndefined(); // not in extraAllow
+  });
+
+  test('TERM falls back when base omits it', () => {
+    const base = { ...CONTAMINATED } as NodeJS.ProcessEnv;
+    delete base.TERM;
+    expect(buildHermeticEnv(base, HERMETIC_VARS).TERM).toBe('xterm-256color');
+  });
+});
+
+describe('EVALS_HERMETIC=0 escape hatch', () => {
+  test('returns byte-identical legacy env, overrides still last', () => {
+    const base = { ...CONTAMINATED, EVALS_HERMETIC: '0' } as NodeJS.ProcessEnv;
+    const e = buildHermeticEnv(base, HERMETIC_VARS, { GSTACK_HEADLESS: '1' });
+    // Legacy spread: every base var survives, hermeticVars NOT applied.
+    expect(e.CONDUCTOR_WORKSPACE_PATH).toBe(CONTAMINATED.CONDUCTOR_WORKSPACE_PATH);
+    expect(e.CLAUDE_CONFIG_DIR).toBe('/Users/op/.claude');
+    expect(e.GSTACK_HOME).toBe('/Users/op/.gstack');
+    expect(e.GSTACK_HEADLESS).toBe('1');
+    expect(e).toEqual({ ...(base as Record<string, string>), GSTACK_HEADLESS: '1' });
+  });
+
+  test('isHermeticEnabled reads at call time (ESM-hoist safety)', () => {
+    const prev = process.env.EVALS_HERMETIC;
+    try {
+      process.env.EVALS_HERMETIC = '0';
+      expect(isHermeticEnabled()).toBe(false);
+      process.env.EVALS_HERMETIC = '1';
+      expect(isHermeticEnabled()).toBe(true);
+      delete process.env.EVALS_HERMETIC;
+      expect(isHermeticEnabled()).toBe(true);
+    } finally {
+      if (prev === undefined) delete process.env.EVALS_HERMETIC;
+      else process.env.EVALS_HERMETIC = prev;
+    }
+  });
+});
+
+describe('buildSeedConfig', () => {
+  test('stores only the 20-char key suffix and trusts the given dirs', () => {
+    const seed = buildSeedConfig({
+      apiKey: 'sk-ant-0123456789abcdefghijklmn',
+      trustedDirs: ['/repo/root'],
+    }) as any;
+    expect(seed.hasCompletedOnboarding).toBe(true);
+    const approved = seed.customApiKeyResponses.approved;
+    expect(approved).toHaveLength(1);
+    expect(approved[0]).toHaveLength(20);
+    expect('sk-ant-0123456789abcdefghijklmn'.endsWith(approved[0])).toBe(true);
+    expect(seed.projects['/repo/root'].hasTrustDialogAccepted).toBe(true);
+    expect(seed.projects['/repo/root'].hasCompletedProjectOnboarding).toBe(true);
+  });
+
+  test('apiKey undefined → omits customApiKeyResponses, does not throw', () => {
+    const seed = buildSeedConfig({ apiKey: undefined, trustedDirs: [] }) as any;
+    expect(seed.customApiKeyResponses).toBeUndefined();
+    expect(seed.hasCompletedOnboarding).toBe(true);
+  });
+
+  test('no full key material anywhere in the seed', () => {
+    const key = 'sk-ant-0123456789abcdefghijklmn';
+    const json = JSON.stringify(buildSeedConfig({ apiKey: key, trustedDirs: [] }));
+    expect(json.includes(key)).toBe(false);
+  });
+});
+
+describe('getHermeticDirs lifecycle', () => {
+  test('configDir ends in /.claude — extractPlanFilePath contract', () => {
+    // claude-pty-runner.ts:191 anchors plan paths on `.claude/plans/` under
+    // /var|/tmp prefixes; the dir-name suffix is what keeps PTY plan-mode
+    // tests extracting hermetic plan files with zero extractor changes.
+    const dirs = getHermeticDirs();
+    expect(dirs.configDir.endsWith(`${path.sep}.claude`)).toBe(true);
+    expect(dirs.configDir.startsWith(os.tmpdir())).toBe(true);
+  });
+
+  test('sync singleton: repeat calls return the same dirs', () => {
+    expect(getHermeticDirs()).toBe(getHermeticDirs());
+  });
+
+  test('seeds .claude.json in the config dir', () => {
+    const dirs = getHermeticDirs();
+    const seed = JSON.parse(fs.readFileSync(path.join(dirs.configDir, '.claude.json'), 'utf-8'));
+    expect(seed.hasCompletedOnboarding).toBe(true);
+    const root = path.resolve(__dirname, '..', '..');
+    expect(seed.projects[root].hasTrustDialogAccepted).toBe(true);
+  });
+});
+
+describe('gcStaleHermeticDirs', () => {
+  test('removes dead-pid dirs, keeps live-pid and foreign dirs', () => {
+    const tmp = fs.mkdtempSync(path.join(os.tmpdir(), 'hermetic-gc-test-'));
+    // Find a pid that is definitely dead: spawn-and-reap is overkill; use a
+    // huge pid beyond pid_max on macOS/Linux defaults.
+    const deadPid = 99999999;
+    const dead = path.join(tmp, `gstack-hermetic-${deadPid}-abc`);
+    const live = path.join(tmp, `gstack-hermetic-${process.pid}-abc`);
+    const foreign = path.join(tmp, 'unrelated-dir');
+    const malformed = path.join(tmp, 'gstack-hermetic-notapid-abc');
+    for (const d of [dead, live, foreign, malformed]) fs.mkdirSync(d);
+    // GC only reclaims dirs older than its 1h age floor (PID-reuse guard);
+    // backdate the dead-pid dir's mtime so it qualifies.
+    const old = new Date(Date.now() - 2 * 60 * 60 * 1000);
+    fs.utimesSync(dead, old, old);
+
+    gcStaleHermeticDirs(tmp);
+
+    expect(fs.existsSync(dead)).toBe(false);
+    expect(fs.existsSync(live)).toBe(true);
+    expect(fs.existsSync(foreign)).toBe(true);
+    expect(fs.existsSync(malformed)).toBe(true); // never guess on malformed names
+    fs.rmSync(tmp, { recursive: true, force: true });
+  });
+
+  test('keeps a fresh dead-pid dir (PID-reuse grace window)', () => {
+    const tmp = fs.mkdtempSync(path.join(os.tmpdir(), 'hermetic-gc-fresh-'));
+    // Dead pid but just created — must survive GC, else PID reuse could delete
+    // a dir whose original pid exited and got recycled to a live process.
+    const freshDead = path.join(tmp, 'gstack-hermetic-99999999-xyz');
+    fs.mkdirSync(freshDead);
+    gcStaleHermeticDirs(tmp);
+    expect(fs.existsSync(freshDead)).toBe(true);
+    fs.rmSync(tmp, { recursive: true, force: true });
+  });
+});
+
+describe('hermeticChildEnv composition', () => {
+  test('hermetic by default: redirects config dirs, drops contamination', () => {
+    // process.env in a real test run may carry CONDUCTOR_*/CLAUDECODE — the
+    // composition must scrub them and point at the singleton dirs.
+    const e = hermeticChildEnv({ GSTACK_HEADLESS: '1' });
+    const dirs = getHermeticDirs();
+    expect(e.CLAUDE_CONFIG_DIR).toBe(dirs.configDir);
+    expect(e.GSTACK_HOME).toBe(dirs.gstackHome);
+    expect(e.GSTACK_HEADLESS).toBe('1');
+    expect(e.CLAUDECODE).toBeUndefined();
+    expect(e.CONDUCTOR_WORKSPACE_PATH).toBeUndefined();
+  });
+
+  test('EVALS_HERMETIC=0: legacy passthrough of live process.env', () => {
+    const prev = process.env.EVALS_HERMETIC;
+    try {
+      process.env.EVALS_HERMETIC = '0';
+      const e = hermeticChildEnv({ EXTRA: 'x' });
+      expect(e.PATH).toBe(process.env.PATH as string);
+      expect(e.EXTRA).toBe('x');
+      // No hermetic redirection in legacy mode.
+      expect(e.CLAUDE_CONFIG_DIR).toBe(process.env.CLAUDE_CONFIG_DIR as any);
+    } finally {
+      if (prev === undefined) delete process.env.EVALS_HERMETIC;
+      else process.env.EVALS_HERMETIC = prev;
+    }
+  });
+});
+
+afterAll(() => {
+  // The singleton's own exit hook handles runRoot; nothing else to clean.
+});
@@ -0,0 +1,276 @@
+/**
+ * Hermetic child environment for E2E test runners.
+ *
+ * Local E2E runs spawn `claude` (and codex/gemini/SDK) children that, until
+ * this module, inherited the operator's full session context: ~/.claude
+ * (user CLAUDE.md, .claude.json MCP servers incl. gbrain + Conductor,
+ * skills), ~/.gstack decision logs, and CONDUCTOR_-/CLAUDECODE-style env vars.
+ * CI was hermetic only by accident (fresh Docker /home/runner). This module
+ * makes local children see a CI-equivalent clean room by default.
+ *
+ *   operator shell (contaminated)            hermetic child env
+ *   ┌─────────────────────────────┐  buildHermeticEnv()
+ *   │ PATH, HOME, TMPDIR, ...     │── allowlist ─────────► kept
+ *   │ HTTP(S)_PROXY, SSL_CERT_*   │── allowlist ─────────► kept (network)
+ *   │ ANTHROPIC_API_KEY/BASE_URL/ │── named list ────────► kept (auth)
+ *   │   AUTH_TOKEN                │
+ *   │ GSTACK_ANTHROPIC_API_KEY    │── promotedEnv() ─────► ANTHROPIC_API_KEY
+ *   │ CONDUCTOR_*, CLAUDECODE,    │
+ *   │ CLAUDE_*, GSTACK_*, MCP_*,  │── dropped ───────────► ∅
+ *   │ GBRAIN_*, GH_TOKEN, ...     │
+ *   └─────────────────────────────┘
+ *      + per-runner extraAllow (codex: OpenAI vars; gemini: Google vars)
+ *      + CLAUDE_CONFIG_DIR=<runRoot>/.claude  GSTACK_HOME=<runRoot>/gstack-home
+ *      + per-test overrides spread LAST
+ *
+ * Escape hatch: EVALS_HERMETIC=0 restores the legacy contaminated env
+ * byte-identically (runners must also gate --strict-mcp-config on
+ * isHermeticEnabled() so the escape hatch restores args too).
+ *
+ * isHermeticEnabled() is evaluated at CALL time, never at module load —
+ * ESM hoists imports above any in-file `process.env.EVALS_HERMETIC = '0'`
+ * assignment, so a module-load-time read would silently ignore test pins.
+ */
+
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import { promotedEnv } from '../../lib/conductor-env-shim';
+import { isProcessAlive } from '../../browse/src/error-handling';
+
+/** Exact env names a hermetic child keeps. Everything not listed (or matched
+ * by a prefix rule below) is dropped. */
+const ALLOW_EXACT = new Set([
+  // Process basics
+  'PATH', 'HOME', 'TMPDIR', 'TERM', 'COLORTERM', 'LANG', 'LC_ALL', 'SHELL',
+  'USER', 'LOGNAME', 'TZ', 'NODE_ENV', 'CI',
+  // Browser/runtime caches the child legitimately shares with the operator
+  'PLAYWRIGHT_BROWSERS_PATH',
+  // Network reachability — without these, children on proxied networks can't
+  // reach the Anthropic API at all
+  'HTTP_PROXY', 'HTTPS_PROXY', 'NO_PROXY',
+  'http_proxy', 'https_proxy', 'no_proxy',
+  'SSL_CERT_FILE', 'SSL_CERT_DIR', 'NODE_EXTRA_CA_CERTS',
+  // Auth — named, NOT the broad ANTHROPIC_* prefix: a prefix rule would
+  // smuggle model/beta/debug knobs that change eval behavior
+  'ANTHROPIC_API_KEY',   // the auth credential evals require
+  'ANTHROPIC_BASE_URL',  // API endpoint override (corp proxies)
+  'ANTHROPIC_AUTH_TOKEN', // bearer-token auth variant
+]);
+
+/** Prefix rules: eval-harness knobs + CI metadata. Deliberately NOT here:
+ * CONDUCTOR_* / CLAUDE_* (incl. CLAUDECODE, CLAUDE_CODE_ENTRYPOINT) /
+ * GSTACK_* / MCP_* / GBRAIN_* — session-context contamination; and operator
+ * credentials (GH_TOKEN, SSH_AUTH_SOCK, GIT_*, OPENAI_API_KEY,
+ * VOYAGE_API_KEY) — CI doesn't have them and eval children have no business
+ * using them. A test that legitimately needs one opts in via its own env
+ * override; a provider runner (codex/gemini) re-admits its auth vars via
+ * opts.extraAllow. */
+const ALLOW_PREFIXES = ['EVALS_', 'GITHUB_'];
+
+export interface HermeticEnvOpts {
+  /** Per-runner additional allowed names (exact match) or prefixes (entries
+   * ending in '*'). Example: codex runner passes ['OPENAI_API_KEY', 'CODEX_*']. */
+  extraAllow?: string[];
+}
+
+/** EVALS_HERMETIC !== '0'. Read at call time (see module doc — ESM hoist). */
+export function isHermeticEnabled(env: NodeJS.ProcessEnv = process.env): boolean {
+  return env.EVALS_HERMETIC !== '0';
+}
+
+/**
+ * Pure allowlist scrub. No I/O. Overrides spread LAST so per-test env
+ * (GSTACK_HOME, CONDUCTOR_WORKSPACE_PATH, GSTACK_HEADLESS opt-out) always
+ * wins over the scrub — that is the documented re-contamination escape and
+ * the wiring tripwire forbids passing raw process.env through it.
+ */
+export function buildHermeticEnv(
+  base: NodeJS.ProcessEnv,
+  hermeticVars: Record<string, string>,
+  overrides?: Record<string, string | undefined>,
+  opts?: HermeticEnvOpts,
+): Record<string, string> {
+  if (!isHermeticEnabled(base)) {
+    // Escape hatch: byte-identical to the legacy spread.
+    const legacy: Record<string, string> = {};
+    for (const [k, v] of Object.entries(base)) if (v !== undefined) legacy[k] = v;
+    for (const [k, v] of Object.entries(overrides ?? {})) if (v !== undefined) legacy[k] = v;
+    return legacy;
+  }
+
+  const promoted = promotedEnv(base);
+  const extraExact = new Set<string>();
+  const extraPrefixes: string[] = [];
+  for (const entry of opts?.extraAllow ?? []) {
+    if (entry.endsWith('*')) extraPrefixes.push(entry.slice(0, -1));
+    else extraExact.add(entry);
+  }
+
+  const out: Record<string, string> = {};
+  for (const [k, v] of Object.entries(promoted)) {
+    if (v === undefined) continue;
+    const allowed =
+      ALLOW_EXACT.has(k) ||
+      extraExact.has(k) ||
+      ALLOW_PREFIXES.some((p) => k.startsWith(p)) ||
+      extraPrefixes.some((p) => k.startsWith(p));
+    if (allowed) out[k] = v;
+  }
+  if (!out.TERM) out.TERM = 'xterm-256color';
+  Object.assign(out, hermeticVars);
+  for (const [k, v] of Object.entries(overrides ?? {})) if (v !== undefined) out[k] = v;
+  return out;
+}
+
+export interface SeedConfigOpts {
+  /** When undefined (operator has no key exported), customApiKeyResponses is
+   * omitted — the child fails auth exactly as it would today, no throw here. */
+  apiKey: string | undefined;
+  trustedDirs: string[];
+}
+
+/**
+ * Minimal $CLAUDE_CONFIG_DIR/.claude.json for fresh-config children.
+ *
+ * Empirically verified 2026-06-12 on claude 2.1.175: PRINT MODE (`claude -p`)
+ * with ANTHROPIC_API_KEY needs NO seed at all — a fresh empty config dir ran
+ * non-interactively (exit 0, real cost billed to the key). The seed exists
+ * for the PTY path, where first-run TUI prompts DO appear:
+ * - hasCompletedOnboarding: suppresses the onboarding flow
+ * - customApiKeyResponses.approved: suppresses the "use this API key?"
+ *   prompt; entries are the key's LAST 20 CHARS (shape verified against a
+ *   real ~/.claude.json)
+ * - projects[dir].hasTrustDialogAccepted: pre-trusts repo-cwd PTY sessions
+ *   (the pty-runner's 15s trust-watcher remains as fallback for temp cwds)
+ * bypassPermissionsModeAccepted was considered and dropped: absent from a
+ * real config even though --dangerously-skip-permissions is in daily use.
+ */
+export function buildSeedConfig(opts: SeedConfigOpts): Record<string, unknown> {
+  const seed: Record<string, unknown> = {
+    hasCompletedOnboarding: true,
+    projects: Object.fromEntries(
+      opts.trustedDirs.map((dir) => [
+        dir,
+        { hasTrustDialogAccepted: true, hasCompletedProjectOnboarding: true },
+      ]),
+    ),
+  };
+  if (opts.apiKey) {
+    seed.customApiKeyResponses = { approved: [opts.apiKey.slice(-20)] };
+  }
+  return seed;
+}
+
+export interface HermeticDirs {
+  /** Ends in `/.claude` — load-bearing: extractPlanFilePath in
+   * claude-pty-runner.ts:191 anchors plan-file paths on `.claude/plans/`
+   * under a /var|/tmp prefix. Renaming this segment breaks PTY plan tests. */
+  configDir: string;
+  gstackHome: string;
+  runRoot: string;
+}
+
+const DIR_PREFIX = 'gstack-hermetic-';
+
+let cachedDirs: HermeticDirs | null = null;
+
+/** Repo root for the trusted-dir seed: test files live in <root>/test/helpers. */
+function repoRoot(): string {
+  return path.resolve(__dirname, '..', '..');
+}
+
+/**
+ * Sync memoized per-process singleton — intentionally NO async gap between
+ * the cache check and create+seed, so concurrent first calls under
+ * `bun test --concurrent` cannot double-create or observe a half-seeded dir.
+ * Shared across all tests in the process: that matches CI's within-job
+ * shared /home/runner (operator isolation, not per-test isolation).
+ */
+export function getHermeticDirs(): HermeticDirs {
+  if (cachedDirs) return cachedDirs;
+
+  gcStaleHermeticDirs();
+
+  // Embed our pid so the GC of future processes can check liveness.
+  const runRoot = fs.mkdtempSync(path.join(os.tmpdir(), `${DIR_PREFIX}${process.pid}-`));
+  const configDir = path.join(runRoot, '.claude');
+  const gstackHome = path.join(runRoot, 'gstack-home');
+
+  // A half-seeded config dir means children hang on first-run prompts until
+  // the test timeout — far worse than failing loudly here. So we throw on
+  // failure, but tear down the partial dir first: an unseeded runRoot named
+  // with our (alive) pid would be skipped by this process's GC and leak until
+  // process exit, so remove it before rethrowing.
+  try {
+    fs.mkdirSync(configDir, { recursive: true });
+    fs.mkdirSync(gstackHome, { recursive: true });
+    const seed = buildSeedConfig({
+      apiKey: process.env.ANTHROPIC_API_KEY ?? process.env.GSTACK_ANTHROPIC_API_KEY,
+      trustedDirs: [repoRoot()],
+    });
+    fs.writeFileSync(path.join(configDir, '.claude.json'), JSON.stringify(seed, null, 2));
+  } catch (err) {
+    try { fs.rmSync(runRoot, { recursive: true, force: true }); } catch { /* best-effort */ }
+    throw err;
+  }
+
+  process.on('exit', () => {
+    // Exit handlers cannot await: sync best-effort removal only. Anything
+    // left behind is reclaimed by the next process's pid-aware GC.
+    try { fs.rmSync(runRoot, { recursive: true, force: true }); } catch { /* GC reclaims */ }
+  });
+
+  cachedDirs = { configDir, gstackHome, runRoot };
+  return cachedDirs;
+}
+
+/** A dir younger than this is never GC'd even if its pid looks dead — guards
+ * against PID reuse deleting a freshly-created dir whose original pid exited
+ * and was recycled to an unrelated live process between create and GC. */
+const GC_MIN_AGE_MS = 60 * 60 * 1000; // 1h
+
+/**
+ * Reclaim leftovers from crashed runs. Two signals, both required: the
+ * embedded pid is dead AND the dir is older than GC_MIN_AGE_MS. Pid-alone
+ * would risk PID-reuse false-deletes of live dirs; age-alone would delete a
+ * live >24h eval run's config out from under it. Exported for tests.
+ */
+export function gcStaleHermeticDirs(tmpDir: string = os.tmpdir()): void {
+  let entries: string[];
+  try { entries = fs.readdirSync(tmpDir); } catch { return; }
+  const now = Date.now();
+  for (const name of entries) {
+    if (!name.startsWith(DIR_PREFIX)) continue;
+    const pidStr = name.slice(DIR_PREFIX.length).split('-')[0];
+    const pid = Number(pidStr);
+    if (!Number.isInteger(pid) || pid <= 0) continue;
+    if (pid === process.pid || isProcessAlive(pid)) continue;
+    const full = path.join(tmpDir, name);
+    try {
+      if (now - fs.statSync(full).mtimeMs < GC_MIN_AGE_MS) continue; // too fresh
+    } catch { continue; } // vanished or unreadable — leave it
+    try { fs.rmSync(full, { recursive: true, force: true }); } catch { /* best-effort */ }
+  }
+}
+
+/**
+ * The composition runners use: scrub process.env, point the child at the
+ * singleton hermetic dirs, apply per-test overrides last. Returns the legacy
+ * env untouched when EVALS_HERMETIC=0 (and skips dir creation entirely).
+ */
+export function hermeticChildEnv(
+  overrides?: Record<string, string | undefined>,
+  opts?: HermeticEnvOpts,
+): Record<string, string> {
+  if (!isHermeticEnabled()) {
+    return buildHermeticEnv(process.env, {}, overrides, opts);
+  }
+  const dirs = getHermeticDirs();
+  return buildHermeticEnv(
+    process.env,
+    { CLAUDE_CONFIG_DIR: dirs.configDir, GSTACK_HOME: dirs.gstackHome },
+    overrides,
+    opts,
+  );
+}
@@ -210,7 +210,11 @@ const MONOLITH_INVARIANTS: ParityInvariant[] = [
    skill: 'review',
    mustContain: ['confidence', 'P1', 'P2'],
    mustHaveHeadings: ['## Preamble', '## When to invoke'],
-    maxSizeRatio: 1.05,
+    // The adversarial step swapped its bare `command -v codex` check for the shared
+    // codexPreflight() block (install + auth tri-state + CODEX_MODE branch prose),
+    // landing ~6.3% over the v1.53.0.0 baseline. Intentional: it adds proper
+    // not-installed vs not-authed handling, not slop.
+    maxSizeRatio: 1.08,
    minBytes: 70_000,
  },
  {
@@ -10,6 +10,7 @@ import * as fs from 'fs';
 import * as path from 'path';
 import * as os from 'os';
 import { getProjectEvalDir } from './eval-store';
+import { hermeticChildEnv, isHermeticEnabled } from './hermetic-env';

 const GSTACK_DEV_DIR = path.join(os.homedir(), '.gstack-dev');
 const HEARTBEAT_PATH = path.join(GSTACK_DEV_DIR, 'e2e-live.json'); // heartbeat stays global
@@ -167,6 +168,10 @@ export async function runSkillTest(options: {
    '--max-turns', String(maxTurns),
    '--allowed-tools', ...allowedTools,
  ];
+  // Hermetic children get zero MCP servers (no --mcp-config is passed).
+  // Gated on the same call-time check as the env scrub so EVALS_HERMETIC=0
+  // restores operator MCP along with the operator env.
+  if (isHermeticEnabled()) args.push('--strict-mcp-config');

  // Write prompt to a temp file OUTSIDE workingDirectory to avoid race conditions
  // where afterAll cleanup deletes the dir before cat reads the file (especially
@@ -176,11 +181,14 @@ export async function runSkillTest(options: {

  const proc = Bun.spawn(['sh', '-c', `cat "${promptFile}" | claude ${args.map(a => `"${a}"`).join(' ')}`], {
    cwd: workingDirectory,
+    // Hermetic by default (see test/helpers/hermetic-env.ts): operator
+    // session context (CONDUCTOR_*, CLAUDECODE, ~/.claude config, ~/.gstack)
+    // never reaches the child; EVALS_HERMETIC=0 restores the legacy env.
    // Default GSTACK_HEADLESS=1 so eval/E2E runs classify as headless (BLOCK on an
    // AskUserQuestion failure rather than emit a prose question no human reads). A
    // suite exercising the INTERACTIVE prose-fallback path opts out by passing
    // `env: { GSTACK_HEADLESS: '' }` — extraEnv wins because it spreads last.
-    env: { ...process.env, GSTACK_HEADLESS: '1', ...extraEnv },
+    env: hermeticChildEnv({ GSTACK_HEADLESS: '1', ...extraEnv }),
    stdout: 'pipe',
    stderr: 'pipe',
  });
@@ -36,6 +36,11 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
  'browse-basic':    ['browse/src/**', 'browse/test/test-server.ts'],
  'browse-snapshot': ['browse/src/**', 'browse/test/test-server.ts'],

+  // Hermetic isolation canaries (hermetic-env.ts is also a GLOBAL touchfile;
+  // these entries exist so the canaries themselves stay tier-classified)
+  'hermetic-canary':   ['test/helpers/hermetic-env.ts', 'test/helpers/session-runner.ts', 'test/skill-e2e-hermetic-canary.test.ts', 'lib/conductor-env-shim.ts'],
+  'hermetic-sentinel': ['test/helpers/hermetic-env.ts', 'test/helpers/session-runner.ts', 'test/skill-e2e-hermetic-canary.test.ts', 'lib/conductor-env-shim.ts'],
+
  // SKILL.md setup + preamble (depend on ROOT SKILL.md + gen-skill-docs)
  'skillmd-setup-discovery':  ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
  'skillmd-no-local-binary':  ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
@@ -111,7 +116,12 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
  // written a never-ask preference, AUQ should still auto-decide rather than
  // surfacing the question. Touches the question-tuning + preference
  // infrastructure plus the resolvers that own the AUTO_DECIDE preamble.
-  'auto-decide-preserved':        ['scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'plan-ceo-review/**', 'bin/gstack-question-preference', 'bin/gstack-config', 'bin/gstack-slug', 'test/helpers/claude-pty-runner.ts'],
+  'auto-decide-preserved':        ['scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-preamble-bash.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'plan-ceo-review/**', 'bin/gstack-question-preference', 'bin/gstack-config', 'bin/gstack-slug', 'hosts/claude/hooks/question-preference-hook.ts', 'lib/is-conductor.ts', 'test/helpers/claude-pty-runner.ts'],
+
+  // Conductor → prose decision brief (Conductor signal makes prose the default;
+  // the PreToolUse hook denies the flaky tool). Touches the resolver that owns
+  // the Conductor rule, the preamble signal, the hook, and the detection helper.
+  'conductor-prose':              ['scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-preamble-bash.ts', 'scripts/resolvers/preamble.ts', 'plan-eng-review/**', 'hosts/claude/hooks/question-preference-hook.ts', 'lib/is-conductor.ts', 'test/helpers/claude-pty-runner.ts', 'test/skill-e2e-conductor-prose.test.ts'],

  // Real-PTY E2E batch (#6 new tests on the harness).
  // Each one tests behavior the SDK harness can't observe (rendered TTY,
@@ -291,6 +301,11 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
  'design-shotgun-session':         ['design-shotgun/**', 'scripts/resolvers/design.ts'],
  'design-shotgun-full':            ['design-shotgun/**', 'design/src/**', 'browse/src/**'],

+  // /diagram (diagram-render bundle consumers). Triplet = deterministic
+  // functional (gate); authoring quality = LLM-judged benchmark (periodic).
+  'diagram-triplet':            ['diagram/**', 'lib/diagram-render/**', 'browse/src/write-commands.ts', 'browse/src/read-commands.ts'],
+  'diagram-authoring-quality':  ['diagram/**', 'lib/diagram-render/**', 'test/helpers/llm-judge.ts'],
+
  // gstack-upgrade
  'gstack-upgrade-happy-path': ['gstack-upgrade/**'],

@@ -435,6 +450,11 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
  'browse-basic': 'gate',
  'browse-snapshot': 'gate',

+  // Hermetic isolation — gate (deterministic env/config assertions; if the
+  // clean room breaks, every other eval's signal is contaminated)
+  'hermetic-canary': 'gate',
+  'hermetic-sentinel': 'gate',
+
  // SKILL.md setup — gate (if setup breaks, no skill works)
  'skillmd-setup-discovery': 'gate',
  'skillmd-no-local-binary': 'gate',
@@ -508,6 +528,7 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
  // v1.21+ auto-mode regression tests
  'office-hours-auto-mode': 'gate',
  'auto-decide-preserved': 'periodic',
+  'conductor-prose': 'periodic',
  'e2e-harness-audit': 'gate',

  // Real-PTY E2E batch — tier classification:
@@ -659,6 +680,10 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
  'design-shotgun-session': 'gate',
  'design-shotgun-full': 'periodic',

+  // /diagram — triplet is deterministic functional, judge is a quality benchmark
+  'diagram-triplet': 'gate',
+  'diagram-authoring-quality': 'periodic',
+
  // gstack-upgrade
  'gstack-upgrade-happy-path': 'gate',

@@ -779,6 +804,7 @@ export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
 */
 export const GLOBAL_TOUCHFILES = [
  'test/helpers/session-runner.ts',  // All E2E tests use this runner
+  'test/helpers/hermetic-env.ts',    // Changes every E2E child's environment
  'test/helpers/eval-store.ts',      // All E2E tests store results here
  'test/helpers/touchfiles.ts',      // Self-referential — reclassifying wrong is dangerous
 ];
@@ -0,0 +1,113 @@
+/**
+ * Static-grep tripwire for the hermetic E2E wiring. Free tier — no API.
+ *
+ * Every E2E runner spawns its child through hermeticChildEnv(); if a refactor
+ * reverts any spawn site to a raw `...process.env` spread (or a callsite
+ * smuggles the operator env back in through the overrides parameter), local
+ * evals silently re-contaminate and nothing fails until a human notices
+ * weird results again — which took three burned suites last time.
+ *
+ * Pattern mirrors browse/test/terminal-agent-pid-identity.test.ts and
+ * browse/test/server-embedder-terminal-port.test.ts: read source files as
+ * text, assert invariants on their contents. Brittle by design — renaming
+ * the helper must force the author to look here.
+ */
+
+import { describe, test, expect } from 'bun:test';
+import * as fs from 'fs';
+import * as path from 'path';
+
+const ROOT = path.resolve(new URL(import.meta.url).pathname, '..', '..');
+
+const RUNNERS = [
+  'test/helpers/session-runner.ts',
+  'test/helpers/claude-pty-runner.ts',
+  'test/helpers/codex-session-runner.ts',
+  'test/helpers/gemini-session-runner.ts',
+  'test/helpers/agent-sdk-runner.ts',
+];
+
+function read(rel: string): string {
+  return fs.readFileSync(path.join(ROOT, rel), 'utf-8');
+}
+
+describe('hermetic wiring tripwire', () => {
+  test('every runner builds its child env via hermeticChildEnv()', () => {
+    for (const rel of RUNNERS) {
+      const src = read(rel);
+      expect(src.includes('hermeticChildEnv(') ).toBe(true);
+      expect(src.includes("from './hermetic-env'")).toBe(true);
+    }
+  });
+
+  test('no runner spawns a child with a raw process.env spread', () => {
+    // `...process.env` inside an env object is the exact pre-hermetic leak.
+    // hermetic-env.ts itself legitimately READS process.env (call-time
+    // snapshot); the runners must not SPREAD it into a child env.
+    for (const rel of RUNNERS) {
+      const offenders = read(rel)
+        .split('\n')
+        .map((line, i) => ({ line, n: i + 1 }))
+        .filter(({ line }) => line.includes('...process.env'));
+      expect(
+        offenders,
+        `${rel} spreads raw process.env into a child env at line(s) ` +
+          offenders.map((o) => o.n).join(', ') +
+          ' — route through hermeticChildEnv() instead',
+      ).toEqual([]);
+    }
+  });
+
+  test('claude runners gate --strict-mcp-config on isHermeticEnabled()', () => {
+    // Zero MCP servers for hermetic children; EVALS_HERMETIC=0 must restore
+    // operator MCP along with the operator env (the flag may not be
+    // unconditional, or the escape hatch lies).
+    for (const rel of ['test/helpers/session-runner.ts', 'test/helpers/claude-pty-runner.ts']) {
+      const src = read(rel);
+      expect(src.includes('--strict-mcp-config')).toBe(true);
+      const gated =
+        /if\s*\(\s*isHermeticEnabled\(\)\s*\)\s*(args\.push\(\s*)?['"]--strict-mcp-config['"]/.test(src) ||
+        /const hermetic = isHermeticEnabled\(\);[\s\S]{0,200}if\s*\(hermetic\)\s*args\.push\(\s*['"]--strict-mcp-config['"]/.test(src);
+      expect(gated, `${rel}: --strict-mcp-config must be gated on isHermeticEnabled()`).toBe(true);
+    }
+  });
+
+  test('no test callsite passes the whole operator env as a RUNNER override', () => {
+    // Overrides merge last by design (per-test GSTACK_HOME etc.) — passing
+    // process.env itself through that hole defeats the entire scrub. Scoped
+    // to OUR runner calls: unit tests that spawnSync gstack bin scripts with
+    // `...process.env` are test-process spawns, not eval children, and are
+    // legitimately the test's own business.
+    const RUNNER_CALL =
+      /\b(runSkillTest|launchClaudePty|runPlanSkillObservation|runPlanSkillCounting|runPlanSkillFloorCheck|runAgentSdkTest|runCodexSkillTest|runGeminiSkillTest)\s*\(/;
+    const DIRECT_SPAWN = /\b(spawnSync|spawn|execSync|exec|Bun\.spawn|Bun\.spawnSync)\s*\(/;
+    const testDir = path.join(ROOT, 'test');
+    const offenders: string[] = [];
+    const walk = (dir: string) => {
+      for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
+        const full = path.join(dir, entry.name);
+        if (entry.isDirectory()) { walk(full); continue; }
+        if (!entry.name.endsWith('.test.ts')) continue;
+        if (entry.name === 'hermetic-wiring.test.ts') continue;
+        const lines = fs.readFileSync(full, 'utf-8').split('\n');
+        for (let i = 0; i < lines.length; i++) {
+          if (!/env:\s*(\{\s*\.\.\.\s*process\.env|process\.env\b(?!\.))/.test(lines[i])) continue;
+          // Walk backwards to the nearest enclosing call: runner vs direct spawn.
+          for (let j = i; j >= Math.max(0, i - 25); j--) {
+            if (DIRECT_SPAWN.test(lines[j])) break; // test's own spawn — fine
+            if (RUNNER_CALL.test(lines[j])) {
+              offenders.push(`${path.relative(ROOT, full)}:${i + 1}`);
+              break;
+            }
+          }
+        }
+      }
+    };
+    walk(testDir);
+    expect(
+      offenders,
+      'These callsites pass the operator env into an eval child, defeating the hermetic scrub: ' +
+        offenders.join(', '),
+    ).toEqual([]);
+  });
+});
@@ -0,0 +1,50 @@
+import { describe, test, expect } from 'bun:test';
+import { isConductor } from '../lib/is-conductor';
+
+describe('is-conductor', () => {
+  test('true when CONDUCTOR_WORKSPACE_PATH is set', () => {
+    expect(isConductor({ CONDUCTOR_WORKSPACE_PATH: '/Users/x/conductor/ws' })).toBe(true);
+  });
+
+  test('true when CONDUCTOR_PORT is set', () => {
+    expect(isConductor({ CONDUCTOR_PORT: '55070' })).toBe(true);
+  });
+
+  test('true when both are set', () => {
+    expect(isConductor({ CONDUCTOR_WORKSPACE_PATH: '/ws', CONDUCTOR_PORT: '55070' })).toBe(true);
+  });
+
+  test('false when neither is set', () => {
+    expect(isConductor({ HOME: '/Users/x', PATH: '/usr/bin' })).toBe(false);
+  });
+
+  test('false on an empty env', () => {
+    expect(isConductor({})).toBe(false);
+  });
+
+  test('false when the vars are present but empty (Codex #1 hardening — empty != set)', () => {
+    expect(isConductor({ CONDUCTOR_WORKSPACE_PATH: '', CONDUCTOR_PORT: '' })).toBe(false);
+  });
+
+  test('reads the passed env at call time, not a module-load snapshot', () => {
+    const env: NodeJS.ProcessEnv = {};
+    expect(isConductor(env)).toBe(false);
+    // mutate AFTER the first call — a call-time read must see the new value
+    env.CONDUCTOR_PORT = '55070';
+    expect(isConductor(env)).toBe(true);
+  });
+
+  test('defaults to process.env when no arg is passed', () => {
+    const saved = process.env.CONDUCTOR_PORT;
+    try {
+      process.env.CONDUCTOR_PORT = '12345';
+      expect(isConductor()).toBe(true);
+      delete process.env.CONDUCTOR_PORT;
+      // CONDUCTOR_WORKSPACE_PATH may be set in a real Conductor session; guard the assertion
+      if (!process.env.CONDUCTOR_WORKSPACE_PATH) expect(isConductor()).toBe(false);
+    } finally {
+      if (saved === undefined) delete process.env.CONDUCTOR_PORT;
+      else process.env.CONDUCTOR_PORT = saved;
+    }
+  });
+});
@@ -43,6 +43,11 @@ function runHook(stdin: object): { stdout: string; stderr: string; status: numbe
  env.GSTACK_STATE_ROOT = stateRoot;
  env.GSTACK_QUESTION_LOG_NO_DERIVE = '1';
  delete env.GSTACK_HOME;
+  // These cases assert the defer-path memoryContext injection. Strip ambient
+  // Conductor markers so running inside Conductor (CONDUCTOR_WORKSPACE_PATH/PORT
+  // set) doesn't flip the hook into the [conductor] prose deny instead of defer.
+  delete env.CONDUCTOR_WORKSPACE_PATH;
+  delete env.CONDUCTOR_PORT;
  const res = spawnSync(HOOK, [], {
    env,
    input: JSON.stringify({ ...stdin, cwd: fixtureCwd }),
@@ -70,3 +70,13 @@ describe('Preamble composition order', () => {
    expect(out).not.toContain('## AskUserQuestion Format');
  });
 });
+
+describe('Conductor signal (preamble bash)', () => {
+  test('claude preamble emits CONDUCTOR_SESSION, gated on != headless (Issue 8)', () => {
+    const out = generatePreamble(makeCtx('claude', 2, 'claude'));
+    expect(out).toContain('echo "CONDUCTOR_SESSION: true"');
+    // The emission must be suppressed when the session is headless (eval/CI
+    // inside Conductor must BLOCK, not render prose to nobody).
+    expect(out).toMatch(/"\$_SESSION_KIND" != "headless"[\s\S]*CONDUCTOR_WORKSPACE_PATH[\s\S]*CONDUCTOR_PORT[\s\S]*CONDUCTOR_SESSION: true/);
+  });
+});
@@ -60,7 +60,7 @@ function writeGlobalPref(questionId: string, preference: string): void {
  fs.writeFileSync(f, JSON.stringify(prefs, null, 2));
 }

-function runHook(stdin: object, cwd?: string): {
+function runHook(stdin: object, cwd?: string, extraEnv?: Record<string, string>): {
  stdout: string;
  stderr: string;
  status: number;
@@ -72,7 +72,15 @@ function runHook(stdin: object, cwd?: string): {
  }
  env.GSTACK_STATE_ROOT = stateRoot;
  delete env.GSTACK_HOME;
+  // Strip ambient Conductor markers so these cases characterize NON-Conductor
+  // behavior deterministically — otherwise running the suite inside Conductor
+  // (CONDUCTOR_WORKSPACE_PATH/PORT set) would flip every defer into the
+  // [conductor] prose deny. The Conductor cases below opt back in explicitly
+  // via extraEnv.
+  delete env.CONDUCTOR_WORKSPACE_PATH;
+  delete env.CONDUCTOR_PORT;
  env.GSTACK_QUESTION_LOG_NO_DERIVE = '1';
+  if (extraEnv) Object.assign(env, extraEnv);
  const res = spawnSync(HOOK, [], {
    env,
    input: JSON.stringify({ ...stdin, cwd: cwd || fixtureCwd }),
@@ -337,6 +345,108 @@ describe('MCP variant', () => {
  });
 });

+// ----------------------------------------------------------------------
+// Conductor: deny + prose redirect (transport avoidance, not preference)
+// ----------------------------------------------------------------------
+
+describe('Conductor prose redirect', () => {
+  const CONDUCTOR = { CONDUCTOR_PORT: '55070' };
+
+  test('two-way, no preference → deny with [conductor] prose directive', () => {
+    const r = runHook({
+      session_id: 'c1',
+      tool_name: 'AskUserQuestion',
+      tool_use_id: 'tu-c1',
+      tool_input: {
+        questions: [
+          { question: '<gstack-qid:test-q> Need approval?', options: ['A) Yes (recommended)', 'B) No'] },
+        ],
+      },
+    }, undefined, CONDUCTOR);
+    expect(r.parsed?.hookSpecificOutput?.permissionDecision).toBe('deny');
+    expect(r.parsed?.hookSpecificOutput?.permissionDecisionReason).toContain('[conductor]');
+    expect(r.parsed?.hookSpecificOutput?.permissionDecisionReason).toMatch(/do not call askuserquestion/i);
+    expect(r.parsed?.hookSpecificOutput?.permissionDecisionReason).toMatch(/reply with a letter/i);
+  });
+
+  test('UNMARKED question (modal path) → deny with prose directive', () => {
+    const r = runHook({
+      session_id: 'c2',
+      tool_name: 'AskUserQuestion',
+      tool_use_id: 'tu-c2',
+      tool_input: {
+        questions: [
+          { question: 'No marker — an ad-hoc question', options: ['A) Yes (recommended)', 'B) No'] },
+        ],
+      },
+    }, undefined, CONDUCTOR);
+    expect(r.parsed?.hookSpecificOutput?.permissionDecision).toBe('deny');
+    expect(r.parsed?.hookSpecificOutput?.permissionDecisionReason).toContain('[conductor]');
+  });
+
+  test('one-way door → deny with prose directive (NOT defer — destructive must reach human via prose)', () => {
+    const r = runHook({
+      session_id: 'c3',
+      tool_name: 'AskUserQuestion',
+      tool_use_id: 'tu-c3',
+      tool_input: {
+        questions: [
+          {
+            question: '<gstack-qid:ship-test-failure-triage> Tests failed.',
+            options: ['A) Fix now (recommended)', 'B) Investigate', 'C) Ack and ship'],
+          },
+        ],
+      },
+    }, undefined, CONDUCTOR);
+    expect(r.parsed?.hookSpecificOutput?.permissionDecision).toBe('deny');
+    expect(r.parsed?.hookSpecificOutput?.permissionDecisionReason).toContain('[conductor]');
+    expect(r.parsed?.hookSpecificOutput?.permissionDecisionReason).toMatch(/typed confirmation/i);
+  });
+
+  test('CONDUCTOR_WORKSPACE_PATH alone also triggers the redirect', () => {
+    const r = runHook({
+      session_id: 'c4',
+      tool_name: 'mcp__conductor__AskUserQuestion',
+      tool_use_id: 'tu-c4',
+      tool_input: {
+        questions: [{ question: '<gstack-qid:test-q> Pick?', options: ['A) X (recommended)', 'B) Y'] }],
+      },
+    }, undefined, { CONDUCTOR_WORKSPACE_PATH: '/Users/x/conductor/ws' });
+    expect(r.parsed?.hookSpecificOutput?.permissionDecision).toBe('deny');
+    expect(r.parsed?.hookSpecificOutput?.permissionDecisionReason).toContain('[conductor]');
+  });
+
+  test('PRECEDENCE: full never-ask auto-decide still wins over Conductor prose', () => {
+    writeProjectPref('ship-pre-landing-review-fix', 'never-ask');
+    const r = runHook({
+      session_id: 'c5',
+      tool_name: 'AskUserQuestion',
+      tool_use_id: 'tu-c5',
+      tool_input: {
+        questions: [
+          {
+            question: '<gstack-qid:ship-pre-landing-review-fix> Pre-landing review flagged issue.',
+            options: ['A) Fix now (recommended)', 'B) Skip'],
+          },
+        ],
+      },
+    }, undefined, CONDUCTOR);
+    expect(r.parsed?.hookSpecificOutput?.permissionDecision).toBe('deny');
+    // auto-decide reason, NOT the conductor prose reason
+    expect(r.parsed?.hookSpecificOutput?.permissionDecisionReason).toContain('plan-tune auto-decide');
+    expect(r.parsed?.hookSpecificOutput?.permissionDecisionReason).not.toContain('[conductor]');
+  });
+
+  test('non-AUQ tool in Conductor → still defer (no redirect on unrelated tools)', () => {
+    const r = runHook(
+      { session_id: 'c6', tool_name: 'Bash', tool_use_id: 'tu-c6', tool_input: {} },
+      undefined,
+      CONDUCTOR,
+    );
+    expect(r.parsed?.hookSpecificOutput?.permissionDecision).toBe('defer');
+  });
+});
+
 // ----------------------------------------------------------------------
 // Auto-decided event logging (since PostToolUse never fires on deny)
 // ----------------------------------------------------------------------
@@ -225,8 +225,25 @@ describe('generateAskUserFormat — runtime-failure prose fallback', () => {
    expect(out).toMatch(/must be sent as tool_use, not prose — unless the documented failure fallback/);
  });

-  test('OV2: the self-check "not writing prose" line carries the fallback qualifier', () => {
-    expect(out).toMatch(/not writing prose — unless the documented failure fallback applies/);
+  test('OV2: the self-check "not writing prose" line carries the Conductor + fallback qualifiers', () => {
+    // After the Conductor-default-prose change, the exception is two-pronged:
+    // CONDUCTOR_SESSION makes prose the default, OR the documented failure fallback.
+    expect(out).toMatch(/not writing prose — unless `CONDUCTOR_SESSION: true`[\s\S]*OR the documented failure fallback applies/);
+  });
+
+  // Conductor-default-prose contract (the proactive path, distinct from the
+  // failure fallback). Guards the Tool-resolution rule + self-check wording.
+  test('Conductor: do-not-call rule present in Tool resolution', () => {
+    expect(out).toMatch(/CONDUCTOR_SESSION: true/);
+    expect(out).toMatch(/do NOT call AskUserQuestion at all/);
+    expect(out).toMatch(/Auto-decide preferences still apply first/);
+    expect(out).toMatch(/gstack-question-log/);
+  });
+
+  test('Conductor: one-way prose rule + continuation protocol present', () => {
+    expect(out).toMatch(/one-way\b[\s\S]*typed confirmation/i);
+    expect(out).toMatch(/never proceed on a vague/i);
+    expect(out).toMatch(/Continuation — mapping a typed reply/);
  });
 });

@@ -131,6 +131,11 @@ export const SKILL_COVERAGE: Record<string, SkillCoverage> = {
  'design-consultation': { gate: ['test/skill-coverage-floor.test.ts'], periodic: [] },
  'design-shotgun': { gate: ['test/skill-coverage-floor.test.ts'], periodic: [] },
  'design-html': { gate: ['test/skill-coverage-floor.test.ts'], periodic: [] },
+  diagram: {
+    gate: ['test/skill-e2e-diagram.test.ts', 'test/skill-coverage-floor.test.ts'],
+    periodic: ['test/skill-e2e-diagram.test.ts'],
+    rationale: 'Triplet contract is gate-tier deterministic; authoring-quality judge is periodic (E2E_TIERS: diagram-triplet/diagram-authoring-quality).',
+  },
  cso: {
    gate: ['test/skill-e2e-cso.test.ts', 'test/cso-preserved.test.ts', 'test/skill-coverage-floor.test.ts'],
    periodic: [],
@@ -100,11 +100,19 @@ describeE2E('AUTO_DECIDE opt-in preserved under Conductor flags (periodic)', ()
      }

      // 4. Run /plan-ceo-review with the Conductor flag set + isolated state.
+      //    GSTACK_HOME=tmpHome is REQUIRED: the preference + question_tuning were
+      //    seeded there. Without it the spawned claude reads the real ~/.gstack,
+      //    never sees the never-ask preference, and the test silently exercises
+      //    the wrong state root (pre-existing bug, Codex #9 / Issue 13).
+      //    CONDUCTOR_WORKSPACE_PATH additionally proves auto-decide still WINS
+      //    over the Conductor prose redirect (precedence: settled preference
+      //    beats transport-avoidance).
      const obs = await runPlanSkillObservation({
        skillName: 'plan-ceo-review',
        inPlanMode: true,
        extraArgs: ['--disallowedTools', 'AskUserQuestion'],
        timeoutMs: 300_000,
+        env: { GSTACK_HOME: tmpHome, CONDUCTOR_WORKSPACE_PATH: tmpHome },
      });

      // 5. Pass: 'auto_decided' (the strongest signal) or 'plan_ready' with
@@ -192,13 +192,21 @@ Report the exact output — either "READY: <path>" or "NEEDS_SETUP".`,
    run('git', ['add', '.']);
    run('git', ['commit', '-m', 'initial']);

-    // Copy bin scripts
+    // Copy bin scripts + the lib module they import. gstack-learnings-log
+    // does `import ... from '$SCRIPT_DIR/../lib/jsonl-store.ts'` (v1.57.5.0
+    // injection sanitization) — without lib/ alongside bin/, the script exits
+    // 1 before writing anything, failing this test for a fixture reason, not
+    // a model-behavior reason (root-caused during the v1.58.0.0 ship; fails
+    // identically on main).
    const binDir = path.join(opDir, 'bin');
    fs.mkdirSync(binDir, { recursive: true });
    for (const script of ['gstack-learnings-log', 'gstack-slug']) {
      fs.copyFileSync(path.join(ROOT, 'bin', script), path.join(binDir, script));
      fs.chmodSync(path.join(binDir, script), 0o755);
    }
+    const libDir = path.join(opDir, 'lib');
+    fs.mkdirSync(libDir, { recursive: true });
+    fs.copyFileSync(path.join(ROOT, 'lib', 'jsonl-store.ts'), path.join(libDir, 'jsonl-store.ts'));

    // gstack-learnings-log will create the project dir automatically via gstack-slug

@@ -0,0 +1,69 @@
+/**
+ * Conductor → prose decision brief (periodic-tier, paid, real-PTY).
+ *
+ * Proves the end-to-end behavior: when CONDUCTOR_SESSION is signalled, a skill
+ * that hits a decision renders a PROSE decision brief and waits, instead of
+ * silently skipping the user.
+ *
+ * SCOPE — read before trusting this as the Conductor guard. This is END-TO-END
+ * BEHAVIOR coverage, NOT the discriminating Conductor guarantee:
+ *   - The deterministic guard is test/question-preference-hook.test.ts
+ *     ("Conductor prose redirect") — it sets process.env.CONDUCTOR_* and asserts
+ *     the PreToolUse hook denies + redirects. That test CAN fail on unfixed code.
+ *   - The PTY harness here cannot register `mcp__conductor__AskUserQuestion`, so
+ *     it tests "native AUQ unavailable + Conductor signal → prose," NOT "the MCP
+ *     variant exists and must not be called" (Codex #10). Under --disallowedTools
+ *     a present-human interactive session already prose-falls-back, so this test
+ *     is a smoke check that the Conductor path still produces a prose brief, not
+ *     a proof that the Conductor signal (vs the generic fallback) drove it.
+ *
+ * Periodic tier: model-behavior, non-deterministic.
+ */
+
+import { describe, test, expect } from 'bun:test';
+import { runPlanSkillObservation } from './helpers/claude-pty-runner';
+
+const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
+const describeE2E = shouldRun ? describe : describe.skip;
+
+const FLAWED_PLAN = `# Plan: add a "developer-friendly" pricing tier
+
+## Goal
+Increase developer adoption.
+
+## Premise
+No tests mentioned, no rollout plan, no auth check on the upgrade endpoint.
+Adds a Stripe tier, a React pricing page, a Postgres entitlements table, and a
+Redis cache. The team "feels like" it should be cheaper; no developer was asked.
+`;
+
+describeE2E('Conductor renders decisions as prose (periodic)', () => {
+  test('plan-eng-review in a Conductor session surfaces a PROSE decision brief, not a silent skip', async () => {
+    const obs = await runPlanSkillObservation({
+      skillName: 'plan-eng-review',
+      inPlanMode: true,
+      // Mimic Conductor: native AUQ disabled + the Conductor env signal present.
+      extraArgs: ['--disallowedTools', 'AskUserQuestion'],
+      env: { CONDUCTOR_WORKSPACE_PATH: '/tmp/conductor-prose-e2e' },
+      initialPlanContent: FLAWED_PLAN,
+      timeoutMs: 300_000,
+    });
+
+    // The decision must reach the human as prose. 'silent_write' (wrote findings
+    // to the plan without asking) is the precise failure we guard against.
+    if (obs.outcome === 'silent_write') {
+      throw new Error(
+        `Conductor prose regression: skill wrote findings without surfacing a decision.\n` +
+          `summary: ${obs.summary}\n--- evidence ---\n${obs.evidence}`,
+      );
+    }
+    if (obs.outcome === 'exited' || obs.outcome === 'timeout') {
+      throw new Error(
+        `Conductor prose test inconclusive: outcome=${obs.outcome}\n` +
+          `summary: ${obs.summary}\n--- evidence ---\n${obs.evidence}`,
+      );
+    }
+    // A prose-rendered decision brief was observed at some point in the run.
+    expect(obs.proseAUQEverObserved).toBe(true);
+  }, 360_000);
+});
@@ -0,0 +1,153 @@
+/**
+ * /diagram skill E2E (paid, claude -p).
+ *
+ * Two tests with deliberately different tiers (eng-review D5):
+ *
+ *   diagram-triplet (gate) — deterministic functional contract: from an
+ *   English ask, the agent following the skill emits a parseable triplet —
+ *   .mmd source, .excalidraw scene with elements, SVG markup, PNG bytes.
+ *   No quality judgment; either the artifacts exist and parse or they don't.
+ *
+ *   diagram-authoring-quality (periodic) — LLM-judged benchmark of the
+ *   authored mermaid itself (faithfulness to the ask, label quality,
+ *   readable size). Non-deterministic by nature → never blocks merge.
+ *
+ * Per the extract-don't-copy fixture rule, the prompt embeds only the skill's
+ * working section (from "# /diagram" onward), not the full generated SKILL.md
+ * with its preamble.
+ */
+import { describe, expect } from 'bun:test';
+import * as fs from 'node:fs';
+import * as path from 'node:path';
+import * as os from 'node:os';
+
+import { runSkillTest } from './helpers/session-runner';
+import {
+  ROOT, browseBin, runId,
+  describeIfSelected, testConcurrentIfSelected,
+  logCost,
+} from './helpers/e2e-helpers';
+import { callJudge } from './helpers/llm-judge';
+
+const BUNDLE = path.join(ROOT, 'lib', 'diagram-render', 'dist', 'diagram-render.html');
+
+/** Extract the working section of the generated skill doc (post-preamble). */
+function skillExtract(): string {
+  const full = fs.readFileSync(path.join(ROOT, 'diagram', 'SKILL.md'), 'utf-8');
+  const start = full.indexOf('# /diagram');
+  if (start < 0) throw new Error('diagram/SKILL.md missing "# /diagram" section — regenerate skill docs');
+  return full.slice(start);
+}
+
+function setupDir(prefix: string): string {
+  const dir = fs.mkdtempSync(path.join(os.tmpdir(), prefix));
+  fs.writeFileSync(path.join(dir, 'diagram-skill.md'), skillExtract());
+  // Pre-stage the bundle so the test is hermetic (no global install needed in
+  // CI); the prompt tells the agent discovery is already done.
+  fs.copyFileSync(BUNDLE, path.join(dir, 'diagram-render.html'));
+  fs.mkdirSync(path.join(dir, 'out'));
+  return dir;
+}
+
+function basePrompt(dir: string, ask: string): string {
+  return `You have the /diagram skill instructions at ./diagram-skill.md — read them and follow Steps 1-4.
+
+Environment notes (already set up — skip Step 2's bundle discovery):
+- The browse binary is at ${browseBin} — use it wherever the skill says $B.
+- The render bundle is ALREADY staged at ./diagram-render.html in this directory; load it with: ${browseBin} load-html ./diagram-render.html
+- Write all four artifacts into ./out/ with the slug "flow" (out/flow.mmd, out/flow.excalidraw, out/flow.svg, out/flow.png).
+- Do not open any other applications. Do not use the Read tool on the PNG (no inline display needed here).
+
+The diagram to create: ${ask}`;
+}
+
+describeIfSelected('/diagram skill E2E', ['diagram-triplet', 'diagram-authoring-quality'], () => {
+  testConcurrentIfSelected('diagram-triplet', async () => {
+    const dir = setupDir('diagram-triplet-');
+    try {
+      const result = await runSkillTest({
+        prompt: basePrompt(
+          dir,
+          'a flowchart (graph LR) of a 4-stage pipeline: markdown → prepass → Chromium → PDF.',
+        ),
+        workingDirectory: dir,
+        maxTurns: 25,
+        allowedTools: ['Bash', 'Read', 'Write'],
+        timeout: 240_000,
+        testName: 'diagram-triplet',
+        runId,
+      });
+      logCost('diagram triplet', result);
+      expect(result.exitReason).toBe('success');
+
+      // The deterministic contract: all four artifacts exist and parse.
+      const mmd = fs.readFileSync(path.join(dir, 'out', 'flow.mmd'), 'utf-8');
+      expect(mmd).toMatch(/graph\s+(LR|TD)/);
+
+      const scene = JSON.parse(fs.readFileSync(path.join(dir, 'out', 'flow.excalidraw'), 'utf-8'));
+      expect(scene.type).toBe('excalidraw');
+      expect(Array.isArray(scene.elements)).toBe(true);
+      expect(scene.elements.length).toBeGreaterThan(3);
+
+      const svg = fs.readFileSync(path.join(dir, 'out', 'flow.svg'), 'utf-8');
+      expect(svg).toMatch(/<svg/i);
+
+      const png = fs.readFileSync(path.join(dir, 'out', 'flow.png'));
+      expect(png.subarray(0, 4)).toEqual(Buffer.from([0x89, 0x50, 0x4e, 0x47]));
+      expect(png.length).toBeGreaterThan(5_000);
+    } finally {
+      try { fs.rmSync(dir, { recursive: true, force: true }); } catch { /* ignore */ }
+    }
+  }, 300_000);
+
+  testConcurrentIfSelected('diagram-authoring-quality', async () => {
+    const dir = setupDir('diagram-quality-');
+    try {
+      const result = await runSkillTest({
+        prompt: basePrompt(
+          dir,
+          'how gstack renders diagrams in PDFs: markdown containing mermaid fences goes through a pre-pass that extracts the fences, renders them in a browse daemon tab using an offline bundle, substitutes the SVG back in, inlines local images, and prints via Chromium. Failures become visible diagnostic blocks.',
+        ),
+        workingDirectory: dir,
+        maxTurns: 25,
+        allowedTools: ['Bash', 'Read', 'Write'],
+        timeout: 240_000,
+        testName: 'diagram-authoring-quality',
+        runId,
+      });
+      logCost('diagram authoring quality', result);
+      expect(result.exitReason).toBe('success');
+
+      const mmd = fs.readFileSync(path.join(dir, 'out', 'flow.mmd'), 'utf-8');
+      const svg = fs.readFileSync(path.join(dir, 'out', 'flow.svg'), 'utf-8');
+      expect(svg).toMatch(/<svg/i);
+
+      const verdict = await callJudge<{ score: number; reasoning: string }>(
+        `You are judging the quality of an agent-authored mermaid diagram.
+
+THE ASK: a diagram of gstack's PDF diagram-rendering flow — mermaid fences are
+extracted by a pre-pass, rendered in a browse tab via an offline bundle,
+substituted back as SVG, images inlined, printed by Chromium, with render
+failures becoming visible diagnostic blocks.
+
+THE AUTHORED MERMAID:
+\`\`\`mermaid
+${mmd}
+\`\`\`
+
+Score 1-10 on: faithfulness to the ask (are the named stages present and
+correctly ordered?), label quality (short node labels, detail on edges),
+and readable size (5-15 nodes, not a wall). A diagram that misses the
+failure/diagnostic path entirely caps at 5 — that path is an explicitly
+named requirement, so omitting it must fail the run.
+
+Respond with JSON: {"score": N, "reasoning": "..."}`,
+      );
+      // eslint-disable-next-line no-console
+      console.log(`[diagram-quality] score=${verdict.score} — ${verdict.reasoning}`);
+      expect(verdict.score).toBeGreaterThanOrEqual(6);
+    } finally {
+      try { fs.rmSync(dir, { recursive: true, force: true }); } catch { /* ignore */ }
+    }
+  }, 300_000);
+});
@@ -0,0 +1,190 @@
+/**
+ * Hermetic-isolation canaries (gate tier, ~$0.02 each, deterministic).
+ *
+ * Two tests that make the hermeticity claim FALSIFIABLE instead of asserted:
+ *
+ * 1. `hermetic-canary` — env + auth isolation. Plants contamination vars in
+ *    the TEST process env, spawns a child through the real runner, and
+ *    asserts from the Bash tool_result in the stream-json transcript (never
+ *    the model's prose — prose can hallucinate) that the child saw a temp
+ *    `/.claude` config dir, a temp GSTACK_HOME, and none of the planted
+ *    contamination. Auth hermeticity: hard-fails when ANTHROPIC_API_KEY is
+ *    absent (a skip here would be a silent hole), and asserts
+ *    total_cost_usd > 0 — subscription/keychain OAuth reports cost 0, so
+ *    nonzero cost is the discriminator that the API key actually paid
+ *    (verified empirically 2026-06-12; the result record exposes no
+ *    auth-source field, so cost is the best available signal — residual
+ *    gap documented in the plan).
+ *
+ * 2. `hermetic-sentinel` — config isolation, the poisoned-operator probe.
+ *    Builds a FAKE operator config tree (user CLAUDE.md + an mcpServers
+ *    entry) and points the test process's CLAUDE_CONFIG_DIR at it. If the
+ *    hermetic redirect ever breaks, the child loads that poisoned tree and
+ *    the probes fire: init.mcp_servers would list the planted server
+ *    (semantic proof that --strict-mcp-config + the redirect yield ZERO MCP
+ *    servers, not an assumption), and the child's config dir would contain
+ *    the poisoned CLAUDE.md.
+ *
+ * Both canaries double as the seed-schema / CLI version-skew tripwire: a
+ * claude release that changes first-run behavior or config discovery fails
+ * here first, loudly, in the gate tier.
+ */
+
+import { expect, afterAll } from 'bun:test';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import { runSkillTest } from './helpers/session-runner';
+import {
+  describeIfSelected, testIfSelected, createEvalCollector, finalizeEvalCollector,
+  recordE2E, runId, logCost,
+} from './helpers/e2e-helpers';
+
+const evalCollector = createEvalCollector('e2e-hermetic');
+
+// Cheap + deterministic: the canaries assert environment facts, not model
+// quality, so the smallest model is the right tool.
+const CANARY_MODEL = 'claude-haiku-4-5-20251001';
+
+/** Extract concatenated tool_result text from the stream-json transcript. */
+function toolResultText(transcript: any[]): string {
+  const chunks: string[] = [];
+  for (const event of transcript) {
+    if (event.type !== 'user') continue;
+    for (const item of event.message?.content ?? []) {
+      if (item.type !== 'tool_result') continue;
+      if (typeof item.content === 'string') chunks.push(item.content);
+      else for (const c of item.content ?? []) if (c.type === 'text') chunks.push(c.text);
+    }
+  }
+  return chunks.join('\n');
+}
+
+function initEvent(transcript: any[]): any {
+  return transcript.find((e) => e.type === 'system' && e.subtype === 'init');
+}
+
+describeIfSelected('hermetic isolation canaries', ['hermetic-canary', 'hermetic-sentinel'], () => {
+  testIfSelected('hermetic-canary', async () => {
+    // Auth hermeticity is part of the contract: a missing key must FAIL the
+    // gate, not skip it — a skipped canary is a silent hole.
+    if (!process.env.ANTHROPIC_API_KEY) {
+      throw new Error('hermetic-canary requires ANTHROPIC_API_KEY (source ~/.zshrc); refusing to skip');
+    }
+
+    const workDir = fs.mkdtempSync(path.join(os.tmpdir(), 'hermetic-canary-'));
+    // Plant contamination deterministically — the operator env may or may not
+    // carry these, so set them ourselves and restore after.
+    const planted: Record<string, string> = {
+      CONDUCTOR_WORKSPACE_PATH: '/tmp/poison-conductor-ws',
+      GBRAIN_POISON_PROBE: 'leaked',
+    };
+    const prev: Record<string, string | undefined> = {};
+    for (const [k, v] of Object.entries(planted)) { prev[k] = process.env[k]; process.env[k] = v; }
+
+    try {
+      const result = await runSkillTest({
+        prompt: 'Run exactly this bash command and then stop: ' +
+          'echo "CFG=$CLAUDE_CONFIG_DIR"; echo "GH=$GSTACK_HOME"; ' +
+          'echo "CW=$CONDUCTOR_WORKSPACE_PATH"; echo "GP=$GBRAIN_POISON_PROBE"',
+        workingDirectory: workDir,
+        maxTurns: 3,
+        allowedTools: ['Bash'],
+        timeout: 120_000,
+        testName: 'hermetic-canary',
+        runId,
+        model: CANARY_MODEL,
+      });
+      logCost('hermetic-canary', result);
+      recordE2E(evalCollector, 'hermetic-canary', 'e2e-hermetic', result);
+
+      expect(result.exitReason).toBe('success');
+
+      // Deterministic: assert the Bash tool OUTPUT, not the model's prose.
+      const bashOut = toolResultText(result.transcript);
+      const cfg = bashOut.match(/CFG=(\S*)/)?.[1] ?? '';
+      expect(cfg).toMatch(/gstack-hermetic-.*\/\.claude$/);
+      expect(bashOut).toMatch(/GH=\S*gstack-home/);
+      // Planted contamination must not reach the child. CLAUDECODE is NOT
+      // probed here: the child claude CLI sets CLAUDECODE=1 for its own tool
+      // subprocesses (verified empirically — CI behaves identically), so the
+      // Bash tool can't observe our scrub of it; the unit test pins that.
+      expect(bashOut).toMatch(/(^|\n)CW=\s*($|\n)/); // planted Conductor var scrubbed
+      expect(bashOut).toMatch(/(^|\n)GP=\s*($|\n)/); // GBRAIN_* scrubbed
+
+      // Zero MCP servers — semantic, from the init event, not a flag grep.
+      const init = initEvent(result.transcript);
+      expect(init).toBeTruthy();
+      expect(init.mcp_servers ?? []).toHaveLength(0);
+
+      // Auth: nonzero cost = the API key paid (OAuth/keychain reports 0).
+      expect(result.transcript.find((e) => e.type === 'result')?.total_cost_usd).toBeGreaterThan(0);
+    } finally {
+      for (const [k, v] of Object.entries(prev)) {
+        if (v === undefined) delete process.env[k]; else process.env[k] = v;
+      }
+      fs.rmSync(workDir, { recursive: true, force: true });
+    }
+  }, 180_000);
+
+  testIfSelected('hermetic-sentinel', async () => {
+    if (!process.env.ANTHROPIC_API_KEY) {
+      throw new Error('hermetic-sentinel requires ANTHROPIC_API_KEY (source ~/.zshrc); refusing to skip');
+    }
+
+    const workDir = fs.mkdtempSync(path.join(os.tmpdir(), 'hermetic-sentinel-'));
+    // Poisoned operator config tree: if the hermetic redirect breaks, the
+    // child discovers this dir and both probes below fire.
+    const poisonRoot = fs.mkdtempSync(path.join(os.tmpdir(), 'hermetic-poison-'));
+    const poisonCfg = path.join(poisonRoot, '.claude');
+    fs.mkdirSync(poisonCfg, { recursive: true });
+    fs.writeFileSync(path.join(poisonCfg, 'CLAUDE.md'), 'POISONED OPERATOR MEMORY — must never load\n');
+    fs.writeFileSync(path.join(poisonCfg, '.claude.json'), JSON.stringify({
+      hasCompletedOnboarding: true,
+      mcpServers: { 'sentinel-mcp': { command: '/usr/bin/true', args: [] } },
+    }));
+    const prevCfgDir = process.env.CLAUDE_CONFIG_DIR;
+    process.env.CLAUDE_CONFIG_DIR = poisonCfg;
+
+    try {
+      const result = await runSkillTest({
+        prompt: 'Run exactly this bash command and then stop: ' +
+          'echo "CFG=$CLAUDE_CONFIG_DIR"; ' +
+          'if [ -f "$CLAUDE_CONFIG_DIR/CLAUDE.md" ]; then echo "USER_MD=present"; else echo "USER_MD=absent"; fi',
+        workingDirectory: workDir,
+        maxTurns: 3,
+        allowedTools: ['Bash'],
+        timeout: 120_000,
+        testName: 'hermetic-sentinel',
+        runId,
+        model: CANARY_MODEL,
+      });
+      logCost('hermetic-sentinel', result);
+      recordE2E(evalCollector, 'hermetic-sentinel', 'e2e-hermetic', result);
+
+      expect(result.exitReason).toBe('success');
+
+      const bashOut = toolResultText(result.transcript);
+      const cfg = bashOut.match(/CFG=(\S*)/)?.[1] ?? '';
+      // The redirect must beat the poisoned operator value...
+      expect(cfg).not.toBe(poisonCfg);
+      expect(cfg).toMatch(/gstack-hermetic-.*\/\.claude$/);
+      // ...and the active config dir must not carry the poisoned user memory.
+      expect(bashOut).toContain('USER_MD=absent');
+
+      // The planted MCP server must be invisible: zero servers in init.
+      const init = initEvent(result.transcript);
+      expect(init).toBeTruthy();
+      const servers = (init.mcp_servers ?? []).map((s: any) => s?.name ?? s);
+      expect(servers).toHaveLength(0);
+      expect(JSON.stringify(servers)).not.toContain('sentinel-mcp');
+    } finally {
+      if (prevCfgDir === undefined) delete process.env.CLAUDE_CONFIG_DIR;
+      else process.env.CLAUDE_CONFIG_DIR = prevCfgDir;
+      fs.rmSync(workDir, { recursive: true, force: true });
+      fs.rmSync(poisonRoot, { recursive: true, force: true });
+    }
+  }, 180_000);
+});
+
+afterAll(() => finalizeEvalCollector(evalCollector));
@@ -36,6 +36,16 @@ afterEach(() => {
  rmSync(workDir, { recursive: true, force: true });
 });

+// Under `bun test --concurrent`, overlapping tests read the SAME shared
+// `workDir` binding (beforeEach reassigns it mid-flight), so a fixed
+// 'daemon.pid' name collides: the first daemon claims it and every sibling
+// gets already_running against the test process's own (always-alive) pid —
+// the exact failure seen in full gate runs at 15-way concurrency. Unique
+// per-claim pidfiles keep the single-instance semantics under test while
+// removing the cross-test collision.
+let pidfileSeq = 0;
+const uniquePidfile = () => join(workDir, `daemon-${++pidfileSeq}.pid`);
+
 interface StubState {
  loggedIn: boolean;
  username: string;
@@ -205,7 +215,7 @@ class AppState {
      const daemon = await startDaemon({
        loopbackPort: 0,
        tailnetEnabled: false,
-        pidfilePath: join(workDir, 'daemon.pid'),
+        pidfilePath: uniquePidfile(),
        tunnelProvider: async () => tunnel,
      });
      if ('error' in daemon) throw new Error(daemon.error);
@@ -249,7 +259,7 @@ describe('ios-qa E2E (agent-flow simulation)', () => {
      const daemon = await startDaemon({
        loopbackPort: 0,
        tailnetEnabled: false,
-        pidfilePath: join(workDir, 'daemon.pid'),
+        pidfilePath: uniquePidfile(),
        tunnelProvider: async () => tunnel,
      });
      if ('error' in daemon) throw new Error(daemon.error);
@@ -314,7 +324,7 @@ describe('ios-qa E2E (agent-flow simulation)', () => {
      const daemon = await startDaemon({
        loopbackPort: 0,
        tailnetEnabled: false,
-        pidfilePath: join(workDir, 'daemon.pid'),
+        pidfilePath: uniquePidfile(),
        tunnelProvider: async () => tunnel,
      });
      if ('error' in daemon) throw new Error(daemon.error);
@@ -352,7 +362,7 @@ describe('ios-qa E2E (agent-flow simulation)', () => {
      const daemon = await startDaemon({
        loopbackPort: 0,
        tailnetEnabled: true,
-        pidfilePath: join(workDir, 'daemon.pid'),
+        pidfilePath: uniquePidfile(),
        tunnelProvider: async () => tunnel,
        probeImpl: async () => ({ ok: true, ownIdentity: 'mac@e2e' }),
        whoIsImpl: async () => ({ identity: 'agent@e2e', raw: {} }),
@@ -430,7 +440,7 @@ describe('ios-qa E2E (agent-flow simulation)', () => {
      const daemon = await startDaemon({
        loopbackPort: 0,
        tailnetEnabled: true,
-        pidfilePath: join(workDir, 'daemon.pid'),
+        pidfilePath: uniquePidfile(),
        tunnelProvider: async () => tunnel,
        probeImpl: async () => ({ ok: true, ownIdentity: 'mac@e2e' }),
        whoIsImpl: async () => ({ identity: 'readonly@e2e', raw: {} }),
@@ -546,10 +546,13 @@ async function runWorkflowJudge(opts: {
  // slice markers vanish from the skeleton and the judge scores empty content.
  let content = fs.readFileSync(path.join(ROOT, opts.skillPath), 'utf-8');
  const secDir = path.join(ROOT, path.dirname(opts.skillPath), 'sections');
+  const sectionBodies: string[] = [];
  if (fs.existsSync(secDir)) {
    for (const f of fs.readdirSync(secDir).sort()) {
      if (f.endsWith('.md') && !f.endsWith('.md.tmpl')) {
-        content += '\n' + fs.readFileSync(path.join(secDir, f), 'utf-8');
+        const body = fs.readFileSync(path.join(secDir, f), 'utf-8');
+        sectionBodies.push(body);
+        content += '\n' + body;
      }
    }
  }
@@ -565,6 +568,17 @@ async function runWorkflowJudge(opts: {
    section = content.slice(startIdx);
  }

+  // Two carve shapes exist. plan-eng/plan-design moved the MARKERS into the
+  // section files, so the slice above already reaches the carved content.
+  // document-release instead keeps its markers in the skeleton and carves the
+  // workflow BODY (Steps 2-9 → sections/release-body.md) AFTER the endMarker,
+  // so the marker slice drops it. Re-append any carved section the window
+  // excluded, so the judge always sees the full workflow the agent executes.
+  for (const body of sectionBodies) {
+    const head = body.trim().slice(0, 120);
+    if (head && !section.includes(head)) section += '\n' + body;
+  }
+
  const scores = await callJudge<JudgeScore>(`You are evaluating the quality of ${opts.judgeContext} for an AI coding agent.

 The agent reads this document to learn ${opts.judgeGoal}. It references external tools and files
@@ -1386,15 +1386,16 @@ describe('Codex skill', () => {
    expect(content).toContain('Adversarial review (always-on)');
    // Always-on: both Claude and Codex adversarial
    expect(content).toContain('Claude adversarial subagent (always runs)');
-    expect(content).toContain('Codex adversarial challenge (always runs when available)');
+    expect(content).toContain('Codex adversarial challenge (runs whenever');
    // Claude adversarial subagent dispatch
    expect(content).toContain('Agent tool');
    expect(content).toContain('FIXABLE');
    expect(content).toContain('INVESTIGATE');
-    // Codex availability check
-    expect(content).toContain('CODEX_NOT_AVAILABLE');
-    // OLD_CFG only gates Codex, not Claude
-    expect(content).toContain('skip Codex passes only');
+    // Probe-based availability via the shared codexPreflight() (install + auth)
+    expect(content).toContain('CODEX_MODE');
+    expect(content).toContain('command -v codex'); // install check kept literal
+    // codex_reviews=disabled gates Codex passes only; Claude adversarial still runs
+    expect(content).toContain('skip the Codex passes ONLY');
    // Review log
    expect(content).toContain('adversarial-review');
    expect(content).toContain('reasoning_effort="high"');
@@ -1449,6 +1450,43 @@ describe('Codex skill', () => {
    expect(content).toContain('codex exec');
  });

+  // D5 regression guard: the Codex outside voice is default-on, not opt-in. A future
+  // gen-skill-docs change must not silently reintroduce the "Want an outside voice?"
+  // AskUserQuestion. The CODEX_PLAN_REVIEW content renders into each skill's
+  // sections/review-sections.md (the skeleton points at it). plan-design-review uses
+  // DESIGN_OUTSIDE_VOICES, not CODEX_PLAN_REVIEW, so it is excluded here.
+  test('plan reviews run the Codex outside voice default-on (no opt-in question)', () => {
+    for (const skill of ['plan-eng-review', 'plan-ceo-review', 'plan-devex-review']) {
+      const content = fs.readFileSync(
+        path.join(ROOT, skill, 'sections', 'review-sections.md'), 'utf-8');
+      expect(content).not.toContain('Want an outside voice');
+      expect(content).toContain('Outside Voice — Independent Plan Challenge (default-on)');
+      expect(content).toContain('CODEX_MODE');
+      expect(content).toContain('command -v codex'); // preflight install check (e2e relies on it)
+    }
+  });
+
+  test('/document-release includes the default-on Codex documentation review', () => {
+    // The doc-review renders into the carved release-body section (kept out of the
+    // always-loaded skeleton to respect the skeleton-byte budget).
+    const content = fs.readFileSync(
+      path.join(ROOT, 'document-release', 'sections', 'release-body.md'), 'utf-8');
+    expect(content).toContain('Codex Documentation Review (default-on)');
+    expect(content).toContain('CODEX_MODE');
+    expect(content).toContain('codex-doc-review');
+  });
+
+  test('codex-host document-release does NOT contain the Codex doc review', () => {
+    // .agents/ is gitignored — generate on demand (codex never invokes itself)
+    Bun.spawnSync(['bun', 'run', 'scripts/gen-skill-docs.ts', '--host', 'codex'], {
+      cwd: ROOT, stdout: 'pipe', stderr: 'pipe',
+    });
+    const content = fs.readFileSync(
+      path.join(ROOT, '.agents', 'skills', 'gstack-document-release', 'SKILL.md'), 'utf-8');
+    expect(content).not.toContain('Codex Documentation Review');
+    expect(content).not.toContain('codex-doc-review');
+  });
+
  test('codex review invocations avoid the prompt plus --base argument shape', () => {
    for (const rel of ['codex/SKILL.md', 'review/SKILL.md', 'ship/SKILL.md']) {
      // ship's codex command moved into sections/adversarial.md (T9 carve).