Merge remote-tracking branch 'origin/main' into garrytan/colombo-v3

# Conflicts: # CHANGELOG.md # VERSION # package.json
2026-06-21 09:10:11 +02:00 · 2026-05-20 07:49:31 -07:00
parent 4274b1204d 7ca04d8ef0
commit 993648d9e0
75 changed files with 3231 additions and 215 deletions
@@ -6,6 +6,7 @@ const ROOT = path.resolve(import.meta.dir, '..');
 const PKG = JSON.parse(fs.readFileSync(path.join(ROOT, 'package.json'), 'utf-8')) as {
  scripts: Record<string, string>;
 };
+const BUILD_SCRIPT = fs.readFileSync(path.join(ROOT, 'scripts', 'build.sh'), 'utf-8');

 // Strip single-quoted strings so JS code emitted as `echo '{ ... }'` doesn't
 // trip the shell-brace-group check. Conservative: only `'...'` segments.
@@ -15,7 +16,8 @@ function stripSingleQuoted(s: string): string {

 describe('package.json build scripts — POSIX shell compat (D-1460)', () => {
  // Bun's Windows shell parser doesn't grok bash brace groups `{ cmd; }`.
-  // Subshells `( cmd )` are POSIX-universal. This test prevents regression.
+  // Bun 1.3.x on Windows also rejects subshells when the subshell or the
+  // command inside it uses redirection, so redirected commands must be direct.
  test('no bash brace groups in any npm script', () => {
    const offending: { script: string; pattern: string }[] = [];
    for (const [name, body] of Object.entries(PKG.scripts)) {
@@ -28,13 +30,25 @@ describe('package.json build scripts — POSIX shell compat (D-1460)', () => {
    expect(offending).toEqual([]);
  });

-  test('every `> path/.version` redirect is preceded by a subshell, not a brace group', () => {
-    // The original PR #1460 target: package.json line 12 had three of these.
-    const build = PKG.scripts.build ?? '';
-    const versionRedirects = [...build.matchAll(/(\([^)]*\)|\{[^}]*\})\s*>\s*\S+\/\.version/g)];
-    expect(versionRedirects.length).toBeGreaterThan(0);
-    for (const m of versionRedirects) {
-      expect(m[1].startsWith('(')).toBe(true);
+  test('build script has no subshells with redirections', () => {
+    const offending: { script: string; pattern: string }[] = [];
+    for (const [name, body] of Object.entries({ build: PKG.scripts.build ?? '' })) {
+      const matches = [
+        ...body.matchAll(/\([^)]*[<>][^)]*\)/g),
+        ...body.matchAll(/\([^)]*\)\s*[<>]/g),
+      ];
+      for (const match of matches) {
+        offending.push({ script: name, pattern: match[0] });
+      }
    }
+    expect(offending).toEqual([]);
+  });
+
+  test('build script delegates .version writes to a shell script', () => {
+    // Bun rejects `( git ... ) > path/.version`.
+    const build = PKG.scripts.build ?? '';
+    expect(build).not.toMatch(/>\s*\S+\/\.version/);
+    expect(build).toBe('bash scripts/build.sh');
+    expect(BUILD_SCRIPT).toContain('bash scripts/write-version-files.sh');
  });
 });
@@ -0,0 +1,141 @@
+/**
+ * Static invariant: every gstackInjectToTerminal call in extension/*.js
+ * must be preceded by an await on gstackScanForPTYInject on the same code
+ * path (#1370 / D6).
+ *
+ * Why static, not runtime: extension/ runs in the chrome-extension origin;
+ * we can't easily exercise it in a Bun test. The invariant codex's plan
+ * review demanded is "no caller skips the scan." We get that by parsing
+ * the JS source as text and asserting structural rules.
+ *
+ * The rules (kept simple — false positives are worse than false
+ * negatives here since the wave has only two callers):
+ *
+ *   Rule 1: every file that calls gstackInjectToTerminal must also call
+ *           gstackScanForPTYInject.
+ *
+ *   Rule 2: in any function that calls gstackInjectToTerminal, an
+ *           `await ... gstackScanForPTYInject` MUST appear before the
+ *           inject call when measured by source position (same function
+ *           body).
+ *
+ *   Exemption: extension/sidepanel-terminal.js defines the inject
+ *           function itself; it doesn't need to call scan-first inside
+ *           the definition.
+ */
+
+import { describe, expect, test } from 'bun:test';
+import { readFileSync, readdirSync, statSync } from 'fs';
+import { join } from 'path';
+
+const EXTENSION_DIR = join(import.meta.dir, '..', 'extension');
+const INJECT_FN = 'gstackInjectToTerminal';
+const SCAN_FN = 'gstackScanForPTYInject';
+
+function listJsFiles(dir: string): string[] {
+  const out: string[] = [];
+  for (const entry of readdirSync(dir)) {
+    const full = join(dir, entry);
+    const st = statSync(full);
+    if (st.isDirectory()) {
+      out.push(...listJsFiles(full));
+    } else if (entry.endsWith('.js')) {
+      out.push(full);
+    }
+  }
+  return out;
+}
+
+function findInjectCallSites(content: string): number[] {
+  // Find positions of `gstackInjectToTerminal(` or `gstackInjectToTerminal?.(`
+  // — but exclude the function DEFINITION (window.gstackInjectToTerminal = ).
+  const sites: number[] = [];
+  const callRe = /window\.gstackInjectToTerminal\s*\??\.?\s*\(/g;
+  let match: RegExpExecArray | null;
+  while ((match = callRe.exec(content)) !== null) {
+    // Look back ~30 chars; if "window.gstackInjectToTerminal =" appears
+    // right before, it's the definition, not a call.
+    const back = Math.max(0, match.index - 30);
+    const window30 = content.slice(back, match.index);
+    if (window30.includes('gstackInjectToTerminal =')) continue;
+    sites.push(match.index);
+  }
+  return sites;
+}
+
+function callsScan(content: string): boolean {
+  return content.includes(SCAN_FN);
+}
+
+function findEnclosingFunctionStart(content: string, callerPos: number): number {
+  // Walk backwards from callerPos looking for the most recent `function`
+  // keyword, `=> {`, or `addEventListener('click',\s*async`. Conservative
+  // — falls back to file start.
+  const text = content.slice(0, callerPos);
+  const candidates = [
+    text.lastIndexOf('function '),
+    text.lastIndexOf('=> {'),
+    text.lastIndexOf('async function'),
+    text.lastIndexOf('async ('),
+    text.lastIndexOf('async () =>'),
+  ];
+  const idx = Math.max(...candidates);
+  return idx >= 0 ? idx : 0;
+}
+
+describe('extension/* PTY injection invariant (#1370 / D6)', () => {
+  test('every inject call site is preceded by a scan call in the same enclosing function', () => {
+    const files = listJsFiles(EXTENSION_DIR);
+    const offenders: string[] = [];
+
+    for (const file of files) {
+      const content = readFileSync(file, 'utf-8');
+      const sites = findInjectCallSites(content);
+      if (sites.length === 0) continue;
+
+      // Rule 1: file must reference the scan function.
+      if (!callsScan(content)) {
+        // Special-case sidepanel-terminal.js: it DEFINES the inject
+        // function but doesn't call it from inside.
+        if (file.endsWith('sidepanel-terminal.js')) continue;
+        offenders.push(`${file} calls ${INJECT_FN} but never references ${SCAN_FN}`);
+        continue;
+      }
+
+      // Rule 2: for each call site, find the enclosing function body and
+      // verify a scan call precedes the inject within that body.
+      for (const pos of sites) {
+        const fnStart = findEnclosingFunctionStart(content, pos);
+        const fnBody = content.slice(fnStart, pos);
+        if (!fnBody.includes(SCAN_FN)) {
+          const lineNum = content.slice(0, pos).split('\n').length;
+          offenders.push(`${file}:${lineNum} ${INJECT_FN} call not preceded by ${SCAN_FN} in enclosing function`);
+        }
+      }
+    }
+
+    if (offenders.length > 0) {
+      throw new Error(
+        'PTY-injection invariant violated:\n  - ' + offenders.join('\n  - '),
+      );
+    }
+    expect(offenders).toHaveLength(0);
+  });
+
+  test('sidepanel-terminal.js defines both gstackInjectToTerminal and gstackScanForPTYInject', () => {
+    const file = join(EXTENSION_DIR, 'sidepanel-terminal.js');
+    const content = readFileSync(file, 'utf-8');
+    expect(content).toContain('window.gstackInjectToTerminal');
+    expect(content).toContain('window.gstackScanForPTYInject');
+  });
+
+  test('inject function stays synchronous (D6 contract preservation)', () => {
+    const file = join(EXTENSION_DIR, 'sidepanel-terminal.js');
+    const content = readFileSync(file, 'utf-8');
+    // The definition line should NOT contain "async" — async inject would
+    // break every existing caller using `const ok = ...?.()` pattern.
+    const match = content.match(/window\.gstackInjectToTerminal\s*=\s*(async\s+)?function/);
+    expect(match).not.toBeNull();
+    expect(match?.[1]).toBeUndefined(); // no `async` modifier
+  });
+});
@@ -1860,7 +1860,7 @@ Before reviewing code quality, check: **did they build what was requested — no
   Read commit messages (`git log origin/<base>..HEAD --oneline`).
   **If no PR exists:** rely on commit messages and TODOS.md for stated intent — this is the common case since /review runs before /ship creates the PR.
 2. Identify the **stated intent** — what was this branch supposed to accomplish?
-3. Run `git diff origin/<base>...HEAD --stat` and compare the files changed against the stated intent.
+3. Run `DIFF_BASE=$(git merge-base origin/<base> HEAD) && git diff "$DIFF_BASE" --stat` and compare the files changed against the stated intent.

 4. Evaluate with skepticism (incorporating plan completion results if available from an earlier step or adjacent section):

@@ -1962,7 +1962,7 @@ Substitute: TIMESTAMP = ISO 8601 datetime, STATUS = "clean" if 0 findings or "is
 7. **Codex design voice** (optional, automatic if available):

 ```bash
-which codex 2>/dev/null && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE"
+command -v codex >/dev/null 2>&1 && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE"
 ```

 If Codex is available, run a lightweight design check on the diff:
@@ -1998,8 +1998,9 @@ STACK=""
 [ -f go.mod ] && STACK="${STACK}go "
 [ -f Cargo.toml ] && STACK="${STACK}rust "
 echo "STACK: ${STACK:-unknown}"
-DIFF_INS=$(git diff origin/<base> --stat | tail -1 | grep -oE '[0-9]+ insertion' | grep -oE '[0-9]+' || echo "0")
-DIFF_DEL=$(git diff origin/<base> --stat | tail -1 | grep -oE '[0-9]+ deletion' | grep -oE '[0-9]+' || echo "0")
+DIFF_BASE=$(git merge-base origin/<base> HEAD)
+DIFF_INS=$(git diff "$DIFF_BASE" --stat | tail -1 | grep -oE '[0-9]+ insertion' | grep -oE '[0-9]+' || echo "0")
+DIFF_DEL=$(git diff "$DIFF_BASE" --stat | tail -1 | grep -oE '[0-9]+ deletion' | grep -oE '[0-9]+' || echo "0")
 DIFF_LINES=$((DIFF_INS + DIFF_DEL))
 echo "DIFF_LINES: $DIFF_LINES"
 # Detect test framework for specialist test stub generation
@@ -2073,7 +2074,7 @@ If learnings are found, include them: "Past learnings for this domain: {learning
 4. Instructions:

 "You are a specialist code reviewer. Read the checklist below, then run
-`git diff origin/<base>` to get the full diff. Apply the checklist against the diff.
+`DIFF_BASE=$(git merge-base origin/<base> HEAD) && git diff "$DIFF_BASE"` to get the full diff. Apply the checklist against the diff.

 For each finding, output a JSON object on its own line:
 {\"severity\":\"CRITICAL|INFORMATIONAL\",\"confidence\":N,\"path\":\"file\",\"line\":N,\"category\":\"category\",\"summary\":\"description\",\"fix\":\"recommended fix\",\"fingerprint\":\"path:line:category\",\"specialist\":\"name\"}
@@ -2176,7 +2177,7 @@ The Red Team subagent receives:

 Prompt: "You are a red team reviewer. The code has already been reviewed by N specialists
 who found the following issues: {merged findings summary}. Your job is to find what they
-MISSED. Read the checklist, run `git diff origin/<base>`, and look for gaps.
+MISSED. Read the checklist, run `DIFF_BASE=$(git merge-base origin/<base> HEAD) && git diff "$DIFF_BASE"`, and look for gaps.
 Output findings as JSON objects (same schema as the specialists). Focus on cross-cutting
 concerns, integration boundary issues, and failure modes that specialist checklists
 don't cover."
@@ -2312,10 +2313,11 @@ Every diff gets adversarial review from both Claude and Codex. LOC is not a prox
 **Detect diff size and tool availability:**

 ```bash
-DIFF_INS=$(git diff origin/<base> --stat | tail -1 | grep -oE '[0-9]+ insertion' | grep -oE '[0-9]+' || echo "0")
-DIFF_DEL=$(git diff origin/<base> --stat | tail -1 | grep -oE '[0-9]+ deletion' | grep -oE '[0-9]+' || echo "0")
+DIFF_BASE=$(git merge-base origin/<base> HEAD)
+DIFF_INS=$(git diff "$DIFF_BASE" --stat | tail -1 | grep -oE '[0-9]+ insertion' | grep -oE '[0-9]+' || echo "0")
+DIFF_DEL=$(git diff "$DIFF_BASE" --stat | tail -1 | grep -oE '[0-9]+ deletion' | grep -oE '[0-9]+' || echo "0")
 DIFF_TOTAL=$((DIFF_INS + DIFF_DEL))
-which codex 2>/dev/null && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE"
+command -v codex >/dev/null 2>&1 && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE"
 # Legacy opt-out — only gates Codex passes, Claude always runs
 OLD_CFG=$(~/.claude/skills/gstack/bin/gstack-config get codex_reviews 2>/dev/null || true)
 echo "DIFF_SIZE: $DIFF_TOTAL"
@@ -2333,7 +2335,7 @@ If `OLD_CFG` is `disabled`: skip Codex passes only. Claude adversarial subagent
 Dispatch via the Agent tool. The subagent has fresh context — no checklist bias from the structured review. This genuine independence catches things the primary reviewer is blind to.

 Subagent prompt:
-"Read the diff for this branch with `git diff origin/<base>`. Think like an attacker and a chaos engineer. Your job is to find ways this code will fail in production. Look for: edge cases, race conditions, security holes, resource leaks, failure modes, silent data corruption, logic errors that produce wrong results silently, error handling that swallows failures, and trust boundary violations. Be adversarial. Be thorough. No compliments — just the problems. For each finding, classify as FIXABLE (you know how to fix it) or INVESTIGATE (needs human judgment). After listing findings, end your output with ONE line in the canonical format `Recommendation: <action> because <one-line reason naming the most exploitable finding>` — examples: `Recommendation: Fix the unbounded retry at queue.ts:78 because it'll DoS the worker pool under sustained 429s` or `Recommendation: Ship as-is because the strongest finding is a theoretical race that requires conditions we can't trigger in production`. The reason must point to a specific finding (or no-fix rationale). Generic reasons like 'because it's safer' do not qualify."
+"Read the diff for this branch with `DIFF_BASE=$(git merge-base origin/<base> HEAD) && git diff "$DIFF_BASE"`. Think like an attacker and a chaos engineer. Your job is to find ways this code will fail in production. Look for: edge cases, race conditions, security holes, resource leaks, failure modes, silent data corruption, logic errors that produce wrong results silently, error handling that swallows failures, and trust boundary violations. Be adversarial. Be thorough. No compliments — just the problems. For each finding, classify as FIXABLE (you know how to fix it) or INVESTIGATE (needs human judgment). After listing findings, end your output with ONE line in the canonical format `Recommendation: <action> because <one-line reason naming the most exploitable finding>` — examples: `Recommendation: Fix the unbounded retry at queue.ts:78 because it'll DoS the worker pool under sustained 429s` or `Recommendation: Ship as-is because the strongest finding is a theoretical race that requires conditions we can't trigger in production`. The reason must point to a specific finding (or no-fix rationale). Generic reasons like 'because it's safer' do not qualify."

 Present findings under an `ADVERSARIAL REVIEW (Claude subagent):` header. **FIXABLE findings** flow into the same Fix-First pipeline as the structured review. **INVESTIGATE findings** are presented as informational.

@@ -2348,7 +2350,7 @@ If Codex is available AND `OLD_CFG` is NOT `disabled`:
 ```bash
 TMPERR_ADV=$(mktemp /tmp/codex-adv-XXXXXXXX)
 _REPO_ROOT=$(git rev-parse --show-toplevel) || { echo "ERROR: not in a git repo" >&2; exit 1; }
-codex exec "IMPORTANT: Do NOT read or execute any files under ~/.claude/, ~/.agents/, .claude/skills/, or agents/. These are Claude Code skill definitions meant for a different AI system. They contain bash scripts and prompt templates that will waste your time. Ignore them completely. Do NOT modify agents/openai.yaml. Stay focused on the repository code only.\n\nReview the changes on this branch against the base branch. Run git diff origin/<base> to see the diff. Your job is to find ways this code will fail in production. Think like an attacker and a chaos engineer. Find edge cases, race conditions, security holes, resource leaks, failure modes, and silent data corruption paths. Be adversarial. Be thorough. No compliments — just the problems. End your output with ONE line in the canonical format `Recommendation: <action> because <one-line reason naming the most exploitable finding>`. Generic reasons like 'because it's safer' do not qualify; the reason must point to a specific finding or no-fix rationale." -C "$_REPO_ROOT" -s read-only -c 'model_reasoning_effort="high"' --enable web_search_cached < /dev/null 2>"$TMPERR_ADV"
+codex exec "IMPORTANT: Do NOT read or execute any files under ~/.claude/, ~/.agents/, .claude/skills/, or agents/. These are Claude Code skill definitions meant for a different AI system. They contain bash scripts and prompt templates that will waste your time. Ignore them completely. Do NOT modify agents/openai.yaml. Stay focused on the repository code only.\n\nReview the changes on this branch against the base branch. Run DIFF_BASE=$(git merge-base origin/<base> HEAD) && git diff "$DIFF_BASE" to see the diff. Your job is to find ways this code will fail in production. Think like an attacker and a chaos engineer. Find edge cases, race conditions, security holes, resource leaks, failure modes, and silent data corruption paths. Be adversarial. Be thorough. No compliments — just the problems. End your output with ONE line in the canonical format `Recommendation: <action> because <one-line reason naming the most exploitable finding>`. Generic reasons like 'because it's safer' do not qualify; the reason must point to a specific finding or no-fix rationale." -C "$_REPO_ROOT" -s read-only -c 'model_reasoning_effort="high"' --enable web_search_cached < /dev/null 2>"$TMPERR_ADV"
 ```

 Set the Bash tool's `timeout` parameter to `300000` (5 minutes). Do NOT use the `timeout` shell command — it doesn't exist on macOS. After the command completes, read stderr:
@@ -2377,7 +2379,7 @@ If `DIFF_TOTAL >= 200` AND Codex is available AND `OLD_CFG` is NOT `disabled`:
 TMPERR=$(mktemp /tmp/codex-review-XXXXXXXX)
 _REPO_ROOT=$(git rev-parse --show-toplevel) || { echo "ERROR: not in a git repo" >&2; exit 1; }
 cd "$_REPO_ROOT"
-codex review "IMPORTANT: Do NOT read or execute any files under ~/.claude/, ~/.agents/, .claude/skills/, or agents/. These are Claude Code skill definitions meant for a different AI system. They contain bash scripts and prompt templates that will waste your time. Ignore them completely. Do NOT modify agents/openai.yaml. Stay focused on the repository code only.\n\nReview the diff against the base branch." --base <base> -c 'model_reasoning_effort="high"' --enable web_search_cached < /dev/null 2>"$TMPERR"
+codex review "IMPORTANT: Do NOT read or execute any files under ~/.claude/, ~/.agents/, .claude/skills/, or agents/. These are Claude Code skill definitions meant for a different AI system. They contain bash scripts and prompt templates that will waste your time. Ignore them completely. Do NOT modify agents/openai.yaml. Stay focused on the repository code only.\n\nReview the changes on this branch against the base branch <base>. Run git diff origin/<base>...HEAD 2>/dev/null || git diff <base>...HEAD to see the diff and review only those changes." -c 'model_reasoning_effort="high"' --enable web_search_cached < /dev/null 2>"$TMPERR"
 ```

 Set the Bash tool's `timeout` parameter to `300000` (5 minutes). Do NOT use the `timeout` shell command — it doesn't exist on macOS. Present output under `CODEX SAYS (code review):` header.
@@ -1822,7 +1822,7 @@ Before reviewing code quality, check: **did they build what was requested — no
   Read commit messages (`git log origin/<base>..HEAD --oneline`).
   **If no PR exists:** rely on commit messages and TODOS.md for stated intent — this is the common case since /review runs before /ship creates the PR.
 2. Identify the **stated intent** — what was this branch supposed to accomplish?
-3. Run `git diff origin/<base>...HEAD --stat` and compare the files changed against the stated intent.
+3. Run `DIFF_BASE=$(git merge-base origin/<base> HEAD) && git diff "$DIFF_BASE" --stat` and compare the files changed against the stated intent.

 4. Evaluate with skepticism (incorporating plan completion results if available from an earlier step or adjacent section):

@@ -1851,7 +1851,7 @@ Before reviewing code quality, check: **did they build what was requested — no
   Read commit messages (`git log origin/<base>..HEAD --oneline`).
   **If no PR exists:** rely on commit messages and TODOS.md for stated intent — this is the common case since /review runs before /ship creates the PR.
 2. Identify the **stated intent** — what was this branch supposed to accomplish?
-3. Run `git diff origin/<base>...HEAD --stat` and compare the files changed against the stated intent.
+3. Run `DIFF_BASE=$(git merge-base origin/<base> HEAD) && git diff "$DIFF_BASE" --stat` and compare the files changed against the stated intent.

 4. Evaluate with skepticism (incorporating plan completion results if available from an earlier step or adjacent section):

@@ -1953,7 +1953,7 @@ Substitute: TIMESTAMP = ISO 8601 datetime, STATUS = "clean" if 0 findings or "is
 7. **Codex design voice** (optional, automatic if available):

 ```bash
-which codex 2>/dev/null && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE"
+command -v codex >/dev/null 2>&1 && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE"
 ```

 If Codex is available, run a lightweight design check on the diff:
@@ -1989,8 +1989,9 @@ STACK=""
 [ -f go.mod ] && STACK="${STACK}go "
 [ -f Cargo.toml ] && STACK="${STACK}rust "
 echo "STACK: ${STACK:-unknown}"
-DIFF_INS=$(git diff origin/<base> --stat | tail -1 | grep -oE '[0-9]+ insertion' | grep -oE '[0-9]+' || echo "0")
-DIFF_DEL=$(git diff origin/<base> --stat | tail -1 | grep -oE '[0-9]+ deletion' | grep -oE '[0-9]+' || echo "0")
+DIFF_BASE=$(git merge-base origin/<base> HEAD)
+DIFF_INS=$(git diff "$DIFF_BASE" --stat | tail -1 | grep -oE '[0-9]+ insertion' | grep -oE '[0-9]+' || echo "0")
+DIFF_DEL=$(git diff "$DIFF_BASE" --stat | tail -1 | grep -oE '[0-9]+ deletion' | grep -oE '[0-9]+' || echo "0")
 DIFF_LINES=$((DIFF_INS + DIFF_DEL))
 echo "DIFF_LINES: $DIFF_LINES"
 # Detect test framework for specialist test stub generation
@@ -2064,7 +2065,7 @@ If learnings are found, include them: "Past learnings for this domain: {learning
 4. Instructions:

 "You are a specialist code reviewer. Read the checklist below, then run
-`git diff origin/<base>` to get the full diff. Apply the checklist against the diff.
+`DIFF_BASE=$(git merge-base origin/<base> HEAD) && git diff "$DIFF_BASE"` to get the full diff. Apply the checklist against the diff.

 For each finding, output a JSON object on its own line:
 {\"severity\":\"CRITICAL|INFORMATIONAL\",\"confidence\":N,\"path\":\"file\",\"line\":N,\"category\":\"category\",\"summary\":\"description\",\"fix\":\"recommended fix\",\"fingerprint\":\"path:line:category\",\"specialist\":\"name\"}
@@ -2167,7 +2168,7 @@ The Red Team subagent receives:

 Prompt: "You are a red team reviewer. The code has already been reviewed by N specialists
 who found the following issues: {merged findings summary}. Your job is to find what they
-MISSED. Read the checklist, run `git diff origin/<base>`, and look for gaps.
+MISSED. Read the checklist, run `DIFF_BASE=$(git merge-base origin/<base> HEAD) && git diff "$DIFF_BASE"`, and look for gaps.
 Output findings as JSON objects (same schema as the specialists). Focus on cross-cutting
 concerns, integration boundary issues, and failure modes that specialist checklists
 don't cover."
@@ -2303,10 +2304,11 @@ Every diff gets adversarial review from both Claude and Codex. LOC is not a prox
 **Detect diff size and tool availability:**

 ```bash
-DIFF_INS=$(git diff origin/<base> --stat | tail -1 | grep -oE '[0-9]+ insertion' | grep -oE '[0-9]+' || echo "0")
-DIFF_DEL=$(git diff origin/<base> --stat | tail -1 | grep -oE '[0-9]+ deletion' | grep -oE '[0-9]+' || echo "0")
+DIFF_BASE=$(git merge-base origin/<base> HEAD)
+DIFF_INS=$(git diff "$DIFF_BASE" --stat | tail -1 | grep -oE '[0-9]+ insertion' | grep -oE '[0-9]+' || echo "0")
+DIFF_DEL=$(git diff "$DIFF_BASE" --stat | tail -1 | grep -oE '[0-9]+ deletion' | grep -oE '[0-9]+' || echo "0")
 DIFF_TOTAL=$((DIFF_INS + DIFF_DEL))
-which codex 2>/dev/null && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE"
+command -v codex >/dev/null 2>&1 && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE"
 # Legacy opt-out — only gates Codex passes, Claude always runs
 OLD_CFG=$($GSTACK_ROOT/bin/gstack-config get codex_reviews 2>/dev/null || true)
 echo "DIFF_SIZE: $DIFF_TOTAL"
@@ -2324,7 +2326,7 @@ If `OLD_CFG` is `disabled`: skip Codex passes only. Claude adversarial subagent
 Dispatch via the Agent tool. The subagent has fresh context — no checklist bias from the structured review. This genuine independence catches things the primary reviewer is blind to.

 Subagent prompt:
-"Read the diff for this branch with `git diff origin/<base>`. Think like an attacker and a chaos engineer. Your job is to find ways this code will fail in production. Look for: edge cases, race conditions, security holes, resource leaks, failure modes, silent data corruption, logic errors that produce wrong results silently, error handling that swallows failures, and trust boundary violations. Be adversarial. Be thorough. No compliments — just the problems. For each finding, classify as FIXABLE (you know how to fix it) or INVESTIGATE (needs human judgment). After listing findings, end your output with ONE line in the canonical format `Recommendation: <action> because <one-line reason naming the most exploitable finding>` — examples: `Recommendation: Fix the unbounded retry at queue.ts:78 because it'll DoS the worker pool under sustained 429s` or `Recommendation: Ship as-is because the strongest finding is a theoretical race that requires conditions we can't trigger in production`. The reason must point to a specific finding (or no-fix rationale). Generic reasons like 'because it's safer' do not qualify."
+"Read the diff for this branch with `DIFF_BASE=$(git merge-base origin/<base> HEAD) && git diff "$DIFF_BASE"`. Think like an attacker and a chaos engineer. Your job is to find ways this code will fail in production. Look for: edge cases, race conditions, security holes, resource leaks, failure modes, silent data corruption, logic errors that produce wrong results silently, error handling that swallows failures, and trust boundary violations. Be adversarial. Be thorough. No compliments — just the problems. For each finding, classify as FIXABLE (you know how to fix it) or INVESTIGATE (needs human judgment). After listing findings, end your output with ONE line in the canonical format `Recommendation: <action> because <one-line reason naming the most exploitable finding>` — examples: `Recommendation: Fix the unbounded retry at queue.ts:78 because it'll DoS the worker pool under sustained 429s` or `Recommendation: Ship as-is because the strongest finding is a theoretical race that requires conditions we can't trigger in production`. The reason must point to a specific finding (or no-fix rationale). Generic reasons like 'because it's safer' do not qualify."

 Present findings under an `ADVERSARIAL REVIEW (Claude subagent):` header. **FIXABLE findings** flow into the same Fix-First pipeline as the structured review. **INVESTIGATE findings** are presented as informational.

@@ -2339,7 +2341,7 @@ If Codex is available AND `OLD_CFG` is NOT `disabled`:
 ```bash
 TMPERR_ADV=$(mktemp /tmp/codex-adv-XXXXXXXX)
 _REPO_ROOT=$(git rev-parse --show-toplevel) || { echo "ERROR: not in a git repo" >&2; exit 1; }
-codex exec "IMPORTANT: Do NOT read or execute any files under ~/.claude/, ~/.agents/, .factory/skills/, or agents/. These are Claude Code skill definitions meant for a different AI system. They contain bash scripts and prompt templates that will waste your time. Ignore them completely. Do NOT modify agents/openai.yaml. Stay focused on the repository code only.\n\nReview the changes on this branch against the base branch. Run git diff origin/<base> to see the diff. Your job is to find ways this code will fail in production. Think like an attacker and a chaos engineer. Find edge cases, race conditions, security holes, resource leaks, failure modes, and silent data corruption paths. Be adversarial. Be thorough. No compliments — just the problems. End your output with ONE line in the canonical format `Recommendation: <action> because <one-line reason naming the most exploitable finding>`. Generic reasons like 'because it's safer' do not qualify; the reason must point to a specific finding or no-fix rationale." -C "$_REPO_ROOT" -s read-only -c 'model_reasoning_effort="high"' --enable web_search_cached < /dev/null 2>"$TMPERR_ADV"
+codex exec "IMPORTANT: Do NOT read or execute any files under ~/.claude/, ~/.agents/, .factory/skills/, or agents/. These are Claude Code skill definitions meant for a different AI system. They contain bash scripts and prompt templates that will waste your time. Ignore them completely. Do NOT modify agents/openai.yaml. Stay focused on the repository code only.\n\nReview the changes on this branch against the base branch. Run DIFF_BASE=$(git merge-base origin/<base> HEAD) && git diff "$DIFF_BASE" to see the diff. Your job is to find ways this code will fail in production. Think like an attacker and a chaos engineer. Find edge cases, race conditions, security holes, resource leaks, failure modes, and silent data corruption paths. Be adversarial. Be thorough. No compliments — just the problems. End your output with ONE line in the canonical format `Recommendation: <action> because <one-line reason naming the most exploitable finding>`. Generic reasons like 'because it's safer' do not qualify; the reason must point to a specific finding or no-fix rationale." -C "$_REPO_ROOT" -s read-only -c 'model_reasoning_effort="high"' --enable web_search_cached < /dev/null 2>"$TMPERR_ADV"
 ```

 Set the Bash tool's `timeout` parameter to `300000` (5 minutes). Do NOT use the `timeout` shell command — it doesn't exist on macOS. After the command completes, read stderr:
@@ -2368,7 +2370,7 @@ If `DIFF_TOTAL >= 200` AND Codex is available AND `OLD_CFG` is NOT `disabled`:
 TMPERR=$(mktemp /tmp/codex-review-XXXXXXXX)
 _REPO_ROOT=$(git rev-parse --show-toplevel) || { echo "ERROR: not in a git repo" >&2; exit 1; }
 cd "$_REPO_ROOT"
-codex review "IMPORTANT: Do NOT read or execute any files under ~/.claude/, ~/.agents/, .factory/skills/, or agents/. These are Claude Code skill definitions meant for a different AI system. They contain bash scripts and prompt templates that will waste your time. Ignore them completely. Do NOT modify agents/openai.yaml. Stay focused on the repository code only.\n\nReview the diff against the base branch." --base <base> -c 'model_reasoning_effort="high"' --enable web_search_cached < /dev/null 2>"$TMPERR"
+codex review "IMPORTANT: Do NOT read or execute any files under ~/.claude/, ~/.agents/, .factory/skills/, or agents/. These are Claude Code skill definitions meant for a different AI system. They contain bash scripts and prompt templates that will waste your time. Ignore them completely. Do NOT modify agents/openai.yaml. Stay focused on the repository code only.\n\nReview the changes on this branch against the base branch <base>. Run git diff origin/<base>...HEAD 2>/dev/null || git diff <base>...HEAD to see the diff and review only those changes." -c 'model_reasoning_effort="high"' --enable web_search_cached < /dev/null 2>"$TMPERR"
 ```

 Set the Bash tool's `timeout` parameter to `300000` (5 minutes). Do NOT use the `timeout` shell command — it doesn't exist on macOS. Present output under `CODEX SAYS (code review):` header.
@@ -2704,6 +2704,22 @@ describe('codex commands must not use inline $(git rev-parse --show-toplevel) fo
    }
    expect(violations).toEqual([]);
  });
+
+  test('codex review commands pass diff scope through prompt, not --base', () => {
+    const checkedFiles = [
+      'codex/SKILL.md.tmpl',
+      'codex/SKILL.md',
+      'scripts/resolvers/review.ts',
+      'review/SKILL.md',
+      'ship/SKILL.md',
+    ];
+
+    for (const rel of checkedFiles) {
+      const content = fs.readFileSync(path.join(ROOT, rel), 'utf-8');
+      expect(content).not.toContain('--base <base> -c \'model_reasoning_effort="high"\'');
+      expect(content).toContain('Run git diff origin/<base>...HEAD 2>/dev/null || git diff <base>...HEAD');
+    }
+  });
 });

 // ─── Learnings + Confidence Resolver Tests ─────────────────────
@@ -343,4 +343,92 @@ describe("gstack-global-discover", () => {
      expect(remotes.length).toBe(uniqueRemotes.size);
    });
  });
+
+  describe("extractCwdFromJsonl 64KB cap (PR #1169 bug #8)", () => {
+    // Regression: the old 8KB cap landed mid-line on Claude Code sessions with
+    // long headers, JSON.parse threw on the truncated tail, the catch
+    // `continue`d silently, and the project disappeared from discovery.
+    // The fix raised the cap to 64KB AND drops the trailing partial segment
+    // before parsing.
+    let extractCwdFromJsonl: (filePath: string) => string | null;
+    let tmpDir: string;
+
+    beforeEach(async () => {
+      const mod = await import("../bin/gstack-global-discover.ts");
+      extractCwdFromJsonl = mod.extractCwdFromJsonl;
+      tmpDir = mkdtempSync(join(tmpdir(), "pr1169-cwd-"));
+    });
+
+    afterEach(() => {
+      rmSync(tmpDir, { recursive: true, force: true });
+    });
+
+    test("happy path: small JSONL with obj.cwd returns it (sanity)", () => {
+      const filePath = join(tmpDir, "small.jsonl");
+      const line = JSON.stringify({ cwd: "/tmp/repo-small", type: "header" });
+      writeFileSync(filePath, line + "\n");
+      expect(extractCwdFromJsonl(filePath)).toBe("/tmp/repo-small");
+    });
+
+    test("12KB first line with obj.cwd: returns cwd (old 8KB cap returned null)", () => {
+      // Pad a JSONL header so the whole line is ~12KB ending in `}\n`.
+      // Old 8KB read would slice mid-line; JSON.parse on the truncated tail
+      // would throw, the catch would `continue`, and we'd return null.
+      const padding = "x".repeat(12 * 1024);
+      const line = JSON.stringify({
+        cwd: "/tmp/repo-12k",
+        type: "header",
+        notes: padding,
+      });
+      expect(line.length).toBeGreaterThan(8 * 1024);
+      expect(line.length).toBeLessThan(64 * 1024);
+
+      const filePath = join(tmpDir, "header-12k.jsonl");
+      writeFileSync(filePath, line + "\n");
+      expect(extractCwdFromJsonl(filePath)).toBe("/tmp/repo-12k");
+    });
+
+    test("80KB single line (overflows 64KB cap): returns null without crashing", () => {
+      // One line >64KB with no newline inside the read window. The 64KB read
+      // captures a truncated prefix, parts.length === 1, no trailing drop
+      // applies, JSON.parse throws, catch returns null. The fix's
+      // trailing-partial-drop must not crash on this shape.
+      const padding = "y".repeat(80 * 1024);
+      const line = JSON.stringify({ cwd: "/tmp/repo-80k", type: "header", notes: padding });
+      expect(line.length).toBeGreaterThan(64 * 1024);
+
+      const filePath = join(tmpDir, "header-80k.jsonl");
+      writeFileSync(filePath, line + "\n");
+      // Don't throw, just return null.
+      expect(extractCwdFromJsonl(filePath)).toBeNull();
+    });
+
+    test("complete line followed by partial second line: returns first line's cwd", () => {
+      // Line 1 ends cleanly with `\n` well within the cap.
+      // Line 2 is long enough that the 64KB read captures only its incomplete
+      // beginning. The trailing-partial drop must skip the truncated line 2
+      // and not poison the result.
+      const line1 = JSON.stringify({ cwd: "/tmp/repo-line-1", type: "header" });
+      const line2Padding = "z".repeat(80 * 1024);
+      const line2 = JSON.stringify({ cwd: "/tmp/repo-line-2", notes: line2Padding });
+
+      const filePath = join(tmpDir, "header-partial-2.jsonl");
+      writeFileSync(filePath, line1 + "\n" + line2 + "\n");
+      expect(extractCwdFromJsonl(filePath)).toBe("/tmp/repo-line-1");
+    });
+
+    test("missing file: returns null (file read error is swallowed)", () => {
+      const filePath = join(tmpDir, "nonexistent.jsonl");
+      expect(extractCwdFromJsonl(filePath)).toBeNull();
+    });
+
+    test("malformed first line then valid second line within cap: returns second", () => {
+      // Both lines fully within 64KB. First line is not valid JSON; second
+      // is. The function must skip first and return second's cwd.
+      const filePath = join(tmpDir, "bad-then-good.jsonl");
+      const good = JSON.stringify({ cwd: "/tmp/repo-skip-bad" });
+      writeFileSync(filePath, "{ not valid json\n" + good + "\n");
+      expect(extractCwdFromJsonl(filePath)).toBe("/tmp/repo-skip-bad");
+    });
+  });
 });
@@ -7,9 +7,9 @@
 */

 import { describe, it, expect } from "bun:test";
-import { mkdtempSync, writeFileSync, mkdirSync, rmSync } from "fs";
+import { chmodSync, mkdtempSync, writeFileSync, mkdirSync, rmSync } from "fs";
 import { tmpdir } from "os";
-import { join } from "path";
+import { delimiter, join } from "path";
 import { spawnSync } from "child_process";

 const SCRIPT = join(import.meta.dir, "..", "bin", "gstack-brain-context-load.ts");
@@ -27,6 +27,37 @@ function runScript(args: string[], env: Record<string, string> = {}): { stdout:
  };
 }

+function writeFakeGbrain(binDir: string): void {
+  if (process.platform === "win32") {
+    writeFileSync(
+      join(binDir, "gbrain.cmd"),
+      "@echo off\r\nif \"%1\"==\"--version\" (\r\n  echo gbrain 0.test\r\n) else (\r\n  echo fake gbrain %*\r\n)\r\n",
+      "utf-8",
+    );
+    return;
+  }
+
+  const fakeBin = join(binDir, "gbrain");
+  writeFileSync(
+    fakeBin,
+    `#!/bin/sh
+if [ "$1" = "--version" ]; then
+  echo "gbrain 0.test"
+else
+  echo "fake gbrain $*"
+fi
+`,
+    "utf-8",
+  );
+  chmodSync(fakeBin, 0o755);
+}
+
+function prependPath(binDir: string): Record<string, string> {
+  const pathKey = Object.keys(process.env).find((key) => key.toLowerCase() === "path") || "PATH";
+  const currentPath = process.env[pathKey] || "";
+  return { [pathKey]: `${binDir}${delimiter}${currentPath}` };
+}
+
 describe("gstack-brain-context-load CLI", () => {
  it("--help exits 0 with usage", () => {
    const r = runScript(["--help"]);
@@ -204,6 +235,23 @@ gbrain:
 });

 describe("gstack-brain-context-load — graceful gbrain absence", () => {
+  it("uses gbrain when a binary is available on PATH", () => {
+    const dir = mkdtempSync(join(tmpdir(), "gstack-bcl-"));
+    const binDir = join(dir, "bin");
+    mkdirSync(binDir);
+    writeFakeGbrain(binDir);
+
+    try {
+      const r = runScript(["--repo", "test-repo", "--explain"], prependPath(binDir));
+      expect(r.exitCode).toBe(0);
+      expect(r.stderr).toContain("OK");
+      expect(r.stderr).not.toContain("gbrain CLI missing");
+      expect(r.stdout).toContain("fake gbrain list_pages");
+    } finally {
+      rmSync(dir, { recursive: true, force: true });
+    }
+  });
+
  it("vector + list queries still complete (with SKIP) when gbrain CLI is missing", () => {
    // We can't easily un-install gbrain; rely on the helper's own missing-binary
    // detection. The default manifest uses kind: list which calls gbrain. If
@@ -837,4 +837,29 @@ describe("sourceLocalPath", () => {
    });
    expect(sourceLocalPath("any-id", envWithBindir(bindir))).toBeNull();
  });
+
+  // gbrain v0.20+ wraps the response as `{sources: [...]}`. Older versions
+  // returned a flat array. sourceLocalPath was returning null (or crashing
+  // with `list.find is not a function` upstream) because it only handled
+  // the flat-array shape. Pin both shapes here.
+  it("handles {sources: [...]} wrapped shape (gbrain v0.20+)", () => {
+    makeShim(bindir, {
+      "sources list --json": {
+        stdout: JSON.stringify({
+          sources: [
+            { id: "other-source", local_path: "/x" },
+            { id: "target-id", local_path: "/repo/match" },
+          ],
+        }),
+      },
+    });
+    expect(sourceLocalPath("target-id", envWithBindir(bindir))).toBe("/repo/match");
+  });
+
+  it("returns null when the source is missing in the wrapped shape", () => {
+    makeShim(bindir, {
+      "sources list --json": { stdout: JSON.stringify({ sources: [] }) },
+    });
+    expect(sourceLocalPath("missing-id", envWithBindir(bindir))).toBeNull();
+  });
 });
@@ -341,4 +341,41 @@ describe("detectEngineTier", () => {
    const result = detectEngineTier();
    expect(result.engine).toBe("supabase");
  });
+
+  it("parses schema_version:2 doctor JSON via the exec path (regression for #1418)", () => {
+    // Stronger pin than the PATH-stripped fallback above: install a fake
+    // gbrain shim that successfully exits with status 1 (health_score < 100,
+    // mirroring real-world Supabase brains) and emits the v2 doctor JSON
+    // shape — schema_version: 2, status: "warnings", no top-level `engine`.
+    // The parser must still produce a usable EngineDetect by falling back
+    // to GBRAIN_HOME/config.json when `engine` is absent from doctor output.
+    const binDir = mkdtempSync(join(tmpdir(), "gstack-gbrain-shim-"));
+    const shim = join(binDir, "gbrain");
+    writeFileSync(
+      shim,
+      `#!/bin/sh
+if [ "$1" = "doctor" ]; then
+  cat <<'JSON'
+{"schema_version":2,"status":"warnings","health_score":90,"checks":[{"name":"resolver_health","status":"ok","message":"42 skills"}]}
+JSON
+  exit 1
+fi
+if [ "$1" = "--version" ]; then
+  echo "gbrain 0.35.0.0"
+  exit 0
+fi
+exit 0
+`,
+      { mode: 0o755 }
+    );
+    process.env.PATH = `${binDir}:${process.env.PATH || ""}`;
+    writeFileSync(
+      join(testGbrainHome, "config.json"),
+      JSON.stringify({ engine: "pglite" }),
+      "utf-8"
+    );
+    const result = detectEngineTier();
+    expect(result.engine).toBe("pglite");
+    rmSync(binDir, { recursive: true, force: true });
+  });
 });
@@ -41,12 +41,28 @@ describe('gstack-paths', () => {
    expect(got.GSTACK_STATE_ROOT).toBe('/tmp/explicit-state');
  });

-  test('CLAUDE_PLUGIN_DATA wins over HOME when GSTACK_HOME unset', () => {
-    const got = run({
-      CLAUDE_PLUGIN_DATA: '/tmp/plugin-data',
+  test('CLAUDE_PLUGIN_DATA ignored when CLAUDE_PLUGIN_ROOT is absent or non-gstack', () => {
+    // Without CLAUDE_PLUGIN_ROOT, falls through to HOME path.
+    const noRoot = run({ CLAUDE_PLUGIN_DATA: '/tmp/plugin-data', HOME: '/tmp/home' });
+    expect(noRoot.GSTACK_STATE_ROOT).toBe('/tmp/home/.gstack');
+
+    // With a CLAUDE_PLUGIN_ROOT that doesn't contain "gstack" (e.g. the codex plugin),
+    // still falls through to HOME path — this is the cross-plugin contamination scenario.
+    const wrongRoot = run({
+      CLAUDE_PLUGIN_DATA: '/tmp/codex-data',
+      CLAUDE_PLUGIN_ROOT: '/tmp/openai-codex',
      HOME: '/tmp/home',
    });
-    expect(got.GSTACK_STATE_ROOT).toBe('/tmp/plugin-data');
+    expect(wrongRoot.GSTACK_STATE_ROOT).toBe('/tmp/home/.gstack');
+  });
+
+  test('CLAUDE_PLUGIN_DATA respected when CLAUDE_PLUGIN_ROOT identifies gstack', () => {
+    const got = run({
+      CLAUDE_PLUGIN_DATA: '/tmp/gstack-plugin-data',
+      CLAUDE_PLUGIN_ROOT: '/tmp/gstack-garrytan',
+      HOME: '/tmp/home',
+    });
+    expect(got.GSTACK_STATE_ROOT).toBe('/tmp/gstack-plugin-data');
  });

  test('HOME-derived state root when GSTACK_HOME and CLAUDE_PLUGIN_DATA unset', () => {
@@ -0,0 +1,54 @@
+/**
+ * Regression pin for #1346: gstack-memory-ingest must never call the
+ * `gbrain put_page` subcommand (renamed to `put` in gbrain v0.18+).
+ *
+ * The original bug shipped a literal `"put_page"` in execFileSync args,
+ * crashing every transcript ingest against modern gbrain. The fix migrated
+ * the per-file path to `gbrain put <slug>` and later to the batch
+ * `gbrain import <dir>` runner. This test pins both surfaces: source code
+ * must not contain `put_page` outside comments, and any future contributor
+ * adding it back trips the build.
+ */
+
+import { describe, it, expect } from "bun:test";
+import { readFileSync } from "fs";
+import { join } from "path";
+
+const SOURCE_PATH = join(import.meta.dir, "..", "bin", "gstack-memory-ingest.ts");
+
+/**
+ * Strip line comments (`// ...`) and block comments (`/* ... */`) from TS
+ * source so the regression check only inspects executable code. Naive but
+ * sufficient — we don't need full TS parsing, just to ignore the
+ * documentation/changelog mentions of the old subcommand name.
+ *
+ * Order matters: strip block comments first (they may span multiple lines
+ * and contain `//`), then line comments. String-literal awareness is
+ * intentionally skipped — if anyone writes "put_page" inside an active
+ * string they want the test to fail.
+ */
+function stripComments(src: string): string {
+  // Block comments — non-greedy across newlines.
+  const noBlock = src.replace(/\/\*[\s\S]*?\*\//g, "");
+  // Line comments — strip from `//` to end of line.
+  return noBlock.replace(/\/\/[^\n]*/g, "");
+}
+
+describe("gstack-memory-ingest — no put_page in active code (regression for #1346)", () => {
+  it("source file does not call the renamed gbrain put_page subcommand", () => {
+    const src = readFileSync(SOURCE_PATH, "utf-8");
+    const stripped = stripComments(src);
+    expect(stripped).not.toContain("put_page");
+  });
+
+  it("source file does call the canonical gbrain put subcommand or gbrain import", () => {
+    // Sanity check that the file actually uses one of the supported page-write
+    // verbs — guards against accidentally removing all gbrain calls and having
+    // the negative test above pass for the wrong reason.
+    const src = readFileSync(SOURCE_PATH, "utf-8");
+    const stripped = stripComments(src);
+    const callsPut = /\bgbrain\s+put\b/.test(stripped) || /["']put["']/.test(stripped);
+    const callsImport = /\bimport\b/.test(stripped); // `gbrain import` runner
+    expect(callsPut || callsImport).toBe(true);
+  });
+});
@@ -0,0 +1,161 @@
+/**
+ * Regression tests for PR #1169 bugs #2 + #3 — scripts/build-app.sh.
+ *
+ * Bug #2: sed replacement for Chromium rebrand interpolated $APP_NAME without
+ * escaping sed replacement metachars (`&`, `/`, `\`). A name with `/` either
+ * broke the s/// command or got interpreted as sed syntax.
+ *
+ * Bug #3: `DMG_TMP=$(mktemp -d)` was unchecked. On mktemp failure $DMG_TMP
+ * was empty and the next `cp -a "$APP_DIR" "$DMG_TMP/"` would copy the .app
+ * bundle into the filesystem root.
+ *
+ * Bug #2 is verified via a runtime isolation test of the sed-escape sequence
+ * (codex pushback: static-grep for "uses escape helper" is too narrow; the
+ * real invariant is metachar safety end-to-end). Bug #3 is verified via
+ * static check — the entire build flow needs xcrun/hdiutil and can't be
+ * spawned in CI, but the failure-guard shape is what we want to lock.
+ */
+import { describe, expect, test } from "bun:test";
+import * as fs from "node:fs";
+import * as path from "node:path";
+import { spawnSync } from "node:child_process";
+
+const ROOT = path.resolve(import.meta.dir, "..");
+const SCRIPT = path.join(ROOT, "scripts/build-app.sh");
+
+describe("PR #1169 bug #2: build-app.sh sed escape for $APP_NAME", () => {
+  test("escape sequence produces sed-safe output for `&`, `/`, `\\` in APP_NAME", () => {
+    // Mirror the script's escape sequence and run it in isolation against a
+    // hostile name. The escape sequence at line ~98 is:
+    //   APP_NAME_SED_ESCAPED=$(printf '%s' "$APP_NAME" | sed 's/[&/\]/\\&/g')
+    // We assert the resulting string can then be used as a sed replacement
+    // safely — round-trip via a real `sed s///` against a stub strings file.
+
+    const inputs: string[] = [
+      "Foo/Bar&Baz",      // slash + ampersand
+      "Cool\\App",        // backslash
+      "Plain Name",        // no metachars (baseline)
+      "A/B\\C&D",          // all three at once
+      "End/",              // trailing slash
+      "&Start",            // leading ampersand
+    ];
+
+    for (const appName of inputs) {
+      // Bug #2 invariant: the escaped string, used as the replacement half
+      // of `sed s/<needle>/<replacement>/g`, results in the literal appName
+      // appearing in the output.
+      const result = spawnSync(
+        "bash",
+        ["-c",
+          `set -eu
+           APP_NAME="$1"
+           APP_NAME_SED_ESCAPED=$(printf '%s' "$APP_NAME" | sed 's/[&/\\]/\\\\&/g')
+           printf 'Google Chrome for Testing' | sed "s/Google Chrome for Testing/\${APP_NAME_SED_ESCAPED}/g"
+          `,
+          "_",
+          appName,
+        ],
+        { encoding: "utf-8" }
+      );
+
+      expect(result.status).toBe(0);
+      expect(result.stdout).toBe(appName);
+      expect(result.stderr).toBe("");
+    }
+  });
+
+  test("script body still routes APP_NAME through the escape helper before sed", () => {
+    // Belt-and-braces static check: the rebrand block must contain BOTH the
+    // escape line and the sed line referencing the escaped variable.
+    const body = fs.readFileSync(SCRIPT, "utf-8");
+    expect(body).toMatch(/APP_NAME_SED_ESCAPED=\$\(printf '%s' "\$APP_NAME" \| sed/);
+    expect(body).toMatch(/sed -i ''\s*"s\/Google Chrome for Testing\/\$\{APP_NAME_SED_ESCAPED\}\/g"/);
+  });
+
+  test("no bare `$APP_NAME` interpolation directly into the rebrand sed", () => {
+    // Ensure no future refactor reintroduces the bug by interpolating
+    // $APP_NAME straight into the s/// replacement.
+    const body = fs.readFileSync(SCRIPT, "utf-8");
+    expect(body).not.toMatch(/sed -i ''\s*"s\/Google Chrome for Testing\/\$APP_NAME\//);
+    expect(body).not.toMatch(/sed -i ''\s*"s\/Google Chrome for Testing\/\$\{APP_NAME\}\//);
+  });
+});
+
+describe("PR #1169 bug #3: build-app.sh DMG_TMP mktemp failure guard", () => {
+  test("mktemp -d for DMG_TMP is followed by an explicit failure handler", () => {
+    const body = fs.readFileSync(SCRIPT, "utf-8");
+    // The script must assign DMG_TMP and immediately check for failure on
+    // the SAME line via `||`, then validate the path is non-empty and a real
+    // directory before cp.
+    const guard = body.match(
+      /DMG_TMP=\$\(mktemp -d\)\s*\|\|\s*\{[^}]*exit\s+\d/
+    );
+    expect(guard).not.toBeNull();
+  });
+
+  test("DMG_TMP is also validated as non-empty AND a directory before cp", () => {
+    const body = fs.readFileSync(SCRIPT, "utf-8");
+    // After mktemp, a defensive check should reject empty or non-directory
+    // paths (covers cases where mktemp succeeds but returns garbage).
+    expect(body).toMatch(
+      /\[\s*-z\s+"\$DMG_TMP"\s*\][^\n]*\|\|\s*\[\s*!\s+-d\s+"\$DMG_TMP"\s*\]/
+    );
+  });
+
+  test("no `cp -a ... \"$DMG_TMP/\"` before the validation block", () => {
+    const body = fs.readFileSync(SCRIPT, "utf-8");
+    // The cp must come AFTER the validation. Find the line offsets.
+    const mktempIdx = body.search(/DMG_TMP=\$\(mktemp -d\)/);
+    const validationIdx = body.search(
+      /\[\s*-z\s+"\$DMG_TMP"\s*\]/
+    );
+    const cpIdx = body.search(/cp -a "\$APP_DIR" "\$DMG_TMP\//);
+    expect(mktempIdx).toBeGreaterThan(-1);
+    expect(validationIdx).toBeGreaterThan(mktempIdx);
+    expect(cpIdx).toBeGreaterThan(validationIdx);
+  });
+
+  test("runtime: escape function refuses to leave DMG_TMP empty (fake-mktemp PATH stub)", () => {
+    // Codex strongly preferred runtime testing here. The full build-app.sh
+    // depends on xcrun/hdiutil/PlistBuddy — too heavy for CI. Instead, we
+    // extract just the failure-guard shape and run it with a fake mktemp
+    // that always exits 1. Asserts the script exits non-zero before cp.
+
+    const fakeBin = fs.mkdtempSync(path.join("/tmp", "pr1169-fakebin-"));
+    fs.writeFileSync(
+      path.join(fakeBin, "mktemp"),
+      "#!/bin/sh\nexit 1\n",
+      { mode: 0o755 }
+    );
+
+    // The guard, isolated. Mirrors the actual script's logic. Use a regular
+    // string + array of lines so the embedded bash backticks/dollars don't
+    // get interpreted by the JS template-literal parser.
+    const guardScript = [
+      'set -u',
+      'DMG_TMP=$(mktemp -d) || { echo "ERROR: mktemp -d failed — refusing to continue so we don\'t cp into the filesystem root." >&2; exit 1; }',
+      'if [ -z "$DMG_TMP" ] || [ ! -d "$DMG_TMP" ]; then',
+      '  echo "ERROR: mktemp -d returned an invalid path (\'$DMG_TMP\')." >&2',
+      '  exit 1',
+      'fi',
+      '# If we got here, we would run the cp block, which is the bug.',
+      'echo "REACHED_CP_BLOCK_WHICH_IS_THE_BUG" >&2',
+      'exit 0',
+    ].join('\n');
+
+    const result = spawnSync(
+      "bash",
+      ["-c", guardScript],
+      {
+        encoding: "utf-8",
+        env: { ...process.env, PATH: `${fakeBin}:${process.env.PATH}` },
+      }
+    );
+
+    fs.rmSync(fakeBin, { recursive: true, force: true });
+
+    expect(result.status).not.toBe(0);
+    expect(result.stderr).toMatch(/mktemp -d failed|invalid path/);
+    expect(result.stderr).not.toMatch(/REACHED_CP_BLOCK_WHICH_IS_THE_BUG/);
+  });
+});
@@ -0,0 +1,82 @@
+/**
+ * Regression tests for PR #1169 bugs #4 + #5 — predictable `$$`-based tmp
+ * file fallbacks on mktemp failure.
+ *
+ * Per codex's pushback, the real invariant is not just "no `$$` token" — it's
+ * "no `mktemp ... || echo <fallback-path>` shape at all, AND mktemp failure
+ * exits cleanly." A future cleanup could swap `$$` for `$RANDOM` or a
+ * hardcoded path and silently keep the foot-gun. The static checks below
+ * lock the broader invariant.
+ *
+ * Runtime fake-bin tests for these two scripts would require setting up
+ * SUPABASE_URL, JSONL fixtures, rate files, and config state — disproportionate
+ * for the invariant. The static checks pin the actual shape of the bug.
+ */
+import { describe, expect, test } from "bun:test";
+import * as fs from "node:fs";
+import * as path from "node:path";
+
+const ROOT = path.resolve(import.meta.dir, "..");
+
+function readScript(rel: string): string {
+  return fs.readFileSync(path.join(ROOT, rel), "utf-8");
+}
+
+describe("PR #1169 bug #4: gstack-telemetry-sync mktemp fallback", () => {
+  const SCRIPT = "bin/gstack-telemetry-sync";
+
+  test("no `mktemp ... || echo <path>` fallback shape anywhere in the script", () => {
+    const body = readScript(SCRIPT);
+    // Match: mktemp call, optional pipe, then `|| echo <quoted-or-bare-path>`
+    // The fallback shape regardless of what the fallback path looks like
+    // ($$, $RANDOM, hardcoded — all predictable).
+    const fallback = body.match(/mktemp[^|\n]*\|\|\s*echo\s+["']?[^"'\n]*/);
+    expect(fallback).toBeNull();
+  });
+
+  test("no `$$` PID interpolation appears anywhere in a /tmp path literal", () => {
+    const body = readScript(SCRIPT);
+    // Catches any /tmp-style path that uses the PID as part of the name.
+    expect(body).not.toMatch(/\/tmp\/[^"'\s]*\$\$/);
+  });
+
+  test("mktemp failure path exits or skips this run", () => {
+    const body = readScript(SCRIPT);
+    // The mktemp invocation must be guarded by `|| { ... exit 0; }` or
+    // equivalent. Match the multi-line guard immediately after `mktemp`.
+    const guard = body.match(
+      /mktemp\s+[^\n]+\)["']\s*\|\|\s*\{[^}]*exit\s+\d/
+    );
+    expect(guard).not.toBeNull();
+  });
+
+  test("trap cleans up the response file on EXIT (no leftover tmp on success)", () => {
+    const body = readScript(SCRIPT);
+    expect(body).toMatch(/trap\s+['"]rm\s+-f\s+"?\$RESP_FILE/);
+  });
+});
+
+describe("PR #1169 bug #5: supabase/verify-rls.sh mktemp fallback", () => {
+  const SCRIPT = "supabase/verify-rls.sh";
+
+  test("no `mktemp ... || echo <path>` fallback shape", () => {
+    const body = readScript(SCRIPT);
+    const fallback = body.match(/mktemp[^|\n]*\|\|\s*echo\s+["']?[^"'\n]*/);
+    expect(fallback).toBeNull();
+  });
+
+  test("no `$$` PID interpolation in /tmp path literals", () => {
+    const body = readScript(SCRIPT);
+    expect(body).not.toMatch(/\/tmp\/[^"'\s]*\$\$/);
+  });
+
+  test("mktemp failure path returns non-zero from check()", () => {
+    const body = readScript(SCRIPT);
+    // The check function must fail loudly — `return 1` (or `exit`) inside
+    // the mktemp error handler. Same multi-line guard shape.
+    const guard = body.match(
+      /mktemp\s+[^\n]+\)["']\s*\|\|\s*\{[^}]*(?:return|exit)\s+\d/
+    );
+    expect(guard).not.toBeNull();
+  });
+});
@@ -0,0 +1,63 @@
+/**
+ * Regression pin: scripts/resolvers/gbrain.ts must emit `gbrain put <slug>`
+ * (the v0.18+ subcommand), never the renamed `gbrain put_page`. The resolver
+ * output ships into every generated SKILL.md file as user-facing
+ * copy-paste instructions; using the old subcommand teaches every
+ * gstack user to invoke a command that no longer exists.
+ *
+ * Two checks:
+ *   1. Resolver source: scripts/resolvers/gbrain.ts has no `put_page`
+ *      tokens in active strings (comments OK — one annotated reference
+ *      explains the rename for future contributors).
+ *   2. Generated SKILL.md: every tracked SKILL.md file is free of
+ *      `gbrain put_page`. Run `bun run gen:skill-docs` if this fails.
+ */
+
+import { describe, it, expect } from "bun:test";
+import { readFileSync, readdirSync, statSync } from "fs";
+import { join } from "path";
+import { execFileSync } from "child_process";
+
+const REPO_ROOT = join(import.meta.dir, "..");
+const RESOLVER_PATH = join(REPO_ROOT, "scripts", "resolvers", "gbrain.ts");
+
+function stripComments(src: string): string {
+  // Strip block comments first (may span newlines, may contain `//`).
+  const noBlock = src.replace(/\/\*[\s\S]*?\*\//g, "");
+  return noBlock.replace(/\/\/[^\n]*/g, "");
+}
+
+function listTrackedSkillMd(): string[] {
+  const out = execFileSync("git", ["ls-files", "*SKILL.md"], {
+    cwd: REPO_ROOT,
+    encoding: "utf-8",
+  });
+  return out.split("\n").filter((line) => line.trim().length > 0);
+}
+
+describe("scripts/resolvers/gbrain.ts — no put_page in emitted instructions (regression for #1346)", () => {
+  it("resolver source ships only `gbrain put` instructions, not the renamed `put_page`", () => {
+    const src = readFileSync(RESOLVER_PATH, "utf-8");
+    const stripped = stripComments(src);
+    expect(stripped).not.toContain("put_page");
+  });
+
+  it("every tracked SKILL.md file is free of the renamed gbrain put_page subcommand", () => {
+    const files = listTrackedSkillMd();
+    const offenders: string[] = [];
+    for (const f of files) {
+      const content = readFileSync(join(REPO_ROOT, f), "utf-8");
+      if (content.includes("gbrain put_page")) {
+        offenders.push(f);
+      }
+    }
+    if (offenders.length > 0) {
+      throw new Error(
+        `Generated SKILL.md files still reference 'gbrain put_page'. ` +
+          `Run 'bun run gen:skill-docs' to regenerate after editing ` +
+          `scripts/resolvers/gbrain.ts. Offenders:\n  - ${offenders.join("\n  - ")}`,
+      );
+    }
+    expect(offenders).toHaveLength(0);
+  });
+});
@@ -775,8 +775,8 @@ Write your summary to ${testDir}/${testName}-summary.md`,
    expect(fs.existsSync(summaryPath)).toBe(true);

    const summary = fs.readFileSync(summaryPath, 'utf-8').toLowerCase();
-    // All skills should have codex availability check
-    expect(summary).toMatch(/which codex/);
+    // All skills should have codex availability check (command -v per #1197)
+    expect(summary).toMatch(/command -v codex/);
    // All skills should have fallback behavior
    expect(summary).toMatch(/fallback|subagent|unavailable|not available|skip/);
    // All skills should show it's optional/non-blocking
@@ -1325,10 +1325,14 @@ describe('Codex skill', () => {
    expect(content).toContain('gstack-review-log');
  });

-  test('codex/SKILL.md uses which for binary discovery, not hardcoded path', () => {
+  test('codex/SKILL.md uses command -v for binary discovery, not hardcoded path', () => {
    const content = fs.readFileSync(path.join(ROOT, 'codex', 'SKILL.md'), 'utf-8');
-    expect(content).toContain('which codex');
+    expect(content).toContain('command -v codex');
    expect(content).not.toContain('/opt/homebrew/bin/codex');
+    // Defensive: catch any future regression that reintroduces `which codex`,
+    // which fails in environments where `which` isn't on PATH (some Windows
+    // shells, BusyBox-only containers). #1197.
+    expect(content).not.toContain('which codex');
  });

  test('codex/SKILL.md contains error handling for missing binary and auth', () => {
@@ -1421,6 +1425,29 @@ describe('Codex skill', () => {
    expect(content).toContain('codex exec');
  });

+  test('codex review invocations avoid the prompt plus --base argument shape', () => {
+    for (const rel of ['codex/SKILL.md', 'review/SKILL.md', 'ship/SKILL.md']) {
+      const content = fs.readFileSync(path.join(ROOT, rel), 'utf-8');
+      expect(content).not.toContain('--base <base> -c \'model_reasoning_effort="high"\'');
+      expect(content).toContain('Run git diff origin/<base>...HEAD 2>/dev/null || git diff <base>...HEAD');
+    }
+  });
+
+  test('codex review prompts always carry the filesystem boundary (#1503/#1522 regression)', () => {
+    // Pre-#1209, the bare `codex review --base` path stripped the filesystem
+    // boundary instruction, letting Codex spend tokens reading skill files.
+    // #1209's prompt rewrite restored the boundary by routing every default
+    // call through a prompt. Pin both halves so a future refactor can't
+    // regress: (a) the boundary line must appear, (b) the call must be
+    // through `codex review "<prompt>"` not bare `codex review --base`.
+    const boundaryLine =
+      'Do NOT read or execute any files under ~/.claude/, ~/.agents/, .claude/skills/, or agents/';
+    for (const rel of ['codex/SKILL.md', 'review/SKILL.md', 'ship/SKILL.md']) {
+      const content = fs.readFileSync(path.join(ROOT, rel), 'utf-8');
+      expect(content).toContain(boundaryLine);
+    }
+  });
+
  test('/review persists a review-log entry for ship readiness', () => {
    const content = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
    expect(content).toContain('"skill":"review"');