Merge branch 'main' into garrytan/team-supabase-store

Brings in 55 commits from main (v0.12.x–v0.13.5.0): Factory Droid compat, prompt injection defense, user sovereignty, security audit, design binary, skill namespacing, modular resolvers, Chrome sidebar, and more. Conflict resolution: - .agents/ SKILL.md files: deleted (main moved to .factory/) - 8 .tmpl templates: accepted main (new features: CDP mode, design tools, global retro, parallelization, distribution checks, plan audits) - scripts/gen-skill-docs.ts: accepted main's modular resolver refactor - test/helpers/session-runner.ts: accepted main + layered back CostEntry tracking from team branch - Generated SKILL.md files: regenerated via bun run gen:skill-docs - Updated tests to match main's gstack-slug output (2 lines, no PROJECTS_DIR) and review log mechanism (gstack-review-log, not $BRANCH.jsonl) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-07 14:06:42 +02:00 · 2026-03-29 15:12:12 -07:00
parent 8444626c6a 484cf1fb3b
commit 15e6d9d8f1
267 changed files with 60292 additions and 12207 deletions
@@ -27,6 +27,7 @@ export interface CodexResult {
  durationMs: number;       // Wall clock time
  sessionId: string | null; // Thread ID for session continuity
  rawLines: string[];       // Raw JSONL lines for debugging
+  stderr: string;           // Stderr output (skill loading errors, auth failures)
 }

 // --- JSONL parser (ported from Python in codex/SKILL.md.tmpl) ---
@@ -98,7 +99,8 @@ export function parseCodexJSONL(lines: string[]): ParsedCodexJSONL {

 /**
 * Install a SKILL.md into a temp HOME directory for Codex to discover.
- * Creates ~/.codex/skills/{skillName}/SKILL.md in the temp HOME.
+ * Creates ~/.codex/skills/{skillName}/SKILL.md in the temp HOME and copies
+ * agents/openai.yaml when present so Codex sees the same metadata as a real install.
 *
 * Returns the temp HOME path. Caller is responsible for cleanup.
 */
@@ -116,6 +118,13 @@ export function installSkillToTempHome(
    fs.copyFileSync(srcSkill, path.join(destDir, 'SKILL.md'));
  }

+  const srcOpenAIYaml = path.join(skillDir, 'agents', 'openai.yaml');
+  if (fs.existsSync(srcOpenAIYaml)) {
+    const destAgentsDir = path.join(destDir, 'agents');
+    fs.mkdirSync(destAgentsDir, { recursive: true });
+    fs.copyFileSync(srcOpenAIYaml, path.join(destAgentsDir, 'openai.yaml'));
+  }
+
  return home;
 }

@@ -159,6 +168,7 @@ export async function runCodexSkill(opts: {
      durationMs: Date.now() - startTime,
      sessionId: null,
      rawLines: [],
+      stderr: '',
    };
  }

@@ -274,6 +284,7 @@ export async function runCodexSkill(opts: {
      durationMs,
      sessionId: parsed.sessionId,
      rawLines: collectedLines,
+      stderr,
    };
  } finally {
    // Clean up temp HOME
@@ -5,11 +5,13 @@
 * tests across multiple files by category.
 */

-import { describe, test, afterAll } from 'bun:test';
+import { describe, test, beforeAll, afterAll } from 'bun:test';
 import type { SkillTestResult } from './session-runner';
 import { EvalCollector, judgePassed } from './eval-store';
 import type { EvalTestEntry } from './eval-store';
-import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES } from './touchfiles';
+import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, E2E_TIERS, GLOBAL_TOUCHFILES } from './touchfiles';
+import { WorktreeManager } from '../../lib/worktree';
+import type { HarvestResult } from '../../lib/worktree';
 import { spawnSync } from 'child_process';
 import * as fs from 'fs';
 import * as path from 'path';
@@ -30,13 +32,6 @@ export const evalsEnabled = !!process.env.EVALS;
 // Set EVALS_ALL=1 to force all tests. Set EVALS_BASE to override base branch.
 export let selectedTests: string[] | null = null; // null = run all

-// EVALS_FAST: skip the 8 slowest tests (all Opus quality tests) for quick feedback
-const FAST_EXCLUDED_TESTS = [
-  'plan-ceo-review-selective', 'plan-ceo-review', 'retro', 'retro-base-branch',
-  'design-consultation-core', 'design-consultation-existing',
-  'qa-fix-loop', 'design-review-fix',
-];
-
 if (evalsEnabled && !process.env.EVALS_ALL) {
  const baseBranch = process.env.EVALS_BASE
    || detectBaseBranch(ROOT)
@@ -55,15 +50,22 @@ if (evalsEnabled && !process.env.EVALS_ALL) {
  // If changedFiles is empty (e.g., on main branch), selectedTests stays null → run all
 }

-// Apply EVALS_FAST filter after diff-based selection
-if (evalsEnabled && process.env.EVALS_FAST) {
+// EVALS_TIER: filter tests by tier after diff-based selection.
+// 'gate' = gate tests only (CI default — blocks merge)
+// 'periodic' = periodic tests only (weekly cron / manual)
+// not set = run all selected tests (local dev default, backward compat)
+if (evalsEnabled && process.env.EVALS_TIER) {
+  const tier = process.env.EVALS_TIER as 'gate' | 'periodic';
+  const tierTests = Object.entries(E2E_TIERS)
+    .filter(([, t]) => t === tier)
+    .map(([name]) => name);
+
  if (selectedTests === null) {
-    // Run all minus excluded
-    selectedTests = Object.keys(E2E_TOUCHFILES).filter(t => !FAST_EXCLUDED_TESTS.includes(t));
+    selectedTests = tierTests;
  } else {
-    selectedTests = selectedTests.filter(t => !FAST_EXCLUDED_TESTS.includes(t));
+    selectedTests = selectedTests.filter(t => tierTests.includes(t));
  }
-  process.stderr.write(`EVALS_FAST: excluded ${FAST_EXCLUDED_TESTS.length} slow tests, running ${selectedTests.length}\n\n`);
+  process.stderr.write(`EVALS_TIER=${tier}: ${selectedTests.length} tests\n\n`);
 }

 export const describeE2E = evalsEnabled ? describe : describe.skip;
@@ -205,7 +207,7 @@ export async function finalizeEvalCollector(evalCollector: EvalCollector | null)
 if (evalsEnabled) {
  const gstackDir = path.join(os.homedir(), '.gstack');
  fs.mkdirSync(gstackDir, { recursive: true });
-  for (const f of ['.completeness-intro-seen', '.telemetry-prompted']) {
+  for (const f of ['.completeness-intro-seen', '.telemetry-prompted', '.proactive-prompted']) {
    const p = path.join(gstackDir, f);
    if (!fs.existsSync(p)) fs.writeFileSync(p, '');
  }
@@ -234,6 +236,59 @@ export function testConcurrentIfSelected(testName: string, fn: () => Promise<voi
  (shouldRun ? test.concurrent : test.skip)(testName, fn, timeout);
 }

+// --- Worktree isolation ---
+
+let worktreeManager: WorktreeManager | null = null;
+
+export function getWorktreeManager(): WorktreeManager {
+  if (!worktreeManager) {
+    worktreeManager = new WorktreeManager();
+    worktreeManager.pruneStale();
+  }
+  return worktreeManager;
+}
+
+/** Create an isolated worktree for a test. Returns the worktree path. */
+export function createTestWorktree(testName: string): string {
+  return getWorktreeManager().create(testName);
+}
+
+/** Harvest changes and clean up. Call in afterAll(). Returns HarvestResult for eval integration. */
+export function harvestAndCleanup(testName: string): HarvestResult | null {
+  const mgr = getWorktreeManager();
+  const result = mgr.harvest(testName);
+  if (result) {
+    if (result.isDuplicate) {
+      process.stderr.write(`\n  HARVEST [${testName}]: duplicate patch (skipped)\n`);
+    } else {
+      process.stderr.write(`\n  HARVEST [${testName}]: ${result.changedFiles.length} files changed\n`);
+      process.stderr.write(`  Patch: ${result.patchPath}\n`);
+      process.stderr.write(`  ${result.diffStat}\n\n`);
+    }
+  }
+  mgr.cleanup(testName);
+  return result;
+}
+
+/**
+ * Convenience: describe block with automatic worktree isolation + harvest.
+ * Any test file can use this to get real repo context instead of a tmpdir.
+ * Note: tests with planted-bug fixtures should NOT use this — they need their fixture repos.
+ */
+export function describeWithWorktree(
+  name: string,
+  testNames: string[],
+  fn: (getWorktreePath: () => string) => void,
+) {
+  describeIfSelected(name, testNames, () => {
+    let worktreePath: string;
+    beforeAll(() => { worktreePath = createTestWorktree(name); });
+    afterAll(() => { harvestAndCleanup(name); });
+    fn(() => worktreePath);
+  });
+}
+
 export { judgePassed } from './eval-store';
 export { EvalCollector } from './eval-store';
 export type { EvalTestEntry } from './eval-store';
+export type { HarvestResult } from '../../lib/worktree';
@@ -2,7 +2,7 @@
 * Eval result persistence and comparison.
 *
 * EvalCollector accumulates test results, writes them to
- * ~/.gstack-dev/evals/{version}-{branch}-{tier}-{timestamp}.json,
+ * ~/.gstack/projects/$SLUG/evals/{version}-{branch}-{tier}-{timestamp}.json,
 * prints a summary table, and auto-compares with the previous run.
 *
 * Comparison functions are exported for reuse by the eval:compare CLI.
@@ -16,7 +16,32 @@ import { getGitInfo as getGitInfoShared, getVersion as getVersionShared } from '
 import type { CostEntry } from '../../lib/eval-format';

 const SCHEMA_VERSION = 1;
-const DEFAULT_EVAL_DIR = path.join(os.homedir(), '.gstack-dev', 'evals');
+const LEGACY_EVAL_DIR = path.join(os.homedir(), '.gstack-dev', 'evals');
+
+/**
+ * Detect project-scoped eval dir via gstack-slug.
+ * Falls back to legacy ~/.gstack-dev/evals/ if slug detection fails.
+ */
+export function getProjectEvalDir(): string {
+  try {
+    // Try repo-local gstack-slug first, then global install
+    const localSlug = spawnSync('bash', ['-c', '.claude/skills/gstack/bin/gstack-slug 2>/dev/null || ~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null'], {
+      stdio: 'pipe', timeout: 3000,
+    });
+    const output = localSlug.stdout?.toString().trim();
+    if (output) {
+      const slugMatch = output.match(/^SLUG=(.+)$/m);
+      if (slugMatch && slugMatch[1]) {
+        const dir = path.join(os.homedir(), '.gstack', 'projects', slugMatch[1], 'evals');
+        fs.mkdirSync(dir, { recursive: true });
+        return dir;
+      }
+    }
+  } catch { /* fall through */ }
+  return LEGACY_EVAL_DIR;
+}
+
+const DEFAULT_EVAL_DIR = getProjectEvalDir();

 // --- Interfaces ---

@@ -60,6 +85,13 @@ export interface EvalTestEntry {
  costs?: CostEntry[];

  error?: string;
+
+  // Worktree harvest data
+  harvest?: {
+    filesChanged: number;
+    patchPath: string;
+    isDuplicate: boolean;
+  };
 }

 export interface EvalResult {
@@ -9,15 +9,23 @@
 import * as fs from 'fs';
 import * as path from 'path';
 import * as os from 'os';
-import { atomicWriteSync, sanitizeForFilename, GSTACK_DEV_DIR } from '../../lib/util';
+import { getProjectEvalDir } from './eval-store';
 import type { CostEntry } from '../../lib/eval-format';
-import { resolveTier, tierToModel } from '../../lib/eval-tier';

-const HEARTBEAT_PATH = path.join(GSTACK_DEV_DIR, 'e2e-live.json');
+const GSTACK_DEV_DIR = path.join(os.homedir(), '.gstack-dev');
+const HEARTBEAT_PATH = path.join(GSTACK_DEV_DIR, 'e2e-live.json'); // heartbeat stays global
+const PROJECT_DIR = path.dirname(getProjectEvalDir()); // ~/.gstack/projects/$SLUG/

 /** Sanitize test name for use as filename: strip leading slashes, replace / with - */
 export function sanitizeTestName(name: string): string {
-  return sanitizeForFilename(name);
+  return name.replace(/^\/+/, '').replace(/\//g, '-');
+}
+
+/** Atomic write: write to .tmp then rename. Non-fatal on error. */
+function atomicWriteSync(filePath: string, data: string): void {
+  const tmp = filePath + '.tmp';
+  fs.writeFileSync(tmp, data);
+  fs.renameSync(tmp, filePath);
 }

 export interface CostEstimate {
@@ -140,15 +148,13 @@ export async function runSkillTest(options: {
  const safeName = testName ? sanitizeTestName(testName) : null;
  if (runId) {
    try {
-      runDir = path.join(GSTACK_DEV_DIR, 'e2e-runs', runId);
+      runDir = path.join(PROJECT_DIR, 'e2e-runs', runId);
      fs.mkdirSync(runDir, { recursive: true });
    } catch { /* non-fatal */ }
  }

  // Spawn claude -p with streaming NDJSON output. Prompt piped via stdin to
  // avoid shell escaping issues. --verbose is required for stream-json mode.
-  // Model pinned via EVAL_TIER env var (default: sonnet).
-  const evalModel = tierToModel(resolveTier());
  const args = [
    '-p',
    '--model', model,
@@ -32,25 +32,25 @@ export function matchGlob(file: string, pattern: string): boolean {
 * Each test lists the file patterns that, if changed, require the test to run.
 */
 export const E2E_TOUCHFILES: Record<string, string[]> = {
-  // Browse core
-  'browse-basic':    ['browse/src/**'],
-  'browse-snapshot': ['browse/src/**'],
+  // Browse core (+ test-server dependency)
+  'browse-basic':    ['browse/src/**', 'browse/test/test-server.ts'],
+  'browse-snapshot': ['browse/src/**', 'browse/test/test-server.ts'],

-  // SKILL.md setup + preamble (depend on ROOT SKILL.md only)
-  'skillmd-setup-discovery':  ['SKILL.md', 'SKILL.md.tmpl'],
-  'skillmd-no-local-binary':  ['SKILL.md', 'SKILL.md.tmpl'],
-  'skillmd-outside-git':      ['SKILL.md', 'SKILL.md.tmpl'],
+  // SKILL.md setup + preamble (depend on ROOT SKILL.md + gen-skill-docs)
+  'skillmd-setup-discovery':  ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
+  'skillmd-no-local-binary':  ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
+  'skillmd-outside-git':      ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],

  'contributor-mode':           ['SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
-  'session-awareness':        ['SKILL.md', 'SKILL.md.tmpl'],
+  'session-awareness':        ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],

-  // QA
-  'qa-quick':       ['qa/**', 'browse/src/**'],
-  'qa-b6-static':   ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval.html', 'test/fixtures/qa-eval-ground-truth.json'],
-  'qa-b7-spa':      ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval-spa.html', 'test/fixtures/qa-eval-spa-ground-truth.json'],
-  'qa-b8-checkout': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval-checkout.html', 'test/fixtures/qa-eval-checkout-ground-truth.json'],
+  // QA (+ test-server dependency)
+  'qa-quick':       ['qa/**', 'browse/src/**', 'browse/test/test-server.ts'],
+  'qa-b6-static':   ['qa/**', 'browse/src/**', 'browse/test/test-server.ts', 'test/helpers/llm-judge.ts', 'browse/test/fixtures/qa-eval.html', 'test/fixtures/qa-eval-ground-truth.json'],
+  'qa-b7-spa':      ['qa/**', 'browse/src/**', 'browse/test/test-server.ts', 'test/helpers/llm-judge.ts', 'browse/test/fixtures/qa-eval-spa.html', 'test/fixtures/qa-eval-spa-ground-truth.json'],
+  'qa-b8-checkout': ['qa/**', 'browse/src/**', 'browse/test/test-server.ts', 'test/helpers/llm-judge.ts', 'browse/test/fixtures/qa-eval-checkout.html', 'test/fixtures/qa-eval-checkout-ground-truth.json'],
  'qa-only-no-fix': ['qa-only/**', 'qa/templates/**'],
-  'qa-fix-loop':    ['qa/**', 'browse/src/**'],
+  'qa-fix-loop':    ['qa/**', 'browse/src/**', 'browse/test/test-server.ts'],
  'qa-bootstrap':   ['qa/**', 'ship/**'],

  // Review
@@ -68,58 +68,94 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
  'plan-ceo-review-benefits':  ['plan-ceo-review/**', 'scripts/gen-skill-docs.ts'],
  'plan-eng-review':           ['plan-eng-review/**'],
  'plan-eng-review-artifact':  ['plan-eng-review/**'],
+  'plan-review-report':        ['plan-eng-review/**', 'scripts/gen-skill-docs.ts'],
+
+  // Codex offering verification
+  'codex-offered-office-hours':  ['office-hours/**', 'scripts/gen-skill-docs.ts'],
+  'codex-offered-ceo-review':    ['plan-ceo-review/**', 'scripts/gen-skill-docs.ts'],
+  'codex-offered-design-review': ['plan-design-review/**', 'scripts/gen-skill-docs.ts'],
+  'codex-offered-eng-review':    ['plan-eng-review/**', 'scripts/gen-skill-docs.ts'],

  // Ship
-  'ship-base-branch':    ['ship/**'],
+  'ship-base-branch': ['ship/**', 'bin/gstack-repo-mode'],
  'ship-local-workflow': ['ship/**', 'scripts/gen-skill-docs.ts'],
-
-  // Setup browser cookies
-  'setup-cookies-detect': ['setup-browser-cookies/**'],
+  'review-dashboard-via': ['ship/**', 'scripts/resolvers/review.ts', 'codex/**', 'autoplan/**', 'land-and-deploy/**'],
+  'ship-plan-completion': ['ship/**', 'scripts/gen-skill-docs.ts'],
+  'ship-plan-verification': ['ship/**', 'scripts/gen-skill-docs.ts'],

  // Retro
  'retro':             ['retro/**'],
  'retro-base-branch': ['retro/**'],

+  // Global discover
+  'global-discover':   ['bin/gstack-global-discover.ts', 'test/global-discover.test.ts'],
+
+  // CSO
+  'cso-full-audit':   ['cso/**'],
+  'cso-diff-mode':    ['cso/**'],
+  'cso-infra-scope':  ['cso/**'],
+
  // Document-release
  'document-release': ['document-release/**'],

  // Codex (Claude E2E — tests /codex skill via Claude)
  'codex-review': ['codex/**'],

-  // Codex E2E (tests skills via Codex CLI)
-  'codex-discover-skill':  ['codex/**', '.agents/skills/**', 'test/helpers/codex-session-runner.ts'],
-  'codex-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'codex/**', 'test/helpers/codex-session-runner.ts'],
+  // Codex E2E (tests skills via Codex CLI + worktree)
+  'codex-discover-skill':  ['codex/**', '.agents/skills/**', 'test/helpers/codex-session-runner.ts', 'lib/worktree.ts'],
+  'codex-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'codex/**', 'test/helpers/codex-session-runner.ts', 'lib/worktree.ts'],

-  // Gemini E2E (tests skills via Gemini CLI)
-  'gemini-discover-skill':  ['.agents/skills/**', 'test/helpers/gemini-session-runner.ts'],
-  'gemini-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'test/helpers/gemini-session-runner.ts'],
+  // Gemini E2E (tests skills via Gemini CLI + worktree)
+  'gemini-discover-skill':  ['.agents/skills/**', 'test/helpers/gemini-session-runner.ts', 'lib/worktree.ts'],
+  'gemini-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'test/helpers/gemini-session-runner.ts', 'lib/worktree.ts'],


-  // Ship coverage audit
-  'ship-coverage-audit': ['ship/**'],
+  // Coverage audit (shared fixture) + triage + gates
+  'ship-coverage-audit': ['ship/**', 'test/fixtures/coverage-audit-fixture.ts', 'bin/gstack-repo-mode'],
+  'review-coverage-audit': ['review/**', 'test/fixtures/coverage-audit-fixture.ts'],
+  'plan-eng-coverage-audit': ['plan-eng-review/**', 'test/fixtures/coverage-audit-fixture.ts'],
+  'ship-triage': ['ship/**', 'bin/gstack-repo-mode'],
+
+  // Plan completion audit + verification
+  'ship-plan-completion': ['ship/**', 'scripts/gen-skill-docs.ts'],
+  'ship-plan-verification': ['ship/**', 'qa-only/**', 'scripts/gen-skill-docs.ts'],
+  'review-plan-completion': ['review/**', 'scripts/gen-skill-docs.ts'],

  // Design
-  'design-consultation-core':       ['design-consultation/**'],
-  'design-consultation-existing':   ['design-consultation/**'],
-  'design-consultation-research':   ['design-consultation/**'],
-  'design-consultation-preview':    ['design-consultation/**'],
-  'plan-design-review-plan-mode':   ['plan-design-review/**'],
-  'plan-design-review-no-ui-scope': ['plan-design-review/**'],
-  'design-review-fix':              ['design-review/**', 'browse/src/**'],
+  'design-consultation-core':       ['design-consultation/**', 'scripts/gen-skill-docs.ts', 'test/helpers/llm-judge.ts'],
+  'design-consultation-existing':   ['design-consultation/**', 'scripts/gen-skill-docs.ts'],
+  'design-consultation-research':   ['design-consultation/**', 'scripts/gen-skill-docs.ts'],
+  'design-consultation-preview':    ['design-consultation/**', 'scripts/gen-skill-docs.ts'],
+  'plan-design-review-plan-mode':   ['plan-design-review/**', 'scripts/gen-skill-docs.ts'],
+  'plan-design-review-no-ui-scope': ['plan-design-review/**', 'scripts/gen-skill-docs.ts'],
+  'design-review-fix':              ['design-review/**', 'browse/src/**', 'scripts/gen-skill-docs.ts'],
+
+  // Design Shotgun
+  'design-shotgun-path':            ['design-shotgun/**', 'design/src/**', 'scripts/resolvers/design.ts'],
+  'design-shotgun-session':         ['design-shotgun/**', 'scripts/resolvers/design.ts'],
+  'design-shotgun-full':            ['design-shotgun/**', 'design/src/**', 'browse/src/**'],

  // gstack-upgrade
  'gstack-upgrade-happy-path': ['gstack-upgrade/**'],

  // Deploy skills
-  'land-and-deploy-workflow':   ['land-and-deploy/**', 'scripts/gen-skill-docs.ts'],
-  'canary-workflow':            ['canary/**', 'browse/src/**'],
-  'benchmark-workflow':         ['benchmark/**', 'browse/src/**'],
-  'setup-deploy-workflow':      ['setup-deploy/**', 'scripts/gen-skill-docs.ts'],
+  'land-and-deploy-workflow':      ['land-and-deploy/**', 'scripts/gen-skill-docs.ts'],
+  'land-and-deploy-first-run':     ['land-and-deploy/**', 'scripts/gen-skill-docs.ts', 'bin/gstack-slug'],
+  'land-and-deploy-review-gate':   ['land-and-deploy/**', 'bin/gstack-review-read'],
+  'canary-workflow':               ['canary/**', 'browse/src/**'],
+  'benchmark-workflow':            ['benchmark/**', 'browse/src/**'],
+  'setup-deploy-workflow':         ['setup-deploy/**', 'scripts/gen-skill-docs.ts'],
+
+  // Sidebar agent
+  'sidebar-navigate':              ['browse/src/server.ts', 'browse/src/sidebar-agent.ts', 'browse/src/sidebar-utils.ts', 'extension/**'],
+  'sidebar-url-accuracy':          ['browse/src/server.ts', 'browse/src/sidebar-agent.ts', 'browse/src/sidebar-utils.ts', 'extension/background.js'],
+
+  // Autoplan
+  'autoplan-core':  ['autoplan/**', 'plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**'],

  // Skill routing — journey-stage tests (depend on ALL skill descriptions)
  'journey-ideation':       ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
  'journey-plan-eng':       ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
-  'journey-think-bigger':   ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
  'journey-debug':          ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
  'journey-qa':             ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
  'journey-code-review':    ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
@@ -130,6 +166,133 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
  'journey-visual-qa':      ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
 };

+/**
+ * E2E test tiers — 'gate' blocks PRs, 'periodic' runs weekly/on-demand.
+ * Must have exactly the same keys as E2E_TOUCHFILES.
+ */
+export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
+  // Browse core — gate (if browse breaks, everything breaks)
+  'browse-basic': 'gate',
+  'browse-snapshot': 'gate',
+
+  // SKILL.md setup — gate (if setup breaks, no skill works)
+  'skillmd-setup-discovery': 'gate',
+  'skillmd-no-local-binary': 'gate',
+  'skillmd-outside-git': 'gate',
+  'contributor-mode': 'gate',
+  'session-awareness': 'gate',
+
+  // QA — gate for functional, periodic for quality/benchmarks
+  'qa-quick': 'gate',
+  'qa-b6-static': 'periodic',
+  'qa-b7-spa': 'periodic',
+  'qa-b8-checkout': 'periodic',
+  'qa-only-no-fix': 'gate',     // CRITICAL guardrail: Edit tool forbidden
+  'qa-fix-loop': 'periodic',
+  'qa-bootstrap': 'gate',
+
+  // Review — gate for functional/guardrails, periodic for quality
+  'review-sql-injection': 'gate',     // Security guardrail
+  'review-enum-completeness': 'gate',
+  'review-base-branch': 'gate',
+  'review-design-lite': 'periodic',   // 4/7 threshold is subjective
+  'review-coverage-audit': 'gate',
+  'review-plan-completion': 'gate',
+  'review-dashboard-via': 'gate',
+
+  // Office Hours
+  'office-hours-spec-review': 'gate',
+
+  // Plan reviews — gate for cheap functional, periodic for Opus quality
+  'plan-ceo-review': 'periodic',
+  'plan-ceo-review-selective': 'periodic',
+  'plan-ceo-review-benefits': 'gate',
+  'plan-eng-review': 'periodic',
+  'plan-eng-review-artifact': 'periodic',
+  'plan-eng-coverage-audit': 'gate',
+  'plan-review-report': 'gate',
+
+  // Codex offering verification
+  'codex-offered-office-hours': 'gate',
+  'codex-offered-ceo-review': 'gate',
+  'codex-offered-design-review': 'gate',
+  'codex-offered-eng-review': 'gate',
+
+  // Ship — gate (end-to-end ship path)
+  'ship-base-branch': 'gate',
+  'ship-local-workflow': 'gate',
+  'ship-coverage-audit': 'gate',
+  'ship-triage': 'gate',
+  'ship-plan-completion': 'gate',
+  'ship-plan-verification': 'gate',
+
+  // Retro — gate for cheap branch detection, periodic for full Opus retro
+  'retro': 'periodic',
+  'retro-base-branch': 'gate',
+
+  // Global discover
+  'global-discover': 'gate',
+
+  // CSO — gate for security guardrails, periodic for quality
+  'cso-full-audit': 'gate',      // Hardcoded secrets detection
+  'cso-diff-mode': 'gate',
+  'cso-infra-scope': 'periodic',
+
+  // Document-release — gate (CHANGELOG guardrail)
+  'document-release': 'gate',
+
+  // Codex — periodic (Opus, requires codex CLI)
+  'codex-review': 'periodic',
+
+  // Multi-AI — periodic (require external CLIs)
+  'codex-discover-skill': 'periodic',
+  'codex-review-findings': 'periodic',
+  'gemini-discover-skill': 'periodic',
+  'gemini-review-findings': 'periodic',
+
+  // Design — gate for cheap functional, periodic for Opus/quality
+  'design-consultation-core': 'periodic',
+  'design-consultation-existing': 'periodic',
+  'design-consultation-research': 'gate',
+  'design-consultation-preview': 'gate',
+  'plan-design-review-plan-mode': 'periodic',
+  'plan-design-review-no-ui-scope': 'gate',
+  'design-review-fix': 'periodic',
+  'design-shotgun-path': 'gate',
+  'design-shotgun-session': 'gate',
+  'design-shotgun-full': 'periodic',
+
+  // gstack-upgrade
+  'gstack-upgrade-happy-path': 'gate',
+
+  // Deploy skills
+  'land-and-deploy-workflow': 'gate',
+  'land-and-deploy-first-run': 'gate',
+  'land-and-deploy-review-gate': 'gate',
+  'canary-workflow': 'gate',
+  'benchmark-workflow': 'gate',
+  'setup-deploy-workflow': 'gate',
+
+  // Sidebar agent
+  'sidebar-navigate': 'periodic',
+  'sidebar-url-accuracy': 'periodic',
+
+  // Autoplan — periodic (not yet implemented)
+  'autoplan-core': 'periodic',
+
+  // Skill routing — periodic (LLM routing is non-deterministic)
+  'journey-ideation': 'periodic',
+  'journey-plan-eng': 'periodic',
+  'journey-debug': 'periodic',
+  'journey-qa': 'periodic',
+  'journey-code-review': 'periodic',
+  'journey-ship': 'periodic',
+  'journey-docs': 'periodic',
+  'journey-retro': 'periodic',
+  'journey-design-system': 'periodic',
+  'journey-visual-qa': 'periodic',
+};
+
 /**
 * LLM-judge test touchfiles — keyed by test description string.
 */
@@ -172,20 +335,22 @@ export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
  'retro/SKILL.md instructions':          ['retro/SKILL.md', 'retro/SKILL.md.tmpl'],
  'qa-only/SKILL.md workflow':            ['qa-only/SKILL.md', 'qa-only/SKILL.md.tmpl'],
  'gstack-upgrade/SKILL.md upgrade flow': ['gstack-upgrade/SKILL.md', 'gstack-upgrade/SKILL.md.tmpl'],
+
+  // Voice directive
+  'voice directive tone':                 ['scripts/resolvers/preamble.ts', 'review/SKILL.md', 'review/SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
 };

 /**
 * Changes to any of these files trigger ALL tests (both E2E and LLM-judge).
+ *
+ * Keep this list minimal — only files that genuinely affect every test.
+ * Scoped dependencies (gen-skill-docs, llm-judge, test-server, worktree,
+ * codex/gemini session runners) belong in individual test entries instead.
 */
 export const GLOBAL_TOUCHFILES = [
-  'test/helpers/session-runner.ts',
-  'test/helpers/codex-session-runner.ts',
-  'test/helpers/gemini-session-runner.ts',
-  'test/helpers/eval-store.ts',
-  'test/helpers/llm-judge.ts',
-  'scripts/gen-skill-docs.ts',
-  'test/helpers/touchfiles.ts',
-  'browse/test/test-server.ts',
+  'test/helpers/session-runner.ts',  // All E2E tests use this runner
+  'test/helpers/eval-store.ts',      // All E2E tests store results here
+  'test/helpers/touchfiles.ts',      // Self-referential — reclassifying wrong is dangerous
 ];

 // --- Base branch detection ---