gstack/test/helpers/touchfiles.ts

/**
 * Diff-based test selection for E2E and LLM-judge evals.
 *
 * Each test declares which source files it depends on ("touchfiles").
 * The test runner checks `git diff` and only runs tests whose
 * dependencies were modified. Override with EVALS_ALL=1 to run everything.
 */

import { spawnSync } from 'child_process';

// --- Glob matching ---

/**
 * Match a file path against a glob pattern.
 * Supports:
 *   ** — match any number of path segments
 *   *  — match within a single segment (no /)
 */
export function matchGlob(file: string, pattern: string): boolean {
  const regexStr = pattern
    .replace(/\./g, '\\.')
    .replace(/\*\*/g, '{{GLOBSTAR}}')
    .replace(/\*/g, '[^/]*')
    .replace(/\{\{GLOBSTAR\}\}/g, '.*');
  return new RegExp(`^${regexStr}$`).test(file);
}

// --- Touchfile maps ---

/**
 * E2E test touchfiles — keyed by testName (the string passed to runSkillTest).
 * Each test lists the file patterns that, if changed, require the test to run.
 */
export const E2E_TOUCHFILES: Record<string, string[]> = {
  // Browse core (+ test-server dependency)
  'browse-basic':    ['browse/src/**', 'browse/test/test-server.ts'],
  'browse-snapshot': ['browse/src/**', 'browse/test/test-server.ts'],

  // Hermetic isolation canaries (hermetic-env.ts is also a GLOBAL touchfile;
  // these entries exist so the canaries themselves stay tier-classified)
  'hermetic-canary':   ['test/helpers/hermetic-env.ts', 'test/helpers/session-runner.ts', 'test/skill-e2e-hermetic-canary.test.ts', 'lib/conductor-env-shim.ts'],
  'hermetic-sentinel': ['test/helpers/hermetic-env.ts', 'test/helpers/session-runner.ts', 'test/skill-e2e-hermetic-canary.test.ts', 'lib/conductor-env-shim.ts'],

  // SKILL.md setup + preamble (depend on ROOT SKILL.md + gen-skill-docs)
  'skillmd-setup-discovery':  ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
  'skillmd-no-local-binary':  ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
  'skillmd-outside-git':      ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],

  'session-awareness':        ['SKILL.md', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
  'operational-learning':     ['scripts/resolvers/preamble.ts', 'bin/gstack-learnings-log'],

  // QA (+ test-server dependency)
  'qa-quick':       ['qa/**', 'browse/src/**', 'browse/test/test-server.ts'],
  'qa-b6-static':   ['qa/**', 'browse/src/**', 'browse/test/test-server.ts', 'test/helpers/llm-judge.ts', 'browse/test/fixtures/qa-eval.html', 'test/fixtures/qa-eval-ground-truth.json'],
  'qa-b7-spa':      ['qa/**', 'browse/src/**', 'browse/test/test-server.ts', 'test/helpers/llm-judge.ts', 'browse/test/fixtures/qa-eval-spa.html', 'test/fixtures/qa-eval-spa-ground-truth.json'],
  'qa-b8-checkout': ['qa/**', 'browse/src/**', 'browse/test/test-server.ts', 'test/helpers/llm-judge.ts', 'browse/test/fixtures/qa-eval-checkout.html', 'test/fixtures/qa-eval-checkout-ground-truth.json'],
  'qa-only-no-fix': ['qa-only/**', 'qa/templates/**'],
  'qa-fix-loop':    ['qa/**', 'browse/src/**', 'browse/test/test-server.ts'],
  'qa-bootstrap':   ['qa/**', 'ship/**'],

  // Review
  'review-sql-injection':     ['review/**', 'test/fixtures/review-eval-vuln.rb'],
  'review-enum-completeness': ['review/**', 'test/fixtures/review-eval-enum*.rb'],
  'review-base-branch':       ['review/**'],
  'review-design-lite':       ['review/**', 'test/fixtures/review-eval-design-slop.*'],

  // Review Army (specialist dispatch)
  'review-army-migration-safety': ['review/**', 'scripts/resolvers/review-army.ts', 'bin/gstack-diff-scope'],
  'review-army-perf-n-plus-one':  ['review/**', 'scripts/resolvers/review-army.ts', 'bin/gstack-diff-scope'],
  'review-army-delivery-audit':   ['review/**', 'scripts/resolvers/review.ts', 'scripts/resolvers/review-army.ts'],
  'review-army-quality-score':    ['review/**', 'scripts/resolvers/review-army.ts'],
  'review-army-json-findings':    ['review/**', 'scripts/resolvers/review-army.ts'],
  'review-army-red-team':         ['review/**', 'scripts/resolvers/review-army.ts'],
  'review-army-consensus':        ['review/**', 'scripts/resolvers/review-army.ts'],

  // Office Hours
  'office-hours-spec-review':     ['office-hours/**', 'scripts/gen-skill-docs.ts'],
  'office-hours-forcing-energy':  ['office-hours/**', 'scripts/resolvers/preamble.ts', 'test/fixtures/mode-posture/**', 'test/helpers/llm-judge.ts'],
  'office-hours-builder-wildness': ['office-hours/**', 'scripts/resolvers/preamble.ts', 'test/fixtures/mode-posture/**', 'test/helpers/llm-judge.ts'],

  // Plan reviews
  'plan-ceo-review':                  ['plan-ceo-review/**'],
  'plan-ceo-review-selective':        ['plan-ceo-review/**'],
  'plan-ceo-review-benefits':         ['plan-ceo-review/**', 'scripts/gen-skill-docs.ts'],
  'plan-ceo-review-expansion-energy': ['plan-ceo-review/**', 'scripts/resolvers/preamble.ts', 'test/fixtures/mode-posture/**', 'test/helpers/llm-judge.ts'],
  'plan-eng-review':           ['plan-eng-review/**'],
  'plan-eng-review-artifact':  ['plan-eng-review/**'],
  'plan-review-report':        ['plan-eng-review/**', 'scripts/gen-skill-docs.ts'],

  // Plan-mode smoke tests — gate-tier safety regression tests. Each test file
  // contains TWO test cases as of v1.21: the baseline plan-mode case and the
  // AskUserQuestion-blocked regression case (--disallowedTools AskUserQuestion
  // parameterized — the flag set Conductor uses by default). Touchfiles
  // include question-tuning.ts and generate-ask-user-format.ts because the
  // AUTO_DECIDE preamble injection lives there and changes can flip the
  // regression test outcome between 'asked' and 'auto_decided'.
  'plan-ceo-review-plan-mode':    ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/review.ts', 'test/helpers/claude-pty-runner.ts'],
  'plan-eng-review-plan-mode':    ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/review.ts', 'test/helpers/claude-pty-runner.ts'],
  'plan-design-review-plan-mode': ['plan-design-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/review.ts', 'test/helpers/claude-pty-runner.ts'],
  'plan-devex-review-plan-mode':  ['plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/review.ts', 'test/helpers/claude-pty-runner.ts'],
  'plan-mode-no-op':              ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],

  // v1.21+ AskUserQuestion-blocked regression tests — Conductor launches
  // claude with `--disallowedTools AskUserQuestion --permission-mode default`
  // (verified via `ps`); skills must still surface user-decisions through a
  // fallback path (mcp__conductor__AskUserQuestion or plan-file flow) rather
  // than silently auto-deciding. Parameterized regression test cases live
  // INSIDE the existing 4 plan-X-review-plan-mode test files (covered
  // transitively by the entries above). Two new standalone files exist for
  // skills with no prior plan-mode test:
  'office-hours-auto-mode':       ['office-hours/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
  'office-hours-phase4-fork':     ['office-hours/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/question-tuning.ts', 'test/helpers/llm-judge.ts', 'test/skill-e2e-office-hours-phase4.test.ts'],
  'llm-judge-recommendation':     ['test/helpers/llm-judge.ts', 'test/llm-judge-recommendation.test.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'codex/SKILL.md.tmpl', 'scripts/resolvers/review.ts'],
  // v1.21+ AUTO_DECIDE preserve eval (periodic). Verifies the Tool resolution
  // fix doesn't trip the legitimate /plan-tune opt-in path: when the user has
  // written a never-ask preference, AUQ should still auto-decide rather than
  // surfacing the question. Touches the question-tuning + preference
  // infrastructure plus the resolvers that own the AUTO_DECIDE preamble.
  'auto-decide-preserved':        ['scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-preamble-bash.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'plan-ceo-review/**', 'bin/gstack-question-preference', 'bin/gstack-config', 'bin/gstack-slug', 'hosts/claude/hooks/question-preference-hook.ts', 'lib/is-conductor.ts', 'test/helpers/claude-pty-runner.ts'],

  // Conductor → prose decision brief (Conductor signal makes prose the default;
  // the PreToolUse hook denies the flaky tool). Touches the resolver that owns
  // the Conductor rule, the preamble signal, the hook, and the detection helper.
  'conductor-prose':              ['scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-preamble-bash.ts', 'scripts/resolvers/preamble.ts', 'plan-eng-review/**', 'hosts/claude/hooks/question-preference-hook.ts', 'lib/is-conductor.ts', 'test/helpers/claude-pty-runner.ts', 'test/skill-e2e-conductor-prose.test.ts'],

  // Real-PTY E2E batch (#6 new tests on the harness).
  // Each one tests behavior the SDK harness can't observe (rendered TTY,
  // numbered-option lists, multi-phase ordering, idempotency state echo).
  'auq-format-gate':                           ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/auq-sdk-capture.ts', 'test/helpers/session-runner.ts', 'test/helpers/llm-judge.ts'],
  'plan-ceo-mode-routing':       ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
  'plan-design-with-ui-scope':   ['plan-design-review/**', 'test/fixtures/plans/ui-heavy-feature.md', 'test/helpers/claude-pty-runner.ts'],
  'budget-regression-pty':       ['test/helpers/eval-store.ts', 'test/skill-budget-regression.test.ts'],
  'ship-idempotency-pty':        ['ship/**', 'bin/gstack-next-version', 'bin/gstack-version-bump', 'scripts/resolvers/sections.ts', 'lib/worktree.ts', 'test/helpers/claude-pty-runner.ts'],
  'ship-section-loading':        ['ship/**', 'scripts/resolvers/sections.ts', 'scripts/gen-skill-docs.ts', 'test/helpers/auq-sdk-capture.ts', 'test/helpers/session-runner.ts'],
  'plan-ceo-section-loading':    ['plan-ceo-review/**', 'scripts/resolvers/sections.ts', 'scripts/gen-skill-docs.ts', 'test/helpers/auq-sdk-capture.ts', 'test/helpers/session-runner.ts'],
  // Data-driven behavioral guard for the 'plan'/'prompt' carves (eng, design,
  // devex, office-hours + future PR2 carves). One file iterating CARVE_GUARDS;
  // the selector sets GSTACK_CARVE_SKILL=<name> to scope cost to the changed
  // skill (D-CODEX A). Touching the registry/helper or sections.ts runs all.
  'carve-section-loading':       ['plan-eng-review/**', 'plan-design-review/**', 'plan-devex-review/**', 'office-hours/**', 'document-release/**', 'design-consultation/**', 'cso/**', 'test/helpers/carve-guards.ts', 'scripts/resolvers/sections.ts', 'scripts/gen-skill-docs.ts', 'test/helpers/auq-sdk-capture.ts', 'test/helpers/session-runner.ts'],
  'autoplan-chain-pty':          ['autoplan/**', 'plan-ceo-review/**', 'plan-design-review/**', 'plan-eng-review/**', 'plan-devex-review/**', 'test/fixtures/plans/ui-heavy-feature.md', 'test/helpers/claude-pty-runner.ts'],
  'e2e-harness-audit':            ['plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'test/helpers/agent-sdk-runner.ts', 'test/helpers/claude-pty-runner.ts'],

  // Per-finding AskUserQuestion count + review-report-at-bottom assertion.
  // Each test drives its skill end-to-end; touchfiles include preamble +
  // completion-status resolvers because they affect question cadence and
  // terminal output (the regression surface this test catches).
  'plan-ceo-finding-count':      ['plan-ceo-review/**', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'test/helpers/claude-pty-runner.ts', 'test/skill-e2e-plan-ceo-finding-count.test.ts'],
  'plan-eng-finding-count':      ['plan-eng-review/**', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'test/helpers/claude-pty-runner.ts', 'test/skill-e2e-plan-eng-finding-count.test.ts'],
  'plan-design-finding-count':   ['plan-design-review/**', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'test/helpers/claude-pty-runner.ts', 'test/skill-e2e-plan-design-finding-count.test.ts'],
  'plan-devex-finding-count':    ['plan-devex-review/**', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'test/helpers/claude-pty-runner.ts', 'test/skill-e2e-plan-devex-finding-count.test.ts'],

  // Gate-tier reviewCount-floor counterparts. Catch the May 2026 transcript
  // bug (model wrote a plan-mode plan and ExitPlanMode'd without firing any
  // review-phase AskUserQuestion). Uses runPlanSkillFloorCheck — minimal
  // "did agent fire ANY AUQ?" observer that exits early on first non-permission
  // numbered-option render. ~1-3 min typical wall time per test, ~$2-6 total.
  'plan-eng-finding-floor':      ['plan-eng-review/**', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/review.ts', 'test/helpers/claude-pty-runner.ts', 'test/fixtures/forcing-finding-seeds.ts', 'test/skill-e2e-plan-eng-finding-floor.test.ts'],
  'plan-ceo-finding-floor':      ['plan-ceo-review/**', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/review.ts', 'test/helpers/claude-pty-runner.ts', 'test/fixtures/forcing-finding-seeds.ts', 'test/skill-e2e-plan-ceo-finding-floor.test.ts'],
  'plan-design-finding-floor':   ['plan-design-review/**', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/review.ts', 'test/helpers/claude-pty-runner.ts', 'test/fixtures/forcing-finding-seeds.ts', 'test/skill-e2e-plan-design-finding-floor.test.ts'],
  'plan-devex-finding-floor':    ['plan-devex-review/**', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/review.ts', 'test/helpers/claude-pty-runner.ts', 'test/fixtures/forcing-finding-seeds.ts', 'test/skill-e2e-plan-devex-finding-floor.test.ts'],

  // Multi-finding batching regression — periodic tier complement to the
  // gate-tier finding-floor. Catches the May 2026 transcript shape where
  // a model fires one AUQ then batches the rest into a "## Decisions to
  // confirm" plan write. runPlanSkillFloorCheck cannot detect that shape
  // (it exits on first AUQ); runPlanSkillCounting can.
  'plan-eng-multi-finding-batching': ['plan-eng-review/**', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/review.ts', 'test/helpers/claude-pty-runner.ts', 'test/fixtures/forcing-finding-seeds.ts', 'test/skill-e2e-plan-eng-multi-finding-batching.test.ts'],
  'plan-ceo-split-overflow': ['plan-ceo-review/**', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'bin/gstack-question-preference', 'test/helpers/claude-pty-runner.ts', 'test/fixtures/forcing-finding-seeds.ts', 'test/skill-e2e-plan-ceo-split-overflow.test.ts'],
  'brain-privacy-gate':           ['scripts/resolvers/preamble/generate-brain-sync-block.ts', 'scripts/resolvers/preamble.ts', 'bin/gstack-brain-sync', 'bin/gstack-artifacts-init', 'bin/gstack-config', 'test/helpers/agent-sdk-runner.ts'],

  // /setup-gbrain Path 4 (Remote MCP) — happy + bad-token end-to-end via
  // Agent SDK. Gate-tier (deterministic stub server, fixed inputs); fires
  // when the skill template, the verify helper, the artifacts-init helper,
  // or the detect script changes.
  'setup-gbrain-remote':          ['setup-gbrain/SKILL.md.tmpl', 'bin/gstack-gbrain-mcp-verify', 'bin/gstack-artifacts-init', 'bin/gstack-gbrain-detect', 'test/helpers/agent-sdk-runner.ts'],
  'setup-gbrain-bad-token':       ['setup-gbrain/SKILL.md.tmpl', 'bin/gstack-gbrain-mcp-verify', 'test/helpers/agent-sdk-runner.ts'],
  // v1.34.0.0 split-engine Path 4 + Step 4.5 Yes (local PGLite for code).
  // Periodic-tier per codex #12 (AgentSDK harness is non-deterministic).
  // Fires when the setup-gbrain template, install/verify/init helpers, or
  // the agent-sdk-runner harness changes.
  'setup-gbrain-path4-local-pglite': ['setup-gbrain/SKILL.md.tmpl', 'bin/gstack-gbrain-mcp-verify', 'bin/gstack-gbrain-install', 'bin/gstack-gbrain-detect', 'lib/gbrain-local-status.ts', 'test/helpers/agent-sdk-runner.ts'],

  // AskUserQuestion format regression (RECOMMENDATION + Completeness: N/10)
  // Fires when either template OR the two preamble resolvers change.
  'plan-ceo-review-format-mode':      ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md', 'test/helpers/llm-judge.ts'],
  'plan-ceo-review-format-approach':  ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md', 'test/helpers/llm-judge.ts'],
  'plan-eng-review-format-coverage':  ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md', 'test/helpers/llm-judge.ts'],
  'plan-eng-review-format-kind':      ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md', 'test/helpers/llm-judge.ts'],

  // v1.7.0.0 Pros/Cons format cadence + format + negative-escape evals.
  // Dependencies: same as format-mode + the 4 plan-review templates + overlay.
  // All periodic-tier (non-deterministic Opus 4.7 behavior).
  'plan-ceo-review-prosons-cadence':  ['plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'],
  'plan-review-prosons-format':       ['plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'],
  'plan-review-prosons-hardstop-neg': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'],
  'plan-review-prosons-neutral-neg':  ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'],

  // Expanded coverage (CT3) — 6 non-plan-review skills inherit Pros/Cons via preamble
  'ship-prosons-format':              ['ship/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'],
  'office-hours-prosons-format':      ['office-hours/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'],
  'investigate-prosons-format':       ['investigate/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'],
  'qa-prosons-format':                ['qa/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'],
  'review-prosons-format':            ['review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'],
  'design-review-prosons-format':     ['design-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'],
  'document-release-prosons-format':  ['document-release/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'model-overlays/opus-4-7.md'],

  // /plan-tune (v1 observational)
  'plan-tune-inspect':         ['plan-tune/**', 'scripts/question-registry.ts', 'scripts/psychographic-signals.ts', 'scripts/one-way-doors.ts', 'bin/gstack-question-log', 'bin/gstack-question-preference', 'bin/gstack-developer-profile'],

  // /plan-tune cathedral (T16 — 5 E2E scenarios, all gate per D12)
  'plan-tune-hook-capture':      ['hosts/claude/hooks/**', 'bin/gstack-question-log', 'bin/gstack-developer-profile', 'plan-tune/**'],
  'plan-tune-enforcement':       ['hosts/claude/hooks/**', 'bin/gstack-question-preference', 'scripts/question-registry.ts'],
  'plan-tune-annotation':        ['hosts/claude/hooks/**', 'scripts/declared-annotation.ts', 'scripts/psychographic-signals.ts', 'scripts/question-registry.ts'],
  'plan-tune-codex-import':      ['bin/gstack-codex-session-import', 'bin/gstack-question-log', 'docs/spikes/codex-session-format.md'],
  'plan-tune-dream-cycle':       ['bin/gstack-distill-free-text', 'bin/gstack-distill-apply', 'hosts/claude/hooks/**', 'plan-tune/**'],

  // Codex offering verification
  'codex-offered-office-hours':  ['office-hours/**', 'scripts/gen-skill-docs.ts'],
  'codex-offered-ceo-review':    ['plan-ceo-review/**', 'scripts/gen-skill-docs.ts'],
  'codex-offered-design-review': ['plan-design-review/**', 'scripts/gen-skill-docs.ts'],
  'codex-offered-eng-review':    ['plan-eng-review/**', 'scripts/gen-skill-docs.ts'],

  // Ship
  'ship-base-branch': ['ship/**', 'bin/gstack-repo-mode'],
  'ship-local-workflow': ['ship/**', 'scripts/gen-skill-docs.ts'],
  'review-dashboard-via': ['ship/**', 'scripts/resolvers/review.ts', 'codex/**', 'autoplan/**', 'land-and-deploy/**'],
  'ship-plan-completion': ['ship/**', 'scripts/gen-skill-docs.ts'],
  'ship-plan-verification': ['ship/**', 'scripts/gen-skill-docs.ts'],

  // Retro
  'retro':             ['retro/**'],
  'retro-base-branch': ['retro/**'],

  // Global discover
  'global-discover':   ['bin/gstack-global-discover.ts', 'test/global-discover.test.ts'],

  // CSO
  'cso-full-audit':   ['cso/**'],
  'cso-diff-mode':    ['cso/**'],
  'cso-infra-scope':  ['cso/**'],

  // Learnings
  'learnings-show': ['learn/**', 'bin/gstack-learnings-search', 'bin/gstack-learnings-log', 'scripts/resolvers/learnings.ts'],

  // Session Intelligence (timeline, context recovery, /context-save + /context-restore)
  'timeline-event-flow':            ['bin/gstack-timeline-log', 'bin/gstack-timeline-read'],
  'context-recovery-artifacts':     ['scripts/resolvers/preamble.ts', 'bin/gstack-timeline-log', 'bin/gstack-slug', 'learn/**'],
  'context-save-writes-file':       ['context-save/**', 'bin/gstack-slug'],
  'context-restore-loads-latest':   ['context-restore/**', 'bin/gstack-slug'],

  // Context skills E2E (live-fire, Skill-tool routing path) — see
  // test/skill-e2e-context-skills.test.ts. These are periodic-tier because
  // each one spawns claude -p and costs ~$0.20-$0.40. Collectively they
  // verify the thing the /checkpoint → /context-save rename was for.
  'context-save-routing':                  ['context-save/**', 'scripts/resolvers/preamble.ts'],
  'context-save-then-restore-roundtrip':   ['context-save/**', 'context-restore/**', 'bin/gstack-slug'],
  'context-restore-fragment-match':        ['context-restore/**'],
  'context-restore-empty-state':           ['context-restore/**'],
  'context-restore-list-delegates':        ['context-restore/**'],
  'context-restore-legacy-compat':         ['context-restore/**'],
  'context-save-list-current-branch':      ['context-save/**'],
  'context-save-list-all-branches':        ['context-save/**'],

  // Document-release
  'document-release': ['document-release/**'],

  // Codex (Claude E2E — tests /codex skill via Claude)
  'codex-review': ['codex/**'],

  // Codex E2E (tests skills via Codex CLI + worktree)
  'codex-discover-skill':  ['codex/**', '.agents/skills/**', 'test/helpers/codex-session-runner.ts', 'lib/worktree.ts'],
  'codex-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'codex/**', 'test/helpers/codex-session-runner.ts', 'lib/worktree.ts'],

  // Gemini E2E — smoke test only (Gemini gets lost in worktrees on complex tasks)
  'gemini-smoke':  ['.agents/skills/**', 'test/helpers/gemini-session-runner.ts', 'lib/worktree.ts'],


  // Coverage audit (shared fixture) + triage + gates
  'ship-coverage-audit': ['ship/**', 'test/fixtures/coverage-audit-fixture.ts', 'bin/gstack-repo-mode'],
  'review-coverage-audit': ['review/**', 'test/fixtures/coverage-audit-fixture.ts'],
  'plan-eng-coverage-audit': ['plan-eng-review/**', 'test/fixtures/coverage-audit-fixture.ts'],
  'ship-triage': ['ship/**', 'bin/gstack-repo-mode'],

  // Plan completion audit + verification
  'ship-plan-completion': ['ship/**', 'scripts/gen-skill-docs.ts'],
  'ship-plan-verification': ['ship/**', 'qa-only/**', 'scripts/gen-skill-docs.ts'],
  'ship-idempotency':       ['ship/**', 'scripts/resolvers/utility.ts'],
  'review-plan-completion': ['review/**', 'scripts/gen-skill-docs.ts'],

  // Design
  'design-consultation-core':       ['design-consultation/**', 'scripts/gen-skill-docs.ts', 'test/helpers/llm-judge.ts'],
  'design-consultation-existing':   ['design-consultation/**', 'scripts/gen-skill-docs.ts'],
  'design-consultation-research':   ['design-consultation/**', 'scripts/gen-skill-docs.ts'],
  'design-consultation-preview':    ['design-consultation/**', 'scripts/gen-skill-docs.ts'],
  'plan-design-review-no-ui-scope': ['plan-design-review/**', 'scripts/gen-skill-docs.ts'],
  'design-review-fix':              ['design-review/**', 'browse/src/**', 'scripts/gen-skill-docs.ts'],

  // Design Shotgun
  'design-shotgun-path':            ['design-shotgun/**', 'design/src/**', 'scripts/resolvers/design.ts'],
  'design-shotgun-session':         ['design-shotgun/**', 'scripts/resolvers/design.ts'],
  'design-shotgun-full':            ['design-shotgun/**', 'design/src/**', 'browse/src/**'],

  // /diagram (diagram-render bundle consumers). Triplet = deterministic
  // functional (gate); authoring quality = LLM-judged benchmark (periodic).
  'diagram-triplet':            ['diagram/**', 'lib/diagram-render/**', 'browse/src/write-commands.ts', 'browse/src/read-commands.ts'],
  'diagram-authoring-quality':  ['diagram/**', 'lib/diagram-render/**', 'test/helpers/llm-judge.ts'],

  // gstack-upgrade
  'gstack-upgrade-happy-path': ['gstack-upgrade/**'],

  // Deploy skills
  'land-and-deploy-workflow':      ['land-and-deploy/**', 'scripts/gen-skill-docs.ts'],
  'land-and-deploy-first-run':     ['land-and-deploy/**', 'scripts/gen-skill-docs.ts', 'bin/gstack-slug'],
  'land-and-deploy-review-gate':   ['land-and-deploy/**', 'bin/gstack-review-read'],
  'canary-workflow':               ['canary/**', 'browse/src/**'],
  'benchmark-workflow':            ['benchmark/**', 'browse/src/**'],
  'setup-deploy-workflow':         ['setup-deploy/**', 'scripts/gen-skill-docs.ts'],

  // Sidebar agent
  'sidebar-navigate':              ['browse/src/server.ts', 'browse/src/sidebar-agent.ts', 'browse/src/sidebar-utils.ts', 'extension/**'],
  'sidebar-url-accuracy':          ['browse/src/server.ts', 'browse/src/sidebar-agent.ts', 'browse/src/sidebar-utils.ts', 'extension/background.js'],
  'sidebar-css-interaction':       ['browse/src/server.ts', 'browse/src/sidebar-agent.ts', 'browse/src/write-commands.ts', 'browse/src/read-commands.ts', 'browse/src/cdp-inspector.ts', 'extension/**'],

  // Autoplan
  'autoplan-core':  ['autoplan/**', 'plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**'],
  'autoplan-dual-voice': ['autoplan/**', 'codex/**', 'bin/gstack-codex-probe', 'scripts/resolvers/review.ts', 'scripts/resolvers/design.ts'],

  // Multi-provider benchmark adapters — live API smoke against real claude/codex/gemini CLIs
  'benchmark-providers-live': ['bin/gstack-model-benchmark', 'test/helpers/providers/**', 'test/helpers/benchmark-runner.ts', 'test/helpers/pricing.ts'],

  // Browser-skills Phase 2a — /scrape + /skillify (v1.19.0.0). Gate-tier
  // E2E covers the D1 (provenance guard), D3 (atomic write) contracts plus
  // the basic loop. Shared deps: both skill templates, the D3 helper, the
  // Phase 1 runtime, and the bundled hackernews-frontpage reference (the
  // match-path test relies on it).
  'scrape-match-path': [
    'scrape/**', 'browse/src/browser-skills.ts', 'browse/src/browser-skill-commands.ts',
    'browser-skills/hackernews-frontpage/**',
  ],
  'scrape-prototype-path': [
    'scrape/**', 'browse/src/browser-skills.ts', 'browse/src/browser-skill-commands.ts',
  ],
  'skillify-happy-path': [
    'skillify/**', 'scrape/**', 'browse/src/browser-skill-write.ts',
    'browse/src/browser-skills.ts', 'browse/src/browser-skill-commands.ts',
  ],
  'skillify-provenance-refusal': [
    'skillify/**', 'browse/src/browser-skill-write.ts',
  ],
  'skillify-approval-reject': [
    'skillify/**', 'scrape/**', 'browse/src/browser-skill-write.ts',
  ],

  // Skill routing — journey-stage tests (depend on ALL skill descriptions)
  'journey-ideation':       ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
  'journey-plan-eng':       ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
  'journey-debug':          ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
  'journey-qa':             ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
  'journey-code-review':    ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
  'journey-ship':           ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
  'journey-docs':           ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
  'journey-retro':          ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
  'journey-design-system':  ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
  'journey-visual-qa':      ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],

  // Opus 4.7 behavior evals — keys match testName: values in the test file.
  // Routing sub-tests use template literal `routing-${c.name}` testNames,
  // which the touchfile completeness scanner skips; they inherit selection
  // from the file-level touchfile entry via GLOBAL_TOUCHFILES.
  'fanout-arm-overlay-on':
    ['model-overlays/claude.md', 'model-overlays/opus-4-7.md', 'scripts/models.ts', 'scripts/resolvers/model-overlay.ts'],
  'fanout-arm-overlay-off':
    ['model-overlays/claude.md', 'model-overlays/opus-4-7.md', 'scripts/models.ts', 'scripts/resolvers/model-overlay.ts'],

  // Overlay efficacy harness (SDK) — measures whether overlay nudges change
  // behavior under @anthropic-ai/claude-agent-sdk (closer to real Claude Code
  // than `claude -p`). testNames in the file are template literals so the
  // completeness scanner doesn't require them; these entries exist for
  // diff-based selection accuracy.
  'overlay-harness-opus-4-7-fanout-toy': [
    'model-overlays/**',
    'test/fixtures/overlay-nudges.ts',
    'test/helpers/agent-sdk-runner.ts',
    'scripts/resolvers/model-overlay.ts',
  ],
  'overlay-harness-opus-4-7-fanout-realistic': [
    'model-overlays/**',
    'test/fixtures/overlay-nudges.ts',
    'test/helpers/agent-sdk-runner.ts',
    'scripts/resolvers/model-overlay.ts',
  ],

  // /ios-qa — agent flow E2E. Daemon + stub StateServer + codegen
  // exercised end-to-end. The no-device path is gate-tier; the with-device
  // path requires GSTACK_HAS_IOS_DEVICE=1 and is periodic-tier.
  'ios-qa-e2e':       ['ios-qa/**', 'ios-fix/**', 'ios-design-review/**', 'ios-clean/**', 'ios-sync/**', 'test/skill-e2e-ios.test.ts'],
  // Swift-build invariant test — requires the Swift toolchain. Compiles the
  // fixture SPM package + runs the XCTest suite that validates the real
  // Swift StateServer implementation (loopback bind, boot token rotation,
  // session lock). Periodic-tier — Swift build is heavier than TS unit tests.
  'ios-qa-swift-build': ['ios-qa/templates/**', 'test/fixtures/ios-qa/FixtureApp/**', 'test/skill-e2e-ios-swift-build.test.ts'],
  // Real-device path — only runs with GSTACK_HAS_IOS_DEVICE=1 + a paired
  // iPhone. Validates the CoreDevice agent + iOS SDK toolchain. Periodic-tier.
  'ios-qa-device':    ['ios-qa/templates/**', 'test/fixtures/ios-qa/FixtureApp/**', 'test/skill-e2e-ios-device.test.ts'],

  // /spec end-to-end via PTY — exercises the full Phase 1→5 pipeline
  // including --execute spawn. Periodic-tier — paid + non-deterministic.
  'spec-execute':     ['spec/**', 'test/skill-e2e-spec-execute.test.ts'],

  // /office-hours brain-writeback path under fake gbrain CLI (v1.50.0.0
  // T7). Drives /office-hours with a regenerated SKILL.md that has the
  // compressed GBRAIN_SAVE_RESULTS block + a fake gbrain on PATH; asserts
  // the agent calls `gbrain put office-hours/<slug>` with valid YAML
  // frontmatter. Touched by anything that changes resolver output, gen
  // pipeline, detection helper, refresh subcommand, or the on-demand
  // docs the resolver points to.
  'office-hours-brain-writeback': [
    'scripts/resolvers/gbrain.ts',
    'scripts/gen-skill-docs.ts',
    'bin/gstack-gbrain-detect',
    'bin/gstack-config',
    'office-hours/SKILL.md.tmpl',
    'docs/gbrain-write-surfaces.md',
    'test/fixtures/office-hours-brain-writeback/**',
    'test/skill-e2e-office-hours-brain-writeback.test.ts',
  ],

  // gbrain CLI real round-trip against a local PGLite store (v1.50.0.0
  // T11). Proves the gbrain CLI persistence contract gstack relies on —
  // a `gbrain put` followed by `gbrain get` returns the body. Skips if
  // VOYAGE_API_KEY is unset OR gbrain CLI not on PATH. Touched by the
  // resolver (which emits the CLI shape) and the test itself.
  'gbrain-roundtrip-local': [
    'scripts/resolvers/gbrain.ts',
    'test/skill-e2e-gbrain-roundtrip-local.test.ts',
  ],

};

/**
 * E2E test tiers — 'gate' blocks PRs, 'periodic' runs weekly/on-demand.
 * Must have exactly the same keys as E2E_TOUCHFILES.
 */
export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
  // Browse core — gate (if browse breaks, everything breaks)
  'browse-basic': 'gate',
  'browse-snapshot': 'gate',

  // Hermetic isolation — gate (deterministic env/config assertions; if the
  // clean room breaks, every other eval's signal is contaminated)
  'hermetic-canary': 'gate',
  'hermetic-sentinel': 'gate',

  // SKILL.md setup — gate (if setup breaks, no skill works)
  'skillmd-setup-discovery': 'gate',
  'skillmd-no-local-binary': 'gate',
  'skillmd-outside-git': 'gate',
  'session-awareness': 'gate',
  'operational-learning': 'gate',

  // QA — gate for functional, periodic for quality/benchmarks
  'qa-quick': 'gate',
  'qa-b6-static': 'periodic',
  'qa-b7-spa': 'periodic',
  'qa-b8-checkout': 'periodic',
  'qa-only-no-fix': 'gate',     // CRITICAL guardrail: Edit tool forbidden
  'qa-fix-loop': 'periodic',
  'qa-bootstrap': 'gate',

  // Review — gate for functional/guardrails, periodic for quality
  'review-sql-injection': 'gate',     // Security guardrail
  'review-enum-completeness': 'gate',
  'review-base-branch': 'gate',
  'review-design-lite': 'periodic',   // 4/7 threshold is subjective
  'review-coverage-audit': 'gate',
  'review-plan-completion': 'gate',
  'review-dashboard-via': 'gate',

  // Review Army — gate for core functionality, periodic for multi-specialist
  'review-army-migration-safety': 'gate',   // Specialist activation guardrail
  'review-army-perf-n-plus-one': 'gate',    // Specialist activation guardrail
  'review-army-delivery-audit': 'gate',     // Delivery integrity guardrail
  'review-army-quality-score': 'gate',      // Score computation
  'review-army-json-findings': 'gate',      // JSON schema compliance
  'review-army-red-team': 'periodic',       // Multi-agent coordination
  'review-army-consensus': 'periodic',      // Multi-specialist agreement

  // Office Hours
  'office-hours-spec-review': 'gate',
  // Brain-writeback E2E — periodic per cost (claude -p) + non-deterministic
  // (model interprets the gbrain instruction). Matches nearby
  // setup-gbrain-path4-* tier classification.
  'office-hours-brain-writeback': 'periodic',
  // GBrain CLI round-trip — periodic per Voyage embedding cost (~$0.001/run)
  // and external-API-dependency (skips cleanly if VOYAGE_API_KEY unset).
  'gbrain-roundtrip-local': 'periodic',
  'office-hours-forcing-energy': 'gate',       // V1.1 mode-posture regression gate (Sonnet generator)
  // 'office-hours-builder-wildness' retiered to periodic in v1.32 contributor
  // wave: this is an LLM-judge creativity score (axis_a ≥4 on a "wildness"
  // posture). Per CLAUDE.md tier-classification rules, non-deterministic
  // quality benchmarks belong in periodic, not gate. The wave's +21-line
  // CJK preamble cascade (#1205) pushed the score from 5/5 → 3/3 on the
  // same /office-hours BUILDER prompt — same model, same fixture — proving
  // the bar is sensitive to preamble-byte changes that have nothing to do
  // with the test's intent (creativity, not preamble compliance).
  'office-hours-builder-wildness': 'periodic',

  // Plan reviews — gate for cheap functional, periodic for Opus quality
  'plan-ceo-review': 'periodic',
  'plan-ceo-review-selective': 'periodic',
  'plan-ceo-review-benefits': 'gate',
  'plan-ceo-review-expansion-energy': 'gate',  // V1.1 mode-posture regression gate (Opus generator, Sonnet judge)
  'plan-eng-review': 'periodic',
  'plan-eng-review-artifact': 'periodic',
  'plan-eng-coverage-audit': 'gate',
  'plan-review-report': 'gate',

  // Plan-mode handshake. plan-ceo/plan-devex ask-first reliably (gate-tier);
  // plan-eng/plan-design run a long explore/audit before their first
  // AskUserQuestion, so whether they reach a terminal outcome within the 300s
  // budget hinges on stochastic ask-first compliance (~50-67%/run measured).
  // Per the "non-deterministic -> periodic" tiering rule they are periodic:
  // the hardened ask-first gate + the collapsed-form detector lifted them from
  // always-failing to mostly-passing, but they are not deterministic gates.
  'plan-ceo-review-plan-mode': 'gate',
  'plan-eng-review-plan-mode': 'periodic',
  'plan-design-review-plan-mode': 'periodic',
  'plan-devex-review-plan-mode': 'gate',
  'plan-mode-no-op': 'gate',
  // v1.21+ auto-mode regression tests
  'office-hours-auto-mode': 'gate',
  'auto-decide-preserved': 'periodic',
  'conductor-prose': 'periodic',
  'e2e-harness-audit': 'gate',

  // Real-PTY E2E batch — tier classification:
  //   gate: cheap, deterministic, run on every PR
  //   periodic: long-running or expensive (>$3/run), run weekly
  'auq-format-gate':                         'gate',       // ~$0.50/run, SDK capture, single skill probe
  'plan-ceo-mode-routing':     'periodic',   // ~$3/run, deep navigation through 8-12 prior AskUserQuestions
  'plan-design-with-ui-scope': 'gate',       // ~$0.80/run
  'budget-regression-pty':     'gate',       // free, library-only assertion
  'ship-idempotency-pty':      'periodic',   // ~$3/run, real /ship in plan mode
  'ship-section-loading':      'periodic',   // ~$3/run, real /ship; asserts section reads
  'plan-ceo-section-loading':  'periodic',   // ~$3-5/run, real /plan-ceo-review; asserts section read
  'carve-section-loading':     'periodic',   // ~$1-2/skill, data-driven; GSTACK_CARVE_SKILL scopes to one
  'autoplan-chain-pty':        'periodic',   // ~$8/run, all 3 phases sequential

  // Per-finding count + review-report-at-bottom — periodic because each
  // run drives a full skill end-to-end (~25 min, ~$5/run). Sequential
  // execution during calibration; concurrent opt-in only after measured
  // comparison agrees (plan §D15).
  'plan-ceo-finding-count':    'periodic',
  'plan-eng-finding-count':    'periodic',
  'plan-design-finding-count': 'periodic',
  'plan-devex-finding-count':  'periodic',
  'plan-eng-finding-floor':    'periodic',  // stochastic ask-first (see plan-mode-handshake note); periodic
  'plan-ceo-finding-floor':    'gate',
  'plan-design-finding-floor': 'periodic',  // stochastic ask-first (see plan-mode-handshake note); periodic
  'plan-devex-finding-floor':  'gate',
  'plan-eng-multi-finding-batching': 'periodic',
  'plan-ceo-split-overflow': 'periodic',

  // Privacy gate for gstack-brain-sync — periodic (non-deterministic LLM call,
  // costs ~$0.30-$0.50 per run, not needed on every commit)
  'brain-privacy-gate': 'periodic',

  // /setup-gbrain Path 4 (Remote MCP) — periodic-tier. The stub HTTP
  // server is deterministic but the model's interpretation of "follow
  // Path 4 only" is not — assertions on which steps the model ran are
  // flaky. The deterministic gate-tier coverage for Path 4 lives in
  // test/setup-gbrain-path4-structure.test.ts (free, <200ms). These
  // E2E tests stay available for on-demand verification of the live
  // model's behavior against a stub MCP server.
  'setup-gbrain-remote': 'periodic',
  'setup-gbrain-bad-token': 'periodic',
  'setup-gbrain-path4-local-pglite': 'periodic',

  // AskUserQuestion format regression — periodic (Opus 4.7 non-deterministic benchmark)
  'plan-ceo-review-format-mode': 'periodic',
  'plan-ceo-review-format-approach': 'periodic',
  'plan-eng-review-format-coverage': 'periodic',
  'plan-eng-review-format-kind': 'periodic',

  // Office-hours Phase 4 silent-auto-decide regression — periodic (Phase 4
  // requires the agent to invent 2-3 architectures, more open-ended than the
  // 4 plan-format cases above). Reclassify to gate if it turns out stable.
  'office-hours-phase4-fork': 'periodic',
  // judgeRecommendation rubric sanity (fixture-based, ~$0.04/run via Haiku)
  'llm-judge-recommendation': 'periodic',

  // v1.7.0.0 Pros/Cons format — cadence + negative-escape evals (all periodic)
  'plan-ceo-review-prosons-cadence': 'periodic',
  'plan-review-prosons-format': 'periodic',
  'plan-review-prosons-hardstop-neg': 'periodic',
  'plan-review-prosons-neutral-neg': 'periodic',

  // CT3 expanded coverage — non-plan-review skills inheriting Pros/Cons (all periodic)
  'ship-prosons-format': 'periodic',
  'office-hours-prosons-format': 'periodic',
  'investigate-prosons-format': 'periodic',
  'qa-prosons-format': 'periodic',
  'review-prosons-format': 'periodic',
  'design-review-prosons-format': 'periodic',
  'document-release-prosons-format': 'periodic',

  // /plan-tune — gate (core v1 DX promise: plain-English intent routing)
  'plan-tune-inspect': 'gate',

  // /plan-tune cathedral (T16 per D12 — all gate)
  'plan-tune-hook-capture': 'gate',
  'plan-tune-enforcement': 'gate',
  'plan-tune-annotation': 'gate',
  'plan-tune-codex-import': 'gate',
  'plan-tune-dream-cycle': 'gate',

  // Codex offering verification
  'codex-offered-office-hours': 'gate',
  'codex-offered-ceo-review': 'gate',
  'codex-offered-design-review': 'gate',
  'codex-offered-eng-review': 'gate',

  // Session Intelligence — gate for data flow, periodic for agent integration
  'timeline-event-flow': 'gate',                   // Binary data flow (no LLM needed)
  'context-recovery-artifacts': 'gate',            // Preamble reads seeded artifacts
  'context-save-writes-file': 'gate',              // /context-save writes a file
  'context-restore-loads-latest': 'gate',          // Cross-branch newest-by-filename restore

  // Context skills live-fire — periodic (each test spawns claude -p, ~$0.20-$0.40)
  'context-save-routing': 'periodic',              // Proves /context-save routes via Skill tool
  'context-save-then-restore-roundtrip': 'periodic', // Full cycle in one session
  'context-restore-fragment-match': 'periodic',    // /context-restore <fragment>
  'context-restore-empty-state': 'periodic',       // Graceful zero-saves message
  'context-restore-list-delegates': 'periodic',    // /context-restore list redirect
  'context-restore-legacy-compat': 'periodic',     // Pre-rename files still load
  'context-save-list-current-branch': 'periodic',  // Default branch filter
  'context-save-list-all-branches': 'periodic',    // --all flag

  // Ship — gate (end-to-end ship path)
  'ship-base-branch': 'gate',
  'ship-local-workflow': 'gate',
  'ship-coverage-audit': 'gate',
  'ship-triage': 'gate',
  'ship-plan-completion': 'gate',
  'ship-plan-verification': 'gate',
  'ship-idempotency': 'periodic',

  // Retro — gate for cheap branch detection, periodic for full Opus retro
  'retro': 'periodic',
  'retro-base-branch': 'gate',

  // Global discover
  'global-discover': 'gate',

  // CSO — gate for security guardrails, periodic for quality
  'cso-full-audit': 'gate',      // Hardcoded secrets detection
  'cso-diff-mode': 'gate',
  'cso-infra-scope': 'periodic',

  // Learnings — gate (functional guardrail: seeded learnings must appear)
  'learnings-show': 'gate',

  // Document-release — gate (CHANGELOG guardrail)
  'document-release': 'gate',

  // Codex — periodic (Opus, requires codex CLI)
  'codex-review': 'periodic',

  // Multi-AI — periodic (require external CLIs)
  'codex-discover-skill': 'periodic',
  'codex-review-findings': 'periodic',
  'gemini-smoke': 'periodic',

  // Design — gate for cheap functional, periodic for Opus/quality
  'design-consultation-core': 'periodic',
  'design-consultation-existing': 'periodic',
  'design-consultation-research': 'gate',
  'design-consultation-preview': 'gate',
  'plan-design-review-no-ui-scope': 'gate',
  'design-review-fix': 'periodic',
  'design-shotgun-path': 'gate',
  'design-shotgun-session': 'gate',
  'design-shotgun-full': 'periodic',

  // /diagram — triplet is deterministic functional, judge is a quality benchmark
  'diagram-triplet': 'gate',
  'diagram-authoring-quality': 'periodic',

  // gstack-upgrade
  'gstack-upgrade-happy-path': 'gate',

  // Deploy skills
  'land-and-deploy-workflow': 'gate',
  'land-and-deploy-first-run': 'gate',
  'land-and-deploy-review-gate': 'gate',
  'canary-workflow': 'gate',
  'benchmark-workflow': 'gate',
  'setup-deploy-workflow': 'gate',

  // Sidebar agent
  'sidebar-navigate': 'periodic',
  'sidebar-url-accuracy': 'periodic',
  'sidebar-css-interaction': 'periodic',

  // Autoplan — periodic (not yet implemented)
  'autoplan-core': 'periodic',
  'autoplan-dual-voice': 'periodic',

  // Multi-provider benchmark — periodic (requires external CLIs + auth, paid)
  'benchmark-providers-live': 'periodic',

  // Browser-skills Phase 2a — gate (D1/D3 contracts must not silently break)
  'scrape-match-path': 'gate',
  'scrape-prototype-path': 'gate',
  'skillify-happy-path': 'gate',
  'skillify-provenance-refusal': 'gate',
  'skillify-approval-reject': 'gate',

  // Skill routing — periodic (LLM routing is non-deterministic)
  'journey-ideation': 'periodic',
  'journey-plan-eng': 'periodic',
  'journey-debug': 'periodic',
  'journey-qa': 'periodic',
  'journey-code-review': 'periodic',
  'journey-ship': 'periodic',
  'journey-docs': 'periodic',
  'journey-retro': 'periodic',
  'journey-design-system': 'periodic',
  'journey-visual-qa': 'periodic',

  // Opus 4.7 overlay evals — periodic (non-deterministic LLM behavior + Opus cost)
  'fanout-arm-overlay-on': 'periodic',
  'fanout-arm-overlay-off': 'periodic',

  // Overlay efficacy harness (SDK, paid) — periodic only
  'overlay-harness-opus-4-7-fanout-toy': 'periodic',
  'overlay-harness-opus-4-7-fanout-realistic': 'periodic',

  // /ios-qa daemon + codegen — no-device path runs every PR (no hardware
  // dependency, deterministic). with-device path requires GSTACK_HAS_IOS_DEVICE.
  'ios-qa-e2e': 'gate',
  // Swift toolchain only, no device required, but heavier than TS unit tests.
  'ios-qa-swift-build': 'periodic',
  // Requires a real connected + paired iPhone. Manual-trigger only.
  'ios-qa-device': 'periodic',
  // /spec end-to-end PTY pipeline (paid, non-deterministic — periodic-tier).
  'spec-execute': 'periodic',
};

/**
 * LLM-judge test touchfiles — keyed by test description string.
 */
export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
  'command reference table':          ['SKILL.md', 'SKILL.md.tmpl', 'browse/src/commands.ts'],
  'snapshot flags reference':         ['SKILL.md', 'SKILL.md.tmpl', 'browse/src/snapshot.ts'],
  'browse/SKILL.md reference':        ['browse/SKILL.md', 'browse/SKILL.md.tmpl', 'browse/src/**'],
  'setup block':                      ['SKILL.md', 'SKILL.md.tmpl'],
  'regression vs baseline':           ['SKILL.md', 'SKILL.md.tmpl', 'browse/src/commands.ts', 'test/fixtures/eval-baselines.json'],
  'qa/SKILL.md workflow':             ['qa/SKILL.md', 'qa/SKILL.md.tmpl'],
  'qa/SKILL.md health rubric':        ['qa/SKILL.md', 'qa/SKILL.md.tmpl'],
  'qa/SKILL.md anti-refusal':         ['qa/SKILL.md', 'qa/SKILL.md.tmpl', 'qa-only/SKILL.md', 'qa-only/SKILL.md.tmpl'],
  'cross-skill greptile consistency': ['review/SKILL.md', 'review/SKILL.md.tmpl', 'ship/SKILL.md', 'ship/SKILL.md.tmpl', 'review/greptile-triage.md', 'retro/SKILL.md', 'retro/SKILL.md.tmpl'],
  'baseline score pinning':           ['SKILL.md', 'SKILL.md.tmpl', 'test/fixtures/eval-baselines.json'],

  // Ship & Release
  'ship/SKILL.md workflow':               ['ship/SKILL.md', 'ship/SKILL.md.tmpl'],
  'document-release/SKILL.md workflow':   ['document-release/SKILL.md', 'document-release/SKILL.md.tmpl'],

  // Plan Reviews
  'plan-ceo-review/SKILL.md modes':       ['plan-ceo-review/SKILL.md', 'plan-ceo-review/SKILL.md.tmpl'],
  'plan-eng-review/SKILL.md sections':    ['plan-eng-review/SKILL.md', 'plan-eng-review/SKILL.md.tmpl'],

  // /spec authored-spec quality (paid LLM-judge — periodic-tier).
  'spec authored quality':                ['spec/SKILL.md', 'spec/SKILL.md.tmpl', 'test/fixtures/spec/**'],
  'plan-design-review/SKILL.md passes':   ['plan-design-review/SKILL.md', 'plan-design-review/SKILL.md.tmpl'],

  // Design skills
  'design-review/SKILL.md fix loop':      ['design-review/SKILL.md', 'design-review/SKILL.md.tmpl'],
  'design-consultation/SKILL.md research': ['design-consultation/SKILL.md', 'design-consultation/SKILL.md.tmpl'],

  // Office Hours
  'office-hours/SKILL.md spec review':    ['office-hours/SKILL.md', 'office-hours/SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
  'office-hours/SKILL.md design sketch':  ['office-hours/SKILL.md', 'office-hours/SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],

  // Deploy skills
  'land-and-deploy/SKILL.md workflow':    ['land-and-deploy/SKILL.md', 'land-and-deploy/SKILL.md.tmpl'],
  'canary/SKILL.md monitoring loop':      ['canary/SKILL.md', 'canary/SKILL.md.tmpl'],
  'benchmark/SKILL.md perf collection':   ['benchmark/SKILL.md', 'benchmark/SKILL.md.tmpl'],
  'setup-deploy/SKILL.md platform setup': ['setup-deploy/SKILL.md', 'setup-deploy/SKILL.md.tmpl'],

  // Other skills
  'retro/SKILL.md instructions':          ['retro/SKILL.md', 'retro/SKILL.md.tmpl'],
  'qa-only/SKILL.md workflow':            ['qa-only/SKILL.md', 'qa-only/SKILL.md.tmpl'],
  'gstack-upgrade/SKILL.md upgrade flow': ['gstack-upgrade/SKILL.md', 'gstack-upgrade/SKILL.md.tmpl'],

  // Voice directive
  'voice directive tone':                 ['scripts/resolvers/preamble.ts', 'review/SKILL.md', 'review/SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
};

/**
 * Changes to any of these files trigger ALL tests (both E2E and LLM-judge).
 *
 * Keep this list minimal — only files that genuinely affect every test.
 * Scoped dependencies (gen-skill-docs, llm-judge, test-server, worktree,
 * codex/gemini session runners) belong in individual test entries instead.
 */
export const GLOBAL_TOUCHFILES = [
  'test/helpers/session-runner.ts',  // All E2E tests use this runner
  'test/helpers/hermetic-env.ts',    // Changes every E2E child's environment
  'test/helpers/eval-store.ts',      // All E2E tests store results here
  'test/helpers/touchfiles.ts',      // Self-referential — reclassifying wrong is dangerous
];

// --- Base branch detection ---

/**
 * Detect the base branch by trying refs in order.
 * Returns the first valid ref, or null if none found.
 */
export function detectBaseBranch(cwd: string): string | null {
  for (const ref of ['origin/main', 'origin/master', 'main', 'master']) {
    const result = spawnSync('git', ['rev-parse', '--verify', ref], {
      cwd, stdio: 'pipe', timeout: 3000,
    });
    if (result.status === 0) return ref;
  }
  return null;
}

/**
 * Get list of files changed between base branch and HEAD.
 */
export function getChangedFiles(baseBranch: string, cwd: string): string[] {
  const result = spawnSync('git', ['diff', '--name-only', `${baseBranch}...HEAD`], {
    cwd, stdio: 'pipe', timeout: 5000,
  });
  if (result.status !== 0) return [];
  return result.stdout.toString().trim().split('\n').filter(Boolean);
}

// --- Test selection ---

/**
 * Select tests to run based on changed files.
 *
 * Algorithm:
 * 1. If any changed file matches a global touchfile → run ALL tests
 * 2. Otherwise, for each test, check if any changed file matches its patterns
 * 3. Return selected + skipped lists with reason
 */
export function selectTests(
  changedFiles: string[],
  touchfiles: Record<string, string[]>,
  globalTouchfiles: string[] = GLOBAL_TOUCHFILES,
): { selected: string[]; skipped: string[]; reason: string } {
  const allTestNames = Object.keys(touchfiles);

  // Global touchfile hit → run all
  for (const file of changedFiles) {
    if (globalTouchfiles.some(g => matchGlob(file, g))) {
      return { selected: allTestNames, skipped: [], reason: `global: ${file}` };
    }
  }

  // Per-test matching
  const selected: string[] = [];
  const skipped: string[] = [];
  for (const [testName, patterns] of Object.entries(touchfiles)) {
    const hit = changedFiles.some(f => patterns.some(p => matchGlob(f, p)));
    (hit ? selected : skipped).push(testName);
  }

  return { selected, skipped, reason: 'diff' };
}