fix: resolve merge conflicts with main — keep stealth patches + auth token

2026-05-06 13:45:35 +02:00 · 2026-03-30 20:47:51 -07:00
parent 1ddd4104db a0328be04c
commit 08b474ea6f
65 changed files with 5234 additions and 574 deletions
@@ -83,11 +83,15 @@ const OPENAI_LITMUS_CHECKS = [
 // ─── External Host Helpers ───────────────────────────────────

 // Re-export local copy for use in this file (matches codex-helpers.ts)
-function externalSkillName(skillDir: string): string {
+// Accepts optional frontmatter name to support directory/invocation name divergence
+function externalSkillName(skillDir: string, frontmatterName?: string): string {
+  // Root skill (skillDir === '' or '.') always maps to 'gstack' regardless of frontmatter
  if (skillDir === '.' || skillDir === '') return 'gstack';
+  // Use frontmatter name when it differs from directory name (e.g., run-tests/ with name: test)
+  const baseName = frontmatterName && frontmatterName !== skillDir ? frontmatterName : skillDir;
  // Don't double-prefix: gstack-upgrade → gstack-upgrade (not gstack-gstack-upgrade)
-  if (skillDir.startsWith('gstack-')) return skillDir;
-  return `gstack-${skillDir}`;
+  if (baseName.startsWith('gstack-')) return baseName;
+  return `gstack-${baseName}`;
 }

 function extractNameAndDescription(content: string): { name: string; description: string } {
@@ -255,11 +259,12 @@ function processExternalHost(
  skillDir: string,
  extractedDescription: string,
  ctx: TemplateContext,
+  frontmatterName?: string,
 ): { content: string; outputPath: string; outputDir: string; symlinkLoop: boolean } {
  const config = EXTERNAL_HOST_CONFIG[host];
  if (!config) throw new Error(`No external host config for: ${host}`);

-  const name = externalSkillName(skillDir === '.' ? '' : skillDir);
+  const name = externalSkillName(skillDir === '.' ? '' : skillDir, frontmatterName);
  const outputDir = path.join(ROOT, config.hostSubdir, 'skills', name);
  fs.mkdirSync(outputDir, { recursive: true });
  const outputPath = path.join(outputDir, 'SKILL.md');
@@ -324,10 +329,13 @@ function processTemplate(tmplPath: string, host: Host = 'claude'): { outputPath:
  // Determine skill directory relative to ROOT
  const skillDir = path.relative(ROOT, path.dirname(tmplPath));

-  // Extract skill name from frontmatter for TemplateContext
+  // Extract skill name from frontmatter early — needed for both TemplateContext and external host output paths.
+  // When frontmatter name: differs from directory name (e.g., run-tests/ with name: test),
+  // the frontmatter name is used for external skill naming and setup script symlinks.
  const { name: extractedName, description: extractedDescription } = extractNameAndDescription(tmplContent);
  const skillName = extractedName || path.basename(path.dirname(tmplPath));

+
  // Extract benefits-from list from frontmatter (inline YAML: benefits-from: [a, b])
  const benefitsMatch = tmplContent.match(/^benefits-from:\s*\[([^\]]*)\]/m);
  const benefitsFrom = benefitsMatch
@@ -340,15 +348,18 @@ function processTemplate(tmplPath: string, host: Host = 'claude'): { outputPath:

  const ctx: TemplateContext = { skillName, tmplPath, benefitsFrom, host, paths: HOST_PATHS[host], preambleTier };

-  // Replace placeholders
-  let content = tmplContent.replace(/\{\{(\w+)\}\}/g, (match, name) => {
-    const resolver = RESOLVERS[name];
-    if (!resolver) throw new Error(`Unknown placeholder {{${name}}} in ${relTmplPath}`);
-    return resolver(ctx);
+  // Replace placeholders (supports parameterized: {{NAME:arg1:arg2}})
+  let content = tmplContent.replace(/\{\{(\w+(?::[^}]+)?)\}\}/g, (match, fullKey) => {
+    const parts = fullKey.split(':');
+    const resolverName = parts[0];
+    const args = parts.slice(1);
+    const resolver = RESOLVERS[resolverName];
+    if (!resolver) throw new Error(`Unknown placeholder {{${resolverName}}} in ${relTmplPath}`);
+    return args.length > 0 ? resolver(ctx, args) : resolver(ctx);
  });

  // Check for any remaining unresolved placeholders
-  const remaining = content.match(/\{\{(\w+)\}\}/g);
+  const remaining = content.match(/\{\{(\w+(?::[^}]+)?)\}\}/g);
  if (remaining) {
    throw new Error(`Unresolved placeholders in ${relTmplPath}: ${remaining.join(', ')}`);
  }
@@ -359,7 +370,7 @@ function processTemplate(tmplPath: string, host: Host = 'claude'): { outputPath:
  if (host === 'claude') {
    content = transformFrontmatter(content, host);
  } else {
-    const result = processExternalHost(content, tmplContent, host, skillDir, extractedDescription, ctx);
+    const result = processExternalHost(content, tmplContent, host, skillDir, extractedDescription, ctx, extractedName || undefined);
    content = result.content;
    outputPath = result.outputPath;
    symlinkLoop = result.symlinkLoop;
@@ -0,0 +1,48 @@
+import type { TemplateContext } from './types';
+
+/**
+ * {{INVOKE_SKILL:skill-name}} — emits prose instructing Claude to read
+ * another skill's SKILL.md and follow it, skipping preamble sections.
+ *
+ * Supports optional skip= parameter for additional sections to skip:
+ *   {{INVOKE_SKILL:plan-ceo-review:skip=Outside Voice,Design Outside Voices}}
+ */
+export function generateInvokeSkill(ctx: TemplateContext, args?: string[]): string {
+  const skillName = args?.[0];
+  if (!skillName || skillName === '') {
+    throw new Error('{{INVOKE_SKILL}} requires a skill name, e.g. {{INVOKE_SKILL:plan-ceo-review}}');
+  }
+
+  // Parse optional skip= parameter from args[1+]
+  const extraSkips = (args?.slice(1) || [])
+    .filter(a => a.startsWith('skip='))
+    .flatMap(a => a.slice(5).split(','))
+    .map(s => s.trim())
+    .filter(Boolean);
+
+  const DEFAULT_SKIPS = [
+    'Preamble (run first)',
+    'AskUserQuestion Format',
+    'Completeness Principle — Boil the Lake',
+    'Search Before Building',
+    'Contributor Mode',
+    'Completion Status Protocol',
+    'Telemetry (run last)',
+    'Step 0: Detect platform and base branch',
+    'Review Readiness Dashboard',
+    'Plan File Review Report',
+    'Prerequisite Skill Offer',
+    'Plan Status Footer',
+  ];
+
+  const allSkips = [...DEFAULT_SKIPS, ...extraSkips];
+
+  return `Read the \`/${skillName}\` skill file at \`${ctx.paths.skillRoot}/${skillName}/SKILL.md\` using the Read tool.
+
+**If unreadable:** Skip with "Could not load /${skillName} — skipping." and continue.
+
+Follow its instructions from top to bottom, **skipping these sections** (already handled by the parent skill):
+${allSkips.map(s => `- ${s}`).join('\n')}
+
+Execute every other section at full depth. When the loaded skill's instructions are complete, continue with the next step below.`;
+}
@@ -855,31 +855,42 @@ $D compare --images "$_DESIGN_DIR/variant-A.png,$_DESIGN_DIR/variant-B.png,$_DES

 This command generates the board HTML, starts an HTTP server on a random port,
 and opens it in the user's default browser. **Run it in the background** with \`&\`
-because the agent needs to keep running while the user interacts with the board.
+because the server needs to stay running while the user interacts with the board.

-**IMPORTANT: Reading feedback via file polling (not stdout):**
+Parse the port from stderr output: \`SERVE_STARTED: port=XXXXX\`. You need this
+for the board URL and for reloading during regeneration cycles.

-The server writes feedback to files next to the board HTML. The agent polls for these:
+**PRIMARY WAIT: AskUserQuestion with board URL**
+
+After the board is serving, use AskUserQuestion to wait for the user. Include the
+board URL so they can click it if they lost the browser tab:
+
+"I've opened a comparison board with the design variants:
+http://127.0.0.1:<PORT>/ — Rate them, leave comments, remix
+elements you like, and click Submit when you're done. Let me know when you've
+submitted your feedback (or paste your preferences here). If you clicked
+Regenerate or Remix on the board, tell me and I'll generate new variants."
+
+**Do NOT use AskUserQuestion to ask which variant the user prefers.** The comparison
+board IS the chooser. AskUserQuestion is just the blocking wait mechanism.
+
+**After the user responds to AskUserQuestion:**
+
+Check for feedback files next to the board HTML:
 - \`$_DESIGN_DIR/feedback.json\` — written when user clicks Submit (final choice)
 - \`$_DESIGN_DIR/feedback-pending.json\` — written when user clicks Regenerate/Remix/More Like This

-**Polling loop** (run after launching \`$D serve\` in background):
-
 \`\`\`bash
-# Poll for feedback files every 5 seconds (up to 10 minutes)
-for i in $(seq 1 120); do
-  if [ -f "$_DESIGN_DIR/feedback.json" ]; then
-    echo "SUBMIT_RECEIVED"
-    cat "$_DESIGN_DIR/feedback.json"
-    break
-  elif [ -f "$_DESIGN_DIR/feedback-pending.json" ]; then
-    echo "REGENERATE_RECEIVED"
-    cat "$_DESIGN_DIR/feedback-pending.json"
-    rm "$_DESIGN_DIR/feedback-pending.json"
-    break
-  fi
-  sleep 5
-done
+if [ -f "$_DESIGN_DIR/feedback.json" ]; then
+  echo "SUBMIT_RECEIVED"
+  cat "$_DESIGN_DIR/feedback.json"
+elif [ -f "$_DESIGN_DIR/feedback-pending.json" ]; then
+  echo "REGENERATE_RECEIVED"
+  cat "$_DESIGN_DIR/feedback-pending.json"
+  rm "$_DESIGN_DIR/feedback-pending.json"
+else
+  echo "NO_FEEDBACK_FILE"
+fi
 \`\`\`

 The feedback JSON has this shape:
@@ -893,24 +904,30 @@ The feedback JSON has this shape:
 }
 \`\`\`

-**If \`feedback-pending.json\` found (\`"regenerated": true\`):**
+**If \`feedback.json\` found:** The user clicked Submit on the board.
+Read \`preferred\`, \`ratings\`, \`comments\`, \`overall\` from the JSON. Proceed with
+the approved variant.
+
+**If \`feedback-pending.json\` found:** The user clicked Regenerate/Remix on the board.
 1. Read \`regenerateAction\` from the JSON (\`"different"\`, \`"match"\`, \`"more_like_B"\`,
   \`"remix"\`, or custom text)
 2. If \`regenerateAction\` is \`"remix"\`, read \`remixSpec\` (e.g. \`{"layout":"A","colors":"B"}\`)
 3. Generate new variants with \`$D iterate\` or \`$D variants\` using updated brief
 4. Create new board: \`$D compare --images "..." --output "$_DESIGN_DIR/design-board.html"\`
-5. Parse the port from the \`$D serve\` stderr output (\`SERVE_STARTED: port=XXXXX\`),
-   then reload the board in the user's browser (same tab):
+5. Reload the board in the user's browser (same tab):
   \`curl -s -X POST http://127.0.0.1:PORT/api/reload -H 'Content-Type: application/json' -d '{"html":"$_DESIGN_DIR/design-board.html"}'\`
-6. The board auto-refreshes. **Poll again** for the next feedback file.
-7. Repeat until \`feedback.json\` appears (user clicked Submit).
+6. The board auto-refreshes. **AskUserQuestion again** with the same board URL to
+   wait for the next round of feedback. Repeat until \`feedback.json\` appears.

-**If \`feedback.json\` found (\`"regenerated": false\`):**
-1. Read \`preferred\`, \`ratings\`, \`comments\`, \`overall\` from the JSON
-2. Proceed with the approved variant
+**If \`NO_FEEDBACK_FILE\`:** The user typed their preferences directly in the
+AskUserQuestion response instead of using the board. Use their text response
+as the feedback.

-**If \`$D serve\` fails or no feedback within 10 minutes:** Fall back to AskUserQuestion:
-"I've opened the design board. Which variant do you prefer? Any feedback?"
+**POLLING FALLBACK:** Only use polling if \`$D serve\` fails (no port available).
+In that case, show each variant inline using the Read tool (so the user can see them),
+then use AskUserQuestion:
+"The comparison board server failed to start. I've shown the variants above.
+Which do you prefer? Any feedback?"

 **After receiving feedback (any path):** Output a clear summary confirming
 what was understood:
@@ -3,7 +3,7 @@
 * Each resolver takes a TemplateContext and returns the replacement string.
 */

-import type { TemplateContext } from './types';
+import type { TemplateContext, ResolverFn } from './types';

 // Domain modules
 import { generatePreamble } from './preamble';
@@ -11,12 +11,13 @@ import { generateTestFailureTriage } from './preamble';
 import { generateCommandReference, generateSnapshotFlags, generateBrowseSetup } from './browse';
 import { generateDesignMethodology, generateDesignHardRules, generateDesignOutsideVoices, generateDesignReviewLite, generateDesignSketch, generateDesignSetup, generateDesignMockup, generateDesignShotgunLoop } from './design';
 import { generateTestBootstrap, generateTestCoverageAuditPlan, generateTestCoverageAuditShip, generateTestCoverageAuditReview } from './testing';
-import { generateReviewDashboard, generatePlanFileReviewReport, generateSpecReviewLoop, generateBenefitsFrom, generateCodexSecondOpinion, generateAdversarialStep, generateCodexPlanReview, generatePlanCompletionAuditShip, generatePlanCompletionAuditReview, generatePlanVerificationExec } from './review';
-import { generateSlugEval, generateSlugSetup, generateBaseBranchDetect, generateDeployBootstrap, generateQAMethodology, generateCoAuthorTrailer } from './utility';
+import { generateReviewDashboard, generatePlanFileReviewReport, generateSpecReviewLoop, generateBenefitsFrom, generateCodexSecondOpinion, generateAdversarialStep, generateCodexPlanReview, generatePlanCompletionAuditShip, generatePlanCompletionAuditReview, generatePlanVerificationExec, generateScopeDrift } from './review';
+import { generateSlugEval, generateSlugSetup, generateBaseBranchDetect, generateDeployBootstrap, generateQAMethodology, generateCoAuthorTrailer, generateChangelogWorkflow } from './utility';
 import { generateLearningsSearch, generateLearningsLog } from './learnings';
 import { generateConfidenceCalibration } from './confidence';
+import { generateInvokeSkill } from './composition';

-export const RESOLVERS: Record<string, (ctx: TemplateContext) => string> = {
+export const RESOLVERS: Record<string, ResolverFn> = {
  SLUG_EVAL: generateSlugEval,
  SLUG_SETUP: generateSlugSetup,
  COMMAND_REFERENCE: generateCommandReference,
@@ -44,6 +45,7 @@ export const RESOLVERS: Record<string, (ctx: TemplateContext) => string> = {
  BENEFITS_FROM: generateBenefitsFrom,
  CODEX_SECOND_OPINION: generateCodexSecondOpinion,
  ADVERSARIAL_STEP: generateAdversarialStep,
+  SCOPE_DRIFT: generateScopeDrift,
  DEPLOY_BOOTSTRAP: generateDeployBootstrap,
  CODEX_PLAN_REVIEW: generateCodexPlanReview,
  PLAN_COMPLETION_AUDIT_SHIP: generatePlanCompletionAuditShip,
@@ -53,4 +55,6 @@ export const RESOLVERS: Record<string, (ctx: TemplateContext) => string> = {
  LEARNINGS_SEARCH: generateLearningsSearch,
  LEARNINGS_LOG: generateLearningsLog,
  CONFIDENCE_CALIBRATION: generateConfidenceCalibration,
+  INVOKE_SKILL: generateInvokeSkill,
+  CHANGELOG_WORKFLOW: generateChangelogWorkflow,
 };
@@ -77,6 +77,14 @@ if [ -f "$_LEARN_FILE" ]; then
 else
  echo "LEARNINGS: 0"
 fi
+# Check if CLAUDE.md has routing rules
+_HAS_ROUTING="no"
+if [ -f CLAUDE.md ] && grep -q "## Skill routing" CLAUDE.md 2>/dev/null; then
+  _HAS_ROUTING="yes"
+fi
+_ROUTING_DECLINED=$(${ctx.paths.binDir}/gstack-config get routing_declined 2>/dev/null || echo "false")
+echo "HAS_ROUTING: $_HAS_ROUTING"
+echo "ROUTING_DECLINED: $_ROUTING_DECLINED"
 \`\`\``;
 }

@@ -167,6 +175,51 @@ touch ~/.gstack/.proactive-prompted
 This only happens once. If \`PROACTIVE_PROMPTED\` is \`yes\`, skip this entirely.`;
 }

+function generateRoutingInjection(ctx: TemplateContext): string {
+  return `If \`HAS_ROUTING\` is \`no\` AND \`ROUTING_DECLINED\` is \`false\` AND \`PROACTIVE_PROMPTED\` is \`yes\`:
+Check if a CLAUDE.md file exists in the project root. If it does not exist, create it.
+
+Use AskUserQuestion:
+
+> gstack works best when your project's CLAUDE.md includes skill routing rules.
+> This tells Claude to use specialized workflows (like /ship, /investigate, /qa)
+> instead of answering directly. It's a one-time addition, about 15 lines.
+
+Options:
+- A) Add routing rules to CLAUDE.md (recommended)
+- B) No thanks, I'll invoke skills manually
+
+If A: Append this section to the end of CLAUDE.md:
+
+\`\`\`markdown
+
+## Skill routing
+
+When the user's request matches an available skill, ALWAYS invoke it using the Skill
+tool as your FIRST action. Do NOT answer directly, do NOT use other tools first.
+The skill has specialized workflows that produce better results than ad-hoc answers.
+
+Key routing rules:
+- Product ideas, "is this worth building", brainstorming → invoke office-hours
+- Bugs, errors, "why is this broken", 500 errors → invoke investigate
+- Ship, deploy, push, create PR → invoke ship
+- QA, test the site, find bugs → invoke qa
+- Code review, check my diff → invoke review
+- Update docs after shipping → invoke document-release
+- Weekly retro → invoke retro
+- Design system, brand → invoke design-consultation
+- Visual audit, design polish → invoke design-review
+- Architecture review → invoke plan-eng-review
+\`\`\`
+
+Then commit the change: \`git add CLAUDE.md && git commit -m "chore: add gstack skill routing rules to CLAUDE.md"\`
+
+If B: run \`${ctx.paths.binDir}/gstack-config set routing_declined true\`
+Say "No problem. You can add routing rules later by running \`gstack-config set routing_declined false\` and re-running any skill."
+
+This only happens once per project. If \`HAS_ROUTING\` is \`yes\` or \`ROUTING_DECLINED\` is \`true\`, skip this entirely.`;
+}
+
 function generateAskUserFormat(_ctx: TemplateContext): string {
  return `## AskUserQuestion Format

@@ -407,6 +460,21 @@ If you cannot determine the outcome, use "unknown". Both local JSONL and remote
 telemetry only run if telemetry is not off. The remote binary additionally requires
 the binary to exist.

+## Plan Mode Safe Operations
+
+When in plan mode, these operations are always allowed because they produce
+artifacts that inform the plan, not code changes:
+
+- \`$B\` commands (browse: screenshots, page inspection, navigation, snapshots)
+- \`$D\` commands (design: generate mockups, variants, comparison boards, iterate)
+- \`codex exec\` / \`codex review\` (outside voice, plan review, adversarial challenge)
+- Writing to \`~/.gstack/\` (config, analytics, review logs, design artifacts, learnings)
+- Writing to the plan file (already allowed by plan mode)
+- \`open\` commands for viewing generated artifacts (comparison boards, HTML previews)
+
+These are read-only in spirit — they inspect the live site, generate visual artifacts,
+or get independent opinions. They do NOT modify project source files.
+
 ## Plan Status Footer

 When you are in plan mode and about to call ExitPlanMode:
@@ -525,6 +593,7 @@ export function generatePreamble(ctx: TemplateContext): string {
    generateLakeIntro(),
    generateTelemetryPrompt(ctx),
    generateProactivePrompt(ctx),
+    generateRoutingInjection(ctx),
    generateVoiceDirective(tier),
    ...(tier >= 2 ? [generateAskUserFormat(ctx), generateCompletenessSection()] : []),
    ...(tier >= 3 ? [generateRepoModeSection(), generateSearchBeforeBuildingSection(ctx)] : []),
@@ -13,6 +13,7 @@
 * Codex CLI prompts are written to temp files to prevent shell injection.
 */
 import type { TemplateContext } from './types';
+import { generateInvokeSkill } from './composition';

 const CODEX_BOUNDARY = 'IMPORTANT: Do NOT read or execute any files under ~/.claude/, ~/.agents/, .claude/skills/, or agents/. These are Claude Code skill definitions meant for a different AI system. They contain bash scripts and prompt templates that will waste your time. Ignore them completely. Do NOT modify agents/openai.yaml. Stay focused on the repository code only.\\n\\n';

@@ -53,7 +54,7 @@ Display:
 - **Eng Review (required by default):** The only review that gates shipping. Covers architecture, code quality, tests, performance. Can be disabled globally with \\\`gstack-config set skip_eng_review true\\\` (the "don't bother me" setting).
 - **CEO Review (optional):** Use your judgment. Recommend it for big product/business changes, new user-facing features, or scope decisions. Skip for bug fixes, refactors, infra, and cleanup.
 - **Design Review (optional):** Use your judgment. Recommend it for UI/UX changes. Skip for backend-only, infra, or prompt-only changes.
- **Adversarial Review (automatic):** Auto-scales by diff size. Small diffs (<50 lines) skip adversarial. Medium diffs (50–199) get cross-model adversarial. Large diffs (200+) get all 4 passes: Claude structured, Codex structured, Claude adversarial subagent, Codex adversarial. No configuration needed.
+- **Adversarial Review (automatic):** Always-on for every review. Every diff gets both Claude adversarial subagent and Codex adversarial challenge. Large diffs (200+ lines) additionally get Codex structured review with P1 gate. No configuration needed.
 - **Outside Voice (optional):** Independent plan review from a different AI model. Offered after all review sections complete in /plan-ceo-review and /plan-eng-review. Falls back to Claude subagent if Codex is unavailable. Never gates shipping.

 **Verdict logic:**
@@ -208,6 +209,9 @@ export function generateBenefitsFrom(ctx: TemplateContext): string {
  const skillList = ctx.benefitsFrom.map(s => `\`/${s}\``).join(' or ');
  const first = ctx.benefitsFrom[0];

+  // Reuse the INVOKE_SKILL resolver for the actual loading instructions
+  const invokeBlock = generateInvokeSkill(ctx, [first]);
+
  return `## Prerequisite Skill Offer

 When the design doc check above prints "No design doc found," offer the prerequisite
@@ -232,20 +236,7 @@ If they choose A:
 Say: "Running /${first} inline. Once the design doc is ready, I'll pick up
 the review right where we left off."

-Read the ${first} skill file from disk using the Read tool:
-\`~/.claude/skills/gstack/${first}/SKILL.md\`
-
-Follow it inline, **skipping these sections** (already handled by the parent skill):
- Preamble (run first)
- AskUserQuestion Format
- Completeness Principle — Boil the Lake
- Search Before Building
- Contributor Mode
- Completion Status Protocol
- Telemetry (run last)
-
-If the Read fails (file not found), say:
-"Could not load /${first} — proceeding with standard review."
+${invokeBlock}

 After /${first} completes, re-run the design doc check:
 \`\`\`bash
@@ -368,6 +359,50 @@ SECOND OPINION (Claude subagent):
 If A: revise the premise and note the revision. If B: proceed (and note that the user defended this premise with reasoning — this is a founder signal if they articulate WHY they disagree, not just dismiss).`;
 }

+// ─── Scope Drift Detection (shared between /review and /ship) ────────
+
+export function generateScopeDrift(ctx: TemplateContext): string {
+  const isShip = ctx.skillName === 'ship';
+  const stepNum = isShip ? '3.48' : '1.5';
+
+  return `## Step ${stepNum}: Scope Drift Detection
+
+Before reviewing code quality, check: **did they build what was requested — nothing more, nothing less?**
+
+1. Read \`TODOS.md\` (if it exists). Read PR description (\`gh pr view --json body --jq .body 2>/dev/null || true\`).
+   Read commit messages (\`git log origin/<base>..HEAD --oneline\`).
+   **If no PR exists:** rely on commit messages and TODOS.md for stated intent — this is the common case since /review runs before /ship creates the PR.
+2. Identify the **stated intent** — what was this branch supposed to accomplish?
+3. Run \`git diff origin/<base>...HEAD --stat\` and compare the files changed against the stated intent.
+
+4. Evaluate with skepticism (incorporating plan completion results if available from an earlier step or adjacent section):
+
+   **SCOPE CREEP detection:**
+   - Files changed that are unrelated to the stated intent
+   - New features or refactors not mentioned in the plan
+   - "While I was in there..." changes that expand blast radius
+
+   **MISSING REQUIREMENTS detection:**
+   - Requirements from TODOS.md/PR description not addressed in the diff
+   - Test coverage gaps for stated requirements
+   - Partial implementations (started but not finished)
+
+5. Output (before the main review begins):
+   \\\`\\\`\\\`
+   Scope Check: [CLEAN / DRIFT DETECTED / REQUIREMENTS MISSING]
+   Intent: <1-line summary of what was requested>
+   Delivered: <1-line summary of what the diff actually does>
+   [If drift: list each out-of-scope change]
+   [If missing: list each unaddressed requirement]
+   \\\`\\\`\\\`
+
+6. This is **INFORMATIONAL** — does not block the review. Proceed to the next step.
+
+---`;
+}
+
+// ─── Adversarial Review (always-on) ──────────────────────────────────
+
 export function generateAdversarialStep(ctx: TemplateContext): string {
  // Codex host: strip entirely — Codex should never invoke itself
  if (ctx.host === 'codex') return '';
@@ -375,9 +410,9 @@ export function generateAdversarialStep(ctx: TemplateContext): string {
  const isShip = ctx.skillName === 'ship';
  const stepNum = isShip ? '3.8' : '5.7';

-  return `## Step ${stepNum}: Adversarial review (auto-scaled)
+  return `## Step ${stepNum}: Adversarial review (always-on)

-Adversarial review thoroughness scales automatically based on diff size. No configuration needed.
+Every diff gets adversarial review from both Claude and Codex. LOC is not a proxy for risk — a 5-line auth change can be critical.

 **Detect diff size and tool availability:**

@@ -386,30 +421,34 @@ DIFF_INS=$(git diff origin/<base> --stat | tail -1 | grep -oE '[0-9]+ insertion'
 DIFF_DEL=$(git diff origin/<base> --stat | tail -1 | grep -oE '[0-9]+ deletion' | grep -oE '[0-9]+' || echo "0")
 DIFF_TOTAL=$((DIFF_INS + DIFF_DEL))
 which codex 2>/dev/null && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE"
-# Respect old opt-out
+# Legacy opt-out — only gates Codex passes, Claude always runs
 OLD_CFG=$(~/.claude/skills/gstack/bin/gstack-config get codex_reviews 2>/dev/null || true)
 echo "DIFF_SIZE: $DIFF_TOTAL"
 echo "OLD_CFG: \${OLD_CFG:-not_set}"
 \`\`\`

-If \`OLD_CFG\` is \`disabled\`: skip this step silently. Continue to the next step.
+If \`OLD_CFG\` is \`disabled\`: skip Codex passes only. Claude adversarial subagent still runs (it's free and fast). Jump to the "Claude adversarial subagent" section.

-**User override:** If the user explicitly requested a specific tier (e.g., "run all passes", "paranoid review", "full adversarial", "do all 4 passes", "thorough review"), honor that request regardless of diff size. Jump to the matching tier section.
-
-**Auto-select tier based on diff size:**
- **Small (< 50 lines changed):** Skip adversarial review entirely. Print: "Small diff ($DIFF_TOTAL lines) — adversarial review skipped." Continue to the next step.
- **Medium (50–199 lines changed):** Run Codex adversarial challenge (or Claude adversarial subagent if Codex unavailable). Jump to the "Medium tier" section.
- **Large (200+ lines changed):** Run all remaining passes — Codex structured review + Claude adversarial subagent + Codex adversarial. Jump to the "Large tier" section.
+**User override:** If the user explicitly requested "full review", "structured review", or "P1 gate", also run the Codex structured review regardless of diff size.

 ---

-### Medium tier (50–199 lines)
+### Claude adversarial subagent (always runs)

-Claude's structured review already ran. Now add a **cross-model adversarial challenge**.
+Dispatch via the Agent tool. The subagent has fresh context — no checklist bias from the structured review. This genuine independence catches things the primary reviewer is blind to.

-**If Codex is available:** run the Codex adversarial challenge. **If Codex is NOT available:** fall back to the Claude adversarial subagent instead.
+Subagent prompt:
+"Read the diff for this branch with \`git diff origin/<base>\`. Think like an attacker and a chaos engineer. Your job is to find ways this code will fail in production. Look for: edge cases, race conditions, security holes, resource leaks, failure modes, silent data corruption, logic errors that produce wrong results silently, error handling that swallows failures, and trust boundary violations. Be adversarial. Be thorough. No compliments — just the problems. For each finding, classify as FIXABLE (you know how to fix it) or INVESTIGATE (needs human judgment)."

-**Codex adversarial:**
+Present findings under an \`ADVERSARIAL REVIEW (Claude subagent):\` header. **FIXABLE findings** flow into the same Fix-First pipeline as the structured review. **INVESTIGATE findings** are presented as informational.
+
+If the subagent fails or times out: "Claude adversarial subagent unavailable. Continuing."
+
+---
+
+### Codex adversarial challenge (always runs when available)
+
+If Codex is available AND \`OLD_CFG\` is NOT \`disabled\`:

 \`\`\`bash
 TMPERR_ADV=$(mktemp /tmp/codex-adv-XXXXXXXX)
@@ -429,34 +468,16 @@ Present the full output verbatim. This is informational — it never blocks ship
 - **Timeout:** "Codex timed out after 5 minutes."
 - **Empty response:** "Codex returned no response. Stderr: <paste relevant error>."

-On any Codex error, fall back to the Claude adversarial subagent automatically.
+**Cleanup:** Run \`rm -f "$TMPERR_ADV"\` after processing.

-**Claude adversarial subagent** (fallback when Codex unavailable or errored):
-
-Dispatch via the Agent tool. The subagent has fresh context — no checklist bias from the structured review. This genuine independence catches things the primary reviewer is blind to.
-
-Subagent prompt:
-"Read the diff for this branch with \`git diff origin/<base>\`. Think like an attacker and a chaos engineer. Your job is to find ways this code will fail in production. Look for: edge cases, race conditions, security holes, resource leaks, failure modes, silent data corruption, logic errors that produce wrong results silently, error handling that swallows failures, and trust boundary violations. Be adversarial. Be thorough. No compliments — just the problems. For each finding, classify as FIXABLE (you know how to fix it) or INVESTIGATE (needs human judgment)."
-
-Present findings under an \`ADVERSARIAL REVIEW (Claude subagent):\` header. **FIXABLE findings** flow into the same Fix-First pipeline as the structured review. **INVESTIGATE findings** are presented as informational.
-
-If the subagent fails or times out: "Claude adversarial subagent unavailable. Continuing without adversarial review."
-
-**Persist the review result:**
-\`\`\`bash
-~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"adversarial-review","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","status":"STATUS","source":"SOURCE","tier":"medium","commit":"'"$(git rev-parse --short HEAD)"'"}'
-\`\`\`
-Substitute STATUS: "clean" if no findings, "issues_found" if findings exist. SOURCE: "codex" if Codex ran, "claude" if subagent ran. If both failed, do NOT persist.
-
-**Cleanup:** Run \`rm -f "$TMPERR_ADV"\` after processing (if Codex was used).
+If Codex is NOT available: "Codex CLI not found — running Claude adversarial only. Install Codex for cross-model coverage: \`npm install -g @openai/codex\`"

 ---

-### Large tier (200+ lines)
+### Codex structured review (large diffs only, 200+ lines)

-Claude's structured review already ran. Now run **all three remaining passes** for maximum coverage:
+If \`DIFF_TOTAL >= 200\` AND Codex is available AND \`OLD_CFG\` is NOT \`disabled\`:

-**1. Codex structured review (if available):**
 \`\`\`bash
 TMPERR=$(mktemp /tmp/codex-review-XXXXXXXX)
 _REPO_ROOT=$(git rev-parse --show-toplevel) || { echo "ERROR: not in a git repo" >&2; exit 1; }
@@ -477,34 +498,34 @@ B) Continue — review will still complete

 If A: address the findings${isShip ? '. After fixing, re-run tests (Step 3) since code has changed' : ''}. Re-run \`codex review\` to verify.

-Read stderr for errors (same error handling as medium tier).
+Read stderr for errors (same error handling as Codex adversarial above).

 After stderr: \`rm -f "$TMPERR"\`

-**2. Claude adversarial subagent:** Dispatch a subagent with the adversarial prompt (same prompt as medium tier). This always runs regardless of Codex availability.
-
-**3. Codex adversarial challenge (if available):** Run \`codex exec\` with the adversarial prompt (same as medium tier).
-
-If Codex is not available for steps 1 and 3, note to the user: "Codex CLI not found — large-diff review ran Claude structured + Claude adversarial (2 of 4 passes). Install Codex for full 4-pass coverage: \`npm install -g @openai/codex\`"
-
-**Persist the review result AFTER all passes complete** (not after each sub-step):
-\`\`\`bash
-~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"adversarial-review","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","status":"STATUS","source":"SOURCE","tier":"large","gate":"GATE","commit":"'"$(git rev-parse --short HEAD)"'"}'
-\`\`\`
-Substitute: STATUS = "clean" if no findings across ALL passes, "issues_found" if any pass found issues. SOURCE = "both" if Codex ran, "claude" if only Claude subagent ran. GATE = the Codex structured review gate result ("pass"/"fail"), or "informational" if Codex was unavailable. If all passes failed, do NOT persist.
+If \`DIFF_TOTAL < 200\`: skip this section silently. The Claude + Codex adversarial passes provide sufficient coverage for smaller diffs.

 ---

-### Cross-model synthesis (medium and large tiers)
+### Persist the review result
+
+After all passes complete, persist:
+\`\`\`bash
+~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"adversarial-review","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","status":"STATUS","source":"SOURCE","tier":"always","gate":"GATE","commit":"'"$(git rev-parse --short HEAD)"'"}'
+\`\`\`
+Substitute: STATUS = "clean" if no findings across ALL passes, "issues_found" if any pass found issues. SOURCE = "both" if Codex ran, "claude" if only Claude subagent ran. GATE = the Codex structured review gate result ("pass"/"fail"), "skipped" if diff < 200, or "informational" if Codex was unavailable. If all passes failed, do NOT persist.
+
+---
+
+### Cross-model synthesis

 After all passes complete, synthesize findings across all sources:

 \`\`\`
-ADVERSARIAL REVIEW SYNTHESIS (auto: TIER, N lines):
+ADVERSARIAL REVIEW SYNTHESIS (always-on, N lines):
 ════════════════════════════════════════════════════════════
  High confidence (found by multiple sources): [findings agreed on by >1 pass]
  Unique to Claude structured review: [from earlier step]
-  Unique to Claude adversarial: [from subagent, if ran]
+  Unique to Claude adversarial: [from subagent]
  Unique to Codex: [from codex adversarial or code review, if ran]
  Models used: Claude structured ✓  Claude adversarial ✓/✗  Codex ✓/✗
 ════════════════════════════════════════════════════════════
@@ -628,6 +649,9 @@ For each substantive tension point, use AskUserQuestion:

 > "Cross-model disagreement on [topic]. The review found [X] but the outside voice
 > argues [Y]. [One sentence on what context you might be missing.]"
+>
+> RECOMMENDATION: Choose [A or B] because [one-line reason explaining which argument
+> is more compelling and why]. Completeness: A=X/10, B=Y/10.

 Options:
 - A) Accept the outside voice's recommendation (I'll apply this change)
@@ -40,3 +40,6 @@ export interface TemplateContext {
  paths: HostPaths;
  preambleTier?: number;  // 1-4, controls which preamble sections are included
 }
+
+/** Resolver function signature. args is populated for parameterized placeholders like {{INVOKE_SKILL:name}}. */
+export type ResolverFn = (ctx: TemplateContext, args?: string[]) => string;
@@ -375,3 +375,47 @@ export function generateCoAuthorTrailer(ctx: TemplateContext): string {
  }
  return 'Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>';
 }
+
+export function generateChangelogWorkflow(_ctx: TemplateContext): string {
+  return `## CHANGELOG (auto-generate)
+
+1. Read \`CHANGELOG.md\` header to know the format.
+
+2. **First, enumerate every commit on the branch:**
+   \`\`\`bash
+   git log <base>..HEAD --oneline
+   \`\`\`
+   Copy the full list. Count the commits. You will use this as a checklist.
+
+3. **Read the full diff** to understand what each commit actually changed:
+   \`\`\`bash
+   git diff <base>...HEAD
+   \`\`\`
+
+4. **Group commits by theme** before writing anything. Common themes:
+   - New features / capabilities
+   - Performance improvements
+   - Bug fixes
+   - Dead code removal / cleanup
+   - Infrastructure / tooling / tests
+   - Refactoring
+
+5. **Write the CHANGELOG entry** covering ALL groups:
+   - If existing CHANGELOG entries on the branch already cover some commits, replace them with one unified entry for the new version
+   - Categorize changes into applicable sections:
+     - \`### Added\` — new features
+     - \`### Changed\` — changes to existing functionality
+     - \`### Fixed\` — bug fixes
+     - \`### Removed\` — removed features
+   - Write concise, descriptive bullet points
+   - Insert after the file header (line 5), dated today
+   - Format: \`## [X.Y.Z.W] - YYYY-MM-DD\`
+   - **Voice:** Lead with what the user can now **do** that they couldn't before. Use plain language, not implementation details. Never mention TODOS.md, internal tracking, or contributor-facing details.
+
+6. **Cross-check:** Compare your CHANGELOG entry against the commit list from step 2.
+   Every commit must map to at least one bullet point. If any commit is unrepresented,
+   add it now. If the branch has N commits spanning K themes, the CHANGELOG must
+   reflect all K themes.
+
+**Do NOT ask the user to describe changes.** Infer from the diff and commit history.`;
+}