Merge branch 'main' into garrytan/team-supabase-store

Resolved 4 conflicts: - scripts/gen-skill-docs.ts: kept ARTIFACT_SETUP + added main's new resolvers (SPEC_REVIEW_LOOP, DESIGN_SKETCH, BENEFITS_FROM, CODEX_REVIEW_STEP). Updated codex review-log to use new paths. - ship/SKILL.md.tmpl: adopted {{CODEX_REVIEW_STEP}} macro from main - test/skill-e2e.test.ts: added main's new E2E tests (office-hours spec review, plan-ceo benefits-from) + kept our E2E isolation cleanup Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-07 05:56:41 +02:00 · 2026-03-21 09:29:05 -07:00
parent 0ffb4e9ec5 1f4b6fd7a2
commit 2769cd043d
96 changed files with 17868 additions and 342 deletions
@@ -19,9 +19,45 @@ const DRY_RUN = process.argv.includes('--dry-run');

 // ─── Template Context ───────────────────────────────────────

+type Host = 'claude' | 'codex';
+
+const HOST_ARG = process.argv.find(a => a.startsWith('--host'));
+const HOST: Host = (() => {
+  if (!HOST_ARG) return 'claude';
+  const val = HOST_ARG.includes('=') ? HOST_ARG.split('=')[1] : process.argv[process.argv.indexOf(HOST_ARG) + 1];
+  if (val === 'codex' || val === 'agents') return 'codex';
+  if (val === 'claude') return 'claude';
+  throw new Error(`Unknown host: ${val}. Use claude, codex, or agents.`);
+})();
+
+interface HostPaths {
+  skillRoot: string;
+  localSkillRoot: string;
+  binDir: string;
+  browseDir: string;
+}
+
+const HOST_PATHS: Record<Host, HostPaths> = {
+  claude: {
+    skillRoot: '~/.claude/skills/gstack',
+    localSkillRoot: '.claude/skills/gstack',
+    binDir: '~/.claude/skills/gstack/bin',
+    browseDir: '~/.claude/skills/gstack/browse/dist',
+  },
+  codex: {
+    skillRoot: '~/.codex/skills/gstack',
+    localSkillRoot: '.agents/skills/gstack',
+    binDir: '~/.codex/skills/gstack/bin',
+    browseDir: '~/.codex/skills/gstack/browse/dist',
+  },
+};
+
 interface TemplateContext {
  skillName: string;
  tmplPath: string;
+  benefitsFrom?: string[];
+  host: Host;
+  paths: HostPaths;
 }

 // ─── Placeholder Resolvers ──────────────────────────────────
@@ -101,33 +137,44 @@ function generateSnapshotFlags(_ctx: TemplateContext): string {
  return lines.join('\n');
 }

-function generatePreamble(ctx: TemplateContext): string {
+function generatePreambleBash(ctx: TemplateContext): string {
  return `## Preamble (run first)

 \`\`\`bash
-_UPD=$(~/.claude/skills/gstack/bin/gstack-update-check 2>/dev/null || .claude/skills/gstack/bin/gstack-update-check 2>/dev/null || true)
+_UPD=$(${ctx.paths.binDir}/gstack-update-check 2>/dev/null || ${ctx.paths.localSkillRoot}/bin/gstack-update-check 2>/dev/null || true)
 [ -n "$_UPD" ] && echo "$_UPD" || true
 mkdir -p ~/.gstack/sessions
 touch ~/.gstack/sessions/"$PPID"
 _SESSIONS=$(find ~/.gstack/sessions -mmin -120 -type f 2>/dev/null | wc -l | tr -d ' ')
 find ~/.gstack/sessions -mmin +120 -type f -delete 2>/dev/null || true
-_CONTRIB=$(~/.claude/skills/gstack/bin/gstack-config get gstack_contributor 2>/dev/null || true)
-_PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true")
+_CONTRIB=$(${ctx.paths.binDir}/gstack-config get gstack_contributor 2>/dev/null || true)
+_PROACTIVE=$(${ctx.paths.binDir}/gstack-config get proactive 2>/dev/null || echo "true")
 _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown")
 echo "BRANCH: $_BRANCH"
 echo "PROACTIVE: $_PROACTIVE"
 _LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no")
 echo "LAKE_INTRO: $_LAKE_SEEN"
+_TEL=$(~/.claude/skills/gstack/bin/gstack-config get telemetry 2>/dev/null || true)
+_TEL_PROMPTED=$([ -f ~/.gstack/.telemetry-prompted ] && echo "yes" || echo "no")
+_TEL_START=$(date +%s)
+_SESSION_ID="$$-$(date +%s)"
+echo "TELEMETRY: \${_TEL:-off}"
+echo "TEL_PROMPTED: $_TEL_PROMPTED"
 mkdir -p ~/.gstack/analytics
 echo '{"skill":"${ctx.skillName}","ts":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'","repo":"'$(basename "$(git rev-parse --show-toplevel 2>/dev/null)" 2>/dev/null || echo "unknown")'"}'  >> ~/.gstack/analytics/skill-usage.jsonl 2>/dev/null || true
-\`\`\`
+for _PF in ~/.gstack/analytics/.pending-*; do [ -f "$_PF" ] && ${ctx.paths.binDir}/gstack-telemetry-log --event-type skill_run --skill _pending_finalize --outcome unknown --session-id "$_SESSION_ID" 2>/dev/null || true; break; done
+\`\`\``;
+}

-If \`PROACTIVE\` is \`"false"\`, do not proactively suggest gstack skills — only invoke
+function generateUpgradeCheck(ctx: TemplateContext): string {
+  return `If \`PROACTIVE\` is \`"false"\`, do not proactively suggest gstack skills — only invoke
 them when the user explicitly asks. The user opted out of proactive suggestions.

-If output shows \`UPGRADE_AVAILABLE <old> <new>\`: read \`~/.claude/skills/gstack/gstack-upgrade/SKILL.md\` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If \`JUST_UPGRADED <from> <to>\`: tell user "Running gstack v{to} (just updated!)" and continue.
+If output shows \`UPGRADE_AVAILABLE <old> <new>\`: read \`${ctx.paths.skillRoot}/gstack-upgrade/SKILL.md\` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If \`JUST_UPGRADED <from> <to>\`: tell user "Running gstack v{to} (just updated!)" and continue.`;
+}

-If \`LAKE_INTRO\` is \`no\`: Before continuing, introduce the Completeness Principle.
+function generateLakeIntro(): string {
+  return `If \`LAKE_INTRO\` is \`no\`: Before continuing, introduce the Completeness Principle.
 Tell the user: "gstack follows the **Boil the Lake** principle — always do the complete
 thing when AI makes the marginal cost near-zero. Read more: https://garryslist.org/posts/boil-the-ocean"
 Then offer to open the essay in their default browser:
@@ -137,9 +184,46 @@ open https://garryslist.org/posts/boil-the-ocean
 touch ~/.gstack/.completeness-intro-seen
 \`\`\`

-Only run \`open\` if the user says yes. Always run \`touch\` to mark as seen. This only happens once.
+Only run \`open\` if the user says yes. Always run \`touch\` to mark as seen. This only happens once.`;
+}

-## AskUserQuestion Format
+function generateTelemetryPrompt(ctx: TemplateContext): string {
+  return `If \`TEL_PROMPTED\` is \`no\` AND \`LAKE_INTRO\` is \`yes\`: After the lake intro is handled,
+ask the user about telemetry. Use AskUserQuestion:
+
+> Help gstack get better! Community mode shares usage data (which skills you use, how long
+> they take, crash info) with a stable device ID so we can track trends and fix bugs faster.
+> No code, file paths, or repo names are ever sent.
+> Change anytime with \`gstack-config set telemetry off\`.
+
+Options:
+- A) Help gstack get better! (recommended)
+- B) No thanks
+
+If A: run \`${ctx.paths.binDir}/gstack-config set telemetry community\`
+
+If B: ask a follow-up AskUserQuestion:
+
+> How about anonymous mode? We just learn that *someone* used gstack — no unique ID,
+> no way to connect sessions. Just a counter that helps us know if anyone's out there.
+
+Options:
+- A) Sure, anonymous is fine
+- B) No thanks, fully off
+
+If B→A: run \`${ctx.paths.binDir}/gstack-config set telemetry anonymous\`
+If B→B: run \`${ctx.paths.binDir}/gstack-config set telemetry off\`
+
+Always run:
+\`\`\`bash
+touch ~/.gstack/.telemetry-prompted
+\`\`\`
+
+This only happens once. If \`TEL_PROMPTED\` is \`yes\`, skip this entirely.`;
+}
+
+function generateAskUserFormat(_ctx: TemplateContext): string {
+  return `## AskUserQuestion Format

 **ALWAYS follow this structure for every AskUserQuestion call:**
 1. **Re-ground:** State the project, the current branch (use the \`_BRANCH\` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences)
@@ -149,9 +233,11 @@ Only run \`open\` if the user says yes. Always run \`touch\` to mark as seen. Th

 Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex.

-Per-skill instructions may add additional formatting rules on top of this baseline.
+Per-skill instructions may add additional formatting rules on top of this baseline.`;
+}

-## Completeness Principle — Boil the Lake
+function generateCompletenessSection(): string {
+  return `## Completeness Principle — Boil the Lake

 AI-assisted coding makes the marginal cost of completeness near-zero. When you present options:

@@ -174,9 +260,11 @@ AI-assisted coding makes the marginal cost of completeness near-zero. When you p
 - BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.)
 - BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.)
 - BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.)
- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.")
+- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.")`;
+}

-## Contributor Mode
+function generateContributorMode(): string {
+  return `## Contributor Mode

 If \`_CONTRIB\` is \`true\`: you are in **contributor mode**. You're a gstack user who also helps make it better.

@@ -211,9 +299,11 @@ Hey gstack team — ran into this while using /{skill-name}:
 **Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill}
 \`\`\`

-Slug: lowercase, hyphens, max 60 chars (e.g. \`browse-js-no-await\`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}"
+Slug: lowercase, hyphens, max 60 chars (e.g. \`browse-js-no-await\`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}"`;
+}

-## Completion Status Protocol
+function generateCompletionStatus(): string {
+  return `## Completion Status Protocol

 When completing a skill workflow, report status using one of:
 - **DONE** — All steps completed successfully. Evidence provided for each claim.
@@ -236,17 +326,58 @@ STATUS: BLOCKED | NEEDS_CONTEXT
 REASON: [1-2 sentences]
 ATTEMPTED: [what you tried]
 RECOMMENDATION: [what the user should do next]
-\`\`\``;
+\`\`\`
+
+## Telemetry (run last)
+
+After the skill workflow completes (success, error, or abort), log the telemetry event.
+Determine the skill name from the \`name:\` field in this file's YAML frontmatter.
+Determine the outcome from the workflow result (success if completed normally, error
+if it failed, abort if the user interrupted).
+
+**PLAN MODE EXCEPTION — ALWAYS RUN:** This command writes telemetry to
+\`~/.gstack/analytics/\` (user config directory, not project files). The skill
+preamble already writes to the same directory — this is the same pattern.
+Skipping this command loses session duration and outcome data.
+
+Run this bash:
+
+\`\`\`bash
+_TEL_END=$(date +%s)
+_TEL_DUR=$(( _TEL_END - _TEL_START ))
+rm -f ~/.gstack/analytics/.pending-"$_SESSION_ID" 2>/dev/null || true
+~/.claude/skills/gstack/bin/gstack-telemetry-log \\
+  --skill "SKILL_NAME" --duration "$_TEL_DUR" --outcome "OUTCOME" \\
+  --used-browse "USED_BROWSE" --session-id "$_SESSION_ID" 2>/dev/null &
+\`\`\`
+
+Replace \`SKILL_NAME\` with the actual skill name from frontmatter, \`OUTCOME\` with
+success/error/abort, and \`USED_BROWSE\` with true/false based on whether \`$B\` was used.
+If you cannot determine the outcome, use "unknown". This runs in the background and
+never blocks the user.`;
 }

-function generateBrowseSetup(_ctx: TemplateContext): string {
+function generatePreamble(ctx: TemplateContext): string {
+  return [
+    generatePreambleBash(ctx),
+    generateUpgradeCheck(ctx),
+    generateLakeIntro(),
+    generateTelemetryPrompt(ctx),
+    generateAskUserFormat(ctx),
+    generateCompletenessSection(),
+    generateContributorMode(),
+    generateCompletionStatus(),
+  ].join('\n\n');
+}
+
+function generateBrowseSetup(ctx: TemplateContext): string {
  return `## SETUP (run this check BEFORE any browse command)

 \`\`\`bash
 _ROOT=$(git rev-parse --show-toplevel 2>/dev/null)
 B=""
-[ -n "$_ROOT" ] && [ -x "$_ROOT/.claude/skills/gstack/browse/dist/browse" ] && B="$_ROOT/.claude/skills/gstack/browse/dist/browse"
-[ -z "$B" ] && B=~/.claude/skills/gstack/browse/dist/browse
+[ -n "$_ROOT" ] && [ -x "$_ROOT/${ctx.paths.localSkillRoot}/browse/dist/browse" ] && B="$_ROOT/${ctx.paths.localSkillRoot}/browse/dist/browse"
+[ -z "$B" ] && B=${ctx.paths.browseDir}/browse
 if [ -x "$B" ]; then
  echo "READY: $B"
 else
@@ -968,7 +1099,7 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl
 - **Eng Review (required by default):** The only review that gates shipping. Covers architecture, code quality, tests, performance. Can be disabled globally with \\\`gstack-config set skip_eng_review true\\\` (the "don't bother me" setting).
 - **CEO Review (optional):** Use your judgment. Recommend it for big product/business changes, new user-facing features, or scope decisions. Skip for bug fixes, refactors, infra, and cleanup.
 - **Design Review (optional):** Use your judgment. Recommend it for UI/UX changes. Skip for backend-only, infra, or prompt-only changes.
- **Codex Review (optional):** Independent second opinion from OpenAI Codex CLI. Shows pass/fail gate. Recommend for critical code changes where a second AI perspective adds value. Skip when Codex CLI is not installed.
+- **Codex Review (enabled by default when Codex CLI is installed):** Independent review + adversarial challenge from OpenAI Codex CLI. Shows pass/fail gate. Runs automatically when enabled — configure with \\\`gstack-config set codex_reviews enabled|disabled\\\`.

 **Verdict logic:**
 - **CLEARED**: Eng Review has >= 1 entry within 7 days with status "clean" (or \\\`skip_eng_review\\\` is \\\`true\\\`)
@@ -1145,6 +1276,291 @@ DATE=$(date +%Y-%m-%d)
 \`\`\``;
 }

+function generateSpecReviewLoop(_ctx: TemplateContext): string {
+  return `## Spec Review Loop
+
+Before presenting the document to the user for approval, run an adversarial review.
+
+**Step 1: Dispatch reviewer subagent**
+
+Use the Agent tool to dispatch an independent reviewer. The reviewer has fresh context
+and cannot see the brainstorming conversation — only the document. This ensures genuine
+adversarial independence.
+
+Prompt the subagent with:
+- The file path of the document just written
+- "Read this document and review it on 5 dimensions. For each dimension, note PASS or
+  list specific issues with suggested fixes. At the end, output a quality score (1-10)
+  across all dimensions."
+
+**Dimensions:**
+1. **Completeness** — Are all requirements addressed? Missing edge cases?
+2. **Consistency** — Do parts of the document agree with each other? Contradictions?
+3. **Clarity** — Could an engineer implement this without asking questions? Ambiguous language?
+4. **Scope** — Does the document creep beyond the original problem? YAGNI violations?
+5. **Feasibility** — Can this actually be built with the stated approach? Hidden complexity?
+
+The subagent should return:
+- A quality score (1-10)
+- PASS if no issues, or a numbered list of issues with dimension, description, and fix
+
+**Step 2: Fix and re-dispatch**
+
+If the reviewer returns issues:
+1. Fix each issue in the document on disk (use Edit tool)
+2. Re-dispatch the reviewer subagent with the updated document
+3. Maximum 3 iterations total
+
+**Convergence guard:** If the reviewer returns the same issues on consecutive iterations
+(the fix didn't resolve them or the reviewer disagrees with the fix), stop the loop
+and persist those issues as "Reviewer Concerns" in the document rather than looping
+further.
+
+If the subagent fails, times out, or is unavailable — skip the review loop entirely.
+Tell the user: "Spec review unavailable — presenting unreviewed doc." The document is
+already written to disk; the review is a quality bonus, not a gate.
+
+**Step 3: Report and persist metrics**
+
+After the loop completes (PASS, max iterations, or convergence guard):
+
+1. Tell the user the result — summary by default:
+   "Your doc survived N rounds of adversarial review. M issues caught and fixed.
+   Quality score: X/10."
+   If they ask "what did the reviewer find?", show the full reviewer output.
+
+2. If issues remain after max iterations or convergence, add a "## Reviewer Concerns"
+   section to the document listing each unresolved issue. Downstream skills will see this.
+
+3. Append metrics:
+\`\`\`bash
+mkdir -p ~/.gstack/analytics
+echo '{"skill":"${_ctx.skillName}","ts":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'","iterations":ITERATIONS,"issues_found":FOUND,"issues_fixed":FIXED,"remaining":REMAINING,"quality_score":SCORE}' >> ~/.gstack/analytics/spec-review.jsonl 2>/dev/null || true
+\`\`\`
+Replace ITERATIONS, FOUND, FIXED, REMAINING, SCORE with actual values from the review.`;
+}
+
+function generateBenefitsFrom(ctx: TemplateContext): string {
+  if (!ctx.benefitsFrom || ctx.benefitsFrom.length === 0) return '';
+
+  const skillList = ctx.benefitsFrom.map(s => `\`/${s}\``).join(' or ');
+  const first = ctx.benefitsFrom[0];
+
+  return `## Prerequisite Skill Offer
+
+When the design doc check above prints "No design doc found," offer the prerequisite
+skill before proceeding.
+
+Say to the user via AskUserQuestion:
+
+> "No design doc found for this branch. ${skillList} produces a structured problem
+> statement, premise challenge, and explored alternatives — it gives this review much
+> sharper input to work with. Takes about 10 minutes. The design doc is per-feature,
+> not per-product — it captures the thinking behind this specific change."
+
+Options:
+- A) Run /${first} first (in another window, then come back)
+- B) Skip — proceed with standard review
+
+If they skip: "No worries — standard review. If you ever want sharper input, try
+/${first} first next time." Then proceed normally. Do not re-offer later in the session.`;
+}
+
+function generateDesignSketch(_ctx: TemplateContext): string {
+  return `## Visual Sketch (UI ideas only)
+
+If the chosen approach involves user-facing UI (screens, pages, forms, dashboards,
+or interactive elements), generate a rough wireframe to help the user visualize it.
+If the idea is backend-only, infrastructure, or has no UI component — skip this
+section silently.
+
+**Step 1: Gather design context**
+
+1. Check if \`DESIGN.md\` exists in the repo root. If it does, read it for design
+   system constraints (colors, typography, spacing, component patterns). Use these
+   constraints in the wireframe.
+2. Apply core design principles:
+   - **Information hierarchy** — what does the user see first, second, third?
+   - **Interaction states** — loading, empty, error, success, partial
+   - **Edge case paranoia** — what if the name is 47 chars? Zero results? Network fails?
+   - **Subtraction default** — "as little design as possible" (Rams). Every element earns its pixels.
+   - **Design for trust** — every interface element builds or erodes user trust.
+
+**Step 2: Generate wireframe HTML**
+
+Generate a single-page HTML file with these constraints:
+- **Intentionally rough aesthetic** — use system fonts, thin gray borders, no color,
+  hand-drawn-style elements. This is a sketch, not a polished mockup.
+- Self-contained — no external dependencies, no CDN links, inline CSS only
+- Show the core interaction flow (1-3 screens/states max)
+- Include realistic placeholder content (not "Lorem ipsum" — use content that
+  matches the actual use case)
+- Add HTML comments explaining design decisions
+
+Write to a temp file:
+\`\`\`bash
+SKETCH_FILE="/tmp/gstack-sketch-$(date +%s).html"
+\`\`\`
+
+**Step 3: Render and capture**
+
+\`\`\`bash
+$B goto "file://$SKETCH_FILE"
+$B screenshot /tmp/gstack-sketch.png
+\`\`\`
+
+If \`$B\` is not available (browse binary not set up), skip the render step. Tell the
+user: "Visual sketch requires the browse binary. Run the setup script to enable it."
+
+**Step 4: Present and iterate**
+
+Show the screenshot to the user. Ask: "Does this feel right? Want to iterate on the layout?"
+
+If they want changes, regenerate the HTML with their feedback and re-render.
+If they approve or say "good enough," proceed.
+
+**Step 5: Include in design doc**
+
+Reference the wireframe screenshot in the design doc's "Recommended Approach" section.
+The screenshot file at \`/tmp/gstack-sketch.png\` can be referenced by downstream skills
+(\`/plan-design-review\`, \`/design-review\`) to see what was originally envisioned.`;
+}
+
+function generateCodexReviewStep(ctx: TemplateContext): string {
+  // Codex host: strip entirely — Codex should never invoke itself
+  if (ctx.host === 'codex') return '';
+
+  const isShip = ctx.skillName === 'ship';
+  const stepNum = isShip ? '3.8' : '5.7';
+
+  return `## Step ${stepNum}: Codex review
+
+Check if the Codex CLI is available and read the user's Codex review preference:
+
+\`\`\`bash
+which codex 2>/dev/null && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE"
+CODEX_REVIEWS_CFG=$(~/.claude/skills/gstack/bin/gstack-config get codex_reviews 2>/dev/null || true)
+echo "CODEX_REVIEWS: \${CODEX_REVIEWS_CFG:-not_set}"
+\`\`\`
+
+If \`CODEX_NOT_AVAILABLE\`: skip this step silently. Continue to the next step.
+
+If \`CODEX_REVIEWS\` is \`disabled\`: skip this step silently. Continue to the next step.
+
+If \`CODEX_REVIEWS\` is \`enabled\`: run both code review and adversarial challenge automatically (no prompt). Jump to the "Run Codex" section below.
+
+If \`CODEX_REVIEWS\` is \`not_set\`: use AskUserQuestion to offer the one-time adoption prompt:
+
+\`\`\`
+GStack recommends enabling Codex code reviews — Codex is the super smart quiet engineer friend who will save your butt.
+
+A) Enable for all future runs (recommended, default)
+B) Try it for now, ask me again later
+C) No thanks, don't ask me again
+\`\`\`
+
+If the user chooses A: persist the setting and run both:
+\`\`\`bash
+~/.claude/skills/gstack/bin/gstack-config set codex_reviews enabled
+\`\`\`
+
+If the user chooses B: run both this time but do not persist any setting.
+
+If the user chooses C: persist the opt-out and skip:
+\`\`\`bash
+~/.claude/skills/gstack/bin/gstack-config set codex_reviews disabled
+\`\`\`
+Then skip this step. Continue to the next step.
+
+### Run Codex
+
+Always run **both** code review and adversarial challenge. Use a 5-minute timeout (\`timeout: 300000\`) on each Bash call.
+
+First, create a temp file for stderr capture:
+\`\`\`bash
+TMPERR=$(mktemp /tmp/codex-review-XXXXXXXX)
+\`\`\`
+
+**Code review:** Run:
+\`\`\`bash
+codex review --base <base> -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR"
+\`\`\`
+
+After the command completes, read stderr for cost/error info:
+\`\`\`bash
+cat "$TMPERR"
+\`\`\`
+
+Present the full output verbatim under a \`CODEX SAYS (code review):\` header:
+
+\`\`\`
+CODEX SAYS (code review):
+════════════════════════════════════════════════════════════
+<full codex output, verbatim — do not truncate or summarize>
+════════════════════════════════════════════════════════════
+GATE: PASS                    Tokens: N | Est. cost: ~$X.XX
+\`\`\`
+
+Check the output for \`[P1]\` markers. If found: \`GATE: FAIL\`. If no \`[P1]\`: \`GATE: PASS\`.
+
+**If GATE is FAIL:** use AskUserQuestion:
+
+\`\`\`
+Codex found N critical issues in the diff.
+
+A) Investigate and fix now (recommended)
+B) Ship anyway — these issues may cause production problems
+\`\`\`
+
+If the user chooses A: read the Codex findings carefully and work to address them${isShip ? '. After fixing, re-run tests (Step 3) since code has changed' : ''}. Then re-run \`codex review\` to verify the gate is now PASS.
+
+If the user chooses B: continue to the next step.
+
+### Error handling (code review)
+
+Before persisting the gate result, check for errors. All errors are non-blocking — Codex is a quality enhancement, not a prerequisite. Check \`$TMPERR\` output (already read above) for error indicators:
+
+- **Auth failure:** If stderr contains "auth", "login", "unauthorized", or "API key", tell the user: "Codex authentication failed. Run \\\`codex login\\\` in your terminal to authenticate via ChatGPT." Do NOT persist a review log entry. Continue to the adversarial step (it will likely fail too, but try anyway).
+- **Timeout:** If the Bash call times out (5 min), tell the user: "Codex timed out after 5 minutes. The diff may be too large or the API may be slow." Do NOT persist a review log entry. Skip to cleanup.
+- **Empty response:** If codex returned no stdout output, tell the user: "Codex returned no response. Stderr: <paste relevant error>." Do NOT persist a review log entry. Skip to cleanup.
+
+**Only if codex produced a real review (non-empty stdout):** Persist the code review result:
+\`\`\`bash
+eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null)
+mkdir -p $PROJECTS_DIR/$SLUG/reviews
+echo '{"skill":"codex-review","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","status":"STATUS","gate":"GATE","commit":"'"$(git rev-parse --short HEAD)"'"}' >> $PROJECTS_DIR/$SLUG/reviews/$BRANCH.jsonl
+\`\`\`
+
+Substitute: STATUS ("clean" if PASS, "issues_found" if FAIL), GATE ("pass" or "fail").
+
+**Adversarial challenge:** Run:
+\`\`\`bash
+TMPERR_ADV=$(mktemp /tmp/codex-adv-XXXXXXXX)
+codex exec "Review the changes on this branch against the base branch. Run git diff origin/<base> to see the diff. Your job is to find ways this code will fail in production. Think like an attacker and a chaos engineer. Find edge cases, race conditions, security holes, resource leaks, failure modes, and silent data corruption paths. Be adversarial. Be thorough. No compliments — just the problems." -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_ADV"
+\`\`\`
+
+After the command completes, read adversarial stderr:
+\`\`\`bash
+cat "$TMPERR_ADV"
+\`\`\`
+
+Present the full output verbatim under a \`CODEX SAYS (adversarial challenge):\` header. This is informational — it never blocks shipping. If the adversarial command timed out or returned no output, note this to the user and continue.
+${!isShip ? `
+**Cross-model analysis:** After both Codex outputs are presented, compare Codex's findings with your own review findings from the earlier review steps and output:
+
+\`\`\`
+CROSS-MODEL ANALYSIS:
+  Both found: [findings that overlap between Claude and Codex]
+  Only Codex found: [findings unique to Codex]
+  Only Claude found: [findings unique to Claude's review]
+  Agreement rate: X% (N/M total unique findings overlap)
+\`\`\`
+` : ''}
+**Cleanup:** Run \`rm -f "$TMPERR" "$TMPERR_ADV"\` after processing.
+
+---`;
+}
+
 const RESOLVERS: Record<string, (ctx: TemplateContext) => string> = {
  COMMAND_REFERENCE: generateCommandReference,
  SNAPSHOT_FLAGS: generateSnapshotFlags,
@@ -1157,21 +1573,140 @@ const RESOLVERS: Record<string, (ctx: TemplateContext) => string> = {
  REVIEW_DASHBOARD: generateReviewDashboard,
  TEST_BOOTSTRAP: generateTestBootstrap,
  ARTIFACT_SETUP: generateArtifactSetup,
+  SPEC_REVIEW_LOOP: generateSpecReviewLoop,
+  DESIGN_SKETCH: generateDesignSketch,
+  BENEFITS_FROM: generateBenefitsFrom,
+  CODEX_REVIEW_STEP: generateCodexReviewStep,
 };

+// ─── Codex Helpers ───────────────────────────────────────────
+
+function codexSkillName(skillDir: string): string {
+  if (skillDir === '.' || skillDir === '') return 'gstack';
+  // Don't double-prefix: gstack-upgrade → gstack-upgrade (not gstack-gstack-upgrade)
+  if (skillDir.startsWith('gstack-')) return skillDir;
+  return `gstack-${skillDir}`;
+}
+
+/**
+ * Transform frontmatter for Codex: keep only name + description.
+ * Strips allowed-tools, hooks, version, and all other fields.
+ * Handles multiline block scalar descriptions (YAML | syntax).
+ */
+function transformFrontmatter(content: string, host: Host): string {
+  if (host === 'claude') return content;
+
+  // Find frontmatter boundaries
+  const fmStart = content.indexOf('---\n');
+  if (fmStart !== 0) return content; // frontmatter must be at the start
+  const fmEnd = content.indexOf('\n---', fmStart + 4);
+  if (fmEnd === -1) return content;
+
+  const frontmatter = content.slice(fmStart + 4, fmEnd);
+  const body = content.slice(fmEnd + 4); // includes the leading \n after ---
+
+  // Parse name
+  const nameMatch = frontmatter.match(/^name:\s*(.+)$/m);
+  const name = nameMatch ? nameMatch[1].trim() : '';
+
+  // Parse description — handle both simple and block scalar (|) formats
+  let description = '';
+  const lines = frontmatter.split('\n');
+  let inDescription = false;
+  const descLines: string[] = [];
+  for (const line of lines) {
+    if (line.match(/^description:\s*\|?\s*$/)) {
+      // Block scalar start: "description: |" or "description:"
+      inDescription = true;
+      continue;
+    }
+    if (line.match(/^description:\s*\S/)) {
+      // Simple inline: "description: some text"
+      description = line.replace(/^description:\s*/, '').trim();
+      break;
+    }
+    if (inDescription) {
+      // Block scalar continuation — indented lines (2 spaces) or blank lines
+      if (line === '' || line.match(/^\s/)) {
+        descLines.push(line.replace(/^  /, ''));
+      } else {
+        // End of block scalar — hit a non-indented, non-blank line
+        break;
+      }
+    }
+  }
+  if (descLines.length > 0) {
+    description = descLines.join('\n').trim();
+  }
+
+  // Re-emit Codex frontmatter (name + description only)
+  const indentedDesc = description.split('\n').map(l => `  ${l}`).join('\n');
+  const codexFm = `---\nname: ${name}\ndescription: |\n${indentedDesc}\n---`;
+  return codexFm + body;
+}
+
+/**
+ * Extract hook descriptions from frontmatter for inline safety prose.
+ * Returns a description of what the hooks do, or null if no hooks.
+ */
+function extractHookSafetyProse(tmplContent: string): string | null {
+  if (!tmplContent.match(/^hooks:/m)) return null;
+
+  // Parse the hook matchers to build a human-readable safety description
+  const matchers: string[] = [];
+  const matcherRegex = /matcher:\s*"(\w+)"/g;
+  let m;
+  while ((m = matcherRegex.exec(tmplContent)) !== null) {
+    if (!matchers.includes(m[1])) matchers.push(m[1]);
+  }
+
+  if (matchers.length === 0) return null;
+
+  // Build safety prose based on what tools are hooked
+  const toolDescriptions: Record<string, string> = {
+    Bash: 'check bash commands for destructive operations (rm -rf, DROP TABLE, force-push, git reset --hard, etc.) before execution',
+    Edit: 'verify file edits are within the allowed scope boundary before applying',
+    Write: 'verify file writes are within the allowed scope boundary before applying',
+  };
+
+  const safetyChecks = matchers
+    .map(t => toolDescriptions[t] || `check ${t} operations for safety`)
+    .join(', and ');
+
+  return `> **Safety Advisory:** This skill includes safety checks that ${safetyChecks}. When using this skill, always pause and verify before executing potentially destructive operations. If uncertain about a command's safety, ask the user for confirmation before proceeding.`;
+}
+
 // ─── Template Processing ────────────────────────────────────

 const GENERATED_HEADER = `<!-- AUTO-GENERATED from {{SOURCE}} — do not edit directly -->\n<!-- Regenerate: bun run gen:skill-docs -->\n`;

-function processTemplate(tmplPath: string): { outputPath: string; content: string } {
+function processTemplate(tmplPath: string, host: Host = 'claude'): { outputPath: string; content: string } {
  const tmplContent = fs.readFileSync(tmplPath, 'utf-8');
  const relTmplPath = path.relative(ROOT, tmplPath);
-  const outputPath = tmplPath.replace(/\.tmpl$/, '');
+  let outputPath = tmplPath.replace(/\.tmpl$/, '');
+
+  // Determine skill directory relative to ROOT
+  const skillDir = path.relative(ROOT, path.dirname(tmplPath));
+
+  // For codex host, route output to .agents/skills/{codexSkillName}/SKILL.md
+  if (host === 'codex') {
+    const codexName = codexSkillName(skillDir === '.' ? '' : skillDir);
+    const outputDir = path.join(ROOT, '.agents', 'skills', codexName);
+    fs.mkdirSync(outputDir, { recursive: true });
+    outputPath = path.join(outputDir, 'SKILL.md');
+  }

  // Extract skill name from frontmatter for TemplateContext
  const nameMatch = tmplContent.match(/^name:\s*(.+)$/m);
  const skillName = nameMatch ? nameMatch[1].trim() : path.basename(path.dirname(tmplPath));
-  const ctx: TemplateContext = { skillName, tmplPath };
+
+  // Extract benefits-from list from frontmatter (inline YAML: benefits-from: [a, b])
+  const benefitsMatch = tmplContent.match(/^benefits-from:\s*\[([^\]]*)\]/m);
+  const benefitsFrom = benefitsMatch
+    ? benefitsMatch[1].split(',').map(s => s.trim()).filter(Boolean)
+    : undefined;
+
+  const ctx: TemplateContext = { skillName, tmplPath, benefitsFrom, host, paths: HOST_PATHS[host] };

  // Replace placeholders
  let content = tmplContent.replace(/\{\{(\w+)\}\}/g, (match, name) => {
@@ -1186,6 +1721,27 @@ function processTemplate(tmplPath: string): { outputPath: string; content: strin
    throw new Error(`Unresolved placeholders in ${relTmplPath}: ${remaining.join(', ')}`);
  }

+  // For codex host: transform frontmatter and replace Claude-specific paths
+  if (host === 'codex') {
+    // Extract hook safety prose BEFORE transforming frontmatter (which strips hooks)
+    const safetyProse = extractHookSafetyProse(tmplContent);
+
+    // Transform frontmatter: keep only name + description
+    content = transformFrontmatter(content, host);
+
+    // Insert safety advisory at the top of the body (after frontmatter)
+    if (safetyProse) {
+      const bodyStart = content.indexOf('\n---') + 4;
+      content = content.slice(0, bodyStart) + '\n' + safetyProse + '\n' + content.slice(bodyStart);
+    }
+
+    // Replace remaining hardcoded Claude paths with host-appropriate paths
+    content = content.replace(/~\/\.claude\/skills\/gstack/g, ctx.paths.skillRoot);
+    content = content.replace(/\.claude\/skills\/gstack/g, ctx.paths.localSkillRoot);
+    content = content.replace(/\.claude\/skills\/review/g, '.agents/skills/gstack/review');
+    content = content.replace(/\.claude\/skills/g, '.agents/skills');
+  }
+
  // Prepend generated header (after frontmatter)
  const header = GENERATED_HEADER.replace('{{SOURCE}}', path.basename(tmplPath));
  const fmEnd = content.indexOf('---', content.indexOf('---') + 3);
@@ -1203,32 +1759,13 @@ function processTemplate(tmplPath: string): { outputPath: string; content: strin

 function findTemplates(): string[] {
  const templates: string[] = [];
-  const candidates = [
-    path.join(ROOT, 'SKILL.md.tmpl'),
-    path.join(ROOT, 'browse', 'SKILL.md.tmpl'),
-    path.join(ROOT, 'qa', 'SKILL.md.tmpl'),
-    path.join(ROOT, 'qa-only', 'SKILL.md.tmpl'),
-    path.join(ROOT, 'setup-browser-cookies', 'SKILL.md.tmpl'),
-    path.join(ROOT, 'ship', 'SKILL.md.tmpl'),
-    path.join(ROOT, 'review', 'SKILL.md.tmpl'),
-    path.join(ROOT, 'plan-ceo-review', 'SKILL.md.tmpl'),
-    path.join(ROOT, 'plan-eng-review', 'SKILL.md.tmpl'),
-    path.join(ROOT, 'retro', 'SKILL.md.tmpl'),
-    path.join(ROOT, 'office-hours', 'SKILL.md.tmpl'),
-    path.join(ROOT, 'investigate', 'SKILL.md.tmpl'),
-    path.join(ROOT, 'gstack-upgrade', 'SKILL.md.tmpl'),
-    path.join(ROOT, 'plan-design-review', 'SKILL.md.tmpl'),
-    path.join(ROOT, 'design-review', 'SKILL.md.tmpl'),
-    path.join(ROOT, 'design-consultation', 'SKILL.md.tmpl'),
-    path.join(ROOT, 'document-release', 'SKILL.md.tmpl'),
-    path.join(ROOT, 'codex', 'SKILL.md.tmpl'),
-    path.join(ROOT, 'careful', 'SKILL.md.tmpl'),
-    path.join(ROOT, 'freeze', 'SKILL.md.tmpl'),
-    path.join(ROOT, 'guard', 'SKILL.md.tmpl'),
-    path.join(ROOT, 'unfreeze', 'SKILL.md.tmpl'),
-  ];
-  for (const p of candidates) {
-    if (fs.existsSync(p)) templates.push(p);
+  const rootTmpl = path.join(ROOT, 'SKILL.md.tmpl');
+  if (fs.existsSync(rootTmpl)) templates.push(rootTmpl);
+
+  for (const entry of fs.readdirSync(ROOT, { withFileTypes: true })) {
+    if (!entry.isDirectory() || entry.name.startsWith('.') || entry.name === 'node_modules') continue;
+    const tmpl = path.join(ROOT, entry.name, 'SKILL.md.tmpl');
+    if (fs.existsSync(tmpl)) templates.push(tmpl);
  }
  return templates;
 }
@@ -1236,7 +1773,13 @@ function findTemplates(): string[] {
 let hasChanges = false;

 for (const tmplPath of findTemplates()) {
-  const { outputPath, content } = processTemplate(tmplPath);
+  // Skip /codex skill for codex host (self-referential — it's a Claude wrapper around codex exec)
+  if (HOST === 'codex') {
+    const dir = path.basename(path.dirname(tmplPath));
+    if (dir === 'codex') continue;
+  }
+
+  const { outputPath, content } = processTemplate(tmplPath, HOST);
  const relOutput = path.relative(ROOT, outputPath);

  if (DRY_RUN) {
@@ -96,21 +96,67 @@ for (const file of SKILL_FILES) {
  }
 }

+// ─── Codex Skills ───────────────────────────────────────────
+
+const AGENTS_DIR = path.join(ROOT, '.agents', 'skills');
+if (fs.existsSync(AGENTS_DIR)) {
+  console.log('\n  Codex Skills (.agents/skills/):');
+  const codexDirs = fs.readdirSync(AGENTS_DIR).sort();
+  let codexCount = 0;
+  let codexMissing = 0;
+  for (const dir of codexDirs) {
+    const skillMd = path.join(AGENTS_DIR, dir, 'SKILL.md');
+    if (fs.existsSync(skillMd)) {
+      codexCount++;
+      const content = fs.readFileSync(skillMd, 'utf-8');
+      // Quick validation: must have frontmatter with name + description only
+      const hasClaude = content.includes('.claude/skills');
+      if (hasClaude) {
+        hasErrors = true;
+        console.log(`  \u274c ${dir.padEnd(30)} — contains .claude/skills reference`);
+      } else {
+        console.log(`  \u2705 ${dir.padEnd(30)} — OK`);
+      }
+    } else {
+      codexMissing++;
+      hasErrors = true;
+      console.log(`  \u274c ${dir.padEnd(30)} — SKILL.md missing`);
+    }
+  }
+  console.log(`  Total: ${codexCount} skills, ${codexMissing} missing`);
+} else {
+  console.log('\n  Codex Skills: .agents/skills/ not found (run: bun run gen:skill-docs --host codex)');
+}
+
 // ─── Freshness ──────────────────────────────────────────────

-console.log('\n  Freshness:');
+console.log('\n  Freshness (Claude):');
 try {
  execSync('bun run scripts/gen-skill-docs.ts --dry-run', { cwd: ROOT, stdio: 'pipe' });
-  console.log('  \u2705 All generated files are fresh');
+  console.log('  \u2705 All Claude generated files are fresh');
 } catch (err: any) {
  hasErrors = true;
  const output = err.stdout?.toString() || '';
-  console.log('  \u274c Generated files are stale:');
+  console.log('  \u274c Claude generated files are stale:');
  for (const line of output.split('\n').filter((l: string) => l.startsWith('STALE'))) {
    console.log(`      ${line}`);
  }
  console.log('      Run: bun run gen:skill-docs');
 }

+console.log('\n  Freshness (Codex):');
+try {
+  execSync('bun run scripts/gen-skill-docs.ts --host codex --dry-run', { cwd: ROOT, stdio: 'pipe' });
+  console.log('  \u2705 All Codex generated files are fresh');
+} catch (err: any) {
+  hasErrors = true;
+  const output = err.stdout?.toString() || '';
+  console.log('  \u274c Codex generated files are stale:');
+  for (const line of output.split('\n').filter((l: string) => l.startsWith('STALE'))) {
+    console.log(`      ${line}`);
+  }
+  console.log('      Run: bun run gen:skill-docs --host codex');
+}
+
 console.log('');
 process.exit(hasErrors ? 1 : 0);