diff --git a/office-hours/SKILL.md.tmpl b/office-hours/SKILL.md.tmpl index 33d673c1..03368faf 100644 --- a/office-hours/SKILL.md.tmpl +++ b/office-hours/SKILL.md.tmpl @@ -348,6 +348,10 @@ Use AskUserQuestion to confirm. If the user disagrees with a premise, revise und --- +{{CODEX_SECOND_OPINION}} + +--- + ## Phase 4: Alternatives Generation (MANDATORY) Produce 2-3 distinct implementation approaches. This is NOT optional. @@ -374,6 +378,7 @@ Rules: - One must be the **"minimal viable"** (fewest files, smallest diff, ships fastest). - One must be the **"ideal architecture"** (best long-term trajectory, most elegant). - One can be **creative/lateral** (unexpected approach, different framing of the problem). +- If Codex proposed a prototype in Phase 3.5, consider using it as a starting point for the creative/lateral approach. **RECOMMENDATION:** Choose [X] because [one-line reason]. @@ -397,6 +402,7 @@ Track which of these signals appeared during the session: - Has **domain expertise** — knows this space from the inside - Showed **taste** — cared about getting the details right - Showed **agency** — actually building, not just planning +- **Defended premise with reasoning** against cross-model challenge (kept original premise when Codex disagreed AND articulated specific reasoning for why — dismissal without reasoning does not count) Count the signals. You'll use this count in Phase 6 to determine which tier of closing message to use. @@ -450,6 +456,9 @@ Supersedes: {prior filename — omit this line if first design on this branch} ## Premises {from Phase 3} +## Cross-Model Perspective +{If Codex ran in Phase 3.5: Codex's independent cold read — steelman, key insight, challenged premise, prototype suggestion. Verbatim or close paraphrase of what Codex said. If Codex did NOT run (skipped or unavailable): omit this section entirely — do not include it.} + ## Approaches Considered ### Approach A: {name} {from Phase 4} @@ -499,6 +508,9 @@ Supersedes: {prior filename — omit this line if first design on this branch} ## Premises {from Phase 3} +## Cross-Model Perspective +{If Codex ran in Phase 3.5: Codex's independent cold read — coolest version, key insight, existing tools, prototype suggestion. Verbatim or close paraphrase of what Codex said. If Codex did NOT run (skipped or unavailable): omit this section entirely — do not include it.} + ## Approaches Considered ### Approach A: {name} {from Phase 4} diff --git a/scripts/gen-skill-docs.ts b/scripts/gen-skill-docs.ts index 6d4e67bc..fbc31500 100644 --- a/scripts/gen-skill-docs.ts +++ b/scripts/gen-skill-docs.ts @@ -2023,6 +2023,93 @@ The screenshot file at \`/tmp/gstack-sketch.png\` can be referenced by downstrea (\`/plan-design-review\`, \`/design-review\`) to see what was originally envisioned.`; } +function generateCodexSecondOpinion(ctx: TemplateContext): string { + // Codex host: strip entirely — Codex should never invoke itself + if (ctx.host === 'codex') return ''; + + return `## Phase 3.5: Cross-Model Second Opinion (optional) + +**Binary check first — no question if unavailable:** + +\`\`\`bash +which codex 2>/dev/null && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE" +\`\`\` + +If \`CODEX_NOT_AVAILABLE\`: skip Phase 3.5 entirely — no message, no AskUserQuestion. Proceed directly to Phase 4. + +If \`CODEX_AVAILABLE\`: use AskUserQuestion: + +> Want a second opinion from a different AI model? Codex will independently review your problem statement, key answers, premises, and any landscape findings from this session. It hasn't seen this conversation — it gets a structured summary. Usually takes 2-5 minutes. +> A) Yes, get a second opinion +> B) No, proceed to alternatives + +If B: skip Phase 3.5 entirely. Remember that Codex did NOT run (affects design doc, founder signals, and Phase 4 below). + +**If A: Run the Codex cold read.** + +1. Assemble a structured context block from Phases 1-3: + - Mode (Startup or Builder) + - Problem statement (from Phase 1) + - Key answers from Phase 2A/2B (summarize each Q&A in 1-2 sentences, include verbatim user quotes) + - Landscape findings (from Phase 2.75, if search was run) + - Agreed premises (from Phase 3) + - Codebase context (project name, languages, recent activity) + +2. **Write the assembled prompt to a temp file** (prevents shell injection from user-derived content): + +\`\`\`bash +CODEX_PROMPT_FILE=$(mktemp /tmp/gstack-codex-oh-XXXXXXXX.txt) +\`\`\` + +Write the full prompt (context block + instructions) to this file. Use the mode-appropriate variant: + +**Startup mode instructions:** "You are an independent technical advisor reading a transcript of a startup brainstorming session. [CONTEXT BLOCK HERE]. Your job: 1) What is the STRONGEST version of what this person is trying to build? Steelman it in 2-3 sentences. 2) What is the ONE thing from their answers that reveals the most about what they should actually build? Quote it and explain why. 3) Name ONE agreed premise you think is wrong, and what evidence would prove you right. 4) If you had 48 hours and one engineer to build a prototype, what would you build? Be specific — tech stack, features, what you'd skip. Be direct. Be terse. No preamble." + +**Builder mode instructions:** "You are an independent technical advisor reading a transcript of a builder brainstorming session. [CONTEXT BLOCK HERE]. Your job: 1) What is the COOLEST version of this they haven't considered? 2) What's the ONE thing from their answers that reveals what excites them most? Quote it. 3) What existing open source project or tool gets them 50% of the way there — and what's the 50% they'd need to build? 4) If you had a weekend to build this, what would you build first? Be specific. Be direct. No preamble." + +3. Run Codex: + +\`\`\`bash +TMPERR_OH=$(mktemp /tmp/codex-oh-err-XXXXXXXX) +codex exec "$(cat "$CODEX_PROMPT_FILE")" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_OH" +\`\`\` + +Use a 5-minute timeout (\`timeout: 300000\`). After the command completes, read stderr: +\`\`\`bash +cat "$TMPERR_OH" +rm -f "$TMPERR_OH" "$CODEX_PROMPT_FILE" +\`\`\` + +**Error handling:** All errors are non-blocking — Codex second opinion is a quality enhancement, not a prerequisite. +- **Auth failure:** If stderr contains "auth", "login", "unauthorized", or "API key": "Codex authentication failed. Run \\\`codex login\\\` to authenticate. Skipping second opinion." +- **Timeout:** "Codex timed out after 5 minutes. Skipping second opinion." +- **Empty response:** "Codex returned no response. Stderr: . Skipping second opinion." + +On any error, proceed to Phase 4 — do NOT fall back to a Claude subagent (this is brainstorming, not adversarial review). + +4. **Presentation:** + +\`\`\` +SECOND OPINION (Codex): +════════════════════════════════════════════════════════════ + +════════════════════════════════════════════════════════════ +\`\`\` + +5. **Cross-model synthesis:** After presenting Codex output, provide 3-5 bullet synthesis: + - Where Claude agrees with Codex + - Where Claude disagrees and why + - Whether Codex's challenged premise changes Claude's recommendation + +6. **Premise revision check:** If Codex challenged an agreed premise, use AskUserQuestion: + +> Codex challenged premise #{N}: "{premise text}". Their argument: "{reasoning}". +> A) Revise this premise based on Codex's input +> B) Keep the original premise — proceed to alternatives + +If A: revise the premise and note the revision. If B: proceed (and note that the user defended this premise with reasoning — this is a founder signal if they articulate WHY they disagree, not just dismiss).`; +} + function generateAdversarialStep(ctx: TemplateContext): string { // Codex host: strip entirely — Codex should never invoke itself if (ctx.host === 'codex') return ''; @@ -2222,6 +2309,7 @@ const RESOLVERS: Record string> = { SPEC_REVIEW_LOOP: generateSpecReviewLoop, DESIGN_SKETCH: generateDesignSketch, BENEFITS_FROM: generateBenefitsFrom, + CODEX_SECOND_OPINION: generateCodexSecondOpinion, CODEX_REVIEW_STEP: generateAdversarialStep, ADVERSARIAL_STEP: generateAdversarialStep, DEPLOY_BOOTSTRAP: generateDeployBootstrap, diff --git a/test/gen-skill-docs.test.ts b/test/gen-skill-docs.test.ts index 496e7963..6bc6ceb3 100644 --- a/test/gen-skill-docs.test.ts +++ b/test/gen-skill-docs.test.ts @@ -664,6 +664,48 @@ describe('DESIGN_SKETCH resolver', () => { }); }); +// --- {{CODEX_SECOND_OPINION}} resolver tests --- + +describe('CODEX_SECOND_OPINION resolver', () => { + const content = fs.readFileSync(path.join(ROOT, 'office-hours', 'SKILL.md'), 'utf-8'); + const codexContent = fs.readFileSync(path.join(ROOT, '.agents', 'skills', 'gstack-office-hours', 'SKILL.md'), 'utf-8'); + + test('Phase 3.5 section appears in office-hours SKILL.md', () => { + expect(content).toContain('Phase 3.5: Cross-Model Second Opinion'); + }); + + test('contains codex exec invocation', () => { + expect(content).toContain('codex exec'); + }); + + test('contains opt-in AskUserQuestion text', () => { + expect(content).toContain('second opinion from a different AI model'); + }); + + test('contains cross-model synthesis instructions', () => { + expect(content).toMatch(/[Ss]ynthesis/); + expect(content).toContain('Where Claude agrees with Codex'); + }); + + test('contains premise revision check', () => { + expect(content).toContain('Codex challenged premise'); + }); + + test('contains error handling for auth, timeout, and empty', () => { + expect(content).toMatch(/[Aa]uth.*fail/); + expect(content).toMatch(/[Tt]imeout/); + expect(content).toMatch(/[Ee]mpty response/); + }); + + test('Codex host variant does NOT contain the Phase 3.5 resolver output', () => { + // The resolver returns '' for codex host, so the interactive section is stripped. + // Static template references to "Phase 3.5" in prose/conditionals are fine. + expect(codexContent).not.toContain('Phase 3.5: Cross-Model Second Opinion'); + expect(codexContent).not.toContain('CODEX_NOT_AVAILABLE'); + expect(codexContent).not.toContain('TMPERR_OH'); + }); +}); + // --- {{BENEFITS_FROM}} resolver tests --- describe('BENEFITS_FROM resolver', () => {