From 248cb7da6d2d1b9e36dfa0338e3d5853d5a11ab0 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sun, 22 Mar 2026 19:18:00 -0700 Subject: [PATCH] feat(gen-skill-docs): add design outside voices + hard rules resolvers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add generateDesignOutsideVoices() — parallel Codex + Claude subagent dispatch for cross-model design critique with litmus scorecard synthesis. Branches per skillName (plan-design-review, design-review, design-consultation) with task-specific reasoning effort (high for analytical, medium for creative). Add generateDesignHardRules() — OpenAI Frontend Skill hard rules + gstack AI slop blacklist unified into one shared block with classifier step (landing page vs app UI vs hybrid). Extract AI_SLOP_BLACKLIST constant from inline prose in generateDesignMethodology() for DRY. Extend generateDesignReviewLite() with lightweight Codex block. Extend generateDesignSketch() with outside voices opt-in after wireframe. Source: OpenAI "Designing Delightful Frontends with GPT-5.4" (Mar 2026) Co-Authored-By: Claude Opus 4.6 (1M context) --- scripts/gen-skill-docs.ts | 362 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 347 insertions(+), 15 deletions(-) diff --git a/scripts/gen-skill-docs.ts b/scripts/gen-skill-docs.ts index 6d4e67bc..1376866b 100644 --- a/scripts/gen-skill-docs.ts +++ b/scripts/gen-skill-docs.ts @@ -60,6 +60,44 @@ interface TemplateContext { paths: HostPaths; } +// ─── Shared Design Constants ──────────────────────────────── + +/** gstack's 10 AI slop anti-patterns — shared between DESIGN_METHODOLOGY and DESIGN_HARD_RULES */ +const AI_SLOP_BLACKLIST = [ + 'Purple/violet/indigo gradient backgrounds or blue-to-purple color schemes', + '**The 3-column feature grid:** icon-in-colored-circle + bold title + 2-line description, repeated 3x symmetrically. THE most recognizable AI layout.', + 'Icons in colored circles as section decoration (SaaS starter template look)', + 'Centered everything (`text-align: center` on all headings, descriptions, cards)', + 'Uniform bubbly border-radius on every element (same large radius on everything)', + 'Decorative blobs, floating circles, wavy SVG dividers (if a section feels empty, it needs better content, not decoration)', + 'Emoji as design elements (rockets in headings, emoji as bullet points)', + 'Colored left-border on cards (`border-left: 3px solid `)', + 'Generic hero copy ("Welcome to [X]", "Unlock the power of...", "Your all-in-one solution for...")', + 'Cookie-cutter section rhythm (hero → 3 features → testimonials → pricing → CTA, every section same height)', +]; + +/** OpenAI hard rejection criteria (from "Designing Delightful Frontends with GPT-5.4", Mar 2026) */ +const OPENAI_HARD_REJECTIONS = [ + 'Generic SaaS card grid as first impression', + 'Beautiful image with weak brand', + 'Strong headline with no clear action', + 'Busy imagery behind text', + 'Sections repeating same mood statement', + 'Carousel with no narrative purpose', + 'App UI made of stacked cards instead of layout', +]; + +/** OpenAI litmus checks — 7 yes/no tests for cross-model consensus scoring */ +const OPENAI_LITMUS_CHECKS = [ + 'Brand/product unmistakable in first screen?', + 'One strong visual anchor present?', + 'Page understandable by scanning headlines only?', + 'Each section has one job?', + 'Are cards actually necessary?', + 'Does motion improve hierarchy or atmosphere?', + 'Would design feel premium with all decorative shadows removed?', +]; + // ─── Placeholder Resolvers ────────────────────────────────── function generateCommandReference(_ctx: TemplateContext): string { @@ -867,13 +905,40 @@ Minimum 0 per category. 12. **Never refuse to use the browser.** When the user invokes /qa or /qa-only, they are requesting browser-based testing. Never suggest evals, unit tests, or other alternatives as a substitute. Even if the diff appears to have no UI changes, backend changes affect app behavior — always open the browser and test.`; } -function generateDesignReviewLite(_ctx: TemplateContext): string { +function generateDesignReviewLite(ctx: TemplateContext): string { + const litmusList = OPENAI_LITMUS_CHECKS.map((item, i) => `${i + 1}. ${item}`).join(' '); + const rejectionList = OPENAI_HARD_REJECTIONS.map((item, i) => `${i + 1}. ${item}`).join(' '); + // Codex block only for Claude host + const codexBlock = ctx.host === 'codex' ? '' : ` + +7. **Codex design voice** (optional, automatic if available): + +\`\`\`bash +which codex 2>/dev/null && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE" +\`\`\` + +If Codex is available, run a lightweight design check on the diff: + +\`\`\`bash +TMPERR_DRL=$(mktemp /tmp/codex-drl-XXXXXXXX) +codex exec "Review the git diff on this branch. Run 7 litmus checks (YES/NO each): ${litmusList} Flag any hard rejections: ${rejectionList} 5 most important design findings only. Reference file:line." -s read-only -c 'model_reasoning_effort="high"' --enable web_search_cached 2>"$TMPERR_DRL" +\`\`\` + +Use a 5-minute timeout (\`timeout: 300000\`). After the command completes, read stderr: +\`\`\`bash +cat "$TMPERR_DRL" && rm -f "$TMPERR_DRL" +\`\`\` + +**Error handling:** All errors are non-blocking. On auth failure, timeout, or empty response — skip with a brief note and continue. + +Present Codex output under a \`CODEX (design):\` header, merged with the checklist findings above.`; + return `## Design Review (conditional, diff-scoped) Check if the diff touches frontend files using \`gstack-diff-scope\`: \`\`\`bash -source <(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null) +source <(${ctx.paths.binDir}/gstack-diff-scope 2>/dev/null) \`\`\` **If \`SCOPE_FRONTEND=false\`:** Skip design review silently. No output. @@ -896,10 +961,10 @@ source <(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null) 6. **Log the result** for the Review Readiness Dashboard: \`\`\`bash -~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"design-review-lite","timestamp":"TIMESTAMP","status":"STATUS","findings":N,"auto_fixed":M,"commit":"COMMIT"}' +${ctx.paths.binDir}/gstack-review-log '{"skill":"design-review-lite","timestamp":"TIMESTAMP","status":"STATUS","findings":N,"auto_fixed":M,"commit":"COMMIT"}' \`\`\` -Substitute: TIMESTAMP = ISO 8601 datetime, STATUS = "clean" if 0 findings or "issues_found", N = total findings, M = auto-fixed count, COMMIT = output of \`git rev-parse --short HEAD\`.`; +Substitute: TIMESTAMP = ISO 8601 datetime, STATUS = "clean" if 0 findings or "issues_found", N = total findings, M = auto-fixed count, COMMIT = output of \`git rev-parse --short HEAD\`.${codexBlock}`; } // NOTE: design-checklist.md is a subset of this methodology for code-level detection. @@ -1096,16 +1161,7 @@ Apply these at each page. Each finding gets an impact rating (high/medium/polish The test: would a human designer at a respected studio ever ship this? -- Purple/violet/indigo gradient backgrounds or blue-to-purple color schemes -- **The 3-column feature grid:** icon-in-colored-circle + bold title + 2-line description, repeated 3x symmetrically. THE most recognizable AI layout. -- Icons in colored circles as section decoration (SaaS starter template look) -- Centered everything (\`text-align: center\` on all headings, descriptions, cards) -- Uniform bubbly border-radius on every element (same large radius on everything) -- Decorative blobs, floating circles, wavy SVG dividers (if a section feels empty, it needs better content, not decoration) -- Emoji as design elements (rockets in headings, emoji as bullet points) -- Colored left-border on cards (\`border-left: 3px solid \`) -- Generic hero copy ("Welcome to [X]", "Unlock the power of...", "Your all-in-one solution for...") -- Cookie-cutter section rhythm (hero → 3 features → testimonials → pricing → CTA, every section same height) +${AI_SLOP_BLACKLIST.map(item => `- ${item}`).join('\n')} **10. Performance as Design** (6 items) - LCP < 2.0s (web apps), < 1.5s (informational sites) @@ -2020,7 +2076,36 @@ If they approve or say "good enough," proceed. Reference the wireframe screenshot in the design doc's "Recommended Approach" section. The screenshot file at \`/tmp/gstack-sketch.png\` can be referenced by downstream skills -(\`/plan-design-review\`, \`/design-review\`) to see what was originally envisioned.`; +(\`/plan-design-review\`, \`/design-review\`) to see what was originally envisioned. + +**Step 6: Outside design voices** (optional) + +After the wireframe is approved, offer outside design perspectives: + +\`\`\`bash +which codex 2>/dev/null && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE" +\`\`\` + +If Codex is available, use AskUserQuestion: +> "Want outside design perspectives on the chosen approach? Codex proposes a visual thesis, content plan, and interaction ideas. A Claude subagent proposes an alternative aesthetic direction." +> +> A) Yes — get outside design voices +> B) No — proceed without + +If user chooses A, launch both voices simultaneously: + +1. **Codex** (via Bash, \`model_reasoning_effort="medium"\`): +\`\`\`bash +TMPERR_SKETCH=$(mktemp /tmp/codex-sketch-XXXXXXXX) +codex exec "For this product approach, provide: a visual thesis (one sentence — mood, material, energy), a content plan (hero → support → detail → CTA), and 2 interaction ideas that change page feel. Apply beautiful defaults: composition-first, brand-first, cardless, poster not document. Be opinionated." -s read-only -c 'model_reasoning_effort="medium"' --enable web_search_cached 2>"$TMPERR_SKETCH" +\`\`\` +Use a 5-minute timeout (\`timeout: 300000\`). After completion: \`cat "$TMPERR_SKETCH" && rm -f "$TMPERR_SKETCH"\` + +2. **Claude subagent** (via Agent tool): +"For this product approach, what design direction would you recommend? What aesthetic, typography, and interaction patterns fit? What would make this approach feel inevitable to the user? Be specific — font names, hex colors, spacing values." + +Present Codex output under \`CODEX SAYS (design sketch):\` and subagent output under \`CLAUDE SUBAGENT (design direction):\`. +Error handling: all non-blocking. On failure, skip and continue.`; } function generateAdversarialStep(ctx: TemplateContext): string { @@ -2203,6 +2288,251 @@ in the decision tree below. If you want to persist deploy settings for future runs, suggest the user run \`/setup-deploy\`.`; } +// ─── Design Outside Voices (parallel Codex + Claude subagent) ─────── + +function generateDesignOutsideVoices(ctx: TemplateContext): string { + // Codex host: strip entirely — Codex should never invoke itself + if (ctx.host === 'codex') return ''; + + const rejectionList = OPENAI_HARD_REJECTIONS.map((item, i) => `${i + 1}. ${item}`).join('\n'); + const litmusList = OPENAI_LITMUS_CHECKS.map((item, i) => `${i + 1}. ${item}`).join('\n'); + + // Skill-specific configuration + const isPlanDesignReview = ctx.skillName === 'plan-design-review'; + const isDesignReview = ctx.skillName === 'design-review'; + const isDesignConsultation = ctx.skillName === 'design-consultation'; + + // Determine opt-in behavior and reasoning effort + const isAutomatic = isDesignReview; // design-review runs automatically + const reasoningEffort = isDesignConsultation ? 'medium' : 'high'; // creative vs analytical + + // Build skill-specific Codex prompt + let codexPrompt: string; + let subagentPrompt: string; + + if (isPlanDesignReview) { + codexPrompt = `Read the plan file at [plan-file-path]. Evaluate this plan's UI/UX design against these criteria. + +HARD REJECTION — flag if ANY apply: +${rejectionList} + +LITMUS CHECKS — answer YES or NO for each: +${litmusList} + +HARD RULES — first classify as MARKETING/LANDING PAGE vs APP UI vs HYBRID, then flag violations of the matching rule set: +- MARKETING: First viewport as one composition, brand-first hierarchy, full-bleed hero, 2-3 intentional motions, composition-first layout +- APP UI: Calm surface hierarchy, dense but readable, utility language, minimal chrome +- UNIVERSAL: CSS variables for colors, no default font stacks, one job per section, cards earn existence + +For each finding: what's wrong, what will happen if it ships unresolved, and the specific fix. Be opinionated. No hedging.`; + + subagentPrompt = `Read the plan file at [plan-file-path]. You are an independent senior product designer reviewing this plan. You have NOT seen any prior review. Evaluate: + +1. Information hierarchy: what does the user see first, second, third? Is it right? +2. Missing states: loading, empty, error, success, partial — which are unspecified? +3. User journey: what's the emotional arc? Where does it break? +4. Specificity: does the plan describe SPECIFIC UI ("48px Söhne Bold header, #1a1a1a on white") or generic patterns ("clean modern card-based layout")? +5. What design decisions will haunt the implementer if left ambiguous? + +For each finding: what's wrong, severity (critical/high/medium), and the fix.`; + } else if (isDesignReview) { + codexPrompt = `Review the frontend source code in this repo. Evaluate against these design hard rules: +- Spacing: systematic (design tokens / CSS variables) or magic numbers? +- Typography: expressive purposeful fonts or default stacks? +- Color: CSS variables with defined system, or hardcoded hex scattered? +- Responsive: breakpoints defined? calc(100svh - header) for heroes? Mobile tested? +- A11y: ARIA landmarks, alt text, contrast ratios, 44px touch targets? +- Motion: 2-3 intentional animations, or zero / ornamental only? +- Cards: used only when card IS the interaction? No decorative card grids? + +First classify as MARKETING/LANDING PAGE vs APP UI vs HYBRID, then apply matching rules. + +LITMUS CHECKS — answer YES/NO: +${litmusList} + +HARD REJECTION — flag if ANY apply: +${rejectionList} + +Be specific. Reference file:line for every finding.`; + + subagentPrompt = `Review the frontend source code in this repo. You are an independent senior product designer doing a source-code design audit. Focus on CONSISTENCY PATTERNS across files rather than individual violations: +- Are spacing values systematic across the codebase? +- Is there ONE color system or scattered approaches? +- Do responsive breakpoints follow a consistent set? +- Is the accessibility approach consistent or spotty? + +For each finding: what's wrong, severity (critical/high/medium), and the file:line.`; + } else if (isDesignConsultation) { + codexPrompt = `Given this product context, propose a complete design direction: +- Visual thesis: one sentence describing mood, material, and energy +- Typography: specific font names (not defaults — no Inter/Roboto/Arial/system) + hex colors +- Color system: CSS variables for background, surface, primary text, muted text, accent +- Layout: composition-first, not component-first. First viewport as poster, not document +- Differentiation: 2 deliberate departures from category norms +- Anti-slop: no purple gradients, no 3-column icon grids, no centered everything, no decorative blobs + +Be opinionated. Be specific. Do not hedge. This is YOUR design direction — own it.`; + + subagentPrompt = `Given this product context, propose a design direction that would SURPRISE. What would the cool indie studio do that the enterprise UI team wouldn't? +- Propose an aesthetic direction, typography stack (specific font names), color palette (hex values) +- 2 deliberate departures from category norms +- What emotional reaction should the user have in the first 3 seconds? + +Be bold. Be specific. No hedging.`; + } else { + // Unknown skill — return empty + return ''; + } + + // Build the opt-in section + const optInSection = isAutomatic ? ` +**Automatic:** Outside voices run automatically when Codex is available. No opt-in needed.` : ` +Use AskUserQuestion: +> "Want outside design voices${isPlanDesignReview ? ' before the detailed review' : ''}? Codex evaluates against OpenAI's design hard rules + litmus checks; Claude subagent does an independent ${isDesignConsultation ? 'design direction proposal' : 'completeness review'}." +> +> A) Yes — run outside design voices +> B) No — proceed without + +If user chooses B, skip this step and continue.`; + + // Build the synthesis section + const synthesisSection = isPlanDesignReview ? ` +**Synthesis — Litmus scorecard:** + +\`\`\` +DESIGN OUTSIDE VOICES — LITMUS SCORECARD: +═══════════════════════════════════════════════════════════════ + Check Claude Codex Consensus + ─────────────────────────────────────── ─────── ─────── ───────── + 1. Brand unmistakable in first screen? — — — + 2. One strong visual anchor? — — — + 3. Scannable by headlines only? — — — + 4. Each section has one job? — — — + 5. Cards actually necessary? — — — + 6. Motion improves hierarchy? — — — + 7. Premium without decorative shadows? — — — + ─────────────────────────────────────── ─────── ─────── ───────── + Hard rejections triggered: — — — +═══════════════════════════════════════════════════════════════ +\`\`\` + +Fill in each cell from the Codex and subagent outputs. CONFIRMED = both agree. DISAGREE = models differ. NOT SPEC'D = not enough info to evaluate. + +**Pass integration (respects existing 7-pass contract):** +- Hard rejections → raised as the FIRST items in Pass 1, tagged \`[HARD REJECTION]\` +- Litmus DISAGREE items → raised in the relevant pass with both perspectives +- Litmus CONFIRMED failures → pre-loaded as known issues in the relevant pass +- Passes can skip discovery and go straight to fixing for pre-identified issues` : + isDesignConsultation ? ` +**Synthesis:** Claude main references both Codex and subagent proposals in the Phase 3 proposal. Present: +- Areas of agreement between all three voices (Claude main + Codex + subagent) +- Genuine divergences as creative alternatives for the user to choose from +- "Codex and I agree on X. Codex suggested Y where I'm proposing Z — here's why..."` : ` +**Synthesis — Litmus scorecard:** + +Use the same scorecard format as /plan-design-review (shown above). Fill in from both outputs. +Merge findings into the triage with \`[codex]\` / \`[subagent]\` / \`[cross-model]\` tags.`; + + const escapedCodexPrompt = codexPrompt.replace(/`/g, '\\`').replace(/\$/g, '\\$'); + + return `## Design Outside Voices (parallel) +${optInSection} + +**Check Codex availability:** +\`\`\`bash +which codex 2>/dev/null && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE" +\`\`\` + +**If Codex is available**, launch both voices simultaneously: + +1. **Codex design voice** (via Bash): +\`\`\`bash +TMPERR_DESIGN=$(mktemp /tmp/codex-design-XXXXXXXX) +codex exec "${escapedCodexPrompt}" -s read-only -c 'model_reasoning_effort="${reasoningEffort}"' --enable web_search_cached 2>"$TMPERR_DESIGN" +\`\`\` +Use a 5-minute timeout (\`timeout: 300000\`). After the command completes, read stderr: +\`\`\`bash +cat "$TMPERR_DESIGN" && rm -f "$TMPERR_DESIGN" +\`\`\` + +2. **Claude design subagent** (via Agent tool): +Dispatch a subagent with this prompt: +"${subagentPrompt}" + +**Error handling (all non-blocking):** +- **Auth failure:** If stderr contains "auth", "login", "unauthorized", or "API key": "Codex authentication failed. Run \`codex login\` to authenticate." +- **Timeout:** "Codex timed out after 5 minutes." +- **Empty response:** "Codex returned no response." +- On any Codex error: proceed with Claude subagent output only, tagged \`[single-model]\`. +- If Claude subagent also fails: "Outside voices unavailable — continuing with primary review." + +Present Codex output under a \`CODEX SAYS (design ${isPlanDesignReview ? 'critique' : isDesignReview ? 'source audit' : 'direction'}):\` header. +Present subagent output under a \`CLAUDE SUBAGENT (design ${isPlanDesignReview ? 'completeness' : isDesignReview ? 'consistency' : 'direction'}):\` header. +${synthesisSection} + +**Log the result:** +\`\`\`bash +${ctx.paths.binDir}/gstack-review-log '{"skill":"design-outside-voices","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","status":"STATUS","source":"SOURCE","commit":"'"$(git rev-parse --short HEAD)"'"}' +\`\`\` +Replace STATUS with "clean" or "issues_found", SOURCE with "codex+subagent", "codex-only", "subagent-only", or "unavailable".`; +} + +// ─── Design Hard Rules (OpenAI framework + gstack slop blacklist) ─── + +function generateDesignHardRules(_ctx: TemplateContext): string { + const slopItems = AI_SLOP_BLACKLIST.map((item, i) => `${i + 1}. ${item}`).join('\n'); + const rejectionItems = OPENAI_HARD_REJECTIONS.map((item, i) => `${i + 1}. ${item}`).join('\n'); + const litmusItems = OPENAI_LITMUS_CHECKS.map((item, i) => `${i + 1}. ${item}`).join('\n'); + + return `### Design Hard Rules + +**Classifier — determine rule set before evaluating:** +- **MARKETING/LANDING PAGE** (hero-driven, brand-forward, conversion-focused) → apply Landing Page Rules +- **APP UI** (workspace-driven, data-dense, task-focused: dashboards, admin, settings) → apply App UI Rules +- **HYBRID** (marketing shell with app-like sections) → apply Landing Page Rules to hero/marketing sections, App UI Rules to functional sections + +**Hard rejection criteria** (instant-fail patterns — flag if ANY apply): +${rejectionItems} + +**Litmus checks** (answer YES/NO for each — used for cross-model consensus scoring): +${litmusItems} + +**Landing page rules** (apply when classifier = MARKETING/LANDING): +- First viewport reads as one composition, not a dashboard +- Brand-first hierarchy: brand > headline > body > CTA +- Typography: expressive, purposeful — no default stacks (Inter, Roboto, Arial, system) +- No flat single-color backgrounds — use gradients, images, subtle patterns +- Hero: full-bleed, edge-to-edge, no inset/tiled/rounded variants +- Hero budget: brand, one headline, one supporting sentence, one CTA group, one image +- No cards in hero. Cards only when card IS the interaction +- One job per section: one purpose, one headline, one short supporting sentence +- Motion: 2-3 intentional motions minimum (entrance, scroll-linked, hover/reveal) +- Color: define CSS variables, avoid purple-on-white defaults, one accent color default +- Copy: product language not design commentary. "If deleting 30% improves it, keep deleting" +- Beautiful defaults: composition-first, brand as loudest text, two typefaces max, cardless by default, first viewport as poster not document + +**App UI rules** (apply when classifier = APP UI): +- Calm surface hierarchy, strong typography, few colors +- Dense but readable, minimal chrome +- Organize: primary workspace, navigation, secondary context, one accent +- Avoid: dashboard-card mosaics, thick borders, decorative gradients, ornamental icons +- Copy: utility language — orientation, status, action. Not mood/brand/aspiration +- Cards only when card IS the interaction +- Section headings state what area is or what user can do ("Selected KPIs", "Plan status") + +**Universal rules** (apply to ALL types): +- Define CSS variables for color system +- No default font stacks (Inter, Roboto, Arial, system) +- One job per section +- "If deleting 30% of the copy improves it, keep deleting" +- Cards earn their existence — no decorative card grids + +**AI Slop blacklist** (the 10 patterns that scream "AI-generated"): +${slopItems} + +Source: [OpenAI "Designing Delightful Frontends with GPT-5.4"](https://developers.openai.com/blog/designing-delightful-frontends-with-gpt-5-4) (Mar 2026) + gstack design methodology.`; +} + const RESOLVERS: Record string> = { COMMAND_REFERENCE: generateCommandReference, SNAPSHOT_FLAGS: generateSnapshotFlags, @@ -2211,6 +2541,8 @@ const RESOLVERS: Record string> = { BASE_BRANCH_DETECT: generateBaseBranchDetect, QA_METHODOLOGY: generateQAMethodology, DESIGN_METHODOLOGY: generateDesignMethodology, + DESIGN_HARD_RULES: generateDesignHardRules, + DESIGN_OUTSIDE_VOICES: generateDesignOutsideVoices, DESIGN_REVIEW_LITE: generateDesignReviewLite, REVIEW_DASHBOARD: generateReviewDashboard, PLAN_FILE_REVIEW_REPORT: generatePlanFileReviewReport,