From 1052711ca6b595d63f963d3a3310d3fe354113e0 Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Mon, 16 Mar 2026 10:59:01 -0500
Subject: [PATCH] feat: add {{DESIGN_METHODOLOGY}} resolver and register design
 review skills

Add generateDesignMethodology() to gen-skill-docs.ts with 10-category, 80-item
design audit checklist. Register plan-design-review and qa-design-review templates
in findTemplates(). Add both skills to skill-check.ts SKILL_FILES. Add command
and snapshot flag validation tests for both skills in skill-validation.test.ts.
---
 scripts/gen-skill-docs.ts     | 336 ++++++++++++++++++++++++++++++++++
 scripts/skill-check.ts        |   2 +
 test/skill-validation.test.ts |  32 ++++
 3 files changed, 370 insertions(+)

diff --git a/scripts/gen-skill-docs.ts b/scripts/gen-skill-docs.ts
index bafed642..f2b3acc5 100644
--- a/scripts/gen-skill-docs.ts
+++ b/scripts/gen-skill-docs.ts
@@ -450,12 +450,346 @@ Minimum 0 per category.
 10. **Use \`snapshot -C\` for tricky UIs.** Finds clickable divs that the accessibility tree misses.`;
 }
 
+function generateDesignMethodology(): string {
+  return `## Modes
+
+### Full (default)
+Systematic review of all pages reachable from homepage. Visit 5-8 pages. Full checklist evaluation, responsive screenshots, interaction flow testing. Produces complete design audit report with letter grades.
+
+### Quick (\`--quick\`)
+Homepage + 2 key pages only. First Impression + Design System Extraction + abbreviated checklist. Fastest path to a design score.
+
+### Deep (\`--deep\`)
+Comprehensive review: 10-15 pages, every interaction flow, exhaustive checklist. For pre-launch audits or major redesigns.
+
+### Diff-aware (automatic when on a feature branch with no URL)
+When on a feature branch, scope to pages affected by the branch changes:
+1. Analyze the branch diff: \`git diff main...HEAD --name-only\`
+2. Map changed files to affected pages/routes
+3. Detect running app on common local ports (3000, 4000, 8080)
+4. Audit only affected pages, compare design quality before/after
+
+### Regression (\`--regression\` or previous \`design-baseline.json\` found)
+Run full audit, then load previous \`design-baseline.json\`. Compare: per-category grade deltas, new findings, resolved findings. Output regression table in report.
+
+---
+
+## Phase 1: First Impression
+
+The most uniquely designer-like output. Form a gut reaction before analyzing anything.
+
+1. Navigate to the target URL
+2. Take a full-page desktop screenshot: \`$B screenshot "$REPORT_DIR/screenshots/first-impression.png"\`
+3. Write the **First Impression** using this structured critique format:
+   - "The site communicates **[what]**." (what it says at a glance — competence? playfulness? confusion?)
+   - "I notice **[observation]**." (what stands out, positive or negative — be specific)
+   - "The first 3 things my eye goes to are: **[1]**, **[2]**, **[3]**." (hierarchy check — are these intentional?)
+   - "If I had to describe this in one word: **[word]**." (gut verdict)
+
+This is the section users read first. Be opinionated. A designer doesn't hedge — they react.
+
+---
+
+## Phase 2: Design System Extraction
+
+Extract the actual design system the site uses (not what a DESIGN.md says, but what's rendered):
+
+\`\`\`bash
+# Fonts in use (capped at 500 elements to avoid timeout)
+$B js "JSON.stringify([...new Set([...document.querySelectorAll('*')].slice(0,500).map(e => getComputedStyle(e).fontFamily))])"
+
+# Color palette in use
+$B js "JSON.stringify([...new Set([...document.querySelectorAll('*')].slice(0,500).flatMap(e => [getComputedStyle(e).color, getComputedStyle(e).backgroundColor]).filter(c => c !== 'rgba(0, 0, 0, 0)'))])"
+
+# Heading hierarchy
+$B js "JSON.stringify([...document.querySelectorAll('h1,h2,h3,h4,h5,h6')].map(h => ({tag:h.tagName, text:h.textContent.trim().slice(0,50), size:getComputedStyle(h).fontSize, weight:getComputedStyle(h).fontWeight})))"
+
+# Touch target audit (find undersized interactive elements)
+$B js "JSON.stringify([...document.querySelectorAll('a,button,input,[role=button]')].filter(e => {const r=e.getBoundingClientRect(); return r.width>0 && (r.width<44||r.height<44)}).map(e => ({tag:e.tagName, text:(e.textContent||'').trim().slice(0,30), w:Math.round(e.getBoundingClientRect().width), h:Math.round(e.getBoundingClientRect().height)})).slice(0,20))"
+
+# Performance baseline
+$B perf
+\`\`\`
+
+Structure findings as an **Inferred Design System**:
+- **Fonts:** list with usage counts. Flag if >3 distinct font families.
+- **Colors:** palette extracted. Flag if >12 unique non-gray colors. Note warm/cool/mixed.
+- **Heading Scale:** h1-h6 sizes. Flag skipped levels, non-systematic size jumps.
+- **Spacing Patterns:** sample padding/margin values. Flag non-scale values.
+
+After extraction, offer: *"Want me to save this as your DESIGN.md? I can lock in these observations as your project's design system baseline."*
+
+---
+
+## Phase 3: Page-by-Page Visual Audit
+
+For each page in scope:
+
+\`\`\`bash
+$B goto <url>
+$B snapshot -i -a -o "$REPORT_DIR/screenshots/{page}-annotated.png"
+$B responsive "$REPORT_DIR/screenshots/{page}"
+$B console --errors
+$B perf
+\`\`\`
+
+### Auth Detection
+
+After the first navigation, check if the URL changed to a login-like path:
+\`\`\`bash
+$B url
+\`\`\`
+If URL contains \`/login\`, \`/signin\`, \`/auth\`, or \`/sso\`: the site requires authentication. AskUserQuestion: "This site requires authentication. Want to import cookies from your browser? Run \`/setup-browser-cookies\` first if needed."
+
+### Design Audit Checklist (10 categories, ~80 items)
+
+Apply these at each page. Each finding gets an impact rating (high/medium/polish) and category.
+
+**1. Visual Hierarchy & Composition** (8 items)
+- Clear focal point? One primary CTA per view?
+- Eye flows naturally top-left to bottom-right?
+- Visual noise — competing elements fighting for attention?
+- Information density appropriate for content type?
+- Z-index clarity — nothing unexpectedly overlapping?
+- Above-the-fold content communicates purpose in 3 seconds?
+- Squint test: hierarchy still visible when blurred?
+- White space is intentional, not leftover?
+
+**2. Typography** (15 items)
+- Font count <=3 (flag if more)
+- Scale follows ratio (1.25 major third or 1.333 perfect fourth)
+- Line-height: 1.5x body, 1.15-1.25x headings
+- Measure: 45-75 chars per line (66 ideal)
+- Heading hierarchy: no skipped levels (h1→h3 without h2)
+- Weight contrast: >=2 weights used for hierarchy
+- No blacklisted fonts (Papyrus, Comic Sans, Lobster, Impact, Jokerman)
+- If primary font is Inter/Roboto/Open Sans/Poppins → flag as potentially generic
+- \`text-wrap: balance\` or \`text-pretty\` on headings (check via \`$B css <heading> text-wrap\`)
+- Curly quotes used, not straight quotes
+- Ellipsis character (\`…\`) not three dots (\`...\`)
+- \`font-variant-numeric: tabular-nums\` on number columns
+- Body text >= 16px
+- Caption/label >= 12px
+- No letterspacing on lowercase text
+
+**3. Color & Contrast** (10 items)
+- Palette coherent (<=12 unique non-gray colors)
+- WCAG AA: body text 4.5:1, large text (18px+) 3:1, UI components 3:1
+- Semantic colors consistent (success=green, error=red, warning=yellow/amber)
+- No color-only encoding (always add labels, icons, or patterns)
+- Dark mode: surfaces use elevation, not just lightness inversion
+- Dark mode: text off-white (~#E0E0E0), not pure white
+- Primary accent desaturated 10-20% in dark mode
+- \`color-scheme: dark\` on html element (if dark mode present)
+- No red/green only combinations (8% of men have red-green deficiency)
+- Neutral palette is warm or cool consistently — not mixed
+
+**4. Spacing & Layout** (12 items)
+- Grid consistent at all breakpoints
+- Spacing uses a scale (4px or 8px base), not arbitrary values
+- Alignment is consistent — nothing floats outside the grid
+- Rhythm: related items closer together, distinct sections further apart
+- Border-radius hierarchy (not uniform bubbly radius on everything)
+- Inner radius = outer radius - gap (nested elements)
+- No horizontal scroll on mobile
+- Max content width set (no full-bleed body text)
+- \`env(safe-area-inset-*)\` for notch devices
+- URL reflects state (filters, tabs, pagination in query params)
+- Flex/grid used for layout (not JS measurement)
+- Breakpoints: mobile (375), tablet (768), desktop (1024), wide (1440)
+
+**5. Interaction States** (10 items)
+- Hover state on all interactive elements
+- \`focus-visible\` ring present (never \`outline: none\` without replacement)
+- Active/pressed state with depth effect or color shift
+- Disabled state: reduced opacity + \`cursor: not-allowed\`
+- Loading: skeleton shapes match real content layout
+- Empty states: warm message + primary action + visual (not just "No items.")
+- Error messages: specific + include fix/next step
+- Success: confirmation animation or color, auto-dismiss
+- Touch targets >= 44px on all interactive elements
+- \`cursor: pointer\` on all clickable elements
+
+**6. Responsive Design** (8 items)
+- Mobile layout makes *design* sense (not just stacked desktop columns)
+- Touch targets sufficient on mobile (>= 44px)
+- No horizontal scroll on any viewport
+- Images handle responsive (srcset, sizes, or CSS containment)
+- Text readable without zooming on mobile (>= 16px body)
+- Navigation collapses appropriately (hamburger, bottom nav, etc.)
+- Forms usable on mobile (correct input types, no autoFocus on mobile)
+- No \`user-scalable=no\` or \`maximum-scale=1\` in viewport meta
+
+**7. Motion & Animation** (6 items)
+- Easing: ease-out for entering, ease-in for exiting, ease-in-out for moving
+- Duration: 50-700ms range (nothing slower unless page transition)
+- Purpose: every animation communicates something (state change, attention, spatial relationship)
+- \`prefers-reduced-motion\` respected (check: \`$B js "matchMedia('(prefers-reduced-motion: reduce)').matches"\`)
+- No \`transition: all\` — properties listed explicitly
+- Only \`transform\` and \`opacity\` animated (not layout properties like width, height, top, left)
+
+**8. Content & Microcopy** (8 items)
+- Empty states designed with warmth (message + action + illustration/icon)
+- Error messages specific: what happened + why + what to do next
+- Button labels specific ("Save API Key" not "Continue" or "Submit")
+- No placeholder/lorem ipsum text visible in production
+- Truncation handled (\`text-overflow: ellipsis\`, \`line-clamp\`, or \`break-words\`)
+- Active voice ("Install the CLI" not "The CLI will be installed")
+- Loading states end with \`…\` ("Saving…" not "Saving...")
+- Destructive actions have confirmation modal or undo window
+
+**9. AI Slop Detection** (10 anti-patterns — the blacklist)
+
+The test: would a human designer at a respected studio ever ship this?
+
+- Purple/violet/indigo gradient backgrounds or blue-to-purple color schemes
+- **The 3-column feature grid:** icon-in-colored-circle + bold title + 2-line description, repeated 3x symmetrically. THE most recognizable AI layout.
+- Icons in colored circles as section decoration (SaaS starter template look)
+- Centered everything (\`text-align: center\` on all headings, descriptions, cards)
+- Uniform bubbly border-radius on every element (same large radius on everything)
+- Decorative blobs, floating circles, wavy SVG dividers (if a section feels empty, it needs better content, not decoration)
+- Emoji as design elements (rockets in headings, emoji as bullet points)
+- Colored left-border on cards (\`border-left: 3px solid <accent>\`)
+- Generic hero copy ("Welcome to [X]", "Unlock the power of...", "Your all-in-one solution for...")
+- Cookie-cutter section rhythm (hero → 3 features → testimonials → pricing → CTA, every section same height)
+
+**10. Performance as Design** (6 items)
+- LCP < 2.0s (web apps), < 1.5s (informational sites)
+- CLS < 0.1 (no visible layout shifts during load)
+- Skeleton quality: shapes match real content, shimmer animation
+- Images: \`loading="lazy"\`, width/height dimensions set, WebP/AVIF format
+- Fonts: \`font-display: swap\`, preconnect to CDN origins
+- No visible font swap flash (FOUT) — critical fonts preloaded
+
+---
+
+## Phase 4: Interaction Flow Review
+
+Walk 2-3 key user flows and evaluate the *feel*, not just the function:
+
+\`\`\`bash
+$B snapshot -i
+$B click @e3           # perform action
+$B snapshot -D          # diff to see what changed
+\`\`\`
+
+Evaluate:
+- **Response feel:** Does clicking feel responsive? Any delays or missing loading states?
+- **Transition quality:** Are transitions intentional or generic/absent?
+- **Feedback clarity:** Did the action clearly succeed or fail? Is the feedback immediate?
+- **Form polish:** Focus states visible? Validation timing correct? Errors near the source?
+
+---
+
+## Phase 5: Cross-Page Consistency
+
+Compare screenshots and observations across pages for:
+- Navigation bar consistent across all pages?
+- Footer consistent?
+- Component reuse vs one-off designs (same button styled differently on different pages?)
+- Tone consistency (one page playful while another is corporate?)
+- Spacing rhythm carries across pages?
+
+---
+
+## Phase 6: Compile Report
+
+### Output Locations
+
+**Local:** \`.gstack/design-reports/design-audit-{domain}-{YYYY-MM-DD}.md\`
+
+**Project-scoped:**
+\`\`\`bash
+SLUG=$(git remote get-url origin 2>/dev/null | sed 's|.*[:/]\\([^/]*/[^/]*\\)\\.git$|\\1|;s|.*[:/]\\([^/]*/[^/]*\\)$|\\1|' | tr '/' '-')
+mkdir -p ~/.gstack/projects/$SLUG
+\`\`\`
+Write to: \`~/.gstack/projects/{slug}/{user}-{branch}-design-audit-{datetime}.md\`
+
+**Baseline:** Write \`design-baseline.json\` for regression mode:
+\`\`\`json
+{
+  "date": "YYYY-MM-DD",
+  "url": "<target>",
+  "designScore": "B",
+  "aiSlopScore": "C",
+  "categoryGrades": { "hierarchy": "A", "typography": "B", ... },
+  "findings": [{ "id": "FINDING-001", "title": "...", "impact": "high", "category": "typography" }]
+}
+\`\`\`
+
+### Scoring System
+
+**Dual headline scores:**
+- **Design Score: {A-F}** — weighted average of all 10 categories
+- **AI Slop Score: {A-F}** — standalone grade with pithy verdict
+
+**Per-category grades:**
+- **A:** Intentional, polished, delightful. Shows design thinking.
+- **B:** Solid fundamentals, minor inconsistencies. Looks professional.
+- **C:** Functional but generic. No major problems, no design point of view.
+- **D:** Noticeable problems. Feels unfinished or careless.
+- **F:** Actively hurting user experience. Needs significant rework.
+
+**Grade computation:** Each category starts at A. Each High-impact finding drops one letter grade. Each Medium-impact finding drops half a letter grade. Polish findings are noted but do not affect grade. Minimum is F.
+
+**Category weights for Design Score:**
+| Category | Weight |
+|----------|--------|
+| Visual Hierarchy | 15% |
+| Typography | 15% |
+| Spacing & Layout | 15% |
+| Color & Contrast | 10% |
+| Interaction States | 10% |
+| Responsive | 10% |
+| Content Quality | 10% |
+| AI Slop | 5% |
+| Motion | 5% |
+| Performance Feel | 5% |
+
+AI Slop is 5% of Design Score but also graded independently as a headline metric.
+
+### Regression Output
+
+When previous \`design-baseline.json\` exists or \`--regression\` flag is used:
+- Load baseline grades
+- Compare: per-category deltas, new findings, resolved findings
+- Append regression table to report
+
+---
+
+## Design Critique Format
+
+Use structured feedback, not opinions:
+- "I notice..." — observation (e.g., "I notice the primary CTA competes with the secondary action")
+- "I wonder..." — question (e.g., "I wonder if users will understand what 'Process' means here")
+- "What if..." — suggestion (e.g., "What if we moved search to a more prominent position?")
+- "I think... because..." — reasoned opinion (e.g., "I think the spacing between sections is too uniform because it doesn't create hierarchy")
+
+Tie everything to user goals and product objectives. Always suggest specific improvements alongside problems.
+
+---
+
+## Important Rules
+
+1. **Think like a designer, not a QA engineer.** You care whether things feel right, look intentional, and respect the user. You do NOT just care whether things "work."
+2. **Screenshots are evidence.** Every finding needs at least one screenshot. Use annotated screenshots (\`snapshot -a\`) to highlight elements.
+3. **Be specific and actionable.** "Change X to Y because Z" — not "the spacing feels off."
+4. **Never read source code.** Evaluate the rendered site, not the implementation. (Exception: offer to write DESIGN.md from extracted observations.)
+5. **AI Slop detection is your superpower.** Most developers can't evaluate whether their site looks AI-generated. You can. Be direct about it.
+6. **Quick wins matter.** Always include a "Quick Wins" section — the 3-5 highest-impact fixes that take <30 minutes each.
+7. **Use \`snapshot -C\` for tricky UIs.** Finds clickable divs that the accessibility tree misses.
+8. **Responsive is design, not just "not broken."** A stacked desktop layout on mobile is not responsive design — it's lazy. Evaluate whether the mobile layout makes *design* sense.
+9. **Document incrementally.** Write each finding to the report as you find it. Don't batch.
+10. **Depth over breadth.** 5-10 well-documented findings with screenshots and specific suggestions > 20 vague observations.`;
+}
+
 const RESOLVERS: Record<string, () => string> = {
   COMMAND_REFERENCE: generateCommandReference,
   SNAPSHOT_FLAGS: generateSnapshotFlags,
   PREAMBLE: generatePreamble,
   BROWSE_SETUP: generateBrowseSetup,
   QA_METHODOLOGY: generateQAMethodology,
+  DESIGN_METHODOLOGY: generateDesignMethodology,
 };
 
 // ─── Template Processing ────────────────────────────────────
@@ -509,6 +843,8 @@ function findTemplates(): string[] {
     path.join(ROOT, 'plan-eng-review', 'SKILL.md.tmpl'),
     path.join(ROOT, 'retro', 'SKILL.md.tmpl'),
     path.join(ROOT, 'gstack-upgrade', 'SKILL.md.tmpl'),
+    path.join(ROOT, 'plan-design-review', 'SKILL.md.tmpl'),
+    path.join(ROOT, 'qa-design-review', 'SKILL.md.tmpl'),
   ];
   for (const p of candidates) {
     if (fs.existsSync(p)) templates.push(p);
diff --git a/scripts/skill-check.ts b/scripts/skill-check.ts
index 591a0c81..eccb12e8 100644
--- a/scripts/skill-check.ts
+++ b/scripts/skill-check.ts
@@ -27,6 +27,8 @@ const SKILL_FILES = [
   'plan-ceo-review/SKILL.md',
   'plan-eng-review/SKILL.md',
   'setup-browser-cookies/SKILL.md',
+  'plan-design-review/SKILL.md',
+  'qa-design-review/SKILL.md',
 ].filter(f => fs.existsSync(path.join(ROOT, f)));
 
 let hasErrors = false;
diff --git a/test/skill-validation.test.ts b/test/skill-validation.test.ts
index 88e98935..415072e3 100644
--- a/test/skill-validation.test.ts
+++ b/test/skill-validation.test.ts
@@ -57,6 +57,34 @@ describe('SKILL.md command validation', () => {
     const result = validateSkill(qaOnlySkill);
     expect(result.snapshotFlagErrors).toHaveLength(0);
   });
+
+  test('all $B commands in plan-design-review/SKILL.md are valid browse commands', () => {
+    const skill = path.join(ROOT, 'plan-design-review', 'SKILL.md');
+    if (!fs.existsSync(skill)) return;
+    const result = validateSkill(skill);
+    expect(result.invalid).toHaveLength(0);
+  });
+
+  test('all snapshot flags in plan-design-review/SKILL.md are valid', () => {
+    const skill = path.join(ROOT, 'plan-design-review', 'SKILL.md');
+    if (!fs.existsSync(skill)) return;
+    const result = validateSkill(skill);
+    expect(result.snapshotFlagErrors).toHaveLength(0);
+  });
+
+  test('all $B commands in qa-design-review/SKILL.md are valid browse commands', () => {
+    const skill = path.join(ROOT, 'qa-design-review', 'SKILL.md');
+    if (!fs.existsSync(skill)) return;
+    const result = validateSkill(skill);
+    expect(result.invalid).toHaveLength(0);
+  });
+
+  test('all snapshot flags in qa-design-review/SKILL.md are valid', () => {
+    const skill = path.join(ROOT, 'qa-design-review', 'SKILL.md');
+    if (!fs.existsSync(skill)) return;
+    const result = validateSkill(skill);
+    expect(result.snapshotFlagErrors).toHaveLength(0);
+  });
 });
 
 describe('Command registry consistency', () => {
@@ -176,6 +204,8 @@ describe('Update check preamble', () => {
     'ship/SKILL.md', 'review/SKILL.md',
     'plan-ceo-review/SKILL.md', 'plan-eng-review/SKILL.md',
     'retro/SKILL.md',
+    'plan-design-review/SKILL.md',
+    'qa-design-review/SKILL.md',
   ];
 
   for (const skill of skillsWithUpdateCheck) {
@@ -421,6 +451,8 @@ describe('v0.4.1 preamble features', () => {
     'ship/SKILL.md', 'review/SKILL.md',
     'plan-ceo-review/SKILL.md', 'plan-eng-review/SKILL.md',
     'retro/SKILL.md',
+    'plan-design-review/SKILL.md',
+    'qa-design-review/SKILL.md',
   ];
 
   for (const skill of skillsWithPreamble) {