From 54677117cc50bef8f9797798a164f5a32603c0a6 Mon Sep 17 00:00:00 2001 From: Lucas Braud Date: Mon, 16 Mar 2026 09:54:24 +0100 Subject: [PATCH 1/7] Fix build script failure on Windows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `rm -f .*.bun-build` glob cleanup step fails on Windows/Git Bash when no files match the pattern, causing `bun run build` to exit with code 1. Since the setup script uses `set -e`, this aborts the entire setup before skill symlinks are created. Adding `|| true` makes the cleanup step non-fatal, which matches the intent — it's just removing stale build artifacts if they exist. --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index a5044b7d..e30ee587 100644 --- a/package.json +++ b/package.json @@ -8,7 +8,7 @@ "browse": "./browse/dist/browse" }, "scripts": { - "build": "bun run gen:skill-docs && bun build --compile browse/src/cli.ts --outfile browse/dist/browse && bun build --compile browse/src/find-browse.ts --outfile browse/dist/find-browse && git rev-parse HEAD > browse/dist/.version && rm -f .*.bun-build", + "build": "bun run gen:skill-docs && bun build --compile browse/src/cli.ts --outfile browse/dist/browse && bun build --compile browse/src/find-browse.ts --outfile browse/dist/find-browse && git rev-parse HEAD > browse/dist/.version && rm -f .*.bun-build || true", "gen:skill-docs": "bun run scripts/gen-skill-docs.ts", "dev": "bun run browse/src/cli.ts", "server": "bun run browse/src/server.ts", From 1a100a2a239a74af03c2bf32abb370419b06416a Mon Sep 17 00:00:00 2001 From: xianren Date: Tue, 17 Mar 2026 22:05:02 +0800 Subject: [PATCH 2/7] fix: eliminate duplicate command sets in chain, improve flush perf and type safety MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove duplicate CHAIN_READ/CHAIN_WRITE/CHAIN_META sets from meta-commands.ts and import from commands.ts (single source of truth). The duplicated sets would silently fail to route new commands added to commands.ts. - Replace read+concat+write log flush with fs.appendFileSync — O(new entries) instead of O(total log size) per flush cycle. - Replace `any` types for contextOptions with Playwright's BrowserContextOptions and add proper types for storage state in recreateContext(). Co-Authored-By: Claude Opus 4.6 (1M context) --- browse/src/browser-manager.ts | 20 ++++++++++---------- browse/src/meta-commands.ts | 29 ++++------------------------- browse/src/server.ts | 6 +++--- 3 files changed, 17 insertions(+), 38 deletions(-) diff --git a/browse/src/browser-manager.ts b/browse/src/browser-manager.ts index 260c8219..e094f3a5 100644 --- a/browse/src/browser-manager.ts +++ b/browse/src/browser-manager.ts @@ -15,7 +15,7 @@ * restores state. Falls back to clean slate on any failure. */ -import { chromium, type Browser, type BrowserContext, type Page, type Locator } from 'playwright'; +import { chromium, type Browser, type BrowserContext, type BrowserContextOptions, type Page, type Locator } from 'playwright'; import { addConsoleEntry, addNetworkEntry, addDialogEntry, networkBuffer, type DialogEntry } from './buffers'; export interface RefEntry { @@ -57,7 +57,7 @@ export class BrowserManager { process.exit(1); }); - const contextOptions: any = { + const contextOptions: BrowserContextOptions = { viewport: { width: 1280, height: 720 }, }; if (this.customUserAgent) { @@ -282,7 +282,7 @@ export class BrowserManager { try { // 1. Save state from current context const savedCookies = await this.context.cookies(); - const savedPages: Array<{ url: string; isActive: boolean; storage: any }> = []; + const savedPages: Array<{ url: string; isActive: boolean; storage: { localStorage: Record; sessionStorage: Record } | null }> = []; for (const [id, page] of this.pages) { const url = page.url(); @@ -308,7 +308,7 @@ export class BrowserManager { await this.context.close().catch(() => {}); // 3. Create new context with updated settings - const contextOptions: any = { + const contextOptions: BrowserContextOptions = { viewport: { width: 1280, height: 720 }, }; if (this.customUserAgent) { @@ -340,15 +340,15 @@ export class BrowserManager { // 6. Restore storage if (saved.storage) { try { - await page.evaluate((s: any) => { + await page.evaluate((s: { localStorage: Record; sessionStorage: Record }) => { if (s.localStorage) { for (const [k, v] of Object.entries(s.localStorage)) { - localStorage.setItem(k, v as string); + localStorage.setItem(k, v); } } if (s.sessionStorage) { for (const [k, v] of Object.entries(s.sessionStorage)) { - sessionStorage.setItem(k, v as string); + sessionStorage.setItem(k, v); } } }, saved.storage); @@ -369,13 +369,13 @@ export class BrowserManager { this.clearRefs(); return null; // success - } catch (err: any) { + } catch (err: unknown) { // Fallback: create a clean context + blank tab try { this.pages.clear(); if (this.context) await this.context.close().catch(() => {}); - const contextOptions: any = { + const contextOptions: BrowserContextOptions = { viewport: { width: 1280, height: 720 }, }; if (this.customUserAgent) { @@ -387,7 +387,7 @@ export class BrowserManager { } catch { // If even the fallback fails, we're in trouble — but browser is still alive } - return `Context recreation failed: ${err.message}. Browser reset to blank tab.`; + return `Context recreation failed: ${err instanceof Error ? err.message : String(err)}. Browser reset to blank tab.`; } } diff --git a/browse/src/meta-commands.ts b/browse/src/meta-commands.ts index c17930b3..3c622db9 100644 --- a/browse/src/meta-commands.ts +++ b/browse/src/meta-commands.ts @@ -5,6 +5,7 @@ import type { BrowserManager } from './browser-manager'; import { handleSnapshot } from './snapshot'; import { getCleanText } from './read-commands'; +import { READ_COMMANDS, WRITE_COMMANDS, META_COMMANDS } from './commands'; import * as Diff from 'diff'; import * as fs from 'fs'; import * as path from 'path'; @@ -20,28 +21,6 @@ function validateOutputPath(filePath: string): void { } } -// Command sets for chain routing (mirrors server.ts — kept local to avoid circular import) -const CHAIN_READ = new Set([ - 'text', 'html', 'links', 'forms', 'accessibility', - 'js', 'eval', 'css', 'attrs', - 'console', 'network', 'cookies', 'storage', 'perf', - 'dialog', 'is', -]); -const CHAIN_WRITE = new Set([ - 'goto', 'back', 'forward', 'reload', - 'click', 'fill', 'select', 'hover', 'type', 'press', 'scroll', 'wait', - 'viewport', 'cookie', 'cookie-import', 'header', 'useragent', - 'upload', 'dialog-accept', 'dialog-dismiss', - 'cookie-import-browser', -]); -const CHAIN_META = new Set([ - 'tabs', 'tab', 'newtab', 'closetab', - 'status', 'stop', 'restart', - 'screenshot', 'pdf', 'responsive', - 'chain', 'diff', - 'url', 'snapshot', -]); - export async function handleMetaCommand( command: string, args: string[], @@ -223,9 +202,9 @@ export async function handleMetaCommand( const [name, ...cmdArgs] = cmd; try { let result: string; - if (CHAIN_WRITE.has(name)) result = await handleWriteCommand(name, cmdArgs, bm); - else if (CHAIN_READ.has(name)) result = await handleReadCommand(name, cmdArgs, bm); - else if (CHAIN_META.has(name)) result = await handleMetaCommand(name, cmdArgs, bm, shutdown); + if (WRITE_COMMANDS.has(name)) result = await handleWriteCommand(name, cmdArgs, bm); + else if (READ_COMMANDS.has(name)) result = await handleReadCommand(name, cmdArgs, bm); + else if (META_COMMANDS.has(name)) result = await handleMetaCommand(name, cmdArgs, bm, shutdown); else throw new Error(`Unknown command: ${name}`); results.push(`[${name}] ${result}`); } catch (err: any) { diff --git a/browse/src/server.ts b/browse/src/server.ts index 5e76f421..f30a4881 100644 --- a/browse/src/server.ts +++ b/browse/src/server.ts @@ -104,7 +104,7 @@ async function flushBuffers() { const lines = entries.map(e => `[${new Date(e.timestamp).toISOString()}] [${e.level}] ${e.text}` ).join('\n') + '\n'; - await Bun.write(CONSOLE_LOG_PATH, (await Bun.file(CONSOLE_LOG_PATH).text().catch(() => '')) + lines); + fs.appendFileSync(CONSOLE_LOG_PATH, lines); lastConsoleFlushed = consoleBuffer.totalAdded; } @@ -115,7 +115,7 @@ async function flushBuffers() { const lines = entries.map(e => `[${new Date(e.timestamp).toISOString()}] ${e.method} ${e.url} → ${e.status || 'pending'} (${e.duration || '?'}ms, ${e.size || '?'}B)` ).join('\n') + '\n'; - await Bun.write(NETWORK_LOG_PATH, (await Bun.file(NETWORK_LOG_PATH).text().catch(() => '')) + lines); + fs.appendFileSync(NETWORK_LOG_PATH, lines); lastNetworkFlushed = networkBuffer.totalAdded; } @@ -126,7 +126,7 @@ async function flushBuffers() { const lines = entries.map(e => `[${new Date(e.timestamp).toISOString()}] [${e.type}] "${e.message}" → ${e.action}${e.response ? ` "${e.response}"` : ''}` ).join('\n') + '\n'; - await Bun.write(DIALOG_LOG_PATH, (await Bun.file(DIALOG_LOG_PATH).text().catch(() => '')) + lines); + fs.appendFileSync(DIALOG_LOG_PATH, lines); lastDialogFlushed = dialogBuffer.totalAdded; } } catch { From 17c1c06cd98f78bf0bd25adba2effa8048a76936 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Tue, 17 Mar 2026 18:45:41 -0500 Subject: [PATCH 3/7] feat: diff-based test selection for E2E and LLM-judge evals (v0.6.1.0) (#139) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: diff-based test selection for E2E and LLM-judge evals Each test declares file dependencies in a TOUCHFILES map. The test runner checks git diff against the base branch and only runs tests whose dependencies were modified. Global touchfiles (session-runner, eval-store, gen-skill-docs) trigger all tests. New scripts: test:e2e:all, test:evals:all, eval:select Co-Authored-By: Claude Opus 4.6 * chore: bump version and changelog (v0.6.1.0) Co-Authored-By: Claude Opus 4.6 * fix: plan-design-review-audit eval — bump turns to 30, add efficiency hints The test was flaky at 20 turns because the agent reads a 300-line SKILL.md, navigates, extracts design data, and writes a report. Added hints to skip preamble/batch commands/write early while still testing the real SKILL.md. Now completes in ~13 turns consistently. Co-Authored-By: Claude Opus 4.6 --------- Co-authored-by: Claude Opus 4.6 --- CHANGELOG.md | 13 ++ CLAUDE.md | 13 +- VERSION | 2 +- package.json | 5 +- scripts/eval-select.ts | 86 ++++++++++++ test/helpers/touchfiles.ts | 178 +++++++++++++++++++++++++ test/skill-e2e.test.ts | 124 ++++++++++++------ test/skill-llm-eval.test.ts | 62 +++++++-- test/touchfiles.test.ts | 253 ++++++++++++++++++++++++++++++++++++ 9 files changed, 681 insertions(+), 55 deletions(-) create mode 100644 scripts/eval-select.ts create mode 100644 test/helpers/touchfiles.ts create mode 100644 test/touchfiles.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index be473577..5062711c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,18 @@ # Changelog +## [0.6.1.0] - 2026-03-17 + +### Added + +- **E2E and LLM-judge tests now only run what you changed.** Each test declares which source files it depends on. When you run `bun run test:e2e`, it checks your diff and skips tests whose dependencies weren't touched. A branch that only changes `/retro` now runs 2 tests instead of 31. Use `bun run test:e2e:all` to force everything. +- **`bun run eval:select` previews which tests would run.** See exactly which tests your diff triggers before spending API credits. Supports `--json` for scripting and `--base ` to override the base branch. +- **Completeness guardrail catches forgotten test entries.** A free unit test validates that every `testName` in the E2E and LLM-judge test files has a corresponding entry in the TOUCHFILES map. New tests without entries fail `bun test` immediately — no silent always-run degradation. + +### Changed + +- `test:evals` and `test:e2e` now auto-select based on diff (was: all-or-nothing) +- New `test:evals:all` and `test:e2e:all` scripts for explicit full runs + ## 0.6.1 — 2026-03-17 — Boil the Lake Every gstack skill now follows the **Completeness Principle**: always recommend the diff --git a/CLAUDE.md b/CLAUDE.md index 34868b0a..213be490 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -5,8 +5,11 @@ ```bash bun install # install dependencies bun test # run free tests (browse + snapshot + skill validation) -bun run test:evals # run paid evals: LLM judge + E2E (~$4/run) -bun run test:e2e # run E2E tests only (~$3.85/run) +bun run test:evals # run paid evals: LLM judge + E2E (diff-based, ~$4/run max) +bun run test:evals:all # run ALL paid evals regardless of diff +bun run test:e2e # run E2E tests only (diff-based, ~$3.85/run max) +bun run test:e2e:all # run ALL E2E tests regardless of diff +bun run eval:select # show which tests would run based on current diff bun run dev # run CLI in dev mode, e.g. bun run dev goto https://example.com bun run build # gen docs + compile binaries bun run gen:skill-docs # regenerate SKILL.md files from templates @@ -21,6 +24,12 @@ bun run eval:summary # aggregate stats across all eval runs (tool-by-tool via `--output-format stream-json --verbose`). Results are persisted to `~/.gstack-dev/evals/` with auto-comparison against the previous run. +**Diff-based test selection:** `test:evals` and `test:e2e` auto-select tests based +on `git diff` against the base branch. Each test declares its file dependencies in +`test/helpers/touchfiles.ts`. Changes to global touchfiles (session-runner, eval-store, +llm-judge, gen-skill-docs) trigger all tests. Use `EVALS_ALL=1` or the `:all` script +variants to force all tests. Run `eval:select` to preview which tests would run. + ## Project structure ``` diff --git a/VERSION b/VERSION index ee6cdce3..44e7f9a2 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.6.1 +0.6.1.0 diff --git a/package.json b/package.json index a5044b7d..e725b4ab 100644 --- a/package.json +++ b/package.json @@ -14,14 +14,17 @@ "server": "bun run browse/src/server.ts", "test": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts --ignore test/skill-llm-eval.test.ts", "test:evals": "EVALS=1 bun test test/skill-llm-eval.test.ts test/skill-e2e.test.ts", + "test:evals:all": "EVALS=1 EVALS_ALL=1 bun test test/skill-llm-eval.test.ts test/skill-e2e.test.ts", "test:e2e": "EVALS=1 bun test test/skill-e2e.test.ts", + "test:e2e:all": "EVALS=1 EVALS_ALL=1 bun test test/skill-e2e.test.ts", "skill:check": "bun run scripts/skill-check.ts", "dev:skill": "bun run scripts/dev-skill.ts", "start": "bun run browse/src/server.ts", "eval:list": "bun run scripts/eval-list.ts", "eval:compare": "bun run scripts/eval-compare.ts", "eval:summary": "bun run scripts/eval-summary.ts", - "eval:watch": "bun run scripts/eval-watch.ts" + "eval:watch": "bun run scripts/eval-watch.ts", + "eval:select": "bun run scripts/eval-select.ts" }, "dependencies": { "playwright": "^1.58.2", diff --git a/scripts/eval-select.ts b/scripts/eval-select.ts new file mode 100644 index 00000000..cdbdcc84 --- /dev/null +++ b/scripts/eval-select.ts @@ -0,0 +1,86 @@ +#!/usr/bin/env bun +/** + * Show which E2E and LLM-judge tests would run based on the current git diff. + * + * Usage: + * bun run eval:select # human-readable output + * bun run eval:select --json # machine-readable JSON + * bun run eval:select --base main # override base branch + */ + +import * as path from 'path'; +import { + selectTests, + detectBaseBranch, + getChangedFiles, + E2E_TOUCHFILES, + LLM_JUDGE_TOUCHFILES, + GLOBAL_TOUCHFILES, +} from '../test/helpers/touchfiles'; + +const ROOT = path.resolve(import.meta.dir, '..'); +const args = process.argv.slice(2); +const jsonMode = args.includes('--json'); +const baseIdx = args.indexOf('--base'); +const baseOverride = baseIdx >= 0 ? args[baseIdx + 1] : undefined; + +// Detect base branch +const baseBranch = baseOverride || detectBaseBranch(ROOT) || 'main'; +const changedFiles = getChangedFiles(baseBranch, ROOT); + +if (changedFiles.length === 0) { + if (jsonMode) { + console.log(JSON.stringify({ base: baseBranch, changed_files: 0, e2e: 'all', llm_judge: 'all', reason: 'no diff — would run all tests' })); + } else { + console.log(`Base: ${baseBranch}`); + console.log('No changed files detected — all tests would run.'); + } + process.exit(0); +} + +const e2eSelection = selectTests(changedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES); +const llmSelection = selectTests(changedFiles, LLM_JUDGE_TOUCHFILES, GLOBAL_TOUCHFILES); + +if (jsonMode) { + console.log(JSON.stringify({ + base: baseBranch, + changed_files: changedFiles, + e2e: { + selected: e2eSelection.selected, + skipped: e2eSelection.skipped, + reason: e2eSelection.reason, + count: `${e2eSelection.selected.length}/${Object.keys(E2E_TOUCHFILES).length}`, + }, + llm_judge: { + selected: llmSelection.selected, + skipped: llmSelection.skipped, + reason: llmSelection.reason, + count: `${llmSelection.selected.length}/${Object.keys(LLM_JUDGE_TOUCHFILES).length}`, + }, + }, null, 2)); +} else { + console.log(`Base: ${baseBranch}`); + console.log(`Changed files: ${changedFiles.length}`); + console.log(); + + console.log(`E2E (${e2eSelection.reason}): ${e2eSelection.selected.length}/${Object.keys(E2E_TOUCHFILES).length} tests`); + if (e2eSelection.selected.length > 0 && e2eSelection.selected.length < Object.keys(E2E_TOUCHFILES).length) { + console.log(` Selected: ${e2eSelection.selected.join(', ')}`); + console.log(` Skipped: ${e2eSelection.skipped.join(', ')}`); + } else if (e2eSelection.selected.length === 0) { + console.log(' No E2E tests affected.'); + } else { + console.log(' All E2E tests selected.'); + } + console.log(); + + console.log(`LLM-judge (${llmSelection.reason}): ${llmSelection.selected.length}/${Object.keys(LLM_JUDGE_TOUCHFILES).length} tests`); + if (llmSelection.selected.length > 0 && llmSelection.selected.length < Object.keys(LLM_JUDGE_TOUCHFILES).length) { + console.log(` Selected: ${llmSelection.selected.join(', ')}`); + console.log(` Skipped: ${llmSelection.skipped.join(', ')}`); + } else if (llmSelection.selected.length === 0) { + console.log(' No LLM-judge tests affected.'); + } else { + console.log(' All LLM-judge tests selected.'); + } +} diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts new file mode 100644 index 00000000..30a15579 --- /dev/null +++ b/test/helpers/touchfiles.ts @@ -0,0 +1,178 @@ +/** + * Diff-based test selection for E2E and LLM-judge evals. + * + * Each test declares which source files it depends on ("touchfiles"). + * The test runner checks `git diff` and only runs tests whose + * dependencies were modified. Override with EVALS_ALL=1 to run everything. + */ + +import { spawnSync } from 'child_process'; + +// --- Glob matching --- + +/** + * Match a file path against a glob pattern. + * Supports: + * ** — match any number of path segments + * * — match within a single segment (no /) + */ +export function matchGlob(file: string, pattern: string): boolean { + const regexStr = pattern + .replace(/\./g, '\\.') + .replace(/\*\*/g, '{{GLOBSTAR}}') + .replace(/\*/g, '[^/]*') + .replace(/\{\{GLOBSTAR\}\}/g, '.*'); + return new RegExp(`^${regexStr}$`).test(file); +} + +// --- Touchfile maps --- + +/** + * E2E test touchfiles — keyed by testName (the string passed to runSkillTest). + * Each test lists the file patterns that, if changed, require the test to run. + */ +export const E2E_TOUCHFILES: Record = { + // Browse core + 'browse-basic': ['browse/src/**'], + 'browse-snapshot': ['browse/src/**'], + + // SKILL.md setup + preamble (depend on ROOT SKILL.md only) + 'skillmd-setup-discovery': ['SKILL.md', 'SKILL.md.tmpl'], + 'skillmd-no-local-binary': ['SKILL.md', 'SKILL.md.tmpl'], + 'skillmd-outside-git': ['SKILL.md', 'SKILL.md.tmpl'], + 'contributor-mode': ['SKILL.md', 'SKILL.md.tmpl'], + 'session-awareness': ['SKILL.md', 'SKILL.md.tmpl'], + + // QA + 'qa-quick': ['qa/**', 'browse/src/**'], + 'qa-b6-static': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval.html', 'test/fixtures/qa-eval-ground-truth.json'], + 'qa-b7-spa': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval-spa.html', 'test/fixtures/qa-eval-spa-ground-truth.json'], + 'qa-b8-checkout': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval-checkout.html', 'test/fixtures/qa-eval-checkout-ground-truth.json'], + 'qa-only-no-fix': ['qa-only/**', 'qa/templates/**'], + 'qa-fix-loop': ['qa/**', 'browse/src/**'], + + // Review + 'review-sql-injection': ['review/**', 'test/fixtures/review-eval-vuln.rb'], + 'review-enum-completeness': ['review/**', 'test/fixtures/review-eval-enum*.rb'], + 'review-base-branch': ['review/**'], + + // Plan reviews + 'plan-ceo-review': ['plan-ceo-review/**'], + 'plan-ceo-review-selective': ['plan-ceo-review/**'], + 'plan-eng-review': ['plan-eng-review/**'], + 'plan-eng-review-artifact': ['plan-eng-review/**'], + + // Ship + 'ship-base-branch': ['ship/**'], + + // Retro + 'retro': ['retro/**'], + 'retro-base-branch': ['retro/**'], + + // Document-release + 'document-release': ['document-release/**'], + + // QA bootstrap + 'qa-bootstrap': ['qa/**', 'browse/src/**', 'ship/**'], + + // Ship coverage audit + 'ship-coverage-audit': ['ship/**'], + + // Design + 'design-consultation-core': ['design-consultation/**'], + 'design-consultation-research': ['design-consultation/**'], + 'design-consultation-existing': ['design-consultation/**'], + 'design-consultation-preview': ['design-consultation/**'], + 'plan-design-review-audit': ['plan-design-review/**'], + 'plan-design-review-export': ['plan-design-review/**'], + 'qa-design-review-fix': ['qa-design-review/**', 'browse/src/**'], +}; + +/** + * LLM-judge test touchfiles — keyed by test description string. + */ +export const LLM_JUDGE_TOUCHFILES: Record = { + 'command reference table': ['SKILL.md', 'SKILL.md.tmpl', 'browse/src/commands.ts'], + 'snapshot flags reference': ['SKILL.md', 'SKILL.md.tmpl', 'browse/src/snapshot.ts'], + 'browse/SKILL.md reference': ['browse/SKILL.md', 'browse/SKILL.md.tmpl', 'browse/src/**'], + 'setup block': ['SKILL.md', 'SKILL.md.tmpl'], + 'regression vs baseline': ['SKILL.md', 'SKILL.md.tmpl', 'browse/src/commands.ts', 'test/fixtures/eval-baselines.json'], + 'qa/SKILL.md workflow': ['qa/SKILL.md', 'qa/SKILL.md.tmpl'], + 'qa/SKILL.md health rubric': ['qa/SKILL.md', 'qa/SKILL.md.tmpl'], + 'cross-skill greptile consistency': ['review/SKILL.md', 'review/SKILL.md.tmpl', 'ship/SKILL.md', 'ship/SKILL.md.tmpl', 'review/greptile-triage.md', 'retro/SKILL.md', 'retro/SKILL.md.tmpl'], + 'baseline score pinning': ['SKILL.md', 'SKILL.md.tmpl', 'test/fixtures/eval-baselines.json'], +}; + +/** + * Changes to any of these files trigger ALL tests (both E2E and LLM-judge). + */ +export const GLOBAL_TOUCHFILES = [ + 'test/helpers/session-runner.ts', + 'test/helpers/eval-store.ts', + 'test/helpers/llm-judge.ts', + 'scripts/gen-skill-docs.ts', + 'test/helpers/touchfiles.ts', + 'browse/test/test-server.ts', +]; + +// --- Base branch detection --- + +/** + * Detect the base branch by trying refs in order. + * Returns the first valid ref, or null if none found. + */ +export function detectBaseBranch(cwd: string): string | null { + for (const ref of ['origin/main', 'origin/master', 'main', 'master']) { + const result = spawnSync('git', ['rev-parse', '--verify', ref], { + cwd, stdio: 'pipe', timeout: 3000, + }); + if (result.status === 0) return ref; + } + return null; +} + +/** + * Get list of files changed between base branch and HEAD. + */ +export function getChangedFiles(baseBranch: string, cwd: string): string[] { + const result = spawnSync('git', ['diff', '--name-only', `${baseBranch}...HEAD`], { + cwd, stdio: 'pipe', timeout: 5000, + }); + if (result.status !== 0) return []; + return result.stdout.toString().trim().split('\n').filter(Boolean); +} + +// --- Test selection --- + +/** + * Select tests to run based on changed files. + * + * Algorithm: + * 1. If any changed file matches a global touchfile → run ALL tests + * 2. Otherwise, for each test, check if any changed file matches its patterns + * 3. Return selected + skipped lists with reason + */ +export function selectTests( + changedFiles: string[], + touchfiles: Record, + globalTouchfiles: string[] = GLOBAL_TOUCHFILES, +): { selected: string[]; skipped: string[]; reason: string } { + const allTestNames = Object.keys(touchfiles); + + // Global touchfile hit → run all + for (const file of changedFiles) { + if (globalTouchfiles.some(g => matchGlob(file, g))) { + return { selected: allTestNames, skipped: [], reason: `global: ${file}` }; + } + } + + // Per-test matching + const selected: string[] = []; + const skipped: string[] = []; + for (const [testName, patterns] of Object.entries(touchfiles)) { + const hit = changedFiles.some(f => patterns.some(p => matchGlob(f, p))); + (hit ? selected : skipped).push(testName); + } + + return { selected, skipped, reason: 'diff' }; +} diff --git a/test/skill-e2e.test.ts b/test/skill-e2e.test.ts index 021e41da..338ec2f1 100644 --- a/test/skill-e2e.test.ts +++ b/test/skill-e2e.test.ts @@ -1,10 +1,11 @@ import { describe, test, expect, beforeAll, afterAll } from 'bun:test'; import { runSkillTest } from './helpers/session-runner'; import type { SkillTestResult } from './helpers/session-runner'; -import { outcomeJudge } from './helpers/llm-judge'; +import { outcomeJudge, callJudge } from './helpers/llm-judge'; import { EvalCollector, judgePassed } from './helpers/eval-store'; import type { EvalTestEntry } from './helpers/eval-store'; import { startTestServer } from '../browse/test/test-server'; +import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES } from './helpers/touchfiles'; import { spawnSync } from 'child_process'; import * as fs from 'fs'; import * as path from 'path'; @@ -21,6 +22,41 @@ const ROOT = path.resolve(import.meta.dir, '..'); const evalsEnabled = !!process.env.EVALS; const describeE2E = evalsEnabled ? describe : describe.skip; +// --- Diff-based test selection --- +// When EVALS_ALL is not set, only run tests whose touchfiles were modified. +// Set EVALS_ALL=1 to force all tests. Set EVALS_BASE to override base branch. +let selectedTests: string[] | null = null; // null = run all + +if (evalsEnabled && !process.env.EVALS_ALL) { + const baseBranch = process.env.EVALS_BASE + || detectBaseBranch(ROOT) + || 'main'; + const changedFiles = getChangedFiles(baseBranch, ROOT); + + if (changedFiles.length > 0) { + const selection = selectTests(changedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES); + selectedTests = selection.selected; + process.stderr.write(`\nE2E selection (${selection.reason}): ${selection.selected.length}/${Object.keys(E2E_TOUCHFILES).length} tests\n`); + if (selection.skipped.length > 0) { + process.stderr.write(` Skipped: ${selection.skipped.join(', ')}\n`); + } + process.stderr.write('\n'); + } + // If changedFiles is empty (e.g., on main branch), selectedTests stays null → run all +} + +/** Wrap a describe block to skip entirely if none of its tests are selected. */ +function describeIfSelected(name: string, testNames: string[], fn: () => void) { + const anySelected = selectedTests === null || testNames.some(t => selectedTests!.includes(t)); + (anySelected ? describeE2E : describe.skip)(name, fn); +} + +/** Skip an individual test if not selected (for multi-test describe blocks). */ +function testIfSelected(testName: string, fn: () => Promise, timeout: number) { + const shouldRun = selectedTests === null || selectedTests.includes(testName); + (shouldRun ? test : test.skip)(testName, fn, timeout); +} + // Eval result collector — accumulates test results, writes to ~/.gstack-dev/evals/ on finalize const evalCollector = evalsEnabled ? new EvalCollector('e2e') : null; @@ -133,7 +169,10 @@ if (evalsEnabled) { } } -describeE2E('Skill E2E tests', () => { +describeIfSelected('Skill E2E tests', [ + 'browse-basic', 'browse-snapshot', 'skillmd-setup-discovery', + 'skillmd-no-local-binary', 'skillmd-outside-git', 'contributor-mode', 'session-awareness', +], () => { beforeAll(() => { testServer = startTestServer(); tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-')); @@ -145,7 +184,7 @@ describeE2E('Skill E2E tests', () => { try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {} }); - test('browse basic commands work without errors', async () => { + testIfSelected('browse-basic', async () => { const result = await runSkillTest({ prompt: `You have a browse binary at ${browseBin}. Assign it to B variable and run these commands in sequence: 1. $B goto ${testServer.url} @@ -166,7 +205,7 @@ Report the results of each command.`, expect(result.exitReason).toBe('success'); }, 90_000); - test('browse snapshot flags all work', async () => { + testIfSelected('browse-snapshot', async () => { const result = await runSkillTest({ prompt: `You have a browse binary at ${browseBin}. Assign it to B variable and run: 1. $B goto ${testServer.url} @@ -191,7 +230,7 @@ Report what each command returned.`, expect(result.exitReason).toBe('success'); }, 90_000); - test('agent discovers browse binary via SKILL.md setup block', async () => { + testIfSelected('skillmd-setup-discovery', async () => { const skillMd = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); const setupStart = skillMd.indexOf('## SETUP'); const setupEnd = skillMd.indexOf('## IMPORTANT'); @@ -220,7 +259,7 @@ Report whether it worked.`, expect(result.exitReason).toBe('success'); }, 90_000); - test('SKILL.md setup block handles missing local binary gracefully', async () => { + testIfSelected('skillmd-no-local-binary', async () => { // Create a tmpdir with no browse binary — no local .claude/skills/gstack/browse/dist/browse const emptyDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-empty-')); @@ -255,7 +294,7 @@ Report the exact output. Do NOT try to fix or install anything — just report w try { fs.rmSync(emptyDir, { recursive: true, force: true }); } catch {} }, 60_000); - test('SKILL.md setup block works outside git repo', async () => { + testIfSelected('skillmd-outside-git', async () => { // Create a tmpdir outside any git repo const nonGitDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-nogit-')); @@ -286,7 +325,7 @@ Report the exact output — either "READY: " or "NEEDS_SETUP".`, try { fs.rmSync(nonGitDir, { recursive: true, force: true }); } catch {} }, 60_000); - test('contributor mode files a report on gstack error', async () => { + testIfSelected('contributor-mode', async () => { const contribDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-contrib-')); const logsDir = path.join(contribDir, 'contributor-logs'); fs.mkdirSync(logsDir, { recursive: true }); @@ -342,7 +381,7 @@ File a contributor report about this issue. Then tell me what you filed.`, try { fs.rmSync(contribDir, { recursive: true, force: true }); } catch {} }, 90_000); - test('session awareness adds ELI16 context when _SESSIONS >= 3', async () => { + testIfSelected('session-awareness', async () => { const sessionDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-session-')); // Set up a git repo so there's project/branch context to reference @@ -413,7 +452,7 @@ Remember: _SESSIONS=4, so ELI16 mode is active. The user is juggling multiple wi // --- B4: QA skill E2E --- -describeE2E('QA skill E2E', () => { +describeIfSelected('QA skill E2E', ['qa-quick'], () => { let qaDir: string; beforeAll(() => { @@ -468,7 +507,7 @@ Write your report to ${qaDir}/qa-reports/qa-report.md`, // --- B5: Review skill E2E --- -describeE2E('Review skill E2E', () => { +describeIfSelected('Review skill E2E', ['review-sql-injection'], () => { let reviewDir: string; beforeAll(() => { @@ -527,7 +566,7 @@ Write your review findings to ${reviewDir}/review-output.md`, // --- Review: Enum completeness E2E --- -describeE2E('Review enum completeness E2E', () => { +describeIfSelected('Review enum completeness E2E', ['review-enum-completeness'], () => { let enumDir: string; beforeAll(() => { @@ -603,7 +642,10 @@ The diff adds a new "returned" status to the Order model. Your job is to check i const hasApiKey = !!process.env.ANTHROPIC_API_KEY; const describeOutcome = (evalsEnabled && hasApiKey) ? describe : describe.skip; -describeOutcome('Planted-bug outcome evals', () => { +// Wrap describeOutcome with selection — skip if no planted-bug tests are selected +const outcomeTestNames = ['qa-b6-static', 'qa-b7-spa', 'qa-b8-checkout']; +const anyOutcomeSelected = selectedTests === null || outcomeTestNames.some(t => selectedTests!.includes(t)); +(anyOutcomeSelected ? describeOutcome : describe.skip)('Planted-bug outcome evals', () => { let outcomeDir: string; beforeAll(() => { @@ -767,7 +809,7 @@ CRITICAL RULES: // --- Plan CEO Review E2E --- -describeE2E('Plan CEO Review E2E', () => { +describeIfSelected('Plan CEO Review E2E', ['plan-ceo-review'], () => { let planDir: string; beforeAll(() => { @@ -854,7 +896,7 @@ Focus on reviewing the plan content: architecture, error handling, security, and // --- Plan CEO Review (SELECTIVE EXPANSION) E2E --- -describeE2E('Plan CEO Review SELECTIVE EXPANSION E2E', () => { +describeIfSelected('Plan CEO Review SELECTIVE EXPANSION E2E', ['plan-ceo-review-selective'], () => { let planDir: string; beforeAll(() => { @@ -937,7 +979,7 @@ Focus on reviewing the plan content: architecture, error handling, security, and // --- Plan Eng Review E2E --- -describeE2E('Plan Eng Review E2E', () => { +describeIfSelected('Plan Eng Review E2E', ['plan-eng-review'], () => { let planDir: string; beforeAll(() => { @@ -1031,7 +1073,7 @@ Focus on architecture, code quality, tests, and performance sections.`, // --- Retro E2E --- -describeE2E('Retro E2E', () => { +describeIfSelected('Retro E2E', ['retro'], () => { let retroDir: string; beforeAll(() => { @@ -1117,7 +1159,7 @@ Analyze the git history and produce the narrative report as described in the SKI // --- QA-Only E2E (report-only, no fixes) --- -describeE2E('QA-Only skill E2E', () => { +describeIfSelected('QA-Only skill E2E', ['qa-only-no-fix'], () => { let qaOnlyDir: string; beforeAll(() => { @@ -1203,7 +1245,7 @@ Write your report to ${qaOnlyDir}/qa-reports/qa-only-report.md`, // --- QA Fix Loop E2E --- -describeE2E('QA Fix Loop E2E', () => { +describeIfSelected('QA Fix Loop E2E', ['qa-fix-loop'], () => { let qaFixDir: string; let qaFixServer: ReturnType | null = null; @@ -1317,7 +1359,7 @@ This is a test+fix loop: find bugs, fix them in the source code, commit each fix // --- Plan-Eng-Review Test-Plan Artifact E2E --- -describeE2E('Plan-Eng-Review Test-Plan Artifact E2E', () => { +describeIfSelected('Plan-Eng-Review Test-Plan Artifact E2E', ['plan-eng-review-artifact'], () => { let planDir: string; let projectDir: string; @@ -1444,7 +1486,7 @@ Write your review to ${planDir}/review-output.md`, // --- Base branch detection smoke tests --- -describeE2E('Base branch detection', () => { +describeIfSelected('Base branch detection', ['review-base-branch', 'ship-base-branch', 'retro-base-branch'], () => { let baseBranchDir: string; const run = (cmd: string, args: string[], cwd: string) => spawnSync(cmd, args, { cwd, stdio: 'pipe', timeout: 5000 }); @@ -1457,7 +1499,7 @@ describeE2E('Base branch detection', () => { try { fs.rmSync(baseBranchDir, { recursive: true, force: true }); } catch {} }); - test('/review detects base branch and diffs against it', async () => { + testIfSelected('review-base-branch', async () => { const dir = path.join(baseBranchDir, 'review-base'); fs.mkdirSync(dir, { recursive: true }); @@ -1510,7 +1552,7 @@ Write your findings to ${dir}/review-output.md`, expect(usedGitDiff).toBe(true); }, 120_000); - test('/ship Step 0-1 detects base branch without destructive actions', async () => { + testIfSelected('ship-base-branch', async () => { const dir = path.join(baseBranchDir, 'ship-base'); fs.mkdirSync(dir, { recursive: true }); @@ -1572,7 +1614,7 @@ Write a summary of what you detected to ${dir}/ship-preflight.md including: expect(destructiveTools).toHaveLength(0); }, 90_000); - test('/retro detects default branch for git queries', async () => { + testIfSelected('retro-base-branch', async () => { const dir = path.join(baseBranchDir, 'retro-base'); fs.mkdirSync(dir, { recursive: true }); @@ -1631,7 +1673,7 @@ Write your retrospective to ${dir}/retro-output.md`, // --- Document-Release skill E2E --- -describeE2E('Document-Release skill E2E', () => { +describeIfSelected('Document-Release skill E2E', ['document-release'], () => { let docReleaseDir: string; beforeAll(() => { @@ -1735,6 +1777,7 @@ IMPORTANT: // --- Deferred skill E2E tests (destructive or require interactive UI) --- +// Deferred tests — only test.todo entries, no selection needed describeE2E('Deferred skill E2E', () => { // Ship is destructive: pushes to remote, creates PRs, modifies VERSION/CHANGELOG test.todo('/ship completes full workflow'); @@ -1772,7 +1815,10 @@ ${designMd} Return JSON: { "passed": true/false, "reasoning": "one paragraph explaining your evaluation" }`); } -describeE2E('Design Consultation E2E', () => { +describeIfSelected('Design Consultation E2E', [ + 'design-consultation-core', 'design-consultation-research', + 'design-consultation-existing', 'design-consultation-preview', +], () => { let designDir: string; beforeAll(() => { @@ -1816,7 +1862,7 @@ A civic tech data platform for government employees to access, visualize, and sh try { fs.rmSync(designDir, { recursive: true, force: true }); } catch {} }); - test('Test 1: core flow produces valid DESIGN.md + CLAUDE.md', async () => { + testIfSelected('design-consultation-core', async () => { const result = await runSkillTest({ prompt: `Read design-consultation/SKILL.md for the design consultation workflow. @@ -1876,7 +1922,7 @@ Write DESIGN.md and CLAUDE.md (or update it) in the working directory.`, } }, 420_000); - test('Test 2: research integration uses WebSearch', async () => { + testIfSelected('design-consultation-research', async () => { // Clean up from previous test try { fs.unlinkSync(path.join(designDir, 'DESIGN.md')); } catch {} try { fs.unlinkSync(path.join(designDir, 'CLAUDE.md')); } catch {} @@ -1933,7 +1979,7 @@ Write DESIGN.md to the working directory.`, expect(designExists).toBe(true); }, 420_000); - test('Test 3: handles existing DESIGN.md', async () => { + testIfSelected('design-consultation-existing', async () => { // Pre-create a minimal DESIGN.md fs.writeFileSync(path.join(designDir, 'DESIGN.md'), `# Design System — CivicPulse @@ -1979,7 +2025,7 @@ Skip research. Skip font preview. Skip any AskUserQuestion calls — this is non } }, 420_000); - test('Test 4: generates font + color preview HTML', async () => { + testIfSelected('design-consultation-preview', async () => { // Clean up try { fs.unlinkSync(path.join(designDir, 'DESIGN.md')); } catch {} @@ -2043,7 +2089,7 @@ Skip research. Skip any AskUserQuestion calls — this is non-interactive. Gener // --- Plan Design Review E2E --- -describeE2E('Plan Design Review E2E', () => { +describeIfSelected('Plan Design Review E2E', ['plan-design-review-audit', 'plan-design-review-export'], () => { let reviewDir: string; beforeAll(() => { @@ -2074,7 +2120,7 @@ describeE2E('Plan Design Review E2E', () => { try { fs.rmSync(reviewDir, { recursive: true, force: true }); } catch {} }); - test('Test 5: /plan-design-review produces audit report', async () => { + testIfSelected('plan-design-review-audit', async () => { const result = await runSkillTest({ prompt: `IMPORTANT: The browse binary is already assigned below as B. Do NOT search for it or run the SKILL.md setup block — just use $B directly. @@ -2082,9 +2128,11 @@ B="${browseBin}" Read plan-design-review/SKILL.md for the design review workflow. -Review the site at ${testServer.url}. Use --quick mode (homepage + 2 pages). Skip any AskUserQuestion calls — this is non-interactive. Write your audit report to ./design-audit.md. Do not offer to create DESIGN.md.`, +Review the site at ${testServer.url}. Use --quick mode (homepage + 2 pages). Skip any AskUserQuestion calls — this is non-interactive. Write your audit report to ./design-audit.md. Do not offer to create DESIGN.md. + +EFFICIENCY: Skip the preamble bash block. Combine multiple browse commands into single bash blocks (e.g. run all Phase 2 JS extractions in one block). Write the report as soon as you have enough data — do not over-explore.`, workingDirectory: reviewDir, - maxTurns: 20, + maxTurns: 30, timeout: 360_000, testName: 'plan-design-review-audit', runId, @@ -2113,7 +2161,7 @@ Review the site at ${testServer.url}. Use --quick mode (homepage + 2 pages). Ski } }, 420_000); - test('Test 6: /plan-design-review exports DESIGN.md', async () => { + testIfSelected('plan-design-review-export', async () => { // Clean up previous test artifacts try { fs.unlinkSync(path.join(reviewDir, 'design-audit.md')); } catch {} @@ -2161,7 +2209,7 @@ Review ${testServer.url} with --quick mode. Skip any AskUserQuestion calls — t // --- QA Design Review E2E --- -describeE2E('QA Design Review E2E', () => { +describeIfSelected('QA Design Review E2E', ['qa-design-review-fix'], () => { let qaDesignDir: string; let qaDesignServer: ReturnType | null = null; @@ -2300,7 +2348,7 @@ Review the site at ${serverUrl}. Use --quick mode. Skip any AskUserQuestion call // --- Test Bootstrap E2E --- -describeE2E('Test Bootstrap E2E', () => { +describeIfSelected('Test Bootstrap E2E', ['qa-bootstrap'], () => { let bootstrapDir: string; let bootstrapServer: ReturnType; @@ -2437,7 +2485,7 @@ This is a test+fix loop: find bugs, fix them, write regression tests, commit eac // --- Test Coverage Audit E2E --- -describeE2E('Test Coverage Audit E2E', () => { +describeIfSelected('Test Coverage Audit E2E', ['ship-coverage-audit'], () => { let coverageDir: string; beforeAll(() => { diff --git a/test/skill-llm-eval.test.ts b/test/skill-llm-eval.test.ts index ba635613..c3e1aef2 100644 --- a/test/skill-llm-eval.test.ts +++ b/test/skill-llm-eval.test.ts @@ -17,6 +17,7 @@ import * as path from 'path'; import { callJudge, judge } from './helpers/llm-judge'; import type { JudgeScore } from './helpers/llm-judge'; import { EvalCollector } from './helpers/eval-store'; +import { selectTests, detectBaseBranch, getChangedFiles, LLM_JUDGE_TOUCHFILES, GLOBAL_TOUCHFILES } from './helpers/touchfiles'; const ROOT = path.resolve(import.meta.dir, '..'); // Run when EVALS=1 is set (requires ANTHROPIC_API_KEY in env) @@ -26,8 +27,43 @@ const describeEval = evalsEnabled ? describe : describe.skip; // Eval result collector const evalCollector = evalsEnabled ? new EvalCollector('llm-judge') : null; -describeEval('LLM-as-judge quality evals', () => { - test('command reference table scores >= 4 on all dimensions', async () => { +// --- Diff-based test selection --- +let selectedTests: string[] | null = null; + +if (evalsEnabled && !process.env.EVALS_ALL) { + const baseBranch = process.env.EVALS_BASE + || detectBaseBranch(ROOT) + || 'main'; + const changedFiles = getChangedFiles(baseBranch, ROOT); + + if (changedFiles.length > 0) { + const selection = selectTests(changedFiles, LLM_JUDGE_TOUCHFILES, GLOBAL_TOUCHFILES); + selectedTests = selection.selected; + process.stderr.write(`\nLLM-judge selection (${selection.reason}): ${selection.selected.length}/${Object.keys(LLM_JUDGE_TOUCHFILES).length} tests\n`); + if (selection.skipped.length > 0) { + process.stderr.write(` Skipped: ${selection.skipped.join(', ')}\n`); + } + process.stderr.write('\n'); + } +} + +/** Wrap a describe block to skip if none of its tests are selected. */ +function describeIfSelected(name: string, testNames: string[], fn: () => void) { + const anySelected = selectedTests === null || testNames.some(t => selectedTests!.includes(t)); + (anySelected ? describeEval : describe.skip)(name, fn); +} + +/** Skip an individual test if not selected (for multi-test describe blocks). */ +function testIfSelected(testName: string, fn: () => Promise, timeout: number) { + const shouldRun = selectedTests === null || selectedTests.includes(testName); + (shouldRun ? test : test.skip)(testName, fn, timeout); +} + +describeIfSelected('LLM-as-judge quality evals', [ + 'command reference table', 'snapshot flags reference', + 'browse/SKILL.md reference', 'setup block', 'regression vs baseline', +], () => { + testIfSelected('command reference table', async () => { const t0 = Date.now(); const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); const start = content.indexOf('## Command Reference'); @@ -53,7 +89,7 @@ describeEval('LLM-as-judge quality evals', () => { expect(scores.actionability).toBeGreaterThanOrEqual(4); }, 30_000); - test('snapshot flags section scores >= 4 on all dimensions', async () => { + testIfSelected('snapshot flags reference', async () => { const t0 = Date.now(); const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); const start = content.indexOf('## Snapshot System'); @@ -79,7 +115,7 @@ describeEval('LLM-as-judge quality evals', () => { expect(scores.actionability).toBeGreaterThanOrEqual(4); }, 30_000); - test('browse/SKILL.md overall scores >= 4', async () => { + testIfSelected('browse/SKILL.md reference', async () => { const t0 = Date.now(); const content = fs.readFileSync(path.join(ROOT, 'browse', 'SKILL.md'), 'utf-8'); const start = content.indexOf('## Snapshot Flags'); @@ -104,7 +140,7 @@ describeEval('LLM-as-judge quality evals', () => { expect(scores.actionability).toBeGreaterThanOrEqual(4); }, 30_000); - test('setup block scores >= 3 on actionability and clarity', async () => { + testIfSelected('setup block', async () => { const t0 = Date.now(); const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); const setupStart = content.indexOf('## SETUP'); @@ -131,7 +167,7 @@ describeEval('LLM-as-judge quality evals', () => { expect(scores.clarity).toBeGreaterThanOrEqual(3); }, 30_000); - test('regression check: compare branch vs baseline quality', async () => { + testIfSelected('regression vs baseline', async () => { const t0 = Date.now(); const generated = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8'); const genStart = generated.indexOf('## Command Reference'); @@ -220,10 +256,10 @@ Scores are 1-5 overall quality.`, // --- Part 7: QA skill quality evals (C6) --- -describeEval('QA skill quality evals', () => { +describeIfSelected('QA skill quality evals', ['qa/SKILL.md workflow', 'qa/SKILL.md health rubric'], () => { const qaContent = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8'); - test('qa/SKILL.md workflow quality scores >= 4', async () => { + testIfSelected('qa/SKILL.md workflow', async () => { const t0 = Date.now(); const start = qaContent.indexOf('## Workflow'); const end = qaContent.indexOf('## Health Score Rubric'); @@ -266,7 +302,7 @@ ${section}`); expect(scores.actionability).toBeGreaterThanOrEqual(4); }, 30_000); - test('qa/SKILL.md health score rubric is unambiguous', async () => { + testIfSelected('qa/SKILL.md health rubric', async () => { const t0 = Date.now(); const start = qaContent.indexOf('## Health Score Rubric'); const section = qaContent.slice(start); @@ -310,8 +346,8 @@ ${section}`); // --- Part 7: Cross-skill consistency judge (C7) --- -describeEval('Cross-skill consistency evals', () => { - test('greptile-history patterns are consistent across all skills', async () => { +describeIfSelected('Cross-skill consistency evals', ['cross-skill greptile consistency'], () => { + testIfSelected('cross-skill greptile consistency', async () => { const t0 = Date.now(); const reviewContent = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8'); const shipContent = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8'); @@ -375,10 +411,10 @@ score (1-5): 5 = perfectly consistent, 1 = contradictory`); // --- Part 7: Baseline score pinning (C9) --- -describeEval('Baseline score pinning', () => { +describeIfSelected('Baseline score pinning', ['baseline score pinning'], () => { const baselinesPath = path.join(ROOT, 'test', 'fixtures', 'eval-baselines.json'); - test('LLM eval scores do not regress below baselines', async () => { + testIfSelected('baseline score pinning', async () => { const t0 = Date.now(); if (!fs.existsSync(baselinesPath)) { console.log('No baseline file found — skipping pinning check'); diff --git a/test/touchfiles.test.ts b/test/touchfiles.test.ts new file mode 100644 index 00000000..e666bb3d --- /dev/null +++ b/test/touchfiles.test.ts @@ -0,0 +1,253 @@ +/** + * Unit tests for diff-based test selection. + * Free (no API calls), runs with `bun test`. + */ + +import { describe, test, expect } from 'bun:test'; +import { spawnSync } from 'child_process'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { + matchGlob, + selectTests, + detectBaseBranch, + E2E_TOUCHFILES, + LLM_JUDGE_TOUCHFILES, + GLOBAL_TOUCHFILES, +} from './helpers/touchfiles'; + +const ROOT = path.resolve(import.meta.dir, '..'); + +// --- matchGlob --- + +describe('matchGlob', () => { + test('** matches any depth of path segments', () => { + expect(matchGlob('browse/src/commands.ts', 'browse/src/**')).toBe(true); + expect(matchGlob('browse/src/deep/nested/file.ts', 'browse/src/**')).toBe(true); + expect(matchGlob('browse/src/cli.ts', 'browse/src/**')).toBe(true); + }); + + test('** does not match unrelated paths', () => { + expect(matchGlob('browse/src/commands.ts', 'qa/**')).toBe(false); + expect(matchGlob('review/SKILL.md', 'qa/**')).toBe(false); + }); + + test('exact match works', () => { + expect(matchGlob('SKILL.md', 'SKILL.md')).toBe(true); + expect(matchGlob('SKILL.md.tmpl', 'SKILL.md')).toBe(false); + expect(matchGlob('qa/SKILL.md', 'SKILL.md')).toBe(false); + }); + + test('* matches within a single segment', () => { + expect(matchGlob('test/fixtures/review-eval-enum.rb', 'test/fixtures/review-eval-enum*.rb')).toBe(true); + expect(matchGlob('test/fixtures/review-eval-enum-diff.rb', 'test/fixtures/review-eval-enum*.rb')).toBe(true); + expect(matchGlob('test/fixtures/review-eval-vuln.rb', 'test/fixtures/review-eval-enum*.rb')).toBe(false); + }); + + test('dots in patterns are escaped correctly', () => { + expect(matchGlob('SKILL.md', 'SKILL.md')).toBe(true); + expect(matchGlob('SKILLxmd', 'SKILL.md')).toBe(false); + }); + + test('** at end matches files in the directory', () => { + expect(matchGlob('qa/SKILL.md', 'qa/**')).toBe(true); + expect(matchGlob('qa/SKILL.md.tmpl', 'qa/**')).toBe(true); + expect(matchGlob('qa/templates/report.md', 'qa/**')).toBe(true); + }); +}); + +// --- selectTests --- + +describe('selectTests', () => { + test('browse/src change selects browse and qa tests', () => { + const result = selectTests(['browse/src/commands.ts'], E2E_TOUCHFILES); + expect(result.selected).toContain('browse-basic'); + expect(result.selected).toContain('browse-snapshot'); + expect(result.selected).toContain('qa-quick'); + expect(result.selected).toContain('qa-fix-loop'); + expect(result.selected).toContain('qa-design-review-fix'); + expect(result.reason).toBe('diff'); + // Should NOT include unrelated tests + expect(result.selected).not.toContain('plan-ceo-review'); + expect(result.selected).not.toContain('retro'); + expect(result.selected).not.toContain('document-release'); + }); + + test('skill-specific change selects only that skill and related tests', () => { + const result = selectTests(['plan-ceo-review/SKILL.md'], E2E_TOUCHFILES); + expect(result.selected).toContain('plan-ceo-review'); + expect(result.selected).toContain('plan-ceo-review-selective'); + expect(result.selected.length).toBe(2); + expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 2); + }); + + test('global touchfile triggers ALL tests', () => { + const result = selectTests(['test/helpers/session-runner.ts'], E2E_TOUCHFILES); + expect(result.selected.length).toBe(Object.keys(E2E_TOUCHFILES).length); + expect(result.skipped.length).toBe(0); + expect(result.reason).toContain('global'); + }); + + test('gen-skill-docs.ts is a global touchfile', () => { + const result = selectTests(['scripts/gen-skill-docs.ts'], E2E_TOUCHFILES); + expect(result.selected.length).toBe(Object.keys(E2E_TOUCHFILES).length); + expect(result.reason).toContain('global'); + }); + + test('unrelated file selects nothing', () => { + const result = selectTests(['README.md'], E2E_TOUCHFILES); + expect(result.selected).toEqual([]); + expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length); + }); + + test('empty changed files selects nothing', () => { + const result = selectTests([], E2E_TOUCHFILES); + expect(result.selected).toEqual([]); + }); + + test('multiple changed files union their selections', () => { + const result = selectTests( + ['plan-ceo-review/SKILL.md', 'retro/SKILL.md.tmpl'], + E2E_TOUCHFILES, + ); + expect(result.selected).toContain('plan-ceo-review'); + expect(result.selected).toContain('plan-ceo-review-selective'); + expect(result.selected).toContain('retro'); + expect(result.selected).toContain('retro-base-branch'); + expect(result.selected.length).toBe(4); + }); + + test('works with LLM_JUDGE_TOUCHFILES', () => { + const result = selectTests(['qa/SKILL.md'], LLM_JUDGE_TOUCHFILES); + expect(result.selected).toContain('qa/SKILL.md workflow'); + expect(result.selected).toContain('qa/SKILL.md health rubric'); + expect(result.selected.length).toBe(2); + }); + + test('SKILL.md.tmpl root template only selects root-dependent tests', () => { + const result = selectTests(['SKILL.md.tmpl'], E2E_TOUCHFILES); + // Should select the 7 tests that depend on root SKILL.md + expect(result.selected).toContain('skillmd-setup-discovery'); + expect(result.selected).toContain('contributor-mode'); + expect(result.selected).toContain('session-awareness'); + // Should NOT select unrelated tests + expect(result.selected).not.toContain('plan-ceo-review'); + expect(result.selected).not.toContain('retro'); + }); + + test('global touchfiles work for LLM-judge tests too', () => { + const result = selectTests(['scripts/gen-skill-docs.ts'], LLM_JUDGE_TOUCHFILES); + expect(result.selected.length).toBe(Object.keys(LLM_JUDGE_TOUCHFILES).length); + }); +}); + +// --- detectBaseBranch --- + +describe('detectBaseBranch', () => { + test('detects local main branch', () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'touchfiles-test-')); + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: dir, stdio: 'pipe', timeout: 5000 }); + + run('git', ['init']); + run('git', ['config', 'user.email', 'test@test.com']); + run('git', ['config', 'user.name', 'Test']); + fs.writeFileSync(path.join(dir, 'test.txt'), 'hello\n'); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'init']); + + const result = detectBaseBranch(dir); + // Should find 'main' (or 'master' depending on git default) + expect(result).toMatch(/^(main|master)$/); + + try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} + }); + + test('returns null for empty repo with no branches', () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'touchfiles-test-')); + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: dir, stdio: 'pipe', timeout: 5000 }); + + run('git', ['init']); + // No commits = no branches + const result = detectBaseBranch(dir); + expect(result).toBeNull(); + + try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} + }); + + test('returns null for non-git directory', () => { + const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'touchfiles-test-')); + const result = detectBaseBranch(dir); + expect(result).toBeNull(); + + try { fs.rmSync(dir, { recursive: true, force: true }); } catch {} + }); +}); + +// --- Completeness: every testName in skill-e2e.test.ts has a TOUCHFILES entry --- + +describe('TOUCHFILES completeness', () => { + test('every E2E testName has a TOUCHFILES entry', () => { + const e2eContent = fs.readFileSync( + path.join(ROOT, 'test', 'skill-e2e.test.ts'), + 'utf-8', + ); + + // Extract all testName: 'value' entries + const testNameRegex = /testName:\s*['"`]([^'"`]+)['"`]/g; + const testNames: string[] = []; + let match; + while ((match = testNameRegex.exec(e2eContent)) !== null) { + let name = match[1]; + // Handle template literals like `qa-${label}` — these expand to + // qa-b6-static, qa-b7-spa, qa-b8-checkout + if (name.includes('${')) continue; // skip template literals, check expanded forms below + testNames.push(name); + } + + // Add the template-expanded testNames from runPlantedBugEval calls + const plantedBugRegex = /runPlantedBugEval\([^,]+,\s*[^,]+,\s*['"`]([^'"`]+)['"`]\)/g; + while ((match = plantedBugRegex.exec(e2eContent)) !== null) { + testNames.push(`qa-${match[1]}`); + } + + expect(testNames.length).toBeGreaterThan(0); + + const missing = testNames.filter(name => !(name in E2E_TOUCHFILES)); + if (missing.length > 0) { + throw new Error( + `E2E tests missing TOUCHFILES entries: ${missing.join(', ')}\n` + + `Add these to E2E_TOUCHFILES in test/helpers/touchfiles.ts`, + ); + } + }); + + test('every LLM-judge test has a TOUCHFILES entry', () => { + const llmContent = fs.readFileSync( + path.join(ROOT, 'test', 'skill-llm-eval.test.ts'), + 'utf-8', + ); + + // Extract test names from addTest({ name: '...' }) calls + const nameRegex = /name:\s*['"`]([^'"`]+)['"`]/g; + const testNames: string[] = []; + let match; + while ((match = nameRegex.exec(llmContent)) !== null) { + testNames.push(match[1]); + } + + // Deduplicate (some tests call addTest with the same name) + const unique = [...new Set(testNames)]; + expect(unique.length).toBeGreaterThan(0); + + const missing = unique.filter(name => !(name in LLM_JUDGE_TOUCHFILES)); + if (missing.length > 0) { + throw new Error( + `LLM-judge tests missing TOUCHFILES entries: ${missing.join(', ')}\n` + + `Add these to LLM_JUDGE_TOUCHFILES in test/helpers/touchfiles.ts`, + ); + } + }); +}); From d8894b750fb39cd1ed2932af42c17aaa850d5c4c Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Tue, 17 Mar 2026 19:09:04 -0500 Subject: [PATCH 4/7] feat: cognitive patterns for plan-review skills (v0.6.2) (#141) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: cognitive patterns for plan-review skills — latent space activation Enrich /plan-ceo-review, /plan-eng-review, and /plan-design-review with researched cognitive patterns from Bezos, Grove, Munger, Horowitz, Altman, Rams, Norman, Zhuo, Gebbia, Larson, McKinley, Brooks, Beck, and Majors. Patterns are evocative activation keys, not checklists — they trigger the LLM's deep knowledge of how these people actually think. * chore: bump version and changelog (v0.6.2.0) Co-Authored-By: Claude Opus 4.6 --------- Co-authored-by: Claude Opus 4.6 --- CHANGELOG.md | 7 +++++++ VERSION | 2 +- plan-ceo-review/SKILL.md | 21 +++++++++++++++++++++ plan-ceo-review/SKILL.md.tmpl | 21 +++++++++++++++++++++ plan-design-review/SKILL.md | 21 +++++++++++++++++++++ plan-design-review/SKILL.md.tmpl | 21 +++++++++++++++++++++ plan-eng-review/SKILL.md | 22 ++++++++++++++++++++++ plan-eng-review/SKILL.md.tmpl | 22 ++++++++++++++++++++++ 8 files changed, 136 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5062711c..5df40ae8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # Changelog +## [0.6.2.0] - 2026-03-17 + +### Added + +- **Plan reviews now think like the best in the world.** `/plan-ceo-review` applies 14 cognitive patterns from Bezos (one-way doors, Day 1 proxy skepticism), Grove (paranoid scanning), Munger (inversion), Horowitz (wartime awareness), Chesky/Graham (founder mode), and Altman (leverage obsession). `/plan-eng-review` applies 15 patterns from Larson (team state diagnosis), McKinley (boring by default), Brooks (essential vs accidental complexity), Beck (make the change easy), Majors (own your code in production), and Google SRE (error budgets). `/plan-design-review` applies 12 patterns from Rams (subtraction default), Norman (time-horizon design), Zhuo (principled taste), Gebbia (design for trust, storyboard the journey), and Ive (care is visible). +- **Latent space activation, not checklists.** The cognitive patterns name-drop frameworks and people so the LLM draws on its deep knowledge of how they actually think. The instruction is "internalize these, don't enumerate them" — making each review a genuine perspective shift, not a longer checklist. + ## [0.6.1.0] - 2026-03-17 ### Added diff --git a/VERSION b/VERSION index 44e7f9a2..e1e48733 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.6.1.0 +0.6.2.0 diff --git a/plan-ceo-review/SKILL.md b/plan-ceo-review/SKILL.md index d7953a92..b44dff6d 100644 --- a/plan-ceo-review/SKILL.md +++ b/plan-ceo-review/SKILL.md @@ -176,6 +176,27 @@ Do NOT make any code changes. Do NOT start implementation. Your only job right n * ASCII diagrams in code comments for complex designs — Models (state transitions), Services (pipelines), Controllers (request flow), Concerns (mixin behavior), Tests (non-obvious setup). * Diagram maintenance is part of the change — stale diagrams are worse than none. +## Cognitive Patterns — How Great CEOs Think + +These are not checklist items. They are thinking instincts — the cognitive moves that separate 10x CEOs from competent managers. Let them shape your perspective throughout the review. Don't enumerate them; internalize them. + +1. **Classification instinct** — Categorize every decision by reversibility x magnitude (Bezos one-way/two-way doors). Most things are two-way doors; move fast. +2. **Paranoid scanning** — Continuously scan for strategic inflection points, cultural drift, talent erosion, process-as-proxy disease (Grove: "Only the paranoid survive"). +3. **Inversion reflex** — For every "how do we win?" also ask "what would make us fail?" (Munger). +4. **Focus as subtraction** — Primary value-add is what to *not* do. Jobs went from 350 products to 10. Default: do fewer things, better. +5. **People-first sequencing** — People, products, profits — always in that order (Horowitz). Talent density solves most other problems (Hastings). +6. **Speed calibration** — Fast is default. Only slow down for irreversible + high-magnitude decisions. 70% information is enough to decide (Bezos). +7. **Proxy skepticism** — Are our metrics still serving users or have they become self-referential? (Bezos Day 1). +8. **Narrative coherence** — Hard decisions need clear framing. Make the "why" legible, not everyone happy. +9. **Temporal depth** — Think in 5-10 year arcs. Apply regret minimization for major bets (Bezos at age 80). +10. **Founder-mode bias** — Deep involvement isn't micromanagement if it expands (not constrains) the team's thinking (Chesky/Graham). +11. **Wartime awareness** — Correctly diagnose peacetime vs wartime. Peacetime habits kill wartime companies (Horowitz). +12. **Courage accumulation** — Confidence comes *from* making hard decisions, not before them. "The struggle IS the job." +13. **Willfulness as strategy** — Be intentionally willful. The world yields to people who push hard enough in one direction for long enough. Most people give up too early (Altman). +14. **Leverage obsession** — Find the inputs where small effort creates massive output. Technology is the ultimate leverage — one person with the right tool can outperform a team of 100 without it (Altman). + +When you evaluate architecture, think through the inversion reflex. When you challenge scope, apply focus as subtraction. When you assess timeline, use speed calibration. When you probe whether the plan solves a real problem, activate proxy skepticism. + ## Priority Hierarchy Under Context Pressure Step 0 > System audit > Error/rescue map > Test diagram > Failure modes > Opinionated recommendations > Everything else. Never skip Step 0, the system audit, the error/rescue map, or the failure modes section. These are the highest-leverage outputs. diff --git a/plan-ceo-review/SKILL.md.tmpl b/plan-ceo-review/SKILL.md.tmpl index 8695dd8b..0616a4cc 100644 --- a/plan-ceo-review/SKILL.md.tmpl +++ b/plan-ceo-review/SKILL.md.tmpl @@ -55,6 +55,27 @@ Do NOT make any code changes. Do NOT start implementation. Your only job right n * ASCII diagrams in code comments for complex designs — Models (state transitions), Services (pipelines), Controllers (request flow), Concerns (mixin behavior), Tests (non-obvious setup). * Diagram maintenance is part of the change — stale diagrams are worse than none. +## Cognitive Patterns — How Great CEOs Think + +These are not checklist items. They are thinking instincts — the cognitive moves that separate 10x CEOs from competent managers. Let them shape your perspective throughout the review. Don't enumerate them; internalize them. + +1. **Classification instinct** — Categorize every decision by reversibility x magnitude (Bezos one-way/two-way doors). Most things are two-way doors; move fast. +2. **Paranoid scanning** — Continuously scan for strategic inflection points, cultural drift, talent erosion, process-as-proxy disease (Grove: "Only the paranoid survive"). +3. **Inversion reflex** — For every "how do we win?" also ask "what would make us fail?" (Munger). +4. **Focus as subtraction** — Primary value-add is what to *not* do. Jobs went from 350 products to 10. Default: do fewer things, better. +5. **People-first sequencing** — People, products, profits — always in that order (Horowitz). Talent density solves most other problems (Hastings). +6. **Speed calibration** — Fast is default. Only slow down for irreversible + high-magnitude decisions. 70% information is enough to decide (Bezos). +7. **Proxy skepticism** — Are our metrics still serving users or have they become self-referential? (Bezos Day 1). +8. **Narrative coherence** — Hard decisions need clear framing. Make the "why" legible, not everyone happy. +9. **Temporal depth** — Think in 5-10 year arcs. Apply regret minimization for major bets (Bezos at age 80). +10. **Founder-mode bias** — Deep involvement isn't micromanagement if it expands (not constrains) the team's thinking (Chesky/Graham). +11. **Wartime awareness** — Correctly diagnose peacetime vs wartime. Peacetime habits kill wartime companies (Horowitz). +12. **Courage accumulation** — Confidence comes *from* making hard decisions, not before them. "The struggle IS the job." +13. **Willfulness as strategy** — Be intentionally willful. The world yields to people who push hard enough in one direction for long enough. Most people give up too early (Altman). +14. **Leverage obsession** — Find the inputs where small effort creates massive output. Technology is the ultimate leverage — one person with the right tool can outperform a team of 100 without it (Altman). + +When you evaluate architecture, think through the inversion reflex. When you challenge scope, apply focus as subtraction. When you assess timeline, use speed calibration. When you probe whether the plan solves a real problem, activate proxy skepticism. + ## Priority Hierarchy Under Context Pressure Step 0 > System audit > Error/rescue map > Test diagram > Failure modes > Opinionated recommendations > Everything else. Never skip Step 0, the system audit, the error/rescue map, or the failure modes section. These are the highest-leverage outputs. diff --git a/plan-design-review/SKILL.md b/plan-design-review/SKILL.md index f0b2fddd..a8f3498e 100644 --- a/plan-design-review/SKILL.md +++ b/plan-design-review/SKILL.md @@ -125,6 +125,27 @@ Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file You are a senior product designer reviewing a live site. You have exacting visual standards, strong opinions about typography and spacing, and zero tolerance for generic or AI-generated-looking interfaces. You do NOT care whether things "work." You care whether they feel right, look intentional, and respect the user. +## Cognitive Patterns — How Great Designers See + +These aren't a checklist — they're how you see. The perceptual instincts that separate "looked at the design" from "understood why it feels wrong." Let them run automatically as you audit. + +1. **Seeing the system, not the screen** — Never evaluate in isolation; what comes before, after, and when things break. +2. **Empathy as simulation** — Not "I feel for the user" but running mental simulations: bad signal, one hand free, boss watching, first time vs. 1000th time. +3. **Hierarchy as service** — Every decision answers "what should the user see first, second, third?" Respecting their time, not prettifying pixels. +4. **Constraint worship** — Limitations force clarity. "If I can only show 3 things, which 3 matter most?" +5. **The question reflex** — First instinct is questions, not opinions. "Who is this for? What did they try before this?" +6. **Edge case paranoia** — What if the name is 47 chars? Zero results? Network fails? Colorblind? RTL language? +7. **The "Would I notice?" test** — Invisible = perfect. The highest compliment is not noticing the design. +8. **Principled taste** — "This feels wrong" is traceable to a broken principle. Taste is *debuggable*, not subjective (Zhuo: "A great designer defends her work based on principles that last"). +9. **Subtraction default** — "As little design as possible" (Rams). "Subtract the obvious, add the meaningful" (Maeda). +10. **Time-horizon design** — First 5 seconds (visceral), 5 minutes (behavioral), 5-year relationship (reflective) — design for all three simultaneously (Norman, Emotional Design). +11. **Design for trust** — Every design decision either builds or erodes trust. Strangers sharing a home requires pixel-level intentionality about safety, identity, and belonging (Gebbia, Airbnb). +12. **Storyboard the journey** — Before touching pixels, storyboard the full emotional arc of the user's experience. The "Snow White" method: every moment is a scene with a mood, not just a screen with a layout (Gebbia). + +Key references: Dieter Rams' 10 Principles, Don Norman's 3 Levels of Design, Nielsen's 10 Heuristics, Gestalt Principles (proximity, similarity, closure, continuity), Ira Glass ("Your taste is why your work disappoints you"), Jony Ive ("People can sense care and can sense carelessness. Different and new is relatively easy. Doing something that's genuinely better is very hard."), Joe Gebbia (designing for trust between strangers, storyboarding emotional journeys). + +When auditing a page, empathy as simulation runs automatically. When grading, principled taste makes your judgment debuggable — never say "this feels off" without tracing it to a broken principle. When something seems cluttered, apply subtraction default before suggesting additions. + ## Setup **Parse the user's request for these parameters:** diff --git a/plan-design-review/SKILL.md.tmpl b/plan-design-review/SKILL.md.tmpl index 97546805..f8ecb25a 100644 --- a/plan-design-review/SKILL.md.tmpl +++ b/plan-design-review/SKILL.md.tmpl @@ -21,6 +21,27 @@ allowed-tools: You are a senior product designer reviewing a live site. You have exacting visual standards, strong opinions about typography and spacing, and zero tolerance for generic or AI-generated-looking interfaces. You do NOT care whether things "work." You care whether they feel right, look intentional, and respect the user. +## Cognitive Patterns — How Great Designers See + +These aren't a checklist — they're how you see. The perceptual instincts that separate "looked at the design" from "understood why it feels wrong." Let them run automatically as you audit. + +1. **Seeing the system, not the screen** — Never evaluate in isolation; what comes before, after, and when things break. +2. **Empathy as simulation** — Not "I feel for the user" but running mental simulations: bad signal, one hand free, boss watching, first time vs. 1000th time. +3. **Hierarchy as service** — Every decision answers "what should the user see first, second, third?" Respecting their time, not prettifying pixels. +4. **Constraint worship** — Limitations force clarity. "If I can only show 3 things, which 3 matter most?" +5. **The question reflex** — First instinct is questions, not opinions. "Who is this for? What did they try before this?" +6. **Edge case paranoia** — What if the name is 47 chars? Zero results? Network fails? Colorblind? RTL language? +7. **The "Would I notice?" test** — Invisible = perfect. The highest compliment is not noticing the design. +8. **Principled taste** — "This feels wrong" is traceable to a broken principle. Taste is *debuggable*, not subjective (Zhuo: "A great designer defends her work based on principles that last"). +9. **Subtraction default** — "As little design as possible" (Rams). "Subtract the obvious, add the meaningful" (Maeda). +10. **Time-horizon design** — First 5 seconds (visceral), 5 minutes (behavioral), 5-year relationship (reflective) — design for all three simultaneously (Norman, Emotional Design). +11. **Design for trust** — Every design decision either builds or erodes trust. Strangers sharing a home requires pixel-level intentionality about safety, identity, and belonging (Gebbia, Airbnb). +12. **Storyboard the journey** — Before touching pixels, storyboard the full emotional arc of the user's experience. The "Snow White" method: every moment is a scene with a mood, not just a screen with a layout (Gebbia). + +Key references: Dieter Rams' 10 Principles, Don Norman's 3 Levels of Design, Nielsen's 10 Heuristics, Gestalt Principles (proximity, similarity, closure, continuity), Ira Glass ("Your taste is why your work disappoints you"), Jony Ive ("People can sense care and can sense carelessness. Different and new is relatively easy. Doing something that's genuinely better is very hard."), Joe Gebbia (designing for trust between strangers, storyboarding emotional journeys). + +When auditing a page, empathy as simulation runs automatically. When grading, principled taste makes your judgment debuggable — never say "this feels off" without tracing it to a broken principle. When something seems cluttered, apply subtraction default before suggesting additions. + ## Setup **Parse the user's request for these parameters:** diff --git a/plan-eng-review/SKILL.md b/plan-eng-review/SKILL.md index d2292af6..05c74ba7 100644 --- a/plan-eng-review/SKILL.md +++ b/plan-eng-review/SKILL.md @@ -135,6 +135,28 @@ If you are running low on context or the user asks you to compress: Step 0 > Tes * Bias toward explicit over clever. * Minimal diff: achieve the goal with the fewest new abstractions and files touched. +## Cognitive Patterns — How Great Eng Managers Think + +These are not additional checklist items. They are the instincts that experienced engineering leaders develop over years — the pattern recognition that separates "reviewed the code" from "caught the landmine." Apply them throughout your review. + +1. **State diagnosis** — Teams exist in four states: falling behind, treading water, repaying debt, innovating. Each demands a different intervention (Larson, An Elegant Puzzle). +2. **Blast radius instinct** — Every decision evaluated through "what's the worst case and how many systems/people does it affect?" +3. **Boring by default** — "Every company gets about three innovation tokens." Everything else should be proven technology (McKinley, Choose Boring Technology). +4. **Incremental over revolutionary** — Strangler fig, not big bang. Canary, not global rollout. Refactor, not rewrite (Fowler). +5. **Systems over heroes** — Design for tired humans at 3am, not your best engineer on their best day. +6. **Reversibility preference** — Feature flags, A/B tests, incremental rollouts. Make the cost of being wrong low. +7. **Failure is information** — Blameless postmortems, error budgets, chaos engineering. Incidents are learning opportunities, not blame events (Allspaw, Google SRE). +8. **Org structure IS architecture** — Conway's Law in practice. Design both intentionally (Skelton/Pais, Team Topologies). +9. **DX is product quality** — Slow CI, bad local dev, painful deploys → worse software, higher attrition. Developer experience is a leading indicator. +10. **Essential vs accidental complexity** — Before adding anything: "Is this solving a real problem or one we created?" (Brooks, No Silver Bullet). +11. **Two-week smell test** — If a competent engineer can't ship a small feature in two weeks, you have an onboarding problem disguised as architecture. +12. **Glue work awareness** — Recognize invisible coordination work. Value it, but don't let people get stuck doing only glue (Reilly, The Staff Engineer's Path). +13. **Make the change easy, then make the easy change** — Refactor first, implement second. Never structural + behavioral changes simultaneously (Beck). +14. **Own your code in production** — No wall between dev and ops. "The DevOps movement is ending because there are only engineers who write code and own it in production" (Majors). +15. **Error budgets over uptime targets** — SLO of 99.9% = 0.1% downtime *budget to spend on shipping*. Reliability is resource allocation (Google SRE). + +When evaluating architecture, think "boring by default." When reviewing tests, think "systems over heroes." When assessing complexity, ask Brooks's question. When a plan introduces new infrastructure, check whether it's spending an innovation token wisely. + ## Documentation and diagrams: * I value ASCII art diagrams highly — for data flow, state machines, dependency graphs, processing pipelines, and decision trees. Use them liberally in plans and design docs. * For particularly complex designs or behaviors, embed ASCII diagrams directly in code comments in the appropriate places: Models (data relationships, state transitions), Controllers (request flow), Concerns (mixin behavior), Services (processing pipelines), and Tests (what's being set up and why) when the test structure is non-obvious. diff --git a/plan-eng-review/SKILL.md.tmpl b/plan-eng-review/SKILL.md.tmpl index bf033606..91f24719 100644 --- a/plan-eng-review/SKILL.md.tmpl +++ b/plan-eng-review/SKILL.md.tmpl @@ -31,6 +31,28 @@ If you are running low on context or the user asks you to compress: Step 0 > Tes * Bias toward explicit over clever. * Minimal diff: achieve the goal with the fewest new abstractions and files touched. +## Cognitive Patterns — How Great Eng Managers Think + +These are not additional checklist items. They are the instincts that experienced engineering leaders develop over years — the pattern recognition that separates "reviewed the code" from "caught the landmine." Apply them throughout your review. + +1. **State diagnosis** — Teams exist in four states: falling behind, treading water, repaying debt, innovating. Each demands a different intervention (Larson, An Elegant Puzzle). +2. **Blast radius instinct** — Every decision evaluated through "what's the worst case and how many systems/people does it affect?" +3. **Boring by default** — "Every company gets about three innovation tokens." Everything else should be proven technology (McKinley, Choose Boring Technology). +4. **Incremental over revolutionary** — Strangler fig, not big bang. Canary, not global rollout. Refactor, not rewrite (Fowler). +5. **Systems over heroes** — Design for tired humans at 3am, not your best engineer on their best day. +6. **Reversibility preference** — Feature flags, A/B tests, incremental rollouts. Make the cost of being wrong low. +7. **Failure is information** — Blameless postmortems, error budgets, chaos engineering. Incidents are learning opportunities, not blame events (Allspaw, Google SRE). +8. **Org structure IS architecture** — Conway's Law in practice. Design both intentionally (Skelton/Pais, Team Topologies). +9. **DX is product quality** — Slow CI, bad local dev, painful deploys → worse software, higher attrition. Developer experience is a leading indicator. +10. **Essential vs accidental complexity** — Before adding anything: "Is this solving a real problem or one we created?" (Brooks, No Silver Bullet). +11. **Two-week smell test** — If a competent engineer can't ship a small feature in two weeks, you have an onboarding problem disguised as architecture. +12. **Glue work awareness** — Recognize invisible coordination work. Value it, but don't let people get stuck doing only glue (Reilly, The Staff Engineer's Path). +13. **Make the change easy, then make the easy change** — Refactor first, implement second. Never structural + behavioral changes simultaneously (Beck). +14. **Own your code in production** — No wall between dev and ops. "The DevOps movement is ending because there are only engineers who write code and own it in production" (Majors). +15. **Error budgets over uptime targets** — SLO of 99.9% = 0.1% downtime *budget to spend on shipping*. Reliability is resource allocation (Google SRE). + +When evaluating architecture, think "boring by default." When reviewing tests, think "systems over heroes." When assessing complexity, ask Brooks's question. When a plan introduces new infrastructure, check whether it's spending an innovation token wisely. + ## Documentation and diagrams: * I value ASCII art diagrams highly — for data flow, state machines, dependency graphs, processing pipelines, and decision trees. Use them liberally in plans and design docs. * For particularly complex designs or behaviors, embed ASCII diagrams directly in code comments in the appropriate places: Models (data relationships, state transitions), Controllers (request flow), Concerns (mixin behavior), Services (processing pipelines), and Tests (what's being set up and why) when the test structure is non-obvious. From 28becb3b395c76f163d368aa20cfadc7cb686988 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Tue, 17 Mar 2026 20:12:55 -0500 Subject: [PATCH 5/7] feat: design review lite in /review and /ship + gstack-diff-scope (v0.6.3) (#142) * feat: gstack-diff-scope helper + design review checklist bin/gstack-diff-scope categorizes branch changes into SCOPE_FRONTEND, SCOPE_BACKEND, SCOPE_PROMPTS, SCOPE_TESTS, SCOPE_DOCS, SCOPE_CONFIG. review/design-checklist.md is a 20-item code-level checklist with HIGH/MEDIUM/LOW confidence tags for detecting design anti-patterns from source code. * feat: integrate design review lite into /review and /ship Add generateDesignReviewLite() resolver, insert {{DESIGN_REVIEW_LITE}} partial in review Step 4.5 and ship Step 3.5. Update dashboard to recognize design-review-lite entries. Ship pre-flight uses gstack-diff-scope for smarter design review recommendations. * test: E2E eval for design review lite detection Planted CSS/HTML fixtures with 7 design anti-patterns. E2E test verifies /review catches >= 4 of 7 (Papyrus font, 14px body text, outline:none, !important, purple gradient, generic hero copy, 3-column feature grid). * chore: bump version and changelog (v0.6.3.0) Co-Authored-By: Claude Opus 4.6 --------- Co-authored-by: Claude Opus 4.6 --- CHANGELOG.md | 9 ++ TODOS.md | 12 +- VERSION | 2 +- bin/gstack-diff-scope | 71 +++++++++++ plan-ceo-review/SKILL.md | 2 +- plan-design-review/SKILL.md | 2 +- plan-eng-review/SKILL.md | 2 +- review/SKILL.md | 41 +++++++ review/SKILL.md.tmpl | 8 ++ review/design-checklist.md | 132 +++++++++++++++++++++ scripts/gen-skill-docs.ts | 42 ++++++- ship/SKILL.md | 48 +++++++- ship/SKILL.md.tmpl | 13 +- test/fixtures/review-eval-design-slop.css | 86 ++++++++++++++ test/fixtures/review-eval-design-slop.html | 41 +++++++ test/helpers/touchfiles.ts | 1 + test/skill-e2e.test.ts | 91 ++++++++++++++ 17 files changed, 587 insertions(+), 16 deletions(-) create mode 100755 bin/gstack-diff-scope create mode 100644 review/design-checklist.md create mode 100644 test/fixtures/review-eval-design-slop.css create mode 100644 test/fixtures/review-eval-design-slop.html diff --git a/CHANGELOG.md b/CHANGELOG.md index 5df40ae8..04f690a3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,14 @@ # Changelog +## [0.6.3.0] - 2026-03-17 + +### Added + +- **Every PR touching frontend code now gets a design review automatically.** `/review` and `/ship` apply a 20-item design checklist against changed CSS, HTML, JSX, and view files. Catches AI slop patterns (purple gradients, 3-column icon grids, generic hero copy), typography issues (body text < 16px, blacklisted fonts), accessibility gaps (`outline: none`), and `!important` abuse. Mechanical CSS fixes are auto-applied; design judgment calls ask you first. +- **`gstack-diff-scope` categorizes what changed in your branch.** Run `eval $(gstack-diff-scope main)` and get `SCOPE_FRONTEND=true/false`, `SCOPE_BACKEND`, `SCOPE_PROMPTS`, `SCOPE_TESTS`, `SCOPE_DOCS`, `SCOPE_CONFIG`. Design review uses it to skip silently on backend-only PRs. Ship pre-flight uses it to recommend design review when frontend files are touched. +- **Design review shows up in the Review Readiness Dashboard.** The dashboard now distinguishes between "LITE" (code-level, runs automatically in /review and /ship) and "FULL" (visual audit via /plan-design-review with browse binary). Both show up as Design Review entries. +- **E2E eval for design review detection.** Planted CSS/HTML fixtures with 7 known anti-patterns (Papyrus font, 14px body text, `outline: none`, `!important`, purple gradient, generic hero copy, 3-column feature grid). The eval verifies `/review` catches at least 4 of 7. + ## [0.6.2.0] - 2026-03-17 ### Added diff --git a/TODOS.md b/TODOS.md index 8616f906..5f771de7 100644 --- a/TODOS.md +++ b/TODOS.md @@ -444,17 +444,17 @@ Shipped as `/design-consultation` on garrytan/design branch. Renamed from `/setu ## Ship Confidence Dashboard -### Smart review relevance detection +### Smart review relevance detection — PARTIALLY SHIPPED -**What:** Auto-detect which of the 4 reviews are relevant based on branch changes (skip Design Review if no CSS/view changes, skip Code Review if plan-only). +~~**What:** Auto-detect which of the 4 reviews are relevant based on branch changes (skip Design Review if no CSS/view changes, skip Code Review if plan-only).~~ -**Why:** Currently dashboard always shows 4 rows. On docs-only changes, "Design Review: NOT YET RUN" is noise. +`bin/gstack-diff-scope` shipped — categorizes diff into SCOPE_FRONTEND, SCOPE_BACKEND, SCOPE_PROMPTS, SCOPE_TESTS, SCOPE_DOCS, SCOPE_CONFIG. Used by design-review-lite to skip when no frontend files changed. Dashboard integration for conditional row display is a follow-up. -**Context:** /plan-design-review and /qa already do file-type detection in diff-aware mode. Could reuse that heuristic. Would require a `gstack-diff-scope` helper or enriching `gstack-slug` to also output change categories. +**Remaining:** Dashboard conditional row display (hide "Design Review: NOT YET RUN" when SCOPE_FRONTEND=false). Extend to Eng Review (skip for docs-only) and CEO Review (skip for config-only). -**Effort:** M +**Effort:** S **Priority:** P3 -**Depends on:** Ship Confidence Dashboard (shipped) +**Depends on:** gstack-diff-scope (shipped) ### /merge skill — review-gated PR merge diff --git a/VERSION b/VERSION index e1e48733..1e40a508 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.6.2.0 +0.6.3.0 diff --git a/bin/gstack-diff-scope b/bin/gstack-diff-scope new file mode 100755 index 00000000..ada66c0a --- /dev/null +++ b/bin/gstack-diff-scope @@ -0,0 +1,71 @@ +#!/usr/bin/env bash +# gstack-diff-scope — categorize what changed in the diff against a base branch +# Usage: eval $(gstack-diff-scope main) → sets SCOPE_FRONTEND=true SCOPE_BACKEND=false ... +# Or: gstack-diff-scope main → prints SCOPE_*=... lines +set -euo pipefail + +BASE="${1:-main}" + +# Get changed file list +FILES=$(git diff "${BASE}...HEAD" --name-only 2>/dev/null || git diff "${BASE}" --name-only 2>/dev/null || echo "") + +if [ -z "$FILES" ]; then + echo "SCOPE_FRONTEND=false" + echo "SCOPE_BACKEND=false" + echo "SCOPE_PROMPTS=false" + echo "SCOPE_TESTS=false" + echo "SCOPE_DOCS=false" + echo "SCOPE_CONFIG=false" + exit 0 +fi + +FRONTEND=false +BACKEND=false +PROMPTS=false +TESTS=false +DOCS=false +CONFIG=false + +while IFS= read -r f; do + case "$f" in + # Frontend: CSS, views, components, templates + *.css|*.scss|*.less|*.sass|*.pcss|*.module.css|*.module.scss) FRONTEND=true ;; + *.tsx|*.jsx|*.vue|*.svelte|*.astro) FRONTEND=true ;; + *.erb|*.haml|*.slim|*.hbs|*.ejs) FRONTEND=true ;; + *.html) FRONTEND=true ;; + tailwind.config.*|postcss.config.*) FRONTEND=true ;; + app/views/*|*/components/*|styles/*|css/*|app/assets/stylesheets/*) FRONTEND=true ;; + + # Prompts: prompt builders, system prompts, generation services + *prompt_builder*|*generation_service*|*writer_service*|*designer_service*) PROMPTS=true ;; + *evaluator*|*scorer*|*classifier_service*|*analyzer*) PROMPTS=true ;; + *voice*.rb|*writing*.rb|*prompt*.rb|*token*.rb) PROMPTS=true ;; + app/services/chat_tools/*|app/services/x_thread_tools/*) PROMPTS=true ;; + config/system_prompts/*) PROMPTS=true ;; + + # Tests + *.test.*|*.spec.*|*_test.*|*_spec.*) TESTS=true ;; + test/*|tests/*|spec/*|__tests__/*|cypress/*|e2e/*) TESTS=true ;; + + # Docs + *.md) DOCS=true ;; + + # Config + package.json|package-lock.json|yarn.lock|bun.lockb) CONFIG=true ;; + Gemfile|Gemfile.lock) CONFIG=true ;; + *.yml|*.yaml) CONFIG=true ;; + .github/*) CONFIG=true ;; + requirements.txt|pyproject.toml|go.mod|Cargo.toml|composer.json) CONFIG=true ;; + + # Backend: everything else that's code (excluding views/components already matched) + *.rb|*.py|*.go|*.rs|*.java|*.php|*.ex|*.exs) BACKEND=true ;; + *.ts|*.js) BACKEND=true ;; # Non-component TS/JS is backend + esac +done <<< "$FILES" + +echo "SCOPE_FRONTEND=$FRONTEND" +echo "SCOPE_BACKEND=$BACKEND" +echo "SCOPE_PROMPTS=$PROMPTS" +echo "SCOPE_TESTS=$TESTS" +echo "SCOPE_DOCS=$DOCS" +echo "SCOPE_CONFIG=$CONFIG" diff --git a/plan-ceo-review/SKILL.md b/plan-ceo-review/SKILL.md index b44dff6d..55238e3f 100644 --- a/plan-ceo-review/SKILL.md +++ b/plan-ceo-review/SKILL.md @@ -702,7 +702,7 @@ echo "---CONFIG---" ~/.claude/skills/gstack/bin/gstack-config get skip_eng_review 2>/dev/null || echo "false" ``` -Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review). Ignore entries with timestamps older than 7 days. Display: +Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review, design-review-lite). Ignore entries with timestamps older than 7 days. For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: ``` +====================================================================+ diff --git a/plan-design-review/SKILL.md b/plan-design-review/SKILL.md index a8f3498e..1d821bf2 100644 --- a/plan-design-review/SKILL.md +++ b/plan-design-review/SKILL.md @@ -645,7 +645,7 @@ echo "---CONFIG---" ~/.claude/skills/gstack/bin/gstack-config get skip_eng_review 2>/dev/null || echo "false" ``` -Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review). Ignore entries with timestamps older than 7 days. Display: +Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review, design-review-lite). Ignore entries with timestamps older than 7 days. For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: ``` +====================================================================+ diff --git a/plan-eng-review/SKILL.md b/plan-eng-review/SKILL.md index 05c74ba7..48fe7230 100644 --- a/plan-eng-review/SKILL.md +++ b/plan-eng-review/SKILL.md @@ -348,7 +348,7 @@ echo "---CONFIG---" ~/.claude/skills/gstack/bin/gstack-config get skip_eng_review 2>/dev/null || echo "false" ``` -Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review). Ignore entries with timestamps older than 7 days. Display: +Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review, design-review-lite). Ignore entries with timestamps older than 7 days. For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: ``` +====================================================================+ diff --git a/review/SKILL.md b/review/SKILL.md index 186978ef..5d734594 100644 --- a/review/SKILL.md +++ b/review/SKILL.md @@ -196,6 +196,47 @@ Follow the output format specified in the checklist. Respect the suppressions --- +## Step 4.5: Design Review (conditional) + +## Design Review (conditional, diff-scoped) + +Check if the diff touches frontend files using `gstack-diff-scope`: + +```bash +eval $(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null) +``` + +**If `SCOPE_FRONTEND=false`:** Skip design review silently. No output. + +**If `SCOPE_FRONTEND=true`:** + +1. **Check for DESIGN.md.** If `DESIGN.md` or `design-system.md` exists in the repo root, read it. All design findings are calibrated against it — patterns blessed in DESIGN.md are not flagged. If not found, use universal design principles. + +2. **Read `.claude/skills/review/design-checklist.md`.** If the file cannot be read, skip design review with a note: "Design checklist not found — skipping design review." + +3. **Read each changed frontend file** (full file, not just diff hunks). Frontend files are identified by the patterns listed in the checklist. + +4. **Apply the design checklist** against the changed files. For each item: + - **[HIGH] mechanical CSS fix** (`outline: none`, `!important`, `font-size < 16px`): classify as AUTO-FIX + - **[HIGH/MEDIUM] design judgment needed**: classify as ASK + - **[LOW] intent-based detection**: present as "Possible — verify visually or run /qa-design-review" + +5. **Include findings** in the review output under a "Design Review" header, following the output format in the checklist. Design findings merge with code review findings into the same Fix-First flow. + +6. **Log the result** for the Review Readiness Dashboard: + +```bash +eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) +mkdir -p ~/.gstack/projects/$SLUG +echo '{"skill":"design-review-lite","timestamp":"TIMESTAMP","status":"STATUS","findings":N,"auto_fixed":M}' >> ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl +``` + +Substitute: TIMESTAMP = ISO 8601 datetime, STATUS = "clean" if 0 findings or "issues_found", N = total findings, M = auto-fixed count. + +Include any design findings alongside the findings from Step 4. They follow the same Fix-First flow in Step 5 — AUTO-FIX for mechanical CSS fixes, ASK for everything else. + +--- + ## Step 5: Fix-First Review **Every finding gets action — not just critical ones.** diff --git a/review/SKILL.md.tmpl b/review/SKILL.md.tmpl index c122ada1..c1d3fae6 100644 --- a/review/SKILL.md.tmpl +++ b/review/SKILL.md.tmpl @@ -75,6 +75,14 @@ Follow the output format specified in the checklist. Respect the suppressions --- +## Step 4.5: Design Review (conditional) + +{{DESIGN_REVIEW_LITE}} + +Include any design findings alongside the findings from Step 4. They follow the same Fix-First flow in Step 5 — AUTO-FIX for mechanical CSS fixes, ASK for everything else. + +--- + ## Step 5: Fix-First Review **Every finding gets action — not just critical ones.** diff --git a/review/design-checklist.md b/review/design-checklist.md new file mode 100644 index 00000000..bbe49885 --- /dev/null +++ b/review/design-checklist.md @@ -0,0 +1,132 @@ +# Design Review Checklist (Lite) + +> **Subset of DESIGN_METHODOLOGY** — when adding items here, also update `generateDesignMethodology()` in `scripts/gen-skill-docs.ts`, and vice versa. + +## Instructions + +This checklist applies to **source code in the diff** — not rendered output. Read each changed frontend file (full file, not just diff hunks) and flag anti-patterns. + +**Trigger:** Only run this checklist if the diff touches frontend files. Use `gstack-diff-scope` to detect: + +```bash +eval $(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null) +``` + +If `SCOPE_FRONTEND=false`, skip the entire design review silently. + +**DESIGN.md calibration:** If `DESIGN.md` or `design-system.md` exists in the repo root, read it first. All findings are calibrated against the project's stated design system. Patterns explicitly blessed in DESIGN.md are NOT flagged. If no DESIGN.md exists, use universal design principles. + +--- + +## Confidence Tiers + +Each item is tagged with a detection confidence level: + +- **[HIGH]** — Reliably detectable via grep/pattern match. Definitive findings. +- **[MEDIUM]** — Detectable via pattern aggregation or heuristic. Flag as findings but expect some noise. +- **[LOW]** — Requires understanding visual intent. Present as: "Possible issue — verify visually or run /qa-design-review." + +--- + +## Classification + +**AUTO-FIX** (mechanical CSS fixes only — HIGH confidence, no design judgment needed): +- `outline: none` without replacement → add `outline: revert` or `&:focus-visible { outline: 2px solid currentColor; }` +- `!important` in new CSS → remove and fix specificity +- `font-size` < 16px on body text → bump to 16px + +**ASK** (everything else — requires design judgment): +- All AI slop findings, typography structure, spacing choices, interaction state gaps, DESIGN.md violations + +**LOW confidence items** → present as "Possible: [description]. Verify visually or run /qa-design-review." Never AUTO-FIX. + +--- + +## Output Format + +``` +Design Review: N issues (X auto-fixable, Y need input, Z possible) + +**AUTO-FIXED:** +- [file:line] Problem → fix applied + +**NEEDS INPUT:** +- [file:line] Problem description + Recommended fix: suggested fix + +**POSSIBLE (verify visually):** +- [file:line] Possible issue — verify with /qa-design-review +``` + +If no issues found: `Design Review: No issues found.` + +If no frontend files changed: skip silently, no output. + +--- + +## Categories + +### 1. AI Slop Detection (6 items) — highest priority + +These are the telltale signs of AI-generated UI that no designer at a respected studio would ship. + +- **[MEDIUM]** Purple/violet/indigo gradient backgrounds or blue-to-purple color schemes. Look for `linear-gradient` with values in the `#6366f1`–`#8b5cf6` range, or CSS custom properties resolving to purple/violet. + +- **[LOW]** The 3-column feature grid: icon-in-colored-circle + bold title + 2-line description, repeated 3x symmetrically. Look for a grid/flex container with exactly 3 children that each contain a circular element + heading + paragraph. + +- **[LOW]** Icons in colored circles as section decoration. Look for elements with `border-radius: 50%` + a background color used as decorative containers for icons. + +- **[HIGH]** Centered everything: `text-align: center` on all headings, descriptions, and cards. Grep for `text-align: center` density — if >60% of text containers use center alignment, flag it. + +- **[MEDIUM]** Uniform bubbly border-radius on every element: same large radius (16px+) applied to cards, buttons, inputs, containers uniformly. Aggregate `border-radius` values — if >80% use the same value ≥16px, flag it. + +- **[MEDIUM]** Generic hero copy: "Welcome to [X]", "Unlock the power of...", "Your all-in-one solution for...", "Revolutionize your...", "Streamline your workflow". Grep HTML/JSX content for these patterns. + +### 2. Typography (4 items) + +- **[HIGH]** Body text `font-size` < 16px. Grep for `font-size` declarations on `body`, `p`, `.text`, or base styles. Values below 16px (or 1rem when base is 16px) are flagged. + +- **[HIGH]** More than 3 font families introduced in the diff. Count distinct `font-family` declarations. Flag if >3 unique families appear across changed files. + +- **[HIGH]** Heading hierarchy skipping levels: `h1` followed by `h3` without an `h2` in the same file/component. Check HTML/JSX for heading tags. + +- **[HIGH]** Blacklisted fonts: Papyrus, Comic Sans, Lobster, Impact, Jokerman. Grep `font-family` for these names. + +### 3. Spacing & Layout (4 items) + +- **[MEDIUM]** Arbitrary spacing values not on a 4px or 8px scale, when DESIGN.md specifies a spacing scale. Check `margin`, `padding`, `gap` values against the stated scale. Only flag when DESIGN.md defines a scale. + +- **[MEDIUM]** Fixed widths without responsive handling: `width: NNNpx` on containers without `max-width` or `@media` breakpoints. Risk of horizontal scroll on mobile. + +- **[MEDIUM]** Missing `max-width` on text containers: body text or paragraph containers with no `max-width` set, allowing lines >75 characters. Check for `max-width` on text wrappers. + +- **[HIGH]** `!important` in new CSS rules. Grep for `!important` in added lines. Almost always a specificity escape hatch that should be fixed properly. + +### 4. Interaction States (3 items) + +- **[MEDIUM]** Interactive elements (buttons, links, inputs) missing hover/focus states. Check if `:hover` and `:focus` or `:focus-visible` pseudo-classes exist for new interactive element styles. + +- **[HIGH]** `outline: none` or `outline: 0` without a replacement focus indicator. Grep for `outline:\s*none` or `outline:\s*0`. This removes keyboard accessibility. + +- **[LOW]** Touch targets < 44px on interactive elements. Check `min-height`/`min-width`/`padding` on buttons and links. Requires computing effective size from multiple properties — low confidence from code alone. + +### 5. DESIGN.md Violations (3 items, conditional) + +Only apply if `DESIGN.md` or `design-system.md` exists: + +- **[MEDIUM]** Colors not in the stated palette. Compare color values in changed CSS against the palette defined in DESIGN.md. + +- **[MEDIUM]** Fonts not in the stated typography section. Compare `font-family` values against DESIGN.md's font list. + +- **[MEDIUM]** Spacing values outside the stated scale. Compare `margin`/`padding`/`gap` values against DESIGN.md's spacing scale. + +--- + +## Suppressions + +Do NOT flag: +- Patterns explicitly documented in DESIGN.md as intentional choices +- Third-party/vendor CSS files (node_modules, vendor directories) +- CSS resets or normalize stylesheets +- Test fixture files +- Generated/minified CSS diff --git a/scripts/gen-skill-docs.ts b/scripts/gen-skill-docs.ts index d2e86ecf..cb807111 100644 --- a/scripts/gen-skill-docs.ts +++ b/scripts/gen-skill-docs.ts @@ -519,6 +519,45 @@ Minimum 0 per category. 11. **Show screenshots to the user.** After every \`$B screenshot\`, \`$B snapshot -a -o\`, or \`$B responsive\` command, use the Read tool on the output file(s) so the user can see them inline. For \`responsive\` (3 files), Read all three. This is critical — without it, screenshots are invisible to the user.`; } +function generateDesignReviewLite(): string { + return `## Design Review (conditional, diff-scoped) + +Check if the diff touches frontend files using \`gstack-diff-scope\`: + +\`\`\`bash +eval $(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null) +\`\`\` + +**If \`SCOPE_FRONTEND=false\`:** Skip design review silently. No output. + +**If \`SCOPE_FRONTEND=true\`:** + +1. **Check for DESIGN.md.** If \`DESIGN.md\` or \`design-system.md\` exists in the repo root, read it. All design findings are calibrated against it — patterns blessed in DESIGN.md are not flagged. If not found, use universal design principles. + +2. **Read \`.claude/skills/review/design-checklist.md\`.** If the file cannot be read, skip design review with a note: "Design checklist not found — skipping design review." + +3. **Read each changed frontend file** (full file, not just diff hunks). Frontend files are identified by the patterns listed in the checklist. + +4. **Apply the design checklist** against the changed files. For each item: + - **[HIGH] mechanical CSS fix** (\`outline: none\`, \`!important\`, \`font-size < 16px\`): classify as AUTO-FIX + - **[HIGH/MEDIUM] design judgment needed**: classify as ASK + - **[LOW] intent-based detection**: present as "Possible — verify visually or run /qa-design-review" + +5. **Include findings** in the review output under a "Design Review" header, following the output format in the checklist. Design findings merge with code review findings into the same Fix-First flow. + +6. **Log the result** for the Review Readiness Dashboard: + +\`\`\`bash +eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) +mkdir -p ~/.gstack/projects/$SLUG +echo '{"skill":"design-review-lite","timestamp":"TIMESTAMP","status":"STATUS","findings":N,"auto_fixed":M}' >> ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl +\`\`\` + +Substitute: TIMESTAMP = ISO 8601 datetime, STATUS = "clean" if 0 findings or "issues_found", N = total findings, M = auto-fixed count.`; +} + +// NOTE: design-checklist.md is a subset of this methodology for code-level detection. +// When adding items here, also update review/design-checklist.md, and vice versa. function generateDesignMethodology(): string { return `## Modes @@ -865,7 +904,7 @@ echo "---CONFIG---" ~/.claude/skills/gstack/bin/gstack-config get skip_eng_review 2>/dev/null || echo "false" \`\`\` -Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review). Ignore entries with timestamps older than 7 days. Display: +Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review, design-review-lite). Ignore entries with timestamps older than 7 days. For Design Review, show whichever is more recent between \`plan-design-review\` (full visual audit) and \`design-review-lite\` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: \`\`\` +====================================================================+ @@ -1056,6 +1095,7 @@ const RESOLVERS: Record string> = { BASE_BRANCH_DETECT: generateBaseBranchDetect, QA_METHODOLOGY: generateQAMethodology, DESIGN_METHODOLOGY: generateDesignMethodology, + DESIGN_REVIEW_LITE: generateDesignReviewLite, REVIEW_DASHBOARD: generateReviewDashboard, TEST_BOOTSTRAP: generateTestBootstrap, }; diff --git a/ship/SKILL.md b/ship/SKILL.md index e2b524d9..486f32bf 100644 --- a/ship/SKILL.md +++ b/ship/SKILL.md @@ -186,7 +186,7 @@ echo "---CONFIG---" ~/.claude/skills/gstack/bin/gstack-config get skip_eng_review 2>/dev/null || echo "false" ``` -Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review). Ignore entries with timestamps older than 7 days. Display: +Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, plan-design-review, design-review-lite). Ignore entries with timestamps older than 7 days. For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: ``` +====================================================================+ @@ -226,7 +226,8 @@ If the Eng Review is NOT "CLEAR": - Show that Eng Review is missing or has open issues - RECOMMENDATION: Choose C if the change is obviously trivial (< 20 lines, typo fix, config-only); Choose B for larger changes - Options: A) Ship anyway B) Abort — run /plan-eng-review first C) Change is too small to need eng review - - If CEO/Design reviews are missing, mention them as informational ("CEO Review not run — recommended for product changes") but do NOT block or recommend aborting for them + - If CEO Review is missing, mention as informational ("CEO Review not run — recommended for product changes") but do NOT block + - For Design Review: run `eval $(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null)`. If `SCOPE_FRONTEND=true` and no design review (plan-design-review or design-review-lite) exists in the dashboard, mention: "Design Review not run — this PR changes frontend code. The lite design check will run automatically in Step 3.5, but consider running /plan-design-review for a full visual audit." Still never block. 3. **If the user chooses A or C,** persist the decision so future `/ship` runs on this branch skip the gate: ```bash @@ -642,6 +643,43 @@ Review the diff for structural issues that tests don't catch. - **Pass 1 (CRITICAL):** SQL & Data Safety, LLM Output Trust Boundary - **Pass 2 (INFORMATIONAL):** All remaining categories +## Design Review (conditional, diff-scoped) + +Check if the diff touches frontend files using `gstack-diff-scope`: + +```bash +eval $(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null) +``` + +**If `SCOPE_FRONTEND=false`:** Skip design review silently. No output. + +**If `SCOPE_FRONTEND=true`:** + +1. **Check for DESIGN.md.** If `DESIGN.md` or `design-system.md` exists in the repo root, read it. All design findings are calibrated against it — patterns blessed in DESIGN.md are not flagged. If not found, use universal design principles. + +2. **Read `.claude/skills/review/design-checklist.md`.** If the file cannot be read, skip design review with a note: "Design checklist not found — skipping design review." + +3. **Read each changed frontend file** (full file, not just diff hunks). Frontend files are identified by the patterns listed in the checklist. + +4. **Apply the design checklist** against the changed files. For each item: + - **[HIGH] mechanical CSS fix** (`outline: none`, `!important`, `font-size < 16px`): classify as AUTO-FIX + - **[HIGH/MEDIUM] design judgment needed**: classify as ASK + - **[LOW] intent-based detection**: present as "Possible — verify visually or run /qa-design-review" + +5. **Include findings** in the review output under a "Design Review" header, following the output format in the checklist. Design findings merge with code review findings into the same Fix-First flow. + +6. **Log the result** for the Review Readiness Dashboard: + +```bash +eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) +mkdir -p ~/.gstack/projects/$SLUG +echo '{"skill":"design-review-lite","timestamp":"TIMESTAMP","status":"STATUS","findings":N,"auto_fixed":M}' >> ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl +``` + +Substitute: TIMESTAMP = ISO 8601 datetime, STATUS = "clean" if 0 findings or "issues_found", N = total findings, M = auto-fixed count. + + Include any design findings alongside the code review findings. They follow the same Fix-First flow below. + 4. **Classify each finding as AUTO-FIX or ASK** per the Fix-First Heuristic in checklist.md. Critical findings lean toward ASK; informational lean toward AUTO-FIX. @@ -863,7 +901,11 @@ gh pr create --base --title ": " --body "$(cat <<'EOF' ## Pre-Landing Review - + + +## Design Review + + ## Eval Results diff --git a/ship/SKILL.md.tmpl b/ship/SKILL.md.tmpl index e059fc6a..f9922147 100644 --- a/ship/SKILL.md.tmpl +++ b/ship/SKILL.md.tmpl @@ -69,7 +69,8 @@ If the Eng Review is NOT "CLEAR": - Show that Eng Review is missing or has open issues - RECOMMENDATION: Choose C if the change is obviously trivial (< 20 lines, typo fix, config-only); Choose B for larger changes - Options: A) Ship anyway B) Abort — run /plan-eng-review first C) Change is too small to need eng review - - If CEO/Design reviews are missing, mention them as informational ("CEO Review not run — recommended for product changes") but do NOT block or recommend aborting for them + - If CEO Review is missing, mention as informational ("CEO Review not run — recommended for product changes") but do NOT block + - For Design Review: run `eval $(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null)`. If `SCOPE_FRONTEND=true` and no design review (plan-design-review or design-review-lite) exists in the dashboard, mention: "Design Review not run — this PR changes frontend code. The lite design check will run automatically in Step 3.5, but consider running /plan-design-review for a full visual audit." Still never block. 3. **If the user chooses A or C,** persist the decision so future `/ship` runs on this branch skip the gate: ```bash @@ -334,6 +335,10 @@ Review the diff for structural issues that tests don't catch. - **Pass 1 (CRITICAL):** SQL & Data Safety, LLM Output Trust Boundary - **Pass 2 (INFORMATIONAL):** All remaining categories +{{DESIGN_REVIEW_LITE}} + + Include any design findings alongside the code review findings. They follow the same Fix-First flow below. + 4. **Classify each finding as AUTO-FIX or ASK** per the Fix-First Heuristic in checklist.md. Critical findings lean toward ASK; informational lean toward AUTO-FIX. @@ -555,7 +560,11 @@ gh pr create --base --title ": " --body "$(cat <<'EOF' ## Pre-Landing Review - + + +## Design Review + + ## Eval Results diff --git a/test/fixtures/review-eval-design-slop.css b/test/fixtures/review-eval-design-slop.css new file mode 100644 index 00000000..40e055fb --- /dev/null +++ b/test/fixtures/review-eval-design-slop.css @@ -0,0 +1,86 @@ +/* Planted design anti-patterns for E2E eval — 7 issues */ + +/* Issue 1: [HIGH] Blacklisted font (Papyrus) */ +/* Issue 2: [HIGH] Body text < 16px (14px) */ +body { + font-family: 'Papyrus', sans-serif; + font-size: 14px; + margin: 0; + padding: 0; +} + +/* Issue 5: [MEDIUM] Purple/violet gradient background */ +.hero { + background: linear-gradient(135deg, #6366f1, #8b5cf6); + text-align: center; + padding: 80px 20px; + color: white; +} + +.hero h1 { + text-align: center; + font-size: 48px; +} + +.hero p { + text-align: center; + font-size: 20px; +} + +/* Issue 7: [LOW] 3-column feature grid with icon circles */ +.features { + display: grid; + grid-template-columns: repeat(3, 1fr); + gap: 24px; + padding: 60px 40px; + text-align: center; +} + +.feature-card { + border-radius: 24px; + padding: 32px; + text-align: center; + background: #f9fafb; +} + +/* Icon in colored circle — AI slop pattern */ +.icon-circle { + width: 60px; + height: 60px; + border-radius: 50%; + background: #ede9fe; + display: flex; + align-items: center; + justify-content: center; + margin: 0 auto 16px; + font-size: 24px; +} + +/* Issue 3: [HIGH] outline: none without replacement */ +button { + outline: none; + background: #6366f1; + color: white; + border: none; + padding: 12px 24px; + border-radius: 24px; + cursor: pointer; +} + +.small-link { + font-size: 11px; + padding: 4px 8px; +} + +/* Issue 4: [HIGH] !important usage */ +.override { + color: red !important; + margin-left: 10px !important; +} + +.footer { + text-align: center; + padding: 40px; + background: #1e1b4b; + color: white; +} diff --git a/test/fixtures/review-eval-design-slop.html b/test/fixtures/review-eval-design-slop.html new file mode 100644 index 00000000..f05affd1 --- /dev/null +++ b/test/fixtures/review-eval-design-slop.html @@ -0,0 +1,41 @@ + + + + + + + Our Platform + + + +
+

Welcome to Our Platform

+

Your all-in-one solution for everything you need

+ +
+ + +
+
+
+

Feature One

+

A short description of this amazing feature that will change your life.

+
+
+
+

Feature Two

+

Another incredible capability that sets us apart from the competition.

+
+
+
+

Feature Three

+

Yet another powerful tool to streamline your workflow effortlessly.

+
+
+ + + + diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index 30a15579..84a11da2 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -55,6 +55,7 @@ export const E2E_TOUCHFILES: Record = { 'review-sql-injection': ['review/**', 'test/fixtures/review-eval-vuln.rb'], 'review-enum-completeness': ['review/**', 'test/fixtures/review-eval-enum*.rb'], 'review-base-branch': ['review/**'], + 'review-design-lite': ['review/**', 'test/fixtures/review-eval-design-slop.*'], // Plan reviews 'plan-ceo-review': ['plan-ceo-review/**'], diff --git a/test/skill-e2e.test.ts b/test/skill-e2e.test.ts index 338ec2f1..6a66311b 100644 --- a/test/skill-e2e.test.ts +++ b/test/skill-e2e.test.ts @@ -636,6 +636,97 @@ The diff adds a new "returned" status to the Order model. Your job is to check i }, 120_000); }); +// --- Review: Design review lite E2E --- + +describeE2E('Review design lite E2E', () => { + let designDir: string; + + beforeAll(() => { + designDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-design-lite-')); + + const run = (cmd: string, args: string[]) => + spawnSync(cmd, args, { cwd: designDir, stdio: 'pipe', timeout: 5000 }); + + run('git', ['init']); + run('git', ['config', 'user.email', 'test@test.com']); + run('git', ['config', 'user.name', 'Test']); + + // Commit clean base on main + fs.writeFileSync(path.join(designDir, 'index.html'), '

Clean

\n'); + fs.writeFileSync(path.join(designDir, 'styles.css'), 'body { font-size: 16px; }\n'); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'initial']); + + // Feature branch adds AI slop CSS + HTML + run('git', ['checkout', '-b', 'feature/add-landing-page']); + const slopCss = fs.readFileSync(path.join(ROOT, 'test', 'fixtures', 'review-eval-design-slop.css'), 'utf-8'); + const slopHtml = fs.readFileSync(path.join(ROOT, 'test', 'fixtures', 'review-eval-design-slop.html'), 'utf-8'); + fs.writeFileSync(path.join(designDir, 'styles.css'), slopCss); + fs.writeFileSync(path.join(designDir, 'landing.html'), slopHtml); + run('git', ['add', '.']); + run('git', ['commit', '-m', 'add landing page']); + + // Copy review skill files + fs.copyFileSync(path.join(ROOT, 'review', 'SKILL.md'), path.join(designDir, 'review-SKILL.md')); + fs.copyFileSync(path.join(ROOT, 'review', 'checklist.md'), path.join(designDir, 'review-checklist.md')); + fs.copyFileSync(path.join(ROOT, 'review', 'design-checklist.md'), path.join(designDir, 'review-design-checklist.md')); + fs.copyFileSync(path.join(ROOT, 'review', 'greptile-triage.md'), path.join(designDir, 'review-greptile-triage.md')); + }); + + afterAll(() => { + try { fs.rmSync(designDir, { recursive: true, force: true }); } catch {} + }); + + test('/review catches design anti-patterns in CSS/HTML diff', async () => { + const result = await runSkillTest({ + prompt: `You are in a git repo on branch feature/add-landing-page with changes against main. +Read review-SKILL.md for the review workflow instructions. +Read review-checklist.md for the code review checklist. +Read review-design-checklist.md for the design review checklist. +Run /review on the current diff (git diff main...HEAD). + +The diff adds a landing page with CSS and HTML. Check for both code issues AND design anti-patterns. +Write your review findings to ${designDir}/review-output.md + +Important: The design checklist should catch issues like blacklisted fonts, small font sizes, outline:none, !important, AI slop patterns (purple gradients, generic hero copy, 3-column feature grid), etc.`, + workingDirectory: designDir, + maxTurns: 15, + timeout: 120_000, + testName: 'review-design-lite', + runId, + }); + + logCost('/review design lite', result); + recordE2E('/review design lite', 'Review design lite E2E', result); + expect(result.exitReason).toBe('success'); + + // Verify the review caught at least 4 of 7 planted design issues + const reviewPath = path.join(designDir, 'review-output.md'); + if (fs.existsSync(reviewPath)) { + const review = fs.readFileSync(reviewPath, 'utf-8').toLowerCase(); + let detected = 0; + + // Issue 1: Blacklisted font (Papyrus) — HIGH + if (review.includes('papyrus') || review.includes('blacklisted font') || review.includes('font family')) detected++; + // Issue 2: Body text < 16px — HIGH + if (review.includes('14px') || review.includes('font-size') || review.includes('font size') || review.includes('body text')) detected++; + // Issue 3: outline: none — HIGH + if (review.includes('outline') || review.includes('focus')) detected++; + // Issue 4: !important — HIGH + if (review.includes('!important') || review.includes('important')) detected++; + // Issue 5: Purple gradient — MEDIUM + if (review.includes('gradient') || review.includes('purple') || review.includes('violet') || review.includes('#6366f1') || review.includes('#8b5cf6')) detected++; + // Issue 6: Generic hero copy — MEDIUM + if (review.includes('welcome to') || review.includes('all-in-one') || review.includes('generic') || review.includes('hero copy') || review.includes('ai slop')) detected++; + // Issue 7: 3-column feature grid — LOW + if (review.includes('3-column') || review.includes('three-column') || review.includes('feature grid') || review.includes('icon') || review.includes('circle')) detected++; + + console.log(`Design review detected ${detected}/7 planted issues`); + expect(detected).toBeGreaterThanOrEqual(4); + } + }, 150_000); +}); + // --- B6/B7/B8: Planted-bug outcome evals --- // Outcome evals also need ANTHROPIC_API_KEY for the LLM judge From f91222f5bd14f97f480f98f7f84ecbef7c9f165f Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Tue, 17 Mar 2026 22:23:30 -0500 Subject: [PATCH 6/7] docs: restructure README for faster conversion (#146) * docs: restructure README for faster conversion Move install block to top (first scroll) for visitors who are already sold. Add "Who this is for" self-identification section and "Quick start: 10 minutes" funnel to guide first-time users. Condense install instructions to bold literal commands. Move "See it work" and "The team" below install. Relocate hiring box after "Come ride the wave" section. Strip redundant section dividers. Add governance framing to final CTA and new troubleshooting case for missing skills. Co-Authored-By: Claude Haiku 4.5 * docs: update "Who this is for" to lead with founders and new CC users Co-Authored-By: Claude Opus 4.6 (1M context) --------- Co-authored-by: Claude Haiku 4.5 --- README.md | 93 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 48 insertions(+), 45 deletions(-) diff --git a/README.md b/README.md index c0a5f0b6..1533928b 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,38 @@ I am learning how to get to the edge of what agentic systems can do as of March Fork it. Improve it. Make it yours. Don't player hate, appreciate. ---- +**Who this is for:** +- **Founders and CEOs** — especially technical ones who still want to ship. This is how you build like a team of twenty. +- **First-time Claude Code users** — gstack is the best way to start. Structured roles instead of a blank prompt. +- **Tech leads and staff engineers** — bring rigorous review, QA, and release automation to every PR + +## Quick start: your first 10 minutes + +1. Install gstack (30 seconds — see below) +2. Run `/plan-ceo-review` on any feature idea +3. Run `/review` on any branch with changes +4. Run `/qa` on your staging URL +5. Stop there. You'll know if this is for you. + +Expect first useful run in under 5 minutes on any repo with tests already set up. + +**If you only read one more section, read this one.** + +## Install — takes 30 seconds + +**Requirements:** [Claude Code](https://docs.anthropic.com/en/docs/claude-code), [Git](https://git-scm.com/), [Bun](https://bun.sh/) v1.0+ + +### Step 1: Install on your machine + +Open Claude Code and paste this. Claude does the rest. + +> Install gstack: run **`git clone https://github.com/garrytan/gstack.git ~/.claude/skills/gstack && cd ~/.claude/skills/gstack && ./setup`** then add a "gstack" section to CLAUDE.md that says to use the /browse skill from gstack for all web browsing, never use mcp\_\_claude-in-chrome\_\_\* tools, and lists the available skills: /plan-ceo-review, /plan-eng-review, /plan-design-review, /design-consultation, /review, /ship, /browse, /qa, /qa-only, /qa-design-review, /setup-browser-cookies, /retro, /document-release. Then ask the user if they also want to add gstack to the current project so teammates get it. + +### Step 2: Add to your repo so teammates get it (optional) + +> Add gstack to this project: run **`cp -Rf ~/.claude/skills/gstack .claude/skills/gstack && rm -rf .claude/skills/gstack/.git && cd .claude/skills/gstack && ./setup`** then add a "gstack" section to this project's CLAUDE.md that says to use the /browse skill from gstack for all web browsing, never use mcp\_\_claude-in-chrome\_\_\* tools, lists the available skills: /plan-ceo-review, /plan-eng-review, /plan-design-review, /design-consultation, /review, /ship, /browse, /qa, /qa-only, /qa-design-review, /setup-browser-cookies, /retro, /document-release, and tells Claude that if gstack skills aren't working, run `cd .claude/skills/gstack && ./setup` to build the binary and register skills. + +Real files get committed to your repo (not a submodule), so `git clone` just works. Everything lives inside `.claude/`. Nothing touches your PATH or runs in the background. ## See it work @@ -76,8 +107,6 @@ Claude: Tests: 42 → 51 (+9 new) One feature. Seven commands. The agent reframed the product, ran an 80-item design audit, drew the architecture, wrote 2,400 lines of code, found a race condition I would have missed, auto-fixed two issues, opened a real browser to QA test, found and fixed a bug I didn't know about, wrote 9 tests, and generated a regression test. That is not a copilot. That is a team. ---- - ## The team | Skill | Your specialist | What they do | @@ -98,8 +127,6 @@ One feature. Seven commands. The agent reframed the product, ran an 80-item desi **[Deep dives with examples and philosophy for every skill →](docs/skills.md)** ---- - ## What's new and why it matters **Design is at the heart.** `/design-consultation` doesn't just pick fonts. It researches what's out there in your space, proposes safe choices AND creative risks, generates realistic mockups of your actual product, and writes `DESIGN.md` — and then `/qa-design-review` and `/plan-eng-review` read what you chose. Design decisions flow through the whole system. @@ -112,8 +139,6 @@ One feature. Seven commands. The agent reframed the product, ran an 80-item desi **`/document-release` is the engineer you never had.** It reads every doc file in your project, cross-references the diff, and updates everything that drifted. README, ARCHITECTURE, CONTRIBUTING, CLAUDE.md, TODOS — all kept current automatically. ---- - ## 10 sessions at once gstack is powerful with one session. It is transformative with ten. @@ -122,53 +147,21 @@ gstack is powerful with one session. It is transformative with ten. One person, ten parallel agents, each with the right cognitive mode. That is a different way of building software. ---- - -## Install — takes 30 seconds - -**Requirements:** [Claude Code](https://docs.anthropic.com/en/docs/claude-code), [Git](https://git-scm.com/), [Bun](https://bun.sh/) v1.0+ - -### Step 1: Install on your machine - -Open Claude Code and paste this. Claude does the rest. - -> Install gstack: run `git clone https://github.com/garrytan/gstack.git ~/.claude/skills/gstack && cd ~/.claude/skills/gstack && ./setup` then add a "gstack" section to CLAUDE.md that says to use the /browse skill from gstack for all web browsing, never use mcp\_\_claude-in-chrome\_\_\* tools, and lists the available skills: /plan-ceo-review, /plan-eng-review, /plan-design-review, /design-consultation, /review, /ship, /browse, /qa, /qa-only, /qa-design-review, /setup-browser-cookies, /retro, /document-release. Then ask the user if they also want to add gstack to the current project so teammates get it. - -### Step 2: Add to your repo so teammates get it (optional) - -> Add gstack to this project: run `cp -Rf ~/.claude/skills/gstack .claude/skills/gstack && rm -rf .claude/skills/gstack/.git && cd .claude/skills/gstack && ./setup` then add a "gstack" section to this project's CLAUDE.md that says to use the /browse skill from gstack for all web browsing, never use mcp\_\_claude-in-chrome\_\_\* tools, lists the available skills: /plan-ceo-review, /plan-eng-review, /plan-design-review, /design-consultation, /review, /ship, /browse, /qa, /qa-only, /qa-design-review, /setup-browser-cookies, /retro, /document-release, and tells Claude that if gstack skills aren't working, run `cd .claude/skills/gstack && ./setup` to build the binary and register skills. - -Real files get committed to your repo (not a submodule), so `git clone` just works. Everything lives inside `.claude/`. Nothing touches your PATH or runs in the background. - ---- - -``` -+----------------------------------------------------------------------------+ -| | -| Are you a great software engineer who wants to ship 10K+ LOC/day? | -| | -| Come work at YC: ycombinator.com/software | -| | -| Extremely competitive salary and equity. | -| Now hiring in San Francisco, Dogpatch District. | -| Come join the revolution. | -| | -+----------------------------------------------------------------------------+ -``` - ---- - ## Come ride the wave This is **free, MIT licensed, open source, available now.** No premium tier. No waitlist. No strings. I open sourced how I do development and I am actively upgrading my own software factory here. You can fork it and make it your own. That's the whole point. I want everyone on this journey. +Same tools, different outcome — because gstack gives you structured roles and review gates, not generic agent chaos. That governance is the difference between shipping fast and shipping reckless. + The models are getting better fast. The people who figure out how to work with them now — really work with them, not just dabble — are going to have a massive advantage. This is that window. Let's go. -**[github.com/garrytan/gstack](https://github.com/garrytan/gstack)** — MIT License +Thirteen specialists. All slash commands. All Markdown. All free. **[github.com/garrytan/gstack](https://github.com/garrytan/gstack)** — MIT License ---- +> **We're hiring.** Want to ship 10K+ LOC/day and help harden gstack? +> Come work at YC — [ycombinator.com/software](https://ycombinator.com/software) +> Extremely competitive salary and equity. San Francisco, Dogpatch District. ## Docs @@ -188,6 +181,16 @@ The models are getting better fast. The people who figure out how to work with t **Stale install?** Run `/gstack-upgrade` — or set `auto_upgrade: true` in `~/.gstack/config.yaml` +**Claude says it can't see the skills?** Make sure your project's `CLAUDE.md` has a gstack section. Add this: + +``` +## gstack +Use /browse from gstack for all web browsing. Never use mcp__claude-in-chrome__* tools. +Available skills: /plan-ceo-review, /plan-eng-review, /plan-design-review, +/design-consultation, /review, /ship, /browse, /qa, /qa-only, /qa-design-review, +/setup-browser-cookies, /retro, /document-release. +``` + ## License MIT. Free forever. Go build something. From 78c207efb42c90f84b5e383cd4aba39ecff29896 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Tue, 17 Mar 2026 22:48:48 -0500 Subject: [PATCH 7/7] feat: interactive /plan-design-review + CEO invokes designer + 100% coverage (v0.6.4) (#149) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * refactor: rename qa-design-review → design-review The "qa-" prefix was confusing — this is the live-site design audit with fix loop, not a QA-only report. Rename directory and update all references across docs, tests, scripts, and skill templates. Co-Authored-By: Claude Opus 4.6 (1M context) * feat: interactive /plan-design-review + CEO invokes designer Rewrite /plan-design-review from report-only grading to an interactive plan-fixer that rates each design dimension 0-10, explains what a 10 looks like, and edits the plan to get there. Parallel structure with /plan-ceo-review and /plan-eng-review — one issue = one AskUserQuestion. CEO review now detects UI scope and invokes the designer perspective when the plan has frontend/UX work, so you get design review automatically when it matters. Co-Authored-By: Claude Opus 4.6 (1M context) * test: validation + touchfile entries for 100% coverage Add design-consultation to command/snapshot flag validation. Add 4 skills to contributor mode validation (plan-design-review, design-review, design-consultation, document-release). Add 2 templates to hardcoded branch check. Register touchfile entries for 10 new LLM-judge tests and 1 new E2E test. Co-Authored-By: Claude Opus 4.6 (1M context) * test: LLM-judge for 10 skills + gstack-upgrade E2E Add LLM-judge quality evals for all uncovered skills using a DRY runWorkflowJudge helper with section marker guards. Add real E2E test for gstack-upgrade using mock git remote (replaces test.todo). Add plan-edit assertion to plan-design-review E2E. 14/15 skills now at full coverage. setup-browser-cookies remains deferred (needs real browser). Co-Authored-By: Claude Opus 4.6 (1M context) * docs: add bisect commit style to CLAUDE.md All commits should be single logical changes, split before pushing. Co-Authored-By: Claude Opus 4.6 (1M context) * chore: bump version and changelog (v0.6.4.0) Co-Authored-By: Claude Opus 4.6 --------- Co-authored-by: Claude Opus 4.6 (1M context) --- ARCHITECTURE.md | 4 +- CHANGELOG.md | 13 + CLAUDE.md | 18 +- README.md | 4 +- VERSION | 2 +- {qa-design-review => design-review}/SKILL.md | 16 +- .../SKILL.md.tmpl | 16 +- docs/skills.md | 78 +- plan-ceo-review/SKILL.md | 37 +- plan-ceo-review/SKILL.md.tmpl | 37 +- plan-design-review/SKILL.md | 728 ++++++------------ plan-design-review/SKILL.md.tmpl | 338 +++++--- review/SKILL.md | 2 +- review/design-checklist.md | 6 +- scripts/gen-skill-docs.ts | 4 +- scripts/skill-check.ts | 2 +- ship/SKILL.md | 4 +- ship/SKILL.md.tmpl | 2 +- test/gen-skill-docs.test.ts | 2 +- test/helpers/touchfiles.ts | 27 +- test/skill-e2e.test.ts | 293 +++++-- test/skill-llm-eval.test.ts | 204 +++++ test/skill-validation.test.ts | 46 +- test/touchfiles.test.ts | 2 +- 24 files changed, 1120 insertions(+), 765 deletions(-) rename {qa-design-review => design-review}/SKILL.md (99%) rename {qa-design-review => design-review}/SKILL.md.tmpl (96%) diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 79bfda75..db55ee36 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -202,9 +202,9 @@ Templates contain the workflows, tips, and examples that require human judgment. | `{{BROWSE_SETUP}}` | `gen-skill-docs.ts` | Binary discovery + setup instructions | | `{{BASE_BRANCH_DETECT}}` | `gen-skill-docs.ts` | Dynamic base branch detection for PR-targeting skills (ship, review, qa, plan-ceo-review) | | `{{QA_METHODOLOGY}}` | `gen-skill-docs.ts` | Shared QA methodology block for /qa and /qa-only | -| `{{DESIGN_METHODOLOGY}}` | `gen-skill-docs.ts` | Shared design audit methodology for /plan-design-review and /qa-design-review | +| `{{DESIGN_METHODOLOGY}}` | `gen-skill-docs.ts` | Shared design audit methodology for /plan-design-review and /design-review | | `{{REVIEW_DASHBOARD}}` | `gen-skill-docs.ts` | Review Readiness Dashboard for /ship pre-flight | -| `{{TEST_BOOTSTRAP}}` | `gen-skill-docs.ts` | Test framework detection, bootstrap, CI/CD setup for /qa, /ship, /qa-design-review | +| `{{TEST_BOOTSTRAP}}` | `gen-skill-docs.ts` | Test framework detection, bootstrap, CI/CD setup for /qa, /ship, /design-review | This is structurally sound — if a command exists in code, it appears in docs. If it doesn't exist, it can't appear. diff --git a/CHANGELOG.md b/CHANGELOG.md index 04f690a3..49d9183d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,18 @@ # Changelog +## [0.6.4.0] - 2026-03-17 + +### Added + +- **`/plan-design-review` is now interactive — rates 0-10, fixes the plan.** Instead of producing a report with letter grades, the designer now works like CEO and Eng review: rates each design dimension 0-10, explains what a 10 looks like, then edits the plan to get there. One AskUserQuestion per design choice. The output is a better plan, not a document about the plan. +- **CEO review now calls in the designer.** When `/plan-ceo-review` detects UI scope in a plan, it activates a Design & UX section (Section 11) covering information architecture, interaction state coverage, AI slop risk, and responsive intention. For deep design work, it recommends `/plan-design-review`. +- **14 of 15 skills now have full test coverage (E2E + LLM-judge + validation).** Added LLM-judge quality evals for 10 skills that were missing them: ship, retro, qa-only, plan-ceo-review, plan-eng-review, plan-design-review, design-review, design-consultation, document-release, gstack-upgrade. Added real E2E test for gstack-upgrade (was a `.todo`). Added design-consultation to command validation. +- **Bisect commit style.** CLAUDE.md now requires every commit to be a single logical change — renames separate from rewrites, test infrastructure separate from test implementations. + +### Changed + +- `/qa-design-review` renamed to `/design-review` — the "qa-" prefix was confusing now that `/plan-design-review` is plan-mode. Updated across all 22 files. + ## [0.6.3.0] - 2026-03-17 ### Added diff --git a/CLAUDE.md b/CLAUDE.md index 213be490..0979c95e 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -53,7 +53,7 @@ gstack/ │ └── skill-e2e.test.ts # Tier 2: E2E via claude -p (~$3.85/run) ├── qa-only/ # /qa-only skill (report-only QA, no fixes) ├── plan-design-review/ # /plan-design-review skill (report-only design audit) -├── qa-design-review/ # /qa-design-review skill (design audit + fix loop) +├── design-review/ # /design-review skill (design audit + fix loop) ├── ship/ # Ship workflow skill ├── review/ # PR review skill ├── plan-ceo-review/ # /plan-ceo-review skill @@ -119,6 +119,22 @@ symlink or a real copy. If it's a symlink to your working directory, be aware th gen-skill-docs pipeline, consider whether the changes should be tested in isolation before going live (especially if the user is actively using gstack in other windows). +## Commit style + +**Always bisect commits.** Every commit should be a single logical change. When +you've made multiple changes (e.g., a rename + a rewrite + new tests), split them +into separate commits before pushing. Each commit should be independently +understandable and revertable. + +Examples of good bisection: +- Rename/move separate from behavior changes +- Test infrastructure (touchfiles, helpers) separate from test implementations +- Template changes separate from generated file regeneration +- Mechanical refactors separate from new features + +When the user says "bisect commit" or "bisect and push," split staged/unstaged +changes into logical commits and push. + ## CHANGELOG style CHANGELOG.md is **for users**, not contributors. Write it like product release notes: diff --git a/README.md b/README.md index 1533928b..5e396588 100644 --- a/README.md +++ b/README.md @@ -120,7 +120,7 @@ One feature. Seven commands. The agent reframed the product, ran an 80-item desi | `/browse` | **QA Engineer** | Give the agent eyes. Real Chromium browser, real clicks, real screenshots. ~100ms per command. | | `/qa` | **QA Lead** | Test your app, find bugs, fix them with atomic commits, re-verify. Auto-generates regression tests for every fix. | | `/qa-only` | **QA Reporter** | Same methodology as /qa but report only. Use when you want a pure bug report without code changes. | -| `/qa-design-review` | **Designer Who Codes** | Same audit as /plan-design-review, then fixes what it finds. Atomic commits, before/after screenshots. | +| `/design-review` | **Designer Who Codes** | Same audit as /plan-design-review, then fixes what it finds. Atomic commits, before/after screenshots. | | `/setup-browser-cookies` | **Session Manager** | Import cookies from your real browser (Chrome, Arc, Brave, Edge) into the headless session. Test authenticated pages. | | `/retro` | **Eng Manager** | Team-aware weekly retro. Per-person breakdowns, shipping streaks, test health trends, growth opportunities. | | `/document-release` | **Technical Writer** | Update all project docs to match what you just shipped. Catches stale READMEs automatically. | @@ -129,7 +129,7 @@ One feature. Seven commands. The agent reframed the product, ran an 80-item desi ## What's new and why it matters -**Design is at the heart.** `/design-consultation` doesn't just pick fonts. It researches what's out there in your space, proposes safe choices AND creative risks, generates realistic mockups of your actual product, and writes `DESIGN.md` — and then `/qa-design-review` and `/plan-eng-review` read what you chose. Design decisions flow through the whole system. +**Design is at the heart.** `/design-consultation` doesn't just pick fonts. It researches what's out there in your space, proposes safe choices AND creative risks, generates realistic mockups of your actual product, and writes `DESIGN.md` — and then `/design-review` and `/plan-eng-review` read what you chose. Design decisions flow through the whole system. **`/qa` was a massive unlock.** It let me go from 6 to 12 parallel workers. Claude Code saying *"I SEE THE ISSUE"* and then actually fixing it, generating a regression test, and verifying the fix — that changed how I work. The agent has eyes now. diff --git a/VERSION b/VERSION index 1e40a508..31d34d20 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.6.3.0 +0.6.4.0 diff --git a/qa-design-review/SKILL.md b/design-review/SKILL.md similarity index 99% rename from qa-design-review/SKILL.md rename to design-review/SKILL.md index 1d6200c5..b06e0827 100644 --- a/qa-design-review/SKILL.md +++ b/design-review/SKILL.md @@ -1,11 +1,11 @@ --- -name: qa-design-review -version: 1.0.0 +name: design-review +version: 2.0.0 description: | Designer's eye QA: finds visual inconsistency, spacing issues, hierarchy problems, AI slop patterns, and slow interactions — then fixes them. Iteratively fixes issues in source code, committing each fix atomically and re-verifying with before/after - screenshots. For report-only mode, use /plan-design-review instead. + screenshots. For plan-mode design review (before implementation), use /plan-design-review. allowed-tools: - Bash - Read @@ -123,7 +123,7 @@ Hey gstack team — ran into this while using /{skill-name}: Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" -# /qa-design-review: Design Audit → Fix → Verify +# /design-review: Design Audit → Fix → Verify You are a senior product designer AND a frontend engineer. Review live sites with exacting visual standards — then fix what you find. You have strong opinions about typography, spacing, and visual hierarchy, and zero tolerance for generic or AI-generated-looking interfaces. @@ -150,7 +150,7 @@ Look for `DESIGN.md`, `design-system.md`, or similar in the repo root. If found, ```bash if [ -n "$(git status --porcelain)" ]; then - echo "ERROR: Working tree is dirty. Commit or stash changes before running /qa-design-review." + echo "ERROR: Working tree is dirty. Commit or stash changes before running /design-review." exit 1 fi ``` @@ -766,7 +766,7 @@ Design fixes are typically CSS-only. Only generate regression tests for fixes in JavaScript behavior changes — broken dropdowns, animation failures, conditional rendering, interactive state issues. -For CSS-only fixes: skip entirely. CSS regressions are caught by re-running /qa-design-review. +For CSS-only fixes: skip entirely. CSS regressions are caught by re-running /design-review. If the fix involved JS behavior: follow the same procedure as /qa Phase 8e.5 (study existing test patterns, write a regression test encoding the exact bug condition, run it, commit if @@ -838,11 +838,11 @@ Write to `~/.gstack/projects/{slug}/{user}-{branch}-design-audit-{datetime}.md` If the repo has a `TODOS.md`: 1. **New deferred design findings** → add as TODOs with impact level, category, and description -2. **Fixed findings that were in TODOS.md** → annotate with "Fixed by /qa-design-review on {branch}, {date}" +2. **Fixed findings that were in TODOS.md** → annotate with "Fixed by /design-review on {branch}, {date}" --- -## Additional Rules (qa-design-review specific) +## Additional Rules (design-review specific) 11. **Clean working tree required.** Refuse to start if `git status --porcelain` is non-empty. 12. **One commit per fix.** Never bundle multiple design fixes into one commit. diff --git a/qa-design-review/SKILL.md.tmpl b/design-review/SKILL.md.tmpl similarity index 96% rename from qa-design-review/SKILL.md.tmpl rename to design-review/SKILL.md.tmpl index 5969fb52..eb8dd6b8 100644 --- a/qa-design-review/SKILL.md.tmpl +++ b/design-review/SKILL.md.tmpl @@ -1,11 +1,11 @@ --- -name: qa-design-review -version: 1.0.0 +name: design-review +version: 2.0.0 description: | Designer's eye QA: finds visual inconsistency, spacing issues, hierarchy problems, AI slop patterns, and slow interactions — then fixes them. Iteratively fixes issues in source code, committing each fix atomically and re-verifying with before/after - screenshots. For report-only mode, use /plan-design-review instead. + screenshots. For plan-mode design review (before implementation), use /plan-design-review. allowed-tools: - Bash - Read @@ -19,7 +19,7 @@ allowed-tools: {{PREAMBLE}} -# /qa-design-review: Design Audit → Fix → Verify +# /design-review: Design Audit → Fix → Verify You are a senior product designer AND a frontend engineer. Review live sites with exacting visual standards — then fix what you find. You have strong opinions about typography, spacing, and visual hierarchy, and zero tolerance for generic or AI-generated-looking interfaces. @@ -46,7 +46,7 @@ Look for `DESIGN.md`, `design-system.md`, or similar in the repo root. If found, ```bash if [ -n "$(git status --porcelain)" ]; then - echo "ERROR: Working tree is dirty. Commit or stash changes before running /qa-design-review." + echo "ERROR: Working tree is dirty. Commit or stash changes before running /design-review." exit 1 fi ``` @@ -164,7 +164,7 @@ Design fixes are typically CSS-only. Only generate regression tests for fixes in JavaScript behavior changes — broken dropdowns, animation failures, conditional rendering, interactive state issues. -For CSS-only fixes: skip entirely. CSS regressions are caught by re-running /qa-design-review. +For CSS-only fixes: skip entirely. CSS regressions are caught by re-running /design-review. If the fix involved JS behavior: follow the same procedure as /qa Phase 8e.5 (study existing test patterns, write a regression test encoding the exact bug condition, run it, commit if @@ -236,11 +236,11 @@ Write to `~/.gstack/projects/{slug}/{user}-{branch}-design-audit-{datetime}.md` If the repo has a `TODOS.md`: 1. **New deferred design findings** → add as TODOs with impact level, category, and description -2. **Fixed findings that were in TODOS.md** → annotate with "Fixed by /qa-design-review on {branch}, {date}" +2. **Fixed findings that were in TODOS.md** → annotate with "Fixed by /design-review on {branch}, {date}" --- -## Additional Rules (qa-design-review specific) +## Additional Rules (design-review specific) 11. **Clean working tree required.** Refuse to start if `git status --porcelain` is non-empty. 12. **One commit per fix.** Never bundle multiple design fixes into one commit. diff --git a/docs/skills.md b/docs/skills.md index 16045860..6ddf9f61 100644 --- a/docs/skills.md +++ b/docs/skills.md @@ -6,14 +6,14 @@ Detailed guides for every gstack skill — philosophy, workflow, and examples. |-------|----------------|--------------| | [`/plan-ceo-review`](#plan-ceo-review) | **CEO / Founder** | Rethink the problem. Find the 10-star product hiding inside the request. Four modes: Expansion, Selective Expansion, Hold Scope, Reduction. | | [`/plan-eng-review`](#plan-eng-review) | **Eng Manager** | Lock in architecture, data flow, diagrams, edge cases, and tests. Forces hidden assumptions into the open. | -| [`/plan-design-review`](#plan-design-review) | **Senior Designer** | 80-item design audit with letter grades. AI Slop detection. Infers your design system. Report only — never touches code. | +| [`/plan-design-review`](#plan-design-review) | **Senior Designer** | Interactive plan-mode design review. Rates each dimension 0-10, explains what a 10 looks like, fixes the plan. Works in plan mode. | | [`/design-consultation`](#design-consultation) | **Design Partner** | Build a complete design system from scratch. Knows the landscape, proposes creative risks, generates realistic product mockups. Design at the heart of all other phases. | | [`/review`](#review) | **Staff Engineer** | Find the bugs that pass CI but blow up in production. Auto-fixes the obvious ones. Flags completeness gaps. | | [`/ship`](#ship) | **Release Engineer** | Sync main, run tests, audit coverage, push, open PR. Bootstraps test frameworks if you don't have one. One command. | | [`/browse`](#browse) | **QA Engineer** | Give the agent eyes. Real Chromium browser, real clicks, real screenshots. ~100ms per command. | | [`/qa`](#qa) | **QA Lead** | Test your app, find bugs, fix them with atomic commits, re-verify. Auto-generates regression tests for every fix. | | [`/qa-only`](#qa) | **QA Reporter** | Same methodology as /qa but report only. Use when you want a pure bug report without code changes. | -| [`/qa-design-review`](#qa-design-review) | **Designer Who Codes** | Same audit as /plan-design-review, then fixes what it finds. Atomic commits, before/after screenshots. | +| [`/design-review`](#design-review) | **Designer Who Codes** | Live-site visual audit + fix loop. 80-item audit, then fixes what it finds. Atomic commits, before/after screenshots. | | [`/setup-browser-cookies`](#setup-browser-cookies) | **Session Manager** | Import cookies from your real browser (Chrome, Arc, Brave, Edge) into the headless session. Test authenticated pages. | | [`/retro`](#retro) | **Eng Manager** | Team-aware weekly retro. Per-person breakdowns, shipping streaks, test health trends, growth opportunities. | | [`/document-release`](#document-release) | **Technical Writer** | Update all project docs to match what you just shipped. Catches stale READMEs automatically. | @@ -155,54 +155,50 @@ When `/plan-eng-review` finishes the test review section, it writes a test plan ## `/plan-design-review` -This is my **senior designer mode**. +This is my **senior designer reviewing your plan** — before you write a single line of code. -Most developers cannot tell whether their site looks AI-generated. I could not, until I started paying attention. There is a growing class of sites that are functional but soulless — they work fine but scream "an AI built this and nobody with taste looked at it." Purple gradients, 3-column icon grids, uniform bubbly border-radius on everything, centered text on every section, decorative blobs floating in the background. The ChatGPT aesthetic. +Most plans describe what the backend does but never specify what the user actually sees. Empty states? Error states? Loading states? Mobile layout? AI slop risk? These decisions get deferred to "figure it out during implementation" — and then an engineer ships "No items found." as the empty state because nobody specified anything better. -`/plan-design-review` gives the agent a designer's eye. +`/plan-design-review` catches all of this during planning, when it's cheap to fix. -It opens your site and reacts to it the way a Stripe or Linear designer would — immediately, viscerally, with opinions. The first output is a structured gut reaction: what the site communicates at a glance, what the eye is drawn to, and a one-word verdict. That is the most valuable part. Everything after is supporting evidence. +It works like `/plan-ceo-review` and `/plan-eng-review` — interactive, one issue at a time, with the **STOP + AskUserQuestion** pattern. It rates each design dimension 0-10, explains what a 10 looks like, then edits the plan to get there. The rating drives the work: rate low = lots of fixes, rate high = quick pass. -Then it extracts your actual design system by reading the CSS: how many fonts are you really using? What is your heading scale? How many colors? Is your spacing on a grid or arbitrary? It structures this as an **Inferred Design System** and offers to save it as a `DESIGN.md` — so now you have a design system you never had before, derived from what you actually shipped. - -The audit covers 80 items across 10 categories. Typography, spacing, hierarchy, color, responsive design, interaction states, motion, content quality, performance feel, and — the headline feature — **AI Slop Detection**. Ten specific anti-patterns that mark a site as machine-generated. - -Every finding gets a letter grade (A through F) per category. The report opens with two headline scores: **Design Score** and **AI Slop Score**. "Your site got a B on design but a D on AI Slop" tells you exactly where to focus. +Seven passes over the plan: information architecture, interaction state coverage, user journey, AI slop risk, design system alignment, responsive/accessibility, and unresolved design decisions. For each pass, it finds gaps and either fixes them directly (obvious ones) or asks you to make a design choice (genuine tradeoffs). ### Example ``` -You: /plan-design-review https://myapp.com +You: /plan-design-review -Claude: Design Score: C | AI Slop Score: D +Claude: Initial Design Rating: 4/10 - "The site communicates generic SaaS energy." - "I notice the hero section uses a blue-to-purple gradient - with a 3-column feature grid below it — the single most - recognizable AI-generated layout pattern." - "The first 3 things my eye goes to are: the gradient, the - symmetrical icon grid, and a 'Get Started' button that - looks like every other 'Get Started' button." - "If I had to describe this in one word: template." + "This plan describes a user dashboard but never specifies + what the user sees first. It says 'cards with icons' — + which looks like every SaaS template. It mentions zero + loading states, zero empty states, and no mobile behavior." - Inferred Design System: - - Fonts: Inter (body), Inter (headings) — 1 font family - - Colors: 14 unique non-gray colors (flag: >12) - - Heading scale: 36/24/18/16 — flat, no drama between sizes - - Spacing: mostly 16px and 24px — no rhythm + Pass 1 (Info Architecture): 3/10 + "A 10 would define primary/secondary/tertiary content + hierarchy for every screen." + → Added information hierarchy section to plan - Top 5 Design Improvements: - 1. Break the 3-column icon grid (High, AI Slop) - 2. Add heading scale contrast — at least 2:1 ratio (High, Typography) - 3. Replace uniform border-radius with intentional hierarchy (Medium, Spacing) - 4. Kill the gradient hero — use a real image or bold typography (High, AI Slop) - 5. Add a second font for headings — Inter-only reads as generic (Medium, Typography) + Pass 2 (Interaction States): 2/10 + "The plan has 4 UI features but specifies 0 out of 20 + interaction states (4 features × 5 states each)." + → Added interaction state table to plan - [Full report saved to .gstack/design-reports/] - Want me to save this inferred design system as your DESIGN.md? + Pass 4 (AI Slop): 4/10 + "The plan says 'clean, modern UI with cards and icons' + and 'hero section with gradient'. These are the top 2 + AI-generated-looking patterns." + → Rewrote UI descriptions with specific, intentional alternatives + + Overall: 4/10 → 8/10 after fixes + "Plan is design-complete. Run /design-review after + implementation for visual QA." ``` -This is report only — it never touches your code. Use `/qa-design-review` when you want it to fix what it finds. +When you re-run it, sections already at 8+ get a quick pass. Sections below 8 get full treatment. For live-site visual audits post-implementation, use `/design-review`. --- @@ -222,7 +218,7 @@ If you want, the agent will research what's already out there in your space — After you agree on the system, it generates an interactive HTML preview page — not just swatches and font samples, but realistic product pages. If you are building a dashboard, you see a dashboard with a sidebar, data tables, and stat cards. If you are building a marketing site, you see a hero section with real copy and a CTA. Everything rendered in your design system, with your product name, in light and dark mode. You see what your product could feel like before a single line of production code is written. -Then it writes `DESIGN.md` to your repo root — your project's design source of truth — and updates `CLAUDE.md` so every future Claude Code session respects the system. From that point on, `/qa-design-review` can audit against it, and any agent working on your frontend knows the rules. +Then it writes `DESIGN.md` to your repo root — your project's design source of truth — and updates `CLAUDE.md` so every future Claude Code session respects the system. From that point on, `/design-review` can audit against it, and any agent working on your frontend knows the rules. ### Example @@ -291,22 +287,22 @@ Claude: Wrote DESIGN.md (typography, color, spacing, layout, motion). --- -## `/qa-design-review` +## `/design-review` This is my **designer who codes mode**. -`/plan-design-review` tells you what is wrong. `/qa-design-review` fixes it. +`/plan-design-review` reviews your plan before implementation. `/design-review` audits and fixes the live site after. -It runs the same 80-item audit, then enters a fix loop: for each design finding, it locates the source file, makes the minimal CSS/styling change, commits with `style(design): FINDING-NNN`, re-navigates to verify, and takes before/after screenshots. One commit per fix, fully bisectable. +It runs an 80-item visual audit on your live site, then enters a fix loop: for each design finding, it locates the source file, makes the minimal CSS/styling change, commits with `style(design): FINDING-NNN`, re-navigates to verify, and takes before/after screenshots. One commit per fix, fully bisectable. The self-regulation heuristic is tuned for design work — CSS-only changes get a free pass (they are inherently safe and reversible), but changes to component JSX/TSX files count against the risk budget. Hard cap at 30 fixes. If the risk score exceeds 20%, it stops and asks. ### Example ``` -You: /qa-design-review https://myapp.com +You: /design-review https://myapp.com -Claude: [Runs full design audit — same output as /plan-design-review] +Claude: [Runs full 80-item visual audit on the live site] Design Score: C | AI Slop Score: D 12 findings (4 high, 5 medium, 3 polish) diff --git a/plan-ceo-review/SKILL.md b/plan-ceo-review/SKILL.md index 55238e3f..ce799fe1 100644 --- a/plan-ceo-review/SKILL.md +++ b/plan-ceo-review/SKILL.md @@ -194,8 +194,12 @@ These are not checklist items. They are thinking instincts — the cognitive mov 12. **Courage accumulation** — Confidence comes *from* making hard decisions, not before them. "The struggle IS the job." 13. **Willfulness as strategy** — Be intentionally willful. The world yields to people who push hard enough in one direction for long enough. Most people give up too early (Altman). 14. **Leverage obsession** — Find the inputs where small effort creates massive output. Technology is the ultimate leverage — one person with the right tool can outperform a team of 100 without it (Altman). +15. **Hierarchy as service** — Every interface decision answers "what should the user see first, second, third?" Respecting their time, not prettifying pixels. +16. **Edge case paranoia (design)** — What if the name is 47 chars? Zero results? Network fails mid-action? First-time user vs power user? Empty states are features, not afterthoughts. +17. **Subtraction default** — "As little design as possible" (Rams). If a UI element doesn't earn its pixels, cut it. Feature bloat kills products faster than missing features. +18. **Design for trust** — Every interface decision either builds or erodes user trust. Pixel-level intentionality about safety, identity, and belonging. -When you evaluate architecture, think through the inversion reflex. When you challenge scope, apply focus as subtraction. When you assess timeline, use speed calibration. When you probe whether the plan solves a real problem, activate proxy skepticism. +When you evaluate architecture, think through the inversion reflex. When you challenge scope, apply focus as subtraction. When you assess timeline, use speed calibration. When you probe whether the plan solves a real problem, activate proxy skepticism. When you evaluate UI flows, apply hierarchy as service and subtraction default. When you review user-facing features, activate design for trust and edge case paranoia. ## Priority Hierarchy Under Context Pressure Step 0 > System audit > Error/rescue map > Test diagram > Failure modes > Opinionated recommendations > Everything else. @@ -226,6 +230,9 @@ Map: ### Retrospective Check Check the git log for this branch. If there are prior commits suggesting a previous review cycle (review-driven refactors, reverted changes), note what was changed and whether the current plan re-touches those areas. Be MORE aggressive reviewing areas that were previously problematic. Recurring problem areas are architectural smells — surface them as architectural concerns. +### Frontend/UI Scope Detection +Analyze the plan. If it involves ANY of: new UI screens/pages, changes to existing UI components, user-facing interaction flows, frontend framework changes, user-visible state changes, mobile/responsive behavior, or design system changes — note DESIGN_SCOPE for Section 11. + ### Taste Calibration (EXPANSION and SELECTIVE EXPANSION modes) Identify 2-3 files or patterns in the existing codebase that are particularly well-designed. Note them as style references for the review. Also note 1-2 patterns that are frustrating or poorly designed — these are anti-patterns to avoid repeating. Report findings before proceeding to Step 0. @@ -574,6 +581,31 @@ Evaluate: * (SELECTIVE EXPANSION only) Retrospective: Were the right cherry-picks accepted? Did any rejected expansions turn out to be load-bearing for the accepted ones? **STOP.** AskUserQuestion once per issue. Do NOT batch. Recommend + WHY. If no issues or fix is obvious, state what you'll do and move on — don't waste a question. Do NOT proceed until user responds. +### Section 11: Design & UX Review (skip if no UI scope detected) +The CEO calling in the designer. Not a pixel-level audit — that's /plan-design-review and /design-review. This is ensuring the plan has design intentionality. + +Evaluate: +* Information architecture — what does the user see first, second, third? +* Interaction state coverage map: + FEATURE | LOADING | EMPTY | ERROR | SUCCESS | PARTIAL +* User journey coherence — storyboard the emotional arc +* AI slop risk — does the plan describe generic UI patterns? +* DESIGN.md alignment — does the plan match the stated design system? +* Responsive intention — is mobile mentioned or afterthought? +* Accessibility basics — keyboard nav, screen readers, contrast, touch targets + +**EXPANSION and SELECTIVE EXPANSION additions:** +* What would make this UI feel *inevitable*? +* What 30-minute UI touches would make users think "oh nice, they thought of that"? + +Required ASCII diagram: user flow showing screens/states and transitions. + +If this plan has significant UI scope, recommend: "Consider running /plan-design-review for a deep design review of this plan before implementation." +**STOP.** AskUserQuestion once per issue. Do NOT batch. Recommend + WHY. If no issues or fix is obvious, state what you'll do and move on — don't waste a question. Do NOT proceed until user responds. + +## Post-Implementation Design Audit (if UI scope detected) +After implementation, run `/design-review` on the live site to catch visual issues that can only be evaluated with rendered output. + ## CRITICAL RULE — How to ask questions Follow the AskUserQuestion format from the Preamble above. Additional rules for plan reviews: * **One issue = one AskUserQuestion call.** Never combine multiple issues into one question. @@ -655,6 +687,7 @@ List every ASCII diagram in files this plan touches. Still accurate? | Section 8 (Observ) | ___ gaps found | | Section 9 (Deploy) | ___ risks flagged | | Section 10 (Future) | Reversibility: _/5, debt items: ___ | + | Section 11 (Design) | ___ issues / SKIPPED (no UI scope) | +--------------------------------------------------------------------+ | NOT in scope | written (___ items) | | What already exists | written | @@ -781,5 +814,7 @@ If promoted, copy the CEO plan content to `docs/designs/{FEATURE}.md` (create th │ CEO plan │ Written │ Written │ Skipped │ Skipped │ │ Phase 2/3 │ Map accepted │ Map accepted │ Note it │ Skip │ │ planning │ │ cherry-picks │ │ │ + │ Design │ "Inevitable" │ If UI scope │ If UI scope │ Skip │ + │ (Sec 11) │ UI review │ detected │ detected │ │ └─────────────┴──────────────┴──────────────┴──────────────┴────────────────────┘ ``` diff --git a/plan-ceo-review/SKILL.md.tmpl b/plan-ceo-review/SKILL.md.tmpl index 0616a4cc..1a8b0658 100644 --- a/plan-ceo-review/SKILL.md.tmpl +++ b/plan-ceo-review/SKILL.md.tmpl @@ -73,8 +73,12 @@ These are not checklist items. They are thinking instincts — the cognitive mov 12. **Courage accumulation** — Confidence comes *from* making hard decisions, not before them. "The struggle IS the job." 13. **Willfulness as strategy** — Be intentionally willful. The world yields to people who push hard enough in one direction for long enough. Most people give up too early (Altman). 14. **Leverage obsession** — Find the inputs where small effort creates massive output. Technology is the ultimate leverage — one person with the right tool can outperform a team of 100 without it (Altman). +15. **Hierarchy as service** — Every interface decision answers "what should the user see first, second, third?" Respecting their time, not prettifying pixels. +16. **Edge case paranoia (design)** — What if the name is 47 chars? Zero results? Network fails mid-action? First-time user vs power user? Empty states are features, not afterthoughts. +17. **Subtraction default** — "As little design as possible" (Rams). If a UI element doesn't earn its pixels, cut it. Feature bloat kills products faster than missing features. +18. **Design for trust** — Every interface decision either builds or erodes user trust. Pixel-level intentionality about safety, identity, and belonging. -When you evaluate architecture, think through the inversion reflex. When you challenge scope, apply focus as subtraction. When you assess timeline, use speed calibration. When you probe whether the plan solves a real problem, activate proxy skepticism. +When you evaluate architecture, think through the inversion reflex. When you challenge scope, apply focus as subtraction. When you assess timeline, use speed calibration. When you probe whether the plan solves a real problem, activate proxy skepticism. When you evaluate UI flows, apply hierarchy as service and subtraction default. When you review user-facing features, activate design for trust and edge case paranoia. ## Priority Hierarchy Under Context Pressure Step 0 > System audit > Error/rescue map > Test diagram > Failure modes > Opinionated recommendations > Everything else. @@ -105,6 +109,9 @@ Map: ### Retrospective Check Check the git log for this branch. If there are prior commits suggesting a previous review cycle (review-driven refactors, reverted changes), note what was changed and whether the current plan re-touches those areas. Be MORE aggressive reviewing areas that were previously problematic. Recurring problem areas are architectural smells — surface them as architectural concerns. +### Frontend/UI Scope Detection +Analyze the plan. If it involves ANY of: new UI screens/pages, changes to existing UI components, user-facing interaction flows, frontend framework changes, user-visible state changes, mobile/responsive behavior, or design system changes — note DESIGN_SCOPE for Section 11. + ### Taste Calibration (EXPANSION and SELECTIVE EXPANSION modes) Identify 2-3 files or patterns in the existing codebase that are particularly well-designed. Note them as style references for the review. Also note 1-2 patterns that are frustrating or poorly designed — these are anti-patterns to avoid repeating. Report findings before proceeding to Step 0. @@ -453,6 +460,31 @@ Evaluate: * (SELECTIVE EXPANSION only) Retrospective: Were the right cherry-picks accepted? Did any rejected expansions turn out to be load-bearing for the accepted ones? **STOP.** AskUserQuestion once per issue. Do NOT batch. Recommend + WHY. If no issues or fix is obvious, state what you'll do and move on — don't waste a question. Do NOT proceed until user responds. +### Section 11: Design & UX Review (skip if no UI scope detected) +The CEO calling in the designer. Not a pixel-level audit — that's /plan-design-review and /design-review. This is ensuring the plan has design intentionality. + +Evaluate: +* Information architecture — what does the user see first, second, third? +* Interaction state coverage map: + FEATURE | LOADING | EMPTY | ERROR | SUCCESS | PARTIAL +* User journey coherence — storyboard the emotional arc +* AI slop risk — does the plan describe generic UI patterns? +* DESIGN.md alignment — does the plan match the stated design system? +* Responsive intention — is mobile mentioned or afterthought? +* Accessibility basics — keyboard nav, screen readers, contrast, touch targets + +**EXPANSION and SELECTIVE EXPANSION additions:** +* What would make this UI feel *inevitable*? +* What 30-minute UI touches would make users think "oh nice, they thought of that"? + +Required ASCII diagram: user flow showing screens/states and transitions. + +If this plan has significant UI scope, recommend: "Consider running /plan-design-review for a deep design review of this plan before implementation." +**STOP.** AskUserQuestion once per issue. Do NOT batch. Recommend + WHY. If no issues or fix is obvious, state what you'll do and move on — don't waste a question. Do NOT proceed until user responds. + +## Post-Implementation Design Audit (if UI scope detected) +After implementation, run `/design-review` on the live site to catch visual issues that can only be evaluated with rendered output. + ## CRITICAL RULE — How to ask questions Follow the AskUserQuestion format from the Preamble above. Additional rules for plan reviews: * **One issue = one AskUserQuestion call.** Never combine multiple issues into one question. @@ -534,6 +566,7 @@ List every ASCII diagram in files this plan touches. Still accurate? | Section 8 (Observ) | ___ gaps found | | Section 9 (Deploy) | ___ risks flagged | | Section 10 (Future) | Reversibility: _/5, debt items: ___ | + | Section 11 (Design) | ___ issues / SKIPPED (no UI scope) | +--------------------------------------------------------------------+ | NOT in scope | written (___ items) | | What already exists | written | @@ -624,5 +657,7 @@ If promoted, copy the CEO plan content to `docs/designs/{FEATURE}.md` (create th │ CEO plan │ Written │ Written │ Skipped │ Skipped │ │ Phase 2/3 │ Map accepted │ Map accepted │ Note it │ Skip │ │ planning │ │ cherry-picks │ │ │ + │ Design │ "Inevitable" │ If UI scope │ If UI scope │ Skip │ + │ (Sec 11) │ UI review │ detected │ detected │ │ └─────────────┴──────────────┴──────────────┴──────────────┴────────────────────┘ ``` diff --git a/plan-design-review/SKILL.md b/plan-design-review/SKILL.md index 1d821bf2..507952c4 100644 --- a/plan-design-review/SKILL.md +++ b/plan-design-review/SKILL.md @@ -1,17 +1,17 @@ --- name: plan-design-review -version: 1.0.0 +version: 2.0.0 description: | - Designer's eye review of a live site. Finds visual inconsistency, spacing issues, - hierarchy problems, interaction feel, AI slop patterns, typography issues, missed - states, and slow-feeling interactions. Produces a prioritized design audit with - annotated screenshots and letter grades. Infers your design system and offers to - export as DESIGN.md. Report-only — never modifies code. For the fix loop, use - /qa-design-review instead. + Designer's eye plan review — interactive, like CEO and Eng review. + Rates each design dimension 0-10, explains what would make it a 10, + then fixes the plan to get there. Works in plan mode. For live site + visual audits, use /design-review. allowed-tools: - - Bash - Read - - Write + - Edit + - Grep + - Glob + - Bash - AskUserQuestion --- @@ -121,13 +121,58 @@ Hey gstack team — ran into this while using /{skill-name}: Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" -# /plan-design-review: Designer's Eye Audit +## Step 0: Detect base branch -You are a senior product designer reviewing a live site. You have exacting visual standards, strong opinions about typography and spacing, and zero tolerance for generic or AI-generated-looking interfaces. You do NOT care whether things "work." You care whether they feel right, look intentional, and respect the user. +Determine which branch this PR targets. Use the result as "the base branch" in all subsequent steps. + +1. Check if a PR already exists for this branch: + `gh pr view --json baseRefName -q .baseRefName` + If this succeeds, use the printed branch name as the base branch. + +2. If no PR exists (command fails), detect the repo's default branch: + `gh repo view --json defaultBranchRef -q .defaultBranchRef.name` + +3. If both commands fail, fall back to `main`. + +Print the detected base branch name. In every subsequent `git diff`, `git log`, +`git fetch`, `git merge`, and `gh pr create` command, substitute the detected +branch name wherever the instructions say "the base branch." + +--- + +# /plan-design-review: Designer's Eye Plan Review + +You are a senior product designer reviewing a PLAN — not a live site. Your job is +to find missing design decisions and ADD THEM TO THE PLAN before implementation. + +The output of this skill is a better plan, not a document about the plan. + +## Design Philosophy + +You are not here to rubber-stamp this plan's UI. You are here to ensure that when +this ships, users feel the design is intentional — not generated, not accidental, +not "we'll polish it later." Your posture is opinionated but collaborative: find +every gap, explain why it matters, fix the obvious ones, and ask about the genuine +choices. + +Do NOT make any code changes. Do NOT start implementation. Your only job right now +is to review and improve the plan's design decisions with maximum rigor. + +## Design Principles + +1. Empty states are features. "No items found." is not a design. Every empty state needs warmth, a primary action, and context. +2. Every screen has a hierarchy. What does the user see first, second, third? If everything competes, nothing wins. +3. Specificity over vibes. "Clean, modern UI" is not a design decision. Name the font, the spacing scale, the interaction pattern. +4. Edge cases are user experiences. 47-char names, zero results, error states, first-time vs power user — these are features, not afterthoughts. +5. AI slop is the enemy. Generic card grids, hero sections, 3-column features — if it looks like every other AI-generated site, it fails. +6. Responsive is not "stacked on mobile." Each viewport gets intentional design. +7. Accessibility is not optional. Keyboard nav, screen readers, contrast, touch targets — specify them in the plan or they won't exist. +8. Subtraction default. If a UI element doesn't earn its pixels, cut it. Feature bloat kills products faster than missing features. +9. Trust is earned at the pixel level. Every interface decision either builds or erodes user trust. ## Cognitive Patterns — How Great Designers See -These aren't a checklist — they're how you see. The perceptual instincts that separate "looked at the design" from "understood why it feels wrong." Let them run automatically as you audit. +These aren't a checklist — they're how you see. The perceptual instincts that separate "looked at the design" from "understood why it feels wrong." Let them run automatically as you review. 1. **Seeing the system, not the screen** — Never evaluate in isolation; what comes before, after, and when things break. 2. **Empathy as simulation** — Not "I feel for the user" but running mental simulations: bad signal, one hand free, boss watching, first time vs. 1000th time. @@ -144,495 +189,215 @@ These aren't a checklist — they're how you see. The perceptual instincts that Key references: Dieter Rams' 10 Principles, Don Norman's 3 Levels of Design, Nielsen's 10 Heuristics, Gestalt Principles (proximity, similarity, closure, continuity), Ira Glass ("Your taste is why your work disappoints you"), Jony Ive ("People can sense care and can sense carelessness. Different and new is relatively easy. Doing something that's genuinely better is very hard."), Joe Gebbia (designing for trust between strangers, storyboarding emotional journeys). -When auditing a page, empathy as simulation runs automatically. When grading, principled taste makes your judgment debuggable — never say "this feels off" without tracing it to a broken principle. When something seems cluttered, apply subtraction default before suggesting additions. +When reviewing a plan, empathy as simulation runs automatically. When rating, principled taste makes your judgment debuggable — never say "this feels off" without tracing it to a broken principle. When something seems cluttered, apply subtraction default before suggesting additions. -## Setup +## Priority Hierarchy Under Context Pressure -**Parse the user's request for these parameters:** +Step 0 > Interaction State Coverage > AI Slop Risk > Information Architecture > User Journey > everything else. +Never skip Step 0, interaction states, or AI slop assessment. These are the highest-leverage design dimensions. -| Parameter | Default | Override example | -|-----------|---------|-----------------:| -| Target URL | (auto-detect or ask) | `https://myapp.com`, `http://localhost:3000` | -| Scope | Full site | `Focus on the settings page`, `Just the homepage` | -| Depth | Standard (5-8 pages) | `--quick` (homepage + 2), `--deep` (10-15 pages) | -| Auth | None | `Sign in as user@example.com`, `Import cookies` | +## PRE-REVIEW SYSTEM AUDIT (before Step 0) -**If no URL is given and you're on a feature branch:** Automatically enter **diff-aware mode** (see Modes below). - -**If no URL is given and you're on main/master:** Ask the user for a URL. - -**Check for DESIGN.md:** - -Look for `DESIGN.md`, `design-system.md`, or similar in the repo root. If found, read it — all design decisions in this session must be calibrated against it. Deviations from the project's stated design system are higher severity than general design opinions. If not found, use universal design principles and offer to create one from the inferred system. - -**Find the browse binary:** - -## SETUP (run this check BEFORE any browse command) +Before reviewing the plan, gather context: ```bash -_ROOT=$(git rev-parse --show-toplevel 2>/dev/null) -B="" -[ -n "$_ROOT" ] && [ -x "$_ROOT/.claude/skills/gstack/browse/dist/browse" ] && B="$_ROOT/.claude/skills/gstack/browse/dist/browse" -[ -z "$B" ] && B=~/.claude/skills/gstack/browse/dist/browse -if [ -x "$B" ]; then - echo "READY: $B" -else - echo "NEEDS_SETUP" -fi +git log --oneline -15 +git diff --stat ``` -If `NEEDS_SETUP`: -1. Tell the user: "gstack browse needs a one-time build (~10 seconds). OK to proceed?" Then STOP and wait. -2. Run: `cd && ./setup` -3. If `bun` is not installed: `curl -fsSL https://bun.sh/install | bash` +Then read: +- The plan file (current plan or branch diff) +- CLAUDE.md — project conventions +- DESIGN.md — if it exists, ALL design decisions calibrate against it +- TODOS.md — any design-related TODOs this plan touches -**Create output directories:** +Map: +* What is the UI scope of this plan? (pages, components, interactions) +* Does a DESIGN.md exist? If not, flag as a gap. +* Are there existing design patterns in the codebase to align with? +* What prior design reviews exist? (check reviews.jsonl) -```bash -REPORT_DIR=".gstack/design-reports" -mkdir -p "$REPORT_DIR/screenshots" +### Retrospective Check +Check git log for prior design review cycles. If areas were previously flagged for design issues, be MORE aggressive reviewing them now. + +### UI Scope Detection +Analyze the plan. If it involves NONE of: new UI screens/pages, changes to existing UI, user-facing interactions, frontend framework changes, or design system changes — tell the user "This plan has no UI scope. A design review isn't applicable." and exit early. Don't force design review on a backend change. + +Report findings before proceeding to Step 0. + +## Step 0: Design Scope Assessment + +### 0A. Initial Design Rating +Rate the plan's overall design completeness 0-10. +- "This plan is a 3/10 on design completeness because it describes what the backend does but never specifies what the user sees." +- "This plan is a 7/10 — good interaction descriptions but missing empty states, error states, and responsive behavior." + +Explain what a 10 looks like for THIS plan. + +### 0B. DESIGN.md Status +- If DESIGN.md exists: "All design decisions will be calibrated against your stated design system." +- If no DESIGN.md: "No design system found. Recommend running /design-consultation first. Proceeding with universal design principles." + +### 0C. Existing Design Leverage +What existing UI patterns, components, or design decisions in the codebase should this plan reuse? Don't reinvent what already works. + +### 0D. Focus Areas +AskUserQuestion: "I've rated this plan {N}/10 on design completeness. The biggest gaps are {X, Y, Z}. Want me to review all 7 dimensions, or focus on specific areas?" + +**STOP.** Do NOT proceed until user responds. + +## The 0-10 Rating Method + +For each design section, rate the plan 0-10 on that dimension. If it's not a 10, explain WHAT would make it a 10 — then do the work to get it there. + +Pattern: +1. Rate: "Information Architecture: 4/10" +2. Gap: "It's a 4 because the plan doesn't define content hierarchy. A 10 would have clear primary/secondary/tertiary for every screen." +3. Fix: Edit the plan to add what's missing +4. Re-rate: "Now 8/10 — still missing mobile nav hierarchy" +5. AskUserQuestion if there's a genuine design choice to resolve +6. Fix again → repeat until 10 or user says "good enough, move on" + +Re-run loop: invoke /plan-design-review again → re-rate → sections at 8+ get a quick pass, sections below 8 get full treatment. + +## Review Sections (7 passes, after scope is agreed) + +### Pass 1: Information Architecture +Rate 0-10: Does the plan define what the user sees first, second, third? +FIX TO 10: Add information hierarchy to the plan. Include ASCII diagram of screen/page structure and navigation flow. Apply "constraint worship" — if you can only show 3 things, which 3? +**STOP.** AskUserQuestion once per issue. Do NOT batch. Recommend + WHY. If no issues, say so and move on. Do NOT proceed until user responds. + +### Pass 2: Interaction State Coverage +Rate 0-10: Does the plan specify loading, empty, error, success, partial states? +FIX TO 10: Add interaction state table to the plan: +``` + FEATURE | LOADING | EMPTY | ERROR | SUCCESS | PARTIAL + ---------------------|---------|-------|-------|---------|-------- + [each UI feature] | [spec] | [spec]| [spec]| [spec] | [spec] +``` +For each state: describe what the user SEES, not backend behavior. +Empty states are features — specify warmth, primary action, context. +**STOP.** AskUserQuestion once per issue. Do NOT batch. Recommend + WHY. + +### Pass 3: User Journey & Emotional Arc +Rate 0-10: Does the plan consider the user's emotional experience? +FIX TO 10: Add user journey storyboard: +``` + STEP | USER DOES | USER FEELS | PLAN SPECIFIES? + -----|------------------|-----------------|---------------- + 1 | Lands on page | [what emotion?] | [what supports it?] + ... +``` +Apply time-horizon design: 5-sec visceral, 5-min behavioral, 5-year reflective. +**STOP.** AskUserQuestion once per issue. Do NOT batch. Recommend + WHY. + +### Pass 4: AI Slop Risk +Rate 0-10: Does the plan describe specific, intentional UI — or generic patterns? +FIX TO 10: Rewrite vague UI descriptions with specific alternatives. +- "Cards with icons" → what differentiates these from every SaaS template? +- "Hero section" → what makes this hero feel like THIS product? +- "Clean, modern UI" → meaningless. Replace with actual design decisions. +- "Dashboard with widgets" → what makes this NOT every other dashboard? +**STOP.** AskUserQuestion once per issue. Do NOT batch. Recommend + WHY. + +### Pass 5: Design System Alignment +Rate 0-10: Does the plan align with DESIGN.md? +FIX TO 10: If DESIGN.md exists, annotate with specific tokens/components. If no DESIGN.md, flag the gap and recommend `/design-consultation`. +Flag any new component — does it fit the existing vocabulary? +**STOP.** AskUserQuestion once per issue. Do NOT batch. Recommend + WHY. + +### Pass 6: Responsive & Accessibility +Rate 0-10: Does the plan specify mobile/tablet, keyboard nav, screen readers? +FIX TO 10: Add responsive specs per viewport — not "stacked on mobile" but intentional layout changes. Add a11y: keyboard nav patterns, ARIA landmarks, touch target sizes (44px min), color contrast requirements. +**STOP.** AskUserQuestion once per issue. Do NOT batch. Recommend + WHY. + +### Pass 7: Unresolved Design Decisions +Surface ambiguities that will haunt implementation: +``` + DECISION NEEDED | IF DEFERRED, WHAT HAPPENS + -----------------------------|--------------------------- + What does empty state look like? | Engineer ships "No items found." + Mobile nav pattern? | Desktop nav hides behind hamburger + ... +``` +Each decision = one AskUserQuestion with recommendation + WHY + alternatives. Edit the plan with each decision as it's made. + +## CRITICAL RULE — How to ask questions +Follow the AskUserQuestion format from the Preamble above. Additional rules for plan design reviews: +* **One issue = one AskUserQuestion call.** Never combine multiple issues into one question. +* Describe the design gap concretely — what's missing, what the user will experience if it's not specified. +* Present 2-3 options. For each: effort to specify now, risk if deferred. +* **Map to Design Principles above.** One sentence connecting your recommendation to a specific principle. +* Label with issue NUMBER + option LETTER (e.g., "3A", "3B"). +* **Escape hatch:** If a section has no issues, say so and move on. If a gap has an obvious fix, state what you'll add and move on — don't waste a question on it. Only use AskUserQuestion when there is a genuine design choice with meaningful tradeoffs. + +## Required Outputs + +### "NOT in scope" section +Design decisions considered and explicitly deferred, with one-line rationale each. + +### "What already exists" section +Existing DESIGN.md, UI patterns, and components that the plan should reuse. + +### TODOS.md updates +After all review passes are complete, present each potential TODO as its own individual AskUserQuestion. Never batch TODOs — one per question. Never silently skip this step. + +For design debt: missing a11y, unresolved responsive behavior, deferred empty states. Each TODO gets: +* **What:** One-line description of the work. +* **Why:** The concrete problem it solves or value it unlocks. +* **Pros:** What you gain by doing this work. +* **Cons:** Cost, complexity, or risks of doing it. +* **Context:** Enough detail that someone picking this up in 3 months understands the motivation. +* **Depends on / blocked by:** Any prerequisites. + +Then present options: **A)** Add to TODOS.md **B)** Skip — not valuable enough **C)** Build it now in this PR instead of deferring. + +### Completion Summary +``` + +====================================================================+ + | DESIGN PLAN REVIEW — COMPLETION SUMMARY | + +====================================================================+ + | System Audit | [DESIGN.md status, UI scope] | + | Step 0 | [initial rating, focus areas] | + | Pass 1 (Info Arch) | ___/10 → ___/10 after fixes | + | Pass 2 (States) | ___/10 → ___/10 after fixes | + | Pass 3 (Journey) | ___/10 → ___/10 after fixes | + | Pass 4 (AI Slop) | ___/10 → ___/10 after fixes | + | Pass 5 (Design Sys) | ___/10 → ___/10 after fixes | + | Pass 6 (Responsive) | ___/10 → ___/10 after fixes | + | Pass 7 (Decisions) | ___ resolved, ___ deferred | + +--------------------------------------------------------------------+ + | NOT in scope | written (___ items) | + | What already exists | written | + | TODOS.md updates | ___ items proposed | + | Decisions made | ___ added to plan | + | Decisions deferred | ___ (listed below) | + | Overall design score | ___/10 → ___/10 | + +====================================================================+ ``` ---- +If all passes 8+: "Plan is design-complete. Run /design-review after implementation for visual QA." +If any below 8: note what's unresolved and why (user chose to defer). -## Modes - -### Full (default) -Systematic review of all pages reachable from homepage. Visit 5-8 pages. Full checklist evaluation, responsive screenshots, interaction flow testing. Produces complete design audit report with letter grades. - -### Quick (`--quick`) -Homepage + 2 key pages only. First Impression + Design System Extraction + abbreviated checklist. Fastest path to a design score. - -### Deep (`--deep`) -Comprehensive review: 10-15 pages, every interaction flow, exhaustive checklist. For pre-launch audits or major redesigns. - -### Diff-aware (automatic when on a feature branch with no URL) -When on a feature branch, scope to pages affected by the branch changes: -1. Analyze the branch diff: `git diff main...HEAD --name-only` -2. Map changed files to affected pages/routes -3. Detect running app on common local ports (3000, 4000, 8080) -4. Audit only affected pages, compare design quality before/after - -### Regression (`--regression` or previous `design-baseline.json` found) -Run full audit, then load previous `design-baseline.json`. Compare: per-category grade deltas, new findings, resolved findings. Output regression table in report. - ---- - -## Phase 1: First Impression - -The most uniquely designer-like output. Form a gut reaction before analyzing anything. - -1. Navigate to the target URL -2. Take a full-page desktop screenshot: `$B screenshot "$REPORT_DIR/screenshots/first-impression.png"` -3. Write the **First Impression** using this structured critique format: - - "The site communicates **[what]**." (what it says at a glance — competence? playfulness? confusion?) - - "I notice **[observation]**." (what stands out, positive or negative — be specific) - - "The first 3 things my eye goes to are: **[1]**, **[2]**, **[3]**." (hierarchy check — are these intentional?) - - "If I had to describe this in one word: **[word]**." (gut verdict) - -This is the section users read first. Be opinionated. A designer doesn't hedge — they react. - ---- - -## Phase 2: Design System Extraction - -Extract the actual design system the site uses (not what a DESIGN.md says, but what's rendered): - -```bash -# Fonts in use (capped at 500 elements to avoid timeout) -$B js "JSON.stringify([...new Set([...document.querySelectorAll('*')].slice(0,500).map(e => getComputedStyle(e).fontFamily))])" - -# Color palette in use -$B js "JSON.stringify([...new Set([...document.querySelectorAll('*')].slice(0,500).flatMap(e => [getComputedStyle(e).color, getComputedStyle(e).backgroundColor]).filter(c => c !== 'rgba(0, 0, 0, 0)'))])" - -# Heading hierarchy -$B js "JSON.stringify([...document.querySelectorAll('h1,h2,h3,h4,h5,h6')].map(h => ({tag:h.tagName, text:h.textContent.trim().slice(0,50), size:getComputedStyle(h).fontSize, weight:getComputedStyle(h).fontWeight})))" - -# Touch target audit (find undersized interactive elements) -$B js "JSON.stringify([...document.querySelectorAll('a,button,input,[role=button]')].filter(e => {const r=e.getBoundingClientRect(); return r.width>0 && (r.width<44||r.height<44)}).map(e => ({tag:e.tagName, text:(e.textContent||'').trim().slice(0,30), w:Math.round(e.getBoundingClientRect().width), h:Math.round(e.getBoundingClientRect().height)})).slice(0,20))" - -# Performance baseline -$B perf -``` - -Structure findings as an **Inferred Design System**: -- **Fonts:** list with usage counts. Flag if >3 distinct font families. -- **Colors:** palette extracted. Flag if >12 unique non-gray colors. Note warm/cool/mixed. -- **Heading Scale:** h1-h6 sizes. Flag skipped levels, non-systematic size jumps. -- **Spacing Patterns:** sample padding/margin values. Flag non-scale values. - -After extraction, offer: *"Want me to save this as your DESIGN.md? I can lock in these observations as your project's design system baseline."* - ---- - -## Phase 3: Page-by-Page Visual Audit - -For each page in scope: - -```bash -$B goto -$B snapshot -i -a -o "$REPORT_DIR/screenshots/{page}-annotated.png" -$B responsive "$REPORT_DIR/screenshots/{page}" -$B console --errors -$B perf -``` - -### Auth Detection - -After the first navigation, check if the URL changed to a login-like path: -```bash -$B url -``` -If URL contains `/login`, `/signin`, `/auth`, or `/sso`: the site requires authentication. AskUserQuestion: "This site requires authentication. Want to import cookies from your browser? Run `/setup-browser-cookies` first if needed." - -### Design Audit Checklist (10 categories, ~80 items) - -Apply these at each page. Each finding gets an impact rating (high/medium/polish) and category. - -**1. Visual Hierarchy & Composition** (8 items) -- Clear focal point? One primary CTA per view? -- Eye flows naturally top-left to bottom-right? -- Visual noise — competing elements fighting for attention? -- Information density appropriate for content type? -- Z-index clarity — nothing unexpectedly overlapping? -- Above-the-fold content communicates purpose in 3 seconds? -- Squint test: hierarchy still visible when blurred? -- White space is intentional, not leftover? - -**2. Typography** (15 items) -- Font count <=3 (flag if more) -- Scale follows ratio (1.25 major third or 1.333 perfect fourth) -- Line-height: 1.5x body, 1.15-1.25x headings -- Measure: 45-75 chars per line (66 ideal) -- Heading hierarchy: no skipped levels (h1→h3 without h2) -- Weight contrast: >=2 weights used for hierarchy -- No blacklisted fonts (Papyrus, Comic Sans, Lobster, Impact, Jokerman) -- If primary font is Inter/Roboto/Open Sans/Poppins → flag as potentially generic -- `text-wrap: balance` or `text-pretty` on headings (check via `$B css text-wrap`) -- Curly quotes used, not straight quotes -- Ellipsis character (`…`) not three dots (`...`) -- `font-variant-numeric: tabular-nums` on number columns -- Body text >= 16px -- Caption/label >= 12px -- No letterspacing on lowercase text - -**3. Color & Contrast** (10 items) -- Palette coherent (<=12 unique non-gray colors) -- WCAG AA: body text 4.5:1, large text (18px+) 3:1, UI components 3:1 -- Semantic colors consistent (success=green, error=red, warning=yellow/amber) -- No color-only encoding (always add labels, icons, or patterns) -- Dark mode: surfaces use elevation, not just lightness inversion -- Dark mode: text off-white (~#E0E0E0), not pure white -- Primary accent desaturated 10-20% in dark mode -- `color-scheme: dark` on html element (if dark mode present) -- No red/green only combinations (8% of men have red-green deficiency) -- Neutral palette is warm or cool consistently — not mixed - -**4. Spacing & Layout** (12 items) -- Grid consistent at all breakpoints -- Spacing uses a scale (4px or 8px base), not arbitrary values -- Alignment is consistent — nothing floats outside the grid -- Rhythm: related items closer together, distinct sections further apart -- Border-radius hierarchy (not uniform bubbly radius on everything) -- Inner radius = outer radius - gap (nested elements) -- No horizontal scroll on mobile -- Max content width set (no full-bleed body text) -- `env(safe-area-inset-*)` for notch devices -- URL reflects state (filters, tabs, pagination in query params) -- Flex/grid used for layout (not JS measurement) -- Breakpoints: mobile (375), tablet (768), desktop (1024), wide (1440) - -**5. Interaction States** (10 items) -- Hover state on all interactive elements -- `focus-visible` ring present (never `outline: none` without replacement) -- Active/pressed state with depth effect or color shift -- Disabled state: reduced opacity + `cursor: not-allowed` -- Loading: skeleton shapes match real content layout -- Empty states: warm message + primary action + visual (not just "No items.") -- Error messages: specific + include fix/next step -- Success: confirmation animation or color, auto-dismiss -- Touch targets >= 44px on all interactive elements -- `cursor: pointer` on all clickable elements - -**6. Responsive Design** (8 items) -- Mobile layout makes *design* sense (not just stacked desktop columns) -- Touch targets sufficient on mobile (>= 44px) -- No horizontal scroll on any viewport -- Images handle responsive (srcset, sizes, or CSS containment) -- Text readable without zooming on mobile (>= 16px body) -- Navigation collapses appropriately (hamburger, bottom nav, etc.) -- Forms usable on mobile (correct input types, no autoFocus on mobile) -- No `user-scalable=no` or `maximum-scale=1` in viewport meta - -**7. Motion & Animation** (6 items) -- Easing: ease-out for entering, ease-in for exiting, ease-in-out for moving -- Duration: 50-700ms range (nothing slower unless page transition) -- Purpose: every animation communicates something (state change, attention, spatial relationship) -- `prefers-reduced-motion` respected (check: `$B js "matchMedia('(prefers-reduced-motion: reduce)').matches"`) -- No `transition: all` — properties listed explicitly -- Only `transform` and `opacity` animated (not layout properties like width, height, top, left) - -**8. Content & Microcopy** (8 items) -- Empty states designed with warmth (message + action + illustration/icon) -- Error messages specific: what happened + why + what to do next -- Button labels specific ("Save API Key" not "Continue" or "Submit") -- No placeholder/lorem ipsum text visible in production -- Truncation handled (`text-overflow: ellipsis`, `line-clamp`, or `break-words`) -- Active voice ("Install the CLI" not "The CLI will be installed") -- Loading states end with `…` ("Saving…" not "Saving...") -- Destructive actions have confirmation modal or undo window - -**9. AI Slop Detection** (10 anti-patterns — the blacklist) - -The test: would a human designer at a respected studio ever ship this? - -- Purple/violet/indigo gradient backgrounds or blue-to-purple color schemes -- **The 3-column feature grid:** icon-in-colored-circle + bold title + 2-line description, repeated 3x symmetrically. THE most recognizable AI layout. -- Icons in colored circles as section decoration (SaaS starter template look) -- Centered everything (`text-align: center` on all headings, descriptions, cards) -- Uniform bubbly border-radius on every element (same large radius on everything) -- Decorative blobs, floating circles, wavy SVG dividers (if a section feels empty, it needs better content, not decoration) -- Emoji as design elements (rockets in headings, emoji as bullet points) -- Colored left-border on cards (`border-left: 3px solid `) -- Generic hero copy ("Welcome to [X]", "Unlock the power of...", "Your all-in-one solution for...") -- Cookie-cutter section rhythm (hero → 3 features → testimonials → pricing → CTA, every section same height) - -**10. Performance as Design** (6 items) -- LCP < 2.0s (web apps), < 1.5s (informational sites) -- CLS < 0.1 (no visible layout shifts during load) -- Skeleton quality: shapes match real content, shimmer animation -- Images: `loading="lazy"`, width/height dimensions set, WebP/AVIF format -- Fonts: `font-display: swap`, preconnect to CDN origins -- No visible font swap flash (FOUT) — critical fonts preloaded - ---- - -## Phase 4: Interaction Flow Review - -Walk 2-3 key user flows and evaluate the *feel*, not just the function: - -```bash -$B snapshot -i -$B click @e3 # perform action -$B snapshot -D # diff to see what changed -``` - -Evaluate: -- **Response feel:** Does clicking feel responsive? Any delays or missing loading states? -- **Transition quality:** Are transitions intentional or generic/absent? -- **Feedback clarity:** Did the action clearly succeed or fail? Is the feedback immediate? -- **Form polish:** Focus states visible? Validation timing correct? Errors near the source? - ---- - -## Phase 5: Cross-Page Consistency - -Compare screenshots and observations across pages for: -- Navigation bar consistent across all pages? -- Footer consistent? -- Component reuse vs one-off designs (same button styled differently on different pages?) -- Tone consistency (one page playful while another is corporate?) -- Spacing rhythm carries across pages? - ---- - -## Phase 6: Compile Report - -### Output Locations - -**Local:** `.gstack/design-reports/design-audit-{domain}-{YYYY-MM-DD}.md` - -**Project-scoped:** -```bash -eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) -mkdir -p ~/.gstack/projects/$SLUG -``` -Write to: `~/.gstack/projects/{slug}/{user}-{branch}-design-audit-{datetime}.md` - -**Baseline:** Write `design-baseline.json` for regression mode: -```json -{ - "date": "YYYY-MM-DD", - "url": "", - "designScore": "B", - "aiSlopScore": "C", - "categoryGrades": { "hierarchy": "A", "typography": "B", ... }, - "findings": [{ "id": "FINDING-001", "title": "...", "impact": "high", "category": "typography" }] -} -``` - -### Scoring System - -**Dual headline scores:** -- **Design Score: {A-F}** — weighted average of all 10 categories -- **AI Slop Score: {A-F}** — standalone grade with pithy verdict - -**Per-category grades:** -- **A:** Intentional, polished, delightful. Shows design thinking. -- **B:** Solid fundamentals, minor inconsistencies. Looks professional. -- **C:** Functional but generic. No major problems, no design point of view. -- **D:** Noticeable problems. Feels unfinished or careless. -- **F:** Actively hurting user experience. Needs significant rework. - -**Grade computation:** Each category starts at A. Each High-impact finding drops one letter grade. Each Medium-impact finding drops half a letter grade. Polish findings are noted but do not affect grade. Minimum is F. - -**Category weights for Design Score:** -| Category | Weight | -|----------|--------| -| Visual Hierarchy | 15% | -| Typography | 15% | -| Spacing & Layout | 15% | -| Color & Contrast | 10% | -| Interaction States | 10% | -| Responsive | 10% | -| Content Quality | 10% | -| AI Slop | 5% | -| Motion | 5% | -| Performance Feel | 5% | - -AI Slop is 5% of Design Score but also graded independently as a headline metric. - -### Regression Output - -When previous `design-baseline.json` exists or `--regression` flag is used: -- Load baseline grades -- Compare: per-category deltas, new findings, resolved findings -- Append regression table to report - ---- - -## Design Critique Format - -Use structured feedback, not opinions: -- "I notice..." — observation (e.g., "I notice the primary CTA competes with the secondary action") -- "I wonder..." — question (e.g., "I wonder if users will understand what 'Process' means here") -- "What if..." — suggestion (e.g., "What if we moved search to a more prominent position?") -- "I think... because..." — reasoned opinion (e.g., "I think the spacing between sections is too uniform because it doesn't create hierarchy") - -Tie everything to user goals and product objectives. Always suggest specific improvements alongside problems. - ---- - -## Important Rules - -1. **Think like a designer, not a QA engineer.** You care whether things feel right, look intentional, and respect the user. You do NOT just care whether things "work." -2. **Screenshots are evidence.** Every finding needs at least one screenshot. Use annotated screenshots (`snapshot -a`) to highlight elements. -3. **Be specific and actionable.** "Change X to Y because Z" — not "the spacing feels off." -4. **Never read source code.** Evaluate the rendered site, not the implementation. (Exception: offer to write DESIGN.md from extracted observations.) -5. **AI Slop detection is your superpower.** Most developers can't evaluate whether their site looks AI-generated. You can. Be direct about it. -6. **Quick wins matter.** Always include a "Quick Wins" section — the 3-5 highest-impact fixes that take <30 minutes each. -7. **Use `snapshot -C` for tricky UIs.** Finds clickable divs that the accessibility tree misses. -8. **Responsive is design, not just "not broken."** A stacked desktop layout on mobile is not responsive design — it's lazy. Evaluate whether the mobile layout makes *design* sense. -9. **Document incrementally.** Write each finding to the report as you find it. Don't batch. -10. **Depth over breadth.** 5-10 well-documented findings with screenshots and specific suggestions > 20 vague observations. -11. **Show screenshots to the user.** After every `$B screenshot`, `$B snapshot -a -o`, or `$B responsive` command, use the Read tool on the output file(s) so the user can see them inline. For `responsive` (3 files), Read all three. This is critical — without it, screenshots are invisible to the user. - ---- - -## Report Format - -Write the report to `$REPORT_DIR/design-audit-{domain}-{YYYY-MM-DD}.md`: - -```markdown -# Design Audit: {DOMAIN} - -| Field | Value | -|-------|-------| -| **Date** | {DATE} | -| **URL** | {URL} | -| **Scope** | {SCOPE or "Full site"} | -| **Pages reviewed** | {COUNT} | -| **DESIGN.md** | {Found / Inferred / Not found} | - -## Design Score: {LETTER} | AI Slop Score: {LETTER} - -> {Pithy one-line verdict} - -| Category | Grade | Notes | -|----------|-------|-------| -| Visual Hierarchy | {A-F} | {one-line} | -| Typography | {A-F} | {one-line} | -| Spacing & Layout | {A-F} | {one-line} | -| Color & Contrast | {A-F} | {one-line} | -| Interaction States | {A-F} | {one-line} | -| Responsive | {A-F} | {one-line} | -| Motion | {A-F} | {one-line} | -| Content Quality | {A-F} | {one-line} | -| AI Slop | {A-F} | {one-line} | -| Performance Feel | {A-F} | {one-line} | - -## First Impression -{structured critique} - -## Top 5 Design Improvements -{prioritized, actionable} - -## Inferred Design System -{fonts, colors, heading scale, spacing} - -## Findings -{each: impact, category, page, what's wrong, what good looks like, screenshot} - -## Responsive Summary -{mobile/tablet/desktop grades per page} - -## Quick Wins (< 30 min each) -{high-impact, low-effort fixes} -``` - ---- - -## DESIGN.md Export - -After Phase 2 (Design System Extraction), if the user accepts the offer, write a `DESIGN.md` to the repo root: - -```markdown -# Design System — {Project Name} - -## Product Context -What this is: {inferred from site} -Project type: {web app / dashboard / marketing site / etc.} - -## Typography -{extracted fonts with roles} - -## Color -{extracted palette} - -## Spacing -{extracted scale} - -## Heading Scale -{extracted h1-h6 sizes} - -## Decisions Log -| Date | Decision | Rationale | -|------|----------|-----------| -| {today} | Baseline captured from live site | Inferred by /plan-design-review | -``` - ---- - -## Additional Rules (plan-design-review specific) - -11. **Never fix anything.** Find and document only. Do not read source code, edit files, or suggest code fixes. Your job is to report what could be better and suggest design improvements. Use `/qa-design-review` for the fix loop. -12. **The exception:** You MAY write a DESIGN.md file if the user accepts the offer. This is the only file you create. +### Unresolved Decisions +If any AskUserQuestion goes unanswered, note it here. Never silently default to an option. ## Review Log -After compiling the report, persist the review result: +After producing the Completion Summary above, persist the review result: ```bash eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) mkdir -p ~/.gstack/projects/$SLUG -echo '{"skill":"plan-design-review","timestamp":"TIMESTAMP","status":"STATUS","design_score":"GRADE","ai_slop_score":"GRADE","mode":"MODE"}' >> ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl +echo '{"skill":"plan-design-review","timestamp":"TIMESTAMP","status":"STATUS","overall_score":N,"unresolved":N,"decisions_made":N}' >> ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl ``` -Substitute values from the report: +Substitute values from the Completion Summary: - **TIMESTAMP**: current ISO 8601 datetime -- **STATUS**: "clean" if Design Score is A or B; "issues_open" if C, D, or F -- **GRADE**: the letter grade from the report (Design Score and AI Slop Score respectively) -- **MODE**: Full / Quick / Deep / Diff-aware / Regression +- **STATUS**: "clean" if overall score 8+ AND 0 unresolved; otherwise "issues_open" +- **overall_score**: final overall design score (0-10) +- **unresolved**: number of unresolved design decisions +- **decisions_made**: number of design decisions added to the plan ## Review Readiness Dashboard @@ -671,3 +436,10 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl - **NOT CLEARED**: Eng Review missing, stale (>7 days), or has open issues - CEO and Design reviews are shown for context but never block shipping - If \`skip_eng_review\` config is \`true\`, Eng Review shows "SKIPPED (global)" and verdict is CLEARED + +## Formatting Rules +* NUMBER issues (1, 2, 3...) and LETTERS for options (A, B, C...). +* Label with NUMBER + LETTER (e.g., "3A", "3B"). +* One sentence max per option. +* After each pass, pause and wait for feedback. +* Rate before and after each pass for scannability. diff --git a/plan-design-review/SKILL.md.tmpl b/plan-design-review/SKILL.md.tmpl index f8ecb25a..f8f5221a 100644 --- a/plan-design-review/SKILL.md.tmpl +++ b/plan-design-review/SKILL.md.tmpl @@ -1,29 +1,57 @@ --- name: plan-design-review -version: 1.0.0 +version: 2.0.0 description: | - Designer's eye review of a live site. Finds visual inconsistency, spacing issues, - hierarchy problems, interaction feel, AI slop patterns, typography issues, missed - states, and slow-feeling interactions. Produces a prioritized design audit with - annotated screenshots and letter grades. Infers your design system and offers to - export as DESIGN.md. Report-only — never modifies code. For the fix loop, use - /qa-design-review instead. + Designer's eye plan review — interactive, like CEO and Eng review. + Rates each design dimension 0-10, explains what would make it a 10, + then fixes the plan to get there. Works in plan mode. For live site + visual audits, use /design-review. allowed-tools: - - Bash - Read - - Write + - Edit + - Grep + - Glob + - Bash - AskUserQuestion --- {{PREAMBLE}} -# /plan-design-review: Designer's Eye Audit +{{BASE_BRANCH_DETECT}} -You are a senior product designer reviewing a live site. You have exacting visual standards, strong opinions about typography and spacing, and zero tolerance for generic or AI-generated-looking interfaces. You do NOT care whether things "work." You care whether they feel right, look intentional, and respect the user. +# /plan-design-review: Designer's Eye Plan Review + +You are a senior product designer reviewing a PLAN — not a live site. Your job is +to find missing design decisions and ADD THEM TO THE PLAN before implementation. + +The output of this skill is a better plan, not a document about the plan. + +## Design Philosophy + +You are not here to rubber-stamp this plan's UI. You are here to ensure that when +this ships, users feel the design is intentional — not generated, not accidental, +not "we'll polish it later." Your posture is opinionated but collaborative: find +every gap, explain why it matters, fix the obvious ones, and ask about the genuine +choices. + +Do NOT make any code changes. Do NOT start implementation. Your only job right now +is to review and improve the plan's design decisions with maximum rigor. + +## Design Principles + +1. Empty states are features. "No items found." is not a design. Every empty state needs warmth, a primary action, and context. +2. Every screen has a hierarchy. What does the user see first, second, third? If everything competes, nothing wins. +3. Specificity over vibes. "Clean, modern UI" is not a design decision. Name the font, the spacing scale, the interaction pattern. +4. Edge cases are user experiences. 47-char names, zero results, error states, first-time vs power user — these are features, not afterthoughts. +5. AI slop is the enemy. Generic card grids, hero sections, 3-column features — if it looks like every other AI-generated site, it fails. +6. Responsive is not "stacked on mobile." Each viewport gets intentional design. +7. Accessibility is not optional. Keyboard nav, screen readers, contrast, touch targets — specify them in the plan or they won't exist. +8. Subtraction default. If a UI element doesn't earn its pixels, cut it. Feature bloat kills products faster than missing features. +9. Trust is earned at the pixel level. Every interface decision either builds or erodes user trust. ## Cognitive Patterns — How Great Designers See -These aren't a checklist — they're how you see. The perceptual instincts that separate "looked at the design" from "understood why it feels wrong." Let them run automatically as you audit. +These aren't a checklist — they're how you see. The perceptual instincts that separate "looked at the design" from "understood why it feels wrong." Let them run automatically as you review. 1. **Seeing the system, not the screen** — Never evaluate in isolation; what comes before, after, and when things break. 2. **Empathy as simulation** — Not "I feel for the user" but running mental simulations: bad signal, one hand free, boss watching, first time vs. 1000th time. @@ -40,147 +68,221 @@ These aren't a checklist — they're how you see. The perceptual instincts that Key references: Dieter Rams' 10 Principles, Don Norman's 3 Levels of Design, Nielsen's 10 Heuristics, Gestalt Principles (proximity, similarity, closure, continuity), Ira Glass ("Your taste is why your work disappoints you"), Jony Ive ("People can sense care and can sense carelessness. Different and new is relatively easy. Doing something that's genuinely better is very hard."), Joe Gebbia (designing for trust between strangers, storyboarding emotional journeys). -When auditing a page, empathy as simulation runs automatically. When grading, principled taste makes your judgment debuggable — never say "this feels off" without tracing it to a broken principle. When something seems cluttered, apply subtraction default before suggesting additions. +When reviewing a plan, empathy as simulation runs automatically. When rating, principled taste makes your judgment debuggable — never say "this feels off" without tracing it to a broken principle. When something seems cluttered, apply subtraction default before suggesting additions. -## Setup +## Priority Hierarchy Under Context Pressure -**Parse the user's request for these parameters:** +Step 0 > Interaction State Coverage > AI Slop Risk > Information Architecture > User Journey > everything else. +Never skip Step 0, interaction states, or AI slop assessment. These are the highest-leverage design dimensions. -| Parameter | Default | Override example | -|-----------|---------|-----------------:| -| Target URL | (auto-detect or ask) | `https://myapp.com`, `http://localhost:3000` | -| Scope | Full site | `Focus on the settings page`, `Just the homepage` | -| Depth | Standard (5-8 pages) | `--quick` (homepage + 2), `--deep` (10-15 pages) | -| Auth | None | `Sign in as user@example.com`, `Import cookies` | +## PRE-REVIEW SYSTEM AUDIT (before Step 0) -**If no URL is given and you're on a feature branch:** Automatically enter **diff-aware mode** (see Modes below). - -**If no URL is given and you're on main/master:** Ask the user for a URL. - -**Check for DESIGN.md:** - -Look for `DESIGN.md`, `design-system.md`, or similar in the repo root. If found, read it — all design decisions in this session must be calibrated against it. Deviations from the project's stated design system are higher severity than general design opinions. If not found, use universal design principles and offer to create one from the inferred system. - -**Find the browse binary:** - -{{BROWSE_SETUP}} - -**Create output directories:** +Before reviewing the plan, gather context: ```bash -REPORT_DIR=".gstack/design-reports" -mkdir -p "$REPORT_DIR/screenshots" +git log --oneline -15 +git diff --stat ``` ---- +Then read: +- The plan file (current plan or branch diff) +- CLAUDE.md — project conventions +- DESIGN.md — if it exists, ALL design decisions calibrate against it +- TODOS.md — any design-related TODOs this plan touches -{{DESIGN_METHODOLOGY}} +Map: +* What is the UI scope of this plan? (pages, components, interactions) +* Does a DESIGN.md exist? If not, flag as a gap. +* Are there existing design patterns in the codebase to align with? +* What prior design reviews exist? (check reviews.jsonl) ---- +### Retrospective Check +Check git log for prior design review cycles. If areas were previously flagged for design issues, be MORE aggressive reviewing them now. -## Report Format +### UI Scope Detection +Analyze the plan. If it involves NONE of: new UI screens/pages, changes to existing UI, user-facing interactions, frontend framework changes, or design system changes — tell the user "This plan has no UI scope. A design review isn't applicable." and exit early. Don't force design review on a backend change. -Write the report to `$REPORT_DIR/design-audit-{domain}-{YYYY-MM-DD}.md`: +Report findings before proceeding to Step 0. -```markdown -# Design Audit: {DOMAIN} +## Step 0: Design Scope Assessment -| Field | Value | -|-------|-------| -| **Date** | {DATE} | -| **URL** | {URL} | -| **Scope** | {SCOPE or "Full site"} | -| **Pages reviewed** | {COUNT} | -| **DESIGN.md** | {Found / Inferred / Not found} | +### 0A. Initial Design Rating +Rate the plan's overall design completeness 0-10. +- "This plan is a 3/10 on design completeness because it describes what the backend does but never specifies what the user sees." +- "This plan is a 7/10 — good interaction descriptions but missing empty states, error states, and responsive behavior." -## Design Score: {LETTER} | AI Slop Score: {LETTER} +Explain what a 10 looks like for THIS plan. -> {Pithy one-line verdict} +### 0B. DESIGN.md Status +- If DESIGN.md exists: "All design decisions will be calibrated against your stated design system." +- If no DESIGN.md: "No design system found. Recommend running /design-consultation first. Proceeding with universal design principles." -| Category | Grade | Notes | -|----------|-------|-------| -| Visual Hierarchy | {A-F} | {one-line} | -| Typography | {A-F} | {one-line} | -| Spacing & Layout | {A-F} | {one-line} | -| Color & Contrast | {A-F} | {one-line} | -| Interaction States | {A-F} | {one-line} | -| Responsive | {A-F} | {one-line} | -| Motion | {A-F} | {one-line} | -| Content Quality | {A-F} | {one-line} | -| AI Slop | {A-F} | {one-line} | -| Performance Feel | {A-F} | {one-line} | +### 0C. Existing Design Leverage +What existing UI patterns, components, or design decisions in the codebase should this plan reuse? Don't reinvent what already works. -## First Impression -{structured critique} +### 0D. Focus Areas +AskUserQuestion: "I've rated this plan {N}/10 on design completeness. The biggest gaps are {X, Y, Z}. Want me to review all 7 dimensions, or focus on specific areas?" -## Top 5 Design Improvements -{prioritized, actionable} +**STOP.** Do NOT proceed until user responds. -## Inferred Design System -{fonts, colors, heading scale, spacing} +## The 0-10 Rating Method -## Findings -{each: impact, category, page, what's wrong, what good looks like, screenshot} +For each design section, rate the plan 0-10 on that dimension. If it's not a 10, explain WHAT would make it a 10 — then do the work to get it there. -## Responsive Summary -{mobile/tablet/desktop grades per page} +Pattern: +1. Rate: "Information Architecture: 4/10" +2. Gap: "It's a 4 because the plan doesn't define content hierarchy. A 10 would have clear primary/secondary/tertiary for every screen." +3. Fix: Edit the plan to add what's missing +4. Re-rate: "Now 8/10 — still missing mobile nav hierarchy" +5. AskUserQuestion if there's a genuine design choice to resolve +6. Fix again → repeat until 10 or user says "good enough, move on" -## Quick Wins (< 30 min each) -{high-impact, low-effort fixes} +Re-run loop: invoke /plan-design-review again → re-rate → sections at 8+ get a quick pass, sections below 8 get full treatment. + +## Review Sections (7 passes, after scope is agreed) + +### Pass 1: Information Architecture +Rate 0-10: Does the plan define what the user sees first, second, third? +FIX TO 10: Add information hierarchy to the plan. Include ASCII diagram of screen/page structure and navigation flow. Apply "constraint worship" — if you can only show 3 things, which 3? +**STOP.** AskUserQuestion once per issue. Do NOT batch. Recommend + WHY. If no issues, say so and move on. Do NOT proceed until user responds. + +### Pass 2: Interaction State Coverage +Rate 0-10: Does the plan specify loading, empty, error, success, partial states? +FIX TO 10: Add interaction state table to the plan: +``` + FEATURE | LOADING | EMPTY | ERROR | SUCCESS | PARTIAL + ---------------------|---------|-------|-------|---------|-------- + [each UI feature] | [spec] | [spec]| [spec]| [spec] | [spec] +``` +For each state: describe what the user SEES, not backend behavior. +Empty states are features — specify warmth, primary action, context. +**STOP.** AskUserQuestion once per issue. Do NOT batch. Recommend + WHY. + +### Pass 3: User Journey & Emotional Arc +Rate 0-10: Does the plan consider the user's emotional experience? +FIX TO 10: Add user journey storyboard: +``` + STEP | USER DOES | USER FEELS | PLAN SPECIFIES? + -----|------------------|-----------------|---------------- + 1 | Lands on page | [what emotion?] | [what supports it?] + ... +``` +Apply time-horizon design: 5-sec visceral, 5-min behavioral, 5-year reflective. +**STOP.** AskUserQuestion once per issue. Do NOT batch. Recommend + WHY. + +### Pass 4: AI Slop Risk +Rate 0-10: Does the plan describe specific, intentional UI — or generic patterns? +FIX TO 10: Rewrite vague UI descriptions with specific alternatives. +- "Cards with icons" → what differentiates these from every SaaS template? +- "Hero section" → what makes this hero feel like THIS product? +- "Clean, modern UI" → meaningless. Replace with actual design decisions. +- "Dashboard with widgets" → what makes this NOT every other dashboard? +**STOP.** AskUserQuestion once per issue. Do NOT batch. Recommend + WHY. + +### Pass 5: Design System Alignment +Rate 0-10: Does the plan align with DESIGN.md? +FIX TO 10: If DESIGN.md exists, annotate with specific tokens/components. If no DESIGN.md, flag the gap and recommend `/design-consultation`. +Flag any new component — does it fit the existing vocabulary? +**STOP.** AskUserQuestion once per issue. Do NOT batch. Recommend + WHY. + +### Pass 6: Responsive & Accessibility +Rate 0-10: Does the plan specify mobile/tablet, keyboard nav, screen readers? +FIX TO 10: Add responsive specs per viewport — not "stacked on mobile" but intentional layout changes. Add a11y: keyboard nav patterns, ARIA landmarks, touch target sizes (44px min), color contrast requirements. +**STOP.** AskUserQuestion once per issue. Do NOT batch. Recommend + WHY. + +### Pass 7: Unresolved Design Decisions +Surface ambiguities that will haunt implementation: +``` + DECISION NEEDED | IF DEFERRED, WHAT HAPPENS + -----------------------------|--------------------------- + What does empty state look like? | Engineer ships "No items found." + Mobile nav pattern? | Desktop nav hides behind hamburger + ... +``` +Each decision = one AskUserQuestion with recommendation + WHY + alternatives. Edit the plan with each decision as it's made. + +## CRITICAL RULE — How to ask questions +Follow the AskUserQuestion format from the Preamble above. Additional rules for plan design reviews: +* **One issue = one AskUserQuestion call.** Never combine multiple issues into one question. +* Describe the design gap concretely — what's missing, what the user will experience if it's not specified. +* Present 2-3 options. For each: effort to specify now, risk if deferred. +* **Map to Design Principles above.** One sentence connecting your recommendation to a specific principle. +* Label with issue NUMBER + option LETTER (e.g., "3A", "3B"). +* **Escape hatch:** If a section has no issues, say so and move on. If a gap has an obvious fix, state what you'll add and move on — don't waste a question on it. Only use AskUserQuestion when there is a genuine design choice with meaningful tradeoffs. + +## Required Outputs + +### "NOT in scope" section +Design decisions considered and explicitly deferred, with one-line rationale each. + +### "What already exists" section +Existing DESIGN.md, UI patterns, and components that the plan should reuse. + +### TODOS.md updates +After all review passes are complete, present each potential TODO as its own individual AskUserQuestion. Never batch TODOs — one per question. Never silently skip this step. + +For design debt: missing a11y, unresolved responsive behavior, deferred empty states. Each TODO gets: +* **What:** One-line description of the work. +* **Why:** The concrete problem it solves or value it unlocks. +* **Pros:** What you gain by doing this work. +* **Cons:** Cost, complexity, or risks of doing it. +* **Context:** Enough detail that someone picking this up in 3 months understands the motivation. +* **Depends on / blocked by:** Any prerequisites. + +Then present options: **A)** Add to TODOS.md **B)** Skip — not valuable enough **C)** Build it now in this PR instead of deferring. + +### Completion Summary +``` + +====================================================================+ + | DESIGN PLAN REVIEW — COMPLETION SUMMARY | + +====================================================================+ + | System Audit | [DESIGN.md status, UI scope] | + | Step 0 | [initial rating, focus areas] | + | Pass 1 (Info Arch) | ___/10 → ___/10 after fixes | + | Pass 2 (States) | ___/10 → ___/10 after fixes | + | Pass 3 (Journey) | ___/10 → ___/10 after fixes | + | Pass 4 (AI Slop) | ___/10 → ___/10 after fixes | + | Pass 5 (Design Sys) | ___/10 → ___/10 after fixes | + | Pass 6 (Responsive) | ___/10 → ___/10 after fixes | + | Pass 7 (Decisions) | ___ resolved, ___ deferred | + +--------------------------------------------------------------------+ + | NOT in scope | written (___ items) | + | What already exists | written | + | TODOS.md updates | ___ items proposed | + | Decisions made | ___ added to plan | + | Decisions deferred | ___ (listed below) | + | Overall design score | ___/10 → ___/10 | + +====================================================================+ ``` ---- +If all passes 8+: "Plan is design-complete. Run /design-review after implementation for visual QA." +If any below 8: note what's unresolved and why (user chose to defer). -## DESIGN.md Export - -After Phase 2 (Design System Extraction), if the user accepts the offer, write a `DESIGN.md` to the repo root: - -```markdown -# Design System — {Project Name} - -## Product Context -What this is: {inferred from site} -Project type: {web app / dashboard / marketing site / etc.} - -## Typography -{extracted fonts with roles} - -## Color -{extracted palette} - -## Spacing -{extracted scale} - -## Heading Scale -{extracted h1-h6 sizes} - -## Decisions Log -| Date | Decision | Rationale | -|------|----------|-----------| -| {today} | Baseline captured from live site | Inferred by /plan-design-review | -``` - ---- - -## Additional Rules (plan-design-review specific) - -11. **Never fix anything.** Find and document only. Do not read source code, edit files, or suggest code fixes. Your job is to report what could be better and suggest design improvements. Use `/qa-design-review` for the fix loop. -12. **The exception:** You MAY write a DESIGN.md file if the user accepts the offer. This is the only file you create. +### Unresolved Decisions +If any AskUserQuestion goes unanswered, note it here. Never silently default to an option. ## Review Log -After compiling the report, persist the review result: +After producing the Completion Summary above, persist the review result: ```bash eval $(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null) mkdir -p ~/.gstack/projects/$SLUG -echo '{"skill":"plan-design-review","timestamp":"TIMESTAMP","status":"STATUS","design_score":"GRADE","ai_slop_score":"GRADE","mode":"MODE"}' >> ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl +echo '{"skill":"plan-design-review","timestamp":"TIMESTAMP","status":"STATUS","overall_score":N,"unresolved":N,"decisions_made":N}' >> ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl ``` -Substitute values from the report: +Substitute values from the Completion Summary: - **TIMESTAMP**: current ISO 8601 datetime -- **STATUS**: "clean" if Design Score is A or B; "issues_open" if C, D, or F -- **GRADE**: the letter grade from the report (Design Score and AI Slop Score respectively) -- **MODE**: Full / Quick / Deep / Diff-aware / Regression +- **STATUS**: "clean" if overall score 8+ AND 0 unresolved; otherwise "issues_open" +- **overall_score**: final overall design score (0-10) +- **unresolved**: number of unresolved design decisions +- **decisions_made**: number of design decisions added to the plan {{REVIEW_DASHBOARD}} + +## Formatting Rules +* NUMBER issues (1, 2, 3...) and LETTERS for options (A, B, C...). +* Label with NUMBER + LETTER (e.g., "3A", "3B"). +* One sentence max per option. +* After each pass, pause and wait for feedback. +* Rate before and after each pass for scannability. diff --git a/review/SKILL.md b/review/SKILL.md index 5d734594..3a14a9d3 100644 --- a/review/SKILL.md +++ b/review/SKILL.md @@ -219,7 +219,7 @@ eval $(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null) 4. **Apply the design checklist** against the changed files. For each item: - **[HIGH] mechanical CSS fix** (`outline: none`, `!important`, `font-size < 16px`): classify as AUTO-FIX - **[HIGH/MEDIUM] design judgment needed**: classify as ASK - - **[LOW] intent-based detection**: present as "Possible — verify visually or run /qa-design-review" + - **[LOW] intent-based detection**: present as "Possible — verify visually or run /design-review" 5. **Include findings** in the review output under a "Design Review" header, following the output format in the checklist. Design findings merge with code review findings into the same Fix-First flow. diff --git a/review/design-checklist.md b/review/design-checklist.md index bbe49885..90075165 100644 --- a/review/design-checklist.md +++ b/review/design-checklist.md @@ -24,7 +24,7 @@ Each item is tagged with a detection confidence level: - **[HIGH]** — Reliably detectable via grep/pattern match. Definitive findings. - **[MEDIUM]** — Detectable via pattern aggregation or heuristic. Flag as findings but expect some noise. -- **[LOW]** — Requires understanding visual intent. Present as: "Possible issue — verify visually or run /qa-design-review." +- **[LOW]** — Requires understanding visual intent. Present as: "Possible issue — verify visually or run /design-review." --- @@ -38,7 +38,7 @@ Each item is tagged with a detection confidence level: **ASK** (everything else — requires design judgment): - All AI slop findings, typography structure, spacing choices, interaction state gaps, DESIGN.md violations -**LOW confidence items** → present as "Possible: [description]. Verify visually or run /qa-design-review." Never AUTO-FIX. +**LOW confidence items** → present as "Possible: [description]. Verify visually or run /design-review." Never AUTO-FIX. --- @@ -55,7 +55,7 @@ Design Review: N issues (X auto-fixable, Y need input, Z possible) Recommended fix: suggested fix **POSSIBLE (verify visually):** -- [file:line] Possible issue — verify with /qa-design-review +- [file:line] Possible issue — verify with /design-review ``` If no issues found: `Design Review: No issues found.` diff --git a/scripts/gen-skill-docs.ts b/scripts/gen-skill-docs.ts index cb807111..687143c0 100644 --- a/scripts/gen-skill-docs.ts +++ b/scripts/gen-skill-docs.ts @@ -541,7 +541,7 @@ eval $(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null) 4. **Apply the design checklist** against the changed files. For each item: - **[HIGH] mechanical CSS fix** (\`outline: none\`, \`!important\`, \`font-size < 16px\`): classify as AUTO-FIX - **[HIGH/MEDIUM] design judgment needed**: classify as ASK - - **[LOW] intent-based detection**: present as "Possible — verify visually or run /qa-design-review" + - **[LOW] intent-based detection**: present as "Possible — verify visually or run /design-review" 5. **Include findings** in the review output under a "Design Review" header, following the output format in the checklist. Design findings merge with code review findings into the same Fix-First flow. @@ -1152,7 +1152,7 @@ function findTemplates(): string[] { path.join(ROOT, 'retro', 'SKILL.md.tmpl'), path.join(ROOT, 'gstack-upgrade', 'SKILL.md.tmpl'), path.join(ROOT, 'plan-design-review', 'SKILL.md.tmpl'), - path.join(ROOT, 'qa-design-review', 'SKILL.md.tmpl'), + path.join(ROOT, 'design-review', 'SKILL.md.tmpl'), path.join(ROOT, 'design-consultation', 'SKILL.md.tmpl'), path.join(ROOT, 'document-release', 'SKILL.md.tmpl'), ]; diff --git a/scripts/skill-check.ts b/scripts/skill-check.ts index 97c417ef..3be0245c 100644 --- a/scripts/skill-check.ts +++ b/scripts/skill-check.ts @@ -28,7 +28,7 @@ const SKILL_FILES = [ 'plan-eng-review/SKILL.md', 'setup-browser-cookies/SKILL.md', 'plan-design-review/SKILL.md', - 'qa-design-review/SKILL.md', + 'design-review/SKILL.md', 'gstack-upgrade/SKILL.md', 'document-release/SKILL.md', ].filter(f => fs.existsSync(path.join(ROOT, f))); diff --git a/ship/SKILL.md b/ship/SKILL.md index 486f32bf..875845dc 100644 --- a/ship/SKILL.md +++ b/ship/SKILL.md @@ -227,7 +227,7 @@ If the Eng Review is NOT "CLEAR": - RECOMMENDATION: Choose C if the change is obviously trivial (< 20 lines, typo fix, config-only); Choose B for larger changes - Options: A) Ship anyway B) Abort — run /plan-eng-review first C) Change is too small to need eng review - If CEO Review is missing, mention as informational ("CEO Review not run — recommended for product changes") but do NOT block - - For Design Review: run `eval $(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null)`. If `SCOPE_FRONTEND=true` and no design review (plan-design-review or design-review-lite) exists in the dashboard, mention: "Design Review not run — this PR changes frontend code. The lite design check will run automatically in Step 3.5, but consider running /plan-design-review for a full visual audit." Still never block. + - For Design Review: run `eval $(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null)`. If `SCOPE_FRONTEND=true` and no design review (plan-design-review or design-review-lite) exists in the dashboard, mention: "Design Review not run — this PR changes frontend code. The lite design check will run automatically in Step 3.5, but consider running /design-review for a full visual audit post-implementation." Still never block. 3. **If the user chooses A or C,** persist the decision so future `/ship` runs on this branch skip the gate: ```bash @@ -664,7 +664,7 @@ eval $(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null) 4. **Apply the design checklist** against the changed files. For each item: - **[HIGH] mechanical CSS fix** (`outline: none`, `!important`, `font-size < 16px`): classify as AUTO-FIX - **[HIGH/MEDIUM] design judgment needed**: classify as ASK - - **[LOW] intent-based detection**: present as "Possible — verify visually or run /qa-design-review" + - **[LOW] intent-based detection**: present as "Possible — verify visually or run /design-review" 5. **Include findings** in the review output under a "Design Review" header, following the output format in the checklist. Design findings merge with code review findings into the same Fix-First flow. diff --git a/ship/SKILL.md.tmpl b/ship/SKILL.md.tmpl index f9922147..bb077dac 100644 --- a/ship/SKILL.md.tmpl +++ b/ship/SKILL.md.tmpl @@ -70,7 +70,7 @@ If the Eng Review is NOT "CLEAR": - RECOMMENDATION: Choose C if the change is obviously trivial (< 20 lines, typo fix, config-only); Choose B for larger changes - Options: A) Ship anyway B) Abort — run /plan-eng-review first C) Change is too small to need eng review - If CEO Review is missing, mention as informational ("CEO Review not run — recommended for product changes") but do NOT block - - For Design Review: run `eval $(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null)`. If `SCOPE_FRONTEND=true` and no design review (plan-design-review or design-review-lite) exists in the dashboard, mention: "Design Review not run — this PR changes frontend code. The lite design check will run automatically in Step 3.5, but consider running /plan-design-review for a full visual audit." Still never block. + - For Design Review: run `eval $(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null)`. If `SCOPE_FRONTEND=true` and no design review (plan-design-review or design-review-lite) exists in the dashboard, mention: "Design Review not run — this PR changes frontend code. The lite design check will run automatically in Step 3.5, but consider running /design-review for a full visual audit post-implementation." Still never block. 3. **If the user chooses A or C,** persist the decision so future `/ship` runs on this branch skip the gate: ```bash diff --git a/test/gen-skill-docs.test.ts b/test/gen-skill-docs.test.ts index c3861e8d..9dfd1a1c 100644 --- a/test/gen-skill-docs.test.ts +++ b/test/gen-skill-docs.test.ts @@ -70,7 +70,7 @@ describe('gen-skill-docs', () => { { dir: 'setup-browser-cookies', name: 'setup-browser-cookies' }, { dir: 'gstack-upgrade', name: 'gstack-upgrade' }, { dir: 'plan-design-review', name: 'plan-design-review' }, - { dir: 'qa-design-review', name: 'qa-design-review' }, + { dir: 'design-review', name: 'design-review' }, { dir: 'design-consultation', name: 'design-consultation' }, ]; diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index 84a11da2..995648a1 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -84,9 +84,12 @@ export const E2E_TOUCHFILES: Record = { 'design-consultation-research': ['design-consultation/**'], 'design-consultation-existing': ['design-consultation/**'], 'design-consultation-preview': ['design-consultation/**'], - 'plan-design-review-audit': ['plan-design-review/**'], - 'plan-design-review-export': ['plan-design-review/**'], - 'qa-design-review-fix': ['qa-design-review/**', 'browse/src/**'], + 'plan-design-review-plan-mode': ['plan-design-review/**'], + 'plan-design-review-no-ui-scope': ['plan-design-review/**'], + 'design-review-fix': ['design-review/**', 'browse/src/**'], + + // gstack-upgrade + 'gstack-upgrade-happy-path': ['gstack-upgrade/**'], }; /** @@ -102,6 +105,24 @@ export const LLM_JUDGE_TOUCHFILES: Record = { 'qa/SKILL.md health rubric': ['qa/SKILL.md', 'qa/SKILL.md.tmpl'], 'cross-skill greptile consistency': ['review/SKILL.md', 'review/SKILL.md.tmpl', 'ship/SKILL.md', 'ship/SKILL.md.tmpl', 'review/greptile-triage.md', 'retro/SKILL.md', 'retro/SKILL.md.tmpl'], 'baseline score pinning': ['SKILL.md', 'SKILL.md.tmpl', 'test/fixtures/eval-baselines.json'], + + // Ship & Release + 'ship/SKILL.md workflow': ['ship/SKILL.md', 'ship/SKILL.md.tmpl'], + 'document-release/SKILL.md workflow': ['document-release/SKILL.md', 'document-release/SKILL.md.tmpl'], + + // Plan Reviews + 'plan-ceo-review/SKILL.md modes': ['plan-ceo-review/SKILL.md', 'plan-ceo-review/SKILL.md.tmpl'], + 'plan-eng-review/SKILL.md sections': ['plan-eng-review/SKILL.md', 'plan-eng-review/SKILL.md.tmpl'], + 'plan-design-review/SKILL.md passes': ['plan-design-review/SKILL.md', 'plan-design-review/SKILL.md.tmpl'], + + // Design skills + 'design-review/SKILL.md fix loop': ['design-review/SKILL.md', 'design-review/SKILL.md.tmpl'], + 'design-consultation/SKILL.md research': ['design-consultation/SKILL.md', 'design-consultation/SKILL.md.tmpl'], + + // Other skills + 'retro/SKILL.md instructions': ['retro/SKILL.md', 'retro/SKILL.md.tmpl'], + 'qa-only/SKILL.md workflow': ['qa-only/SKILL.md', 'qa-only/SKILL.md.tmpl'], + 'gstack-upgrade/SKILL.md upgrade flow': ['gstack-upgrade/SKILL.md', 'gstack-upgrade/SKILL.md.tmpl'], }; /** diff --git a/test/skill-e2e.test.ts b/test/skill-e2e.test.ts index 6a66311b..13539278 100644 --- a/test/skill-e2e.test.ts +++ b/test/skill-e2e.test.ts @@ -1876,8 +1876,120 @@ describeE2E('Deferred skill E2E', () => { // Setup-browser-cookies requires interactive browser picker UI test.todo('/setup-browser-cookies imports cookies'); - // Gstack-upgrade is destructive: modifies skill installation directory - test.todo('/gstack-upgrade completes upgrade flow'); +}); + +// --- gstack-upgrade E2E --- + +describeIfSelected('gstack-upgrade E2E', ['gstack-upgrade-happy-path'], () => { + let upgradeDir: string; + let remoteDir: string; + + beforeAll(() => { + upgradeDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-upgrade-')); + remoteDir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-remote-')); + + const run = (cmd: string, args: string[], cwd: string) => + spawnSync(cmd, args, { cwd, stdio: 'pipe', timeout: 5000 }); + + // Init the "project" repo + run('git', ['init'], upgradeDir); + run('git', ['config', 'user.email', 'test@test.com'], upgradeDir); + run('git', ['config', 'user.name', 'Test'], upgradeDir); + + // Create mock gstack install directory (local-git type) + const mockGstack = path.join(upgradeDir, '.claude', 'skills', 'gstack'); + fs.mkdirSync(mockGstack, { recursive: true }); + + // Init as a git repo + run('git', ['init'], mockGstack); + run('git', ['config', 'user.email', 'test@test.com'], mockGstack); + run('git', ['config', 'user.name', 'Test'], mockGstack); + + // Create bare remote + run('git', ['init', '--bare'], remoteDir); + run('git', ['remote', 'add', 'origin', remoteDir], mockGstack); + + // Write old version files + fs.writeFileSync(path.join(mockGstack, 'VERSION'), '0.5.0\n'); + fs.writeFileSync(path.join(mockGstack, 'CHANGELOG.md'), + '# Changelog\n\n## 0.5.0 — 2026-03-01\n\n- Initial release\n'); + fs.writeFileSync(path.join(mockGstack, 'setup'), + '#!/bin/bash\necho "Setup completed"\n', { mode: 0o755 }); + + // Initial commit + push + run('git', ['add', '.'], mockGstack); + run('git', ['commit', '-m', 'initial'], mockGstack); + run('git', ['push', '-u', 'origin', 'HEAD:main'], mockGstack); + + // Create new version (simulate upstream release) + fs.writeFileSync(path.join(mockGstack, 'VERSION'), '0.6.0\n'); + fs.writeFileSync(path.join(mockGstack, 'CHANGELOG.md'), + '# Changelog\n\n## 0.6.0 — 2026-03-15\n\n- New feature: interactive design review\n- Fix: snapshot flag validation\n\n## 0.5.0 — 2026-03-01\n\n- Initial release\n'); + run('git', ['add', '.'], mockGstack); + run('git', ['commit', '-m', 'release 0.6.0'], mockGstack); + run('git', ['push', 'origin', 'HEAD:main'], mockGstack); + + // Reset working copy back to old version + run('git', ['reset', '--hard', 'HEAD~1'], mockGstack); + + // Copy gstack-upgrade skill + fs.mkdirSync(path.join(upgradeDir, 'gstack-upgrade'), { recursive: true }); + fs.copyFileSync( + path.join(ROOT, 'gstack-upgrade', 'SKILL.md'), + path.join(upgradeDir, 'gstack-upgrade', 'SKILL.md'), + ); + + // Commit so git repo is clean + run('git', ['add', '.'], upgradeDir); + run('git', ['commit', '-m', 'initial project'], upgradeDir); + }); + + afterAll(() => { + try { fs.rmSync(upgradeDir, { recursive: true, force: true }); } catch {} + try { fs.rmSync(remoteDir, { recursive: true, force: true }); } catch {} + }); + + testIfSelected('gstack-upgrade-happy-path', async () => { + const mockGstack = path.join(upgradeDir, '.claude', 'skills', 'gstack'); + const result = await runSkillTest({ + prompt: `Read gstack-upgrade/SKILL.md for the upgrade workflow. + +You are running /gstack-upgrade standalone. The gstack installation is at ./.claude/skills/gstack (local-git type — it has a .git directory with an origin remote). + +Current version: 0.5.0. A new version 0.6.0 is available on origin/main. + +Follow the standalone upgrade flow: +1. Detect install type (local-git) +2. Run git fetch origin && git reset --hard origin/main in the install directory +3. Run the setup script +4. Show what's new from CHANGELOG + +Skip any AskUserQuestion calls — auto-approve the upgrade. Write a summary of what you did to stdout. + +IMPORTANT: The install directory is at ./.claude/skills/gstack — use that exact path.`, + workingDirectory: upgradeDir, + maxTurns: 20, + timeout: 180_000, + testName: 'gstack-upgrade-happy-path', + runId, + }); + + logCost('/gstack-upgrade happy path', result); + + // Check that the version was updated + const versionAfter = fs.readFileSync(path.join(mockGstack, 'VERSION'), 'utf-8').trim(); + const output = result.output || ''; + const mentionsUpgrade = output.toLowerCase().includes('0.6.0') || + output.toLowerCase().includes('upgrade') || + output.toLowerCase().includes('updated'); + + recordE2E('/gstack-upgrade happy path', 'gstack-upgrade E2E', result, { + passed: versionAfter === '0.6.0' && ['success', 'error_max_turns'].includes(result.exitReason), + }); + + expect(['success', 'error_max_turns']).toContain(result.exitReason); + expect(versionAfter).toBe('0.6.0'); + }, 240_000); }); // --- Design Consultation E2E --- @@ -2178,15 +2290,13 @@ Skip research. Skip any AskUserQuestion calls — this is non-interactive. Gener }, 420_000); }); -// --- Plan Design Review E2E --- +// --- Plan Design Review E2E (plan-mode) --- -describeIfSelected('Plan Design Review E2E', ['plan-design-review-audit', 'plan-design-review-export'], () => { +describeIfSelected('Plan Design Review E2E', ['plan-design-review-plan-mode', 'plan-design-review-no-ui-scope'], () => { let reviewDir: string; beforeAll(() => { - testServer = testServer || startTestServer(); reviewDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-plan-design-')); - setupBrowseShims(reviewDir); const { spawnSync } = require('child_process'); const run = (cmd: string, args: string[]) => @@ -2195,9 +2305,6 @@ describeIfSelected('Plan Design Review E2E', ['plan-design-review-audit', 'plan- run('git', ['init']); run('git', ['config', 'user.email', 'test@test.com']); run('git', ['config', 'user.name', 'Test']); - fs.writeFileSync(path.join(reviewDir, 'index.html'), '

Test

\n'); - run('git', ['add', '.']); - run('git', ['commit', '-m', 'initial']); // Copy plan-design-review skill fs.mkdirSync(path.join(reviewDir, 'plan-design-review'), { recursive: true }); @@ -2205,102 +2312,136 @@ describeIfSelected('Plan Design Review E2E', ['plan-design-review-audit', 'plan- path.join(ROOT, 'plan-design-review', 'SKILL.md'), path.join(reviewDir, 'plan-design-review', 'SKILL.md'), ); + + // Create a plan file with intentional design gaps + fs.writeFileSync(path.join(reviewDir, 'plan.md'), `# Plan: User Dashboard + +## Context +Build a user dashboard that shows account stats, recent activity, and settings. + +## Implementation +1. Create a dashboard page at /dashboard +2. Show user stats (posts, followers, engagement rate) +3. Add a recent activity feed +4. Add a settings panel +5. Use a clean, modern UI with cards and icons +6. Add a hero section at the top with a gradient background + +## Technical Details +- React components with Tailwind CSS +- API endpoint: GET /api/dashboard +- WebSocket for real-time activity updates +`); + + run('git', ['add', '.']); + run('git', ['commit', '-m', 'initial plan']); }); afterAll(() => { try { fs.rmSync(reviewDir, { recursive: true, force: true }); } catch {} }); - testIfSelected('plan-design-review-audit', async () => { + testIfSelected('plan-design-review-plan-mode', async () => { const result = await runSkillTest({ - prompt: `IMPORTANT: The browse binary is already assigned below as B. Do NOT search for it or run the SKILL.md setup block — just use $B directly. + prompt: `Read plan-design-review/SKILL.md for the design review workflow. -B="${browseBin}" +Review the plan in ./plan.md. This plan has several design gaps — it uses vague language like "clean, modern UI" and "cards and icons", mentions a "hero section with gradient" (AI slop), and doesn't specify empty states, error states, loading states, responsive behavior, or accessibility. -Read plan-design-review/SKILL.md for the design review workflow. +Skip the preamble bash block. Skip any AskUserQuestion calls — this is non-interactive. Rate each design dimension 0-10 and explain what would make it a 10. Then EDIT plan.md to add the missing design decisions (interaction state table, empty states, responsive behavior, etc.). -Review the site at ${testServer.url}. Use --quick mode (homepage + 2 pages). Skip any AskUserQuestion calls — this is non-interactive. Write your audit report to ./design-audit.md. Do not offer to create DESIGN.md. - -EFFICIENCY: Skip the preamble bash block. Combine multiple browse commands into single bash blocks (e.g. run all Phase 2 JS extractions in one block). Write the report as soon as you have enough data — do not over-explore.`, +IMPORTANT: Do NOT try to browse any URLs or use a browse binary. This is a plan review, not a live site audit. Just read the plan file, review it, and edit it to fix the gaps.`, workingDirectory: reviewDir, - maxTurns: 30, - timeout: 360_000, - testName: 'plan-design-review-audit', + maxTurns: 15, + timeout: 300_000, + testName: 'plan-design-review-plan-mode', runId, }); - logCost('/plan-design-review audit', result); + logCost('/plan-design-review plan-mode', result); - const reportPath = path.join(reviewDir, 'design-audit.md'); - const reportExists = fs.existsSync(reportPath); - let reportContent = ''; - if (reportExists) { - reportContent = fs.readFileSync(reportPath, 'utf-8'); - } + // Check that the agent produced design ratings (0-10 scale) + const output = result.output || ''; + const hasRatings = /\d+\/10/.test(output); + const hasDesignContent = output.toLowerCase().includes('information architecture') || + output.toLowerCase().includes('interaction state') || + output.toLowerCase().includes('ai slop') || + output.toLowerCase().includes('hierarchy'); - const hasFirstImpression = reportContent.toLowerCase().includes('first impression') || - reportContent.toLowerCase().includes('impression'); + // Check that the plan file was edited (the core new behavior) + const planAfter = fs.readFileSync(path.join(reviewDir, 'plan.md'), 'utf-8'); + const planOriginal = `# Plan: User Dashboard`; + const planWasEdited = planAfter.length > 300; // Original is ~450 chars, edited should be much longer + const planHasDesignAdditions = planAfter.toLowerCase().includes('empty') || + planAfter.toLowerCase().includes('loading') || + planAfter.toLowerCase().includes('error') || + planAfter.toLowerCase().includes('state') || + planAfter.toLowerCase().includes('responsive') || + planAfter.toLowerCase().includes('accessibility'); - recordE2E('/plan-design-review audit', 'Plan Design Review E2E', result, { - passed: reportExists && ['success', 'error_max_turns'].includes(result.exitReason), + recordE2E('/plan-design-review plan-mode', 'Plan Design Review E2E', result, { + passed: hasDesignContent && planWasEdited && ['success', 'error_max_turns'].includes(result.exitReason), }); expect(['success', 'error_max_turns']).toContain(result.exitReason); - expect(reportExists).toBe(true); - if (reportExists) { - expect(reportContent.length).toBeGreaterThan(200); - } - }, 420_000); + // Agent should produce design-relevant output about the plan + expect(hasDesignContent).toBe(true); + // Agent should have edited the plan file to add missing design decisions + expect(planWasEdited).toBe(true); + expect(planHasDesignAdditions).toBe(true); + }, 360_000); - testIfSelected('plan-design-review-export', async () => { - // Clean up previous test artifacts - try { fs.unlinkSync(path.join(reviewDir, 'design-audit.md')); } catch {} + testIfSelected('plan-design-review-no-ui-scope', async () => { + // Write a backend-only plan + fs.writeFileSync(path.join(reviewDir, 'backend-plan.md'), `# Plan: Database Migration + +## Context +Migrate user records from PostgreSQL to a new schema with better indexing. + +## Implementation +1. Create migration to add new columns to users table +2. Backfill data from legacy columns +3. Add database indexes for common query patterns +4. Update ActiveRecord models +5. Run migration in staging first, then production +`); const result = await runSkillTest({ - prompt: `IMPORTANT: The browse binary is already assigned below as B. Do NOT search for it or run the SKILL.md setup block — just use $B directly. + prompt: `Read plan-design-review/SKILL.md for the design review workflow. -B="${browseBin}" +Review the plan in ./backend-plan.md. This is a pure backend database migration plan with no UI changes. -Read plan-design-review/SKILL.md for the design review workflow. +Skip the preamble bash block. Skip any AskUserQuestion calls — this is non-interactive. Write your findings directly to stdout. -Review ${testServer.url} with --quick mode. Skip any AskUserQuestion calls — this is non-interactive. After Phase 2 (Design System Extraction), write a DESIGN.md to the working directory. Also write the audit report to ./design-audit.md.`, +IMPORTANT: Do NOT try to browse any URLs or use a browse binary. This is a plan review, not a live site audit.`, workingDirectory: reviewDir, - maxTurns: 25, - timeout: 360_000, - testName: 'plan-design-review-export', + maxTurns: 10, + timeout: 180_000, + testName: 'plan-design-review-no-ui-scope', runId, }); - logCost('/plan-design-review export', result); + logCost('/plan-design-review no-ui-scope', result); - const designPath = path.join(reviewDir, 'DESIGN.md'); - const reportPath = path.join(reviewDir, 'design-audit.md'); - const designExists = fs.existsSync(designPath); - const reportExists = fs.existsSync(reportPath); + // Agent should detect no UI scope and exit early + const output = result.output || ''; + const detectsNoUI = output.toLowerCase().includes('no ui') || + output.toLowerCase().includes('no frontend') || + output.toLowerCase().includes('no design') || + output.toLowerCase().includes('not applicable') || + output.toLowerCase().includes('backend'); - let designContent = ''; - if (designExists) { - designContent = fs.readFileSync(designPath, 'utf-8'); - } - - const hasTypography = designContent.toLowerCase().includes('typography') || designContent.toLowerCase().includes('font'); - const hasColor = designContent.toLowerCase().includes('color'); - - recordE2E('/plan-design-review export', 'Plan Design Review E2E', result, { - passed: designExists && ['success', 'error_max_turns'].includes(result.exitReason), + recordE2E('/plan-design-review no-ui-scope', 'Plan Design Review E2E', result, { + passed: detectsNoUI && ['success', 'error_max_turns'].includes(result.exitReason), }); expect(['success', 'error_max_turns']).toContain(result.exitReason); - // DESIGN.md export is best-effort — agent may not always produce it - if (designExists) { - expect(hasTypography || hasColor).toBe(true); - } - }, 420_000); + expect(detectsNoUI).toBe(true); + }, 240_000); }); -// --- QA Design Review E2E --- +// --- Design Review E2E (live-site audit + fix) --- -describeIfSelected('QA Design Review E2E', ['qa-design-review-fix'], () => { +describeIfSelected('Design Review E2E', ['design-review-fix'], () => { let qaDesignDir: string; let qaDesignServer: ReturnType | null = null; @@ -2376,11 +2517,11 @@ describeIfSelected('QA Design Review E2E', ['qa-design-review-fix'], () => { }, }); - // Copy qa-design-review skill - fs.mkdirSync(path.join(qaDesignDir, 'qa-design-review'), { recursive: true }); + // Copy design-review skill + fs.mkdirSync(path.join(qaDesignDir, 'design-review'), { recursive: true }); fs.copyFileSync( - path.join(ROOT, 'qa-design-review', 'SKILL.md'), - path.join(qaDesignDir, 'qa-design-review', 'SKILL.md'), + path.join(ROOT, 'design-review', 'SKILL.md'), + path.join(qaDesignDir, 'design-review', 'SKILL.md'), ); }); @@ -2389,7 +2530,7 @@ describeIfSelected('QA Design Review E2E', ['qa-design-review-fix'], () => { try { fs.rmSync(qaDesignDir, { recursive: true, force: true }); } catch {} }); - test('Test 7: /qa-design-review audits and fixes design issues', async () => { + test('Test 7: /design-review audits and fixes design issues', async () => { const serverUrl = `http://localhost:${(qaDesignServer as any)?.port}`; const result = await runSkillTest({ @@ -2397,17 +2538,17 @@ describeIfSelected('QA Design Review E2E', ['qa-design-review-fix'], () => { B="${browseBin}" -Read qa-design-review/SKILL.md for the design review + fix workflow. +Read design-review/SKILL.md for the design review + fix workflow. Review the site at ${serverUrl}. Use --quick mode. Skip any AskUserQuestion calls — this is non-interactive. Fix up to 3 issues max. Write your report to ./design-audit.md.`, workingDirectory: qaDesignDir, maxTurns: 30, timeout: 360_000, - testName: 'qa-design-review-fix', + testName: 'design-review-fix', runId, }); - logCost('/qa-design-review fix', result); + logCost('/design-review fix', result); const reportPath = path.join(qaDesignDir, 'design-audit.md'); const reportExists = fs.existsSync(reportPath); @@ -2419,7 +2560,7 @@ Review the site at ${serverUrl}. Use --quick mode. Skip any AskUserQuestion call const commits = gitLog.stdout.toString().trim().split('\n'); const designFixCommits = commits.filter((c: string) => c.includes('style(design)')); - recordE2E('/qa-design-review fix', 'QA Design Review E2E', result, { + recordE2E('/design-review fix', 'Design Review E2E', result, { passed: ['success', 'error_max_turns'].includes(result.exitReason), }); diff --git a/test/skill-llm-eval.test.ts b/test/skill-llm-eval.test.ts index c3e1aef2..528d5115 100644 --- a/test/skill-llm-eval.test.ts +++ b/test/skill-llm-eval.test.ts @@ -464,6 +464,210 @@ describeIfSelected('Baseline score pinning', ['baseline score pinning'], () => { }, 60_000); }); +// --- Workflow SKILL.md quality evals (10 new tests for 100% coverage) --- + +/** + * DRY helper for workflow SKILL.md judge tests. + * Extracts a section from a SKILL.md file and judges its quality as an agent workflow. + */ +async function runWorkflowJudge(opts: { + testName: string; + suite: string; + skillPath: string; + startMarker: string; + endMarker: string | null; + judgeContext: string; + judgeGoal: string; + thresholds?: { clarity: number; completeness: number; actionability: number }; +}) { + const t0 = Date.now(); + const defaults = { clarity: 4, completeness: 3, actionability: 4 }; + const thresholds = { ...defaults, ...opts.thresholds }; + + const content = fs.readFileSync(path.join(ROOT, opts.skillPath), 'utf-8'); + const startIdx = content.indexOf(opts.startMarker); + if (startIdx === -1) throw new Error(`Start marker not found in ${opts.skillPath}: "${opts.startMarker}"`); + + let section: string; + if (opts.endMarker) { + const endIdx = content.indexOf(opts.endMarker, startIdx); + if (endIdx === -1) throw new Error(`End marker not found in ${opts.skillPath}: "${opts.endMarker}"`); + section = content.slice(startIdx, endIdx); + } else { + section = content.slice(startIdx); + } + + const scores = await callJudge(`You are evaluating the quality of ${opts.judgeContext} for an AI coding agent. + +The agent reads this document to learn ${opts.judgeGoal}. It references external tools and files +that are documented separately — do NOT penalize for missing external definitions. + +Rate on three dimensions (1-5 scale): +- **clarity** (1-5): Can an agent follow the instructions without ambiguity? +- **completeness** (1-5): Are all steps, decision points, and outputs well-defined? +- **actionability** (1-5): Can an agent execute this workflow and produce the expected deliverables? + +Respond with ONLY valid JSON: +{"clarity": N, "completeness": N, "actionability": N, "reasoning": "brief explanation"} + +Here is the document to evaluate: + +${section}`); + + console.log(`${opts.testName} scores:`, JSON.stringify(scores, null, 2)); + + evalCollector?.addTest({ + name: opts.testName, + suite: opts.suite, + tier: 'llm-judge', + passed: scores.clarity >= thresholds.clarity && scores.completeness >= thresholds.completeness && scores.actionability >= thresholds.actionability, + duration_ms: Date.now() - t0, + cost_usd: 0.02, + judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability }, + judge_reasoning: scores.reasoning, + }); + + expect(scores.clarity).toBeGreaterThanOrEqual(thresholds.clarity); + expect(scores.completeness).toBeGreaterThanOrEqual(thresholds.completeness); + expect(scores.actionability).toBeGreaterThanOrEqual(thresholds.actionability); +} + +// Block 1: Ship & Release skills +describeIfSelected('Ship & Release skill evals', ['ship/SKILL.md workflow', 'document-release/SKILL.md workflow'], () => { + testIfSelected('ship/SKILL.md workflow', async () => { + await runWorkflowJudge({ + testName: 'ship/SKILL.md workflow', + suite: 'Ship & Release skill evals', + skillPath: 'ship/SKILL.md', + startMarker: '# Ship:', + endMarker: '## Important Rules', + judgeContext: 'a ship/release workflow document', + judgeGoal: 'how to create a PR: merge base branch, run tests, review diff, bump version, update changelog, push, and open PR', + }); + }, 30_000); + + testIfSelected('document-release/SKILL.md workflow', async () => { + await runWorkflowJudge({ + testName: 'document-release/SKILL.md workflow', + suite: 'Ship & Release skill evals', + skillPath: 'document-release/SKILL.md', + startMarker: '# Document Release:', + endMarker: '## Important Rules', + judgeContext: 'a post-ship documentation update workflow', + judgeGoal: 'how to audit and update project documentation after code ships: README, ARCHITECTURE, CONTRIBUTING, CLAUDE.md, CHANGELOG, TODOS', + }); + }, 30_000); +}); + +// Block 2: Plan Review skills +describeIfSelected('Plan Review skill evals', [ + 'plan-ceo-review/SKILL.md modes', 'plan-eng-review/SKILL.md sections', 'plan-design-review/SKILL.md passes', +], () => { + testIfSelected('plan-ceo-review/SKILL.md modes', async () => { + await runWorkflowJudge({ + testName: 'plan-ceo-review/SKILL.md modes', + suite: 'Plan Review skill evals', + skillPath: 'plan-ceo-review/SKILL.md', + startMarker: '## Step 0: Nuclear Scope Challenge', + endMarker: '## Review Sections', + judgeContext: 'a CEO/founder plan review framework with 4 scope modes', + judgeGoal: 'how to conduct a CEO-perspective plan review: challenge scope, select a mode (Expansion, Selective Expansion, Hold Scope, Reduction), then review sections interactively', + }); + }, 30_000); + + testIfSelected('plan-eng-review/SKILL.md sections', async () => { + await runWorkflowJudge({ + testName: 'plan-eng-review/SKILL.md sections', + suite: 'Plan Review skill evals', + skillPath: 'plan-eng-review/SKILL.md', + startMarker: '## BEFORE YOU START:', + endMarker: '## CRITICAL RULE', + judgeContext: 'an engineering plan review framework with 4 review sections', + judgeGoal: 'how to review a plan for architecture quality, code quality, test coverage, and performance — walking through each section interactively with AskUserQuestion', + }); + }, 30_000); + + testIfSelected('plan-design-review/SKILL.md passes', async () => { + await runWorkflowJudge({ + testName: 'plan-design-review/SKILL.md passes', + suite: 'Plan Review skill evals', + skillPath: 'plan-design-review/SKILL.md', + startMarker: '## Review Sections', + endMarker: '## CRITICAL RULE', + judgeContext: 'a design plan review framework with 7 review passes', + judgeGoal: 'how to review a plan for design quality using a 0-10 rating method: rate each dimension, explain what a 10 looks like, edit the plan to fix gaps, then re-rate', + }); + }, 30_000); +}); + +// Block 3: Design skills +describeIfSelected('Design skill evals', ['design-review/SKILL.md fix loop', 'design-consultation/SKILL.md research'], () => { + testIfSelected('design-review/SKILL.md fix loop', async () => { + await runWorkflowJudge({ + testName: 'design-review/SKILL.md fix loop', + suite: 'Design skill evals', + skillPath: 'design-review/SKILL.md', + startMarker: '## Phase 7:', + endMarker: '## Additional Rules', + judgeContext: 'a design audit triage and fix loop workflow', + judgeGoal: 'how to triage design issues by severity, fix them atomically in source code, commit each fix, and re-verify with before/after screenshots', + }); + }, 30_000); + + testIfSelected('design-consultation/SKILL.md research', async () => { + await runWorkflowJudge({ + testName: 'design-consultation/SKILL.md research', + suite: 'Design skill evals', + skillPath: 'design-consultation/SKILL.md', + startMarker: '## Phase 1:', + endMarker: '## Phase 4:', + judgeContext: 'a design consultation research and proposal workflow', + judgeGoal: 'how to gather product context, research the competitive landscape, and produce a complete design system proposal with typography, color, spacing, and motion specifications', + }); + }, 30_000); +}); + +// Block 4: Other skills +describeIfSelected('Other skill evals', [ + 'retro/SKILL.md instructions', 'qa-only/SKILL.md workflow', 'gstack-upgrade/SKILL.md upgrade flow', +], () => { + testIfSelected('retro/SKILL.md instructions', async () => { + await runWorkflowJudge({ + testName: 'retro/SKILL.md instructions', + suite: 'Other skill evals', + skillPath: 'retro/SKILL.md', + startMarker: '## Instructions', + endMarker: '## Compare Mode', + judgeContext: 'an engineering retrospective data gathering and analysis workflow', + judgeGoal: 'how to gather git metrics (commit history, test counts, work patterns), analyze them, produce a structured retro report with praise, growth areas, and trend tracking', + }); + }, 30_000); + + testIfSelected('qa-only/SKILL.md workflow', async () => { + await runWorkflowJudge({ + testName: 'qa-only/SKILL.md workflow', + suite: 'Other skill evals', + skillPath: 'qa-only/SKILL.md', + startMarker: '## Workflow', + endMarker: '## Important Rules', + judgeContext: 'a report-only QA testing workflow', + judgeGoal: 'how to systematically QA test a web application and produce a structured report with health score, screenshots, and repro steps — without fixing anything', + }); + }, 30_000); + + testIfSelected('gstack-upgrade/SKILL.md upgrade flow', async () => { + await runWorkflowJudge({ + testName: 'gstack-upgrade/SKILL.md upgrade flow', + suite: 'Other skill evals', + skillPath: 'gstack-upgrade/SKILL.md', + startMarker: '## Inline upgrade flow', + endMarker: '## Standalone usage', + judgeContext: 'a version upgrade detection and execution workflow', + judgeGoal: 'how to detect install type, compare versions, back up current install, upgrade via git or fresh clone, run setup, and show what changed', + }); + }, 30_000); +}); + // Module-level afterAll — finalize eval collector after all tests complete afterAll(async () => { if (evalCollector) { diff --git a/test/skill-validation.test.ts b/test/skill-validation.test.ts index 81d97d31..bd0e205b 100644 --- a/test/skill-validation.test.ts +++ b/test/skill-validation.test.ts @@ -72,15 +72,29 @@ describe('SKILL.md command validation', () => { expect(result.snapshotFlagErrors).toHaveLength(0); }); - test('all $B commands in qa-design-review/SKILL.md are valid browse commands', () => { - const skill = path.join(ROOT, 'qa-design-review', 'SKILL.md'); + test('all $B commands in design-review/SKILL.md are valid browse commands', () => { + const skill = path.join(ROOT, 'design-review', 'SKILL.md'); if (!fs.existsSync(skill)) return; const result = validateSkill(skill); expect(result.invalid).toHaveLength(0); }); - test('all snapshot flags in qa-design-review/SKILL.md are valid', () => { - const skill = path.join(ROOT, 'qa-design-review', 'SKILL.md'); + test('all snapshot flags in design-review/SKILL.md are valid', () => { + const skill = path.join(ROOT, 'design-review', 'SKILL.md'); + if (!fs.existsSync(skill)) return; + const result = validateSkill(skill); + expect(result.snapshotFlagErrors).toHaveLength(0); + }); + + test('all $B commands in design-consultation/SKILL.md are valid browse commands', () => { + const skill = path.join(ROOT, 'design-consultation', 'SKILL.md'); + if (!fs.existsSync(skill)) return; + const result = validateSkill(skill); + expect(result.invalid).toHaveLength(0); + }); + + test('all snapshot flags in design-consultation/SKILL.md are valid', () => { + const skill = path.join(ROOT, 'design-consultation', 'SKILL.md'); if (!fs.existsSync(skill)) return; const result = validateSkill(skill); expect(result.snapshotFlagErrors).toHaveLength(0); @@ -205,7 +219,7 @@ describe('Update check preamble', () => { 'plan-ceo-review/SKILL.md', 'plan-eng-review/SKILL.md', 'retro/SKILL.md', 'plan-design-review/SKILL.md', - 'qa-design-review/SKILL.md', + 'design-review/SKILL.md', 'design-consultation/SKILL.md', 'document-release/SKILL.md', ]; @@ -430,6 +444,8 @@ describe('No hardcoded branch names in SKILL templates', () => { 'plan-ceo-review/SKILL.md.tmpl', 'retro/SKILL.md.tmpl', 'document-release/SKILL.md.tmpl', + 'plan-eng-review/SKILL.md.tmpl', + 'plan-design-review/SKILL.md.tmpl', ]; // Patterns that indicate hardcoded 'main' in git commands @@ -513,7 +529,7 @@ describe('v0.4.1 preamble features', () => { 'plan-ceo-review/SKILL.md', 'plan-eng-review/SKILL.md', 'retro/SKILL.md', 'plan-design-review/SKILL.md', - 'qa-design-review/SKILL.md', + 'design-review/SKILL.md', 'design-consultation/SKILL.md', 'document-release/SKILL.md', ]; @@ -543,6 +559,10 @@ describe('Contributor mode preamble structure', () => { 'ship/SKILL.md', 'review/SKILL.md', 'plan-ceo-review/SKILL.md', 'plan-eng-review/SKILL.md', 'retro/SKILL.md', + 'plan-design-review/SKILL.md', + 'design-review/SKILL.md', + 'design-consultation/SKILL.md', + 'document-release/SKILL.md', ]; for (const skill of skillsWithPreamble) { @@ -628,7 +648,7 @@ describe('Completeness Principle in generated SKILL.md files', () => { 'plan-ceo-review/SKILL.md', 'plan-eng-review/SKILL.md', 'retro/SKILL.md', 'plan-design-review/SKILL.md', - 'qa-design-review/SKILL.md', + 'design-review/SKILL.md', 'design-consultation/SKILL.md', 'document-release/SKILL.md', ]; @@ -801,8 +821,8 @@ describe('Test Bootstrap ({{TEST_BOOTSTRAP}}) integration', () => { expect(content).toContain('Step 2.5'); }); - test('TEST_BOOTSTRAP appears in qa-design-review/SKILL.md', () => { - const content = fs.readFileSync(path.join(ROOT, 'qa-design-review', 'SKILL.md'), 'utf-8'); + test('TEST_BOOTSTRAP appears in design-review/SKILL.md', () => { + const content = fs.readFileSync(path.join(ROOT, 'design-review', 'SKILL.md'), 'utf-8'); expect(content).toContain('Test Framework Bootstrap'); }); @@ -843,10 +863,10 @@ describe('Test Bootstrap ({{TEST_BOOTSTRAP}}) integration', () => { expect(content).toContain('100% test coverage'); }); - test('WebSearch is in allowed-tools for qa, ship, qa-design-review', () => { + test('WebSearch is in allowed-tools for qa, ship, design-review', () => { const qa = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8'); const ship = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8'); - const qaDesign = fs.readFileSync(path.join(ROOT, 'qa-design-review', 'SKILL.md'), 'utf-8'); + const qaDesign = fs.readFileSync(path.join(ROOT, 'design-review', 'SKILL.md'), 'utf-8'); expect(qa).toContain('WebSearch'); expect(ship).toContain('WebSearch'); expect(qaDesign).toContain('WebSearch'); @@ -869,8 +889,8 @@ describe('Phase 8e.5 regression test generation', () => { expect(content).not.toContain('Never modify tests or CI configuration'); }); - test('qa-design-review has CSS-aware Phase 8e.5 variant', () => { - const content = fs.readFileSync(path.join(ROOT, 'qa-design-review', 'SKILL.md'), 'utf-8'); + test('design-review has CSS-aware Phase 8e.5 variant', () => { + const content = fs.readFileSync(path.join(ROOT, 'design-review', 'SKILL.md'), 'utf-8'); expect(content).toContain('8e.5. Regression Test (design-review variant)'); expect(content).toContain('CSS-only'); expect(content).toContain('test(design): regression test'); diff --git a/test/touchfiles.test.ts b/test/touchfiles.test.ts index e666bb3d..48613d64 100644 --- a/test/touchfiles.test.ts +++ b/test/touchfiles.test.ts @@ -66,7 +66,7 @@ describe('selectTests', () => { expect(result.selected).toContain('browse-snapshot'); expect(result.selected).toContain('qa-quick'); expect(result.selected).toContain('qa-fix-loop'); - expect(result.selected).toContain('qa-design-review-fix'); + expect(result.selected).toContain('design-review-fix'); expect(result.reason).toBe('diff'); // Should NOT include unrelated tests expect(result.selected).not.toContain('plan-ceo-review');