diff --git a/AGENTS.md b/AGENTS.md index d8721745..a6d6de2d 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -6,7 +6,10 @@ designer, QA lead, release engineer, debugger, and more. ## Available skills -Skills live in `.agents/skills/`. Invoke them by name (e.g., `/office-hours`). +Skills live in `.agents/skills/` (or `~/.claude/skills/gstack/` on Claude Code). +Invoke them by name (e.g., `/office-hours`). + +### Plan-mode reviews | Skill | What it does | |-------|-------------| @@ -14,36 +17,95 @@ Skills live in `.agents/skills/`. Invoke them by name (e.g., `/office-hours`). | `/plan-ceo-review` | CEO-level review: find the 10-star product in the request. | | `/plan-eng-review` | Lock architecture, data flow, edge cases, and tests. | | `/plan-design-review` | Rate each design dimension 0-10, explain what a 10 looks like. | +| `/plan-devex-review` | DX-mode review: TTHW, magical moments, friction points, persona traces. | +| `/plan-tune` | Self-tune AskUserQuestion sensitivity per question. | +| `/autoplan` | One command runs CEO → design → eng → DX review. | | `/design-consultation` | Build a complete design system from scratch. | + +### Implementation + review + +| Skill | What it does | +|-------|-------------| | `/review` | Pre-landing PR review. Finds bugs that pass CI but break in prod. | -| `/debug` | Systematic root-cause debugging. No fixes without investigation. | -| `/design-review` | Design audit + fix loop with atomic commits. | +| `/codex` | Second opinion via OpenAI Codex. Review, challenge, or consult modes. | +| `/investigate` | Systematic root-cause debugging. No fixes without investigation. | +| `/design-review` | Live-site visual audit + fix loop with atomic commits. | +| `/design-shotgun` | Generate multiple AI design variants, comparison board, iterate. | +| `/design-html` | Generate production-quality Pretext-native HTML/CSS. | +| `/devex-review` | Live developer experience audit (TTHW measured against the real flow). | | `/qa` | Open a real browser, find bugs, fix them, re-verify. | -| `/qa-only` | Same as /qa but report only — no code changes. | -| `/ship` | Run tests, review, push, open PR. One command. | +| `/qa-only` | Same methodology as /qa but report only — no code changes. | + +### Release + deploy + +| Skill | What it does | +|-------|-------------| +| `/ship` | Run tests, review, push, open PR. Workspace-aware version queue. | +| `/land-and-deploy` | Merge the PR, wait for CI and deploy, verify production health. | +| `/canary` | Post-deploy monitoring loop using the browse daemon. | +| `/landing-report` | Read-only dashboard for the workspace-aware ship queue. | | `/document-release` | Update all docs to match what you just shipped. | +| `/setup-deploy` | One-time deploy config detection (Fly.io, Render, Vercel, etc.). | +| `/gstack-upgrade` | Update gstack to the latest version. | + +### Operational + memory + +| Skill | What it does | +|-------|-------------| +| `/context-save` | Save working context (git state, decisions, remaining work). | +| `/context-restore` | Resume from a saved context, even across Conductor workspaces. | +| `/learn` | Manage what gstack learned across sessions. | | `/retro` | Weekly retro with per-person breakdowns and shipping streaks. | +| `/health` | Code quality dashboard (type checker, linter, tests, dead code). | +| `/benchmark` | Performance regression detection (page load, Core Web Vitals). | +| `/benchmark-models` | Cross-model benchmark for skills (Claude, GPT, Gemini side-by-side). | +| `/cso` | OWASP Top 10 + STRIDE security audit. | +| `/setup-gbrain` | Set up gbrain for cross-machine session memory sync. | + +### Browser + agent integration + +| Skill | What it does | +|-------|-------------| | `/browse` | Headless browser — real Chromium, real clicks, ~100ms/command. | +| `/open-gstack-browser` | Launch the visible GStack Browser with sidebar + stealth. | | `/setup-browser-cookies` | Import cookies from your real browser for authenticated testing. | +| `/pair-agent` | Pair a remote AI agent (OpenClaw, Codex, etc.) with your browser. | + +### Safety + scoping + +| Skill | What it does | +|-------|-------------| | `/careful` | Warn before destructive commands (rm -rf, DROP TABLE, force-push). | | `/freeze` | Lock edits to one directory. Hard block, not just a warning. | | `/guard` | Activate both careful + freeze at once. | | `/unfreeze` | Remove directory edit restrictions. | -| `/gstack-upgrade` | Update gstack to the latest version. | +| `/make-pdf` | Turn any markdown file into a publication-quality PDF. | ## Build commands ```bash bun install # install dependencies -bun test # run tests (free, <5s) +bun test # run free tests (no API spend) +bun run test:windows # curated Windows-safe subset (runs on windows-latest) bun run build # generate docs + compile binaries bun run gen:skill-docs # regenerate SKILL.md files from templates bun run skill:check # health dashboard for all skills ``` +## Platform support + +- **macOS** + **Linux**: full test suite supported. +- **Windows**: curated Windows-safe subset runs on `windows-latest` via the + `windows-free-tests` CI job. Setup script (`./setup`) requires Git Bash or + MSYS today; native PowerShell support is a future expansion. The `bin/gstack-paths` + helper resolves state roots through `CLAUDE_PLUGIN_DATA` / `GSTACK_HOME` so plugin + installs work on every platform. + ## Key conventions - SKILL.md files are **generated** from `.tmpl` templates. Edit the template, not the output. - Run `bun run gen:skill-docs --host codex` to regenerate Codex-specific output. - The browse binary provides headless browser access. Use `$B ` in skills. - Safety skills (careful, freeze, guard) use inline advisory prose — always confirm before destructive operations. +- State paths resolve via `bin/gstack-paths` (sourced via `eval "$(...)"`). Honors `GSTACK_HOME`, `CLAUDE_PLUGIN_DATA`, `CLAUDE_PLANS_DIR`. +- The `claude` CLI binary resolves via `browse/src/claude-bin.ts` (`Bun.which()` + `GSTACK_CLAUDE_BIN` override). Set `GSTACK_CLAUDE_BIN=wsl` plus `GSTACK_CLAUDE_BIN_ARGS='["claude"]'` to run Claude through WSL on Windows. diff --git a/docs/skills.md b/docs/skills.md index 71d5b68d..025ee229 100644 --- a/docs/skills.md +++ b/docs/skills.md @@ -25,11 +25,21 @@ Detailed guides for every gstack skill — philosophy, workflow, and examples. | [`/retro`](#retro) | **Eng Manager** | Team-aware weekly retro. Per-person breakdowns, shipping streaks, test health trends, growth opportunities. | | [`/browse`](#browse) | **QA Engineer** | Give the agent eyes. Real Chromium browser, real clicks, real screenshots. ~100ms per command. | | [`/setup-browser-cookies`](#setup-browser-cookies) | **Session Manager** | Import cookies from your real browser (Chrome, Arc, Brave, Edge) into the headless session. Test authenticated pages. | -| [`/autoplan`](#autoplan) | **Review Pipeline** | One command, fully reviewed plan. Runs CEO → design → eng review automatically with encoded decision principles. Surfaces only taste decisions for your approval. | +| [`/autoplan`](#autoplan) | **Review Pipeline** | One command, fully reviewed plan. Runs CEO → design → eng → DX review automatically with encoded decision principles. Surfaces only taste decisions for your approval. | +| [`/plan-devex-review`](#plan-devex-review) | **DX Reviewer** | Plan-stage DX review. TTHW (time-to-hello-world), magical moments, friction points, persona traces. Three modes: Expansion, Polish, Triage. | +| [`/devex-review`](#devex-review) | **DX Reviewer (live)** | Live developer experience audit. Walks the actual onboarding flow, measures TTHW, catches the docs lies. | +| [`/plan-tune`](#plan-tune) | **Question Tuner** | Self-tune AskUserQuestion sensitivity per question. Mark questions as never-ask, always-ask, or only-for-one-way. | | [`/learn`](#learn) | **Memory** | Manage what gstack learned across sessions. Review, search, prune, and export project-specific patterns and preferences. | +| [`/context-save`](#context-save) | **Save State** | Save working context (git state, decisions, remaining work) so any future session can resume. | +| [`/context-restore`](#context-restore) | **Restore State** | Resume from a saved context, even across Conductor workspace handoffs. | +| [`/health`](#health) | **Code Quality Dashboard** | Wraps type checker, linter, tests, dead code detection. Computes a weighted 0-10 score; tracks trends over time. | +| [`/landing-report`](#landing-report) | **Ship Queue Dashboard** | Read-only snapshot of the workspace-aware ship queue. Which version slots are claimed, which sibling workspaces have WIP. | +| [`/benchmark-models`](#benchmark-models) | **Model Benchmark** | Side-by-side cross-model benchmark for skills (Claude vs GPT vs Gemini). Latency, tokens, cost, optional LLM-judged quality. | | | | | | **Multi-AI** | | | | [`/codex`](#codex) | **Second Opinion** | Independent review from OpenAI Codex CLI. Three modes: code review (pass/fail gate), adversarial challenge, and open consultation with session continuity. Cross-model analysis when both `/review` and `/codex` have run. | +| [`/pair-agent`](#pair-agent) | **Remote Agent Bridge** | Pair a remote AI agent (OpenClaw, Codex, Cursor, Hermes) with your browser. Scoped tunnel, locked allowlist, session token. | +| [`/setup-gbrain`](#setup-gbrain) | **Memory Sync** | Set up gbrain for cross-machine session memory sync. One command from zero to live. | | | | | | **Safety & Utility** | | | | [`/careful`](#safety--guardrails) | **Safety Guardrails** | Warns before destructive commands (rm -rf, DROP TABLE, force-push, git reset --hard). Override any warning. Common build cleanups whitelisted. | @@ -39,6 +49,7 @@ Detailed guides for every gstack skill — philosophy, workflow, and examples. | [`/open-gstack-browser`](#open-gstack-browser) | **GStack Browser** | Launch GStack Browser with sidebar, anti-bot stealth, auto model routing, cookie import, and Claude Code integration. Watch every action live. | | [`/setup-deploy`](#setup-deploy) | **Deploy Configurator** | One-time setup for `/land-and-deploy`. Detects your platform, production URL, and deploy commands. | | [`/gstack-upgrade`](#gstack-upgrade) | **Self-Updater** | Upgrade gstack to the latest version. Detects global vs vendored install, syncs both, shows what changed. | +| [`/make-pdf`](#make-pdf) | **PDF Generator** | Turn any markdown file into a publication-quality PDF. Proper margins, page numbers, cover pages, clickable TOC. | --- diff --git a/test/skill-validation.test.ts b/test/skill-validation.test.ts index 23b909ae..247d0e2f 100644 --- a/test/skill-validation.test.ts +++ b/test/skill-validation.test.ts @@ -1458,6 +1458,107 @@ describe('Skill trigger phrases', () => { } }); +// ─── Private-path leak detector ────────────────────────────── +// +// Catches accidental references to maintainer-private files in skill output. +// Adapted from the McGluut fork's skill-contract-audit.ts (we don't take the +// whole script — these are the unique checks not already covered by +// test/gen-skill-docs.test.ts:1668-2074 .claude/skills leakage tests). + +describe('Private-path leak detection', () => { + const PRIVATE_PATTERNS: Array<{ pattern: RegExp; label: string }> = [ + { pattern: /coordination-board\.md/i, label: 'coordination-board.md' }, + { pattern: /SEEKING_LOG\.md/, label: 'SEEKING_LOG.md' }, + { pattern: /RATIONAL_SUBJECT\.md/, label: 'RATIONAL_SUBJECT.md' }, + { pattern: /VALUE_SIGNAL_LOOP\.md/, label: 'VALUE_SIGNAL_LOOP.md' }, + { pattern: /C:\\\\LLM Playground\\\\go/i, label: 'C:\\LLM Playground\\go' }, + ]; + + // Walk every SKILL.md and SKILL.md.tmpl in the repo (excluding node_modules, + // generated host outputs, and .git). + function discoverSkillSurface(): string[] { + const results: string[] = []; + function walk(dir: string) { + for (const entry of fs.readdirSync(dir, { withFileTypes: true })) { + if (entry.name.startsWith('.') && entry.name !== '.agents') continue; + if (entry.name === 'node_modules' || entry.name === 'dist') continue; + const full = path.join(dir, entry.name); + if (entry.isDirectory()) { + walk(full); + } else if (entry.name === 'SKILL.md' || entry.name === 'SKILL.md.tmpl') { + results.push(full); + } + } + } + walk(ROOT); + return results; + } + + test('no SKILL.md or SKILL.md.tmpl references private maintainer files', () => { + const files = discoverSkillSurface(); + expect(files.length).toBeGreaterThan(0); + const leaks: string[] = []; + for (const file of files) { + const content = fs.readFileSync(file, 'utf-8'); + for (const { pattern, label } of PRIVATE_PATTERNS) { + if (pattern.test(content)) { + leaks.push(`${path.relative(ROOT, file)} mentions ${label}`); + } + } + } + expect(leaks).toEqual([]); + }); +}); + +// ─── Doc-inventory cross-check ─────────────────────────────── +// +// Every skill directory (with a SKILL.md.tmpl) must appear in both AGENTS.md +// and docs/skills.md. Catches the inventory drift codex flagged (/debug +// → /investigate; missing /autoplan, /context-save, /plan-devex-review, etc.). + +describe('Doc inventory cross-check', () => { + // Skills that don't get user-invocation lines in agent-facing docs. + // - 'qa-only' is a sub-mode of /qa with shared docs. + // - The 5 listed below are infrastructure (model overlays, shipped binary, + // hosts) that don't show up in the user-facing skill table. + const DOC_INVENTORY_EXCLUDE = new Set([ + // Infra / non-skills + 'agents', 'claude', 'connect-chrome', 'contrib', 'hosts', + 'lib', 'model-overlays', 'openclaw', 'supabase', 'scripts', 'test', + ]); + + function discoverSkillDirs(): string[] { + const dirs: string[] = []; + for (const entry of fs.readdirSync(ROOT, { withFileTypes: true })) { + if (!entry.isDirectory()) continue; + if (entry.name.startsWith('.')) continue; + if (DOC_INVENTORY_EXCLUDE.has(entry.name)) continue; + const tmplPath = path.join(ROOT, entry.name, 'SKILL.md.tmpl'); + if (fs.existsSync(tmplPath)) dirs.push(entry.name); + } + return dirs.sort(); + } + + test('every skill is documented in AGENTS.md', () => { + const agents = fs.readFileSync(path.join(ROOT, 'AGENTS.md'), 'utf-8'); + const missing: string[] = []; + for (const skill of discoverSkillDirs()) { + // Match `/skill-name` as a token boundary. + if (!new RegExp(`/${skill}\\b`).test(agents)) missing.push(skill); + } + expect(missing).toEqual([]); + }); + + test('every skill is documented in docs/skills.md', () => { + const docs = fs.readFileSync(path.join(ROOT, 'docs', 'skills.md'), 'utf-8'); + const missing: string[] = []; + for (const skill of discoverSkillDirs()) { + if (!new RegExp(`/${skill}\\b`).test(docs)) missing.push(skill); + } + expect(missing).toEqual([]); + }); +}); + // ─── Codex Skill Validation ────────────────────────────────── describe('Codex skill validation', () => {