From 9870a4ec49078ad3fc150c3d93605401a747af6d Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Tue, 24 Mar 2026 20:32:30 -0700 Subject: [PATCH 1/9] =?UTF-8?q?fix:=20Windows=20browse=20=E2=80=94=20stdio?= =?UTF-8?q?=20array=20format=20for=20Bun=20compatibility=20(v0.11.18.2)=20?= =?UTF-8?q?(#468)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: use stdio array format for Bun Windows compatibility Bun on Windows requires stdio as ['ignore','ignore','ignore'] array, not 'ignore' string. Fixes #448, #454, #458. Closes #444. Co-Authored-By: Claude Opus 4.6 (1M context) * chore: bump version and changelog (v0.11.18.2) Co-Authored-By: Claude Opus 4.6 --------- Co-authored-by: Claude Opus 4.6 (1M context) --- CHANGELOG.md | 6 ++++++ VERSION | 2 +- browse/src/cli.ts | 4 ++-- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 90e8335d..56d79bc6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## [0.11.18.2] - 2026-03-24 + +### Fixed + +- **Windows browse daemon fixed.** The browse server wouldn't start on Windows because Bun requires `stdio` as an array (`['ignore', 'ignore', 'ignore']`), not a string (`'ignore'`). Fixes #448, #454, #458. + ## [0.11.18.1] - 2026-03-24 ### Changed diff --git a/VERSION b/VERSION index 53d7c74c..c1e61543 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.11.18.1 +0.11.18.2 diff --git a/browse/src/cli.ts b/browse/src/cli.ts index 2d48ecf7..25894a5d 100644 --- a/browse/src/cli.ts +++ b/browse/src/cli.ts @@ -234,9 +234,9 @@ async function startServer(): Promise { const launcherCode = `const{spawn}=require('child_process');` + `spawn(process.execPath,[${JSON.stringify(NODE_SERVER_SCRIPT)}],` + - `{detached:true,stdio:'ignore',env:Object.assign({},process.env,` + + `{detached:true,stdio:['ignore','ignore','ignore'],env:Object.assign({},process.env,` + `{BROWSE_STATE_FILE:${JSON.stringify(config.stateFile)}})}).unref()`; - Bun.spawnSync(['node', '-e', launcherCode], { stdio: 'ignore' }); + Bun.spawnSync(['node', '-e', launcherCode], { stdio: ['ignore', 'ignore', 'ignore'] }); } else { // macOS/Linux: Bun.spawn + unref works correctly proc = Bun.spawn(['bun', 'run', SERVER_SCRIPT], { From aa7daf052ece077ab3d05da3834ad7a029b79bc9 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Wed, 25 Mar 2026 23:07:07 -0700 Subject: [PATCH 2/9] fix: Codex description limit + wrong-repo bug (v0.11.19.0) (#471) * fix: Codex description limit + wrong-repo bug Move skill routing table from root SKILL.md.tmpl description (1017/1024 chars) to body. Add 900-char warning threshold test to prevent future creep. Add -C flag to all 14 codex exec calls so Codex always runs in the correct git root. Fix pre-existing package.json version mismatch. Co-Authored-By: Claude Opus 4.6 (1M context) * fix: Codex description limit + wrong-repo bug Move skill routing table from root SKILL.md.tmpl description (1017/1024 chars) to body where there's no length limit. Add 900-char warning threshold test. Add -C flag to all codex exec calls so Codex always runs in the correct git root directory. Co-Authored-By: Claude Opus 4.6 (1M context) * chore: regenerate SKILL.md files from updated templates Co-Authored-By: Claude Opus 4.6 (1M context) * chore: bump version and changelog (v0.11.19.0) Co-Authored-By: Claude Opus 4.6 (1M context) * fix: Codex wrong-repo + routing table to body + 900-char guard (v0.11.19.0) - Add -C "$(git rev-parse --show-toplevel)" to all 14 codex exec calls so Codex always runs in the correct repo (fixes Conductor multi-workspace bug) - Move skill routing table from description to body in SKILL.md.tmpl (description was already shortened on main; routing table was missing from body) - Add 900-char warning threshold test for Codex descriptions - Bump version + sync package.json Co-Authored-By: Claude Opus 4.6 (1M context) --------- Co-authored-by: Claude Opus 4.6 (1M context) --- CHANGELOG.md | 11 +++++++++++ SKILL.md | 22 ++++++++++++++++++++++ SKILL.md.tmpl | 22 ++++++++++++++++++++++ VERSION | 2 +- autoplan/SKILL.md | 6 +++--- autoplan/SKILL.md.tmpl | 6 +++--- codex/SKILL.md | 6 +++--- codex/SKILL.md.tmpl | 6 +++--- package.json | 2 +- scripts/resolvers/design.ts | 6 +++--- scripts/resolvers/review.ts | 6 +++--- test/gen-skill-docs.test.ts | 18 ++++++++++++++++++ 12 files changed, 93 insertions(+), 20 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 56d79bc6..56620db7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,16 @@ # Changelog +## [0.11.19.0] - 2026-03-24 + +### Fixed + +- **Auto-upgrade no longer breaks.** The root gstack skill description was 7 characters from the Codex 1024-char limit. Every new skill addition pushed it closer. Moved the skill routing table from the description (bounded) to the body (unlimited), dropping from 1017 to 409 chars with 615 chars of headroom. +- **Codex reviews now run in the correct repo.** In multi-workspace setups (like Conductor), Codex could pick up the wrong project directory. All `codex exec` calls now explicitly set `-C` to the git root. + +### Added + +- **900-char early warning test.** A new test fails if any Codex skill description exceeds 900 chars, catching description bloat before it breaks builds. + ## [0.11.18.2] - 2026-03-24 ### Fixed diff --git a/SKILL.md b/SKILL.md index dada1e75..f6d2831e 100644 --- a/SKILL.md +++ b/SKILL.md @@ -297,6 +297,28 @@ If `PROACTIVE` is `false`: do NOT proactively suggest other gstack skills during Only run skills the user explicitly invokes. This preference persists across sessions via `gstack-config`. +If `PROACTIVE` is `true` (default): suggest adjacent gstack skills when relevant to the +user's workflow stage: +- Brainstorming → /office-hours +- Strategy → /plan-ceo-review +- Architecture → /plan-eng-review +- Design → /plan-design-review or /design-consultation +- Auto-review → /autoplan +- Debugging → /investigate +- QA → /qa +- Code review → /review +- Visual audit → /design-review +- Shipping → /ship +- Docs → /document-release +- Retro → /retro +- Second opinion → /codex +- Prod safety → /careful or /guard +- Scoped edits → /freeze or /unfreeze +- Upgrades → /gstack-upgrade + +If the user opts out of suggestions, run `gstack-config set proactive false`. +If they opt back in, run `gstack-config set proactive true`. + # gstack browse: QA Testing & Dogfooding Persistent headless Chromium. First call auto-starts (~3s), then ~100-200ms per command. diff --git a/SKILL.md.tmpl b/SKILL.md.tmpl index fca8fa60..31bd2837 100644 --- a/SKILL.md.tmpl +++ b/SKILL.md.tmpl @@ -20,6 +20,28 @@ If `PROACTIVE` is `false`: do NOT proactively suggest other gstack skills during Only run skills the user explicitly invokes. This preference persists across sessions via `gstack-config`. +If `PROACTIVE` is `true` (default): suggest adjacent gstack skills when relevant to the +user's workflow stage: +- Brainstorming → /office-hours +- Strategy → /plan-ceo-review +- Architecture → /plan-eng-review +- Design → /plan-design-review or /design-consultation +- Auto-review → /autoplan +- Debugging → /investigate +- QA → /qa +- Code review → /review +- Visual audit → /design-review +- Shipping → /ship +- Docs → /document-release +- Retro → /retro +- Second opinion → /codex +- Prod safety → /careful or /guard +- Scoped edits → /freeze or /unfreeze +- Upgrades → /gstack-upgrade + +If the user opts out of suggestions, run `gstack-config set proactive false`. +If they opt back in, run `gstack-config set proactive true`. + # gstack browse: QA Testing & Dogfooding Persistent headless Chromium. First call auto-starts (~3s), then ~100-200ms per command. diff --git a/VERSION b/VERSION index c1e61543..d20322e5 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.11.18.2 +0.11.19.0 diff --git a/autoplan/SKILL.md b/autoplan/SKILL.md index e9161eab..14874900 100644 --- a/autoplan/SKILL.md +++ b/autoplan/SKILL.md @@ -547,7 +547,7 @@ Override: every AskUserQuestion → auto-decide using the 6 principles. What alternatives were dismissed too quickly? What competitive or market risks are unaddressed? What scope decisions will look foolish in 6 months? Be adversarial. No compliments. Just the strategic blind spots. - File: " -s read-only --enable web_search_cached` + File: " -C "$(git rev-parse --show-toplevel)" -s read-only --enable web_search_cached` Timeout: 10 minutes **Claude CEO subagent** (via Agent tool): @@ -658,7 +658,7 @@ Override: every AskUserQuestion → auto-decide using the 6 principles. accessibility requirements (keyboard nav, contrast, touch targets) specified or aspirational? Does the plan describe specific UI decisions or generic patterns? What design decisions will haunt the implementer if left ambiguous? - Be opinionated. No hedging." -s read-only --enable web_search_cached` + Be opinionated. No hedging." -C "$(git rev-parse --show-toplevel)" -s read-only --enable web_search_cached` Timeout: 10 minutes **Claude design subagent** (via Agent tool): @@ -723,7 +723,7 @@ Override: every AskUserQuestion → auto-decide using the 6 principles. CEO: Design: - File: " -s read-only --enable web_search_cached` + File: " -C "$(git rev-parse --show-toplevel)" -s read-only --enable web_search_cached` Timeout: 10 minutes **Claude eng subagent** (via Agent tool): diff --git a/autoplan/SKILL.md.tmpl b/autoplan/SKILL.md.tmpl index b3e0a340..661e8fb0 100644 --- a/autoplan/SKILL.md.tmpl +++ b/autoplan/SKILL.md.tmpl @@ -204,7 +204,7 @@ Override: every AskUserQuestion → auto-decide using the 6 principles. What alternatives were dismissed too quickly? What competitive or market risks are unaddressed? What scope decisions will look foolish in 6 months? Be adversarial. No compliments. Just the strategic blind spots. - File: " -s read-only --enable web_search_cached` + File: " -C "$(git rev-parse --show-toplevel)" -s read-only --enable web_search_cached` Timeout: 10 minutes **Claude CEO subagent** (via Agent tool): @@ -315,7 +315,7 @@ Override: every AskUserQuestion → auto-decide using the 6 principles. accessibility requirements (keyboard nav, contrast, touch targets) specified or aspirational? Does the plan describe specific UI decisions or generic patterns? What design decisions will haunt the implementer if left ambiguous? - Be opinionated. No hedging." -s read-only --enable web_search_cached` + Be opinionated. No hedging." -C "$(git rev-parse --show-toplevel)" -s read-only --enable web_search_cached` Timeout: 10 minutes **Claude design subagent** (via Agent tool): @@ -380,7 +380,7 @@ Override: every AskUserQuestion → auto-decide using the 6 principles. CEO: Design: - File: " -s read-only --enable web_search_cached` + File: " -C "$(git rev-parse --show-toplevel)" -s read-only --enable web_search_cached` Timeout: 10 minutes **Claude eng subagent** (via Agent tool): diff --git a/codex/SKILL.md b/codex/SKILL.md index f34b8db4..8bce22e5 100644 --- a/codex/SKILL.md +++ b/codex/SKILL.md @@ -518,7 +518,7 @@ With focus (e.g., "security"): 2. Run codex exec with **JSONL output** to capture reasoning traces and tool calls (5-minute timeout): ```bash -codex exec "" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>/dev/null | python3 -c " +codex exec "" -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>/dev/null | python3 -c " import sys, json for line in sys.stdin: line = line.strip() @@ -603,7 +603,7 @@ THE PLAN: For a **new session:** ```bash -codex exec "" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>"$TMPERR" | python3 -c " +codex exec "" -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>"$TMPERR" | python3 -c " import sys, json for line in sys.stdin: line = line.strip() @@ -636,7 +636,7 @@ for line in sys.stdin: For a **resumed session** (user chose "Continue"): ```bash -codex exec resume "" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>"$TMPERR" | python3 -c " +codex exec resume "" -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>"$TMPERR" | python3 -c " " ``` diff --git a/codex/SKILL.md.tmpl b/codex/SKILL.md.tmpl index c0b7adb1..338df93b 100644 --- a/codex/SKILL.md.tmpl +++ b/codex/SKILL.md.tmpl @@ -159,7 +159,7 @@ With focus (e.g., "security"): 2. Run codex exec with **JSONL output** to capture reasoning traces and tool calls (5-minute timeout): ```bash -codex exec "" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>/dev/null | python3 -c " +codex exec "" -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>/dev/null | python3 -c " import sys, json for line in sys.stdin: line = line.strip() @@ -244,7 +244,7 @@ THE PLAN: For a **new session:** ```bash -codex exec "" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>"$TMPERR" | python3 -c " +codex exec "" -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>"$TMPERR" | python3 -c " import sys, json for line in sys.stdin: line = line.strip() @@ -277,7 +277,7 @@ for line in sys.stdin: For a **resumed session** (user chose "Continue"): ```bash -codex exec resume "" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>"$TMPERR" | python3 -c " +codex exec resume "" -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>"$TMPERR" | python3 -c " " ``` diff --git a/package.json b/package.json index 70b40909..f666c9af 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "gstack", - "version": "0.11.17.0", + "version": "0.11.19.0", "description": "Garry's Stack — Claude Code skills + fast headless browser. One repo, one install, entire AI engineering workflow.", "license": "MIT", "type": "module", diff --git a/scripts/resolvers/design.ts b/scripts/resolvers/design.ts index 30b1fe2c..c4926112 100644 --- a/scripts/resolvers/design.ts +++ b/scripts/resolvers/design.ts @@ -17,7 +17,7 @@ If Codex is available, run a lightweight design check on the diff: \`\`\`bash TMPERR_DRL=$(mktemp /tmp/codex-drl-XXXXXXXX) -codex exec "Review the git diff on this branch. Run 7 litmus checks (YES/NO each): ${litmusList} Flag any hard rejections: ${rejectionList} 5 most important design findings only. Reference file:line." -s read-only -c 'model_reasoning_effort="high"' --enable web_search_cached 2>"$TMPERR_DRL" +codex exec "Review the git diff on this branch. Run 7 litmus checks (YES/NO each): ${litmusList} Flag any hard rejections: ${rejectionList} 5 most important design findings only. Reference file:line." -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="high"' --enable web_search_cached 2>"$TMPERR_DRL" \`\`\` Use a 5-minute timeout (\`timeout: 300000\`). After the command completes, read stderr: @@ -467,7 +467,7 @@ If user chooses A, launch both voices simultaneously: 1. **Codex** (via Bash, \`model_reasoning_effort="medium"\`): \`\`\`bash TMPERR_SKETCH=$(mktemp /tmp/codex-sketch-XXXXXXXX) -codex exec "For this product approach, provide: a visual thesis (one sentence — mood, material, energy), a content plan (hero → support → detail → CTA), and 2 interaction ideas that change page feel. Apply beautiful defaults: composition-first, brand-first, cardless, poster not document. Be opinionated." -s read-only -c 'model_reasoning_effort="medium"' --enable web_search_cached 2>"$TMPERR_SKETCH" +codex exec "For this product approach, provide: a visual thesis (one sentence — mood, material, energy), a content plan (hero → support → detail → CTA), and 2 interaction ideas that change page feel. Apply beautiful defaults: composition-first, brand-first, cardless, poster not document. Be opinionated." -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="medium"' --enable web_search_cached 2>"$TMPERR_SKETCH" \`\`\` Use a 5-minute timeout (\`timeout: 300000\`). After completion: \`cat "$TMPERR_SKETCH" && rm -f "$TMPERR_SKETCH"\` @@ -636,7 +636,7 @@ which codex 2>/dev/null && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE" 1. **Codex design voice** (via Bash): \`\`\`bash TMPERR_DESIGN=$(mktemp /tmp/codex-design-XXXXXXXX) -codex exec "${escapedCodexPrompt}" -s read-only -c 'model_reasoning_effort="${reasoningEffort}"' --enable web_search_cached 2>"$TMPERR_DESIGN" +codex exec "${escapedCodexPrompt}" -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="${reasoningEffort}"' --enable web_search_cached 2>"$TMPERR_DESIGN" \`\`\` Use a 5-minute timeout (\`timeout: 300000\`). After the command completes, read stderr: \`\`\`bash diff --git a/scripts/resolvers/review.ts b/scripts/resolvers/review.ts index 1831e098..2b83f36d 100644 --- a/scripts/resolvers/review.ts +++ b/scripts/resolvers/review.ts @@ -286,7 +286,7 @@ Write the full prompt (context block + instructions) to this file. Use the mode- \`\`\`bash TMPERR_OH=$(mktemp /tmp/codex-oh-err-XXXXXXXX) -codex exec "$(cat "$CODEX_PROMPT_FILE")" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_OH" +codex exec "$(cat "$CODEX_PROMPT_FILE")" -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_OH" \`\`\` Use a 5-minute timeout (\`timeout: 300000\`). After the command completes, read stderr: @@ -370,7 +370,7 @@ Claude's structured review already ran. Now add a **cross-model adversarial chal \`\`\`bash TMPERR_ADV=$(mktemp /tmp/codex-adv-XXXXXXXX) -codex exec "Review the changes on this branch against the base branch. Run git diff origin/ to see the diff. Your job is to find ways this code will fail in production. Think like an attacker and a chaos engineer. Find edge cases, race conditions, security holes, resource leaks, failure modes, and silent data corruption paths. Be adversarial. Be thorough. No compliments — just the problems." -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_ADV" +codex exec "Review the changes on this branch against the base branch. Run git diff origin/ to see the diff. Your job is to find ways this code will fail in production. Think like an attacker and a chaos engineer. Find edge cases, race conditions, security holes, resource leaks, failure modes, and silent data corruption paths. Be adversarial. Be thorough. No compliments — just the problems." -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_ADV" \`\`\` Set the Bash tool's \`timeout\` parameter to \`300000\` (5 minutes). Do NOT use the \`timeout\` shell command — it doesn't exist on macOS. After the command completes, read stderr: @@ -525,7 +525,7 @@ THE PLAN: \`\`\`bash TMPERR_PV=$(mktemp /tmp/codex-planreview-XXXXXXXX) -codex exec "" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_PV" +codex exec "" -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_PV" \`\`\` Use a 5-minute timeout (\`timeout: 300000\`). After the command completes, read stderr: diff --git a/test/gen-skill-docs.test.ts b/test/gen-skill-docs.test.ts index d8a071a1..c26bb64b 100644 --- a/test/gen-skill-docs.test.ts +++ b/test/gen-skill-docs.test.ts @@ -152,6 +152,24 @@ describe('gen-skill-docs', () => { } }); + test('every Codex SKILL.md description stays under 900-char warning threshold', () => { + const WARN_THRESHOLD = 900; + const agentsDir = path.join(ROOT, '.agents', 'skills'); + if (!fs.existsSync(agentsDir)) return; + const violations: string[] = []; + for (const entry of fs.readdirSync(agentsDir, { withFileTypes: true })) { + if (!entry.isDirectory()) continue; + const skillMd = path.join(agentsDir, entry.name, 'SKILL.md'); + if (!fs.existsSync(skillMd)) continue; + const content = fs.readFileSync(skillMd, 'utf-8'); + const description = extractDescription(content); + if (description.length > WARN_THRESHOLD) { + violations.push(`${entry.name}: ${description.length} chars (limit ${MAX_SKILL_DESCRIPTION_LENGTH}, ${MAX_SKILL_DESCRIPTION_LENGTH - description.length} remaining)`); + } + } + expect(violations).toEqual([]); + }); + test('package.json version matches VERSION file', () => { const pkg = JSON.parse(fs.readFileSync(path.join(ROOT, 'package.json'), 'utf-8')); const version = fs.readFileSync(path.join(ROOT, 'VERSION'), 'utf-8').trim(); From 1bf888d75c6652e4a692a2a175970a9f218cb33f Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Thu, 26 Mar 2026 07:21:15 -0600 Subject: [PATCH 3/9] feat: GitLab support for /retro, /ship, and /document-release (v0.11.20.0) (#508) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: multi-platform BASE_BRANCH_DETECT (GitHub + GitLab + GHE + git-native) Update the shared BASE_BRANCH_DETECT resolver to support GitHub, GitLab, GitHub Enterprise, self-hosted GitLab, and a git-native fallback chain. Platform detection uses remote URL matching plus CLI auth status for custom domains. Add glab issue create alternative in test failure triage. Add 7 new test assertions covering GitLab CLI presence, git symbolic-ref fallback, and platform-specific output in retro and ship generated files. Co-Authored-By: Claude Opus 4.6 (1M context) * feat: GitLab support in /retro — use shared BASE_BRANCH_DETECT resolver Replace retro's custom gh-only default branch detection with the shared BASE_BRANCH_DETECT resolver (DRY — same as 10 other skills). Update PR/MR number extraction to match both GitHub #NNN and GitLab !NNN patterns. Remove hardcoded github.com URL from the personal card footer. Regenerate all SKILL.md files affected by the resolver update. Co-Authored-By: Claude Opus 4.6 (1M context) * feat: GitLab MR creation in /ship + /document-release Ship Step 1.5 now checks .gitlab-ci.yml for release workflows alongside GitHub Actions. Step 8 routes to glab mr create on GitLab repos with correct flag mapping (-b, -t, -d). Falls back to manual instructions when no CLI is available. Document-release now reads MR body via glab mr view -F json and updates via glab mr update on GitLab repos. Co-Authored-By: Claude Opus 4.6 (1M context) * chore: add P2 TODO for land-and-deploy GitLab support Track the remaining work to support GitLab in /land-and-deploy — MR merge, CI polling, and deploy workflow detection using glab equivalents. Co-Authored-By: Claude Opus 4.6 (1M context) * fix: adversarial review — GitLab gate, shell safety, MR prefix preservation Three fixes from adversarial review: 1. land-and-deploy: add GitLab gate after Step 0 — prevents detection/ execution mismatch where agent detects GitLab but all subsequent steps are GitHub-only 2. document-release: use heredoc for glab mr update body to avoid shell metacharacter mangling ($, backticks, !) in MR descriptions 3. retro: preserve original #/! prefix in PR/MR number extraction — GitLab !42 stays as !42, not incorrectly converted to #42 Co-Authored-By: Claude Opus 4.6 (1M context) * fix: resolve merge conflicts — deduplicate gen-skill-docs resolvers The merge from main created duplicate RESOLVERS records in gen-skill-docs.ts (inline functions shadowing the imported module versions). Removed the inline duplicates so the modular resolvers from scripts/resolvers/ are used. Also added missing E2E_TIERS entries for plan-completion/verification tests. * chore: bump version and changelog (v0.11.20.0) Co-Authored-By: Claude Opus 4.6 --------- Co-authored-by: Claude Opus 4.6 (1M context) --- CHANGELOG.md | 13 ++ SKILL.md | 119 ++++----------- TODOS.md | 12 ++ VERSION | 2 +- autoplan/SKILL.md | 160 +++++++++++---------- benchmark/SKILL.md | 119 ++++----------- browse/SKILL.md | 119 ++++----------- canary/SKILL.md | 162 ++++++++++----------- codex/SKILL.md | 160 +++++++++++---------- cso/SKILL.md | 122 ++++++---------- design-consultation/SKILL.md | 122 +++++++--------- design-review/SKILL.md | 124 +++++++--------- document-release/SKILL.md | 186 ++++++++++++------------ document-release/SKILL.md.tmpl | 24 +++- investigate/SKILL.md | 122 ++++++---------- land-and-deploy/SKILL.md | 162 +++++++++++---------- land-and-deploy/SKILL.md.tmpl | 2 + office-hours/SKILL.md | 124 +++++++--------- package.json | 2 +- plan-ceo-review/SKILL.md | 162 ++++++++++----------- plan-design-review/SKILL.md | 162 ++++++++++----------- plan-eng-review/SKILL.md | 122 +++++++--------- qa-only/SKILL.md | 120 +++++++--------- qa/SKILL.md | 160 +++++++++++---------- retro/SKILL.md | 173 +++++++++++----------- retro/SKILL.md.tmpl | 22 +-- review/SKILL.md | 179 ++++++++++++----------- scripts/gen-skill-docs.ts | 111 +++++--------- scripts/resolvers/preamble.ts | 24 ++-- scripts/resolvers/utility.ts | 40 ++++-- setup-browser-cookies/SKILL.md | 119 ++++----------- setup-deploy/SKILL.md | 122 ++++++---------- ship/SKILL.md | 254 +++++++++++++++++++++------------ ship/SKILL.md.tmpl | 33 ++++- test/gen-skill-docs.test.ts | 33 +++++ test/helpers/touchfiles.ts | 5 + 36 files changed, 1697 insertions(+), 2000 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 56620db7..acbc55cd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,18 @@ # Changelog +## [0.11.20.0] - 2026-03-26 + +### Added + +- **GitLab support for `/retro` and `/ship`.** You can now run `/ship` on GitLab repos — it creates merge requests via `glab mr create` instead of `gh pr create`. `/retro` detects default branches on both platforms. All 11 skills using `BASE_BRANCH_DETECT` automatically get GitHub, GitLab, and git-native fallback detection. +- **GitHub Enterprise and self-hosted GitLab detection.** If the remote URL doesn't match `github.com` or `gitlab`, gstack checks `gh auth status` / `glab auth status` to detect authenticated platforms — no manual config needed. +- **`/document-release` works on GitLab.** After `/ship` creates a merge request, the auto-invoked `/document-release` reads and updates the MR body via `glab` instead of failing silently. +- **GitLab safety gate for `/land-and-deploy`.** Instead of silently failing on GitLab repos, `/land-and-deploy` now stops early with a clear message that GitLab merge support is not yet implemented. + +### Fixed + +- **Deduplicated gen-skill-docs resolvers.** The template generator had duplicate inline resolver functions that shadowed the modular versions, causing generated SKILL.md files to miss recent resolver updates. + ## [0.11.19.0] - 2026-03-24 ### Fixed diff --git a/SKILL.md b/SKILL.md index f6d2831e..5f8d0f33 100644 --- a/SKILL.md +++ b/SKILL.md @@ -27,9 +27,11 @@ _SESSIONS=$(find ~/.gstack/sessions -mmin -120 -type f 2>/dev/null | wc -l | tr find ~/.gstack/sessions -mmin +120 -type f -delete 2>/dev/null || true _CONTRIB=$(~/.claude/skills/gstack/bin/gstack-config get gstack_contributor 2>/dev/null || true) _PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +_PROACTIVE_PROMPTED=$([ -f ~/.gstack/.proactive-prompted ] && echo "yes" || echo "no") _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" echo "PROACTIVE: $_PROACTIVE" +echo "PROACTIVE_PROMPTED: $_PROACTIVE_PROMPTED" source <(~/.claude/skills/gstack/bin/gstack-repo-mode 2>/dev/null) || true REPO_MODE=${REPO_MODE:-unknown} echo "REPO_MODE: $REPO_MODE" @@ -47,8 +49,11 @@ echo '{"skill":"gstack","ts":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'","repo":"'$(basen for _PF in $(find ~/.gstack/analytics -maxdepth 1 -name '.pending-*' 2>/dev/null); do [ -f "$_PF" ] && ~/.claude/skills/gstack/bin/gstack-telemetry-log --event-type skill_run --skill _pending_finalize --outcome unknown --session-id "$_SESSION_ID" 2>/dev/null || true; break; done ``` -If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke -them when the user explicitly asks. The user opted out of proactive suggestions. +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills AND do not +auto-invoke skills based on conversation context. Only run skills the user explicitly +types (e.g., /qa, /ship). If you would have auto-invoked a skill, instead briefly say: +"I think /skillname might help here — want me to run it?" and wait for confirmation. +The user opted out of proactive behavior. If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. @@ -97,112 +102,44 @@ touch ~/.gstack/.telemetry-prompted This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. -## AskUserQuestion Format +If `PROACTIVE_PROMPTED` is `no` AND `TEL_PROMPTED` is `yes`: After telemetry is handled, +ask the user about proactive behavior. Use AskUserQuestion: -**ALWAYS follow this structure for every AskUserQuestion call:** -1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` -5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. +> gstack can proactively figure out when you might need a skill while you work — +> like suggesting /qa when you say "does this work?" or /investigate when you hit +> a bug. We recommend keeping this on — it speeds up every part of your workflow. -Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. +Options: +- A) Keep it on (recommended) +- B) Turn it off — I'll type /commands myself -Per-skill instructions may add additional formatting rules on top of this baseline. +If A: run `~/.claude/skills/gstack/bin/gstack-config set proactive true` +If B: run `~/.claude/skills/gstack/bin/gstack-config set proactive false` -## Completeness Principle — Boil the Lake - -AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: - -- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. -- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. -- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: - -| Task type | Human team | CC+gstack | Compression | -|-----------|-----------|-----------|-------------| -| Boilerplate / scaffolding | 2 days | 15 min | ~100x | -| Test writing | 1 day | 15 min | ~50x | -| Feature implementation | 1 week | 30 min | ~30x | -| Bug fix + regression test | 4 hours | 15 min | ~20x | -| Architecture / design | 2 days | 4 hours | ~5x | -| Research / exploration | 1 day | 3 hours | ~3x | - -- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. - -**Anti-patterns — DON'T do this:** -- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) -- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) -- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) -- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") - -## Repo Ownership Mode — See Something, Say Something - -`REPO_MODE` from the preamble tells you who owns issues in this repo: - -- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. -- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. -- **`unknown`** — Treat as collaborative (safer default — ask before fixing). - -**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. - -Never let a noticed issue silently pass. The whole point is proactive communication. - -## Search Before Building - -Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. - -**Three layers of knowledge:** -- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. -- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. -- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. - -**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: -"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." - -Log eureka moments: +Always run: ```bash -jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +touch ~/.gstack/.proactive-prompted ``` -Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. -**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." +This only happens once. If `PROACTIVE_PROMPTED` is `yes`, skip this entirely. ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. +If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. -**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! - -**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. - -**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. - -**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): +**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} - -Hey gstack team — ran into this while using /{skill-name}: - -**What I was trying to do:** {what the user/agent was attempting} -**What happened instead:** {what actually happened} -**My rating:** {0-10} — {one sentence on why it wasn't a 10} - -## Steps to reproduce +**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} +## Repro 1. {step} - -## Raw output -``` -{paste the actual error or unexpected output here} -``` - ## What would make this a 10 -{one sentence: what gstack should have done differently} - -**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} +{one sentence} +**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} ``` - -Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" +Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. ## Completion Status Protocol diff --git a/TODOS.md b/TODOS.md index 1c4b88ed..3ee995b6 100644 --- a/TODOS.md +++ b/TODOS.md @@ -168,6 +168,18 @@ Linux cookie import shipped in v0.11.11.0 (Wave 3). Supports Chrome, Chromium, B ## Ship +### GitLab support for /land-and-deploy + +**What:** Add GitLab MR merge + CI polling support to `/land-and-deploy` skill. Currently uses `gh pr view`, `gh pr checks`, `gh pr merge`, and `gh run list/view` in 15+ places — each needs a GitLab conditional path using `glab ci status`, `glab mr merge`, etc. + +**Why:** Without this, GitLab users can `/ship` (create MR) but can't `/land-and-deploy` (merge + verify). Completes the GitLab story end-to-end. + +**Context:** `/retro`, `/ship`, and `/document-release` now support GitLab via the multi-platform `BASE_BRANCH_DETECT` resolver. `/land-and-deploy` has deeper GitHub-specific semantics (merge queues, required checks via `gh pr checks`, deploy workflow polling) that have different shapes on GitLab. The `glab` CLI (v1.90.0) supports `glab mr merge`, `glab ci status`, `glab ci view` but with different output formats and no merge queue concept. + +**Effort:** L +**Priority:** P2 +**Depends on:** None (BASE_BRANCH_DETECT multi-platform resolver is already done) + ### Ship log — persistent record of /ship runs **What:** Append structured JSON entry to `.gstack/ship-log.json` at end of every /ship run (version, date, branch, PR URL, review findings, Greptile stats, todos completed, test results). diff --git a/VERSION b/VERSION index d20322e5..508c698a 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.11.19.0 +0.11.20.0 diff --git a/autoplan/SKILL.md b/autoplan/SKILL.md index 14874900..d69fc285 100644 --- a/autoplan/SKILL.md +++ b/autoplan/SKILL.md @@ -36,9 +36,11 @@ _SESSIONS=$(find ~/.gstack/sessions -mmin -120 -type f 2>/dev/null | wc -l | tr find ~/.gstack/sessions -mmin +120 -type f -delete 2>/dev/null || true _CONTRIB=$(~/.claude/skills/gstack/bin/gstack-config get gstack_contributor 2>/dev/null || true) _PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +_PROACTIVE_PROMPTED=$([ -f ~/.gstack/.proactive-prompted ] && echo "yes" || echo "no") _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" echo "PROACTIVE: $_PROACTIVE" +echo "PROACTIVE_PROMPTED: $_PROACTIVE_PROMPTED" source <(~/.claude/skills/gstack/bin/gstack-repo-mode 2>/dev/null) || true REPO_MODE=${REPO_MODE:-unknown} echo "REPO_MODE: $REPO_MODE" @@ -56,8 +58,11 @@ echo '{"skill":"autoplan","ts":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'","repo":"'$(bas for _PF in $(find ~/.gstack/analytics -maxdepth 1 -name '.pending-*' 2>/dev/null); do [ -f "$_PF" ] && ~/.claude/skills/gstack/bin/gstack-telemetry-log --event-type skill_run --skill _pending_finalize --outcome unknown --session-id "$_SESSION_ID" 2>/dev/null || true; break; done ``` -If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke -them when the user explicitly asks. The user opted out of proactive suggestions. +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills AND do not +auto-invoke skills based on conversation context. Only run skills the user explicitly +types (e.g., /qa, /ship). If you would have auto-invoked a skill, instead briefly say: +"I think /skillname might help here — want me to run it?" and wait for confirmation. +The user opted out of proactive behavior. If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. @@ -106,6 +111,27 @@ touch ~/.gstack/.telemetry-prompted This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. +If `PROACTIVE_PROMPTED` is `no` AND `TEL_PROMPTED` is `yes`: After telemetry is handled, +ask the user about proactive behavior. Use AskUserQuestion: + +> gstack can proactively figure out when you might need a skill while you work — +> like suggesting /qa when you say "does this work?" or /investigate when you hit +> a bug. We recommend keeping this on — it speeds up every part of your workflow. + +Options: +- A) Keep it on (recommended) +- B) Turn it off — I'll type /commands myself + +If A: run `~/.claude/skills/gstack/bin/gstack-config set proactive true` +If B: run `~/.claude/skills/gstack/bin/gstack-config set proactive false` + +Always run: +```bash +touch ~/.gstack/.proactive-prompted +``` + +This only happens once. If `PROACTIVE_PROMPTED` is `yes`, skip this entirely. + ## AskUserQuestion Format **ALWAYS follow this structure for every AskUserQuestion call:** @@ -113,7 +139,6 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` -5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -121,97 +146,54 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: +AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. -- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. -- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. -- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: +**Effort reference** — always show both scales: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate / scaffolding | 2 days | 15 min | ~100x | -| Test writing | 1 day | 15 min | ~50x | -| Feature implementation | 1 week | 30 min | ~30x | -| Bug fix + regression test | 4 hours | 15 min | ~20x | -| Architecture / design | 2 days | 4 hours | ~5x | -| Research / exploration | 1 day | 3 hours | ~3x | +| Boilerplate | 2 days | 15 min | ~100x | +| Tests | 1 day | 15 min | ~50x | +| Feature | 1 week | 30 min | ~30x | +| Bug fix | 4 hours | 15 min | ~20x | -- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. +Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). -**Anti-patterns — DON'T do this:** -- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) -- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) -- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) -- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Repo Ownership — See Something, Say Something -## Repo Ownership Mode — See Something, Say Something +`REPO_MODE` controls how to handle issues outside your branch: +- **`solo`** — You own everything. Investigate and offer to fix proactively. +- **`collaborative`** / **`unknown`** — Flag via AskUserQuestion, don't fix (may be someone else's). -`REPO_MODE` from the preamble tells you who owns issues in this repo: - -- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. -- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. -- **`unknown`** — Treat as collaborative (safer default — ask before fixing). - -**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. - -Never let a noticed issue silently pass. The whole point is proactive communication. +Always flag anything that looks wrong — one sentence, what you noticed and its impact. ## Search Before Building -Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. +Before building anything unfamiliar, **search first.** See `~/.claude/skills/gstack/ETHOS.md`. +- **Layer 1** (tried and true) — don't reinvent. **Layer 2** (new and popular) — scrutinize. **Layer 3** (first principles) — prize above all. -**Three layers of knowledge:** -- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. -- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. -- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. - -**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: -"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." - -Log eureka moments: +**Eureka:** When first-principles reasoning contradicts conventional wisdom, name it and log: ```bash jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true ``` -Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. - -**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. +If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. -**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! - -**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. - -**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. - -**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): +**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} - -Hey gstack team — ran into this while using /{skill-name}: - -**What I was trying to do:** {what the user/agent was attempting} -**What happened instead:** {what actually happened} -**My rating:** {0-10} — {one sentence on why it wasn't a 10} - -## Steps to reproduce +**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} +## Repro 1. {step} - -## Raw output -``` -{paste the actual error or unexpected output here} -``` - ## What would make this a 10 -{one sentence: what gstack should have done differently} - -**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} +{one sentence} +**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} ``` - -Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" +Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. ## Completion Status Protocol @@ -302,22 +284,42 @@ Then write a `## GSTACK REVIEW REPORT` section to the end of the plan file: file you are allowed to edit in plan mode. The plan file review report is part of the plan's living status. -## Step 0: Detect base branch +## Step 0: Detect platform and base branch -Determine which branch this PR targets. Use the result as "the base branch" in all subsequent steps. +First, detect the git hosting platform from the remote URL: -1. Check if a PR already exists for this branch: - `gh pr view --json baseRefName -q .baseRefName` - If this succeeds, use the printed branch name as the base branch. +```bash +git remote get-url origin 2>/dev/null +``` -2. If no PR exists (command fails), detect the repo's default branch: - `gh repo view --json defaultBranchRef -q .defaultBranchRef.name` +- If the URL contains "github.com" → platform is **GitHub** +- If the URL contains "gitlab" → platform is **GitLab** +- Otherwise, check CLI availability: + - `gh auth status 2>/dev/null` succeeds → platform is **GitHub** (covers GitHub Enterprise) + - `glab auth status 2>/dev/null` succeeds → platform is **GitLab** (covers self-hosted) + - Neither → **unknown** (use git-native commands only) -3. If both commands fail, fall back to `main`. +Determine which branch this PR/MR targets, or the repo's default branch if no +PR/MR exists. Use the result as "the base branch" in all subsequent steps. + +**If GitHub:** +1. `gh pr view --json baseRefName -q .baseRefName` — if succeeds, use it +2. `gh repo view --json defaultBranchRef -q .defaultBranchRef.name` — if succeeds, use it + +**If GitLab:** +1. `glab mr view -F json 2>/dev/null` and extract the `target_branch` field — if succeeds, use it +2. `glab repo view -F json 2>/dev/null` and extract the `default_branch` field — if succeeds, use it + +**Git-native fallback (if unknown platform, or CLI commands fail):** +1. `git symbolic-ref refs/remotes/origin/HEAD 2>/dev/null | sed 's|refs/remotes/origin/||'` +2. If that fails: `git rev-parse --verify origin/main 2>/dev/null` → use `main` +3. If that fails: `git rev-parse --verify origin/master 2>/dev/null` → use `master` + +If all fail, fall back to `main`. Print the detected base branch name. In every subsequent `git diff`, `git log`, -`git fetch`, `git merge`, and `gh pr create` command, substitute the detected -branch name wherever the instructions say "the base branch." +`git fetch`, `git merge`, and PR/MR creation command, substitute the detected +branch name wherever the instructions say "the base branch" or ``. --- diff --git a/benchmark/SKILL.md b/benchmark/SKILL.md index d9138a03..d6d65ae2 100644 --- a/benchmark/SKILL.md +++ b/benchmark/SKILL.md @@ -29,9 +29,11 @@ _SESSIONS=$(find ~/.gstack/sessions -mmin -120 -type f 2>/dev/null | wc -l | tr find ~/.gstack/sessions -mmin +120 -type f -delete 2>/dev/null || true _CONTRIB=$(~/.claude/skills/gstack/bin/gstack-config get gstack_contributor 2>/dev/null || true) _PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +_PROACTIVE_PROMPTED=$([ -f ~/.gstack/.proactive-prompted ] && echo "yes" || echo "no") _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" echo "PROACTIVE: $_PROACTIVE" +echo "PROACTIVE_PROMPTED: $_PROACTIVE_PROMPTED" source <(~/.claude/skills/gstack/bin/gstack-repo-mode 2>/dev/null) || true REPO_MODE=${REPO_MODE:-unknown} echo "REPO_MODE: $REPO_MODE" @@ -49,8 +51,11 @@ echo '{"skill":"benchmark","ts":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'","repo":"'$(ba for _PF in $(find ~/.gstack/analytics -maxdepth 1 -name '.pending-*' 2>/dev/null); do [ -f "$_PF" ] && ~/.claude/skills/gstack/bin/gstack-telemetry-log --event-type skill_run --skill _pending_finalize --outcome unknown --session-id "$_SESSION_ID" 2>/dev/null || true; break; done ``` -If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke -them when the user explicitly asks. The user opted out of proactive suggestions. +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills AND do not +auto-invoke skills based on conversation context. Only run skills the user explicitly +types (e.g., /qa, /ship). If you would have auto-invoked a skill, instead briefly say: +"I think /skillname might help here — want me to run it?" and wait for confirmation. +The user opted out of proactive behavior. If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. @@ -99,112 +104,44 @@ touch ~/.gstack/.telemetry-prompted This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. -## AskUserQuestion Format +If `PROACTIVE_PROMPTED` is `no` AND `TEL_PROMPTED` is `yes`: After telemetry is handled, +ask the user about proactive behavior. Use AskUserQuestion: -**ALWAYS follow this structure for every AskUserQuestion call:** -1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` -5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. +> gstack can proactively figure out when you might need a skill while you work — +> like suggesting /qa when you say "does this work?" or /investigate when you hit +> a bug. We recommend keeping this on — it speeds up every part of your workflow. -Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. +Options: +- A) Keep it on (recommended) +- B) Turn it off — I'll type /commands myself -Per-skill instructions may add additional formatting rules on top of this baseline. +If A: run `~/.claude/skills/gstack/bin/gstack-config set proactive true` +If B: run `~/.claude/skills/gstack/bin/gstack-config set proactive false` -## Completeness Principle — Boil the Lake - -AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: - -- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. -- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. -- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: - -| Task type | Human team | CC+gstack | Compression | -|-----------|-----------|-----------|-------------| -| Boilerplate / scaffolding | 2 days | 15 min | ~100x | -| Test writing | 1 day | 15 min | ~50x | -| Feature implementation | 1 week | 30 min | ~30x | -| Bug fix + regression test | 4 hours | 15 min | ~20x | -| Architecture / design | 2 days | 4 hours | ~5x | -| Research / exploration | 1 day | 3 hours | ~3x | - -- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. - -**Anti-patterns — DON'T do this:** -- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) -- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) -- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) -- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") - -## Repo Ownership Mode — See Something, Say Something - -`REPO_MODE` from the preamble tells you who owns issues in this repo: - -- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. -- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. -- **`unknown`** — Treat as collaborative (safer default — ask before fixing). - -**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. - -Never let a noticed issue silently pass. The whole point is proactive communication. - -## Search Before Building - -Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. - -**Three layers of knowledge:** -- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. -- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. -- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. - -**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: -"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." - -Log eureka moments: +Always run: ```bash -jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +touch ~/.gstack/.proactive-prompted ``` -Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. -**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." +This only happens once. If `PROACTIVE_PROMPTED` is `yes`, skip this entirely. ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. +If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. -**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! - -**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. - -**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. - -**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): +**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} - -Hey gstack team — ran into this while using /{skill-name}: - -**What I was trying to do:** {what the user/agent was attempting} -**What happened instead:** {what actually happened} -**My rating:** {0-10} — {one sentence on why it wasn't a 10} - -## Steps to reproduce +**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} +## Repro 1. {step} - -## Raw output -``` -{paste the actual error or unexpected output here} -``` - ## What would make this a 10 -{one sentence: what gstack should have done differently} - -**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} +{one sentence} +**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} ``` - -Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" +Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. ## Completion Status Protocol diff --git a/browse/SKILL.md b/browse/SKILL.md index 91845a99..c52dcaa5 100644 --- a/browse/SKILL.md +++ b/browse/SKILL.md @@ -29,9 +29,11 @@ _SESSIONS=$(find ~/.gstack/sessions -mmin -120 -type f 2>/dev/null | wc -l | tr find ~/.gstack/sessions -mmin +120 -type f -delete 2>/dev/null || true _CONTRIB=$(~/.claude/skills/gstack/bin/gstack-config get gstack_contributor 2>/dev/null || true) _PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +_PROACTIVE_PROMPTED=$([ -f ~/.gstack/.proactive-prompted ] && echo "yes" || echo "no") _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" echo "PROACTIVE: $_PROACTIVE" +echo "PROACTIVE_PROMPTED: $_PROACTIVE_PROMPTED" source <(~/.claude/skills/gstack/bin/gstack-repo-mode 2>/dev/null) || true REPO_MODE=${REPO_MODE:-unknown} echo "REPO_MODE: $REPO_MODE" @@ -49,8 +51,11 @@ echo '{"skill":"browse","ts":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'","repo":"'$(basen for _PF in $(find ~/.gstack/analytics -maxdepth 1 -name '.pending-*' 2>/dev/null); do [ -f "$_PF" ] && ~/.claude/skills/gstack/bin/gstack-telemetry-log --event-type skill_run --skill _pending_finalize --outcome unknown --session-id "$_SESSION_ID" 2>/dev/null || true; break; done ``` -If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke -them when the user explicitly asks. The user opted out of proactive suggestions. +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills AND do not +auto-invoke skills based on conversation context. Only run skills the user explicitly +types (e.g., /qa, /ship). If you would have auto-invoked a skill, instead briefly say: +"I think /skillname might help here — want me to run it?" and wait for confirmation. +The user opted out of proactive behavior. If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. @@ -99,112 +104,44 @@ touch ~/.gstack/.telemetry-prompted This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. -## AskUserQuestion Format +If `PROACTIVE_PROMPTED` is `no` AND `TEL_PROMPTED` is `yes`: After telemetry is handled, +ask the user about proactive behavior. Use AskUserQuestion: -**ALWAYS follow this structure for every AskUserQuestion call:** -1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` -5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. +> gstack can proactively figure out when you might need a skill while you work — +> like suggesting /qa when you say "does this work?" or /investigate when you hit +> a bug. We recommend keeping this on — it speeds up every part of your workflow. -Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. +Options: +- A) Keep it on (recommended) +- B) Turn it off — I'll type /commands myself -Per-skill instructions may add additional formatting rules on top of this baseline. +If A: run `~/.claude/skills/gstack/bin/gstack-config set proactive true` +If B: run `~/.claude/skills/gstack/bin/gstack-config set proactive false` -## Completeness Principle — Boil the Lake - -AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: - -- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. -- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. -- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: - -| Task type | Human team | CC+gstack | Compression | -|-----------|-----------|-----------|-------------| -| Boilerplate / scaffolding | 2 days | 15 min | ~100x | -| Test writing | 1 day | 15 min | ~50x | -| Feature implementation | 1 week | 30 min | ~30x | -| Bug fix + regression test | 4 hours | 15 min | ~20x | -| Architecture / design | 2 days | 4 hours | ~5x | -| Research / exploration | 1 day | 3 hours | ~3x | - -- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. - -**Anti-patterns — DON'T do this:** -- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) -- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) -- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) -- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") - -## Repo Ownership Mode — See Something, Say Something - -`REPO_MODE` from the preamble tells you who owns issues in this repo: - -- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. -- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. -- **`unknown`** — Treat as collaborative (safer default — ask before fixing). - -**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. - -Never let a noticed issue silently pass. The whole point is proactive communication. - -## Search Before Building - -Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. - -**Three layers of knowledge:** -- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. -- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. -- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. - -**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: -"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." - -Log eureka moments: +Always run: ```bash -jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +touch ~/.gstack/.proactive-prompted ``` -Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. -**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." +This only happens once. If `PROACTIVE_PROMPTED` is `yes`, skip this entirely. ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. +If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. -**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! - -**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. - -**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. - -**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): +**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} - -Hey gstack team — ran into this while using /{skill-name}: - -**What I was trying to do:** {what the user/agent was attempting} -**What happened instead:** {what actually happened} -**My rating:** {0-10} — {one sentence on why it wasn't a 10} - -## Steps to reproduce +**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} +## Repro 1. {step} - -## Raw output -``` -{paste the actual error or unexpected output here} -``` - ## What would make this a 10 -{one sentence: what gstack should have done differently} - -**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} +{one sentence} +**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} ``` - -Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" +Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. ## Completion Status Protocol diff --git a/canary/SKILL.md b/canary/SKILL.md index fe889c74..08903c71 100644 --- a/canary/SKILL.md +++ b/canary/SKILL.md @@ -29,9 +29,11 @@ _SESSIONS=$(find ~/.gstack/sessions -mmin -120 -type f 2>/dev/null | wc -l | tr find ~/.gstack/sessions -mmin +120 -type f -delete 2>/dev/null || true _CONTRIB=$(~/.claude/skills/gstack/bin/gstack-config get gstack_contributor 2>/dev/null || true) _PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +_PROACTIVE_PROMPTED=$([ -f ~/.gstack/.proactive-prompted ] && echo "yes" || echo "no") _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" echo "PROACTIVE: $_PROACTIVE" +echo "PROACTIVE_PROMPTED: $_PROACTIVE_PROMPTED" source <(~/.claude/skills/gstack/bin/gstack-repo-mode 2>/dev/null) || true REPO_MODE=${REPO_MODE:-unknown} echo "REPO_MODE: $REPO_MODE" @@ -49,8 +51,11 @@ echo '{"skill":"canary","ts":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'","repo":"'$(basen for _PF in $(find ~/.gstack/analytics -maxdepth 1 -name '.pending-*' 2>/dev/null); do [ -f "$_PF" ] && ~/.claude/skills/gstack/bin/gstack-telemetry-log --event-type skill_run --skill _pending_finalize --outcome unknown --session-id "$_SESSION_ID" 2>/dev/null || true; break; done ``` -If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke -them when the user explicitly asks. The user opted out of proactive suggestions. +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills AND do not +auto-invoke skills based on conversation context. Only run skills the user explicitly +types (e.g., /qa, /ship). If you would have auto-invoked a skill, instead briefly say: +"I think /skillname might help here — want me to run it?" and wait for confirmation. +The user opted out of proactive behavior. If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. @@ -99,6 +104,27 @@ touch ~/.gstack/.telemetry-prompted This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. +If `PROACTIVE_PROMPTED` is `no` AND `TEL_PROMPTED` is `yes`: After telemetry is handled, +ask the user about proactive behavior. Use AskUserQuestion: + +> gstack can proactively figure out when you might need a skill while you work — +> like suggesting /qa when you say "does this work?" or /investigate when you hit +> a bug. We recommend keeping this on — it speeds up every part of your workflow. + +Options: +- A) Keep it on (recommended) +- B) Turn it off — I'll type /commands myself + +If A: run `~/.claude/skills/gstack/bin/gstack-config set proactive true` +If B: run `~/.claude/skills/gstack/bin/gstack-config set proactive false` + +Always run: +```bash +touch ~/.gstack/.proactive-prompted +``` + +This only happens once. If `PROACTIVE_PROMPTED` is `yes`, skip this entirely. + ## AskUserQuestion Format **ALWAYS follow this structure for every AskUserQuestion call:** @@ -106,7 +132,6 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` -5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -114,97 +139,36 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: +AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. -- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. -- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. -- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: +**Effort reference** — always show both scales: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate / scaffolding | 2 days | 15 min | ~100x | -| Test writing | 1 day | 15 min | ~50x | -| Feature implementation | 1 week | 30 min | ~30x | -| Bug fix + regression test | 4 hours | 15 min | ~20x | -| Architecture / design | 2 days | 4 hours | ~5x | -| Research / exploration | 1 day | 3 hours | ~3x | +| Boilerplate | 2 days | 15 min | ~100x | +| Tests | 1 day | 15 min | ~50x | +| Feature | 1 week | 30 min | ~30x | +| Bug fix | 4 hours | 15 min | ~20x | -- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. - -**Anti-patterns — DON'T do this:** -- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) -- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) -- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) -- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") - -## Repo Ownership Mode — See Something, Say Something - -`REPO_MODE` from the preamble tells you who owns issues in this repo: - -- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. -- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. -- **`unknown`** — Treat as collaborative (safer default — ask before fixing). - -**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. - -Never let a noticed issue silently pass. The whole point is proactive communication. - -## Search Before Building - -Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. - -**Three layers of knowledge:** -- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. -- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. -- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. - -**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: -"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." - -Log eureka moments: -```bash -jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true -``` -Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. - -**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." +Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. +If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. -**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! - -**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. - -**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. - -**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): +**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} - -Hey gstack team — ran into this while using /{skill-name}: - -**What I was trying to do:** {what the user/agent was attempting} -**What happened instead:** {what actually happened} -**My rating:** {0-10} — {one sentence on why it wasn't a 10} - -## Steps to reproduce +**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} +## Repro 1. {step} - -## Raw output -``` -{paste the actual error or unexpected output here} -``` - ## What would make this a 10 -{one sentence: what gstack should have done differently} - -**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} +{one sentence} +**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} ``` - -Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" +Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. ## Completion Status Protocol @@ -314,22 +278,42 @@ If `NEEDS_SETUP`: 2. Run: `cd && ./setup` 3. If `bun` is not installed: `curl -fsSL https://bun.sh/install | bash` -## Step 0: Detect base branch +## Step 0: Detect platform and base branch -Determine which branch this PR targets. Use the result as "the base branch" in all subsequent steps. +First, detect the git hosting platform from the remote URL: -1. Check if a PR already exists for this branch: - `gh pr view --json baseRefName -q .baseRefName` - If this succeeds, use the printed branch name as the base branch. +```bash +git remote get-url origin 2>/dev/null +``` -2. If no PR exists (command fails), detect the repo's default branch: - `gh repo view --json defaultBranchRef -q .defaultBranchRef.name` +- If the URL contains "github.com" → platform is **GitHub** +- If the URL contains "gitlab" → platform is **GitLab** +- Otherwise, check CLI availability: + - `gh auth status 2>/dev/null` succeeds → platform is **GitHub** (covers GitHub Enterprise) + - `glab auth status 2>/dev/null` succeeds → platform is **GitLab** (covers self-hosted) + - Neither → **unknown** (use git-native commands only) -3. If both commands fail, fall back to `main`. +Determine which branch this PR/MR targets, or the repo's default branch if no +PR/MR exists. Use the result as "the base branch" in all subsequent steps. + +**If GitHub:** +1. `gh pr view --json baseRefName -q .baseRefName` — if succeeds, use it +2. `gh repo view --json defaultBranchRef -q .defaultBranchRef.name` — if succeeds, use it + +**If GitLab:** +1. `glab mr view -F json 2>/dev/null` and extract the `target_branch` field — if succeeds, use it +2. `glab repo view -F json 2>/dev/null` and extract the `default_branch` field — if succeeds, use it + +**Git-native fallback (if unknown platform, or CLI commands fail):** +1. `git symbolic-ref refs/remotes/origin/HEAD 2>/dev/null | sed 's|refs/remotes/origin/||'` +2. If that fails: `git rev-parse --verify origin/main 2>/dev/null` → use `main` +3. If that fails: `git rev-parse --verify origin/master 2>/dev/null` → use `master` + +If all fail, fall back to `main`. Print the detected base branch name. In every subsequent `git diff`, `git log`, -`git fetch`, `git merge`, and `gh pr create` command, substitute the detected -branch name wherever the instructions say "the base branch." +`git fetch`, `git merge`, and PR/MR creation command, substitute the detected +branch name wherever the instructions say "the base branch" or ``. --- diff --git a/codex/SKILL.md b/codex/SKILL.md index 8bce22e5..6e19cd04 100644 --- a/codex/SKILL.md +++ b/codex/SKILL.md @@ -30,9 +30,11 @@ _SESSIONS=$(find ~/.gstack/sessions -mmin -120 -type f 2>/dev/null | wc -l | tr find ~/.gstack/sessions -mmin +120 -type f -delete 2>/dev/null || true _CONTRIB=$(~/.claude/skills/gstack/bin/gstack-config get gstack_contributor 2>/dev/null || true) _PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +_PROACTIVE_PROMPTED=$([ -f ~/.gstack/.proactive-prompted ] && echo "yes" || echo "no") _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" echo "PROACTIVE: $_PROACTIVE" +echo "PROACTIVE_PROMPTED: $_PROACTIVE_PROMPTED" source <(~/.claude/skills/gstack/bin/gstack-repo-mode 2>/dev/null) || true REPO_MODE=${REPO_MODE:-unknown} echo "REPO_MODE: $REPO_MODE" @@ -50,8 +52,11 @@ echo '{"skill":"codex","ts":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'","repo":"'$(basena for _PF in $(find ~/.gstack/analytics -maxdepth 1 -name '.pending-*' 2>/dev/null); do [ -f "$_PF" ] && ~/.claude/skills/gstack/bin/gstack-telemetry-log --event-type skill_run --skill _pending_finalize --outcome unknown --session-id "$_SESSION_ID" 2>/dev/null || true; break; done ``` -If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke -them when the user explicitly asks. The user opted out of proactive suggestions. +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills AND do not +auto-invoke skills based on conversation context. Only run skills the user explicitly +types (e.g., /qa, /ship). If you would have auto-invoked a skill, instead briefly say: +"I think /skillname might help here — want me to run it?" and wait for confirmation. +The user opted out of proactive behavior. If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. @@ -100,6 +105,27 @@ touch ~/.gstack/.telemetry-prompted This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. +If `PROACTIVE_PROMPTED` is `no` AND `TEL_PROMPTED` is `yes`: After telemetry is handled, +ask the user about proactive behavior. Use AskUserQuestion: + +> gstack can proactively figure out when you might need a skill while you work — +> like suggesting /qa when you say "does this work?" or /investigate when you hit +> a bug. We recommend keeping this on — it speeds up every part of your workflow. + +Options: +- A) Keep it on (recommended) +- B) Turn it off — I'll type /commands myself + +If A: run `~/.claude/skills/gstack/bin/gstack-config set proactive true` +If B: run `~/.claude/skills/gstack/bin/gstack-config set proactive false` + +Always run: +```bash +touch ~/.gstack/.proactive-prompted +``` + +This only happens once. If `PROACTIVE_PROMPTED` is `yes`, skip this entirely. + ## AskUserQuestion Format **ALWAYS follow this structure for every AskUserQuestion call:** @@ -107,7 +133,6 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` -5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -115,97 +140,54 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: +AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. -- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. -- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. -- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: +**Effort reference** — always show both scales: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate / scaffolding | 2 days | 15 min | ~100x | -| Test writing | 1 day | 15 min | ~50x | -| Feature implementation | 1 week | 30 min | ~30x | -| Bug fix + regression test | 4 hours | 15 min | ~20x | -| Architecture / design | 2 days | 4 hours | ~5x | -| Research / exploration | 1 day | 3 hours | ~3x | +| Boilerplate | 2 days | 15 min | ~100x | +| Tests | 1 day | 15 min | ~50x | +| Feature | 1 week | 30 min | ~30x | +| Bug fix | 4 hours | 15 min | ~20x | -- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. +Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). -**Anti-patterns — DON'T do this:** -- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) -- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) -- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) -- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Repo Ownership — See Something, Say Something -## Repo Ownership Mode — See Something, Say Something +`REPO_MODE` controls how to handle issues outside your branch: +- **`solo`** — You own everything. Investigate and offer to fix proactively. +- **`collaborative`** / **`unknown`** — Flag via AskUserQuestion, don't fix (may be someone else's). -`REPO_MODE` from the preamble tells you who owns issues in this repo: - -- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. -- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. -- **`unknown`** — Treat as collaborative (safer default — ask before fixing). - -**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. - -Never let a noticed issue silently pass. The whole point is proactive communication. +Always flag anything that looks wrong — one sentence, what you noticed and its impact. ## Search Before Building -Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. +Before building anything unfamiliar, **search first.** See `~/.claude/skills/gstack/ETHOS.md`. +- **Layer 1** (tried and true) — don't reinvent. **Layer 2** (new and popular) — scrutinize. **Layer 3** (first principles) — prize above all. -**Three layers of knowledge:** -- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. -- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. -- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. - -**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: -"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." - -Log eureka moments: +**Eureka:** When first-principles reasoning contradicts conventional wisdom, name it and log: ```bash jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true ``` -Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. - -**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. +If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. -**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! - -**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. - -**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. - -**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): +**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} - -Hey gstack team — ran into this while using /{skill-name}: - -**What I was trying to do:** {what the user/agent was attempting} -**What happened instead:** {what actually happened} -**My rating:** {0-10} — {one sentence on why it wasn't a 10} - -## Steps to reproduce +**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} +## Repro 1. {step} - -## Raw output -``` -{paste the actual error or unexpected output here} -``` - ## What would make this a 10 -{one sentence: what gstack should have done differently} - -**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} +{one sentence} +**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} ``` - -Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" +Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. ## Completion Status Protocol @@ -296,22 +278,42 @@ Then write a `## GSTACK REVIEW REPORT` section to the end of the plan file: file you are allowed to edit in plan mode. The plan file review report is part of the plan's living status. -## Step 0: Detect base branch +## Step 0: Detect platform and base branch -Determine which branch this PR targets. Use the result as "the base branch" in all subsequent steps. +First, detect the git hosting platform from the remote URL: -1. Check if a PR already exists for this branch: - `gh pr view --json baseRefName -q .baseRefName` - If this succeeds, use the printed branch name as the base branch. +```bash +git remote get-url origin 2>/dev/null +``` -2. If no PR exists (command fails), detect the repo's default branch: - `gh repo view --json defaultBranchRef -q .defaultBranchRef.name` +- If the URL contains "github.com" → platform is **GitHub** +- If the URL contains "gitlab" → platform is **GitLab** +- Otherwise, check CLI availability: + - `gh auth status 2>/dev/null` succeeds → platform is **GitHub** (covers GitHub Enterprise) + - `glab auth status 2>/dev/null` succeeds → platform is **GitLab** (covers self-hosted) + - Neither → **unknown** (use git-native commands only) -3. If both commands fail, fall back to `main`. +Determine which branch this PR/MR targets, or the repo's default branch if no +PR/MR exists. Use the result as "the base branch" in all subsequent steps. + +**If GitHub:** +1. `gh pr view --json baseRefName -q .baseRefName` — if succeeds, use it +2. `gh repo view --json defaultBranchRef -q .defaultBranchRef.name` — if succeeds, use it + +**If GitLab:** +1. `glab mr view -F json 2>/dev/null` and extract the `target_branch` field — if succeeds, use it +2. `glab repo view -F json 2>/dev/null` and extract the `default_branch` field — if succeeds, use it + +**Git-native fallback (if unknown platform, or CLI commands fail):** +1. `git symbolic-ref refs/remotes/origin/HEAD 2>/dev/null | sed 's|refs/remotes/origin/||'` +2. If that fails: `git rev-parse --verify origin/main 2>/dev/null` → use `main` +3. If that fails: `git rev-parse --verify origin/master 2>/dev/null` → use `master` + +If all fail, fall back to `main`. Print the detected base branch name. In every subsequent `git diff`, `git log`, -`git fetch`, `git merge`, and `gh pr create` command, substitute the detected -branch name wherever the instructions say "the base branch." +`git fetch`, `git merge`, and PR/MR creation command, substitute the detected +branch name wherever the instructions say "the base branch" or ``. --- diff --git a/cso/SKILL.md b/cso/SKILL.md index c023e1eb..3f092fd6 100644 --- a/cso/SKILL.md +++ b/cso/SKILL.md @@ -33,9 +33,11 @@ _SESSIONS=$(find ~/.gstack/sessions -mmin -120 -type f 2>/dev/null | wc -l | tr find ~/.gstack/sessions -mmin +120 -type f -delete 2>/dev/null || true _CONTRIB=$(~/.claude/skills/gstack/bin/gstack-config get gstack_contributor 2>/dev/null || true) _PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +_PROACTIVE_PROMPTED=$([ -f ~/.gstack/.proactive-prompted ] && echo "yes" || echo "no") _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" echo "PROACTIVE: $_PROACTIVE" +echo "PROACTIVE_PROMPTED: $_PROACTIVE_PROMPTED" source <(~/.claude/skills/gstack/bin/gstack-repo-mode 2>/dev/null) || true REPO_MODE=${REPO_MODE:-unknown} echo "REPO_MODE: $REPO_MODE" @@ -53,8 +55,11 @@ echo '{"skill":"cso","ts":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'","repo":"'$(basename for _PF in $(find ~/.gstack/analytics -maxdepth 1 -name '.pending-*' 2>/dev/null); do [ -f "$_PF" ] && ~/.claude/skills/gstack/bin/gstack-telemetry-log --event-type skill_run --skill _pending_finalize --outcome unknown --session-id "$_SESSION_ID" 2>/dev/null || true; break; done ``` -If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke -them when the user explicitly asks. The user opted out of proactive suggestions. +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills AND do not +auto-invoke skills based on conversation context. Only run skills the user explicitly +types (e.g., /qa, /ship). If you would have auto-invoked a skill, instead briefly say: +"I think /skillname might help here — want me to run it?" and wait for confirmation. +The user opted out of proactive behavior. If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. @@ -103,6 +108,27 @@ touch ~/.gstack/.telemetry-prompted This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. +If `PROACTIVE_PROMPTED` is `no` AND `TEL_PROMPTED` is `yes`: After telemetry is handled, +ask the user about proactive behavior. Use AskUserQuestion: + +> gstack can proactively figure out when you might need a skill while you work — +> like suggesting /qa when you say "does this work?" or /investigate when you hit +> a bug. We recommend keeping this on — it speeds up every part of your workflow. + +Options: +- A) Keep it on (recommended) +- B) Turn it off — I'll type /commands myself + +If A: run `~/.claude/skills/gstack/bin/gstack-config set proactive true` +If B: run `~/.claude/skills/gstack/bin/gstack-config set proactive false` + +Always run: +```bash +touch ~/.gstack/.proactive-prompted +``` + +This only happens once. If `PROACTIVE_PROMPTED` is `yes`, skip this entirely. + ## AskUserQuestion Format **ALWAYS follow this structure for every AskUserQuestion call:** @@ -110,7 +136,6 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` -5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -118,97 +143,36 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: +AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. -- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. -- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. -- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: +**Effort reference** — always show both scales: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate / scaffolding | 2 days | 15 min | ~100x | -| Test writing | 1 day | 15 min | ~50x | -| Feature implementation | 1 week | 30 min | ~30x | -| Bug fix + regression test | 4 hours | 15 min | ~20x | -| Architecture / design | 2 days | 4 hours | ~5x | -| Research / exploration | 1 day | 3 hours | ~3x | +| Boilerplate | 2 days | 15 min | ~100x | +| Tests | 1 day | 15 min | ~50x | +| Feature | 1 week | 30 min | ~30x | +| Bug fix | 4 hours | 15 min | ~20x | -- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. - -**Anti-patterns — DON'T do this:** -- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) -- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) -- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) -- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") - -## Repo Ownership Mode — See Something, Say Something - -`REPO_MODE` from the preamble tells you who owns issues in this repo: - -- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. -- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. -- **`unknown`** — Treat as collaborative (safer default — ask before fixing). - -**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. - -Never let a noticed issue silently pass. The whole point is proactive communication. - -## Search Before Building - -Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. - -**Three layers of knowledge:** -- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. -- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. -- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. - -**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: -"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." - -Log eureka moments: -```bash -jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true -``` -Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. - -**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." +Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. +If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. -**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! - -**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. - -**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. - -**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): +**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} - -Hey gstack team — ran into this while using /{skill-name}: - -**What I was trying to do:** {what the user/agent was attempting} -**What happened instead:** {what actually happened} -**My rating:** {0-10} — {one sentence on why it wasn't a 10} - -## Steps to reproduce +**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} +## Repro 1. {step} - -## Raw output -``` -{paste the actual error or unexpected output here} -``` - ## What would make this a 10 -{one sentence: what gstack should have done differently} - -**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} +{one sentence} +**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} ``` - -Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" +Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. ## Completion Status Protocol diff --git a/design-consultation/SKILL.md b/design-consultation/SKILL.md index e265f26d..68cdd346 100644 --- a/design-consultation/SKILL.md +++ b/design-consultation/SKILL.md @@ -34,9 +34,11 @@ _SESSIONS=$(find ~/.gstack/sessions -mmin -120 -type f 2>/dev/null | wc -l | tr find ~/.gstack/sessions -mmin +120 -type f -delete 2>/dev/null || true _CONTRIB=$(~/.claude/skills/gstack/bin/gstack-config get gstack_contributor 2>/dev/null || true) _PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +_PROACTIVE_PROMPTED=$([ -f ~/.gstack/.proactive-prompted ] && echo "yes" || echo "no") _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" echo "PROACTIVE: $_PROACTIVE" +echo "PROACTIVE_PROMPTED: $_PROACTIVE_PROMPTED" source <(~/.claude/skills/gstack/bin/gstack-repo-mode 2>/dev/null) || true REPO_MODE=${REPO_MODE:-unknown} echo "REPO_MODE: $REPO_MODE" @@ -54,8 +56,11 @@ echo '{"skill":"design-consultation","ts":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'","re for _PF in $(find ~/.gstack/analytics -maxdepth 1 -name '.pending-*' 2>/dev/null); do [ -f "$_PF" ] && ~/.claude/skills/gstack/bin/gstack-telemetry-log --event-type skill_run --skill _pending_finalize --outcome unknown --session-id "$_SESSION_ID" 2>/dev/null || true; break; done ``` -If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke -them when the user explicitly asks. The user opted out of proactive suggestions. +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills AND do not +auto-invoke skills based on conversation context. Only run skills the user explicitly +types (e.g., /qa, /ship). If you would have auto-invoked a skill, instead briefly say: +"I think /skillname might help here — want me to run it?" and wait for confirmation. +The user opted out of proactive behavior. If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. @@ -104,6 +109,27 @@ touch ~/.gstack/.telemetry-prompted This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. +If `PROACTIVE_PROMPTED` is `no` AND `TEL_PROMPTED` is `yes`: After telemetry is handled, +ask the user about proactive behavior. Use AskUserQuestion: + +> gstack can proactively figure out when you might need a skill while you work — +> like suggesting /qa when you say "does this work?" or /investigate when you hit +> a bug. We recommend keeping this on — it speeds up every part of your workflow. + +Options: +- A) Keep it on (recommended) +- B) Turn it off — I'll type /commands myself + +If A: run `~/.claude/skills/gstack/bin/gstack-config set proactive true` +If B: run `~/.claude/skills/gstack/bin/gstack-config set proactive false` + +Always run: +```bash +touch ~/.gstack/.proactive-prompted +``` + +This only happens once. If `PROACTIVE_PROMPTED` is `yes`, skip this entirely. + ## AskUserQuestion Format **ALWAYS follow this structure for every AskUserQuestion call:** @@ -111,7 +137,6 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` -5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -119,97 +144,54 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: +AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. -- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. -- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. -- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: +**Effort reference** — always show both scales: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate / scaffolding | 2 days | 15 min | ~100x | -| Test writing | 1 day | 15 min | ~50x | -| Feature implementation | 1 week | 30 min | ~30x | -| Bug fix + regression test | 4 hours | 15 min | ~20x | -| Architecture / design | 2 days | 4 hours | ~5x | -| Research / exploration | 1 day | 3 hours | ~3x | +| Boilerplate | 2 days | 15 min | ~100x | +| Tests | 1 day | 15 min | ~50x | +| Feature | 1 week | 30 min | ~30x | +| Bug fix | 4 hours | 15 min | ~20x | -- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. +Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). -**Anti-patterns — DON'T do this:** -- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) -- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) -- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) -- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Repo Ownership — See Something, Say Something -## Repo Ownership Mode — See Something, Say Something +`REPO_MODE` controls how to handle issues outside your branch: +- **`solo`** — You own everything. Investigate and offer to fix proactively. +- **`collaborative`** / **`unknown`** — Flag via AskUserQuestion, don't fix (may be someone else's). -`REPO_MODE` from the preamble tells you who owns issues in this repo: - -- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. -- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. -- **`unknown`** — Treat as collaborative (safer default — ask before fixing). - -**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. - -Never let a noticed issue silently pass. The whole point is proactive communication. +Always flag anything that looks wrong — one sentence, what you noticed and its impact. ## Search Before Building -Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. +Before building anything unfamiliar, **search first.** See `~/.claude/skills/gstack/ETHOS.md`. +- **Layer 1** (tried and true) — don't reinvent. **Layer 2** (new and popular) — scrutinize. **Layer 3** (first principles) — prize above all. -**Three layers of knowledge:** -- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. -- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. -- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. - -**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: -"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." - -Log eureka moments: +**Eureka:** When first-principles reasoning contradicts conventional wisdom, name it and log: ```bash jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true ``` -Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. - -**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. +If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. -**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! - -**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. - -**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. - -**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): +**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} - -Hey gstack team — ran into this while using /{skill-name}: - -**What I was trying to do:** {what the user/agent was attempting} -**What happened instead:** {what actually happened} -**My rating:** {0-10} — {one sentence on why it wasn't a 10} - -## Steps to reproduce +**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} +## Repro 1. {step} - -## Raw output -``` -{paste the actual error or unexpected output here} -``` - ## What would make this a 10 -{one sentence: what gstack should have done differently} - -**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} +{one sentence} +**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} ``` - -Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" +Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. ## Completion Status Protocol @@ -454,7 +436,7 @@ codex exec "Given this product context, propose a complete design direction: - Differentiation: 2 deliberate departures from category norms - Anti-slop: no purple gradients, no 3-column icon grids, no centered everything, no decorative blobs -Be opinionated. Be specific. Do not hedge. This is YOUR design direction — own it." -s read-only -c 'model_reasoning_effort="medium"' --enable web_search_cached 2>"$TMPERR_DESIGN" +Be opinionated. Be specific. Do not hedge. This is YOUR design direction — own it." -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="medium"' --enable web_search_cached 2>"$TMPERR_DESIGN" ``` Use a 5-minute timeout (`timeout: 300000`). After the command completes, read stderr: ```bash diff --git a/design-review/SKILL.md b/design-review/SKILL.md index 38341033..5ebc9d1f 100644 --- a/design-review/SKILL.md +++ b/design-review/SKILL.md @@ -34,9 +34,11 @@ _SESSIONS=$(find ~/.gstack/sessions -mmin -120 -type f 2>/dev/null | wc -l | tr find ~/.gstack/sessions -mmin +120 -type f -delete 2>/dev/null || true _CONTRIB=$(~/.claude/skills/gstack/bin/gstack-config get gstack_contributor 2>/dev/null || true) _PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +_PROACTIVE_PROMPTED=$([ -f ~/.gstack/.proactive-prompted ] && echo "yes" || echo "no") _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" echo "PROACTIVE: $_PROACTIVE" +echo "PROACTIVE_PROMPTED: $_PROACTIVE_PROMPTED" source <(~/.claude/skills/gstack/bin/gstack-repo-mode 2>/dev/null) || true REPO_MODE=${REPO_MODE:-unknown} echo "REPO_MODE: $REPO_MODE" @@ -54,8 +56,11 @@ echo '{"skill":"design-review","ts":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'","repo":"' for _PF in $(find ~/.gstack/analytics -maxdepth 1 -name '.pending-*' 2>/dev/null); do [ -f "$_PF" ] && ~/.claude/skills/gstack/bin/gstack-telemetry-log --event-type skill_run --skill _pending_finalize --outcome unknown --session-id "$_SESSION_ID" 2>/dev/null || true; break; done ``` -If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke -them when the user explicitly asks. The user opted out of proactive suggestions. +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills AND do not +auto-invoke skills based on conversation context. Only run skills the user explicitly +types (e.g., /qa, /ship). If you would have auto-invoked a skill, instead briefly say: +"I think /skillname might help here — want me to run it?" and wait for confirmation. +The user opted out of proactive behavior. If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. @@ -104,6 +109,27 @@ touch ~/.gstack/.telemetry-prompted This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. +If `PROACTIVE_PROMPTED` is `no` AND `TEL_PROMPTED` is `yes`: After telemetry is handled, +ask the user about proactive behavior. Use AskUserQuestion: + +> gstack can proactively figure out when you might need a skill while you work — +> like suggesting /qa when you say "does this work?" or /investigate when you hit +> a bug. We recommend keeping this on — it speeds up every part of your workflow. + +Options: +- A) Keep it on (recommended) +- B) Turn it off — I'll type /commands myself + +If A: run `~/.claude/skills/gstack/bin/gstack-config set proactive true` +If B: run `~/.claude/skills/gstack/bin/gstack-config set proactive false` + +Always run: +```bash +touch ~/.gstack/.proactive-prompted +``` + +This only happens once. If `PROACTIVE_PROMPTED` is `yes`, skip this entirely. + ## AskUserQuestion Format **ALWAYS follow this structure for every AskUserQuestion call:** @@ -111,7 +137,6 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` -5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -119,97 +144,54 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: +AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. -- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. -- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. -- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: +**Effort reference** — always show both scales: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate / scaffolding | 2 days | 15 min | ~100x | -| Test writing | 1 day | 15 min | ~50x | -| Feature implementation | 1 week | 30 min | ~30x | -| Bug fix + regression test | 4 hours | 15 min | ~20x | -| Architecture / design | 2 days | 4 hours | ~5x | -| Research / exploration | 1 day | 3 hours | ~3x | +| Boilerplate | 2 days | 15 min | ~100x | +| Tests | 1 day | 15 min | ~50x | +| Feature | 1 week | 30 min | ~30x | +| Bug fix | 4 hours | 15 min | ~20x | -- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. +Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). -**Anti-patterns — DON'T do this:** -- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) -- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) -- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) -- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Repo Ownership — See Something, Say Something -## Repo Ownership Mode — See Something, Say Something +`REPO_MODE` controls how to handle issues outside your branch: +- **`solo`** — You own everything. Investigate and offer to fix proactively. +- **`collaborative`** / **`unknown`** — Flag via AskUserQuestion, don't fix (may be someone else's). -`REPO_MODE` from the preamble tells you who owns issues in this repo: - -- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. -- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. -- **`unknown`** — Treat as collaborative (safer default — ask before fixing). - -**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. - -Never let a noticed issue silently pass. The whole point is proactive communication. +Always flag anything that looks wrong — one sentence, what you noticed and its impact. ## Search Before Building -Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. +Before building anything unfamiliar, **search first.** See `~/.claude/skills/gstack/ETHOS.md`. +- **Layer 1** (tried and true) — don't reinvent. **Layer 2** (new and popular) — scrutinize. **Layer 3** (first principles) — prize above all. -**Three layers of knowledge:** -- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. -- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. -- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. - -**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: -"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." - -Log eureka moments: +**Eureka:** When first-principles reasoning contradicts conventional wisdom, name it and log: ```bash jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true ``` -Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. - -**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. +If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. -**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! - -**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. - -**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. - -**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): +**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} - -Hey gstack team — ran into this while using /{skill-name}: - -**What I was trying to do:** {what the user/agent was attempting} -**What happened instead:** {what actually happened} -**My rating:** {0-10} — {one sentence on why it wasn't a 10} - -## Steps to reproduce +**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} +## Repro 1. {step} - -## Raw output -``` -{paste the actual error or unexpected output here} -``` - ## What would make this a 10 -{one sentence: what gstack should have done differently} - -**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} +{one sentence} +**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} ``` - -Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" +Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. ## Completion Status Protocol @@ -733,7 +715,7 @@ The test: would a human designer at a respected studio ever ship this? **10. Performance as Design** (6 items) - LCP < 2.0s (web apps), < 1.5s (informational sites) - CLS < 0.1 (no visible layout shifts during load) -- Skeleton quality: shapes match real content, shimmer animation +- Skeleton quality: shapes match real content layout, shimmer animation - Images: `loading="lazy"`, width/height dimensions set, WebP/AVIF format - Fonts: `font-display: swap`, preconnect to CDN origins - No visible font swap flash (FOUT) — critical fonts preloaded @@ -994,7 +976,7 @@ HARD REJECTION — flag if ANY apply: 6. Carousel with no narrative purpose 7. App UI made of stacked cards instead of layout -Be specific. Reference file:line for every finding." -s read-only -c 'model_reasoning_effort="high"' --enable web_search_cached 2>"$TMPERR_DESIGN" +Be specific. Reference file:line for every finding." -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="high"' --enable web_search_cached 2>"$TMPERR_DESIGN" ``` Use a 5-minute timeout (`timeout: 300000`). After the command completes, read stderr: ```bash diff --git a/document-release/SKILL.md b/document-release/SKILL.md index 1364e4d9..ee08867a 100644 --- a/document-release/SKILL.md +++ b/document-release/SKILL.md @@ -31,9 +31,11 @@ _SESSIONS=$(find ~/.gstack/sessions -mmin -120 -type f 2>/dev/null | wc -l | tr find ~/.gstack/sessions -mmin +120 -type f -delete 2>/dev/null || true _CONTRIB=$(~/.claude/skills/gstack/bin/gstack-config get gstack_contributor 2>/dev/null || true) _PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +_PROACTIVE_PROMPTED=$([ -f ~/.gstack/.proactive-prompted ] && echo "yes" || echo "no") _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" echo "PROACTIVE: $_PROACTIVE" +echo "PROACTIVE_PROMPTED: $_PROACTIVE_PROMPTED" source <(~/.claude/skills/gstack/bin/gstack-repo-mode 2>/dev/null) || true REPO_MODE=${REPO_MODE:-unknown} echo "REPO_MODE: $REPO_MODE" @@ -51,8 +53,11 @@ echo '{"skill":"document-release","ts":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'","repo" for _PF in $(find ~/.gstack/analytics -maxdepth 1 -name '.pending-*' 2>/dev/null); do [ -f "$_PF" ] && ~/.claude/skills/gstack/bin/gstack-telemetry-log --event-type skill_run --skill _pending_finalize --outcome unknown --session-id "$_SESSION_ID" 2>/dev/null || true; break; done ``` -If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke -them when the user explicitly asks. The user opted out of proactive suggestions. +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills AND do not +auto-invoke skills based on conversation context. Only run skills the user explicitly +types (e.g., /qa, /ship). If you would have auto-invoked a skill, instead briefly say: +"I think /skillname might help here — want me to run it?" and wait for confirmation. +The user opted out of proactive behavior. If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. @@ -101,6 +106,27 @@ touch ~/.gstack/.telemetry-prompted This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. +If `PROACTIVE_PROMPTED` is `no` AND `TEL_PROMPTED` is `yes`: After telemetry is handled, +ask the user about proactive behavior. Use AskUserQuestion: + +> gstack can proactively figure out when you might need a skill while you work — +> like suggesting /qa when you say "does this work?" or /investigate when you hit +> a bug. We recommend keeping this on — it speeds up every part of your workflow. + +Options: +- A) Keep it on (recommended) +- B) Turn it off — I'll type /commands myself + +If A: run `~/.claude/skills/gstack/bin/gstack-config set proactive true` +If B: run `~/.claude/skills/gstack/bin/gstack-config set proactive false` + +Always run: +```bash +touch ~/.gstack/.proactive-prompted +``` + +This only happens once. If `PROACTIVE_PROMPTED` is `yes`, skip this entirely. + ## AskUserQuestion Format **ALWAYS follow this structure for every AskUserQuestion call:** @@ -108,7 +134,6 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` -5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -116,97 +141,36 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: +AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. -- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. -- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. -- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: +**Effort reference** — always show both scales: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate / scaffolding | 2 days | 15 min | ~100x | -| Test writing | 1 day | 15 min | ~50x | -| Feature implementation | 1 week | 30 min | ~30x | -| Bug fix + regression test | 4 hours | 15 min | ~20x | -| Architecture / design | 2 days | 4 hours | ~5x | -| Research / exploration | 1 day | 3 hours | ~3x | +| Boilerplate | 2 days | 15 min | ~100x | +| Tests | 1 day | 15 min | ~50x | +| Feature | 1 week | 30 min | ~30x | +| Bug fix | 4 hours | 15 min | ~20x | -- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. - -**Anti-patterns — DON'T do this:** -- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) -- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) -- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) -- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") - -## Repo Ownership Mode — See Something, Say Something - -`REPO_MODE` from the preamble tells you who owns issues in this repo: - -- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. -- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. -- **`unknown`** — Treat as collaborative (safer default — ask before fixing). - -**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. - -Never let a noticed issue silently pass. The whole point is proactive communication. - -## Search Before Building - -Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. - -**Three layers of knowledge:** -- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. -- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. -- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. - -**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: -"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." - -Log eureka moments: -```bash -jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true -``` -Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. - -**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." +Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. +If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. -**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! - -**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. - -**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. - -**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): +**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} - -Hey gstack team — ran into this while using /{skill-name}: - -**What I was trying to do:** {what the user/agent was attempting} -**What happened instead:** {what actually happened} -**My rating:** {0-10} — {one sentence on why it wasn't a 10} - -## Steps to reproduce +**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} +## Repro 1. {step} - -## Raw output -``` -{paste the actual error or unexpected output here} -``` - ## What would make this a 10 -{one sentence: what gstack should have done differently} - -**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} +{one sentence} +**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} ``` - -Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" +Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. ## Completion Status Protocol @@ -297,22 +261,42 @@ Then write a `## GSTACK REVIEW REPORT` section to the end of the plan file: file you are allowed to edit in plan mode. The plan file review report is part of the plan's living status. -## Step 0: Detect base branch +## Step 0: Detect platform and base branch -Determine which branch this PR targets. Use the result as "the base branch" in all subsequent steps. +First, detect the git hosting platform from the remote URL: -1. Check if a PR already exists for this branch: - `gh pr view --json baseRefName -q .baseRefName` - If this succeeds, use the printed branch name as the base branch. +```bash +git remote get-url origin 2>/dev/null +``` -2. If no PR exists (command fails), detect the repo's default branch: - `gh repo view --json defaultBranchRef -q .defaultBranchRef.name` +- If the URL contains "github.com" → platform is **GitHub** +- If the URL contains "gitlab" → platform is **GitLab** +- Otherwise, check CLI availability: + - `gh auth status 2>/dev/null` succeeds → platform is **GitHub** (covers GitHub Enterprise) + - `glab auth status 2>/dev/null` succeeds → platform is **GitLab** (covers self-hosted) + - Neither → **unknown** (use git-native commands only) -3. If both commands fail, fall back to `main`. +Determine which branch this PR/MR targets, or the repo's default branch if no +PR/MR exists. Use the result as "the base branch" in all subsequent steps. + +**If GitHub:** +1. `gh pr view --json baseRefName -q .baseRefName` — if succeeds, use it +2. `gh repo view --json defaultBranchRef -q .defaultBranchRef.name` — if succeeds, use it + +**If GitLab:** +1. `glab mr view -F json 2>/dev/null` and extract the `target_branch` field — if succeeds, use it +2. `glab repo view -F json 2>/dev/null` and extract the `default_branch` field — if succeeds, use it + +**Git-native fallback (if unknown platform, or CLI commands fail):** +1. `git symbolic-ref refs/remotes/origin/HEAD 2>/dev/null | sed 's|refs/remotes/origin/||'` +2. If that fails: `git rev-parse --verify origin/main 2>/dev/null` → use `main` +3. If that fails: `git rev-parse --verify origin/master 2>/dev/null` → use `master` + +If all fail, fall back to `main`. Print the detected base branch name. In every subsequent `git diff`, `git log`, -`git fetch`, `git merge`, and `gh pr create` command, substitute the detected -branch name wherever the instructions say "the base branch." +`git fetch`, `git merge`, and PR/MR creation command, substitute the detected +branch name wherever the instructions say "the base branch" or ``. --- @@ -585,14 +569,20 @@ EOF git push ``` -**PR body update (idempotent, race-safe):** +**PR/MR body update (idempotent, race-safe):** -1. Read the existing PR body into a PID-unique tempfile: +1. Read the existing PR/MR body into a PID-unique tempfile (use the platform detected in Step 0): +**If GitHub:** ```bash gh pr view --json body -q .body > /tmp/gstack-pr-body-$$.md ``` +**If GitLab:** +```bash +glab mr view -F json 2>/dev/null | python3 -c "import sys,json; print(json.load(sys.stdin).get('description',''))" > /tmp/gstack-pr-body-$$.md +``` + 2. If the tempfile already contains a `## Documentation` section, replace that section with the updated content. If it does not contain one, append a `## Documentation` section at the end. @@ -602,18 +592,28 @@ gh pr view --json body -q .body > /tmp/gstack-pr-body-$$.md 4. Write the updated body back: +**If GitHub:** ```bash gh pr edit --body-file /tmp/gstack-pr-body-$$.md ``` +**If GitLab:** +Read the contents of `/tmp/gstack-pr-body-$$.md` using the Read tool, then pass it to `glab mr update` using a heredoc to avoid shell metacharacter issues: +```bash +glab mr update -d "$(cat <<'MRBODY' + +MRBODY +)" +``` + 5. Clean up the tempfile: ```bash rm -f /tmp/gstack-pr-body-$$.md ``` -6. If `gh pr view` fails (no PR exists): skip with message "No PR found — skipping body update." -7. If `gh pr edit` fails: warn "Could not update PR body — documentation changes are in the +6. If `gh pr view` / `glab mr view` fails (no PR/MR exists): skip with message "No PR/MR found — skipping body update." +7. If `gh pr edit` / `glab mr update` fails: warn "Could not update PR/MR body — documentation changes are in the commit." and continue. **Structured doc health summary (final output):** diff --git a/document-release/SKILL.md.tmpl b/document-release/SKILL.md.tmpl index 30cdee0c..5d236ae2 100644 --- a/document-release/SKILL.md.tmpl +++ b/document-release/SKILL.md.tmpl @@ -291,14 +291,20 @@ EOF git push ``` -**PR body update (idempotent, race-safe):** +**PR/MR body update (idempotent, race-safe):** -1. Read the existing PR body into a PID-unique tempfile: +1. Read the existing PR/MR body into a PID-unique tempfile (use the platform detected in Step 0): +**If GitHub:** ```bash gh pr view --json body -q .body > /tmp/gstack-pr-body-$$.md ``` +**If GitLab:** +```bash +glab mr view -F json 2>/dev/null | python3 -c "import sys,json; print(json.load(sys.stdin).get('description',''))" > /tmp/gstack-pr-body-$$.md +``` + 2. If the tempfile already contains a `## Documentation` section, replace that section with the updated content. If it does not contain one, append a `## Documentation` section at the end. @@ -308,18 +314,28 @@ gh pr view --json body -q .body > /tmp/gstack-pr-body-$$.md 4. Write the updated body back: +**If GitHub:** ```bash gh pr edit --body-file /tmp/gstack-pr-body-$$.md ``` +**If GitLab:** +Read the contents of `/tmp/gstack-pr-body-$$.md` using the Read tool, then pass it to `glab mr update` using a heredoc to avoid shell metacharacter issues: +```bash +glab mr update -d "$(cat <<'MRBODY' + +MRBODY +)" +``` + 5. Clean up the tempfile: ```bash rm -f /tmp/gstack-pr-body-$$.md ``` -6. If `gh pr view` fails (no PR exists): skip with message "No PR found — skipping body update." -7. If `gh pr edit` fails: warn "Could not update PR body — documentation changes are in the +6. If `gh pr view` / `glab mr view` fails (no PR/MR exists): skip with message "No PR/MR found — skipping body update." +7. If `gh pr edit` / `glab mr update` fails: warn "Could not update PR/MR body — documentation changes are in the commit." and continue. **Structured doc health summary (final output):** diff --git a/investigate/SKILL.md b/investigate/SKILL.md index b1df5ca2..4d1cb933 100644 --- a/investigate/SKILL.md +++ b/investigate/SKILL.md @@ -45,9 +45,11 @@ _SESSIONS=$(find ~/.gstack/sessions -mmin -120 -type f 2>/dev/null | wc -l | tr find ~/.gstack/sessions -mmin +120 -type f -delete 2>/dev/null || true _CONTRIB=$(~/.claude/skills/gstack/bin/gstack-config get gstack_contributor 2>/dev/null || true) _PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +_PROACTIVE_PROMPTED=$([ -f ~/.gstack/.proactive-prompted ] && echo "yes" || echo "no") _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" echo "PROACTIVE: $_PROACTIVE" +echo "PROACTIVE_PROMPTED: $_PROACTIVE_PROMPTED" source <(~/.claude/skills/gstack/bin/gstack-repo-mode 2>/dev/null) || true REPO_MODE=${REPO_MODE:-unknown} echo "REPO_MODE: $REPO_MODE" @@ -65,8 +67,11 @@ echo '{"skill":"investigate","ts":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'","repo":"'$( for _PF in $(find ~/.gstack/analytics -maxdepth 1 -name '.pending-*' 2>/dev/null); do [ -f "$_PF" ] && ~/.claude/skills/gstack/bin/gstack-telemetry-log --event-type skill_run --skill _pending_finalize --outcome unknown --session-id "$_SESSION_ID" 2>/dev/null || true; break; done ``` -If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke -them when the user explicitly asks. The user opted out of proactive suggestions. +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills AND do not +auto-invoke skills based on conversation context. Only run skills the user explicitly +types (e.g., /qa, /ship). If you would have auto-invoked a skill, instead briefly say: +"I think /skillname might help here — want me to run it?" and wait for confirmation. +The user opted out of proactive behavior. If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. @@ -115,6 +120,27 @@ touch ~/.gstack/.telemetry-prompted This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. +If `PROACTIVE_PROMPTED` is `no` AND `TEL_PROMPTED` is `yes`: After telemetry is handled, +ask the user about proactive behavior. Use AskUserQuestion: + +> gstack can proactively figure out when you might need a skill while you work — +> like suggesting /qa when you say "does this work?" or /investigate when you hit +> a bug. We recommend keeping this on — it speeds up every part of your workflow. + +Options: +- A) Keep it on (recommended) +- B) Turn it off — I'll type /commands myself + +If A: run `~/.claude/skills/gstack/bin/gstack-config set proactive true` +If B: run `~/.claude/skills/gstack/bin/gstack-config set proactive false` + +Always run: +```bash +touch ~/.gstack/.proactive-prompted +``` + +This only happens once. If `PROACTIVE_PROMPTED` is `yes`, skip this entirely. + ## AskUserQuestion Format **ALWAYS follow this structure for every AskUserQuestion call:** @@ -122,7 +148,6 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` -5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -130,97 +155,36 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: +AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. -- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. -- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. -- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: +**Effort reference** — always show both scales: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate / scaffolding | 2 days | 15 min | ~100x | -| Test writing | 1 day | 15 min | ~50x | -| Feature implementation | 1 week | 30 min | ~30x | -| Bug fix + regression test | 4 hours | 15 min | ~20x | -| Architecture / design | 2 days | 4 hours | ~5x | -| Research / exploration | 1 day | 3 hours | ~3x | +| Boilerplate | 2 days | 15 min | ~100x | +| Tests | 1 day | 15 min | ~50x | +| Feature | 1 week | 30 min | ~30x | +| Bug fix | 4 hours | 15 min | ~20x | -- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. - -**Anti-patterns — DON'T do this:** -- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) -- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) -- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) -- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") - -## Repo Ownership Mode — See Something, Say Something - -`REPO_MODE` from the preamble tells you who owns issues in this repo: - -- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. -- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. -- **`unknown`** — Treat as collaborative (safer default — ask before fixing). - -**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. - -Never let a noticed issue silently pass. The whole point is proactive communication. - -## Search Before Building - -Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. - -**Three layers of knowledge:** -- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. -- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. -- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. - -**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: -"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." - -Log eureka moments: -```bash -jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true -``` -Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. - -**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." +Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. +If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. -**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! - -**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. - -**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. - -**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): +**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} - -Hey gstack team — ran into this while using /{skill-name}: - -**What I was trying to do:** {what the user/agent was attempting} -**What happened instead:** {what actually happened} -**My rating:** {0-10} — {one sentence on why it wasn't a 10} - -## Steps to reproduce +**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} +## Repro 1. {step} - -## Raw output -``` -{paste the actual error or unexpected output here} -``` - ## What would make this a 10 -{one sentence: what gstack should have done differently} - -**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} +{one sentence} +**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} ``` - -Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" +Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. ## Completion Status Protocol diff --git a/land-and-deploy/SKILL.md b/land-and-deploy/SKILL.md index 85e52e4e..131c1f2d 100644 --- a/land-and-deploy/SKILL.md +++ b/land-and-deploy/SKILL.md @@ -28,9 +28,11 @@ _SESSIONS=$(find ~/.gstack/sessions -mmin -120 -type f 2>/dev/null | wc -l | tr find ~/.gstack/sessions -mmin +120 -type f -delete 2>/dev/null || true _CONTRIB=$(~/.claude/skills/gstack/bin/gstack-config get gstack_contributor 2>/dev/null || true) _PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +_PROACTIVE_PROMPTED=$([ -f ~/.gstack/.proactive-prompted ] && echo "yes" || echo "no") _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" echo "PROACTIVE: $_PROACTIVE" +echo "PROACTIVE_PROMPTED: $_PROACTIVE_PROMPTED" source <(~/.claude/skills/gstack/bin/gstack-repo-mode 2>/dev/null) || true REPO_MODE=${REPO_MODE:-unknown} echo "REPO_MODE: $REPO_MODE" @@ -48,8 +50,11 @@ echo '{"skill":"land-and-deploy","ts":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'","repo": for _PF in $(find ~/.gstack/analytics -maxdepth 1 -name '.pending-*' 2>/dev/null); do [ -f "$_PF" ] && ~/.claude/skills/gstack/bin/gstack-telemetry-log --event-type skill_run --skill _pending_finalize --outcome unknown --session-id "$_SESSION_ID" 2>/dev/null || true; break; done ``` -If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke -them when the user explicitly asks. The user opted out of proactive suggestions. +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills AND do not +auto-invoke skills based on conversation context. Only run skills the user explicitly +types (e.g., /qa, /ship). If you would have auto-invoked a skill, instead briefly say: +"I think /skillname might help here — want me to run it?" and wait for confirmation. +The user opted out of proactive behavior. If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. @@ -98,6 +103,27 @@ touch ~/.gstack/.telemetry-prompted This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. +If `PROACTIVE_PROMPTED` is `no` AND `TEL_PROMPTED` is `yes`: After telemetry is handled, +ask the user about proactive behavior. Use AskUserQuestion: + +> gstack can proactively figure out when you might need a skill while you work — +> like suggesting /qa when you say "does this work?" or /investigate when you hit +> a bug. We recommend keeping this on — it speeds up every part of your workflow. + +Options: +- A) Keep it on (recommended) +- B) Turn it off — I'll type /commands myself + +If A: run `~/.claude/skills/gstack/bin/gstack-config set proactive true` +If B: run `~/.claude/skills/gstack/bin/gstack-config set proactive false` + +Always run: +```bash +touch ~/.gstack/.proactive-prompted +``` + +This only happens once. If `PROACTIVE_PROMPTED` is `yes`, skip this entirely. + ## AskUserQuestion Format **ALWAYS follow this structure for every AskUserQuestion call:** @@ -105,7 +131,6 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` -5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -113,97 +138,54 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: +AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. -- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. -- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. -- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: +**Effort reference** — always show both scales: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate / scaffolding | 2 days | 15 min | ~100x | -| Test writing | 1 day | 15 min | ~50x | -| Feature implementation | 1 week | 30 min | ~30x | -| Bug fix + regression test | 4 hours | 15 min | ~20x | -| Architecture / design | 2 days | 4 hours | ~5x | -| Research / exploration | 1 day | 3 hours | ~3x | +| Boilerplate | 2 days | 15 min | ~100x | +| Tests | 1 day | 15 min | ~50x | +| Feature | 1 week | 30 min | ~30x | +| Bug fix | 4 hours | 15 min | ~20x | -- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. +Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). -**Anti-patterns — DON'T do this:** -- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) -- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) -- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) -- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Repo Ownership — See Something, Say Something -## Repo Ownership Mode — See Something, Say Something +`REPO_MODE` controls how to handle issues outside your branch: +- **`solo`** — You own everything. Investigate and offer to fix proactively. +- **`collaborative`** / **`unknown`** — Flag via AskUserQuestion, don't fix (may be someone else's). -`REPO_MODE` from the preamble tells you who owns issues in this repo: - -- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. -- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. -- **`unknown`** — Treat as collaborative (safer default — ask before fixing). - -**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. - -Never let a noticed issue silently pass. The whole point is proactive communication. +Always flag anything that looks wrong — one sentence, what you noticed and its impact. ## Search Before Building -Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. +Before building anything unfamiliar, **search first.** See `~/.claude/skills/gstack/ETHOS.md`. +- **Layer 1** (tried and true) — don't reinvent. **Layer 2** (new and popular) — scrutinize. **Layer 3** (first principles) — prize above all. -**Three layers of knowledge:** -- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. -- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. -- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. - -**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: -"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." - -Log eureka moments: +**Eureka:** When first-principles reasoning contradicts conventional wisdom, name it and log: ```bash jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true ``` -Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. - -**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. +If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. -**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! - -**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. - -**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. - -**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): +**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} - -Hey gstack team — ran into this while using /{skill-name}: - -**What I was trying to do:** {what the user/agent was attempting} -**What happened instead:** {what actually happened} -**My rating:** {0-10} — {one sentence on why it wasn't a 10} - -## Steps to reproduce +**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} +## Repro 1. {step} - -## Raw output -``` -{paste the actual error or unexpected output here} -``` - ## What would make this a 10 -{one sentence: what gstack should have done differently} - -**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} +{one sentence} +**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} ``` - -Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" +Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. ## Completion Status Protocol @@ -313,25 +295,47 @@ If `NEEDS_SETUP`: 2. Run: `cd && ./setup` 3. If `bun` is not installed: `curl -fsSL https://bun.sh/install | bash` -## Step 0: Detect base branch +## Step 0: Detect platform and base branch -Determine which branch this PR targets. Use the result as "the base branch" in all subsequent steps. +First, detect the git hosting platform from the remote URL: -1. Check if a PR already exists for this branch: - `gh pr view --json baseRefName -q .baseRefName` - If this succeeds, use the printed branch name as the base branch. +```bash +git remote get-url origin 2>/dev/null +``` -2. If no PR exists (command fails), detect the repo's default branch: - `gh repo view --json defaultBranchRef -q .defaultBranchRef.name` +- If the URL contains "github.com" → platform is **GitHub** +- If the URL contains "gitlab" → platform is **GitLab** +- Otherwise, check CLI availability: + - `gh auth status 2>/dev/null` succeeds → platform is **GitHub** (covers GitHub Enterprise) + - `glab auth status 2>/dev/null` succeeds → platform is **GitLab** (covers self-hosted) + - Neither → **unknown** (use git-native commands only) -3. If both commands fail, fall back to `main`. +Determine which branch this PR/MR targets, or the repo's default branch if no +PR/MR exists. Use the result as "the base branch" in all subsequent steps. + +**If GitHub:** +1. `gh pr view --json baseRefName -q .baseRefName` — if succeeds, use it +2. `gh repo view --json defaultBranchRef -q .defaultBranchRef.name` — if succeeds, use it + +**If GitLab:** +1. `glab mr view -F json 2>/dev/null` and extract the `target_branch` field — if succeeds, use it +2. `glab repo view -F json 2>/dev/null` and extract the `default_branch` field — if succeeds, use it + +**Git-native fallback (if unknown platform, or CLI commands fail):** +1. `git symbolic-ref refs/remotes/origin/HEAD 2>/dev/null | sed 's|refs/remotes/origin/||'` +2. If that fails: `git rev-parse --verify origin/main 2>/dev/null` → use `main` +3. If that fails: `git rev-parse --verify origin/master 2>/dev/null` → use `master` + +If all fail, fall back to `main`. Print the detected base branch name. In every subsequent `git diff`, `git log`, -`git fetch`, `git merge`, and `gh pr create` command, substitute the detected -branch name wherever the instructions say "the base branch." +`git fetch`, `git merge`, and PR/MR creation command, substitute the detected +branch name wherever the instructions say "the base branch" or ``. --- +**If the platform detected above is GitLab or unknown:** STOP with: "GitLab support for /land-and-deploy is not yet implemented. Run `/ship` to create the MR, then merge manually via the GitLab web UI." Do not proceed. + # /land-and-deploy — Merge, Deploy, Verify You are a **Release Engineer** who has deployed to production thousands of times. You know the two worst feelings in software: the merge that breaks prod, and the merge that sits in queue for 45 minutes while you stare at the screen. Your job is to handle both gracefully — merge efficiently, wait intelligently, verify thoroughly, and give the user a clear verdict. diff --git a/land-and-deploy/SKILL.md.tmpl b/land-and-deploy/SKILL.md.tmpl index a82a75a2..7fcf6797 100644 --- a/land-and-deploy/SKILL.md.tmpl +++ b/land-and-deploy/SKILL.md.tmpl @@ -21,6 +21,8 @@ allowed-tools: {{BASE_BRANCH_DETECT}} +**If the platform detected above is GitLab or unknown:** STOP with: "GitLab support for /land-and-deploy is not yet implemented. Run `/ship` to create the MR, then merge manually via the GitLab web UI." Do not proceed. + # /land-and-deploy — Merge, Deploy, Verify You are a **Release Engineer** who has deployed to production thousands of times. You know the two worst feelings in software: the merge that breaks prod, and the merge that sits in queue for 45 minutes while you stare at the screen. Your job is to handle both gracefully — merge efficiently, wait intelligently, verify thoroughly, and give the user a clear verdict. diff --git a/office-hours/SKILL.md b/office-hours/SKILL.md index 8bf43efa..9e2debd4 100644 --- a/office-hours/SKILL.md +++ b/office-hours/SKILL.md @@ -36,9 +36,11 @@ _SESSIONS=$(find ~/.gstack/sessions -mmin -120 -type f 2>/dev/null | wc -l | tr find ~/.gstack/sessions -mmin +120 -type f -delete 2>/dev/null || true _CONTRIB=$(~/.claude/skills/gstack/bin/gstack-config get gstack_contributor 2>/dev/null || true) _PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +_PROACTIVE_PROMPTED=$([ -f ~/.gstack/.proactive-prompted ] && echo "yes" || echo "no") _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" echo "PROACTIVE: $_PROACTIVE" +echo "PROACTIVE_PROMPTED: $_PROACTIVE_PROMPTED" source <(~/.claude/skills/gstack/bin/gstack-repo-mode 2>/dev/null) || true REPO_MODE=${REPO_MODE:-unknown} echo "REPO_MODE: $REPO_MODE" @@ -56,8 +58,11 @@ echo '{"skill":"office-hours","ts":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'","repo":"'$ for _PF in $(find ~/.gstack/analytics -maxdepth 1 -name '.pending-*' 2>/dev/null); do [ -f "$_PF" ] && ~/.claude/skills/gstack/bin/gstack-telemetry-log --event-type skill_run --skill _pending_finalize --outcome unknown --session-id "$_SESSION_ID" 2>/dev/null || true; break; done ``` -If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke -them when the user explicitly asks. The user opted out of proactive suggestions. +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills AND do not +auto-invoke skills based on conversation context. Only run skills the user explicitly +types (e.g., /qa, /ship). If you would have auto-invoked a skill, instead briefly say: +"I think /skillname might help here — want me to run it?" and wait for confirmation. +The user opted out of proactive behavior. If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. @@ -106,6 +111,27 @@ touch ~/.gstack/.telemetry-prompted This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. +If `PROACTIVE_PROMPTED` is `no` AND `TEL_PROMPTED` is `yes`: After telemetry is handled, +ask the user about proactive behavior. Use AskUserQuestion: + +> gstack can proactively figure out when you might need a skill while you work — +> like suggesting /qa when you say "does this work?" or /investigate when you hit +> a bug. We recommend keeping this on — it speeds up every part of your workflow. + +Options: +- A) Keep it on (recommended) +- B) Turn it off — I'll type /commands myself + +If A: run `~/.claude/skills/gstack/bin/gstack-config set proactive true` +If B: run `~/.claude/skills/gstack/bin/gstack-config set proactive false` + +Always run: +```bash +touch ~/.gstack/.proactive-prompted +``` + +This only happens once. If `PROACTIVE_PROMPTED` is `yes`, skip this entirely. + ## AskUserQuestion Format **ALWAYS follow this structure for every AskUserQuestion call:** @@ -113,7 +139,6 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` -5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -121,97 +146,54 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: +AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. -- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. -- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. -- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: +**Effort reference** — always show both scales: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate / scaffolding | 2 days | 15 min | ~100x | -| Test writing | 1 day | 15 min | ~50x | -| Feature implementation | 1 week | 30 min | ~30x | -| Bug fix + regression test | 4 hours | 15 min | ~20x | -| Architecture / design | 2 days | 4 hours | ~5x | -| Research / exploration | 1 day | 3 hours | ~3x | +| Boilerplate | 2 days | 15 min | ~100x | +| Tests | 1 day | 15 min | ~50x | +| Feature | 1 week | 30 min | ~30x | +| Bug fix | 4 hours | 15 min | ~20x | -- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. +Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). -**Anti-patterns — DON'T do this:** -- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) -- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) -- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) -- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Repo Ownership — See Something, Say Something -## Repo Ownership Mode — See Something, Say Something +`REPO_MODE` controls how to handle issues outside your branch: +- **`solo`** — You own everything. Investigate and offer to fix proactively. +- **`collaborative`** / **`unknown`** — Flag via AskUserQuestion, don't fix (may be someone else's). -`REPO_MODE` from the preamble tells you who owns issues in this repo: - -- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. -- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. -- **`unknown`** — Treat as collaborative (safer default — ask before fixing). - -**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. - -Never let a noticed issue silently pass. The whole point is proactive communication. +Always flag anything that looks wrong — one sentence, what you noticed and its impact. ## Search Before Building -Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. +Before building anything unfamiliar, **search first.** See `~/.claude/skills/gstack/ETHOS.md`. +- **Layer 1** (tried and true) — don't reinvent. **Layer 2** (new and popular) — scrutinize. **Layer 3** (first principles) — prize above all. -**Three layers of knowledge:** -- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. -- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. -- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. - -**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: -"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." - -Log eureka moments: +**Eureka:** When first-principles reasoning contradicts conventional wisdom, name it and log: ```bash jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true ``` -Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. - -**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. +If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. -**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! - -**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. - -**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. - -**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): +**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} - -Hey gstack team — ran into this while using /{skill-name}: - -**What I was trying to do:** {what the user/agent was attempting} -**What happened instead:** {what actually happened} -**My rating:** {0-10} — {one sentence on why it wasn't a 10} - -## Steps to reproduce +**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} +## Repro 1. {step} - -## Raw output -``` -{paste the actual error or unexpected output here} -``` - ## What would make this a 10 -{one sentence: what gstack should have done differently} - -**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} +{one sentence} +**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} ``` - -Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" +Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. ## Completion Status Protocol @@ -688,7 +670,7 @@ Write the full prompt (context block + instructions) to this file. Use the mode- ```bash TMPERR_OH=$(mktemp /tmp/codex-oh-err-XXXXXXXX) -codex exec "$(cat "$CODEX_PROMPT_FILE")" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_OH" +codex exec "$(cat "$CODEX_PROMPT_FILE")" -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_OH" ``` Use a 5-minute timeout (`timeout: 300000`). After the command completes, read stderr: @@ -839,7 +821,7 @@ If user chooses A, launch both voices simultaneously: 1. **Codex** (via Bash, `model_reasoning_effort="medium"`): ```bash TMPERR_SKETCH=$(mktemp /tmp/codex-sketch-XXXXXXXX) -codex exec "For this product approach, provide: a visual thesis (one sentence — mood, material, energy), a content plan (hero → support → detail → CTA), and 2 interaction ideas that change page feel. Apply beautiful defaults: composition-first, brand-first, cardless, poster not document. Be opinionated." -s read-only -c 'model_reasoning_effort="medium"' --enable web_search_cached 2>"$TMPERR_SKETCH" +codex exec "For this product approach, provide: a visual thesis (one sentence — mood, material, energy), a content plan (hero → support → detail → CTA), and 2 interaction ideas that change page feel. Apply beautiful defaults: composition-first, brand-first, cardless, poster not document. Be opinionated." -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="medium"' --enable web_search_cached 2>"$TMPERR_SKETCH" ``` Use a 5-minute timeout (`timeout: 300000`). After completion: `cat "$TMPERR_SKETCH" && rm -f "$TMPERR_SKETCH"` diff --git a/package.json b/package.json index f666c9af..130af28f 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "gstack", - "version": "0.11.19.0", + "version": "0.11.20.0", "description": "Garry's Stack — Claude Code skills + fast headless browser. One repo, one install, entire AI engineering workflow.", "license": "MIT", "type": "module", diff --git a/plan-ceo-review/SKILL.md b/plan-ceo-review/SKILL.md index e5d4af6a..d05be05f 100644 --- a/plan-ceo-review/SKILL.md +++ b/plan-ceo-review/SKILL.md @@ -34,9 +34,11 @@ _SESSIONS=$(find ~/.gstack/sessions -mmin -120 -type f 2>/dev/null | wc -l | tr find ~/.gstack/sessions -mmin +120 -type f -delete 2>/dev/null || true _CONTRIB=$(~/.claude/skills/gstack/bin/gstack-config get gstack_contributor 2>/dev/null || true) _PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +_PROACTIVE_PROMPTED=$([ -f ~/.gstack/.proactive-prompted ] && echo "yes" || echo "no") _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" echo "PROACTIVE: $_PROACTIVE" +echo "PROACTIVE_PROMPTED: $_PROACTIVE_PROMPTED" source <(~/.claude/skills/gstack/bin/gstack-repo-mode 2>/dev/null) || true REPO_MODE=${REPO_MODE:-unknown} echo "REPO_MODE: $REPO_MODE" @@ -54,8 +56,11 @@ echo '{"skill":"plan-ceo-review","ts":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'","repo": for _PF in $(find ~/.gstack/analytics -maxdepth 1 -name '.pending-*' 2>/dev/null); do [ -f "$_PF" ] && ~/.claude/skills/gstack/bin/gstack-telemetry-log --event-type skill_run --skill _pending_finalize --outcome unknown --session-id "$_SESSION_ID" 2>/dev/null || true; break; done ``` -If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke -them when the user explicitly asks. The user opted out of proactive suggestions. +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills AND do not +auto-invoke skills based on conversation context. Only run skills the user explicitly +types (e.g., /qa, /ship). If you would have auto-invoked a skill, instead briefly say: +"I think /skillname might help here — want me to run it?" and wait for confirmation. +The user opted out of proactive behavior. If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. @@ -104,6 +109,27 @@ touch ~/.gstack/.telemetry-prompted This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. +If `PROACTIVE_PROMPTED` is `no` AND `TEL_PROMPTED` is `yes`: After telemetry is handled, +ask the user about proactive behavior. Use AskUserQuestion: + +> gstack can proactively figure out when you might need a skill while you work — +> like suggesting /qa when you say "does this work?" or /investigate when you hit +> a bug. We recommend keeping this on — it speeds up every part of your workflow. + +Options: +- A) Keep it on (recommended) +- B) Turn it off — I'll type /commands myself + +If A: run `~/.claude/skills/gstack/bin/gstack-config set proactive true` +If B: run `~/.claude/skills/gstack/bin/gstack-config set proactive false` + +Always run: +```bash +touch ~/.gstack/.proactive-prompted +``` + +This only happens once. If `PROACTIVE_PROMPTED` is `yes`, skip this entirely. + ## AskUserQuestion Format **ALWAYS follow this structure for every AskUserQuestion call:** @@ -111,7 +137,6 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` -5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -119,97 +144,54 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: +AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. -- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. -- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. -- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: +**Effort reference** — always show both scales: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate / scaffolding | 2 days | 15 min | ~100x | -| Test writing | 1 day | 15 min | ~50x | -| Feature implementation | 1 week | 30 min | ~30x | -| Bug fix + regression test | 4 hours | 15 min | ~20x | -| Architecture / design | 2 days | 4 hours | ~5x | -| Research / exploration | 1 day | 3 hours | ~3x | +| Boilerplate | 2 days | 15 min | ~100x | +| Tests | 1 day | 15 min | ~50x | +| Feature | 1 week | 30 min | ~30x | +| Bug fix | 4 hours | 15 min | ~20x | -- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. +Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). -**Anti-patterns — DON'T do this:** -- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) -- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) -- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) -- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Repo Ownership — See Something, Say Something -## Repo Ownership Mode — See Something, Say Something +`REPO_MODE` controls how to handle issues outside your branch: +- **`solo`** — You own everything. Investigate and offer to fix proactively. +- **`collaborative`** / **`unknown`** — Flag via AskUserQuestion, don't fix (may be someone else's). -`REPO_MODE` from the preamble tells you who owns issues in this repo: - -- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. -- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. -- **`unknown`** — Treat as collaborative (safer default — ask before fixing). - -**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. - -Never let a noticed issue silently pass. The whole point is proactive communication. +Always flag anything that looks wrong — one sentence, what you noticed and its impact. ## Search Before Building -Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. +Before building anything unfamiliar, **search first.** See `~/.claude/skills/gstack/ETHOS.md`. +- **Layer 1** (tried and true) — don't reinvent. **Layer 2** (new and popular) — scrutinize. **Layer 3** (first principles) — prize above all. -**Three layers of knowledge:** -- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. -- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. -- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. - -**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: -"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." - -Log eureka moments: +**Eureka:** When first-principles reasoning contradicts conventional wisdom, name it and log: ```bash jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true ``` -Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. - -**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. +If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. -**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! - -**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. - -**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. - -**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): +**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} - -Hey gstack team — ran into this while using /{skill-name}: - -**What I was trying to do:** {what the user/agent was attempting} -**What happened instead:** {what actually happened} -**My rating:** {0-10} — {one sentence on why it wasn't a 10} - -## Steps to reproduce +**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} +## Repro 1. {step} - -## Raw output -``` -{paste the actual error or unexpected output here} -``` - ## What would make this a 10 -{one sentence: what gstack should have done differently} - -**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} +{one sentence} +**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} ``` - -Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" +Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. ## Completion Status Protocol @@ -300,22 +282,42 @@ Then write a `## GSTACK REVIEW REPORT` section to the end of the plan file: file you are allowed to edit in plan mode. The plan file review report is part of the plan's living status. -## Step 0: Detect base branch +## Step 0: Detect platform and base branch -Determine which branch this PR targets. Use the result as "the base branch" in all subsequent steps. +First, detect the git hosting platform from the remote URL: -1. Check if a PR already exists for this branch: - `gh pr view --json baseRefName -q .baseRefName` - If this succeeds, use the printed branch name as the base branch. +```bash +git remote get-url origin 2>/dev/null +``` -2. If no PR exists (command fails), detect the repo's default branch: - `gh repo view --json defaultBranchRef -q .defaultBranchRef.name` +- If the URL contains "github.com" → platform is **GitHub** +- If the URL contains "gitlab" → platform is **GitLab** +- Otherwise, check CLI availability: + - `gh auth status 2>/dev/null` succeeds → platform is **GitHub** (covers GitHub Enterprise) + - `glab auth status 2>/dev/null` succeeds → platform is **GitLab** (covers self-hosted) + - Neither → **unknown** (use git-native commands only) -3. If both commands fail, fall back to `main`. +Determine which branch this PR/MR targets, or the repo's default branch if no +PR/MR exists. Use the result as "the base branch" in all subsequent steps. + +**If GitHub:** +1. `gh pr view --json baseRefName -q .baseRefName` — if succeeds, use it +2. `gh repo view --json defaultBranchRef -q .defaultBranchRef.name` — if succeeds, use it + +**If GitLab:** +1. `glab mr view -F json 2>/dev/null` and extract the `target_branch` field — if succeeds, use it +2. `glab repo view -F json 2>/dev/null` and extract the `default_branch` field — if succeeds, use it + +**Git-native fallback (if unknown platform, or CLI commands fail):** +1. `git symbolic-ref refs/remotes/origin/HEAD 2>/dev/null | sed 's|refs/remotes/origin/||'` +2. If that fails: `git rev-parse --verify origin/main 2>/dev/null` → use `main` +3. If that fails: `git rev-parse --verify origin/master 2>/dev/null` → use `master` + +If all fail, fall back to `main`. Print the detected base branch name. In every subsequent `git diff`, `git log`, -`git fetch`, `git merge`, and `gh pr create` command, substitute the detected -branch name wherever the instructions say "the base branch." +`git fetch`, `git merge`, and PR/MR creation command, substitute the detected +branch name wherever the instructions say "the base branch" or ``. --- @@ -1045,7 +1047,7 @@ THE PLAN: ```bash TMPERR_PV=$(mktemp /tmp/codex-planreview-XXXXXXXX) -codex exec "" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_PV" +codex exec "" -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_PV" ``` Use a 5-minute timeout (`timeout: 300000`). After the command completes, read stderr: diff --git a/plan-design-review/SKILL.md b/plan-design-review/SKILL.md index 9b45e8c8..5960ea18 100644 --- a/plan-design-review/SKILL.md +++ b/plan-design-review/SKILL.md @@ -32,9 +32,11 @@ _SESSIONS=$(find ~/.gstack/sessions -mmin -120 -type f 2>/dev/null | wc -l | tr find ~/.gstack/sessions -mmin +120 -type f -delete 2>/dev/null || true _CONTRIB=$(~/.claude/skills/gstack/bin/gstack-config get gstack_contributor 2>/dev/null || true) _PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +_PROACTIVE_PROMPTED=$([ -f ~/.gstack/.proactive-prompted ] && echo "yes" || echo "no") _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" echo "PROACTIVE: $_PROACTIVE" +echo "PROACTIVE_PROMPTED: $_PROACTIVE_PROMPTED" source <(~/.claude/skills/gstack/bin/gstack-repo-mode 2>/dev/null) || true REPO_MODE=${REPO_MODE:-unknown} echo "REPO_MODE: $REPO_MODE" @@ -52,8 +54,11 @@ echo '{"skill":"plan-design-review","ts":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'","rep for _PF in $(find ~/.gstack/analytics -maxdepth 1 -name '.pending-*' 2>/dev/null); do [ -f "$_PF" ] && ~/.claude/skills/gstack/bin/gstack-telemetry-log --event-type skill_run --skill _pending_finalize --outcome unknown --session-id "$_SESSION_ID" 2>/dev/null || true; break; done ``` -If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke -them when the user explicitly asks. The user opted out of proactive suggestions. +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills AND do not +auto-invoke skills based on conversation context. Only run skills the user explicitly +types (e.g., /qa, /ship). If you would have auto-invoked a skill, instead briefly say: +"I think /skillname might help here — want me to run it?" and wait for confirmation. +The user opted out of proactive behavior. If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. @@ -102,6 +107,27 @@ touch ~/.gstack/.telemetry-prompted This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. +If `PROACTIVE_PROMPTED` is `no` AND `TEL_PROMPTED` is `yes`: After telemetry is handled, +ask the user about proactive behavior. Use AskUserQuestion: + +> gstack can proactively figure out when you might need a skill while you work — +> like suggesting /qa when you say "does this work?" or /investigate when you hit +> a bug. We recommend keeping this on — it speeds up every part of your workflow. + +Options: +- A) Keep it on (recommended) +- B) Turn it off — I'll type /commands myself + +If A: run `~/.claude/skills/gstack/bin/gstack-config set proactive true` +If B: run `~/.claude/skills/gstack/bin/gstack-config set proactive false` + +Always run: +```bash +touch ~/.gstack/.proactive-prompted +``` + +This only happens once. If `PROACTIVE_PROMPTED` is `yes`, skip this entirely. + ## AskUserQuestion Format **ALWAYS follow this structure for every AskUserQuestion call:** @@ -109,7 +135,6 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` -5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -117,97 +142,54 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: +AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. -- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. -- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. -- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: +**Effort reference** — always show both scales: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate / scaffolding | 2 days | 15 min | ~100x | -| Test writing | 1 day | 15 min | ~50x | -| Feature implementation | 1 week | 30 min | ~30x | -| Bug fix + regression test | 4 hours | 15 min | ~20x | -| Architecture / design | 2 days | 4 hours | ~5x | -| Research / exploration | 1 day | 3 hours | ~3x | +| Boilerplate | 2 days | 15 min | ~100x | +| Tests | 1 day | 15 min | ~50x | +| Feature | 1 week | 30 min | ~30x | +| Bug fix | 4 hours | 15 min | ~20x | -- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. +Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). -**Anti-patterns — DON'T do this:** -- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) -- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) -- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) -- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Repo Ownership — See Something, Say Something -## Repo Ownership Mode — See Something, Say Something +`REPO_MODE` controls how to handle issues outside your branch: +- **`solo`** — You own everything. Investigate and offer to fix proactively. +- **`collaborative`** / **`unknown`** — Flag via AskUserQuestion, don't fix (may be someone else's). -`REPO_MODE` from the preamble tells you who owns issues in this repo: - -- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. -- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. -- **`unknown`** — Treat as collaborative (safer default — ask before fixing). - -**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. - -Never let a noticed issue silently pass. The whole point is proactive communication. +Always flag anything that looks wrong — one sentence, what you noticed and its impact. ## Search Before Building -Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. +Before building anything unfamiliar, **search first.** See `~/.claude/skills/gstack/ETHOS.md`. +- **Layer 1** (tried and true) — don't reinvent. **Layer 2** (new and popular) — scrutinize. **Layer 3** (first principles) — prize above all. -**Three layers of knowledge:** -- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. -- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. -- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. - -**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: -"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." - -Log eureka moments: +**Eureka:** When first-principles reasoning contradicts conventional wisdom, name it and log: ```bash jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true ``` -Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. - -**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. +If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. -**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! - -**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. - -**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. - -**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): +**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} - -Hey gstack team — ran into this while using /{skill-name}: - -**What I was trying to do:** {what the user/agent was attempting} -**What happened instead:** {what actually happened} -**My rating:** {0-10} — {one sentence on why it wasn't a 10} - -## Steps to reproduce +**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} +## Repro 1. {step} - -## Raw output -``` -{paste the actual error or unexpected output here} -``` - ## What would make this a 10 -{one sentence: what gstack should have done differently} - -**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} +{one sentence} +**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} ``` - -Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" +Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. ## Completion Status Protocol @@ -298,22 +280,42 @@ Then write a `## GSTACK REVIEW REPORT` section to the end of the plan file: file you are allowed to edit in plan mode. The plan file review report is part of the plan's living status. -## Step 0: Detect base branch +## Step 0: Detect platform and base branch -Determine which branch this PR targets. Use the result as "the base branch" in all subsequent steps. +First, detect the git hosting platform from the remote URL: -1. Check if a PR already exists for this branch: - `gh pr view --json baseRefName -q .baseRefName` - If this succeeds, use the printed branch name as the base branch. +```bash +git remote get-url origin 2>/dev/null +``` -2. If no PR exists (command fails), detect the repo's default branch: - `gh repo view --json defaultBranchRef -q .defaultBranchRef.name` +- If the URL contains "github.com" → platform is **GitHub** +- If the URL contains "gitlab" → platform is **GitLab** +- Otherwise, check CLI availability: + - `gh auth status 2>/dev/null` succeeds → platform is **GitHub** (covers GitHub Enterprise) + - `glab auth status 2>/dev/null` succeeds → platform is **GitLab** (covers self-hosted) + - Neither → **unknown** (use git-native commands only) -3. If both commands fail, fall back to `main`. +Determine which branch this PR/MR targets, or the repo's default branch if no +PR/MR exists. Use the result as "the base branch" in all subsequent steps. + +**If GitHub:** +1. `gh pr view --json baseRefName -q .baseRefName` — if succeeds, use it +2. `gh repo view --json defaultBranchRef -q .defaultBranchRef.name` — if succeeds, use it + +**If GitLab:** +1. `glab mr view -F json 2>/dev/null` and extract the `target_branch` field — if succeeds, use it +2. `glab repo view -F json 2>/dev/null` and extract the `default_branch` field — if succeeds, use it + +**Git-native fallback (if unknown platform, or CLI commands fail):** +1. `git symbolic-ref refs/remotes/origin/HEAD 2>/dev/null | sed 's|refs/remotes/origin/||'` +2. If that fails: `git rev-parse --verify origin/main 2>/dev/null` → use `main` +3. If that fails: `git rev-parse --verify origin/master 2>/dev/null` → use `master` + +If all fail, fall back to `main`. Print the detected base branch name. In every subsequent `git diff`, `git log`, -`git fetch`, `git merge`, and `gh pr create` command, substitute the detected -branch name wherever the instructions say "the base branch." +`git fetch`, `git merge`, and PR/MR creation command, substitute the detected +branch name wherever the instructions say "the base branch" or ``. --- @@ -468,7 +470,7 @@ HARD RULES — first classify as MARKETING/LANDING PAGE vs APP UI vs HYBRID, the - APP UI: Calm surface hierarchy, dense but readable, utility language, minimal chrome - UNIVERSAL: CSS variables for colors, no default font stacks, one job per section, cards earn existence -For each finding: what's wrong, what will happen if it ships unresolved, and the specific fix. Be opinionated. No hedging." -s read-only -c 'model_reasoning_effort="high"' --enable web_search_cached 2>"$TMPERR_DESIGN" +For each finding: what's wrong, what will happen if it ships unresolved, and the specific fix. Be opinionated. No hedging." -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="high"' --enable web_search_cached 2>"$TMPERR_DESIGN" ``` Use a 5-minute timeout (`timeout: 300000`). After the command completes, read stderr: ```bash diff --git a/plan-eng-review/SKILL.md b/plan-eng-review/SKILL.md index 53bf7112..0b61d5f6 100644 --- a/plan-eng-review/SKILL.md +++ b/plan-eng-review/SKILL.md @@ -33,9 +33,11 @@ _SESSIONS=$(find ~/.gstack/sessions -mmin -120 -type f 2>/dev/null | wc -l | tr find ~/.gstack/sessions -mmin +120 -type f -delete 2>/dev/null || true _CONTRIB=$(~/.claude/skills/gstack/bin/gstack-config get gstack_contributor 2>/dev/null || true) _PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +_PROACTIVE_PROMPTED=$([ -f ~/.gstack/.proactive-prompted ] && echo "yes" || echo "no") _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" echo "PROACTIVE: $_PROACTIVE" +echo "PROACTIVE_PROMPTED: $_PROACTIVE_PROMPTED" source <(~/.claude/skills/gstack/bin/gstack-repo-mode 2>/dev/null) || true REPO_MODE=${REPO_MODE:-unknown} echo "REPO_MODE: $REPO_MODE" @@ -53,8 +55,11 @@ echo '{"skill":"plan-eng-review","ts":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'","repo": for _PF in $(find ~/.gstack/analytics -maxdepth 1 -name '.pending-*' 2>/dev/null); do [ -f "$_PF" ] && ~/.claude/skills/gstack/bin/gstack-telemetry-log --event-type skill_run --skill _pending_finalize --outcome unknown --session-id "$_SESSION_ID" 2>/dev/null || true; break; done ``` -If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke -them when the user explicitly asks. The user opted out of proactive suggestions. +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills AND do not +auto-invoke skills based on conversation context. Only run skills the user explicitly +types (e.g., /qa, /ship). If you would have auto-invoked a skill, instead briefly say: +"I think /skillname might help here — want me to run it?" and wait for confirmation. +The user opted out of proactive behavior. If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. @@ -103,6 +108,27 @@ touch ~/.gstack/.telemetry-prompted This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. +If `PROACTIVE_PROMPTED` is `no` AND `TEL_PROMPTED` is `yes`: After telemetry is handled, +ask the user about proactive behavior. Use AskUserQuestion: + +> gstack can proactively figure out when you might need a skill while you work — +> like suggesting /qa when you say "does this work?" or /investigate when you hit +> a bug. We recommend keeping this on — it speeds up every part of your workflow. + +Options: +- A) Keep it on (recommended) +- B) Turn it off — I'll type /commands myself + +If A: run `~/.claude/skills/gstack/bin/gstack-config set proactive true` +If B: run `~/.claude/skills/gstack/bin/gstack-config set proactive false` + +Always run: +```bash +touch ~/.gstack/.proactive-prompted +``` + +This only happens once. If `PROACTIVE_PROMPTED` is `yes`, skip this entirely. + ## AskUserQuestion Format **ALWAYS follow this structure for every AskUserQuestion call:** @@ -110,7 +136,6 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` -5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -118,97 +143,54 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: +AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. -- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. -- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. -- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: +**Effort reference** — always show both scales: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate / scaffolding | 2 days | 15 min | ~100x | -| Test writing | 1 day | 15 min | ~50x | -| Feature implementation | 1 week | 30 min | ~30x | -| Bug fix + regression test | 4 hours | 15 min | ~20x | -| Architecture / design | 2 days | 4 hours | ~5x | -| Research / exploration | 1 day | 3 hours | ~3x | +| Boilerplate | 2 days | 15 min | ~100x | +| Tests | 1 day | 15 min | ~50x | +| Feature | 1 week | 30 min | ~30x | +| Bug fix | 4 hours | 15 min | ~20x | -- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. +Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). -**Anti-patterns — DON'T do this:** -- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) -- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) -- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) -- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Repo Ownership — See Something, Say Something -## Repo Ownership Mode — See Something, Say Something +`REPO_MODE` controls how to handle issues outside your branch: +- **`solo`** — You own everything. Investigate and offer to fix proactively. +- **`collaborative`** / **`unknown`** — Flag via AskUserQuestion, don't fix (may be someone else's). -`REPO_MODE` from the preamble tells you who owns issues in this repo: - -- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. -- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. -- **`unknown`** — Treat as collaborative (safer default — ask before fixing). - -**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. - -Never let a noticed issue silently pass. The whole point is proactive communication. +Always flag anything that looks wrong — one sentence, what you noticed and its impact. ## Search Before Building -Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. +Before building anything unfamiliar, **search first.** See `~/.claude/skills/gstack/ETHOS.md`. +- **Layer 1** (tried and true) — don't reinvent. **Layer 2** (new and popular) — scrutinize. **Layer 3** (first principles) — prize above all. -**Three layers of knowledge:** -- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. -- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. -- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. - -**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: -"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." - -Log eureka moments: +**Eureka:** When first-principles reasoning contradicts conventional wisdom, name it and log: ```bash jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true ``` -Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. - -**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. +If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. -**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! - -**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. - -**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. - -**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): +**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} - -Hey gstack team — ran into this while using /{skill-name}: - -**What I was trying to do:** {what the user/agent was attempting} -**What happened instead:** {what actually happened} -**My rating:** {0-10} — {one sentence on why it wasn't a 10} - -## Steps to reproduce +**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} +## Repro 1. {step} - -## Raw output -``` -{paste the actual error or unexpected output here} -``` - ## What would make this a 10 -{one sentence: what gstack should have done differently} - -**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} +{one sentence} +**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} ``` - -Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" +Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. ## Completion Status Protocol @@ -723,7 +705,7 @@ THE PLAN: ```bash TMPERR_PV=$(mktemp /tmp/codex-planreview-XXXXXXXX) -codex exec "" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_PV" +codex exec "" -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_PV" ``` Use a 5-minute timeout (`timeout: 300000`). After the command completes, read stderr: diff --git a/qa-only/SKILL.md b/qa-only/SKILL.md index 6736211e..1129d52a 100644 --- a/qa-only/SKILL.md +++ b/qa-only/SKILL.md @@ -29,9 +29,11 @@ _SESSIONS=$(find ~/.gstack/sessions -mmin -120 -type f 2>/dev/null | wc -l | tr find ~/.gstack/sessions -mmin +120 -type f -delete 2>/dev/null || true _CONTRIB=$(~/.claude/skills/gstack/bin/gstack-config get gstack_contributor 2>/dev/null || true) _PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +_PROACTIVE_PROMPTED=$([ -f ~/.gstack/.proactive-prompted ] && echo "yes" || echo "no") _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" echo "PROACTIVE: $_PROACTIVE" +echo "PROACTIVE_PROMPTED: $_PROACTIVE_PROMPTED" source <(~/.claude/skills/gstack/bin/gstack-repo-mode 2>/dev/null) || true REPO_MODE=${REPO_MODE:-unknown} echo "REPO_MODE: $REPO_MODE" @@ -49,8 +51,11 @@ echo '{"skill":"qa-only","ts":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'","repo":"'$(base for _PF in $(find ~/.gstack/analytics -maxdepth 1 -name '.pending-*' 2>/dev/null); do [ -f "$_PF" ] && ~/.claude/skills/gstack/bin/gstack-telemetry-log --event-type skill_run --skill _pending_finalize --outcome unknown --session-id "$_SESSION_ID" 2>/dev/null || true; break; done ``` -If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke -them when the user explicitly asks. The user opted out of proactive suggestions. +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills AND do not +auto-invoke skills based on conversation context. Only run skills the user explicitly +types (e.g., /qa, /ship). If you would have auto-invoked a skill, instead briefly say: +"I think /skillname might help here — want me to run it?" and wait for confirmation. +The user opted out of proactive behavior. If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. @@ -99,6 +104,27 @@ touch ~/.gstack/.telemetry-prompted This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. +If `PROACTIVE_PROMPTED` is `no` AND `TEL_PROMPTED` is `yes`: After telemetry is handled, +ask the user about proactive behavior. Use AskUserQuestion: + +> gstack can proactively figure out when you might need a skill while you work — +> like suggesting /qa when you say "does this work?" or /investigate when you hit +> a bug. We recommend keeping this on — it speeds up every part of your workflow. + +Options: +- A) Keep it on (recommended) +- B) Turn it off — I'll type /commands myself + +If A: run `~/.claude/skills/gstack/bin/gstack-config set proactive true` +If B: run `~/.claude/skills/gstack/bin/gstack-config set proactive false` + +Always run: +```bash +touch ~/.gstack/.proactive-prompted +``` + +This only happens once. If `PROACTIVE_PROMPTED` is `yes`, skip this entirely. + ## AskUserQuestion Format **ALWAYS follow this structure for every AskUserQuestion call:** @@ -106,7 +132,6 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` -5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -114,97 +139,54 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: +AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. -- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. -- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. -- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: +**Effort reference** — always show both scales: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate / scaffolding | 2 days | 15 min | ~100x | -| Test writing | 1 day | 15 min | ~50x | -| Feature implementation | 1 week | 30 min | ~30x | -| Bug fix + regression test | 4 hours | 15 min | ~20x | -| Architecture / design | 2 days | 4 hours | ~5x | -| Research / exploration | 1 day | 3 hours | ~3x | +| Boilerplate | 2 days | 15 min | ~100x | +| Tests | 1 day | 15 min | ~50x | +| Feature | 1 week | 30 min | ~30x | +| Bug fix | 4 hours | 15 min | ~20x | -- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. +Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). -**Anti-patterns — DON'T do this:** -- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) -- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) -- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) -- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Repo Ownership — See Something, Say Something -## Repo Ownership Mode — See Something, Say Something +`REPO_MODE` controls how to handle issues outside your branch: +- **`solo`** — You own everything. Investigate and offer to fix proactively. +- **`collaborative`** / **`unknown`** — Flag via AskUserQuestion, don't fix (may be someone else's). -`REPO_MODE` from the preamble tells you who owns issues in this repo: - -- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. -- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. -- **`unknown`** — Treat as collaborative (safer default — ask before fixing). - -**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. - -Never let a noticed issue silently pass. The whole point is proactive communication. +Always flag anything that looks wrong — one sentence, what you noticed and its impact. ## Search Before Building -Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. +Before building anything unfamiliar, **search first.** See `~/.claude/skills/gstack/ETHOS.md`. +- **Layer 1** (tried and true) — don't reinvent. **Layer 2** (new and popular) — scrutinize. **Layer 3** (first principles) — prize above all. -**Three layers of knowledge:** -- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. -- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. -- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. - -**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: -"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." - -Log eureka moments: +**Eureka:** When first-principles reasoning contradicts conventional wisdom, name it and log: ```bash jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true ``` -Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. - -**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. +If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. -**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! - -**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. - -**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. - -**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): +**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} - -Hey gstack team — ran into this while using /{skill-name}: - -**What I was trying to do:** {what the user/agent was attempting} -**What happened instead:** {what actually happened} -**My rating:** {0-10} — {one sentence on why it wasn't a 10} - -## Steps to reproduce +**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} +## Repro 1. {step} - -## Raw output -``` -{paste the actual error or unexpected output here} -``` - ## What would make this a 10 -{one sentence: what gstack should have done differently} - -**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} +{one sentence} +**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} ``` - -Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" +Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. ## Completion Status Protocol diff --git a/qa/SKILL.md b/qa/SKILL.md index 290c89af..af9279c5 100644 --- a/qa/SKILL.md +++ b/qa/SKILL.md @@ -35,9 +35,11 @@ _SESSIONS=$(find ~/.gstack/sessions -mmin -120 -type f 2>/dev/null | wc -l | tr find ~/.gstack/sessions -mmin +120 -type f -delete 2>/dev/null || true _CONTRIB=$(~/.claude/skills/gstack/bin/gstack-config get gstack_contributor 2>/dev/null || true) _PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +_PROACTIVE_PROMPTED=$([ -f ~/.gstack/.proactive-prompted ] && echo "yes" || echo "no") _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" echo "PROACTIVE: $_PROACTIVE" +echo "PROACTIVE_PROMPTED: $_PROACTIVE_PROMPTED" source <(~/.claude/skills/gstack/bin/gstack-repo-mode 2>/dev/null) || true REPO_MODE=${REPO_MODE:-unknown} echo "REPO_MODE: $REPO_MODE" @@ -55,8 +57,11 @@ echo '{"skill":"qa","ts":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'","repo":"'$(basename for _PF in $(find ~/.gstack/analytics -maxdepth 1 -name '.pending-*' 2>/dev/null); do [ -f "$_PF" ] && ~/.claude/skills/gstack/bin/gstack-telemetry-log --event-type skill_run --skill _pending_finalize --outcome unknown --session-id "$_SESSION_ID" 2>/dev/null || true; break; done ``` -If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke -them when the user explicitly asks. The user opted out of proactive suggestions. +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills AND do not +auto-invoke skills based on conversation context. Only run skills the user explicitly +types (e.g., /qa, /ship). If you would have auto-invoked a skill, instead briefly say: +"I think /skillname might help here — want me to run it?" and wait for confirmation. +The user opted out of proactive behavior. If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. @@ -105,6 +110,27 @@ touch ~/.gstack/.telemetry-prompted This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. +If `PROACTIVE_PROMPTED` is `no` AND `TEL_PROMPTED` is `yes`: After telemetry is handled, +ask the user about proactive behavior. Use AskUserQuestion: + +> gstack can proactively figure out when you might need a skill while you work — +> like suggesting /qa when you say "does this work?" or /investigate when you hit +> a bug. We recommend keeping this on — it speeds up every part of your workflow. + +Options: +- A) Keep it on (recommended) +- B) Turn it off — I'll type /commands myself + +If A: run `~/.claude/skills/gstack/bin/gstack-config set proactive true` +If B: run `~/.claude/skills/gstack/bin/gstack-config set proactive false` + +Always run: +```bash +touch ~/.gstack/.proactive-prompted +``` + +This only happens once. If `PROACTIVE_PROMPTED` is `yes`, skip this entirely. + ## AskUserQuestion Format **ALWAYS follow this structure for every AskUserQuestion call:** @@ -112,7 +138,6 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` -5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -120,97 +145,54 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: +AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. -- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. -- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. -- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: +**Effort reference** — always show both scales: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate / scaffolding | 2 days | 15 min | ~100x | -| Test writing | 1 day | 15 min | ~50x | -| Feature implementation | 1 week | 30 min | ~30x | -| Bug fix + regression test | 4 hours | 15 min | ~20x | -| Architecture / design | 2 days | 4 hours | ~5x | -| Research / exploration | 1 day | 3 hours | ~3x | +| Boilerplate | 2 days | 15 min | ~100x | +| Tests | 1 day | 15 min | ~50x | +| Feature | 1 week | 30 min | ~30x | +| Bug fix | 4 hours | 15 min | ~20x | -- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. +Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). -**Anti-patterns — DON'T do this:** -- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) -- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) -- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) -- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Repo Ownership — See Something, Say Something -## Repo Ownership Mode — See Something, Say Something +`REPO_MODE` controls how to handle issues outside your branch: +- **`solo`** — You own everything. Investigate and offer to fix proactively. +- **`collaborative`** / **`unknown`** — Flag via AskUserQuestion, don't fix (may be someone else's). -`REPO_MODE` from the preamble tells you who owns issues in this repo: - -- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. -- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. -- **`unknown`** — Treat as collaborative (safer default — ask before fixing). - -**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. - -Never let a noticed issue silently pass. The whole point is proactive communication. +Always flag anything that looks wrong — one sentence, what you noticed and its impact. ## Search Before Building -Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. +Before building anything unfamiliar, **search first.** See `~/.claude/skills/gstack/ETHOS.md`. +- **Layer 1** (tried and true) — don't reinvent. **Layer 2** (new and popular) — scrutinize. **Layer 3** (first principles) — prize above all. -**Three layers of knowledge:** -- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. -- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. -- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. - -**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: -"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." - -Log eureka moments: +**Eureka:** When first-principles reasoning contradicts conventional wisdom, name it and log: ```bash jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true ``` -Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. - -**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. +If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. -**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! - -**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. - -**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. - -**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): +**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} - -Hey gstack team — ran into this while using /{skill-name}: - -**What I was trying to do:** {what the user/agent was attempting} -**What happened instead:** {what actually happened} -**My rating:** {0-10} — {one sentence on why it wasn't a 10} - -## Steps to reproduce +**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} +## Repro 1. {step} - -## Raw output -``` -{paste the actual error or unexpected output here} -``` - ## What would make this a 10 -{one sentence: what gstack should have done differently} - -**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} +{one sentence} +**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} ``` - -Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" +Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. ## Completion Status Protocol @@ -301,22 +283,42 @@ Then write a `## GSTACK REVIEW REPORT` section to the end of the plan file: file you are allowed to edit in plan mode. The plan file review report is part of the plan's living status. -## Step 0: Detect base branch +## Step 0: Detect platform and base branch -Determine which branch this PR targets. Use the result as "the base branch" in all subsequent steps. +First, detect the git hosting platform from the remote URL: -1. Check if a PR already exists for this branch: - `gh pr view --json baseRefName -q .baseRefName` - If this succeeds, use the printed branch name as the base branch. +```bash +git remote get-url origin 2>/dev/null +``` -2. If no PR exists (command fails), detect the repo's default branch: - `gh repo view --json defaultBranchRef -q .defaultBranchRef.name` +- If the URL contains "github.com" → platform is **GitHub** +- If the URL contains "gitlab" → platform is **GitLab** +- Otherwise, check CLI availability: + - `gh auth status 2>/dev/null` succeeds → platform is **GitHub** (covers GitHub Enterprise) + - `glab auth status 2>/dev/null` succeeds → platform is **GitLab** (covers self-hosted) + - Neither → **unknown** (use git-native commands only) -3. If both commands fail, fall back to `main`. +Determine which branch this PR/MR targets, or the repo's default branch if no +PR/MR exists. Use the result as "the base branch" in all subsequent steps. + +**If GitHub:** +1. `gh pr view --json baseRefName -q .baseRefName` — if succeeds, use it +2. `gh repo view --json defaultBranchRef -q .defaultBranchRef.name` — if succeeds, use it + +**If GitLab:** +1. `glab mr view -F json 2>/dev/null` and extract the `target_branch` field — if succeeds, use it +2. `glab repo view -F json 2>/dev/null` and extract the `default_branch` field — if succeeds, use it + +**Git-native fallback (if unknown platform, or CLI commands fail):** +1. `git symbolic-ref refs/remotes/origin/HEAD 2>/dev/null | sed 's|refs/remotes/origin/||'` +2. If that fails: `git rev-parse --verify origin/main 2>/dev/null` → use `main` +3. If that fails: `git rev-parse --verify origin/master 2>/dev/null` → use `master` + +If all fail, fall back to `main`. Print the detected base branch name. In every subsequent `git diff`, `git log`, -`git fetch`, `git merge`, and `gh pr create` command, substitute the detected -branch name wherever the instructions say "the base branch." +`git fetch`, `git merge`, and PR/MR creation command, substitute the detected +branch name wherever the instructions say "the base branch" or ``. --- diff --git a/retro/SKILL.md b/retro/SKILL.md index 806ffde3..8741fb30 100644 --- a/retro/SKILL.md +++ b/retro/SKILL.md @@ -29,9 +29,11 @@ _SESSIONS=$(find ~/.gstack/sessions -mmin -120 -type f 2>/dev/null | wc -l | tr find ~/.gstack/sessions -mmin +120 -type f -delete 2>/dev/null || true _CONTRIB=$(~/.claude/skills/gstack/bin/gstack-config get gstack_contributor 2>/dev/null || true) _PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +_PROACTIVE_PROMPTED=$([ -f ~/.gstack/.proactive-prompted ] && echo "yes" || echo "no") _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" echo "PROACTIVE: $_PROACTIVE" +echo "PROACTIVE_PROMPTED: $_PROACTIVE_PROMPTED" source <(~/.claude/skills/gstack/bin/gstack-repo-mode 2>/dev/null) || true REPO_MODE=${REPO_MODE:-unknown} echo "REPO_MODE: $REPO_MODE" @@ -49,8 +51,11 @@ echo '{"skill":"retro","ts":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'","repo":"'$(basena for _PF in $(find ~/.gstack/analytics -maxdepth 1 -name '.pending-*' 2>/dev/null); do [ -f "$_PF" ] && ~/.claude/skills/gstack/bin/gstack-telemetry-log --event-type skill_run --skill _pending_finalize --outcome unknown --session-id "$_SESSION_ID" 2>/dev/null || true; break; done ``` -If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke -them when the user explicitly asks. The user opted out of proactive suggestions. +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills AND do not +auto-invoke skills based on conversation context. Only run skills the user explicitly +types (e.g., /qa, /ship). If you would have auto-invoked a skill, instead briefly say: +"I think /skillname might help here — want me to run it?" and wait for confirmation. +The user opted out of proactive behavior. If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. @@ -99,6 +104,27 @@ touch ~/.gstack/.telemetry-prompted This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. +If `PROACTIVE_PROMPTED` is `no` AND `TEL_PROMPTED` is `yes`: After telemetry is handled, +ask the user about proactive behavior. Use AskUserQuestion: + +> gstack can proactively figure out when you might need a skill while you work — +> like suggesting /qa when you say "does this work?" or /investigate when you hit +> a bug. We recommend keeping this on — it speeds up every part of your workflow. + +Options: +- A) Keep it on (recommended) +- B) Turn it off — I'll type /commands myself + +If A: run `~/.claude/skills/gstack/bin/gstack-config set proactive true` +If B: run `~/.claude/skills/gstack/bin/gstack-config set proactive false` + +Always run: +```bash +touch ~/.gstack/.proactive-prompted +``` + +This only happens once. If `PROACTIVE_PROMPTED` is `yes`, skip this entirely. + ## AskUserQuestion Format **ALWAYS follow this structure for every AskUserQuestion call:** @@ -106,7 +132,6 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` -5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -114,97 +139,36 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: +AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. -- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. -- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. -- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: +**Effort reference** — always show both scales: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate / scaffolding | 2 days | 15 min | ~100x | -| Test writing | 1 day | 15 min | ~50x | -| Feature implementation | 1 week | 30 min | ~30x | -| Bug fix + regression test | 4 hours | 15 min | ~20x | -| Architecture / design | 2 days | 4 hours | ~5x | -| Research / exploration | 1 day | 3 hours | ~3x | +| Boilerplate | 2 days | 15 min | ~100x | +| Tests | 1 day | 15 min | ~50x | +| Feature | 1 week | 30 min | ~30x | +| Bug fix | 4 hours | 15 min | ~20x | -- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. - -**Anti-patterns — DON'T do this:** -- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) -- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) -- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) -- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") - -## Repo Ownership Mode — See Something, Say Something - -`REPO_MODE` from the preamble tells you who owns issues in this repo: - -- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. -- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. -- **`unknown`** — Treat as collaborative (safer default — ask before fixing). - -**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. - -Never let a noticed issue silently pass. The whole point is proactive communication. - -## Search Before Building - -Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. - -**Three layers of knowledge:** -- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. -- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. -- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. - -**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: -"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." - -Log eureka moments: -```bash -jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true -``` -Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. - -**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." +Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. +If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. -**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! - -**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. - -**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. - -**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): +**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} - -Hey gstack team — ran into this while using /{skill-name}: - -**What I was trying to do:** {what the user/agent was attempting} -**What happened instead:** {what actually happened} -**My rating:** {0-10} — {one sentence on why it wasn't a 10} - -## Steps to reproduce +**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} +## Repro 1. {step} - -## Raw output -``` -{paste the actual error or unexpected output here} -``` - ## What would make this a 10 -{one sentence: what gstack should have done differently} - -**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} +{one sentence} +**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} ``` - -Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" +Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. ## Completion Status Protocol @@ -295,13 +259,42 @@ Then write a `## GSTACK REVIEW REPORT` section to the end of the plan file: file you are allowed to edit in plan mode. The plan file review report is part of the plan's living status. -## Detect default branch +## Step 0: Detect platform and base branch -Before gathering data, detect the repo's default branch name: -`gh repo view --json defaultBranchRef -q .defaultBranchRef.name` +First, detect the git hosting platform from the remote URL: -If this fails, fall back to `main`. Use the detected name wherever the instructions -say `origin/` below. +```bash +git remote get-url origin 2>/dev/null +``` + +- If the URL contains "github.com" → platform is **GitHub** +- If the URL contains "gitlab" → platform is **GitLab** +- Otherwise, check CLI availability: + - `gh auth status 2>/dev/null` succeeds → platform is **GitHub** (covers GitHub Enterprise) + - `glab auth status 2>/dev/null` succeeds → platform is **GitLab** (covers self-hosted) + - Neither → **unknown** (use git-native commands only) + +Determine which branch this PR/MR targets, or the repo's default branch if no +PR/MR exists. Use the result as "the base branch" in all subsequent steps. + +**If GitHub:** +1. `gh pr view --json baseRefName -q .baseRefName` — if succeeds, use it +2. `gh repo view --json defaultBranchRef -q .defaultBranchRef.name` — if succeeds, use it + +**If GitLab:** +1. `glab mr view -F json 2>/dev/null` and extract the `target_branch` field — if succeeds, use it +2. `glab repo view -F json 2>/dev/null` and extract the `default_branch` field — if succeeds, use it + +**Git-native fallback (if unknown platform, or CLI commands fail):** +1. `git symbolic-ref refs/remotes/origin/HEAD 2>/dev/null | sed 's|refs/remotes/origin/||'` +2. If that fails: `git rev-parse --verify origin/main 2>/dev/null` → use `main` +3. If that fails: `git rev-parse --verify origin/master 2>/dev/null` → use `master` + +If all fail, fall back to `main`. + +Print the detected base branch name. In every subsequent `git diff`, `git log`, +`git fetch`, `git merge`, and PR/MR creation command, substitute the detected +branch name wherever the instructions say "the base branch" or ``. --- @@ -372,8 +365,8 @@ git log origin/ --since="" --format="%at|%aN|%ai|%s" | sort -n # 4. Files most frequently changed (hotspot analysis) git log origin/ --since="" --format="" --name-only | grep -v '^$' | sort | uniq -c | sort -rn -# 5. PR numbers from commit messages (extract #NNN patterns) -git log origin/ --since="" --format="%s" | grep -oE '#[0-9]+' | sed 's/^#//' | sort -n | uniq | sed 's/^/#/' +# 5. PR/MR numbers from commit messages (GitHub #NNN, GitLab !NNN) +git log origin/ --since="" --format="%s" | grep -oE '[#!][0-9]+' | sort -t'#' -k1 | uniq # 6. Per-author file hotspots (who touches what) git log origin/ --since="" --format="AUTHOR:%aN" --name-only @@ -866,8 +859,8 @@ git -C log origin/$DEFAULT --since="T00:00:00" --format="%at| # Per-author commit counts git -C shortlog origin/$DEFAULT --since="T00:00:00" -sn --no-merges -# PR numbers from commit messages -git -C log origin/$DEFAULT --since="T00:00:00" --format="%s" | grep -oE '#[0-9]+' | sort -n | uniq +# PR/MR numbers from commit messages (GitHub #NNN, GitLab !NNN) +git -C log origin/$DEFAULT --since="T00:00:00" --format="%s" | grep -oE '[#!][0-9]+' | sort -t'#' -k1 | uniq ``` For repos that fail (deleted paths, network errors): skip and note "N repos could not be reached." @@ -945,7 +938,7 @@ align cleanly. Never truncate project names. ║ • [1-line description of second theme] ║ • [1-line description of third theme] ║ -║ Powered by gstack · github.com/garrytan/gstack +║ Powered by gstack ╚═══════════════════════════════════════════════════════════════ ``` @@ -1074,7 +1067,7 @@ Use the Write tool to save JSON to `~/.gstack/retros/global-${today}-${next}.jso "projects": [ { "name": "gstack", - "remote": "https://github.com/garrytan/gstack", + "remote": "", "commits": 47, "insertions": 3200, "deletions": 800, diff --git a/retro/SKILL.md.tmpl b/retro/SKILL.md.tmpl index dae967ef..cc4f53fa 100644 --- a/retro/SKILL.md.tmpl +++ b/retro/SKILL.md.tmpl @@ -18,15 +18,7 @@ allowed-tools: {{PREAMBLE}} -## Detect default branch - -Before gathering data, detect the repo's default branch name: -`gh repo view --json defaultBranchRef -q .defaultBranchRef.name` - -If this fails, fall back to `main`. Use the detected name wherever the instructions -say `origin/` below. - ---- +{{BASE_BRANCH_DETECT}} # /retro — Weekly Engineering Retrospective @@ -95,8 +87,8 @@ git log origin/ --since="" --format="%at|%aN|%ai|%s" | sort -n # 4. Files most frequently changed (hotspot analysis) git log origin/ --since="" --format="" --name-only | grep -v '^$' | sort | uniq -c | sort -rn -# 5. PR numbers from commit messages (extract #NNN patterns) -git log origin/ --since="" --format="%s" | grep -oE '#[0-9]+' | sed 's/^#//' | sort -n | uniq | sed 's/^/#/' +# 5. PR/MR numbers from commit messages (GitHub #NNN, GitLab !NNN) +git log origin/ --since="" --format="%s" | grep -oE '[#!][0-9]+' | sort -t'#' -k1 | uniq # 6. Per-author file hotspots (who touches what) git log origin/ --since="" --format="AUTHOR:%aN" --name-only @@ -589,8 +581,8 @@ git -C log origin/$DEFAULT --since="T00:00:00" --format="%at| # Per-author commit counts git -C shortlog origin/$DEFAULT --since="T00:00:00" -sn --no-merges -# PR numbers from commit messages -git -C log origin/$DEFAULT --since="T00:00:00" --format="%s" | grep -oE '#[0-9]+' | sort -n | uniq +# PR/MR numbers from commit messages (GitHub #NNN, GitLab !NNN) +git -C log origin/$DEFAULT --since="T00:00:00" --format="%s" | grep -oE '[#!][0-9]+' | sort -t'#' -k1 | uniq ``` For repos that fail (deleted paths, network errors): skip and note "N repos could not be reached." @@ -668,7 +660,7 @@ align cleanly. Never truncate project names. ║ • [1-line description of second theme] ║ • [1-line description of third theme] ║ -║ Powered by gstack · github.com/garrytan/gstack +║ Powered by gstack ╚═══════════════════════════════════════════════════════════════ ``` @@ -797,7 +789,7 @@ Use the Write tool to save JSON to `~/.gstack/retros/global-${today}-${next}.jso "projects": [ { "name": "gstack", - "remote": "https://github.com/garrytan/gstack", + "remote": "", "commits": 47, "insertions": 3200, "deletions": 800, diff --git a/review/SKILL.md b/review/SKILL.md index a58e8627..3c28ed6c 100644 --- a/review/SKILL.md +++ b/review/SKILL.md @@ -32,9 +32,11 @@ _SESSIONS=$(find ~/.gstack/sessions -mmin -120 -type f 2>/dev/null | wc -l | tr find ~/.gstack/sessions -mmin +120 -type f -delete 2>/dev/null || true _CONTRIB=$(~/.claude/skills/gstack/bin/gstack-config get gstack_contributor 2>/dev/null || true) _PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +_PROACTIVE_PROMPTED=$([ -f ~/.gstack/.proactive-prompted ] && echo "yes" || echo "no") _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" echo "PROACTIVE: $_PROACTIVE" +echo "PROACTIVE_PROMPTED: $_PROACTIVE_PROMPTED" source <(~/.claude/skills/gstack/bin/gstack-repo-mode 2>/dev/null) || true REPO_MODE=${REPO_MODE:-unknown} echo "REPO_MODE: $REPO_MODE" @@ -52,8 +54,11 @@ echo '{"skill":"review","ts":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'","repo":"'$(basen for _PF in $(find ~/.gstack/analytics -maxdepth 1 -name '.pending-*' 2>/dev/null); do [ -f "$_PF" ] && ~/.claude/skills/gstack/bin/gstack-telemetry-log --event-type skill_run --skill _pending_finalize --outcome unknown --session-id "$_SESSION_ID" 2>/dev/null || true; break; done ``` -If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke -them when the user explicitly asks. The user opted out of proactive suggestions. +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills AND do not +auto-invoke skills based on conversation context. Only run skills the user explicitly +types (e.g., /qa, /ship). If you would have auto-invoked a skill, instead briefly say: +"I think /skillname might help here — want me to run it?" and wait for confirmation. +The user opted out of proactive behavior. If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. @@ -102,6 +107,27 @@ touch ~/.gstack/.telemetry-prompted This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. +If `PROACTIVE_PROMPTED` is `no` AND `TEL_PROMPTED` is `yes`: After telemetry is handled, +ask the user about proactive behavior. Use AskUserQuestion: + +> gstack can proactively figure out when you might need a skill while you work — +> like suggesting /qa when you say "does this work?" or /investigate when you hit +> a bug. We recommend keeping this on — it speeds up every part of your workflow. + +Options: +- A) Keep it on (recommended) +- B) Turn it off — I'll type /commands myself + +If A: run `~/.claude/skills/gstack/bin/gstack-config set proactive true` +If B: run `~/.claude/skills/gstack/bin/gstack-config set proactive false` + +Always run: +```bash +touch ~/.gstack/.proactive-prompted +``` + +This only happens once. If `PROACTIVE_PROMPTED` is `yes`, skip this entirely. + ## AskUserQuestion Format **ALWAYS follow this structure for every AskUserQuestion call:** @@ -109,7 +135,6 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` -5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -117,97 +142,54 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: +AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. -- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. -- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. -- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: +**Effort reference** — always show both scales: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate / scaffolding | 2 days | 15 min | ~100x | -| Test writing | 1 day | 15 min | ~50x | -| Feature implementation | 1 week | 30 min | ~30x | -| Bug fix + regression test | 4 hours | 15 min | ~20x | -| Architecture / design | 2 days | 4 hours | ~5x | -| Research / exploration | 1 day | 3 hours | ~3x | +| Boilerplate | 2 days | 15 min | ~100x | +| Tests | 1 day | 15 min | ~50x | +| Feature | 1 week | 30 min | ~30x | +| Bug fix | 4 hours | 15 min | ~20x | -- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. +Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). -**Anti-patterns — DON'T do this:** -- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) -- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) -- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) -- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Repo Ownership — See Something, Say Something -## Repo Ownership Mode — See Something, Say Something +`REPO_MODE` controls how to handle issues outside your branch: +- **`solo`** — You own everything. Investigate and offer to fix proactively. +- **`collaborative`** / **`unknown`** — Flag via AskUserQuestion, don't fix (may be someone else's). -`REPO_MODE` from the preamble tells you who owns issues in this repo: - -- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. -- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. -- **`unknown`** — Treat as collaborative (safer default — ask before fixing). - -**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. - -Never let a noticed issue silently pass. The whole point is proactive communication. +Always flag anything that looks wrong — one sentence, what you noticed and its impact. ## Search Before Building -Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. +Before building anything unfamiliar, **search first.** See `~/.claude/skills/gstack/ETHOS.md`. +- **Layer 1** (tried and true) — don't reinvent. **Layer 2** (new and popular) — scrutinize. **Layer 3** (first principles) — prize above all. -**Three layers of knowledge:** -- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. -- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. -- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. - -**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: -"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." - -Log eureka moments: +**Eureka:** When first-principles reasoning contradicts conventional wisdom, name it and log: ```bash jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true ``` -Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. - -**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. +If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. -**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! - -**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. - -**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. - -**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): +**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} - -Hey gstack team — ran into this while using /{skill-name}: - -**What I was trying to do:** {what the user/agent was attempting} -**What happened instead:** {what actually happened} -**My rating:** {0-10} — {one sentence on why it wasn't a 10} - -## Steps to reproduce +**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} +## Repro 1. {step} - -## Raw output -``` -{paste the actual error or unexpected output here} -``` - ## What would make this a 10 -{one sentence: what gstack should have done differently} - -**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} +{one sentence} +**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} ``` - -Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" +Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. ## Completion Status Protocol @@ -298,22 +280,42 @@ Then write a `## GSTACK REVIEW REPORT` section to the end of the plan file: file you are allowed to edit in plan mode. The plan file review report is part of the plan's living status. -## Step 0: Detect base branch +## Step 0: Detect platform and base branch -Determine which branch this PR targets. Use the result as "the base branch" in all subsequent steps. +First, detect the git hosting platform from the remote URL: -1. Check if a PR already exists for this branch: - `gh pr view --json baseRefName -q .baseRefName` - If this succeeds, use the printed branch name as the base branch. +```bash +git remote get-url origin 2>/dev/null +``` -2. If no PR exists (command fails), detect the repo's default branch: - `gh repo view --json defaultBranchRef -q .defaultBranchRef.name` +- If the URL contains "github.com" → platform is **GitHub** +- If the URL contains "gitlab" → platform is **GitLab** +- Otherwise, check CLI availability: + - `gh auth status 2>/dev/null` succeeds → platform is **GitHub** (covers GitHub Enterprise) + - `glab auth status 2>/dev/null` succeeds → platform is **GitLab** (covers self-hosted) + - Neither → **unknown** (use git-native commands only) -3. If both commands fail, fall back to `main`. +Determine which branch this PR/MR targets, or the repo's default branch if no +PR/MR exists. Use the result as "the base branch" in all subsequent steps. + +**If GitHub:** +1. `gh pr view --json baseRefName -q .baseRefName` — if succeeds, use it +2. `gh repo view --json defaultBranchRef -q .defaultBranchRef.name` — if succeeds, use it + +**If GitLab:** +1. `glab mr view -F json 2>/dev/null` and extract the `target_branch` field — if succeeds, use it +2. `glab repo view -F json 2>/dev/null` and extract the `default_branch` field — if succeeds, use it + +**Git-native fallback (if unknown platform, or CLI commands fail):** +1. `git symbolic-ref refs/remotes/origin/HEAD 2>/dev/null | sed 's|refs/remotes/origin/||'` +2. If that fails: `git rev-parse --verify origin/main 2>/dev/null` → use `main` +3. If that fails: `git rev-parse --verify origin/master 2>/dev/null` → use `master` + +If all fail, fall back to `main`. Print the detected base branch name. In every subsequent `git diff`, `git log`, -`git fetch`, `git merge`, and `gh pr create` command, substitute the detected -branch name wherever the instructions say "the base branch." +`git fetch`, `git merge`, and PR/MR creation command, substitute the detected +branch name wherever the instructions say "the base branch" or ``. --- @@ -573,7 +575,7 @@ If Codex is available, run a lightweight design check on the diff: ```bash TMPERR_DRL=$(mktemp /tmp/codex-drl-XXXXXXXX) -codex exec "Review the git diff on this branch. Run 7 litmus checks (YES/NO each): 1. Brand/product unmistakable in first screen? 2. One strong visual anchor present? 3. Page understandable by scanning headlines only? 4. Each section has one job? 5. Are cards actually necessary? 6. Does motion improve hierarchy or atmosphere? 7. Would design feel premium with all decorative shadows removed? Flag any hard rejections: 1. Generic SaaS card grid as first impression 2. Beautiful image with weak brand 3. Strong headline with no clear action 4. Busy imagery behind text 5. Sections repeating same mood statement 6. Carousel with no narrative purpose 7. App UI made of stacked cards instead of layout 5 most important design findings only. Reference file:line." -s read-only -c 'model_reasoning_effort="high"' --enable web_search_cached 2>"$TMPERR_DRL" +codex exec "Review the git diff on this branch. Run 7 litmus checks (YES/NO each): 1. Brand/product unmistakable in first screen? 2. One strong visual anchor present? 3. Page understandable by scanning headlines only? 4. Each section has one job? 5. Are cards actually necessary? 6. Does motion improve hierarchy or atmosphere? 7. Would design feel premium with all decorative shadows removed? Flag any hard rejections: 1. Generic SaaS card grid as first impression 2. Beautiful image with weak brand 3. Strong headline with no clear action 4. Busy imagery behind text 5. Sections repeating same mood statement 6. Carousel with no narrative purpose 7. App UI made of stacked cards instead of layout 5 most important design findings only. Reference file:line." -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="high"' --enable web_search_cached 2>"$TMPERR_DRL" ``` Use a 5-minute timeout (`timeout: 300000`). After the command completes, read stderr: @@ -762,6 +764,21 @@ If no test framework detected → include gaps as INFORMATIONAL findings only, n **Diff is test-only changes:** Skip Step 4.75 entirely: "No new application code paths to audit." +### Coverage Warning + +After producing the coverage diagram, check the coverage percentage. Read CLAUDE.md for a `## Test Coverage` section with a `Minimum:` field. If not found, use default: 60%. + +If coverage is below the minimum threshold, output a prominent warning **before** the regular review findings: + +``` +⚠️ COVERAGE WARNING: AI-assessed coverage is {X}%. {N} code paths untested. +Consider writing tests before running /ship. +``` + +This is INFORMATIONAL — does not block /review. But it makes low coverage visible early so the developer can address it before reaching the /ship coverage gate. + +If coverage percentage cannot be determined, skip the warning silently. + This step subsumes the "Test Gaps" category from Pass 2 — do not duplicate findings between the checklist Test Gaps item and this coverage diagram. Include any coverage gaps alongside the findings from Step 4 and Step 4.5. They follow the same Fix-First flow — gaps are INFORMATIONAL findings. --- @@ -916,7 +933,7 @@ Claude's structured review already ran. Now add a **cross-model adversarial chal ```bash TMPERR_ADV=$(mktemp /tmp/codex-adv-XXXXXXXX) -codex exec "Review the changes on this branch against the base branch. Run git diff origin/ to see the diff. Your job is to find ways this code will fail in production. Think like an attacker and a chaos engineer. Find edge cases, race conditions, security holes, resource leaks, failure modes, and silent data corruption paths. Be adversarial. Be thorough. No compliments — just the problems." -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_ADV" +codex exec "Review the changes on this branch against the base branch. Run git diff origin/ to see the diff. Your job is to find ways this code will fail in production. Think like an attacker and a chaos engineer. Find edge cases, race conditions, security holes, resource leaks, failure modes, and silent data corruption paths. Be adversarial. Be thorough. No compliments — just the problems." -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_ADV" ``` Set the Bash tool's `timeout` parameter to `300000` (5 minutes). Do NOT use the `timeout` shell command — it doesn't exist on macOS. After the command completes, read stderr: diff --git a/scripts/gen-skill-docs.ts b/scripts/gen-skill-docs.ts index 8d483dad..81cd7476 100644 --- a/scripts/gen-skill-docs.ts +++ b/scripts/gen-skill-docs.ts @@ -34,35 +34,7 @@ const HOST: Host = (() => { throw new Error(`Unknown host: ${val}. Use claude, codex, or agents.`); })(); -interface HostPaths { - skillRoot: string; - localSkillRoot: string; - binDir: string; - browseDir: string; -} - -const HOST_PATHS: Record = { - claude: { - skillRoot: '~/.claude/skills/gstack', - localSkillRoot: '.claude/skills/gstack', - binDir: '~/.claude/skills/gstack/bin', - browseDir: '~/.claude/skills/gstack/browse/dist', - }, - codex: { - skillRoot: '$GSTACK_ROOT', - localSkillRoot: '.agents/skills/gstack', - binDir: '$GSTACK_BIN', - browseDir: '$GSTACK_BROWSE', - }, -}; - -interface TemplateContext { - skillName: string; - tmplPath: string; - benefitsFrom?: string[]; - host: Host; - paths: HostPaths; -} +// HostPaths, HOST_PATHS, and TemplateContext imported from ./resolvers/types (line 7-8) // ─── Shared Design Constants ──────────────────────────────── @@ -620,22 +592,42 @@ If \`NEEDS_SETUP\`: } function generateBaseBranchDetect(_ctx: TemplateContext): string { - return `## Step 0: Detect base branch + return `## Step 0: Detect platform and base branch -Determine which branch this PR targets. Use the result as "the base branch" in all subsequent steps. +First, detect the git hosting platform from the remote URL: -1. Check if a PR already exists for this branch: - \`gh pr view --json baseRefName -q .baseRefName\` - If this succeeds, use the printed branch name as the base branch. +\`\`\`bash +git remote get-url origin 2>/dev/null +\`\`\` -2. If no PR exists (command fails), detect the repo's default branch: - \`gh repo view --json defaultBranchRef -q .defaultBranchRef.name\` +- If the URL contains "github.com" → platform is **GitHub** +- If the URL contains "gitlab" → platform is **GitLab** +- Otherwise, check CLI availability: + - \`gh auth status 2>/dev/null\` succeeds → platform is **GitHub** (covers GitHub Enterprise) + - \`glab auth status 2>/dev/null\` succeeds → platform is **GitLab** (covers self-hosted) + - Neither → **unknown** (use git-native commands only) -3. If both commands fail, fall back to \`main\`. +Determine which branch this PR/MR targets, or the repo's default branch if no +PR/MR exists. Use the result as "the base branch" in all subsequent steps. + +**If GitHub:** +1. \`gh pr view --json baseRefName -q .baseRefName\` — if succeeds, use it +2. \`gh repo view --json defaultBranchRef -q .defaultBranchRef.name\` — if succeeds, use it + +**If GitLab:** +1. \`glab mr view -F json 2>/dev/null\` and extract the \`target_branch\` field — if succeeds, use it +2. \`glab repo view -F json 2>/dev/null\` and extract the \`default_branch\` field — if succeeds, use it + +**Git-native fallback (if unknown platform, or CLI commands fail):** +1. \`git symbolic-ref refs/remotes/origin/HEAD 2>/dev/null | sed 's|refs/remotes/origin/||'\` +2. If that fails: \`git rev-parse --verify origin/main 2>/dev/null\` → use \`main\` +3. If that fails: \`git rev-parse --verify origin/master 2>/dev/null\` → use \`master\` + +If all fail, fall back to \`main\`. Print the detected base branch name. In every subsequent \`git diff\`, \`git log\`, -\`git fetch\`, \`git merge\`, and \`gh pr create\` command, substitute the detected -branch name wherever the instructions say "the base branch." +\`git fetch\`, \`git merge\`, and PR/MR creation command, substitute the detected +branch name wherever the instructions say "the base branch" or \`\`. ---`; } @@ -2793,46 +2785,7 @@ ${slopItems} Source: [OpenAI "Designing Delightful Frontends with GPT-5.4"](https://developers.openai.com/blog/designing-delightful-frontends-with-gpt-5-4) (Mar 2026) + gstack design methodology.`; } -function generateSlugEval(ctx: TemplateContext): string { - return `eval "$(${ctx.paths.binDir}/gstack-slug 2>/dev/null)"`; -} - -function generateSlugSetup(ctx: TemplateContext): string { - return `eval "$(${ctx.paths.binDir}/gstack-slug 2>/dev/null)" && mkdir -p ~/.gstack/projects/$SLUG`; -} - -const RESOLVERS: Record string> = { - SLUG_EVAL: generateSlugEval, - SLUG_SETUP: generateSlugSetup, - COMMAND_REFERENCE: generateCommandReference, - SNAPSHOT_FLAGS: generateSnapshotFlags, - PREAMBLE: generatePreamble, - BROWSE_SETUP: generateBrowseSetup, - BASE_BRANCH_DETECT: generateBaseBranchDetect, - QA_METHODOLOGY: generateQAMethodology, - DESIGN_METHODOLOGY: generateDesignMethodology, - DESIGN_HARD_RULES: generateDesignHardRules, - DESIGN_OUTSIDE_VOICES: generateDesignOutsideVoices, - DESIGN_REVIEW_LITE: generateDesignReviewLite, - REVIEW_DASHBOARD: generateReviewDashboard, - PLAN_FILE_REVIEW_REPORT: generatePlanFileReviewReport, - TEST_BOOTSTRAP: generateTestBootstrap, - TEST_COVERAGE_AUDIT_PLAN: generateTestCoverageAuditPlan, - TEST_COVERAGE_AUDIT_SHIP: generateTestCoverageAuditShip, - TEST_COVERAGE_AUDIT_REVIEW: generateTestCoverageAuditReview, - TEST_FAILURE_TRIAGE: generateTestFailureTriage, - SPEC_REVIEW_LOOP: generateSpecReviewLoop, - DESIGN_SKETCH: generateDesignSketch, - BENEFITS_FROM: generateBenefitsFrom, - CODEX_SECOND_OPINION: generateCodexSecondOpinion, - CODEX_REVIEW_STEP: generateAdversarialStep, - ADVERSARIAL_STEP: generateAdversarialStep, - DEPLOY_BOOTSTRAP: generateDeployBootstrap, - CODEX_PLAN_REVIEW: generateCodexPlanReview, - PLAN_COMPLETION_AUDIT_SHIP: generatePlanCompletionAuditShip, - PLAN_COMPLETION_AUDIT_REVIEW: generatePlanCompletionAuditReview, - PLAN_VERIFICATION_EXEC: generatePlanVerificationExec, -}; +// RESOLVERS imported from ./resolvers/index (line 19) — do not redeclare here // ─── Codex Helpers ─────────────────────────────────────────── diff --git a/scripts/resolvers/preamble.ts b/scripts/resolvers/preamble.ts index 76573422..44126771 100644 --- a/scripts/resolvers/preamble.ts +++ b/scripts/resolvers/preamble.ts @@ -250,14 +250,22 @@ Use AskUserQuestion: git log --format="%an (%ae)" -1 -- \`\`\` If these are different people, prefer the production code author — they likely introduced the regression. -- Create a GitHub issue assigned to that person: - \`\`\`bash - gh issue create \\ - --title "Pre-existing test failure: " \\ - --body "Found failing on branch . Failure is pre-existing.\\n\\n**Error:**\\n\`\`\`\\n\\n\`\`\`\\n\\n**Last modified by:** \\n**Noticed by:** gstack /ship on " \\ - --assignee "" - \`\`\` -- If \`gh\` is not available or \`--assignee\` fails (user not in org, etc.), create the issue without assignee and note who should look at it in the body. +- Create an issue assigned to that person (use the platform detected in Step 0): + - **If GitHub:** + \`\`\`bash + gh issue create \\ + --title "Pre-existing test failure: " \\ + --body "Found failing on branch . Failure is pre-existing.\\n\\n**Error:**\\n\`\`\`\\n\\n\`\`\`\\n\\n**Last modified by:** \\n**Noticed by:** gstack /ship on " \\ + --assignee "" + \`\`\` + - **If GitLab:** + \`\`\`bash + glab issue create \\ + -t "Pre-existing test failure: " \\ + -d "Found failing on branch . Failure is pre-existing.\\n\\n**Error:**\\n\`\`\`\\n\\n\`\`\`\\n\\n**Last modified by:** \\n**Noticed by:** gstack /ship on " \\ + -a "" + \`\`\` +- If neither CLI is available or \`--assignee\`/\`-a\` fails (user not in org, etc.), create the issue without assignee and note who should look at it in the body. - Continue with the workflow. **If "Skip":** diff --git a/scripts/resolvers/utility.ts b/scripts/resolvers/utility.ts index 03e72e21..6cd912f2 100644 --- a/scripts/resolvers/utility.ts +++ b/scripts/resolvers/utility.ts @@ -9,22 +9,42 @@ export function generateSlugSetup(ctx: TemplateContext): string { } export function generateBaseBranchDetect(_ctx: TemplateContext): string { - return `## Step 0: Detect base branch + return `## Step 0: Detect platform and base branch -Determine which branch this PR targets. Use the result as "the base branch" in all subsequent steps. +First, detect the git hosting platform from the remote URL: -1. Check if a PR already exists for this branch: - \`gh pr view --json baseRefName -q .baseRefName\` - If this succeeds, use the printed branch name as the base branch. +\`\`\`bash +git remote get-url origin 2>/dev/null +\`\`\` -2. If no PR exists (command fails), detect the repo's default branch: - \`gh repo view --json defaultBranchRef -q .defaultBranchRef.name\` +- If the URL contains "github.com" → platform is **GitHub** +- If the URL contains "gitlab" → platform is **GitLab** +- Otherwise, check CLI availability: + - \`gh auth status 2>/dev/null\` succeeds → platform is **GitHub** (covers GitHub Enterprise) + - \`glab auth status 2>/dev/null\` succeeds → platform is **GitLab** (covers self-hosted) + - Neither → **unknown** (use git-native commands only) -3. If both commands fail, fall back to \`main\`. +Determine which branch this PR/MR targets, or the repo's default branch if no +PR/MR exists. Use the result as "the base branch" in all subsequent steps. + +**If GitHub:** +1. \`gh pr view --json baseRefName -q .baseRefName\` — if succeeds, use it +2. \`gh repo view --json defaultBranchRef -q .defaultBranchRef.name\` — if succeeds, use it + +**If GitLab:** +1. \`glab mr view -F json 2>/dev/null\` and extract the \`target_branch\` field — if succeeds, use it +2. \`glab repo view -F json 2>/dev/null\` and extract the \`default_branch\` field — if succeeds, use it + +**Git-native fallback (if unknown platform, or CLI commands fail):** +1. \`git symbolic-ref refs/remotes/origin/HEAD 2>/dev/null | sed 's|refs/remotes/origin/||'\` +2. If that fails: \`git rev-parse --verify origin/main 2>/dev/null\` → use \`main\` +3. If that fails: \`git rev-parse --verify origin/master 2>/dev/null\` → use \`master\` + +If all fail, fall back to \`main\`. Print the detected base branch name. In every subsequent \`git diff\`, \`git log\`, -\`git fetch\`, \`git merge\`, and \`gh pr create\` command, substitute the detected -branch name wherever the instructions say "the base branch." +\`git fetch\`, \`git merge\`, and PR/MR creation command, substitute the detected +branch name wherever the instructions say "the base branch" or \`\`. ---`; } diff --git a/setup-browser-cookies/SKILL.md b/setup-browser-cookies/SKILL.md index 85c1ce20..85815c91 100644 --- a/setup-browser-cookies/SKILL.md +++ b/setup-browser-cookies/SKILL.md @@ -26,9 +26,11 @@ _SESSIONS=$(find ~/.gstack/sessions -mmin -120 -type f 2>/dev/null | wc -l | tr find ~/.gstack/sessions -mmin +120 -type f -delete 2>/dev/null || true _CONTRIB=$(~/.claude/skills/gstack/bin/gstack-config get gstack_contributor 2>/dev/null || true) _PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +_PROACTIVE_PROMPTED=$([ -f ~/.gstack/.proactive-prompted ] && echo "yes" || echo "no") _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" echo "PROACTIVE: $_PROACTIVE" +echo "PROACTIVE_PROMPTED: $_PROACTIVE_PROMPTED" source <(~/.claude/skills/gstack/bin/gstack-repo-mode 2>/dev/null) || true REPO_MODE=${REPO_MODE:-unknown} echo "REPO_MODE: $REPO_MODE" @@ -46,8 +48,11 @@ echo '{"skill":"setup-browser-cookies","ts":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'"," for _PF in $(find ~/.gstack/analytics -maxdepth 1 -name '.pending-*' 2>/dev/null); do [ -f "$_PF" ] && ~/.claude/skills/gstack/bin/gstack-telemetry-log --event-type skill_run --skill _pending_finalize --outcome unknown --session-id "$_SESSION_ID" 2>/dev/null || true; break; done ``` -If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke -them when the user explicitly asks. The user opted out of proactive suggestions. +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills AND do not +auto-invoke skills based on conversation context. Only run skills the user explicitly +types (e.g., /qa, /ship). If you would have auto-invoked a skill, instead briefly say: +"I think /skillname might help here — want me to run it?" and wait for confirmation. +The user opted out of proactive behavior. If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. @@ -96,112 +101,44 @@ touch ~/.gstack/.telemetry-prompted This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. -## AskUserQuestion Format +If `PROACTIVE_PROMPTED` is `no` AND `TEL_PROMPTED` is `yes`: After telemetry is handled, +ask the user about proactive behavior. Use AskUserQuestion: -**ALWAYS follow this structure for every AskUserQuestion call:** -1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) -2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. -3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. -4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` -5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. +> gstack can proactively figure out when you might need a skill while you work — +> like suggesting /qa when you say "does this work?" or /investigate when you hit +> a bug. We recommend keeping this on — it speeds up every part of your workflow. -Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. +Options: +- A) Keep it on (recommended) +- B) Turn it off — I'll type /commands myself -Per-skill instructions may add additional formatting rules on top of this baseline. +If A: run `~/.claude/skills/gstack/bin/gstack-config set proactive true` +If B: run `~/.claude/skills/gstack/bin/gstack-config set proactive false` -## Completeness Principle — Boil the Lake - -AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: - -- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. -- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. -- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: - -| Task type | Human team | CC+gstack | Compression | -|-----------|-----------|-----------|-------------| -| Boilerplate / scaffolding | 2 days | 15 min | ~100x | -| Test writing | 1 day | 15 min | ~50x | -| Feature implementation | 1 week | 30 min | ~30x | -| Bug fix + regression test | 4 hours | 15 min | ~20x | -| Architecture / design | 2 days | 4 hours | ~5x | -| Research / exploration | 1 day | 3 hours | ~3x | - -- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. - -**Anti-patterns — DON'T do this:** -- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) -- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) -- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) -- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") - -## Repo Ownership Mode — See Something, Say Something - -`REPO_MODE` from the preamble tells you who owns issues in this repo: - -- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. -- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. -- **`unknown`** — Treat as collaborative (safer default — ask before fixing). - -**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. - -Never let a noticed issue silently pass. The whole point is proactive communication. - -## Search Before Building - -Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. - -**Three layers of knowledge:** -- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. -- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. -- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. - -**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: -"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." - -Log eureka moments: +Always run: ```bash -jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +touch ~/.gstack/.proactive-prompted ``` -Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. -**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." +This only happens once. If `PROACTIVE_PROMPTED` is `yes`, skip this entirely. ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. +If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. -**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! - -**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. - -**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. - -**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): +**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} - -Hey gstack team — ran into this while using /{skill-name}: - -**What I was trying to do:** {what the user/agent was attempting} -**What happened instead:** {what actually happened} -**My rating:** {0-10} — {one sentence on why it wasn't a 10} - -## Steps to reproduce +**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} +## Repro 1. {step} - -## Raw output -``` -{paste the actual error or unexpected output here} -``` - ## What would make this a 10 -{one sentence: what gstack should have done differently} - -**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} +{one sentence} +**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} ``` - -Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" +Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. ## Completion Status Protocol diff --git a/setup-deploy/SKILL.md b/setup-deploy/SKILL.md index 9eba4479..e5c94278 100644 --- a/setup-deploy/SKILL.md +++ b/setup-deploy/SKILL.md @@ -32,9 +32,11 @@ _SESSIONS=$(find ~/.gstack/sessions -mmin -120 -type f 2>/dev/null | wc -l | tr find ~/.gstack/sessions -mmin +120 -type f -delete 2>/dev/null || true _CONTRIB=$(~/.claude/skills/gstack/bin/gstack-config get gstack_contributor 2>/dev/null || true) _PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +_PROACTIVE_PROMPTED=$([ -f ~/.gstack/.proactive-prompted ] && echo "yes" || echo "no") _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" echo "PROACTIVE: $_PROACTIVE" +echo "PROACTIVE_PROMPTED: $_PROACTIVE_PROMPTED" source <(~/.claude/skills/gstack/bin/gstack-repo-mode 2>/dev/null) || true REPO_MODE=${REPO_MODE:-unknown} echo "REPO_MODE: $REPO_MODE" @@ -52,8 +54,11 @@ echo '{"skill":"setup-deploy","ts":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'","repo":"'$ for _PF in $(find ~/.gstack/analytics -maxdepth 1 -name '.pending-*' 2>/dev/null); do [ -f "$_PF" ] && ~/.claude/skills/gstack/bin/gstack-telemetry-log --event-type skill_run --skill _pending_finalize --outcome unknown --session-id "$_SESSION_ID" 2>/dev/null || true; break; done ``` -If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke -them when the user explicitly asks. The user opted out of proactive suggestions. +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills AND do not +auto-invoke skills based on conversation context. Only run skills the user explicitly +types (e.g., /qa, /ship). If you would have auto-invoked a skill, instead briefly say: +"I think /skillname might help here — want me to run it?" and wait for confirmation. +The user opted out of proactive behavior. If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. @@ -102,6 +107,27 @@ touch ~/.gstack/.telemetry-prompted This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. +If `PROACTIVE_PROMPTED` is `no` AND `TEL_PROMPTED` is `yes`: After telemetry is handled, +ask the user about proactive behavior. Use AskUserQuestion: + +> gstack can proactively figure out when you might need a skill while you work — +> like suggesting /qa when you say "does this work?" or /investigate when you hit +> a bug. We recommend keeping this on — it speeds up every part of your workflow. + +Options: +- A) Keep it on (recommended) +- B) Turn it off — I'll type /commands myself + +If A: run `~/.claude/skills/gstack/bin/gstack-config set proactive true` +If B: run `~/.claude/skills/gstack/bin/gstack-config set proactive false` + +Always run: +```bash +touch ~/.gstack/.proactive-prompted +``` + +This only happens once. If `PROACTIVE_PROMPTED` is `yes`, skip this entirely. + ## AskUserQuestion Format **ALWAYS follow this structure for every AskUserQuestion call:** @@ -109,7 +135,6 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` -5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -117,97 +142,36 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: +AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. -- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. -- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. -- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: +**Effort reference** — always show both scales: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate / scaffolding | 2 days | 15 min | ~100x | -| Test writing | 1 day | 15 min | ~50x | -| Feature implementation | 1 week | 30 min | ~30x | -| Bug fix + regression test | 4 hours | 15 min | ~20x | -| Architecture / design | 2 days | 4 hours | ~5x | -| Research / exploration | 1 day | 3 hours | ~3x | +| Boilerplate | 2 days | 15 min | ~100x | +| Tests | 1 day | 15 min | ~50x | +| Feature | 1 week | 30 min | ~30x | +| Bug fix | 4 hours | 15 min | ~20x | -- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. - -**Anti-patterns — DON'T do this:** -- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) -- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) -- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) -- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") - -## Repo Ownership Mode — See Something, Say Something - -`REPO_MODE` from the preamble tells you who owns issues in this repo: - -- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. -- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. -- **`unknown`** — Treat as collaborative (safer default — ask before fixing). - -**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. - -Never let a noticed issue silently pass. The whole point is proactive communication. - -## Search Before Building - -Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. - -**Three layers of knowledge:** -- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. -- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. -- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. - -**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: -"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." - -Log eureka moments: -```bash -jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true -``` -Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. - -**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." +Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. +If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. -**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! - -**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. - -**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. - -**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): +**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} - -Hey gstack team — ran into this while using /{skill-name}: - -**What I was trying to do:** {what the user/agent was attempting} -**What happened instead:** {what actually happened} -**My rating:** {0-10} — {one sentence on why it wasn't a 10} - -## Steps to reproduce +**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} +## Repro 1. {step} - -## Raw output -``` -{paste the actual error or unexpected output here} -``` - ## What would make this a 10 -{one sentence: what gstack should have done differently} - -**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} +{one sentence} +**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} ``` - -Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" +Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. ## Completion Status Protocol diff --git a/ship/SKILL.md b/ship/SKILL.md index af2ea565..8999bf84 100644 --- a/ship/SKILL.md +++ b/ship/SKILL.md @@ -30,9 +30,11 @@ _SESSIONS=$(find ~/.gstack/sessions -mmin -120 -type f 2>/dev/null | wc -l | tr find ~/.gstack/sessions -mmin +120 -type f -delete 2>/dev/null || true _CONTRIB=$(~/.claude/skills/gstack/bin/gstack-config get gstack_contributor 2>/dev/null || true) _PROACTIVE=$(~/.claude/skills/gstack/bin/gstack-config get proactive 2>/dev/null || echo "true") +_PROACTIVE_PROMPTED=$([ -f ~/.gstack/.proactive-prompted ] && echo "yes" || echo "no") _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" echo "PROACTIVE: $_PROACTIVE" +echo "PROACTIVE_PROMPTED: $_PROACTIVE_PROMPTED" source <(~/.claude/skills/gstack/bin/gstack-repo-mode 2>/dev/null) || true REPO_MODE=${REPO_MODE:-unknown} echo "REPO_MODE: $REPO_MODE" @@ -50,8 +52,11 @@ echo '{"skill":"ship","ts":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'","repo":"'$(basenam for _PF in $(find ~/.gstack/analytics -maxdepth 1 -name '.pending-*' 2>/dev/null); do [ -f "$_PF" ] && ~/.claude/skills/gstack/bin/gstack-telemetry-log --event-type skill_run --skill _pending_finalize --outcome unknown --session-id "$_SESSION_ID" 2>/dev/null || true; break; done ``` -If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills — only invoke -them when the user explicitly asks. The user opted out of proactive suggestions. +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills AND do not +auto-invoke skills based on conversation context. Only run skills the user explicitly +types (e.g., /qa, /ship). If you would have auto-invoked a skill, instead briefly say: +"I think /skillname might help here — want me to run it?" and wait for confirmation. +The user opted out of proactive behavior. If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. @@ -100,6 +105,27 @@ touch ~/.gstack/.telemetry-prompted This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. +If `PROACTIVE_PROMPTED` is `no` AND `TEL_PROMPTED` is `yes`: After telemetry is handled, +ask the user about proactive behavior. Use AskUserQuestion: + +> gstack can proactively figure out when you might need a skill while you work — +> like suggesting /qa when you say "does this work?" or /investigate when you hit +> a bug. We recommend keeping this on — it speeds up every part of your workflow. + +Options: +- A) Keep it on (recommended) +- B) Turn it off — I'll type /commands myself + +If A: run `~/.claude/skills/gstack/bin/gstack-config set proactive true` +If B: run `~/.claude/skills/gstack/bin/gstack-config set proactive false` + +Always run: +```bash +touch ~/.gstack/.proactive-prompted +``` + +This only happens once. If `PROACTIVE_PROMPTED` is `yes`, skip this entirely. + ## AskUserQuestion Format **ALWAYS follow this structure for every AskUserQuestion call:** @@ -107,7 +133,6 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` -5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -115,97 +140,54 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: +AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. -- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. -- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. -- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: +**Effort reference** — always show both scales: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate / scaffolding | 2 days | 15 min | ~100x | -| Test writing | 1 day | 15 min | ~50x | -| Feature implementation | 1 week | 30 min | ~30x | -| Bug fix + regression test | 4 hours | 15 min | ~20x | -| Architecture / design | 2 days | 4 hours | ~5x | -| Research / exploration | 1 day | 3 hours | ~3x | +| Boilerplate | 2 days | 15 min | ~100x | +| Tests | 1 day | 15 min | ~50x | +| Feature | 1 week | 30 min | ~30x | +| Bug fix | 4 hours | 15 min | ~20x | -- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. +Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). -**Anti-patterns — DON'T do this:** -- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) -- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) -- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) -- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") +## Repo Ownership — See Something, Say Something -## Repo Ownership Mode — See Something, Say Something +`REPO_MODE` controls how to handle issues outside your branch: +- **`solo`** — You own everything. Investigate and offer to fix proactively. +- **`collaborative`** / **`unknown`** — Flag via AskUserQuestion, don't fix (may be someone else's). -`REPO_MODE` from the preamble tells you who owns issues in this repo: - -- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. -- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. -- **`unknown`** — Treat as collaborative (safer default — ask before fixing). - -**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. - -Never let a noticed issue silently pass. The whole point is proactive communication. +Always flag anything that looks wrong — one sentence, what you noticed and its impact. ## Search Before Building -Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. +Before building anything unfamiliar, **search first.** See `~/.claude/skills/gstack/ETHOS.md`. +- **Layer 1** (tried and true) — don't reinvent. **Layer 2** (new and popular) — scrutinize. **Layer 3** (first principles) — prize above all. -**Three layers of knowledge:** -- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. -- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. -- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. - -**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: -"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." - -Log eureka moments: +**Eureka:** When first-principles reasoning contradicts conventional wisdom, name it and log: ```bash jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true ``` -Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. - -**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. +If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. -**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! - -**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. - -**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. - -**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): +**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} - -Hey gstack team — ran into this while using /{skill-name}: - -**What I was trying to do:** {what the user/agent was attempting} -**What happened instead:** {what actually happened} -**My rating:** {0-10} — {one sentence on why it wasn't a 10} - -## Steps to reproduce +**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} +## Repro 1. {step} - -## Raw output -``` -{paste the actual error or unexpected output here} -``` - ## What would make this a 10 -{one sentence: what gstack should have done differently} - -**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} +{one sentence} +**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} ``` - -Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" +Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. ## Completion Status Protocol @@ -296,22 +278,42 @@ Then write a `## GSTACK REVIEW REPORT` section to the end of the plan file: file you are allowed to edit in plan mode. The plan file review report is part of the plan's living status. -## Step 0: Detect base branch +## Step 0: Detect platform and base branch -Determine which branch this PR targets. Use the result as "the base branch" in all subsequent steps. +First, detect the git hosting platform from the remote URL: -1. Check if a PR already exists for this branch: - `gh pr view --json baseRefName -q .baseRefName` - If this succeeds, use the printed branch name as the base branch. +```bash +git remote get-url origin 2>/dev/null +``` -2. If no PR exists (command fails), detect the repo's default branch: - `gh repo view --json defaultBranchRef -q .defaultBranchRef.name` +- If the URL contains "github.com" → platform is **GitHub** +- If the URL contains "gitlab" → platform is **GitLab** +- Otherwise, check CLI availability: + - `gh auth status 2>/dev/null` succeeds → platform is **GitHub** (covers GitHub Enterprise) + - `glab auth status 2>/dev/null` succeeds → platform is **GitLab** (covers self-hosted) + - Neither → **unknown** (use git-native commands only) -3. If both commands fail, fall back to `main`. +Determine which branch this PR/MR targets, or the repo's default branch if no +PR/MR exists. Use the result as "the base branch" in all subsequent steps. + +**If GitHub:** +1. `gh pr view --json baseRefName -q .baseRefName` — if succeeds, use it +2. `gh repo view --json defaultBranchRef -q .defaultBranchRef.name` — if succeeds, use it + +**If GitLab:** +1. `glab mr view -F json 2>/dev/null` and extract the `target_branch` field — if succeeds, use it +2. `glab repo view -F json 2>/dev/null` and extract the `default_branch` field — if succeeds, use it + +**Git-native fallback (if unknown platform, or CLI commands fail):** +1. `git symbolic-ref refs/remotes/origin/HEAD 2>/dev/null | sed 's|refs/remotes/origin/||'` +2. If that fails: `git rev-parse --verify origin/main 2>/dev/null` → use `main` +3. If that fails: `git rev-parse --verify origin/master 2>/dev/null` → use `master` + +If all fail, fall back to `main`. Print the detected base branch name. In every subsequent `git diff`, `git log`, -`git fetch`, `git merge`, and `gh pr create` command, substitute the detected -branch name wherever the instructions say "the base branch." +`git fetch`, `git merge`, and PR/MR creation command, substitute the detected +branch name wherever the instructions say "the base branch" or ``. --- @@ -437,12 +439,13 @@ service with existing deployment — verify that a distribution pipeline exists. 2. If new artifact detected, check for a release workflow: ```bash ls .github/workflows/ 2>/dev/null | grep -iE 'release|publish|dist' + grep -qE 'release|publish|deploy' .gitlab-ci.yml 2>/dev/null && echo "GITLAB_CI_RELEASE" ``` 3. **If no release pipeline exists and a new artifact was added:** Use AskUserQuestion: - "This PR adds a new binary/tool but there's no CI/CD pipeline to build and publish it. Users won't be able to download the artifact after merge." - - A) Add a release workflow now (GitHub Actions cross-platform build + GitHub Releases) + - A) Add a release workflow now (CI/CD release pipeline — GitHub Actions or GitLab CI depending on platform) - B) Defer — add to TODOS.md - C) Not needed — this is internal/web-only, existing deployment covers it @@ -722,14 +725,22 @@ Use AskUserQuestion: git log --format="%an (%ae)" -1 -- ``` If these are different people, prefer the production code author — they likely introduced the regression. -- Create a GitHub issue assigned to that person: - ```bash - gh issue create \ - --title "Pre-existing test failure: " \ - --body "Found failing on branch . Failure is pre-existing.\n\n**Error:**\n```\n\n```\n\n**Last modified by:** \n**Noticed by:** gstack /ship on " \ - --assignee "" - ``` -- If `gh` is not available or `--assignee` fails (user not in org, etc.), create the issue without assignee and note who should look at it in the body. +- Create an issue assigned to that person (use the platform detected in Step 0): + - **If GitHub:** + ```bash + gh issue create \ + --title "Pre-existing test failure: " \ + --body "Found failing on branch . Failure is pre-existing.\n\n**Error:**\n```\n\n```\n\n**Last modified by:** \n**Noticed by:** gstack /ship on " \ + --assignee "" + ``` + - **If GitLab:** + ```bash + glab issue create \ + -t "Pre-existing test failure: " \ + -d "Found failing on branch . Failure is pre-existing.\n\n**Error:**\n```\n\n```\n\n**Last modified by:** \n**Noticed by:** gstack /ship on " \ + -a "" + ``` +- If neither CLI is available or `--assignee`/`-a` fails (user not in org, etc.), create the issue without assignee and note who should look at it in the body. - Continue with the workflow. **If "Skip":** @@ -999,6 +1010,39 @@ find . -name '*.test.*' -o -name '*.spec.*' -o -name '*_test.*' -o -name '*_spec For PR body: `Tests: {before} → {after} (+{delta} new)` Coverage line: `Test Coverage Audit: N new code paths. M covered (X%). K tests generated, J committed.` +**7. Coverage gate:** + +Before proceeding, check CLAUDE.md for a `## Test Coverage` section with `Minimum:` and `Target:` fields. If found, use those percentages. Otherwise use defaults: Minimum = 60%, Target = 80%. + +Using the coverage percentage from the diagram in substep 4 (the `COVERAGE: X/Y (Z%)` line): + +- **>= target:** Pass. "Coverage gate: PASS ({X}%)." Continue. +- **>= minimum, < target:** Use AskUserQuestion: + - "AI-assessed coverage is {X}%. {N} code paths are untested. Target is {target}%." + - RECOMMENDATION: Choose A because untested code paths are where production bugs hide. + - Options: + A) Generate more tests for remaining gaps (recommended) + B) Ship anyway — I accept the coverage risk + C) These paths don't need tests — mark as intentionally uncovered + - If A: Loop back to substep 5 (generate tests) targeting the remaining gaps. After second pass, if still below target, present AskUserQuestion again with updated numbers. Maximum 2 generation passes total. + - If B: Continue. Include in PR body: "Coverage gate: {X}% — user accepted risk." + - If C: Continue. Include in PR body: "Coverage gate: {X}% — {N} paths intentionally uncovered." + +- **< minimum:** Use AskUserQuestion: + - "AI-assessed coverage is critically low ({X}%). {N} of {M} code paths have no tests. Minimum threshold is {minimum}%." + - RECOMMENDATION: Choose A because less than {minimum}% means more code is untested than tested. + - Options: + A) Generate tests for remaining gaps (recommended) + B) Override — ship with low coverage (I understand the risk) + - If A: Loop back to substep 5. Maximum 2 passes. If still below minimum after 2 passes, present the override choice again. + - If B: Continue. Include in PR body: "Coverage gate: OVERRIDDEN at {X}%." + +**Coverage percentage undetermined:** If the coverage diagram doesn't produce a clear numeric percentage (ambiguous output, parse error), **skip the gate** with: "Coverage gate: could not determine percentage — skipping." Do not default to 0% or block. + +**Test-only diffs:** Skip the gate (same as the existing fast-path). + +**100% coverage:** "Coverage gate: PASS (100%)." Continue. + ### Test Plan Artifact After producing the coverage diagram, write a test plan artifact so `/qa` and `/qa-only` can consume it: @@ -1262,7 +1306,7 @@ If Codex is available, run a lightweight design check on the diff: ```bash TMPERR_DRL=$(mktemp /tmp/codex-drl-XXXXXXXX) -codex exec "Review the git diff on this branch. Run 7 litmus checks (YES/NO each): 1. Brand/product unmistakable in first screen? 2. One strong visual anchor present? 3. Page understandable by scanning headlines only? 4. Each section has one job? 5. Are cards actually necessary? 6. Does motion improve hierarchy or atmosphere? 7. Would design feel premium with all decorative shadows removed? Flag any hard rejections: 1. Generic SaaS card grid as first impression 2. Beautiful image with weak brand 3. Strong headline with no clear action 4. Busy imagery behind text 5. Sections repeating same mood statement 6. Carousel with no narrative purpose 7. App UI made of stacked cards instead of layout 5 most important design findings only. Reference file:line." -s read-only -c 'model_reasoning_effort="high"' --enable web_search_cached 2>"$TMPERR_DRL" +codex exec "Review the git diff on this branch. Run 7 litmus checks (YES/NO each): 1. Brand/product unmistakable in first screen? 2. One strong visual anchor present? 3. Page understandable by scanning headlines only? 4. Each section has one job? 5. Are cards actually necessary? 6. Does motion improve hierarchy or atmosphere? 7. Would design feel premium with all decorative shadows removed? Flag any hard rejections: 1. Generic SaaS card grid as first impression 2. Beautiful image with weak brand 3. Strong headline with no clear action 4. Busy imagery behind text 5. Sections repeating same mood statement 6. Carousel with no narrative purpose 7. App UI made of stacked cards instead of layout 5 most important design findings only. Reference file:line." -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="high"' --enable web_search_cached 2>"$TMPERR_DRL" ``` Use a 5-minute timeout (`timeout: 300000`). After the command completes, read stderr: @@ -1377,7 +1421,7 @@ Claude's structured review already ran. Now add a **cross-model adversarial chal ```bash TMPERR_ADV=$(mktemp /tmp/codex-adv-XXXXXXXX) -codex exec "Review the changes on this branch against the base branch. Run git diff origin/ to see the diff. Your job is to find ways this code will fail in production. Think like an attacker and a chaos engineer. Find edge cases, race conditions, security holes, resource leaks, failure modes, and silent data corruption paths. Be adversarial. Be thorough. No compliments — just the problems." -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_ADV" +codex exec "Review the changes on this branch against the base branch. Run git diff origin/ to see the diff. Your job is to find ways this code will fail in production. Think like an attacker and a chaos engineer. Find edge cases, race conditions, security holes, resource leaks, failure modes, and silent data corruption paths. Be adversarial. Be thorough. No compliments — just the problems." -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_ADV" ``` Set the Bash tool's `timeout` parameter to `300000` (5 minutes). Do NOT use the `timeout` shell command — it doesn't exist on macOS. After the command completes, read stderr: @@ -1641,12 +1685,13 @@ git push -u origin --- -## Step 8: Create PR +## Step 8: Create PR/MR -Create a pull request using `gh`: +Create a pull request (GitHub) or merge request (GitLab) using the platform detected in Step 0. -```bash -gh pr create --base --title ": " --body "$(cat <<'EOF' +The PR/MR body should contain these sections: + +``` ## Summary @@ -1690,11 +1735,30 @@ gh pr create --base --title ": " --body "$(cat <<'EOF' - [x] All Vitest tests pass (N tests) 🤖 Generated with [Claude Code](https://claude.com/claude-code) +``` + +**If GitHub:** + +```bash +gh pr create --base --title ": " --body "$(cat <<'EOF' + EOF )" ``` -**Output the PR URL** — then proceed to Step 8.5. +**If GitLab:** + +```bash +glab mr create -b -t ": " -d "$(cat <<'EOF' + +EOF +)" +``` + +**If neither CLI is available:** +Print the branch name, remote URL, and instruct the user to create the PR/MR manually via the web UI. Do not stop — the code is pushed and ready. + +**Output the PR/MR URL** — then proceed to Step 8.5. --- diff --git a/ship/SKILL.md.tmpl b/ship/SKILL.md.tmpl index 7f82c64d..d630e330 100644 --- a/ship/SKILL.md.tmpl +++ b/ship/SKILL.md.tmpl @@ -100,12 +100,13 @@ service with existing deployment — verify that a distribution pipeline exists. 2. If new artifact detected, check for a release workflow: ```bash ls .github/workflows/ 2>/dev/null | grep -iE 'release|publish|dist' + grep -qE 'release|publish|deploy' .gitlab-ci.yml 2>/dev/null && echo "GITLAB_CI_RELEASE" ``` 3. **If no release pipeline exists and a new artifact was added:** Use AskUserQuestion: - "This PR adds a new binary/tool but there's no CI/CD pipeline to build and publish it. Users won't be able to download the artifact after merge." - - A) Add a release workflow now (GitHub Actions cross-platform build + GitHub Releases) + - A) Add a release workflow now (CI/CD release pipeline — GitHub Actions or GitLab CI depending on platform) - B) Defer — add to TODOS.md - C) Not needed — this is internal/web-only, existing deployment covers it @@ -485,12 +486,13 @@ git push -u origin --- -## Step 8: Create PR +## Step 8: Create PR/MR -Create a pull request using `gh`: +Create a pull request (GitHub) or merge request (GitLab) using the platform detected in Step 0. -```bash -gh pr create --base --title ": " --body "$(cat <<'EOF' +The PR/MR body should contain these sections: + +``` ## Summary @@ -534,11 +536,30 @@ gh pr create --base --title ": " --body "$(cat <<'EOF' - [x] All Vitest tests pass (N tests) 🤖 Generated with [Claude Code](https://claude.com/claude-code) +``` + +**If GitHub:** + +```bash +gh pr create --base --title ": " --body "$(cat <<'EOF' + EOF )" ``` -**Output the PR URL** — then proceed to Step 8.5. +**If GitLab:** + +```bash +glab mr create -b -t ": " -d "$(cat <<'EOF' + +EOF +)" +``` + +**If neither CLI is available:** +Print the branch name, remote URL, and instruct the user to create the PR/MR manually via the web UI. Do not stop — the code is pushed and ready. + +**Output the PR/MR URL** — then proceed to Step 8.5. --- diff --git a/test/gen-skill-docs.test.ts b/test/gen-skill-docs.test.ts index c26bb64b..cab12413 100644 --- a/test/gen-skill-docs.test.ts +++ b/test/gen-skill-docs.test.ts @@ -351,6 +351,39 @@ describe('BASE_BRANCH_DETECT resolver', () => { test('resolver output uses "the base branch" phrasing', () => { expect(shipContent).toContain('the base branch'); }); + + test('resolver output contains GitLab CLI commands', () => { + expect(shipContent).toContain('glab'); + }); + + test('resolver output contains git-native fallback', () => { + expect(shipContent).toContain('git symbolic-ref'); + }); + + test('resolver output mentions GitLab platform', () => { + expect(shipContent).toMatch(/gitlab/i); + }); +}); + +describe('GitLab support in generated skills', () => { + const retroContent = fs.readFileSync(path.join(ROOT, 'retro', 'SKILL.md'), 'utf-8'); + const shipSkillContent = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8'); + + test('retro contains GitLab MR number extraction', () => { + expect(retroContent).toContain('[#!]'); + }); + + test('retro uses BASE_BRANCH_DETECT (contains glab)', () => { + expect(retroContent).toContain('glab'); + }); + + test('ship contains glab mr create', () => { + expect(shipSkillContent).toContain('glab mr create'); + }); + + test('ship checks .gitlab-ci.yml', () => { + expect(shipSkillContent).toContain('.gitlab-ci.yml'); + }); }); /** diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index d61ae164..585e9dd3 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -79,6 +79,8 @@ export const E2E_TOUCHFILES: Record = { // Ship 'ship-base-branch': ['ship/**', 'bin/gstack-repo-mode'], 'ship-local-workflow': ['ship/**', 'scripts/gen-skill-docs.ts'], + 'ship-plan-completion': ['ship/**', 'scripts/gen-skill-docs.ts'], + 'ship-plan-verification': ['ship/**', 'scripts/gen-skill-docs.ts'], // Retro 'retro': ['retro/**'], @@ -184,6 +186,7 @@ export const E2E_TIERS: Record = { 'review-base-branch': 'gate', 'review-design-lite': 'periodic', // 4/7 threshold is subjective 'review-coverage-audit': 'gate', + 'review-plan-completion': 'gate', // Office Hours 'office-hours-spec-review': 'gate', @@ -208,6 +211,8 @@ export const E2E_TIERS: Record = { 'ship-local-workflow': 'gate', 'ship-coverage-audit': 'gate', 'ship-triage': 'gate', + 'ship-plan-completion': 'gate', + 'ship-plan-verification': 'gate', // Retro — gate for cheap branch detection, periodic for full Opus retro 'retro': 'periodic', From 997f7b1da6a19879fa5bc79c4fe5f71900b8c19f Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Thu, 26 Mar 2026 08:31:53 -0600 Subject: [PATCH 4/9] =?UTF-8?q?fix:=20review=20log=20architecture=20?= =?UTF-8?q?=E2=80=94=20close=20gaps,=20add=20attribution=20(v0.11.21.0)=20?= =?UTF-8?q?(#512)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: review log architecture — close gaps, fix orphans, add attribution - Ship Step 3.5 now logs its code review to the review log (via:"ship") - Remove eng review gate — ship runs its own review in Step 3.5 - Dashboard Outside Voice row mapped to codex-plan-review - Dashboard shows via source attribution (e.g., "via /autoplan") - land-and-deploy checks all 8 review skill types (was 5) - codex-review log gets commit field for staleness detection - autoplan uses placeholder tokens instead of hardcoded "clean" - Document autoplan-voices as audit-trail-only in review.ts - E2E test for dashboard via attribution * chore: bump version and changelog (v0.11.21.0) Co-Authored-By: Claude Opus 4.6 --------- Co-authored-by: Claude Opus 4.6 --- CHANGELOG.md | 11 ++++ VERSION | 2 +- autoplan/SKILL.md | 12 ++-- autoplan/SKILL.md.tmpl | 12 ++-- codex/SKILL.md | 2 +- codex/SKILL.md.tmpl | 2 +- land-and-deploy/SKILL.md | 5 +- land-and-deploy/SKILL.md.tmpl | 5 +- plan-ceo-review/SKILL.md | 8 ++- plan-design-review/SKILL.md | 8 ++- plan-eng-review/SKILL.md | 8 ++- scripts/resolvers/review.ts | 8 ++- ship/SKILL.md | 40 ++++++------ ship/SKILL.md.tmpl | 32 +++++----- test/helpers/touchfiles.ts | 1 + test/skill-e2e-review.test.ts | 113 ++++++++++++++++++++++++++++++++++ 16 files changed, 209 insertions(+), 60 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index acbc55cd..68199eb1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,16 @@ # Changelog +## [0.11.21.0] - 2026-03-26 + +### Fixed + +- **`/autoplan` reviews now count toward the ship readiness gate.** When `/autoplan` ran full CEO + Design + Eng reviews, `/ship` still showed "0 runs" for Eng Review because autoplan-logged entries weren't being read correctly. Now the dashboard shows source attribution (e.g., "CLEAR (PLAN via /autoplan)") so you can see exactly which tool satisfied each review. +- **`/ship` no longer tells you to "run /review first."** Ship runs its own pre-landing review in Step 3.5 — asking you to run the same review separately was redundant. The gate is removed; ship just does it. +- **`/land-and-deploy` now checks all 8 review types.** Previously missed `review`, `adversarial-review`, and `codex-plan-review` — if you only ran `/review` (not `/plan-eng-review`), land-and-deploy wouldn't see it. +- **Dashboard Outside Voice row now works.** Was showing "0 runs" even after outside voices ran in `/plan-ceo-review` or `/plan-eng-review`. Now correctly maps to `codex-plan-review` entries. +- **`/codex review` now tracks staleness.** Added the `commit` field to codex review log entries so the dashboard can detect when a codex review is outdated. +- **`/autoplan` no longer hardcodes "clean" status.** Review log entries from autoplan used to always record `status:"clean"` even when issues were found. Now uses proper placeholder tokens that Claude substitutes with real values. + ## [0.11.20.0] - 2026-03-26 ### Added diff --git a/VERSION b/VERSION index 508c698a..5e1d8ddf 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.11.20.0 +0.11.21.0 diff --git a/autoplan/SKILL.md b/autoplan/SKILL.md index d69fc285..aee5d372 100644 --- a/autoplan/SKILL.md +++ b/autoplan/SKILL.md @@ -929,24 +929,24 @@ AskUserQuestion options: ## Completion: Write Review Logs -On approval, write 3 separate review log entries so /ship's dashboard recognizes them: +On approval, write 3 separate review log entries so /ship's dashboard recognizes them. +Replace TIMESTAMP, STATUS, and N with actual values from each review phase. +STATUS is "clean" if no unresolved issues, "issues_open" otherwise. ```bash COMMIT=$(git rev-parse --short HEAD 2>/dev/null) TIMESTAMP=$(date -u +%Y-%m-%dT%H:%M:%SZ) -~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"plan-ceo-review","timestamp":"'"$TIMESTAMP"'","status":"clean","unresolved":0,"critical_gaps":0,"mode":"SELECTIVE_EXPANSION","via":"autoplan","commit":"'"$COMMIT"'"}' +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"plan-ceo-review","timestamp":"'"$TIMESTAMP"'","status":"STATUS","unresolved":N,"critical_gaps":N,"mode":"SELECTIVE_EXPANSION","via":"autoplan","commit":"'"$COMMIT"'"}' -~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"plan-eng-review","timestamp":"'"$TIMESTAMP"'","status":"clean","unresolved":0,"critical_gaps":0,"issues_found":0,"mode":"FULL_REVIEW","via":"autoplan","commit":"'"$COMMIT"'"}' +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"plan-eng-review","timestamp":"'"$TIMESTAMP"'","status":"STATUS","unresolved":N,"critical_gaps":N,"issues_found":N,"mode":"FULL_REVIEW","via":"autoplan","commit":"'"$COMMIT"'"}' ``` If Phase 2 ran (UI scope): ```bash -~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"plan-design-review","timestamp":"'"$TIMESTAMP"'","status":"clean","unresolved":0,"via":"autoplan","commit":"'"$COMMIT"'"}' +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"plan-design-review","timestamp":"'"$TIMESTAMP"'","status":"STATUS","unresolved":N,"via":"autoplan","commit":"'"$COMMIT"'"}' ``` -Replace field values with actual counts from the review. - Dual voice logs (one per phase that ran): ```bash ~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"autoplan-voices","timestamp":"'"$TIMESTAMP"'","status":"STATUS","source":"SOURCE","phase":"ceo","via":"autoplan","consensus_confirmed":N,"consensus_disagree":N,"commit":"'"$COMMIT"'"}' diff --git a/autoplan/SKILL.md.tmpl b/autoplan/SKILL.md.tmpl index 661e8fb0..7cf78ced 100644 --- a/autoplan/SKILL.md.tmpl +++ b/autoplan/SKILL.md.tmpl @@ -584,24 +584,24 @@ AskUserQuestion options: ## Completion: Write Review Logs -On approval, write 3 separate review log entries so /ship's dashboard recognizes them: +On approval, write 3 separate review log entries so /ship's dashboard recognizes them. +Replace TIMESTAMP, STATUS, and N with actual values from each review phase. +STATUS is "clean" if no unresolved issues, "issues_open" otherwise. ```bash COMMIT=$(git rev-parse --short HEAD 2>/dev/null) TIMESTAMP=$(date -u +%Y-%m-%dT%H:%M:%SZ) -~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"plan-ceo-review","timestamp":"'"$TIMESTAMP"'","status":"clean","unresolved":0,"critical_gaps":0,"mode":"SELECTIVE_EXPANSION","via":"autoplan","commit":"'"$COMMIT"'"}' +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"plan-ceo-review","timestamp":"'"$TIMESTAMP"'","status":"STATUS","unresolved":N,"critical_gaps":N,"mode":"SELECTIVE_EXPANSION","via":"autoplan","commit":"'"$COMMIT"'"}' -~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"plan-eng-review","timestamp":"'"$TIMESTAMP"'","status":"clean","unresolved":0,"critical_gaps":0,"issues_found":0,"mode":"FULL_REVIEW","via":"autoplan","commit":"'"$COMMIT"'"}' +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"plan-eng-review","timestamp":"'"$TIMESTAMP"'","status":"STATUS","unresolved":N,"critical_gaps":N,"issues_found":N,"mode":"FULL_REVIEW","via":"autoplan","commit":"'"$COMMIT"'"}' ``` If Phase 2 ran (UI scope): ```bash -~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"plan-design-review","timestamp":"'"$TIMESTAMP"'","status":"clean","unresolved":0,"via":"autoplan","commit":"'"$COMMIT"'"}' +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"plan-design-review","timestamp":"'"$TIMESTAMP"'","status":"STATUS","unresolved":N,"via":"autoplan","commit":"'"$COMMIT"'"}' ``` -Replace field values with actual counts from the review. - Dual voice logs (one per phase that ran): ```bash ~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"autoplan-voices","timestamp":"'"$TIMESTAMP"'","status":"STATUS","source":"SOURCE","phase":"ceo","via":"autoplan","consensus_confirmed":N,"consensus_disagree":N,"commit":"'"$COMMIT"'"}' diff --git a/codex/SKILL.md b/codex/SKILL.md index 6e19cd04..ec9eea7c 100644 --- a/codex/SKILL.md +++ b/codex/SKILL.md @@ -423,7 +423,7 @@ CROSS-MODEL ANALYSIS: 7. Persist the review result: ```bash -~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"codex-review","timestamp":"TIMESTAMP","status":"STATUS","gate":"GATE","findings":N,"findings_fixed":N}' +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"codex-review","timestamp":"TIMESTAMP","status":"STATUS","gate":"GATE","findings":N,"findings_fixed":N,"commit":"'"$(git rev-parse --short HEAD)"'"}' ``` Substitute: TIMESTAMP (ISO 8601), STATUS ("clean" if PASS, "issues_found" if FAIL), diff --git a/codex/SKILL.md.tmpl b/codex/SKILL.md.tmpl index 338df93b..77021c82 100644 --- a/codex/SKILL.md.tmpl +++ b/codex/SKILL.md.tmpl @@ -127,7 +127,7 @@ CROSS-MODEL ANALYSIS: 7. Persist the review result: ```bash -~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"codex-review","timestamp":"TIMESTAMP","status":"STATUS","gate":"GATE","findings":N,"findings_fixed":N}' +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"codex-review","timestamp":"TIMESTAMP","status":"STATUS","gate":"GATE","findings":N,"findings_fixed":N,"commit":"'"$(git rev-parse --short HEAD)"'"}' ``` Substitute: TIMESTAMP (ISO 8601), STATUS ("clean" if PASS, "issues_found" if FAIL), diff --git a/land-and-deploy/SKILL.md b/land-and-deploy/SKILL.md index 131c1f2d..d5f2c8d6 100644 --- a/land-and-deploy/SKILL.md +++ b/land-and-deploy/SKILL.md @@ -447,7 +447,8 @@ Collect evidence for each check below. Track warnings (yellow) and blockers (red ``` Parse the output. For each review skill (plan-eng-review, plan-ceo-review, -plan-design-review, design-review-lite, codex-review): +plan-design-review, design-review-lite, codex-review, review, adversarial-review, +codex-plan-review): 1. Find the most recent entry within the last 7 days. 2. Extract its `commit` field. @@ -594,7 +595,7 @@ Use AskUserQuestion: - C) Merge anyway — I understand the risks (Completeness: 3/10) If the user chooses B: **STOP.** List exactly what needs to be done: -- If reviews are stale: "Re-run /plan-eng-review (or /review) to review current code." +- If reviews are stale: "Re-run `/plan-eng-review`, `/review`, or `/autoplan` to review current code." - If E2E not run: "Run `bun run test:e2e` to verify." - If docs not updated: "Run /document-release to update documentation." - If PR body stale: "Update the PR body to reflect current changes." diff --git a/land-and-deploy/SKILL.md.tmpl b/land-and-deploy/SKILL.md.tmpl index 7fcf6797..2af2acba 100644 --- a/land-and-deploy/SKILL.md.tmpl +++ b/land-and-deploy/SKILL.md.tmpl @@ -134,7 +134,8 @@ Collect evidence for each check below. Track warnings (yellow) and blockers (red ``` Parse the output. For each review skill (plan-eng-review, plan-ceo-review, -plan-design-review, design-review-lite, codex-review): +plan-design-review, design-review-lite, codex-review, review, adversarial-review, +codex-plan-review): 1. Find the most recent entry within the last 7 days. 2. Extract its `commit` field. @@ -281,7 +282,7 @@ Use AskUserQuestion: - C) Merge anyway — I understand the risks (Completeness: 3/10) If the user chooses B: **STOP.** List exactly what needs to be done: -- If reviews are stale: "Re-run /plan-eng-review (or /review) to review current code." +- If reviews are stale: "Re-run `/plan-eng-review`, `/review`, or `/autoplan` to review current code." - If E2E not run: "Run `bun run test:e2e` to verify." - If docs not updated: "Run /document-release to update documentation." - If PR body stale: "Update the PR body to reflect current changes." diff --git a/plan-ceo-review/SKILL.md b/plan-ceo-review/SKILL.md index d05be05f..c092ebc1 100644 --- a/plan-ceo-review/SKILL.md +++ b/plan-ceo-review/SKILL.md @@ -1262,7 +1262,13 @@ After completing the review, read the review log and config to display the dashb ~/.claude/skills/gstack/bin/gstack-review-read ``` -Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, review, plan-design-review, design-review-lite, adversarial-review, codex-review, codex-plan-review). Ignore entries with timestamps older than 7 days. For the Eng Review row, show whichever is more recent between `review` (diff-scoped pre-landing review) and `plan-eng-review` (plan-stage architecture review). Append "(DIFF)" or "(PLAN)" to the status to distinguish. For the Adversarial row, show whichever is more recent between `adversarial-review` (new auto-scaled) and `codex-review` (legacy). For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: +Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, review, plan-design-review, design-review-lite, adversarial-review, codex-review, codex-plan-review). Ignore entries with timestamps older than 7 days. For the Eng Review row, show whichever is more recent between `review` (diff-scoped pre-landing review) and `plan-eng-review` (plan-stage architecture review). Append "(DIFF)" or "(PLAN)" to the status to distinguish. For the Adversarial row, show whichever is more recent between `adversarial-review` (new auto-scaled) and `codex-review` (legacy). For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. For the Outside Voice row, show the most recent `codex-plan-review` entry — this captures outside voices from both /plan-ceo-review and /plan-eng-review. + +**Source attribution:** If the most recent entry for a skill has a \`"via"\` field, append it to the status label in parentheses. Examples: `plan-eng-review` with `via:"autoplan"` shows as "CLEAR (PLAN via /autoplan)". `review` with `via:"ship"` shows as "CLEAR (DIFF via /ship)". Entries without a `via` field show as "CLEAR (PLAN)" or "CLEAR (DIFF)" as before. + +Note: `autoplan-voices` and `design-outside-voices` entries are audit-trail-only (forensic data for cross-model consensus analysis). They do not appear in the dashboard and are not checked by any consumer. + +Display: ``` +====================================================================+ diff --git a/plan-design-review/SKILL.md b/plan-design-review/SKILL.md index 5960ea18..3ff7d9f8 100644 --- a/plan-design-review/SKILL.md +++ b/plan-design-review/SKILL.md @@ -768,7 +768,13 @@ After completing the review, read the review log and config to display the dashb ~/.claude/skills/gstack/bin/gstack-review-read ``` -Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, review, plan-design-review, design-review-lite, adversarial-review, codex-review, codex-plan-review). Ignore entries with timestamps older than 7 days. For the Eng Review row, show whichever is more recent between `review` (diff-scoped pre-landing review) and `plan-eng-review` (plan-stage architecture review). Append "(DIFF)" or "(PLAN)" to the status to distinguish. For the Adversarial row, show whichever is more recent between `adversarial-review` (new auto-scaled) and `codex-review` (legacy). For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: +Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, review, plan-design-review, design-review-lite, adversarial-review, codex-review, codex-plan-review). Ignore entries with timestamps older than 7 days. For the Eng Review row, show whichever is more recent between `review` (diff-scoped pre-landing review) and `plan-eng-review` (plan-stage architecture review). Append "(DIFF)" or "(PLAN)" to the status to distinguish. For the Adversarial row, show whichever is more recent between `adversarial-review` (new auto-scaled) and `codex-review` (legacy). For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. For the Outside Voice row, show the most recent `codex-plan-review` entry — this captures outside voices from both /plan-ceo-review and /plan-eng-review. + +**Source attribution:** If the most recent entry for a skill has a \`"via"\` field, append it to the status label in parentheses. Examples: `plan-eng-review` with `via:"autoplan"` shows as "CLEAR (PLAN via /autoplan)". `review` with `via:"ship"` shows as "CLEAR (DIFF via /ship)". Entries without a `via` field show as "CLEAR (PLAN)" or "CLEAR (DIFF)" as before. + +Note: `autoplan-voices` and `design-outside-voices` entries are audit-trail-only (forensic data for cross-model consensus analysis). They do not appear in the dashboard and are not checked by any consumer. + +Display: ``` +====================================================================+ diff --git a/plan-eng-review/SKILL.md b/plan-eng-review/SKILL.md index 0b61d5f6..5b57c16f 100644 --- a/plan-eng-review/SKILL.md +++ b/plan-eng-review/SKILL.md @@ -870,7 +870,13 @@ After completing the review, read the review log and config to display the dashb ~/.claude/skills/gstack/bin/gstack-review-read ``` -Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, review, plan-design-review, design-review-lite, adversarial-review, codex-review, codex-plan-review). Ignore entries with timestamps older than 7 days. For the Eng Review row, show whichever is more recent between `review` (diff-scoped pre-landing review) and `plan-eng-review` (plan-stage architecture review). Append "(DIFF)" or "(PLAN)" to the status to distinguish. For the Adversarial row, show whichever is more recent between `adversarial-review` (new auto-scaled) and `codex-review` (legacy). For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: +Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, review, plan-design-review, design-review-lite, adversarial-review, codex-review, codex-plan-review). Ignore entries with timestamps older than 7 days. For the Eng Review row, show whichever is more recent between `review` (diff-scoped pre-landing review) and `plan-eng-review` (plan-stage architecture review). Append "(DIFF)" or "(PLAN)" to the status to distinguish. For the Adversarial row, show whichever is more recent between `adversarial-review` (new auto-scaled) and `codex-review` (legacy). For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. For the Outside Voice row, show the most recent `codex-plan-review` entry — this captures outside voices from both /plan-ceo-review and /plan-eng-review. + +**Source attribution:** If the most recent entry for a skill has a \`"via"\` field, append it to the status label in parentheses. Examples: `plan-eng-review` with `via:"autoplan"` shows as "CLEAR (PLAN via /autoplan)". `review` with `via:"ship"` shows as "CLEAR (DIFF via /ship)". Entries without a `via` field show as "CLEAR (PLAN)" or "CLEAR (DIFF)" as before. + +Note: `autoplan-voices` and `design-outside-voices` entries are audit-trail-only (forensic data for cross-model consensus analysis). They do not appear in the dashboard and are not checked by any consumer. + +Display: ``` +====================================================================+ diff --git a/scripts/resolvers/review.ts b/scripts/resolvers/review.ts index 2b83f36d..86da3b86 100644 --- a/scripts/resolvers/review.ts +++ b/scripts/resolvers/review.ts @@ -9,7 +9,13 @@ After completing the review, read the review log and config to display the dashb ~/.claude/skills/gstack/bin/gstack-review-read \`\`\` -Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, review, plan-design-review, design-review-lite, adversarial-review, codex-review, codex-plan-review). Ignore entries with timestamps older than 7 days. For the Eng Review row, show whichever is more recent between \`review\` (diff-scoped pre-landing review) and \`plan-eng-review\` (plan-stage architecture review). Append "(DIFF)" or "(PLAN)" to the status to distinguish. For the Adversarial row, show whichever is more recent between \`adversarial-review\` (new auto-scaled) and \`codex-review\` (legacy). For Design Review, show whichever is more recent between \`plan-design-review\` (full visual audit) and \`design-review-lite\` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: +Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, review, plan-design-review, design-review-lite, adversarial-review, codex-review, codex-plan-review). Ignore entries with timestamps older than 7 days. For the Eng Review row, show whichever is more recent between \`review\` (diff-scoped pre-landing review) and \`plan-eng-review\` (plan-stage architecture review). Append "(DIFF)" or "(PLAN)" to the status to distinguish. For the Adversarial row, show whichever is more recent between \`adversarial-review\` (new auto-scaled) and \`codex-review\` (legacy). For Design Review, show whichever is more recent between \`plan-design-review\` (full visual audit) and \`design-review-lite\` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. For the Outside Voice row, show the most recent \`codex-plan-review\` entry — this captures outside voices from both /plan-ceo-review and /plan-eng-review. + +**Source attribution:** If the most recent entry for a skill has a \\\`"via"\\\` field, append it to the status label in parentheses. Examples: \`plan-eng-review\` with \`via:"autoplan"\` shows as "CLEAR (PLAN via /autoplan)". \`review\` with \`via:"ship"\` shows as "CLEAR (DIFF via /ship)". Entries without a \`via\` field show as "CLEAR (PLAN)" or "CLEAR (DIFF)" as before. + +Note: \`autoplan-voices\` and \`design-outside-voices\` entries are audit-trail-only (forensic data for cross-model consensus analysis). They do not appear in the dashboard and are not checked by any consumer. + +Display: \`\`\` +====================================================================+ diff --git a/ship/SKILL.md b/ship/SKILL.md index 8999bf84..0fbc474f 100644 --- a/ship/SKILL.md +++ b/ship/SKILL.md @@ -364,7 +364,13 @@ After completing the review, read the review log and config to display the dashb ~/.claude/skills/gstack/bin/gstack-review-read ``` -Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, review, plan-design-review, design-review-lite, adversarial-review, codex-review, codex-plan-review). Ignore entries with timestamps older than 7 days. For the Eng Review row, show whichever is more recent between `review` (diff-scoped pre-landing review) and `plan-eng-review` (plan-stage architecture review). Append "(DIFF)" or "(PLAN)" to the status to distinguish. For the Adversarial row, show whichever is more recent between `adversarial-review` (new auto-scaled) and `codex-review` (legacy). For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: +Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, review, plan-design-review, design-review-lite, adversarial-review, codex-review, codex-plan-review). Ignore entries with timestamps older than 7 days. For the Eng Review row, show whichever is more recent between `review` (diff-scoped pre-landing review) and `plan-eng-review` (plan-stage architecture review). Append "(DIFF)" or "(PLAN)" to the status to distinguish. For the Adversarial row, show whichever is more recent between `adversarial-review` (new auto-scaled) and `codex-review` (legacy). For Design Review, show whichever is more recent between `plan-design-review` (full visual audit) and `design-review-lite` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. For the Outside Voice row, show the most recent `codex-plan-review` entry — this captures outside voices from both /plan-ceo-review and /plan-eng-review. + +**Source attribution:** If the most recent entry for a skill has a \`"via"\` field, append it to the status label in parentheses. Examples: `plan-eng-review` with `via:"autoplan"` shows as "CLEAR (PLAN via /autoplan)". `review` with `via:"ship"` shows as "CLEAR (DIFF via /ship)". Entries without a `via` field show as "CLEAR (PLAN)" or "CLEAR (DIFF)" as before. + +Note: `autoplan-voices` and `design-outside-voices` entries are audit-trail-only (forensic data for cross-model consensus analysis). They do not appear in the dashboard and are not checked by any consumer. + +Display: ``` +====================================================================+ @@ -403,26 +409,15 @@ Parse the output. Find the most recent entry for each skill (plan-ceo-review, pl If the Eng Review is NOT "CLEAR": -1. **Check for a prior override on this branch:** - ```bash - eval "$(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null)" - grep '"skill":"ship-review-override"' ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl 2>/dev/null || echo "NO_OVERRIDE" - ``` - If an override exists, display the dashboard and note "Review gate previously accepted — continuing." Do NOT ask again. +Print: "No prior eng review found — ship will run its own pre-landing review in Step 3.5." -2. **If no override exists,** use AskUserQuestion: - - Show that Eng Review is missing or has open issues - - RECOMMENDATION: Choose C if the change is obviously trivial (< 20 lines, typo fix, config-only); Choose B for larger changes - - Options: A) Ship anyway B) Abort — run /review or /plan-eng-review first C) Change is too small to need eng review - - If CEO Review is missing, mention as informational ("CEO Review not run — recommended for product changes") but do NOT block - - For Design Review: run `source <(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null)`. If `SCOPE_FRONTEND=true` and no design review (plan-design-review or design-review-lite) exists in the dashboard, mention: "Design Review not run — this PR changes frontend code. The lite design check will run automatically in Step 3.5, but consider running /design-review for a full visual audit post-implementation." Still never block. +Check diff size: `git diff ...HEAD --stat | tail -1`. If the diff is >200 lines, add: "Note: This is a large diff. Consider running `/plan-eng-review` or `/autoplan` for architecture-level review before shipping." -3. **If the user chooses A or C,** persist the decision so future `/ship` runs on this branch skip the gate: - ```bash - eval "$(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null)" - echo '{"skill":"ship-review-override","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","decision":"USER_CHOICE"}' >> ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl - ``` - Substitute USER_CHOICE with "ship_anyway" or "not_relevant". +If CEO Review is missing, mention as informational ("CEO Review not run — recommended for product changes") but do NOT block. + +For Design Review: run `source <(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null)`. If `SCOPE_FRONTEND=true` and no design review (plan-design-review or design-review-lite) exists in the dashboard, mention: "Design Review not run — this PR changes frontend code. The lite design check will run automatically in Step 3.5, but consider running /design-review for a full visual audit post-implementation." Still never block. + +Continue to Step 1.5 — do NOT block or ask. Ship runs its own review in Step 3.5. --- @@ -1340,6 +1335,13 @@ Present Codex output under a `CODEX (design):` header, merged with the checklist If no issues found: `Pre-Landing Review: No issues found.` +9. Persist the review result to the review log: +```bash +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"review","timestamp":"TIMESTAMP","status":"STATUS","issues_found":N,"critical":N,"informational":N,"commit":"'"$(git rev-parse --short HEAD)"'","via":"ship"}' +``` +Substitute TIMESTAMP (ISO 8601), STATUS ("clean" if no issues, "issues_found" otherwise), +and N values from the summary counts above. The `via:"ship"` distinguishes from standalone `/review` runs. + Save the review output — it goes into the PR body in Step 8. --- diff --git a/ship/SKILL.md.tmpl b/ship/SKILL.md.tmpl index d630e330..7f545cd9 100644 --- a/ship/SKILL.md.tmpl +++ b/ship/SKILL.md.tmpl @@ -64,26 +64,15 @@ You are running the `/ship` workflow. This is a **non-interactive, fully automat If the Eng Review is NOT "CLEAR": -1. **Check for a prior override on this branch:** - ```bash - {{SLUG_EVAL}} - grep '"skill":"ship-review-override"' ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl 2>/dev/null || echo "NO_OVERRIDE" - ``` - If an override exists, display the dashboard and note "Review gate previously accepted — continuing." Do NOT ask again. +Print: "No prior eng review found — ship will run its own pre-landing review in Step 3.5." -2. **If no override exists,** use AskUserQuestion: - - Show that Eng Review is missing or has open issues - - RECOMMENDATION: Choose C if the change is obviously trivial (< 20 lines, typo fix, config-only); Choose B for larger changes - - Options: A) Ship anyway B) Abort — run /review or /plan-eng-review first C) Change is too small to need eng review - - If CEO Review is missing, mention as informational ("CEO Review not run — recommended for product changes") but do NOT block - - For Design Review: run `source <(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null)`. If `SCOPE_FRONTEND=true` and no design review (plan-design-review or design-review-lite) exists in the dashboard, mention: "Design Review not run — this PR changes frontend code. The lite design check will run automatically in Step 3.5, but consider running /design-review for a full visual audit post-implementation." Still never block. +Check diff size: `git diff ...HEAD --stat | tail -1`. If the diff is >200 lines, add: "Note: This is a large diff. Consider running `/plan-eng-review` or `/autoplan` for architecture-level review before shipping." -3. **If the user chooses A or C,** persist the decision so future `/ship` runs on this branch skip the gate: - ```bash - {{SLUG_EVAL}} - echo '{"skill":"ship-review-override","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","decision":"USER_CHOICE"}' >> ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl - ``` - Substitute USER_CHOICE with "ship_anyway" or "not_relevant". +If CEO Review is missing, mention as informational ("CEO Review not run — recommended for product changes") but do NOT block. + +For Design Review: run `source <(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null)`. If `SCOPE_FRONTEND=true` and no design review (plan-design-review or design-review-lite) exists in the dashboard, mention: "Design Review not run — this PR changes frontend code. The lite design check will run automatically in Step 3.5, but consider running /design-review for a full visual audit post-implementation." Still never block. + +Continue to Step 1.5 — do NOT block or ask. Ship runs its own review in Step 3.5. --- @@ -275,6 +264,13 @@ Review the diff for structural issues that tests don't catch. If no issues found: `Pre-Landing Review: No issues found.` +9. Persist the review result to the review log: +```bash +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"review","timestamp":"TIMESTAMP","status":"STATUS","issues_found":N,"critical":N,"informational":N,"commit":"'"$(git rev-parse --short HEAD)"'","via":"ship"}' +``` +Substitute TIMESTAMP (ISO 8601), STATUS ("clean" if no issues, "issues_found" otherwise), +and N values from the summary counts above. The `via:"ship"` distinguishes from standalone `/review` runs. + Save the review output — it goes into the PR body in Step 8. --- diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index 585e9dd3..d1a0fa57 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -79,6 +79,7 @@ export const E2E_TOUCHFILES: Record = { // Ship 'ship-base-branch': ['ship/**', 'bin/gstack-repo-mode'], 'ship-local-workflow': ['ship/**', 'scripts/gen-skill-docs.ts'], + 'review-dashboard-via': ['ship/**', 'scripts/resolvers/review.ts', 'codex/**', 'autoplan/**', 'land-and-deploy/**'], 'ship-plan-completion': ['ship/**', 'scripts/gen-skill-docs.ts'], 'ship-plan-verification': ['ship/**', 'scripts/gen-skill-docs.ts'], diff --git a/test/skill-e2e-review.test.ts b/test/skill-e2e-review.test.ts index b1d5442d..b5ad501c 100644 --- a/test/skill-e2e-review.test.ts +++ b/test/skill-e2e-review.test.ts @@ -529,6 +529,119 @@ Analyze the git history and produce the narrative report as described in the SKI }, 420_000); }); +// --- Review Dashboard Via Attribution E2E --- + +describeIfSelected('Review Dashboard Via Attribution', ['review-dashboard-via'], () => { + let dashDir: string; + + beforeAll(() => { + dashDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-dashboard-via-')); + const run = (cmd: string, args: string[], cwd = dashDir) => + spawnSync(cmd, args, { cwd, stdio: 'pipe', timeout: 5000 }); + + // Create git repo with feature branch + run('git', ['init', '-b', 'main']); + run('git', ['config', 'user.email', 'test@test.com']); + run('git', ['config', 'user.name', 'Test']); + + fs.writeFileSync(path.join(dashDir, 'app.ts'), 'console.log("v1");\n'); + run('git', ['add', 'app.ts']); + run('git', ['commit', '-m', 'initial']); + + run('git', ['checkout', '-b', 'feature/dashboard-test']); + fs.writeFileSync(path.join(dashDir, 'app.ts'), 'console.log("v2");\n'); + run('git', ['add', 'app.ts']); + run('git', ['commit', '-m', 'feat: update']); + + // Get HEAD commit for review entries + const headResult = spawnSync('git', ['rev-parse', '--short', 'HEAD'], { cwd: dashDir, stdio: 'pipe' }); + const commit = headResult.stdout.toString().trim(); + + // Pre-populate review log with autoplan-sourced entries + // gstack-review-read reads from ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl + // For the test, we'll write a mock gstack-review-read script that returns our test data + const timestamp = new Date().toISOString().replace(/\.\d{3}Z$/, 'Z'); + const reviewData = [ + `{"skill":"plan-eng-review","timestamp":"${timestamp}","status":"clean","unresolved":0,"critical_gaps":0,"issues_found":0,"mode":"FULL_REVIEW","via":"autoplan","commit":"${commit}"}`, + `{"skill":"plan-ceo-review","timestamp":"${timestamp}","status":"clean","unresolved":0,"critical_gaps":0,"mode":"SELECTIVE_EXPANSION","via":"autoplan","commit":"${commit}"}`, + `{"skill":"codex-plan-review","timestamp":"${timestamp}","status":"clean","source":"codex","commit":"${commit}"}`, + ].join('\n'); + + // Write a mock gstack-review-read that returns our test data + const mockBinDir = path.join(dashDir, '.mock-bin'); + fs.mkdirSync(mockBinDir, { recursive: true }); + fs.writeFileSync(path.join(mockBinDir, 'gstack-review-read'), [ + '#!/usr/bin/env bash', + `echo '${reviewData.split('\n').join("'\necho '")}'`, + 'echo "---CONFIG---"', + 'echo "false"', + 'echo "---HEAD---"', + `echo "${commit}"`, + ].join('\n')); + fs.chmodSync(path.join(mockBinDir, 'gstack-review-read'), 0o755); + + // Copy ship skill + fs.copyFileSync(path.join(ROOT, 'ship', 'SKILL.md'), path.join(dashDir, 'ship-SKILL.md')); + }); + + afterAll(() => { + try { fs.rmSync(dashDir, { recursive: true, force: true }); } catch {} + }); + + testConcurrentIfSelected('review-dashboard-via', async () => { + const mockBinDir = path.join(dashDir, '.mock-bin'); + + const result = await runSkillTest({ + prompt: `Read ship-SKILL.md. You only need to run the Review Readiness Dashboard section. + +Instead of running ~/.claude/skills/gstack/bin/gstack-review-read, run this mock: ${mockBinDir}/gstack-review-read + +Parse the output and display the dashboard table. Pay attention to: +1. The "via" field in entries — show source attribution (e.g., "via /autoplan") +2. The codex-plan-review entry — it should populate the Outside Voice row +3. Since Eng Review IS clear, there should be NO gate blocking — just display the dashboard + +Skip the preamble, lake intro, telemetry, and all other ship steps. +Write the dashboard output to ${dashDir}/dashboard-output.md`, + workingDirectory: dashDir, + maxTurns: 12, + timeout: 90_000, + testName: 'review-dashboard-via', + runId, + }); + + logCost('/ship dashboard-via', result); + recordE2E(evalCollector, '/ship review dashboard via attribution', 'Dashboard via field', result); + expect(result.exitReason).toBe('success'); + + // Check dashboard output for via attribution + const dashPath = path.join(dashDir, 'dashboard-output.md'); + const allOutput = [ + result.output || '', + ...result.toolCalls.map(tc => tc.output || ''), + ].join('\n').toLowerCase(); + + // Verify via attribution appears somewhere (conversation or file) + let dashContent = ''; + if (fs.existsSync(dashPath)) { + dashContent = fs.readFileSync(dashPath, 'utf-8').toLowerCase(); + } + const combined = allOutput + dashContent; + + // Should mention autoplan attribution + expect(combined).toMatch(/autoplan/); + // Should show eng review as CLEAR (it has a clean entry) + expect(combined).toMatch(/clear/i); + // Should NOT contain AskUserQuestion gate (no blocking) + const gateQuestions = result.toolCalls.filter(tc => + tc.tool === 'mcp__conductor__AskUserQuestion' || + (tc.tool === 'AskUserQuestion') + ); + // Ship dashboard should not gate when eng review is clear + expect(gateQuestions).toHaveLength(0); + }, 120_000); +}); + // Module-level afterAll — finalize eval collector after all tests complete afterAll(async () => { await finalizeEvalCollector(evalCollector); From 7665adf4fe8b13ad40b687b53ef66b7bc551147f Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Thu, 26 Mar 2026 11:15:24 -0600 Subject: [PATCH 5/9] feat: headed mode + sidebar agent + Chrome extension (v0.12.0) (#517) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: CDP connect — control real Chrome/Comet via Playwright Add `connectCDP()` to BrowserManager: connects to a running browser via Chrome DevTools Protocol. All existing browse commands work unchanged through Playwright's abstraction layer. - chrome-launcher.ts: browser discovery, CDP probe, auto-relaunch with rollback - browser-manager.ts: connectCDP(), mode guards (close/closeTab/recreateContext/handoff), auto-reconnect on browser restart, getRefMap() for extension API - server.ts: CDP branch in start(), /health gains mode field, /refs endpoint, idle timer only resets on /command (not passive endpoints) Co-Authored-By: Claude Opus 4.6 (1M context) * feat: browse connect/disconnect/focus CLI commands - connect: pre-server command that discovers browser, starts server in CDP mode - disconnect: drops CDP connection, restarts in headless mode - focus: brings browser window to foreground via osascript (macOS) - status: now shows Mode: cdp | launched | headed - startServer() accepts extra env vars for CDP URL/port passthrough Co-Authored-By: Claude Opus 4.6 (1M context) * feat: CDP-aware skill templates — skip cookie import in real browser mode Skills now check `$B status` for CDP mode and skip: - /qa: cookie import prompt, user-agent override, headless workarounds - /design-review: cookie import for authenticated pages - /setup-browser-cookies: returns "not needed" in CDP mode Regenerated SKILL.md files from updated templates. Co-Authored-By: Claude Opus 4.6 (1M context) * feat: activity streaming — SSE endpoint for Chrome extension Side Panel Real-time browse command feed via Server-Sent Events: - activity.ts: ActivityEntry type, CircularBuffer (capacity 1000), privacy filtering (redacts passwords, auth tokens, sensitive URL params), cursor-based gap detection, async subscriber notification - server.ts: /activity/stream SSE, /activity/history REST, handleCommand instrumented with command_start/command_end events - 18 unit tests for filterArgs privacy, emitActivity, subscribe lifecycle Co-Authored-By: Claude Opus 4.6 (1M context) * feat: Chrome extension Side Panel + Conductor API proposal Chrome extension (Manifest V3, sideload): - Side Panel with live activity feed, @ref overlays, dark terminal aesthetic - Background worker: health polling, SSE relay, ref fetching - Popup: port config, connection status, side panel launcher - Content script: floating ref panel with @ref badges Conductor API proposal (docs/designs/CONDUCTOR_SESSION_API.md): - SSE endpoint for full Claude Code session mirroring in Side Panel - Discovery via HTTP endpoint (not filesystem — extensions can't read files) TODOS.md: add $B watch, multi-agent tabs, cross-platform CDP, Web Store publishing. Mark CDP mode as shipped. Co-Authored-By: Claude Opus 4.6 (1M context) * fix: detect Conductor runtime, skip osascript quit for sandboxed apps macOS App Management blocks Electron apps (Conductor) from quitting other apps via osascript. Now detects the runtime environment: - terminal/claude-code/codex: can manage apps freely - conductor: prints manual restart instructions + polls for 60s detectRuntime() checks env vars and parent process. When Chrome needs restart but we can't quit it, prints step-by-step instructions and waits for the user to restart Chrome with --remote-debugging-port. Co-Authored-By: Claude Opus 4.6 (1M context) * fix: detect Conductor via actual env vars (CONDUCTOR_WORKSPACE_NAME) Previous detection checked CONDUCTOR_WORKSPACE_ID which doesn't exist. Conductor sets CONDUCTOR_WORKSPACE_NAME, CONDUCTOR_BIN_DIR, CONDUCTOR_PORT, and __CFBundleIdentifier=com.conductor.app. Check these FIRST because Conductor sessions also have ANTHROPIC_API_KEY (which was matching claude-code). Co-Authored-By: Claude Opus 4.6 (1M context) * feat: connection status pill — floating indicator when gstack controls Chrome Small pill in bottom-right corner of every page: "● gstack · 3 refs" Shows when connected via CDP, fades to 30% opacity after 3s, full on hover. Disappears entirely when disconnected. Background worker now notifies content scripts on connect/disconnect state changes so the pill appears/disappears without polling. Co-Authored-By: Claude Opus 4.6 (1M context) * fix: Chrome requires --user-data-dir for remote debugging Chrome refuses --remote-debugging-port without an explicit --user-data-dir. Add userDataDir to BrowserBinary registry (macOS Application Support paths) and pass it in both auto-launch and manual restart instructions. Fix double-quoting in CLI manual restart instructions. Co-Authored-By: Claude Opus 4.6 (1M context) * fix: Chrome must be fully quit before launching with --remote-debugging-port Chrome refuses to enable CDP on its default profile when another instance is running (even with explicit --user-data-dir). The only reliable path: fully quit Chrome first, then relaunch with the flag. Updated instructions to emphasize this clearly with verification step. Co-Authored-By: Claude Opus 4.6 (1M context) * feat: bin/chrome-cdp — quit Chrome and relaunch with CDP in one command Quits Chrome gracefully, waits for full exit, relaunches with --remote-debugging-port, polls until CDP is ready. Usage: chrome-cdp [port] Co-Authored-By: Claude Opus 4.6 (1M context) * fix: use Playwright channel:chrome instead of broken connectOverCDP Playwright's connectOverCDP hangs with Chrome 146 due to CDP protocol version mismatch. Switch to channel:'chrome' which uses Playwright's native pipe protocol to launch the system Chrome binary directly. This is simpler and more reliable: - No CDP port discovery needed - No --remote-debugging-port or --user-data-dir hassles - $B connect just works — launches real Chrome headed window - All Playwright APIs (snapshot, click, fill) work unchanged bin/chrome-cdp updated with symlinked profile approach (kept for manual CDP use cases, but $B connect no longer needs it). Co-Authored-By: Claude Opus 4.6 (1M context) * feat: green border + gstack label on controlled Chrome window Injects a 2px green border and small "gstack" label on every page loaded in the controlled Chrome window via context.addInitScript(). Users can instantly tell which Chrome window Claude controls. Also fixes close() for channel:chrome mode (uses browser.close() not browser.disconnect() which doesn't exist). Co-Authored-By: Claude Opus 4.6 (1M context) * chore: cleanup chrome-launcher runtime detection, remove puppeteer-core dep Co-Authored-By: Claude Opus 4.6 (1M context) * style(design): redesign controlled Chrome indicator Replace crude green border + label with polished indicator: - 2px shimmer gradient at top edge (green→cyan→green, 3s loop) - Floating pill bottom-right with frosted glass bg, fades to 25% opacity after 4s so it doesn't compete with page content - prefers-reduced-motion disables shimmer animation - Much more subtle — looks like a developer tool, not broken CSS Co-Authored-By: Claude Opus 4.6 (1M context) * docs: document real browser mode + Chrome extension in BROWSER.md and README.md BROWSER.md: new sections for connect/disconnect/focus commands, Chrome extension Side Panel install, CDP-aware skills, activity streaming. Updated command reference table, key components, env vars, source map. README.md: updated /browse description, added "Real browser mode" to What's New section. Co-Authored-By: Claude Opus 4.6 (1M context) * docs: step-by-step Chrome extension install guide in BROWSER.md Replace terse bullet points with numbered walkthrough covering: developer mode toggle, load unpacked, macOS file picker tip (Cmd+Shift+G), pin extension, configure port, open side panel. Added troubleshooting section. Co-Authored-By: Claude Opus 4.6 (1M context) * docs: add Cmd+Shift+. tip for hidden folders in macOS file picker macOS hides folders starting with . by default. Added both shortcuts: Cmd+Shift+G (paste path directly) and Cmd+Shift+. (show hidden files). Co-Authored-By: Claude Opus 4.6 (1M context) * docs: integrate hidden folder tips into the install flow naturally Move Cmd+Shift+G and Cmd+Shift+. tips inline with the file picker step instead of as a separate tip block after it. Co-Authored-By: Claude Opus 4.6 (1M context) * feat: auto-load Chrome extension when $B connect launches Chrome Extension auto-loads via --load-extension flag — no manual chrome://extensions install needed. findExtensionPath() checks repo root, global install, and dev paths. Also adds bin/gstack-extension helper for manual install in regular Chrome, and rewrites BROWSER.md install docs with auto-load as primary path. Co-Authored-By: Claude Opus 4.6 (1M context) * feat: /connect-chrome skill — one command to launch Chrome with Side Panel New skill that runs $B connect, verifies the connection, guides the user to open the Side Panel, and demos the live activity feed. Extension auto-loads via --load-extension so no manual chrome://extensions install needed. Co-Authored-By: Claude Opus 4.6 (1M context) * fix: use launchPersistentContext for Chrome extension loading Playwright's chromium.launch() silently ignores --load-extension. Switch to launchPersistentContext with ignoreDefaultArgs to remove --disable-extensions flag. Use bundled Chromium (real Chrome blocks unpacked extensions). Fixed port 34567 for CDP mode so the extension auto-connects. Co-Authored-By: Claude Opus 4.6 (1M context) * feat: sync extension to DESIGN.md — amber accent, zinc neutrals, grain texture Import design system from gstack-website. Update all extension colors: green (#4ade80) → amber (#F59E0B/#FBBF24), zinc gray neutrals, grain texture overlay. Regenerate icons as amber "G" monogram on dark background. Co-Authored-By: Claude Opus 4.6 (1M context) * feat: sidebar chat with Claude Code — icon opens side panel directly Replace popup flyout with direct side panel open on icon click. Primary UI is now a chat interface that sends messages to Claude Code via file queue. Activity/Refs tabs moved behind a debug toggle in the footer. Command bar with history, auto-poll for responses, amber design system. Co-Authored-By: Claude Opus 4.6 (1M context) * feat: sidebar agent — Claude-powered chat backend via file queue Add /sidebar-command, /sidebar-response, and /sidebar-chat endpoints to the browse server. sidebar-agent.ts watches the command queue file, spawns claude -p with browse context for each message, and streams responses back to the sidebar chat. Co-Authored-By: Claude Opus 4.6 (1M context) * fix: remove duplicate gstack pill overlay, hide crash restore bubble The addInitScript indicator and the extension's content script were both injecting bottom-right pills, causing duplicates. Remove the pill from addInitScript (extension handles it). Replace --restore-last-session with --hide-crash-restore-bubble to suppress the "Chromium didn't shut down correctly" dialog. Co-Authored-By: Claude Opus 4.6 (1M context) * fix: state file authority — CDP server cannot be silently replaced Hardens the connect/disconnect lifecycle: - ensureServer() refuses to auto-start headless when CDP server is alive - $B connect does full cleanup: SIGTERM → 2s → SIGKILL, profile locks, state - shutdown() cleans Chromium SingletonLock/Socket/Cookie files - uncaughtException/unhandledRejection handlers do emergency cleanup This prevents the bug where a headless server overwrites the CDP server's state file, causing $B commands to hit the wrong browser. Co-Authored-By: Claude Opus 4.6 (1M context) * feat: sidebar agent streaming events + session state management Enhance sidebar-agent.ts with: - Live streaming of claude -p events (tool_use, text, result) to sidebar - Session state file for BROWSE_STATE_FILE propagation to claude subprocess - Improved logging (stderr, exit codes, event types) - stdin.end() to prevent claude waiting for input - summarizeToolInput() with path shortening for compact sidebar display Co-Authored-By: Claude Opus 4.6 (1M context) * feat: sidebar chat UI — streaming events, agent status, reconnect retry Sidebar panel improvements: - Chat tab renders streaming agent events (tool_use, text, result) - Thinking dots animation while agent processes - Agent error display with styled error blocks - tryConnect() with 2s retry loop for initial connection - Debug tabs (Activity/Refs) hidden behind gear toggle - Clear chat button - Compact tool call display with path shortening Co-Authored-By: Claude Opus 4.6 (1M context) * feat: server-integrated sidebar agent with sessions and message queue Move the sidebar agent from a separate bun process into server.ts: - Agent spawns claude -p directly when messages arrive via /sidebar-command - In-memory chat buffer backed by per-session chat.jsonl on disk - Session manager: create, load, persist, list sessions - Message queue (cap 5) with agent status tracking (idle/processing/hung) - Stop/kill endpoints with queue dismiss support - /health now returns agent status + session info - All sidebar endpoints require Bearer auth - Agent killed on server shutdown - 120s timeout detects hung claude processes Eliminates: file-queue polling, separate sidebar-agent.ts process, stale auth tokens, state file conflicts between processes. Co-Authored-By: Claude Opus 4.6 (1M context) * feat: extension auth + token flow for server-integrated agent Update Chrome extension to use Bearer auth on all sidebar endpoints: - background.js captures auth token from /health, exposes via getToken msg - background.js sets openPanelOnActionClick for direct side panel access - sidepanel.js gets token from background, sends in all fetch headers - Health broadcasts include token so sidebar auto-authenticates - Removes popup from manifest — icon click opens side panel directly Co-Authored-By: Claude Opus 4.6 (1M context) * feat: self-healing sidebar — reconnect banner, state machine, copy button Sidebar UI now handles disconnection gracefully: - Connection state machine: connected → reconnecting → dead - Amber pulsing banner during reconnect (2s retry, 30 attempts) - Red "Server offline" banner with Reconnect + Copy /connect-chrome buttons - Green "Reconnected" toast that fades after 3s on successful reconnect - Copy button lets user paste /connect-chrome into any Claude Code session Co-Authored-By: Claude Opus 4.6 (1M context) * fix: crash handling — save session, kill agent, distinct exit codes Hardened shutdown/crash behavior: - Browser disconnect exits with code 2 (distinct from crash code 1) - emergencyCleanup kills agent subprocess and saves session state - Clean shutdown saves session before exit (chat history persists) - Clear user message on browser disconnect: "Run $B connect to reconnect" Co-Authored-By: Claude Opus 4.6 (1M context) * feat: worktree-per-session isolation for sidebar agent Each sidebar session gets an isolated git worktree so the agent's file operations don't conflict with the user's working directory: - createWorktree() creates detached HEAD worktree in ~/.gstack/worktrees/ - Falls back to main cwd for non-git repos or on creation failure - Handles collision cleanup from prior crashes - removeWorktree() cleans up on session switch and shutdown - worktreePath persisted in session.json Co-Authored-By: Claude Opus 4.6 (1M context) * fix(qa): ISSUE-001 — disconnect blocked by CDP guard in ensureServer $B disconnect was routed through ensureServer() which refused to start a headless server when a CDP state file existed. Disconnect is now handled before ensureServer() (like connect), with force-kill + cleanup fallback when the CDP server is unresponsive. Co-Authored-By: Claude Opus 4.6 (1M context) * fix: resolve claude binary path for daemon-spawned agent The browse server runs as a daemon and may not inherit the user's shell PATH. Add findClaudeBin() that checks ~/.local/bin/claude (standard install location), which claude, and common system paths. Shows a clear error in the sidebar chat if claude CLI is not found. Co-Authored-By: Claude Opus 4.6 (1M context) * fix: resolve claude symlinks + check Conductor bundled binary posix_spawn fails on symlinks in compiled bun binaries. Now: - Checks Conductor app's bundled binary first (not a symlink) - Scans ~/.local/share/claude/versions/ for direct versioned binaries - Uses fs.realpathSync() to resolve symlinks before spawning Co-Authored-By: Claude Opus 4.6 (1M context) * fix: compiled bun binary cannot posix_spawn — use external agent process Compiled bun binaries fail posix_spawn on ALL executables (even /bin/bash). The server now writes to an agent queue file, and a separate non-compiled bun process (sidebar-agent.ts) reads the queue, spawns claude, and POSTs events back via /sidebar-agent/event. Changes: - server.ts: spawnClaude writes to queue file instead of spawning directly - server.ts: new /sidebar-agent/event endpoint for agent → server relay - server.ts: fix result event field name (event.text vs event.result) - sidebar-agent.ts: rewritten to poll queue file, relay events via HTTP - cli.ts: $B connect auto-starts sidebar-agent as non-compiled bun process Co-Authored-By: Claude Opus 4.6 (1M context) * feat: loading spinner on sidebar open while connecting to server Shows an amber spinner with "Connecting..." when the sidebar first opens, replacing the empty state. After the first successful /sidebar-chat poll: - If chat history exists: renders it immediately - If no history: shows the welcome message Prevents the jarring empty-then-populated flash on sidebar open. Co-Authored-By: Claude Opus 4.6 (1M context) * feat: zero-friction side panel — auto-open on install, pill is clickable Three changes to eliminate manual side panel setup: - Auto-open side panel on extension install/update (onInstalled listener) - gstack pill (bottom-right) is now clickable — opens the side panel - Pill has pointer-events: auto so clicks always register (was: none) User no longer needs to find the puzzle piece icon, pin the extension, or know the side panel exists. It opens automatically on first launch and can be re-opened by clicking the floating gstack pill. Co-Authored-By: Claude Opus 4.6 (1M context) * refactor: kill CDP naming, delete chrome-launcher.ts dead code The connectCDP() method and connectionMode: 'cdp' naming was a legacy artifact — real Chrome was tried but failed (silently blocks --load-extension), so the implementation already used Playwright's bundled Chromium via launchPersistentContext(). The naming was misleading. Changes: - Delete chrome-launcher.ts (361 LOC) — only import was in unreachable attemptReconnect() method - Delete dead attemptReconnect() and reconnecting field - Delete preExistingTabIds (was for protecting real Chrome tabs we never connect to) - Rename connectCDP() → launchHeaded() - Rename connectionMode: 'cdp' → 'headed' across all files - Replace BROWSE_CDP_URL/BROWSE_CDP_PORT env vars with BROWSE_HEADED=1 - Regenerate SKILL.md files for updated command descriptions - Move BrowserManager unit tests to browser-manager-unit.test.ts Co-Authored-By: Claude Opus 4.6 (1M context) * feat: converge handoff into connect — extension loads on handoff Handoff now uses launchPersistentContext() with extension auto-loading, same as the connect/launchHeaded() path. This means when the agent gets stuck (2FA, CAPTCHA) and hands off to the user, the Chrome extension + side panel are available automatically. Before: handoff used chromium.launch() + newContext() — no extension After: handoff uses chromium.launchPersistentContext() — extension loads Also sets connectionMode to 'headed' and disables dialog auto-accept on handoff, matching connect behavior. Co-Authored-By: Claude Opus 4.6 (1M context) * feat: gate sidebar chat behind --chat flag $B connect (default): headed Chromium + extension with Activity + Refs tabs only. No separate agent spawned. Clean, no confusion. $B connect --chat: same + Chat tab with standalone claude -p agent. Shows experimental banner: "Standalone mode — this is a separate agent from your workspace." Implementation: - cli.ts: parse --chat, set BROWSE_SIDEBAR_CHAT env, conditionally spawn sidebar-agent - server.ts: gate /sidebar-* routes behind chatEnabled, return 403 when disabled, include chatEnabled in /health response - sidepanel.js: applyChatEnabled() hides/shows Chat tab + banner - background.js: forward chatEnabled from health response - sidepanel.html/css: experimental banner with amber styling Co-Authored-By: Claude Opus 4.6 (1M context) * feat: file drop relay + $B inbox command Sidebar agent now writes structured messages to .context/sidebar-inbox/ when processing user input. The workspace agent can read these via $B inbox to see what the user reported from the browser. File drop format: .context/sidebar-inbox/{timestamp}-observation.json { type, timestamp, page: {url}, userMessage, sidebarSessionId } Atomic writes (tmp + rename) prevent partial reads. $B inbox --clear removes messages after display. Co-Authored-By: Claude Opus 4.6 (1M context) * feat: $B watch — passive observation mode Claude enters read-only mode and captures periodic snapshots (every 5s) while the user browses. Mutation commands (click, fill, etc.) are blocked during watch. $B watch stop exits and returns a summary with the last snapshot. Requires headed mode ($B connect). This is the inverse of the scout pattern — the workspace agent watches through the browser instead of the sidebar relaying to it. Co-Authored-By: Claude Opus 4.6 (1M context) * test: add coverage for sidebar-agent, file-drop, and watch mode 33 new tests covering: - Sidebar agent queue parsing (valid/malformed/empty JSONL) - writeToInbox file drop (directory creation, atomic writes, JSON format) - Inbox command (display, sorting, --clear, malformed file handling) - Watch mode state machine (start/stop cycles, snapshots, duration) Co-Authored-By: Claude Opus 4.6 (1M context) * docs: TODOS cleanup + Chrome vs Chromium exploration doc - Update TODOS.md: mark CDP mode, $B watch, sidebar scout as SHIPPED - Delete dead "cross-platform CDP browser discovery" TODO - Rename dependencies from "CDP connect" to "headed mode" - Add docs/designs/CHROME_VS_CHROMIUM_EXPLORATION.md memorializing the architecture exploration and decision to use Playwright Chromium Co-Authored-By: Claude Opus 4.6 (1M context) * docs: add Conductor Chrome sidebar integration design doc Co-Authored-By: Claude Opus 4.6 (1M context) * fix: sidebar-agent validates cwd before spawning claude The queue entry may reference a worktree that was cleaned up between sessions. Now falls back to process.cwd() if the path doesn't exist, preventing silent spawn failures. Co-Authored-By: Claude Opus 4.6 (1M context) * fix: gen-skill-docs resolver merge + preamble tier gate + plan file discovery The local RESOLVERS record in gen-skill-docs.ts was shadowing the imported canonical resolvers, causing stale test coverage and preamble generators to be used instead of the authoritative versions in resolvers/. Changes: - Merge imported RESOLVERS with local overrides (spread + override pattern) - Fix preamble tier gate: tier 1 skills no longer get AskUserQuestion format - Make plan file discovery host-agnostic (search multiple plan dirs) - Add missing E2E tier entries for ship/review plan completion tests Co-Authored-By: Claude Opus 4.6 (1M context) * feat: ungate sidebar agent + raise timeout to 5 minutes (v0.12.0) Sidebar chat is now always available in headed mode — no --chat flag needed. Agent tasks get 5 minutes instead of 2, enabling multi-page workflows like navigating directories and filling forms across pages. Changes: - cli.ts: remove --chat flag, always set BROWSE_SIDEBAR_CHAT=1, always spawn agent - server.ts: remove chatEnabled gate (403 response), raise AGENT_TIMEOUT_MS to 300s - sidebar-agent.ts: raise child process timeout from 120s to 300s Co-Authored-By: Claude Opus 4.6 (1M context) * docs: headed mode + sidebar agent documentation (v0.12.0) - README: sidebar agent section, personal automation example (school parent portal), two auth paths (manual login + cookie import), DevTools MCP mention - BROWSER.md: sidebar agent section with usage, timeout, session isolation, authentication, and random delay documentation - connect-chrome template: add sidebar chat onboarding step - CHANGELOG: v0.12.0 entry covering headed mode, sidebar agent, extension - VERSION: bump to 0.12.0.0 - TODOS: Chrome DevTools MCP integration as P0 Co-Authored-By: Claude Opus 4.6 (1M context) * chore: regenerate SKILL.md files Generated from updated templates + resolver merge. Key changes: - Tier 1 skills no longer include AskUserQuestion format section - Ship/review skills now include coverage gate with thresholds - Connect-chrome skill includes sidebar chat onboarding step - Plan file discovery uses host-agnostic paths Co-Authored-By: Claude Opus 4.6 (1M context) * chore: regenerate Codex connect-chrome skill Updated preamble with proactive prompt and sidebar chat onboarding step. Co-Authored-By: Claude Opus 4.6 (1M context) * feat: network idle, state persistence, iframe support, chain pipe format (v0.12.1.0) (#516) * feat: network idle detection + chain pipe format - Upgrade click/fill/select from domcontentloaded to networkidle wait (2s timeout, best-effort). Catches XHR/fetch triggered by interactions. - Add pipe-delimited format to chain as JSON fallback: $B chain 'goto url | click @e5 | snapshot -ic' - Add post-loop networkidle wait in chain when last command was a write. - Frame-aware: commands use target (getActiveFrameOrPage) for locator ops, page-only ops (goto/back/forward/reload) guard against frame context. Co-Authored-By: Claude Opus 4.6 (1M context) * feat: $B state save/load + $B frame — new browse commands - state save/load: persist cookies + URLs to .gstack/browse-states/{name}.json File perms 0o600, name sanitized to [a-zA-Z0-9_-]. V1 skips localStorage (breaks on load-before-navigate). Load replaces session via closeAllPages(). - frame: switch command context to iframe via CSS selector, @ref, --name, or --url. 'frame main' returns to main frame. Execution target abstraction (getActiveFrameOrPage) across read-commands, snapshot, and write-commands. - Frame context cleared on tab switch, navigation, resume, and handoff. - Snapshot shows [Context: iframe src="..."] header when in frame. Co-Authored-By: Claude Opus 4.6 (1M context) * test: add tests for network idle, chain pipe format, state, and frame - Network idle: click on fetch button waits for XHR, static click is fast - Chain pipe: pipe-delimited commands, quoted args, JSON still works - State: save/load round-trip, name sanitization, missing state error - Frame: switch to iframe + back, snapshot context header, fill in frame, goto-in-frame guard, usage error New fixtures: network-idle.html (fetch + static buttons), iframe.html (srcdoc) Co-Authored-By: Claude Opus 4.6 (1M context) * fix: review fixes — iframe ref scoping, detached frame recovery, state validation - snapshot.ts: ref locators, cursor-interactive scan, and cursor locator now use target (frame-aware) instead of page — fixes @ref clicking in iframes - browser-manager.ts: getActiveFrameOrPage auto-recovers from detached frames via isDetached() check - meta-commands.ts: state load resets activeFrame, elementHandle disposed after contentFrame(), state file schema validation (cookies + pages arrays), filter empty pipe segments in chain tokenizer - write-commands.ts: upload command uses target.locator() for frame support Co-Authored-By: Claude Opus 4.6 (1M context) * chore: regenerate SKILL.md files + rebuild binary Co-Authored-By: Claude Opus 4.6 (1M context) * chore: bump version and changelog (v0.12.1.0) Co-Authored-By: Claude Opus 4.6 --------- Co-authored-by: Claude Opus 4.6 (1M context) --------- Co-authored-by: Claude Opus 4.6 (1M context) --- .agents/skills/gstack-connect-chrome/SKILL.md | 411 ++++++++++ BROWSER.md | 124 +++ CHANGELOG.md | 46 ++ DESIGN.md | 86 ++ README.md | 34 +- SKILL.md | 7 + TODOS.md | 75 +- VERSION | 2 +- bin/chrome-cdp | 68 ++ bin/gstack-extension | 65 ++ browse/SKILL.md | 7 + browse/src/activity.ts | 208 +++++ browse/src/browser-manager.ts | 340 +++++++- browse/src/cli.ts | 152 +++- browse/src/commands.ts | 17 + browse/src/meta-commands.ts | 284 ++++++- browse/src/read-commands.ts | 38 +- browse/src/server.ts | 766 +++++++++++++++++- browse/src/sidebar-agent.ts | 278 +++++++ browse/src/snapshot.ts | 23 +- browse/src/write-commands.ts | 36 +- browse/test/activity.test.ts | 120 +++ browse/test/browser-manager-unit.test.ts | 17 + browse/test/commands.test.ts | 242 +++++- browse/test/file-drop.test.ts | 271 +++++++ browse/test/fixtures/iframe.html | 30 + browse/test/fixtures/network-idle.html | 30 + browse/test/sidebar-agent.test.ts | 199 +++++ browse/test/watch.test.ts | 129 +++ connect-chrome/SKILL.md | 412 ++++++++++ connect-chrome/SKILL.md.tmpl | 136 ++++ design-review/SKILL.md | 6 + design-review/SKILL.md.tmpl | 6 + .../designs/CHROME_VS_CHROMIUM_EXPLORATION.md | 84 ++ .../CONDUCTOR_CHROME_SIDEBAR_INTEGRATION.md | 57 ++ docs/designs/CONDUCTOR_SESSION_API.md | 108 +++ extension/background.js | 237 ++++++ extension/content.css | 124 +++ extension/content.js | 150 ++++ extension/icons/icon-128.png | Bin 0 -> 2839 bytes extension/icons/icon-16.png | Bin 0 -> 400 bytes extension/icons/icon-48.png | Bin 0 -> 1106 bytes extension/manifest.json | 31 + extension/popup.html | 98 +++ extension/popup.js | 60 ++ extension/sidepanel.css | 704 ++++++++++++++++ extension/sidepanel.html | 84 ++ extension/sidepanel.js | 661 +++++++++++++++ package.json | 5 +- qa/SKILL.md | 6 + qa/SKILL.md.tmpl | 6 + review/SKILL.md | 16 +- scripts/gen-skill-docs.ts | 9 +- scripts/resolvers/review.ts | 16 +- setup-browser-cookies/SKILL.md | 8 + setup-browser-cookies/SKILL.md.tmpl | 8 + ship/SKILL.md | 16 +- test/helpers/touchfiles.ts | 1 + test/skill-validation.test.ts | 5 - 59 files changed, 7008 insertions(+), 151 deletions(-) create mode 100644 .agents/skills/gstack-connect-chrome/SKILL.md create mode 100644 DESIGN.md create mode 100755 bin/chrome-cdp create mode 100755 bin/gstack-extension create mode 100644 browse/src/activity.ts create mode 100644 browse/src/sidebar-agent.ts create mode 100644 browse/test/activity.test.ts create mode 100644 browse/test/browser-manager-unit.test.ts create mode 100644 browse/test/file-drop.test.ts create mode 100644 browse/test/fixtures/iframe.html create mode 100644 browse/test/fixtures/network-idle.html create mode 100644 browse/test/sidebar-agent.test.ts create mode 100644 browse/test/watch.test.ts create mode 100644 connect-chrome/SKILL.md create mode 100644 connect-chrome/SKILL.md.tmpl create mode 100644 docs/designs/CHROME_VS_CHROMIUM_EXPLORATION.md create mode 100644 docs/designs/CONDUCTOR_CHROME_SIDEBAR_INTEGRATION.md create mode 100644 docs/designs/CONDUCTOR_SESSION_API.md create mode 100644 extension/background.js create mode 100644 extension/content.css create mode 100644 extension/content.js create mode 100644 extension/icons/icon-128.png create mode 100644 extension/icons/icon-16.png create mode 100644 extension/icons/icon-48.png create mode 100644 extension/manifest.json create mode 100644 extension/popup.html create mode 100644 extension/popup.js create mode 100644 extension/sidepanel.css create mode 100644 extension/sidepanel.html create mode 100644 extension/sidepanel.js diff --git a/.agents/skills/gstack-connect-chrome/SKILL.md b/.agents/skills/gstack-connect-chrome/SKILL.md new file mode 100644 index 00000000..b1dfc989 --- /dev/null +++ b/.agents/skills/gstack-connect-chrome/SKILL.md @@ -0,0 +1,411 @@ +--- +name: connect-chrome +description: | + Launch real Chrome controlled by gstack with the Side Panel extension auto-loaded. + One command: connects Claude to a visible Chrome window where you can watch every + action in real time. The extension shows a live activity feed in the Side Panel. + Use when asked to "connect chrome", "open chrome", "real browser", "launch chrome", + "side panel", or "control my browser". +--- + + + +## Preamble (run first) + +```bash +_ROOT=$(git rev-parse --show-toplevel 2>/dev/null) +GSTACK_ROOT="$HOME/.codex/skills/gstack" +[ -n "$_ROOT" ] && [ -d "$_ROOT/.agents/skills/gstack" ] && GSTACK_ROOT="$_ROOT/.agents/skills/gstack" +GSTACK_BIN="$GSTACK_ROOT/bin" +GSTACK_BROWSE="$GSTACK_ROOT/browse/dist" +_UPD=$($GSTACK_BIN/gstack-update-check 2>/dev/null || .agents/skills/gstack/bin/gstack-update-check 2>/dev/null || true) +[ -n "$_UPD" ] && echo "$_UPD" || true +mkdir -p ~/.gstack/sessions +touch ~/.gstack/sessions/"$PPID" +_SESSIONS=$(find ~/.gstack/sessions -mmin -120 -type f 2>/dev/null | wc -l | tr -d ' ') +find ~/.gstack/sessions -mmin +120 -type f -delete 2>/dev/null || true +_CONTRIB=$($GSTACK_BIN/gstack-config get gstack_contributor 2>/dev/null || true) +_PROACTIVE=$($GSTACK_BIN/gstack-config get proactive 2>/dev/null || echo "true") +_PROACTIVE_PROMPTED=$([ -f ~/.gstack/.proactive-prompted ] && echo "yes" || echo "no") +_BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") +echo "BRANCH: $_BRANCH" +echo "PROACTIVE: $_PROACTIVE" +echo "PROACTIVE_PROMPTED: $_PROACTIVE_PROMPTED" +source <($GSTACK_BIN/gstack-repo-mode 2>/dev/null) || true +REPO_MODE=${REPO_MODE:-unknown} +echo "REPO_MODE: $REPO_MODE" +_LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no") +echo "LAKE_INTRO: $_LAKE_SEEN" +_TEL=$($GSTACK_BIN/gstack-config get telemetry 2>/dev/null || true) +_TEL_PROMPTED=$([ -f ~/.gstack/.telemetry-prompted ] && echo "yes" || echo "no") +_TEL_START=$(date +%s) +_SESSION_ID="$$-$(date +%s)" +echo "TELEMETRY: ${_TEL:-off}" +echo "TEL_PROMPTED: $_TEL_PROMPTED" +mkdir -p ~/.gstack/analytics +echo '{"skill":"connect-chrome","ts":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'","repo":"'$(basename "$(git rev-parse --show-toplevel 2>/dev/null)" 2>/dev/null || echo "unknown")'"}' >> ~/.gstack/analytics/skill-usage.jsonl 2>/dev/null || true +# zsh-compatible: use find instead of glob to avoid NOMATCH error +for _PF in $(find ~/.gstack/analytics -maxdepth 1 -name '.pending-*' 2>/dev/null); do [ -f "$_PF" ] && $GSTACK_BIN/gstack-telemetry-log --event-type skill_run --skill _pending_finalize --outcome unknown --session-id "$_SESSION_ID" 2>/dev/null || true; break; done +``` + +If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills AND do not +auto-invoke skills based on conversation context. Only run skills the user explicitly +types (e.g., /qa, /ship). If you would have auto-invoked a skill, instead briefly say: +"I think /skillname might help here — want me to run it?" and wait for confirmation. +The user opted out of proactive behavior. + +If output shows `UPGRADE_AVAILABLE `: read `$GSTACK_ROOT/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If `JUST_UPGRADED `: tell user "Running gstack v{to} (just updated!)" and continue. + +If `LAKE_INTRO` is `no`: Before continuing, introduce the Completeness Principle. +Tell the user: "gstack follows the **Boil the Lake** principle — always do the complete +thing when AI makes the marginal cost near-zero. Read more: https://garryslist.org/posts/boil-the-ocean" +Then offer to open the essay in their default browser: + +```bash +open https://garryslist.org/posts/boil-the-ocean +touch ~/.gstack/.completeness-intro-seen +``` + +Only run `open` if the user says yes. Always run `touch` to mark as seen. This only happens once. + +If `TEL_PROMPTED` is `no` AND `LAKE_INTRO` is `yes`: After the lake intro is handled, +ask the user about telemetry. Use AskUserQuestion: + +> Help gstack get better! Community mode shares usage data (which skills you use, how long +> they take, crash info) with a stable device ID so we can track trends and fix bugs faster. +> No code, file paths, or repo names are ever sent. +> Change anytime with `gstack-config set telemetry off`. + +Options: +- A) Help gstack get better! (recommended) +- B) No thanks + +If A: run `$GSTACK_BIN/gstack-config set telemetry community` + +If B: ask a follow-up AskUserQuestion: + +> How about anonymous mode? We just learn that *someone* used gstack — no unique ID, +> no way to connect sessions. Just a counter that helps us know if anyone's out there. + +Options: +- A) Sure, anonymous is fine +- B) No thanks, fully off + +If B→A: run `$GSTACK_BIN/gstack-config set telemetry anonymous` +If B→B: run `$GSTACK_BIN/gstack-config set telemetry off` + +Always run: +```bash +touch ~/.gstack/.telemetry-prompted +``` + +This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. + +If `PROACTIVE_PROMPTED` is `no` AND `TEL_PROMPTED` is `yes`: After telemetry is handled, +ask the user about proactive behavior. Use AskUserQuestion: + +> gstack can proactively figure out when you might need a skill while you work — +> like suggesting /qa when you say "does this work?" or /investigate when you hit +> a bug. We recommend keeping this on — it speeds up every part of your workflow. + +Options: +- A) Keep it on (recommended) +- B) Turn it off — I'll type /commands myself + +If A: run `$GSTACK_BIN/gstack-config set proactive true` +If B: run `$GSTACK_BIN/gstack-config set proactive false` + +Always run: +```bash +touch ~/.gstack/.proactive-prompted +``` + +This only happens once. If `PROACTIVE_PROMPTED` is `yes`, skip this entirely. + +## AskUserQuestion Format + +**ALWAYS follow this structure for every AskUserQuestion call:** +1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) +2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. +3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. +4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` + +Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. + +Per-skill instructions may add additional formatting rules on top of this baseline. + +## Completeness Principle — Boil the Lake + +AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. + +**Effort reference** — always show both scales: + +| Task type | Human team | CC+gstack | Compression | +|-----------|-----------|-----------|-------------| +| Boilerplate | 2 days | 15 min | ~100x | +| Tests | 1 day | 15 min | ~50x | +| Feature | 1 week | 30 min | ~30x | +| Bug fix | 4 hours | 15 min | ~20x | + +Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). + +## Repo Ownership — See Something, Say Something + +`REPO_MODE` controls how to handle issues outside your branch: +- **`solo`** — You own everything. Investigate and offer to fix proactively. +- **`collaborative`** / **`unknown`** — Flag via AskUserQuestion, don't fix (may be someone else's). + +Always flag anything that looks wrong — one sentence, what you noticed and its impact. + +## Search Before Building + +Before building anything unfamiliar, **search first.** See `$GSTACK_ROOT/ETHOS.md`. +- **Layer 1** (tried and true) — don't reinvent. **Layer 2** (new and popular) — scrutinize. **Layer 3** (first principles) — prize above all. + +**Eureka:** When first-principles reasoning contradicts conventional wisdom, name it and log: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` + +## Contributor Mode + +If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. + +**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. + +**To file:** write `~/.gstack/contributor-logs/{slug}.md`: +``` +# {Title} +**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} +## Repro +1. {step} +## What would make this a 10 +{one sentence} +**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} +``` +Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. + +## Completion Status Protocol + +When completing a skill workflow, report status using one of: +- **DONE** — All steps completed successfully. Evidence provided for each claim. +- **DONE_WITH_CONCERNS** — Completed, but with issues the user should know about. List each concern. +- **BLOCKED** — Cannot proceed. State what is blocking and what was tried. +- **NEEDS_CONTEXT** — Missing information required to continue. State exactly what you need. + +### Escalation + +It is always OK to stop and say "this is too hard for me" or "I'm not confident in this result." + +Bad work is worse than no work. You will not be penalized for escalating. +- If you have attempted a task 3 times without success, STOP and escalate. +- If you are uncertain about a security-sensitive change, STOP and escalate. +- If the scope of work exceeds what you can verify, STOP and escalate. + +Escalation format: +``` +STATUS: BLOCKED | NEEDS_CONTEXT +REASON: [1-2 sentences] +ATTEMPTED: [what you tried] +RECOMMENDATION: [what the user should do next] +``` + +## Telemetry (run last) + +After the skill workflow completes (success, error, or abort), log the telemetry event. +Determine the skill name from the `name:` field in this file's YAML frontmatter. +Determine the outcome from the workflow result (success if completed normally, error +if it failed, abort if the user interrupted). + +**PLAN MODE EXCEPTION — ALWAYS RUN:** This command writes telemetry to +`~/.gstack/analytics/` (user config directory, not project files). The skill +preamble already writes to the same directory — this is the same pattern. +Skipping this command loses session duration and outcome data. + +Run this bash: + +```bash +_TEL_END=$(date +%s) +_TEL_DUR=$(( _TEL_END - _TEL_START )) +rm -f ~/.gstack/analytics/.pending-"$_SESSION_ID" 2>/dev/null || true +$GSTACK_ROOT/bin/gstack-telemetry-log \ + --skill "SKILL_NAME" --duration "$_TEL_DUR" --outcome "OUTCOME" \ + --used-browse "USED_BROWSE" --session-id "$_SESSION_ID" 2>/dev/null & +``` + +Replace `SKILL_NAME` with the actual skill name from frontmatter, `OUTCOME` with +success/error/abort, and `USED_BROWSE` with true/false based on whether `$B` was used. +If you cannot determine the outcome, use "unknown". This runs in the background and +never blocks the user. + +## Plan Status Footer + +When you are in plan mode and about to call ExitPlanMode: + +1. Check if the plan file already has a `## GSTACK REVIEW REPORT` section. +2. If it DOES — skip (a review skill already wrote a richer report). +3. If it does NOT — run this command: + +\`\`\`bash +$GSTACK_ROOT/bin/gstack-review-read +\`\`\` + +Then write a `## GSTACK REVIEW REPORT` section to the end of the plan file: + +- If the output contains review entries (JSONL lines before `---CONFIG---`): format the + standard report table with runs/status/findings per skill, same format as the review + skills use. +- If the output is `NO_REVIEWS` or empty: write this placeholder table: + +\`\`\`markdown +## GSTACK REVIEW REPORT + +| Review | Trigger | Why | Runs | Status | Findings | +|--------|---------|-----|------|--------|----------| +| CEO Review | \`/plan-ceo-review\` | Scope & strategy | 0 | — | — | +| Codex Review | \`/codex review\` | Independent 2nd opinion | 0 | — | — | +| Eng Review | \`/plan-eng-review\` | Architecture & tests (required) | 0 | — | — | +| Design Review | \`/plan-design-review\` | UI/UX gaps | 0 | — | — | + +**VERDICT:** NO REVIEWS YET — run \`/autoplan\` for full review pipeline, or individual reviews above. +\`\`\` + +**PLAN MODE EXCEPTION — ALWAYS RUN:** This writes to the plan file, which is the one +file you are allowed to edit in plan mode. The plan file review report is part of the +plan's living status. + +# /connect-chrome — Launch Real Chrome with Side Panel + +Connect Claude to a visible Chrome window with the gstack extension auto-loaded. +You see every click, every navigation, every action in real time. + +## SETUP (run this check BEFORE any browse command) + +```bash +_ROOT=$(git rev-parse --show-toplevel 2>/dev/null) +B="" +[ -n "$_ROOT" ] && [ -x "$_ROOT/.agents/skills/gstack/browse/dist/browse" ] && B="$_ROOT/.agents/skills/gstack/browse/dist/browse" +[ -z "$B" ] && B=$GSTACK_BROWSE/browse +if [ -x "$B" ]; then + echo "READY: $B" +else + echo "NEEDS_SETUP" +fi +``` + +If `NEEDS_SETUP`: +1. Tell the user: "gstack browse needs a one-time build (~10 seconds). OK to proceed?" Then STOP and wait. +2. Run: `cd && ./setup` +3. If `bun` is not installed: `curl -fsSL https://bun.sh/install | bash` + +## Step 1: Connect + +```bash +$B connect +``` + +This launches your system Chrome via Playwright with: +- A visible window (headed mode, not headless) +- The gstack Chrome extension pre-loaded +- A green shimmer line + "gstack" pill so you know which window is controlled + +If Chrome is already running, the server restarts in headed mode with a fresh +Chrome instance. Your regular Chrome stays untouched. + +After connecting, print the output to the user. + +## Step 2: Verify + +```bash +$B status +``` + +Confirm the output shows `Mode: cdp`. Print the port number — the user may need +it for the Side Panel. + +## Step 3: Guide the user to the Side Panel + +Use AskUserQuestion: + +> Chrome is launched with gstack control. You should see a green shimmer line at the +> top of the Chrome window and a small "gstack" pill in the bottom-right corner. +> +> The Side Panel extension is pre-loaded. To open it: +> 1. Look for the **puzzle piece icon** (Extensions) in Chrome's toolbar +> 2. Click it → find **gstack browse** → click the **pin icon** to pin it +> 3. Click the **gstack icon** in the toolbar +> 4. Click **Open Side Panel** +> +> The Side Panel shows a live feed of every browse command in real time. +> +> **Port:** The browse server is on port {PORT} — the extension auto-detects it +> if you're using the Playwright-controlled Chrome. If the badge stays gray, click +> the gstack icon and enter port {PORT} manually. + +Options: +- A) I can see the Side Panel — let's go! +- B) I can see Chrome but can't find the extension +- C) Something went wrong + +If B: Tell the user: +> The extension should be auto-loaded, but Chrome sometimes doesn't show it +> immediately. Try: +> 1. Type `chrome://extensions` in the address bar +> 2. Look for "gstack browse" — it should be listed and enabled +> 3. If not listed, click "Load unpacked" → navigate to the extension folder +> (press Cmd+Shift+G in the file picker, paste this path): +> `{EXTENSION_PATH}` +> +> Then pin it from the puzzle piece icon and open the Side Panel. + +If C: Run `$B status` and show the output. Check if the server is healthy. + +## Step 4: Demo + +After the user confirms the Side Panel is working, run a quick demo so they +can see the activity feed in action: + +```bash +$B goto https://news.ycombinator.com +``` + +Wait 2 seconds, then: + +```bash +$B snapshot -i +``` + +Tell the user: "Check the Side Panel — you should see the `goto` and `snapshot` +commands appear in the activity feed. Every command Claude runs will show up here +in real time." + +## Step 5: Sidebar chat + +After the activity feed demo, tell the user about the sidebar chat: + +> The Side Panel also has a **chat tab**. Try typing a message like "take a +> snapshot and describe this page." A child Claude instance will execute your +> request in the browser — you'll see the commands appear in the activity feed. +> +> The sidebar agent can navigate pages, click buttons, fill forms, and read +> content. Each task gets up to 5 minutes. It runs in an isolated session, so +> it won't interfere with this Claude Code window. + +## Step 6: What's next + +Tell the user: + +> You're all set! Chrome is under Claude's control with the Side Panel showing +> live activity and a chat sidebar for direct commands. Here's what you can do: +> +> - **Chat in the sidebar** — type natural language instructions and Claude +> executes them in the browser +> - **Run any browse command** — `$B goto`, `$B click`, `$B snapshot` — and +> watch it happen in Chrome + the Side Panel +> - **Use /qa or /design-review** — they'll run in the visible Chrome window +> instead of headless. No cookie import needed. +> - **`$B focus`** — bring Chrome to the foreground anytime +> - **`$B disconnect`** — return to headless mode when done + +Then proceed with whatever the user asked to do. If they didn't specify a task, +ask what they'd like to test or browse. diff --git a/BROWSER.md b/BROWSER.md index 086d2278..8f626948 100644 --- a/BROWSER.md +++ b/BROWSER.md @@ -18,6 +18,7 @@ This document covers the command reference and internals of gstack's headless br | Cookies | `cookie-import`, `cookie-import-browser` | Import cookies from file or real browser | | Multi-step | `chain` (JSON from stdin) | Batch commands in one call | | Handoff | `handoff [reason]`, `resume` | Switch to visible Chrome for user takeover | +| Real browser | `connect`, `disconnect`, `focus` | Control real Chrome, visible window | All selector arguments accept CSS selectors, `@e` refs after `snapshot`, or `@c` refs after `snapshot -C`. 50+ commands total plus cookie import. @@ -70,6 +71,7 @@ browse/ │ ├── cookie-import-browser.ts # Decrypt + import cookies from real Chromium browsers │ ├── cookie-picker-routes.ts # HTTP routes for interactive cookie picker UI │ ├── cookie-picker-ui.ts # Self-contained HTML/CSS/JS for cookie picker +│ ├── activity.ts # Activity streaming (SSE) for Chrome extension │ └── buffers.ts # CircularBuffer + console/network/dialog capture ├── test/ # Integration tests + HTML fixtures └── dist/ @@ -124,6 +126,125 @@ The server hooks into Playwright's `page.on('console')`, `page.on('response')`, The `console`, `network`, and `dialog` commands read from the in-memory buffers, not disk. +### Real browser mode (`connect`) + +Instead of headless Chromium, `connect` launches your real Chrome as a headed window controlled by Playwright. You see everything Claude does in real time. + +```bash +$B connect # launch real Chrome, headed +$B goto https://app.com # navigates in the visible window +$B snapshot -i # refs from the real page +$B click @e3 # clicks in the real window +$B focus # bring Chrome window to foreground (macOS) +$B status # shows Mode: cdp +$B disconnect # back to headless mode +``` + +The window has a subtle green shimmer line at the top edge and a floating "gstack" pill in the bottom-right corner so you always know which Chrome window is being controlled. + +**How it works:** Playwright's `channel: 'chrome'` launches your system Chrome binary via a native pipe protocol — not CDP WebSocket. All existing browse commands work unchanged because they go through Playwright's abstraction layer. + +**When to use it:** +- QA testing where you want to watch Claude click through your app +- Design review where you need to see exactly what Claude sees +- Debugging where headless behavior differs from real Chrome +- Demos where you're sharing your screen + +**Commands:** + +| Command | What it does | +|---------|-------------| +| `connect` | Launch real Chrome, restart server in headed mode | +| `disconnect` | Close real Chrome, restart in headless mode | +| `focus` | Bring Chrome to foreground (macOS). `focus @e3` also scrolls element into view | +| `status` | Shows `Mode: cdp` when connected, `Mode: launched` when headless | + +**CDP-aware skills:** When in real-browser mode, `/qa` and `/design-review` automatically skip cookie import prompts and headless workarounds. + +### Chrome extension (Side Panel) + +A Chrome extension that shows a live activity feed of browse commands in a Side Panel, plus @ref overlays on the page. + +#### Automatic install (recommended) + +When you run `$B connect`, the extension **auto-loads** into the Playwright-controlled Chrome window. No manual steps needed — the Side Panel is immediately available. + +```bash +$B connect # launches Chrome with extension pre-loaded +# Click the gstack icon in toolbar → Open Side Panel +``` + +The port is auto-configured. You're done. + +#### Manual install (for your regular Chrome) + +If you want the extension in your everyday Chrome (not the Playwright-controlled one), run: + +```bash +bin/gstack-extension # opens chrome://extensions, copies path to clipboard +``` + +Or do it manually: + +1. **Go to `chrome://extensions`** in Chrome's address bar +2. **Toggle "Developer mode" ON** (top-right corner) +3. **Click "Load unpacked"** — a file picker opens +4. **Navigate to the extension folder:** Press **Cmd+Shift+G** in the file picker to open "Go to folder", then paste one of these paths: + - Global install: `~/.claude/skills/gstack/extension` + - Dev/source: `/extension` + + Press Enter, then click **Select**. + + (Tip: macOS hides folders starting with `.` — press **Cmd+Shift+.** in the file picker to reveal them if you prefer to navigate manually.) + +5. **Pin it:** Click the puzzle piece icon (Extensions) in the toolbar → pin "gstack browse" +6. **Set the port:** Click the gstack icon → enter the port from `$B status` or `.gstack/browse.json` +7. **Open Side Panel:** Click the gstack icon → "Open Side Panel" + +#### What you get + +| Feature | What it does | +|---------|-------------| +| **Toolbar badge** | Green dot when the browse server is reachable, gray when not | +| **Side Panel** | Live scrolling feed of every browse command — shows command name, args, duration, status (success/error) | +| **Refs tab** | After `$B snapshot`, shows the current @ref list (role + name) | +| **@ref overlays** | Floating panel on the page showing current refs | +| **Connection pill** | Small "gstack" pill in the bottom-right corner of every page when connected | + +#### Troubleshooting + +- **Badge stays gray:** Check that the port is correct. The browse server may have restarted on a different port — re-run `$B status` and update the port in the popup. +- **Side Panel is empty:** The feed only shows activity after the extension connects. Run a browse command (`$B snapshot`) to see it appear. +- **Extension disappeared after Chrome update:** Sideloaded extensions persist across updates. If it's gone, reload it from Step 3. + +### Sidebar agent + +The Chrome side panel includes a chat interface. Type a message and a child Claude instance executes it in the browser. The sidebar agent has access to `Bash`, `Read`, `Glob`, and `Grep` tools (same as Claude Code, minus `Edit` and `Write` ... read-only by design). + +**How it works:** + +1. You type a message in the side panel chat +2. The extension POSTs to the local browse server (`/sidebar-command`) +3. The server queues the message and the sidebar-agent process spawns `claude -p` with your message + the current page context +4. Claude executes browse commands via Bash (`$B snapshot`, `$B click @e3`, etc.) +5. Progress streams back to the side panel in real time + +**What you can do:** +- "Take a snapshot and describe what you see" +- "Click the Login button, fill in test@example.com / password123, and submit" +- "Go through every row in this table and extract the names and emails" +- "Navigate to Settings > Account and screenshot it" + +**Timeout:** Each task gets up to 5 minutes. Multi-page workflows (navigating a directory, filling forms across pages) work within this window. If a task times out, the side panel shows an error and you can retry or break it into smaller steps. + +**Session isolation:** Each sidebar session runs in its own git worktree. The sidebar agent won't interfere with your main Claude Code session. + +**Authentication:** The sidebar agent uses the same browser session as headed mode. Two options: +1. Log in manually in the headed browser ... your session persists for the sidebar agent +2. Import cookies from your real Chrome via `/setup-browser-cookies` + +**Random delays:** If you need the agent to pause between actions (e.g., to avoid rate limits), use `sleep` in bash or `$B wait `. + ### User handoff When the headless browser can't proceed (CAPTCHA, MFA, complex auth), `handoff` opens a visible Chrome window at the exact same page with all cookies, localStorage, and tabs preserved. The user solves the problem manually, then `resume` returns control to the agent with a fresh snapshot. @@ -171,6 +292,8 @@ No port collisions. No shared state. Each project is fully isolated. | `BROWSE_IDLE_TIMEOUT` | 1800000 (30 min) | Idle shutdown timeout in ms | | `BROWSE_STATE_FILE` | `.gstack/browse.json` | Path to state file (CLI passes to server) | | `BROWSE_SERVER_SCRIPT` | auto-detected | Path to server.ts | +| `BROWSE_CDP_URL` | (none) | Set to `channel:chrome` for real browser mode | +| `BROWSE_CDP_PORT` | 0 | CDP port (used internally) | ### Performance @@ -250,6 +373,7 @@ Tests spin up a local HTTP server (`browse/test/test-server.ts`) serving HTML fi | `browse/src/cookie-import-browser.ts` | Decrypt Chromium cookies from macOS and Linux browser profiles using platform-specific safe-storage key lookup. Auto-detects installed browsers. | | `browse/src/cookie-picker-routes.ts` | HTTP routes for `/cookie-picker/*` — browser list, domain search, import, remove. | | `browse/src/cookie-picker-ui.ts` | Self-contained HTML generator for the interactive cookie picker (dark theme, no frameworks). | +| `browse/src/activity.ts` | Activity streaming — `ActivityEntry` type, `CircularBuffer`, privacy filtering, SSE subscriber management. | | `browse/src/buffers.ts` | `CircularBuffer` (O(1) ring buffer) + console/network/dialog capture with async disk flush. | ### Deploying to the active skill diff --git a/CHANGELOG.md b/CHANGELOG.md index 68199eb1..2f989493 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,51 @@ # Changelog +## [0.12.1.0] - 2026-03-26 — Smarter Browsing: Network Idle, State Persistence, Iframes + +Every click, fill, and select now waits for the page to settle before returning. No more stale snapshots because an XHR was still in-flight. Chain accepts pipe-delimited format for faster multi-step flows. You can save and restore browser sessions (cookies + open tabs). And iframe content is now reachable. + +### Added + +- **Network idle detection.** `click`, `fill`, and `select` auto-wait up to 2s for network requests to settle before returning. Catches XHR/fetch triggered by interactions. Uses Playwright's built-in `waitForLoadState('networkidle')`, not a custom tracker. + +- **`$B state save/load`.** Save your browser session (cookies + open tabs) to a named file, load it back later. Files stored at `.gstack/browse-states/{name}.json` with 0o600 permissions. V1 saves cookies + URLs only (not localStorage, which breaks on load-before-navigate). Load replaces the current session, not merge. + +- **`$B frame` command.** Switch command context into an iframe: `$B frame iframe`, `$B frame --name checkout`, `$B frame --url stripe`, or `$B frame @e5`. All subsequent commands (click, fill, snapshot, etc.) operate inside the iframe. `$B frame main` returns to the main page. Snapshot shows `[Context: iframe src="..."]` header. Detached frames auto-recover. + +- **Chain pipe format.** Chain now accepts `$B chain 'goto url | click @e5 | snapshot -ic'` as a fallback when JSON parsing fails. Pipe-delimited with quote-aware tokenization. + +### Changed + +- **Chain post-loop idle wait.** After executing all commands in a chain, if the last was a write command, chain waits for network idle before returning. + +### Fixed + +- **Iframe ref scoping.** Snapshot ref locators, cursor-interactive scan, and cursor locators now use the frame-aware target instead of always scoping to the main page. +- **Detached frame recovery.** `getActiveFrameOrPage()` checks `isDetached()` and auto-recovers. +- **State load resets frame context.** Loading a saved state clears the active frame reference. +- **elementHandle leak in frame command.** Now properly disposed after getting contentFrame. +- **Upload command frame-aware.** `upload` uses the frame-aware target for file input locators. + +## [0.12.0.0] - 2026-03-26 — Headed Mode + Sidebar Agent + +You can now watch Claude work in a real Chrome window and direct it from a sidebar chat. + +### Added + +- **Headed mode with sidebar agent.** `$B connect` launches a visible Chrome window with the gstack extension. The Side Panel shows a live activity feed of every command AND a chat interface where you type natural language instructions. A child Claude instance executes your requests in the browser ... navigate pages, click buttons, fill forms, extract data. Each task gets up to 5 minutes. + +- **Personal automation.** The sidebar agent handles repetitive browser tasks beyond dev workflows. Browse your kid's school parent portal and add parent contact info to Google Contacts. Fill out vendor onboarding forms. Extract data from dashboards. Log in once in the headed browser or import cookies from your real Chrome with `/setup-browser-cookies`. + +- **Chrome extension.** Toolbar badge (green=connected, gray=not), Side Panel with activity feed + chat + refs tab, @ref overlays on the page, and a connection pill showing which window gstack controls. Auto-loads when you run `$B connect`. + +- **`/connect-chrome` skill.** Guided setup: launches Chrome, verifies the extension, demos the activity feed, and introduces the sidebar chat. + +### Changed + +- **Sidebar agent ungated.** Previously required `--chat` flag. Now always available in headed mode. The sidebar agent has the same security model as Claude Code itself (Bash, Read, Glob, Grep on localhost). + +- **Agent timeout raised to 5 minutes.** Multi-page tasks (navigating directories, filling forms across pages) need more than the previous 2-minute limit. + ## [0.11.21.0] - 2026-03-26 ### Fixed diff --git a/DESIGN.md b/DESIGN.md new file mode 100644 index 00000000..d1f3ce3d --- /dev/null +++ b/DESIGN.md @@ -0,0 +1,86 @@ +# Design System — gstack + +## Product Context +- **What this is:** Community website for gstack — a CLI tool that turns Claude Code into a virtual engineering team +- **Who it's for:** Developers discovering gstack, existing community members +- **Space/industry:** Developer tools (peers: Linear, Raycast, Warp, Zed) +- **Project type:** Community dashboard + marketing site + +## Aesthetic Direction +- **Direction:** Industrial/Utilitarian — function-first, data-dense, monospace as personality font +- **Decoration level:** Intentional — subtle noise/grain texture on surfaces for materiality +- **Mood:** Serious tool built by someone who cares about craft. Warm, not cold. The CLI heritage IS the brand. +- **Reference sites:** formulae.brew.sh (competitor, but ours is live and interactive), Linear (dark + restrained), Warp (warm accents) + +## Typography +- **Display/Hero:** Satoshi (Black 900 / Bold 700) — geometric with warmth, distinctive letterforms (the lowercase 'a' and 'g'). Not Inter, not Geist. Loaded from Fontshare CDN. +- **Body:** DM Sans (Regular 400 / Medium 500 / Semibold 600) — clean, readable, slightly friendlier than geometric display. Loaded from Google Fonts. +- **UI/Labels:** DM Sans (same as body) +- **Data/Tables:** JetBrains Mono (Regular 400 / Medium 500) — the personality font. Supports tabular-nums. Monospace should be prominent, not hidden in code blocks. Loaded from Google Fonts. +- **Code:** JetBrains Mono +- **Loading:** Google Fonts for DM Sans + JetBrains Mono, Fontshare for Satoshi. Use `display=swap`. +- **Scale:** + - Hero: 72px / clamp(40px, 6vw, 72px) + - H1: 48px + - H2: 32px + - H3: 24px + - H4: 18px + - Body: 16px + - Small: 14px + - Caption: 13px + - Micro: 12px + - Nano: 11px (JetBrains Mono labels) + +## Color +- **Approach:** Restrained — amber accent is rare and meaningful. Dashboard data gets the color; chrome stays neutral. +- **Primary (dark mode):** amber-500 #F59E0B — warm, energetic, reads as "terminal cursor" +- **Primary (light mode):** amber-600 #D97706 — darker for contrast against white backgrounds +- **Primary text accent (dark mode):** amber-400 #FBBF24 +- **Primary text accent (light mode):** amber-700 #B45309 +- **Neutrals:** Cool zinc grays + - zinc-50: #FAFAFA (lightest) + - zinc-400: #A1A1AA + - zinc-600: #52525B + - zinc-800: #27272A + - Surface (dark): #141414 + - Base (dark): #0C0C0C + - Surface (light): #FFFFFF + - Base (light): #FAFAF9 +- **Semantic:** success #22C55E, warning #F59E0B, error #EF4444, info #3B82F6 +- **Dark mode:** Default. Near-black base (#0C0C0C), surface cards at #141414, borders at #262626. +- **Light mode:** Warm stone base (#FAFAF9), white surface cards, stone borders (#E7E5E4). Amber accent shifts to amber-600 for contrast. + +## Spacing +- **Base unit:** 4px +- **Density:** Comfortable — not cramped (not Bloomberg Terminal), not spacious (not a marketing site) +- **Scale:** 2xs(2px) xs(4px) sm(8px) md(16px) lg(24px) xl(32px) 2xl(48px) 3xl(64px) + +## Layout +- **Approach:** Grid-disciplined for dashboard, editorial hero for landing page +- **Grid:** 12 columns at lg+, 1 column at mobile +- **Max content width:** 1200px (6xl) +- **Border radius:** sm:4px, md:8px, lg:12px, full:9999px + - Cards/panels: lg (12px) + - Buttons/inputs: md (8px) + - Badges/pills: full (9999px) + - Skill bars: sm (4px) + +## Motion +- **Approach:** Minimal-functional — only transitions that aid comprehension. The dashboard's live feed IS the motion. +- **Easing:** enter(ease-out / cubic-bezier(0.16,1,0.3,1)) exit(ease-in) move(ease-in-out) +- **Duration:** micro(50-100ms) short(150ms) medium(250ms) long(400ms) +- **Animated elements:** live feed dot pulse (2s infinite), skill bar fill (600ms ease-out), hover states (150ms) + +## Grain Texture +Apply a subtle noise overlay to the entire page for materiality: +- Dark mode: opacity 0.03 +- Light mode: opacity 0.02 +- Use SVG feTurbulence filter as a CSS background-image on body::after +- pointer-events: none, position: fixed, z-index: 9999 + +## Decisions Log +| Date | Decision | Rationale | +|------|----------|-----------| +| 2026-03-21 | Initial design system | Created by /design-consultation. Industrial aesthetic, warm amber accent, Satoshi + DM Sans + JetBrains Mono. | +| 2026-03-21 | Light mode amber-600 | amber-500 too bright/washed against white; amber-700 too brown/umber. amber-600 is the sweet spot. | +| 2026-03-21 | Grain texture | Adds materiality to flat dark surfaces. Prevents the "generic SaaS template" sameness. | diff --git a/README.md b/README.md index fd81d78c..aad62290 100644 --- a/README.md +++ b/README.md @@ -157,7 +157,7 @@ Each skill feeds into the next. `/office-hours` writes a design doc that `/plan- | `/benchmark` | **Performance Engineer** | Baseline page load times, Core Web Vitals, and resource sizes. Compare before/after on every PR. | | `/document-release` | **Technical Writer** | Update all project docs to match what you just shipped. Catches stale READMEs automatically. | | `/retro` | **Eng Manager** | Team-aware weekly retro. Per-person breakdowns, shipping streaks, test health trends, growth opportunities. `/retro global` runs across all your projects and AI tools (Claude Code, Codex, Gemini). | -| `/browse` | **QA Engineer** | Real Chromium browser, real clicks, real screenshots. ~100ms per command. | +| `/browse` | **QA Engineer** | Give the agent eyes. Real Chromium browser, real clicks, real screenshots. ~100ms per command. `$B connect` launches your real Chrome as a headed window — watch every action live. | | `/setup-browser-cookies` | **Session Manager** | Import cookies from your real browser (Chrome, Arc, Brave, Edge) into the headless session. Test authenticated pages. | | `/autoplan` | **Review Pipeline** | One command, fully reviewed plan. Runs CEO → design → eng review automatically with encoded decision principles. Surfaces only taste decisions for your approval. | @@ -179,7 +179,37 @@ Each skill feeds into the next. `/office-hours` writes a design doc that `/plan- gstack works well with one sprint. It gets interesting with ten running at once. -[Conductor](https://conductor.build) runs multiple Claude Code sessions in parallel — each in its own isolated workspace. One session on `/office-hours`, another on `/review`, a third implementing a feature, a fourth running `/qa`. All at the same time. The sprint structure is what makes parallelism work — without a process, ten agents is ten sources of chaos. With a process, each agent knows exactly what to do and when to stop. +**Design is at the heart.** `/design-consultation` doesn't just pick fonts. It researches what's out there in your space, proposes safe choices AND creative risks, generates realistic mockups of your actual product, and writes `DESIGN.md` — and then `/design-review` and `/plan-eng-review` read what you chose. Design decisions flow through the whole system. + +**`/qa` was a massive unlock.** It let me go from 6 to 12 parallel workers. Claude Code saying *"I SEE THE ISSUE"* and then actually fixing it, generating a regression test, and verifying the fix — that changed how I work. The agent has eyes now. + +**Smart review routing.** Just like at a well-run startup: CEO doesn't have to look at infra bug fixes, design review isn't needed for backend changes. gstack tracks what reviews are run, figures out what's appropriate, and just does the smart thing. The Review Readiness Dashboard tells you where you stand before you ship. + +**Test everything.** `/ship` bootstraps test frameworks from scratch if your project doesn't have one. Every `/ship` run produces a coverage audit. Every `/qa` bug fix generates a regression test. 100% test coverage is the goal — tests make vibe coding safe instead of yolo coding. + +**`/document-release` is the engineer you never had.** It reads every doc file in your project, cross-references the diff, and updates everything that drifted. README, ARCHITECTURE, CONTRIBUTING, CLAUDE.md, TODOS — all kept current automatically. And now `/ship` auto-invokes it — docs stay current without an extra command. + +**Real browser mode.** `$B connect` launches your actual Chrome as a headed window controlled by Playwright. You watch Claude click, fill, and navigate in real time — same window, same screen. A subtle green shimmer at the top edge tells you which Chrome window gstack controls. All existing browse commands work unchanged. `$B disconnect` returns to headless. A Chrome extension Side Panel shows a live activity feed of every command and a chat sidebar where you can direct Claude. This is co-presence — Claude isn't remote-controlling a hidden browser, it's sitting next to you in the same cockpit. + +**Sidebar agent — your AI browser assistant.** Type natural language instructions in the Chrome side panel and a child Claude instance executes them. "Navigate to the settings page and screenshot it." "Fill out this form with test data." "Go through every item in this list and extract the prices." Each task gets up to 5 minutes. The sidebar agent runs in an isolated session, so it won't interfere with your main Claude Code window. It's like having a second pair of hands in the browser. + +**Personal automation.** The sidebar agent isn't just for dev workflows. Example: "Browse my kid's school parent portal and add all the other parents' names, phone numbers, and photos to my Google Contacts." Two ways to get authenticated: (1) log in once in the headed browser — your session persists, or (2) run `/setup-browser-cookies` to import cookies from your real Chrome. Once authenticated, Claude navigates the directory, extracts the data, and creates the contacts. + +**Browser handoff when the AI gets stuck.** Hit a CAPTCHA, auth wall, or MFA prompt? `$B handoff` opens a visible Chrome at the exact same page with all your cookies and tabs intact. Solve the problem, tell Claude you're done, `$B resume` picks up right where it left off. The agent even suggests it automatically after 3 consecutive failures. + +**Multi-AI second opinion.** `/codex` gets an independent review from OpenAI's Codex CLI — a completely different AI looking at the same diff. Three modes: code review with a pass/fail gate, adversarial challenge that actively tries to break your code, and open consultation with session continuity. When both `/review` (Claude) and `/codex` (OpenAI) have reviewed the same branch, you get a cross-model analysis showing which findings overlap and which are unique to each. + +**Safety guardrails on demand.** Say "be careful" and `/careful` warns before any destructive command — rm -rf, DROP TABLE, force-push, git reset --hard. `/freeze` locks edits to one directory while debugging so Claude can't accidentally "fix" unrelated code. `/guard` activates both. `/investigate` auto-freezes to the module being investigated. + +**Proactive skill suggestions.** gstack notices what stage you're in — brainstorming, reviewing, debugging, testing — and suggests the right skill. Don't like it? Say "stop suggesting" and it remembers across sessions. + +## 10-15 parallel sprints + +gstack is powerful with one sprint. It is transformative with ten running at once. + +[Conductor](https://conductor.build) runs multiple Claude Code sessions in parallel — each in its own isolated workspace. One session running `/office-hours` on a new idea, another doing `/review` on a PR, a third implementing a feature, a fourth running `/qa` on staging, and six more on other branches. All at the same time. I regularly run 10-15 parallel sprints — that's the practical max right now. + +The sprint structure is what makes parallelism work. Without a process, ten agents is ten sources of chaos. With a process — think, plan, build, review, test, ship — each agent knows exactly what to do and when to stop. You manage them the way a CEO manages a team: check in on the decisions that matter, let the rest run. --- diff --git a/SKILL.md b/SKILL.md index 5f8d0f33..b3f1ce3d 100644 --- a/SKILL.md +++ b/SKILL.md @@ -591,6 +591,9 @@ Refs are invalidated on navigation — run `snapshot` again after `goto`. | Command | Description | |---------|-------------| | `chain` | Run commands from JSON stdin. Format: [["cmd","arg1",...],...] | +| `frame ` | Switch to iframe context (or main to return) | +| `inbox [--clear]` | List messages from sidebar scout inbox | +| `watch [stop]` | Passive observation — periodic snapshots while user browses | ### Tabs | Command | Description | @@ -603,9 +606,13 @@ Refs are invalidated on navigation — run `snapshot` again after `goto`. ### Server | Command | Description | |---------|-------------| +| `connect` | Launch headed Chromium with Chrome extension | +| `disconnect` | Disconnect headed browser, return to headless mode | +| `focus [@ref]` | Bring headed browser window to foreground (macOS) | | `handoff [message]` | Open visible Chrome at current page for user takeover | | `restart` | Restart server | | `resume` | Re-snapshot after user takeover, return control to AI | +| `state save|load ` | Save/load browser state (cookies + URLs) | | `status` | Health check | | `stop` | Shutdown server | diff --git a/TODOS.md b/TODOS.md index 3ee995b6..8458a98a 100644 --- a/TODOS.md +++ b/TODOS.md @@ -14,6 +14,26 @@ **Priority:** P2 **Depends on:** Blog post about Search Before Building +## Chrome DevTools MCP Integration + +### Real Chrome session access + +**What:** Integrate Chrome DevTools MCP to connect to the user's real Chrome session with real cookies, real state, no Playwright middleman. + +**Why:** Right now, headed mode launches a fresh Chromium profile. Users must log in manually or import cookies. Chrome DevTools MCP connects to the user's actual Chrome ... instant access to every authenticated site. This is the future of browser automation for AI agents. + +**Context:** Google shipped Chrome DevTools MCP in Chrome 146+ (June 2025). It provides screenshots, console messages, performance traces, Lighthouse audits, and full page interaction through the user's real browser. gstack should use it for real-session access while keeping Playwright for headless CI/testing workflows. + +Potential new skills: +- `/debug-browser`: JS error tracing with source-mapped stack traces +- `/perf-debug`: performance traces, Core Web Vitals, network waterfall + +May replace `/setup-browser-cookies` for most use cases since the user's real cookies are already there. + +**Effort:** L (human: ~2 weeks / CC: ~2 hours) +**Priority:** P0 +**Depends on:** Chrome 146+, DevTools MCP server installed + ## Browse ### Bundle server.ts into compiled binary @@ -60,17 +80,14 @@ **Effort:** S **Priority:** P3 -### State persistence +### State persistence — SHIPPED -**What:** Save/load cookies + localStorage to JSON files for reproducible test sessions. +~~**What:** Save/load cookies + localStorage to JSON files for reproducible test sessions.~~ -**Why:** Enables "resume where I left off" for QA sessions and repeatable auth states. +`$B state save/load` ships in v0.12.1.0. V1 saves cookies + URLs only (not localStorage, which breaks on load-before-navigate). Files at `.gstack/browse-states/{name}.json` with 0o600 permissions. Load replaces session (closes all pages first). Name sanitized to `[a-zA-Z0-9_-]`. -**Context:** The `saveState()`/`restoreState()` helpers from the handoff feature (browser-manager.ts) already capture cookies + localStorage + sessionStorage + URLs. Adding file I/O on top is ~20 lines. - -**Effort:** S -**Priority:** P3 -**Depends on:** Sessions +**Remaining:** V2 localStorage support (needs pre-navigation injection strategy). +**Completed:** v0.12.1.0 (2026-03-26) ### Auth vault @@ -82,14 +99,13 @@ **Priority:** P3 **Depends on:** Sessions, state persistence -### Iframe support +### Iframe support — SHIPPED -**What:** `frame ` and `frame main` commands for cross-frame interaction. +~~**What:** `frame ` and `frame main` commands for cross-frame interaction.~~ -**Why:** Many web apps use iframes (embeds, payment forms, ads). Currently invisible to browse. +`$B frame` ships in v0.12.1.0. Supports CSS selector, @ref, `--name`, and `--url` pattern matching. Execution target abstraction (`getActiveFrameOrPage()`) across all read/write/snapshot commands. Frame context cleared on navigation, tab switch, resume. Detached frame auto-recovery. Page-only operations (goto, screenshot, viewport) throw clear error when in frame context. -**Effort:** M -**Priority:** P4 +**Completed:** v0.12.1.0 (2026-03-26) ### Semantic locators @@ -145,14 +161,39 @@ **Effort:** L **Priority:** P4 -### CDP mode +### Headed mode with Chrome extension — SHIPPED -**What:** Connect to already-running Chrome/Electron apps via Chrome DevTools Protocol. +`$B connect` launches Playwright's bundled Chromium in headed mode with the gstack Chrome extension auto-loaded. `$B handoff` now produces the same result (extension + side panel). Sidebar chat gated behind `--chat` flag. -**Why:** Test production apps, Electron apps, and existing browser sessions without launching new instances. +### `$B watch` — SHIPPED -**Effort:** M +Claude observes user browsing in passive read-only mode with periodic snapshots. `$B watch stop` exits with summary. Mutation commands blocked during watch. + +### Sidebar scout / file drop relay — SHIPPED + +Sidebar agent writes structured messages to `.context/sidebar-inbox/`. Workspace agent reads via `$B inbox`. Message format: `{type, timestamp, page, userMessage, sidebarSessionId}`. + +### Multi-agent tab isolation + +**What:** Two Claude sessions connect to the same browser, each operating on different tabs. No cross-contamination. + +**Why:** Enables parallel /qa + /design-review on different tabs in the same browser. + +**Context:** Requires tab ownership model for concurrent headed connections. Playwright may not cleanly support two persistent contexts. Needs investigation. + +**Effort:** L (human: ~2 weeks / CC: ~2 hours) +**Priority:** P3 +**Depends on:** Headed mode (shipped) + +### Chrome Web Store publishing + +**What:** Publish the gstack browse Chrome extension to Chrome Web Store for easier install. + +**Why:** Currently sideloaded via chrome://extensions. Web Store makes install one-click. + +**Effort:** S **Priority:** P4 +**Depends on:** Chrome extension proving value via sideloading ### Linux cookie decryption — PARTIALLY SHIPPED diff --git a/VERSION b/VERSION index 5e1d8ddf..ba9b59b5 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.11.21.0 +0.12.1.0 diff --git a/bin/chrome-cdp b/bin/chrome-cdp new file mode 100755 index 00000000..9c1ad717 --- /dev/null +++ b/bin/chrome-cdp @@ -0,0 +1,68 @@ +#!/bin/bash +# Launch Chrome with CDP (remote debugging) enabled. +# Usage: chrome-cdp [port] +# +# Chrome refuses --remote-debugging-port on its default data directory. +# We create a separate data dir with a symlink to the user's real profile, +# so Chrome thinks it's non-default but uses the same cookies/extensions. + +PORT="${1:-9222}" +CHROME="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" +REAL_PROFILE="$HOME/Library/Application Support/Google/Chrome" +CDP_DATA_DIR="$HOME/.gstack/cdp-profile/chrome" + +if ! [ -f "$CHROME" ]; then + echo "Chrome not found at $CHROME" >&2 + exit 1 +fi + +# Check if Chrome is running +if pgrep -f "Google Chrome" >/dev/null 2>&1; then + echo "Chrome is still running. Quitting..." + osascript -e 'tell application "Google Chrome" to quit' 2>/dev/null + + # Wait for it to fully exit + for i in $(seq 1 20); do + pgrep -f "Google Chrome" >/dev/null 2>&1 || break + sleep 0.5 + done + + if pgrep -f "Google Chrome" >/dev/null 2>&1; then + echo "Chrome won't quit. Force-killing..." >&2 + pkill -f "Google Chrome" + sleep 1 + fi +fi + +# Set up CDP data dir with symlinked profile +# Chrome requires a "non-default" data dir for --remote-debugging-port. +# We symlink the real Default profile so cookies/extensions carry over. +mkdir -p "$CDP_DATA_DIR" +if [ -d "$REAL_PROFILE/Default" ] && ! [ -e "$CDP_DATA_DIR/Default" ]; then + ln -s "$REAL_PROFILE/Default" "$CDP_DATA_DIR/Default" + echo "Linked real Chrome profile into CDP data dir" +fi +# Also link Local State (contains crypto keys for cookie decryption, etc.) +if [ -f "$REAL_PROFILE/Local State" ] && ! [ -e "$CDP_DATA_DIR/Local State" ]; then + ln -s "$REAL_PROFILE/Local State" "$CDP_DATA_DIR/Local State" +fi + +echo "Launching Chrome with CDP on port $PORT..." +"$CHROME" \ + --remote-debugging-port="$PORT" \ + --user-data-dir="$CDP_DATA_DIR" \ + --restore-last-session & +disown + +# Wait for CDP to be available +for i in $(seq 1 30); do + if curl -s "http://127.0.0.1:$PORT/json/version" >/dev/null 2>&1; then + echo "CDP ready on port $PORT" + echo "Run: \$B connect chrome" + exit 0 + fi + sleep 1 +done + +echo "CDP not available after 30s." >&2 +exit 1 diff --git a/bin/gstack-extension b/bin/gstack-extension new file mode 100755 index 00000000..8d0a62af --- /dev/null +++ b/bin/gstack-extension @@ -0,0 +1,65 @@ +#!/bin/bash +# gstack-extension — helper to install the Chrome extension +# +# When using $B connect, the extension auto-loads. This script is for +# installing it in your regular Chrome (not the Playwright-controlled one). + +set -e + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +# Find the extension directory +EXT_DIR="" +if [ -f "$REPO_ROOT/extension/manifest.json" ]; then + EXT_DIR="$REPO_ROOT/extension" +elif [ -f "$HOME/.claude/skills/gstack/extension/manifest.json" ]; then + EXT_DIR="$HOME/.claude/skills/gstack/extension" +fi + +if [ -z "$EXT_DIR" ]; then + echo "Error: extension/ directory not found." + echo "Expected at: $REPO_ROOT/extension/ or ~/.claude/skills/gstack/extension/" + exit 1 +fi + +# Copy path to clipboard +echo -n "$EXT_DIR" | pbcopy 2>/dev/null + +# Get browse server port +PORT="" +STATE_FILE="$REPO_ROOT/.gstack/browse.json" +if [ -f "$STATE_FILE" ]; then + PORT=$(grep -o '"port":[0-9]*' "$STATE_FILE" | grep -o '[0-9]*') +fi + +echo "gstack Chrome Extension Setup" +echo "==============================" +echo "" +echo "Extension path (copied to clipboard):" +echo " $EXT_DIR" +echo "" + +if [ -n "$PORT" ]; then + echo "Browse server port: $PORT" + echo "" +fi + +echo "Quick install (if using \$B connect):" +echo " The extension auto-loads when you run \$B connect." +echo " No manual installation needed!" +echo "" +echo "Manual install (for your regular Chrome):" +echo "" +echo " 1. Opening chrome://extensions now..." + +# Open chrome://extensions +osascript -e 'tell application "Google Chrome" to open location "chrome://extensions"' 2>/dev/null || \ + open "chrome://extensions" 2>/dev/null || \ + echo " Could not open Chrome. Navigate to chrome://extensions manually." + +echo " 2. Toggle 'Developer mode' ON (top-right)" +echo " 3. Click 'Load unpacked'" +echo " 4. In the file picker: Cmd+Shift+G → paste (path is in your clipboard) → Enter → Select" +echo " 5. Click the gstack puzzle icon in toolbar → enter port: ${PORT:-}" +echo " 6. Click 'Open Side Panel'" diff --git a/browse/SKILL.md b/browse/SKILL.md index c52dcaa5..399aec3a 100644 --- a/browse/SKILL.md +++ b/browse/SKILL.md @@ -474,6 +474,9 @@ Refs are invalidated on navigation — run `snapshot` again after `goto`. | Command | Description | |---------|-------------| | `chain` | Run commands from JSON stdin. Format: [["cmd","arg1",...],...] | +| `frame ` | Switch to iframe context (or main to return) | +| `inbox [--clear]` | List messages from sidebar scout inbox | +| `watch [stop]` | Passive observation — periodic snapshots while user browses | ### Tabs | Command | Description | @@ -486,8 +489,12 @@ Refs are invalidated on navigation — run `snapshot` again after `goto`. ### Server | Command | Description | |---------|-------------| +| `connect` | Launch headed Chromium with Chrome extension | +| `disconnect` | Disconnect headed browser, return to headless mode | +| `focus [@ref]` | Bring headed browser window to foreground (macOS) | | `handoff [message]` | Open visible Chrome at current page for user takeover | | `restart` | Restart server | | `resume` | Re-snapshot after user takeover, return control to AI | +| `state save|load ` | Save/load browser state (cookies + URLs) | | `status` | Health check | | `stop` | Shutdown server | diff --git a/browse/src/activity.ts b/browse/src/activity.ts new file mode 100644 index 00000000..e76467d4 --- /dev/null +++ b/browse/src/activity.ts @@ -0,0 +1,208 @@ +/** + * Activity streaming — real-time feed of browse commands for the Chrome extension Side Panel + * + * Architecture: + * handleCommand() ──► emitActivity(command_start) + * ──► emitActivity(command_end) + * wirePageEvents() ──► emitActivity(navigation) + * + * GET /activity/stream?after=ID ──► SSE via ReadableStream + * GET /activity/history?limit=N ──► REST fallback + * + * Privacy: filterArgs() redacts passwords, auth tokens, and sensitive query params. + * Backpressure: subscribers notified via queueMicrotask (never blocks command path). + * Gap detection: client sends ?after=ID, server detects if ring buffer overflowed. + */ + +import { CircularBuffer } from './buffers'; + +// ─── Types ────────────────────────────────────────────────────── + +export interface ActivityEntry { + id: number; + timestamp: number; + type: 'command_start' | 'command_end' | 'navigation' | 'error'; + command?: string; + args?: string[]; + url?: string; + duration?: number; + status?: 'ok' | 'error'; + error?: string; + result?: string; + tabs?: number; + mode?: string; +} + +// ─── Buffer & Subscribers ─────────────────────────────────────── + +const BUFFER_CAPACITY = 1000; +const activityBuffer = new CircularBuffer(BUFFER_CAPACITY); +let nextId = 1; + +type ActivitySubscriber = (entry: ActivityEntry) => void; +const subscribers = new Set(); + +// ─── Privacy Filtering ───────────────────────────────────────── + +const SENSITIVE_COMMANDS = new Set(['fill', 'type', 'cookie', 'header']); +const SENSITIVE_PARAM_PATTERN = /\b(password|token|secret|key|auth|bearer|api[_-]?key)\b/i; + +/** + * Redact sensitive data from command args before streaming. + */ +export function filterArgs(command: string, args: string[]): string[] { + if (!args || args.length === 0) return args; + + // fill: redact the value (last arg) for password-type fields + if (command === 'fill' && args.length >= 2) { + const selector = args[0]; + // If the selector suggests a password field, redact the value + if (/password|passwd|secret|token/i.test(selector)) { + return [selector, '[REDACTED]']; + } + return args; + } + + // header: redact Authorization and other sensitive headers + if (command === 'header' && args.length >= 1) { + const headerLine = args[0]; + if (/^(authorization|x-api-key|cookie|set-cookie)/i.test(headerLine)) { + const colonIdx = headerLine.indexOf(':'); + if (colonIdx > 0) { + return [headerLine.substring(0, colonIdx + 1) + '[REDACTED]']; + } + } + return args; + } + + // cookie: redact cookie values + if (command === 'cookie' && args.length >= 1) { + const cookieStr = args[0]; + const eqIdx = cookieStr.indexOf('='); + if (eqIdx > 0) { + return [cookieStr.substring(0, eqIdx + 1) + '[REDACTED]']; + } + return args; + } + + // type: always redact (could be a password field) + if (command === 'type') { + return ['[REDACTED]']; + } + + // URL args: redact sensitive query params + return args.map(arg => { + if (arg.startsWith('http://') || arg.startsWith('https://')) { + try { + const url = new URL(arg); + let redacted = false; + for (const key of url.searchParams.keys()) { + if (SENSITIVE_PARAM_PATTERN.test(key)) { + url.searchParams.set(key, '[REDACTED]'); + redacted = true; + } + } + return redacted ? url.toString() : arg; + } catch { + return arg; + } + } + return arg; + }); +} + +/** + * Truncate result text for streaming (max 200 chars). + */ +function truncateResult(result: string | undefined): string | undefined { + if (!result) return undefined; + if (result.length <= 200) return result; + return result.substring(0, 200) + '...'; +} + +// ─── Public API ───────────────────────────────────────────────── + +/** + * Emit an activity event. Backpressure-safe: subscribers notified asynchronously. + */ +export function emitActivity(entry: Omit): ActivityEntry { + const full: ActivityEntry = { + ...entry, + id: nextId++, + timestamp: Date.now(), + args: entry.args ? filterArgs(entry.command || '', entry.args) : undefined, + result: truncateResult(entry.result), + }; + activityBuffer.push(full); + + // Notify subscribers asynchronously — never block the command path + for (const notify of subscribers) { + queueMicrotask(() => { + try { notify(full); } catch { /* subscriber error — don't crash */ } + }); + } + + return full; +} + +/** + * Subscribe to live activity events. Returns unsubscribe function. + */ +export function subscribe(fn: ActivitySubscriber): () => void { + subscribers.add(fn); + return () => subscribers.delete(fn); +} + +/** + * Get recent activity entries after the given cursor ID. + * Returns entries and gap info if the buffer has overflowed. + */ +export function getActivityAfter(afterId: number): { + entries: ActivityEntry[]; + gap: boolean; + gapFrom?: number; + availableFrom?: number; + totalAdded: number; +} { + const total = activityBuffer.totalAdded; + const allEntries = activityBuffer.toArray(); + + if (afterId === 0) { + return { entries: allEntries, gap: false, totalAdded: total }; + } + + // Check for gap: if afterId is too old and has been evicted + const oldestId = allEntries.length > 0 ? allEntries[0].id : nextId; + if (afterId < oldestId) { + return { + entries: allEntries, + gap: true, + gapFrom: afterId + 1, + availableFrom: oldestId, + totalAdded: total, + }; + } + + // Filter to entries after the cursor + const filtered = allEntries.filter(e => e.id > afterId); + return { entries: filtered, gap: false, totalAdded: total }; +} + +/** + * Get the N most recent activity entries. + */ +export function getActivityHistory(limit: number = 50): { + entries: ActivityEntry[]; + totalAdded: number; +} { + const allEntries = activityBuffer.toArray(); + const sliced = limit < allEntries.length ? allEntries.slice(-limit) : allEntries; + return { entries: sliced, totalAdded: activityBuffer.totalAdded }; +} + +/** + * Get subscriber count (for debugging/health). + */ +export function getSubscriberCount(): number { + return subscribers.size; +} diff --git a/browse/src/browser-manager.ts b/browse/src/browser-manager.ts index 335ff19e..1ef58e36 100644 --- a/browse/src/browser-manager.ts +++ b/browse/src/browser-manager.ts @@ -61,6 +61,88 @@ export class BrowserManager { private isHeaded: boolean = false; private consecutiveFailures: number = 0; + // ─── Watch Mode ───────────────────────────────────────── + private watching = false; + public watchInterval: ReturnType | null = null; + private watchSnapshots: string[] = []; + private watchStartTime: number = 0; + + // ─── Headed State ──────────────────────────────────────── + private connectionMode: 'launched' | 'headed' = 'launched'; + private intentionalDisconnect = false; + + getConnectionMode(): 'launched' | 'headed' { return this.connectionMode; } + + // ─── Watch Mode Methods ───────────────────────────────── + isWatching(): boolean { return this.watching; } + + startWatch(): void { + this.watching = true; + this.watchSnapshots = []; + this.watchStartTime = Date.now(); + } + + stopWatch(): { snapshots: string[]; duration: number } { + this.watching = false; + if (this.watchInterval) { + clearInterval(this.watchInterval); + this.watchInterval = null; + } + const snapshots = this.watchSnapshots; + const duration = Date.now() - this.watchStartTime; + this.watchSnapshots = []; + this.watchStartTime = 0; + return { snapshots, duration }; + } + + addWatchSnapshot(snapshot: string): void { + this.watchSnapshots.push(snapshot); + } + + /** + * Find the gstack Chrome extension directory. + * Checks: repo root /extension, global install, dev install. + */ + private findExtensionPath(): string | null { + const fs = require('fs'); + const path = require('path'); + const candidates = [ + // Relative to this source file (dev mode: browse/src/ -> ../../extension) + path.resolve(__dirname, '..', '..', 'extension'), + // Global gstack install + path.join(process.env.HOME || '', '.claude', 'skills', 'gstack', 'extension'), + // Git repo root (detected via BROWSE_STATE_FILE location) + (() => { + const stateFile = process.env.BROWSE_STATE_FILE || ''; + if (stateFile) { + const repoRoot = path.resolve(path.dirname(stateFile), '..'); + return path.join(repoRoot, '.claude', 'skills', 'gstack', 'extension'); + } + return ''; + })(), + ].filter(Boolean); + + for (const candidate of candidates) { + try { + if (fs.existsSync(path.join(candidate, 'manifest.json'))) { + return candidate; + } + } catch {} + } + return null; + } + + /** + * Get the ref map for external consumers (e.g., /refs endpoint). + */ + getRefMap(): Array<{ ref: string; role: string; name: string }> { + const refs: Array<{ ref: string; role: string; name: string }> = []; + for (const [ref, entry] of this.refMap) { + refs.push({ ref, role: entry.role, name: entry.name }); + } + return refs; + } + async launch() { // ─── Extension Support ──────────────────────────────────── // BROWSE_EXTENSIONS_DIR points to an unpacked Chrome extension directory. @@ -119,15 +201,140 @@ export class BrowserManager { await this.newTab(); } - async close() { + // ─── Headed Mode ───────────────────────────────────────────── + /** + * Launch Playwright's bundled Chromium in headed mode with the gstack + * Chrome extension auto-loaded. Uses launchPersistentContext() which + * is required for extension loading (launch() + newContext() can't + * load extensions). + * + * The browser launches headed with a visible window — the user sees + * every action Claude takes in real time. + */ + async launchHeaded(): Promise { + // Clear old state before repopulating + this.pages.clear(); + this.refMap.clear(); + this.nextTabId = 1; + + // Find the gstack extension directory for auto-loading + const extensionPath = this.findExtensionPath(); + const launchArgs = ['--hide-crash-restore-bubble']; + if (extensionPath) { + launchArgs.push(`--disable-extensions-except=${extensionPath}`); + launchArgs.push(`--load-extension=${extensionPath}`); + } + + // Launch headed Chromium via Playwright's persistent context. + // Extensions REQUIRE launchPersistentContext (not launch + newContext). + // Real Chrome (executablePath/channel) silently blocks --load-extension, + // so we use Playwright's bundled Chromium which reliably loads extensions. + const fs = require('fs'); + const path = require('path'); + const userDataDir = path.join(process.env.HOME || '/tmp', '.gstack', 'chromium-profile'); + fs.mkdirSync(userDataDir, { recursive: true }); + + this.context = await chromium.launchPersistentContext(userDataDir, { + headless: false, + args: launchArgs, + viewport: null, // Use browser's default viewport (real window size) + // Playwright adds flags that block extension loading + ignoreDefaultArgs: [ + '--disable-extensions', + '--disable-component-extensions-with-background-pages', + ], + }); + this.browser = this.context.browser(); + this.connectionMode = 'headed'; + this.intentionalDisconnect = false; + + // Inject visual indicator — subtle top-edge amber gradient + // Extension's content script handles the floating pill + const indicatorScript = () => { + const injectIndicator = () => { + if (document.getElementById('gstack-ctrl')) return; + + const topLine = document.createElement('div'); + topLine.id = 'gstack-ctrl'; + topLine.style.cssText = ` + position: fixed; top: 0; left: 0; right: 0; height: 2px; + background: linear-gradient(90deg, #F59E0B, #FBBF24, #F59E0B); + background-size: 200% 100%; + animation: gstack-shimmer 3s linear infinite; + pointer-events: none; z-index: 2147483647; + opacity: 0.8; + `; + + const style = document.createElement('style'); + style.textContent = ` + @keyframes gstack-shimmer { + 0% { background-position: 200% 0; } + 100% { background-position: -200% 0; } + } + @media (prefers-reduced-motion: reduce) { + #gstack-ctrl { animation: none !important; } + } + `; + + document.documentElement.appendChild(style); + document.documentElement.appendChild(topLine); + }; + if (document.readyState === 'loading') { + document.addEventListener('DOMContentLoaded', injectIndicator); + } else { + injectIndicator(); + } + }; + await this.context.addInitScript(indicatorScript); + + // Persistent context opens a default page — adopt it instead of creating a new one + const existingPages = this.context.pages(); + if (existingPages.length > 0) { + const page = existingPages[0]; + const id = this.nextTabId++; + this.pages.set(id, page); + this.activeTabId = id; + this.wirePageEvents(page); + // Inject indicator on restored page (addInitScript only fires on new navigations) + try { await page.evaluate(indicatorScript); } catch {} + } else { + await this.newTab(); + } + + // Browser disconnect handler — exit code 2 distinguishes from crashes (1) if (this.browser) { - // Remove disconnect handler to avoid exit during intentional close - this.browser.removeAllListeners('disconnected'); - // Timeout: headed browser.close() can hang on macOS - await Promise.race([ - this.browser.close(), - new Promise(resolve => setTimeout(resolve, 5000)), - ]).catch(() => {}); + this.browser.on('disconnected', () => { + if (this.intentionalDisconnect) return; + console.error('[browse] Real browser disconnected (user closed or crashed).'); + console.error('[browse] Run `$B connect` to reconnect.'); + process.exit(2); + }); + } + + // Headed mode defaults + this.dialogAutoAccept = false; // Don't dismiss user's real dialogs + this.isHeaded = true; + this.consecutiveFailures = 0; + } + + async close() { + if (this.browser || (this.connectionMode === 'headed' && this.context)) { + if (this.connectionMode === 'headed') { + // Headed/persistent context mode: close the context (which closes the browser) + this.intentionalDisconnect = true; + if (this.browser) this.browser.removeAllListeners('disconnected'); + await Promise.race([ + this.context ? this.context.close() : Promise.resolve(), + new Promise(resolve => setTimeout(resolve, 5000)), + ]).catch(() => {}); + } else { + // Launched mode: close the browser we spawned + this.browser.removeAllListeners('disconnected'); + await Promise.race([ + this.browser.close(), + new Promise(resolve => setTimeout(resolve, 5000)), + ]).catch(() => {}); + } this.browser = null; } } @@ -195,6 +402,7 @@ export class BrowserManager { switchTab(id: number): void { if (!this.pages.has(id)) throw new Error(`Tab ${id} not found`); this.activeTabId = id; + this.activeFrame = null; // Frame context is per-tab } getTabCount(): number { @@ -324,6 +532,42 @@ export class BrowserManager { return this.customUserAgent; } + // ─── Lifecycle helpers ─────────────────────────────── + /** + * Close all open pages and clear the pages map. + * Used by state load to replace the current session. + */ + async closeAllPages(): Promise { + for (const page of this.pages.values()) { + await page.close().catch(() => {}); + } + this.pages.clear(); + this.clearRefs(); + } + + // ─── Frame context ───────────────────────────────── + private activeFrame: import('playwright').Frame | null = null; + + setFrame(frame: import('playwright').Frame | null): void { + this.activeFrame = frame; + } + + getFrame(): import('playwright').Frame | null { + return this.activeFrame; + } + + /** + * Returns the active frame if set, otherwise the current page. + * Use this for operations that work on both Page and Frame (locator, evaluate, etc.). + */ + getActiveFrameOrPage(): import('playwright').Page | import('playwright').Frame { + // Auto-recover from detached frames (iframe removed/navigated) + if (this.activeFrame?.isDetached()) { + this.activeFrame = null; + } + return this.activeFrame ?? this.getPage(); + } + // ─── State Save/Restore (shared by recreateContext + handoff) ─ /** * Capture browser state: cookies, localStorage, sessionStorage, URLs, active tab. @@ -416,6 +660,9 @@ export class BrowserManager { * Falls back to a clean slate on any failure. */ async recreateContext(): Promise { + if (this.connectionMode === 'headed') { + throw new Error('Cannot recreate context in headed mode. Use disconnect first.'); + } if (!this.browser || !this.context) { throw new Error('Browser not launched'); } @@ -482,7 +729,7 @@ export class BrowserManager { * If step 2 fails → return error, headless browser untouched */ async handoff(message: string): Promise { - if (this.isHeaded) { + if (this.connectionMode === 'headed' || this.isHeaded) { return `HANDOFF: Already in headed mode at ${this.getCurrentUrl()}`; } if (!this.browser || !this.context) { @@ -493,53 +740,68 @@ export class BrowserManager { const state = await this.saveState(); const currentUrl = this.getCurrentUrl(); - // 2. Launch new headed browser (try-catch — if this fails, headless stays running) - let newBrowser: Browser; + // 2. Launch new headed browser with extension (same as launchHeaded) + // Uses launchPersistentContext so the extension auto-loads. + let newContext: BrowserContext; try { - newBrowser = await chromium.launch({ + const fs = require('fs'); + const path = require('path'); + const extensionPath = this.findExtensionPath(); + const launchArgs = ['--hide-crash-restore-bubble']; + if (extensionPath) { + launchArgs.push(`--disable-extensions-except=${extensionPath}`); + launchArgs.push(`--load-extension=${extensionPath}`); + console.log(`[browse] Handoff: loading extension from ${extensionPath}`); + } else { + console.log('[browse] Handoff: extension not found — headed mode without side panel'); + } + + const userDataDir = path.join(process.env.HOME || '/tmp', '.gstack', 'chromium-profile'); + fs.mkdirSync(userDataDir, { recursive: true }); + + newContext = await chromium.launchPersistentContext(userDataDir, { headless: false, + args: launchArgs, + viewport: null, + ignoreDefaultArgs: [ + '--disable-extensions', + '--disable-component-extensions-with-background-pages', + ], timeout: 15000, - chromiumSandbox: process.platform !== 'win32', }); } catch (err: unknown) { const msg = err instanceof Error ? err.message : String(err); return `ERROR: Cannot open headed browser — ${msg}. Headless browser still running.`; } - // 3. Create context and restore state into new headed browser + // 3. Restore state into new headed browser try { - const contextOptions: BrowserContextOptions = { - viewport: { width: 1280, height: 720 }, - }; - if (this.customUserAgent) { - contextOptions.userAgent = this.customUserAgent; - } - const newContext = await newBrowser.newContext(contextOptions); + // Swap to new browser/context before restoreState (it uses this.context) + const oldBrowser = this.browser; + + this.context = newContext; + this.browser = newContext.browser(); + this.pages.clear(); + this.connectionMode = 'headed'; if (Object.keys(this.extraHeaders).length > 0) { await newContext.setExtraHTTPHeaders(this.extraHeaders); } - // Swap to new browser/context before restoreState (it uses this.context) - const oldBrowser = this.browser; - const oldContext = this.context; - - this.browser = newBrowser; - this.context = newContext; - this.pages.clear(); - // Register crash handler on new browser - this.browser.on('disconnected', () => { - console.error('[browse] FATAL: Chromium process crashed or was killed. Server exiting.'); - console.error('[browse] Console/network logs flushed to .gstack/browse-*.log'); - process.exit(1); - }); + if (this.browser) { + this.browser.on('disconnected', () => { + if (this.intentionalDisconnect) return; + console.error('[browse] FATAL: Chromium process crashed or was killed. Server exiting.'); + process.exit(1); + }); + } await this.restoreState(state); this.isHeaded = true; + this.dialogAutoAccept = false; // User controls dialogs in headed mode - // 4. Close old headless browser (fire-and-forget — close() can hang - // when another Playwright instance is active, so we don't await it) + // 4. Close old headless browser (fire-and-forget) oldBrowser.removeAllListeners('disconnected'); oldBrowser.close().catch(() => {}); @@ -549,8 +811,8 @@ export class BrowserManager { `STATUS: Waiting for user. Run 'resume' when done.`, ].join('\n'); } catch (err: unknown) { - // Restore failed — close the new browser, keep old one - await newBrowser.close().catch(() => {}); + // Restore failed — close the new context, keep old state + await newContext.close().catch(() => {}); const msg = err instanceof Error ? err.message : String(err); return `ERROR: Handoff failed during state restore — ${msg}. Headless browser still running.`; } @@ -564,6 +826,7 @@ export class BrowserManager { resume(): void { this.clearRefs(); this.resetFailures(); + this.activeFrame = null; } getIsHeaded(): boolean { @@ -593,6 +856,7 @@ export class BrowserManager { page.on('framenavigated', (frame) => { if (frame === page.mainFrame()) { this.clearRefs(); + this.activeFrame = null; // Navigation invalidates frame context } }); diff --git a/browse/src/cli.ts b/browse/src/cli.ts index 25894a5d..28e4a79e 100644 --- a/browse/src/cli.ts +++ b/browse/src/cli.ts @@ -90,6 +90,7 @@ interface ServerState { startedAt: string; serverPath: string; binaryVersion?: string; + mode?: 'launched' | 'headed'; } // ─── State File ──────────────────────────────────────────────── @@ -217,7 +218,7 @@ function cleanupLegacyState(): void { } // ─── Server Lifecycle ────────────────────────────────────────── -async function startServer(): Promise { +async function startServer(extraEnv?: Record): Promise { ensureStateDir(config); // Clean up stale state file and error log @@ -241,7 +242,7 @@ async function startServer(): Promise { // macOS/Linux: Bun.spawn + unref works correctly proc = Bun.spawn(['bun', 'run', SERVER_SCRIPT], { stdio: ['ignore', 'pipe', 'pipe'], - env: { ...process.env, BROWSE_STATE_FILE: config.stateFile }, + env: { ...process.env, BROWSE_STATE_FILE: config.stateFile, ...extraEnv }, }); proc.unref(); } @@ -328,6 +329,15 @@ async function ensureServer(): Promise { return state; } + // Guard: never silently replace a headed server with a headless one. + // Headed mode means a user-visible Chrome window is (or was) controlled. + // Silently replacing it would be confusing — tell the user to reconnect. + if (state && state.mode === 'headed' && isProcessAlive(state.pid)) { + console.error(`[browse] Headed server running (PID ${state.pid}) but not responding.`); + console.error(`[browse] Run '$B connect' to restart.`); + process.exit(1); + } + // Ensure state directory exists before lock acquisition (lock file lives there) ensureStateDir(config); @@ -471,6 +481,144 @@ Refs: After 'snapshot', use @e1, @e2... as selectors: const command = args[0]; const commandArgs = args.slice(1); + // ─── Headed Connect (pre-server command) ──────────────────── + // connect must be handled BEFORE ensureServer() because it needs + // to restart the server in headed mode with the Chrome extension. + if (command === 'connect') { + // Check if already in headed mode and healthy + const existingState = readState(); + if (existingState && existingState.mode === 'headed' && isProcessAlive(existingState.pid)) { + try { + const resp = await fetch(`http://127.0.0.1:${existingState.port}/health`, { + signal: AbortSignal.timeout(2000), + }); + if (resp.ok) { + console.log('Already connected in headed mode.'); + process.exit(0); + } + } catch { + // Headed server alive but not responding — kill and restart + } + } + + // Kill ANY existing server (SIGTERM → wait 2s → SIGKILL) + if (existingState && isProcessAlive(existingState.pid)) { + try { process.kill(existingState.pid, 'SIGTERM'); } catch {} + await new Promise(resolve => setTimeout(resolve, 2000)); + if (isProcessAlive(existingState.pid)) { + try { process.kill(existingState.pid, 'SIGKILL'); } catch {} + await new Promise(resolve => setTimeout(resolve, 1000)); + } + } + + // Clean up Chromium profile locks (can persist after crashes) + const profileDir = path.join(process.env.HOME || '/tmp', '.gstack', 'chromium-profile'); + for (const lockFile of ['SingletonLock', 'SingletonSocket', 'SingletonCookie']) { + try { fs.unlinkSync(path.join(profileDir, lockFile)); } catch {} + } + + // Delete stale state file + try { fs.unlinkSync(config.stateFile); } catch {} + + console.log('Launching headed Chromium with extension + sidebar agent...'); + try { + // Start server in headed mode with extension auto-loaded + // Use a well-known port so the Chrome extension auto-connects + const serverEnv: Record = { + BROWSE_HEADED: '1', + BROWSE_PORT: '34567', + BROWSE_SIDEBAR_CHAT: '1', + }; + const newState = await startServer(serverEnv); + + // Print connected status + const resp = await fetch(`http://127.0.0.1:${newState.port}/command`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'Authorization': `Bearer ${newState.token}`, + }, + body: JSON.stringify({ command: 'status', args: [] }), + signal: AbortSignal.timeout(5000), + }); + const status = await resp.text(); + console.log(`Connected to real Chrome\n${status}`); + + // Auto-start sidebar agent + const agentScript = path.resolve(__dirname, 'sidebar-agent.ts'); + try { + // Clear old agent queue + const agentQueue = path.join(process.env.HOME || '/tmp', '.gstack', 'sidebar-agent-queue.jsonl'); + try { fs.writeFileSync(agentQueue, ''); } catch {} + + const agentProc = Bun.spawn(['bun', 'run', agentScript], { + cwd: config.projectDir, + env: { + ...process.env, + BROWSE_BIN: path.resolve(__dirname, '..', 'dist', 'browse'), + BROWSE_STATE_FILE: config.stateFile, + BROWSE_SERVER_PORT: String(newState.port), + }, + stdio: ['ignore', 'ignore', 'ignore'], + }); + agentProc.unref(); + console.log(`[browse] Sidebar agent started (PID: ${agentProc.pid})`); + } catch (err: any) { + console.error(`[browse] Sidebar agent failed to start: ${err.message}`); + console.error(`[browse] Run manually: bun run ${agentScript}`); + } + } catch (err: any) { + console.error(`[browse] Connect failed: ${err.message}`); + process.exit(1); + } + process.exit(0); + } + + // ─── Headed Disconnect (pre-server command) ───────────────── + // disconnect must be handled BEFORE ensureServer() because the headed + // guard blocks all commands when the server is unresponsive. + if (command === 'disconnect') { + const existingState = readState(); + if (!existingState || existingState.mode !== 'headed') { + console.log('Not in headed mode — nothing to disconnect.'); + process.exit(0); + } + // Try graceful shutdown via server + try { + const resp = await fetch(`http://127.0.0.1:${existingState.port}/command`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'Authorization': `Bearer ${existingState.token}`, + }, + body: JSON.stringify({ command: 'disconnect', args: [] }), + signal: AbortSignal.timeout(3000), + }); + if (resp.ok) { + console.log('Disconnected from real browser.'); + process.exit(0); + } + } catch { + // Server not responding — force cleanup + } + // Force kill + cleanup + if (isProcessAlive(existingState.pid)) { + try { process.kill(existingState.pid, 'SIGTERM'); } catch {} + await new Promise(resolve => setTimeout(resolve, 2000)); + if (isProcessAlive(existingState.pid)) { + try { process.kill(existingState.pid, 'SIGKILL'); } catch {} + } + } + // Clean profile locks and state file + const profileDir = path.join(process.env.HOME || '/tmp', '.gstack', 'chromium-profile'); + for (const lockFile of ['SingletonLock', 'SingletonSocket', 'SingletonCookie']) { + try { fs.unlinkSync(path.join(profileDir, lockFile)); } catch {} + } + try { fs.unlinkSync(config.stateFile); } catch {} + console.log('Disconnected (server was unresponsive — force cleaned).'); + process.exit(0); + } + // Special case: chain reads from stdin if (command === 'chain' && commandArgs.length === 0) { const stdin = await Bun.stdin.text(); diff --git a/browse/src/commands.ts b/browse/src/commands.ts index 81c8f61a..15244538 100644 --- a/browse/src/commands.ts +++ b/browse/src/commands.ts @@ -31,6 +31,11 @@ export const META_COMMANDS = new Set([ 'chain', 'diff', 'url', 'snapshot', 'handoff', 'resume', + 'connect', 'disconnect', 'focus', + 'inbox', + 'watch', + 'state', + 'frame', ]); export const ALL_COMMANDS = new Set([...READ_COMMANDS, ...WRITE_COMMANDS, ...META_COMMANDS]); @@ -98,6 +103,18 @@ export const COMMAND_DESCRIPTIONS: Record' }, + // Frame + 'frame': { category: 'Meta', description: 'Switch to iframe context (or main to return)', usage: 'frame ' }, }; // Load-time validation: descriptions must cover exactly the command sets diff --git a/browse/src/meta-commands.ts b/browse/src/meta-commands.ts index 16ed7f84..4388491a 100644 --- a/browse/src/meta-commands.ts +++ b/browse/src/meta-commands.ts @@ -11,6 +11,8 @@ import * as Diff from 'diff'; import * as fs from 'fs'; import * as path from 'path'; import { TEMP_DIR, isPathWithin } from './platform'; +import { resolveConfig } from './config'; +import type { Frame } from 'playwright'; // Security: Path validation to prevent path traversal attacks const SAFE_DIRECTORIES = [TEMP_DIR, process.cwd()]; @@ -23,6 +25,25 @@ export function validateOutputPath(filePath: string): void { } } +/** Tokenize a pipe segment respecting double-quoted strings. */ +function tokenizePipeSegment(segment: string): string[] { + const tokens: string[] = []; + let current = ''; + let inQuote = false; + for (let i = 0; i < segment.length; i++) { + const ch = segment[i]; + if (ch === '"') { + inQuote = !inQuote; + } else if (ch === ' ' && !inQuote) { + if (current) { tokens.push(current); current = ''; } + } else { + current += ch; + } + } + if (current) tokens.push(current); + return tokens; +} + export async function handleMetaCommand( command: string, args: string[], @@ -61,8 +82,10 @@ export async function handleMetaCommand( case 'status': { const page = bm.getPage(); const tabs = bm.getTabCount(); + const mode = bm.getConnectionMode(); return [ `Status: healthy`, + `Mode: ${mode}`, `URL: ${page.url()}`, `Tabs: ${tabs}`, `PID: ${process.pid}`, @@ -185,35 +208,54 @@ export async function handleMetaCommand( case 'chain': { // Read JSON array from args[0] (if provided) or expect it was passed as body const jsonStr = args[0]; - if (!jsonStr) throw new Error('Usage: echo \'[["goto","url"],["text"]]\' | browse chain'); + if (!jsonStr) throw new Error( + 'Usage: echo \'[["goto","url"],["text"]]\' | browse chain\n' + + ' or: browse chain \'goto url | click @e5 | snapshot -ic\'' + ); let commands: string[][]; try { commands = JSON.parse(jsonStr); + if (!Array.isArray(commands)) throw new Error('not array'); } catch { - throw new Error('Invalid JSON. Expected: [["command", "arg1", "arg2"], ...]'); + // Fallback: pipe-delimited format "goto url | click @e5 | snapshot -ic" + commands = jsonStr.split(' | ') + .filter(seg => seg.trim().length > 0) + .map(seg => tokenizePipeSegment(seg.trim())); } - if (!Array.isArray(commands)) throw new Error('Expected JSON array of commands'); - const results: string[] = []; const { handleReadCommand } = await import('./read-commands'); const { handleWriteCommand } = await import('./write-commands'); + let lastWasWrite = false; for (const cmd of commands) { const [name, ...cmdArgs] = cmd; try { let result: string; - if (WRITE_COMMANDS.has(name)) result = await handleWriteCommand(name, cmdArgs, bm); - else if (READ_COMMANDS.has(name)) result = await handleReadCommand(name, cmdArgs, bm); - else if (META_COMMANDS.has(name)) result = await handleMetaCommand(name, cmdArgs, bm, shutdown); - else throw new Error(`Unknown command: ${name}`); + if (WRITE_COMMANDS.has(name)) { + result = await handleWriteCommand(name, cmdArgs, bm); + lastWasWrite = true; + } else if (READ_COMMANDS.has(name)) { + result = await handleReadCommand(name, cmdArgs, bm); + lastWasWrite = false; + } else if (META_COMMANDS.has(name)) { + result = await handleMetaCommand(name, cmdArgs, bm, shutdown); + lastWasWrite = false; + } else { + throw new Error(`Unknown command: ${name}`); + } results.push(`[${name}] ${result}`); } catch (err: any) { results.push(`[${name}] ERROR: ${err.message}`); } } + // Wait for network to settle after write commands before returning + if (lastWasWrite) { + await bm.getPage().waitForLoadState('networkidle', { timeout: 2000 }).catch(() => {}); + } + return results.join('\n\n'); } @@ -263,6 +305,232 @@ export async function handleMetaCommand( return `RESUMED\n${snapshot}`; } + // ─── Headed Mode ────────────────────────────────────── + case 'connect': { + // connect is handled as a pre-server command in cli.ts + // If we get here, server is already running — tell the user + if (bm.getConnectionMode() === 'headed') { + return 'Already in headed mode with extension.'; + } + return 'The connect command must be run from the CLI (not sent to a running server). Run: $B connect'; + } + + case 'disconnect': { + if (bm.getConnectionMode() !== 'headed') { + return 'Not in headed mode — nothing to disconnect.'; + } + // Signal that we want a restart in headless mode + console.log('[browse] Disconnecting headed browser. Restarting in headless mode.'); + await shutdown(); + return 'Disconnected. Server will restart in headless mode on next command.'; + } + + case 'focus': { + if (bm.getConnectionMode() !== 'headed') { + return 'focus requires headed mode. Run `$B connect` first.'; + } + try { + const { execSync } = await import('child_process'); + // Try common Chromium-based browser app names to bring to foreground + const appNames = ['Comet', 'Google Chrome', 'Arc', 'Brave Browser', 'Microsoft Edge']; + let activated = false; + for (const appName of appNames) { + try { + execSync(`osascript -e 'tell application "${appName}" to activate'`, { stdio: 'pipe', timeout: 3000 }); + activated = true; + break; + } catch { + // Try next browser + } + } + + if (!activated) { + return 'Could not bring browser to foreground. macOS only.'; + } + + // If a ref was passed, scroll it into view + if (args.length > 0 && args[0].startsWith('@')) { + try { + const resolved = await bm.resolveRef(args[0]); + if ('locator' in resolved) { + await resolved.locator.scrollIntoViewIfNeeded({ timeout: 5000 }); + return `Browser activated. Scrolled ${args[0]} into view.`; + } + } catch { + // Ref not found — still activated the browser + } + } + + return 'Browser window activated.'; + } catch (err: any) { + return `focus failed: ${err.message}. macOS only.`; + } + } + + // ─── Watch ────────────────────────────────────────── + case 'watch': { + if (args[0] === 'stop') { + if (!bm.isWatching()) return 'Not currently watching.'; + const result = bm.stopWatch(); + const durationSec = Math.round(result.duration / 1000); + return [ + `WATCH STOPPED (${durationSec}s, ${result.snapshots.length} snapshots)`, + '', + 'Last snapshot:', + result.snapshots.length > 0 ? result.snapshots[result.snapshots.length - 1] : '(none)', + ].join('\n'); + } + + if (bm.isWatching()) return 'Already watching. Run `$B watch stop` to stop.'; + if (bm.getConnectionMode() !== 'headed') { + return 'watch requires headed mode. Run `$B connect` first.'; + } + + bm.startWatch(); + return 'WATCHING — observing user browsing. Periodic snapshots every 5s.\nRun `$B watch stop` to stop and get summary.'; + } + + // ─── Inbox ────────────────────────────────────────── + case 'inbox': { + const { execSync } = await import('child_process'); + let gitRoot: string; + try { + gitRoot = execSync('git rev-parse --show-toplevel', { encoding: 'utf-8', stdio: ['pipe', 'pipe', 'pipe'] }).trim(); + } catch { + return 'Not in a git repository — cannot locate inbox.'; + } + + const inboxDir = path.join(gitRoot, '.context', 'sidebar-inbox'); + if (!fs.existsSync(inboxDir)) return 'Inbox empty.'; + + const files = fs.readdirSync(inboxDir) + .filter(f => f.endsWith('.json') && !f.startsWith('.')) + .sort() + .reverse(); // newest first + + if (files.length === 0) return 'Inbox empty.'; + + const messages: { timestamp: string; url: string; userMessage: string }[] = []; + for (const file of files) { + try { + const data = JSON.parse(fs.readFileSync(path.join(inboxDir, file), 'utf-8')); + messages.push({ + timestamp: data.timestamp || '', + url: data.page?.url || 'unknown', + userMessage: data.userMessage || '', + }); + } catch { + // Skip malformed files + } + } + + if (messages.length === 0) return 'Inbox empty.'; + + const lines: string[] = []; + lines.push(`SIDEBAR INBOX (${messages.length} message${messages.length === 1 ? '' : 's'})`); + lines.push('────────────────────────────────'); + + for (const msg of messages) { + const ts = msg.timestamp ? `[${msg.timestamp}]` : '[unknown]'; + lines.push(`${ts} ${msg.url}`); + lines.push(` "${msg.userMessage}"`); + lines.push(''); + } + + lines.push('────────────────────────────────'); + + // Handle --clear flag + if (args.includes('--clear')) { + for (const file of files) { + try { fs.unlinkSync(path.join(inboxDir, file)); } catch {} + } + lines.push(`Cleared ${files.length} message${files.length === 1 ? '' : 's'}.`); + } + + return lines.join('\n'); + } + + // ─── State ──────────────────────────────────────── + case 'state': { + const [action, name] = args; + if (!action || !name) throw new Error('Usage: state save|load '); + + // Sanitize name: alphanumeric + hyphens + underscores only + if (!/^[a-zA-Z0-9_-]+$/.test(name)) { + throw new Error('State name must be alphanumeric (a-z, 0-9, _, -)'); + } + + const config = resolveConfig(); + const stateDir = path.join(config.stateDir, 'browse-states'); + fs.mkdirSync(stateDir, { recursive: true }); + const statePath = path.join(stateDir, `${name}.json`); + + if (action === 'save') { + const state = await bm.saveState(); + // V1: cookies + URLs only (not localStorage — breaks on load-before-navigate) + const saveData = { + version: 1, + cookies: state.cookies, + pages: state.pages.map(p => ({ url: p.url, isActive: p.isActive })), + }; + fs.writeFileSync(statePath, JSON.stringify(saveData, null, 2), { mode: 0o600 }); + return `State saved: ${statePath} (${state.cookies.length} cookies, ${state.pages.length} pages — treat as sensitive)`; + } + + if (action === 'load') { + if (!fs.existsSync(statePath)) throw new Error(`State not found: ${statePath}`); + const data = JSON.parse(fs.readFileSync(statePath, 'utf-8')); + if (!Array.isArray(data.cookies) || !Array.isArray(data.pages)) { + throw new Error('Invalid state file: expected cookies and pages arrays'); + } + // Close existing pages, then restore (replace, not merge) + bm.setFrame(null); + await bm.closeAllPages(); + await bm.restoreState({ + cookies: data.cookies, + pages: data.pages.map((p: any) => ({ ...p, storage: null })), + }); + return `State loaded: ${data.cookies.length} cookies, ${data.pages.length} pages`; + } + + throw new Error('Usage: state save|load '); + } + + // ─── Frame ─────────────────────────────────────── + case 'frame': { + const target = args[0]; + if (!target) throw new Error('Usage: frame '); + + if (target === 'main') { + bm.setFrame(null); + bm.clearRefs(); + return 'Switched to main frame'; + } + + const page = bm.getPage(); + let frame: Frame | null = null; + + if (target === '--name') { + if (!args[1]) throw new Error('Usage: frame --name '); + frame = page.frame({ name: args[1] }); + } else if (target === '--url') { + if (!args[1]) throw new Error('Usage: frame --url '); + frame = page.frame({ url: new RegExp(args[1]) }); + } else { + // CSS selector or @ref for the iframe element + const resolved = await bm.resolveRef(target); + const locator = 'locator' in resolved ? resolved.locator : page.locator(resolved.selector); + const elementHandle = await locator.elementHandle({ timeout: 5000 }); + frame = await elementHandle?.contentFrame() ?? null; + await elementHandle?.dispose(); + } + + if (!frame) throw new Error(`Frame not found: ${target}`); + bm.setFrame(frame); + bm.clearRefs(); + return `Switched to frame: ${frame.url()}`; + } + default: throw new Error(`Unknown meta command: ${command}`); } diff --git a/browse/src/read-commands.ts b/browse/src/read-commands.ts index 5d93156c..802c3813 100644 --- a/browse/src/read-commands.ts +++ b/browse/src/read-commands.ts @@ -7,7 +7,7 @@ import type { BrowserManager } from './browser-manager'; import { consoleBuffer, networkBuffer, dialogBuffer } from './buffers'; -import type { Page } from 'playwright'; +import type { Page, Frame } from 'playwright'; import * as fs from 'fs'; import * as path from 'path'; import { TEMP_DIR, isPathWithin } from './platform'; @@ -57,7 +57,7 @@ export function validateReadPath(filePath: string): void { * Extract clean text from a page (strips script/style/noscript/svg). * Exported for DRY reuse in meta-commands (diff). */ -export async function getCleanText(page: Page): Promise { +export async function getCleanText(page: Page | Frame): Promise { return await page.evaluate(() => { const body = document.body; if (!body) return ''; @@ -77,10 +77,12 @@ export async function handleReadCommand( bm: BrowserManager ): Promise { const page = bm.getPage(); + // Frame-aware target for content extraction + const target = bm.getActiveFrameOrPage(); switch (command) { case 'text': { - return await getCleanText(page); + return await getCleanText(target); } case 'html': { @@ -90,13 +92,19 @@ export async function handleReadCommand( if ('locator' in resolved) { return await resolved.locator.innerHTML({ timeout: 5000 }); } - return await page.innerHTML(resolved.selector); + return await target.locator(resolved.selector).innerHTML({ timeout: 5000 }); } - return await page.content(); + // page.content() is page-only; use evaluate for frame compat + const doctype = await target.evaluate(() => { + const dt = document.doctype; + return dt ? `` : ''; + }); + const html = await target.evaluate(() => document.documentElement.outerHTML); + return doctype ? `${doctype}\n${html}` : html; } case 'links': { - const links = await page.evaluate(() => + const links = await target.evaluate(() => [...document.querySelectorAll('a[href]')].map(a => ({ text: a.textContent?.trim().slice(0, 120) || '', href: (a as HTMLAnchorElement).href, @@ -106,7 +114,7 @@ export async function handleReadCommand( } case 'forms': { - const forms = await page.evaluate(() => { + const forms = await target.evaluate(() => { return [...document.querySelectorAll('form')].map((form, i) => { const fields = [...form.querySelectorAll('input, select, textarea')].map(el => { const input = el as HTMLInputElement; @@ -136,7 +144,7 @@ export async function handleReadCommand( } case 'accessibility': { - const snapshot = await page.locator("body").ariaSnapshot(); + const snapshot = await target.locator("body").ariaSnapshot(); return snapshot; } @@ -144,7 +152,7 @@ export async function handleReadCommand( const expr = args[0]; if (!expr) throw new Error('Usage: browse js '); const wrapped = wrapForEvaluate(expr); - const result = await page.evaluate(wrapped); + const result = await target.evaluate(wrapped); return typeof result === 'object' ? JSON.stringify(result, null, 2) : String(result ?? ''); } @@ -155,7 +163,7 @@ export async function handleReadCommand( if (!fs.existsSync(filePath)) throw new Error(`File not found: ${filePath}`); const code = fs.readFileSync(filePath, 'utf-8'); const wrapped = wrapForEvaluate(code); - const result = await page.evaluate(wrapped); + const result = await target.evaluate(wrapped); return typeof result === 'object' ? JSON.stringify(result, null, 2) : String(result ?? ''); } @@ -170,7 +178,7 @@ export async function handleReadCommand( ); return value; } - const value = await page.evaluate( + const value = await target.evaluate( ([sel, prop]) => { const el = document.querySelector(sel); if (!el) return `Element not found: ${sel}`; @@ -195,7 +203,7 @@ export async function handleReadCommand( }); return JSON.stringify(attrs, null, 2); } - const attrs = await page.evaluate((sel) => { + const attrs = await target.evaluate((sel: string) => { const el = document.querySelector(sel); if (!el) return `Element not found: ${sel}`; const result: Record = {}; @@ -253,7 +261,7 @@ export async function handleReadCommand( if ('locator' in resolved) { locator = resolved.locator; } else { - locator = page.locator(resolved.selector); + locator = target.locator(resolved.selector); } switch (property) { @@ -283,10 +291,10 @@ export async function handleReadCommand( if (args[0] === 'set' && args[1]) { const key = args[1]; const value = args[2] || ''; - await page.evaluate(([k, v]) => localStorage.setItem(k, v), [key, value]); + await target.evaluate(([k, v]: string[]) => localStorage.setItem(k, v), [key, value]); return `Set localStorage["${key}"]`; } - const storage = await page.evaluate(() => ({ + const storage = await target.evaluate(() => ({ localStorage: { ...localStorage }, sessionStorage: { ...sessionStorage }, })); diff --git a/browse/src/server.ts b/browse/src/server.ts index fe2c27cb..fe288e9e 100644 --- a/browse/src/server.ts +++ b/browse/src/server.ts @@ -19,8 +19,11 @@ import { handleWriteCommand } from './write-commands'; import { handleMetaCommand } from './meta-commands'; import { handleCookiePickerRoute } from './cookie-picker-routes'; import { COMMAND_DESCRIPTIONS } from './commands'; -import { SNAPSHOT_FLAGS } from './snapshot'; +import { handleSnapshot, SNAPSHOT_FLAGS } from './snapshot'; import { resolveConfig, ensureStateDir, readVersionHash } from './config'; +import { emitActivity, subscribe, getActivityAfter, getActivityHistory, getSubscriberCount } from './activity'; +// Bun.spawn used instead of child_process.spawn (compiled bun binaries +// fail posix_spawn on all executables including /bin/bash) import * as fs from 'fs'; import * as path from 'path'; import * as crypto from 'crypto'; @@ -33,6 +36,7 @@ ensureStateDir(config); const AUTH_TOKEN = crypto.randomUUID(); const BROWSE_PORT = parseInt(process.env.BROWSE_PORT || '0', 10); const IDLE_TIMEOUT_MS = parseInt(process.env.BROWSE_IDLE_TIMEOUT || '1800000', 10); // 30 min +// Sidebar chat is always enabled in headed mode (ungated in v0.12.0) function validateAuth(req: Request): boolean { const header = req.headers.get('authorization'); @@ -87,6 +91,377 @@ export { consoleBuffer, networkBuffer, dialogBuffer, addConsoleEntry, addNetwork const CONSOLE_LOG_PATH = config.consoleLog; const NETWORK_LOG_PATH = config.networkLog; const DIALOG_LOG_PATH = config.dialogLog; + +// ─── Sidebar Agent (integrated — no separate process) ───────────── + +interface ChatEntry { + id: number; + ts: string; + role: 'user' | 'assistant' | 'agent'; + message?: string; + type?: string; + tool?: string; + input?: string; + text?: string; + error?: string; +} + +interface SidebarSession { + id: string; + name: string; + claudeSessionId: string | null; + worktreePath: string | null; + createdAt: string; + lastActiveAt: string; +} + +const SESSIONS_DIR = path.join(process.env.HOME || '/tmp', '.gstack', 'sidebar-sessions'); +const AGENT_TIMEOUT_MS = 300_000; // 5 minutes — multi-page tasks need time +const MAX_QUEUE = 5; + +let sidebarSession: SidebarSession | null = null; +let agentProcess: ChildProcess | null = null; +let agentStatus: 'idle' | 'processing' | 'hung' = 'idle'; +let agentStartTime: number | null = null; +let messageQueue: Array<{message: string, ts: string}> = []; +let currentMessage: string | null = null; +let chatBuffer: ChatEntry[] = []; +let chatNextId = 0; + +// Find the browse binary for the claude subprocess system prompt +function findBrowseBin(): string { + const candidates = [ + path.resolve(__dirname, '..', 'dist', 'browse'), + path.resolve(__dirname, '..', '..', '.claude', 'skills', 'gstack', 'browse', 'dist', 'browse'), + path.join(process.env.HOME || '', '.claude', 'skills', 'gstack', 'browse', 'dist', 'browse'), + ]; + for (const c of candidates) { + try { if (fs.existsSync(c)) return c; } catch {} + } + return 'browse'; // fallback to PATH +} + +const BROWSE_BIN = findBrowseBin(); + +function findClaudeBin(): string | null { + const home = process.env.HOME || ''; + const candidates = [ + // Conductor app bundled binary (not a symlink — works reliably) + path.join(home, 'Library', 'Application Support', 'com.conductor.app', 'bin', 'claude'), + // Direct versioned binary (not a symlink) + ...(() => { + try { + const versionsDir = path.join(home, '.local', 'share', 'claude', 'versions'); + const entries = fs.readdirSync(versionsDir).filter(e => /^\d/.test(e)).sort().reverse(); + return entries.map(e => path.join(versionsDir, e)); + } catch { return []; } + })(), + // Standard install (symlink — resolve it) + path.join(home, '.local', 'bin', 'claude'), + '/usr/local/bin/claude', + '/opt/homebrew/bin/claude', + ]; + // Also check if 'claude' is in current PATH + try { + const proc = Bun.spawnSync(['which', 'claude'], { stdout: 'pipe', stderr: 'pipe', timeout: 2000 }); + if (proc.exitCode === 0) { + const p = proc.stdout.toString().trim(); + if (p) candidates.unshift(p); + } + } catch {} + for (const c of candidates) { + try { + if (!fs.existsSync(c)) continue; + // Resolve symlinks — posix_spawn can fail on symlinks in compiled bun binaries + return fs.realpathSync(c); + } catch {} + } + return null; +} + +function shortenPath(str: string): string { + return str + .replace(new RegExp(BROWSE_BIN.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), 'g'), '$B') + .replace(/\/Users\/[^/]+/g, '~') + .replace(/\/conductor\/workspaces\/[^/]+\/[^/]+/g, '') + .replace(/\.claude\/skills\/gstack\//g, '') + .replace(/browse\/dist\/browse/g, '$B'); +} + +function summarizeToolInput(tool: string, input: any): string { + if (!input) return ''; + if (tool === 'Bash' && input.command) { + let cmd = shortenPath(input.command); + return cmd.length > 80 ? cmd.slice(0, 80) + '…' : cmd; + } + if (tool === 'Read' && input.file_path) return shortenPath(input.file_path); + if (tool === 'Edit' && input.file_path) return shortenPath(input.file_path); + if (tool === 'Write' && input.file_path) return shortenPath(input.file_path); + if (tool === 'Grep' && input.pattern) return `/${input.pattern}/`; + if (tool === 'Glob' && input.pattern) return input.pattern; + try { return shortenPath(JSON.stringify(input)).slice(0, 60); } catch { return ''; } +} + +function addChatEntry(entry: Omit): ChatEntry { + const full: ChatEntry = { ...entry, id: chatNextId++ }; + chatBuffer.push(full); + // Persist to disk (best-effort) + if (sidebarSession) { + const chatFile = path.join(SESSIONS_DIR, sidebarSession.id, 'chat.jsonl'); + try { fs.appendFileSync(chatFile, JSON.stringify(full) + '\n'); } catch {} + } + return full; +} + +function loadSession(): SidebarSession | null { + try { + const activeFile = path.join(SESSIONS_DIR, 'active.json'); + const activeData = JSON.parse(fs.readFileSync(activeFile, 'utf-8')); + const sessionFile = path.join(SESSIONS_DIR, activeData.id, 'session.json'); + const session = JSON.parse(fs.readFileSync(sessionFile, 'utf-8')) as SidebarSession; + // Load chat history + const chatFile = path.join(SESSIONS_DIR, session.id, 'chat.jsonl'); + try { + const lines = fs.readFileSync(chatFile, 'utf-8').split('\n').filter(Boolean); + chatBuffer = lines.map(line => { try { return JSON.parse(line); } catch { return null; } }).filter(Boolean); + chatNextId = chatBuffer.length > 0 ? Math.max(...chatBuffer.map(e => e.id)) + 1 : 0; + } catch {} + return session; + } catch { + return null; + } +} + +/** + * Create a git worktree for session isolation. + * Falls back to null (use main cwd) if: + * - not in a git repo + * - git worktree add fails (submodules, LFS, permissions) + * - worktree dir already exists (collision from prior crash) + */ +function createWorktree(sessionId: string): string | null { + try { + // Check if we're in a git repo + const gitCheck = Bun.spawnSync(['git', 'rev-parse', '--show-toplevel'], { + stdout: 'pipe', stderr: 'pipe', timeout: 3000, + }); + if (gitCheck.exitCode !== 0) return null; + const repoRoot = gitCheck.stdout.toString().trim(); + + const worktreeDir = path.join(process.env.HOME || '/tmp', '.gstack', 'worktrees', sessionId.slice(0, 8)); + + // Clean up if dir exists from prior crash + if (fs.existsSync(worktreeDir)) { + Bun.spawnSync(['git', 'worktree', 'remove', '--force', worktreeDir], { + cwd: repoRoot, stdout: 'pipe', stderr: 'pipe', timeout: 5000, + }); + try { fs.rmSync(worktreeDir, { recursive: true, force: true }); } catch {} + } + + // Get current branch/commit + const headCheck = Bun.spawnSync(['git', 'rev-parse', 'HEAD'], { + cwd: repoRoot, stdout: 'pipe', stderr: 'pipe', timeout: 3000, + }); + if (headCheck.exitCode !== 0) return null; + const head = headCheck.stdout.toString().trim(); + + // Create worktree (detached HEAD — no branch conflicts) + const result = Bun.spawnSync(['git', 'worktree', 'add', '--detach', worktreeDir, head], { + cwd: repoRoot, stdout: 'pipe', stderr: 'pipe', timeout: 10000, + }); + + if (result.exitCode !== 0) { + console.log(`[browse] Worktree creation failed: ${result.stderr.toString().trim()}`); + return null; + } + + console.log(`[browse] Created worktree: ${worktreeDir}`); + return worktreeDir; + } catch (err: any) { + console.log(`[browse] Worktree creation error: ${err.message}`); + return null; + } +} + +function removeWorktree(worktreePath: string | null): void { + if (!worktreePath) return; + try { + const gitCheck = Bun.spawnSync(['git', 'rev-parse', '--show-toplevel'], { + stdout: 'pipe', stderr: 'pipe', timeout: 3000, + }); + if (gitCheck.exitCode === 0) { + Bun.spawnSync(['git', 'worktree', 'remove', '--force', worktreePath], { + cwd: gitCheck.stdout.toString().trim(), stdout: 'pipe', stderr: 'pipe', timeout: 5000, + }); + } + // Cleanup dir if git worktree remove didn't + try { fs.rmSync(worktreePath, { recursive: true, force: true }); } catch {} + } catch {} +} + +function createSession(): SidebarSession { + const id = crypto.randomUUID(); + const worktreePath = createWorktree(id); + const session: SidebarSession = { + id, + name: 'Chrome sidebar', + claudeSessionId: null, + worktreePath, + createdAt: new Date().toISOString(), + lastActiveAt: new Date().toISOString(), + }; + const sessionDir = path.join(SESSIONS_DIR, id); + fs.mkdirSync(sessionDir, { recursive: true }); + fs.writeFileSync(path.join(sessionDir, 'session.json'), JSON.stringify(session, null, 2)); + fs.writeFileSync(path.join(sessionDir, 'chat.jsonl'), ''); + fs.writeFileSync(path.join(SESSIONS_DIR, 'active.json'), JSON.stringify({ id })); + chatBuffer = []; + chatNextId = 0; + return session; +} + +function saveSession(): void { + if (!sidebarSession) return; + sidebarSession.lastActiveAt = new Date().toISOString(); + const sessionFile = path.join(SESSIONS_DIR, sidebarSession.id, 'session.json'); + try { fs.writeFileSync(sessionFile, JSON.stringify(sidebarSession, null, 2)); } catch {} +} + +function listSessions(): Array { + try { + const dirs = fs.readdirSync(SESSIONS_DIR).filter(d => d !== 'active.json'); + return dirs.map(d => { + try { + const session = JSON.parse(fs.readFileSync(path.join(SESSIONS_DIR, d, 'session.json'), 'utf-8')); + let chatLines = 0; + try { chatLines = fs.readFileSync(path.join(SESSIONS_DIR, d, 'chat.jsonl'), 'utf-8').split('\n').filter(Boolean).length; } catch {} + return { ...session, chatLines }; + } catch { return null; } + }).filter(Boolean); + } catch { return []; } +} + +function processAgentEvent(event: any): void { + if (event.type === 'system' && event.session_id && sidebarSession && !sidebarSession.claudeSessionId) { + // Capture session_id from first claude init event for --resume + sidebarSession.claudeSessionId = event.session_id; + saveSession(); + } + + if (event.type === 'assistant' && event.message?.content) { + for (const block of event.message.content) { + if (block.type === 'tool_use') { + addChatEntry({ ts: new Date().toISOString(), role: 'agent', type: 'tool_use', tool: block.name, input: summarizeToolInput(block.name, block.input) }); + } else if (block.type === 'text' && block.text) { + addChatEntry({ ts: new Date().toISOString(), role: 'agent', type: 'text', text: block.text }); + } + } + } + + if (event.type === 'content_block_start' && event.content_block?.type === 'tool_use') { + addChatEntry({ ts: new Date().toISOString(), role: 'agent', type: 'tool_use', tool: event.content_block.name, input: summarizeToolInput(event.content_block.name, event.content_block.input) }); + } + + if (event.type === 'content_block_delta' && event.delta?.type === 'text_delta' && event.delta.text) { + addChatEntry({ ts: new Date().toISOString(), role: 'agent', type: 'text_delta', text: event.delta.text }); + } + + if (event.type === 'result') { + addChatEntry({ ts: new Date().toISOString(), role: 'agent', type: 'result', text: event.text || event.result || '' }); + } +} + +function spawnClaude(userMessage: string): void { + agentStatus = 'processing'; + agentStartTime = Date.now(); + currentMessage = userMessage; + + const pageUrl = browserManager.getCurrentUrl() || 'about:blank'; + const B = BROWSE_BIN; + const systemPrompt = [ + 'You are a browser assistant running in a Chrome sidebar.', + `Current page: ${pageUrl}`, + `Browse binary: ${B}`, + '', + 'Commands (run via bash):', + ` ${B} goto ${B} click <@ref> ${B} fill <@ref> `, + ` ${B} snapshot -i ${B} text ${B} screenshot`, + ` ${B} back ${B} forward ${B} reload`, + '', + 'Rules: run snapshot -i before clicking. Keep responses SHORT.', + ].join('\n'); + + const prompt = `${systemPrompt}\n\nUser: ${userMessage}`; + const args = ['-p', prompt, '--output-format', 'stream-json', '--verbose', + '--allowedTools', 'Bash,Read,Glob,Grep']; + if (sidebarSession?.claudeSessionId) { + args.push('--resume', sidebarSession.claudeSessionId); + } + + addChatEntry({ ts: new Date().toISOString(), role: 'agent', type: 'agent_start' }); + + // Compiled bun binaries CANNOT spawn external processes (posix_spawn + // fails with ENOENT on everything, including /bin/bash). Instead, + // write the command to a queue file that the sidebar-agent process + // (running as non-compiled bun) picks up and spawns claude. + const gstackDir = path.join(process.env.HOME || '/tmp', '.gstack'); + const agentQueue = path.join(gstackDir, 'sidebar-agent-queue.jsonl'); + const entry = JSON.stringify({ + ts: new Date().toISOString(), + message: userMessage, + prompt, + args, + stateFile: config.stateFile, + cwd: (sidebarSession as any)?.worktreePath || process.cwd(), + sessionId: sidebarSession?.claudeSessionId || null, + }); + try { + fs.mkdirSync(gstackDir, { recursive: true }); + fs.appendFileSync(agentQueue, entry + '\n'); + } catch (err: any) { + addChatEntry({ ts: new Date().toISOString(), role: 'agent', type: 'agent_error', error: `Failed to queue: ${err.message}` }); + agentStatus = 'idle'; + agentStartTime = null; + currentMessage = null; + return; + } + // The sidebar-agent.ts process polls this file and spawns claude. + // It POST events back via /sidebar-event which processAgentEvent handles. + // Agent status transitions happen when we receive agent_done/agent_error events. +} + +function killAgent(): void { + if (agentProcess) { + try { agentProcess.kill('SIGTERM'); } catch {} + setTimeout(() => { try { agentProcess?.kill('SIGKILL'); } catch {} }, 3000); + } + agentProcess = null; + agentStartTime = null; + currentMessage = null; + agentStatus = 'idle'; +} + +// Agent health check — detect hung processes +let agentHealthInterval: ReturnType | null = null; +function startAgentHealthCheck(): void { + agentHealthInterval = setInterval(() => { + if (agentStatus === 'processing' && agentStartTime && Date.now() - agentStartTime > AGENT_TIMEOUT_MS) { + agentStatus = 'hung'; + console.log(`[browse] Sidebar agent hung (>${AGENT_TIMEOUT_MS / 1000}s)`); + } + }, 10000); +} + +// Initialize session on startup +function initSidebarSession(): void { + fs.mkdirSync(SESSIONS_DIR, { recursive: true }); + sidebarSession = loadSession(); + if (!sidebarSession) { + sidebarSession = createSession(); + } + console.log(`[browse] Sidebar session: ${sidebarSession.id} (${chatBuffer.length} chat entries loaded)`); + startAgentHealthCheck(); +} let lastConsoleFlushed = 0; let lastNetworkFlushed = 0; let lastDialogFlushed = 0; @@ -224,6 +599,27 @@ async function handleCommand(body: any): Promise { }); } + // Block mutation commands while watching (read-only observation mode) + if (browserManager.isWatching() && WRITE_COMMANDS.has(command)) { + return new Response(JSON.stringify({ + error: 'Cannot run mutation commands while watching. Run `$B watch stop` first.', + }), { + status: 400, + headers: { 'Content-Type': 'application/json' }, + }); + } + + // Activity: emit command_start + const startTime = Date.now(); + emitActivity({ + type: 'command_start', + command, + args, + url: browserManager.getCurrentUrl(), + tabs: browserManager.getTabCount(), + mode: browserManager.getConnectionMode(), + }); + try { let result: string; @@ -233,6 +629,22 @@ async function handleCommand(body: any): Promise { result = await handleWriteCommand(command, args, browserManager); } else if (META_COMMANDS.has(command)) { result = await handleMetaCommand(command, args, browserManager, shutdown); + // Start periodic snapshot interval when watch mode begins + if (command === 'watch' && args[0] !== 'stop' && browserManager.isWatching()) { + const watchInterval = setInterval(async () => { + if (!browserManager.isWatching()) { + clearInterval(watchInterval); + return; + } + try { + const snapshot = await handleSnapshot(['-i'], browserManager); + browserManager.addWatchSnapshot(snapshot); + } catch { + // Page may be navigating — skip this snapshot + } + }, 5000); + browserManager.watchInterval = watchInterval; + } } else if (command === 'help') { const helpText = generateHelpText(); return new Response(helpText, { @@ -249,12 +661,38 @@ async function handleCommand(body: any): Promise { }); } + // Activity: emit command_end (success) + emitActivity({ + type: 'command_end', + command, + args, + url: browserManager.getCurrentUrl(), + duration: Date.now() - startTime, + status: 'ok', + result: result, + tabs: browserManager.getTabCount(), + mode: browserManager.getConnectionMode(), + }); + browserManager.resetFailures(); return new Response(result, { status: 200, headers: { 'Content-Type': 'text/plain' }, }); } catch (err: any) { + // Activity: emit command_end (error) + emitActivity({ + type: 'command_end', + command, + args, + url: browserManager.getCurrentUrl(), + duration: Date.now() - startTime, + status: 'error', + error: err.message, + tabs: browserManager.getTabCount(), + mode: browserManager.getConnectionMode(), + }); + browserManager.incrementFailures(); let errorMsg = wrapError(err); const hint = browserManager.getFailureHint(); @@ -271,12 +709,25 @@ async function shutdown() { isShuttingDown = true; console.log('[browse] Shutting down...'); + // Stop watch mode if active + if (browserManager.isWatching()) browserManager.stopWatch(); + killAgent(); + messageQueue = []; + saveSession(); // Persist chat history before exit + if (sidebarSession?.worktreePath) removeWorktree(sidebarSession.worktreePath); + if (agentHealthInterval) clearInterval(agentHealthInterval); clearInterval(flushInterval); clearInterval(idleCheckInterval); await flushBuffers(); // Final flush (async now) await browserManager.close(); + // Clean up Chromium profile locks (prevent SingletonLock on next launch) + const profileDir = path.join(process.env.HOME || '/tmp', '.gstack', 'chromium-profile'); + for (const lockFile of ['SingletonLock', 'SingletonSocket', 'SingletonCookie']) { + try { fs.unlinkSync(path.join(profileDir, lockFile)); } catch {} + } + // Clean up state file try { fs.unlinkSync(config.stateFile); } catch {} @@ -294,6 +745,32 @@ if (process.platform === 'win32') { }); } +// Emergency cleanup for crashes (OOM, uncaught exceptions, browser disconnect) +function emergencyCleanup() { + if (isShuttingDown) return; + isShuttingDown = true; + // Kill agent subprocess if running + try { killAgent(); } catch {} + // Save session state so chat history persists across crashes + try { saveSession(); } catch {} + // Clean Chromium profile locks + const profileDir = path.join(process.env.HOME || '/tmp', '.gstack', 'chromium-profile'); + for (const lockFile of ['SingletonLock', 'SingletonSocket', 'SingletonCookie']) { + try { fs.unlinkSync(path.join(profileDir, lockFile)); } catch {} + } + try { fs.unlinkSync(config.stateFile); } catch {} +} +process.on('uncaughtException', (err) => { + console.error('[browse] FATAL uncaught exception:', err.message); + emergencyCleanup(); + process.exit(1); +}); +process.on('unhandledRejection', (err: any) => { + console.error('[browse] FATAL unhandled rejection:', err?.message || err); + emergencyCleanup(); + process.exit(1); +}); + // ─── Start ───────────────────────────────────────────────────── async function start() { // Clear old log files @@ -303,16 +780,20 @@ async function start() { const port = await findPort(); - // Launch browser - await browserManager.launch(); + // Launch browser (headless or headed with extension) + const headed = process.env.BROWSE_HEADED === '1'; + if (headed) { + await browserManager.launchHeaded(); + console.log(`[browse] Launched headed Chromium with extension`); + } else { + await browserManager.launch(); + } const startTime = Date.now(); const server = Bun.serve({ port, hostname: '127.0.0.1', fetch: async (req) => { - resetIdleTimer(); - const url = new URL(req.url); // Cookie picker routes — no auth required (localhost-only) @@ -320,21 +801,285 @@ async function start() { return handleCookiePickerRoute(url, req, browserManager); } - // Health check — no auth required (now async) + // Health check — no auth required, does NOT reset idle timer if (url.pathname === '/health') { const healthy = await browserManager.isHealthy(); return new Response(JSON.stringify({ status: healthy ? 'healthy' : 'unhealthy', + mode: browserManager.getConnectionMode(), uptime: Math.floor((Date.now() - startTime) / 1000), tabs: browserManager.getTabCount(), currentUrl: browserManager.getCurrentUrl(), + token: AUTH_TOKEN, // Extension uses this for Bearer auth + chatEnabled: true, + agent: { + status: agentStatus, + runningFor: agentStartTime ? Date.now() - agentStartTime : null, + currentMessage, + queueLength: messageQueue.length, + }, + session: sidebarSession ? { id: sidebarSession.id, name: sidebarSession.name } : null, }), { status: 200, headers: { 'Content-Type': 'application/json' }, }); } - // All other endpoints require auth + // Refs endpoint — no auth required (localhost-only), does NOT reset idle timer + if (url.pathname === '/refs') { + const refs = browserManager.getRefMap(); + return new Response(JSON.stringify({ + refs, + url: browserManager.getCurrentUrl(), + mode: browserManager.getConnectionMode(), + }), { + status: 200, + headers: { + 'Content-Type': 'application/json', + 'Access-Control-Allow-Origin': '*', + }, + }); + } + + // Activity stream — SSE, no auth (localhost-only), does NOT reset idle timer + if (url.pathname === '/activity/stream') { + const afterId = parseInt(url.searchParams.get('after') || '0', 10); + const encoder = new TextEncoder(); + + const stream = new ReadableStream({ + start(controller) { + // 1. Gap detection + replay + const { entries, gap, gapFrom, availableFrom } = getActivityAfter(afterId); + if (gap) { + controller.enqueue(encoder.encode(`event: gap\ndata: ${JSON.stringify({ gapFrom, availableFrom })}\n\n`)); + } + for (const entry of entries) { + controller.enqueue(encoder.encode(`event: activity\ndata: ${JSON.stringify(entry)}\n\n`)); + } + + // 2. Subscribe for live events + const unsubscribe = subscribe((entry) => { + try { + controller.enqueue(encoder.encode(`event: activity\ndata: ${JSON.stringify(entry)}\n\n`)); + } catch { + unsubscribe(); + } + }); + + // 3. Heartbeat every 15s + const heartbeat = setInterval(() => { + try { + controller.enqueue(encoder.encode(`: heartbeat\n\n`)); + } catch { + clearInterval(heartbeat); + unsubscribe(); + } + }, 15000); + + // 4. Cleanup on disconnect + req.signal.addEventListener('abort', () => { + clearInterval(heartbeat); + unsubscribe(); + try { controller.close(); } catch {} + }); + }, + }); + + return new Response(stream, { + headers: { + 'Content-Type': 'text/event-stream', + 'Cache-Control': 'no-cache', + 'Connection': 'keep-alive', + 'Access-Control-Allow-Origin': '*', + }, + }); + } + + // Activity history — REST, no auth (localhost-only), does NOT reset idle timer + if (url.pathname === '/activity/history') { + const limit = parseInt(url.searchParams.get('limit') || '50', 10); + const { entries, totalAdded } = getActivityHistory(limit); + return new Response(JSON.stringify({ entries, totalAdded, subscribers: getSubscriberCount() }), { + status: 200, + headers: { + 'Content-Type': 'application/json', + 'Access-Control-Allow-Origin': '*', + }, + }); + } + + // ─── Sidebar endpoints (auth required — token from /health) ──── + + // Sidebar routes are always available in headed mode (ungated in v0.12.0) + + // Sidebar chat history — read from in-memory buffer + if (url.pathname === '/sidebar-chat') { + if (!validateAuth(req)) { + return new Response(JSON.stringify({ error: 'Unauthorized' }), { status: 401, headers: { 'Content-Type': 'application/json' } }); + } + const afterId = parseInt(url.searchParams.get('after') || '0', 10); + const entries = chatBuffer.filter(e => e.id >= afterId); + return new Response(JSON.stringify({ entries, total: chatNextId }), { + status: 200, + headers: { 'Content-Type': 'application/json', 'Access-Control-Allow-Origin': '*' }, + }); + } + + // Sidebar → server: user message → queue or process immediately + if (url.pathname === '/sidebar-command' && req.method === 'POST') { + if (!validateAuth(req)) { + return new Response(JSON.stringify({ error: 'Unauthorized' }), { status: 401, headers: { 'Content-Type': 'application/json' } }); + } + const body = await req.json(); + const msg = body.message?.trim(); + if (!msg) { + return new Response(JSON.stringify({ error: 'Empty message' }), { status: 400, headers: { 'Content-Type': 'application/json' } }); + } + const ts = new Date().toISOString(); + addChatEntry({ ts, role: 'user', message: msg }); + if (sidebarSession) { sidebarSession.lastActiveAt = ts; saveSession(); } + + if (agentStatus === 'idle') { + spawnClaude(msg); + return new Response(JSON.stringify({ ok: true, processing: true }), { + status: 200, headers: { 'Content-Type': 'application/json' }, + }); + } else if (messageQueue.length < MAX_QUEUE) { + messageQueue.push({ message: msg, ts }); + return new Response(JSON.stringify({ ok: true, queued: true, position: messageQueue.length }), { + status: 200, headers: { 'Content-Type': 'application/json' }, + }); + } else { + return new Response(JSON.stringify({ error: 'Queue full (max 5)' }), { + status: 429, headers: { 'Content-Type': 'application/json' }, + }); + } + } + + // Clear sidebar chat + if (url.pathname === '/sidebar-chat/clear' && req.method === 'POST') { + if (!validateAuth(req)) { + return new Response(JSON.stringify({ error: 'Unauthorized' }), { status: 401, headers: { 'Content-Type': 'application/json' } }); + } + chatBuffer = []; + chatNextId = 0; + if (sidebarSession) { + try { fs.writeFileSync(path.join(SESSIONS_DIR, sidebarSession.id, 'chat.jsonl'), ''); } catch {} + } + return new Response(JSON.stringify({ ok: true }), { status: 200, headers: { 'Content-Type': 'application/json' } }); + } + + // Kill hung agent + if (url.pathname === '/sidebar-agent/kill' && req.method === 'POST') { + if (!validateAuth(req)) { + return new Response(JSON.stringify({ error: 'Unauthorized' }), { status: 401, headers: { 'Content-Type': 'application/json' } }); + } + killAgent(); + addChatEntry({ ts: new Date().toISOString(), role: 'agent', type: 'agent_error', error: 'Killed by user' }); + // Process next in queue + if (messageQueue.length > 0) { + const next = messageQueue.shift()!; + spawnClaude(next.message); + } + return new Response(JSON.stringify({ ok: true }), { status: 200, headers: { 'Content-Type': 'application/json' } }); + } + + // Stop agent (user-initiated) — queued messages remain for dismissal + if (url.pathname === '/sidebar-agent/stop' && req.method === 'POST') { + if (!validateAuth(req)) { + return new Response(JSON.stringify({ error: 'Unauthorized' }), { status: 401, headers: { 'Content-Type': 'application/json' } }); + } + killAgent(); + addChatEntry({ ts: new Date().toISOString(), role: 'agent', type: 'agent_error', error: 'Stopped by user' }); + return new Response(JSON.stringify({ ok: true, queuedMessages: messageQueue.length }), { + status: 200, headers: { 'Content-Type': 'application/json' }, + }); + } + + // Dismiss a queued message by index + if (url.pathname === '/sidebar-queue/dismiss' && req.method === 'POST') { + if (!validateAuth(req)) { + return new Response(JSON.stringify({ error: 'Unauthorized' }), { status: 401, headers: { 'Content-Type': 'application/json' } }); + } + const body = await req.json(); + const idx = body.index; + if (typeof idx === 'number' && idx >= 0 && idx < messageQueue.length) { + messageQueue.splice(idx, 1); + } + return new Response(JSON.stringify({ ok: true, queueLength: messageQueue.length }), { + status: 200, headers: { 'Content-Type': 'application/json' }, + }); + } + + // Session info + if (url.pathname === '/sidebar-session') { + if (!validateAuth(req)) { + return new Response(JSON.stringify({ error: 'Unauthorized' }), { status: 401, headers: { 'Content-Type': 'application/json' } }); + } + return new Response(JSON.stringify({ + session: sidebarSession, + agent: { status: agentStatus, runningFor: agentStartTime ? Date.now() - agentStartTime : null, currentMessage, queueLength: messageQueue.length, queue: messageQueue }, + }), { status: 200, headers: { 'Content-Type': 'application/json' } }); + } + + // Create new session + if (url.pathname === '/sidebar-session/new' && req.method === 'POST') { + if (!validateAuth(req)) { + return new Response(JSON.stringify({ error: 'Unauthorized' }), { status: 401, headers: { 'Content-Type': 'application/json' } }); + } + killAgent(); + messageQueue = []; + // Clean up old session's worktree before creating new one + if (sidebarSession?.worktreePath) removeWorktree(sidebarSession.worktreePath); + sidebarSession = createSession(); + return new Response(JSON.stringify({ ok: true, session: sidebarSession }), { + status: 200, headers: { 'Content-Type': 'application/json' }, + }); + } + + // List all sessions + if (url.pathname === '/sidebar-session/list') { + if (!validateAuth(req)) { + return new Response(JSON.stringify({ error: 'Unauthorized' }), { status: 401, headers: { 'Content-Type': 'application/json' } }); + } + return new Response(JSON.stringify({ sessions: listSessions(), activeId: sidebarSession?.id }), { + status: 200, headers: { 'Content-Type': 'application/json' }, + }); + } + + // Agent event relay — sidebar-agent.ts POSTs events here + if (url.pathname === '/sidebar-agent/event' && req.method === 'POST') { + if (!validateAuth(req)) { + return new Response(JSON.stringify({ error: 'Unauthorized' }), { status: 401, headers: { 'Content-Type': 'application/json' } }); + } + const body = await req.json(); + processAgentEvent(body); + // Handle agent lifecycle events + if (body.type === 'agent_done' || body.type === 'agent_error') { + agentProcess = null; + agentStartTime = null; + currentMessage = null; + if (body.type === 'agent_done') { + addChatEntry({ ts: new Date().toISOString(), role: 'agent', type: 'agent_done' }); + } + // Process next queued message + if (messageQueue.length > 0) { + const next = messageQueue.shift()!; + spawnClaude(next.message); + } else { + agentStatus = 'idle'; + } + } + // Capture claude session ID for --resume + if (body.claudeSessionId && sidebarSession && !sidebarSession.claudeSessionId) { + sidebarSession.claudeSessionId = body.claudeSessionId; + saveSession(); + } + return new Response(JSON.stringify({ ok: true }), { status: 200, headers: { 'Content-Type': 'application/json' } }); + } + + // ─── Auth-required endpoints ────────────────────────────────── + if (!validateAuth(req)) { return new Response(JSON.stringify({ error: 'Unauthorized' }), { status: 401, @@ -343,6 +1088,7 @@ async function start() { } if (url.pathname === '/command' && req.method === 'POST') { + resetIdleTimer(); // Only commands reset idle timer const body = await req.json(); return handleCommand(body); } @@ -352,13 +1098,14 @@ async function start() { }); // Write state file (atomic: write .tmp then rename) - const state = { + const state: Record = { pid: process.pid, port, token: AUTH_TOKEN, startedAt: new Date().toISOString(), serverPath: path.resolve(import.meta.dir, 'server.ts'), binaryVersion: readVersionHash() || undefined, + mode: browserManager.getConnectionMode(), }; const tmpFile = config.stateFile + '.tmp'; fs.writeFileSync(tmpFile, JSON.stringify(state, null, 2), { mode: 0o600 }); @@ -368,6 +1115,9 @@ async function start() { console.log(`[browse] Server running on http://127.0.0.1:${port} (PID: ${process.pid})`); console.log(`[browse] State file: ${config.stateFile}`); console.log(`[browse] Idle timeout: ${IDLE_TIMEOUT_MS / 1000}s`); + + // Initialize sidebar session (load existing or create new) + initSidebarSession(); } start().catch((err) => { diff --git a/browse/src/sidebar-agent.ts b/browse/src/sidebar-agent.ts new file mode 100644 index 00000000..6f28f5f4 --- /dev/null +++ b/browse/src/sidebar-agent.ts @@ -0,0 +1,278 @@ +/** + * Sidebar Agent — polls agent-queue from server, spawns claude -p for each + * message, streams live events back to the server via /sidebar-agent/event. + * + * This runs as a NON-COMPILED bun process because compiled bun binaries + * cannot posix_spawn external executables. The server writes to the queue + * file, this process reads it and spawns claude. + * + * Usage: BROWSE_BIN=/path/to/browse bun run browse/src/sidebar-agent.ts + */ + +import { spawn } from 'child_process'; +import * as fs from 'fs'; +import * as path from 'path'; + +const QUEUE = path.join(process.env.HOME || '/tmp', '.gstack', 'sidebar-agent-queue.jsonl'); +const SERVER_PORT = parseInt(process.env.BROWSE_SERVER_PORT || '34567', 10); +const SERVER_URL = `http://127.0.0.1:${SERVER_PORT}`; +const POLL_MS = 500; // Fast polling — server already did the user-facing response +const B = process.env.BROWSE_BIN || path.resolve(__dirname, '../../.claude/skills/gstack/browse/dist/browse'); + +let lastLine = 0; +let authToken: string | null = null; +let isProcessing = false; + +// ─── File drop relay ────────────────────────────────────────── + +function getGitRoot(): string | null { + try { + const { execSync } = require('child_process'); + return execSync('git rev-parse --show-toplevel', { encoding: 'utf-8', stdio: ['pipe', 'pipe', 'pipe'] }).trim(); + } catch { + return null; + } +} + +function writeToInbox(message: string, pageUrl?: string, sessionId?: string): void { + const gitRoot = getGitRoot(); + if (!gitRoot) { + console.error('[sidebar-agent] Cannot write to inbox — not in a git repo'); + return; + } + + const inboxDir = path.join(gitRoot, '.context', 'sidebar-inbox'); + fs.mkdirSync(inboxDir, { recursive: true }); + + const now = new Date(); + const timestamp = now.toISOString().replace(/:/g, '-'); + const filename = `${timestamp}-observation.json`; + const tmpFile = path.join(inboxDir, `.${filename}.tmp`); + const finalFile = path.join(inboxDir, filename); + + const inboxMessage = { + type: 'observation', + timestamp: now.toISOString(), + page: { url: pageUrl || 'unknown', title: '' }, + userMessage: message, + sidebarSessionId: sessionId || 'unknown', + }; + + fs.writeFileSync(tmpFile, JSON.stringify(inboxMessage, null, 2)); + fs.renameSync(tmpFile, finalFile); + console.log(`[sidebar-agent] Wrote inbox message: ${filename}`); +} + +// ─── Auth ──────────────────────────────────────────────────────── + +async function refreshToken(): Promise { + try { + const resp = await fetch(`${SERVER_URL}/health`, { signal: AbortSignal.timeout(3000) }); + if (!resp.ok) return null; + const data = await resp.json() as any; + authToken = data.token || null; + return authToken; + } catch { + return null; + } +} + +// ─── Event relay to server ────────────────────────────────────── + +async function sendEvent(event: Record): Promise { + if (!authToken) await refreshToken(); + if (!authToken) return; + + try { + await fetch(`${SERVER_URL}/sidebar-agent/event`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'Authorization': `Bearer ${authToken}`, + }, + body: JSON.stringify(event), + }); + } catch (err) { + console.error('[sidebar-agent] Failed to send event:', err); + } +} + +// ─── Claude subprocess ────────────────────────────────────────── + +function shorten(str: string): string { + return str + .replace(new RegExp(B.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), 'g'), '$B') + .replace(/\/Users\/[^/]+/g, '~') + .replace(/\/conductor\/workspaces\/[^/]+\/[^/]+/g, '') + .replace(/\.claude\/skills\/gstack\//g, '') + .replace(/browse\/dist\/browse/g, '$B'); +} + +function summarizeToolInput(tool: string, input: any): string { + if (!input) return ''; + if (tool === 'Bash' && input.command) { + let cmd = shorten(input.command); + return cmd.length > 80 ? cmd.slice(0, 80) + '…' : cmd; + } + if (tool === 'Read' && input.file_path) return shorten(input.file_path); + if (tool === 'Edit' && input.file_path) return shorten(input.file_path); + if (tool === 'Write' && input.file_path) return shorten(input.file_path); + if (tool === 'Grep' && input.pattern) return `/${input.pattern}/`; + if (tool === 'Glob' && input.pattern) return input.pattern; + try { return shorten(JSON.stringify(input)).slice(0, 60); } catch { return ''; } +} + +async function handleStreamEvent(event: any): Promise { + if (event.type === 'system' && event.session_id) { + // Relay claude session ID for --resume support + await sendEvent({ type: 'system', claudeSessionId: event.session_id }); + } + + if (event.type === 'assistant' && event.message?.content) { + for (const block of event.message.content) { + if (block.type === 'tool_use') { + await sendEvent({ type: 'tool_use', tool: block.name, input: summarizeToolInput(block.name, block.input) }); + } else if (block.type === 'text' && block.text) { + await sendEvent({ type: 'text', text: block.text }); + } + } + } + + if (event.type === 'content_block_start' && event.content_block?.type === 'tool_use') { + await sendEvent({ type: 'tool_use', tool: event.content_block.name, input: summarizeToolInput(event.content_block.name, event.content_block.input) }); + } + + if (event.type === 'content_block_delta' && event.delta?.type === 'text_delta' && event.delta.text) { + await sendEvent({ type: 'text_delta', text: event.delta.text }); + } + + if (event.type === 'result') { + await sendEvent({ type: 'result', text: event.result || '' }); + } +} + +async function askClaude(queueEntry: any): Promise { + const { prompt, args, stateFile, cwd } = queueEntry; + + isProcessing = true; + await sendEvent({ type: 'agent_start' }); + + return new Promise((resolve) => { + // Build args fresh — don't trust --resume from queue (session may be stale) + let claudeArgs = ['-p', prompt, '--output-format', 'stream-json', '--verbose', + '--allowedTools', 'Bash,Read,Glob,Grep']; + + // Validate cwd exists — queue may reference a stale worktree + let effectiveCwd = cwd || process.cwd(); + try { fs.accessSync(effectiveCwd); } catch { effectiveCwd = process.cwd(); } + + const proc = spawn('claude', claudeArgs, { + stdio: ['pipe', 'pipe', 'pipe'], + cwd: effectiveCwd, + env: { ...process.env, BROWSE_STATE_FILE: stateFile || '' }, + }); + + proc.stdin.end(); + + let buffer = ''; + + proc.stdout.on('data', (data: Buffer) => { + buffer += data.toString(); + const lines = buffer.split('\n'); + buffer = lines.pop() || ''; + for (const line of lines) { + if (!line.trim()) continue; + try { handleStreamEvent(JSON.parse(line)); } catch {} + } + }); + + proc.stderr.on('data', () => {}); // Claude logs to stderr, ignore + + proc.on('close', (code) => { + if (buffer.trim()) { + try { handleStreamEvent(JSON.parse(buffer)); } catch {} + } + sendEvent({ type: 'agent_done' }).then(() => { + isProcessing = false; + resolve(); + }); + }); + + proc.on('error', (err) => { + sendEvent({ type: 'agent_error', error: err.message }).then(() => { + isProcessing = false; + resolve(); + }); + }); + + // Timeout after 300 seconds (5 min — multi-page tasks need time) + setTimeout(() => { + try { proc.kill(); } catch {} + sendEvent({ type: 'agent_error', error: 'Timed out after 300s' }).then(() => { + isProcessing = false; + resolve(); + }); + }, 300000); + }); +} + +// ─── Poll loop ─────────────────────────────────────────────────── + +function countLines(): number { + try { + return fs.readFileSync(QUEUE, 'utf-8').split('\n').filter(Boolean).length; + } catch { return 0; } +} + +function readLine(n: number): string | null { + try { + const lines = fs.readFileSync(QUEUE, 'utf-8').split('\n').filter(Boolean); + return lines[n - 1] || null; + } catch { return null; } +} + +async function poll() { + if (isProcessing) return; // One at a time — server handles queuing + + const current = countLines(); + if (current <= lastLine) return; + + while (lastLine < current && !isProcessing) { + lastLine++; + const line = readLine(lastLine); + if (!line) continue; + + let entry: any; + try { entry = JSON.parse(line); } catch { continue; } + if (!entry.message && !entry.prompt) continue; + + console.log(`[sidebar-agent] Processing: "${entry.message}"`); + // Write to inbox so workspace agent can pick it up + writeToInbox(entry.message || entry.prompt, entry.pageUrl, entry.sessionId); + try { + await askClaude(entry); + } catch (err) { + console.error(`[sidebar-agent] Error:`, err); + await sendEvent({ type: 'agent_error', error: String(err) }); + } + } +} + +// ─── Main ──────────────────────────────────────────────────────── + +async function main() { + const dir = path.dirname(QUEUE); + fs.mkdirSync(dir, { recursive: true }); + if (!fs.existsSync(QUEUE)) fs.writeFileSync(QUEUE, ''); + + lastLine = countLines(); + await refreshToken(); + + console.log(`[sidebar-agent] Started. Watching ${QUEUE} from line ${lastLine}`); + console.log(`[sidebar-agent] Server: ${SERVER_URL}`); + console.log(`[sidebar-agent] Browse binary: ${B}`); + + setInterval(poll, POLL_MS); +} + +main().catch(console.error); diff --git a/browse/src/snapshot.ts b/browse/src/snapshot.ts index 24380bad..840cd686 100644 --- a/browse/src/snapshot.ts +++ b/browse/src/snapshot.ts @@ -17,7 +17,7 @@ * Later: "click @e3" → look up Locator → locator.click() */ -import type { Page, Locator } from 'playwright'; +import type { Page, Frame, Locator } from 'playwright'; import type { BrowserManager, RefEntry } from './browser-manager'; import * as Diff from 'diff'; import { TEMP_DIR, isPathWithin } from './platform'; @@ -136,15 +136,18 @@ export async function handleSnapshot( ): Promise { const opts = parseSnapshotArgs(args); const page = bm.getPage(); + // Frame-aware target for accessibility tree + const target = bm.getActiveFrameOrPage(); + const inFrame = bm.getFrame() !== null; // Get accessibility tree via ariaSnapshot let rootLocator: Locator; if (opts.selector) { - rootLocator = page.locator(opts.selector); + rootLocator = target.locator(opts.selector); const count = await rootLocator.count(); if (count === 0) throw new Error(`Selector not found: ${opts.selector}`); } else { - rootLocator = page.locator('body'); + rootLocator = target.locator('body'); } const ariaText = await rootLocator.ariaSnapshot(); @@ -205,11 +208,11 @@ export async function handleSnapshot( let locator: Locator; if (opts.selector) { - locator = page.locator(opts.selector).getByRole(node.role as any, { + locator = target.locator(opts.selector).getByRole(node.role as any, { name: node.name || undefined, }); } else { - locator = page.getByRole(node.role as any, { + locator = target.getByRole(node.role as any, { name: node.name || undefined, }); } @@ -233,7 +236,7 @@ export async function handleSnapshot( // ─── Cursor-interactive scan (-C) ───────────────────────── if (opts.cursorInteractive) { try { - const cursorElements = await page.evaluate(() => { + const cursorElements = await target.evaluate(() => { const STANDARD_INTERACTIVE = new Set([ 'A', 'BUTTON', 'INPUT', 'SELECT', 'TEXTAREA', 'SUMMARY', 'DETAILS', ]); @@ -287,7 +290,7 @@ export async function handleSnapshot( let cRefCounter = 1; for (const elem of cursorElements) { const ref = `c${cRefCounter++}`; - const locator = page.locator(elem.selector); + const locator = target.locator(elem.selector); refMap.set(ref, { locator, role: 'cursor-interactive', name: elem.text }); output.push(`@${ref} [${elem.reason}] "${elem.text}"`); } @@ -394,5 +397,11 @@ export async function handleSnapshot( // Store for future diffs bm.setLastSnapshot(snapshotText); + // Add frame context header when operating inside an iframe + if (inFrame) { + const frameUrl = bm.getFrame()?.url() ?? 'unknown'; + output.unshift(`[Context: iframe src="${frameUrl}"]`); + } + return output.join('\n'); } diff --git a/browse/src/write-commands.ts b/browse/src/write-commands.ts index 3e80c7fd..02413daf 100644 --- a/browse/src/write-commands.ts +++ b/browse/src/write-commands.ts @@ -18,9 +18,13 @@ export async function handleWriteCommand( bm: BrowserManager ): Promise { const page = bm.getPage(); + // Frame-aware target for locator-based operations (click, fill, etc.) + const target = bm.getActiveFrameOrPage(); + const inFrame = bm.getFrame() !== null; switch (command) { case 'goto': { + if (inFrame) throw new Error('Cannot use goto inside a frame. Run \'frame main\' first.'); const url = args[0]; if (!url) throw new Error('Usage: browse goto '); await validateNavigationUrl(url); @@ -30,16 +34,19 @@ export async function handleWriteCommand( } case 'back': { + if (inFrame) throw new Error('Cannot use back inside a frame. Run \'frame main\' first.'); await page.goBack({ waitUntil: 'domcontentloaded', timeout: 15000 }); return `Back → ${page.url()}`; } case 'forward': { + if (inFrame) throw new Error('Cannot use forward inside a frame. Run \'frame main\' first.'); await page.goForward({ waitUntil: 'domcontentloaded', timeout: 15000 }); return `Forward → ${page.url()}`; } case 'reload': { + if (inFrame) throw new Error('Cannot use reload inside a frame. Run \'frame main\' first.'); await page.reload({ waitUntil: 'domcontentloaded', timeout: 15000 }); return `Reloaded ${page.url()}`; } @@ -73,15 +80,14 @@ export async function handleWriteCommand( if ('locator' in resolved) { await resolved.locator.click({ timeout: 5000 }); } else { - await page.click(resolved.selector, { timeout: 5000 }); + await target.locator(resolved.selector).click({ timeout: 5000 }); } } catch (err: any) { // Enhanced error guidance: clicking