diff --git a/.github/workflows/pr-title-sync.yml b/.github/workflows/pr-title-sync.yml new file mode 100644 index 00000000..023f5f66 --- /dev/null +++ b/.github/workflows/pr-title-sync.yml @@ -0,0 +1,64 @@ +name: PR Title Sync + +on: + pull_request: + types: [opened, synchronize, edited] + paths: + - 'VERSION' + +concurrency: + group: pr-title-sync-${{ github.event.pull_request.number }} + cancel-in-progress: true + +jobs: + sync: + name: Sync PR title to VERSION + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write + if: github.actor != 'github-actions[bot]' + steps: + - name: Checkout PR head + uses: actions/checkout@v4 + with: + fetch-depth: 1 + ref: ${{ github.event.pull_request.head.sha }} + + - name: Read VERSION + current title + id: inspect + run: | + set -euo pipefail + VERSION=$(cat VERSION | tr -d '[:space:]') + TITLE=$(jq -r '.pull_request.title' "$GITHUB_EVENT_PATH") + echo "version=$VERSION" >> "$GITHUB_OUTPUT" + # Only rewrite titles that ALREADY follow the v prefix pattern. + # Custom titles (no prefix) are left alone — user kept them intentionally. + if printf '%s' "$TITLE" | grep -qE '^v[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+ '; then + PREFIX=$(printf '%s' "$TITLE" | awk '{print $1}') + REST=$(printf '%s' "$TITLE" | sed 's/^v[0-9][0-9.]* //') + { + echo "prefix=$PREFIX" + echo "rest=$REST" + echo "eligible=true" + } >> "$GITHUB_OUTPUT" + else + echo "eligible=false" >> "$GITHUB_OUTPUT" + fi + + - name: Rewrite title if version changed + if: steps.inspect.outputs.eligible == 'true' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PR_NUM: ${{ github.event.pull_request.number }} + NEW_V: ${{ steps.inspect.outputs.version }} + OLD_PREFIX: ${{ steps.inspect.outputs.prefix }} + REST: ${{ steps.inspect.outputs.rest }} + run: | + if [ "v$NEW_V" = "$OLD_PREFIX" ]; then + echo "Title already matches v$NEW_V; no change." + exit 0 + fi + NEW_TITLE="v$NEW_V $REST" + echo "Rewriting: $OLD_PREFIX ... → v$NEW_V ..." + gh pr edit "$PR_NUM" --title "$NEW_TITLE" diff --git a/.github/workflows/version-gate.yml b/.github/workflows/version-gate.yml new file mode 100644 index 00000000..262baf6e --- /dev/null +++ b/.github/workflows/version-gate.yml @@ -0,0 +1,74 @@ +name: Version Gate + +on: + pull_request: + paths: + - 'VERSION' + - 'CHANGELOG.md' + - 'package.json' + +concurrency: + group: version-gate-${{ github.event.pull_request.number }} + cancel-in-progress: true + +jobs: + check: + name: Check VERSION is not stale vs queue + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: read + steps: + - name: Checkout PR head + uses: actions/checkout@v4 + with: + fetch-depth: 0 + ref: ${{ github.event.pull_request.head.sha }} + + - name: Setup Bun + uses: oven-sh/setup-bun@v2 + + - name: Read versions + id: versions + run: | + set -euo pipefail + PR_VERSION=$(cat VERSION | tr -d '[:space:]') + BASE_REF="${{ github.event.pull_request.base.ref }}" + git fetch origin "$BASE_REF" --depth=1 --quiet || true + BASE_VERSION=$(git show "origin/$BASE_REF:VERSION" 2>/dev/null | tr -d '[:space:]' || echo "0.0.0.0") + { + echo "pr_version=$PR_VERSION" + echo "base_version=$BASE_VERSION" + echo "base_ref=$BASE_REF" + } >> "$GITHUB_OUTPUT" + + - name: Detect bump level + id: bump + run: | + LEVEL=$(bun run scripts/detect-bump.ts "${{ steps.versions.outputs.base_version }}" "${{ steps.versions.outputs.pr_version }}") + echo "level=$LEVEL" >> "$GITHUB_OUTPUT" + + - name: Query queue (util) — fail-open on error + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + set +e + bun run bin/gstack-next-version \ + --base "${{ steps.versions.outputs.base_ref }}" \ + --bump "${{ steps.bump.outputs.level }}" \ + --current-version "${{ steps.versions.outputs.base_version }}" \ + --workspace-root null \ + --exclude-pr "${{ github.event.pull_request.number }}" \ + > next.json 2> next.err + RC=$? + if [ "$RC" != "0" ] || [ ! -s next.json ]; then + echo '{"offline":true}' > next.json + echo "::warning::util exit=$RC — failing open. stderr:" + cat next.err || true + fi + + - name: Compare PR VERSION to next free slot + env: + PR_VERSION: ${{ steps.versions.outputs.pr_version }} + run: | + bun run scripts/compare-pr-version.ts next.json "${{ github.event.pull_request.number }}" diff --git a/.gitignore b/.gitignore index bb6e841a..979bc17c 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,10 @@ bin/gstack-global-discover .gbrain/ .context/ extension/.auth.json +# xterm assets are vendored from npm at build time; not source-of-truth. +extension/lib/xterm.js +extension/lib/xterm.css +extension/lib/xterm-addon-fit.js .gstack-worktrees/ /tmp/ *.log diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 00000000..7e5e1fa3 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,72 @@ +# GitLab CI parity for workspace-aware ship. +# Mirrors .github/workflows/version-gate.yml and pr-title-sync.yml. +# Projects that mirror to GitLab get the same protection as GitHub. + +stages: + - check + +variables: + BUN_VERSION: "1.3.10" + +.setup-bun: &setup-bun + - apt-get update -qq && apt-get install -qq -y curl jq git + - curl -fsSL https://bun.sh/install | bash -s "bun-v$BUN_VERSION" + - export PATH="$HOME/.bun/bin:$PATH" + +version-gate: + stage: check + image: debian:stable-slim + rules: + - if: '$CI_PIPELINE_SOURCE == "merge_request_event"' + changes: + - VERSION + - CHANGELOG.md + - package.json + script: + - *setup-bun + - PR_VERSION=$(cat VERSION | tr -d '[:space:]') + - BASE_VERSION=$(git show "origin/$CI_MERGE_REQUEST_TARGET_BRANCH_NAME:VERSION" 2>/dev/null | tr -d '[:space:]' || echo "0.0.0.0") + - LEVEL=$(bun run scripts/detect-bump.ts "$BASE_VERSION" "$PR_VERSION") + # Util fail-open: on non-zero exit, emit offline marker + - | + set +e + bun run bin/gstack-next-version \ + --base "$CI_MERGE_REQUEST_TARGET_BRANCH_NAME" \ + --bump "$LEVEL" \ + --current-version "$BASE_VERSION" \ + --workspace-root null \ + --exclude-pr "$CI_MERGE_REQUEST_IID" \ + > next.json + RC=$? + if [ "$RC" != "0" ] || [ ! -s next.json ]; then + echo '{"offline":true}' > next.json + echo "WARNING: util exit=$RC — failing open" + fi + set -e + - PR_VERSION="$PR_VERSION" bun run scripts/compare-pr-version.ts next.json "$CI_MERGE_REQUEST_IID" + +pr-title-sync: + stage: check + image: debian:stable-slim + rules: + - if: '$CI_PIPELINE_SOURCE == "merge_request_event"' + changes: + - VERSION + script: + - apt-get update -qq && apt-get install -qq -y curl jq git + - curl -fsSL https://gitlab.com/gitlab-org/cli/-/releases/permalink/latest/downloads/glab_linux_amd64.deb -o glab.deb && dpkg -i glab.deb + - VERSION=$(cat VERSION | tr -d '[:space:]') + - TITLE="$CI_MERGE_REQUEST_TITLE" + - | + if printf '%s' "$TITLE" | grep -qE '^v[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+ '; then + PREFIX=$(printf '%s' "$TITLE" | awk '{print $1}') + REST=$(printf '%s' "$TITLE" | sed 's/^v[0-9][0-9.]* //') + if [ "v$VERSION" != "$PREFIX" ]; then + echo "Rewriting: $PREFIX ... → v$VERSION ..." + glab mr update "$CI_MERGE_REQUEST_IID" -t "v$VERSION $REST" + else + echo "Title already matches v$VERSION; no change." + fi + else + echo "Title does not use v prefix — leaving alone." + fi diff --git a/CHANGELOG.md b/CHANGELOG.md index e1e1aef1..9d089a69 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -64,60 +64,595 @@ Phase 2 (next release) is where the productivity gain lives. The agent prototype - The canonical SDK at `browse/src/browse-client.ts` and the sibling at `browser-skills/hackernews-frontpage/_lib/browse-client.ts` MUST be byte-identical. The skill-validation test fails the build otherwise. When the canonical SDK changes, update every bundled skill's `_lib/` copy. - Phase 2 design questions are tracked in `docs/designs/BROWSER_SKILLS_V1.md` ("Phase 2 sketch"). Specifically: synthesis from lossy activity feed, and Bun runtime distribution for user-authored skills landing on machines without Bun. -## [1.8.0.0] - 2026-04-25 +## [1.15.0.0] - 2026-04-26 -## **Two new browser primitives that compound the agent over time. Per-site notes save what works once and reuse it. Raw CDP gets a tightly-scoped escape hatch.** +## **Real-PTY test harness ships. 11 plan-mode E2E tests, 23 unit tests, and 50K fewer tokens per invocation.** -The agent learns LinkedIn's iframe trick once and remembers it next session. That's the whole pitch. `$B domain-skill save` writes a per-site markdown note keyed to the active tab's hostname; future sessions on that host get the note injected into their prompt. New skills land quarantined, auto-promote to active after 3 successful uses without classifier flags, and stay per-project unless you explicitly promote to global. Storage piggybacks on `/learn`'s JSONL so the same tooling works. - -`$B cdp ` is the escape hatch when curated commands miss. Deny-default by construction: ~25 read-only methods are pre-allowed (Accessibility tree, DOM/CSS inspection, Performance metrics, screenshots, viewport overrides). Adding a method requires a PR with a one-line justification. Dangerous methods that would be RCE or silent exfil if exposed (`Runtime.evaluate`, `Page.navigate`, `Network.getResponseBody`, `Browser.close`, `Target.attachToTarget`, etc.) are intentionally absent and verified absent by a unit test. - -Both features went through CEO review (9 decisions), DevEx review (5/10 to 8/10), Eng review, and a brutal Codex outside-voice pass. Codex's pass rolled back significant scope: a planned "agents author their own gstack commands" expansion was deferred to a P1 TODO with "needs out-of-process isolation design" attached, because in-daemon agent-authored TypeScript can't be safely contained with AST + approval gate alone. The shipped scope is what the security model actually defends. +Two big pieces of engineering in one release. The headline is a real-PTY test harness — 654 lines of TypeScript on top of `Bun.spawn({terminal:})` — that drives the actual `claude` binary and parses rendered terminal frames. Six new E2E tests on the harness cover behaviors that were structurally unreachable before: format compliance for every gstack `AskUserQuestion`, plan-design UI-scope detection (positive coverage), tool-budget regression vs prior runs, `/ship` end-to-end idempotency against a real git fixture, `/plan-ceo` answer-routing, and `/autoplan` phase sequencing. The branch nets ~11.6K lines smaller against `main` while adding ~1,450 lines of new TypeScript test code — preamble resolvers were rewritten to keep every semantic rule in less prose, and the test surface that catches AskUserQuestion drift expanded from zero to gate-tier on every PR. ### The numbers that matter -Source: 30 unit tests in `browse/test/` (`domain-skills-storage.test.ts`, `cdp-allowlist.test.ts`, `cdp-mutex.test.ts`, `telemetry.test.ts`), all passing in under one second. +Branch totals come from `git diff --shortstat origin/main..HEAD`. Token-level reduction comes from regenerating every `SKILL.md` against the rewritten resolvers (`bun run gen:skill-docs --host all`). E2E numbers come from `EVALS=1 EVALS_TIER=gate bun test test/skill-e2e-*.test.ts` on a clean working tree. -| Surface | Shape | +| Metric | Δ | |---|---| -| New `$B` commands | `domain-skill` (8 subcommands), `cdp` | -| New modules | 7 (`domain-skills.ts`, `domain-skill-commands.ts`, `cdp-allowlist.ts`, `cdp-bridge.ts`, `cdp-commands.ts`, `project-slug.ts`, `telemetry.ts`) | -| Lines of agent-facing TypeScript shipped | ~1100 (including 350+ of allowlist + state machine + mutex tests) | -| Curated CDP allowlist size | 25 methods, deny-default | -| Dangerous CDP surfaces verified absent | 18 (Runtime/Debugger/Page navigation/Network exfil/Browser/Target) | -| Codex outside-voice findings resolved | 7 of 20 (12 mooted by T1 scope drop) | -| State-machine transitions covered by tests | 6 (save to quarantined, 3-use auto-promote, classifier-flag-blocks-promotion, promote-to-global, rollback, tombstone) | +| Net branch size vs `main` | **−11,609 lines** (89 files, +7,240 / −18,849) | +| New test files added | **8 files** (1 harness unit-test + 7 E2E tests) | +| New test code shipped | **~1,453 lines** of TypeScript | +| Real-PTY harness module | **654 lines** in `test/helpers/claude-pty-runner.ts` | +| Per-invocation token savings | **−196K tokens (−25%)** on cold reads | +| `plan-ceo-review` preamble | **−43%** (54 KB → 31 KB) | +| Plan-mode E2E test count | **5 → 11** | +| New gate-tier paid E2E tests | **+3** (format compliance, design-with-UI, budget regression) | +| New periodic-tier paid E2E tests | **+3** (mode-routing, ship-idempotency, autoplan-chain) | +| Helper unit test coverage | **+23 tests** for parser + budget primitives | +| All free tests | **49 pass, 0 fail** | + +| Skill class | Per-invocation surface | Δ | +|---|---|---| +| Tier-≥3 plan reviews (full preamble) | ~50 KB → ~30 KB | −40% | +| Tier-1 quick skills | ~12 KB → ~9 KB | −25% | + +Every gstack invocation now sends ~50K fewer tokens to the model on cold reads — that's roughly a quarter of a typical 200K context window freed up for actual work. Tier-≥3 plan reviews keep their full functional surface (Brain Sync, Context Recovery, Routing Injection) and still lose almost half the bytes. ### What this means for builders -Domain skills are how an agent gets faster on a site over time. The first time it figures out LinkedIn's apply-button iframe, it costs minutes. Save that as a skill and the next session starts already knowing it. Across a sprint of repetitive site work, you'll feel the compounding inside a week. To opt into cross-project compounding (your LinkedIn skill follows you to every project, for instance), one explicit `$B domain-skill promote-to-global` per skill. Never silent, because Codex correctly argued that silent cross-project leakage is a privacy and contamination vector. +Three new classes of regression that were previously impossible to catch now block every PR. **Format drift**: a missing `Recommendation:` line or absent Pros/Cons bullet on an `AskUserQuestion` is caught against the real rendered terminal — not the model's claim about what it would have shown. **Conditional skill paths**: `/plan-design-review` had to early-exit when there's no UI scope, but until this release nothing tested the *positive* path; a regression that flipped the detector to "early-exit always" could have shipped silently. **Tool-budget regressions**: a preamble change that makes any skill burn 2× its prior tool calls fails a free, branch-scoped assertion that runs on every `bun test`. -`$B cdp` exists for the rare case you need raw CDP. Use it when curated commands don't fit, file a PR adding the method to the allowlist when you're done so the next agent doesn't need it. Or, if you don't want gstack's rails at all, the README now plugs [browser-use/browser-harness-js](https://github.com/browser-use/browser-harness-js): different philosophy, different tradeoffs, also good. +The harness itself is a reusable primitive. `runPlanSkillObservation()` watches plan-mode terminal output and classifies outcomes as `asked` / `plan_ready` / `silent_write` / `exited` / `timeout`. Three periodic-tier tests built on top of it cover the heavier cases — multi-phase chain ordering, ship idempotency state-machine end-to-end, and answer routing through 8-12 sequential prompts — that don't fit a per-PR budget but run weekly. Pull, run `bun run gen:skill-docs --host all`, and every skill invocation is meaningfully smaller and meaningfully better-tested than the prior release. ### Itemized changes #### Added -- `$B domain-skill save|list|show|edit|promote-to-global|rollback|rm`. Host derived from active tab's top-level origin, closing a confused-deputy class of bugs. Body via stdin or `--from-file`, never inline argv. -- `$B cdp [json-params]`. Deny-default allowlist (`browse/src/cdp-allowlist.ts`). Output for data-exfil methods wrapped in UNTRUSTED envelope. -- Two-tier CDP mutex in `browser-manager.ts`: per-tab plus global escalation, 5-second acquire timeout with `try/finally` release. -- Lightweight telemetry in `~/.gstack/analytics/browse-telemetry.jsonl` for `domain_skill_*` and `cdp_method_*` signals. Fire-and-forget. Hostname and method only. `GSTACK_TELEMETRY_OFF=1` silences. -- Sidebar-agent prompt context now injects per-project plus global domain skills matching the active tab's hostname, wrapped in UNTRUSTED markers. -- `docs/domain-skills.md` reference plus error lookup table. -- README plug for browser-harness-js as the no-rails alternative. +- `test/helpers/claude-pty-runner.ts`: real-PTY test harness using `Bun.spawn({terminal:})` (Bun 1.3.10+ has built-in PTY — no `node-pty`, no native modules). Exposes `launchClaudePty()` for raw session control and `runPlanSkillObservation()` as the high-level contract for plan-mode skill tests. +- `parseNumberedOptions(visible)` and `isPermissionDialogVisible(visible)` helpers in `claude-pty-runner.ts`. Tests can now look up an option index by its label without hard-coding positions, and auto-grant Claude Code's file-edit / workspace-trust / bash-permission dialogs that fire during preamble side-effects. +- `findBudgetRegressions()` and `assertNoBudgetRegression()` in `test/helpers/eval-store.ts`. Pure functions returning tests that grew >2× in tools or turns vs the prior eval run, with floors at 5 prior tools / 3 prior turns to avoid noise. Env override `GSTACK_BUDGET_RATIO`. +- 6 new real-PTY E2E tests on the harness: + - `skill-e2e-ask-user-question-format-compliance.test.ts` (gate, ~$0.50/run): asserts every gstack `AskUserQuestion` rendering contains the 7 mandated format elements (ELI10, Recommendation, Pros/Cons with ✅/❌, Net, `(recommended)` label). + - `skill-e2e-plan-design-with-ui.test.ts` (gate, ~$0.80/run): positive coverage for `/plan-design-review` UI-scope detection. Counterpart to the existing no-UI early-exit test — without it, a regression that flips the detector to "early-exit always" would ship undetected. + - `skill-budget-regression.test.ts` (gate, free): branch-scoped library-only assertion that no skill burns >2× tools or turns vs its prior recorded run. + - `skill-e2e-plan-ceo-mode-routing.test.ts` (periodic, ~$3/run): verifies AskUserQuestion answer routing — HOLD SCOPE picks routes to rigor language, SCOPE EXPANSION picks route to expansion language. + - `skill-e2e-ship-idempotency.test.ts` (periodic, ~$3/run): runs `/ship` end-to-end against a real git fixture with `STATE: ALREADY_BUMPED` baked in; asserts no double-bump, no double-commit, no fixture mutation. + - `skill-e2e-autoplan-chain.test.ts` (periodic, ~$8/run): asserts `/autoplan` phase ordering by tee'ing timestamps as each `**Phase N complete.**` marker appears. +- `test/helpers-unit.test.ts`: 23 unit tests covering `parseNumberedOptions` edge cases (empty, partial paint, >9 options, stale-vs-fresh anchoring) and `findBudgetRegressions` (noise floor, env override, missing tool data). +- `test/fixtures/plans/ui-heavy-feature.md`: planted plan with explicit UI scope keywords for the new design-with-UI test. +- Auto-handling of the workspace-trust dialog so tests run in temp directories without manual intervention. +- Outcome contract: `asked` | `plan_ready` | `silent_write` | `exited` | `timeout`. Tests pass on `asked` or `plan_ready`, fail on the rest. #### Changed -- `browse/src/server.ts` `spawnClaude` is now async to await `readSkill`. The system prompt has a one-line introduction to `$B domain-skill` so agents discover the feature. -- `browse/src/commands.ts` registers `domain-skill` and `cdp` as META commands. +- 18 preamble resolvers compressed: `generate-ask-user-format.ts`, `generate-brain-sync-block.ts`, `generate-completeness-section.ts`, `generate-completion-status.ts`, `generate-confusion-protocol.ts`, `generate-context-health.ts`, `generate-context-recovery.ts`, `generate-continuous-checkpoint.ts`, `generate-lake-intro.ts`, `generate-preamble-bash.ts`, `generate-proactive-prompt.ts`, `generate-routing-injection.ts`, `generate-telemetry-prompt.ts`, `generate-upgrade-check.ts`, `generate-vendoring-deprecation.ts`, `generate-voice-directive.ts`, `generate-writing-style-migration.ts`, `generate-writing-style.ts`. +- All 47 generated `SKILL.md` files regenerated; 3 ship golden fixtures regenerated. +- Plan-* skills retain full preamble surface (Brain Sync, Context Recovery, Routing Injection) — the early slim attempt that cut these was reverted after diagnosing them as load-bearing. +- 5 existing plan-mode tests (`plan-ceo`, `plan-eng`, `plan-design`, `plan-devex`, `plan-mode-no-op`) rewritten onto the new harness with a 300s observation budget. All 5 verify-pass under `EVALS=1 EVALS_TIER=gate` against the real `claude` binary in 790s sequential. +- `isNumberedOptionListVisible` regex tolerates whitespace collapse from TTY cursor-positioning escapes (`\x1b[40C`) which `stripAnsi` removes — `\b2\.` was failing on word-to-word transitions where stripped output read `text2.`. + +#### Fixed + +- `scripts/skill-check.ts`: new `isRepoRootSymlink()` helper so dev installs that mount the repo root at `host/skills/gstack` (e.g., codex's `.agents/skills/gstack`) get skipped instead of double-counted. +- `test/skill-validation.test.ts`: known-large-fixture exemption keeps `browse/test/fixtures/security-bench-haiku-responses.json` (27 MB BrowseSafe-Bench replay fixture, intentional) out of the size warning. + +#### Removed + +- `test/helpers/plan-mode-helpers.ts`: superseded by `claude-pty-runner.ts`. Zero callers remained after the rewrite. #### For contributors -- `browse/src/domain-skills.ts` is the storage layer. Tests in `browse/test/domain-skills-storage.test.ts` lock in the state machine. -- Adding a CDP method: edit `browse/src/cdp-allowlist.ts`, add `{domain, method, scope, output, justification}`. The `cdp-allowlist.test.ts` linter enforces all four fields. -- The full review trail (CEO + DevEx + Eng + Codex) is in `~/.claude/plans/system-instruction-you-are-working-drifting-alpaca.md` for posterity. +- `test/helpers/touchfiles.ts`: 5 plan-mode test selections + e2e-harness-audit selection now point at `claude-pty-runner.ts` instead of the deleted helper. 6 new entries (`ask-user-question-format-pty`, `plan-ceo-mode-routing`, `plan-design-with-ui-scope`, `budget-regression-pty`, `ship-idempotency-pty`, `autoplan-chain-pty`) with tier classifications: 3 gate, 3 periodic. +- `test/e2e-harness-audit.test.ts`: recognizes `runPlanSkillObservation` as a valid coverage path alongside the legacy `canUseTool` / `runPlanModeSkillTest` patterns. +- New unit test: `test/gen-skill-docs.test.ts` asserts plan-review preambles stay under 33 KB and the slim Voice section preserves its load-bearing semantic contract (lead-with-the-point, name-the-file, user-outcome framing, no-corporate, no-AI-vocab, user-sovereignty). +- `test/touchfiles.test.ts`: skill-specific change selection count updated 15 → 18 to match the 6 new touchfile entries that depend on `plan-ceo-review/**`. -## [1.7.0.0] - 2026-04-22 +## [1.14.0.0] - 2026-04-25 + +## **The gstack browser sidebar is now an interactive Claude Code REPL with live tab awareness.** + +Open the side panel and Claude Code is right there in a real terminal. Type, watch the agent work, switch browser tabs and Claude sees the change. The old one-shot chat queue is gone. Two-way conversation, slash commands, `/resume`, ANSI colors, all of it. Plus a `$B tab-each` command that fans out a single browse command across every open tab and returns per-tab JSON results. + +### The numbers that matter + +| Metric | Before | After | Δ | +|---|---|---|---| +| Sidebar surfaces | Chat (one-shot `claude -p`) + 3 debug | Terminal (live PTY) + 3 debug | -1 surface, +interactive | +| Subprocesses spawned per session | Many (one per chat message) | One (PTY claude, lazy-spawned) | -N | +| Lines in `extension/sidepanel.js` | 1969 | 1042 | -47% | +| Total diff | — | 27 files, +2875 / -3885 | -1010 net | +| New unit + integration + regression tests | 0 | 56+ | +56 | +| Live `tabs.json` push latency | n/a (no live state) | <50ms after `chrome.tabs` event | new capability | + +### What this means for builders + +Open the sidebar, type. Real PTY means slash commands, `/resume`, real ANSI rendering, real claude process lifecycle. Switch browser tabs while Claude is running and `/tabs.json` + `active-tab.json` update in place — Claude reads them, no need to ask `$B tabs`. Need to do the same thing on every tab? `$B tab-each ` returns a JSON array, original active tab restored when done, no OS focus stealing. + +The old chat queue is gone. `sidebar-agent.ts`, `/sidebar-command`, `/sidebar-chat`, `/sidebar-agent/event` all deleted. The Cleanup / Screenshot / Cookies toolbar buttons survive in the Terminal pane — Cleanup pipes its prompt straight into the live PTY via `window.gstackInjectToTerminal()` instead of spawning yet another `claude -p`. + +### Itemized changes + +#### Added + +- **Interactive Terminal sidebar tab.** xterm.js + a non-compiled `terminal-agent.ts` Bun process that spawns claude with `Bun.spawn({terminal: {rows, cols, data}})`. Auto-connects when the side panel opens, no keypress needed. +- **`$B tab-each `** — fan-out helper for multi-tab work. Returns `{command, args, total, results: [{tabId, url, title, status, output}]}`. Skips chrome:// pages, scope-checks the inner command before iterating, restores the original active tab in a `finally` block, never pulls focus away from the user's foreground app. +- **Live tab state files.** `/tabs.json` (full list with id, url, title, active, pinned, audible, windowId) and `/active-tab.json` (current active). Updated atomically on every `chrome.tabs` event (activated, created, removed, URL/title change). Claude reads on demand instead of running `$B tabs`. +- **Tab-awareness system prompt** injected via `claude --append-system-prompt` at spawn so the model knows about the state files and the `$B tab-each` command without being told. +- **Always-visible Restart button** in the Terminal toolbar. Force-restart claude any time, not just from the "session ended" state. + +#### Changed +- **Sidebar is Terminal-only.** No more `Terminal | Chat` primary tab nav. Activity / Refs / Inspector still live behind the `debug` toggle in the footer. Quick-actions (🧹 Cleanup / 📸 Screenshot / 🍪 Cookies) moved into the Terminal toolbar. +- **WebSocket auth uses `Sec-WebSocket-Protocol`** instead of cookies. Browsers can't set `Authorization` on WS upgrades, and `SameSite=Strict` cookies don't survive the cross-port jump from server.ts:34567 to the agent's random port from a chrome-extension origin. The token rides on `new WebSocket(url, [`gstack-pty.`])` and the agent echoes the protocol back (Chromium closes connections that don't pick a protocol). +- **Cleanup button now drives the live PTY.** Clicking "🧹 Cleanup" injects the cleanup prompt straight into claude via `window.gstackInjectToTerminal()`. The Inspector "Send to Code" action uses the same path. No more `/sidebar-command` POSTs. +- **Repaint after debug-tab close.** xterm.js doesn't auto-redraw when its container flips from `display: none` back to `display: flex`. A MutationObserver on `#tab-terminal`'s class attribute now forces a `fitAddon.fit() + term.refresh() + resize` push when the pane becomes visible. + +#### Removed +- **`browse/src/sidebar-agent.ts`** — the one-shot `claude -p` queue worker. ~900 lines. +- **Server endpoints**: `/sidebar-command`, `/sidebar-chat[/clear]`, `/sidebar-agent/{event,kill,stop}`, `/sidebar-tabs[/switch]`, `/sidebar-session{,/new,/list}`, `/sidebar-queue/dismiss`. ~600 lines. +- **Chat-related state** in server.ts: `ChatEntry`, `SidebarSession`, `TabAgentState`, `pickSidebarModel`, `addChatEntry`, `processAgentEvent`, `killAgent`, the agent-health watchdog, `chatBuffer`, the per-tab agent map. +- **Chat UI in sidepanel.html**: primary-tab nav, `
`, the chat input bar, the experimental "Browser co-pilot" banner, the security event banner, the `clear-chat` footer button. +- **Five obsolete test files**: `sidebar-agent.test.ts`, `sidebar-agent-roundtrip.test.ts`, `security-e2e-fullstack.test.ts`, `security-review-fullstack.test.ts`, `security-review-sidepanel-e2e.test.ts`. Plus 5 chat-only describe blocks inside surviving security tests (loadSession session-ID validation, switchChatTab DocumentFragment, pollChat reentrancy, sidebar-tabs URL sanitization, agent queue security). + +#### For contributors +- **`browse/src/pty-session-cookie.ts`** mirrors `sse-session-cookie.ts`. Same TTL, same opportunistic pruning, separate registry (PTY tokens must never be valid as SSE tokens or vice versa). +- **`docs/designs/SIDEBAR_MESSAGE_FLOW.md`** rewritten around the Terminal flow: WebSocket upgrade, dual-token model (`AUTH_TOKEN` for `/pty-session`, `gstack-pty.` for `/ws`, `INTERNAL_TOKEN` for server↔agent loopback), threat-model boundary (Terminal tab bypasses the prompt-injection stack on purpose; user keystrokes are the trust source). +- **`browse/test/terminal-agent.test.ts`** (16 tests) + `terminal-agent-integration.test.ts` (real `/bin/bash` PTY round-trip, raw `Sec-WebSocket-Protocol` upgrade verification) + `tab-each.test.ts` (10 tests with mock `BrowserManager`) + `sidebar-tabs.test.ts` (27 structural assertions locking the chat-rip invariants). +- **CLAUDE.md** updated with the dual-token model, the cookie-vs-protocol rationale, and the cross-pane injection pattern. +- **`vendor:xterm`** build step copies `xterm@5.x` and `xterm-addon-fit` from `node_modules/` into `extension/lib/` at build time. xterm files are gitignored. +- **TODOS.md** carries three v1.1+ follow-ups: PTY session survival across sidebar reload (Issue 1C deferred), `/health` `AUTH_TOKEN` distribution audit (codex finding, pre-existing soft leak), and dropping the now-dead `security-classifier.ts` ML pipeline. + +## [1.13.0.0] - 2026-04-25 + +## **`/gstack-claude` gives non-Claude hosts a read-only outside voice.** + +This release adds the reverse of `/codex`: external hosts can now ask Claude for review, adversarial challenge, or read-only consultation without handing nested Claude mutation tools. + +### Added + +- `claude/SKILL.md.tmpl`: new external-only `/gstack-claude` skill with `review`, `challenge`, and `consult` modes. +- Review and challenge mode feed the detected base-branch diff to `claude -p --tools ""` with `--disable-slash-commands`. +- Consult mode allows only `Read,Grep,Glob`, explicitly disallows `Bash,Edit,Write`, saves `.context/claude-session-id`, and can resume the prior consult session. +- Claude prompt transport now uses a `/tmp/gstack-claude-prompt-*` file piped over stdin with cleanup. +- Auth checks require the `claude` CLI plus either `~/.claude/.credentials.json` or `ANTHROPIC_API_KEY`. +- JSON output parsing extracts `result`, `usage`, `model`, `session_id`, and `is_error`. + +### Fixed + +- `hosts/claude.ts`: excludes the Claude outside-voice skill from Claude-host generation. +- `test/brain-sync.test.ts`: the `GSTACK_HOME` isolation test now snapshots and preserves the real config file instead of assuming local machine state. +- `claude/SKILL.md.tmpl`: uses `mktemp` for diff capture in review/challenge mode instead of a `$$`-based temp path, avoiding collisions across concurrent invocations. + +### Changed + +- `test/skill-validation.test.ts`: the tracked-file-size check is now advisory. Large fixtures remain allowed in git and are reported as `[size-warning]` instead of failing the suite. +- `test/gen-skill-docs.test.ts`: generation coverage now asserts external host docs include `gstack-claude/SKILL.md` while Claude host output omits `claude/SKILL.md`. + +## [1.12.2.0] - 2026-04-24 + +## **`/setup-gbrain` polish: PATH parsing, repo init order, MCP user scope.** + +Small refinements to the /setup-gbrain onboarding path. + +### Fixed +- `bin/gstack-gbrain-install`: parse `gbrain --version` output with `awk '{print $NF}'` so the D19 PATH-shadow check compares just the version number. +- `bin/gstack-brain-init`: omit `--source` from `gh repo create`. Later steps handle `git init` + remote setup explicitly. +- `setup-gbrain` Step 9: smoke test uses `gbrain put ` with body piped on stdin. +- `setup-gbrain` Step 5a: MCP registers with `--scope user` and an absolute path to the gbrain binary, so `mcp__gbrain__*` tools are available in every Claude Code session on the machine. + +### Changed +- `test/gstack-brain-init-gh-mock.test.ts`: asserts `--source` is absent from the `gh repo create` call. + +## [1.12.1.0] - 2026-04-24 + +## **Plan-mode review skills run the review directly, no more "exit and rerun" prompt.** + +Before this release, `/plan-eng-review` (and the three other `interactive: true` review skills) greeted plan-mode users with an A/B/C handshake asking them to exit plan mode and rerun, or cancel. That handshake was vestigial: the preamble already contains an authoritative "Skill Invocation During Plan Mode" rule saying AskUserQuestion satisfies plan mode's end-of-turn requirement. Two contradictory rules, the bossy one at the top won, the review never ran. This release deletes the bossier rule and hoists the correct one to position 1 of the preamble so skills run straight through. + +### What shipped + +The vestigial `scripts/resolvers/preamble/generate-plan-mode-handshake.ts` resolver is deleted. The "Plan Mode Safe Operations" and "Skill Invocation During Plan Mode" blocks are split out of `generate-completion-status.ts` into a sibling `generatePlanModeInfo()` export in the same module, then wired at preamble position 1 where the handshake used to live. The "you see this first" positioning stays; only the content changes. Four dead plan-mode-handshake question-registry IDs are removed. The `interactive: true` frontmatter flag stays on the four review skill templates because `test/e2e-harness-audit.test.ts` reads it to classify which skills must have `canUseTool` coverage, per codex outside-voice review. + +The four per-skill plan-mode E2E tests are rewritten as smoke tests that assert Step 0's actual scope-mode question fires (not an A/B/C handshake), no Write/Edit before the first AskUserQuestion, and no early `ExitPlanMode`. The write-guard helper from the old `plan-mode-handshake-helpers.ts` is preserved in the renamed `plan-mode-helpers.ts` so silent-bypass regressions still get caught. `test/skill-e2e-plan-mode-no-op.test.ts` is kept for the opposite coverage case: the plan-mode-info block stays quiet outside plan mode. `test/gen-skill-docs.test.ts` now scans every generated `SKILL.md` across all 9 host subdirs (`.agents/`, `.openclaw/`, `.kiro/`, etc.) and asserts `## Plan Mode Handshake` is absent. That's a sub-second unit gate blocking any future PR from re-introducing the resolver. + +### The numbers that matter + +Source: `bun test` on HEAD against the pre-change baseline. + +| Metric | Before | After | Δ | +|---|---|---|---| +| Preamble resolvers | 19 (handshake + completion-status) | 18 (completion-status owns both functions) | -1 module | +| Handshake lines in generated SKILL.md | 92 per skill × 4 skills = 368 | 0 | -368 | +| Question-registry entries | 51 | 47 | -4 dead entries | +| Plan-mode gate-tier tests | 5 handshake-asserting | 5 smoke + no-op + write-guard | same count, stronger assertions | +| Multi-host handshake-absence unit test | none | 1 (scans 9 host dirs, <1s) | new regression gate | +| `bun test` on changed files | 360 gen-skill-docs pass | 360 gen-skill-docs pass | no regression | + +The preamble position for the new `## Skill Invocation During Plan Mode` section lands at line ~127 of every `plan-*-review/SKILL.md` (first ~15% of the file), before the upgrade check and onboarding gates, so the authoritative plan-mode rule is the first thing the model reads after bash env setup. + +### What this means for plan-mode users + +Invoke `/plan-eng-review` from plan mode. You get the scope-mode question (`SCOPE EXPANSION` / `SELECTIVE EXPANSION` / `HOLD SCOPE` / `SCOPE REDUCTION`) immediately, the review runs, each finding gets its own `AskUserQuestion`, `ExitPlanMode` fires at the end. No two-step "exit and rerun" friction. Same for `/plan-ceo-review`, `/plan-design-review`, `/plan-devex-review`. + +### Itemized changes + +#### Fixed + +- `/plan-eng-review`, `/plan-ceo-review`, `/plan-design-review`, `/plan-devex-review` no longer show an A/B/C handshake prompt when invoked in plan mode. Each skill runs its interactive review directly, with every finding gated by `AskUserQuestion` just like outside plan mode. + +#### Changed + +- The "Plan Mode Safe Operations" and "Skill Invocation During Plan Mode" preamble sections are now emitted at position 1 (right after the bash env setup) instead of at the tail of the completion-status block. All skills see these two sections earlier in the preamble; nothing else changes about the content. +- `test/helpers/plan-mode-handshake-helpers.ts` is renamed to `test/helpers/plan-mode-helpers.ts`. The exported API is renamed from `runPlanModeHandshakeTest` to `runPlanModeSkillTest` and from `assertHandshakeShape` to `assertNotHandshakeShape`. The write-guard detection (no `Write`/`Edit` tool call before the first `AskUserQuestion`) is preserved and extended with `ExitPlanMode`-before-ask detection. + +#### Removed + +- `scripts/resolvers/preamble/generate-plan-mode-handshake.ts` deleted (vestigial, superseded by `generatePlanModeInfo` in `generate-completion-status.ts`). +- Four question-registry entries removed from `scripts/question-registry.ts`: `plan-ceo-review-plan-mode-handshake`, `plan-eng-review-plan-mode-handshake`, `plan-design-review-plan-mode-handshake`, `plan-devex-review-plan-mode-handshake`. These IDs are no longer emitted by any skill; keeping them in the registry was dead weight. + +#### For contributors + +- `test/gen-skill-docs.test.ts` now has a "plan-mode-info resolver" describe block that (a) scans every generated `SKILL.md` under the repo root plus every host subdir (`.agents/`, `.openclaw/`, `.opencode/`, `.factory/`, `.hermes/`, `.kiro/`, `.cursor/`, `.slate/`) and asserts `## Plan Mode Handshake` is absent, and (b) asserts `## Skill Invocation During Plan Mode` lands in the first 15,000 bytes of each of the four review skills' generated `SKILL.md`. Both assertions run on every `bun test`. Any PR that re-introduces the handshake resolver fails CI immediately. +- The `interactive: true` frontmatter flag on the four review skill templates is preserved. It still has a reader: `test/e2e-harness-audit.test.ts` uses it to enforce `canUseTool` coverage on interactive review E2E tests. Removing the flag was part of the initial plan; codex outside-voice review caught the downstream dependency during review and that decision was reversed. + +## [1.12.0.0] - 2026-04-24 + +## **`/setup-gbrain` — any coding agent goes from zero to "gbrain is running, and I can call it" in under five minutes.** + +gstack v1.9.0.0 shipped `gbrain-sync`, which assumed a `gbrain` CLI was already installed. That was fine on Garry's machine (he'd manually cloned `~/git/gbrain`), broken for everyone else. This release closes the onboarding gap: one skill, three paths (local PGLite, existing Supabase URL, or Supabase auto-provision via the Management API), an MCP registration step for Claude Code, a per-remote trust triad (read-write / read-only / deny) so multi-client consultants don't mingle brains, and a reusable secret-sink test harness other skills can import when they start handling secrets. + +### What shipped + +Six new `bin/` helpers and one new skill template. `bin/gstack-gbrain-repo-policy` stores per-remote ingest tiers at `~/.gstack/gbrain-repo-policy.json` with a `_schema_version: 2` field so future migrations are deterministic (the first one — legacy `allow` → `read-write` — already runs on first read of any pre-D3 file). `bin/gstack-gbrain-detect` emits the full state as JSON so the skill can skip steps that are already done. `bin/gstack-gbrain-install` probes `~/git/gbrain` and `~/gbrain` before cloning fresh (fixes the day-one dup-clone footgun on the author's own machine) and fails hard on PATH shadowing with a three-option remediation menu instead of warn-and-continue. `bin/gstack-gbrain-lib.sh` extracts the `read_secret_to_env` helper used for both PAT collection and pooler-URL paste — one canonical implementation of the stty-echo-off + SIGINT-restore + env-var-only pattern. `bin/gstack-gbrain-supabase-verify` rejects direct-connection URLs (IPv6-only, fails in most environments) with exit code 3 so the caller's retry UX is distinct from a generic format error. `bin/gstack-gbrain-supabase-provision` wraps the Management API — list-orgs, create, poll, pooler-url, list-orphans, delete-project — with full HTTP error coverage (401/403/402/409/429/5xx), exponential backoff, and `--cleanup-orphans` support for the rare case where someone kills setup mid-provision. + +The skill template itself threads these together into a single interactive flow. PAT collection shows the full scope disclosure verbatim before the read-s prompt, explains that the token grants access to every project in the user's Supabase account, and emits a revocation reminder at the end. Path 1's pooler-URL paste gets the same hygiene plus a redacted preview (host / port / database visible, password masked). Switching between engines wraps `gbrain migrate` in `timeout 180s` with an actionable message on deadlock. Concurrent-run protection via `mkdir ~/.gstack/.setup-gbrain.lock.d`. Telemetry records scenario, install result, MCP opt-in, trust tier — all enumerated categorical values, never free-form strings that could leak secrets. + +`/health` gets a new GBrain dimension (weight 10%, wrapped in `timeout 5s`) alongside type-check / lint / tests / dead-code / shell-linter. The dimension is omitted — not red — when gbrain isn't installed, so running `/health` on a non-gbrain machine doesn't penalize that choice. + +`test/helpers/secret-sink-harness.ts` is new infrastructure. Runs a subprocess with a seeded secret, captures stdout / stderr / files-under-HOME / telemetry-JSONL, and asserts the seed never appears in any channel via four match rules (exact + URL-decoded + first-12-char prefix + base64). Seven positive-control tests prove the harness catches leaks in every covered channel; four negative controls run real setup-gbrain bins with seeded secrets and confirm nothing escapes. Any future skill that handles secrets can import `runWithSecretSink` and run the same pattern. + +### The numbers that matter + +Source: `bun test` against Slices 1–7's five new test files. + +| Suite | Tests | Time | +|---|---|---| +| `gbrain-repo-policy.test.ts` | 24 | ~1.2s | +| `gbrain-detect-install.test.ts` | 15 | ~1.0s | +| `gbrain-lib-verify.test.ts` | 22 | ~0.2s | +| `gbrain-supabase-provision.test.ts` | 28 | ~13.8s | +| `secret-sink-harness.test.ts` | 11 | ~7.0s | +| **Total** | **100** | **~23s** | + +Every HTTP error path for the Supabase Management API is covered by a mock-server fixture. Every secret-bearing bin is exercised with a distinctive seed through the leak harness. + +### What this means for Claude Code users + +Previously: install gbrain manually, hope nothing was shadowing on PATH, paste the pooler URL into an echoing prompt, figure out MCP registration yourself. Now: one command, three paths, PAT-handled-correctly auto-provision, MCP registered for Claude Code automatically, trust tiers for multi-client work, leak-tested end-to-end. Run `/setup-gbrain`. + +### Itemized changes + +#### Added +- `/setup-gbrain` skill (`setup-gbrain/SKILL.md.tmpl`) — full onboarding flow with path selection, PAT-scoped disclosure, redacted URL preview, concurrent-run lock, SIGINT recovery with `--resume-provision`, and `--cleanup-orphans` subcommand. +- `bin/gstack-gbrain-repo-policy` — per-remote trust triad (read-write / read-only / deny), schema-versioned file format, atomic writes, corrupt-file quarantine. +- `bin/gstack-gbrain-detect` — JSON state reporter for skill branching. +- `bin/gstack-gbrain-install` — D5 detect-first installer, D19 PATH-shadow fail-hard validator, pinned gbrain commit. +- `bin/gstack-gbrain-lib.sh` — shared `read_secret_to_env` bash helper. +- `bin/gstack-gbrain-supabase-verify` — structural URL validator with distinct exit for direct-connection rejects. +- `bin/gstack-gbrain-supabase-provision` — Management API wrapper (list-orgs / create / wait / pooler-url / list-orphans / delete-project) with full HTTP error coverage and retry+backoff. +- `test/helpers/secret-sink-harness.ts` — reusable negative-space leak-testing harness. + +#### Changed +- `/health` skill adds a GBrain composite dimension (weight 10%, wrapped in `timeout 5s`). Existing category weights rebalanced to keep the composite score on the 0–10 scale; historical JSONL entries without a `gbrain` field read as `null` for trend comparison. + +#### For contributors +- Pre-Impl Gate 1 verified Supabase Management API shape before any code was written. Corrected two wrong endpoint assumptions (`POST /v1/projects` not `/v1/organizations/{ref}/projects`; `/config/database/pooler` not `/config/database`) and confirmed gbrain's `--non-interactive` + `GBRAIN_DATABASE_URL` env var are real. Documented in the plan file. +- Review discipline: CEO review + Codex outside voice + Eng review all passed in plan mode before any code landed (3 reviews, 21 D-decisions, 0 unresolved gaps). + +## [1.11.1.0] - 2026-04-23 + +## **Plan mode stopped silently rubber-stamping your reviews. The forcing questions actually fire now.** + +If you ran `/plan-ceo-review` or any interactive review skill while in plan mode, the skill used to read your diff, skip every STOP gate, write a plan file, and exit. Zero AskUserQuestion calls. Zero mode selection. Zero per-section decisions. The skill's interactive contract got outranked by plan mode's system-reminder, which tells the model to run its own workflow and ignore everything else. This release adds a preamble-level STOP gate that fires before any analysis, so you always get the interactive review the skill was designed to run. + +### What shipped + +Four interactive review skills (plan-ceo-review, plan-eng-review, plan-design-review, plan-devex-review) now emit a two-option AskUserQuestion the moment plan mode is detected: exit-and-rerun interactively, or cancel. No silent bypass. The gate is classified one-way-door in the question registry so `/plan-tune` preferences can't auto-decide past it. Outcome gets logged to `~/.gstack/analytics/skill-usage.jsonl` synchronously when the handshake fires, so A-exit and C-cancel are captured even though they terminate the skill before the end-of-run telemetry block. + +The test harness got a canUseTool extension built on Anthropic's Agent SDK (already installed at v0.2.117). When a test supplies a canUseTool callback, `test/helpers/agent-sdk-runner.ts` flips `permissionMode` from `bypassPermissions` to `default` so the callback actually fires. This is the foundation for asserting AskUserQuestion content end-to-end, which gstack's E2E tests previously couldn't do at all. They had to instruct the model to skip AskUserQuestion entirely. Every future interactive-skill test builds on this. + +### The numbers that matter + +Source: new unit tests in `test/gen-skill-docs.test.ts` (8 tests covering handshake presence, absence, composition ordering, 0C-bis STOP block) and `test/agent-sdk-runner.test.ts` (6 tests covering canUseTool + permission-mode + passThrough helper). All 14 pass locally in <250ms, free tier. + +| Surface | Before | After | +|---|---|---| +| Claude skills rendering the handshake | 0 | 4 (plan-ceo, plan-eng, plan-design, plan-devex) | +| Non-Claude host outputs with handshake text | N/A | 0 (host-scoped via `ctx.host === 'claude'` check) | +| E2E tests that can assert AskUserQuestion content | 0 | 1 harness primitive, ready for every interactive skill | +| Plan-mode entry to any of 4 review skills | Silent bypass | Two-option STOP gate | +| Step 0C-bis in plan-ceo-review | No STOP block, could drift to 0F | Explicit `**STOP.**` block matching 0F pattern | +| Post-handshake telemetry outcomes captured | Neither A-exit nor C-cancel | Both (synchronous write before ExitPlanMode) | + +### What this means for builders + +If you're running gstack in plan mode on a PR review, you'll see one question before the skill does anything: "Exit plan mode and run interactively, or cancel?" Pick A, press esc-esc, rerun the skill in normal mode, get the full interactive review you expected. Pick C to bail cleanly. No more silent rubber-stamp. + +If you're building new interactive skills (yours or contributing to gstack), you can now write real E2E tests that assert on AskUserQuestion shape and routing via the canUseTool harness. See `test/agent-sdk-runner.test.ts` for the pattern and `test/helpers/agent-sdk-runner.ts` for the API. + +### Itemized changes + +#### Fixed + +- Plan mode no longer silently skips AskUserQuestion gates in `/plan-ceo-review`, `/plan-eng-review`, `/plan-design-review`, or `/plan-devex-review`. A preamble-level handshake fires as the first thing the skill does when the plan-mode system-reminder is present, forcing a user choice before any analysis or plan-file writes. +- `/plan-ceo-review` Step 0C-bis now has an explicit STOP block matching the pattern used at Step 0F, so the approach-selection question can't be silently skipped when the skill continues to mode selection. + +#### Added + +- New resolver `scripts/resolvers/preamble/generate-plan-mode-handshake.ts` emits the handshake prose and telemetry bash. Host-scoped to Claude only via `ctx.host === 'claude'` check. Opt-in per skill via `interactive: true` in frontmatter. +- New frontmatter field `interactive: boolean` on skill templates. Generator-only input parsed by `scripts/gen-skill-docs.ts`, never written to generated SKILL.md output (follows the `preamble-tier` precedent). +- New question registry entries `plan-{ceo,eng,design,devex}-review-plan-mode-handshake` with `door_type: 'one-way'` in `scripts/question-registry.ts`. Question-tuning `never-ask` preferences cannot suppress this gate. +- New telemetry field `plan_mode_handshake` in `~/.gstack/analytics/skill-usage.jsonl` with outcomes `fired`, `A-exit`, `C-cancel` written synchronously as the handshake fires. Captures outcomes that would otherwise terminate the skill before end-of-run telemetry runs. +- `test/helpers/agent-sdk-runner.ts` extended with optional `canUseTool` callback parameter. When supplied, flips `permissionMode` to `default`, auto-adds `AskUserQuestion` to `allowedTools`, and passes the callback to the SDK. Exports `passThroughNonAskUserQuestion` helper for tests that only want to assert on AskUserQuestion but auto-allow other tools. + +#### For contributors + +- Added 5 unit tests in `test/gen-skill-docs.test.ts` verifying handshake presence in 4 interactive skills, absence in non-interactive skills, absence in non-Claude host outputs, composition ordering (handshake precedes upgrade-check), and 0C-bis STOP block wiring. +- Added 6 unit tests in `test/agent-sdk-runner.test.ts` verifying permission-mode flip, allowedTools auto-injection, canUseTool callback propagation, and pass-through helper behavior. +- Added 6 gate-tier entries to `test/helpers/touchfiles.ts` covering the new E2E test surface. Dependency glob fires any of the new tests when: the relevant skill template, the handshake resolver, preamble composition, the question registry, the one-way-door classifier, or the agent-sdk-runner changes. +- Filed 2 P1/P2 follow-ups in `TODOS.md`: structural STOP-Ask forcing function across all skills (broader class of bug beyond plan-mode entry), and extending `interactive: true` audit to non-review interactive skills like `/office-hours`, `/codex`, `/investigate`, `/qa`. + +## [1.11.0.0] - 2026-04-23 + +## **Workspace-aware ship. Two open PRs can't both claim the same VERSION anymore.** + +If you run gstack in multiple Conductor windows at once, you've probably seen this: two branches bump to the same version, whoever merges second silently overwrites the first one's CHANGELOG entry or lands with a duplicate header, and nobody notices until a `grep "^## \["` later. This release makes that collision impossible by construction. `/ship` now queries the open PR queue, sees what versions are already claimed, and picks the next free slot at your chosen bump level. If a collision is detected between ship and land, the land step aborts and tells you to rerun `/ship` rather than silently overwriting. A new `/landing-report` command shows the whole queue on demand. + +### What changes for you + +Run `/ship` in one Conductor window while another has an open PR claiming v1.7.0.0. Your ship now sees the claim, renders a queue table, and picks the next free slot above it (same bump level). The PR title starts with `v` so landing order is visible in `gh pr list` without opening each PR. If a sibling workspace has uncommitted work at a higher VERSION and looks active (commit in the last 24h), `/ship` asks whether to wait for them or advance past. If the queue shifts between ship and merge, CI's new version-gate catches it, and rerunning `/ship` rewrites VERSION, package.json, CHANGELOG, and the PR title atomically. This very release dogfooded the drift path: the original ship at v1.8.0.0 went stale when three other PRs landed first, and the merge-back-to-main rebump (v1.8.0.0 → v1.11.0.0) happened via the same queue-aware codepath it introduces. + +### What shipped (by the numbers) + +- `bin/gstack-next-version` — ~390-line Bun/TS util. 21 passing fixture tests covering happy path, 8 collision scenarios, offline fallback, fork-PR filtering, sibling activity detection, self-PR auto-exclusion. +- Host parity: GitHub + GitLab both supported. CI gates: `.github/workflows/version-gate.yml`, `.github/workflows/pr-title-sync.yml`, plus `.gitlab-ci.yml` mirror. +- Fail-open semantics on util errors (network, auth, bug). A gstack bug never freezes your merge queue. Fail-closed on confirmed collisions. +- `/landing-report` skill — read-only dashboard showing queue, siblings, and what all four bump levels would claim. +- `workspace_root` config key, default `$HOME/conductor/workspaces`, null disables sibling scan for non-Conductor users. + +### What this means for teams running parallel workspaces + +If you're routinely running 3-10 Conductor windows against the same repo, this is the capability that lets the model scale. Before: you mostly got away with it because you noticed collisions by eye. After: the queue is an observable surface, and the system refuses to ship a stale version. `/landing-report` is the new "where am I in line" check when you're about to open PR #6 for the day. Run it before `/ship` if you want to see what's coming without shipping. + +### Itemized changes + +#### Added + +- `bin/gstack-next-version`. Host-aware (GitHub + GitLab + unknown) VERSION allocator. Queries open PRs, fetches each PR's VERSION at head (bounded concurrency, 10 parallel), scans sibling Conductor worktrees, picks the next free slot. Pure reader, never writes files. Supports `--exclude-pr ` to filter out the PR being checked (prevents self-reference when CI runs against the PR's own VERSION). +- `scripts/detect-bump.ts`, `scripts/compare-pr-version.ts`. CI gate helpers. Three exit paths: pass, block on confirmed collision, fail-open on util errors. +- `.github/workflows/version-gate.yml`. Merge-time collision gate. Runs when VERSION/CHANGELOG/package.json changes on a PR. +- `.github/workflows/pr-title-sync.yml`. Auto-rewrites PR title when VERSION changes on push, only for titles already carrying the `v` prefix (custom titles left alone, idempotent). +- `.gitlab-ci.yml`. GitLab CI parity. Both jobs mirrored with the same fail-open semantics. +- `landing-report/SKILL.md.tmpl`. New `/landing-report` or `/gstack-landing-report` skill. Read-only dashboard. +- `bin/gstack-config`. New `workspace_root` key. Default `$HOME/conductor/workspaces`, `null` disables sibling scan. + +#### Changed + +- `ship/SKILL.md.tmpl` Step 12. Queue-aware VERSION pick in FRESH path, drift detection in ALREADY_BUMPED path. On detected drift the user is prompted to rebump, which runs the full metadata path (VERSION + package.json + CHANGELOG header + PR title) atomically so nothing goes stale. +- `ship/SKILL.md.tmpl` Step 19. PR title format is now `v : `, version ALWAYS first. Rerun path updates the title (not just the body) when VERSION changed. Both GitHub and GitLab paths. +- `land-and-deploy/SKILL.md.tmpl`. New Step 3.4 pre-merge drift detection. Aborts with a clear rerun-/ship instruction rather than auto-mutating files. Rerunning `/ship` is the clean path because ship owns the full metadata flow. +- `review/SKILL.md.tmpl`. New Step 3.4 advisory one-liner showing queue status. Non-blocking. +- `CLAUDE.md`. Versioning invariant paragraph. Documents that VERSION is a monotonic sequence, not a strict semver commitment, and queue-advance within a bump level is permitted. + +#### Fixed + +- Self-reference bug in the version gate. The first live CI run (PR #1168 at v1.8.0.0) was rejected as "stale" because the util counted the PR being checked as a queued claim, inflating the next slot by one. Fixed with `--exclude-pr` flag + `gh pr view` auto-detect so the util silently filters the current branch's PR. Caught and fixed in the same ship — exactly the dogfood loop the release is designed for. + +#### For contributors + +- `test/gstack-next-version.test.ts`. 21 pure-function tests (parseVersion / bumpVersion / cmpVersion / pickNextSlot with 8 collision scenarios / markActiveSiblings 4 cases) plus a CLI smoke test against the live repo. +- Golden ship fixtures refreshed for all three hosts (claude, codex, factory) after Step 12 and Step 19 template changes. This is exactly the blast radius Codex flagged during the CEO review (cross-model tension #8), handled in the same PR rather than as a follow-up. + +## **Plan mode stopped silently rubber-stamping your reviews. The forcing questions actually fire now.** + +If you ran `/plan-ceo-review` or any interactive review skill while in plan mode, the skill used to read your diff, skip every STOP gate, write a plan file, and exit. Zero AskUserQuestion calls. Zero mode selection. Zero per-section decisions. The skill's interactive contract got outranked by plan mode's system-reminder, which tells the model to run its own workflow and ignore everything else. This release adds a preamble-level STOP gate that fires before any analysis, so you always get the interactive review the skill was designed to run. + +### What shipped + +Four interactive review skills (plan-ceo-review, plan-eng-review, plan-design-review, plan-devex-review) now emit a two-option AskUserQuestion the moment plan mode is detected: exit-and-rerun interactively, or cancel. No silent bypass. The gate is classified one-way-door in the question registry so `/plan-tune` preferences can't auto-decide past it. Outcome gets logged to `~/.gstack/analytics/skill-usage.jsonl` synchronously when the handshake fires, so A-exit and C-cancel are captured even though they terminate the skill before the end-of-run telemetry block. + +The test harness got a canUseTool extension built on Anthropic's Agent SDK (already installed at v0.2.117). When a test supplies a canUseTool callback, `test/helpers/agent-sdk-runner.ts` flips `permissionMode` from `bypassPermissions` to `default` so the callback actually fires. This is the foundation for asserting AskUserQuestion content end-to-end, which gstack's E2E tests previously couldn't do at all. They had to instruct the model to skip AskUserQuestion entirely. Every future interactive-skill test builds on this. + +### The numbers that matter + +Source: new unit tests in `test/gen-skill-docs.test.ts` (8 tests covering handshake presence, absence, composition ordering, 0C-bis STOP block) and `test/agent-sdk-runner.test.ts` (6 tests covering canUseTool + permission-mode + passThrough helper). All 14 pass locally in <250ms, free tier. + +| Surface | Before | After | +|---|---|---| +| Claude skills rendering the handshake | 0 | 4 (plan-ceo, plan-eng, plan-design, plan-devex) | +| Non-Claude host outputs with handshake text | N/A | 0 (host-scoped via `ctx.host === 'claude'` check) | +| E2E tests that can assert AskUserQuestion content | 0 | 1 harness primitive, ready for every interactive skill | +| Plan-mode entry to any of 4 review skills | Silent bypass | Two-option STOP gate | +| Step 0C-bis in plan-ceo-review | No STOP block, could drift to 0F | Explicit `**STOP.**` block matching 0F pattern | +| Post-handshake telemetry outcomes captured | Neither A-exit nor C-cancel | Both (synchronous write before ExitPlanMode) | + +### What this means for builders + +If you're running gstack in plan mode on a PR review, you'll see one question before the skill does anything: "Exit plan mode and run interactively, or cancel?" Pick A, press esc-esc, rerun the skill in normal mode, get the full interactive review you expected. Pick C to bail cleanly. No more silent rubber-stamp. + +If you're building new interactive skills (yours or contributing to gstack), you can now write real E2E tests that assert on AskUserQuestion shape and routing via the canUseTool harness. See `test/agent-sdk-runner.test.ts` for the pattern and `test/helpers/agent-sdk-runner.ts` for the API. + +### Itemized changes + +#### Fixed + +- Plan mode no longer silently skips AskUserQuestion gates in `/plan-ceo-review`, `/plan-eng-review`, `/plan-design-review`, or `/plan-devex-review`. A preamble-level handshake fires as the first thing the skill does when the plan-mode system-reminder is present, forcing a user choice before any analysis or plan-file writes. +- `/plan-ceo-review` Step 0C-bis now has an explicit STOP block matching the pattern used at Step 0F, so the approach-selection question can't be silently skipped when the skill continues to mode selection. + +#### Added + +- New resolver `scripts/resolvers/preamble/generate-plan-mode-handshake.ts` emits the handshake prose and telemetry bash. Host-scoped to Claude only via `ctx.host === 'claude'` check. Opt-in per skill via `interactive: true` in frontmatter. +- New frontmatter field `interactive: boolean` on skill templates. Generator-only input parsed by `scripts/gen-skill-docs.ts`, never written to generated SKILL.md output (follows the `preamble-tier` precedent). +- New question registry entry `plan-mode-handshake` with `door_type: 'one-way'` in `scripts/question-registry.ts`. Question-tuning `never-ask` preferences cannot suppress this gate. +- New telemetry field `plan_mode_handshake` in `~/.gstack/analytics/skill-usage.jsonl` with outcomes `fired`, `A-exit`, `C-cancel` written synchronously as the handshake fires. Captures outcomes that would otherwise terminate the skill before end-of-run telemetry runs. +- `test/helpers/agent-sdk-runner.ts` extended with optional `canUseTool` callback parameter. When supplied, flips `permissionMode` to `default`, auto-adds `AskUserQuestion` to `allowedTools`, and passes the callback to the SDK. Exports `passThroughNonAskUserQuestion` helper for tests that only want to assert on AskUserQuestion but auto-allow other tools. + +#### For contributors + +- Added 8 unit tests in `test/gen-skill-docs.test.ts` verifying handshake presence in 4 interactive skills, absence in non-interactive skills, absence in non-Claude host outputs, composition ordering (handshake precedes upgrade-check), and 0C-bis STOP block wiring. +- Added 6 unit tests in `test/agent-sdk-runner.test.ts` verifying permission-mode flip, allowedTools auto-injection, canUseTool callback propagation, and pass-through helper behavior. +- Added 6 gate-tier entries to `test/helpers/touchfiles.ts` covering the new E2E test surface. Dependency glob fires any of the new tests when: the relevant skill template, the handshake resolver, preamble composition, the question registry, the one-way-door classifier, or the agent-sdk-runner changes. +- Filed 2 P1/P2 follow-ups in `TODOS.md`: structural STOP-Ask forcing function across all skills (broader class of bug beyond plan-mode entry), and extending `interactive: true` audit to non-review interactive skills like `/office-hours`, `/codex`, `/investigate`, `/qa`. + +## [1.10.1.0] - 2026-04-23 + +## **We tried to make Opus 4.7 faster with a prompt. Measurement said it got slower. Pulled the bullet.** + +gstack shipped a "Fan out explicitly" overlay nudge in `model-overlays/opus-4-7.md` +back in v1.5.2.0. The idea: tell Opus 4.7 to emit multiple tool calls in one +assistant turn instead of one per turn, so "read three files" takes one API +round-trip instead of three. Sounded obvious. This release removes that +bullet after measuring that it actively hurt performance, and ships the eval +harness we used to prove it so you can measure your own overlay changes. + +### The numbers that matter + +Source: new `test/skill-e2e-overlay-harness.test.ts`, N=10 trials per arm per +fixture, 40 trials per run, ~$3 per run. Pinned to `claude-opus-4-7` via +Anthropic's published Agent SDK (`@anthropic-ai/claude-agent-sdk@0.2.117`) +with `pathToClaudeCodeExecutable` set to the locally-installed `claude` binary +(2.1.118). Metric: number of parallel `tool_use` blocks in the first assistant +turn. + +| Prompt text in overlay | First-turn fanout rate (toy: read 3 files) | Lift vs baseline | +|---|---|---| +| No overlay (default Claude Code system prompt only) | **70%** (7/10) | baseline | +| gstack's original "Fan out explicitly" nudge (v1.5.2.0 through v1.6.3.0) | 10% (1/10) | **-60%** | +| Anthropic's own canonical `` text from their parallel-tool-use docs | **0%** (0/10) | **-70%** | + +On a realistic multi-file audit prompt (`read app.ts + config.ts + README.md, +glob src/*.ts, summarize`), Opus 4.7 never fanned out in the first turn at all, +regardless of overlay. Zero of 20 trials. The nudge had nothing to grip. + +Total cost of the investigation: **$7** across three eval runs. + +### What this means for you + +If you ship system-prompt nudges for Claude, measure them. Anthropic's own +published best-practice text dropped our fanout rate to zero. That's not a +claim about Anthropic, it's a claim about measurement: the model, the SDK, +the binary, and the context all move under the advice, and the advice sits +still. The harness is in the repo now. Run +`EVALS=1 EVALS_TIER=periodic bun test test/skill-e2e-overlay-harness.test.ts`. +Three dollars per run. + +### Itemized changes + +#### Fixed + +- `model-overlays/opus-4-7.md` — removed the "Fan out explicitly" block. The + other three nudges (effort-match, batch questions, literal interpretation) + are untested and stay in for now. They're candidates for their own + measurement in a follow-up PR. + +#### Added + +- `test/skill-e2e-overlay-harness.test.ts` — periodic-tier eval that iterates a + typed fixture registry and runs A/B arms through `@anthropic-ai/claude-agent-sdk`. + Uses SDK preset `claude_code` so the arms include Claude Code's real system + prompt; overlay-ON appends the resolved overlay text. Saves per-trial raw + event streams for forensic recovery. Gated on both `EVALS=1` and + `EVALS_TIER=periodic`. +- `test/fixtures/overlay-nudges.ts` — typed `OverlayFixture` registry with + strict validator. Adding a future nudge to measure = one fixture entry. + First two fixtures: `opus-4-7-fanout-toy` and `opus-4-7-fanout-realistic`. +- `test/helpers/agent-sdk-runner.ts` — parametric SDK wrapper with explicit + `AgentSdkResult` types, process-level API concurrency semaphore, and + three-shape 429 retry (thrown error, result-message error, mid-stream + `SDKRateLimitEvent`). Binary pinning via `pathToClaudeCodeExecutable`. +- `test/agent-sdk-runner.test.ts` — 36 free-tier unit tests covering happy + path, all three rate-limit shapes, persistent-429 `RateLimitExhaustedError`, + non-429 propagation, options propagation, concurrency cap, and every + validator rejection case. +- `scripts/preflight-agent-sdk.ts` — 20-line sanity check that confirms the + SDK loads, `claude-opus-4-7` is a live API model, the `SDKMessage` event + shape matches assumptions, and the overlay resolver produces the expected + text. Run manually before paid runs if you suspect drift. Costs ~$0.013. +- `@anthropic-ai/claude-agent-sdk@0.2.117` in `devDependencies`. Exact pin, + no caret — SDK event shapes can drift on minor versions. + +#### Changed + +- `scripts/resolvers/model-overlay.ts` — exported `readOverlay` so the eval + harness can resolve `{{INHERIT:claude}}` directives without synthesizing a + full `TemplateContext`. + +#### For contributors + +- `test/helpers/touchfiles.ts` — registered the new eval in both + `E2E_TOUCHFILES` (deps: `model-overlays/**`, `overlay-nudges.ts`, runner, + resolver) and `E2E_TIERS` (`periodic`). Passes the + `test/touchfiles.test.ts` completeness check. +- The harness is deliberately parametric. Adding a second overlay nudge + measurement (for the remaining three nudges in `opus-4-7.md`, or any + future nudge in any overlay file) is a single entry in + `test/fixtures/overlay-nudges.ts`. Total incremental effort: ~15 minutes + per fixture. + +## [1.10.0.0] - 2026-04-23 + +## **Plan reviews walk you through each issue again, and every question is now a real decision brief.** + +v1.6.4.0 broke something nobody wrote down. Plan reviews on Opus 4.7 silently stopped asking questions one at a time. They turned into a report: here are 6 findings, end of turn. The interactive dialogue that made `/plan-ceo-review`, `/plan-eng-review`, and the rest useful quietly evaporated. v1.10.0.0 restores that, and bundles a format upgrade so every `AskUserQuestion` now renders as a numbered decision brief with ELI10, stakes, recommendation, per-option pros / cons (✅ / ❌), and a closing "Net:" line that frames the trade-off in one sentence. + +### What changes for you + +Run `/plan-ceo-review` or `/plan-eng-review` on a plan with 3 findings. You get 3 separate AskUserQuestion prompts, one per finding, with the full Pros / Cons shape. Pick the option in 5 seconds, or expand the pros / cons if you want to think about it. Every review finding becomes a decision you actually made, not a bullet point you skimmed. The reference shape matches the D2 memory-design question Garry hand-crafted for his own use, now baked into every tier-2 skill via the preamble resolver, so `/ship`, `/office-hours`, `/investigate`, and the rest inherit it for free. + +### The numbers that matter + +Measured across the v1.10.0.0 fix. Verify any claim with `git log 1.9.0.0..1.10.0.0 --oneline` and `bun test` against the pinned commit SHA. + +| Metric | v1.6.4.0 | v1.10.0.0 | Δ | +|---|---|---|---| +| `AskUserQuestion` renders above model overlay in SKILL.md | no | **yes** | ordering inverted | +| Escape-hatch sites hardened across plan-review templates | 0 | **16** | +16 | +| Gate-tier unit tests pinning the format contract | 0 | **30** | +30 (runs in 16ms, $0) | +| Periodic evals defending against escape-hatch abuse | 0 | **4** | +4 (2 positive, 2 negative-case) | +| Cross-model review findings incorporated before landing | N/A | **5 of 8** | Codex caught real bugs CEO+Eng missed | + +Two of the five Codex findings were load-bearing. (1) The overlay reorder theory wasn't enough on its own. The `(recommended)` label on a neutral-posture question had to stay, because `question-tuning.ts:29` reads it to power AUTO_DECIDE. Omitting it would have silently broken auto-decide on every cherry-pick prompt. (2) The "31 sites global replace" in the original plan was factually wrong. Actual count, verified with `rg`, is 16 sites across 4 templates, and eng/design/devex templates used different phrasing than CEO. Without the audit, the fix would have shipped half-applied. + +### What this means for anyone running plan reviews on Opus 4.7 + +Upgrade and re-run your next plan review. You should see D-numbered prompts (D1, D2, D3...) with ELI10 paragraphs, stakes lines, and ✅ / ❌ bullet blocks per option. If you don't, check that `bun run gen:skill-docs` regenerated cleanly after the upgrade, and verify the `Pros / cons:` header renders in `plan-ceo-review/SKILL.md`. Complete plan reviews that used to take 20 minutes and produced a report now take 10 minutes and produce a row of decisions. + +### Itemized changes + +#### Added + +- New Pros / Cons decision-brief format for every `AskUserQuestion` across all tier-2+ skills. Rendering: `D` header, ELI10, "Stakes if we pick wrong:", Recommendation, per-option `✅ / ❌` bullets with minimum 2 pros + 1 con, closing `Net:` synthesis line. Lands in `scripts/resolvers/preamble/generate-ask-user-format.ts` so every skill inherits it. +- Hard-stop escape for destructive one-way choices: single bullet `✅ No cons — this is a hard-stop choice`. +- Neutral-posture handling for SELECTIVE EXPANSION cherry-picks and taste calls: `Recommendation: — this is a taste call, no strong preference either way` with `(recommended)` label preserved on the default to keep AUTO_DECIDE working. +- Three gate-tier unit tests (`test/preamble-compose.test.ts`, `test/resolver-ask-user-format.test.ts`, `test/model-overlay-opus-4-7.test.ts`) that pin the composition order, format contract, and overlay text. Run in <100ms on every `bun test`. +- Four periodic-tier Pros/Cons eval cases in `test/skill-e2e-plan-prosons.test.ts` including two negative-case assertions that catch escape-hatch abuse before it drifts. +- Touchfiles entries (`test/helpers/touchfiles.ts`) for all new eval cases plus expanded-coverage stubs for 7 additional skills. + +#### Fixed + +- Plan-review cadence regression on Opus 4.7. `/plan-ceo-review`, `/plan-eng-review`, `/plan-design-review`, and `/plan-devex-review` now actually pause after each finding and call `AskUserQuestion` as a tool_use instead of batching everything into one summary report. Root cause: `generateModelOverlay` rendered above `generateAskUserFormat` in `scripts/resolvers/preamble.ts`, so the overlay's "Batch your questions" directive registered as the ambient default before the pacing rule. Fixed by reordering the section array and rewriting the overlay directive as "Pace questions to the skill". +- Escape-hatch collapse: "If no issues or fix is obvious, state what you'll do and move on, don't waste a question" at 16 sites across 4 templates let Opus 4.7's literal interpreter classify every finding as self-dismissable. Tightened per-template: zero findings gets "No issues, moving on"; findings require AskUserQuestion as a tool_use. + +#### Changed + +- `test/skill-e2e-plan-format.test.ts`: extended with v1.10.0.0 format token regexes (D-number, ELI10, Stakes, Pros/cons, Net). Existing RECOMMENDATION check loosened to accept mixed-case "Recommendation:". +- `test/skill-validation.test.ts`: format assertions updated from "RECOMMENDATION: Choose" to the new Pros/Cons token set. +- Golden fixtures regenerated: `test/fixtures/golden/claude-ship-SKILL.md`, `codex-ship-SKILL.md`, `factory-ship-SKILL.md`. + +#### For contributors + +- Outside-voice Codex review (`codex exec` with `model_reasoning_effort="high"`) caught two factual bugs in the original plan: the "31 sites" count (actually 16) and the AUTO_DECIDE contract break on neutral-posture questions. 5 of 8 Codex findings incorporated, 1 rejected (kept defense in depth on the composition reorder), 1 declined (HOLD SCOPE mode lock). +- Follow-up: true multi-turn cadence eval (3 findings produce 3 distinct AskUserQuestion invocations across turns) requires new harness support for multi-capture. Filed in NOT-in-scope. Current single-capture eval covers format + escape-hatch abuse but not cadence itself. +- Follow-up: expanded-coverage eval cases for `/ship`, `/office-hours`, `/investigate`, `/qa`, `/review`, `/design-review`, `/document-release`. Touchfiles entries exist; test blocks will land per-skill in follow-up PRs. +- D-numbering is a model-level instruction, not a runtime counter. `TemplateContext` has no state for it. Drift over long sessions is expected; a registry (deferred to TODOs) is the long-term fix. + +## [1.9.0.0] - 2026-04-23 ## **Your gstack memory now travels with you. Cross-machine brain via a private git repo + optional GBrain indexing, no daemon, no credential leaks.** @@ -192,6 +727,7 @@ Work on the laptop Monday. Switch to the desktop Tuesday. Skill preamble sees th - `test/brain-sync.test.ts` — 12 of 27 tests pass on first bun-test run; remaining 15 hit bun-test's 5s default timeout (spawnSync-heavy git operations). Behaviors verified via integration smokes during implementation. Test infrastructure needs a 30s per-test timeout wrapper. - Three unmerged team-sync branches (`garrytan/team-supabase-store`, `garrytan/fix-team-setup`, `garrytan/team-install-mode`) should be formally closed if team-sync isn't landing — flagged in the CEO plan. - Pre-existing golden-file regression test failure in `test/host-config.test.ts` (Codex ship skill baseline) exists on `main` too — unrelated to this PR, tracked separately. + ## [1.6.4.0] - 2026-04-22 ## **Sidebar prompt-injection defense got half as noisy, half as trusting of any single classifier.** diff --git a/CLAUDE.md b/CLAUDE.md index b77b304f..2e5ae567 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -26,6 +26,26 @@ bun run slop:diff # slop findings in files changed on this branch only `test:evals` requires `ANTHROPIC_API_KEY`. Codex E2E tests (`test/codex-e2e.test.ts`) use Codex's own auth from `~/.codex/` config — no `OPENAI_API_KEY` env var needed. + +**Where the keys live on this machine.** Conductor workspaces don't inherit the +user's interactive shell env, so `ANTHROPIC_API_KEY` and `OPENAI_API_KEY` aren't +in the default process env. Before running any paid eval / E2E, source them from +`~/.zshrc` (that's where Garry keeps them): + +```bash +bash -c ' + eval "$(grep -E "^export (ANTHROPIC_API_KEY|OPENAI_API_KEY)=" ~/.zshrc)" + export ANTHROPIC_API_KEY OPENAI_API_KEY + EVALS=1 EVALS_TIER=periodic bun test test/skill-e2e-.test.ts +' +``` + +Do not echo the key value anywhere (stdout, logs, shell history). The grep+eval +pattern keeps it in process env only. When passing to a test's Agent SDK, do NOT +pass `env: {...}` to `runAgentSdkTest` — the SDK's auth pipeline doesn't pick up +the key the same way when env is supplied as an object (confirmed failure mode). +Instead, mutate `process.env.ANTHROPIC_API_KEY` ambiently before the call and +restore in `finally`. E2E tests stream progress in real-time (tool-by-tool via `--output-format stream-json --verbose`). Results are persisted to `~/.gstack-dev/evals/` with auto-comparison against the previous run. @@ -205,12 +225,35 @@ When you need to interact with a browser (QA, dogfooding, cookie setup), use the project uses. **Sidebar architecture:** Before modifying `sidepanel.js`, `background.js`, -`content.js`, `sidebar-agent.ts`, or sidebar-related server endpoints, read -`docs/designs/SIDEBAR_MESSAGE_FLOW.md`. It documents the full initialization -timeline, message flow, auth token chain, tab concurrency model, and known -failure modes. The sidebar spans 5 files across 2 codebases (extension + server) -with non-obvious ordering dependencies. The doc exists to prevent the kind of -silent failures that come from not understanding the cross-component flow. +`content.js`, `terminal-agent.ts`, or sidebar-related server endpoints, +read `docs/designs/SIDEBAR_MESSAGE_FLOW.md`. The sidebar has one primary +surface — the **Terminal** pane (interactive `claude` PTY) — with +Activity / Refs / Inspector as debug overlays behind the footer's +`debug` toggle. The chat queue path was ripped once the PTY proved out; +`sidebar-agent.ts` and the `/sidebar-command` / `/sidebar-chat` / +`/sidebar-agent/event` endpoints are gone. The doc covers the WS auth +flow, dual-token model, and threat-model boundary — silent failures +here usually trace to not understanding the cross-component flow. + +**WebSocket auth uses Sec-WebSocket-Protocol, not cookies.** Browsers +can't set `Authorization` on a WebSocket upgrade, but they CAN set +`Sec-WebSocket-Protocol` via `new WebSocket(url, [token])`. The agent +reads it, validates against `validTokens`, and MUST echo the protocol +back in the upgrade response — without the echo, Chromium closes the +connection immediately. `Set-Cookie: gstack_pty=...` is kept as a +fallback for non-browser callers (the cross-port `SameSite=Strict` +cookie path doesn't survive from a chrome-extension origin). + +**Cross-pane PTY injection.** The toolbar's Cleanup button and the +Inspector's "Send to Code" action both pipe text into the live claude +PTY via `window.gstackInjectToTerminal(text)`, exposed by +`sidepanel-terminal.js`. No `/sidebar-command` POST — the live REPL is +the only execution surface in the sidebar now. + +**`/health` MUST NOT surface any shell-grant token.** It already leaks +`AUTH_TOKEN` to localhost callers in headed mode (a v1.1+ TODO). Don't +make that worse by adding the PTY session token there. PTY auth flows +through `POST /pty-session` only. **Transport-layer security** (v1.6.0.0+). When `pair-agent` starts an ngrok tunnel, the daemon binds two HTTP listeners: a local listener (127.0.0.1, full command @@ -407,6 +450,41 @@ No auto-merging. No "I'll just clean this up." ## CHANGELOG + VERSION style +**Versioning invariant (workspace-aware ship).** VERSION is a monotonic ordered +release identifier, not a strict semver commitment. The bump level +(major/minor/patch/micro) expresses intent at ship time. Queue-advancing past a +claimed version within the same bump level is explicitly permitted — if branch A +claims v1.7.0.0 as a MINOR and branch B is also a MINOR, B lands at v1.8.0.0 +(still a MINOR relative to main). Downstream consumers must NOT rely on +"MINOR = feature-only, PATCH = fix-only" as a strict contract. This is why +`bin/gstack-next-version` advances within the chosen bump level rather than +repicking the level when collisions happen. + +**Scale-aware bumps — use common sense.** When the diff is big, bump MINOR (or +MAJOR), not PATCH. PATCH is for bug fixes and small additions; MINOR is for +substantial new capability or substantial reduction; MAJOR is for breaking +changes. Rough guideposts (don't treat as rules, treat as smell-checks): + +- **PATCH (X.Y.Z+1.0)**: bug fix, doc tweak, small additive change, single + test/file added. Net diff under ~500 lines, no new user-facing capability. +- **MINOR (X.Y+1.0.0)**: new capability shipped (skill, harness, command, big + refactor), substantial code reduction (compression, migration), or coordinated + multi-file change. Net diff over ~2000 lines added/removed, OR a user-visible + feature you'd put in a tweet. +- **MAJOR (X+1.0.0.0)**: breaking change to public surface (CLI flag rename, + skill removed, config format changed), OR a release big enough to be the + headline of a blog post. + +If you find yourself debating "is 10K added + 24K removed really a PATCH?" — it +isn't. Bump MINOR. Same for "this adds a whole new test harness with 6 new E2E +tests + helper utilities" — MINOR. The bump level is communication to the user +about what kind of release this is; don't undersell it. + +When merging origin/main brings a higher VERSION, re-evaluate the bump level +against the SCALE of your branch's work, not just whether main moved forward. +If main bumped MINOR and your branch is also a substantial change, you bump +MINOR again on top (e.g., main at v1.14.0.0, your branch lands v1.15.0.0). + **VERSION and CHANGELOG are branch-scoped.** Every feature branch that ships gets its own version bump and CHANGELOG entry. The entry describes what THIS branch adds — not what was already on main. @@ -653,3 +731,21 @@ The active skill lives at `~/.claude/skills/gstack/`. After making changes: Or copy the binaries directly: - `cp browse/dist/browse ~/.claude/skills/gstack/browse/dist/browse` - `cp design/dist/design ~/.claude/skills/gstack/design/dist/design` + +## Skill routing + +When the user's request matches an available skill, invoke it via the Skill tool. When in doubt, invoke the skill. + +Key routing rules: +- Product ideas/brainstorming → invoke /office-hours +- Strategy/scope → invoke /plan-ceo-review +- Architecture → invoke /plan-eng-review +- Design system/plan review → invoke /design-consultation or /plan-design-review +- Full review pipeline → invoke /autoplan +- Bugs/errors → invoke /investigate +- QA/testing site behavior → invoke /qa or /qa-only +- Code review/diff check → invoke /review +- Visual polish → invoke /design-review +- Ship/deploy/PR → invoke /ship or /land-and-deploy +- Save progress → invoke /context-save +- Resume context → invoke /context-restore diff --git a/README.md b/README.md index 3177d56c..426c8468 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,7 @@ Fork it. Improve it. Make it yours. And if you want to hate on free open source Open Claude Code and paste this. Claude does the rest. -> Install gstack: run **`git clone --single-branch --depth 1 https://github.com/garrytan/gstack.git ~/.claude/skills/gstack && cd ~/.claude/skills/gstack && ./setup`** then add a "gstack" section to CLAUDE.md that says to use the /browse skill from gstack for all web browsing, never use mcp\_\_claude-in-chrome\_\_\* tools, and lists the available skills: /office-hours, /plan-ceo-review, /plan-eng-review, /plan-design-review, /design-consultation, /design-shotgun, /design-html, /review, /ship, /land-and-deploy, /canary, /benchmark, /browse, /connect-chrome, /qa, /qa-only, /design-review, /setup-browser-cookies, /setup-deploy, /retro, /investigate, /document-release, /codex, /cso, /autoplan, /plan-devex-review, /devex-review, /careful, /freeze, /guard, /unfreeze, /gstack-upgrade, /learn. Then ask the user if they also want to add gstack to the current project so teammates get it. +> Install gstack: run **`git clone --single-branch --depth 1 https://github.com/garrytan/gstack.git ~/.claude/skills/gstack && cd ~/.claude/skills/gstack && ./setup`** then add a "gstack" section to CLAUDE.md that says to use the /browse skill from gstack for all web browsing, never use mcp\_\_claude-in-chrome\_\_\* tools, and lists the available skills: /office-hours, /plan-ceo-review, /plan-eng-review, /plan-design-review, /design-consultation, /design-shotgun, /design-html, /review, /ship, /land-and-deploy, /canary, /benchmark, /browse, /connect-chrome, /qa, /qa-only, /design-review, /setup-browser-cookies, /setup-deploy, /setup-gbrain, /retro, /investigate, /document-release, /codex, /cso, /autoplan, /plan-devex-review, /devex-review, /careful, /freeze, /guard, /unfreeze, /gstack-upgrade, /learn. Then ask the user if they also want to add gstack to the current project so teammates get it. ### Step 2: Team mode — auto-update for shared repos (recommended) @@ -225,6 +225,7 @@ Each skill feeds into the next. `/office-hours` writes a design doc that `/plan- | `/unfreeze` | **Unlock** — remove the `/freeze` boundary. | | `/open-gstack-browser` | **GStack Browser** — launch GStack Browser with sidebar, anti-bot stealth, auto model routing (Sonnet for actions, Opus for analysis), one-click cookie import, and Claude Code integration. Clean up pages, take smart screenshots, edit CSS, and pass info back to your terminal. | | `/setup-deploy` | **Deploy Configurator** — one-time setup for `/land-and-deploy`. Detects your platform, production URL, and deploy commands. | +| `/setup-gbrain` | **GBrain Onboarding** — from zero to running gbrain in under 5 minutes. PGLite local, Supabase existing URL, or auto-provision a new Supabase project via Management API. MCP registration for Claude Code + per-repo trust triad (read-write/read-only/deny). [Full guide](USING_GBRAIN_WITH_GSTACK.md). | | `/gstack-upgrade` | **Self-Updater** — upgrade gstack to latest. Detects global vs vendored install, syncs both, shows what changed. | ### New binaries (v0.19) @@ -368,34 +369,39 @@ I open sourced how I build software. You can fork it and make it your own. > Come work at YC — [ycombinator.com/software](https://ycombinator.com/software) > Extremely competitive salary and equity. San Francisco, Dogpatch District. -## Cross-machine memory with GBrain sync +## GBrain — persistent knowledge for your coding agent -gstack accumulates a lot of useful state on your laptop: learnings, CEO -plans, design docs, retros, developer profile. Today, all of that dies when -you switch machines. **GBrain sync** optionally pushes a curated, secret-scanned -subset to a private git repo so your memory follows you, and (if you use -GBrain) becomes indexable there. +[GBrain](https://github.com/garrytan/gbrain) is a persistent knowledge base for AI agents — think of it as the memory your agent actually keeps between sessions. GStack gives you a one-command path from zero to "it's running, my agent can call it." -One command to turn it on: +```bash +/setup-gbrain +``` + +Three paths, pick one: + +- **Supabase, existing URL** — your cloud agent already provisioned a brain; paste the Session Pooler URL, now this laptop uses the same data. +- **Supabase, auto-provision** — paste a Supabase Personal Access Token; the skill creates a new project, polls to healthy, fetches the pooler URL, hands it to `gbrain init`. ~90 seconds end-to-end. +- **PGLite local** — zero accounts, zero network, ~30 seconds. Isolated brain on this Mac only. Great for try-first; migrate to Supabase later with `/setup-gbrain --switch`. + +After init, the skill offers to register gbrain as an MCP server for Claude Code (`claude mcp add gbrain -- gbrain serve`) so `gbrain search`, `gbrain put_page`, etc. show up as first-class typed tools — not bash shell-outs. + +**Per-remote trust policy.** Each repo on your machine gets one of three tiers: + +- `read-write` — agent can search the brain AND write new pages back from this repo +- `read-only` — agent can search but never writes (best for multi-client consultants: search the shared brain, don't contaminate it with Client A's work while in Client B's repo) +- `deny` — no gbrain interaction at all + +The skill asks once per repo. The decision is sticky across worktrees and branches of the same remote. + +**GStack memory sync (different feature, same private-repo infra).** Optionally pushes your gstack state (learnings, CEO plans, design docs, retros, developer profile) to a private git repo so your memory follows you across machines, with a one-time privacy prompt (everything allowlisted / artifacts only / off) and a defense-in-depth secret scanner that blocks AWS keys, tokens, PEM blocks, and JWTs before they leave your machine. ```bash gstack-brain-init ``` -That creates a private GitHub repo (or any git remote you prefer — -GitLab, Gitea, self-hosted). Every skill run syncs the queue at its -start and end boundaries. No daemon, no background process. A one-time -privacy prompt asks how much you want to share (everything allowlisted / -artifacts only / off). Secret-shaped content (AWS keys, GitHub tokens, -PEM blocks, JWTs, etc.) is blocked from sync before it leaves your -machine. +**Full monty — every scenario, every flag, every bin helper, every troubleshooting step:** [USING_GBRAIN_WITH_GSTACK.md](USING_GBRAIN_WITH_GSTACK.md) -New machine? Copy `~/.gstack-brain-remote.txt` over, run -`gstack-brain-restore`, and yesterday's learnings surface on today's -laptop. - -Full guide: [docs/gbrain-sync.md](docs/gbrain-sync.md) • -Error index: [docs/gbrain-sync-errors.md](docs/gbrain-sync-errors.md) +Other references: [docs/gbrain-sync.md](docs/gbrain-sync.md) (sync-specific guide) • [docs/gbrain-sync-errors.md](docs/gbrain-sync-errors.md) (error index) ## Docs @@ -403,6 +409,7 @@ Error index: [docs/gbrain-sync-errors.md](docs/gbrain-sync-errors.md) |-----|---------------| | [Skill Deep Dives](docs/skills.md) | Philosophy, examples, and workflow for every skill (includes Greptile integration) | | [Builder Ethos](ETHOS.md) | Builder philosophy: Boil the Lake, Search Before Building, three layers of knowledge | +| [Using GBrain with GStack](USING_GBRAIN_WITH_GSTACK.md) | Every path, flag, bin helper, and troubleshooting step for `/setup-gbrain` | | [GBrain Sync](docs/gbrain-sync.md) | Cross-machine memory setup, privacy modes, troubleshooting | | [Architecture](ARCHITECTURE.md) | Design decisions and system internals | | [Browser Reference](BROWSER.md) | Full command reference for `/browse` | @@ -447,8 +454,8 @@ Use /browse from gstack for all web browsing. Never use mcp__claude-in-chrome__* Available skills: /office-hours, /plan-ceo-review, /plan-eng-review, /plan-design-review, /design-consultation, /design-shotgun, /design-html, /review, /ship, /land-and-deploy, /canary, /benchmark, /browse, /open-gstack-browser, /qa, /qa-only, /design-review, -/setup-browser-cookies, /setup-deploy, /retro, /investigate, /document-release, /codex, -/cso, /autoplan, /pair-agent, /careful, /freeze, /guard, /unfreeze, /gstack-upgrade, /learn. +/setup-browser-cookies, /setup-deploy, /setup-gbrain, /retro, /investigate, /document-release, +/codex, /cso, /autoplan, /pair-agent, /careful, /freeze, /guard, /unfreeze, /gstack-upgrade, /learn. ``` ## License diff --git a/SKILL.md b/SKILL.md index eee3153a..83e512ea 100644 --- a/SKILL.md +++ b/SKILL.md @@ -49,19 +49,15 @@ _TEL_START=$(date +%s) _SESSION_ID="$$-$(date +%s)" echo "TELEMETRY: ${_TEL:-off}" echo "TEL_PROMPTED: $_TEL_PROMPTED" -# Writing style verbosity (V1: default = ELI10, terse = tighter V0 prose. -# Read on every skill run so terse mode takes effect without a restart.) _EXPLAIN_LEVEL=$(~/.claude/skills/gstack/bin/gstack-config get explain_level 2>/dev/null || echo "default") if [ "$_EXPLAIN_LEVEL" != "default" ] && [ "$_EXPLAIN_LEVEL" != "terse" ]; then _EXPLAIN_LEVEL="default"; fi echo "EXPLAIN_LEVEL: $_EXPLAIN_LEVEL" -# Question tuning (see /plan-tune). Observational only in V1. _QUESTION_TUNING=$(~/.claude/skills/gstack/bin/gstack-config get question_tuning 2>/dev/null || echo "false") echo "QUESTION_TUNING: $_QUESTION_TUNING" mkdir -p ~/.gstack/analytics if [ "$_TEL" != "off" ]; then echo '{"skill":"gstack","ts":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'","repo":"'$(basename "$(git rev-parse --show-toplevel 2>/dev/null)" 2>/dev/null || echo "unknown")'"}' >> ~/.gstack/analytics/skill-usage.jsonl 2>/dev/null || true fi -# zsh-compatible: use find instead of glob to avoid NOMATCH error for _PF in $(find ~/.gstack/analytics -maxdepth 1 -name '.pending-*' 2>/dev/null); do if [ -f "$_PF" ]; then if [ "$_TEL" != "off" ] && [ -x "~/.claude/skills/gstack/bin/gstack-telemetry-log" ]; then @@ -71,7 +67,6 @@ for _PF in $(find ~/.gstack/analytics -maxdepth 1 -name '.pending-*' 2>/dev/null fi break done -# Learnings count eval "$(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null)" 2>/dev/null || true _LEARN_FILE="${GSTACK_HOME:-$HOME/.gstack}/projects/${SLUG:-unknown}/learnings.jsonl" if [ -f "$_LEARN_FILE" ]; then @@ -83,9 +78,7 @@ if [ -f "$_LEARN_FILE" ]; then else echo "LEARNINGS: 0" fi -# Session timeline: record skill start (local-only, never sent anywhere) ~/.claude/skills/gstack/bin/gstack-timeline-log '{"skill":"gstack","event":"started","branch":"'"$_BRANCH"'","session":"'"$_SESSION_ID"'"}' 2>/dev/null & -# Check if CLAUDE.md has routing rules _HAS_ROUTING="no" if [ -f CLAUDE.md ] && grep -q "## Skill routing" CLAUDE.md 2>/dev/null; then _HAS_ROUTING="yes" @@ -93,7 +86,6 @@ fi _ROUTING_DECLINED=$(~/.claude/skills/gstack/bin/gstack-config get routing_declined 2>/dev/null || echo "false") echo "HAS_ROUTING: $_HAS_ROUTING" echo "ROUTING_DECLINED: $_ROUTING_DECLINED" -# Vendoring deprecation: detect if CWD has a vendored gstack copy _VENDORED="no" if [ -d ".claude/skills/gstack" ] && [ ! -L ".claude/skills/gstack" ]; then if [ -f ".claude/skills/gstack/VERSION" ] || [ -d ".claude/skills/gstack/.git" ]; then @@ -102,66 +94,38 @@ if [ -d ".claude/skills/gstack" ] && [ ! -L ".claude/skills/gstack" ]; then fi echo "VENDORED_GSTACK: $_VENDORED" echo "MODEL_OVERLAY: claude" -# Checkpoint mode (explicit = no auto-commit, continuous = WIP commits as you go) _CHECKPOINT_MODE=$(~/.claude/skills/gstack/bin/gstack-config get checkpoint_mode 2>/dev/null || echo "explicit") _CHECKPOINT_PUSH=$(~/.claude/skills/gstack/bin/gstack-config get checkpoint_push 2>/dev/null || echo "false") echo "CHECKPOINT_MODE: $_CHECKPOINT_MODE" echo "CHECKPOINT_PUSH: $_CHECKPOINT_PUSH" -# Detect spawned session (OpenClaw or other orchestrator) [ -n "$OPENCLAW_SESSION" ] && echo "SPAWNED_SESSION: true" || true ``` -If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills AND do not -auto-invoke skills based on conversation context. Only run skills the user explicitly -types (e.g., /qa, /ship). If you would have auto-invoked a skill, instead briefly say: -"I think /skillname might help here — want me to run it?" and wait for confirmation. -The user opted out of proactive behavior. +## Plan Mode Safe Operations -If `SKILL_PREFIX` is `"true"`, the user has namespaced skill names. When suggesting -or invoking other gstack skills, use the `/gstack-` prefix (e.g., `/gstack-qa` instead -of `/qa`, `/gstack-ship` instead of `/ship`). Disk paths are unaffected — always use -`~/.claude/skills/gstack/[skill-name]/SKILL.md` for reading skill files. +In plan mode, allowed because they inform the plan: `$B`, `$D`, `codex exec`/`codex review`, writes to `~/.gstack/`, writes to the plan file, and `open` for generated artifacts. + +## Skill Invocation During Plan Mode + +If the user invokes a skill in plan mode, the skill takes precedence over generic plan mode behavior. **Treat the skill file as executable instructions, not reference.** Follow it step by step starting from Step 0; the first AskUserQuestion is the workflow entering plan mode, not a violation of it. AskUserQuestion satisfies plan mode's end-of-turn requirement. At a STOP point, stop immediately. Do not continue the workflow or call ExitPlanMode there. Commands marked "PLAN MODE EXCEPTION — ALWAYS RUN" execute. Call ExitPlanMode only after the skill workflow completes, or if the user tells you to cancel the skill or leave plan mode. + +If `PROACTIVE` is `"false"`, do not auto-invoke or proactively suggest skills. If a skill seems useful, ask: "I think /skillname might help here — want me to run it?" + +If `SKILL_PREFIX` is `"true"`, suggest/invoke `/gstack-*` names. Disk paths stay `~/.claude/skills/gstack/[skill-name]/SKILL.md`. If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). -If output shows `JUST_UPGRADED ` AND `SPAWNED_SESSION` is NOT set: tell -the user "Running gstack v{to} (just updated!)" and then check for new features to -surface. For each per-feature marker below, if the marker file is missing AND the -feature is plausibly useful for this user, use AskUserQuestion to let them try it. -Fire once per feature per user, NOT once per upgrade. +If output shows `JUST_UPGRADED `: print "Running gstack v{to} (just updated!)". If `SPAWNED_SESSION` is true, skip feature discovery. -**In spawned sessions (`SPAWNED_SESSION` = "true"): SKIP feature discovery entirely.** -Just print "Running gstack v{to}" and continue. Orchestrators do not want interactive -prompts from sub-sessions. +Feature discovery, max one prompt per session: +- Missing `~/.claude/skills/gstack/.feature-prompted-continuous-checkpoint`: AskUserQuestion for Continuous checkpoint auto-commits. If accepted, run `~/.claude/skills/gstack/bin/gstack-config set checkpoint_mode continuous`. Always touch marker. +- Missing `~/.claude/skills/gstack/.feature-prompted-model-overlay`: inform "Model overlays are active. MODEL_OVERLAY shows the patch." Always touch marker. -**Feature discovery markers and prompts** (one at a time, max one per session): +After upgrade prompts, continue workflow. -1. `~/.claude/skills/gstack/.feature-prompted-continuous-checkpoint` → - Prompt: "Continuous checkpoint auto-commits your work as you go with `WIP:` prefix - so you never lose progress to a crash. Local-only by default — doesn't push - anywhere unless you turn that on. Want to try it?" - Options: A) Enable continuous mode, B) Show me first (print the section from - the preamble Continuous Checkpoint Mode), C) Skip. - If A: run `~/.claude/skills/gstack/bin/gstack-config set checkpoint_mode continuous`. - Always: `touch ~/.claude/skills/gstack/.feature-prompted-continuous-checkpoint` +If `WRITING_STYLE_PENDING` is `yes`: ask once about writing style: -2. `~/.claude/skills/gstack/.feature-prompted-model-overlay` → - Inform only (no prompt): "Model overlays are active. `MODEL_OVERLAY: {model}` - shown in the preamble output tells you which behavioral patch is applied. - Override with `--model` when regenerating skills (e.g., `bun run gen:skill-docs - --model gpt-5.4`). Default is claude." - Always: `touch ~/.claude/skills/gstack/.feature-prompted-model-overlay` - -After handling JUST_UPGRADED (prompts done or skipped), continue with the skill -workflow. - -If `WRITING_STYLE_PENDING` is `yes`: You're on the first skill run after upgrading -to gstack v1. Ask the user once about the new default writing style. Use AskUserQuestion: - -> v1 prompts = simpler. Technical terms get a one-sentence gloss on first use, -> questions are framed in outcome terms, sentences are shorter. -> -> Keep the new default, or prefer the older tighter prose? +> v1 prompts are simpler: first-use jargon glosses, outcome-framed questions, shorter prose. Keep default or restore terse? Options: - A) Keep the new default (recommended — good writing helps everyone) @@ -176,27 +140,20 @@ rm -f ~/.gstack/.writing-style-prompt-pending touch ~/.gstack/.writing-style-prompted ``` -This only happens once. If `WRITING_STYLE_PENDING` is `no`, skip this entirely. +Skip if `WRITING_STYLE_PENDING` is `no`. -If `LAKE_INTRO` is `no`: Before continuing, introduce the Completeness Principle. -Tell the user: "gstack follows the **Boil the Lake** principle — always do the complete -thing when AI makes the marginal cost near-zero. Read more: https://garryslist.org/posts/boil-the-ocean" -Then offer to open the essay in their default browser: +If `LAKE_INTRO` is `no`: say "gstack follows the **Boil the Lake** principle — do the complete thing when AI makes marginal cost near-zero. Read more: https://garryslist.org/posts/boil-the-ocean" Offer to open: ```bash open https://garryslist.org/posts/boil-the-ocean touch ~/.gstack/.completeness-intro-seen ``` -Only run `open` if the user says yes. Always run `touch` to mark as seen. This only happens once. +Only run `open` if yes. Always run `touch`. -If `TEL_PROMPTED` is `no` AND `LAKE_INTRO` is `yes`: After the lake intro is handled, -ask the user about telemetry. Use AskUserQuestion: +If `TEL_PROMPTED` is `no` AND `LAKE_INTRO` is `yes`: ask telemetry once via AskUserQuestion: -> Help gstack get better! Community mode shares usage data (which skills you use, how long -> they take, crash info) with a stable device ID so we can track trends and fix bugs faster. -> No code, file paths, or repo names are ever sent. -> Change anytime with `gstack-config set telemetry off`. +> Help gstack get better. Share usage data only: skill, duration, crashes, stable device ID. No code, file paths, or repo names. Options: - A) Help gstack get better! (recommended) @@ -204,10 +161,9 @@ Options: If A: run `~/.claude/skills/gstack/bin/gstack-config set telemetry community` -If B: ask a follow-up AskUserQuestion: +If B: ask follow-up: -> How about anonymous mode? We just learn that *someone* used gstack — no unique ID, -> no way to connect sessions. Just a counter that helps us know if anyone's out there. +> Anonymous mode sends only aggregate usage, no unique ID. Options: - A) Sure, anonymous is fine @@ -221,14 +177,11 @@ Always run: touch ~/.gstack/.telemetry-prompted ``` -This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. +Skip if `TEL_PROMPTED` is `yes`. -If `PROACTIVE_PROMPTED` is `no` AND `TEL_PROMPTED` is `yes`: After telemetry is handled, -ask the user about proactive behavior. Use AskUserQuestion: +If `PROACTIVE_PROMPTED` is `no` AND `TEL_PROMPTED` is `yes`: ask once: -> gstack can proactively figure out when you might need a skill while you work — -> like suggesting /qa when you say "does this work?" or /investigate when you hit -> a bug. We recommend keeping this on — it speeds up every part of your workflow. +> Let gstack proactively suggest skills, like /qa for "does this work?" or /investigate for bugs? Options: - A) Keep it on (recommended) @@ -242,7 +195,7 @@ Always run: touch ~/.gstack/.proactive-prompted ``` -This only happens once. If `PROACTIVE_PROMPTED` is `yes`, skip this entirely. +Skip if `PROACTIVE_PROMPTED` is `yes`. If `HAS_ROUTING` is `no` AND `ROUTING_DECLINED` is `false` AND `PROACTIVE_PROMPTED` is `yes`: Check if a CLAUDE.md file exists in the project root. If it does not exist, create it. @@ -250,8 +203,6 @@ Check if a CLAUDE.md file exists in the project root. If it does not exist, crea Use AskUserQuestion: > gstack works best when your project's CLAUDE.md includes skill routing rules. -> This tells Claude to use specialized workflows (like /ship, /investigate, /qa) -> instead of answering directly. It's a one-time addition, about 15 lines. Options: - A) Add routing rules to CLAUDE.md (recommended) @@ -263,63 +214,33 @@ If A: Append this section to the end of CLAUDE.md: ## Skill routing -When the user's request matches an available skill, invoke it via the Skill tool. The -skill has multi-step workflows, checklists, and quality gates that produce better -results than an ad-hoc answer. When in doubt, invoke the skill. A false positive is -cheaper than a false negative. +When the user's request matches an available skill, invoke it via the Skill tool. When in doubt, invoke the skill. Key routing rules: -- Product ideas, "is this worth building", brainstorming → invoke /office-hours -- Strategy, scope, "think bigger", "what should we build" → invoke /plan-ceo-review -- Architecture, "does this design make sense" → invoke /plan-eng-review -- Design system, brand, "how should this look" → invoke /design-consultation -- Design review of a plan → invoke /plan-design-review -- Developer experience of a plan → invoke /plan-devex-review -- "Review everything", full review pipeline → invoke /autoplan -- Bugs, errors, "why is this broken", "wtf", "this doesn't work" → invoke /investigate -- Test the site, find bugs, "does this work" → invoke /qa (or /qa-only for report only) -- Code review, check the diff, "look at my changes" → invoke /review -- Visual polish, design audit, "this looks off" → invoke /design-review -- Developer experience audit, try onboarding → invoke /devex-review -- Ship, deploy, create a PR, "send it" → invoke /ship -- Merge + deploy + verify → invoke /land-and-deploy -- Configure deployment → invoke /setup-deploy -- Post-deploy monitoring → invoke /canary -- Update docs after shipping → invoke /document-release -- Weekly retro, "how'd we do" → invoke /retro -- Second opinion, codex review → invoke /codex -- Safety mode, careful mode, lock it down → invoke /careful or /guard -- Restrict edits to a directory → invoke /freeze or /unfreeze -- Upgrade gstack → invoke /gstack-upgrade -- Save progress, "save my work" → invoke /context-save -- Resume, restore, "where was I" → invoke /context-restore -- Security audit, OWASP, "is this secure" → invoke /cso -- Make a PDF, document, publication → invoke /make-pdf -- Launch real browser for QA → invoke /open-gstack-browser -- Import cookies for authenticated testing → invoke /setup-browser-cookies -- Performance regression, page speed, benchmarks → invoke /benchmark -- Review what gstack has learned → invoke /learn -- Tune question sensitivity → invoke /plan-tune -- Code quality dashboard → invoke /health +- Product ideas/brainstorming → invoke /office-hours +- Strategy/scope → invoke /plan-ceo-review +- Architecture → invoke /plan-eng-review +- Design system/plan review → invoke /design-consultation or /plan-design-review +- Full review pipeline → invoke /autoplan +- Bugs/errors → invoke /investigate +- QA/testing site behavior → invoke /qa or /qa-only +- Code review/diff check → invoke /review +- Visual polish → invoke /design-review +- Ship/deploy/PR → invoke /ship or /land-and-deploy +- Save progress → invoke /context-save +- Resume context → invoke /context-restore ``` Then commit the change: `git add CLAUDE.md && git commit -m "chore: add gstack skill routing rules to CLAUDE.md"` -If B: run `~/.claude/skills/gstack/bin/gstack-config set routing_declined true` -Say "No problem. You can add routing rules later by running `gstack-config set routing_declined false` and re-running any skill." +If B: run `~/.claude/skills/gstack/bin/gstack-config set routing_declined true` and say they can re-enable with `gstack-config set routing_declined false`. -This only happens once per project. If `HAS_ROUTING` is `yes` or `ROUTING_DECLINED` is `true`, skip this entirely. +This only happens once per project. Skip if `HAS_ROUTING` is `yes` or `ROUTING_DECLINED` is `true`. -If `VENDORED_GSTACK` is `yes`: This project has a vendored copy of gstack at -`.claude/skills/gstack/`. Vendoring is deprecated. We will not keep vendored copies -up to date, so this project's gstack will fall behind. - -Use AskUserQuestion (one-time per project, check for `~/.gstack/.vendoring-warned-$SLUG` marker): +If `VENDORED_GSTACK` is `yes`, warn once via AskUserQuestion unless `~/.gstack/.vendoring-warned-$SLUG` exists: > This project has gstack vendored in `.claude/skills/gstack/`. Vendoring is deprecated. -> We won't keep this copy up to date, so you'll fall behind on new features and fixes. -> -> Want to migrate to team mode? It takes about 30 seconds. +> Migrate to team mode? Options: - A) Yes, migrate to team mode now @@ -340,7 +261,7 @@ eval "$(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null)" 2>/dev/null || tru touch ~/.gstack/.vendoring-warned-${SLUG:-unknown} ``` -This only happens once per project. If the marker file exists, skip entirely. +If marker exists, skip. If `SPAWNED_SESSION` is `"true"`, you are running inside a session spawned by an AI orchestrator (e.g., OpenClaw). In spawned sessions: @@ -352,10 +273,6 @@ AI orchestrator (e.g., OpenClaw). In spawned sessions: ## GBrain Sync (skill start) ```bash -# gbrain-sync: drain pending writes, pull once per day. Silent no-op when -# the feature isn't initialized or gbrain_sync_mode is "off". See -# docs/gbrain-sync.md. - _GSTACK_HOME="${GSTACK_HOME:-$HOME/.gstack}" _BRAIN_REMOTE_FILE="$HOME/.gstack-brain-remote.txt" _BRAIN_SYNC_BIN="~/.claude/skills/gstack/bin/gstack-brain-sync" @@ -363,7 +280,6 @@ _BRAIN_CONFIG_BIN="~/.claude/skills/gstack/bin/gstack-config" _BRAIN_SYNC_MODE=$("$_BRAIN_CONFIG_BIN" get gbrain_sync_mode 2>/dev/null || echo off) -# New-machine hint: URL file present, local .git missing, sync not yet enabled. if [ -f "$_BRAIN_REMOTE_FILE" ] && [ ! -d "$_GSTACK_HOME/.git" ] && [ "$_BRAIN_SYNC_MODE" = "off" ]; then _BRAIN_NEW_URL=$(head -1 "$_BRAIN_REMOTE_FILE" 2>/dev/null | tr -d '[:space:]') if [ -n "$_BRAIN_NEW_URL" ]; then @@ -372,9 +288,7 @@ if [ -f "$_BRAIN_REMOTE_FILE" ] && [ ! -d "$_GSTACK_HOME/.git" ] && [ "$_BRAIN_S fi fi -# Active-sync path. if [ -d "$_GSTACK_HOME/.git" ] && [ "$_BRAIN_SYNC_MODE" != "off" ]; then - # Once-per-day pull. _BRAIN_LAST_PULL_FILE="$_GSTACK_HOME/.brain-last-pull" _BRAIN_NOW=$(date +%s) _BRAIN_DO_PULL=1 @@ -387,11 +301,9 @@ if [ -d "$_GSTACK_HOME/.git" ] && [ "$_BRAIN_SYNC_MODE" != "off" ]; then ( cd "$_GSTACK_HOME" && git fetch origin >/dev/null 2>&1 && git merge --ff-only "origin/$(git rev-parse --abbrev-ref HEAD)" >/dev/null 2>&1 ) || true echo "$_BRAIN_NOW" > "$_BRAIN_LAST_PULL_FILE" fi - # Drain pending queue, push. "$_BRAIN_SYNC_BIN" --once 2>/dev/null || true fi -# Status line — always emitted, easy to grep. if [ -d "$_GSTACK_HOME/.git" ] && [ "$_BRAIN_SYNC_MODE" != "off" ]; then _BRAIN_QUEUE_DEPTH=0 [ -f "$_GSTACK_HOME/.brain-queue.jsonl" ] && _BRAIN_QUEUE_DEPTH=$(wc -l < "$_GSTACK_HOME/.brain-queue.jsonl" | tr -d ' ') @@ -405,24 +317,16 @@ fi -**Privacy stop-gate (fires ONCE per machine).** +Privacy stop-gate: if output shows `BRAIN_SYNC: off`, `gbrain_sync_mode_prompted` is `false`, and gbrain is on PATH or `gbrain doctor --fast --json` works, ask once: -If the bash output shows `BRAIN_SYNC: off` AND the config value -`gbrain_sync_mode_prompted` is `false` AND gbrain is detected on this host -(either `gbrain doctor --fast --json` succeeds or the `gbrain` binary is in PATH), -fire a one-time privacy gate via AskUserQuestion: - -> gstack can publish your session memory (learnings, plans, designs, retros) to a -> private GitHub repo that GBrain indexes across your machines. Higher tiers -> include behavioral data (session timelines, developer profile). How much do you -> want to sync? +> gstack can publish your session memory to a private GitHub repo that GBrain indexes across machines. How much should sync? Options: -- A) Everything allowlisted (recommended — maximum cross-machine memory) -- B) Only artifacts (plans, designs, retros, learnings) — skip timelines and profile -- C) Decline — keep everything local +- A) Everything allowlisted (recommended) +- B) Only artifacts +- C) Decline, keep everything local -After the user answers, run (substituting the chosen value): +After answer: ```bash # Chosen mode: full | artifacts-only | off @@ -430,17 +334,9 @@ After the user answers, run (substituting the chosen value): "$_BRAIN_CONFIG_BIN" set gbrain_sync_mode_prompted true ``` -If A or B was chosen AND `~/.gstack/.git` doesn't exist, ask a follow-up: -"Set up the GBrain sync repo now? (runs `gstack-brain-init`)" -- A) Yes, run it now -- B) Show me the command, I'll run it myself +If A/B and `~/.gstack/.git` is missing, ask whether to run `gstack-brain-init`. Do not block the skill. -Do not block the skill. Emit the question, continue the skill workflow. The -next skill run picks up wherever this left off. - -**At skill END (before the telemetry block),** run these bash commands to -catch artifact writes (design docs, plans, retros) that skipped the writer -shims, plus drain any still-pending queue entries: +At skill END before telemetry: ```bash "~/.claude/skills/gstack/bin/gstack-brain-sync" --discover-new 2>/dev/null || true @@ -468,66 +364,38 @@ equivalents (cat, sed, find, grep). The dedicated tools are cheaper and clearer. ## Voice -**Tone:** direct, concrete, sharp, never corporate, never academic. Sound like a builder, not a consultant. Name the file, the function, the command. No filler, no throat-clearing. +Direct, concrete, builder-to-builder. Name the file, function, command, and user-visible impact. No filler. -**Writing rules:** No em dashes (use commas, periods, "..."). No AI vocabulary (delve, crucial, robust, comprehensive, nuanced, etc.). Short paragraphs. End with what to do. +No em dashes. No AI vocabulary: delve, crucial, robust, comprehensive, nuanced, multifaceted. Never corporate or academic. Short paragraphs. End with what to do. -The user always has context you don't. Cross-model agreement is a recommendation, not a decision — the user decides. +The user has context you do not. Cross-model agreement is a recommendation, not a decision. The user decides. ## Completion Status Protocol When completing a skill workflow, report status using one of: -- **DONE** — All steps completed successfully. Evidence provided for each claim. -- **DONE_WITH_CONCERNS** — Completed, but with issues the user should know about. List each concern. -- **BLOCKED** — Cannot proceed. State what is blocking and what was tried. -- **NEEDS_CONTEXT** — Missing information required to continue. State exactly what you need. +- **DONE** — completed with evidence. +- **DONE_WITH_CONCERNS** — completed, but list concerns. +- **BLOCKED** — cannot proceed; state blocker and what was tried. +- **NEEDS_CONTEXT** — missing info; state exactly what is needed. -### Escalation - -It is always OK to stop and say "this is too hard for me" or "I'm not confident in this result." - -Bad work is worse than no work. You will not be penalized for escalating. -- If you have attempted a task 3 times without success, STOP and escalate. -- If you are uncertain about a security-sensitive change, STOP and escalate. -- If the scope of work exceeds what you can verify, STOP and escalate. - -Escalation format: -``` -STATUS: BLOCKED | NEEDS_CONTEXT -REASON: [1-2 sentences] -ATTEMPTED: [what you tried] -RECOMMENDATION: [what the user should do next] -``` +Escalate after 3 failed attempts, uncertain security-sensitive changes, or scope you cannot verify. Format: `STATUS`, `REASON`, `ATTEMPTED`, `RECOMMENDATION`. ## Operational Self-Improvement -Before completing, reflect on this session: -- Did any commands fail unexpectedly? -- Did you take a wrong approach and have to backtrack? -- Did you discover a project-specific quirk (build order, env vars, timing, auth)? -- Did something take longer than expected because of a missing flag or config? - -If yes, log an operational learning for future sessions: +Before completing, if you discovered a durable project quirk or command fix that would save 5+ minutes next time, log it: ```bash ~/.claude/skills/gstack/bin/gstack-learnings-log '{"skill":"SKILL_NAME","type":"operational","key":"SHORT_KEY","insight":"DESCRIPTION","confidence":N,"source":"observed"}' ``` -Replace SKILL_NAME with the current skill name. Only log genuine operational discoveries. -Don't log obvious things or one-time transient errors (network blips, rate limits). -A good test: would knowing this save 5+ minutes in a future session? If yes, log it. +Do not log obvious facts or one-time transient errors. ## Telemetry (run last) -After the skill workflow completes (success, error, or abort), log the telemetry event. -Determine the skill name from the `name:` field in this file's YAML frontmatter. -Determine the outcome from the workflow result (success if completed normally, error -if it failed, abort if the user interrupted). +After workflow completion, log telemetry. Use skill `name:` from frontmatter. OUTCOME is success/error/abort/unknown. **PLAN MODE EXCEPTION — ALWAYS RUN:** This command writes telemetry to -`~/.gstack/analytics/` (user config directory, not project files). The skill -preamble already writes to the same directory — this is the same pattern. -Skipping this command loses session duration and outcome data. +`~/.gstack/analytics/`, matching preamble analytics writes. Run this bash: @@ -549,34 +417,11 @@ if [ "$_TEL" != "off" ] && [ -x ~/.claude/skills/gstack/bin/gstack-telemetry-log fi ``` -Replace `SKILL_NAME` with the actual skill name from frontmatter, `OUTCOME` with -success/error/abort, and `USED_BROWSE` with true/false based on whether `$B` was used. -If you cannot determine the outcome, use "unknown". The local JSONL always logs. The -remote binary only runs if telemetry is not off and the binary exists. - -## Plan Mode Safe Operations - -In plan mode, these are always allowed (they inform the plan, don't modify source): -`$B` (browse), `$D` (design), `codex exec`/`codex review`, writes to `~/.gstack/`, -writes to the plan file, `open` for generated artifacts. - -## Skill Invocation During Plan Mode - -If the user invokes a skill in plan mode, that skill takes precedence over generic plan mode behavior. Treat it as executable instructions, not reference. Follow step -by step. AskUserQuestion calls satisfy plan mode's end-of-turn requirement. At a STOP -point, stop immediately. Do not continue the workflow past a STOP point and do not call ExitPlanMode there. Commands marked "PLAN -MODE EXCEPTION — ALWAYS RUN" execute. Other writes need to be already permitted -above or explicitly exception-marked. Call ExitPlanMode only after the skill -workflow completes — only then call ExitPlanMode (or if the user tells you to cancel the skill or leave plan mode). +Replace `SKILL_NAME`, `OUTCOME`, and `USED_BROWSE` before running. ## Plan Status Footer -In plan mode, before ExitPlanMode: if the plan file lacks a `## GSTACK REVIEW REPORT` -section, run `~/.claude/skills/gstack/bin/gstack-review-read` and append a report. -With JSONL entries (before `---CONFIG---`), format the standard runs/status/findings -table. With `NO_REVIEWS` or empty, append a 5-row placeholder table (CEO/Codex/Eng/ -Design/DX Review) with all zeros and verdict "NO REVIEWS YET — run `/autoplan`". -If a richer review report already exists, skip — review skills wrote it. +In plan mode before ExitPlanMode: if the plan file lacks `## GSTACK REVIEW REPORT`, run `~/.claude/skills/gstack/bin/gstack-review-read` and append the standard runs/status/findings table. With `NO_REVIEWS` or empty, append a 5-row placeholder with verdict "NO REVIEWS YET — run `/autoplan`". If a richer report exists, skip. PLAN MODE EXCEPTION — always allowed (it's the plan file). @@ -1038,6 +883,7 @@ Refs are invalidated on navigation — run `snapshot` again after `goto`. | `closetab [id]` | Close tab | | `newtab [url] [--json]` | Open new tab. With --json, returns {"tabId":N,"url":...} for programmatic use (make-pdf). | | `tab ` | Switch to tab | +| `tab-each [args...]` | Run a command on every open tab. Returns JSON with per-tab results. | | `tabs` | List open tabs | ### Server diff --git a/TODOS.md b/TODOS.md index a6d7fce4..579b8e3f 100644 --- a/TODOS.md +++ b/TODOS.md @@ -159,7 +159,75 @@ **Depends on:** v1.8.0.0 telemetry in production. P1 self-authoring commands. --- +## Sidebar Terminal (cc-pty-import follow-ups) +### v1.1: PTY session survives sidebar reload + +**What:** Today the Terminal tab's PTY dies with the WebSocket — sidebar +reload, side-panel close, even a quick navigate-away in another tab close +the session. v1.1 should key the PTY on a tab/session id so a reload +reattaches to the existing claude process and you keep `/resume` history. + +**Why:** Mid-task resilience. When you've been pair-programming with claude +for 20 minutes and an accidental Cmd-R blows it away, the cost is real. + +**Pros:** Better UX, fewer interrupted sessions. **Cons:** Session-tracking +state, ghost-process risk, lifecycle bugs (when DOES the PTY actually go +away?). v1 chose the simple "PTY dies with WS" model deliberately. + +**Context:** /plan-eng-review Issue 1C decision (cc-pty-import branch, +2026-04-25). v1 ships with phoenix's lifecycle. **Depends on:** +cc-pty-import landed. + +**Priority:** P2 (nice-to-have). +**Effort:** M. Likely needs a per-tab session map keyed by chrome.tabs.id +plus a TTL so abandoned PTYs eventually exit. + +--- + +### v1.1+: Audit `/health` token distribution + +**What:** Codex's outside-voice review on cc-pty-import flagged that +`/health` already surfaces `AUTH_TOKEN` to any localhost caller in headed +mode (`server.ts:1657`). That's a pre-existing soft leak — anything +running on localhost gets the root token by hitting `/health`. + +**Why:** cc-pty-import sidesteps it by NOT putting the PTY token there +(uses an HttpOnly cookie path instead). But the underlying leak is still +shippable surface. A second extension or a localhost web app could +currently scrape `AUTH_TOKEN` and hit any browse-server endpoint. + +**Pros:** Closes a real privilege-escalation path on multi-extension +machines. **Cons:** Either we tighten the gate (Origin must be OUR +extension id, not just any chrome-extension://) or we move bootstrap +discovery off `/health` entirely. Either has migration cost for tests +and the existing extension. + +**Context:** codex finding #2 on cc-pty-import plan-eng review. Not in +scope of that PR; deliberately deferred to keep PTY-import small. + +**Priority:** P2. +**Effort:** M. + +--- + +## Testing + +## P1: Structural STOP-Ask forcing function across all skills + +**What:** Design and implement a structural forcing function that catches when a skill mandates per-issue AskUserQuestion but the model silently substitutes batch-synthesis. Candidate mechanisms: question-count assertion (skill declares expected question count in frontmatter; post-run audit logs if model fired ` form (literal space after the scheme name) slipped past the scanner. Added an optional `(Bearer |Basic |Token )?` prefix to the pattern. Validated against 5 positive cases (including the regression fixture) + 3 negative cases (short tokens, non-secret keys, random JSON). The 7-pattern secret scanner now passes all fixtures including bearer-json. +- **Added `test/gstack-brain-init-gh-mock.test.ts`** — 8 tests exercising the `gh` CLI auto-create path that previously had zero coverage. Stubs `gh` on PATH to record every call, asserts `gh repo create --private --description "..." --source ` fires with the computed `gstack-brain-` default name. Covers: happy path, fall-through-to-`gh repo view` when create hits already-exists, user-provided-URL-bypasses-gh, gh-not-on-path prompts for URL, gh-not-authed prompts for URL, idempotent `--remote` re-runs, conflicting-remote rejection. +- **Added `test/skill-e2e-brain-privacy-gate.test.ts`** — periodic-tier E2E (~$0.30-$0.50/run). Stages a fake `gbrain` on PATH + `gbrain_sync_mode_prompted=false` in config, runs a real skill via `runAgentSdkTest`, intercepts tool-use via `canUseTool`, and asserts the preamble fires the 3-option privacy AskUserQuestion with canonical prose ("publish session memory" / "artifact" / "decline"). Second test asserts the gate is silent when `prompted=true` (idempotency-within-session). +- **Registered `brain-privacy-gate` in `test/helpers/touchfiles.ts`** (periodic tier) with dependency tracking on `scripts/resolvers/preamble/generate-brain-sync-block.ts`, `bin/gstack-brain-sync`, `bin/gstack-brain-init`, `bin/gstack-config`, and the Agent SDK runner. Diff-based selection will re-run the E2E whenever any of those change. + +**Completed:** v1.12.0.0 (2026-04-24) + +--- + +### Overlay efficacy harness + Opus 4.7 fanout nudge removal (v1.10.1.0) +- Built `test/skill-e2e-overlay-harness.test.ts`, a parametric periodic-tier eval that drives `@anthropic-ai/claude-agent-sdk` and measures first-turn fanout rate (overlay-ON vs overlay-OFF) across registered fixtures +- Measured the original "Fan out explicitly" overlay nudge: baseline Opus 4.7 = 70% first-turn fanout on toy prompt, with our nudge = 10%, with Anthropic's own canonical `` text = 0% +- Removed the counterproductive nudge from `model-overlays/opus-4-7.md` +- Shipped 36-test free-tier unit suite for the SDK runner + strict fixture validator +- Registered `overlay-harness-opus-4-7-fanout-{toy,realistic}` in E2E_TOUCHFILES and E2E_TIERS +- Total investigation cost: ~$7 across 3 eval runs +**Completed:** v1.10.1.0 + ### CI eval pipeline (v0.9.9.0) - GitHub Actions eval upload on Ubicloud runners ($0.006/run) - Within-file test concurrency (test() → testConcurrentIfSelected()) diff --git a/USING_GBRAIN_WITH_GSTACK.md b/USING_GBRAIN_WITH_GSTACK.md new file mode 100644 index 00000000..f0dfb14c --- /dev/null +++ b/USING_GBRAIN_WITH_GSTACK.md @@ -0,0 +1,291 @@ +# Using GBrain with GStack + +Your coding agent, with a memory it actually keeps. + +[GBrain](https://github.com/garrytan/gbrain) is a persistent knowledge base designed for AI agents. It stores what your agent learns, what you've decided, what worked and what didn't, and lets the agent search all of it on demand. GStack gives you a one-command path from zero to "gbrain is running, and my agent can call it" — with paths for try-it-local, share-with-your-team, and everything between. + +This is the full monty: every scenario, every flag, every helper bin, every troubleshooting step. For the quick pitch, see the [README's GBrain section](README.md#gbrain--persistent-knowledge-for-your-coding-agent). For error codes and sync-specific issues, see [docs/gbrain-sync.md](docs/gbrain-sync.md). + +--- + +## The one-command install + +```bash +/setup-gbrain +``` + +That's it. The skill detects your current state, asks three questions at most, and walks you through install, init, MCP registration for Claude Code, and per-repo trust policy. On a clean Mac with nothing installed it finishes in under five minutes. On a Mac where something's already set up it takes seconds (it detects the existing state and skips done work). + +## The three paths + +You pick one when the skill asks "Where should your brain live?" + +### Path 1: Supabase, you already have a connection string + +Best for: you (or a teammate's cloud agent) already provisioned a Supabase brain and you want this local machine to use the same data. + +**What happens:** Paste the Session Pooler URL (Settings → Database → Connection Pooler → Session → copy URI, port 6543). The skill reads it with echo off, shows you a redacted preview (`aws-0-us-east-1.pooler.supabase.com:6543/postgres` — host visible, password masked), hands it to `gbrain init` via the `GBRAIN_DATABASE_URL` environment variable, and the URL is never written to argv or your shell history. + +**Trust warning:** Pasting this URL gives your local Claude Code full read/write access to every page in the shared brain. If that's not the trust level you want, pick PGLite local (Path 3) instead and accept the brains are disjoint. + +### Path 2a: Supabase, auto-provision a new project + +Best for: fresh Supabase account, you want a clean new project with zero clicking. + +**What happens:** You paste a Supabase Personal Access Token (PAT). The skill shows you the scope disclosure first — *the token grants full access to every project in your Supabase account, not just the one we're about to create*. It lists your organizations, asks which one and which region (default `us-east-1`), generates a database password, calls `POST /v1/projects`, polls `GET /v1/projects/{ref}` every 5 seconds until the project is `ACTIVE_HEALTHY` (180s timeout), fetches the pooler URL, hands it to `gbrain init`. End-to-end: ~90 seconds. + +At the end: explicit reminder to revoke the PAT at https://supabase.com/dashboard/account/tokens. The skill already discarded it from memory. + +**If you Ctrl-C mid-provision:** The SIGINT trap prints your in-flight project ref + a resume command. You can delete the orphan at the Supabase dashboard, or run `/setup-gbrain --resume-provision ` to pick up where you left off. + +### Path 2b: Supabase, create manually + +Best for: you'd rather click through supabase.com yourself than paste a PAT. + +**What happens:** The skill walks you through the four manual steps (signup → new project → wait ~2 min → copy Session Pooler URL), then takes over from Path 1's paste step. Same security treatment as Path 1. + +### Path 3: PGLite local + +Best for: try-it-first, no account, no cloud, no sharing. Or a dedicated "this Mac's brain" that stays isolated from any cloud agent. + +**What happens:** `gbrain init --pglite`. Brain lives at `~/.gbrain/brain.pglite`. No network calls. Done in 30 seconds. + +This is the best first choice if you just want to see what gbrain feels like before committing to cloud. You can always migrate later with `/setup-gbrain --switch`. + +## MCP registration for Claude Code + +By default the skill asks "Give Claude Code a typed tool surface for gbrain?" If you say yes, it runs: + +```bash +claude mcp add gbrain -- gbrain serve +``` + +That registers gbrain's stdio MCP server with Claude Code. Now `gbrain search`, `gbrain put_page`, `gbrain get_page`, etc. show up as first-class tools in every session, not bash shell-outs. + +**If `claude` is not on PATH**, the skill skips MCP registration gracefully with a manual-register hint. The CLI resolver still works from any skill that shells out to `gbrain` — MCP is an upgrade, not a prerequisite. + +**Other local agents** (Cursor, Codex CLI, etc.) need their own MCP registration. The skill is Claude-Code-targeted for v1; other hosts can register `gbrain serve` manually in their own MCP config. + +## Per-remote trust policy (the triad) + +Every repo on your machine gets a policy decision: **read-write**, **read-only**, or **deny**. + +- **read-write** — your agent can `gbrain search` from this repo's context AND write new pages back to the brain. Default for your own projects. +- **read-only** — your agent can search the brain but never writes new pages from this repo's sessions. Ideal for multi-client consultants: search the shared brain, don't contaminate it with Client A's code while you're in Client B's repo. +- **deny** — no gbrain interaction at all. The repo is invisible to gbrain tooling. + +The skill asks once per repo the first time you run a gstack skill there. After that the decision is sticky — every worktree + branch of the same git remote shares the same policy, so you set it once and it follows you. + +SSH and HTTPS remote variants collapse to the same key: `https://github.com/foo/bar.git` and `git@github.com:foo/bar.git` are the same repo. + +**To change a policy:** + +```bash +/setup-gbrain --repo # re-prompt for this repo only + +# Or directly: +~/.claude/skills/gstack/bin/gstack-gbrain-repo-policy set "github.com/foo/bar" read-only +``` + +**To see every policy:** + +```bash +~/.claude/skills/gstack/bin/gstack-gbrain-repo-policy list +``` + +Storage: `~/.gstack/gbrain-repo-policy.json`, mode 0600, schema-versioned so future migrations stay deterministic. + +## Switching engines later + +Picked PGLite and now want to join a team brain? One command: + +```bash +/setup-gbrain --switch +``` + +The skill runs `gbrain migrate --to supabase --url "$URL"` wrapped in `timeout 180s`. Migration is bidirectional (Supabase → PGLite also works) and lossless — pages, chunks, embeddings, links, tags, and timeline all copy. Your original brain is preserved as a backup. + +**If migration hangs:** another gstack session may be holding a lock on the source brain. The timeout fires at 3 minutes with an actionable message. Close other workspaces and re-run. + +## GStack memory sync (a separate concern) + +This is different from gbrain itself. Your gstack state (`~/.gstack/` — learnings, plans, retros, timeline, developer profile) is machine-local by default. "GStack memory sync" optionally pushes a curated, secret-scanned subset to a private git repo so your memory follows you across machines — and, if you're running gbrain, that git repo becomes indexable there too. + +Turn it on with: + +```bash +gstack-brain-init +``` + +You'll get a one-time privacy prompt: **everything allowlisted** / **artifacts only** (plans, designs, retros, learnings — skip behavioral data like timelines) / **off**. Every skill run syncs the queue at start and end — no daemon, no background process. + +Secret-shaped content (AWS keys, GitHub tokens, PEM blocks, JWTs, bearer tokens) is blocked from sync before it leaves your machine. + +**On a new machine:** Copy `~/.gstack-brain-remote.txt` over, run `gstack-brain-restore`, and yesterday's learnings surface on today's laptop. + +Full guide: [docs/gbrain-sync.md](docs/gbrain-sync.md). Error index: [docs/gbrain-sync-errors.md](docs/gbrain-sync-errors.md). + +`/setup-gbrain` offers to wire this up for you at the end of initial setup — it's one more AskUserQuestion, and it integrates with the same private-repo infrastructure. + +## Cleanup orphan projects + +If you Ctrl-C'd mid-provision, tried three different names before settling on one, or otherwise accumulated gbrain-shaped Supabase projects you don't use, there's a subcommand for that: + +```bash +/setup-gbrain --cleanup-orphans +``` + +The skill re-collects a PAT (one-time, discarded after), lists every project in your Supabase account whose name starts with `gbrain` and whose ref doesn't match your active `~/.gbrain/config.json` pooler URL. For each orphan it asks per-project: *"Delete orphan project `` (``, created ``)?"* — no batching, no "delete all" shortcut. The active brain is never offered for deletion. + +## Command + flag reference + +### `/setup-gbrain` entry modes + +| Invocation | What it does | +|---|---| +| `/setup-gbrain` | Full flow: detect state, pick path, install, init, MCP, policy, optional memory-sync | +| `/setup-gbrain --repo` | Flip the per-remote trust policy for the current repo only | +| `/setup-gbrain --switch` | Migrate engine (PGLite ↔ Supabase) without re-running the other steps | +| `/setup-gbrain --resume-provision ` | Resume a path-2a auto-provision that was interrupted during polling | +| `/setup-gbrain --cleanup-orphans` | List + per-project delete of orphan Supabase projects | + +### Bin helpers (for scripting) + +| Bin | Purpose | +|---|---| +| `gstack-gbrain-detect` | Emit current state as JSON: gbrain on PATH, version, config engine, doctor status, sync mode | +| `gstack-gbrain-install` | Detect-first installer (probes `~/git/gbrain`, `~/gbrain`, then fresh clone). Has `--dry-run` and `--validate-only` flags. PATH-shadow check exits 3 with remediation menu. | +| `gstack-gbrain-lib.sh` | Sourced, not executed. Provides `read_secret_to_env VARNAME "prompt" [--echo-redacted ""]` | +| `gstack-gbrain-supabase-verify` | Structural URL check. Rejects direct-connection URLs (`db.*.supabase.co:5432`) with exit 3 | +| `gstack-gbrain-supabase-provision` | Management API wrapper. Subcommands: `list-orgs`, `create`, `wait`, `pooler-url`, `list-orphans`, `delete-project`. All require `SUPABASE_ACCESS_TOKEN` in env. `create` and `pooler-url` also require `DB_PASS`. `--json` mode available on every subcommand. | +| `gstack-gbrain-repo-policy` | Per-remote trust triad. Subcommands: `get`, `set`, `list`, `normalize` | + +### gbrain CLI (upstream tool) + +Gbrain itself ships with these that gstack wraps: + +| Command | Purpose | +|---|---| +| `gbrain init --pglite` | Initialize a local PGLite brain | +| `gbrain init --non-interactive` | Initialize via env (`GBRAIN_DATABASE_URL` or `DATABASE_URL`). Never pass a URL as argv — it'll leak to shell history. | +| `gbrain doctor --json` | Health check. Returns `{status: "ok"|"warnings"|"error", health_score: 0-100, checks: [...]}` | +| `gbrain migrate --to supabase --url ...` | Move a PGLite brain to Supabase (lossless, preserves source as backup) | +| `gbrain migrate --to pglite` | Reverse migration | +| `gbrain search "query"` | Search the brain | +| `gbrain put_page --title "..." --tags "a,b" <<<"content"` | Write a page | +| `gbrain get_page ""` | Fetch a page | +| `gbrain serve` | Start the MCP stdio server (used by `claude mcp add`) | + +### Config files + state + +| Path | What lives there | +|---|---| +| `~/.gbrain/config.json` | Engine (pglite/postgres), database URL or path, API keys. Mode 0600. Written by `gbrain init`. | +| `~/.gstack/gbrain-repo-policy.json` | Per-remote trust triad. Schema v2. Mode 0600. | +| `~/.gstack/.setup-gbrain.lock.d` | Concurrent-run lock (atomic mkdir). Released on normal exit + SIGINT. | +| `~/.gstack/.brain-queue.jsonl` | Pending sync entries for gstack memory sync | +| `~/.gstack/.brain-last-push` | Timestamp of last sync push (for `/health` scoring) | +| `~/.gstack-brain-remote.txt` | URL of your gstack memory sync remote (safe to copy between machines) | +| `~/.gstack/.setup-gbrain-inflight.json` | Reserved for future `--resume-provision` persisted state | + +### Environment variables + +| Var | Where it's read | What it does | +|---|---|---| +| `SUPABASE_ACCESS_TOKEN` | `gstack-gbrain-supabase-provision` | PAT for Management API calls. Discarded after each setup run. | +| `DB_PASS` | `gstack-gbrain-supabase-provision` (create, pooler-url) | Generated DB password. Never in argv. | +| `GBRAIN_DATABASE_URL` | `gbrain init`, `gbrain doctor`, etc. | Postgres connection string (Supabase pooler URL for us). Env takes precedence over `~/.gbrain/config.json`. | +| `DATABASE_URL` | `gbrain init` (fallback) | Same semantics as `GBRAIN_DATABASE_URL`; checked second. | +| `SUPABASE_API_BASE` | `gstack-gbrain-supabase-provision` | Override the Management API host. Used by tests to point at a mock server. | +| `GBRAIN_INSTALL_DIR` | `gstack-gbrain-install` | Override default install path (`~/gbrain`) | +| `GSTACK_HOME` | every bin helper | Override `~/.gstack` state dir. Heavy test use. | + +## Security model + +One rule for every secret this skill touches: **env var only, never argv, never logged, never written to disk by us.** The only persistent storage is gbrain's own `~/.gbrain/config.json` at mode 0600, which is gbrain's discipline, not ours. + +**Enforced in code:** + +- CI grep test in `test/skill-validation.test.ts` fails the build if `$SUPABASE_ACCESS_TOKEN` or `$GBRAIN_DATABASE_URL` appears in an argv position +- CI grep test fails if `--insecure`, `-k`, or `NODE_TLS_REJECT_UNAUTHORIZED=0` appear in `bin/gstack-gbrain-supabase-provision` +- `set +x` at the top of the provision helper prevents debug tracing from leaking PAT +- Telemetry payload contains only enumerated categorical values (scenario, install result, MCP opt-in, trust tier) — never free-form strings that could contain secrets + +**Enforced via tests:** + +- `test/secret-sink-harness.test.ts` runs every secret-handling bin with a seeded secret and asserts the seed never appears in any captured channel (stdout, stderr, files under `$HOME`, telemetry JSONL). Four match rules per seed: exact, URL-decoded, first-12-char prefix, base64. +- Positive controls in the same test file deliberately leak seeds in every covered channel and assert the harness catches each one. Without the positive controls, a harness that silently under-reports would look identical to a working harness. + +**What you can still leak** (the honest limits of v1): + +- If you paste a secret into a normal chat message outside `read -s`, it's in the conversation transcript and any host-side logging +- The leak harness doesn't dump subprocess environment — a bin that `env >> ~/.log` would evade detection (no bin in v1 does this; grep tests prevent it) +- Your shell's own `HISTFILE` behavior is your shell's, not ours — we never pass secrets to argv so they don't land there via our code, but nothing stops you from pasting one into a raw `curl` command yourself + +## Troubleshooting + +### "PATH SHADOWING DETECTED" during install + +Another `gbrain` binary is earlier in PATH than the one the installer just linked. The installer's version check caught it. Fix one of: + +- `rm $(which gbrain)` if you don't need the other one +- Prepend `~/.bun/bin` to PATH in your shell rc so the linked binary wins +- Set `GBRAIN_INSTALL_DIR` to the shadowing binary's install directory and re-run + +Then re-run `/setup-gbrain`. + +### "rejected direct-connection URL" + +You pasted a `db..supabase.co:5432` URL. Those are IPv6-only and fail in most environments. Use the Session Pooler URL instead: Supabase dashboard → Settings → Database → Connection Pooler → **Session** → copy URI (port 6543). + +### Auto-provision times out at 180s + +The Supabase project is still initializing. Your ref was printed in the exit message. Wait a minute, then: + +```bash +/setup-gbrain --resume-provision +``` + +The skill re-collects a PAT, skips project creation, resumes polling. + +### "Another `/setup-gbrain` instance is running" + +You have a stale lock directory. If you're sure no other instance is actually running: + +```bash +rm -rf ~/.gstack/.setup-gbrain.lock.d +``` + +Then re-run. + +### "No cross-model tension" on policy file + +You edited `~/.gstack/gbrain-repo-policy.json` by hand with legacy `allow` values? No problem. On the next read, gstack auto-migrates `allow` → `read-write` and adds `_schema_version: 2`. One log line on stderr, idempotent, deterministic. + +### `gbrain doctor` says "warnings" + +`/health` treats that as yellow, not red. Check `gbrain doctor --json | jq .checks` to see which sub-checks are warning. Typical causes: resolver MECE overlap (skill names clashing) or DB connection not yet configured. + +### Switching PGLite → Supabase hangs + +Another gstack session in a sibling Conductor workspace may be holding a lock on your local PGLite file via its preamble's `gstack-brain-sync` call. Close other workspaces, re-run `/setup-gbrain --switch`. The timeout is bounded at 180s so you'll never actually wait forever. + +## Why this design + +**Why per-remote trust triad and not binary allow/deny?** Multi-client consultants need search without write-back. A freelance dev working on Client A in the morning and Client B in the afternoon can't let A's code insights leak into a brain Client B can search. Read-only solves that cleanly. + +**Why not bundle gbrain into gstack?** Gbrain is a separate, actively-developed project with its own release cadence, schema migrations, and MCP surface. Bundling would mean gstack has to gate gbrain updates, which slows gbrain improvements from reaching users. Separate-but-integrated lets each ship on its own cadence. + +**Why `gbrain init --non-interactive` via env var and not a flag?** Connection strings contain database passwords. Passing them as argv lands the password in `ps`, shell history, and process listings. Env-var handoff keeps the secret in process memory only. Gbrain supports both `GBRAIN_DATABASE_URL` and `DATABASE_URL`; we use the former to avoid collisions with non-gbrain tooling. + +**Why fail-hard on PATH shadowing instead of warn-and-continue?** A shadowed `gbrain` means every subsequent command calls a different binary than the one we just installed. That's a silent version-drift bug that surfaces as mysterious feature gaps weeks later. Setup skills have one job — set up a working environment. Refusing to install into a broken one is the setup-skill-correct behavior. + +**Why not auto-import every repo?** Privacy + noise. An auto-import preamble hook that ingests every repo you touch would: (a) leak work code into a shared brain without consent, and (b) clog search with throwaway repos. The per-remote policy makes ingestion an explicit, per-repo decision. `/setup-gbrain` doesn't install any auto-import hook today — but the policy store is forward-compatible for one later. + +## Related skills + next steps + +- `/health` — includes a GBrain dimension (doctor status, sync queue depth, last-push age) in its 0-10 composite score. The dimension is omitted when gbrain isn't installed; running `/health` on a non-gbrain machine doesn't penalize that choice. +- `/gstack-upgrade` — keeps gstack itself up to date. Does NOT upgrade gbrain independently. To bump gbrain, update `PINNED_COMMIT` in `bin/gstack-gbrain-install` and re-run `/setup-gbrain`. +- `/retro` — weekly retrospective pulls learnings and plans from your gbrain when memory sync is on, letting the retro reference cross-machine history. + +Run `/setup-gbrain` and see what sticks. diff --git a/autoplan/SKILL.md b/autoplan/SKILL.md index c4ceeee9..6a8ad3b2 100644 --- a/autoplan/SKILL.md +++ b/autoplan/SKILL.md @@ -58,19 +58,15 @@ _TEL_START=$(date +%s) _SESSION_ID="$$-$(date +%s)" echo "TELEMETRY: ${_TEL:-off}" echo "TEL_PROMPTED: $_TEL_PROMPTED" -# Writing style verbosity (V1: default = ELI10, terse = tighter V0 prose. -# Read on every skill run so terse mode takes effect without a restart.) _EXPLAIN_LEVEL=$(~/.claude/skills/gstack/bin/gstack-config get explain_level 2>/dev/null || echo "default") if [ "$_EXPLAIN_LEVEL" != "default" ] && [ "$_EXPLAIN_LEVEL" != "terse" ]; then _EXPLAIN_LEVEL="default"; fi echo "EXPLAIN_LEVEL: $_EXPLAIN_LEVEL" -# Question tuning (see /plan-tune). Observational only in V1. _QUESTION_TUNING=$(~/.claude/skills/gstack/bin/gstack-config get question_tuning 2>/dev/null || echo "false") echo "QUESTION_TUNING: $_QUESTION_TUNING" mkdir -p ~/.gstack/analytics if [ "$_TEL" != "off" ]; then echo '{"skill":"autoplan","ts":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'","repo":"'$(basename "$(git rev-parse --show-toplevel 2>/dev/null)" 2>/dev/null || echo "unknown")'"}' >> ~/.gstack/analytics/skill-usage.jsonl 2>/dev/null || true fi -# zsh-compatible: use find instead of glob to avoid NOMATCH error for _PF in $(find ~/.gstack/analytics -maxdepth 1 -name '.pending-*' 2>/dev/null); do if [ -f "$_PF" ]; then if [ "$_TEL" != "off" ] && [ -x "~/.claude/skills/gstack/bin/gstack-telemetry-log" ]; then @@ -80,7 +76,6 @@ for _PF in $(find ~/.gstack/analytics -maxdepth 1 -name '.pending-*' 2>/dev/null fi break done -# Learnings count eval "$(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null)" 2>/dev/null || true _LEARN_FILE="${GSTACK_HOME:-$HOME/.gstack}/projects/${SLUG:-unknown}/learnings.jsonl" if [ -f "$_LEARN_FILE" ]; then @@ -92,9 +87,7 @@ if [ -f "$_LEARN_FILE" ]; then else echo "LEARNINGS: 0" fi -# Session timeline: record skill start (local-only, never sent anywhere) ~/.claude/skills/gstack/bin/gstack-timeline-log '{"skill":"autoplan","event":"started","branch":"'"$_BRANCH"'","session":"'"$_SESSION_ID"'"}' 2>/dev/null & -# Check if CLAUDE.md has routing rules _HAS_ROUTING="no" if [ -f CLAUDE.md ] && grep -q "## Skill routing" CLAUDE.md 2>/dev/null; then _HAS_ROUTING="yes" @@ -102,7 +95,6 @@ fi _ROUTING_DECLINED=$(~/.claude/skills/gstack/bin/gstack-config get routing_declined 2>/dev/null || echo "false") echo "HAS_ROUTING: $_HAS_ROUTING" echo "ROUTING_DECLINED: $_ROUTING_DECLINED" -# Vendoring deprecation: detect if CWD has a vendored gstack copy _VENDORED="no" if [ -d ".claude/skills/gstack" ] && [ ! -L ".claude/skills/gstack" ]; then if [ -f ".claude/skills/gstack/VERSION" ] || [ -d ".claude/skills/gstack/.git" ]; then @@ -111,66 +103,38 @@ if [ -d ".claude/skills/gstack" ] && [ ! -L ".claude/skills/gstack" ]; then fi echo "VENDORED_GSTACK: $_VENDORED" echo "MODEL_OVERLAY: claude" -# Checkpoint mode (explicit = no auto-commit, continuous = WIP commits as you go) _CHECKPOINT_MODE=$(~/.claude/skills/gstack/bin/gstack-config get checkpoint_mode 2>/dev/null || echo "explicit") _CHECKPOINT_PUSH=$(~/.claude/skills/gstack/bin/gstack-config get checkpoint_push 2>/dev/null || echo "false") echo "CHECKPOINT_MODE: $_CHECKPOINT_MODE" echo "CHECKPOINT_PUSH: $_CHECKPOINT_PUSH" -# Detect spawned session (OpenClaw or other orchestrator) [ -n "$OPENCLAW_SESSION" ] && echo "SPAWNED_SESSION: true" || true ``` -If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills AND do not -auto-invoke skills based on conversation context. Only run skills the user explicitly -types (e.g., /qa, /ship). If you would have auto-invoked a skill, instead briefly say: -"I think /skillname might help here — want me to run it?" and wait for confirmation. -The user opted out of proactive behavior. +## Plan Mode Safe Operations -If `SKILL_PREFIX` is `"true"`, the user has namespaced skill names. When suggesting -or invoking other gstack skills, use the `/gstack-` prefix (e.g., `/gstack-qa` instead -of `/qa`, `/gstack-ship` instead of `/ship`). Disk paths are unaffected — always use -`~/.claude/skills/gstack/[skill-name]/SKILL.md` for reading skill files. +In plan mode, allowed because they inform the plan: `$B`, `$D`, `codex exec`/`codex review`, writes to `~/.gstack/`, writes to the plan file, and `open` for generated artifacts. + +## Skill Invocation During Plan Mode + +If the user invokes a skill in plan mode, the skill takes precedence over generic plan mode behavior. **Treat the skill file as executable instructions, not reference.** Follow it step by step starting from Step 0; the first AskUserQuestion is the workflow entering plan mode, not a violation of it. AskUserQuestion satisfies plan mode's end-of-turn requirement. At a STOP point, stop immediately. Do not continue the workflow or call ExitPlanMode there. Commands marked "PLAN MODE EXCEPTION — ALWAYS RUN" execute. Call ExitPlanMode only after the skill workflow completes, or if the user tells you to cancel the skill or leave plan mode. + +If `PROACTIVE` is `"false"`, do not auto-invoke or proactively suggest skills. If a skill seems useful, ask: "I think /skillname might help here — want me to run it?" + +If `SKILL_PREFIX` is `"true"`, suggest/invoke `/gstack-*` names. Disk paths stay `~/.claude/skills/gstack/[skill-name]/SKILL.md`. If output shows `UPGRADE_AVAILABLE `: read `~/.claude/skills/gstack/gstack-upgrade/SKILL.md` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). -If output shows `JUST_UPGRADED ` AND `SPAWNED_SESSION` is NOT set: tell -the user "Running gstack v{to} (just updated!)" and then check for new features to -surface. For each per-feature marker below, if the marker file is missing AND the -feature is plausibly useful for this user, use AskUserQuestion to let them try it. -Fire once per feature per user, NOT once per upgrade. +If output shows `JUST_UPGRADED `: print "Running gstack v{to} (just updated!)". If `SPAWNED_SESSION` is true, skip feature discovery. -**In spawned sessions (`SPAWNED_SESSION` = "true"): SKIP feature discovery entirely.** -Just print "Running gstack v{to}" and continue. Orchestrators do not want interactive -prompts from sub-sessions. +Feature discovery, max one prompt per session: +- Missing `~/.claude/skills/gstack/.feature-prompted-continuous-checkpoint`: AskUserQuestion for Continuous checkpoint auto-commits. If accepted, run `~/.claude/skills/gstack/bin/gstack-config set checkpoint_mode continuous`. Always touch marker. +- Missing `~/.claude/skills/gstack/.feature-prompted-model-overlay`: inform "Model overlays are active. MODEL_OVERLAY shows the patch." Always touch marker. -**Feature discovery markers and prompts** (one at a time, max one per session): +After upgrade prompts, continue workflow. -1. `~/.claude/skills/gstack/.feature-prompted-continuous-checkpoint` → - Prompt: "Continuous checkpoint auto-commits your work as you go with `WIP:` prefix - so you never lose progress to a crash. Local-only by default — doesn't push - anywhere unless you turn that on. Want to try it?" - Options: A) Enable continuous mode, B) Show me first (print the section from - the preamble Continuous Checkpoint Mode), C) Skip. - If A: run `~/.claude/skills/gstack/bin/gstack-config set checkpoint_mode continuous`. - Always: `touch ~/.claude/skills/gstack/.feature-prompted-continuous-checkpoint` +If `WRITING_STYLE_PENDING` is `yes`: ask once about writing style: -2. `~/.claude/skills/gstack/.feature-prompted-model-overlay` → - Inform only (no prompt): "Model overlays are active. `MODEL_OVERLAY: {model}` - shown in the preamble output tells you which behavioral patch is applied. - Override with `--model` when regenerating skills (e.g., `bun run gen:skill-docs - --model gpt-5.4`). Default is claude." - Always: `touch ~/.claude/skills/gstack/.feature-prompted-model-overlay` - -After handling JUST_UPGRADED (prompts done or skipped), continue with the skill -workflow. - -If `WRITING_STYLE_PENDING` is `yes`: You're on the first skill run after upgrading -to gstack v1. Ask the user once about the new default writing style. Use AskUserQuestion: - -> v1 prompts = simpler. Technical terms get a one-sentence gloss on first use, -> questions are framed in outcome terms, sentences are shorter. -> -> Keep the new default, or prefer the older tighter prose? +> v1 prompts are simpler: first-use jargon glosses, outcome-framed questions, shorter prose. Keep default or restore terse? Options: - A) Keep the new default (recommended — good writing helps everyone) @@ -185,27 +149,20 @@ rm -f ~/.gstack/.writing-style-prompt-pending touch ~/.gstack/.writing-style-prompted ``` -This only happens once. If `WRITING_STYLE_PENDING` is `no`, skip this entirely. +Skip if `WRITING_STYLE_PENDING` is `no`. -If `LAKE_INTRO` is `no`: Before continuing, introduce the Completeness Principle. -Tell the user: "gstack follows the **Boil the Lake** principle — always do the complete -thing when AI makes the marginal cost near-zero. Read more: https://garryslist.org/posts/boil-the-ocean" -Then offer to open the essay in their default browser: +If `LAKE_INTRO` is `no`: say "gstack follows the **Boil the Lake** principle — do the complete thing when AI makes marginal cost near-zero. Read more: https://garryslist.org/posts/boil-the-ocean" Offer to open: ```bash open https://garryslist.org/posts/boil-the-ocean touch ~/.gstack/.completeness-intro-seen ``` -Only run `open` if the user says yes. Always run `touch` to mark as seen. This only happens once. +Only run `open` if yes. Always run `touch`. -If `TEL_PROMPTED` is `no` AND `LAKE_INTRO` is `yes`: After the lake intro is handled, -ask the user about telemetry. Use AskUserQuestion: +If `TEL_PROMPTED` is `no` AND `LAKE_INTRO` is `yes`: ask telemetry once via AskUserQuestion: -> Help gstack get better! Community mode shares usage data (which skills you use, how long -> they take, crash info) with a stable device ID so we can track trends and fix bugs faster. -> No code, file paths, or repo names are ever sent. -> Change anytime with `gstack-config set telemetry off`. +> Help gstack get better. Share usage data only: skill, duration, crashes, stable device ID. No code, file paths, or repo names. Options: - A) Help gstack get better! (recommended) @@ -213,10 +170,9 @@ Options: If A: run `~/.claude/skills/gstack/bin/gstack-config set telemetry community` -If B: ask a follow-up AskUserQuestion: +If B: ask follow-up: -> How about anonymous mode? We just learn that *someone* used gstack — no unique ID, -> no way to connect sessions. Just a counter that helps us know if anyone's out there. +> Anonymous mode sends only aggregate usage, no unique ID. Options: - A) Sure, anonymous is fine @@ -230,14 +186,11 @@ Always run: touch ~/.gstack/.telemetry-prompted ``` -This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. +Skip if `TEL_PROMPTED` is `yes`. -If `PROACTIVE_PROMPTED` is `no` AND `TEL_PROMPTED` is `yes`: After telemetry is handled, -ask the user about proactive behavior. Use AskUserQuestion: +If `PROACTIVE_PROMPTED` is `no` AND `TEL_PROMPTED` is `yes`: ask once: -> gstack can proactively figure out when you might need a skill while you work — -> like suggesting /qa when you say "does this work?" or /investigate when you hit -> a bug. We recommend keeping this on — it speeds up every part of your workflow. +> Let gstack proactively suggest skills, like /qa for "does this work?" or /investigate for bugs? Options: - A) Keep it on (recommended) @@ -251,7 +204,7 @@ Always run: touch ~/.gstack/.proactive-prompted ``` -This only happens once. If `PROACTIVE_PROMPTED` is `yes`, skip this entirely. +Skip if `PROACTIVE_PROMPTED` is `yes`. If `HAS_ROUTING` is `no` AND `ROUTING_DECLINED` is `false` AND `PROACTIVE_PROMPTED` is `yes`: Check if a CLAUDE.md file exists in the project root. If it does not exist, create it. @@ -259,8 +212,6 @@ Check if a CLAUDE.md file exists in the project root. If it does not exist, crea Use AskUserQuestion: > gstack works best when your project's CLAUDE.md includes skill routing rules. -> This tells Claude to use specialized workflows (like /ship, /investigate, /qa) -> instead of answering directly. It's a one-time addition, about 15 lines. Options: - A) Add routing rules to CLAUDE.md (recommended) @@ -272,63 +223,33 @@ If A: Append this section to the end of CLAUDE.md: ## Skill routing -When the user's request matches an available skill, invoke it via the Skill tool. The -skill has multi-step workflows, checklists, and quality gates that produce better -results than an ad-hoc answer. When in doubt, invoke the skill. A false positive is -cheaper than a false negative. +When the user's request matches an available skill, invoke it via the Skill tool. When in doubt, invoke the skill. Key routing rules: -- Product ideas, "is this worth building", brainstorming → invoke /office-hours -- Strategy, scope, "think bigger", "what should we build" → invoke /plan-ceo-review -- Architecture, "does this design make sense" → invoke /plan-eng-review -- Design system, brand, "how should this look" → invoke /design-consultation -- Design review of a plan → invoke /plan-design-review -- Developer experience of a plan → invoke /plan-devex-review -- "Review everything", full review pipeline → invoke /autoplan -- Bugs, errors, "why is this broken", "wtf", "this doesn't work" → invoke /investigate -- Test the site, find bugs, "does this work" → invoke /qa (or /qa-only for report only) -- Code review, check the diff, "look at my changes" → invoke /review -- Visual polish, design audit, "this looks off" → invoke /design-review -- Developer experience audit, try onboarding → invoke /devex-review -- Ship, deploy, create a PR, "send it" → invoke /ship -- Merge + deploy + verify → invoke /land-and-deploy -- Configure deployment → invoke /setup-deploy -- Post-deploy monitoring → invoke /canary -- Update docs after shipping → invoke /document-release -- Weekly retro, "how'd we do" → invoke /retro -- Second opinion, codex review → invoke /codex -- Safety mode, careful mode, lock it down → invoke /careful or /guard -- Restrict edits to a directory → invoke /freeze or /unfreeze -- Upgrade gstack → invoke /gstack-upgrade -- Save progress, "save my work" → invoke /context-save -- Resume, restore, "where was I" → invoke /context-restore -- Security audit, OWASP, "is this secure" → invoke /cso -- Make a PDF, document, publication → invoke /make-pdf -- Launch real browser for QA → invoke /open-gstack-browser -- Import cookies for authenticated testing → invoke /setup-browser-cookies -- Performance regression, page speed, benchmarks → invoke /benchmark -- Review what gstack has learned → invoke /learn -- Tune question sensitivity → invoke /plan-tune -- Code quality dashboard → invoke /health +- Product ideas/brainstorming → invoke /office-hours +- Strategy/scope → invoke /plan-ceo-review +- Architecture → invoke /plan-eng-review +- Design system/plan review → invoke /design-consultation or /plan-design-review +- Full review pipeline → invoke /autoplan +- Bugs/errors → invoke /investigate +- QA/testing site behavior → invoke /qa or /qa-only +- Code review/diff check → invoke /review +- Visual polish → invoke /design-review +- Ship/deploy/PR → invoke /ship or /land-and-deploy +- Save progress → invoke /context-save +- Resume context → invoke /context-restore ``` Then commit the change: `git add CLAUDE.md && git commit -m "chore: add gstack skill routing rules to CLAUDE.md"` -If B: run `~/.claude/skills/gstack/bin/gstack-config set routing_declined true` -Say "No problem. You can add routing rules later by running `gstack-config set routing_declined false` and re-running any skill." +If B: run `~/.claude/skills/gstack/bin/gstack-config set routing_declined true` and say they can re-enable with `gstack-config set routing_declined false`. -This only happens once per project. If `HAS_ROUTING` is `yes` or `ROUTING_DECLINED` is `true`, skip this entirely. +This only happens once per project. Skip if `HAS_ROUTING` is `yes` or `ROUTING_DECLINED` is `true`. -If `VENDORED_GSTACK` is `yes`: This project has a vendored copy of gstack at -`.claude/skills/gstack/`. Vendoring is deprecated. We will not keep vendored copies -up to date, so this project's gstack will fall behind. - -Use AskUserQuestion (one-time per project, check for `~/.gstack/.vendoring-warned-$SLUG` marker): +If `VENDORED_GSTACK` is `yes`, warn once via AskUserQuestion unless `~/.gstack/.vendoring-warned-$SLUG` exists: > This project has gstack vendored in `.claude/skills/gstack/`. Vendoring is deprecated. -> We won't keep this copy up to date, so you'll fall behind on new features and fixes. -> -> Want to migrate to team mode? It takes about 30 seconds. +> Migrate to team mode? Options: - A) Yes, migrate to team mode now @@ -349,7 +270,7 @@ eval "$(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null)" 2>/dev/null || tru touch ~/.gstack/.vendoring-warned-${SLUG:-unknown} ``` -This only happens once per project. If the marker file exists, skip entirely. +If marker exists, skip. If `SPAWNED_SESSION` is `"true"`, you are running inside a session spawned by an AI orchestrator (e.g., OpenClaw). In spawned sessions: @@ -358,13 +279,58 @@ AI orchestrator (e.g., OpenClaw). In spawned sessions: - Focus on completing the task and reporting results via prose output. - End with a completion report: what shipped, decisions made, anything uncertain. +## AskUserQuestion Format + +Every AskUserQuestion is a decision brief and must be sent as tool_use, not prose. + +``` +D +Project/branch/task: <1 short grounding sentence using _BRANCH> +ELI10: +Stakes if we pick wrong: +Recommendation: because +Completeness: A=X/10, B=Y/10 (or: Note: options differ in kind, not coverage — no completeness score) +Pros / cons: +A)