diff --git a/.github/workflows/pr-title-sync.yml b/.github/workflows/pr-title-sync.yml new file mode 100644 index 00000000..023f5f66 --- /dev/null +++ b/.github/workflows/pr-title-sync.yml @@ -0,0 +1,64 @@ +name: PR Title Sync + +on: + pull_request: + types: [opened, synchronize, edited] + paths: + - 'VERSION' + +concurrency: + group: pr-title-sync-${{ github.event.pull_request.number }} + cancel-in-progress: true + +jobs: + sync: + name: Sync PR title to VERSION + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write + if: github.actor != 'github-actions[bot]' + steps: + - name: Checkout PR head + uses: actions/checkout@v4 + with: + fetch-depth: 1 + ref: ${{ github.event.pull_request.head.sha }} + + - name: Read VERSION + current title + id: inspect + run: | + set -euo pipefail + VERSION=$(cat VERSION | tr -d '[:space:]') + TITLE=$(jq -r '.pull_request.title' "$GITHUB_EVENT_PATH") + echo "version=$VERSION" >> "$GITHUB_OUTPUT" + # Only rewrite titles that ALREADY follow the v prefix pattern. + # Custom titles (no prefix) are left alone — user kept them intentionally. + if printf '%s' "$TITLE" | grep -qE '^v[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+ '; then + PREFIX=$(printf '%s' "$TITLE" | awk '{print $1}') + REST=$(printf '%s' "$TITLE" | sed 's/^v[0-9][0-9.]* //') + { + echo "prefix=$PREFIX" + echo "rest=$REST" + echo "eligible=true" + } >> "$GITHUB_OUTPUT" + else + echo "eligible=false" >> "$GITHUB_OUTPUT" + fi + + - name: Rewrite title if version changed + if: steps.inspect.outputs.eligible == 'true' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PR_NUM: ${{ github.event.pull_request.number }} + NEW_V: ${{ steps.inspect.outputs.version }} + OLD_PREFIX: ${{ steps.inspect.outputs.prefix }} + REST: ${{ steps.inspect.outputs.rest }} + run: | + if [ "v$NEW_V" = "$OLD_PREFIX" ]; then + echo "Title already matches v$NEW_V; no change." + exit 0 + fi + NEW_TITLE="v$NEW_V $REST" + echo "Rewriting: $OLD_PREFIX ... → v$NEW_V ..." + gh pr edit "$PR_NUM" --title "$NEW_TITLE" diff --git a/.github/workflows/version-gate.yml b/.github/workflows/version-gate.yml new file mode 100644 index 00000000..262baf6e --- /dev/null +++ b/.github/workflows/version-gate.yml @@ -0,0 +1,74 @@ +name: Version Gate + +on: + pull_request: + paths: + - 'VERSION' + - 'CHANGELOG.md' + - 'package.json' + +concurrency: + group: version-gate-${{ github.event.pull_request.number }} + cancel-in-progress: true + +jobs: + check: + name: Check VERSION is not stale vs queue + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: read + steps: + - name: Checkout PR head + uses: actions/checkout@v4 + with: + fetch-depth: 0 + ref: ${{ github.event.pull_request.head.sha }} + + - name: Setup Bun + uses: oven-sh/setup-bun@v2 + + - name: Read versions + id: versions + run: | + set -euo pipefail + PR_VERSION=$(cat VERSION | tr -d '[:space:]') + BASE_REF="${{ github.event.pull_request.base.ref }}" + git fetch origin "$BASE_REF" --depth=1 --quiet || true + BASE_VERSION=$(git show "origin/$BASE_REF:VERSION" 2>/dev/null | tr -d '[:space:]' || echo "0.0.0.0") + { + echo "pr_version=$PR_VERSION" + echo "base_version=$BASE_VERSION" + echo "base_ref=$BASE_REF" + } >> "$GITHUB_OUTPUT" + + - name: Detect bump level + id: bump + run: | + LEVEL=$(bun run scripts/detect-bump.ts "${{ steps.versions.outputs.base_version }}" "${{ steps.versions.outputs.pr_version }}") + echo "level=$LEVEL" >> "$GITHUB_OUTPUT" + + - name: Query queue (util) — fail-open on error + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + set +e + bun run bin/gstack-next-version \ + --base "${{ steps.versions.outputs.base_ref }}" \ + --bump "${{ steps.bump.outputs.level }}" \ + --current-version "${{ steps.versions.outputs.base_version }}" \ + --workspace-root null \ + --exclude-pr "${{ github.event.pull_request.number }}" \ + > next.json 2> next.err + RC=$? + if [ "$RC" != "0" ] || [ ! -s next.json ]; then + echo '{"offline":true}' > next.json + echo "::warning::util exit=$RC — failing open. stderr:" + cat next.err || true + fi + + - name: Compare PR VERSION to next free slot + env: + PR_VERSION: ${{ steps.versions.outputs.pr_version }} + run: | + bun run scripts/compare-pr-version.ts next.json "${{ github.event.pull_request.number }}" diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 00000000..7e5e1fa3 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,72 @@ +# GitLab CI parity for workspace-aware ship. +# Mirrors .github/workflows/version-gate.yml and pr-title-sync.yml. +# Projects that mirror to GitLab get the same protection as GitHub. + +stages: + - check + +variables: + BUN_VERSION: "1.3.10" + +.setup-bun: &setup-bun + - apt-get update -qq && apt-get install -qq -y curl jq git + - curl -fsSL https://bun.sh/install | bash -s "bun-v$BUN_VERSION" + - export PATH="$HOME/.bun/bin:$PATH" + +version-gate: + stage: check + image: debian:stable-slim + rules: + - if: '$CI_PIPELINE_SOURCE == "merge_request_event"' + changes: + - VERSION + - CHANGELOG.md + - package.json + script: + - *setup-bun + - PR_VERSION=$(cat VERSION | tr -d '[:space:]') + - BASE_VERSION=$(git show "origin/$CI_MERGE_REQUEST_TARGET_BRANCH_NAME:VERSION" 2>/dev/null | tr -d '[:space:]' || echo "0.0.0.0") + - LEVEL=$(bun run scripts/detect-bump.ts "$BASE_VERSION" "$PR_VERSION") + # Util fail-open: on non-zero exit, emit offline marker + - | + set +e + bun run bin/gstack-next-version \ + --base "$CI_MERGE_REQUEST_TARGET_BRANCH_NAME" \ + --bump "$LEVEL" \ + --current-version "$BASE_VERSION" \ + --workspace-root null \ + --exclude-pr "$CI_MERGE_REQUEST_IID" \ + > next.json + RC=$? + if [ "$RC" != "0" ] || [ ! -s next.json ]; then + echo '{"offline":true}' > next.json + echo "WARNING: util exit=$RC — failing open" + fi + set -e + - PR_VERSION="$PR_VERSION" bun run scripts/compare-pr-version.ts next.json "$CI_MERGE_REQUEST_IID" + +pr-title-sync: + stage: check + image: debian:stable-slim + rules: + - if: '$CI_PIPELINE_SOURCE == "merge_request_event"' + changes: + - VERSION + script: + - apt-get update -qq && apt-get install -qq -y curl jq git + - curl -fsSL https://gitlab.com/gitlab-org/cli/-/releases/permalink/latest/downloads/glab_linux_amd64.deb -o glab.deb && dpkg -i glab.deb + - VERSION=$(cat VERSION | tr -d '[:space:]') + - TITLE="$CI_MERGE_REQUEST_TITLE" + - | + if printf '%s' "$TITLE" | grep -qE '^v[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+ '; then + PREFIX=$(printf '%s' "$TITLE" | awk '{print $1}') + REST=$(printf '%s' "$TITLE" | sed 's/^v[0-9][0-9.]* //') + if [ "v$VERSION" != "$PREFIX" ]; then + echo "Rewriting: $PREFIX ... → v$VERSION ..." + glab mr update "$CI_MERGE_REQUEST_IID" -t "v$VERSION $REST" + else + echo "Title already matches v$VERSION; no change." + fi + else + echo "Title does not use v prefix — leaving alone." + fi diff --git a/CHANGELOG.md b/CHANGELOG.md index 27c832ca..9212bdd0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,312 @@ # Changelog +## [1.11.1.0] - 2026-04-23 + +## **Plan mode stopped silently rubber-stamping your reviews. The forcing questions actually fire now.** + +If you ran `/plan-ceo-review` or any interactive review skill while in plan mode, the skill used to read your diff, skip every STOP gate, write a plan file, and exit. Zero AskUserQuestion calls. Zero mode selection. Zero per-section decisions. The skill's interactive contract got outranked by plan mode's system-reminder, which tells the model to run its own workflow and ignore everything else. This release adds a preamble-level STOP gate that fires before any analysis, so you always get the interactive review the skill was designed to run. + +### What shipped + +Four interactive review skills (plan-ceo-review, plan-eng-review, plan-design-review, plan-devex-review) now emit a two-option AskUserQuestion the moment plan mode is detected: exit-and-rerun interactively, or cancel. No silent bypass. The gate is classified one-way-door in the question registry so `/plan-tune` preferences can't auto-decide past it. Outcome gets logged to `~/.gstack/analytics/skill-usage.jsonl` synchronously when the handshake fires, so A-exit and C-cancel are captured even though they terminate the skill before the end-of-run telemetry block. + +The test harness got a canUseTool extension built on Anthropic's Agent SDK (already installed at v0.2.117). When a test supplies a canUseTool callback, `test/helpers/agent-sdk-runner.ts` flips `permissionMode` from `bypassPermissions` to `default` so the callback actually fires. This is the foundation for asserting AskUserQuestion content end-to-end, which gstack's E2E tests previously couldn't do at all. They had to instruct the model to skip AskUserQuestion entirely. Every future interactive-skill test builds on this. + +### The numbers that matter + +Source: new unit tests in `test/gen-skill-docs.test.ts` (8 tests covering handshake presence, absence, composition ordering, 0C-bis STOP block) and `test/agent-sdk-runner.test.ts` (6 tests covering canUseTool + permission-mode + passThrough helper). All 14 pass locally in <250ms, free tier. + +| Surface | Before | After | +|---|---|---| +| Claude skills rendering the handshake | 0 | 4 (plan-ceo, plan-eng, plan-design, plan-devex) | +| Non-Claude host outputs with handshake text | N/A | 0 (host-scoped via `ctx.host === 'claude'` check) | +| E2E tests that can assert AskUserQuestion content | 0 | 1 harness primitive, ready for every interactive skill | +| Plan-mode entry to any of 4 review skills | Silent bypass | Two-option STOP gate | +| Step 0C-bis in plan-ceo-review | No STOP block, could drift to 0F | Explicit `**STOP.**` block matching 0F pattern | +| Post-handshake telemetry outcomes captured | Neither A-exit nor C-cancel | Both (synchronous write before ExitPlanMode) | + +### What this means for builders + +If you're running gstack in plan mode on a PR review, you'll see one question before the skill does anything: "Exit plan mode and run interactively, or cancel?" Pick A, press esc-esc, rerun the skill in normal mode, get the full interactive review you expected. Pick C to bail cleanly. No more silent rubber-stamp. + +If you're building new interactive skills (yours or contributing to gstack), you can now write real E2E tests that assert on AskUserQuestion shape and routing via the canUseTool harness. See `test/agent-sdk-runner.test.ts` for the pattern and `test/helpers/agent-sdk-runner.ts` for the API. + +### Itemized changes + +#### Fixed + +- Plan mode no longer silently skips AskUserQuestion gates in `/plan-ceo-review`, `/plan-eng-review`, `/plan-design-review`, or `/plan-devex-review`. A preamble-level handshake fires as the first thing the skill does when the plan-mode system-reminder is present, forcing a user choice before any analysis or plan-file writes. +- `/plan-ceo-review` Step 0C-bis now has an explicit STOP block matching the pattern used at Step 0F, so the approach-selection question can't be silently skipped when the skill continues to mode selection. + +#### Added + +- New resolver `scripts/resolvers/preamble/generate-plan-mode-handshake.ts` emits the handshake prose and telemetry bash. Host-scoped to Claude only via `ctx.host === 'claude'` check. Opt-in per skill via `interactive: true` in frontmatter. +- New frontmatter field `interactive: boolean` on skill templates. Generator-only input parsed by `scripts/gen-skill-docs.ts`, never written to generated SKILL.md output (follows the `preamble-tier` precedent). +- New question registry entries `plan-{ceo,eng,design,devex}-review-plan-mode-handshake` with `door_type: 'one-way'` in `scripts/question-registry.ts`. Question-tuning `never-ask` preferences cannot suppress this gate. +- New telemetry field `plan_mode_handshake` in `~/.gstack/analytics/skill-usage.jsonl` with outcomes `fired`, `A-exit`, `C-cancel` written synchronously as the handshake fires. Captures outcomes that would otherwise terminate the skill before end-of-run telemetry runs. +- `test/helpers/agent-sdk-runner.ts` extended with optional `canUseTool` callback parameter. When supplied, flips `permissionMode` to `default`, auto-adds `AskUserQuestion` to `allowedTools`, and passes the callback to the SDK. Exports `passThroughNonAskUserQuestion` helper for tests that only want to assert on AskUserQuestion but auto-allow other tools. + +#### For contributors + +- Added 5 unit tests in `test/gen-skill-docs.test.ts` verifying handshake presence in 4 interactive skills, absence in non-interactive skills, absence in non-Claude host outputs, composition ordering (handshake precedes upgrade-check), and 0C-bis STOP block wiring. +- Added 6 unit tests in `test/agent-sdk-runner.test.ts` verifying permission-mode flip, allowedTools auto-injection, canUseTool callback propagation, and pass-through helper behavior. +- Added 6 gate-tier entries to `test/helpers/touchfiles.ts` covering the new E2E test surface. Dependency glob fires any of the new tests when: the relevant skill template, the handshake resolver, preamble composition, the question registry, the one-way-door classifier, or the agent-sdk-runner changes. +- Filed 2 P1/P2 follow-ups in `TODOS.md`: structural STOP-Ask forcing function across all skills (broader class of bug beyond plan-mode entry), and extending `interactive: true` audit to non-review interactive skills like `/office-hours`, `/codex`, `/investigate`, `/qa`. + +## [1.11.0.0] - 2026-04-23 + +## **Workspace-aware ship. Two open PRs can't both claim the same VERSION anymore.** + +If you run gstack in multiple Conductor windows at once, you've probably seen this: two branches bump to the same version, whoever merges second silently overwrites the first one's CHANGELOG entry or lands with a duplicate header, and nobody notices until a `grep "^## \["` later. This release makes that collision impossible by construction. `/ship` now queries the open PR queue, sees what versions are already claimed, and picks the next free slot at your chosen bump level. If a collision is detected between ship and land, the land step aborts and tells you to rerun `/ship` rather than silently overwriting. A new `/landing-report` command shows the whole queue on demand. + +### What changes for you + +Run `/ship` in one Conductor window while another has an open PR claiming v1.7.0.0. Your ship now sees the claim, renders a queue table, and picks the next free slot above it (same bump level). The PR title starts with `v` so landing order is visible in `gh pr list` without opening each PR. If a sibling workspace has uncommitted work at a higher VERSION and looks active (commit in the last 24h), `/ship` asks whether to wait for them or advance past. If the queue shifts between ship and merge, CI's new version-gate catches it, and rerunning `/ship` rewrites VERSION, package.json, CHANGELOG, and the PR title atomically. This very release dogfooded the drift path: the original ship at v1.8.0.0 went stale when three other PRs landed first, and the merge-back-to-main rebump (v1.8.0.0 → v1.11.0.0) happened via the same queue-aware codepath it introduces. + +### What shipped (by the numbers) + +- `bin/gstack-next-version` — ~390-line Bun/TS util. 21 passing fixture tests covering happy path, 8 collision scenarios, offline fallback, fork-PR filtering, sibling activity detection, self-PR auto-exclusion. +- Host parity: GitHub + GitLab both supported. CI gates: `.github/workflows/version-gate.yml`, `.github/workflows/pr-title-sync.yml`, plus `.gitlab-ci.yml` mirror. +- Fail-open semantics on util errors (network, auth, bug). A gstack bug never freezes your merge queue. Fail-closed on confirmed collisions. +- `/landing-report` skill — read-only dashboard showing queue, siblings, and what all four bump levels would claim. +- `workspace_root` config key, default `$HOME/conductor/workspaces`, null disables sibling scan for non-Conductor users. + +### What this means for teams running parallel workspaces + +If you're routinely running 3-10 Conductor windows against the same repo, this is the capability that lets the model scale. Before: you mostly got away with it because you noticed collisions by eye. After: the queue is an observable surface, and the system refuses to ship a stale version. `/landing-report` is the new "where am I in line" check when you're about to open PR #6 for the day. Run it before `/ship` if you want to see what's coming without shipping. + +### Itemized changes + +#### Added + +- `bin/gstack-next-version`. Host-aware (GitHub + GitLab + unknown) VERSION allocator. Queries open PRs, fetches each PR's VERSION at head (bounded concurrency, 10 parallel), scans sibling Conductor worktrees, picks the next free slot. Pure reader, never writes files. Supports `--exclude-pr ` to filter out the PR being checked (prevents self-reference when CI runs against the PR's own VERSION). +- `scripts/detect-bump.ts`, `scripts/compare-pr-version.ts`. CI gate helpers. Three exit paths: pass, block on confirmed collision, fail-open on util errors. +- `.github/workflows/version-gate.yml`. Merge-time collision gate. Runs when VERSION/CHANGELOG/package.json changes on a PR. +- `.github/workflows/pr-title-sync.yml`. Auto-rewrites PR title when VERSION changes on push, only for titles already carrying the `v` prefix (custom titles left alone, idempotent). +- `.gitlab-ci.yml`. GitLab CI parity. Both jobs mirrored with the same fail-open semantics. +- `landing-report/SKILL.md.tmpl`. New `/landing-report` or `/gstack-landing-report` skill. Read-only dashboard. +- `bin/gstack-config`. New `workspace_root` key. Default `$HOME/conductor/workspaces`, `null` disables sibling scan. + +#### Changed + +- `ship/SKILL.md.tmpl` Step 12. Queue-aware VERSION pick in FRESH path, drift detection in ALREADY_BUMPED path. On detected drift the user is prompted to rebump, which runs the full metadata path (VERSION + package.json + CHANGELOG header + PR title) atomically so nothing goes stale. +- `ship/SKILL.md.tmpl` Step 19. PR title format is now `v : `, version ALWAYS first. Rerun path updates the title (not just the body) when VERSION changed. Both GitHub and GitLab paths. +- `land-and-deploy/SKILL.md.tmpl`. New Step 3.4 pre-merge drift detection. Aborts with a clear rerun-/ship instruction rather than auto-mutating files. Rerunning `/ship` is the clean path because ship owns the full metadata flow. +- `review/SKILL.md.tmpl`. New Step 3.4 advisory one-liner showing queue status. Non-blocking. +- `CLAUDE.md`. Versioning invariant paragraph. Documents that VERSION is a monotonic sequence, not a strict semver commitment, and queue-advance within a bump level is permitted. + +#### Fixed + +- Self-reference bug in the version gate. The first live CI run (PR #1168 at v1.8.0.0) was rejected as "stale" because the util counted the PR being checked as a queued claim, inflating the next slot by one. Fixed with `--exclude-pr` flag + `gh pr view` auto-detect so the util silently filters the current branch's PR. Caught and fixed in the same ship — exactly the dogfood loop the release is designed for. + +#### For contributors + +- `test/gstack-next-version.test.ts`. 21 pure-function tests (parseVersion / bumpVersion / cmpVersion / pickNextSlot with 8 collision scenarios / markActiveSiblings 4 cases) plus a CLI smoke test against the live repo. +- Golden ship fixtures refreshed for all three hosts (claude, codex, factory) after Step 12 and Step 19 template changes. This is exactly the blast radius Codex flagged during the CEO review (cross-model tension #8), handled in the same PR rather than as a follow-up. + +## **Plan mode stopped silently rubber-stamping your reviews. The forcing questions actually fire now.** + +If you ran `/plan-ceo-review` or any interactive review skill while in plan mode, the skill used to read your diff, skip every STOP gate, write a plan file, and exit. Zero AskUserQuestion calls. Zero mode selection. Zero per-section decisions. The skill's interactive contract got outranked by plan mode's system-reminder, which tells the model to run its own workflow and ignore everything else. This release adds a preamble-level STOP gate that fires before any analysis, so you always get the interactive review the skill was designed to run. + +### What shipped + +Four interactive review skills (plan-ceo-review, plan-eng-review, plan-design-review, plan-devex-review) now emit a two-option AskUserQuestion the moment plan mode is detected: exit-and-rerun interactively, or cancel. No silent bypass. The gate is classified one-way-door in the question registry so `/plan-tune` preferences can't auto-decide past it. Outcome gets logged to `~/.gstack/analytics/skill-usage.jsonl` synchronously when the handshake fires, so A-exit and C-cancel are captured even though they terminate the skill before the end-of-run telemetry block. + +The test harness got a canUseTool extension built on Anthropic's Agent SDK (already installed at v0.2.117). When a test supplies a canUseTool callback, `test/helpers/agent-sdk-runner.ts` flips `permissionMode` from `bypassPermissions` to `default` so the callback actually fires. This is the foundation for asserting AskUserQuestion content end-to-end, which gstack's E2E tests previously couldn't do at all. They had to instruct the model to skip AskUserQuestion entirely. Every future interactive-skill test builds on this. + +### The numbers that matter + +Source: new unit tests in `test/gen-skill-docs.test.ts` (8 tests covering handshake presence, absence, composition ordering, 0C-bis STOP block) and `test/agent-sdk-runner.test.ts` (6 tests covering canUseTool + permission-mode + passThrough helper). All 14 pass locally in <250ms, free tier. + +| Surface | Before | After | +|---|---|---| +| Claude skills rendering the handshake | 0 | 4 (plan-ceo, plan-eng, plan-design, plan-devex) | +| Non-Claude host outputs with handshake text | N/A | 0 (host-scoped via `ctx.host === 'claude'` check) | +| E2E tests that can assert AskUserQuestion content | 0 | 1 harness primitive, ready for every interactive skill | +| Plan-mode entry to any of 4 review skills | Silent bypass | Two-option STOP gate | +| Step 0C-bis in plan-ceo-review | No STOP block, could drift to 0F | Explicit `**STOP.**` block matching 0F pattern | +| Post-handshake telemetry outcomes captured | Neither A-exit nor C-cancel | Both (synchronous write before ExitPlanMode) | + +### What this means for builders + +If you're running gstack in plan mode on a PR review, you'll see one question before the skill does anything: "Exit plan mode and run interactively, or cancel?" Pick A, press esc-esc, rerun the skill in normal mode, get the full interactive review you expected. Pick C to bail cleanly. No more silent rubber-stamp. + +If you're building new interactive skills (yours or contributing to gstack), you can now write real E2E tests that assert on AskUserQuestion shape and routing via the canUseTool harness. See `test/agent-sdk-runner.test.ts` for the pattern and `test/helpers/agent-sdk-runner.ts` for the API. + +### Itemized changes + +#### Fixed + +- Plan mode no longer silently skips AskUserQuestion gates in `/plan-ceo-review`, `/plan-eng-review`, `/plan-design-review`, or `/plan-devex-review`. A preamble-level handshake fires as the first thing the skill does when the plan-mode system-reminder is present, forcing a user choice before any analysis or plan-file writes. +- `/plan-ceo-review` Step 0C-bis now has an explicit STOP block matching the pattern used at Step 0F, so the approach-selection question can't be silently skipped when the skill continues to mode selection. + +#### Added + +- New resolver `scripts/resolvers/preamble/generate-plan-mode-handshake.ts` emits the handshake prose and telemetry bash. Host-scoped to Claude only via `ctx.host === 'claude'` check. Opt-in per skill via `interactive: true` in frontmatter. +- New frontmatter field `interactive: boolean` on skill templates. Generator-only input parsed by `scripts/gen-skill-docs.ts`, never written to generated SKILL.md output (follows the `preamble-tier` precedent). +- New question registry entry `plan-mode-handshake` with `door_type: 'one-way'` in `scripts/question-registry.ts`. Question-tuning `never-ask` preferences cannot suppress this gate. +- New telemetry field `plan_mode_handshake` in `~/.gstack/analytics/skill-usage.jsonl` with outcomes `fired`, `A-exit`, `C-cancel` written synchronously as the handshake fires. Captures outcomes that would otherwise terminate the skill before end-of-run telemetry runs. +- `test/helpers/agent-sdk-runner.ts` extended with optional `canUseTool` callback parameter. When supplied, flips `permissionMode` to `default`, auto-adds `AskUserQuestion` to `allowedTools`, and passes the callback to the SDK. Exports `passThroughNonAskUserQuestion` helper for tests that only want to assert on AskUserQuestion but auto-allow other tools. + +#### For contributors + +- Added 8 unit tests in `test/gen-skill-docs.test.ts` verifying handshake presence in 4 interactive skills, absence in non-interactive skills, absence in non-Claude host outputs, composition ordering (handshake precedes upgrade-check), and 0C-bis STOP block wiring. +- Added 6 unit tests in `test/agent-sdk-runner.test.ts` verifying permission-mode flip, allowedTools auto-injection, canUseTool callback propagation, and pass-through helper behavior. +- Added 6 gate-tier entries to `test/helpers/touchfiles.ts` covering the new E2E test surface. Dependency glob fires any of the new tests when: the relevant skill template, the handshake resolver, preamble composition, the question registry, the one-way-door classifier, or the agent-sdk-runner changes. +- Filed 2 P1/P2 follow-ups in `TODOS.md`: structural STOP-Ask forcing function across all skills (broader class of bug beyond plan-mode entry), and extending `interactive: true` audit to non-review interactive skills like `/office-hours`, `/codex`, `/investigate`, `/qa`. + +## [1.10.1.0] - 2026-04-23 + +## **We tried to make Opus 4.7 faster with a prompt. Measurement said it got slower. Pulled the bullet.** + +gstack shipped a "Fan out explicitly" overlay nudge in `model-overlays/opus-4-7.md` +back in v1.5.2.0. The idea: tell Opus 4.7 to emit multiple tool calls in one +assistant turn instead of one per turn, so "read three files" takes one API +round-trip instead of three. Sounded obvious. This release removes that +bullet after measuring that it actively hurt performance, and ships the eval +harness we used to prove it so you can measure your own overlay changes. + +### The numbers that matter + +Source: new `test/skill-e2e-overlay-harness.test.ts`, N=10 trials per arm per +fixture, 40 trials per run, ~$3 per run. Pinned to `claude-opus-4-7` via +Anthropic's published Agent SDK (`@anthropic-ai/claude-agent-sdk@0.2.117`) +with `pathToClaudeCodeExecutable` set to the locally-installed `claude` binary +(2.1.118). Metric: number of parallel `tool_use` blocks in the first assistant +turn. + +| Prompt text in overlay | First-turn fanout rate (toy: read 3 files) | Lift vs baseline | +|---|---|---| +| No overlay (default Claude Code system prompt only) | **70%** (7/10) | baseline | +| gstack's original "Fan out explicitly" nudge (v1.5.2.0 through v1.6.3.0) | 10% (1/10) | **-60%** | +| Anthropic's own canonical `` text from their parallel-tool-use docs | **0%** (0/10) | **-70%** | + +On a realistic multi-file audit prompt (`read app.ts + config.ts + README.md, +glob src/*.ts, summarize`), Opus 4.7 never fanned out in the first turn at all, +regardless of overlay. Zero of 20 trials. The nudge had nothing to grip. + +Total cost of the investigation: **$7** across three eval runs. + +### What this means for you + +If you ship system-prompt nudges for Claude, measure them. Anthropic's own +published best-practice text dropped our fanout rate to zero. That's not a +claim about Anthropic, it's a claim about measurement: the model, the SDK, +the binary, and the context all move under the advice, and the advice sits +still. The harness is in the repo now. Run +`EVALS=1 EVALS_TIER=periodic bun test test/skill-e2e-overlay-harness.test.ts`. +Three dollars per run. + +### Itemized changes + +#### Fixed + +- `model-overlays/opus-4-7.md` — removed the "Fan out explicitly" block. The + other three nudges (effort-match, batch questions, literal interpretation) + are untested and stay in for now. They're candidates for their own + measurement in a follow-up PR. + +#### Added + +- `test/skill-e2e-overlay-harness.test.ts` — periodic-tier eval that iterates a + typed fixture registry and runs A/B arms through `@anthropic-ai/claude-agent-sdk`. + Uses SDK preset `claude_code` so the arms include Claude Code's real system + prompt; overlay-ON appends the resolved overlay text. Saves per-trial raw + event streams for forensic recovery. Gated on both `EVALS=1` and + `EVALS_TIER=periodic`. +- `test/fixtures/overlay-nudges.ts` — typed `OverlayFixture` registry with + strict validator. Adding a future nudge to measure = one fixture entry. + First two fixtures: `opus-4-7-fanout-toy` and `opus-4-7-fanout-realistic`. +- `test/helpers/agent-sdk-runner.ts` — parametric SDK wrapper with explicit + `AgentSdkResult` types, process-level API concurrency semaphore, and + three-shape 429 retry (thrown error, result-message error, mid-stream + `SDKRateLimitEvent`). Binary pinning via `pathToClaudeCodeExecutable`. +- `test/agent-sdk-runner.test.ts` — 36 free-tier unit tests covering happy + path, all three rate-limit shapes, persistent-429 `RateLimitExhaustedError`, + non-429 propagation, options propagation, concurrency cap, and every + validator rejection case. +- `scripts/preflight-agent-sdk.ts` — 20-line sanity check that confirms the + SDK loads, `claude-opus-4-7` is a live API model, the `SDKMessage` event + shape matches assumptions, and the overlay resolver produces the expected + text. Run manually before paid runs if you suspect drift. Costs ~$0.013. +- `@anthropic-ai/claude-agent-sdk@0.2.117` in `devDependencies`. Exact pin, + no caret — SDK event shapes can drift on minor versions. + +#### Changed + +- `scripts/resolvers/model-overlay.ts` — exported `readOverlay` so the eval + harness can resolve `{{INHERIT:claude}}` directives without synthesizing a + full `TemplateContext`. + +#### For contributors + +- `test/helpers/touchfiles.ts` — registered the new eval in both + `E2E_TOUCHFILES` (deps: `model-overlays/**`, `overlay-nudges.ts`, runner, + resolver) and `E2E_TIERS` (`periodic`). Passes the + `test/touchfiles.test.ts` completeness check. +- The harness is deliberately parametric. Adding a second overlay nudge + measurement (for the remaining three nudges in `opus-4-7.md`, or any + future nudge in any overlay file) is a single entry in + `test/fixtures/overlay-nudges.ts`. Total incremental effort: ~15 minutes + per fixture. + +## [1.10.0.0] - 2026-04-23 + +## **Plan reviews walk you through each issue again, and every question is now a real decision brief.** + +v1.6.4.0 broke something nobody wrote down. Plan reviews on Opus 4.7 silently stopped asking questions one at a time. They turned into a report: here are 6 findings, end of turn. The interactive dialogue that made `/plan-ceo-review`, `/plan-eng-review`, and the rest useful quietly evaporated. v1.10.0.0 restores that, and bundles a format upgrade so every `AskUserQuestion` now renders as a numbered decision brief with ELI10, stakes, recommendation, per-option pros / cons (✅ / ❌), and a closing "Net:" line that frames the trade-off in one sentence. + +### What changes for you + +Run `/plan-ceo-review` or `/plan-eng-review` on a plan with 3 findings. You get 3 separate AskUserQuestion prompts, one per finding, with the full Pros / Cons shape. Pick the option in 5 seconds, or expand the pros / cons if you want to think about it. Every review finding becomes a decision you actually made, not a bullet point you skimmed. The reference shape matches the D2 memory-design question Garry hand-crafted for his own use, now baked into every tier-2 skill via the preamble resolver, so `/ship`, `/office-hours`, `/investigate`, and the rest inherit it for free. + +### The numbers that matter + +Measured across the v1.10.0.0 fix. Verify any claim with `git log 1.9.0.0..1.10.0.0 --oneline` and `bun test` against the pinned commit SHA. + +| Metric | v1.6.4.0 | v1.10.0.0 | Δ | +|---|---|---|---| +| `AskUserQuestion` renders above model overlay in SKILL.md | no | **yes** | ordering inverted | +| Escape-hatch sites hardened across plan-review templates | 0 | **16** | +16 | +| Gate-tier unit tests pinning the format contract | 0 | **30** | +30 (runs in 16ms, $0) | +| Periodic evals defending against escape-hatch abuse | 0 | **4** | +4 (2 positive, 2 negative-case) | +| Cross-model review findings incorporated before landing | N/A | **5 of 8** | Codex caught real bugs CEO+Eng missed | + +Two of the five Codex findings were load-bearing. (1) The overlay reorder theory wasn't enough on its own. The `(recommended)` label on a neutral-posture question had to stay, because `question-tuning.ts:29` reads it to power AUTO_DECIDE. Omitting it would have silently broken auto-decide on every cherry-pick prompt. (2) The "31 sites global replace" in the original plan was factually wrong. Actual count, verified with `rg`, is 16 sites across 4 templates, and eng/design/devex templates used different phrasing than CEO. Without the audit, the fix would have shipped half-applied. + +### What this means for anyone running plan reviews on Opus 4.7 + +Upgrade and re-run your next plan review. You should see D-numbered prompts (D1, D2, D3...) with ELI10 paragraphs, stakes lines, and ✅ / ❌ bullet blocks per option. If you don't, check that `bun run gen:skill-docs` regenerated cleanly after the upgrade, and verify the `Pros / cons:` header renders in `plan-ceo-review/SKILL.md`. Complete plan reviews that used to take 20 minutes and produced a report now take 10 minutes and produce a row of decisions. + +### Itemized changes + +#### Added + +- New Pros / Cons decision-brief format for every `AskUserQuestion` across all tier-2+ skills. Rendering: `D` header, ELI10, "Stakes if we pick wrong:", Recommendation, per-option `✅ / ❌` bullets with minimum 2 pros + 1 con, closing `Net:` synthesis line. Lands in `scripts/resolvers/preamble/generate-ask-user-format.ts` so every skill inherits it. +- Hard-stop escape for destructive one-way choices: single bullet `✅ No cons — this is a hard-stop choice`. +- Neutral-posture handling for SELECTIVE EXPANSION cherry-picks and taste calls: `Recommendation: — this is a taste call, no strong preference either way` with `(recommended)` label preserved on the default to keep AUTO_DECIDE working. +- Three gate-tier unit tests (`test/preamble-compose.test.ts`, `test/resolver-ask-user-format.test.ts`, `test/model-overlay-opus-4-7.test.ts`) that pin the composition order, format contract, and overlay text. Run in <100ms on every `bun test`. +- Four periodic-tier Pros/Cons eval cases in `test/skill-e2e-plan-prosons.test.ts` including two negative-case assertions that catch escape-hatch abuse before it drifts. +- Touchfiles entries (`test/helpers/touchfiles.ts`) for all new eval cases plus expanded-coverage stubs for 7 additional skills. + +#### Fixed + +- Plan-review cadence regression on Opus 4.7. `/plan-ceo-review`, `/plan-eng-review`, `/plan-design-review`, and `/plan-devex-review` now actually pause after each finding and call `AskUserQuestion` as a tool_use instead of batching everything into one summary report. Root cause: `generateModelOverlay` rendered above `generateAskUserFormat` in `scripts/resolvers/preamble.ts`, so the overlay's "Batch your questions" directive registered as the ambient default before the pacing rule. Fixed by reordering the section array and rewriting the overlay directive as "Pace questions to the skill". +- Escape-hatch collapse: "If no issues or fix is obvious, state what you'll do and move on, don't waste a question" at 16 sites across 4 templates let Opus 4.7's literal interpreter classify every finding as self-dismissable. Tightened per-template: zero findings gets "No issues, moving on"; findings require AskUserQuestion as a tool_use. + +#### Changed + +- `test/skill-e2e-plan-format.test.ts`: extended with v1.10.0.0 format token regexes (D-number, ELI10, Stakes, Pros/cons, Net). Existing RECOMMENDATION check loosened to accept mixed-case "Recommendation:". +- `test/skill-validation.test.ts`: format assertions updated from "RECOMMENDATION: Choose" to the new Pros/Cons token set. +- Golden fixtures regenerated: `test/fixtures/golden/claude-ship-SKILL.md`, `codex-ship-SKILL.md`, `factory-ship-SKILL.md`. + +#### For contributors + +- Outside-voice Codex review (`codex exec` with `model_reasoning_effort="high"`) caught two factual bugs in the original plan: the "31 sites" count (actually 16) and the AUTO_DECIDE contract break on neutral-posture questions. 5 of 8 Codex findings incorporated, 1 rejected (kept defense in depth on the composition reorder), 1 declined (HOLD SCOPE mode lock). +- Follow-up: true multi-turn cadence eval (3 findings produce 3 distinct AskUserQuestion invocations across turns) requires new harness support for multi-capture. Filed in NOT-in-scope. Current single-capture eval covers format + escape-hatch abuse but not cadence itself. +- Follow-up: expanded-coverage eval cases for `/ship`, `/office-hours`, `/investigate`, `/qa`, `/review`, `/design-review`, `/document-release`. Touchfiles entries exist; test blocks will land per-skill in follow-up PRs. +- D-numbering is a model-level instruction, not a runtime counter. `TemplateContext` has no state for it. Drift over long sessions is expected; a registry (deferred to TODOs) is the long-term fix. + ## [1.9.0.0] - 2026-04-23 ## **Your gstack memory now travels with you. Cross-machine brain via a private git repo + optional GBrain indexing, no daemon, no credential leaks.** @@ -75,6 +382,7 @@ Work on the laptop Monday. Switch to the desktop Tuesday. Skill preamble sees th - `test/brain-sync.test.ts` — 12 of 27 tests pass on first bun-test run; remaining 15 hit bun-test's 5s default timeout (spawnSync-heavy git operations). Behaviors verified via integration smokes during implementation. Test infrastructure needs a 30s per-test timeout wrapper. - Three unmerged team-sync branches (`garrytan/team-supabase-store`, `garrytan/fix-team-setup`, `garrytan/team-install-mode`) should be formally closed if team-sync isn't landing — flagged in the CEO plan. - Pre-existing golden-file regression test failure in `test/host-config.test.ts` (Codex ship skill baseline) exists on `main` too — unrelated to this PR, tracked separately. + ## [1.6.4.0] - 2026-04-22 ## **Sidebar prompt-injection defense got half as noisy, half as trusting of any single classifier.** diff --git a/CLAUDE.md b/CLAUDE.md index b77b304f..ca1c5b99 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -407,6 +407,16 @@ No auto-merging. No "I'll just clean this up." ## CHANGELOG + VERSION style +**Versioning invariant (workspace-aware ship).** VERSION is a monotonic ordered +release identifier, not a strict semver commitment. The bump level +(major/minor/patch/micro) expresses intent at ship time. Queue-advancing past a +claimed version within the same bump level is explicitly permitted — if branch A +claims v1.7.0.0 as a MINOR and branch B is also a MINOR, B lands at v1.8.0.0 +(still a MINOR relative to main). Downstream consumers must NOT rely on +"MINOR = feature-only, PATCH = fix-only" as a strict contract. This is why +`bin/gstack-next-version` advances within the chosen bump level rather than +repicking the level when collisions happen. + **VERSION and CHANGELOG are branch-scoped.** Every feature branch that ships gets its own version bump and CHANGELOG entry. The entry describes what THIS branch adds — not what was already on main. diff --git a/TODOS.md b/TODOS.md index 5256ec29..5264574c 100644 --- a/TODOS.md +++ b/TODOS.md @@ -1,5 +1,57 @@ # TODOS +## Testing + +### `security-bench-haiku-responses.json` is 27MB, violates the 2MB tracked-file gate + +**What:** `browse/test/fixtures/security-bench-haiku-responses.json` landed on main at v1.6.4.0 (PR #1135) at 27MB. The `no compiled binaries in git > git tracks no files larger than 2MB` gate in `test/skill-validation.test.ts:1623` fails on main and on every feature branch that merges main afterward. + +**Why:** The fixture is a legitimate CI replay corpus (real Haiku responses from the 500-case BrowseSafe-Bench) used to verify the ensemble classifier deterministically. But 13x over the 2MB limit means it will keep failing the validation test for every future ship. + +**Pros:** Removes a pre-existing failure that wastes a triage slot in every /ship run. + +**Cons:** Moving to git-lfs adds a dependency. Splitting into chunks risks breaking the bench test. External hosting adds a CI fetch step. + +**Context:** Noticed during workspace-aware-ship /ship on 2026-04-23 when the post-merge test suite flagged this single failure. Introduced on main in PR #1135 (`v1.6.4.0: cut Haiku classifier FP from 44% to 23%`), commit d75402bb. Two reasonable paths: (a) split into multiple ≤2MB chunks and load them in the bench test, (b) move to git-lfs. + +**Effort:** M (human: ~2-3h / CC: ~20 min) +**Priority:** P1 (not blocking ship, but every future /ship triages the same failure) +**Depends on:** nothing + +--- + +## P1: Structural STOP-Ask forcing function across all skills (v1.11.1.0 follow-up) + +**What:** Design and implement a structural forcing function that catches when a skill mandates per-issue AskUserQuestion but the model silently substitutes batch-synthesis. Candidate mechanisms: question-count assertion (skill declares expected question count in frontmatter; post-run audit logs if model fired ` text = 0% +- Removed the counterproductive nudge from `model-overlays/opus-4-7.md` +- Shipped 36-test free-tier unit suite for the SDK runner + strict fixture validator +- Registered `overlay-harness-opus-4-7-fanout-{toy,realistic}` in E2E_TOUCHFILES and E2E_TIERS +- Total investigation cost: ~$7 across 3 eval runs +**Completed:** v1.10.1.0 + ### CI eval pipeline (v0.9.9.0) - GitHub Actions eval upload on Ubicloud runners ($0.006/run) - Within-file test concurrency (test() → testConcurrentIfSelected()) diff --git a/VERSION b/VERSION index dcafa494..1b915da2 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.9.0.0 +1.11.1.0 diff --git a/autoplan/SKILL.md b/autoplan/SKILL.md index c4ceeee9..a4f67770 100644 --- a/autoplan/SKILL.md +++ b/autoplan/SKILL.md @@ -358,6 +358,135 @@ AI orchestrator (e.g., OpenClaw). In spawned sessions: - Focus on completing the task and reporting results via prose output. - End with a completion report: what shipped, decisions made, anything uncertain. +## AskUserQuestion Format + +**ALWAYS follow this structure for every AskUserQuestion call. Every element is non-skippable. If you find yourself about to skip any of them, stop and back up.** + +### Required shape + +Every AskUserQuestion reads like a decision brief, not a bullet list: + +``` +D + +ELI10: + +Stakes if we pick wrong: + +Recommendation: because + +Completeness: A=X/10, B=Y/10 (or: Note: options differ in kind, not coverage — no completeness score) + +Pros / cons: + +A)