From b3eaffce073aca37541434b23e2ac04306a80794 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Thu, 16 Apr 2026 23:14:03 -0700 Subject: [PATCH] =?UTF-8?q?feat:=20context=20rot=20defense=20for=20/ship?= =?UTF-8?q?=20=E2=80=94=20subagent=20isolation=20+=20clean=20step=20number?= =?UTF-8?q?ing=20(v0.18.1.0)=20(#1030)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * refactor: renumber /ship steps to clean integers (1-20) Replaces fractional step numbers (1.5, 2.5, 3.25, 3.4, 3.45, 3.47, 3.48, 3.5, 3.55, 3.56, 3.57, 3.75, 3.8, 5.5, 6.5, 8.5, 8.75) with clean integers 1 through 20, plus allowed resolver sub-steps 8.1, 8.2, 9.1, 9.2, 9.3. Fractional numbering signaled "optional appendix" and contributed to /ship's habit of skipping late-stage steps. Affects: - ship/SKILL.md.tmpl (all headings + ~30 cross-references) - scripts/resolvers/review.ts (ship-side 3.47/3.48/3.57/3.8 conditionals) - scripts/resolvers/review-army.ts (ship-side 3.55/3.56 conditionals) - scripts/resolvers/testing.ts (ship-side 2.5/3.4 references, 5 sites) - scripts/resolvers/utility.ts (CHANGELOG heading gets Step 13 prefix) - test/gen-skill-docs.test.ts (5 step-number assertions updated) - test/skill-validation.test.ts (3 step-number assertions updated) /review step numbering (1.5, 2.5, 4.5, 5.5-5.8) intentionally unchanged — only the ship-side of each isShip conditional was updated. Co-Authored-By: Claude Opus 4.7 (1M context) * feat: subagent isolation for /ship's 4 context-heaviest sub-workflows Fights context rot. By late /ship, the parent context is bloated with 500-1,750 lines of intermediate tool output from tests, coverage audits, reviews, adversarial checks, and PR body construction. The model is at its least intelligent when it reaches doc-sync — which is why /document-release was being skipped ~80% of the time. Applies subagent dispatch (proven pattern from Review Army at Step 9.1 and Adversarial at Step 11) to four sub-workflows where the parent only needs the conclusion, not the intermediate output: - Step 7 (Test Coverage Audit) — subagent returns coverage_pct, gaps, diagram, tests_added - Step 8 (Plan Completion Audit) — subagent returns total_items, done, changed, deferred, summary - Step 10 (Greptile Triage) — subagent fetches + classifies, parent handles user interaction and commits fixes (AskUserQuestion + Edit can't run in subagents) - Step 18 (Documentation Sync) — subagent invokes full /document-release skill in fresh context; parent embeds documentation_section in PR body Sequencing fix for Step 18: runs AFTER Step 17 (Push) and BEFORE Step 19 (Create PR). The PR is created once from final HEAD with the ## Documentation section baked into the initial body — no create-then- re-edit dance, no race conditions with document-release's own PR body editor. Adds "You are NOT done" guardrail after Step 17 (Push) to break the natural stopping point that currently causes doc-release skips. Each subagent falls back to inline execution if it fails or returns invalid JSON. /ship never blocks on subagent failure. Co-Authored-By: Claude Opus 4.7 (1M context) * test: regression guard for /ship step numbering Three regression guards in skill-validation.test.ts to prevent future drift back to fractional step numbering: 1. ship/SKILL.md.tmpl contains no fractional step numbers except the allowed resolver sub-steps (8.1, 8.2, 9.1, 9.2, 9.3). A contributor adding "Step 3.75" next month will fail this test with a clear error. 2. ship/SKILL.md main headings use clean integer step numbers. If a renumber accidentally leaves a decimal heading, this catches it. 3. review/SKILL.md step numbers unchanged — regression guard for the resolver conditionals in review.ts/review-army.ts. If a future edit accidentally touches the review-side of an isShip ternary, /review's fractional numbering (1.5, 4.5, 5.7) would vanish. This test catches that cross-contamination. Co-Authored-By: Claude Opus 4.7 (1M context) * docs: sync ship step references after renumber CLAUDE.md: "At /ship time (Step 5)" → "(Step 13)" — CHANGELOG is now explicitly Step 13 after the renumber (was implicit between old Step 4 and Step 5.5). TODOS.md: "Step 3.4 coverage audit" → "Step 7" — references the open TODO for auto-upgrading ★-rated tests, which hooks into the coverage audit step. Both are historical references to ship's step numbering that became stale when clean integer renumbering landed in 566d42c2. Co-Authored-By: Claude Opus 4.7 (1M context) * test: update golden ship skill baselines after renumber + subagent refactor The golden fixtures at test/fixtures/golden/{claude,codex,factory}-ship-SKILL.md regression-test that generated ship/SKILL.md output matches a committed baseline. After renumbering steps to clean integers and converting 4 sub-workflows to subagent dispatches, the generated output changed substantially — refresh the baselines to reflect the new expected output. Co-Authored-By: Claude Opus 4.7 (1M context) * chore: bump version and changelog (v0.18.1.0) Co-Authored-By: Claude Opus 4.7 (1M context) * chore: gitignore Claude Code harness runtime artifacts .claude/scheduled_tasks.lock appears when ScheduleWakeup fires. It's a runtime lock file owned by the Claude Code harness, not project source. Add .claude/*.lock too so future harness artifacts in that directory don't need their own gitignore entries. Co-Authored-By: Claude Opus 4.7 (1M context) --------- Co-authored-by: Claude Opus 4.7 (1M context) --- .gitignore | 2 + CHANGELOG.md | 14 ++ CLAUDE.md | 2 +- TODOS.md | 2 +- VERSION | 2 +- design-review/SKILL.md | 2 +- package.json | 2 +- qa/SKILL.md | 2 +- scripts/resolvers/review-army.ts | 12 +- scripts/resolvers/review.ts | 18 +- scripts/resolvers/testing.ts | 10 +- scripts/resolvers/utility.ts | 2 +- ship/SKILL.md | 273 +++++++++++++-------- ship/SKILL.md.tmpl | 235 +++++++++++------- test/fixtures/golden/claude-ship-SKILL.md | 273 +++++++++++++-------- test/fixtures/golden/codex-ship-SKILL.md | 261 ++++++++++++-------- test/fixtures/golden/factory-ship-SKILL.md | 273 +++++++++++++-------- test/gen-skill-docs.test.ts | 12 +- test/skill-validation.test.ts | 55 ++++- 19 files changed, 900 insertions(+), 552 deletions(-) diff --git a/.gitignore b/.gitignore index c0ab4c16..e1098789 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,8 @@ design/dist/ bin/gstack-global-discover .gstack/ .claude/skills/ +.claude/scheduled_tasks.lock +.claude/*.lock .agents/ .factory/ .kiro/ diff --git a/CHANGELOG.md b/CHANGELOG.md index 75f09431..e2f9a4ed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,19 @@ # Changelog +## [0.18.2.0] - 2026-04-17 + +### Fixed +- **`/ship` stops skipping `/document-release` ~80% of the time.** The old Step 8.5 told Claude to `cat` a 2500-line external skill file *after* the PR URL was already output, at which point the model had 500-1,750 lines of intermediate tool output in context and was at its least intelligent. Now `/ship` dispatches `/document-release` as a subagent that runs in a fresh context window, *before* creating the PR, so the `## Documentation` section gets baked into the initial PR body instead of a create-then-re-edit dance. The result: documentation actually syncs on every ship. + +### Changed +- **`/ship`'s 4 heaviest sub-workflows now run in isolated subagent contexts.** Coverage audit (Step 7), plan completion audit (Step 8), Greptile triage (Step 10), and documentation sync (Step 18) each dispatch a subagent that gets a fresh context window. The parent only sees the conclusion (structured JSON), not the intermediate file reads. This is the pattern Anthropic's "Using Claude Code: Session Management and 1M Context" blog post recommends for fighting context rot: "Will I need this tool output again, or just the conclusion? If just the conclusion, use a subagent." +- **`/ship` step numbers are clean integers 1-20 instead of fractional (`3.47`, `8.5`, `8.75`).** Fractional step numbers signaled "optional appendix" to the model and contributed to late-stage steps getting skipped. Clean integers feel mandatory. Resolver sub-steps that are genuinely nested (Plan Verification 8.1, Scope Drift 8.2, Review Army 9.1/9.2, Cross-review dedup 9.3) are preserved. +- **`/ship` now prints "You are NOT done" after push.** Breaks the natural stopping point where the model was treating a pushed branch as mission-accomplished and skipping doc sync + PR creation. + +### For contributors +- New regression guards in `test/skill-validation.test.ts` prevent drift back to fractional step numbers and catch cross-contamination between `/ship` and `/review` resolver conditionals. +- Ship template restructure: old Step 8.5 (post-PR doc sync with `cat` delegation) replaced by new Step 18 (pre-PR subagent dispatch that invokes full `/document-release` skill with its CHANGELOG clobber protections, doc exclusions, risky-change gates, and race-safe PR body editing). Codex caught that the original plan's reimplementation dropped those protections; this version reuses the real `/document-release`. + ## [0.18.1.0] - 2026-04-16 ### Fixed diff --git a/CLAUDE.md b/CLAUDE.md index 4d9fb300..074b6122 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -339,7 +339,7 @@ own version bump and CHANGELOG entry. The entry describes what THIS branch adds not what was already on main. **When to write the CHANGELOG entry:** -- At `/ship` time (Step 5), not during development or mid-branch. +- At `/ship` time (Step 13), not during development or mid-branch. - The entry covers ALL commits on this branch vs the base branch. - Never fold new work into an existing CHANGELOG entry from a prior version that already landed on main. If main has v0.10.0.0 and your branch adds features, diff --git a/TODOS.md b/TODOS.md index 7bb06d01..54f5d31b 100644 --- a/TODOS.md +++ b/TODOS.md @@ -396,7 +396,7 @@ Linux cookie import shipped in v0.11.11.0 (Wave 3). Supports Chrome, Chromium, B ### Auto-upgrade weak tests (★) to strong tests (★★★) -**What:** When Step 3.4 coverage audit identifies existing ★-rated tests (smoke/trivial assertions), generate improved versions testing edge cases and error paths. +**What:** When Step 7 coverage audit identifies existing ★-rated tests (smoke/trivial assertions), generate improved versions testing edge cases and error paths. **Why:** Many codebases have tests that technically exist but don't catch real bugs — `expect(component).toBeDefined()` isn't testing behavior. Upgrading these closes the gap between "has tests" and "has good tests." diff --git a/VERSION b/VERSION index 72ad141a..51534b8f 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.18.1.0 +0.18.2.0 diff --git a/design-review/SKILL.md b/design-review/SKILL.md index f2c136f9..cc1f0d16 100644 --- a/design-review/SKILL.md +++ b/design-review/SKILL.md @@ -690,7 +690,7 @@ ls -d test/ tests/ spec/ __tests__/ cypress/ e2e/ 2>/dev/null **If test framework detected** (config files or test directories found): Print "Test framework detected: {name} ({N} existing tests). Skipping bootstrap." Read 2-3 existing test files to learn conventions (naming, imports, assertion style, setup patterns). -Store conventions as prose context for use in Phase 8e.5 or Step 3.4. **Skip the rest of bootstrap.** +Store conventions as prose context for use in Phase 8e.5 or Step 7. **Skip the rest of bootstrap.** **If BOOTSTRAP_DECLINED** appears: Print "Test bootstrap previously declined — skipping." **Skip the rest of bootstrap.** diff --git a/package.json b/package.json index 68edadf1..6bd3facb 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "gstack", - "version": "0.18.1.0", + "version": "0.18.2.0", "description": "Garry's Stack — Claude Code skills + fast headless browser. One repo, one install, entire AI engineering workflow.", "license": "MIT", "type": "module", diff --git a/qa/SKILL.md b/qa/SKILL.md index 3a04bd78..dbeb5dde 100644 --- a/qa/SKILL.md +++ b/qa/SKILL.md @@ -732,7 +732,7 @@ ls -d test/ tests/ spec/ __tests__/ cypress/ e2e/ 2>/dev/null **If test framework detected** (config files or test directories found): Print "Test framework detected: {name} ({N} existing tests). Skipping bootstrap." Read 2-3 existing test files to learn conventions (naming, imports, assertion style, setup patterns). -Store conventions as prose context for use in Phase 8e.5 or Step 3.4. **Skip the rest of bootstrap.** +Store conventions as prose context for use in Phase 8e.5 or Step 7. **Skip the rest of bootstrap.** **If BOOTSTRAP_DECLINED** appears: Print "Test bootstrap previously declined — skipping." **Skip the rest of bootstrap.** diff --git a/scripts/resolvers/review-army.ts b/scripts/resolvers/review-army.ts index 1240b839..516ce3c8 100644 --- a/scripts/resolvers/review-army.ts +++ b/scripts/resolvers/review-army.ts @@ -13,8 +13,8 @@ import type { TemplateContext } from './types'; function generateSpecialistSelection(ctx: TemplateContext): string { const isShip = ctx.skillName === 'ship'; - const stepSel = isShip ? '3.55' : '4.5'; - const stepMerge = isShip ? '3.56' : '4.6'; + const stepSel = isShip ? '9.1' : '4.5'; + const stepMerge = isShip ? '9.2' : '4.6'; const nextStep = isShip ? 'the Fix-First flow (item 4)' : 'Step 5'; return `## Step ${stepSel}: Review Army — Specialist Dispatch @@ -134,10 +134,10 @@ CHECKLIST: function generateFindingsMerge(ctx: TemplateContext): string { const isShip = ctx.skillName === 'ship'; - const stepMerge = isShip ? '3.56' : '4.6'; - const stepSel = isShip ? '3.55' : '4.5'; + const stepMerge = isShip ? '9.2' : '4.6'; + const stepSel = isShip ? '9.1' : '4.5'; const fixFirstRef = isShip ? 'the Fix-First flow (item 4)' : 'Step 5 Fix-First'; - const critPassRef = isShip ? 'the checklist pass (Step 3.5)' : 'the CRITICAL pass findings from Step 4'; + const critPassRef = isShip ? 'the checklist pass (Step 9)' : 'the CRITICAL pass findings from Step 4'; const persistRef = isShip ? 'the review-log persist' : 'the review-log entry in Step 5.8'; return `### Step ${stepMerge}: Collect and merge findings @@ -202,7 +202,7 @@ Remember these stats — you will need them for the review-log entry in Step 5.8 function generateRedTeam(ctx: TemplateContext): string { const isShip = ctx.skillName === 'ship'; - const stepMerge = isShip ? '3.56' : '4.6'; + const stepMerge = isShip ? '9.2' : '4.6'; const fixFirstRef = isShip ? 'the Fix-First flow (item 4)' : 'Step 5 Fix-First'; return `### Red Team dispatch (conditional) diff --git a/scripts/resolvers/review.ts b/scripts/resolvers/review.ts index cbc8053c..57c5596c 100644 --- a/scripts/resolvers/review.ts +++ b/scripts/resolvers/review.ts @@ -368,7 +368,7 @@ If A: revise the premise and note the revision. If B: proceed (and note that the export function generateScopeDrift(ctx: TemplateContext): string { const isShip = ctx.skillName === 'ship'; - const stepNum = isShip ? '3.48' : '1.5'; + const stepNum = isShip ? '8.2' : '1.5'; return `## Step ${stepNum}: Scope Drift Detection @@ -413,7 +413,7 @@ export function generateAdversarialStep(ctx: TemplateContext): string { if (ctx.host === 'codex') return ''; const isShip = ctx.skillName === 'ship'; - const stepNum = isShip ? '3.8' : '5.7'; + const stepNum = isShip ? '11' : '5.7'; return `## Step ${stepNum}: Adversarial review (always-on) @@ -501,7 +501,7 @@ A) Investigate and fix now (recommended) B) Continue — review will still complete \`\`\` -If A: address the findings${isShip ? '. After fixing, re-run tests (Step 3) since code has changed' : ''}. Re-run \`codex review\` to verify. +If A: address the findings${isShip ? '. After fixing, re-run tests (Step 5) since code has changed' : ''}. Re-run \`codex review\` to verify. Read stderr for errors (same error handling as Codex adversarial above). @@ -917,16 +917,16 @@ export function generatePlanCompletionAuditReview(_ctx: TemplateContext): string // ─── Plan Verification Execution ────────────────────────────────────── export function generatePlanVerificationExec(_ctx: TemplateContext): string { - return `## Step 3.47: Plan Verification + return `## Step 8.1: Plan Verification Automatically verify the plan's testing/verification steps using the \`/qa-only\` skill. ### 1. Check for verification section -Using the plan file already discovered in Step 3.45, look for a verification section. Match any of these headings: \`## Verification\`, \`## Test plan\`, \`## Testing\`, \`## How to test\`, \`## Manual testing\`, or any section with verification-flavored items (URLs to visit, things to check visually, interactions to test). +Using the plan file already discovered in Step 8, look for a verification section. Match any of these headings: \`## Verification\`, \`## Test plan\`, \`## Testing\`, \`## How to test\`, \`## Manual testing\`, or any section with verification-flavored items (URLs to visit, things to check visually, interactions to test). **If no verification section found:** Skip with "No verification steps found in plan — skipping auto-verification." -**If no plan file was found in Step 3.45:** Skip (already handled). +**If no plan file was found in Step 8:** Skip (already handled). ### 2. Check for running dev server @@ -971,7 +971,7 @@ Follow the /qa-only workflow with these modifications: ### 5. Include in PR body -Add a \`## Verification Results\` section to the PR body (Step 8): +Add a \`## Verification Results\` section to the PR body (Step 19): - If verification ran: summary of results (N PASS, M FAIL, K SKIPPED) - If skipped: reason for skipping (no plan, no server, no verification section)`; } @@ -980,9 +980,9 @@ Add a \`## Verification Results\` section to the PR body (Step 8): export function generateCrossReviewDedup(ctx: TemplateContext): string { const isShip = ctx.skillName === 'ship'; - const stepNum = isShip ? '3.57' : '5.0'; + const stepNum = isShip ? '9.3' : '5.0'; const findingsRef = isShip - ? 'the checklist pass (Step 3.5) and specialist review (Step 3.55-3.56)' + ? 'the checklist pass (Step 9) and specialist review (Step 9.1-9.2)' : 'Step 4 critical pass and Step 4.5-4.6 specialists'; return `### Step ${stepNum}: Cross-review finding dedup diff --git a/scripts/resolvers/testing.ts b/scripts/resolvers/testing.ts index da1381c2..f372aee1 100644 --- a/scripts/resolvers/testing.ts +++ b/scripts/resolvers/testing.ts @@ -28,7 +28,7 @@ ls -d test/ tests/ spec/ __tests__/ cypress/ e2e/ 2>/dev/null **If test framework detected** (config files or test directories found): Print "Test framework detected: {name} ({N} existing tests). Skipping bootstrap." Read 2-3 existing test files to learn conventions (naming, imports, assertion style, setup patterns). -Store conventions as prose context for use in Phase 8e.5 or Step 3.4. **Skip the rest of bootstrap.** +Store conventions as prose context for use in Phase 8e.5 or Step 7. **Skip the rest of bootstrap.** **If BOOTSTRAP_DECLINED** appears: Print "Test bootstrap previously declined — skipping." **Skip the rest of bootstrap.** @@ -213,7 +213,7 @@ ls jest.config.* vitest.config.* playwright.config.* cypress.config.* .rspec pyt ls -d test/ tests/ spec/ __tests__/ cypress/ e2e/ 2>/dev/null \`\`\` -3. **If no framework detected:**${mode === 'ship' ? ' falls through to the Test Framework Bootstrap step (Step 2.5) which handles full setup.' : ' still produce the coverage diagram, but skip test generation.'}`); +3. **If no framework detected:**${mode === 'ship' ? ' falls through to the Test Framework Bootstrap step (Step 4) which handles full setup.' : ' still produce the coverage diagram, but skip test generation.'}`); // ── Before/after count (ship only) ── if (mode === 'ship') { @@ -379,7 +379,7 @@ GAPS: 8 paths need tests (2 need E2E, 1 needs eval) ───────────────────────────────── \`\`\` -**Fast path:** All paths covered → "${mode === 'ship' ? 'Step 3.4' : mode === 'review' ? 'Step 4.75' : 'Test review'}: All new code paths have test coverage ✓" Continue.`); +**Fast path:** All paths covered → "${mode === 'ship' ? 'Step 7' : mode === 'review' ? 'Step 4.75' : 'Test review'}: All new code paths have test coverage ✓" Continue.`); // ── Mode-specific action section ── if (mode === 'plan') { @@ -432,7 +432,7 @@ This file is consumed by \`/qa\` and \`/qa-only\` as primary test input. Include sections.push(` **5. Generate tests for uncovered paths:** -If test framework detected (or bootstrapped in Step 2.5): +If test framework detected (or bootstrapped in Step 4): - Prioritize error handlers and edge cases first (happy paths are more likely already tested) - Read 2-3 existing test files to match conventions exactly - Generate unit tests. Mock all external dependencies (DB, API, Redis). @@ -446,7 +446,7 @@ Caps: 30 code paths max, 20 tests generated max (code + user flow combined), 2-m If no test framework AND user declined bootstrap → diagram only, no generation. Note: "Test generation skipped — no test framework configured." -**Diff is test-only changes:** Skip Step 3.4 entirely: "No new application code paths to audit." +**Diff is test-only changes:** Skip Step 7 entirely: "No new application code paths to audit." **6. After-count and coverage summary:** diff --git a/scripts/resolvers/utility.ts b/scripts/resolvers/utility.ts index c3e6d690..83934b07 100644 --- a/scripts/resolvers/utility.ts +++ b/scripts/resolvers/utility.ts @@ -373,7 +373,7 @@ export function generateCoAuthorTrailer(ctx: TemplateContext): string { } export function generateChangelogWorkflow(_ctx: TemplateContext): string { - return `## CHANGELOG (auto-generate) + return `## Step 13: CHANGELOG (auto-generate) 1. Read \`CHANGELOG.md\` header to know the format. diff --git a/ship/SKILL.md b/ship/SKILL.md index 61a6b87e..0d97b858 100644 --- a/ship/SKILL.md +++ b/ship/SKILL.md @@ -624,17 +624,17 @@ You are running the `/ship` workflow. This is a **non-interactive, fully automat - Merge conflicts that can't be auto-resolved (stop, show conflicts) - In-branch test failures (pre-existing failures are triaged, not auto-blocking) - Pre-landing review finds ASK items that need user judgment -- MINOR or MAJOR version bump needed (ask — see Step 4) +- MINOR or MAJOR version bump needed (ask — see Step 12) - Greptile review comments that need user decision (complex fixes, false positives) -- AI-assessed coverage below minimum threshold (hard gate with user override — see Step 3.4) -- Plan items NOT DONE with no user override (see Step 3.45) -- Plan verification failures (see Step 3.47) -- TODOS.md missing and user wants to create one (ask — see Step 5.5) -- TODOS.md disorganized and user wants to reorganize (ask — see Step 5.5) +- AI-assessed coverage below minimum threshold (hard gate with user override — see Step 7) +- Plan items NOT DONE with no user override (see Step 8) +- Plan verification failures (see Step 8.1) +- TODOS.md missing and user wants to create one (ask — see Step 14) +- TODOS.md disorganized and user wants to reorganize (ask — see Step 14) **Never stop for:** - Uncommitted changes (always include them) -- Version bump choice (auto-pick MICRO or PATCH — see Step 4) +- Version bump choice (auto-pick MICRO or PATCH — see Step 12) - CHANGELOG content (auto-generate from diff) - Commit message approval (auto-commit) - Multi-file changesets (auto-split into bisectable commits) @@ -647,9 +647,9 @@ Re-running `/ship` means "run the whole checklist again." Every verification ste (tests, coverage audit, plan completion, pre-landing review, adversarial review, VERSION/CHANGELOG check, TODOS, document-release) runs on every invocation. Only *actions* are idempotent: -- Step 4: If VERSION already bumped, skip the bump but still read the version -- Step 7: If already pushed, skip the push command -- Step 8: If PR exists, update the body instead of creating a new PR +- Step 12: If VERSION already bumped, skip the bump but still read the version +- Step 17: If already pushed, skip the push command +- Step 19: If PR exists, update the body instead of creating a new PR Never skip a verification step because a prior `/ship` run already performed it. --- @@ -717,19 +717,19 @@ Display: If the Eng Review is NOT "CLEAR": -Print: "No prior eng review found — ship will run its own pre-landing review in Step 3.5." +Print: "No prior eng review found — ship will run its own pre-landing review in Step 9." Check diff size: `git diff ...HEAD --stat | tail -1`. If the diff is >200 lines, add: "Note: This is a large diff. Consider running `/plan-eng-review` or `/autoplan` for architecture-level review before shipping." If CEO Review is missing, mention as informational ("CEO Review not run — recommended for product changes") but do NOT block. -For Design Review: run `source <(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null)`. If `SCOPE_FRONTEND=true` and no design review (plan-design-review or design-review-lite) exists in the dashboard, mention: "Design Review not run — this PR changes frontend code. The lite design check will run automatically in Step 3.5, but consider running /design-review for a full visual audit post-implementation." Still never block. +For Design Review: run `source <(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null)`. If `SCOPE_FRONTEND=true` and no design review (plan-design-review or design-review-lite) exists in the dashboard, mention: "Design Review not run — this PR changes frontend code. The lite design check will run automatically in Step 9, but consider running /design-review for a full visual audit post-implementation." Still never block. -Continue to Step 1.5 — do NOT block or ask. Ship runs its own review in Step 3.5. +Continue to Step 2 — do NOT block or ask. Ship runs its own review in Step 9. --- -## Step 1.5: Distribution Pipeline Check +## Step 2: Distribution Pipeline Check If the diff introduces a new standalone artifact (CLI binary, library package, tool) — not a web service with existing deployment — verify that a distribution pipeline exists. @@ -757,7 +757,7 @@ service with existing deployment — verify that a distribution pipeline exists. --- -## Step 2: Merge the base branch (BEFORE tests) +## Step 3: Merge the base branch (BEFORE tests) Fetch and merge the base branch into the feature branch so tests run against the merged state: @@ -771,7 +771,7 @@ git fetch origin && git merge origin/ --no-edit --- -## Step 2.5: Test Framework Bootstrap +## Step 4: Test Framework Bootstrap ## Test Framework Bootstrap @@ -800,7 +800,7 @@ ls -d test/ tests/ spec/ __tests__/ cypress/ e2e/ 2>/dev/null **If test framework detected** (config files or test directories found): Print "Test framework detected: {name} ({N} existing tests). Skipping bootstrap." Read 2-3 existing test files to learn conventions (naming, imports, assertion style, setup patterns). -Store conventions as prose context for use in Phase 8e.5 or Step 3.4. **Skip the rest of bootstrap.** +Store conventions as prose context for use in Phase 8e.5 or Step 7. **Skip the rest of bootstrap.** **If BOOTSTRAP_DECLINED** appears: Print "Test bootstrap previously declined — skipping." **Skip the rest of bootstrap.** @@ -929,7 +929,7 @@ Only commit if there are changes. Stage all bootstrap files (config, test direct --- -## Step 3: Run tests (on merged code) +## Step 5: Run tests (on merged code) **Do NOT run `RAILS_ENV=test bin/rails db:migrate`** — `bin/test-lane` already calls `db:test:prepare` internally, which loads the schema into the correct lane database. @@ -1051,13 +1051,13 @@ Use AskUserQuestion: - Continue with the workflow. - Note in output: "Pre-existing test failure skipped: " -**After triage:** If any in-branch failures remain unfixed, **STOP**. Do not proceed. If all failures were pre-existing and handled (fixed, TODOed, assigned, or skipped), continue to Step 3.25. +**After triage:** If any in-branch failures remain unfixed, **STOP**. Do not proceed. If all failures were pre-existing and handled (fixed, TODOed, assigned, or skipped), continue to Step 6. **If all pass:** Continue silently — just note the counts briefly. --- -## Step 3.25: Eval Suites (conditional) +## Step 6: Eval Suites (conditional) Evals are mandatory when prompt-related files change. Skip this step entirely if no prompt files are in the diff. @@ -1076,7 +1076,7 @@ Match against these patterns (from CLAUDE.md): - `config/system_prompts/*.txt` - `test/evals/**/*` (eval infrastructure changes affect all suites) -**If no matches:** Print "No prompt-related files changed — skipping evals." and continue to Step 3.5. +**If no matches:** Print "No prompt-related files changed — skipping evals." and continue to Step 9. **2. Identify affected eval suites:** @@ -1106,9 +1106,9 @@ If multiple suites need to run, run them sequentially (each needs a test lane). **4. Check results:** - **If any eval fails:** Show the failures, the cost dashboard, and **STOP**. Do not proceed. -- **If all pass:** Note pass counts and cost. Continue to Step 3.5. +- **If all pass:** Note pass counts and cost. Continue to Step 9. -**5. Save eval output** — include eval results and cost dashboard in the PR body (Step 8). +**5. Save eval output** — include eval results and cost dashboard in the PR body (Step 19). **Tier reference (for context — /ship always uses `full`):** | Tier | When | Speed (cached) | Cost | @@ -1119,9 +1119,15 @@ If multiple suites need to run, run them sequentially (each needs a test lane). --- -## Step 3.4: Test Coverage Audit +## Step 7: Test Coverage Audit -100% coverage is the goal — every untested path is a path where bugs hide and vibe coding becomes yolo coding. Evaluate what was ACTUALLY coded (from the diff), not what was planned. +**Dispatch this step as a subagent** using the Agent tool with `subagent_type: "general-purpose"`. The subagent runs the coverage audit in a fresh context window — the parent only sees the conclusion, not intermediate file reads. This is context-rot defense. + +**Subagent prompt:** Pass the following instructions to the subagent, with `` substituted with the base branch: + +> You are running a ship-workflow test coverage audit. Run `git diff ...HEAD` as needed. Do not commit or push — report only. +> +> 100% coverage is the goal — every untested path is a path where bugs hide and vibe coding becomes yolo coding. Evaluate what was ACTUALLY coded (from the diff), not what was planned. ### Test Framework Detection @@ -1143,7 +1149,7 @@ ls jest.config.* vitest.config.* playwright.config.* cypress.config.* .rspec pyt ls -d test/ tests/ spec/ __tests__/ cypress/ e2e/ 2>/dev/null ``` -3. **If no framework detected:** falls through to the Test Framework Bootstrap step (Step 2.5) which handles full setup. +3. **If no framework detected:** falls through to the Test Framework Bootstrap step (Step 4) which handles full setup. **0. Before/after test count:** @@ -1285,11 +1291,11 @@ GAPS: 8 paths need tests (2 need E2E, 1 needs eval) ───────────────────────────────── ``` -**Fast path:** All paths covered → "Step 3.4: All new code paths have test coverage ✓" Continue. +**Fast path:** All paths covered → "Step 7: All new code paths have test coverage ✓" Continue. **5. Generate tests for uncovered paths:** -If test framework detected (or bootstrapped in Step 2.5): +If test framework detected (or bootstrapped in Step 4): - Prioritize error handlers and edge cases first (happy paths are more likely already tested) - Read 2-3 existing test files to match conventions exactly - Generate unit tests. Mock all external dependencies (DB, API, Redis). @@ -1303,7 +1309,7 @@ Caps: 30 code paths max, 20 tests generated max (code + user flow combined), 2-m If no test framework AND user declined bootstrap → diagram only, no generation. Note: "Test generation skipped — no test framework configured." -**Diff is test-only changes:** Skip Step 3.4 entirely: "No new application code paths to audit." +**Diff is test-only changes:** Skip Step 7 entirely: "No new application code paths to audit." **6. After-count and coverage summary:** @@ -1378,12 +1384,30 @@ Repo: {owner/repo} ## Critical Paths - {end-to-end flow that must work} ``` +> +> After your analysis, output a single JSON object on the LAST LINE of your response (no other text after it): +> `{"coverage_pct":N,"gaps":N,"diagram":"","tests_added":["path",...]}` + +**Parent processing:** + +1. Read the subagent's final output. Parse the LAST line as JSON. +2. Store `coverage_pct` (for Step 20 metrics), `gaps` (user summary), `tests_added` (for the commit). +3. Embed `diagram` verbatim in the PR body's `## Test Coverage` section (Step 19). +4. Print a one-line summary: `Coverage: {coverage_pct}%, {gaps} gaps. {tests_added.length} tests added.` + +**If the subagent fails, times out, or returns invalid JSON:** Fall back to running the audit inline in the parent. Do not block /ship on subagent failure — partial results are better than none. --- -## Step 3.45: Plan Completion Audit +## Step 8: Plan Completion Audit -### Plan File Discovery +**Dispatch this step as a subagent** using the Agent tool with `subagent_type: "general-purpose"`. The subagent reads the plan file and every referenced code file in its own fresh context. Parent gets only the conclusion. + +**Subagent prompt:** Pass these instructions to the subagent: + +> You are running a ship-workflow plan completion audit. The base branch is ``. Use `git diff ...HEAD` to see what shipped. Do not commit or push — report only. +> +> ### Plan File Discovery 1. **Conversation context (primary):** Check if there is an active plan file in this conversation. The host agent's system messages include plan file paths when in plan mode. If found, use it directly — this is the most reliable signal. @@ -1499,19 +1523,31 @@ After producing the completion checklist: **No plan file found:** Skip entirely. "No plan file detected — skipping plan completion audit." **Include in PR body (Step 8):** Add a `## Plan Completion` section with the checklist summary. +> +> After your analysis, output a single JSON object on the LAST LINE of your response (no other text after it): +> `{"total_items":N,"done":N,"changed":N,"deferred":N,"summary":""}` + +**Parent processing:** + +1. Parse the LAST line of the subagent's output as JSON. +2. Store `done`, `deferred` for Step 20 metrics; use `summary` in PR body. +3. If `deferred > 0` and no user override, present the deferred items via AskUserQuestion before continuing. +4. Embed `summary` in PR body's `## Plan Completion` section (Step 19). + +**If the subagent fails or returns invalid JSON:** Fall back to running the audit inline. Never block /ship on subagent failure. --- -## Step 3.47: Plan Verification +## Step 8.1: Plan Verification Automatically verify the plan's testing/verification steps using the `/qa-only` skill. ### 1. Check for verification section -Using the plan file already discovered in Step 3.45, look for a verification section. Match any of these headings: `## Verification`, `## Test plan`, `## Testing`, `## How to test`, `## Manual testing`, or any section with verification-flavored items (URLs to visit, things to check visually, interactions to test). +Using the plan file already discovered in Step 8, look for a verification section. Match any of these headings: `## Verification`, `## Test plan`, `## Testing`, `## How to test`, `## Manual testing`, or any section with verification-flavored items (URLs to visit, things to check visually, interactions to test). **If no verification section found:** Skip with "No verification steps found in plan — skipping auto-verification." -**If no plan file was found in Step 3.45:** Skip (already handled). +**If no plan file was found in Step 8:** Skip (already handled). ### 2. Check for running dev server @@ -1556,7 +1592,7 @@ Follow the /qa-only workflow with these modifications: ### 5. Include in PR body -Add a `## Verification Results` section to the PR body (Step 8): +Add a `## Verification Results` section to the PR body (Step 19): - If verification ran: summary of results (N PASS, M FAIL, K SKIPPED) - If skipped: reason for skipping (no plan, no server, no verification section) @@ -1598,7 +1634,7 @@ matches a past learning, display: This makes the compounding visible. The user should see that gstack is getting smarter on their codebase over time. -## Step 3.48: Scope Drift Detection +## Step 8.2: Scope Drift Detection Before reviewing code quality, check: **did they build what was requested — nothing more, nothing less?** @@ -1635,7 +1671,7 @@ Before reviewing code quality, check: **did they build what was requested — no --- -## Step 3.5: Pre-Landing Review +## Step 9: Pre-Landing Review Review the diff for structural issues that tests don't catch. @@ -1730,7 +1766,7 @@ Present Codex output under a `CODEX (design):` header, merged with the checklist Include any design findings alongside the code review findings. They follow the same Fix-First flow below. -## Step 3.55: Review Army — Specialist Dispatch +## Step 9.1: Review Army — Specialist Dispatch ### Detect stack and scope @@ -1847,7 +1883,7 @@ CHECKLIST: --- -### Step 3.56: Collect and merge findings +### Step 9.2: Collect and merge findings After all specialist subagents complete, collect their outputs. @@ -1893,7 +1929,7 @@ SPECIALIST REVIEW: N findings (X critical, Y informational) from Z specialists PR Quality Score: X/10 ``` -These findings flow into the Fix-First flow (item 4) alongside the checklist pass (Step 3.5). +These findings flow into the Fix-First flow (item 4) alongside the checklist pass (Step 9). The Fix-First heuristic applies identically — specialist findings follow the same AUTO-FIX vs ASK classification. **Compile per-specialist stats:** @@ -1917,7 +1953,7 @@ If activated, dispatch one more subagent via the Agent tool (foreground, not bac The Red Team subagent receives: 1. The red-team checklist from `~/.claude/skills/gstack/review/specialists/red-team.md` -2. The merged specialist findings from Step 3.56 (so it knows what was already caught) +2. The merged specialist findings from Step 9.2 (so it knows what was already caught) 3. The git diff command Prompt: "You are a red team reviewer. The code has already been reviewed by N specialists @@ -1933,7 +1969,7 @@ the Fix-First flow (item 4). Red Team findings are tagged with `"specialist":"re If the Red Team returns NO FINDINGS, note: "Red Team review: no additional issues found." If the Red Team subagent fails or times out, skip silently and continue. -### Step 3.57: Cross-review finding dedup +### Step 9.3: Cross-review finding dedup Before classifying findings, check if any were previously skipped by the user in a prior review on this branch. @@ -1953,7 +1989,7 @@ If skipped fingerprints exist, get the list of files changed since that review: git diff --name-only HEAD ``` -For each current finding (from both the checklist pass (Step 3.5) and specialist review (Step 3.55-3.56)), check: +For each current finding (from both the checklist pass (Step 9) and specialist review (Step 9.1-9.2)), check: - Does its fingerprint match a previously skipped finding? - Is the finding's file path NOT in the changed-files set? @@ -1967,7 +2003,7 @@ If no prior reviews exist or none have a `findings` array, skip this step silent Output a summary header: `Pre-Landing Review: N issues (X critical, Y informational)` -4. **Classify each finding from both the checklist pass and specialist review (Step 3.55-3.56) as AUTO-FIX or ASK** per the Fix-First Heuristic in +4. **Classify each finding from both the checklist pass and specialist review (Step 9.1-Step 9.2) as AUTO-FIX or ASK** per the Fix-First Heuristic in checklist.md. Critical findings lean toward ASK; informational lean toward AUTO-FIX. 5. **Auto-fix all AUTO-FIX items.** Apply each fix. Output one line per fix: @@ -1981,7 +2017,7 @@ Output a summary header: `Pre-Landing Review: N issues (X critical, Y informatio 7. **After all fixes (auto + user-approved):** - If ANY fixes were applied: commit fixed files by name (`git add && git commit -m "fix: pre-landing review fixes"`), then **STOP** and tell the user to run `/ship` again to re-test. - - If no fixes applied (all ASK items skipped, or no issues found): continue to Step 4. + - If no fixes applied (all ASK items skipped, or no issues found): continue to Step 12. 8. Output summary: `Pre-Landing Review: N issues — M auto-fixed, K asked (J fixed, L skipped)` @@ -1993,27 +2029,38 @@ Output a summary header: `Pre-Landing Review: N issues (X critical, Y informatio ``` Substitute TIMESTAMP (ISO 8601), STATUS ("clean" if no issues, "issues_found" otherwise), and N values from the summary counts above. The `via:"ship"` distinguishes from standalone `/review` runs. -- `quality_score` = the PR Quality Score computed in Step 3.56 (e.g., 7.5). If specialists were skipped (small diff), use `10.0` -- `specialists` = the per-specialist stats object compiled in Step 3.56. Each specialist that was considered gets an entry: `{"dispatched":true/false,"findings":N,"critical":N,"informational":N}` if dispatched, or `{"dispatched":false,"reason":"scope|gated"}` if skipped. Example: `{"testing":{"dispatched":true,"findings":2,"critical":0,"informational":2},"security":{"dispatched":false,"reason":"scope"}}` +- `quality_score` = the PR Quality Score computed in Step 9.2 (e.g., 7.5). If specialists were skipped (small diff), use `10.0` +- `specialists` = the per-specialist stats object compiled in Step 9.2. Each specialist that was considered gets an entry: `{"dispatched":true/false,"findings":N,"critical":N,"informational":N}` if dispatched, or `{"dispatched":false,"reason":"scope|gated"}` if skipped. Example: `{"testing":{"dispatched":true,"findings":2,"critical":0,"informational":2},"security":{"dispatched":false,"reason":"scope"}}` - `findings` = array of per-finding records. For each finding (from checklist pass and specialists), include: `{"fingerprint":"path:line:category","severity":"CRITICAL|INFORMATIONAL","action":"ACTION"}`. ACTION is `"auto-fixed"`, `"fixed"` (user approved), or `"skipped"` (user chose Skip). -Save the review output — it goes into the PR body in Step 8. +Save the review output — it goes into the PR body in Step 19. --- -## Step 3.75: Address Greptile review comments (if PR exists) +## Step 10: Address Greptile review comments (if PR exists) -Read `.claude/skills/review/greptile-triage.md` and follow the fetch, filter, classify, and **escalation detection** steps. +**Dispatch the fetch + classification as a subagent** using the Agent tool with `subagent_type: "general-purpose"`. The subagent pulls every Greptile comment, runs the escalation detection algorithm, and classifies each comment. Parent receives a structured list and handles user interaction + file edits. -**If no PR exists, `gh` fails, API returns an error, or there are zero Greptile comments:** Skip this step silently. Continue to Step 4. +**Subagent prompt:** -**If Greptile comments are found:** +> You are classifying Greptile review comments for a /ship workflow. Read `.claude/skills/review/greptile-triage.md` and follow the fetch, filter, classify, and **escalation detection** steps. Do NOT fix code, do NOT reply to comments, do NOT commit — report only. +> +> For each comment, assign: `classification` (`valid_actionable`, `already_fixed`, `false_positive`, `suppressed`), `escalation_tier` (1 or 2), the file:line or [top-level] tag, body summary, and permalink URL. +> +> If no PR exists, `gh` fails, the API errors, or there are zero comments, output: `{"total":0,"comments":[]}` and stop. +> +> Otherwise, output a single JSON object on the LAST LINE of your response: +> `{"total":N,"comments":[{"classification":"...","escalation_tier":N,"ref":"file:line","summary":"...","permalink":"url"},...]}` -Include a Greptile summary in your output: `+ N Greptile comments (X valid, Y fixed, Z FP)` +**Parent processing:** -Before replying to any comment, run the **Escalation Detection** algorithm from greptile-triage.md to determine whether to use Tier 1 (friendly) or Tier 2 (firm) reply templates. +Parse the LAST line as JSON. -For each classified comment: +If `total` is 0, skip this step silently. Continue to Step 12. + +Otherwise, print: `+ {total} Greptile comments ({valid_actionable} valid, {already_fixed} already fixed, {false_positive} FP)`. + +For each comment in `comments`: **VALID & ACTIONABLE:** Use AskUserQuestion with: - The comment (file:line or [top-level] + body summary + permalink URL) @@ -2036,11 +2083,11 @@ For each classified comment: **SUPPRESSED:** Skip silently — these are known false positives from previous triage. -**After all comments are resolved:** If any fixes were applied, the tests from Step 3 are now stale. **Re-run tests** (Step 3) before continuing to Step 4. If no fixes were applied, continue to Step 4. +**After all comments are resolved:** If any fixes were applied, the tests from Step 5 are now stale. **Re-run tests** (Step 5) before continuing to Step 12. If no fixes were applied, continue to Step 12. --- -## Step 3.8: Adversarial review (always-on) +## Step 11: Adversarial review (always-on) Every diff gets adversarial review from both Claude and Codex. LOC is not a proxy for risk — a 5-line auth change can be critical. @@ -2126,7 +2173,7 @@ A) Investigate and fix now (recommended) B) Continue — review will still complete ``` -If A: address the findings. After fixing, re-run tests (Step 3) since code has changed. Re-run `codex review` to verify. +If A: address the findings. After fixing, re-run tests (Step 5) since code has changed. Re-run `codex review` to verify. Read stderr for errors (same error handling as Codex adversarial above). @@ -2192,7 +2239,7 @@ already knows. A good test: would this insight save time in a future session? If -## Step 4: Version bump (auto-decide) +## Step 12: Version bump (auto-decide) **Idempotency check:** Before bumping, compare VERSION against the base branch. @@ -2223,7 +2270,7 @@ If output shows `ALREADY_BUMPED`, VERSION was already bumped on this branch (pri --- -## CHANGELOG (auto-generate) +## Step 13: CHANGELOG (auto-generate) 1. Read `CHANGELOG.md` header to know the format. @@ -2267,7 +2314,7 @@ If output shows `ALREADY_BUMPED`, VERSION was already bumped on this branch (pri --- -## Step 5.5: TODOS.md (auto-update) +## Step 14: TODOS.md (auto-update) Cross-reference the project's TODOS.md against the changes being shipped. Mark completed items automatically; prompt only if the file is missing or disorganized. @@ -2279,7 +2326,7 @@ Read `.claude/skills/review/TODOS-format.md` for the canonical format reference. - Message: "GStack recommends maintaining a TODOS.md organized by skill/component, then priority (P0 at top through P4, then Completed at bottom). See TODOS-format.md for the full format. Would you like to create one?" - Options: A) Create it now, B) Skip for now - If A: Create `TODOS.md` with a skeleton (# TODOS heading + ## Completed section). Continue to step 3. -- If B: Skip the rest of Step 5.5. Continue to Step 6. +- If B: Skip the rest of Step 14. Continue to Step 15. **2. Check structure and organization:** @@ -2318,11 +2365,11 @@ For each TODO item, check if the changes in this PR complete it by: **6. Defensive:** If TODOS.md cannot be written (permission error, disk full), warn the user and continue. Never stop the ship workflow for a TODOS failure. -Save this summary — it goes into the PR body in Step 8. +Save this summary — it goes into the PR body in Step 19. --- -## Step 6: Commit (bisectable chunks) +## Step 15: Commit (bisectable chunks) **Goal:** Create small, logical commits that work well with `git bisect` and help LLMs understand what changed. @@ -2360,13 +2407,13 @@ EOF --- -## Step 6.5: Verification Gate +## Step 16: Verification Gate **IRON LAW: NO COMPLETION CLAIMS WITHOUT FRESH VERIFICATION EVIDENCE.** Before pushing, re-verify if code changed during Steps 4-6: -1. **Test verification:** If ANY code changed after Step 3's test run (fixes from review findings, CHANGELOG edits don't count), re-run the test suite. Paste fresh output. Stale output from Step 3 is NOT acceptable. +1. **Test verification:** If ANY code changed after Step 5's test run (fixes from review findings, CHANGELOG edits don't count), re-run the test suite. Paste fresh output. Stale output from Step 5 is NOT acceptable. 2. **Build verification:** If the project has a build step, run it. Paste output. @@ -2376,13 +2423,13 @@ Before pushing, re-verify if code changed during Steps 4-6: - "I already tested earlier" → Code changed since then. Test again. - "It's a trivial change" → Trivial changes break production. -**If tests fail here:** STOP. Do not push. Fix the issue and return to Step 3. +**If tests fail here:** STOP. Do not push. Fix the issue and return to Step 5. Claiming work is complete without verification is dishonesty, not efficiency. --- -## Step 7: Push +## Step 17: Push **Idempotency check:** Check if the branch is already pushed and up to date. @@ -2394,15 +2441,44 @@ echo "LOCAL: $LOCAL REMOTE: $REMOTE" [ "$LOCAL" = "$REMOTE" ] && echo "ALREADY_PUSHED" || echo "PUSH_NEEDED" ``` -If `ALREADY_PUSHED`, skip the push but continue to Step 8. Otherwise push with upstream tracking: +If `ALREADY_PUSHED`, skip the push but continue to Step 18. Otherwise push with upstream tracking: ```bash git push -u origin ``` +**You are NOT done.** The code is pushed but documentation sync and PR creation are mandatory final steps. Continue to Step 18. + --- -## Step 8: Create PR/MR +## Step 18: Documentation sync (via subagent, before PR creation) + +**Dispatch /document-release as a subagent** using the Agent tool with `subagent_type: "general-purpose"`. The subagent gets a fresh context window — zero rot from the preceding 17 steps. It also runs the **full** `/document-release` workflow (with CHANGELOG clobber protection, doc exclusions, risky-change gates, named staging, race-safe PR body editing) rather than a weaker reimplementation. + +**Sequencing:** This step runs AFTER Step 17 (Push) and BEFORE Step 19 (Create PR). The PR is created once from final HEAD with the `## Documentation` section baked into the initial body. No create-then-re-edit dance. + +**Subagent prompt:** + +> You are executing the /document-release workflow after a code push. Read the full skill file `${HOME}/.claude/skills/gstack/document-release/SKILL.md` and execute its complete workflow end-to-end, including CHANGELOG clobber protection, doc exclusions, risky-change gates, and named staging. Do NOT attempt to edit the PR body — no PR exists yet. Branch: ``, base: ``. +> +> After completing the workflow, output a single JSON object on the LAST LINE of your response (no other text after it): +> `{"files_updated":["README.md","CLAUDE.md",...],"commit_sha":"abc1234","pushed":true,"documentation_section":""}` +> +> If no documentation files needed updating, output: +> `{"files_updated":[],"commit_sha":null,"pushed":false,"documentation_section":null}` + +**Parent processing:** + +1. Parse the LAST line of the subagent's output as JSON. +2. Store `documentation_section` — Step 19 embeds it in the PR body (or omits the section if null). +3. If `files_updated` is non-empty, print: `Documentation synced: {files_updated.length} files updated, committed as {commit_sha}`. +4. If `files_updated` is empty, print: `Documentation is current — no updates needed.` + +**If the subagent fails or returns invalid JSON:** Print a warning and proceed to Step 19 without a `## Documentation` section. Do not block /ship on subagent failure. The user can run `/document-release` manually after the PR lands. + +--- + +## Step 19: Create PR/MR **Idempotency check:** Check if a PR/MR already exists for this branch. @@ -2416,7 +2492,7 @@ gh pr view --json url,number,state -q 'if .state == "OPEN" then "PR #\(.number): glab mr view -F json 2>/dev/null | jq -r 'if .state == "opened" then "MR_EXISTS" else "NO_MR" end' 2>/dev/null || echo "NO_MR" ``` -If an **open** PR/MR already exists: **update** the PR body using `gh pr edit --body "..."` (GitHub) or `glab mr update -d "..."` (GitLab). Always regenerate the PR body from scratch using this run's fresh results (test output, coverage audit, review findings, adversarial review, TODOS summary). Never reuse stale PR body content from a prior run. Print the existing URL and continue to Step 8.5. +If an **open** PR/MR already exists: **update** the PR body using `gh pr edit --body "..."` (GitHub) or `glab mr update -d "..."` (GitLab). Always regenerate the PR body from scratch using this run's fresh results (test output, coverage audit, review findings, adversarial review, TODOS summary, documentation_section from Step 18). Never reuse stale PR body content from a prior run. Print the existing URL and continue to Step 20. If no PR/MR exists: create a pull request (GitHub) or merge request (GitLab) using the platform detected in Step 0. @@ -2432,11 +2508,11 @@ must appear in at least one section. If a commit's work isn't reflected in the s you missed it.> ## Test Coverage - - + + ## Pre-Landing Review - + ## Design Review @@ -2448,19 +2524,19 @@ you missed it.> ## Greptile Review - + ## Scope Drift ## Plan Completion - + ## Verification Results - + @@ -2470,6 +2546,10 @@ you missed it.> +## Documentation + + + ## Test plan - [x] All Rails tests pass (N runs, 0 failures) - [x] All Vitest tests pass (N tests) @@ -2498,34 +2578,11 @@ EOF **If neither CLI is available:** Print the branch name, remote URL, and instruct the user to create the PR/MR manually via the web UI. Do not stop — the code is pushed and ready. -**Output the PR/MR URL** — then proceed to Step 8.5. +**Output the PR/MR URL** — then proceed to Step 20. --- -## Step 8.5: Auto-invoke /document-release - -After the PR is created, automatically sync project documentation. Read the -`document-release/SKILL.md` skill file (adjacent to this skill's directory) and -execute its full workflow: - -1. Read the `/document-release` skill: `cat ${CLAUDE_SKILL_DIR}/../document-release/SKILL.md` -2. Follow its instructions — it reads all .md files in the project, cross-references - the diff, and updates anything that drifted (README, ARCHITECTURE, CONTRIBUTING, - CLAUDE.md, TODOS, etc.) -3. If any docs were updated, commit the changes and push to the same branch: - ```bash - git add -A && git commit -m "docs: sync documentation with shipped changes" && git push - ``` -4. If no docs needed updating, say "Documentation is current — no updates needed." - -This step is automatic. Do not ask the user for confirmation. The goal is zero-friction -doc updates — the user runs `/ship` and documentation stays current without a separate command. - -If Step 8.5 created a docs commit, re-edit the PR/MR body to include the latest commit SHA in the summary. This ensures the PR body reflects the truly final state after document-release. - ---- - -## Step 8.75: Persist ship metrics +## Step 20: Persist ship metrics Log coverage and plan completion data so `/retro` can track trends: @@ -2540,10 +2597,10 @@ echo '{"skill":"ship","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","coverage ``` Substitute from earlier steps: -- **COVERAGE_PCT**: coverage percentage from Step 3.4 diagram (integer, or -1 if undetermined) -- **PLAN_TOTAL**: total plan items extracted in Step 3.45 (0 if no plan file) -- **PLAN_DONE**: count of DONE + CHANGED items from Step 3.45 (0 if no plan file) -- **VERIFY_RESULT**: "pass", "fail", or "skipped" from Step 3.47 +- **COVERAGE_PCT**: coverage percentage from Step 7 diagram (integer, or -1 if undetermined) +- **PLAN_TOTAL**: total plan items extracted in Step 8 (0 if no plan file) +- **PLAN_DONE**: count of DONE + CHANGED items from Step 8 (0 if no plan file) +- **VERIFY_RESULT**: "pass", "fail", or "skipped" from Step 8.1 - **VERSION**: from the VERSION file - **BRANCH**: current branch name @@ -2562,6 +2619,6 @@ This step is automatic — never skip it, never ask for confirmation. - **Split commits for bisectability** — each commit = one logical change. - **TODOS.md completion detection must be conservative.** Only mark items as completed when the diff clearly shows the work is done. - **Use Greptile reply templates from greptile-triage.md.** Every reply includes evidence (inline diff, code references, re-rank suggestion). Never post vague replies. -- **Never push without fresh verification evidence.** If code changed after Step 3 tests, re-run before pushing. -- **Step 3.4 generates coverage tests.** They must pass before committing. Never commit failing tests. +- **Never push without fresh verification evidence.** If code changed after Step 5 tests, re-run before pushing. +- **Step 7 generates coverage tests.** They must pass before committing. Never commit failing tests. - **The goal is: user says `/ship`, next thing they see is the review + PR URL + auto-synced docs.** diff --git a/ship/SKILL.md.tmpl b/ship/SKILL.md.tmpl index 0af2ea62..e262d74e 100644 --- a/ship/SKILL.md.tmpl +++ b/ship/SKILL.md.tmpl @@ -41,17 +41,17 @@ You are running the `/ship` workflow. This is a **non-interactive, fully automat - Merge conflicts that can't be auto-resolved (stop, show conflicts) - In-branch test failures (pre-existing failures are triaged, not auto-blocking) - Pre-landing review finds ASK items that need user judgment -- MINOR or MAJOR version bump needed (ask — see Step 4) +- MINOR or MAJOR version bump needed (ask — see Step 12) - Greptile review comments that need user decision (complex fixes, false positives) -- AI-assessed coverage below minimum threshold (hard gate with user override — see Step 3.4) -- Plan items NOT DONE with no user override (see Step 3.45) -- Plan verification failures (see Step 3.47) -- TODOS.md missing and user wants to create one (ask — see Step 5.5) -- TODOS.md disorganized and user wants to reorganize (ask — see Step 5.5) +- AI-assessed coverage below minimum threshold (hard gate with user override — see Step 7) +- Plan items NOT DONE with no user override (see Step 8) +- Plan verification failures (see Step 8.1) +- TODOS.md missing and user wants to create one (ask — see Step 14) +- TODOS.md disorganized and user wants to reorganize (ask — see Step 14) **Never stop for:** - Uncommitted changes (always include them) -- Version bump choice (auto-pick MICRO or PATCH — see Step 4) +- Version bump choice (auto-pick MICRO or PATCH — see Step 12) - CHANGELOG content (auto-generate from diff) - Commit message approval (auto-commit) - Multi-file changesets (auto-split into bisectable commits) @@ -64,9 +64,9 @@ Re-running `/ship` means "run the whole checklist again." Every verification ste (tests, coverage audit, plan completion, pre-landing review, adversarial review, VERSION/CHANGELOG check, TODOS, document-release) runs on every invocation. Only *actions* are idempotent: -- Step 4: If VERSION already bumped, skip the bump but still read the version -- Step 7: If already pushed, skip the push command -- Step 8: If PR exists, update the body instead of creating a new PR +- Step 12: If VERSION already bumped, skip the bump but still read the version +- Step 17: If already pushed, skip the push command +- Step 19: If PR exists, update the body instead of creating a new PR Never skip a verification step because a prior `/ship` run already performed it. --- @@ -85,19 +85,19 @@ Never skip a verification step because a prior `/ship` run already performed it. If the Eng Review is NOT "CLEAR": -Print: "No prior eng review found — ship will run its own pre-landing review in Step 3.5." +Print: "No prior eng review found — ship will run its own pre-landing review in Step 9." Check diff size: `git diff ...HEAD --stat | tail -1`. If the diff is >200 lines, add: "Note: This is a large diff. Consider running `/plan-eng-review` or `/autoplan` for architecture-level review before shipping." If CEO Review is missing, mention as informational ("CEO Review not run — recommended for product changes") but do NOT block. -For Design Review: run `source <(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null)`. If `SCOPE_FRONTEND=true` and no design review (plan-design-review or design-review-lite) exists in the dashboard, mention: "Design Review not run — this PR changes frontend code. The lite design check will run automatically in Step 3.5, but consider running /design-review for a full visual audit post-implementation." Still never block. +For Design Review: run `source <(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null)`. If `SCOPE_FRONTEND=true` and no design review (plan-design-review or design-review-lite) exists in the dashboard, mention: "Design Review not run — this PR changes frontend code. The lite design check will run automatically in Step 9, but consider running /design-review for a full visual audit post-implementation." Still never block. -Continue to Step 1.5 — do NOT block or ask. Ship runs its own review in Step 3.5. +Continue to Step 2 — do NOT block or ask. Ship runs its own review in Step 9. --- -## Step 1.5: Distribution Pipeline Check +## Step 2: Distribution Pipeline Check If the diff introduces a new standalone artifact (CLI binary, library package, tool) — not a web service with existing deployment — verify that a distribution pipeline exists. @@ -125,7 +125,7 @@ service with existing deployment — verify that a distribution pipeline exists. --- -## Step 2: Merge the base branch (BEFORE tests) +## Step 3: Merge the base branch (BEFORE tests) Fetch and merge the base branch into the feature branch so tests run against the merged state: @@ -139,13 +139,13 @@ git fetch origin && git merge origin/ --no-edit --- -## Step 2.5: Test Framework Bootstrap +## Step 4: Test Framework Bootstrap {{TEST_BOOTSTRAP}} --- -## Step 3: Run tests (on merged code) +## Step 5: Run tests (on merged code) **Do NOT run `RAILS_ENV=test bin/rails db:migrate`** — `bin/test-lane` already calls `db:test:prepare` internally, which loads the schema into the correct lane database. @@ -165,13 +165,13 @@ After both complete, read the output files and check pass/fail. {{TEST_FAILURE_TRIAGE}} -**After triage:** If any in-branch failures remain unfixed, **STOP**. Do not proceed. If all failures were pre-existing and handled (fixed, TODOed, assigned, or skipped), continue to Step 3.25. +**After triage:** If any in-branch failures remain unfixed, **STOP**. Do not proceed. If all failures were pre-existing and handled (fixed, TODOed, assigned, or skipped), continue to Step 6. **If all pass:** Continue silently — just note the counts briefly. --- -## Step 3.25: Eval Suites (conditional) +## Step 6: Eval Suites (conditional) Evals are mandatory when prompt-related files change. Skip this step entirely if no prompt files are in the diff. @@ -190,7 +190,7 @@ Match against these patterns (from CLAUDE.md): - `config/system_prompts/*.txt` - `test/evals/**/*` (eval infrastructure changes affect all suites) -**If no matches:** Print "No prompt-related files changed — skipping evals." and continue to Step 3.5. +**If no matches:** Print "No prompt-related files changed — skipping evals." and continue to Step 9. **2. Identify affected eval suites:** @@ -220,9 +220,9 @@ If multiple suites need to run, run them sequentially (each needs a test lane). **4. Check results:** - **If any eval fails:** Show the failures, the cost dashboard, and **STOP**. Do not proceed. -- **If all pass:** Note pass counts and cost. Continue to Step 3.5. +- **If all pass:** Note pass counts and cost. Continue to Step 9. -**5. Save eval output** — include eval results and cost dashboard in the PR body (Step 8). +**5. Save eval output** — include eval results and cost dashboard in the PR body (Step 19). **Tier reference (for context — /ship always uses `full`):** | Tier | When | Speed (cached) | Cost | @@ -233,15 +233,51 @@ If multiple suites need to run, run them sequentially (each needs a test lane). --- -## Step 3.4: Test Coverage Audit +## Step 7: Test Coverage Audit -{{TEST_COVERAGE_AUDIT_SHIP}} +**Dispatch this step as a subagent** using the Agent tool with `subagent_type: "general-purpose"`. The subagent runs the coverage audit in a fresh context window — the parent only sees the conclusion, not intermediate file reads. This is context-rot defense. + +**Subagent prompt:** Pass the following instructions to the subagent, with `` substituted with the base branch: + +> You are running a ship-workflow test coverage audit. Run `git diff ...HEAD` as needed. Do not commit or push — report only. +> +> {{TEST_COVERAGE_AUDIT_SHIP}} +> +> After your analysis, output a single JSON object on the LAST LINE of your response (no other text after it): +> `{"coverage_pct":N,"gaps":N,"diagram":"","tests_added":["path",...]}` + +**Parent processing:** + +1. Read the subagent's final output. Parse the LAST line as JSON. +2. Store `coverage_pct` (for Step 20 metrics), `gaps` (user summary), `tests_added` (for the commit). +3. Embed `diagram` verbatim in the PR body's `## Test Coverage` section (Step 19). +4. Print a one-line summary: `Coverage: {coverage_pct}%, {gaps} gaps. {tests_added.length} tests added.` + +**If the subagent fails, times out, or returns invalid JSON:** Fall back to running the audit inline in the parent. Do not block /ship on subagent failure — partial results are better than none. --- -## Step 3.45: Plan Completion Audit +## Step 8: Plan Completion Audit -{{PLAN_COMPLETION_AUDIT_SHIP}} +**Dispatch this step as a subagent** using the Agent tool with `subagent_type: "general-purpose"`. The subagent reads the plan file and every referenced code file in its own fresh context. Parent gets only the conclusion. + +**Subagent prompt:** Pass these instructions to the subagent: + +> You are running a ship-workflow plan completion audit. The base branch is ``. Use `git diff ...HEAD` to see what shipped. Do not commit or push — report only. +> +> {{PLAN_COMPLETION_AUDIT_SHIP}} +> +> After your analysis, output a single JSON object on the LAST LINE of your response (no other text after it): +> `{"total_items":N,"done":N,"changed":N,"deferred":N,"summary":""}` + +**Parent processing:** + +1. Parse the LAST line of the subagent's output as JSON. +2. Store `done`, `deferred` for Step 20 metrics; use `summary` in PR body. +3. If `deferred > 0` and no user override, present the deferred items via AskUserQuestion before continuing. +4. Embed `summary` in PR body's `## Plan Completion` section (Step 19). + +**If the subagent fails or returns invalid JSON:** Fall back to running the audit inline. Never block /ship on subagent failure. --- @@ -253,7 +289,7 @@ If multiple suites need to run, run them sequentially (each needs a test lane). --- -## Step 3.5: Pre-Landing Review +## Step 9: Pre-Landing Review Review the diff for structural issues that tests don't catch. @@ -275,7 +311,7 @@ Review the diff for structural issues that tests don't catch. {{CROSS_REVIEW_DEDUP}} -4. **Classify each finding from both the checklist pass and specialist review (Step 3.55-3.56) as AUTO-FIX or ASK** per the Fix-First Heuristic in +4. **Classify each finding from both the checklist pass and specialist review (Step 9.1-Step 9.2) as AUTO-FIX or ASK** per the Fix-First Heuristic in checklist.md. Critical findings lean toward ASK; informational lean toward AUTO-FIX. 5. **Auto-fix all AUTO-FIX items.** Apply each fix. Output one line per fix: @@ -289,7 +325,7 @@ Review the diff for structural issues that tests don't catch. 7. **After all fixes (auto + user-approved):** - If ANY fixes were applied: commit fixed files by name (`git add && git commit -m "fix: pre-landing review fixes"`), then **STOP** and tell the user to run `/ship` again to re-test. - - If no fixes applied (all ASK items skipped, or no issues found): continue to Step 4. + - If no fixes applied (all ASK items skipped, or no issues found): continue to Step 12. 8. Output summary: `Pre-Landing Review: N issues — M auto-fixed, K asked (J fixed, L skipped)` @@ -301,27 +337,38 @@ Review the diff for structural issues that tests don't catch. ``` Substitute TIMESTAMP (ISO 8601), STATUS ("clean" if no issues, "issues_found" otherwise), and N values from the summary counts above. The `via:"ship"` distinguishes from standalone `/review` runs. -- `quality_score` = the PR Quality Score computed in Step 3.56 (e.g., 7.5). If specialists were skipped (small diff), use `10.0` -- `specialists` = the per-specialist stats object compiled in Step 3.56. Each specialist that was considered gets an entry: `{"dispatched":true/false,"findings":N,"critical":N,"informational":N}` if dispatched, or `{"dispatched":false,"reason":"scope|gated"}` if skipped. Example: `{"testing":{"dispatched":true,"findings":2,"critical":0,"informational":2},"security":{"dispatched":false,"reason":"scope"}}` +- `quality_score` = the PR Quality Score computed in Step 9.2 (e.g., 7.5). If specialists were skipped (small diff), use `10.0` +- `specialists` = the per-specialist stats object compiled in Step 9.2. Each specialist that was considered gets an entry: `{"dispatched":true/false,"findings":N,"critical":N,"informational":N}` if dispatched, or `{"dispatched":false,"reason":"scope|gated"}` if skipped. Example: `{"testing":{"dispatched":true,"findings":2,"critical":0,"informational":2},"security":{"dispatched":false,"reason":"scope"}}` - `findings` = array of per-finding records. For each finding (from checklist pass and specialists), include: `{"fingerprint":"path:line:category","severity":"CRITICAL|INFORMATIONAL","action":"ACTION"}`. ACTION is `"auto-fixed"`, `"fixed"` (user approved), or `"skipped"` (user chose Skip). -Save the review output — it goes into the PR body in Step 8. +Save the review output — it goes into the PR body in Step 19. --- -## Step 3.75: Address Greptile review comments (if PR exists) +## Step 10: Address Greptile review comments (if PR exists) -Read `.claude/skills/review/greptile-triage.md` and follow the fetch, filter, classify, and **escalation detection** steps. +**Dispatch the fetch + classification as a subagent** using the Agent tool with `subagent_type: "general-purpose"`. The subagent pulls every Greptile comment, runs the escalation detection algorithm, and classifies each comment. Parent receives a structured list and handles user interaction + file edits. -**If no PR exists, `gh` fails, API returns an error, or there are zero Greptile comments:** Skip this step silently. Continue to Step 4. +**Subagent prompt:** -**If Greptile comments are found:** +> You are classifying Greptile review comments for a /ship workflow. Read `.claude/skills/review/greptile-triage.md` and follow the fetch, filter, classify, and **escalation detection** steps. Do NOT fix code, do NOT reply to comments, do NOT commit — report only. +> +> For each comment, assign: `classification` (`valid_actionable`, `already_fixed`, `false_positive`, `suppressed`), `escalation_tier` (1 or 2), the file:line or [top-level] tag, body summary, and permalink URL. +> +> If no PR exists, `gh` fails, the API errors, or there are zero comments, output: `{"total":0,"comments":[]}` and stop. +> +> Otherwise, output a single JSON object on the LAST LINE of your response: +> `{"total":N,"comments":[{"classification":"...","escalation_tier":N,"ref":"file:line","summary":"...","permalink":"url"},...]}` -Include a Greptile summary in your output: `+ N Greptile comments (X valid, Y fixed, Z FP)` +**Parent processing:** -Before replying to any comment, run the **Escalation Detection** algorithm from greptile-triage.md to determine whether to use Tier 1 (friendly) or Tier 2 (firm) reply templates. +Parse the LAST line as JSON. -For each classified comment: +If `total` is 0, skip this step silently. Continue to Step 12. + +Otherwise, print: `+ {total} Greptile comments ({valid_actionable} valid, {already_fixed} already fixed, {false_positive} FP)`. + +For each comment in `comments`: **VALID & ACTIONABLE:** Use AskUserQuestion with: - The comment (file:line or [top-level] + body summary + permalink URL) @@ -344,7 +391,7 @@ For each classified comment: **SUPPRESSED:** Skip silently — these are known false positives from previous triage. -**After all comments are resolved:** If any fixes were applied, the tests from Step 3 are now stale. **Re-run tests** (Step 3) before continuing to Step 4. If no fixes were applied, continue to Step 4. +**After all comments are resolved:** If any fixes were applied, the tests from Step 5 are now stale. **Re-run tests** (Step 5) before continuing to Step 12. If no fixes were applied, continue to Step 12. --- @@ -354,7 +401,7 @@ For each classified comment: {{GBRAIN_SAVE_RESULTS}} -## Step 4: Version bump (auto-decide) +## Step 12: Version bump (auto-decide) **Idempotency check:** Before bumping, compare VERSION against the base branch. @@ -389,7 +436,7 @@ If output shows `ALREADY_BUMPED`, VERSION was already bumped on this branch (pri --- -## Step 5.5: TODOS.md (auto-update) +## Step 14: TODOS.md (auto-update) Cross-reference the project's TODOS.md against the changes being shipped. Mark completed items automatically; prompt only if the file is missing or disorganized. @@ -401,7 +448,7 @@ Read `.claude/skills/review/TODOS-format.md` for the canonical format reference. - Message: "GStack recommends maintaining a TODOS.md organized by skill/component, then priority (P0 at top through P4, then Completed at bottom). See TODOS-format.md for the full format. Would you like to create one?" - Options: A) Create it now, B) Skip for now - If A: Create `TODOS.md` with a skeleton (# TODOS heading + ## Completed section). Continue to step 3. -- If B: Skip the rest of Step 5.5. Continue to Step 6. +- If B: Skip the rest of Step 14. Continue to Step 15. **2. Check structure and organization:** @@ -440,11 +487,11 @@ For each TODO item, check if the changes in this PR complete it by: **6. Defensive:** If TODOS.md cannot be written (permission error, disk full), warn the user and continue. Never stop the ship workflow for a TODOS failure. -Save this summary — it goes into the PR body in Step 8. +Save this summary — it goes into the PR body in Step 19. --- -## Step 6: Commit (bisectable chunks) +## Step 15: Commit (bisectable chunks) **Goal:** Create small, logical commits that work well with `git bisect` and help LLMs understand what changed. @@ -482,13 +529,13 @@ EOF --- -## Step 6.5: Verification Gate +## Step 16: Verification Gate **IRON LAW: NO COMPLETION CLAIMS WITHOUT FRESH VERIFICATION EVIDENCE.** Before pushing, re-verify if code changed during Steps 4-6: -1. **Test verification:** If ANY code changed after Step 3's test run (fixes from review findings, CHANGELOG edits don't count), re-run the test suite. Paste fresh output. Stale output from Step 3 is NOT acceptable. +1. **Test verification:** If ANY code changed after Step 5's test run (fixes from review findings, CHANGELOG edits don't count), re-run the test suite. Paste fresh output. Stale output from Step 5 is NOT acceptable. 2. **Build verification:** If the project has a build step, run it. Paste output. @@ -498,13 +545,13 @@ Before pushing, re-verify if code changed during Steps 4-6: - "I already tested earlier" → Code changed since then. Test again. - "It's a trivial change" → Trivial changes break production. -**If tests fail here:** STOP. Do not push. Fix the issue and return to Step 3. +**If tests fail here:** STOP. Do not push. Fix the issue and return to Step 5. Claiming work is complete without verification is dishonesty, not efficiency. --- -## Step 7: Push +## Step 17: Push **Idempotency check:** Check if the branch is already pushed and up to date. @@ -516,15 +563,44 @@ echo "LOCAL: $LOCAL REMOTE: $REMOTE" [ "$LOCAL" = "$REMOTE" ] && echo "ALREADY_PUSHED" || echo "PUSH_NEEDED" ``` -If `ALREADY_PUSHED`, skip the push but continue to Step 8. Otherwise push with upstream tracking: +If `ALREADY_PUSHED`, skip the push but continue to Step 18. Otherwise push with upstream tracking: ```bash git push -u origin ``` +**You are NOT done.** The code is pushed but documentation sync and PR creation are mandatory final steps. Continue to Step 18. + --- -## Step 8: Create PR/MR +## Step 18: Documentation sync (via subagent, before PR creation) + +**Dispatch /document-release as a subagent** using the Agent tool with `subagent_type: "general-purpose"`. The subagent gets a fresh context window — zero rot from the preceding 17 steps. It also runs the **full** `/document-release` workflow (with CHANGELOG clobber protection, doc exclusions, risky-change gates, named staging, race-safe PR body editing) rather than a weaker reimplementation. + +**Sequencing:** This step runs AFTER Step 17 (Push) and BEFORE Step 19 (Create PR). The PR is created once from final HEAD with the `## Documentation` section baked into the initial body. No create-then-re-edit dance. + +**Subagent prompt:** + +> You are executing the /document-release workflow after a code push. Read the full skill file `${HOME}/.claude/skills/gstack/document-release/SKILL.md` and execute its complete workflow end-to-end, including CHANGELOG clobber protection, doc exclusions, risky-change gates, and named staging. Do NOT attempt to edit the PR body — no PR exists yet. Branch: ``, base: ``. +> +> After completing the workflow, output a single JSON object on the LAST LINE of your response (no other text after it): +> `{"files_updated":["README.md","CLAUDE.md",...],"commit_sha":"abc1234","pushed":true,"documentation_section":""}` +> +> If no documentation files needed updating, output: +> `{"files_updated":[],"commit_sha":null,"pushed":false,"documentation_section":null}` + +**Parent processing:** + +1. Parse the LAST line of the subagent's output as JSON. +2. Store `documentation_section` — Step 19 embeds it in the PR body (or omits the section if null). +3. If `files_updated` is non-empty, print: `Documentation synced: {files_updated.length} files updated, committed as {commit_sha}`. +4. If `files_updated` is empty, print: `Documentation is current — no updates needed.` + +**If the subagent fails or returns invalid JSON:** Print a warning and proceed to Step 19 without a `## Documentation` section. Do not block /ship on subagent failure. The user can run `/document-release` manually after the PR lands. + +--- + +## Step 19: Create PR/MR **Idempotency check:** Check if a PR/MR already exists for this branch. @@ -538,7 +614,7 @@ gh pr view --json url,number,state -q 'if .state == "OPEN" then "PR #\(.number): glab mr view -F json 2>/dev/null | jq -r 'if .state == "opened" then "MR_EXISTS" else "NO_MR" end' 2>/dev/null || echo "NO_MR" ``` -If an **open** PR/MR already exists: **update** the PR body using `gh pr edit --body "..."` (GitHub) or `glab mr update -d "..."` (GitLab). Always regenerate the PR body from scratch using this run's fresh results (test output, coverage audit, review findings, adversarial review, TODOS summary). Never reuse stale PR body content from a prior run. Print the existing URL and continue to Step 8.5. +If an **open** PR/MR already exists: **update** the PR body using `gh pr edit --body "..."` (GitHub) or `glab mr update -d "..."` (GitLab). Always regenerate the PR body from scratch using this run's fresh results (test output, coverage audit, review findings, adversarial review, TODOS summary, documentation_section from Step 18). Never reuse stale PR body content from a prior run. Print the existing URL and continue to Step 20. If no PR/MR exists: create a pull request (GitHub) or merge request (GitLab) using the platform detected in Step 0. @@ -554,11 +630,11 @@ must appear in at least one section. If a commit's work isn't reflected in the s you missed it.> ## Test Coverage - - + + ## Pre-Landing Review - + ## Design Review @@ -570,19 +646,19 @@ you missed it.> ## Greptile Review - + ## Scope Drift ## Plan Completion - + ## Verification Results - + @@ -592,6 +668,10 @@ you missed it.> +## Documentation + + + ## Test plan - [x] All Rails tests pass (N runs, 0 failures) - [x] All Vitest tests pass (N tests) @@ -620,34 +700,11 @@ EOF **If neither CLI is available:** Print the branch name, remote URL, and instruct the user to create the PR/MR manually via the web UI. Do not stop — the code is pushed and ready. -**Output the PR/MR URL** — then proceed to Step 8.5. +**Output the PR/MR URL** — then proceed to Step 20. --- -## Step 8.5: Auto-invoke /document-release - -After the PR is created, automatically sync project documentation. Read the -`document-release/SKILL.md` skill file (adjacent to this skill's directory) and -execute its full workflow: - -1. Read the `/document-release` skill: `cat ${CLAUDE_SKILL_DIR}/../document-release/SKILL.md` -2. Follow its instructions — it reads all .md files in the project, cross-references - the diff, and updates anything that drifted (README, ARCHITECTURE, CONTRIBUTING, - CLAUDE.md, TODOS, etc.) -3. If any docs were updated, commit the changes and push to the same branch: - ```bash - git add -A && git commit -m "docs: sync documentation with shipped changes" && git push - ``` -4. If no docs needed updating, say "Documentation is current — no updates needed." - -This step is automatic. Do not ask the user for confirmation. The goal is zero-friction -doc updates — the user runs `/ship` and documentation stays current without a separate command. - -If Step 8.5 created a docs commit, re-edit the PR/MR body to include the latest commit SHA in the summary. This ensures the PR body reflects the truly final state after document-release. - ---- - -## Step 8.75: Persist ship metrics +## Step 20: Persist ship metrics Log coverage and plan completion data so `/retro` can track trends: @@ -662,10 +719,10 @@ echo '{"skill":"ship","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","coverage ``` Substitute from earlier steps: -- **COVERAGE_PCT**: coverage percentage from Step 3.4 diagram (integer, or -1 if undetermined) -- **PLAN_TOTAL**: total plan items extracted in Step 3.45 (0 if no plan file) -- **PLAN_DONE**: count of DONE + CHANGED items from Step 3.45 (0 if no plan file) -- **VERIFY_RESULT**: "pass", "fail", or "skipped" from Step 3.47 +- **COVERAGE_PCT**: coverage percentage from Step 7 diagram (integer, or -1 if undetermined) +- **PLAN_TOTAL**: total plan items extracted in Step 8 (0 if no plan file) +- **PLAN_DONE**: count of DONE + CHANGED items from Step 8 (0 if no plan file) +- **VERIFY_RESULT**: "pass", "fail", or "skipped" from Step 8.1 - **VERSION**: from the VERSION file - **BRANCH**: current branch name @@ -684,6 +741,6 @@ This step is automatic — never skip it, never ask for confirmation. - **Split commits for bisectability** — each commit = one logical change. - **TODOS.md completion detection must be conservative.** Only mark items as completed when the diff clearly shows the work is done. - **Use Greptile reply templates from greptile-triage.md.** Every reply includes evidence (inline diff, code references, re-rank suggestion). Never post vague replies. -- **Never push without fresh verification evidence.** If code changed after Step 3 tests, re-run before pushing. -- **Step 3.4 generates coverage tests.** They must pass before committing. Never commit failing tests. +- **Never push without fresh verification evidence.** If code changed after Step 5 tests, re-run before pushing. +- **Step 7 generates coverage tests.** They must pass before committing. Never commit failing tests. - **The goal is: user says `/ship`, next thing they see is the review + PR URL + auto-synced docs.** diff --git a/test/fixtures/golden/claude-ship-SKILL.md b/test/fixtures/golden/claude-ship-SKILL.md index 61a6b87e..0d97b858 100644 --- a/test/fixtures/golden/claude-ship-SKILL.md +++ b/test/fixtures/golden/claude-ship-SKILL.md @@ -624,17 +624,17 @@ You are running the `/ship` workflow. This is a **non-interactive, fully automat - Merge conflicts that can't be auto-resolved (stop, show conflicts) - In-branch test failures (pre-existing failures are triaged, not auto-blocking) - Pre-landing review finds ASK items that need user judgment -- MINOR or MAJOR version bump needed (ask — see Step 4) +- MINOR or MAJOR version bump needed (ask — see Step 12) - Greptile review comments that need user decision (complex fixes, false positives) -- AI-assessed coverage below minimum threshold (hard gate with user override — see Step 3.4) -- Plan items NOT DONE with no user override (see Step 3.45) -- Plan verification failures (see Step 3.47) -- TODOS.md missing and user wants to create one (ask — see Step 5.5) -- TODOS.md disorganized and user wants to reorganize (ask — see Step 5.5) +- AI-assessed coverage below minimum threshold (hard gate with user override — see Step 7) +- Plan items NOT DONE with no user override (see Step 8) +- Plan verification failures (see Step 8.1) +- TODOS.md missing and user wants to create one (ask — see Step 14) +- TODOS.md disorganized and user wants to reorganize (ask — see Step 14) **Never stop for:** - Uncommitted changes (always include them) -- Version bump choice (auto-pick MICRO or PATCH — see Step 4) +- Version bump choice (auto-pick MICRO or PATCH — see Step 12) - CHANGELOG content (auto-generate from diff) - Commit message approval (auto-commit) - Multi-file changesets (auto-split into bisectable commits) @@ -647,9 +647,9 @@ Re-running `/ship` means "run the whole checklist again." Every verification ste (tests, coverage audit, plan completion, pre-landing review, adversarial review, VERSION/CHANGELOG check, TODOS, document-release) runs on every invocation. Only *actions* are idempotent: -- Step 4: If VERSION already bumped, skip the bump but still read the version -- Step 7: If already pushed, skip the push command -- Step 8: If PR exists, update the body instead of creating a new PR +- Step 12: If VERSION already bumped, skip the bump but still read the version +- Step 17: If already pushed, skip the push command +- Step 19: If PR exists, update the body instead of creating a new PR Never skip a verification step because a prior `/ship` run already performed it. --- @@ -717,19 +717,19 @@ Display: If the Eng Review is NOT "CLEAR": -Print: "No prior eng review found — ship will run its own pre-landing review in Step 3.5." +Print: "No prior eng review found — ship will run its own pre-landing review in Step 9." Check diff size: `git diff ...HEAD --stat | tail -1`. If the diff is >200 lines, add: "Note: This is a large diff. Consider running `/plan-eng-review` or `/autoplan` for architecture-level review before shipping." If CEO Review is missing, mention as informational ("CEO Review not run — recommended for product changes") but do NOT block. -For Design Review: run `source <(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null)`. If `SCOPE_FRONTEND=true` and no design review (plan-design-review or design-review-lite) exists in the dashboard, mention: "Design Review not run — this PR changes frontend code. The lite design check will run automatically in Step 3.5, but consider running /design-review for a full visual audit post-implementation." Still never block. +For Design Review: run `source <(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null)`. If `SCOPE_FRONTEND=true` and no design review (plan-design-review or design-review-lite) exists in the dashboard, mention: "Design Review not run — this PR changes frontend code. The lite design check will run automatically in Step 9, but consider running /design-review for a full visual audit post-implementation." Still never block. -Continue to Step 1.5 — do NOT block or ask. Ship runs its own review in Step 3.5. +Continue to Step 2 — do NOT block or ask. Ship runs its own review in Step 9. --- -## Step 1.5: Distribution Pipeline Check +## Step 2: Distribution Pipeline Check If the diff introduces a new standalone artifact (CLI binary, library package, tool) — not a web service with existing deployment — verify that a distribution pipeline exists. @@ -757,7 +757,7 @@ service with existing deployment — verify that a distribution pipeline exists. --- -## Step 2: Merge the base branch (BEFORE tests) +## Step 3: Merge the base branch (BEFORE tests) Fetch and merge the base branch into the feature branch so tests run against the merged state: @@ -771,7 +771,7 @@ git fetch origin && git merge origin/ --no-edit --- -## Step 2.5: Test Framework Bootstrap +## Step 4: Test Framework Bootstrap ## Test Framework Bootstrap @@ -800,7 +800,7 @@ ls -d test/ tests/ spec/ __tests__/ cypress/ e2e/ 2>/dev/null **If test framework detected** (config files or test directories found): Print "Test framework detected: {name} ({N} existing tests). Skipping bootstrap." Read 2-3 existing test files to learn conventions (naming, imports, assertion style, setup patterns). -Store conventions as prose context for use in Phase 8e.5 or Step 3.4. **Skip the rest of bootstrap.** +Store conventions as prose context for use in Phase 8e.5 or Step 7. **Skip the rest of bootstrap.** **If BOOTSTRAP_DECLINED** appears: Print "Test bootstrap previously declined — skipping." **Skip the rest of bootstrap.** @@ -929,7 +929,7 @@ Only commit if there are changes. Stage all bootstrap files (config, test direct --- -## Step 3: Run tests (on merged code) +## Step 5: Run tests (on merged code) **Do NOT run `RAILS_ENV=test bin/rails db:migrate`** — `bin/test-lane` already calls `db:test:prepare` internally, which loads the schema into the correct lane database. @@ -1051,13 +1051,13 @@ Use AskUserQuestion: - Continue with the workflow. - Note in output: "Pre-existing test failure skipped: " -**After triage:** If any in-branch failures remain unfixed, **STOP**. Do not proceed. If all failures were pre-existing and handled (fixed, TODOed, assigned, or skipped), continue to Step 3.25. +**After triage:** If any in-branch failures remain unfixed, **STOP**. Do not proceed. If all failures were pre-existing and handled (fixed, TODOed, assigned, or skipped), continue to Step 6. **If all pass:** Continue silently — just note the counts briefly. --- -## Step 3.25: Eval Suites (conditional) +## Step 6: Eval Suites (conditional) Evals are mandatory when prompt-related files change. Skip this step entirely if no prompt files are in the diff. @@ -1076,7 +1076,7 @@ Match against these patterns (from CLAUDE.md): - `config/system_prompts/*.txt` - `test/evals/**/*` (eval infrastructure changes affect all suites) -**If no matches:** Print "No prompt-related files changed — skipping evals." and continue to Step 3.5. +**If no matches:** Print "No prompt-related files changed — skipping evals." and continue to Step 9. **2. Identify affected eval suites:** @@ -1106,9 +1106,9 @@ If multiple suites need to run, run them sequentially (each needs a test lane). **4. Check results:** - **If any eval fails:** Show the failures, the cost dashboard, and **STOP**. Do not proceed. -- **If all pass:** Note pass counts and cost. Continue to Step 3.5. +- **If all pass:** Note pass counts and cost. Continue to Step 9. -**5. Save eval output** — include eval results and cost dashboard in the PR body (Step 8). +**5. Save eval output** — include eval results and cost dashboard in the PR body (Step 19). **Tier reference (for context — /ship always uses `full`):** | Tier | When | Speed (cached) | Cost | @@ -1119,9 +1119,15 @@ If multiple suites need to run, run them sequentially (each needs a test lane). --- -## Step 3.4: Test Coverage Audit +## Step 7: Test Coverage Audit -100% coverage is the goal — every untested path is a path where bugs hide and vibe coding becomes yolo coding. Evaluate what was ACTUALLY coded (from the diff), not what was planned. +**Dispatch this step as a subagent** using the Agent tool with `subagent_type: "general-purpose"`. The subagent runs the coverage audit in a fresh context window — the parent only sees the conclusion, not intermediate file reads. This is context-rot defense. + +**Subagent prompt:** Pass the following instructions to the subagent, with `` substituted with the base branch: + +> You are running a ship-workflow test coverage audit. Run `git diff ...HEAD` as needed. Do not commit or push — report only. +> +> 100% coverage is the goal — every untested path is a path where bugs hide and vibe coding becomes yolo coding. Evaluate what was ACTUALLY coded (from the diff), not what was planned. ### Test Framework Detection @@ -1143,7 +1149,7 @@ ls jest.config.* vitest.config.* playwright.config.* cypress.config.* .rspec pyt ls -d test/ tests/ spec/ __tests__/ cypress/ e2e/ 2>/dev/null ``` -3. **If no framework detected:** falls through to the Test Framework Bootstrap step (Step 2.5) which handles full setup. +3. **If no framework detected:** falls through to the Test Framework Bootstrap step (Step 4) which handles full setup. **0. Before/after test count:** @@ -1285,11 +1291,11 @@ GAPS: 8 paths need tests (2 need E2E, 1 needs eval) ───────────────────────────────── ``` -**Fast path:** All paths covered → "Step 3.4: All new code paths have test coverage ✓" Continue. +**Fast path:** All paths covered → "Step 7: All new code paths have test coverage ✓" Continue. **5. Generate tests for uncovered paths:** -If test framework detected (or bootstrapped in Step 2.5): +If test framework detected (or bootstrapped in Step 4): - Prioritize error handlers and edge cases first (happy paths are more likely already tested) - Read 2-3 existing test files to match conventions exactly - Generate unit tests. Mock all external dependencies (DB, API, Redis). @@ -1303,7 +1309,7 @@ Caps: 30 code paths max, 20 tests generated max (code + user flow combined), 2-m If no test framework AND user declined bootstrap → diagram only, no generation. Note: "Test generation skipped — no test framework configured." -**Diff is test-only changes:** Skip Step 3.4 entirely: "No new application code paths to audit." +**Diff is test-only changes:** Skip Step 7 entirely: "No new application code paths to audit." **6. After-count and coverage summary:** @@ -1378,12 +1384,30 @@ Repo: {owner/repo} ## Critical Paths - {end-to-end flow that must work} ``` +> +> After your analysis, output a single JSON object on the LAST LINE of your response (no other text after it): +> `{"coverage_pct":N,"gaps":N,"diagram":"","tests_added":["path",...]}` + +**Parent processing:** + +1. Read the subagent's final output. Parse the LAST line as JSON. +2. Store `coverage_pct` (for Step 20 metrics), `gaps` (user summary), `tests_added` (for the commit). +3. Embed `diagram` verbatim in the PR body's `## Test Coverage` section (Step 19). +4. Print a one-line summary: `Coverage: {coverage_pct}%, {gaps} gaps. {tests_added.length} tests added.` + +**If the subagent fails, times out, or returns invalid JSON:** Fall back to running the audit inline in the parent. Do not block /ship on subagent failure — partial results are better than none. --- -## Step 3.45: Plan Completion Audit +## Step 8: Plan Completion Audit -### Plan File Discovery +**Dispatch this step as a subagent** using the Agent tool with `subagent_type: "general-purpose"`. The subagent reads the plan file and every referenced code file in its own fresh context. Parent gets only the conclusion. + +**Subagent prompt:** Pass these instructions to the subagent: + +> You are running a ship-workflow plan completion audit. The base branch is ``. Use `git diff ...HEAD` to see what shipped. Do not commit or push — report only. +> +> ### Plan File Discovery 1. **Conversation context (primary):** Check if there is an active plan file in this conversation. The host agent's system messages include plan file paths when in plan mode. If found, use it directly — this is the most reliable signal. @@ -1499,19 +1523,31 @@ After producing the completion checklist: **No plan file found:** Skip entirely. "No plan file detected — skipping plan completion audit." **Include in PR body (Step 8):** Add a `## Plan Completion` section with the checklist summary. +> +> After your analysis, output a single JSON object on the LAST LINE of your response (no other text after it): +> `{"total_items":N,"done":N,"changed":N,"deferred":N,"summary":""}` + +**Parent processing:** + +1. Parse the LAST line of the subagent's output as JSON. +2. Store `done`, `deferred` for Step 20 metrics; use `summary` in PR body. +3. If `deferred > 0` and no user override, present the deferred items via AskUserQuestion before continuing. +4. Embed `summary` in PR body's `## Plan Completion` section (Step 19). + +**If the subagent fails or returns invalid JSON:** Fall back to running the audit inline. Never block /ship on subagent failure. --- -## Step 3.47: Plan Verification +## Step 8.1: Plan Verification Automatically verify the plan's testing/verification steps using the `/qa-only` skill. ### 1. Check for verification section -Using the plan file already discovered in Step 3.45, look for a verification section. Match any of these headings: `## Verification`, `## Test plan`, `## Testing`, `## How to test`, `## Manual testing`, or any section with verification-flavored items (URLs to visit, things to check visually, interactions to test). +Using the plan file already discovered in Step 8, look for a verification section. Match any of these headings: `## Verification`, `## Test plan`, `## Testing`, `## How to test`, `## Manual testing`, or any section with verification-flavored items (URLs to visit, things to check visually, interactions to test). **If no verification section found:** Skip with "No verification steps found in plan — skipping auto-verification." -**If no plan file was found in Step 3.45:** Skip (already handled). +**If no plan file was found in Step 8:** Skip (already handled). ### 2. Check for running dev server @@ -1556,7 +1592,7 @@ Follow the /qa-only workflow with these modifications: ### 5. Include in PR body -Add a `## Verification Results` section to the PR body (Step 8): +Add a `## Verification Results` section to the PR body (Step 19): - If verification ran: summary of results (N PASS, M FAIL, K SKIPPED) - If skipped: reason for skipping (no plan, no server, no verification section) @@ -1598,7 +1634,7 @@ matches a past learning, display: This makes the compounding visible. The user should see that gstack is getting smarter on their codebase over time. -## Step 3.48: Scope Drift Detection +## Step 8.2: Scope Drift Detection Before reviewing code quality, check: **did they build what was requested — nothing more, nothing less?** @@ -1635,7 +1671,7 @@ Before reviewing code quality, check: **did they build what was requested — no --- -## Step 3.5: Pre-Landing Review +## Step 9: Pre-Landing Review Review the diff for structural issues that tests don't catch. @@ -1730,7 +1766,7 @@ Present Codex output under a `CODEX (design):` header, merged with the checklist Include any design findings alongside the code review findings. They follow the same Fix-First flow below. -## Step 3.55: Review Army — Specialist Dispatch +## Step 9.1: Review Army — Specialist Dispatch ### Detect stack and scope @@ -1847,7 +1883,7 @@ CHECKLIST: --- -### Step 3.56: Collect and merge findings +### Step 9.2: Collect and merge findings After all specialist subagents complete, collect their outputs. @@ -1893,7 +1929,7 @@ SPECIALIST REVIEW: N findings (X critical, Y informational) from Z specialists PR Quality Score: X/10 ``` -These findings flow into the Fix-First flow (item 4) alongside the checklist pass (Step 3.5). +These findings flow into the Fix-First flow (item 4) alongside the checklist pass (Step 9). The Fix-First heuristic applies identically — specialist findings follow the same AUTO-FIX vs ASK classification. **Compile per-specialist stats:** @@ -1917,7 +1953,7 @@ If activated, dispatch one more subagent via the Agent tool (foreground, not bac The Red Team subagent receives: 1. The red-team checklist from `~/.claude/skills/gstack/review/specialists/red-team.md` -2. The merged specialist findings from Step 3.56 (so it knows what was already caught) +2. The merged specialist findings from Step 9.2 (so it knows what was already caught) 3. The git diff command Prompt: "You are a red team reviewer. The code has already been reviewed by N specialists @@ -1933,7 +1969,7 @@ the Fix-First flow (item 4). Red Team findings are tagged with `"specialist":"re If the Red Team returns NO FINDINGS, note: "Red Team review: no additional issues found." If the Red Team subagent fails or times out, skip silently and continue. -### Step 3.57: Cross-review finding dedup +### Step 9.3: Cross-review finding dedup Before classifying findings, check if any were previously skipped by the user in a prior review on this branch. @@ -1953,7 +1989,7 @@ If skipped fingerprints exist, get the list of files changed since that review: git diff --name-only HEAD ``` -For each current finding (from both the checklist pass (Step 3.5) and specialist review (Step 3.55-3.56)), check: +For each current finding (from both the checklist pass (Step 9) and specialist review (Step 9.1-9.2)), check: - Does its fingerprint match a previously skipped finding? - Is the finding's file path NOT in the changed-files set? @@ -1967,7 +2003,7 @@ If no prior reviews exist or none have a `findings` array, skip this step silent Output a summary header: `Pre-Landing Review: N issues (X critical, Y informational)` -4. **Classify each finding from both the checklist pass and specialist review (Step 3.55-3.56) as AUTO-FIX or ASK** per the Fix-First Heuristic in +4. **Classify each finding from both the checklist pass and specialist review (Step 9.1-Step 9.2) as AUTO-FIX or ASK** per the Fix-First Heuristic in checklist.md. Critical findings lean toward ASK; informational lean toward AUTO-FIX. 5. **Auto-fix all AUTO-FIX items.** Apply each fix. Output one line per fix: @@ -1981,7 +2017,7 @@ Output a summary header: `Pre-Landing Review: N issues (X critical, Y informatio 7. **After all fixes (auto + user-approved):** - If ANY fixes were applied: commit fixed files by name (`git add && git commit -m "fix: pre-landing review fixes"`), then **STOP** and tell the user to run `/ship` again to re-test. - - If no fixes applied (all ASK items skipped, or no issues found): continue to Step 4. + - If no fixes applied (all ASK items skipped, or no issues found): continue to Step 12. 8. Output summary: `Pre-Landing Review: N issues — M auto-fixed, K asked (J fixed, L skipped)` @@ -1993,27 +2029,38 @@ Output a summary header: `Pre-Landing Review: N issues (X critical, Y informatio ``` Substitute TIMESTAMP (ISO 8601), STATUS ("clean" if no issues, "issues_found" otherwise), and N values from the summary counts above. The `via:"ship"` distinguishes from standalone `/review` runs. -- `quality_score` = the PR Quality Score computed in Step 3.56 (e.g., 7.5). If specialists were skipped (small diff), use `10.0` -- `specialists` = the per-specialist stats object compiled in Step 3.56. Each specialist that was considered gets an entry: `{"dispatched":true/false,"findings":N,"critical":N,"informational":N}` if dispatched, or `{"dispatched":false,"reason":"scope|gated"}` if skipped. Example: `{"testing":{"dispatched":true,"findings":2,"critical":0,"informational":2},"security":{"dispatched":false,"reason":"scope"}}` +- `quality_score` = the PR Quality Score computed in Step 9.2 (e.g., 7.5). If specialists were skipped (small diff), use `10.0` +- `specialists` = the per-specialist stats object compiled in Step 9.2. Each specialist that was considered gets an entry: `{"dispatched":true/false,"findings":N,"critical":N,"informational":N}` if dispatched, or `{"dispatched":false,"reason":"scope|gated"}` if skipped. Example: `{"testing":{"dispatched":true,"findings":2,"critical":0,"informational":2},"security":{"dispatched":false,"reason":"scope"}}` - `findings` = array of per-finding records. For each finding (from checklist pass and specialists), include: `{"fingerprint":"path:line:category","severity":"CRITICAL|INFORMATIONAL","action":"ACTION"}`. ACTION is `"auto-fixed"`, `"fixed"` (user approved), or `"skipped"` (user chose Skip). -Save the review output — it goes into the PR body in Step 8. +Save the review output — it goes into the PR body in Step 19. --- -## Step 3.75: Address Greptile review comments (if PR exists) +## Step 10: Address Greptile review comments (if PR exists) -Read `.claude/skills/review/greptile-triage.md` and follow the fetch, filter, classify, and **escalation detection** steps. +**Dispatch the fetch + classification as a subagent** using the Agent tool with `subagent_type: "general-purpose"`. The subagent pulls every Greptile comment, runs the escalation detection algorithm, and classifies each comment. Parent receives a structured list and handles user interaction + file edits. -**If no PR exists, `gh` fails, API returns an error, or there are zero Greptile comments:** Skip this step silently. Continue to Step 4. +**Subagent prompt:** -**If Greptile comments are found:** +> You are classifying Greptile review comments for a /ship workflow. Read `.claude/skills/review/greptile-triage.md` and follow the fetch, filter, classify, and **escalation detection** steps. Do NOT fix code, do NOT reply to comments, do NOT commit — report only. +> +> For each comment, assign: `classification` (`valid_actionable`, `already_fixed`, `false_positive`, `suppressed`), `escalation_tier` (1 or 2), the file:line or [top-level] tag, body summary, and permalink URL. +> +> If no PR exists, `gh` fails, the API errors, or there are zero comments, output: `{"total":0,"comments":[]}` and stop. +> +> Otherwise, output a single JSON object on the LAST LINE of your response: +> `{"total":N,"comments":[{"classification":"...","escalation_tier":N,"ref":"file:line","summary":"...","permalink":"url"},...]}` -Include a Greptile summary in your output: `+ N Greptile comments (X valid, Y fixed, Z FP)` +**Parent processing:** -Before replying to any comment, run the **Escalation Detection** algorithm from greptile-triage.md to determine whether to use Tier 1 (friendly) or Tier 2 (firm) reply templates. +Parse the LAST line as JSON. -For each classified comment: +If `total` is 0, skip this step silently. Continue to Step 12. + +Otherwise, print: `+ {total} Greptile comments ({valid_actionable} valid, {already_fixed} already fixed, {false_positive} FP)`. + +For each comment in `comments`: **VALID & ACTIONABLE:** Use AskUserQuestion with: - The comment (file:line or [top-level] + body summary + permalink URL) @@ -2036,11 +2083,11 @@ For each classified comment: **SUPPRESSED:** Skip silently — these are known false positives from previous triage. -**After all comments are resolved:** If any fixes were applied, the tests from Step 3 are now stale. **Re-run tests** (Step 3) before continuing to Step 4. If no fixes were applied, continue to Step 4. +**After all comments are resolved:** If any fixes were applied, the tests from Step 5 are now stale. **Re-run tests** (Step 5) before continuing to Step 12. If no fixes were applied, continue to Step 12. --- -## Step 3.8: Adversarial review (always-on) +## Step 11: Adversarial review (always-on) Every diff gets adversarial review from both Claude and Codex. LOC is not a proxy for risk — a 5-line auth change can be critical. @@ -2126,7 +2173,7 @@ A) Investigate and fix now (recommended) B) Continue — review will still complete ``` -If A: address the findings. After fixing, re-run tests (Step 3) since code has changed. Re-run `codex review` to verify. +If A: address the findings. After fixing, re-run tests (Step 5) since code has changed. Re-run `codex review` to verify. Read stderr for errors (same error handling as Codex adversarial above). @@ -2192,7 +2239,7 @@ already knows. A good test: would this insight save time in a future session? If -## Step 4: Version bump (auto-decide) +## Step 12: Version bump (auto-decide) **Idempotency check:** Before bumping, compare VERSION against the base branch. @@ -2223,7 +2270,7 @@ If output shows `ALREADY_BUMPED`, VERSION was already bumped on this branch (pri --- -## CHANGELOG (auto-generate) +## Step 13: CHANGELOG (auto-generate) 1. Read `CHANGELOG.md` header to know the format. @@ -2267,7 +2314,7 @@ If output shows `ALREADY_BUMPED`, VERSION was already bumped on this branch (pri --- -## Step 5.5: TODOS.md (auto-update) +## Step 14: TODOS.md (auto-update) Cross-reference the project's TODOS.md against the changes being shipped. Mark completed items automatically; prompt only if the file is missing or disorganized. @@ -2279,7 +2326,7 @@ Read `.claude/skills/review/TODOS-format.md` for the canonical format reference. - Message: "GStack recommends maintaining a TODOS.md organized by skill/component, then priority (P0 at top through P4, then Completed at bottom). See TODOS-format.md for the full format. Would you like to create one?" - Options: A) Create it now, B) Skip for now - If A: Create `TODOS.md` with a skeleton (# TODOS heading + ## Completed section). Continue to step 3. -- If B: Skip the rest of Step 5.5. Continue to Step 6. +- If B: Skip the rest of Step 14. Continue to Step 15. **2. Check structure and organization:** @@ -2318,11 +2365,11 @@ For each TODO item, check if the changes in this PR complete it by: **6. Defensive:** If TODOS.md cannot be written (permission error, disk full), warn the user and continue. Never stop the ship workflow for a TODOS failure. -Save this summary — it goes into the PR body in Step 8. +Save this summary — it goes into the PR body in Step 19. --- -## Step 6: Commit (bisectable chunks) +## Step 15: Commit (bisectable chunks) **Goal:** Create small, logical commits that work well with `git bisect` and help LLMs understand what changed. @@ -2360,13 +2407,13 @@ EOF --- -## Step 6.5: Verification Gate +## Step 16: Verification Gate **IRON LAW: NO COMPLETION CLAIMS WITHOUT FRESH VERIFICATION EVIDENCE.** Before pushing, re-verify if code changed during Steps 4-6: -1. **Test verification:** If ANY code changed after Step 3's test run (fixes from review findings, CHANGELOG edits don't count), re-run the test suite. Paste fresh output. Stale output from Step 3 is NOT acceptable. +1. **Test verification:** If ANY code changed after Step 5's test run (fixes from review findings, CHANGELOG edits don't count), re-run the test suite. Paste fresh output. Stale output from Step 5 is NOT acceptable. 2. **Build verification:** If the project has a build step, run it. Paste output. @@ -2376,13 +2423,13 @@ Before pushing, re-verify if code changed during Steps 4-6: - "I already tested earlier" → Code changed since then. Test again. - "It's a trivial change" → Trivial changes break production. -**If tests fail here:** STOP. Do not push. Fix the issue and return to Step 3. +**If tests fail here:** STOP. Do not push. Fix the issue and return to Step 5. Claiming work is complete without verification is dishonesty, not efficiency. --- -## Step 7: Push +## Step 17: Push **Idempotency check:** Check if the branch is already pushed and up to date. @@ -2394,15 +2441,44 @@ echo "LOCAL: $LOCAL REMOTE: $REMOTE" [ "$LOCAL" = "$REMOTE" ] && echo "ALREADY_PUSHED" || echo "PUSH_NEEDED" ``` -If `ALREADY_PUSHED`, skip the push but continue to Step 8. Otherwise push with upstream tracking: +If `ALREADY_PUSHED`, skip the push but continue to Step 18. Otherwise push with upstream tracking: ```bash git push -u origin ``` +**You are NOT done.** The code is pushed but documentation sync and PR creation are mandatory final steps. Continue to Step 18. + --- -## Step 8: Create PR/MR +## Step 18: Documentation sync (via subagent, before PR creation) + +**Dispatch /document-release as a subagent** using the Agent tool with `subagent_type: "general-purpose"`. The subagent gets a fresh context window — zero rot from the preceding 17 steps. It also runs the **full** `/document-release` workflow (with CHANGELOG clobber protection, doc exclusions, risky-change gates, named staging, race-safe PR body editing) rather than a weaker reimplementation. + +**Sequencing:** This step runs AFTER Step 17 (Push) and BEFORE Step 19 (Create PR). The PR is created once from final HEAD with the `## Documentation` section baked into the initial body. No create-then-re-edit dance. + +**Subagent prompt:** + +> You are executing the /document-release workflow after a code push. Read the full skill file `${HOME}/.claude/skills/gstack/document-release/SKILL.md` and execute its complete workflow end-to-end, including CHANGELOG clobber protection, doc exclusions, risky-change gates, and named staging. Do NOT attempt to edit the PR body — no PR exists yet. Branch: ``, base: ``. +> +> After completing the workflow, output a single JSON object on the LAST LINE of your response (no other text after it): +> `{"files_updated":["README.md","CLAUDE.md",...],"commit_sha":"abc1234","pushed":true,"documentation_section":""}` +> +> If no documentation files needed updating, output: +> `{"files_updated":[],"commit_sha":null,"pushed":false,"documentation_section":null}` + +**Parent processing:** + +1. Parse the LAST line of the subagent's output as JSON. +2. Store `documentation_section` — Step 19 embeds it in the PR body (or omits the section if null). +3. If `files_updated` is non-empty, print: `Documentation synced: {files_updated.length} files updated, committed as {commit_sha}`. +4. If `files_updated` is empty, print: `Documentation is current — no updates needed.` + +**If the subagent fails or returns invalid JSON:** Print a warning and proceed to Step 19 without a `## Documentation` section. Do not block /ship on subagent failure. The user can run `/document-release` manually after the PR lands. + +--- + +## Step 19: Create PR/MR **Idempotency check:** Check if a PR/MR already exists for this branch. @@ -2416,7 +2492,7 @@ gh pr view --json url,number,state -q 'if .state == "OPEN" then "PR #\(.number): glab mr view -F json 2>/dev/null | jq -r 'if .state == "opened" then "MR_EXISTS" else "NO_MR" end' 2>/dev/null || echo "NO_MR" ``` -If an **open** PR/MR already exists: **update** the PR body using `gh pr edit --body "..."` (GitHub) or `glab mr update -d "..."` (GitLab). Always regenerate the PR body from scratch using this run's fresh results (test output, coverage audit, review findings, adversarial review, TODOS summary). Never reuse stale PR body content from a prior run. Print the existing URL and continue to Step 8.5. +If an **open** PR/MR already exists: **update** the PR body using `gh pr edit --body "..."` (GitHub) or `glab mr update -d "..."` (GitLab). Always regenerate the PR body from scratch using this run's fresh results (test output, coverage audit, review findings, adversarial review, TODOS summary, documentation_section from Step 18). Never reuse stale PR body content from a prior run. Print the existing URL and continue to Step 20. If no PR/MR exists: create a pull request (GitHub) or merge request (GitLab) using the platform detected in Step 0. @@ -2432,11 +2508,11 @@ must appear in at least one section. If a commit's work isn't reflected in the s you missed it.> ## Test Coverage - - + + ## Pre-Landing Review - + ## Design Review @@ -2448,19 +2524,19 @@ you missed it.> ## Greptile Review - + ## Scope Drift ## Plan Completion - + ## Verification Results - + @@ -2470,6 +2546,10 @@ you missed it.> +## Documentation + + + ## Test plan - [x] All Rails tests pass (N runs, 0 failures) - [x] All Vitest tests pass (N tests) @@ -2498,34 +2578,11 @@ EOF **If neither CLI is available:** Print the branch name, remote URL, and instruct the user to create the PR/MR manually via the web UI. Do not stop — the code is pushed and ready. -**Output the PR/MR URL** — then proceed to Step 8.5. +**Output the PR/MR URL** — then proceed to Step 20. --- -## Step 8.5: Auto-invoke /document-release - -After the PR is created, automatically sync project documentation. Read the -`document-release/SKILL.md` skill file (adjacent to this skill's directory) and -execute its full workflow: - -1. Read the `/document-release` skill: `cat ${CLAUDE_SKILL_DIR}/../document-release/SKILL.md` -2. Follow its instructions — it reads all .md files in the project, cross-references - the diff, and updates anything that drifted (README, ARCHITECTURE, CONTRIBUTING, - CLAUDE.md, TODOS, etc.) -3. If any docs were updated, commit the changes and push to the same branch: - ```bash - git add -A && git commit -m "docs: sync documentation with shipped changes" && git push - ``` -4. If no docs needed updating, say "Documentation is current — no updates needed." - -This step is automatic. Do not ask the user for confirmation. The goal is zero-friction -doc updates — the user runs `/ship` and documentation stays current without a separate command. - -If Step 8.5 created a docs commit, re-edit the PR/MR body to include the latest commit SHA in the summary. This ensures the PR body reflects the truly final state after document-release. - ---- - -## Step 8.75: Persist ship metrics +## Step 20: Persist ship metrics Log coverage and plan completion data so `/retro` can track trends: @@ -2540,10 +2597,10 @@ echo '{"skill":"ship","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","coverage ``` Substitute from earlier steps: -- **COVERAGE_PCT**: coverage percentage from Step 3.4 diagram (integer, or -1 if undetermined) -- **PLAN_TOTAL**: total plan items extracted in Step 3.45 (0 if no plan file) -- **PLAN_DONE**: count of DONE + CHANGED items from Step 3.45 (0 if no plan file) -- **VERIFY_RESULT**: "pass", "fail", or "skipped" from Step 3.47 +- **COVERAGE_PCT**: coverage percentage from Step 7 diagram (integer, or -1 if undetermined) +- **PLAN_TOTAL**: total plan items extracted in Step 8 (0 if no plan file) +- **PLAN_DONE**: count of DONE + CHANGED items from Step 8 (0 if no plan file) +- **VERIFY_RESULT**: "pass", "fail", or "skipped" from Step 8.1 - **VERSION**: from the VERSION file - **BRANCH**: current branch name @@ -2562,6 +2619,6 @@ This step is automatic — never skip it, never ask for confirmation. - **Split commits for bisectability** — each commit = one logical change. - **TODOS.md completion detection must be conservative.** Only mark items as completed when the diff clearly shows the work is done. - **Use Greptile reply templates from greptile-triage.md.** Every reply includes evidence (inline diff, code references, re-rank suggestion). Never post vague replies. -- **Never push without fresh verification evidence.** If code changed after Step 3 tests, re-run before pushing. -- **Step 3.4 generates coverage tests.** They must pass before committing. Never commit failing tests. +- **Never push without fresh verification evidence.** If code changed after Step 5 tests, re-run before pushing. +- **Step 7 generates coverage tests.** They must pass before committing. Never commit failing tests. - **The goal is: user says `/ship`, next thing they see is the review + PR URL + auto-synced docs.** diff --git a/test/fixtures/golden/codex-ship-SKILL.md b/test/fixtures/golden/codex-ship-SKILL.md index 11bf4253..e0281770 100644 --- a/test/fixtures/golden/codex-ship-SKILL.md +++ b/test/fixtures/golden/codex-ship-SKILL.md @@ -613,17 +613,17 @@ You are running the `/ship` workflow. This is a **non-interactive, fully automat - Merge conflicts that can't be auto-resolved (stop, show conflicts) - In-branch test failures (pre-existing failures are triaged, not auto-blocking) - Pre-landing review finds ASK items that need user judgment -- MINOR or MAJOR version bump needed (ask — see Step 4) +- MINOR or MAJOR version bump needed (ask — see Step 12) - Greptile review comments that need user decision (complex fixes, false positives) -- AI-assessed coverage below minimum threshold (hard gate with user override — see Step 3.4) -- Plan items NOT DONE with no user override (see Step 3.45) -- Plan verification failures (see Step 3.47) -- TODOS.md missing and user wants to create one (ask — see Step 5.5) -- TODOS.md disorganized and user wants to reorganize (ask — see Step 5.5) +- AI-assessed coverage below minimum threshold (hard gate with user override — see Step 7) +- Plan items NOT DONE with no user override (see Step 8) +- Plan verification failures (see Step 8.1) +- TODOS.md missing and user wants to create one (ask — see Step 14) +- TODOS.md disorganized and user wants to reorganize (ask — see Step 14) **Never stop for:** - Uncommitted changes (always include them) -- Version bump choice (auto-pick MICRO or PATCH — see Step 4) +- Version bump choice (auto-pick MICRO or PATCH — see Step 12) - CHANGELOG content (auto-generate from diff) - Commit message approval (auto-commit) - Multi-file changesets (auto-split into bisectable commits) @@ -636,9 +636,9 @@ Re-running `/ship` means "run the whole checklist again." Every verification ste (tests, coverage audit, plan completion, pre-landing review, adversarial review, VERSION/CHANGELOG check, TODOS, document-release) runs on every invocation. Only *actions* are idempotent: -- Step 4: If VERSION already bumped, skip the bump but still read the version -- Step 7: If already pushed, skip the push command -- Step 8: If PR exists, update the body instead of creating a new PR +- Step 12: If VERSION already bumped, skip the bump but still read the version +- Step 17: If already pushed, skip the push command +- Step 19: If PR exists, update the body instead of creating a new PR Never skip a verification step because a prior `/ship` run already performed it. --- @@ -706,19 +706,19 @@ Display: If the Eng Review is NOT "CLEAR": -Print: "No prior eng review found — ship will run its own pre-landing review in Step 3.5." +Print: "No prior eng review found — ship will run its own pre-landing review in Step 9." Check diff size: `git diff ...HEAD --stat | tail -1`. If the diff is >200 lines, add: "Note: This is a large diff. Consider running `/plan-eng-review` or `/autoplan` for architecture-level review before shipping." If CEO Review is missing, mention as informational ("CEO Review not run — recommended for product changes") but do NOT block. -For Design Review: run `source <($GSTACK_ROOT/bin/gstack-diff-scope 2>/dev/null)`. If `SCOPE_FRONTEND=true` and no design review (plan-design-review or design-review-lite) exists in the dashboard, mention: "Design Review not run — this PR changes frontend code. The lite design check will run automatically in Step 3.5, but consider running /design-review for a full visual audit post-implementation." Still never block. +For Design Review: run `source <($GSTACK_ROOT/bin/gstack-diff-scope 2>/dev/null)`. If `SCOPE_FRONTEND=true` and no design review (plan-design-review or design-review-lite) exists in the dashboard, mention: "Design Review not run — this PR changes frontend code. The lite design check will run automatically in Step 9, but consider running /design-review for a full visual audit post-implementation." Still never block. -Continue to Step 1.5 — do NOT block or ask. Ship runs its own review in Step 3.5. +Continue to Step 2 — do NOT block or ask. Ship runs its own review in Step 9. --- -## Step 1.5: Distribution Pipeline Check +## Step 2: Distribution Pipeline Check If the diff introduces a new standalone artifact (CLI binary, library package, tool) — not a web service with existing deployment — verify that a distribution pipeline exists. @@ -746,7 +746,7 @@ service with existing deployment — verify that a distribution pipeline exists. --- -## Step 2: Merge the base branch (BEFORE tests) +## Step 3: Merge the base branch (BEFORE tests) Fetch and merge the base branch into the feature branch so tests run against the merged state: @@ -760,7 +760,7 @@ git fetch origin && git merge origin/ --no-edit --- -## Step 2.5: Test Framework Bootstrap +## Step 4: Test Framework Bootstrap ## Test Framework Bootstrap @@ -789,7 +789,7 @@ ls -d test/ tests/ spec/ __tests__/ cypress/ e2e/ 2>/dev/null **If test framework detected** (config files or test directories found): Print "Test framework detected: {name} ({N} existing tests). Skipping bootstrap." Read 2-3 existing test files to learn conventions (naming, imports, assertion style, setup patterns). -Store conventions as prose context for use in Phase 8e.5 or Step 3.4. **Skip the rest of bootstrap.** +Store conventions as prose context for use in Phase 8e.5 or Step 7. **Skip the rest of bootstrap.** **If BOOTSTRAP_DECLINED** appears: Print "Test bootstrap previously declined — skipping." **Skip the rest of bootstrap.** @@ -918,7 +918,7 @@ Only commit if there are changes. Stage all bootstrap files (config, test direct --- -## Step 3: Run tests (on merged code) +## Step 5: Run tests (on merged code) **Do NOT run `RAILS_ENV=test bin/rails db:migrate`** — `bin/test-lane` already calls `db:test:prepare` internally, which loads the schema into the correct lane database. @@ -1040,13 +1040,13 @@ Use AskUserQuestion: - Continue with the workflow. - Note in output: "Pre-existing test failure skipped: " -**After triage:** If any in-branch failures remain unfixed, **STOP**. Do not proceed. If all failures were pre-existing and handled (fixed, TODOed, assigned, or skipped), continue to Step 3.25. +**After triage:** If any in-branch failures remain unfixed, **STOP**. Do not proceed. If all failures were pre-existing and handled (fixed, TODOed, assigned, or skipped), continue to Step 6. **If all pass:** Continue silently — just note the counts briefly. --- -## Step 3.25: Eval Suites (conditional) +## Step 6: Eval Suites (conditional) Evals are mandatory when prompt-related files change. Skip this step entirely if no prompt files are in the diff. @@ -1065,7 +1065,7 @@ Match against these patterns (from CLAUDE.md): - `config/system_prompts/*.txt` - `test/evals/**/*` (eval infrastructure changes affect all suites) -**If no matches:** Print "No prompt-related files changed — skipping evals." and continue to Step 3.5. +**If no matches:** Print "No prompt-related files changed — skipping evals." and continue to Step 9. **2. Identify affected eval suites:** @@ -1095,9 +1095,9 @@ If multiple suites need to run, run them sequentially (each needs a test lane). **4. Check results:** - **If any eval fails:** Show the failures, the cost dashboard, and **STOP**. Do not proceed. -- **If all pass:** Note pass counts and cost. Continue to Step 3.5. +- **If all pass:** Note pass counts and cost. Continue to Step 9. -**5. Save eval output** — include eval results and cost dashboard in the PR body (Step 8). +**5. Save eval output** — include eval results and cost dashboard in the PR body (Step 19). **Tier reference (for context — /ship always uses `full`):** | Tier | When | Speed (cached) | Cost | @@ -1108,9 +1108,15 @@ If multiple suites need to run, run them sequentially (each needs a test lane). --- -## Step 3.4: Test Coverage Audit +## Step 7: Test Coverage Audit -100% coverage is the goal — every untested path is a path where bugs hide and vibe coding becomes yolo coding. Evaluate what was ACTUALLY coded (from the diff), not what was planned. +**Dispatch this step as a subagent** using the Agent tool with `subagent_type: "general-purpose"`. The subagent runs the coverage audit in a fresh context window — the parent only sees the conclusion, not intermediate file reads. This is context-rot defense. + +**Subagent prompt:** Pass the following instructions to the subagent, with `` substituted with the base branch: + +> You are running a ship-workflow test coverage audit. Run `git diff ...HEAD` as needed. Do not commit or push — report only. +> +> 100% coverage is the goal — every untested path is a path where bugs hide and vibe coding becomes yolo coding. Evaluate what was ACTUALLY coded (from the diff), not what was planned. ### Test Framework Detection @@ -1132,7 +1138,7 @@ ls jest.config.* vitest.config.* playwright.config.* cypress.config.* .rspec pyt ls -d test/ tests/ spec/ __tests__/ cypress/ e2e/ 2>/dev/null ``` -3. **If no framework detected:** falls through to the Test Framework Bootstrap step (Step 2.5) which handles full setup. +3. **If no framework detected:** falls through to the Test Framework Bootstrap step (Step 4) which handles full setup. **0. Before/after test count:** @@ -1274,11 +1280,11 @@ GAPS: 8 paths need tests (2 need E2E, 1 needs eval) ───────────────────────────────── ``` -**Fast path:** All paths covered → "Step 3.4: All new code paths have test coverage ✓" Continue. +**Fast path:** All paths covered → "Step 7: All new code paths have test coverage ✓" Continue. **5. Generate tests for uncovered paths:** -If test framework detected (or bootstrapped in Step 2.5): +If test framework detected (or bootstrapped in Step 4): - Prioritize error handlers and edge cases first (happy paths are more likely already tested) - Read 2-3 existing test files to match conventions exactly - Generate unit tests. Mock all external dependencies (DB, API, Redis). @@ -1292,7 +1298,7 @@ Caps: 30 code paths max, 20 tests generated max (code + user flow combined), 2-m If no test framework AND user declined bootstrap → diagram only, no generation. Note: "Test generation skipped — no test framework configured." -**Diff is test-only changes:** Skip Step 3.4 entirely: "No new application code paths to audit." +**Diff is test-only changes:** Skip Step 7 entirely: "No new application code paths to audit." **6. After-count and coverage summary:** @@ -1367,12 +1373,30 @@ Repo: {owner/repo} ## Critical Paths - {end-to-end flow that must work} ``` +> +> After your analysis, output a single JSON object on the LAST LINE of your response (no other text after it): +> `{"coverage_pct":N,"gaps":N,"diagram":"","tests_added":["path",...]}` + +**Parent processing:** + +1. Read the subagent's final output. Parse the LAST line as JSON. +2. Store `coverage_pct` (for Step 20 metrics), `gaps` (user summary), `tests_added` (for the commit). +3. Embed `diagram` verbatim in the PR body's `## Test Coverage` section (Step 19). +4. Print a one-line summary: `Coverage: {coverage_pct}%, {gaps} gaps. {tests_added.length} tests added.` + +**If the subagent fails, times out, or returns invalid JSON:** Fall back to running the audit inline in the parent. Do not block /ship on subagent failure — partial results are better than none. --- -## Step 3.45: Plan Completion Audit +## Step 8: Plan Completion Audit -### Plan File Discovery +**Dispatch this step as a subagent** using the Agent tool with `subagent_type: "general-purpose"`. The subagent reads the plan file and every referenced code file in its own fresh context. Parent gets only the conclusion. + +**Subagent prompt:** Pass these instructions to the subagent: + +> You are running a ship-workflow plan completion audit. The base branch is ``. Use `git diff ...HEAD` to see what shipped. Do not commit or push — report only. +> +> ### Plan File Discovery 1. **Conversation context (primary):** Check if there is an active plan file in this conversation. The host agent's system messages include plan file paths when in plan mode. If found, use it directly — this is the most reliable signal. @@ -1488,19 +1512,31 @@ After producing the completion checklist: **No plan file found:** Skip entirely. "No plan file detected — skipping plan completion audit." **Include in PR body (Step 8):** Add a `## Plan Completion` section with the checklist summary. +> +> After your analysis, output a single JSON object on the LAST LINE of your response (no other text after it): +> `{"total_items":N,"done":N,"changed":N,"deferred":N,"summary":""}` + +**Parent processing:** + +1. Parse the LAST line of the subagent's output as JSON. +2. Store `done`, `deferred` for Step 20 metrics; use `summary` in PR body. +3. If `deferred > 0` and no user override, present the deferred items via AskUserQuestion before continuing. +4. Embed `summary` in PR body's `## Plan Completion` section (Step 19). + +**If the subagent fails or returns invalid JSON:** Fall back to running the audit inline. Never block /ship on subagent failure. --- -## Step 3.47: Plan Verification +## Step 8.1: Plan Verification Automatically verify the plan's testing/verification steps using the `/qa-only` skill. ### 1. Check for verification section -Using the plan file already discovered in Step 3.45, look for a verification section. Match any of these headings: `## Verification`, `## Test plan`, `## Testing`, `## How to test`, `## Manual testing`, or any section with verification-flavored items (URLs to visit, things to check visually, interactions to test). +Using the plan file already discovered in Step 8, look for a verification section. Match any of these headings: `## Verification`, `## Test plan`, `## Testing`, `## How to test`, `## Manual testing`, or any section with verification-flavored items (URLs to visit, things to check visually, interactions to test). **If no verification section found:** Skip with "No verification steps found in plan — skipping auto-verification." -**If no plan file was found in Step 3.45:** Skip (already handled). +**If no plan file was found in Step 8:** Skip (already handled). ### 2. Check for running dev server @@ -1545,7 +1581,7 @@ Follow the /qa-only workflow with these modifications: ### 5. Include in PR body -Add a `## Verification Results` section to the PR body (Step 8): +Add a `## Verification Results` section to the PR body (Step 19): - If verification ran: summary of results (N PASS, M FAIL, K SKIPPED) - If skipped: reason for skipping (no plan, no server, no verification section) @@ -1560,7 +1596,7 @@ $GSTACK_BIN/gstack-learnings-search --limit 10 2>/dev/null || true If learnings are found, incorporate them into your analysis. When a review finding matches a past learning, note it: "Prior learning applied: [key] (confidence N, from [date])" -## Step 3.48: Scope Drift Detection +## Step 8.2: Scope Drift Detection Before reviewing code quality, check: **did they build what was requested — nothing more, nothing less?** @@ -1597,7 +1633,7 @@ Before reviewing code quality, check: **did they build what was requested — no --- -## Step 3.5: Pre-Landing Review +## Step 9: Pre-Landing Review Review the diff for structural issues that tests don't catch. @@ -1671,7 +1707,7 @@ Substitute: TIMESTAMP = ISO 8601 datetime, STATUS = "clean" if 0 findings or "is -### Step 3.57: Cross-review finding dedup +### Step 9.3: Cross-review finding dedup Before classifying findings, check if any were previously skipped by the user in a prior review on this branch. @@ -1691,7 +1727,7 @@ If skipped fingerprints exist, get the list of files changed since that review: git diff --name-only HEAD ``` -For each current finding (from both the checklist pass (Step 3.5) and specialist review (Step 3.55-3.56)), check: +For each current finding (from both the checklist pass (Step 9) and specialist review (Step 9.1-9.2)), check: - Does its fingerprint match a previously skipped finding? - Is the finding's file path NOT in the changed-files set? @@ -1705,7 +1741,7 @@ If no prior reviews exist or none have a `findings` array, skip this step silent Output a summary header: `Pre-Landing Review: N issues (X critical, Y informational)` -4. **Classify each finding from both the checklist pass and specialist review (Step 3.55-3.56) as AUTO-FIX or ASK** per the Fix-First Heuristic in +4. **Classify each finding from both the checklist pass and specialist review (Step 9.1-Step 9.2) as AUTO-FIX or ASK** per the Fix-First Heuristic in checklist.md. Critical findings lean toward ASK; informational lean toward AUTO-FIX. 5. **Auto-fix all AUTO-FIX items.** Apply each fix. Output one line per fix: @@ -1719,7 +1755,7 @@ Output a summary header: `Pre-Landing Review: N issues (X critical, Y informatio 7. **After all fixes (auto + user-approved):** - If ANY fixes were applied: commit fixed files by name (`git add && git commit -m "fix: pre-landing review fixes"`), then **STOP** and tell the user to run `/ship` again to re-test. - - If no fixes applied (all ASK items skipped, or no issues found): continue to Step 4. + - If no fixes applied (all ASK items skipped, or no issues found): continue to Step 12. 8. Output summary: `Pre-Landing Review: N issues — M auto-fixed, K asked (J fixed, L skipped)` @@ -1731,27 +1767,38 @@ $GSTACK_ROOT/bin/gstack-review-log '{"skill":"review","timestamp":"TIMESTAMP","s ``` Substitute TIMESTAMP (ISO 8601), STATUS ("clean" if no issues, "issues_found" otherwise), and N values from the summary counts above. The `via:"ship"` distinguishes from standalone `/review` runs. -- `quality_score` = the PR Quality Score computed in Step 3.56 (e.g., 7.5). If specialists were skipped (small diff), use `10.0` -- `specialists` = the per-specialist stats object compiled in Step 3.56. Each specialist that was considered gets an entry: `{"dispatched":true/false,"findings":N,"critical":N,"informational":N}` if dispatched, or `{"dispatched":false,"reason":"scope|gated"}` if skipped. Example: `{"testing":{"dispatched":true,"findings":2,"critical":0,"informational":2},"security":{"dispatched":false,"reason":"scope"}}` +- `quality_score` = the PR Quality Score computed in Step 9.2 (e.g., 7.5). If specialists were skipped (small diff), use `10.0` +- `specialists` = the per-specialist stats object compiled in Step 9.2. Each specialist that was considered gets an entry: `{"dispatched":true/false,"findings":N,"critical":N,"informational":N}` if dispatched, or `{"dispatched":false,"reason":"scope|gated"}` if skipped. Example: `{"testing":{"dispatched":true,"findings":2,"critical":0,"informational":2},"security":{"dispatched":false,"reason":"scope"}}` - `findings` = array of per-finding records. For each finding (from checklist pass and specialists), include: `{"fingerprint":"path:line:category","severity":"CRITICAL|INFORMATIONAL","action":"ACTION"}`. ACTION is `"auto-fixed"`, `"fixed"` (user approved), or `"skipped"` (user chose Skip). -Save the review output — it goes into the PR body in Step 8. +Save the review output — it goes into the PR body in Step 19. --- -## Step 3.75: Address Greptile review comments (if PR exists) +## Step 10: Address Greptile review comments (if PR exists) -Read `.agents/skills/gstack/review/greptile-triage.md` and follow the fetch, filter, classify, and **escalation detection** steps. +**Dispatch the fetch + classification as a subagent** using the Agent tool with `subagent_type: "general-purpose"`. The subagent pulls every Greptile comment, runs the escalation detection algorithm, and classifies each comment. Parent receives a structured list and handles user interaction + file edits. -**If no PR exists, `gh` fails, API returns an error, or there are zero Greptile comments:** Skip this step silently. Continue to Step 4. +**Subagent prompt:** -**If Greptile comments are found:** +> You are classifying Greptile review comments for a /ship workflow. Read `.agents/skills/gstack/review/greptile-triage.md` and follow the fetch, filter, classify, and **escalation detection** steps. Do NOT fix code, do NOT reply to comments, do NOT commit — report only. +> +> For each comment, assign: `classification` (`valid_actionable`, `already_fixed`, `false_positive`, `suppressed`), `escalation_tier` (1 or 2), the file:line or [top-level] tag, body summary, and permalink URL. +> +> If no PR exists, `gh` fails, the API errors, or there are zero comments, output: `{"total":0,"comments":[]}` and stop. +> +> Otherwise, output a single JSON object on the LAST LINE of your response: +> `{"total":N,"comments":[{"classification":"...","escalation_tier":N,"ref":"file:line","summary":"...","permalink":"url"},...]}` -Include a Greptile summary in your output: `+ N Greptile comments (X valid, Y fixed, Z FP)` +**Parent processing:** -Before replying to any comment, run the **Escalation Detection** algorithm from greptile-triage.md to determine whether to use Tier 1 (friendly) or Tier 2 (firm) reply templates. +Parse the LAST line as JSON. -For each classified comment: +If `total` is 0, skip this step silently. Continue to Step 12. + +Otherwise, print: `+ {total} Greptile comments ({valid_actionable} valid, {already_fixed} already fixed, {false_positive} FP)`. + +For each comment in `comments`: **VALID & ACTIONABLE:** Use AskUserQuestion with: - The comment (file:line or [top-level] + body summary + permalink URL) @@ -1774,7 +1821,7 @@ For each classified comment: **SUPPRESSED:** Skip silently — these are known false positives from previous triage. -**After all comments are resolved:** If any fixes were applied, the tests from Step 3 are now stale. **Re-run tests** (Step 3) before continuing to Step 4. If no fixes were applied, continue to Step 4. +**After all comments are resolved:** If any fixes were applied, the tests from Step 5 are now stale. **Re-run tests** (Step 5) before continuing to Step 12. If no fixes were applied, continue to Step 12. --- @@ -1807,7 +1854,7 @@ already knows. A good test: would this insight save time in a future session? If -## Step 4: Version bump (auto-decide) +## Step 12: Version bump (auto-decide) **Idempotency check:** Before bumping, compare VERSION against the base branch. @@ -1838,7 +1885,7 @@ If output shows `ALREADY_BUMPED`, VERSION was already bumped on this branch (pri --- -## CHANGELOG (auto-generate) +## Step 13: CHANGELOG (auto-generate) 1. Read `CHANGELOG.md` header to know the format. @@ -1882,7 +1929,7 @@ If output shows `ALREADY_BUMPED`, VERSION was already bumped on this branch (pri --- -## Step 5.5: TODOS.md (auto-update) +## Step 14: TODOS.md (auto-update) Cross-reference the project's TODOS.md against the changes being shipped. Mark completed items automatically; prompt only if the file is missing or disorganized. @@ -1894,7 +1941,7 @@ Read `.agents/skills/gstack/review/TODOS-format.md` for the canonical format ref - Message: "GStack recommends maintaining a TODOS.md organized by skill/component, then priority (P0 at top through P4, then Completed at bottom). See TODOS-format.md for the full format. Would you like to create one?" - Options: A) Create it now, B) Skip for now - If A: Create `TODOS.md` with a skeleton (# TODOS heading + ## Completed section). Continue to step 3. -- If B: Skip the rest of Step 5.5. Continue to Step 6. +- If B: Skip the rest of Step 14. Continue to Step 15. **2. Check structure and organization:** @@ -1933,11 +1980,11 @@ For each TODO item, check if the changes in this PR complete it by: **6. Defensive:** If TODOS.md cannot be written (permission error, disk full), warn the user and continue. Never stop the ship workflow for a TODOS failure. -Save this summary — it goes into the PR body in Step 8. +Save this summary — it goes into the PR body in Step 19. --- -## Step 6: Commit (bisectable chunks) +## Step 15: Commit (bisectable chunks) **Goal:** Create small, logical commits that work well with `git bisect` and help LLMs understand what changed. @@ -1975,13 +2022,13 @@ EOF --- -## Step 6.5: Verification Gate +## Step 16: Verification Gate **IRON LAW: NO COMPLETION CLAIMS WITHOUT FRESH VERIFICATION EVIDENCE.** Before pushing, re-verify if code changed during Steps 4-6: -1. **Test verification:** If ANY code changed after Step 3's test run (fixes from review findings, CHANGELOG edits don't count), re-run the test suite. Paste fresh output. Stale output from Step 3 is NOT acceptable. +1. **Test verification:** If ANY code changed after Step 5's test run (fixes from review findings, CHANGELOG edits don't count), re-run the test suite. Paste fresh output. Stale output from Step 5 is NOT acceptable. 2. **Build verification:** If the project has a build step, run it. Paste output. @@ -1991,13 +2038,13 @@ Before pushing, re-verify if code changed during Steps 4-6: - "I already tested earlier" → Code changed since then. Test again. - "It's a trivial change" → Trivial changes break production. -**If tests fail here:** STOP. Do not push. Fix the issue and return to Step 3. +**If tests fail here:** STOP. Do not push. Fix the issue and return to Step 5. Claiming work is complete without verification is dishonesty, not efficiency. --- -## Step 7: Push +## Step 17: Push **Idempotency check:** Check if the branch is already pushed and up to date. @@ -2009,15 +2056,44 @@ echo "LOCAL: $LOCAL REMOTE: $REMOTE" [ "$LOCAL" = "$REMOTE" ] && echo "ALREADY_PUSHED" || echo "PUSH_NEEDED" ``` -If `ALREADY_PUSHED`, skip the push but continue to Step 8. Otherwise push with upstream tracking: +If `ALREADY_PUSHED`, skip the push but continue to Step 18. Otherwise push with upstream tracking: ```bash git push -u origin ``` +**You are NOT done.** The code is pushed but documentation sync and PR creation are mandatory final steps. Continue to Step 18. + --- -## Step 8: Create PR/MR +## Step 18: Documentation sync (via subagent, before PR creation) + +**Dispatch /document-release as a subagent** using the Agent tool with `subagent_type: "general-purpose"`. The subagent gets a fresh context window — zero rot from the preceding 17 steps. It also runs the **full** `/document-release` workflow (with CHANGELOG clobber protection, doc exclusions, risky-change gates, named staging, race-safe PR body editing) rather than a weaker reimplementation. + +**Sequencing:** This step runs AFTER Step 17 (Push) and BEFORE Step 19 (Create PR). The PR is created once from final HEAD with the `## Documentation` section baked into the initial body. No create-then-re-edit dance. + +**Subagent prompt:** + +> You are executing the /document-release workflow after a code push. Read the full skill file `${HOME}/.agents/skills/gstack/document-release/SKILL.md` and execute its complete workflow end-to-end, including CHANGELOG clobber protection, doc exclusions, risky-change gates, and named staging. Do NOT attempt to edit the PR body — no PR exists yet. Branch: ``, base: ``. +> +> After completing the workflow, output a single JSON object on the LAST LINE of your response (no other text after it): +> `{"files_updated":["README.md","CLAUDE.md",...],"commit_sha":"abc1234","pushed":true,"documentation_section":""}` +> +> If no documentation files needed updating, output: +> `{"files_updated":[],"commit_sha":null,"pushed":false,"documentation_section":null}` + +**Parent processing:** + +1. Parse the LAST line of the subagent's output as JSON. +2. Store `documentation_section` — Step 19 embeds it in the PR body (or omits the section if null). +3. If `files_updated` is non-empty, print: `Documentation synced: {files_updated.length} files updated, committed as {commit_sha}`. +4. If `files_updated` is empty, print: `Documentation is current — no updates needed.` + +**If the subagent fails or returns invalid JSON:** Print a warning and proceed to Step 19 without a `## Documentation` section. Do not block /ship on subagent failure. The user can run `/document-release` manually after the PR lands. + +--- + +## Step 19: Create PR/MR **Idempotency check:** Check if a PR/MR already exists for this branch. @@ -2031,7 +2107,7 @@ gh pr view --json url,number,state -q 'if .state == "OPEN" then "PR #\(.number): glab mr view -F json 2>/dev/null | jq -r 'if .state == "opened" then "MR_EXISTS" else "NO_MR" end' 2>/dev/null || echo "NO_MR" ``` -If an **open** PR/MR already exists: **update** the PR body using `gh pr edit --body "..."` (GitHub) or `glab mr update -d "..."` (GitLab). Always regenerate the PR body from scratch using this run's fresh results (test output, coverage audit, review findings, adversarial review, TODOS summary). Never reuse stale PR body content from a prior run. Print the existing URL and continue to Step 8.5. +If an **open** PR/MR already exists: **update** the PR body using `gh pr edit --body "..."` (GitHub) or `glab mr update -d "..."` (GitLab). Always regenerate the PR body from scratch using this run's fresh results (test output, coverage audit, review findings, adversarial review, TODOS summary, documentation_section from Step 18). Never reuse stale PR body content from a prior run. Print the existing URL and continue to Step 20. If no PR/MR exists: create a pull request (GitHub) or merge request (GitLab) using the platform detected in Step 0. @@ -2047,11 +2123,11 @@ must appear in at least one section. If a commit's work isn't reflected in the s you missed it.> ## Test Coverage - - + + ## Pre-Landing Review - + ## Design Review @@ -2063,19 +2139,19 @@ you missed it.> ## Greptile Review - + ## Scope Drift ## Plan Completion - + ## Verification Results - + @@ -2085,6 +2161,10 @@ you missed it.> +## Documentation + + + ## Test plan - [x] All Rails tests pass (N runs, 0 failures) - [x] All Vitest tests pass (N tests) @@ -2113,34 +2193,11 @@ EOF **If neither CLI is available:** Print the branch name, remote URL, and instruct the user to create the PR/MR manually via the web UI. Do not stop — the code is pushed and ready. -**Output the PR/MR URL** — then proceed to Step 8.5. +**Output the PR/MR URL** — then proceed to Step 20. --- -## Step 8.5: Auto-invoke /document-release - -After the PR is created, automatically sync project documentation. Read the -`document-release/SKILL.md` skill file (adjacent to this skill's directory) and -execute its full workflow: - -1. Read the `/document-release` skill: `cat ${CLAUDE_SKILL_DIR}/../document-release/SKILL.md` -2. Follow its instructions — it reads all .md files in the project, cross-references - the diff, and updates anything that drifted (README, ARCHITECTURE, CONTRIBUTING, - CLAUDE.md, TODOS, etc.) -3. If any docs were updated, commit the changes and push to the same branch: - ```bash - git add -A && git commit -m "docs: sync documentation with shipped changes" && git push - ``` -4. If no docs needed updating, say "Documentation is current — no updates needed." - -This step is automatic. Do not ask the user for confirmation. The goal is zero-friction -doc updates — the user runs `/ship` and documentation stays current without a separate command. - -If Step 8.5 created a docs commit, re-edit the PR/MR body to include the latest commit SHA in the summary. This ensures the PR body reflects the truly final state after document-release. - ---- - -## Step 8.75: Persist ship metrics +## Step 20: Persist ship metrics Log coverage and plan completion data so `/retro` can track trends: @@ -2155,10 +2212,10 @@ echo '{"skill":"ship","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","coverage ``` Substitute from earlier steps: -- **COVERAGE_PCT**: coverage percentage from Step 3.4 diagram (integer, or -1 if undetermined) -- **PLAN_TOTAL**: total plan items extracted in Step 3.45 (0 if no plan file) -- **PLAN_DONE**: count of DONE + CHANGED items from Step 3.45 (0 if no plan file) -- **VERIFY_RESULT**: "pass", "fail", or "skipped" from Step 3.47 +- **COVERAGE_PCT**: coverage percentage from Step 7 diagram (integer, or -1 if undetermined) +- **PLAN_TOTAL**: total plan items extracted in Step 8 (0 if no plan file) +- **PLAN_DONE**: count of DONE + CHANGED items from Step 8 (0 if no plan file) +- **VERIFY_RESULT**: "pass", "fail", or "skipped" from Step 8.1 - **VERSION**: from the VERSION file - **BRANCH**: current branch name @@ -2177,6 +2234,6 @@ This step is automatic — never skip it, never ask for confirmation. - **Split commits for bisectability** — each commit = one logical change. - **TODOS.md completion detection must be conservative.** Only mark items as completed when the diff clearly shows the work is done. - **Use Greptile reply templates from greptile-triage.md.** Every reply includes evidence (inline diff, code references, re-rank suggestion). Never post vague replies. -- **Never push without fresh verification evidence.** If code changed after Step 3 tests, re-run before pushing. -- **Step 3.4 generates coverage tests.** They must pass before committing. Never commit failing tests. +- **Never push without fresh verification evidence.** If code changed after Step 5 tests, re-run before pushing. +- **Step 7 generates coverage tests.** They must pass before committing. Never commit failing tests. - **The goal is: user says `/ship`, next thing they see is the review + PR URL + auto-synced docs.** diff --git a/test/fixtures/golden/factory-ship-SKILL.md b/test/fixtures/golden/factory-ship-SKILL.md index dc6f10ce..74da5ce0 100644 --- a/test/fixtures/golden/factory-ship-SKILL.md +++ b/test/fixtures/golden/factory-ship-SKILL.md @@ -615,17 +615,17 @@ You are running the `/ship` workflow. This is a **non-interactive, fully automat - Merge conflicts that can't be auto-resolved (stop, show conflicts) - In-branch test failures (pre-existing failures are triaged, not auto-blocking) - Pre-landing review finds ASK items that need user judgment -- MINOR or MAJOR version bump needed (ask — see Step 4) +- MINOR or MAJOR version bump needed (ask — see Step 12) - Greptile review comments that need user decision (complex fixes, false positives) -- AI-assessed coverage below minimum threshold (hard gate with user override — see Step 3.4) -- Plan items NOT DONE with no user override (see Step 3.45) -- Plan verification failures (see Step 3.47) -- TODOS.md missing and user wants to create one (ask — see Step 5.5) -- TODOS.md disorganized and user wants to reorganize (ask — see Step 5.5) +- AI-assessed coverage below minimum threshold (hard gate with user override — see Step 7) +- Plan items NOT DONE with no user override (see Step 8) +- Plan verification failures (see Step 8.1) +- TODOS.md missing and user wants to create one (ask — see Step 14) +- TODOS.md disorganized and user wants to reorganize (ask — see Step 14) **Never stop for:** - Uncommitted changes (always include them) -- Version bump choice (auto-pick MICRO or PATCH — see Step 4) +- Version bump choice (auto-pick MICRO or PATCH — see Step 12) - CHANGELOG content (auto-generate from diff) - Commit message approval (auto-commit) - Multi-file changesets (auto-split into bisectable commits) @@ -638,9 +638,9 @@ Re-running `/ship` means "run the whole checklist again." Every verification ste (tests, coverage audit, plan completion, pre-landing review, adversarial review, VERSION/CHANGELOG check, TODOS, document-release) runs on every invocation. Only *actions* are idempotent: -- Step 4: If VERSION already bumped, skip the bump but still read the version -- Step 7: If already pushed, skip the push command -- Step 8: If PR exists, update the body instead of creating a new PR +- Step 12: If VERSION already bumped, skip the bump but still read the version +- Step 17: If already pushed, skip the push command +- Step 19: If PR exists, update the body instead of creating a new PR Never skip a verification step because a prior `/ship` run already performed it. --- @@ -708,19 +708,19 @@ Display: If the Eng Review is NOT "CLEAR": -Print: "No prior eng review found — ship will run its own pre-landing review in Step 3.5." +Print: "No prior eng review found — ship will run its own pre-landing review in Step 9." Check diff size: `git diff ...HEAD --stat | tail -1`. If the diff is >200 lines, add: "Note: This is a large diff. Consider running `/plan-eng-review` or `/autoplan` for architecture-level review before shipping." If CEO Review is missing, mention as informational ("CEO Review not run — recommended for product changes") but do NOT block. -For Design Review: run `source <($GSTACK_ROOT/bin/gstack-diff-scope 2>/dev/null)`. If `SCOPE_FRONTEND=true` and no design review (plan-design-review or design-review-lite) exists in the dashboard, mention: "Design Review not run — this PR changes frontend code. The lite design check will run automatically in Step 3.5, but consider running /design-review for a full visual audit post-implementation." Still never block. +For Design Review: run `source <($GSTACK_ROOT/bin/gstack-diff-scope 2>/dev/null)`. If `SCOPE_FRONTEND=true` and no design review (plan-design-review or design-review-lite) exists in the dashboard, mention: "Design Review not run — this PR changes frontend code. The lite design check will run automatically in Step 9, but consider running /design-review for a full visual audit post-implementation." Still never block. -Continue to Step 1.5 — do NOT block or ask. Ship runs its own review in Step 3.5. +Continue to Step 2 — do NOT block or ask. Ship runs its own review in Step 9. --- -## Step 1.5: Distribution Pipeline Check +## Step 2: Distribution Pipeline Check If the diff introduces a new standalone artifact (CLI binary, library package, tool) — not a web service with existing deployment — verify that a distribution pipeline exists. @@ -748,7 +748,7 @@ service with existing deployment — verify that a distribution pipeline exists. --- -## Step 2: Merge the base branch (BEFORE tests) +## Step 3: Merge the base branch (BEFORE tests) Fetch and merge the base branch into the feature branch so tests run against the merged state: @@ -762,7 +762,7 @@ git fetch origin && git merge origin/ --no-edit --- -## Step 2.5: Test Framework Bootstrap +## Step 4: Test Framework Bootstrap ## Test Framework Bootstrap @@ -791,7 +791,7 @@ ls -d test/ tests/ spec/ __tests__/ cypress/ e2e/ 2>/dev/null **If test framework detected** (config files or test directories found): Print "Test framework detected: {name} ({N} existing tests). Skipping bootstrap." Read 2-3 existing test files to learn conventions (naming, imports, assertion style, setup patterns). -Store conventions as prose context for use in Phase 8e.5 or Step 3.4. **Skip the rest of bootstrap.** +Store conventions as prose context for use in Phase 8e.5 or Step 7. **Skip the rest of bootstrap.** **If BOOTSTRAP_DECLINED** appears: Print "Test bootstrap previously declined — skipping." **Skip the rest of bootstrap.** @@ -920,7 +920,7 @@ Only commit if there are changes. Stage all bootstrap files (config, test direct --- -## Step 3: Run tests (on merged code) +## Step 5: Run tests (on merged code) **Do NOT run `RAILS_ENV=test bin/rails db:migrate`** — `bin/test-lane` already calls `db:test:prepare` internally, which loads the schema into the correct lane database. @@ -1042,13 +1042,13 @@ Use AskUserQuestion: - Continue with the workflow. - Note in output: "Pre-existing test failure skipped: " -**After triage:** If any in-branch failures remain unfixed, **STOP**. Do not proceed. If all failures were pre-existing and handled (fixed, TODOed, assigned, or skipped), continue to Step 3.25. +**After triage:** If any in-branch failures remain unfixed, **STOP**. Do not proceed. If all failures were pre-existing and handled (fixed, TODOed, assigned, or skipped), continue to Step 6. **If all pass:** Continue silently — just note the counts briefly. --- -## Step 3.25: Eval Suites (conditional) +## Step 6: Eval Suites (conditional) Evals are mandatory when prompt-related files change. Skip this step entirely if no prompt files are in the diff. @@ -1067,7 +1067,7 @@ Match against these patterns (from CLAUDE.md): - `config/system_prompts/*.txt` - `test/evals/**/*` (eval infrastructure changes affect all suites) -**If no matches:** Print "No prompt-related files changed — skipping evals." and continue to Step 3.5. +**If no matches:** Print "No prompt-related files changed — skipping evals." and continue to Step 9. **2. Identify affected eval suites:** @@ -1097,9 +1097,9 @@ If multiple suites need to run, run them sequentially (each needs a test lane). **4. Check results:** - **If any eval fails:** Show the failures, the cost dashboard, and **STOP**. Do not proceed. -- **If all pass:** Note pass counts and cost. Continue to Step 3.5. +- **If all pass:** Note pass counts and cost. Continue to Step 9. -**5. Save eval output** — include eval results and cost dashboard in the PR body (Step 8). +**5. Save eval output** — include eval results and cost dashboard in the PR body (Step 19). **Tier reference (for context — /ship always uses `full`):** | Tier | When | Speed (cached) | Cost | @@ -1110,9 +1110,15 @@ If multiple suites need to run, run them sequentially (each needs a test lane). --- -## Step 3.4: Test Coverage Audit +## Step 7: Test Coverage Audit -100% coverage is the goal — every untested path is a path where bugs hide and vibe coding becomes yolo coding. Evaluate what was ACTUALLY coded (from the diff), not what was planned. +**Dispatch this step as a subagent** using the Agent tool with `subagent_type: "general-purpose"`. The subagent runs the coverage audit in a fresh context window — the parent only sees the conclusion, not intermediate file reads. This is context-rot defense. + +**Subagent prompt:** Pass the following instructions to the subagent, with `` substituted with the base branch: + +> You are running a ship-workflow test coverage audit. Run `git diff ...HEAD` as needed. Do not commit or push — report only. +> +> 100% coverage is the goal — every untested path is a path where bugs hide and vibe coding becomes yolo coding. Evaluate what was ACTUALLY coded (from the diff), not what was planned. ### Test Framework Detection @@ -1134,7 +1140,7 @@ ls jest.config.* vitest.config.* playwright.config.* cypress.config.* .rspec pyt ls -d test/ tests/ spec/ __tests__/ cypress/ e2e/ 2>/dev/null ``` -3. **If no framework detected:** falls through to the Test Framework Bootstrap step (Step 2.5) which handles full setup. +3. **If no framework detected:** falls through to the Test Framework Bootstrap step (Step 4) which handles full setup. **0. Before/after test count:** @@ -1276,11 +1282,11 @@ GAPS: 8 paths need tests (2 need E2E, 1 needs eval) ───────────────────────────────── ``` -**Fast path:** All paths covered → "Step 3.4: All new code paths have test coverage ✓" Continue. +**Fast path:** All paths covered → "Step 7: All new code paths have test coverage ✓" Continue. **5. Generate tests for uncovered paths:** -If test framework detected (or bootstrapped in Step 2.5): +If test framework detected (or bootstrapped in Step 4): - Prioritize error handlers and edge cases first (happy paths are more likely already tested) - Read 2-3 existing test files to match conventions exactly - Generate unit tests. Mock all external dependencies (DB, API, Redis). @@ -1294,7 +1300,7 @@ Caps: 30 code paths max, 20 tests generated max (code + user flow combined), 2-m If no test framework AND user declined bootstrap → diagram only, no generation. Note: "Test generation skipped — no test framework configured." -**Diff is test-only changes:** Skip Step 3.4 entirely: "No new application code paths to audit." +**Diff is test-only changes:** Skip Step 7 entirely: "No new application code paths to audit." **6. After-count and coverage summary:** @@ -1369,12 +1375,30 @@ Repo: {owner/repo} ## Critical Paths - {end-to-end flow that must work} ``` +> +> After your analysis, output a single JSON object on the LAST LINE of your response (no other text after it): +> `{"coverage_pct":N,"gaps":N,"diagram":"","tests_added":["path",...]}` + +**Parent processing:** + +1. Read the subagent's final output. Parse the LAST line as JSON. +2. Store `coverage_pct` (for Step 20 metrics), `gaps` (user summary), `tests_added` (for the commit). +3. Embed `diagram` verbatim in the PR body's `## Test Coverage` section (Step 19). +4. Print a one-line summary: `Coverage: {coverage_pct}%, {gaps} gaps. {tests_added.length} tests added.` + +**If the subagent fails, times out, or returns invalid JSON:** Fall back to running the audit inline in the parent. Do not block /ship on subagent failure — partial results are better than none. --- -## Step 3.45: Plan Completion Audit +## Step 8: Plan Completion Audit -### Plan File Discovery +**Dispatch this step as a subagent** using the Agent tool with `subagent_type: "general-purpose"`. The subagent reads the plan file and every referenced code file in its own fresh context. Parent gets only the conclusion. + +**Subagent prompt:** Pass these instructions to the subagent: + +> You are running a ship-workflow plan completion audit. The base branch is ``. Use `git diff ...HEAD` to see what shipped. Do not commit or push — report only. +> +> ### Plan File Discovery 1. **Conversation context (primary):** Check if there is an active plan file in this conversation. The host agent's system messages include plan file paths when in plan mode. If found, use it directly — this is the most reliable signal. @@ -1490,19 +1514,31 @@ After producing the completion checklist: **No plan file found:** Skip entirely. "No plan file detected — skipping plan completion audit." **Include in PR body (Step 8):** Add a `## Plan Completion` section with the checklist summary. +> +> After your analysis, output a single JSON object on the LAST LINE of your response (no other text after it): +> `{"total_items":N,"done":N,"changed":N,"deferred":N,"summary":""}` + +**Parent processing:** + +1. Parse the LAST line of the subagent's output as JSON. +2. Store `done`, `deferred` for Step 20 metrics; use `summary` in PR body. +3. If `deferred > 0` and no user override, present the deferred items via AskUserQuestion before continuing. +4. Embed `summary` in PR body's `## Plan Completion` section (Step 19). + +**If the subagent fails or returns invalid JSON:** Fall back to running the audit inline. Never block /ship on subagent failure. --- -## Step 3.47: Plan Verification +## Step 8.1: Plan Verification Automatically verify the plan's testing/verification steps using the `/qa-only` skill. ### 1. Check for verification section -Using the plan file already discovered in Step 3.45, look for a verification section. Match any of these headings: `## Verification`, `## Test plan`, `## Testing`, `## How to test`, `## Manual testing`, or any section with verification-flavored items (URLs to visit, things to check visually, interactions to test). +Using the plan file already discovered in Step 8, look for a verification section. Match any of these headings: `## Verification`, `## Test plan`, `## Testing`, `## How to test`, `## Manual testing`, or any section with verification-flavored items (URLs to visit, things to check visually, interactions to test). **If no verification section found:** Skip with "No verification steps found in plan — skipping auto-verification." -**If no plan file was found in Step 3.45:** Skip (already handled). +**If no plan file was found in Step 8:** Skip (already handled). ### 2. Check for running dev server @@ -1547,7 +1583,7 @@ Follow the /qa-only workflow with these modifications: ### 5. Include in PR body -Add a `## Verification Results` section to the PR body (Step 8): +Add a `## Verification Results` section to the PR body (Step 19): - If verification ran: summary of results (N PASS, M FAIL, K SKIPPED) - If skipped: reason for skipping (no plan, no server, no verification section) @@ -1589,7 +1625,7 @@ matches a past learning, display: This makes the compounding visible. The user should see that gstack is getting smarter on their codebase over time. -## Step 3.48: Scope Drift Detection +## Step 8.2: Scope Drift Detection Before reviewing code quality, check: **did they build what was requested — nothing more, nothing less?** @@ -1626,7 +1662,7 @@ Before reviewing code quality, check: **did they build what was requested — no --- -## Step 3.5: Pre-Landing Review +## Step 9: Pre-Landing Review Review the diff for structural issues that tests don't catch. @@ -1721,7 +1757,7 @@ Present Codex output under a `CODEX (design):` header, merged with the checklist Include any design findings alongside the code review findings. They follow the same Fix-First flow below. -## Step 3.55: Review Army — Specialist Dispatch +## Step 9.1: Review Army — Specialist Dispatch ### Detect stack and scope @@ -1838,7 +1874,7 @@ CHECKLIST: --- -### Step 3.56: Collect and merge findings +### Step 9.2: Collect and merge findings After all specialist subagents complete, collect their outputs. @@ -1884,7 +1920,7 @@ SPECIALIST REVIEW: N findings (X critical, Y informational) from Z specialists PR Quality Score: X/10 ``` -These findings flow into the Fix-First flow (item 4) alongside the checklist pass (Step 3.5). +These findings flow into the Fix-First flow (item 4) alongside the checklist pass (Step 9). The Fix-First heuristic applies identically — specialist findings follow the same AUTO-FIX vs ASK classification. **Compile per-specialist stats:** @@ -1908,7 +1944,7 @@ If activated, dispatch one more subagent via the Agent tool (foreground, not bac The Red Team subagent receives: 1. The red-team checklist from `$GSTACK_ROOT/review/specialists/red-team.md` -2. The merged specialist findings from Step 3.56 (so it knows what was already caught) +2. The merged specialist findings from Step 9.2 (so it knows what was already caught) 3. The git diff command Prompt: "You are a red team reviewer. The code has already been reviewed by N specialists @@ -1924,7 +1960,7 @@ the Fix-First flow (item 4). Red Team findings are tagged with `"specialist":"re If the Red Team returns NO FINDINGS, note: "Red Team review: no additional issues found." If the Red Team subagent fails or times out, skip silently and continue. -### Step 3.57: Cross-review finding dedup +### Step 9.3: Cross-review finding dedup Before classifying findings, check if any were previously skipped by the user in a prior review on this branch. @@ -1944,7 +1980,7 @@ If skipped fingerprints exist, get the list of files changed since that review: git diff --name-only HEAD ``` -For each current finding (from both the checklist pass (Step 3.5) and specialist review (Step 3.55-3.56)), check: +For each current finding (from both the checklist pass (Step 9) and specialist review (Step 9.1-9.2)), check: - Does its fingerprint match a previously skipped finding? - Is the finding's file path NOT in the changed-files set? @@ -1958,7 +1994,7 @@ If no prior reviews exist or none have a `findings` array, skip this step silent Output a summary header: `Pre-Landing Review: N issues (X critical, Y informational)` -4. **Classify each finding from both the checklist pass and specialist review (Step 3.55-3.56) as AUTO-FIX or ASK** per the Fix-First Heuristic in +4. **Classify each finding from both the checklist pass and specialist review (Step 9.1-Step 9.2) as AUTO-FIX or ASK** per the Fix-First Heuristic in checklist.md. Critical findings lean toward ASK; informational lean toward AUTO-FIX. 5. **Auto-fix all AUTO-FIX items.** Apply each fix. Output one line per fix: @@ -1972,7 +2008,7 @@ Output a summary header: `Pre-Landing Review: N issues (X critical, Y informatio 7. **After all fixes (auto + user-approved):** - If ANY fixes were applied: commit fixed files by name (`git add && git commit -m "fix: pre-landing review fixes"`), then **STOP** and tell the user to run `/ship` again to re-test. - - If no fixes applied (all ASK items skipped, or no issues found): continue to Step 4. + - If no fixes applied (all ASK items skipped, or no issues found): continue to Step 12. 8. Output summary: `Pre-Landing Review: N issues — M auto-fixed, K asked (J fixed, L skipped)` @@ -1984,27 +2020,38 @@ $GSTACK_ROOT/bin/gstack-review-log '{"skill":"review","timestamp":"TIMESTAMP","s ``` Substitute TIMESTAMP (ISO 8601), STATUS ("clean" if no issues, "issues_found" otherwise), and N values from the summary counts above. The `via:"ship"` distinguishes from standalone `/review` runs. -- `quality_score` = the PR Quality Score computed in Step 3.56 (e.g., 7.5). If specialists were skipped (small diff), use `10.0` -- `specialists` = the per-specialist stats object compiled in Step 3.56. Each specialist that was considered gets an entry: `{"dispatched":true/false,"findings":N,"critical":N,"informational":N}` if dispatched, or `{"dispatched":false,"reason":"scope|gated"}` if skipped. Example: `{"testing":{"dispatched":true,"findings":2,"critical":0,"informational":2},"security":{"dispatched":false,"reason":"scope"}}` +- `quality_score` = the PR Quality Score computed in Step 9.2 (e.g., 7.5). If specialists were skipped (small diff), use `10.0` +- `specialists` = the per-specialist stats object compiled in Step 9.2. Each specialist that was considered gets an entry: `{"dispatched":true/false,"findings":N,"critical":N,"informational":N}` if dispatched, or `{"dispatched":false,"reason":"scope|gated"}` if skipped. Example: `{"testing":{"dispatched":true,"findings":2,"critical":0,"informational":2},"security":{"dispatched":false,"reason":"scope"}}` - `findings` = array of per-finding records. For each finding (from checklist pass and specialists), include: `{"fingerprint":"path:line:category","severity":"CRITICAL|INFORMATIONAL","action":"ACTION"}`. ACTION is `"auto-fixed"`, `"fixed"` (user approved), or `"skipped"` (user chose Skip). -Save the review output — it goes into the PR body in Step 8. +Save the review output — it goes into the PR body in Step 19. --- -## Step 3.75: Address Greptile review comments (if PR exists) +## Step 10: Address Greptile review comments (if PR exists) -Read `.factory/skills/gstack/review/greptile-triage.md` and follow the fetch, filter, classify, and **escalation detection** steps. +**Dispatch the fetch + classification as a subagent** using the Agent tool with `subagent_type: "general-purpose"`. The subagent pulls every Greptile comment, runs the escalation detection algorithm, and classifies each comment. Parent receives a structured list and handles user interaction + file edits. -**If no PR exists, `gh` fails, API returns an error, or there are zero Greptile comments:** Skip this step silently. Continue to Step 4. +**Subagent prompt:** -**If Greptile comments are found:** +> You are classifying Greptile review comments for a /ship workflow. Read `.factory/skills/gstack/review/greptile-triage.md` and follow the fetch, filter, classify, and **escalation detection** steps. Do NOT fix code, do NOT reply to comments, do NOT commit — report only. +> +> For each comment, assign: `classification` (`valid_actionable`, `already_fixed`, `false_positive`, `suppressed`), `escalation_tier` (1 or 2), the file:line or [top-level] tag, body summary, and permalink URL. +> +> If no PR exists, `gh` fails, the API errors, or there are zero comments, output: `{"total":0,"comments":[]}` and stop. +> +> Otherwise, output a single JSON object on the LAST LINE of your response: +> `{"total":N,"comments":[{"classification":"...","escalation_tier":N,"ref":"file:line","summary":"...","permalink":"url"},...]}` -Include a Greptile summary in your output: `+ N Greptile comments (X valid, Y fixed, Z FP)` +**Parent processing:** -Before replying to any comment, run the **Escalation Detection** algorithm from greptile-triage.md to determine whether to use Tier 1 (friendly) or Tier 2 (firm) reply templates. +Parse the LAST line as JSON. -For each classified comment: +If `total` is 0, skip this step silently. Continue to Step 12. + +Otherwise, print: `+ {total} Greptile comments ({valid_actionable} valid, {already_fixed} already fixed, {false_positive} FP)`. + +For each comment in `comments`: **VALID & ACTIONABLE:** Use AskUserQuestion with: - The comment (file:line or [top-level] + body summary + permalink URL) @@ -2027,11 +2074,11 @@ For each classified comment: **SUPPRESSED:** Skip silently — these are known false positives from previous triage. -**After all comments are resolved:** If any fixes were applied, the tests from Step 3 are now stale. **Re-run tests** (Step 3) before continuing to Step 4. If no fixes were applied, continue to Step 4. +**After all comments are resolved:** If any fixes were applied, the tests from Step 5 are now stale. **Re-run tests** (Step 5) before continuing to Step 12. If no fixes were applied, continue to Step 12. --- -## Step 3.8: Adversarial review (always-on) +## Step 11: Adversarial review (always-on) Every diff gets adversarial review from both Claude and Codex. LOC is not a proxy for risk — a 5-line auth change can be critical. @@ -2117,7 +2164,7 @@ A) Investigate and fix now (recommended) B) Continue — review will still complete ``` -If A: address the findings. After fixing, re-run tests (Step 3) since code has changed. Re-run `codex review` to verify. +If A: address the findings. After fixing, re-run tests (Step 5) since code has changed. Re-run `codex review` to verify. Read stderr for errors (same error handling as Codex adversarial above). @@ -2183,7 +2230,7 @@ already knows. A good test: would this insight save time in a future session? If -## Step 4: Version bump (auto-decide) +## Step 12: Version bump (auto-decide) **Idempotency check:** Before bumping, compare VERSION against the base branch. @@ -2214,7 +2261,7 @@ If output shows `ALREADY_BUMPED`, VERSION was already bumped on this branch (pri --- -## CHANGELOG (auto-generate) +## Step 13: CHANGELOG (auto-generate) 1. Read `CHANGELOG.md` header to know the format. @@ -2258,7 +2305,7 @@ If output shows `ALREADY_BUMPED`, VERSION was already bumped on this branch (pri --- -## Step 5.5: TODOS.md (auto-update) +## Step 14: TODOS.md (auto-update) Cross-reference the project's TODOS.md against the changes being shipped. Mark completed items automatically; prompt only if the file is missing or disorganized. @@ -2270,7 +2317,7 @@ Read `.factory/skills/gstack/review/TODOS-format.md` for the canonical format re - Message: "GStack recommends maintaining a TODOS.md organized by skill/component, then priority (P0 at top through P4, then Completed at bottom). See TODOS-format.md for the full format. Would you like to create one?" - Options: A) Create it now, B) Skip for now - If A: Create `TODOS.md` with a skeleton (# TODOS heading + ## Completed section). Continue to step 3. -- If B: Skip the rest of Step 5.5. Continue to Step 6. +- If B: Skip the rest of Step 14. Continue to Step 15. **2. Check structure and organization:** @@ -2309,11 +2356,11 @@ For each TODO item, check if the changes in this PR complete it by: **6. Defensive:** If TODOS.md cannot be written (permission error, disk full), warn the user and continue. Never stop the ship workflow for a TODOS failure. -Save this summary — it goes into the PR body in Step 8. +Save this summary — it goes into the PR body in Step 19. --- -## Step 6: Commit (bisectable chunks) +## Step 15: Commit (bisectable chunks) **Goal:** Create small, logical commits that work well with `git bisect` and help LLMs understand what changed. @@ -2351,13 +2398,13 @@ EOF --- -## Step 6.5: Verification Gate +## Step 16: Verification Gate **IRON LAW: NO COMPLETION CLAIMS WITHOUT FRESH VERIFICATION EVIDENCE.** Before pushing, re-verify if code changed during Steps 4-6: -1. **Test verification:** If ANY code changed after Step 3's test run (fixes from review findings, CHANGELOG edits don't count), re-run the test suite. Paste fresh output. Stale output from Step 3 is NOT acceptable. +1. **Test verification:** If ANY code changed after Step 5's test run (fixes from review findings, CHANGELOG edits don't count), re-run the test suite. Paste fresh output. Stale output from Step 5 is NOT acceptable. 2. **Build verification:** If the project has a build step, run it. Paste output. @@ -2367,13 +2414,13 @@ Before pushing, re-verify if code changed during Steps 4-6: - "I already tested earlier" → Code changed since then. Test again. - "It's a trivial change" → Trivial changes break production. -**If tests fail here:** STOP. Do not push. Fix the issue and return to Step 3. +**If tests fail here:** STOP. Do not push. Fix the issue and return to Step 5. Claiming work is complete without verification is dishonesty, not efficiency. --- -## Step 7: Push +## Step 17: Push **Idempotency check:** Check if the branch is already pushed and up to date. @@ -2385,15 +2432,44 @@ echo "LOCAL: $LOCAL REMOTE: $REMOTE" [ "$LOCAL" = "$REMOTE" ] && echo "ALREADY_PUSHED" || echo "PUSH_NEEDED" ``` -If `ALREADY_PUSHED`, skip the push but continue to Step 8. Otherwise push with upstream tracking: +If `ALREADY_PUSHED`, skip the push but continue to Step 18. Otherwise push with upstream tracking: ```bash git push -u origin ``` +**You are NOT done.** The code is pushed but documentation sync and PR creation are mandatory final steps. Continue to Step 18. + --- -## Step 8: Create PR/MR +## Step 18: Documentation sync (via subagent, before PR creation) + +**Dispatch /document-release as a subagent** using the Agent tool with `subagent_type: "general-purpose"`. The subagent gets a fresh context window — zero rot from the preceding 17 steps. It also runs the **full** `/document-release` workflow (with CHANGELOG clobber protection, doc exclusions, risky-change gates, named staging, race-safe PR body editing) rather than a weaker reimplementation. + +**Sequencing:** This step runs AFTER Step 17 (Push) and BEFORE Step 19 (Create PR). The PR is created once from final HEAD with the `## Documentation` section baked into the initial body. No create-then-re-edit dance. + +**Subagent prompt:** + +> You are executing the /document-release workflow after a code push. Read the full skill file `${HOME}/.factory/skills/gstack/document-release/SKILL.md` and execute its complete workflow end-to-end, including CHANGELOG clobber protection, doc exclusions, risky-change gates, and named staging. Do NOT attempt to edit the PR body — no PR exists yet. Branch: ``, base: ``. +> +> After completing the workflow, output a single JSON object on the LAST LINE of your response (no other text after it): +> `{"files_updated":["README.md","CLAUDE.md",...],"commit_sha":"abc1234","pushed":true,"documentation_section":""}` +> +> If no documentation files needed updating, output: +> `{"files_updated":[],"commit_sha":null,"pushed":false,"documentation_section":null}` + +**Parent processing:** + +1. Parse the LAST line of the subagent's output as JSON. +2. Store `documentation_section` — Step 19 embeds it in the PR body (or omits the section if null). +3. If `files_updated` is non-empty, print: `Documentation synced: {files_updated.length} files updated, committed as {commit_sha}`. +4. If `files_updated` is empty, print: `Documentation is current — no updates needed.` + +**If the subagent fails or returns invalid JSON:** Print a warning and proceed to Step 19 without a `## Documentation` section. Do not block /ship on subagent failure. The user can run `/document-release` manually after the PR lands. + +--- + +## Step 19: Create PR/MR **Idempotency check:** Check if a PR/MR already exists for this branch. @@ -2407,7 +2483,7 @@ gh pr view --json url,number,state -q 'if .state == "OPEN" then "PR #\(.number): glab mr view -F json 2>/dev/null | jq -r 'if .state == "opened" then "MR_EXISTS" else "NO_MR" end' 2>/dev/null || echo "NO_MR" ``` -If an **open** PR/MR already exists: **update** the PR body using `gh pr edit --body "..."` (GitHub) or `glab mr update -d "..."` (GitLab). Always regenerate the PR body from scratch using this run's fresh results (test output, coverage audit, review findings, adversarial review, TODOS summary). Never reuse stale PR body content from a prior run. Print the existing URL and continue to Step 8.5. +If an **open** PR/MR already exists: **update** the PR body using `gh pr edit --body "..."` (GitHub) or `glab mr update -d "..."` (GitLab). Always regenerate the PR body from scratch using this run's fresh results (test output, coverage audit, review findings, adversarial review, TODOS summary, documentation_section from Step 18). Never reuse stale PR body content from a prior run. Print the existing URL and continue to Step 20. If no PR/MR exists: create a pull request (GitHub) or merge request (GitLab) using the platform detected in Step 0. @@ -2423,11 +2499,11 @@ must appear in at least one section. If a commit's work isn't reflected in the s you missed it.> ## Test Coverage - - + + ## Pre-Landing Review - + ## Design Review @@ -2439,19 +2515,19 @@ you missed it.> ## Greptile Review - + ## Scope Drift ## Plan Completion - + ## Verification Results - + @@ -2461,6 +2537,10 @@ you missed it.> +## Documentation + + + ## Test plan - [x] All Rails tests pass (N runs, 0 failures) - [x] All Vitest tests pass (N tests) @@ -2489,34 +2569,11 @@ EOF **If neither CLI is available:** Print the branch name, remote URL, and instruct the user to create the PR/MR manually via the web UI. Do not stop — the code is pushed and ready. -**Output the PR/MR URL** — then proceed to Step 8.5. +**Output the PR/MR URL** — then proceed to Step 20. --- -## Step 8.5: Auto-invoke /document-release - -After the PR is created, automatically sync project documentation. Read the -`document-release/SKILL.md` skill file (adjacent to this skill's directory) and -execute its full workflow: - -1. Read the `/document-release` skill: `cat ${CLAUDE_SKILL_DIR}/../document-release/SKILL.md` -2. Follow its instructions — it reads all .md files in the project, cross-references - the diff, and updates anything that drifted (README, ARCHITECTURE, CONTRIBUTING, - CLAUDE.md, TODOS, etc.) -3. If any docs were updated, commit the changes and push to the same branch: - ```bash - git add -A && git commit -m "docs: sync documentation with shipped changes" && git push - ``` -4. If no docs needed updating, say "Documentation is current — no updates needed." - -This step is automatic. Do not ask the user for confirmation. The goal is zero-friction -doc updates — the user runs `/ship` and documentation stays current without a separate command. - -If Step 8.5 created a docs commit, re-edit the PR/MR body to include the latest commit SHA in the summary. This ensures the PR body reflects the truly final state after document-release. - ---- - -## Step 8.75: Persist ship metrics +## Step 20: Persist ship metrics Log coverage and plan completion data so `/retro` can track trends: @@ -2531,10 +2588,10 @@ echo '{"skill":"ship","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","coverage ``` Substitute from earlier steps: -- **COVERAGE_PCT**: coverage percentage from Step 3.4 diagram (integer, or -1 if undetermined) -- **PLAN_TOTAL**: total plan items extracted in Step 3.45 (0 if no plan file) -- **PLAN_DONE**: count of DONE + CHANGED items from Step 3.45 (0 if no plan file) -- **VERIFY_RESULT**: "pass", "fail", or "skipped" from Step 3.47 +- **COVERAGE_PCT**: coverage percentage from Step 7 diagram (integer, or -1 if undetermined) +- **PLAN_TOTAL**: total plan items extracted in Step 8 (0 if no plan file) +- **PLAN_DONE**: count of DONE + CHANGED items from Step 8 (0 if no plan file) +- **VERIFY_RESULT**: "pass", "fail", or "skipped" from Step 8.1 - **VERSION**: from the VERSION file - **BRANCH**: current branch name @@ -2553,6 +2610,6 @@ This step is automatic — never skip it, never ask for confirmation. - **Split commits for bisectability** — each commit = one logical change. - **TODOS.md completion detection must be conservative.** Only mark items as completed when the diff clearly shows the work is done. - **Use Greptile reply templates from greptile-triage.md.** Every reply includes evidence (inline diff, code references, re-rank suggestion). Never post vague replies. -- **Never push without fresh verification evidence.** If code changed after Step 3 tests, re-run before pushing. -- **Step 3.4 generates coverage tests.** They must pass before committing. Never commit failing tests. +- **Never push without fresh verification evidence.** If code changed after Step 5 tests, re-run before pushing. +- **Step 7 generates coverage tests.** They must pass before committing. Never commit failing tests. - **The goal is: user says `/ship`, next thing they see is the review + PR URL + auto-synced docs.** diff --git a/test/gen-skill-docs.test.ts b/test/gen-skill-docs.test.ts index a555104d..2e0814ae 100644 --- a/test/gen-skill-docs.test.ts +++ b/test/gen-skill-docs.test.ts @@ -752,13 +752,13 @@ describe('TEST_COVERAGE_AUDIT placeholders', () => { test('ship SKILL.md contains review army specialist dispatch', () => { expect(shipSkill).toContain('Specialist Dispatch'); - expect(shipSkill).toContain('Step 3.55'); - expect(shipSkill).toContain('Step 3.56'); + expect(shipSkill).toContain('Step 9.1'); + expect(shipSkill).toContain('Step 9.2'); }); test('ship SKILL.md contains cross-review finding dedup', () => { expect(shipSkill).toContain('Cross-review finding dedup'); - expect(shipSkill).toContain('Step 3.57'); + expect(shipSkill).toContain('Step 9.3'); }); test('ship SKILL.md contains re-run idempotency behavior', () => { @@ -839,7 +839,7 @@ describe('PLAN_COMPLETION_AUDIT placeholders', () => { test('ship SKILL.md contains plan completion audit step', () => { expect(shipSkill).toContain('Plan Completion Audit'); - expect(shipSkill).toContain('Step 3.45'); + expect(shipSkill).toContain('Step 8'); }); test('review SKILL.md contains plan completion in scope drift', () => { @@ -888,7 +888,7 @@ describe('PLAN_VERIFICATION_EXEC placeholder', () => { const shipSkill = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8'); test('ship SKILL.md contains plan verification step', () => { - expect(shipSkill).toContain('Step 3.47'); + expect(shipSkill).toContain('Step 8.1'); expect(shipSkill).toContain('Plan Verification'); }); @@ -946,7 +946,7 @@ describe('Ship metrics logging', () => { const shipSkill = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8'); test('ship SKILL.md contains metrics persistence step', () => { - expect(shipSkill).toContain('Step 8.75'); + expect(shipSkill).toContain('Step 20'); expect(shipSkill).toContain('coverage_pct'); expect(shipSkill).toContain('plan_items_total'); expect(shipSkill).toContain('plan_items_done'); diff --git a/test/skill-validation.test.ts b/test/skill-validation.test.ts index c78c1873..6515d08b 100644 --- a/test/skill-validation.test.ts +++ b/test/skill-validation.test.ts @@ -1005,7 +1005,7 @@ describe('Test Bootstrap ({{TEST_BOOTSTRAP}}) integration', () => { test('TEST_BOOTSTRAP appears in ship/SKILL.md', () => { const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8'); expect(content).toContain('Test Framework Bootstrap'); - expect(content).toContain('Step 2.5'); + expect(content).toContain('Step 4'); }); test('TEST_BOOTSTRAP appears in design-review/SKILL.md', () => { @@ -1100,9 +1100,9 @@ describe('Phase 8e.5 regression test generation', () => { // --- Step 3.4 coverage audit validation --- describe('Step 3.4 test coverage audit', () => { - test('ship/SKILL.md contains Step 3.4', () => { + test('ship/SKILL.md contains Step 7', () => { const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8'); - expect(content).toContain('Step 3.4: Test Coverage Audit'); + expect(content).toContain('Step 7: Test Coverage Audit'); expect(content).toContain('CODE PATH COVERAGE'); }); @@ -1127,7 +1127,7 @@ describe('Step 3.4 test coverage audit', () => { test('ship rules include test generation rule', () => { const content = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8'); - expect(content).toContain('Step 3.4 generates coverage tests'); + expect(content).toContain('Step 7 generates coverage tests'); expect(content).toContain('Never commit failing tests'); }); @@ -1161,6 +1161,53 @@ describe('Step 3.4 test coverage audit', () => { }); }); +// --- Ship step numbering regression guard --- + +describe('ship step numbering', () => { + // Allowed sub-steps that are resolver-generated and intentionally nested: + // 8.1 (Plan Verification), 8.2 (Scope Drift), 9.1 (Review Army), 9.2 (Findings Merge), 9.3 (Cross-review dedup) + const ALLOWED_SUBSTEPS = new Set(['8.1', '8.2', '9.1', '9.2', '9.3']); + + test('ship/SKILL.md.tmpl contains no unexpected fractional step numbers', () => { + const tmpl = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md.tmpl'), 'utf-8'); + // Match "Step X.Y" where X.Y is a decimal step reference (e.g., "Step 3.47", "Step 8.1") + const matches = Array.from(tmpl.matchAll(/Step (\d+\.\d+)/g)); + const violations = matches + .map((m) => m[1]) + .filter((n) => !ALLOWED_SUBSTEPS.has(n)); + if (violations.length > 0) { + const unique = Array.from(new Set(violations)).sort(); + throw new Error( + `ship/SKILL.md.tmpl contains fractional step numbers that are not in the allowed sub-step list.\n` + + ` Found: ${unique.join(', ')}\n` + + ` Allowed sub-steps: ${Array.from(ALLOWED_SUBSTEPS).sort().join(', ')}\n` + + ` Fix: use clean integer step numbers (1-20), or add to ALLOWED_SUBSTEPS if intentional.` + ); + } + }); + + test('ship/SKILL.md main headings use clean integer step numbers', () => { + const skill = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8'); + // Headings like "## Step 7: Test Coverage Audit" — NOT sub-steps like "## Step 8.1:" + const headings = Array.from(skill.matchAll(/^## Step (\d+(?:\.\d+)?):/gm)).map( + (m) => m[1] + ); + const fractional = headings.filter((n) => n.includes('.')); + const unexpected = fractional.filter((n) => !ALLOWED_SUBSTEPS.has(n)); + expect(unexpected).toEqual([]); + }); + + test('review/SKILL.md step numbers unchanged (regression guard for resolver conditionals)', () => { + const skill = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8'); + // /review uses its own fractional numbering: 1.5, 2.5, 4.5, 5.5, 5.6, 5.7, 5.8 + // If the ship-side renumber accidentally touched the review-side of resolver conditionals, + // these would vanish. This test catches that. + expect(skill).toContain('## Step 1.5: Scope Drift Detection'); + expect(skill).toContain('## Step 4.5: Review Army'); + expect(skill).toContain('## Step 5.7: Adversarial review'); + }); +}); + // --- Retro test health validation --- describe('Retro test health tracking', () => {