From e6fd776a37ff4f92aa1d058ca975de5d2ece9187 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sun, 26 Apr 2026 04:36:51 -0700 Subject: [PATCH] feat(test): 3 periodic-tier real-PTY E2E tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit skill-e2e-plan-ceo-mode-routing.test.ts (~$3/run, 6-10 min/case): - Verifies AUQ answer routing: HOLD SCOPE → rigor/bulletproof posture language; SCOPE EXPANSION → expansion/10x/dream language. Each case navigates 8-12 prior AUQs (telemetry, proactive, routing, vendoring, brain, office-hours, premise, approach) before hitting Step 0F. - Periodic, not gate: navigation phase too slow for PR-blocking. V2 expansion to 4 modes (SELECTIVE + REDUCTION) when nav is faster. skill-e2e-ship-idempotency.test.ts (~$3/run, 5-10 min): - Builds a real git fixture with VERSION 0.0.2 already bumped, matching package.json, CHANGELOG entry, pushed to a local bare remote. Runs /ship in plan mode and asserts STATE: ALREADY_BUMPED echoes from the Step 12 idempotency check, OR plan_ready terminates without mutation. - Snapshots VERSION + package.json + CHANGELOG entry count + commit count + branch HEAD before/after; fails if any changed. skill-e2e-autoplan-chain.test.ts (~$8/run, 12-18 min): - Asserts /autoplan phases run sequentially: tees timestamps as each "**Phase N complete.**" marker first appears. Phase 1 (CEO) must precede Phase 3 (Eng); Phase 2 (Design) is optional but if it appears, must sit between 1 and 3. - Auto-grants permission dialogs that fire during phase transitions. All three auto-handle permission dialogs (preamble side-effects on fresh user envs without .feature-prompted-* markers). Co-Authored-By: Claude Opus 4.7 (1M context) --- test/skill-e2e-autoplan-chain.test.ts | 176 ++++++++++++ test/skill-e2e-plan-ceo-mode-routing.test.ts | 204 ++++++++++++++ test/skill-e2e-ship-idempotency.test.ts | 271 +++++++++++++++++++ 3 files changed, 651 insertions(+) create mode 100644 test/skill-e2e-autoplan-chain.test.ts create mode 100644 test/skill-e2e-plan-ceo-mode-routing.test.ts create mode 100644 test/skill-e2e-ship-idempotency.test.ts diff --git a/test/skill-e2e-autoplan-chain.test.ts b/test/skill-e2e-autoplan-chain.test.ts new file mode 100644 index 00000000..adf85803 --- /dev/null +++ b/test/skill-e2e-autoplan-chain.test.ts @@ -0,0 +1,176 @@ +/** + * /autoplan cross-skill chain (periodic, paid, real-PTY). + * + * Asserts: when /autoplan runs against a plan fixture, the phase markers + * the autoplan template emits appear in the correct order: + * + * "**Phase 1 complete." (CEO) → + * "**Phase 2 complete." (Design — only if UI scope detected) → + * "**Phase 3 complete." (Eng) → + * "**Phase 3.5 complete." (DX — optional, skipped if no DX scope) + * + * Why this exists: each individual phase has its own plan-mode smoke + * test. Nothing verifies the SEQUENCING — that phases don't run in + * parallel, that Phase 3 doesn't start before Phase 1 ends, that + * conditional phases (Design, DX) are skipped when their scope is absent. + * A regression where the autoplan template wires phases concurrently + * would not be caught by per-phase tests. + * + * Approach: tee timestamps as each "**Phase N complete." marker first + * appears in the visible buffer. Assert observed ordering. Phase 2 is + * optional — UI-heavy fixture should make it run; backend-only fixtures + * should make it skip. + * + * Cost: ~$5-8/run, 10-15 min wall clock. Periodic — runs weekly. + */ + +import { describe, test, expect } from 'bun:test'; +import { spawnSync } from 'child_process'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { + launchClaudePty, + isPlanReadyVisible, + isPermissionDialogVisible, + isNumberedOptionListVisible, +} from './helpers/claude-pty-runner'; + +const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic'; +const describeE2E = shouldRun ? describe : describe.skip; + +const ROOT = path.resolve(import.meta.dir, '..'); +const UI_FIXTURE = path.join(ROOT, 'test', 'fixtures', 'plans', 'ui-heavy-feature.md'); + +interface PhaseHit { + phase: number; + ts: number; +} + +describeE2E('/autoplan chain ordering (periodic)', () => { + test( + 'phases run sequentially: Phase 1 (CEO) before Phase 3 (Eng), Phase 2 (Design) between when present', + async () => { + // UI-heavy fixture so Phase 2 runs. + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-autoplan-chain-')); + try { + const gitRun = (args: string[]) => + spawnSync('git', args, { cwd: tempDir, stdio: 'pipe', timeout: 5000 }); + gitRun(['init', '-b', 'main']); + gitRun(['config', 'user.email', 'test@test.com']); + gitRun(['config', 'user.name', 'Test']); + + const plansDir = path.join(tempDir, '.claude', 'plans'); + fs.mkdirSync(plansDir, { recursive: true }); + fs.copyFileSync(UI_FIXTURE, path.join(plansDir, 'ui-heavy-feature.md')); + fs.writeFileSync(path.join(tempDir, 'README.md'), '# Autoplan chain fixture\n'); + gitRun(['add', '.']); + gitRun(['commit', '-m', 'init UI-heavy fixture']); + + const session = await launchClaudePty({ + permissionMode: 'plan', + cwd: tempDir, + timeoutMs: 1_080_000, // 18 min, slightly above test budget + }); + + const hits: PhaseHit[] = []; + let outcome: 'chain_complete' | 'plan_ready' | 'timeout' | 'exited' = 'timeout'; + let evidence = ''; + + try { + await Bun.sleep(8000); + const since = session.mark(); + session.send('/autoplan\r'); + + const budgetMs = 900_000; // 15 min + const start = Date.now(); + // Phase markers in autoplan/SKILL.md (lines 1126, 1211, 1331, 1437): + // "**Phase 1 complete." / "**Phase 2 complete." / "**Phase 3 complete." / "**Phase 3.5 complete." + const phasePattern = /\*\*Phase\s+(\d+(?:\.\d+)?)\s+complete\.?\*\*/g; + + let lastPermSig = ''; + while (Date.now() - start < budgetMs) { + await Bun.sleep(5000); + if (session.exited()) { + outcome = 'exited'; + evidence = session.visibleSince(since).slice(-3000); + break; + } + const visible = session.visibleSince(since); + + // Auto-grant any permission dialog so autoplan can keep moving + // through its phases. The autoplan template auto-decides AUQs + // it owns; only permission prompts (file/tool grants) need our + // hand-pressing. Classify on tail to avoid stale matches. + const recentTail = visible.slice(-1500); + if (isNumberedOptionListVisible(recentTail) && isPermissionDialogVisible(recentTail)) { + const sig = visible.slice(-500); + if (sig !== lastPermSig) { + lastPermSig = sig; + session.send('1\r'); + await Bun.sleep(2000); + continue; + } + } + + // Re-scan for any phase markers we haven't yet recorded. + phasePattern.lastIndex = 0; + let m: RegExpExecArray | null; + while ((m = phasePattern.exec(visible)) !== null) { + const phaseNum = parseFloat(m[1] ?? '0'); + if (Number.isNaN(phaseNum)) continue; + if (hits.some(h => h.phase === phaseNum)) continue; + hits.push({ phase: phaseNum, ts: Date.now() }); + } + + // Terminal: Phase 3 (Eng) seen — chain reached the required end. + if (hits.some(h => h.phase === 3)) { + outcome = 'chain_complete'; + evidence = visible.slice(-3000); + break; + } + + // Plan-ready as a fallback terminal — autoplan finished without + // surfacing a Phase 3 marker. This is a regression surface. + if (isPlanReadyVisible(visible)) { + outcome = 'plan_ready'; + evidence = visible.slice(-3000); + break; + } + } + } finally { + await session.close(); + } + + if (outcome === 'exited' || outcome === 'timeout') { + throw new Error( + `autoplan chain test FAILED: outcome=${outcome}, hits=${JSON.stringify(hits)}\n` + + `--- evidence (last 3KB) ---\n${evidence}`, + ); + } + + // Phase 3 (Eng) MUST have been seen. + const ceo = hits.find(h => h.phase === 1); + const design = hits.find(h => h.phase === 2); + const eng = hits.find(h => h.phase === 3); + if (!ceo || !eng) { + throw new Error( + `Required phase markers missing. Saw: ${JSON.stringify(hits)}\n` + + `--- evidence ---\n${evidence}`, + ); + } + + // Sequencing: CEO must end before Eng ends. Design (if observed) + // must end after CEO and before Eng. + expect(ceo.ts).toBeLessThan(eng.ts); + if (design) { + expect(design.ts).toBeGreaterThan(ceo.ts); + expect(design.ts).toBeLessThan(eng.ts); + } + } finally { + try { fs.rmSync(tempDir, { recursive: true, force: true }); } catch { /* ignore */ } + } + }, + 1_200_000, // 20 min absolute test ceiling + ); +}); diff --git a/test/skill-e2e-plan-ceo-mode-routing.test.ts b/test/skill-e2e-plan-ceo-mode-routing.test.ts new file mode 100644 index 00000000..adb75449 --- /dev/null +++ b/test/skill-e2e-plan-ceo-mode-routing.test.ts @@ -0,0 +1,204 @@ +/** + * /plan-ceo-review mode-routing E2E (periodic, paid, real-PTY). + * + * Asserts: when /plan-ceo-review reaches its Step 0F mode-selection + * AskUserQuestion and the user picks HOLD SCOPE or SCOPE EXPANSION, + * the downstream rendered output reflects that mode's distinctive + * posture language. + * + * Why this exists: existing tests verify that the question fires. Nothing + * verifies the answer actually routes. A regression where Step 0F shows + * the question but the agent ignores the choice (e.g. always defaults + * to EXPANSION) would not be caught by any prior test. + * + * Tier: periodic (not gate). Each run navigates 8-12 prior AUQs (telemetry, + * proactive, routing, vendoring, brain, office-hours, premise×3, approach) + * before reaching Step 0F. At ~30s per AUQ that's a 4-6 min navigation + * phase per case. The full 2-case suite runs ~12-15 min, $3-4. Too slow + * for gate-tier; weekly is fine. + * + * Mode coverage: HOLD SCOPE + SCOPE EXPANSION cover the two posture poles + * (rigor vs ambition). SELECTIVE EXPANSION and SCOPE REDUCTION are V2 once + * the navigation phase is shorter or has a deterministic fast-path through + * Step 0A/0C-bis. + * + * Posture assertions: each mode has distinct downstream language. The + * checks below are deliberately permissive — they catch the binary + * "did the mode posture even apply" question, not Opus-specific phrasing. + * + * HOLD SCOPE — "rigor" or "bulletproof" or "hold scope" + * SCOPE EXPANSION — "expansion" or "10x" or "delight" or "dream" + */ + +import { describe, test } from 'bun:test'; +import { + launchClaudePty, + isNumberedOptionListVisible, + isPermissionDialogVisible, + parseNumberedOptions, + isPlanReadyVisible, + type ClaudePtySession, +} from './helpers/claude-pty-runner'; + +const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic'; +const describeE2E = shouldRun ? describe : describe.skip; + +const MODE_RE = /HOLD SCOPE|SCOPE EXPANSION|SELECTIVE EXPANSION|SCOPE REDUCTION/i; + +interface ModeCase { + mode: 'HOLD SCOPE' | 'SCOPE EXPANSION'; + /** Regex applied to visible-since-mode-pick text. At least one must match. */ + postureRe: RegExp; +} + +const CASES: ModeCase[] = [ + { mode: 'HOLD SCOPE', postureRe: /\b(rigor|bulletproof|hold\s*scope|maximum\s+rigor)\b/i }, + { mode: 'SCOPE EXPANSION', postureRe: /\b(expansion|10x|delight|dream|cathedral|opt[\s-]?in)\b/i }, +]; + +/** + * Navigate prior AUQs by picking option 1 until we hit an AUQ whose + * options match one of the 4 mode names. Returns the option index + * matching `targetMode`, with the buffer marker pointing AT that AUQ. + * + * Throws if we don't reach the mode AUQ within `maxNav` prior AUQs or + * the overall budget. + */ +async function navigateToModeAuq( + session: ClaudePtySession, + since: number, + targetMode: ModeCase['mode'], + opts: { maxNav?: number; budgetMs?: number } = {}, +): Promise<{ modeIndex: number; visibleAtMode: string }> { + // /plan-ceo-review's mode AUQ (Step 0F) sits behind several preamble + // and Step 0A-0C-bis gates: telemetry, proactive, routing, vendoring, + // brain privacy, office-hours offer, premise challenge (3 questions), + // approach selection. 12 hops is the conservative ceiling. + const maxNav = opts.maxNav ?? 12; + const budgetMs = opts.budgetMs ?? 420_000; + const start = Date.now(); + let priorAnswered = 0; + let lastSeenList: Array<{ index: number; label: string }> = []; + + while (Date.now() - start < budgetMs) { + if (session.exited()) { + throw new Error( + `claude exited (code=${session.exitCode()}) during nav.\n` + + `Last visible:\n${session.visibleSince(since).slice(-2000)}`, + ); + } + await Bun.sleep(2000); + const visible = session.visibleSince(since); + if (!isNumberedOptionListVisible(visible)) continue; + const opts = parseNumberedOptions(visible); + if (opts.length < 2) continue; + + // Has the rendered list changed since last poll? If not, we're seeing + // the same prompt and shouldn't double-press. + const sig = opts.map(o => `${o.index}:${o.label}`).join('|'); + const lastSig = lastSeenList.map(o => `${o.index}:${o.label}`).join('|'); + if (sig === lastSig) continue; + lastSeenList = opts; + + // Is THIS the mode AUQ? + if (opts.some(o => MODE_RE.test(o.label))) { + const target = opts.find(o => o.label.toUpperCase().includes(targetMode)); + if (!target) { + throw new Error( + `Mode AUQ rendered but target "${targetMode}" not in option labels:\n` + + opts.map(o => ` ${o.index}. ${o.label}`).join('\n'), + ); + } + return { modeIndex: target.index, visibleAtMode: visible }; + } + + // Permission dialog? Grant with "1" but don't count it against nav budget. + // Classify on the recent tail only — old permission text persists in + // visibleSince and would re-trigger forever. + if (isPermissionDialogVisible(visible.slice(-1500))) { + session.send('1\r'); + await Bun.sleep(1500); + continue; + } + + // Not the mode AUQ — answer with option 1 (recommended) and continue. + if (priorAnswered >= maxNav) { + throw new Error( + `Navigated ${maxNav} prior AUQs without reaching the mode AUQ. ` + + `Last list:\n${opts.map(o => ` ${o.index}. ${o.label}`).join('\n')}`, + ); + } + priorAnswered++; + session.send('1\r'); + // Give the agent a beat to advance before re-polling. + await Bun.sleep(2000); + } + throw new Error(`Mode AUQ not reached within ${budgetMs}ms`); +} + +describeE2E('/plan-ceo-review mode routing (gate)', () => { + for (const c of CASES) { + test( + `mode "${c.mode}" routes to its distinctive posture`, + async () => { + const session = await launchClaudePty({ + permissionMode: 'plan', + timeoutMs: 540_000, + }); + try { + await Bun.sleep(8000); + const since = session.mark(); + session.send('/plan-ceo-review\r'); + + const { modeIndex } = await navigateToModeAuq(session, since, c.mode); + + // Snapshot the visible buffer at mode-pick time, then send the index. + const sincePick = session.rawOutput().length; + session.send(`${modeIndex}\r`); + + // Wait for downstream evidence: either next AUQ or plan_ready or + // a posture-distinctive substring shows up. + const budgetMs = 240_000; + const start = Date.now(); + let postureMatched = false; + let downstreamSnapshot = ''; + while (Date.now() - start < budgetMs) { + await Bun.sleep(2500); + if (session.exited()) { + throw new Error( + `claude exited (code=${session.exitCode()}) after mode pick.\n` + + `Downstream:\n${session.visibleSince(sincePick).slice(-2000)}`, + ); + } + downstreamSnapshot = session.visibleSince(sincePick); + if (c.postureRe.test(downstreamSnapshot)) { + postureMatched = true; + break; + } + // Don't bail early on plan_ready alone — the posture text may + // arrive as the agent finishes writing the plan. Only break + // once we either match posture or run the clock. + if ( + isPlanReadyVisible(downstreamSnapshot) && + isNumberedOptionListVisible(downstreamSnapshot) && + !c.postureRe.test(downstreamSnapshot) + ) { + // Plan-ready AND a follow-up AUQ are both visible but + // posture text has not appeared yet. Keep polling for a bit. + } + } + if (!postureMatched) { + throw new Error( + `Mode "${c.mode}" routing FAILED: no posture match for ${c.postureRe.source}.\n` + + `--- downstream visible since mode pick (last 3KB) ---\n` + + downstreamSnapshot.slice(-3000), + ); + } + } finally { + await session.close(); + } + }, + 600_000, + ); + } +}); diff --git a/test/skill-e2e-ship-idempotency.test.ts b/test/skill-e2e-ship-idempotency.test.ts new file mode 100644 index 00000000..e4e3b049 --- /dev/null +++ b/test/skill-e2e-ship-idempotency.test.ts @@ -0,0 +1,271 @@ +/** + * /ship idempotency E2E (periodic, paid, real-PTY). + * + * Asserts: when /ship runs against a branch that has ALREADY been bumped + * (VERSION ahead of base AND package.json synced AND a CHANGELOG entry + * exists for the bumped version), the workflow: + * + * 1. Detects ALREADY_BUMPED state via the Step 12 idempotency check + * 2. Does NOT echo STATE: FRESH (which would trigger a second bump) + * 3. Does NOT mutate the fixture's VERSION file + * 4. Does NOT append a duplicate CHANGELOG [0.0.2] entry + * 5. Does NOT create a new "chore: bump version" commit + * + * Why real-PTY: the existing ship-idempotency test in skill-e2e.test.ts + * uses the SDK harness with a synthetic prompt asking the agent to "run + * ONLY the idempotency checks." This test exercises the actual /ship + * skill end-to-end against a real git fixture so a regression that + * silently re-bumps despite the check passing would be caught. + * + * Plan-mode framing: we run /ship in plan mode so the agent cannot push, + * commit, or open PRs. The Step 12 idempotency check is read-only + * (reads VERSION + package.json + git rev-parse) and runs fine in plan + * mode. The plan-ready output serves as the terminal signal — the agent + * has done its analysis and produced a plan describing what it would do. + * + * If the agent decides to bump or push despite the fixture's + * ALREADY_BUMPED state, that intent surfaces in the plan or in + * tool-call attempts, which we detect. + * + * Cost: ~$2-4/run. Periodic tier — long, runs weekly. + */ + +import { describe, test, expect } from 'bun:test'; +import { spawnSync } from 'child_process'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { + launchClaudePty, + isPermissionDialogVisible, + isNumberedOptionListVisible, +} from './helpers/claude-pty-runner'; + +const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic'; +const describeE2E = shouldRun ? describe : describe.skip; + +interface ShipFixture { + workTree: string; + bareRemote: string; + /** Full bash log of `git` and helper commands run during setup. */ + setupLog: string[]; +} + +/** + * Build a self-contained git fixture representing an already-shipped state: + * - main branch at VERSION 0.0.1, with one CHANGELOG entry [0.0.1] + * - feat/already-shipped branch at VERSION 0.0.2 (bumped + synced), + * CHANGELOG has [0.0.2] entry on top of [0.0.1], one feature commit + * - bareRemote is the origin; both branches are pushed + * + * Returns the work-tree dir for /ship to operate on. + */ +function buildShippedFixture(): ShipFixture { + const root = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-ship-fixture-')); + const workTree = path.join(root, 'workspace'); + const bareRemote = path.join(root, 'origin.git'); + fs.mkdirSync(workTree, { recursive: true }); + + const setupLog: string[] = []; + const sh = (cmd: string, cwd: string): void => { + setupLog.push(`[${cwd}] ${cmd}`); + const result = spawnSync('bash', ['-c', cmd], { cwd, stdio: 'pipe', timeout: 15_000 }); + if (result.status !== 0) { + const stderr = result.stderr?.toString() ?? ''; + throw new Error(`fixture setup failed at "${cmd}":\n${stderr}\n--- log ---\n${setupLog.join('\n')}`); + } + }; + + // Bare remote. + sh(`git init --bare "${bareRemote}"`, root); + + // Initial commit on main. + sh('git init -b main', workTree); + sh('git config user.email "test@test.com"', workTree); + sh('git config user.name "Test"', workTree); + sh('git config commit.gpgsign false', workTree); + + fs.writeFileSync(path.join(workTree, 'VERSION'), '0.0.1\n'); + fs.writeFileSync( + path.join(workTree, 'package.json'), + JSON.stringify({ name: 'fixture', version: '0.0.1', private: true }, null, 2) + '\n', + ); + fs.writeFileSync( + path.join(workTree, 'CHANGELOG.md'), + `# Changelog\n\n## [0.0.1] - 2026-01-01\n\n- Initial release\n`, + ); + fs.writeFileSync(path.join(workTree, 'README.md'), '# Fixture\n'); + + sh('git add VERSION package.json CHANGELOG.md README.md', workTree); + sh('git commit -m "chore: initial release v0.0.1"', workTree); + sh(`git remote add origin "${bareRemote}"`, workTree); + sh('git push -u origin main', workTree); + + // Feature branch with ALREADY_BUMPED state. + sh('git checkout -b feat/already-shipped', workTree); + fs.writeFileSync(path.join(workTree, 'VERSION'), '0.0.2\n'); + fs.writeFileSync( + path.join(workTree, 'package.json'), + JSON.stringify({ name: 'fixture', version: '0.0.2', private: true }, null, 2) + '\n', + ); + fs.writeFileSync( + path.join(workTree, 'CHANGELOG.md'), + `# Changelog\n\n## [0.0.2] - 2026-04-25\n\n**Feature shipped.**\n\nAdded the new feature.\n\n## [0.0.1] - 2026-01-01\n\n- Initial release\n`, + ); + fs.writeFileSync(path.join(workTree, 'feature.md'), '# Feature\n\nAlready shipped.\n'); + + sh('git add VERSION package.json CHANGELOG.md feature.md', workTree); + sh('git commit -m "feat: add new feature\n\nbumps VERSION to 0.0.2"', workTree); + sh('git push -u origin feat/already-shipped', workTree); + + return { workTree, bareRemote, setupLog }; +} + +/** Snapshot the load-bearing fixture state so we can compare post-run. */ +interface FixtureSnapshot { + versionFile: string; + packageVersion: string; + changelogEntryCount: number; + bumpCommitCount: number; + branchHead: string; +} + +function snapshotFixture(workTree: string): FixtureSnapshot { + const versionFile = fs.readFileSync(path.join(workTree, 'VERSION'), 'utf-8').trim(); + const pkg = JSON.parse(fs.readFileSync(path.join(workTree, 'package.json'), 'utf-8')); + const changelog = fs.readFileSync(path.join(workTree, 'CHANGELOG.md'), 'utf-8'); + // Count `## [0.0.2]` headings — should stay at 1 across re-runs. + const changelogEntryCount = (changelog.match(/^##\s*\[0\.0\.2\]/gm) ?? []).length; + const head = spawnSync('git', ['rev-parse', 'HEAD'], { cwd: workTree, stdio: 'pipe' }); + const branchHead = head.stdout?.toString().trim() ?? ''; + // Count "chore: bump version" commits on this branch since main. + const log = spawnSync( + 'git', ['log', '--format=%s', 'main..HEAD'], + { cwd: workTree, stdio: 'pipe' }, + ); + const subjects = log.stdout?.toString() ?? ''; + const bumpCommitCount = subjects.split('\n').filter(s => /chore:\s*bump\s+version/i.test(s)).length; + return { versionFile, packageVersion: pkg.version, changelogEntryCount, bumpCommitCount, branchHead }; +} + +describeE2E('/ship idempotency E2E (periodic, real-PTY)', () => { + test( + 'rerunning /ship on an already-shipped branch detects ALREADY_BUMPED and does not mutate fixture', + async () => { + const fixture = buildShippedFixture(); + const before = snapshotFixture(fixture.workTree); + + const session = await launchClaudePty({ + permissionMode: 'plan', + cwd: fixture.workTree, + timeoutMs: 720_000, + // Disable network-y pieces so the agent can't reach actual github. + env: { GH_TOKEN: 'mock-not-real', NO_COLOR: '1' }, + }); + + let outcome: 'detected' | 'plan_ready' | 'attempted_mutation' | 'timeout' | 'exited' = 'timeout'; + let evidence = ''; + + try { + await Bun.sleep(8000); + const since = session.mark(); + session.send('/ship\r'); + + const budgetMs = 600_000; + const start = Date.now(); + let lastPermSig = ''; + while (Date.now() - start < budgetMs) { + await Bun.sleep(3000); + if (session.exited()) { + outcome = 'exited'; + evidence = session.visibleSince(since).slice(-3000); + break; + } + const visible = session.visibleSince(since); + + // Auto-grant any permission dialogs the preamble triggers + // (e.g. touch on a marker file claude considers sensitive). + // Classify on the recent tail; don't double-press the same render. + const tail = visible.slice(-1500); + if (isNumberedOptionListVisible(tail) && isPermissionDialogVisible(tail)) { + const sig = visible.slice(-500); + if (sig !== lastPermSig) { + lastPermSig = sig; + session.send('1\r'); + await Bun.sleep(1500); + continue; + } + } + + // Positive: the idempotency-check echoed ALREADY_BUMPED. + if (/STATE:\s*ALREADY_BUMPED/.test(visible)) { + outcome = 'detected'; + evidence = visible.slice(-3000); + break; + } + + // Negative regressions: + // - bump-action bash block ran (would echo on FRESH path) + // - agent attempted git commit -m "chore: bump version" + // - agent attempted git push + // - agent rendered an Edit/Write to CHANGELOG.md or VERSION (acceptable in plan mode but flagged here) + if ( + /STATE:\s*FRESH(?![\w-])/i.test(visible) || + /git\s+commit\s+.*chore:\s*bump\s+version/i.test(visible) || + /git\s+push.*origin/i.test(visible) + ) { + outcome = 'attempted_mutation'; + evidence = visible.slice(-3000); + break; + } + + // Plan-ready outcome (acceptable terminal): the agent finished + // analysis. We'll accept this if no mutation signals showed up. + if (/ready to execute|Would you like to proceed/i.test(visible)) { + outcome = 'plan_ready'; + evidence = visible.slice(-3000); + break; + } + } + } finally { + await session.close(); + } + + // Verify fixture was not mutated regardless of outcome. + const after = snapshotFixture(fixture.workTree); + const fixtureStable = + after.versionFile === before.versionFile && + after.packageVersion === before.packageVersion && + after.changelogEntryCount === before.changelogEntryCount && + after.bumpCommitCount === before.bumpCommitCount && + after.branchHead === before.branchHead; + + try { + if (outcome === 'attempted_mutation') { + throw new Error( + `/ship attempted to mutate already-shipped state.\n` + + `--- evidence (last 3KB) ---\n${evidence}\n` + + `--- before ---\n${JSON.stringify(before, null, 2)}\n` + + `--- after ---\n${JSON.stringify(after, null, 2)}`, + ); + } + if (outcome === 'exited') { + throw new Error(`claude exited unexpectedly.\n--- evidence ---\n${evidence}`); + } + if (outcome === 'timeout') { + throw new Error( + `Timed out before any terminal outcome.\n--- evidence (last 3KB) ---\n${evidence}`, + ); + } + // Detected or plan_ready — both are acceptable terminal outcomes. + expect(['detected', 'plan_ready']).toContain(outcome); + // Fixture must not have been mutated regardless of outcome. + expect(fixtureStable).toBe(true); + } finally { + // Clean up fixture root. + try { fs.rmSync(path.dirname(fixture.workTree), { recursive: true, force: true }); } catch { /* ignore */ } + } + }, + 900_000, // 15 min wall clock + ); +});