diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index 35f82dee8..5e6c1ecad 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -120,7 +120,8 @@ export const E2E_TOUCHFILES: Record = { 'plan-ceo-mode-routing': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'], 'plan-design-with-ui-scope': ['plan-design-review/**', 'test/fixtures/plans/ui-heavy-feature.md', 'test/helpers/claude-pty-runner.ts'], 'budget-regression-pty': ['test/helpers/eval-store.ts', 'test/skill-budget-regression.test.ts'], - 'ship-idempotency-pty': ['ship/**', 'bin/gstack-next-version', 'lib/worktree.ts', 'test/helpers/claude-pty-runner.ts'], + 'ship-idempotency-pty': ['ship/**', 'bin/gstack-next-version', 'bin/gstack-version-bump', 'scripts/resolvers/sections.ts', 'lib/worktree.ts', 'test/helpers/claude-pty-runner.ts'], + 'ship-section-loading': ['ship/**', 'scripts/resolvers/sections.ts', 'scripts/gen-skill-docs.ts', 'test/helpers/required-reads.ts', 'test/helpers/transcript-section-logger.ts', 'test/helpers/claude-pty-runner.ts'], 'autoplan-chain-pty': ['autoplan/**', 'plan-ceo-review/**', 'plan-design-review/**', 'plan-eng-review/**', 'plan-devex-review/**', 'test/fixtures/plans/ui-heavy-feature.md', 'test/helpers/claude-pty-runner.ts'], 'e2e-harness-audit': ['plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'test/helpers/agent-sdk-runner.ts', 'test/helpers/claude-pty-runner.ts'], @@ -472,6 +473,7 @@ export const E2E_TIERS: Record = { 'plan-design-with-ui-scope': 'gate', // ~$0.80/run 'budget-regression-pty': 'gate', // free, library-only assertion 'ship-idempotency-pty': 'periodic', // ~$3/run, real /ship in plan mode + 'ship-section-loading': 'periodic', // ~$3/run, real /ship; asserts section reads 'autoplan-chain-pty': 'periodic', // ~$8/run, all 3 phases sequential // Per-finding count + review-report-at-bottom — periodic because each diff --git a/test/skill-e2e-ship-idempotency.test.ts b/test/skill-e2e-ship-idempotency.test.ts index e4e3b049c..daed1f1d7 100644 --- a/test/skill-e2e-ship-idempotency.test.ts +++ b/test/skill-e2e-ship-idempotency.test.ts @@ -197,20 +197,26 @@ describeE2E('/ship idempotency E2E (periodic, real-PTY)', () => { } } - // Positive: the idempotency-check echoed ALREADY_BUMPED. - if (/STATE:\s*ALREADY_BUMPED/.test(visible)) { + // Positive: idempotency classify reported ALREADY_BUMPED. Post-carve + // (T9), Step 12 runs `gstack-version-bump classify` which emits JSON + // (`"state":"ALREADY_BUMPED"`); the legacy inline bash echoed + // `STATE: ALREADY_BUMPED`. Accept either so the test survives the carve. + if (/STATE:\s*ALREADY_BUMPED|"state":\s*"ALREADY_BUMPED"/.test(visible)) { outcome = 'detected'; evidence = visible.slice(-3000); break; } // Negative regressions: - // - bump-action bash block ran (would echo on FRESH path) + // - classify reported FRESH (CLI JSON or legacy echo) → would re-bump // - agent attempted git commit -m "chore: bump version" // - agent attempted git push - // - agent rendered an Edit/Write to CHANGELOG.md or VERSION (acceptable in plan mode but flagged here) + // - agent ran the CLI write path (gstack-version-bump write) — a + // re-bump on an already-shipped branch if ( + /"state":\s*"FRESH"/.test(visible) || /STATE:\s*FRESH(?![\w-])/i.test(visible) || + /gstack-version-bump\s+write/i.test(visible) || /git\s+commit\s+.*chore:\s*bump\s+version/i.test(visible) || /git\s+push.*origin/i.test(visible) ) { diff --git a/test/skill-e2e-ship-section-loading.test.ts b/test/skill-e2e-ship-section-loading.test.ts new file mode 100644 index 000000000..67355ee90 --- /dev/null +++ b/test/skill-e2e-ship-section-loading.test.ts @@ -0,0 +1,120 @@ +/** + * /ship section-loading E2E (periodic, paid, real-PTY) — v2 plan T9 mitigation + * layer 5, the ONLY CI-failing guard against silent section-skip. + * + * After the carve, ship is a skeleton whose STOP-Read directives point at + * sections/*.md. This test runs the REAL /ship skill in plan mode against a + * fresh version-changing fixture and asserts the agent actually Read the + * sections its situation requires (review-army + changelog at minimum — every + * version-changing ship needs the pre-landing review and a CHANGELOG entry). + * + * Runs against the INSTALLED skill at ~/.claude/skills/gstack/ship (Codex + * outside-voice #5: an E2E that reads repo paths would miss install-layout + * 404s). Section reads are detected from the PTY scrollback — when the agent + * Reads a section the tool render shows the `sections/.md` path. + * + * Plan-mode framing keeps the agent from committing/pushing; producing a plan + * is the terminal signal. Cost: ~$2-4/run. Periodic tier. + * + * Situation matrix (T1 = B): this file covers the fresh version-changing ship; + * the already-bumped re-run is covered by skill-e2e-ship-idempotency.test.ts, + * and a no-plan-file variant can be added to FIXTURES below. + */ + +import { describe, test, expect } from 'bun:test'; +import { spawnSync } from 'child_process'; +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { + launchClaudePty, + isPermissionDialogVisible, + isNumberedOptionListVisible, +} from './helpers/claude-pty-runner'; + +const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic'; +const describeE2E = shouldRun ? describe : describe.skip; + +/** Fresh fixture: feature branch with a real change but VERSION still == base, + * so /ship must bump (FRESH) and walk the full pre-landing + changelog flow. */ +function buildFreshFixture(): { workTree: string; root: string } { + const root = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-ship-secload-')); + const workTree = path.join(root, 'workspace'); + const bareRemote = path.join(root, 'origin.git'); + fs.mkdirSync(workTree, { recursive: true }); + const sh = (cmd: string, cwd: string): void => { + const r = spawnSync('bash', ['-c', cmd], { cwd, stdio: 'pipe', timeout: 15_000 }); + if (r.status !== 0) throw new Error(`fixture setup failed at "${cmd}":\n${r.stderr?.toString()}`); + }; + sh(`git init --bare "${bareRemote}"`, root); + sh('git init -b main', workTree); + sh('git config user.email "t@t.com" && git config user.name "T" && git config commit.gpgsign false', workTree); + fs.writeFileSync(path.join(workTree, 'VERSION'), '0.0.1\n'); + fs.writeFileSync(path.join(workTree, 'package.json'), JSON.stringify({ name: 'fx', version: '0.0.1', private: true }, null, 2) + '\n'); + fs.writeFileSync(path.join(workTree, 'CHANGELOG.md'), '# Changelog\n\n## [0.0.1] - 2026-01-01\n\n- Initial release\n'); + fs.writeFileSync(path.join(workTree, 'app.js'), '// base\n'); + sh('git add -A && git commit -m "chore: initial v0.0.1"', workTree); + sh(`git remote add origin "${bareRemote}" && git push -u origin main`, workTree); + // Feature branch: a real code change, VERSION untouched → FRESH (needs a bump). + sh('git checkout -b feat/new-thing', workTree); + fs.writeFileSync(path.join(workTree, 'app.js'), '// base\nexport function newThing() { return 42; }\n'); + fs.writeFileSync(path.join(workTree, 'app.test.js'), 'test("newThing", () => {});\n'); + sh('git add -A && git commit -m "feat: add newThing"', workTree); + sh('git push -u origin feat/new-thing', workTree); + return { workTree, root }; +} + +// Sections every version-changing ship must consult. +const REQUIRED_SECTIONS = ['review-army.md', 'changelog.md']; + +describeE2E('/ship section-loading E2E (periodic, real-PTY, installed skill)', () => { + test( + 'fresh version-changing ship Reads the required sections', + async () => { + const { workTree, root } = buildFreshFixture(); + const session = await launchClaudePty({ + permissionMode: 'plan', + cwd: workTree, + timeoutMs: 720_000, + env: { GH_TOKEN: 'mock-not-real', NO_COLOR: '1' }, + }); + + const readSections = new Set(); + let planReady = false; + try { + await Bun.sleep(8000); + const since = session.mark(); + session.send('/ship\r'); + const start = Date.now(); + let lastPermSig = ''; + while (Date.now() - start < 600_000) { + await Bun.sleep(3000); + if (session.exited()) break; + const visible = session.visibleSince(since); + const tail = visible.slice(-1500); + if (isNumberedOptionListVisible(tail) && isPermissionDialogVisible(tail)) { + const sig = visible.slice(-500); + if (sig !== lastPermSig) { lastPermSig = sig; session.send('1\r'); await Bun.sleep(1500); continue; } + } + // Detect section reads from the scrollback (tool render shows the path). + for (const m of visible.matchAll(/sections\/([A-Za-z0-9._-]+\.md)/g)) readSections.add(m[1]); + if (/ready to execute|Would you like to proceed|GSTACK REVIEW REPORT/i.test(visible)) { + planReady = true; + break; + } + } + } finally { + await session.close(); + try { fs.rmSync(root, { recursive: true, force: true }); } catch { /* ignore */ } + } + + const missing = REQUIRED_SECTIONS.filter(s => !readSections.has(s)); + expect({ planReady, read: [...readSections], missing }).toEqual({ + planReady: true, + read: expect.any(Array), + missing: [], + }); + }, + 900_000, + ); +});