mirror of
https://github.com/garrytan/gstack.git
synced 2026-06-17 07:10:12 +02:00
test(ship): section-loading E2E + idempotency CLI detection (T9)
- skill-e2e-ship-section-loading.test.ts (new, periodic): runs real /ship in plan mode against a fresh version-changing fixture and asserts the agent Read the required sections (review-army + changelog). Runs against the INSTALLED skill (~/.claude/skills/gstack/ship), not repo paths, so install-layout 404s surface [Codex outside-voice #5]. Layer-5 mechanical guard against silent section-skip. - skill-e2e-ship-idempotency.test.ts: detection updated for the carve — Step 12 now runs gstack-version-bump classify (JSON "state":"ALREADY_BUMPED") instead of the inline bash echo (STATE: ALREADY_BUMPED). Accept both; add a gstack-version-bump-write re-bump regression signal. - touchfiles: register ship-section-loading (periodic) + extend idempotency deps with bin/gstack-version-bump + scripts/resolvers/sections.ts. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -120,7 +120,8 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
'plan-ceo-mode-routing': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
|
||||
'plan-design-with-ui-scope': ['plan-design-review/**', 'test/fixtures/plans/ui-heavy-feature.md', 'test/helpers/claude-pty-runner.ts'],
|
||||
'budget-regression-pty': ['test/helpers/eval-store.ts', 'test/skill-budget-regression.test.ts'],
|
||||
'ship-idempotency-pty': ['ship/**', 'bin/gstack-next-version', 'lib/worktree.ts', 'test/helpers/claude-pty-runner.ts'],
|
||||
'ship-idempotency-pty': ['ship/**', 'bin/gstack-next-version', 'bin/gstack-version-bump', 'scripts/resolvers/sections.ts', 'lib/worktree.ts', 'test/helpers/claude-pty-runner.ts'],
|
||||
'ship-section-loading': ['ship/**', 'scripts/resolvers/sections.ts', 'scripts/gen-skill-docs.ts', 'test/helpers/required-reads.ts', 'test/helpers/transcript-section-logger.ts', 'test/helpers/claude-pty-runner.ts'],
|
||||
'autoplan-chain-pty': ['autoplan/**', 'plan-ceo-review/**', 'plan-design-review/**', 'plan-eng-review/**', 'plan-devex-review/**', 'test/fixtures/plans/ui-heavy-feature.md', 'test/helpers/claude-pty-runner.ts'],
|
||||
'e2e-harness-audit': ['plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'test/helpers/agent-sdk-runner.ts', 'test/helpers/claude-pty-runner.ts'],
|
||||
|
||||
@@ -472,6 +473,7 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
|
||||
'plan-design-with-ui-scope': 'gate', // ~$0.80/run
|
||||
'budget-regression-pty': 'gate', // free, library-only assertion
|
||||
'ship-idempotency-pty': 'periodic', // ~$3/run, real /ship in plan mode
|
||||
'ship-section-loading': 'periodic', // ~$3/run, real /ship; asserts section reads
|
||||
'autoplan-chain-pty': 'periodic', // ~$8/run, all 3 phases sequential
|
||||
|
||||
// Per-finding count + review-report-at-bottom — periodic because each
|
||||
|
||||
@@ -197,20 +197,26 @@ describeE2E('/ship idempotency E2E (periodic, real-PTY)', () => {
|
||||
}
|
||||
}
|
||||
|
||||
// Positive: the idempotency-check echoed ALREADY_BUMPED.
|
||||
if (/STATE:\s*ALREADY_BUMPED/.test(visible)) {
|
||||
// Positive: idempotency classify reported ALREADY_BUMPED. Post-carve
|
||||
// (T9), Step 12 runs `gstack-version-bump classify` which emits JSON
|
||||
// (`"state":"ALREADY_BUMPED"`); the legacy inline bash echoed
|
||||
// `STATE: ALREADY_BUMPED`. Accept either so the test survives the carve.
|
||||
if (/STATE:\s*ALREADY_BUMPED|"state":\s*"ALREADY_BUMPED"/.test(visible)) {
|
||||
outcome = 'detected';
|
||||
evidence = visible.slice(-3000);
|
||||
break;
|
||||
}
|
||||
|
||||
// Negative regressions:
|
||||
// - bump-action bash block ran (would echo on FRESH path)
|
||||
// - classify reported FRESH (CLI JSON or legacy echo) → would re-bump
|
||||
// - agent attempted git commit -m "chore: bump version"
|
||||
// - agent attempted git push
|
||||
// - agent rendered an Edit/Write to CHANGELOG.md or VERSION (acceptable in plan mode but flagged here)
|
||||
// - agent ran the CLI write path (gstack-version-bump write) — a
|
||||
// re-bump on an already-shipped branch
|
||||
if (
|
||||
/"state":\s*"FRESH"/.test(visible) ||
|
||||
/STATE:\s*FRESH(?![\w-])/i.test(visible) ||
|
||||
/gstack-version-bump\s+write/i.test(visible) ||
|
||||
/git\s+commit\s+.*chore:\s*bump\s+version/i.test(visible) ||
|
||||
/git\s+push.*origin/i.test(visible)
|
||||
) {
|
||||
|
||||
@@ -0,0 +1,120 @@
|
||||
/**
|
||||
* /ship section-loading E2E (periodic, paid, real-PTY) — v2 plan T9 mitigation
|
||||
* layer 5, the ONLY CI-failing guard against silent section-skip.
|
||||
*
|
||||
* After the carve, ship is a skeleton whose STOP-Read directives point at
|
||||
* sections/*.md. This test runs the REAL /ship skill in plan mode against a
|
||||
* fresh version-changing fixture and asserts the agent actually Read the
|
||||
* sections its situation requires (review-army + changelog at minimum — every
|
||||
* version-changing ship needs the pre-landing review and a CHANGELOG entry).
|
||||
*
|
||||
* Runs against the INSTALLED skill at ~/.claude/skills/gstack/ship (Codex
|
||||
* outside-voice #5: an E2E that reads repo paths would miss install-layout
|
||||
* 404s). Section reads are detected from the PTY scrollback — when the agent
|
||||
* Reads a section the tool render shows the `sections/<file>.md` path.
|
||||
*
|
||||
* Plan-mode framing keeps the agent from committing/pushing; producing a plan
|
||||
* is the terminal signal. Cost: ~$2-4/run. Periodic tier.
|
||||
*
|
||||
* Situation matrix (T1 = B): this file covers the fresh version-changing ship;
|
||||
* the already-bumped re-run is covered by skill-e2e-ship-idempotency.test.ts,
|
||||
* and a no-plan-file variant can be added to FIXTURES below.
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import { spawnSync } from 'child_process';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import * as os from 'os';
|
||||
import {
|
||||
launchClaudePty,
|
||||
isPermissionDialogVisible,
|
||||
isNumberedOptionListVisible,
|
||||
} from './helpers/claude-pty-runner';
|
||||
|
||||
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
|
||||
const describeE2E = shouldRun ? describe : describe.skip;
|
||||
|
||||
/** Fresh fixture: feature branch with a real change but VERSION still == base,
|
||||
* so /ship must bump (FRESH) and walk the full pre-landing + changelog flow. */
|
||||
function buildFreshFixture(): { workTree: string; root: string } {
|
||||
const root = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-ship-secload-'));
|
||||
const workTree = path.join(root, 'workspace');
|
||||
const bareRemote = path.join(root, 'origin.git');
|
||||
fs.mkdirSync(workTree, { recursive: true });
|
||||
const sh = (cmd: string, cwd: string): void => {
|
||||
const r = spawnSync('bash', ['-c', cmd], { cwd, stdio: 'pipe', timeout: 15_000 });
|
||||
if (r.status !== 0) throw new Error(`fixture setup failed at "${cmd}":\n${r.stderr?.toString()}`);
|
||||
};
|
||||
sh(`git init --bare "${bareRemote}"`, root);
|
||||
sh('git init -b main', workTree);
|
||||
sh('git config user.email "t@t.com" && git config user.name "T" && git config commit.gpgsign false', workTree);
|
||||
fs.writeFileSync(path.join(workTree, 'VERSION'), '0.0.1\n');
|
||||
fs.writeFileSync(path.join(workTree, 'package.json'), JSON.stringify({ name: 'fx', version: '0.0.1', private: true }, null, 2) + '\n');
|
||||
fs.writeFileSync(path.join(workTree, 'CHANGELOG.md'), '# Changelog\n\n## [0.0.1] - 2026-01-01\n\n- Initial release\n');
|
||||
fs.writeFileSync(path.join(workTree, 'app.js'), '// base\n');
|
||||
sh('git add -A && git commit -m "chore: initial v0.0.1"', workTree);
|
||||
sh(`git remote add origin "${bareRemote}" && git push -u origin main`, workTree);
|
||||
// Feature branch: a real code change, VERSION untouched → FRESH (needs a bump).
|
||||
sh('git checkout -b feat/new-thing', workTree);
|
||||
fs.writeFileSync(path.join(workTree, 'app.js'), '// base\nexport function newThing() { return 42; }\n');
|
||||
fs.writeFileSync(path.join(workTree, 'app.test.js'), 'test("newThing", () => {});\n');
|
||||
sh('git add -A && git commit -m "feat: add newThing"', workTree);
|
||||
sh('git push -u origin feat/new-thing', workTree);
|
||||
return { workTree, root };
|
||||
}
|
||||
|
||||
// Sections every version-changing ship must consult.
|
||||
const REQUIRED_SECTIONS = ['review-army.md', 'changelog.md'];
|
||||
|
||||
describeE2E('/ship section-loading E2E (periodic, real-PTY, installed skill)', () => {
|
||||
test(
|
||||
'fresh version-changing ship Reads the required sections',
|
||||
async () => {
|
||||
const { workTree, root } = buildFreshFixture();
|
||||
const session = await launchClaudePty({
|
||||
permissionMode: 'plan',
|
||||
cwd: workTree,
|
||||
timeoutMs: 720_000,
|
||||
env: { GH_TOKEN: 'mock-not-real', NO_COLOR: '1' },
|
||||
});
|
||||
|
||||
const readSections = new Set<string>();
|
||||
let planReady = false;
|
||||
try {
|
||||
await Bun.sleep(8000);
|
||||
const since = session.mark();
|
||||
session.send('/ship\r');
|
||||
const start = Date.now();
|
||||
let lastPermSig = '';
|
||||
while (Date.now() - start < 600_000) {
|
||||
await Bun.sleep(3000);
|
||||
if (session.exited()) break;
|
||||
const visible = session.visibleSince(since);
|
||||
const tail = visible.slice(-1500);
|
||||
if (isNumberedOptionListVisible(tail) && isPermissionDialogVisible(tail)) {
|
||||
const sig = visible.slice(-500);
|
||||
if (sig !== lastPermSig) { lastPermSig = sig; session.send('1\r'); await Bun.sleep(1500); continue; }
|
||||
}
|
||||
// Detect section reads from the scrollback (tool render shows the path).
|
||||
for (const m of visible.matchAll(/sections\/([A-Za-z0-9._-]+\.md)/g)) readSections.add(m[1]);
|
||||
if (/ready to execute|Would you like to proceed|GSTACK REVIEW REPORT/i.test(visible)) {
|
||||
planReady = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
await session.close();
|
||||
try { fs.rmSync(root, { recursive: true, force: true }); } catch { /* ignore */ }
|
||||
}
|
||||
|
||||
const missing = REQUIRED_SECTIONS.filter(s => !readSections.has(s));
|
||||
expect({ planReady, read: [...readSections], missing }).toEqual({
|
||||
planReady: true,
|
||||
read: expect.any(Array),
|
||||
missing: [],
|
||||
});
|
||||
},
|
||||
900_000,
|
||||
);
|
||||
});
|
||||
Reference in New Issue
Block a user