gstack/test/skill-e2e-ship-idempotency.test.ts

/**
 * /ship idempotency E2E (periodic, paid, real-PTY).
 *
 * Asserts: when /ship runs against a branch that has ALREADY been bumped
 * (VERSION ahead of base AND package.json synced AND a CHANGELOG entry
 * exists for the bumped version), the workflow:
 *
 *   1. Detects ALREADY_BUMPED state via the Step 12 idempotency check
 *   2. Does NOT echo STATE: FRESH (which would trigger a second bump)
 *   3. Does NOT mutate the fixture's VERSION file
 *   4. Does NOT append a duplicate CHANGELOG [0.0.2] entry
 *   5. Does NOT create a new "chore: bump version" commit
 *
 * Why real-PTY: the existing ship-idempotency test in skill-e2e.test.ts
 * uses the SDK harness with a synthetic prompt asking the agent to "run
 * ONLY the idempotency checks." This test exercises the actual /ship
 * skill end-to-end against a real git fixture so a regression that
 * silently re-bumps despite the check passing would be caught.
 *
 * Plan-mode framing: we run /ship in plan mode so the agent cannot push,
 * commit, or open PRs. The Step 12 idempotency check is read-only
 * (reads VERSION + package.json + git rev-parse) and runs fine in plan
 * mode. The plan-ready output serves as the terminal signal — the agent
 * has done its analysis and produced a plan describing what it would do.
 *
 * If the agent decides to bump or push despite the fixture's
 * ALREADY_BUMPED state, that intent surfaces in the plan or in
 * tool-call attempts, which we detect.
 *
 * Cost: ~$2-4/run. Periodic tier — long, runs weekly.
 */

import { describe, test, expect } from 'bun:test';
import { spawnSync } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
import {
  launchClaudePty,
  isPermissionDialogVisible,
  isNumberedOptionListVisible,
} from './helpers/claude-pty-runner';

const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
const describeE2E = shouldRun ? describe : describe.skip;

interface ShipFixture {
  workTree: string;
  bareRemote: string;
  /** Full bash log of `git` and helper commands run during setup. */
  setupLog: string[];
}

/**
 * Build a self-contained git fixture representing an already-shipped state:
 *   - main branch at VERSION 0.0.1, with one CHANGELOG entry [0.0.1]
 *   - feat/already-shipped branch at VERSION 0.0.2 (bumped + synced),
 *     CHANGELOG has [0.0.2] entry on top of [0.0.1], one feature commit
 *   - bareRemote is the origin; both branches are pushed
 *
 * Returns the work-tree dir for /ship to operate on.
 */
function buildShippedFixture(): ShipFixture {
  const root = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-ship-fixture-'));
  const workTree = path.join(root, 'workspace');
  const bareRemote = path.join(root, 'origin.git');
  fs.mkdirSync(workTree, { recursive: true });

  const setupLog: string[] = [];
  const sh = (cmd: string, cwd: string): void => {
    setupLog.push(`[${cwd}] ${cmd}`);
    const result = spawnSync('bash', ['-c', cmd], { cwd, stdio: 'pipe', timeout: 15_000 });
    if (result.status !== 0) {
      const stderr = result.stderr?.toString() ?? '';
      throw new Error(`fixture setup failed at "${cmd}":\n${stderr}\n--- log ---\n${setupLog.join('\n')}`);
    }
  };

  // Bare remote.
  sh(`git init --bare "${bareRemote}"`, root);

  // Initial commit on main.
  sh('git init -b main', workTree);
  sh('git config user.email "test@test.com"', workTree);
  sh('git config user.name "Test"', workTree);
  sh('git config commit.gpgsign false', workTree);

  fs.writeFileSync(path.join(workTree, 'VERSION'), '0.0.1\n');
  fs.writeFileSync(
    path.join(workTree, 'package.json'),
    JSON.stringify({ name: 'fixture', version: '0.0.1', private: true }, null, 2) + '\n',
  );
  fs.writeFileSync(
    path.join(workTree, 'CHANGELOG.md'),
    `# Changelog\n\n## [0.0.1] - 2026-01-01\n\n- Initial release\n`,
  );
  fs.writeFileSync(path.join(workTree, 'README.md'), '# Fixture\n');

  sh('git add VERSION package.json CHANGELOG.md README.md', workTree);
  sh('git commit -m "chore: initial release v0.0.1"', workTree);
  sh(`git remote add origin "${bareRemote}"`, workTree);
  sh('git push -u origin main', workTree);

  // Feature branch with ALREADY_BUMPED state.
  sh('git checkout -b feat/already-shipped', workTree);
  fs.writeFileSync(path.join(workTree, 'VERSION'), '0.0.2\n');
  fs.writeFileSync(
    path.join(workTree, 'package.json'),
    JSON.stringify({ name: 'fixture', version: '0.0.2', private: true }, null, 2) + '\n',
  );
  fs.writeFileSync(
    path.join(workTree, 'CHANGELOG.md'),
    `# Changelog\n\n## [0.0.2] - 2026-04-25\n\n**Feature shipped.**\n\nAdded the new feature.\n\n## [0.0.1] - 2026-01-01\n\n- Initial release\n`,
  );
  fs.writeFileSync(path.join(workTree, 'feature.md'), '# Feature\n\nAlready shipped.\n');

  sh('git add VERSION package.json CHANGELOG.md feature.md', workTree);
  sh('git commit -m "feat: add new feature\n\nbumps VERSION to 0.0.2"', workTree);
  sh('git push -u origin feat/already-shipped', workTree);

  return { workTree, bareRemote, setupLog };
}

/** Snapshot the load-bearing fixture state so we can compare post-run. */
interface FixtureSnapshot {
  versionFile: string;
  packageVersion: string;
  changelogEntryCount: number;
  bumpCommitCount: number;
  branchHead: string;
}

function snapshotFixture(workTree: string): FixtureSnapshot {
  const versionFile = fs.readFileSync(path.join(workTree, 'VERSION'), 'utf-8').trim();
  const pkg = JSON.parse(fs.readFileSync(path.join(workTree, 'package.json'), 'utf-8'));
  const changelog = fs.readFileSync(path.join(workTree, 'CHANGELOG.md'), 'utf-8');
  // Count `## [0.0.2]` headings — should stay at 1 across re-runs.
  const changelogEntryCount = (changelog.match(/^##\s*\[0\.0\.2\]/gm) ?? []).length;
  const head = spawnSync('git', ['rev-parse', 'HEAD'], { cwd: workTree, stdio: 'pipe' });
  const branchHead = head.stdout?.toString().trim() ?? '';
  // Count "chore: bump version" commits on this branch since main.
  const log = spawnSync(
    'git', ['log', '--format=%s', 'main..HEAD'],
    { cwd: workTree, stdio: 'pipe' },
  );
  const subjects = log.stdout?.toString() ?? '';
  const bumpCommitCount = subjects.split('\n').filter(s => /chore:\s*bump\s+version/i.test(s)).length;
  return { versionFile, packageVersion: pkg.version, changelogEntryCount, bumpCommitCount, branchHead };
}

describeE2E('/ship idempotency E2E (periodic, real-PTY)', () => {
  test(
    'rerunning /ship on an already-shipped branch detects ALREADY_BUMPED and does not mutate fixture',
    async () => {
      const fixture = buildShippedFixture();
      const before = snapshotFixture(fixture.workTree);

      const session = await launchClaudePty({
        permissionMode: 'plan',
        cwd: fixture.workTree,
        timeoutMs: 720_000,
        // Disable network-y pieces so the agent can't reach actual github.
        env: { GH_TOKEN: 'mock-not-real', NO_COLOR: '1' },
      });

      let outcome: 'detected' | 'plan_ready' | 'attempted_mutation' | 'timeout' | 'exited' = 'timeout';
      let evidence = '';

      try {
        await Bun.sleep(8000);
        const since = session.mark();
        session.send('/ship\r');

        const budgetMs = 600_000;
        const start = Date.now();
        let lastPermSig = '';
        while (Date.now() - start < budgetMs) {
          await Bun.sleep(3000);
          if (session.exited()) {
            outcome = 'exited';
            evidence = session.visibleSince(since).slice(-3000);
            break;
          }
          const visible = session.visibleSince(since);

          // Auto-grant any permission dialogs the preamble triggers
          // (e.g. touch on a marker file claude considers sensitive).
          // Classify on the recent tail; don't double-press the same render.
          const tail = visible.slice(-1500);
          if (isNumberedOptionListVisible(tail) && isPermissionDialogVisible(tail)) {
            const sig = visible.slice(-500);
            if (sig !== lastPermSig) {
              lastPermSig = sig;
              session.send('1\r');
              await Bun.sleep(1500);
              continue;
            }
          }

          // Positive: the idempotency-check echoed ALREADY_BUMPED.
          if (/STATE:\s*ALREADY_BUMPED/.test(visible)) {
            outcome = 'detected';
            evidence = visible.slice(-3000);
            break;
          }

          // Negative regressions:
          //   - bump-action bash block ran (would echo on FRESH path)
          //   - agent attempted git commit -m "chore: bump version"
          //   - agent attempted git push
          //   - agent rendered an Edit/Write to CHANGELOG.md or VERSION (acceptable in plan mode but flagged here)
          if (
            /STATE:\s*FRESH(?![\w-])/i.test(visible) ||
            /git\s+commit\s+.*chore:\s*bump\s+version/i.test(visible) ||
            /git\s+push.*origin/i.test(visible)
          ) {
            outcome = 'attempted_mutation';
            evidence = visible.slice(-3000);
            break;
          }

          // Plan-ready outcome (acceptable terminal): the agent finished
          // analysis. We'll accept this if no mutation signals showed up.
          if (/ready to execute|Would you like to proceed/i.test(visible)) {
            outcome = 'plan_ready';
            evidence = visible.slice(-3000);
            break;
          }
        }
      } finally {
        await session.close();
      }

      // Verify fixture was not mutated regardless of outcome.
      const after = snapshotFixture(fixture.workTree);
      const fixtureStable =
        after.versionFile === before.versionFile &&
        after.packageVersion === before.packageVersion &&
        after.changelogEntryCount === before.changelogEntryCount &&
        after.bumpCommitCount === before.bumpCommitCount &&
        after.branchHead === before.branchHead;

      try {
        if (outcome === 'attempted_mutation') {
          throw new Error(
            `/ship attempted to mutate already-shipped state.\n` +
              `--- evidence (last 3KB) ---\n${evidence}\n` +
              `--- before ---\n${JSON.stringify(before, null, 2)}\n` +
              `--- after  ---\n${JSON.stringify(after, null, 2)}`,
          );
        }
        if (outcome === 'exited') {
          throw new Error(`claude exited unexpectedly.\n--- evidence ---\n${evidence}`);
        }
        if (outcome === 'timeout') {
          throw new Error(
            `Timed out before any terminal outcome.\n--- evidence (last 3KB) ---\n${evidence}`,
          );
        }
        // Detected or plan_ready — both are acceptable terminal outcomes.
        expect(['detected', 'plan_ready']).toContain(outcome);
        // Fixture must not have been mutated regardless of outcome.
        expect(fixtureStable).toBe(true);
      } finally {
        // Clean up fixture root.
        try { fs.rmSync(path.dirname(fixture.workTree), { recursive: true, force: true }); } catch { /* ignore */ }
      }
    },
    900_000, // 15 min wall clock
  );
});