mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-06 21:46:40 +02:00
e6fd776a37
skill-e2e-plan-ceo-mode-routing.test.ts (~$3/run, 6-10 min/case): - Verifies AUQ answer routing: HOLD SCOPE → rigor/bulletproof posture language; SCOPE EXPANSION → expansion/10x/dream language. Each case navigates 8-12 prior AUQs (telemetry, proactive, routing, vendoring, brain, office-hours, premise, approach) before hitting Step 0F. - Periodic, not gate: navigation phase too slow for PR-blocking. V2 expansion to 4 modes (SELECTIVE + REDUCTION) when nav is faster. skill-e2e-ship-idempotency.test.ts (~$3/run, 5-10 min): - Builds a real git fixture with VERSION 0.0.2 already bumped, matching package.json, CHANGELOG entry, pushed to a local bare remote. Runs /ship in plan mode and asserts STATE: ALREADY_BUMPED echoes from the Step 12 idempotency check, OR plan_ready terminates without mutation. - Snapshots VERSION + package.json + CHANGELOG entry count + commit count + branch HEAD before/after; fails if any changed. skill-e2e-autoplan-chain.test.ts (~$8/run, 12-18 min): - Asserts /autoplan phases run sequentially: tees timestamps as each "**Phase N complete.**" marker first appears. Phase 1 (CEO) must precede Phase 3 (Eng); Phase 2 (Design) is optional but if it appears, must sit between 1 and 3. - Auto-grants permission dialogs that fire during phase transitions. All three auto-handle permission dialogs (preamble side-effects on fresh user envs without .feature-prompted-* markers). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
177 lines
6.8 KiB
TypeScript
177 lines
6.8 KiB
TypeScript
/**
|
|
* /autoplan cross-skill chain (periodic, paid, real-PTY).
|
|
*
|
|
* Asserts: when /autoplan runs against a plan fixture, the phase markers
|
|
* the autoplan template emits appear in the correct order:
|
|
*
|
|
* "**Phase 1 complete." (CEO) →
|
|
* "**Phase 2 complete." (Design — only if UI scope detected) →
|
|
* "**Phase 3 complete." (Eng) →
|
|
* "**Phase 3.5 complete." (DX — optional, skipped if no DX scope)
|
|
*
|
|
* Why this exists: each individual phase has its own plan-mode smoke
|
|
* test. Nothing verifies the SEQUENCING — that phases don't run in
|
|
* parallel, that Phase 3 doesn't start before Phase 1 ends, that
|
|
* conditional phases (Design, DX) are skipped when their scope is absent.
|
|
* A regression where the autoplan template wires phases concurrently
|
|
* would not be caught by per-phase tests.
|
|
*
|
|
* Approach: tee timestamps as each "**Phase N complete." marker first
|
|
* appears in the visible buffer. Assert observed ordering. Phase 2 is
|
|
* optional — UI-heavy fixture should make it run; backend-only fixtures
|
|
* should make it skip.
|
|
*
|
|
* Cost: ~$5-8/run, 10-15 min wall clock. Periodic — runs weekly.
|
|
*/
|
|
|
|
import { describe, test, expect } from 'bun:test';
|
|
import { spawnSync } from 'child_process';
|
|
import * as fs from 'fs';
|
|
import * as path from 'path';
|
|
import * as os from 'os';
|
|
import {
|
|
launchClaudePty,
|
|
isPlanReadyVisible,
|
|
isPermissionDialogVisible,
|
|
isNumberedOptionListVisible,
|
|
} from './helpers/claude-pty-runner';
|
|
|
|
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
|
|
const describeE2E = shouldRun ? describe : describe.skip;
|
|
|
|
const ROOT = path.resolve(import.meta.dir, '..');
|
|
const UI_FIXTURE = path.join(ROOT, 'test', 'fixtures', 'plans', 'ui-heavy-feature.md');
|
|
|
|
interface PhaseHit {
|
|
phase: number;
|
|
ts: number;
|
|
}
|
|
|
|
describeE2E('/autoplan chain ordering (periodic)', () => {
|
|
test(
|
|
'phases run sequentially: Phase 1 (CEO) before Phase 3 (Eng), Phase 2 (Design) between when present',
|
|
async () => {
|
|
// UI-heavy fixture so Phase 2 runs.
|
|
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-autoplan-chain-'));
|
|
try {
|
|
const gitRun = (args: string[]) =>
|
|
spawnSync('git', args, { cwd: tempDir, stdio: 'pipe', timeout: 5000 });
|
|
gitRun(['init', '-b', 'main']);
|
|
gitRun(['config', 'user.email', 'test@test.com']);
|
|
gitRun(['config', 'user.name', 'Test']);
|
|
|
|
const plansDir = path.join(tempDir, '.claude', 'plans');
|
|
fs.mkdirSync(plansDir, { recursive: true });
|
|
fs.copyFileSync(UI_FIXTURE, path.join(plansDir, 'ui-heavy-feature.md'));
|
|
fs.writeFileSync(path.join(tempDir, 'README.md'), '# Autoplan chain fixture\n');
|
|
gitRun(['add', '.']);
|
|
gitRun(['commit', '-m', 'init UI-heavy fixture']);
|
|
|
|
const session = await launchClaudePty({
|
|
permissionMode: 'plan',
|
|
cwd: tempDir,
|
|
timeoutMs: 1_080_000, // 18 min, slightly above test budget
|
|
});
|
|
|
|
const hits: PhaseHit[] = [];
|
|
let outcome: 'chain_complete' | 'plan_ready' | 'timeout' | 'exited' = 'timeout';
|
|
let evidence = '';
|
|
|
|
try {
|
|
await Bun.sleep(8000);
|
|
const since = session.mark();
|
|
session.send('/autoplan\r');
|
|
|
|
const budgetMs = 900_000; // 15 min
|
|
const start = Date.now();
|
|
// Phase markers in autoplan/SKILL.md (lines 1126, 1211, 1331, 1437):
|
|
// "**Phase 1 complete." / "**Phase 2 complete." / "**Phase 3 complete." / "**Phase 3.5 complete."
|
|
const phasePattern = /\*\*Phase\s+(\d+(?:\.\d+)?)\s+complete\.?\*\*/g;
|
|
|
|
let lastPermSig = '';
|
|
while (Date.now() - start < budgetMs) {
|
|
await Bun.sleep(5000);
|
|
if (session.exited()) {
|
|
outcome = 'exited';
|
|
evidence = session.visibleSince(since).slice(-3000);
|
|
break;
|
|
}
|
|
const visible = session.visibleSince(since);
|
|
|
|
// Auto-grant any permission dialog so autoplan can keep moving
|
|
// through its phases. The autoplan template auto-decides AUQs
|
|
// it owns; only permission prompts (file/tool grants) need our
|
|
// hand-pressing. Classify on tail to avoid stale matches.
|
|
const recentTail = visible.slice(-1500);
|
|
if (isNumberedOptionListVisible(recentTail) && isPermissionDialogVisible(recentTail)) {
|
|
const sig = visible.slice(-500);
|
|
if (sig !== lastPermSig) {
|
|
lastPermSig = sig;
|
|
session.send('1\r');
|
|
await Bun.sleep(2000);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// Re-scan for any phase markers we haven't yet recorded.
|
|
phasePattern.lastIndex = 0;
|
|
let m: RegExpExecArray | null;
|
|
while ((m = phasePattern.exec(visible)) !== null) {
|
|
const phaseNum = parseFloat(m[1] ?? '0');
|
|
if (Number.isNaN(phaseNum)) continue;
|
|
if (hits.some(h => h.phase === phaseNum)) continue;
|
|
hits.push({ phase: phaseNum, ts: Date.now() });
|
|
}
|
|
|
|
// Terminal: Phase 3 (Eng) seen — chain reached the required end.
|
|
if (hits.some(h => h.phase === 3)) {
|
|
outcome = 'chain_complete';
|
|
evidence = visible.slice(-3000);
|
|
break;
|
|
}
|
|
|
|
// Plan-ready as a fallback terminal — autoplan finished without
|
|
// surfacing a Phase 3 marker. This is a regression surface.
|
|
if (isPlanReadyVisible(visible)) {
|
|
outcome = 'plan_ready';
|
|
evidence = visible.slice(-3000);
|
|
break;
|
|
}
|
|
}
|
|
} finally {
|
|
await session.close();
|
|
}
|
|
|
|
if (outcome === 'exited' || outcome === 'timeout') {
|
|
throw new Error(
|
|
`autoplan chain test FAILED: outcome=${outcome}, hits=${JSON.stringify(hits)}\n` +
|
|
`--- evidence (last 3KB) ---\n${evidence}`,
|
|
);
|
|
}
|
|
|
|
// Phase 3 (Eng) MUST have been seen.
|
|
const ceo = hits.find(h => h.phase === 1);
|
|
const design = hits.find(h => h.phase === 2);
|
|
const eng = hits.find(h => h.phase === 3);
|
|
if (!ceo || !eng) {
|
|
throw new Error(
|
|
`Required phase markers missing. Saw: ${JSON.stringify(hits)}\n` +
|
|
`--- evidence ---\n${evidence}`,
|
|
);
|
|
}
|
|
|
|
// Sequencing: CEO must end before Eng ends. Design (if observed)
|
|
// must end after CEO and before Eng.
|
|
expect(ceo.ts).toBeLessThan(eng.ts);
|
|
if (design) {
|
|
expect(design.ts).toBeGreaterThan(ceo.ts);
|
|
expect(design.ts).toBeLessThan(eng.ts);
|
|
}
|
|
} finally {
|
|
try { fs.rmSync(tempDir, { recursive: true, force: true }); } catch { /* ignore */ }
|
|
}
|
|
},
|
|
1_200_000, // 20 min absolute test ceiling
|
|
);
|
|
});
|