refactor(plan-ceo-review): carve review body into on-demand section

Carve the largest skill (138,838 B) into a skeleton + one on-demand section, the documented next Phase B target after /ship (v2_PLAN.md:216). - sections/review-sections.md(.tmpl): the 11-section deep review, codex/ outside-voice rules, how-to-ask, Required Outputs, registries, Completion Summary, Review Log, REVIEW_DASHBOARD, PLAN_FILE_REVIEW_REPORT, Next Steps, docs/designs promotion, Formatting Rules, and the Mode Quick Reference. - sections/manifest.json: passive registry (CM2), one entry. - SKILL.md.tmpl: {{SECTION_INDEX}} after the system audit, a single {{SECTION:review-sections}} STOP-Read after Step 0 mode selection, and a Section self-check. All of Step 0 (the scope/mode conversation) stays in the always-loaded skeleton; only EXIT_PLAN_MODE_GATE follows the section. Measured: always-loaded skeleton 138,838 -> 80,731 B (-42%, ~14.4K tokens off every invocation). Union (skeleton + section) 139,110 B, behavior held. Boundary honors Codex P1: nothing review-governing (formatting rules, mode reference, how-to-ask, required outputs) sits in the skeleton below the STOP. Housekeeping resolvers ride in the section, matching the ship precedent (adversarial.md carries LEARNINGS_LOG + GBRAIN_SAVE_RESULTS). Tests (atomic with the carve — skill-docs.yml gates gen:skill-docs freshness on every push, so source + regen + tests must land together): - parity-harness: plan-ceo flipped to sectioned, maxSkeletonBytes 90_000 (measured 80,731 + headroom); content/minBytes run against the union. - skill-size-budget: plan-ceo-review added to SECTIONS_EXTRACTED. - section-manifest-consistency: generalized to discover every carved skill, vars computed per-skill-case (Codex P2). - skill-ceo-section-ordering (new, gate): per-PR static guard — STOP after Step 0, review body absent from skeleton, report writer in the section, nothing review-governing below the STOP. - skill-e2e-plan-ceo-review-section-loading (new, periodic): refreshes the installed skill first (Codex P1), drives full Step 0, asserts the section is Read before the report. - gen-skill-docs + skill-validation: read the skeleton+sections union for carved skills so relocated prose still counts. - touchfiles: plan-ceo-section-loading registered (periodic). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-25 19:20:00 +02:00 · 2026-05-31 08:54:56 -07:00
parent 3bef43bc5a
commit ab66193e2e
14 changed files with 1831 additions and 1457 deletions
@@ -0,0 +1,191 @@
+/**
+ * /plan-ceo-review section-loading E2E (periodic, paid, real-PTY) — v2 plan
+ * Phase B carve backstop. The per-PR guard is the free static test
+ * skill-ceo-section-ordering.test.ts; THIS is the behavioral proof that a real
+ * agent actually Reads the carved section instead of working from memory.
+ *
+ * After the carve, plan-ceo-review is a skeleton whose single STOP-Read directive
+ * (fired after Step 0 mode selection) points at sections/review-sections.md. This
+ * test runs the REAL /plan-ceo-review skill in plan mode against a fixture branch
+ * that has a plan worth reviewing, drives Step 0 to HOLD SCOPE (the simplest mode
+ * that still requires all 11 review sections), and asserts the agent Read
+ * review-sections.md before producing the review report.
+ *
+ * Codex outside-voice P1 fixes vs the naive port of the ship test:
+ *  - REFRESH THE INSTALL FIRST. The skill loads from the installed copy at
+ *    ~/.claude/skills/gstack/plan-ceo-review (a real copy on dev machines, fresh
+ *    on CI). A test that didn't refresh would assert against the pre-carve
+ *    monolith and trivially "pass" with zero section reads. beforeAll copies the
+ *    freshly-generated skeleton + sections into the install; afterAll restores the
+ *    prior state so a local run doesn't leave the active skill mutated.
+ *  - HANDLE THE FULL STEP 0. plan-ceo's Step 0 can fire a system audit, WebSearch,
+ *    and several AskUserQuestion calls before mode selection — the answer loop
+ *    replies to every permission dialog / numbered list, not just two.
+ *
+ * Plan-mode framing keeps the agent from editing/committing. Cost: ~$3-5/run.
+ * Periodic tier.
+ */
+
+import { describe, test, expect } from 'bun:test';
+import { spawnSync } from 'child_process';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import {
+  launchClaudePty,
+  isPermissionDialogVisible,
+  isNumberedOptionListVisible,
+} from './helpers/claude-pty-runner';
+
+const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
+const describeE2E = shouldRun ? describe : describe.skip;
+
+const REPO_ROOT = path.resolve(import.meta.dir, '..');
+const INSTALL_DIR = path.join(os.homedir(), '.claude', 'skills', 'gstack', 'plan-ceo-review');
+
+// Sections every plan-ceo-review run must consult after Step 0.
+const REQUIRED_SECTIONS = ['review-sections.md'];
+
+/** Copy the freshly-generated skeleton + sections into the installed skill so the
+ *  PTY agent loads the carve under test. Returns a restore() that puts the install
+ *  back exactly as it was (content of SKILL.md + presence/content of sections/). */
+function refreshInstall(): () => void {
+  const repoSkill = path.join(REPO_ROOT, 'plan-ceo-review', 'SKILL.md');
+  const repoSections = path.join(REPO_ROOT, 'plan-ceo-review', 'sections');
+  const installSkill = path.join(INSTALL_DIR, 'SKILL.md');
+  const installSections = path.join(INSTALL_DIR, 'sections');
+
+  // Snapshot prior state for restore.
+  const priorSkill = fs.existsSync(installSkill) ? fs.readFileSync(installSkill) : null;
+  const hadSections = fs.existsSync(installSections);
+  const priorSections: Record<string, Buffer> = {};
+  if (hadSections) {
+    for (const f of fs.readdirSync(installSections)) {
+      priorSections[f] = fs.readFileSync(path.join(installSections, f));
+    }
+  }
+
+  // Apply: skeleton + every generated section file (.md) + manifest.
+  fs.mkdirSync(INSTALL_DIR, { recursive: true });
+  fs.copyFileSync(repoSkill, installSkill);
+  fs.mkdirSync(installSections, { recursive: true });
+  for (const f of fs.readdirSync(repoSections)) {
+    if (f.endsWith('.md.tmpl')) continue; // install carries generated files, not templates
+    fs.copyFileSync(path.join(repoSections, f), path.join(installSections, f));
+  }
+
+  return function restore(): void {
+    try {
+      if (priorSkill) fs.writeFileSync(installSkill, priorSkill);
+      if (hadSections) {
+        // Restore the prior section files; drop any we added.
+        for (const f of fs.readdirSync(installSections)) {
+          if (!(f in priorSections)) fs.rmSync(path.join(installSections, f), { force: true });
+        }
+        for (const [f, buf] of Object.entries(priorSections)) {
+          fs.writeFileSync(path.join(installSections, f), buf);
+        }
+      } else {
+        fs.rmSync(installSections, { recursive: true, force: true });
+      }
+    } catch { /* best-effort restore */ }
+  };
+}
+
+/** Fixture: a feature branch with a real change + a plan file worth reviewing. */
+function buildPlanFixture(): { workTree: string; root: string } {
+  const root = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-ceo-secload-'));
+  const workTree = path.join(root, 'workspace');
+  const bareRemote = path.join(root, 'origin.git');
+  fs.mkdirSync(workTree, { recursive: true });
+  const sh = (cmd: string, cwd: string): void => {
+    const r = spawnSync('bash', ['-c', cmd], { cwd, stdio: 'pipe', timeout: 15_000 });
+    if (r.status !== 0) throw new Error(`fixture setup failed at "${cmd}":\n${r.stderr?.toString()}`);
+  };
+  sh(`git init --bare "${bareRemote}"`, root);
+  sh('git init -b main', workTree);
+  sh('git config user.email "t@t.com" && git config user.name "T" && git config commit.gpgsign false', workTree);
+  fs.writeFileSync(path.join(workTree, 'app.js'), '// base\n');
+  sh('git add -A && git commit -m "chore: initial"', workTree);
+  sh(`git remote add origin "${bareRemote}" && git push -u origin main`, workTree);
+  // Feature branch with a real change + a plan describing it (something to review).
+  sh('git checkout -b feat/cache-layer', workTree);
+  fs.writeFileSync(
+    path.join(workTree, 'PLAN.md'),
+    [
+      '# Plan: add an in-memory cache layer',
+      '',
+      '## Context',
+      'Reads hit the DB on every request. Add a process-local LRU cache in front of',
+      'the read path to cut DB load.',
+      '',
+      '## Approach',
+      '- Wrap the read repository in a cache that stores the last 1000 keys.',
+      '- Invalidate on write.',
+      '',
+      '## Out of scope',
+      'Distributed cache, cross-process coherence.',
+      '',
+    ].join('\n'),
+  );
+  fs.writeFileSync(path.join(workTree, 'app.js'), '// base\nexport function read(k) { return db.get(k); }\n');
+  sh('git add -A && git commit -m "feat: cache layer plan + stub"', workTree);
+  sh('git push -u origin feat/cache-layer', workTree);
+  return { workTree, root };
+}
+
+describeE2E('/plan-ceo-review section-loading E2E (periodic, real-PTY, installed skill)', () => {
+  test(
+    'a real review Reads the carved section before producing the report',
+    async () => {
+      const restore = refreshInstall();
+      const { workTree, root } = buildPlanFixture();
+      const session = await launchClaudePty({
+        permissionMode: 'plan',
+        cwd: workTree,
+        timeoutMs: 900_000,
+        env: { NO_COLOR: '1' },
+      });
+
+      const readSections = new Set<string>();
+      let reportReady = false;
+      try {
+        await Bun.sleep(8000);
+        const since = session.mark();
+        // HOLD SCOPE = simplest mode that still walks all 11 review sections.
+        session.send('/plan-ceo-review review PLAN.md, hold scope\r');
+        const start = Date.now();
+        let lastPermSig = '';
+        while (Date.now() - start < 780_000) {
+          await Bun.sleep(3000);
+          if (session.exited()) break;
+          const visible = session.visibleSince(since);
+          const tail = visible.slice(-1500);
+          // Answer EVERY permission dialog / numbered option list (system audit,
+          // WebSearch, and the several Step 0 questions) by taking option 1.
+          if (isNumberedOptionListVisible(tail) && isPermissionDialogVisible(tail)) {
+            const sig = visible.slice(-500);
+            if (sig !== lastPermSig) { lastPermSig = sig; session.send('1\r'); await Bun.sleep(1500); continue; }
+          }
+          for (const m of visible.matchAll(/sections\/([A-Za-z0-9._-]+\.md)/g)) readSections.add(m[1]);
+          if (/GSTACK REVIEW REPORT|COMPLETION SUMMARY|ready to execute/i.test(visible)) {
+            reportReady = true;
+            break;
+          }
+        }
+      } finally {
+        await session.close();
+        try { fs.rmSync(root, { recursive: true, force: true }); } catch { /* ignore */ }
+        restore();
+      }
+
+      const missing = REQUIRED_SECTIONS.filter(s => !readSections.has(s));
+      expect({ reportReady, read: [...readSections], missing }).toEqual({
+        reportReady: true,
+        read: expect.any(Array),
+        missing: [],
+      });
+    },
+    1_020_000,
+  );
+});