test: T2 data-driven behavioral section-loading guard (periodic)

One file iterating CARVE_GUARDS, one test() per skill with GSTACK_CARVE_SKILL cost-scoping (D-CODEX A). external carves (ship, plan-ceo) keep bespoke tests; testNames aligned to their touchfile keys. Registered in touchfiles. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-17 23:30:09 +02:00 · 2026-06-07 17:52:03 -07:00
parent f736095c46
commit 2da4ca6dc2
4 changed files with 105 additions and 2 deletions
@@ -0,0 +1,97 @@
+/**
+ * T2 — data-driven behavioral section-loading guard (PERIODIC tier, paid, SDK capture).
+ *
+ * The behavioral proof that a REAL agent actually Reads each carved skill's
+ * required sections at runtime — not just that the skeleton structure looks right
+ * (that's E2, free, per-PR). One file iterating the canonical CARVE_GUARDS
+ * registry (EQ2): registry membership IS the test, so "registered ⇒ asserted" is
+ * structural — a carve can't be registered yet behaviorally unguarded.
+ *
+ * Per codex refined-plan pass:
+ *   #2 — ONE test() per skill, each with its own timeout + named failure output;
+ *        a hung claude -p fails only its skill, not the whole file.
+ *   #3 / D-CODEX(A) — GSTACK_CARVE_SKILL=<name> runs only that skill's case, so
+ *        the touchfile selector can scope cost to the changed skill; unset runs all.
+ *   #7 — each case drives the run with the registry's `scenario` (built to force
+ *        the STOP-Read path) and asserts the required sections were Read.
+ *
+ * 'external' skills (ship, plan-ceo-review) have bespoke fixtures (git state,
+ * Step-0 mode loop) and keep their dedicated tests; E1 asserts those exist.
+ */
+
+import { describe, test, expect } from 'bun:test';
+import { setupSkillDir, skillFromWorktree, captureSectionReads } from './helpers/auq-sdk-capture';
+import { CARVE_GUARDS } from './helpers/carve-guards';
+
+const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
+const describeE2E = shouldRun ? describe : describe.skip;
+const runId = `carve-section-loading-${process.env.EVALS_RUN_ID ?? 'local'}`;
+const only = process.env.GSTACK_CARVE_SKILL?.trim();
+
+// A generic plan fixture for 'plan' behavioral skills (the review family).
+const PLAN_MD = [
+  '# Plan: add an in-memory cache layer',
+  '',
+  '## Context',
+  'Reads hit the DB on every request. Add a process-local LRU cache in front of the',
+  'read path to cut DB load.',
+  '',
+  '## Approach',
+  '- Wrap the read repository in a cache that stores the last 1000 keys.',
+  '- Invalidate on write.',
+  '',
+  '## Out of scope',
+  'Distributed cache, cross-process coherence.',
+  '',
+].join('\n');
+
+describeE2E('carve behavioral section-loading (periodic, SDK capture)', () => {
+  for (const guard of Object.values(CARVE_GUARDS)) {
+    // 'external' carves keep their dedicated bespoke tests (E1 verifies those exist).
+    if (guard.behavioral === 'external') continue;
+    // Cost-scoped selection: when GSTACK_CARVE_SKILL is set, run only that skill.
+    if (only && only !== guard.skill) continue;
+
+    test(
+      `${guard.skill}: a real run Reads ${guard.requiredReads.join(', ')}`,
+      async () => {
+        const { skillMd, sectionsFrom } = skillFromWorktree(guard.skill);
+        const fixtures = guard.behavioral === 'plan' ? { 'PLAN.md': PLAN_MD } : {};
+        const planDir = setupSkillDir({
+          skillName: guard.skill,
+          skillMd,
+          sectionsFrom,
+          fixtures,
+          tmpPrefix: `gstack-${guard.skill}-secload-`,
+        });
+
+        const { readSections, reportProduced, output } = await captureSectionReads({
+          planDir,
+          skillName: guard.skill,
+          scenario: guard.scenario,
+          reportMarker: /report|review|summary|design doc|handoff/i,
+          testName: `${guard.skill} section-loading`,
+          runId,
+        });
+
+        const missing = guard.requiredReads.filter((s) => !readSections.has(s));
+        // Named failure output (codex #2): skill + expected + observed.
+        expect({
+          skill: guard.skill,
+          reportProduced,
+          expected: guard.requiredReads,
+          observed: [...readSections],
+          missing,
+        }).toEqual({
+          skill: guard.skill,
+          reportProduced: true,
+          expected: guard.requiredReads,
+          observed: expect.any(Array),
+          missing: [],
+        });
+        expect(output.trim().length).toBeGreaterThan(200);
+      },
+      360_000,
+    );
+  }
+});