test: canonical CARVE_GUARDS registry; derive parity + size-budget from it

Single source of truth for the carved-skill set + per-skill invariants (EQ1). parity-harness.ts sectioned entries and skill-size-budget.ts SECTIONS_EXTRACTED now derive from it instead of hand-maintained lists. Closes a pre-existing drift: plan-devex-review was in SECTIONS_EXTRACTED but had no sectioned parity invariant; now generated. carve-guards.ts is a pure leaf data module (import type only) to avoid an import cycle. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-17 07:10:12 +02:00 · 2026-06-07 17:52:02 -07:00
parent cab774cced
commit da9f6852ea
3 changed files with 235 additions and 90 deletions
@@ -0,0 +1,200 @@
+/**
+ * Canonical carved-skill guard registry — the single source of truth for which
+ * skills are carved (skeleton SKILL.md + on-demand sections/*.md) and what each
+ * carve must guarantee.
+ *
+ * PURE LEAF DATA MODULE (codex outside-voice #1, refined-plan pass): this file
+ * has NO runtime imports — `import type` only. parity-harness.ts and
+ * skill-size-budget.test.ts derive their carved-skill lists FROM here (no
+ * parallel hand-maintained lists), so a runtime import back into either of them
+ * would create a cycle. Keep it data.
+ *
+ * Consumers:
+ *   - test/carve-section-ordering.test.ts   (E2, gate)  → staticInvariants
+ *   - test/carve-section-loading.test.ts    (T2, periodic) → requiredReads + scenario
+ *   - test/carve-guard-completeness.test.ts (E1, gate)  → the set must equal the
+ *                                                          filesystem carved set
+ *   - test/carve-guards-negative.test.ts    (ET1, gate) → injects a broken fixture
+ *   - test/helpers/parity-harness.ts        → sectioned/maxSkeletonBytes/minBytes/mustContain
+ *   - test/skill-size-budget.test.ts        → SECTIONS_EXTRACTED = CARVED_SKILLS
+ *
+ * Adding a carve = add one entry here (atomically, in the same commit as the
+ * skeleton + manifest + sections — codex #4 — so E1's bidirectional parity never
+ * false-positives mid-commit).
+ */
+
+/** Static (skeleton-shape) invariants the per-PR ordering guard (E2) asserts. */
+export interface CarveStaticInvariants {
+  /**
+   * Substrings that MUST remain in the always-loaded skeleton. Empty = skip
+   * (the skill has no distinctive pre-STOP anchor worth pinning beyond the
+   * universal STOP/section-index checks E2 already runs).
+   */
+  mustStayInSkeleton: string[];
+  /**
+   * Substrings that MUST be in the union (skeleton + sections) but MUST NOT be in
+   * the skeleton — i.e. the heavy body that the carve relocated. Empty = skip.
+   */
+  mustMoveToSection: string[];
+  /**
+   * If set, this marker must appear in the skeleton AFTER the last STOP-Read
+   * directive (e.g. the EXIT PLAN MODE GATE that fires once section work returns).
+   * Undefined = the skill has no post-STOP gate (operational/conversational carve).
+   */
+  gateAfterStop?: string;
+}
+
+export interface CarveGuard {
+  skill: string;
+  /** Section .md filenames the manifest lists and the skeleton must STOP-Read. */
+  expectedSections: string[];
+  /**
+   * Sections the behavioral test (T2) asserts the agent actually Read when driven
+   * by `scenario`. A non-empty subset of expectedSections — the ones the scenario
+   * is built to require. The registry owns this so "registered ⇒ asserted" is
+   * structural (codex #2), not policed.
+   */
+  requiredReads: string[];
+  /**
+   * Fixture prompt that drives a real `claude -p` run down the STOP-Read path for
+   * this skill (codex #7). The behavioral test asserts the run reached the STOP
+   * (read requiredReads), not merely that nothing was read.
+   */
+  scenario: string;
+  staticInvariants: CarveStaticInvariants;
+  /**
+   * How the behavioral guard (T2) exercises this skill:
+   *  - 'plan'     → write a PLAN.md fixture, run the review against it
+   *  - 'prompt'   → no fixture file; the scenario prompt alone drives the run
+   *  - 'external' → covered by a dedicated bespoke test (complex fixtures, e.g.
+   *                 ship's git/VERSION/CHANGELOG state). The data-driven loop
+   *                 skips it; E1 asserts `externalTest` exists instead.
+   */
+  behavioral: 'plan' | 'prompt' | 'external';
+  /** Required when behavioral === 'external': path (repo-relative) to the dedicated test. */
+  externalTest?: string;
+  /** Parity: max bytes for the always-loaded skeleton (asserts the carve shrank it). */
+  maxSkeletonBytes: number;
+  /** Parity: min bytes for the skeleton+sections union (total behavior preserved). */
+  minUnionBytes: number;
+  /** Parity: content phrases the union must preserve. */
+  mustContain: string[];
+}
+
+export const CARVE_GUARDS: Record<string, CarveGuard> = {
+  ship: {
+    skill: 'ship',
+    expectedSections: [
+      'tests.md',
+      'test-coverage.md',
+      'plan-completion.md',
+      'review-army.md',
+      'greptile.md',
+      'adversarial.md',
+      'changelog.md',
+      'pr-body.md',
+    ],
+    requiredReads: ['review-army.md', 'changelog.md'],
+    scenario:
+      'This is a FRESH version-changing ship: the branch has a real code change, VERSION still equals the base version (needs a bump), and CHANGELOG.md needs a new entry. Follow the skill flow for a version-changing ship: run the pre-landing review and prepare the CHANGELOG entry. Produce the ship plan / review report. Do NOT actually commit, push, or open a PR.',
+    staticInvariants: {
+      mustStayInSkeleton: [],
+      mustMoveToSection: [],
+      // ship is operational (multi-STOP, not a plan review); no single post-STOP gate.
+      gateAfterStop: undefined,
+    },
+    behavioral: 'external',
+    externalTest: 'test/skill-e2e-ship-section-loading.test.ts',
+    maxSkeletonBytes: 90_000,
+    minUnionBytes: 120_000,
+    mustContain: ['VERSION', 'CHANGELOG', 'review', 'merge', 'PR'],
+  },
+  'plan-ceo-review': {
+    skill: 'plan-ceo-review',
+    expectedSections: ['review-sections.md'],
+    requiredReads: ['review-sections.md'],
+    scenario:
+      'Review the plan in PLAN.md. Hold the current scope (HOLD SCOPE mode) — do not challenge or expand scope. Run the full CEO review and produce the review report.',
+    staticInvariants: {
+      mustStayInSkeleton: ['## Step 0: Nuclear Scope Challenge'],
+      mustMoveToSection: ['### Section 1: Architecture Review', '## Mode Quick Reference'],
+      gateAfterStop: 'EXIT PLAN MODE GATE',
+    },
+    behavioral: 'external',
+    externalTest: 'test/skill-e2e-plan-ceo-review-section-loading.test.ts',
+    maxSkeletonBytes: 90_000,
+    minUnionBytes: 80_000,
+    mustContain: ['SCOPE EXPANSION', 'SELECTIVE EXPANSION', 'HOLD SCOPE', 'SCOPE REDUCTION'],
+  },
+  'plan-eng-review': {
+    skill: 'plan-eng-review',
+    expectedSections: ['review-sections.md'],
+    requiredReads: ['review-sections.md'],
+    scenario:
+      'Review the plan in PLAN.md. Accept the current scope. Run the full engineering review (architecture, code quality, tests, performance) and produce the review report.',
+    staticInvariants: {
+      mustStayInSkeleton: ['### Step 0: Scope Challenge'],
+      mustMoveToSection: ['### 1. Architecture review'],
+      gateAfterStop: 'EXIT PLAN MODE GATE',
+    },
+    behavioral: 'plan',
+    maxSkeletonBytes: 62_000,
+    minUnionBytes: 70_000,
+    mustContain: ['Architecture', 'Code Quality', 'Test', 'Performance'],
+  },
+  'plan-design-review': {
+    skill: 'plan-design-review',
+    expectedSections: ['review-sections.md'],
+    requiredReads: ['review-sections.md'],
+    scenario:
+      'Review the plan in PLAN.md for design and UX. Accept the current scope. Run the full design review passes and produce the review report.',
+    staticInvariants: {
+      mustStayInSkeleton: [],
+      mustMoveToSection: ['### Pass 1: Information Architecture'],
+      gateAfterStop: 'EXIT PLAN MODE GATE',
+    },
+    behavioral: 'plan',
+    maxSkeletonBytes: 82_000,
+    minUnionBytes: 70_000,
+    mustContain: ['design', 'visual'],
+  },
+  'plan-devex-review': {
+    skill: 'plan-devex-review',
+    expectedSections: ['review-sections.md'],
+    requiredReads: ['review-sections.md'],
+    scenario:
+      'Review the plan in PLAN.md for developer experience. Accept the current scope. Run the full DX review passes and produce the review report.',
+    staticInvariants: {
+      mustStayInSkeleton: [],
+      mustMoveToSection: ['### Pass 1: Getting Started Experience'],
+      gateAfterStop: 'EXIT PLAN MODE GATE',
+    },
+    behavioral: 'plan',
+    maxSkeletonBytes: 76_000,
+    minUnionBytes: 70_000,
+    mustContain: ['developer experience', 'Getting Started'],
+  },
+  'office-hours': {
+    skill: 'office-hours',
+    expectedSections: ['design-and-handoff.md'],
+    requiredReads: ['design-and-handoff.md'],
+    scenario:
+      'Run office hours for this product idea through to the end: have the diagnostic conversation, explore alternatives, then write the design doc and run the relationship handoff (Phases 5-6).',
+    staticInvariants: {
+      mustStayInSkeleton: [],
+      mustMoveToSection: [],
+      // office-hours is conversational; the design-doc/handoff section has no
+      // post-STOP review gate in the skeleton.
+      gateAfterStop: undefined,
+    },
+    behavioral: 'prompt',
+    maxSkeletonBytes: 96_000,
+    minUnionBytes: 70_000,
+    mustContain: ['design doc', 'problem statement'],
+  },
+};
+
+/** Sorted carved-skill names. Consumers derive their lists from this — no parallel lists. */
+export const CARVED_SKILLS: readonly string[] = Object.freeze(
+  Object.keys(CARVE_GUARDS).sort(),
+);
@@ -22,6 +22,7 @@ import * as fs from 'fs';
 import * as path from 'path';
 import type { ParityBaseline, SkillBaselineEntry } from './capture-parity-baseline';
 import { captureBaseline } from './capture-parity-baseline';
+import { CARVE_GUARDS } from './carve-guards';

 export interface ParityInvariant {
  skill: string;
@@ -198,7 +199,12 @@ export function runParityChecks(opts: {
 * Each entry pins what must-not-break in a skill family. Extend as future
 * skills land. Phase B (v2.0.0.0) adds LLM-judge invariants on top of these.
 */
-export const PARITY_INVARIANTS: ParityInvariant[] = [
+/**
+ * Monolith (non-carved) invariants — hand-written. Carved-skill invariants are
+ * generated from CARVE_GUARDS below (single source of truth), so they never drift
+ * from the size-budget / static / behavioral guards.
+ */
+const MONOLITH_INVARIANTS: ParityInvariant[] = [
  {
    skill: 'cso',
    mustContain: ['OWASP', 'STRIDE', 'daily', 'comprehensive', 'verif'],
@@ -206,78 +212,6 @@ export const PARITY_INVARIANTS: ParityInvariant[] = [
    maxSizeRatio: 1.05,
    minBytes: 30_000,
  },
-  {
-    // Carved (v2 plan T9): skeleton SKILL.md + sections/*.md. Content checks run
-    // against the union (relocated phrases still count); size floors run against
-    // the union (total behavior preserved); maxSkeletonBytes asserts the
-    // always-loaded skeleton actually shrank from the ~167KB monolith.
-    skill: 'ship',
-    sectioned: true,
-    maxSkeletonBytes: 90_000,
-    mustContain: [
-      'VERSION',
-      'CHANGELOG',
-      'review',
-      'merge',
-      'PR',
-    ],
-    mustHaveHeadings: ['## Preamble', '## When to invoke'],
-    maxSizeRatio: 1.05,
-    minBytes: 120_000,
-  },
-  {
-    // Carved (v2 plan T9): skeleton SKILL.md + sections/review-sections.md.
-    // Content + size floors run against the union (relocated prose still counts);
-    // maxSkeletonBytes asserts the always-loaded skeleton shrank from the ~138KB
-    // monolith to ~81KB (measured 80,731 B, -42%). Headroom to 90KB so a small
-    // skeleton edit doesn't trip CI, but a 10KB regression does.
-    skill: 'plan-ceo-review',
-    sectioned: true,
-    maxSkeletonBytes: 90_000,
-    mustContain: [
-      'SCOPE EXPANSION',
-      'SELECTIVE EXPANSION',
-      'HOLD SCOPE',
-      'SCOPE REDUCTION',
-    ],
-    mustHaveHeadings: ['## Preamble', '## When to invoke'],
-    maxSizeRatio: 1.05,
-    minBytes: 80_000,
-  },
-  {
-    // Carved (v2 plan T9): skeleton + sections/review-sections.md. The 4-section
-    // review, outside voice, and required outputs moved to the section; content
-    // checks run against the union. Skeleton shrank 106,984 -> 54,892 B (-48.7%);
-    // maxSkeletonBytes 62KB = measured + headroom.
-    skill: 'plan-eng-review',
-    sectioned: true,
-    maxSkeletonBytes: 62_000,
-    mustContain: [
-      'Architecture',
-      'Code Quality',
-      'Test',
-      'Performance',
-    ],
-    mustHaveHeadings: ['## Preamble', '## When to invoke'],
-    maxSizeRatio: 1.05,
-    minBytes: 70_000,
-  },
-  {
-    // Carved (v2 plan T9): skeleton + sections/review-sections.md. The 7 design
-    // passes + required outputs moved to the section; content checks run against
-    // the union. Skeleton shrank 112,057 -> 76,024 B (-32.2%); maxSkeletonBytes
-    // 82KB = measured + headroom.
-    skill: 'plan-design-review',
-    sectioned: true,
-    maxSkeletonBytes: 82_000,
-    mustContain: [
-      'design',
-      'visual',
-    ],
-    mustHaveHeadings: ['## Preamble', '## When to invoke'],
-    maxSizeRatio: 1.05,
-    minBytes: 70_000,
-  },
  {
    skill: 'review',
    mustContain: ['confidence', 'P1', 'P2'],
@@ -299,21 +233,6 @@ export const PARITY_INVARIANTS: ParityInvariant[] = [
    maxSizeRatio: 1.05,
    minBytes: 30_000,
  },
-  {
-    // Carved (v2 plan T9): skeleton SKILL.md + sections/design-and-handoff.md.
-    // Phase 5 (design doc) + Phase 6 (handoff) moved into the section, so
-    // 'design doc' / 'problem statement' now live there — content checks run
-    // against the union. maxSkeletonBytes asserts the always-loaded skeleton
-    // shrank from the ~118KB monolith to ~89KB (measured 88,975 B, -24.8%);
-    // headroom to 96KB so a small skeleton edit doesn't trip CI.
-    skill: 'office-hours',
-    sectioned: true,
-    maxSkeletonBytes: 96_000,
-    mustContain: ['design doc', 'problem statement'],
-    mustHaveHeadings: ['## Preamble', '## When to invoke'],
-    maxSizeRatio: 1.05,
-    minBytes: 70_000,
-  },
  {
    skill: 'autoplan',
    mustContain: ['ceo', 'eng', 'design'],
@@ -322,3 +241,27 @@ export const PARITY_INVARIANTS: ParityInvariant[] = [
    minBytes: 70_000,
  },
 ];
+
+/**
+ * Carved-skill invariants, GENERATED from the canonical CARVE_GUARDS registry
+ * (EQ1: single source of truth). Each carve's skeleton-shrink floor
+ * (maxSkeletonBytes), union floor (minUnionBytes), and content invariants
+ * (mustContain) live in carve-guards.ts; this just projects them into the parity
+ * shape. Adding a carve there auto-adds its union guard here — which is how
+ * plan-devex-review (previously in SECTIONS_EXTRACTED but missing a sectioned
+ * parity invariant) is now guarded.
+ */
+const CARVED_INVARIANTS: ParityInvariant[] = Object.values(CARVE_GUARDS).map((g) => ({
+  skill: g.skill,
+  sectioned: true,
+  maxSkeletonBytes: g.maxSkeletonBytes,
+  minBytes: g.minUnionBytes,
+  mustContain: g.mustContain,
+  mustHaveHeadings: ['## Preamble', '## When to invoke'],
+  maxSizeRatio: 1.05,
+}));
+
+export const PARITY_INVARIANTS: ParityInvariant[] = [
+  ...MONOLITH_INVARIANTS,
+  ...CARVED_INVARIANTS,
+];
@@ -33,6 +33,7 @@ import * as fs from 'fs';
 import * as path from 'path';
 import { captureBaseline, type ParityBaseline } from './helpers/capture-parity-baseline';
 import { logBudgetOverride } from './helpers/budget-override';
+import { CARVED_SKILLS } from './helpers/carve-guards';

 const REPO_ROOT = path.resolve(import.meta.dir, '..');
 const BASELINE_PATH = path.join(REPO_ROOT, 'test', 'fixtures', 'parity-baseline-v1.47.0.0.json');
@@ -161,9 +162,10 @@ describe('SKILL.md size budget regression (gate, free)', () => {
    const MIN_RATIO = 0.80; // a skill at <80% of its v1.44 size signals mass-deletion
    // Carved skills (v2 plan T9): the skeleton SKILL.md intentionally shrinks
    // because prose moved into sections/*.md. The union size is guarded instead
-    // by the sectioned ship invariant in parity-harness.ts (minBytes on the
+    // by the sectioned invariant in parity-harness.ts (minBytes on the
    // skeleton+sections union), so exempt the skeleton from the body-strip floor.
-    const SECTIONS_EXTRACTED = new Set<string>(['ship', 'plan-ceo-review', 'office-hours', 'plan-eng-review', 'plan-design-review', 'plan-devex-review']);
+    // EQ1: derived from the canonical CARVE_GUARDS registry — no parallel list.
+    const SECTIONS_EXTRACTED = new Set<string>(CARVED_SKILLS);

    const undershoots: Array<{
      skill: string; beforeBytes: number; afterBytes: number; ratio: number;