From da9f6852eacda135bca257606de555071e3274a2 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sun, 7 Jun 2026 17:52:02 -0700 Subject: [PATCH] test: canonical CARVE_GUARDS registry; derive parity + size-budget from it Single source of truth for the carved-skill set + per-skill invariants (EQ1). parity-harness.ts sectioned entries and skill-size-budget.ts SECTIONS_EXTRACTED now derive from it instead of hand-maintained lists. Closes a pre-existing drift: plan-devex-review was in SECTIONS_EXTRACTED but had no sectioned parity invariant; now generated. carve-guards.ts is a pure leaf data module (import type only) to avoid an import cycle. Co-Authored-By: Claude Opus 4.8 (1M context) --- test/helpers/carve-guards.ts | 200 +++++++++++++++++++++++++++++++++ test/helpers/parity-harness.ts | 119 +++++--------------- test/skill-size-budget.test.ts | 6 +- 3 files changed, 235 insertions(+), 90 deletions(-) create mode 100644 test/helpers/carve-guards.ts diff --git a/test/helpers/carve-guards.ts b/test/helpers/carve-guards.ts new file mode 100644 index 000000000..8dfd04a04 --- /dev/null +++ b/test/helpers/carve-guards.ts @@ -0,0 +1,200 @@ +/** + * Canonical carved-skill guard registry — the single source of truth for which + * skills are carved (skeleton SKILL.md + on-demand sections/*.md) and what each + * carve must guarantee. + * + * PURE LEAF DATA MODULE (codex outside-voice #1, refined-plan pass): this file + * has NO runtime imports — `import type` only. parity-harness.ts and + * skill-size-budget.test.ts derive their carved-skill lists FROM here (no + * parallel hand-maintained lists), so a runtime import back into either of them + * would create a cycle. Keep it data. + * + * Consumers: + * - test/carve-section-ordering.test.ts (E2, gate) → staticInvariants + * - test/carve-section-loading.test.ts (T2, periodic) → requiredReads + scenario + * - test/carve-guard-completeness.test.ts (E1, gate) → the set must equal the + * filesystem carved set + * - test/carve-guards-negative.test.ts (ET1, gate) → injects a broken fixture + * - test/helpers/parity-harness.ts → sectioned/maxSkeletonBytes/minBytes/mustContain + * - test/skill-size-budget.test.ts → SECTIONS_EXTRACTED = CARVED_SKILLS + * + * Adding a carve = add one entry here (atomically, in the same commit as the + * skeleton + manifest + sections — codex #4 — so E1's bidirectional parity never + * false-positives mid-commit). + */ + +/** Static (skeleton-shape) invariants the per-PR ordering guard (E2) asserts. */ +export interface CarveStaticInvariants { + /** + * Substrings that MUST remain in the always-loaded skeleton. Empty = skip + * (the skill has no distinctive pre-STOP anchor worth pinning beyond the + * universal STOP/section-index checks E2 already runs). + */ + mustStayInSkeleton: string[]; + /** + * Substrings that MUST be in the union (skeleton + sections) but MUST NOT be in + * the skeleton — i.e. the heavy body that the carve relocated. Empty = skip. + */ + mustMoveToSection: string[]; + /** + * If set, this marker must appear in the skeleton AFTER the last STOP-Read + * directive (e.g. the EXIT PLAN MODE GATE that fires once section work returns). + * Undefined = the skill has no post-STOP gate (operational/conversational carve). + */ + gateAfterStop?: string; +} + +export interface CarveGuard { + skill: string; + /** Section .md filenames the manifest lists and the skeleton must STOP-Read. */ + expectedSections: string[]; + /** + * Sections the behavioral test (T2) asserts the agent actually Read when driven + * by `scenario`. A non-empty subset of expectedSections — the ones the scenario + * is built to require. The registry owns this so "registered ⇒ asserted" is + * structural (codex #2), not policed. + */ + requiredReads: string[]; + /** + * Fixture prompt that drives a real `claude -p` run down the STOP-Read path for + * this skill (codex #7). The behavioral test asserts the run reached the STOP + * (read requiredReads), not merely that nothing was read. + */ + scenario: string; + staticInvariants: CarveStaticInvariants; + /** + * How the behavioral guard (T2) exercises this skill: + * - 'plan' → write a PLAN.md fixture, run the review against it + * - 'prompt' → no fixture file; the scenario prompt alone drives the run + * - 'external' → covered by a dedicated bespoke test (complex fixtures, e.g. + * ship's git/VERSION/CHANGELOG state). The data-driven loop + * skips it; E1 asserts `externalTest` exists instead. + */ + behavioral: 'plan' | 'prompt' | 'external'; + /** Required when behavioral === 'external': path (repo-relative) to the dedicated test. */ + externalTest?: string; + /** Parity: max bytes for the always-loaded skeleton (asserts the carve shrank it). */ + maxSkeletonBytes: number; + /** Parity: min bytes for the skeleton+sections union (total behavior preserved). */ + minUnionBytes: number; + /** Parity: content phrases the union must preserve. */ + mustContain: string[]; +} + +export const CARVE_GUARDS: Record = { + ship: { + skill: 'ship', + expectedSections: [ + 'tests.md', + 'test-coverage.md', + 'plan-completion.md', + 'review-army.md', + 'greptile.md', + 'adversarial.md', + 'changelog.md', + 'pr-body.md', + ], + requiredReads: ['review-army.md', 'changelog.md'], + scenario: + 'This is a FRESH version-changing ship: the branch has a real code change, VERSION still equals the base version (needs a bump), and CHANGELOG.md needs a new entry. Follow the skill flow for a version-changing ship: run the pre-landing review and prepare the CHANGELOG entry. Produce the ship plan / review report. Do NOT actually commit, push, or open a PR.', + staticInvariants: { + mustStayInSkeleton: [], + mustMoveToSection: [], + // ship is operational (multi-STOP, not a plan review); no single post-STOP gate. + gateAfterStop: undefined, + }, + behavioral: 'external', + externalTest: 'test/skill-e2e-ship-section-loading.test.ts', + maxSkeletonBytes: 90_000, + minUnionBytes: 120_000, + mustContain: ['VERSION', 'CHANGELOG', 'review', 'merge', 'PR'], + }, + 'plan-ceo-review': { + skill: 'plan-ceo-review', + expectedSections: ['review-sections.md'], + requiredReads: ['review-sections.md'], + scenario: + 'Review the plan in PLAN.md. Hold the current scope (HOLD SCOPE mode) — do not challenge or expand scope. Run the full CEO review and produce the review report.', + staticInvariants: { + mustStayInSkeleton: ['## Step 0: Nuclear Scope Challenge'], + mustMoveToSection: ['### Section 1: Architecture Review', '## Mode Quick Reference'], + gateAfterStop: 'EXIT PLAN MODE GATE', + }, + behavioral: 'external', + externalTest: 'test/skill-e2e-plan-ceo-review-section-loading.test.ts', + maxSkeletonBytes: 90_000, + minUnionBytes: 80_000, + mustContain: ['SCOPE EXPANSION', 'SELECTIVE EXPANSION', 'HOLD SCOPE', 'SCOPE REDUCTION'], + }, + 'plan-eng-review': { + skill: 'plan-eng-review', + expectedSections: ['review-sections.md'], + requiredReads: ['review-sections.md'], + scenario: + 'Review the plan in PLAN.md. Accept the current scope. Run the full engineering review (architecture, code quality, tests, performance) and produce the review report.', + staticInvariants: { + mustStayInSkeleton: ['### Step 0: Scope Challenge'], + mustMoveToSection: ['### 1. Architecture review'], + gateAfterStop: 'EXIT PLAN MODE GATE', + }, + behavioral: 'plan', + maxSkeletonBytes: 62_000, + minUnionBytes: 70_000, + mustContain: ['Architecture', 'Code Quality', 'Test', 'Performance'], + }, + 'plan-design-review': { + skill: 'plan-design-review', + expectedSections: ['review-sections.md'], + requiredReads: ['review-sections.md'], + scenario: + 'Review the plan in PLAN.md for design and UX. Accept the current scope. Run the full design review passes and produce the review report.', + staticInvariants: { + mustStayInSkeleton: [], + mustMoveToSection: ['### Pass 1: Information Architecture'], + gateAfterStop: 'EXIT PLAN MODE GATE', + }, + behavioral: 'plan', + maxSkeletonBytes: 82_000, + minUnionBytes: 70_000, + mustContain: ['design', 'visual'], + }, + 'plan-devex-review': { + skill: 'plan-devex-review', + expectedSections: ['review-sections.md'], + requiredReads: ['review-sections.md'], + scenario: + 'Review the plan in PLAN.md for developer experience. Accept the current scope. Run the full DX review passes and produce the review report.', + staticInvariants: { + mustStayInSkeleton: [], + mustMoveToSection: ['### Pass 1: Getting Started Experience'], + gateAfterStop: 'EXIT PLAN MODE GATE', + }, + behavioral: 'plan', + maxSkeletonBytes: 76_000, + minUnionBytes: 70_000, + mustContain: ['developer experience', 'Getting Started'], + }, + 'office-hours': { + skill: 'office-hours', + expectedSections: ['design-and-handoff.md'], + requiredReads: ['design-and-handoff.md'], + scenario: + 'Run office hours for this product idea through to the end: have the diagnostic conversation, explore alternatives, then write the design doc and run the relationship handoff (Phases 5-6).', + staticInvariants: { + mustStayInSkeleton: [], + mustMoveToSection: [], + // office-hours is conversational; the design-doc/handoff section has no + // post-STOP review gate in the skeleton. + gateAfterStop: undefined, + }, + behavioral: 'prompt', + maxSkeletonBytes: 96_000, + minUnionBytes: 70_000, + mustContain: ['design doc', 'problem statement'], + }, +}; + +/** Sorted carved-skill names. Consumers derive their lists from this — no parallel lists. */ +export const CARVED_SKILLS: readonly string[] = Object.freeze( + Object.keys(CARVE_GUARDS).sort(), +); diff --git a/test/helpers/parity-harness.ts b/test/helpers/parity-harness.ts index d50c609a0..010ab4993 100644 --- a/test/helpers/parity-harness.ts +++ b/test/helpers/parity-harness.ts @@ -22,6 +22,7 @@ import * as fs from 'fs'; import * as path from 'path'; import type { ParityBaseline, SkillBaselineEntry } from './capture-parity-baseline'; import { captureBaseline } from './capture-parity-baseline'; +import { CARVE_GUARDS } from './carve-guards'; export interface ParityInvariant { skill: string; @@ -198,7 +199,12 @@ export function runParityChecks(opts: { * Each entry pins what must-not-break in a skill family. Extend as future * skills land. Phase B (v2.0.0.0) adds LLM-judge invariants on top of these. */ -export const PARITY_INVARIANTS: ParityInvariant[] = [ +/** + * Monolith (non-carved) invariants — hand-written. Carved-skill invariants are + * generated from CARVE_GUARDS below (single source of truth), so they never drift + * from the size-budget / static / behavioral guards. + */ +const MONOLITH_INVARIANTS: ParityInvariant[] = [ { skill: 'cso', mustContain: ['OWASP', 'STRIDE', 'daily', 'comprehensive', 'verif'], @@ -206,78 +212,6 @@ export const PARITY_INVARIANTS: ParityInvariant[] = [ maxSizeRatio: 1.05, minBytes: 30_000, }, - { - // Carved (v2 plan T9): skeleton SKILL.md + sections/*.md. Content checks run - // against the union (relocated phrases still count); size floors run against - // the union (total behavior preserved); maxSkeletonBytes asserts the - // always-loaded skeleton actually shrank from the ~167KB monolith. - skill: 'ship', - sectioned: true, - maxSkeletonBytes: 90_000, - mustContain: [ - 'VERSION', - 'CHANGELOG', - 'review', - 'merge', - 'PR', - ], - mustHaveHeadings: ['## Preamble', '## When to invoke'], - maxSizeRatio: 1.05, - minBytes: 120_000, - }, - { - // Carved (v2 plan T9): skeleton SKILL.md + sections/review-sections.md. - // Content + size floors run against the union (relocated prose still counts); - // maxSkeletonBytes asserts the always-loaded skeleton shrank from the ~138KB - // monolith to ~81KB (measured 80,731 B, -42%). Headroom to 90KB so a small - // skeleton edit doesn't trip CI, but a 10KB regression does. - skill: 'plan-ceo-review', - sectioned: true, - maxSkeletonBytes: 90_000, - mustContain: [ - 'SCOPE EXPANSION', - 'SELECTIVE EXPANSION', - 'HOLD SCOPE', - 'SCOPE REDUCTION', - ], - mustHaveHeadings: ['## Preamble', '## When to invoke'], - maxSizeRatio: 1.05, - minBytes: 80_000, - }, - { - // Carved (v2 plan T9): skeleton + sections/review-sections.md. The 4-section - // review, outside voice, and required outputs moved to the section; content - // checks run against the union. Skeleton shrank 106,984 -> 54,892 B (-48.7%); - // maxSkeletonBytes 62KB = measured + headroom. - skill: 'plan-eng-review', - sectioned: true, - maxSkeletonBytes: 62_000, - mustContain: [ - 'Architecture', - 'Code Quality', - 'Test', - 'Performance', - ], - mustHaveHeadings: ['## Preamble', '## When to invoke'], - maxSizeRatio: 1.05, - minBytes: 70_000, - }, - { - // Carved (v2 plan T9): skeleton + sections/review-sections.md. The 7 design - // passes + required outputs moved to the section; content checks run against - // the union. Skeleton shrank 112,057 -> 76,024 B (-32.2%); maxSkeletonBytes - // 82KB = measured + headroom. - skill: 'plan-design-review', - sectioned: true, - maxSkeletonBytes: 82_000, - mustContain: [ - 'design', - 'visual', - ], - mustHaveHeadings: ['## Preamble', '## When to invoke'], - maxSizeRatio: 1.05, - minBytes: 70_000, - }, { skill: 'review', mustContain: ['confidence', 'P1', 'P2'], @@ -299,21 +233,6 @@ export const PARITY_INVARIANTS: ParityInvariant[] = [ maxSizeRatio: 1.05, minBytes: 30_000, }, - { - // Carved (v2 plan T9): skeleton SKILL.md + sections/design-and-handoff.md. - // Phase 5 (design doc) + Phase 6 (handoff) moved into the section, so - // 'design doc' / 'problem statement' now live there — content checks run - // against the union. maxSkeletonBytes asserts the always-loaded skeleton - // shrank from the ~118KB monolith to ~89KB (measured 88,975 B, -24.8%); - // headroom to 96KB so a small skeleton edit doesn't trip CI. - skill: 'office-hours', - sectioned: true, - maxSkeletonBytes: 96_000, - mustContain: ['design doc', 'problem statement'], - mustHaveHeadings: ['## Preamble', '## When to invoke'], - maxSizeRatio: 1.05, - minBytes: 70_000, - }, { skill: 'autoplan', mustContain: ['ceo', 'eng', 'design'], @@ -322,3 +241,27 @@ export const PARITY_INVARIANTS: ParityInvariant[] = [ minBytes: 70_000, }, ]; + +/** + * Carved-skill invariants, GENERATED from the canonical CARVE_GUARDS registry + * (EQ1: single source of truth). Each carve's skeleton-shrink floor + * (maxSkeletonBytes), union floor (minUnionBytes), and content invariants + * (mustContain) live in carve-guards.ts; this just projects them into the parity + * shape. Adding a carve there auto-adds its union guard here — which is how + * plan-devex-review (previously in SECTIONS_EXTRACTED but missing a sectioned + * parity invariant) is now guarded. + */ +const CARVED_INVARIANTS: ParityInvariant[] = Object.values(CARVE_GUARDS).map((g) => ({ + skill: g.skill, + sectioned: true, + maxSkeletonBytes: g.maxSkeletonBytes, + minBytes: g.minUnionBytes, + mustContain: g.mustContain, + mustHaveHeadings: ['## Preamble', '## When to invoke'], + maxSizeRatio: 1.05, +})); + +export const PARITY_INVARIANTS: ParityInvariant[] = [ + ...MONOLITH_INVARIANTS, + ...CARVED_INVARIANTS, +]; diff --git a/test/skill-size-budget.test.ts b/test/skill-size-budget.test.ts index 8ae5bd268..95d04d65c 100644 --- a/test/skill-size-budget.test.ts +++ b/test/skill-size-budget.test.ts @@ -33,6 +33,7 @@ import * as fs from 'fs'; import * as path from 'path'; import { captureBaseline, type ParityBaseline } from './helpers/capture-parity-baseline'; import { logBudgetOverride } from './helpers/budget-override'; +import { CARVED_SKILLS } from './helpers/carve-guards'; const REPO_ROOT = path.resolve(import.meta.dir, '..'); const BASELINE_PATH = path.join(REPO_ROOT, 'test', 'fixtures', 'parity-baseline-v1.47.0.0.json'); @@ -161,9 +162,10 @@ describe('SKILL.md size budget regression (gate, free)', () => { const MIN_RATIO = 0.80; // a skill at <80% of its v1.44 size signals mass-deletion // Carved skills (v2 plan T9): the skeleton SKILL.md intentionally shrinks // because prose moved into sections/*.md. The union size is guarded instead - // by the sectioned ship invariant in parity-harness.ts (minBytes on the + // by the sectioned invariant in parity-harness.ts (minBytes on the // skeleton+sections union), so exempt the skeleton from the body-strip floor. - const SECTIONS_EXTRACTED = new Set(['ship', 'plan-ceo-review', 'office-hours', 'plan-eng-review', 'plan-design-review', 'plan-devex-review']); + // EQ1: derived from the canonical CARVE_GUARDS registry — no parallel list. + const SECTIONS_EXTRACTED = new Set(CARVED_SKILLS); const undershoots: Array<{ skill: string; beforeBytes: number; afterBytes: number; ratio: number;