test(ship): manifest-consistency + context-parity + requiredReads helper (T9)

Free deterministic guards for the carve: - required-reads.ts + unit test: assertRequiredReads(run, requiredFiles) — the mechanical layer-5 check that the agent Read the sections its situation needs (required set comes from the fixture, not the passive manifest) - section-manifest-consistency: 3-tier orphan classification (generated orphan + hand-edited generated file → FAIL; manifest orphan → WARN per v2_PLAN.md) and pins the PASSIVE-manifest contract (no applies_when/required_for) - template-context-parity: generated sections have zero unresolved placeholders and gated resolvers (ADVERSARIAL_STEP/CONFIDENCE_CALIBRATION/CHANGELOG_WORKFLOW) rendered — proving sections resolve with the parent skillName, not 'sections' 16 tests, all green. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-17 07:10:12 +02:00 · 2026-05-30 10:11:59 -07:00
parent 261d57a8e3
commit 924025a59c
4 changed files with 216 additions and 0 deletions
@@ -0,0 +1,40 @@
+/**
+ * requiredReads enforcement (v2 plan T9, mitigation layer 5 — the only CI-failing
+ * layer against silent section-skip).
+ *
+ * Given a /ship run's tool calls and the set of section files the run's SITUATION
+ * required, assert the agent actually Read each one. The required set comes from
+ * the TEST FIXTURE (which situation it set up), NOT from the manifest — the
+ * manifest is passive (CM2). This keeps "when is a section required" in exactly
+ * one machine-checkable place: the eval fixtures.
+ *
+ * Builds on extractSectionReads from transcript-section-logger so section-path
+ * matching (the `/sections/<file>.md` segment, host-layout agnostic) lives in one
+ * place.
+ */
+
+import { extractSectionReads, type TranscriptResultLike } from './transcript-section-logger';
+
+export interface RequiredReadsResult {
+  required: string[];
+  read: string[];
+  missing: string[];
+  ok: boolean;
+}
+
+/**
+ * @param result        the skill run (anything with toolCalls)
+ * @param requiredFiles section basenames the situation required, e.g.
+ *                      ['version-bump.md','changelog.md'] (or with a sections/
+ *                      prefix — normalized to basename here)
+ */
+export function assertRequiredReads(
+  result: TranscriptResultLike,
+  requiredFiles: string[],
+): RequiredReadsResult {
+  const read = extractSectionReads(result);
+  const readSet = new Set(read);
+  const required = requiredFiles.map(f => f.replace(/^.*\//, '')); // tolerate sections/<f>
+  const missing = required.filter(f => !readSet.has(f));
+  return { required, read, missing, ok: missing.length === 0 };
+}
@@ -0,0 +1,41 @@
+/**
+ * Unit tests for assertRequiredReads (v2 plan T9 mitigation layer 5). Pure logic
+ * over synthetic tool-call transcripts — the section-loading E2E (paid) drives
+ * this against real /ship runs.
+ */
+
+import { describe, test, expect } from 'bun:test';
+import { assertRequiredReads } from './helpers/required-reads';
+import type { ToolCallLike } from './helpers/transcript-section-logger';
+
+const read = (fp: string): ToolCallLike => ({ tool: 'Read', input: { file_path: fp }, output: '' });
+
+describe('assertRequiredReads', () => {
+  test('passes when every required section was Read', () => {
+    const result = {
+      toolCalls: [
+        read('/Users/x/.claude/skills/gstack/ship/sections/version-bump.md'),
+        read('ship/sections/changelog.md'),
+      ],
+    };
+    const r = assertRequiredReads(result, ['version-bump.md', 'changelog.md']);
+    expect(r.ok).toBe(true);
+    expect(r.missing).toEqual([]);
+  });
+
+  test('flags a required section the agent never opened', () => {
+    const result = { toolCalls: [read('ship/sections/changelog.md')] };
+    const r = assertRequiredReads(result, ['version-bump.md', 'changelog.md']);
+    expect(r.ok).toBe(false);
+    expect(r.missing).toEqual(['version-bump.md']);
+  });
+
+  test('tolerates a sections/ prefix in the required list', () => {
+    const result = { toolCalls: [read('/abs/gstack/ship/sections/review-army.md')] };
+    expect(assertRequiredReads(result, ['sections/review-army.md']).ok).toBe(true);
+  });
+
+  test('empty required set always passes', () => {
+    expect(assertRequiredReads({ toolCalls: [] }, []).ok).toBe(true);
+  });
+});
@@ -0,0 +1,77 @@
+/**
+ * Section manifest ↔ filesystem consistency (v2 plan T9 / Phase C orphan check).
+ *
+ * Implements the 3-tier orphan classification from v2_PLAN.md:
+ *  - generated orphan  (sections/X.md with no sections/X.md.tmpl)  → FAIL
+ *  - hand-edited generated file (X.md missing the AUTO-GENERATED header) → FAIL
+ *  - manifest orphan   (sections/X.md.tmpl not listed in manifest)  → WARN (v2.0)
+ *
+ * Also pins the PASSIVE-manifest contract (CM2 / v2_PLAN.md:663): manifest entries
+ * carry only id/file/title/trigger — no machine predicate (applies_when/required_for).
+ */
+
+import { describe, test, expect } from 'bun:test';
+import * as fs from 'fs';
+import * as path from 'path';
+
+const ROOT = path.resolve(import.meta.dir, '..');
+const SHIP_SECTIONS = path.join(ROOT, 'ship', 'sections');
+const manifest = JSON.parse(fs.readFileSync(path.join(SHIP_SECTIONS, 'manifest.json'), 'utf-8'));
+
+const sectionTmpls = fs.readdirSync(SHIP_SECTIONS).filter(f => f.endsWith('.md.tmpl'));
+const sectionMds = fs.readdirSync(SHIP_SECTIONS).filter(f => f.endsWith('.md') && !f.endsWith('.md.tmpl'));
+
+describe('section manifest ↔ filesystem consistency', () => {
+  test('manifest parses with skill + sections array', () => {
+    expect(manifest.skill).toBe('ship');
+    expect(Array.isArray(manifest.sections)).toBe(true);
+    expect(manifest.sections.length).toBeGreaterThan(0);
+  });
+
+  test('every manifest entry has a .md.tmpl source AND a generated .md', () => {
+    for (const s of manifest.sections) {
+      expect(fs.existsSync(path.join(SHIP_SECTIONS, `${s.file}.tmpl`))).toBe(true);
+      expect(fs.existsSync(path.join(SHIP_SECTIONS, s.file))).toBe(true);
+    }
+  });
+
+  test('manifest is PASSIVE — no applies_when / required_for predicate (CM2)', () => {
+    for (const s of manifest.sections) {
+      expect(s).not.toHaveProperty('applies_when');
+      expect(s).not.toHaveProperty('required_for');
+      // The allowed passive shape:
+      expect(typeof s.id).toBe('string');
+      expect(typeof s.file).toBe('string');
+      expect(typeof s.title).toBe('string');
+      expect(typeof s.trigger).toBe('string');
+    }
+  });
+
+  test('no generated orphan: every sections/X.md has a sections/X.md.tmpl → FAIL', () => {
+    const orphans = sectionMds.filter(md => !sectionTmpls.includes(`${md}.tmpl`));
+    expect(orphans).toEqual([]);
+  });
+
+  test('no hand-edited generated file: every sections/X.md has the AUTO-GENERATED header → FAIL', () => {
+    for (const md of sectionMds) {
+      const head = fs.readFileSync(path.join(SHIP_SECTIONS, md), 'utf-8').slice(0, 120);
+      expect(head).toContain('AUTO-GENERATED');
+    }
+  });
+
+  test('manifest orphan check (WARN in v2.0): every .md.tmpl is listed', () => {
+    const listed = new Set(manifest.sections.map((s: { file: string }) => `${s.file}.tmpl`));
+    const unlisted = sectionTmpls.filter(t => !listed.has(t));
+    if (unlisted.length > 0) {
+      // v2_PLAN.md: WARN now, FAIL in v2.1. Surface, don't fail the build yet.
+      // eslint-disable-next-line no-console
+      console.warn(`[section-manifest] manifest orphan(s) (not in manifest.json): ${unlisted.join(', ')}`);
+    }
+    expect(unlisted.length).toBeLessThanOrEqual(unlisted.length); // always passes; WARN only
+  });
+
+  test('section ids are unique', () => {
+    const ids = manifest.sections.map((s: { id: string }) => s.id);
+    expect(new Set(ids).size).toBe(ids.length);
+  });
+});
@@ -0,0 +1,58 @@
+/**
+ * Section TemplateContext parity (v2 plan T9 / Codex consult absorbed-refinement #1).
+ *
+ * Section generation must use the SAME TemplateContext as the parent skill —
+ * crucially the same skillName, so resolver `appliesTo` gating + tier behave
+ * identically. If a section resolved with skillName "sections" (the bug
+ * processSectionTemplate guards against), gated resolvers like ADVERSARIAL_STEP /
+ * CONFIDENCE_CALIBRATION would render empty.
+ *
+ * We assert on the GENERATED section output: gated resolver content is present and
+ * no placeholder is left unresolved. That can only be true if the parent ctx
+ * (skillName=ship) drove the resolve.
+ */
+
+import { describe, test, expect } from 'bun:test';
+import * as fs from 'fs';
+import * as path from 'path';
+
+const ROOT = path.resolve(import.meta.dir, '..');
+const SHIP_SECTIONS = path.join(ROOT, 'ship', 'sections');
+
+function readSection(file: string): string {
+  return fs.readFileSync(path.join(SHIP_SECTIONS, file), 'utf-8');
+}
+
+describe('section TemplateContext parity (skillName pinned to parent)', () => {
+  test('no generated section has unresolved {{PLACEHOLDER}} tokens', () => {
+    for (const md of fs.readdirSync(SHIP_SECTIONS).filter(f => f.endsWith('.md') && !f.endsWith('.md.tmpl'))) {
+      const content = readSection(md);
+      const unresolved = content.match(/\{\{[A-Z_]+(?::[^}]+)?\}\}/g);
+      expect({ md, unresolved }).toEqual({ md, unresolved: null });
+    }
+  });
+
+  test('adversarial section rendered the ADVERSARIAL_STEP resolver (proves ship ctx)', () => {
+    const content = readSection('adversarial.md');
+    // The codex filesystem-boundary line only appears when ADVERSARIAL_STEP resolves.
+    expect(content).toContain('Do NOT read or execute any files under');
+    expect(content.length).toBeGreaterThan(500);
+  });
+
+  test('review-army section rendered CONFIDENCE_CALIBRATION + REVIEW_ARMY (gated resolvers)', () => {
+    const content = readSection('review-army.md');
+    expect(content).toContain('Confidence Calibration');
+    expect(content).toContain('confidence score');
+  });
+
+  test('tests section rendered TEST_BOOTSTRAP + TEST_FAILURE_TRIAGE', () => {
+    const content = readSection('tests.md');
+    expect(content).toContain('Test Failure Ownership Triage');
+  });
+
+  test('changelog section rendered CHANGELOG_WORKFLOW', () => {
+    const content = readSection('changelog.md');
+    expect(content).toContain('CHANGELOG');
+    expect(content.length).toBeGreaterThan(300);
+  });
+});