From 924025a59c751a830950c9d7e650d114cc675084 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sat, 30 May 2026 10:11:59 -0700 Subject: [PATCH] test(ship): manifest-consistency + context-parity + requiredReads helper (T9) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Free deterministic guards for the carve: - required-reads.ts + unit test: assertRequiredReads(run, requiredFiles) — the mechanical layer-5 check that the agent Read the sections its situation needs (required set comes from the fixture, not the passive manifest) - section-manifest-consistency: 3-tier orphan classification (generated orphan + hand-edited generated file → FAIL; manifest orphan → WARN per v2_PLAN.md) and pins the PASSIVE-manifest contract (no applies_when/required_for) - template-context-parity: generated sections have zero unresolved placeholders and gated resolvers (ADVERSARIAL_STEP/CONFIDENCE_CALIBRATION/CHANGELOG_WORKFLOW) rendered — proving sections resolve with the parent skillName, not 'sections' 16 tests, all green. Co-Authored-By: Claude Opus 4.8 (1M context) --- test/helpers/required-reads.ts | 40 ++++++++++++ test/required-reads.test.ts | 41 ++++++++++++ test/section-manifest-consistency.test.ts | 77 +++++++++++++++++++++++ test/template-context-parity.test.ts | 58 +++++++++++++++++ 4 files changed, 216 insertions(+) create mode 100644 test/helpers/required-reads.ts create mode 100644 test/required-reads.test.ts create mode 100644 test/section-manifest-consistency.test.ts create mode 100644 test/template-context-parity.test.ts diff --git a/test/helpers/required-reads.ts b/test/helpers/required-reads.ts new file mode 100644 index 000000000..eae190a82 --- /dev/null +++ b/test/helpers/required-reads.ts @@ -0,0 +1,40 @@ +/** + * requiredReads enforcement (v2 plan T9, mitigation layer 5 — the only CI-failing + * layer against silent section-skip). + * + * Given a /ship run's tool calls and the set of section files the run's SITUATION + * required, assert the agent actually Read each one. The required set comes from + * the TEST FIXTURE (which situation it set up), NOT from the manifest — the + * manifest is passive (CM2). This keeps "when is a section required" in exactly + * one machine-checkable place: the eval fixtures. + * + * Builds on extractSectionReads from transcript-section-logger so section-path + * matching (the `/sections/.md` segment, host-layout agnostic) lives in one + * place. + */ + +import { extractSectionReads, type TranscriptResultLike } from './transcript-section-logger'; + +export interface RequiredReadsResult { + required: string[]; + read: string[]; + missing: string[]; + ok: boolean; +} + +/** + * @param result the skill run (anything with toolCalls) + * @param requiredFiles section basenames the situation required, e.g. + * ['version-bump.md','changelog.md'] (or with a sections/ + * prefix — normalized to basename here) + */ +export function assertRequiredReads( + result: TranscriptResultLike, + requiredFiles: string[], +): RequiredReadsResult { + const read = extractSectionReads(result); + const readSet = new Set(read); + const required = requiredFiles.map(f => f.replace(/^.*\//, '')); // tolerate sections/ + const missing = required.filter(f => !readSet.has(f)); + return { required, read, missing, ok: missing.length === 0 }; +} diff --git a/test/required-reads.test.ts b/test/required-reads.test.ts new file mode 100644 index 000000000..78aa1598b --- /dev/null +++ b/test/required-reads.test.ts @@ -0,0 +1,41 @@ +/** + * Unit tests for assertRequiredReads (v2 plan T9 mitigation layer 5). Pure logic + * over synthetic tool-call transcripts — the section-loading E2E (paid) drives + * this against real /ship runs. + */ + +import { describe, test, expect } from 'bun:test'; +import { assertRequiredReads } from './helpers/required-reads'; +import type { ToolCallLike } from './helpers/transcript-section-logger'; + +const read = (fp: string): ToolCallLike => ({ tool: 'Read', input: { file_path: fp }, output: '' }); + +describe('assertRequiredReads', () => { + test('passes when every required section was Read', () => { + const result = { + toolCalls: [ + read('/Users/x/.claude/skills/gstack/ship/sections/version-bump.md'), + read('ship/sections/changelog.md'), + ], + }; + const r = assertRequiredReads(result, ['version-bump.md', 'changelog.md']); + expect(r.ok).toBe(true); + expect(r.missing).toEqual([]); + }); + + test('flags a required section the agent never opened', () => { + const result = { toolCalls: [read('ship/sections/changelog.md')] }; + const r = assertRequiredReads(result, ['version-bump.md', 'changelog.md']); + expect(r.ok).toBe(false); + expect(r.missing).toEqual(['version-bump.md']); + }); + + test('tolerates a sections/ prefix in the required list', () => { + const result = { toolCalls: [read('/abs/gstack/ship/sections/review-army.md')] }; + expect(assertRequiredReads(result, ['sections/review-army.md']).ok).toBe(true); + }); + + test('empty required set always passes', () => { + expect(assertRequiredReads({ toolCalls: [] }, []).ok).toBe(true); + }); +}); diff --git a/test/section-manifest-consistency.test.ts b/test/section-manifest-consistency.test.ts new file mode 100644 index 000000000..158c8ccb7 --- /dev/null +++ b/test/section-manifest-consistency.test.ts @@ -0,0 +1,77 @@ +/** + * Section manifest ↔ filesystem consistency (v2 plan T9 / Phase C orphan check). + * + * Implements the 3-tier orphan classification from v2_PLAN.md: + * - generated orphan (sections/X.md with no sections/X.md.tmpl) → FAIL + * - hand-edited generated file (X.md missing the AUTO-GENERATED header) → FAIL + * - manifest orphan (sections/X.md.tmpl not listed in manifest) → WARN (v2.0) + * + * Also pins the PASSIVE-manifest contract (CM2 / v2_PLAN.md:663): manifest entries + * carry only id/file/title/trigger — no machine predicate (applies_when/required_for). + */ + +import { describe, test, expect } from 'bun:test'; +import * as fs from 'fs'; +import * as path from 'path'; + +const ROOT = path.resolve(import.meta.dir, '..'); +const SHIP_SECTIONS = path.join(ROOT, 'ship', 'sections'); +const manifest = JSON.parse(fs.readFileSync(path.join(SHIP_SECTIONS, 'manifest.json'), 'utf-8')); + +const sectionTmpls = fs.readdirSync(SHIP_SECTIONS).filter(f => f.endsWith('.md.tmpl')); +const sectionMds = fs.readdirSync(SHIP_SECTIONS).filter(f => f.endsWith('.md') && !f.endsWith('.md.tmpl')); + +describe('section manifest ↔ filesystem consistency', () => { + test('manifest parses with skill + sections array', () => { + expect(manifest.skill).toBe('ship'); + expect(Array.isArray(manifest.sections)).toBe(true); + expect(manifest.sections.length).toBeGreaterThan(0); + }); + + test('every manifest entry has a .md.tmpl source AND a generated .md', () => { + for (const s of manifest.sections) { + expect(fs.existsSync(path.join(SHIP_SECTIONS, `${s.file}.tmpl`))).toBe(true); + expect(fs.existsSync(path.join(SHIP_SECTIONS, s.file))).toBe(true); + } + }); + + test('manifest is PASSIVE — no applies_when / required_for predicate (CM2)', () => { + for (const s of manifest.sections) { + expect(s).not.toHaveProperty('applies_when'); + expect(s).not.toHaveProperty('required_for'); + // The allowed passive shape: + expect(typeof s.id).toBe('string'); + expect(typeof s.file).toBe('string'); + expect(typeof s.title).toBe('string'); + expect(typeof s.trigger).toBe('string'); + } + }); + + test('no generated orphan: every sections/X.md has a sections/X.md.tmpl → FAIL', () => { + const orphans = sectionMds.filter(md => !sectionTmpls.includes(`${md}.tmpl`)); + expect(orphans).toEqual([]); + }); + + test('no hand-edited generated file: every sections/X.md has the AUTO-GENERATED header → FAIL', () => { + for (const md of sectionMds) { + const head = fs.readFileSync(path.join(SHIP_SECTIONS, md), 'utf-8').slice(0, 120); + expect(head).toContain('AUTO-GENERATED'); + } + }); + + test('manifest orphan check (WARN in v2.0): every .md.tmpl is listed', () => { + const listed = new Set(manifest.sections.map((s: { file: string }) => `${s.file}.tmpl`)); + const unlisted = sectionTmpls.filter(t => !listed.has(t)); + if (unlisted.length > 0) { + // v2_PLAN.md: WARN now, FAIL in v2.1. Surface, don't fail the build yet. + // eslint-disable-next-line no-console + console.warn(`[section-manifest] manifest orphan(s) (not in manifest.json): ${unlisted.join(', ')}`); + } + expect(unlisted.length).toBeLessThanOrEqual(unlisted.length); // always passes; WARN only + }); + + test('section ids are unique', () => { + const ids = manifest.sections.map((s: { id: string }) => s.id); + expect(new Set(ids).size).toBe(ids.length); + }); +}); diff --git a/test/template-context-parity.test.ts b/test/template-context-parity.test.ts new file mode 100644 index 000000000..1540ed744 --- /dev/null +++ b/test/template-context-parity.test.ts @@ -0,0 +1,58 @@ +/** + * Section TemplateContext parity (v2 plan T9 / Codex consult absorbed-refinement #1). + * + * Section generation must use the SAME TemplateContext as the parent skill — + * crucially the same skillName, so resolver `appliesTo` gating + tier behave + * identically. If a section resolved with skillName "sections" (the bug + * processSectionTemplate guards against), gated resolvers like ADVERSARIAL_STEP / + * CONFIDENCE_CALIBRATION would render empty. + * + * We assert on the GENERATED section output: gated resolver content is present and + * no placeholder is left unresolved. That can only be true if the parent ctx + * (skillName=ship) drove the resolve. + */ + +import { describe, test, expect } from 'bun:test'; +import * as fs from 'fs'; +import * as path from 'path'; + +const ROOT = path.resolve(import.meta.dir, '..'); +const SHIP_SECTIONS = path.join(ROOT, 'ship', 'sections'); + +function readSection(file: string): string { + return fs.readFileSync(path.join(SHIP_SECTIONS, file), 'utf-8'); +} + +describe('section TemplateContext parity (skillName pinned to parent)', () => { + test('no generated section has unresolved {{PLACEHOLDER}} tokens', () => { + for (const md of fs.readdirSync(SHIP_SECTIONS).filter(f => f.endsWith('.md') && !f.endsWith('.md.tmpl'))) { + const content = readSection(md); + const unresolved = content.match(/\{\{[A-Z_]+(?::[^}]+)?\}\}/g); + expect({ md, unresolved }).toEqual({ md, unresolved: null }); + } + }); + + test('adversarial section rendered the ADVERSARIAL_STEP resolver (proves ship ctx)', () => { + const content = readSection('adversarial.md'); + // The codex filesystem-boundary line only appears when ADVERSARIAL_STEP resolves. + expect(content).toContain('Do NOT read or execute any files under'); + expect(content.length).toBeGreaterThan(500); + }); + + test('review-army section rendered CONFIDENCE_CALIBRATION + REVIEW_ARMY (gated resolvers)', () => { + const content = readSection('review-army.md'); + expect(content).toContain('Confidence Calibration'); + expect(content).toContain('confidence score'); + }); + + test('tests section rendered TEST_BOOTSTRAP + TEST_FAILURE_TRIAGE', () => { + const content = readSection('tests.md'); + expect(content).toContain('Test Failure Ownership Triage'); + }); + + test('changelog section rendered CHANGELOG_WORKFLOW', () => { + const content = readSection('changelog.md'); + expect(content).toContain('CHANGELOG'); + expect(content.length).toBeGreaterThan(300); + }); +});