mirror of
https://github.com/garrytan/gstack.git
synced 2026-06-17 07:10:12 +02:00
test(ship): manifest-consistency + context-parity + requiredReads helper (T9)
Free deterministic guards for the carve: - required-reads.ts + unit test: assertRequiredReads(run, requiredFiles) — the mechanical layer-5 check that the agent Read the sections its situation needs (required set comes from the fixture, not the passive manifest) - section-manifest-consistency: 3-tier orphan classification (generated orphan + hand-edited generated file → FAIL; manifest orphan → WARN per v2_PLAN.md) and pins the PASSIVE-manifest contract (no applies_when/required_for) - template-context-parity: generated sections have zero unresolved placeholders and gated resolvers (ADVERSARIAL_STEP/CONFIDENCE_CALIBRATION/CHANGELOG_WORKFLOW) rendered — proving sections resolve with the parent skillName, not 'sections' 16 tests, all green. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,40 @@
|
||||
/**
|
||||
* requiredReads enforcement (v2 plan T9, mitigation layer 5 — the only CI-failing
|
||||
* layer against silent section-skip).
|
||||
*
|
||||
* Given a /ship run's tool calls and the set of section files the run's SITUATION
|
||||
* required, assert the agent actually Read each one. The required set comes from
|
||||
* the TEST FIXTURE (which situation it set up), NOT from the manifest — the
|
||||
* manifest is passive (CM2). This keeps "when is a section required" in exactly
|
||||
* one machine-checkable place: the eval fixtures.
|
||||
*
|
||||
* Builds on extractSectionReads from transcript-section-logger so section-path
|
||||
* matching (the `/sections/<file>.md` segment, host-layout agnostic) lives in one
|
||||
* place.
|
||||
*/
|
||||
|
||||
import { extractSectionReads, type TranscriptResultLike } from './transcript-section-logger';
|
||||
|
||||
export interface RequiredReadsResult {
|
||||
required: string[];
|
||||
read: string[];
|
||||
missing: string[];
|
||||
ok: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param result the skill run (anything with toolCalls)
|
||||
* @param requiredFiles section basenames the situation required, e.g.
|
||||
* ['version-bump.md','changelog.md'] (or with a sections/
|
||||
* prefix — normalized to basename here)
|
||||
*/
|
||||
export function assertRequiredReads(
|
||||
result: TranscriptResultLike,
|
||||
requiredFiles: string[],
|
||||
): RequiredReadsResult {
|
||||
const read = extractSectionReads(result);
|
||||
const readSet = new Set(read);
|
||||
const required = requiredFiles.map(f => f.replace(/^.*\//, '')); // tolerate sections/<f>
|
||||
const missing = required.filter(f => !readSet.has(f));
|
||||
return { required, read, missing, ok: missing.length === 0 };
|
||||
}
|
||||
@@ -0,0 +1,41 @@
|
||||
/**
|
||||
* Unit tests for assertRequiredReads (v2 plan T9 mitigation layer 5). Pure logic
|
||||
* over synthetic tool-call transcripts — the section-loading E2E (paid) drives
|
||||
* this against real /ship runs.
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import { assertRequiredReads } from './helpers/required-reads';
|
||||
import type { ToolCallLike } from './helpers/transcript-section-logger';
|
||||
|
||||
const read = (fp: string): ToolCallLike => ({ tool: 'Read', input: { file_path: fp }, output: '' });
|
||||
|
||||
describe('assertRequiredReads', () => {
|
||||
test('passes when every required section was Read', () => {
|
||||
const result = {
|
||||
toolCalls: [
|
||||
read('/Users/x/.claude/skills/gstack/ship/sections/version-bump.md'),
|
||||
read('ship/sections/changelog.md'),
|
||||
],
|
||||
};
|
||||
const r = assertRequiredReads(result, ['version-bump.md', 'changelog.md']);
|
||||
expect(r.ok).toBe(true);
|
||||
expect(r.missing).toEqual([]);
|
||||
});
|
||||
|
||||
test('flags a required section the agent never opened', () => {
|
||||
const result = { toolCalls: [read('ship/sections/changelog.md')] };
|
||||
const r = assertRequiredReads(result, ['version-bump.md', 'changelog.md']);
|
||||
expect(r.ok).toBe(false);
|
||||
expect(r.missing).toEqual(['version-bump.md']);
|
||||
});
|
||||
|
||||
test('tolerates a sections/ prefix in the required list', () => {
|
||||
const result = { toolCalls: [read('/abs/gstack/ship/sections/review-army.md')] };
|
||||
expect(assertRequiredReads(result, ['sections/review-army.md']).ok).toBe(true);
|
||||
});
|
||||
|
||||
test('empty required set always passes', () => {
|
||||
expect(assertRequiredReads({ toolCalls: [] }, []).ok).toBe(true);
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,77 @@
|
||||
/**
|
||||
* Section manifest ↔ filesystem consistency (v2 plan T9 / Phase C orphan check).
|
||||
*
|
||||
* Implements the 3-tier orphan classification from v2_PLAN.md:
|
||||
* - generated orphan (sections/X.md with no sections/X.md.tmpl) → FAIL
|
||||
* - hand-edited generated file (X.md missing the AUTO-GENERATED header) → FAIL
|
||||
* - manifest orphan (sections/X.md.tmpl not listed in manifest) → WARN (v2.0)
|
||||
*
|
||||
* Also pins the PASSIVE-manifest contract (CM2 / v2_PLAN.md:663): manifest entries
|
||||
* carry only id/file/title/trigger — no machine predicate (applies_when/required_for).
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
const SHIP_SECTIONS = path.join(ROOT, 'ship', 'sections');
|
||||
const manifest = JSON.parse(fs.readFileSync(path.join(SHIP_SECTIONS, 'manifest.json'), 'utf-8'));
|
||||
|
||||
const sectionTmpls = fs.readdirSync(SHIP_SECTIONS).filter(f => f.endsWith('.md.tmpl'));
|
||||
const sectionMds = fs.readdirSync(SHIP_SECTIONS).filter(f => f.endsWith('.md') && !f.endsWith('.md.tmpl'));
|
||||
|
||||
describe('section manifest ↔ filesystem consistency', () => {
|
||||
test('manifest parses with skill + sections array', () => {
|
||||
expect(manifest.skill).toBe('ship');
|
||||
expect(Array.isArray(manifest.sections)).toBe(true);
|
||||
expect(manifest.sections.length).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
test('every manifest entry has a .md.tmpl source AND a generated .md', () => {
|
||||
for (const s of manifest.sections) {
|
||||
expect(fs.existsSync(path.join(SHIP_SECTIONS, `${s.file}.tmpl`))).toBe(true);
|
||||
expect(fs.existsSync(path.join(SHIP_SECTIONS, s.file))).toBe(true);
|
||||
}
|
||||
});
|
||||
|
||||
test('manifest is PASSIVE — no applies_when / required_for predicate (CM2)', () => {
|
||||
for (const s of manifest.sections) {
|
||||
expect(s).not.toHaveProperty('applies_when');
|
||||
expect(s).not.toHaveProperty('required_for');
|
||||
// The allowed passive shape:
|
||||
expect(typeof s.id).toBe('string');
|
||||
expect(typeof s.file).toBe('string');
|
||||
expect(typeof s.title).toBe('string');
|
||||
expect(typeof s.trigger).toBe('string');
|
||||
}
|
||||
});
|
||||
|
||||
test('no generated orphan: every sections/X.md has a sections/X.md.tmpl → FAIL', () => {
|
||||
const orphans = sectionMds.filter(md => !sectionTmpls.includes(`${md}.tmpl`));
|
||||
expect(orphans).toEqual([]);
|
||||
});
|
||||
|
||||
test('no hand-edited generated file: every sections/X.md has the AUTO-GENERATED header → FAIL', () => {
|
||||
for (const md of sectionMds) {
|
||||
const head = fs.readFileSync(path.join(SHIP_SECTIONS, md), 'utf-8').slice(0, 120);
|
||||
expect(head).toContain('AUTO-GENERATED');
|
||||
}
|
||||
});
|
||||
|
||||
test('manifest orphan check (WARN in v2.0): every .md.tmpl is listed', () => {
|
||||
const listed = new Set(manifest.sections.map((s: { file: string }) => `${s.file}.tmpl`));
|
||||
const unlisted = sectionTmpls.filter(t => !listed.has(t));
|
||||
if (unlisted.length > 0) {
|
||||
// v2_PLAN.md: WARN now, FAIL in v2.1. Surface, don't fail the build yet.
|
||||
// eslint-disable-next-line no-console
|
||||
console.warn(`[section-manifest] manifest orphan(s) (not in manifest.json): ${unlisted.join(', ')}`);
|
||||
}
|
||||
expect(unlisted.length).toBeLessThanOrEqual(unlisted.length); // always passes; WARN only
|
||||
});
|
||||
|
||||
test('section ids are unique', () => {
|
||||
const ids = manifest.sections.map((s: { id: string }) => s.id);
|
||||
expect(new Set(ids).size).toBe(ids.length);
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,58 @@
|
||||
/**
|
||||
* Section TemplateContext parity (v2 plan T9 / Codex consult absorbed-refinement #1).
|
||||
*
|
||||
* Section generation must use the SAME TemplateContext as the parent skill —
|
||||
* crucially the same skillName, so resolver `appliesTo` gating + tier behave
|
||||
* identically. If a section resolved with skillName "sections" (the bug
|
||||
* processSectionTemplate guards against), gated resolvers like ADVERSARIAL_STEP /
|
||||
* CONFIDENCE_CALIBRATION would render empty.
|
||||
*
|
||||
* We assert on the GENERATED section output: gated resolver content is present and
|
||||
* no placeholder is left unresolved. That can only be true if the parent ctx
|
||||
* (skillName=ship) drove the resolve.
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
const SHIP_SECTIONS = path.join(ROOT, 'ship', 'sections');
|
||||
|
||||
function readSection(file: string): string {
|
||||
return fs.readFileSync(path.join(SHIP_SECTIONS, file), 'utf-8');
|
||||
}
|
||||
|
||||
describe('section TemplateContext parity (skillName pinned to parent)', () => {
|
||||
test('no generated section has unresolved {{PLACEHOLDER}} tokens', () => {
|
||||
for (const md of fs.readdirSync(SHIP_SECTIONS).filter(f => f.endsWith('.md') && !f.endsWith('.md.tmpl'))) {
|
||||
const content = readSection(md);
|
||||
const unresolved = content.match(/\{\{[A-Z_]+(?::[^}]+)?\}\}/g);
|
||||
expect({ md, unresolved }).toEqual({ md, unresolved: null });
|
||||
}
|
||||
});
|
||||
|
||||
test('adversarial section rendered the ADVERSARIAL_STEP resolver (proves ship ctx)', () => {
|
||||
const content = readSection('adversarial.md');
|
||||
// The codex filesystem-boundary line only appears when ADVERSARIAL_STEP resolves.
|
||||
expect(content).toContain('Do NOT read or execute any files under');
|
||||
expect(content.length).toBeGreaterThan(500);
|
||||
});
|
||||
|
||||
test('review-army section rendered CONFIDENCE_CALIBRATION + REVIEW_ARMY (gated resolvers)', () => {
|
||||
const content = readSection('review-army.md');
|
||||
expect(content).toContain('Confidence Calibration');
|
||||
expect(content).toContain('confidence score');
|
||||
});
|
||||
|
||||
test('tests section rendered TEST_BOOTSTRAP + TEST_FAILURE_TRIAGE', () => {
|
||||
const content = readSection('tests.md');
|
||||
expect(content).toContain('Test Failure Ownership Triage');
|
||||
});
|
||||
|
||||
test('changelog section rendered CHANGELOG_WORKFLOW', () => {
|
||||
const content = readSection('changelog.md');
|
||||
expect(content).toContain('CHANGELOG');
|
||||
expect(content.length).toBeGreaterThan(300);
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user