test(parity): T0a — capture v1.44.1 baseline + capture helper + diff utility

Cathedral parity-eval suite primitive. captureBaseline() walks every
top-level SKILL.md and records bytes, lines, estimated tokens, frontmatter
description length, and eval coverage. diffBaselines() reports per-skill
delta + total corpus delta + catalog tokens delta.

Locks the v1.44.1 reference snapshot at test/fixtures/parity-baseline-v1.44.1.json.
After Phase A+B+C land, scripts/capture-baseline.ts --tag v1.45.0.0 produces
a comparable snapshot; diff supplies the real numbers the v2 CHANGELOG quotes.
Never invent baseline numbers; ship them only if they came from a real run.

v1.44.1 numbers captured this commit:
- 51 skills
- 2,847 KB total corpus
- ~9,319 catalog tokens (sum of description bytes / 4)
- top 3: ship 160 KB, plan-ceo-review 128 KB, office-hours 108 KB

Test plan:
- bun test test/helpers/capture-parity-baseline.test.ts passes 4/4
- The baseline JSON file is committed so reviewers can audit v1→v2 numbers

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-05-25 20:29:47 -07:00
parent 74bc80545f
commit e274e5ec82
4 changed files with 998 additions and 0 deletions
@@ -0,0 +1,90 @@
/**
* Unit tests for parity baseline capture.
*
* Free. Reads the live repo state via captureBaseline() and asserts
* shape + invariants, not specific numbers (which drift release-over-release).
*/
import { describe, test, expect } from 'bun:test';
import * as fs from 'fs';
import * as path from 'path';
import { captureBaseline, diffBaselines, type ParityBaseline } from './capture-parity-baseline';
const REPO_ROOT = path.resolve(import.meta.dir, '..', '..');
describe('capture-parity-baseline', () => {
test('produces a shaped baseline for the current repo', () => {
const baseline = captureBaseline({ repoRoot: REPO_ROOT, tag: 'unit-test' });
expect(baseline.tag).toBe('unit-test');
expect(baseline.totalSkills).toBeGreaterThan(20);
expect(baseline.totalCorpusBytes).toBeGreaterThan(100_000);
expect(baseline.topHeaviest.length).toBeGreaterThan(0);
expect(baseline.topHeaviest.length).toBeLessThanOrEqual(10);
expect(baseline.topHeaviest[0]!.skillMdBytes).toBeGreaterThan(0);
// Top 1 should be ≥ Top 2 (sort invariant)
if (baseline.topHeaviest.length >= 2) {
expect(baseline.topHeaviest[0]!.skillMdBytes).toBeGreaterThanOrEqual(
baseline.topHeaviest[1]!.skillMdBytes,
);
}
});
test('each skill entry has byte + line + token estimates', () => {
const baseline = captureBaseline({ repoRoot: REPO_ROOT });
for (const skill of Object.values(baseline.skills)) {
expect(skill.skillMdBytes).toBeGreaterThan(0);
expect(skill.skillMdLines).toBeGreaterThan(0);
expect(skill.estTokens).toBeGreaterThan(0);
// ~4 chars/token heuristic
expect(skill.estTokens).toBeCloseTo(skill.skillMdBytes / 4, -2);
}
});
test('diffBaselines returns expected deltas', () => {
const before: ParityBaseline = {
tag: 'before',
capturedAt: '2026-01-01T00:00:00Z',
capturedFromCommit: 'abc',
capturedFromBranch: 'main',
totalSkills: 2,
totalCorpusBytes: 1000,
estTotalCatalogTokens: 100,
topHeaviest: [],
skills: {
foo: { skill: 'foo', skillMdBytes: 600, skillMdLines: 10, estTokens: 150, tmplBytes: 300, descriptionLen: 50, hasGateEval: true, hasPeriodicEval: false },
bar: { skill: 'bar', skillMdBytes: 400, skillMdLines: 8, estTokens: 100, tmplBytes: 200, descriptionLen: 30, hasGateEval: false, hasPeriodicEval: false },
},
};
const after: ParityBaseline = {
...before,
tag: 'after',
totalCorpusBytes: 700,
estTotalCatalogTokens: 60,
skills: {
foo: { ...before.skills.foo!, skillMdBytes: 400 },
bar: { ...before.skills.bar!, skillMdBytes: 300 },
},
};
const diff = diffBaselines(before, after);
expect(diff.totalCorpusDelta).toBe(-300);
expect(diff.totalCorpusDeltaPct).toBeCloseTo(-30, 1);
expect(diff.catalogTokensDelta).toBe(-40);
expect(diff.perSkill.length).toBe(2);
// Sorted by abs delta descending
expect(diff.perSkill[0]!.skill).toBe('foo');
expect(diff.perSkill[0]!.deltaBytes).toBe(-200);
expect(diff.perSkill[1]!.skill).toBe('bar');
});
test('v1.44.1 baseline file exists with expected shape', () => {
const baselinePath = path.join(REPO_ROOT, 'test', 'fixtures', 'parity-baseline-v1.44.1.json');
expect(fs.existsSync(baselinePath)).toBe(true);
const baseline = JSON.parse(fs.readFileSync(baselinePath, 'utf-8')) as ParityBaseline;
expect(baseline.tag).toBe('v1.44.1');
expect(baseline.totalSkills).toBeGreaterThan(40);
// Document the v1.44.1 snapshot as the v1→v2 baseline reference.
// Compression in v1.45+ should drop totalCorpusBytes; this assertion
// anchors the "v1 was XX MB" claim in the CHANGELOG to a real file.
expect(baseline.totalCorpusBytes).toBeGreaterThan(2_000_000);
});
});
+231
View File
@@ -0,0 +1,231 @@
/**
* Parity baseline capture — cathedral parity-eval suite primitive.
*
* Snapshots the current state of every top-level SKILL.md: byte count, line
* count, estimated token count, frontmatter description length, eval
* coverage. The output JSON is the v1.44 baseline that v2 must beat on
* compression AND match (or exceed) on parity.
*
* The numbers quoted in the v2.0.0.0 CHANGELOG numbers table are read
* from a baseline JSON captured by this script. Never invent baseline
* numbers; ship them only if they came from a real captureBaseline() run.
*
* Usage:
* bun run scripts/capture-baseline.ts # write default path
* bun run scripts/capture-baseline.ts --out PATH # write custom path
* bun run scripts/capture-baseline.ts --tag v1.44.1 # tag the snapshot
*/
import * as fs from 'fs';
import * as path from 'path';
import { execSync } from 'child_process';
export interface SkillBaselineEntry {
skill: string;
skillMdBytes: number;
skillMdLines: number;
estTokens: number; // ~4 chars/token heuristic
tmplBytes: number | null; // null when no .tmpl exists (vendored or non-Claude)
descriptionLen: number; // bytes in frontmatter description field
hasGateEval: boolean;
hasPeriodicEval: boolean;
}
export interface ParityBaseline {
tag: string;
capturedAt: string;
capturedFromCommit: string;
capturedFromBranch: string;
totalSkills: number;
totalCorpusBytes: number;
estTotalCatalogTokens: number; // sum of all description lengths / 4
topHeaviest: SkillBaselineEntry[]; // sorted desc by skillMdBytes
skills: Record<string, SkillBaselineEntry>;
}
export interface CaptureOptions {
repoRoot: string;
tag?: string;
}
/** Extract the frontmatter description from a SKILL.md file. Empty string if none. */
function extractDescription(content: string): string {
if (!content.startsWith('---\n')) return '';
const fmEnd = content.indexOf('\n---', 4);
if (fmEnd === -1) return '';
const frontmatter = content.slice(4, fmEnd);
const lines = frontmatter.split('\n');
let inDescription = false;
const descLines: string[] = [];
for (const line of lines) {
if (line.match(/^description:\s*\|?\s*$/)) {
inDescription = true;
continue;
}
if (line.match(/^description:\s+/)) {
descLines.push(line.replace(/^description:\s+/, ''));
inDescription = true;
continue;
}
if (inDescription) {
if (line.match(/^\w+:\s/)) break;
descLines.push(line.trim());
}
}
return descLines.join('\n').trim();
}
/** Estimate token count via 4 chars/token. Crude but matches existing budget-regression usage. */
function estimateTokens(bytes: number): number {
return Math.round(bytes / 4);
}
/** Find which top-level directories contain a SKILL.md (skills we capture). */
function discoverSkillDirs(repoRoot: string): string[] {
const entries = fs.readdirSync(repoRoot, { withFileTypes: true });
const dirs: string[] = [];
for (const e of entries) {
if (!e.isDirectory()) continue;
if (e.name.startsWith('.')) continue;
if (e.name === 'node_modules' || e.name === 'docs') continue;
const skillMd = path.join(repoRoot, e.name, 'SKILL.md');
if (fs.existsSync(skillMd)) dirs.push(e.name);
}
return dirs.sort();
}
/** Check whether a skill has E2E gate / periodic eval coverage by scanning test/. */
function discoverEvalCoverage(repoRoot: string, skills: string[]): {
gate: Set<string>;
periodic: Set<string>;
} {
const gate = new Set<string>();
const periodic = new Set<string>();
const testDir = path.join(repoRoot, 'test');
if (!fs.existsSync(testDir)) return { gate, periodic };
const testFiles = fs.readdirSync(testDir).filter(f => f.startsWith('skill-e2e-') && f.endsWith('.test.ts'));
// Try to map each test file to a skill by reading its contents for skill names.
for (const file of testFiles) {
const content = fs.readFileSync(path.join(testDir, file), 'utf-8');
for (const skill of skills) {
// Match the skill name as a word boundary, also try /skill-name slash form.
const re = new RegExp(`(/${skill}|['"\`]${skill}['"\`]|skill[s]?[/=:]\\s*['"\`]${skill}['"\`])`);
if (re.test(content)) {
// Crude tier inference: if file name contains "regression" / known-periodic markers, classify periodic.
if (file.includes('chain') || file.includes('multi') || file.includes('idempotency') || file.includes('finding-floor')) {
periodic.add(skill);
} else {
gate.add(skill);
}
}
}
}
return { gate, periodic };
}
function getGitInfo(repoRoot: string): { commit: string; branch: string } {
try {
const commit = execSync('git rev-parse --short HEAD', { cwd: repoRoot, encoding: 'utf-8' }).trim();
const branch = execSync('git rev-parse --abbrev-ref HEAD', { cwd: repoRoot, encoding: 'utf-8' }).trim();
return { commit, branch };
} catch {
return { commit: 'unknown', branch: 'unknown' };
}
}
export function captureBaseline(opts: CaptureOptions): ParityBaseline {
const { repoRoot, tag } = opts;
const skillDirs = discoverSkillDirs(repoRoot);
const evalCoverage = discoverEvalCoverage(repoRoot, skillDirs);
const skills: Record<string, SkillBaselineEntry> = {};
let totalCorpusBytes = 0;
let totalDescriptionBytes = 0;
for (const dir of skillDirs) {
const skillMdPath = path.join(repoRoot, dir, 'SKILL.md');
const tmplPath = path.join(repoRoot, dir, 'SKILL.md.tmpl');
const content = fs.readFileSync(skillMdPath, 'utf-8');
const bytes = Buffer.byteLength(content, 'utf-8');
const lines = content.split('\n').length;
const description = extractDescription(content);
const descriptionLen = Buffer.byteLength(description, 'utf-8');
const tmplBytes = fs.existsSync(tmplPath)
? Buffer.byteLength(fs.readFileSync(tmplPath, 'utf-8'), 'utf-8')
: null;
const entry: SkillBaselineEntry = {
skill: dir,
skillMdBytes: bytes,
skillMdLines: lines,
estTokens: estimateTokens(bytes),
tmplBytes,
descriptionLen,
hasGateEval: evalCoverage.gate.has(dir),
hasPeriodicEval: evalCoverage.periodic.has(dir),
};
skills[dir] = entry;
totalCorpusBytes += bytes;
totalDescriptionBytes += descriptionLen;
}
const topHeaviest = Object.values(skills)
.slice()
.sort((a, b) => b.skillMdBytes - a.skillMdBytes)
.slice(0, 10);
const git = getGitInfo(repoRoot);
return {
tag: tag ?? 'untagged',
capturedAt: new Date().toISOString(),
capturedFromCommit: git.commit,
capturedFromBranch: git.branch,
totalSkills: skillDirs.length,
totalCorpusBytes,
estTotalCatalogTokens: estimateTokens(totalDescriptionBytes),
topHeaviest,
skills,
};
}
/** Diff two baselines; useful for v2 vs v1.44 deltas. */
export interface BaselineDiff {
totalCorpusDelta: number;
totalCorpusDeltaPct: number;
catalogTokensDelta: number;
catalogTokensDeltaPct: number;
perSkill: Array<{
skill: string;
beforeBytes: number;
afterBytes: number;
deltaBytes: number;
deltaPct: number;
}>;
}
export function diffBaselines(before: ParityBaseline, after: ParityBaseline): BaselineDiff {
const totalCorpusDelta = after.totalCorpusBytes - before.totalCorpusBytes;
const totalCorpusDeltaPct = before.totalCorpusBytes
? (totalCorpusDelta / before.totalCorpusBytes) * 100
: 0;
const catalogTokensDelta = after.estTotalCatalogTokens - before.estTotalCatalogTokens;
const catalogTokensDeltaPct = before.estTotalCatalogTokens
? (catalogTokensDelta / before.estTotalCatalogTokens) * 100
: 0;
const perSkill: BaselineDiff['perSkill'] = [];
const allSkills = new Set([...Object.keys(before.skills), ...Object.keys(after.skills)]);
for (const skill of allSkills) {
const b = before.skills[skill]?.skillMdBytes ?? 0;
const a = after.skills[skill]?.skillMdBytes ?? 0;
perSkill.push({
skill,
beforeBytes: b,
afterBytes: a,
deltaBytes: a - b,
deltaPct: b ? ((a - b) / b) * 100 : 0,
});
}
perSkill.sort((x, y) => Math.abs(y.deltaBytes) - Math.abs(x.deltaBytes));
return {
totalCorpusDelta,
totalCorpusDeltaPct,
catalogTokensDelta,
catalogTokensDeltaPct,
perSkill,
};
}