mirror of
https://github.com/garrytan/gstack.git
synced 2026-06-19 00:00:13 +02:00
ebebc95a34
Adds the harness that the v2_PLAN.md cathedral parity-eval suite is built
on. Compares CURRENT SKILL.md output to v1.44.1 baseline along three axes:
STRUCTURE frontmatter shape (catalog trim landed, "## When to invoke" present)
CONTENT must-preserve phrases per skill family (cso: OWASP/STRIDE;
plan-ceo: SCOPE EXPANSION/HOLD SCOPE/REDUCTION; ship:
VERSION/CHANGELOG/PR; etc.)
SIZE per-skill byte budget (maxSizeRatio + minBytes guards)
PARITY_INVARIANTS registry pins 10 load-bearing skills (cso, ship, plan-*-
review, review, qa, investigate, office-hours, autoplan). Each entry
declares what must NOT regress; future compression that strips these
phrases or shrinks a skill past its minBytes cliff fails CI.
Periodic-tier LLM-judge parity (paid, ~$0.20/skill) lands in v2.0.0.0
sections/ phase. Same registry, same harness, judge added on top.
Test plan:
- bun test test/parity-suite.test.ts: 10/10 invariants pass vs v1.44.1
- Per-skill failures get actionable per-line breakdown so a reviewer can
see which phrase / heading / size limit went sideways
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
50 lines
1.7 KiB
TypeScript
50 lines
1.7 KiB
TypeScript
/**
|
|
* Cathedral parity suite — gate-tier (free, structural + content checks).
|
|
*
|
|
* Runs every PARITY_INVARIANTS check against the current SKILL.md output
|
|
* vs the v1.44.1 baseline. Failures get an actionable, per-skill report
|
|
* showing missing phrases, missing headings, and size ratios.
|
|
*
|
|
* Periodic-tier LLM-judge parity (paid) lands in Phase B (v2.0.0.0)
|
|
* alongside the sections/ extraction. Plumbing is in parity-harness.ts.
|
|
*/
|
|
|
|
import { describe, test, expect } from 'bun:test';
|
|
import * as fs from 'fs';
|
|
import * as path from 'path';
|
|
import { runParityChecks, PARITY_INVARIANTS } from './helpers/parity-harness';
|
|
import type { ParityBaseline } from './helpers/capture-parity-baseline';
|
|
|
|
const REPO_ROOT = path.resolve(import.meta.dir, '..');
|
|
const BASELINE_PATH = path.join(REPO_ROOT, 'test', 'fixtures', 'parity-baseline-v1.44.1.json');
|
|
|
|
describe('parity suite vs v1.44.1 baseline (gate, free)', () => {
|
|
test('baseline exists', () => {
|
|
expect(fs.existsSync(BASELINE_PATH)).toBe(true);
|
|
});
|
|
|
|
test('all PARITY_INVARIANTS pass', () => {
|
|
const baseline: ParityBaseline = JSON.parse(fs.readFileSync(BASELINE_PATH, 'utf-8'));
|
|
const report = runParityChecks({
|
|
repoRoot: REPO_ROOT,
|
|
baseline,
|
|
invariants: PARITY_INVARIANTS,
|
|
});
|
|
|
|
// eslint-disable-next-line no-console
|
|
console.log(
|
|
`[parity] ${report.passed}/${report.totalChecks} skills passed parity vs ${baseline.tag}`,
|
|
);
|
|
|
|
if (report.failed === 0) return;
|
|
|
|
const failureMessages = report.details
|
|
.filter(d => !d.passed)
|
|
.map(d => ` ${d.skill}:\n - ${d.failures.join('\n - ')}`)
|
|
.join('\n');
|
|
throw new Error(
|
|
`${report.failed} skill(s) failed parity checks vs v1.44.1:\n${failureMessages}`,
|
|
);
|
|
});
|
|
});
|