mirror of
https://github.com/garrytan/gstack.git
synced 2026-06-17 23:30:09 +02:00
296937d466
Phase 0 deliverable — eval-first foundation. Two new test files plus the
registry:
1. test/skill-coverage-matrix.ts — single source of truth mapping each
skill to its gate-tier + periodic-tier test files. SKILL_COVERAGE
record with 51 entries; every gstack skill on disk has at least one
gate-tier entry.
2. test/skill-coverage-matrix.test.ts — CI gate. Asserts every skill on
disk has a registry entry AND that gate[] is non-empty. Catches
"skill added but eval not registered" the moment a new SKILL.md
lands.
3. test/skill-coverage-floor.test.ts — per-skill structural compliance
(FREE, file-IO only). For each of 51 skills, verifies:
- SKILL.md exists
- Frontmatter well-formed (name + description fields)
- Catalog-trim contract (inline description ≤ 250 chars, or block form)
- Generated header present (edit .tmpl, not .md)
- Body ≥ 200 bytes (non-trivial content)
- No unresolved {{TEMPLATE}} placeholders leaked
The "floor" is the minimum eval that every skill ships with. Skills that
need deeper behavioral testing get additional entries in their coverage
record (e.g., ship has skill-e2e-ship-idempotency + workflow + floor).
Future skills only need to add the floor entry and the matrix gate
unblocks them.
Codex 2nd-pass critique #1 mitigation: eval-first floor is structural
compliance (the testable part) — judgment-skill behavior gets layered
periodic-tier evals on top. We don't pretend the floor proves
correctness, only that the skill structurally compiles.
Test plan:
- bun test test/skill-coverage-matrix.test.ts: 4 pass (matrix shape + coverage)
- bun test test/skill-coverage-floor.test.ts: 309 pass (6 checks × 51 skills + 3 registry-level)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
154 lines
6.0 KiB
TypeScript
154 lines
6.0 KiB
TypeScript
/**
|
|
* Skill coverage floor — gate-tier, free, runs every PR.
|
|
*
|
|
* Phase 0 of the cathedral parity-eval suite: structural-compliance smoke
|
|
* test that covers every gstack skill with file-IO assertions. The intent
|
|
* is "every skill ships with at least one CI-blocking check" — even when
|
|
* a skill doesn't (yet) have a behavioral E2E test, this floor catches
|
|
* frontmatter regressions, missing generated header, empty/trivial bodies,
|
|
* and dangling SKILL.md.tmpl-without-SKILL.md mismatches.
|
|
*
|
|
* Pairs with test/skill-coverage-matrix.ts (the registry) and
|
|
* test/parity-suite.test.ts (the content-invariant suite). Together,
|
|
* v1.45.0.0 ships with: floor (this file) + matrix (registry CI gate)
|
|
* + invariants (content per skill family) + size budget. That's the
|
|
* eval-first foundation the v2.0.0.0 sections/ work builds on.
|
|
*/
|
|
|
|
import { describe, test, expect } from 'bun:test';
|
|
import * as fs from 'fs';
|
|
import * as path from 'path';
|
|
import { SKILL_COVERAGE } from './skill-coverage-matrix';
|
|
|
|
const REPO_ROOT = path.resolve(import.meta.dir, '..');
|
|
|
|
function readSkillMd(skill: string): string | null {
|
|
const p = path.join(REPO_ROOT, skill, 'SKILL.md');
|
|
try {
|
|
return fs.readFileSync(p, 'utf-8');
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
function listSkillDirs(): string[] {
|
|
const entries = fs.readdirSync(REPO_ROOT, { withFileTypes: true });
|
|
return entries
|
|
.filter(e => e.isDirectory() && !e.name.startsWith('.'))
|
|
.filter(e => e.name !== 'node_modules' && e.name !== 'docs' && e.name !== 'test')
|
|
.filter(e => fs.existsSync(path.join(REPO_ROOT, e.name, 'SKILL.md')))
|
|
.map(e => e.name)
|
|
.sort();
|
|
}
|
|
|
|
describe('skill-coverage-floor: every skill passes structural compliance', () => {
|
|
const skills = listSkillDirs();
|
|
|
|
test('skill registry mentions every skill on disk', () => {
|
|
const onDisk = new Set(skills);
|
|
const inRegistry = new Set(Object.keys(SKILL_COVERAGE));
|
|
const missingFromRegistry: string[] = [];
|
|
for (const s of onDisk) {
|
|
if (!inRegistry.has(s)) missingFromRegistry.push(s);
|
|
}
|
|
if (missingFromRegistry.length > 0) {
|
|
throw new Error(
|
|
`Skills on disk missing from test/skill-coverage-matrix.ts: ${missingFromRegistry.join(', ')}. ` +
|
|
`Add an entry to SKILL_COVERAGE with at least 'test/skill-coverage-floor.test.ts' in gate[].`,
|
|
);
|
|
}
|
|
});
|
|
|
|
test('every registry entry has at least one gate-tier test', () => {
|
|
const missingGate: string[] = [];
|
|
for (const [skill, coverage] of Object.entries(SKILL_COVERAGE)) {
|
|
if (!coverage.gate || coverage.gate.length === 0) missingGate.push(skill);
|
|
}
|
|
if (missingGate.length > 0) {
|
|
throw new Error(
|
|
`Skills with no gate-tier eval: ${missingGate.join(', ')}. ` +
|
|
`Eval-first foundation requires at least one CI-blocking check per skill.`,
|
|
);
|
|
}
|
|
});
|
|
|
|
test('every gate-tier test path referenced in registry exists on disk', () => {
|
|
const missing: string[] = [];
|
|
for (const [skill, coverage] of Object.entries(SKILL_COVERAGE)) {
|
|
for (const testPath of [...coverage.gate, ...coverage.periodic]) {
|
|
const fullPath = path.join(REPO_ROOT, testPath);
|
|
if (!fs.existsSync(fullPath)) {
|
|
missing.push(`${skill} → ${testPath}`);
|
|
}
|
|
}
|
|
}
|
|
if (missing.length > 0) {
|
|
throw new Error(`Registry references missing test files:\n ${missing.join('\n ')}`);
|
|
}
|
|
});
|
|
|
|
// Per-skill structural compliance (file IO only, no LLM)
|
|
for (const skill of skills) {
|
|
describe(`skill: ${skill}`, () => {
|
|
test('SKILL.md exists', () => {
|
|
const content = readSkillMd(skill);
|
|
expect(content).not.toBeNull();
|
|
});
|
|
|
|
test('frontmatter is well-formed and contains name + description', () => {
|
|
const content = readSkillMd(skill)!;
|
|
expect(content.startsWith('---\n')).toBe(true);
|
|
const fmEnd = content.indexOf('\n---', 4);
|
|
expect(fmEnd).toBeGreaterThan(0);
|
|
const fm = content.slice(4, fmEnd);
|
|
// name: ...
|
|
expect(/^name:\s*\S/m.test(fm)).toBe(true);
|
|
// description: ... (either inline or block form)
|
|
expect(/^description:\s*(\S|\|)/m.test(fm)).toBe(true);
|
|
});
|
|
|
|
test('frontmatter description fits the catalog-trim contract', () => {
|
|
const content = readSkillMd(skill)!;
|
|
const fmEnd = content.indexOf('\n---', 4);
|
|
const fm = content.slice(4, fmEnd);
|
|
// Inline form: description: <one line>
|
|
const inlineMatch = fm.match(/^description:\s+(.+)$/m);
|
|
// Block form: description: |\n multiline
|
|
const blockMatch = fm.match(/^description:\s*\|/m);
|
|
if (inlineMatch) {
|
|
// Catalog-trimmed: should be ≤ 250 chars
|
|
expect(inlineMatch[1].length).toBeLessThanOrEqual(250);
|
|
} else if (blockMatch) {
|
|
// Block form is acceptable for small skills (under-120-chars baseline
|
|
// didn't trigger catalog trim). No size cap here; the parity-suite
|
|
// and size-budget tests handle bytes.
|
|
} else {
|
|
throw new Error(`${skill}: description field is not in inline or block form`);
|
|
}
|
|
});
|
|
|
|
test('generated header present (only edit .tmpl, not .md)', () => {
|
|
const content = readSkillMd(skill)!;
|
|
expect(content).toContain('AUTO-GENERATED from SKILL.md.tmpl');
|
|
});
|
|
|
|
test('body is non-trivial (≥ 200 bytes after frontmatter)', () => {
|
|
const content = readSkillMd(skill)!;
|
|
const fmEnd = content.indexOf('\n---', 4);
|
|
const body = content.slice(fmEnd + 5).trim();
|
|
expect(body.length).toBeGreaterThanOrEqual(200);
|
|
});
|
|
|
|
test('no unresolved {{TEMPLATE}} placeholders leaked into output', () => {
|
|
const content = readSkillMd(skill)!;
|
|
const leaks = content.match(/\{\{[A-Z_]+(?::[^}]+)?\}\}/g);
|
|
if (leaks) {
|
|
throw new Error(
|
|
`${skill}: ${leaks.length} unresolved placeholder(s) in generated SKILL.md: ${leaks.slice(0, 3).join(', ')}${leaks.length > 3 ? ', ...' : ''}`,
|
|
);
|
|
}
|
|
});
|
|
});
|
|
}
|
|
});
|