Merge remote-tracking branch 'origin/main' into garrytan/askuserquestion-split-on-overflow

2026-06-18 15:50:11 +02:00 · 2026-05-26 22:27:54 -07:00
parent d0d8cb2db6 f8bb59094d
commit e08e5fa8aa
107 changed files with 10060 additions and 3885 deletions
@@ -0,0 +1,116 @@
+/**
+ * Unit tests for budget-override audit logger.
+ *
+ * The audit trail is the only check on `EVALS_BUDGET_OVERRIDE_REASON` and
+ * `GSTACK_SIZE_BUDGET_OVERRIDE_REASON` — if the logger silently drops events,
+ * overrides become invisible and the budget gates are theater. These tests
+ * pin the contract: every override produces exactly one JSONL line with
+ * timestamp + scope + reason + CI provenance.
+ */
+
+import { describe, test, expect, beforeEach } from 'bun:test';
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+import { logBudgetOverride } from './budget-override';
+
+const TMP_HOME = fs.mkdtempSync(path.join(os.tmpdir(), 'budget-override-test-'));
+process.env.GSTACK_HOME = TMP_HOME;
+const AUDIT_PATH = path.join(TMP_HOME, 'analytics', 'spend-overrides.jsonl');
+
+describe('logBudgetOverride', () => {
+  beforeEach(() => {
+    // Start each test with a clean audit file
+    try { fs.unlinkSync(AUDIT_PATH); } catch { /* doesn't exist */ }
+  });
+
+  test('writes one JSONL line per call with required fields', () => {
+    logBudgetOverride({
+      scope: 'evals-cost-cap-e2e',
+      reason: 'model price went up, will rebase the cap next sprint',
+      details: { tier: 'e2e', cap: 25, observed_cost_usd: 31.4 },
+    });
+
+    expect(fs.existsSync(AUDIT_PATH)).toBe(true);
+    const lines = fs.readFileSync(AUDIT_PATH, 'utf-8').split('\n').filter(Boolean);
+    expect(lines.length).toBe(1);
+    const entry = JSON.parse(lines[0]!);
+    expect(entry.scope).toBe('evals-cost-cap-e2e');
+    expect(entry.reason).toBe('model price went up, will rebase the cap next sprint');
+    expect(entry.details).toEqual({ tier: 'e2e', cap: 25, observed_cost_usd: 31.4 });
+    expect(typeof entry.timestamp).toBe('string');
+    expect(entry.timestamp).toMatch(/^\d{4}-\d{2}-\d{2}T/);
+  });
+
+  test('captures CI provenance when CI env is set', () => {
+    process.env.CI = 'true';
+    process.env.GITHUB_ACTIONS = 'true';
+    process.env.GITHUB_REF_NAME = 'feature/x';
+    process.env.GITHUB_SHA = 'deadbeefcafe1234';
+
+    logBudgetOverride({ scope: 'skill-size-budget', reason: 'big diff bake-in' });
+
+    const entry = JSON.parse(fs.readFileSync(AUDIT_PATH, 'utf-8').trim());
+    expect(entry.ci).toBe(true);
+    expect(entry.runner).toBe('github-actions');
+    expect(entry.branch).toBe('feature/x');
+    expect(entry.commit).toBe('deadbeef');
+
+    delete process.env.CI;
+    delete process.env.GITHUB_ACTIONS;
+    delete process.env.GITHUB_REF_NAME;
+    delete process.env.GITHUB_SHA;
+  });
+
+  test('defaults provenance to local when CI is unset', () => {
+    delete process.env.CI;
+    delete process.env.GITHUB_ACTIONS;
+    delete process.env.GITHUB_REF_NAME;
+    delete process.env.GITHUB_SHA;
+    delete process.env.CI_RUNNER;
+    delete process.env.CI_COMMIT_REF_NAME;
+    delete process.env.CI_COMMIT_SHORT_SHA;
+
+    logBudgetOverride({ scope: 'skill-size-budget-corpus', reason: 'local dev test' });
+
+    const entry = JSON.parse(fs.readFileSync(AUDIT_PATH, 'utf-8').trim());
+    expect(entry.ci).toBe(false);
+    expect(entry.runner).toBe('local');
+    expect(entry.branch).toBe('unknown');
+    expect(entry.commit).toBe('unknown');
+  });
+
+  test('append-only: multiple calls produce multiple lines', () => {
+    logBudgetOverride({ scope: 's1', reason: 'r1' });
+    logBudgetOverride({ scope: 's2', reason: 'r2' });
+    logBudgetOverride({ scope: 's3', reason: 'r3' });
+
+    const lines = fs.readFileSync(AUDIT_PATH, 'utf-8').split('\n').filter(Boolean);
+    expect(lines.length).toBe(3);
+    const scopes = lines.map(l => JSON.parse(l).scope);
+    expect(scopes).toEqual(['s1', 's2', 's3']);
+  });
+
+  test('omits details key when entry.details is absent (uses empty object)', () => {
+    logBudgetOverride({ scope: 'plain', reason: 'no details' });
+    const entry = JSON.parse(fs.readFileSync(AUDIT_PATH, 'utf-8').trim());
+    expect(entry.details).toEqual({});
+  });
+
+  test('never throws even when audit directory is missing — creates it', () => {
+    // Remove the analytics dir to force mkdir
+    try { fs.rmSync(path.join(TMP_HOME, 'analytics'), { recursive: true, force: true }); } catch { /* */ }
+    expect(() => logBudgetOverride({ scope: 'recreate', reason: 'test' })).not.toThrow();
+    expect(fs.existsSync(AUDIT_PATH)).toBe(true);
+  });
+
+  test('survives an unwritable audit path (logs warning, does not throw)', () => {
+    // Point GSTACK_HOME at a path inside a file (illegal directory location)
+    const originalHome = process.env.GSTACK_HOME;
+    const bogusFile = path.join(TMP_HOME, 'not-a-dir.txt');
+    fs.writeFileSync(bogusFile, 'just a file');
+    process.env.GSTACK_HOME = bogusFile;
+    expect(() => logBudgetOverride({ scope: 'unwritable', reason: 'fs error path' })).not.toThrow();
+    process.env.GSTACK_HOME = originalHome;
+  });
+});
@@ -0,0 +1,50 @@
+/**
+ * Budget override audit trail (v1.45.0.0 T5).
+ *
+ * Records uses of GSTACK_SIZE_BUDGET_OVERRIDE_REASON or
+ * EVALS_BUDGET_OVERRIDE_REASON so a reviewer can see what was waived,
+ * by whom, and why. Append-only JSONL at ~/.gstack/analytics/spend-overrides.jsonl.
+ *
+ * Why audit: a hard cap with no escape valve becomes operationally hostile
+ * (legit price changes, longer transcripts, new required evals can all
+ * blow the cap). An escape valve with no audit becomes "everyone overrides
+ * everything and we lose the gate." This module is the audit half.
+ */
+
+import * as fs from 'fs';
+import * as path from 'path';
+import * as os from 'os';
+
+export interface BudgetOverrideEntry {
+  scope: string;             // e.g. 'skill-size-budget', 'evals-cost-cap'
+  reason: string;            // user-supplied REASON env var
+  details?: Record<string, unknown>; // numbers / regressions
+}
+
+function getAuditPath(): string {
+  const base = process.env.GSTACK_HOME || path.join(os.homedir(), '.gstack');
+  return path.join(base, 'analytics', 'spend-overrides.jsonl');
+}
+
+export function logBudgetOverride(entry: BudgetOverrideEntry): void {
+  try {
+    const auditPath = getAuditPath();
+    fs.mkdirSync(path.dirname(auditPath), { recursive: true });
+    const line = JSON.stringify({
+      timestamp: new Date().toISOString(),
+      scope: entry.scope,
+      reason: entry.reason,
+      details: entry.details ?? {},
+      // Capture provenance: who/where/which CI ran
+      ci: process.env.CI === 'true',
+      runner: process.env.GITHUB_ACTIONS ? 'github-actions' : process.env.CI_RUNNER || 'local',
+      branch: process.env.GITHUB_REF_NAME || process.env.CI_COMMIT_REF_NAME || 'unknown',
+      commit: process.env.GITHUB_SHA?.slice(0, 8) || process.env.CI_COMMIT_SHORT_SHA || 'unknown',
+    }) + '\n';
+    fs.appendFileSync(auditPath, line);
+  } catch (err) {
+    // Best-effort logging; don't fail the test on audit-write errors.
+    // eslint-disable-next-line no-console
+    console.warn(`[budget-override] could not write audit log: ${(err as Error).message}`);
+  }
+}
@@ -0,0 +1,90 @@
+/**
+ * Unit tests for parity baseline capture.
+ *
+ * Free. Reads the live repo state via captureBaseline() and asserts
+ * shape + invariants, not specific numbers (which drift release-over-release).
+ */
+
+import { describe, test, expect } from 'bun:test';
+import * as fs from 'fs';
+import * as path from 'path';
+import { captureBaseline, diffBaselines, type ParityBaseline } from './capture-parity-baseline';
+
+const REPO_ROOT = path.resolve(import.meta.dir, '..', '..');
+
+describe('capture-parity-baseline', () => {
+  test('produces a shaped baseline for the current repo', () => {
+    const baseline = captureBaseline({ repoRoot: REPO_ROOT, tag: 'unit-test' });
+    expect(baseline.tag).toBe('unit-test');
+    expect(baseline.totalSkills).toBeGreaterThan(20);
+    expect(baseline.totalCorpusBytes).toBeGreaterThan(100_000);
+    expect(baseline.topHeaviest.length).toBeGreaterThan(0);
+    expect(baseline.topHeaviest.length).toBeLessThanOrEqual(10);
+    expect(baseline.topHeaviest[0]!.skillMdBytes).toBeGreaterThan(0);
+    // Top 1 should be ≥ Top 2 (sort invariant)
+    if (baseline.topHeaviest.length >= 2) {
+      expect(baseline.topHeaviest[0]!.skillMdBytes).toBeGreaterThanOrEqual(
+        baseline.topHeaviest[1]!.skillMdBytes,
+      );
+    }
+  });
+
+  test('each skill entry has byte + line + token estimates', () => {
+    const baseline = captureBaseline({ repoRoot: REPO_ROOT });
+    for (const skill of Object.values(baseline.skills)) {
+      expect(skill.skillMdBytes).toBeGreaterThan(0);
+      expect(skill.skillMdLines).toBeGreaterThan(0);
+      expect(skill.estTokens).toBeGreaterThan(0);
+      // ~4 chars/token heuristic
+      expect(skill.estTokens).toBeCloseTo(skill.skillMdBytes / 4, -2);
+    }
+  });
+
+  test('diffBaselines returns expected deltas', () => {
+    const before: ParityBaseline = {
+      tag: 'before',
+      capturedAt: '2026-01-01T00:00:00Z',
+      capturedFromCommit: 'abc',
+      capturedFromBranch: 'main',
+      totalSkills: 2,
+      totalCorpusBytes: 1000,
+      estTotalCatalogTokens: 100,
+      topHeaviest: [],
+      skills: {
+        foo: { skill: 'foo', skillMdBytes: 600, skillMdLines: 10, estTokens: 150, tmplBytes: 300, descriptionLen: 50, hasGateEval: true, hasPeriodicEval: false },
+        bar: { skill: 'bar', skillMdBytes: 400, skillMdLines: 8, estTokens: 100, tmplBytes: 200, descriptionLen: 30, hasGateEval: false, hasPeriodicEval: false },
+      },
+    };
+    const after: ParityBaseline = {
+      ...before,
+      tag: 'after',
+      totalCorpusBytes: 700,
+      estTotalCatalogTokens: 60,
+      skills: {
+        foo: { ...before.skills.foo!, skillMdBytes: 400 },
+        bar: { ...before.skills.bar!, skillMdBytes: 300 },
+      },
+    };
+    const diff = diffBaselines(before, after);
+    expect(diff.totalCorpusDelta).toBe(-300);
+    expect(diff.totalCorpusDeltaPct).toBeCloseTo(-30, 1);
+    expect(diff.catalogTokensDelta).toBe(-40);
+    expect(diff.perSkill.length).toBe(2);
+    // Sorted by abs delta descending
+    expect(diff.perSkill[0]!.skill).toBe('foo');
+    expect(diff.perSkill[0]!.deltaBytes).toBe(-200);
+    expect(diff.perSkill[1]!.skill).toBe('bar');
+  });
+
+  test('v1.44.1 baseline file exists with expected shape', () => {
+    const baselinePath = path.join(REPO_ROOT, 'test', 'fixtures', 'parity-baseline-v1.44.1.json');
+    expect(fs.existsSync(baselinePath)).toBe(true);
+    const baseline = JSON.parse(fs.readFileSync(baselinePath, 'utf-8')) as ParityBaseline;
+    expect(baseline.tag).toBe('v1.44.1');
+    expect(baseline.totalSkills).toBeGreaterThan(40);
+    // Document the v1.44.1 snapshot as the v1→v2 baseline reference.
+    // Compression in v1.45+ should drop totalCorpusBytes; this assertion
+    // anchors the "v1 was XX MB" claim in the CHANGELOG to a real file.
+    expect(baseline.totalCorpusBytes).toBeGreaterThan(2_000_000);
+  });
+});
@@ -0,0 +1,231 @@
+/**
+ * Parity baseline capture — cathedral parity-eval suite primitive.
+ *
+ * Snapshots the current state of every top-level SKILL.md: byte count, line
+ * count, estimated token count, frontmatter description length, eval
+ * coverage. The output JSON is the v1.44 baseline that v2 must beat on
+ * compression AND match (or exceed) on parity.
+ *
+ * The numbers quoted in the v2.0.0.0 CHANGELOG numbers table are read
+ * from a baseline JSON captured by this script. Never invent baseline
+ * numbers; ship them only if they came from a real captureBaseline() run.
+ *
+ * Usage:
+ *   bun run scripts/capture-baseline.ts                    # write default path
+ *   bun run scripts/capture-baseline.ts --out PATH         # write custom path
+ *   bun run scripts/capture-baseline.ts --tag v1.44.1      # tag the snapshot
+ */
+
+import * as fs from 'fs';
+import * as path from 'path';
+import { execSync } from 'child_process';
+
+export interface SkillBaselineEntry {
+  skill: string;
+  skillMdBytes: number;
+  skillMdLines: number;
+  estTokens: number; // ~4 chars/token heuristic
+  tmplBytes: number | null; // null when no .tmpl exists (vendored or non-Claude)
+  descriptionLen: number; // bytes in frontmatter description field
+  hasGateEval: boolean;
+  hasPeriodicEval: boolean;
+}
+
+export interface ParityBaseline {
+  tag: string;
+  capturedAt: string;
+  capturedFromCommit: string;
+  capturedFromBranch: string;
+  totalSkills: number;
+  totalCorpusBytes: number;
+  estTotalCatalogTokens: number; // sum of all description lengths / 4
+  topHeaviest: SkillBaselineEntry[]; // sorted desc by skillMdBytes
+  skills: Record<string, SkillBaselineEntry>;
+}
+
+export interface CaptureOptions {
+  repoRoot: string;
+  tag?: string;
+}
+
+/** Extract the frontmatter description from a SKILL.md file. Empty string if none. */
+function extractDescription(content: string): string {
+  if (!content.startsWith('---\n')) return '';
+  const fmEnd = content.indexOf('\n---', 4);
+  if (fmEnd === -1) return '';
+  const frontmatter = content.slice(4, fmEnd);
+  const lines = frontmatter.split('\n');
+  let inDescription = false;
+  const descLines: string[] = [];
+  for (const line of lines) {
+    if (line.match(/^description:\s*\|?\s*$/)) {
+      inDescription = true;
+      continue;
+    }
+    if (line.match(/^description:\s+/)) {
+      descLines.push(line.replace(/^description:\s+/, ''));
+      inDescription = true;
+      continue;
+    }
+    if (inDescription) {
+      if (line.match(/^\w+:\s/)) break;
+      descLines.push(line.trim());
+    }
+  }
+  return descLines.join('\n').trim();
+}
+
+/** Estimate token count via 4 chars/token. Crude but matches existing budget-regression usage. */
+function estimateTokens(bytes: number): number {
+  return Math.round(bytes / 4);
+}
+
+/** Find which top-level directories contain a SKILL.md (skills we capture). */
+function discoverSkillDirs(repoRoot: string): string[] {
+  const entries = fs.readdirSync(repoRoot, { withFileTypes: true });
+  const dirs: string[] = [];
+  for (const e of entries) {
+    if (!e.isDirectory()) continue;
+    if (e.name.startsWith('.')) continue;
+    if (e.name === 'node_modules' || e.name === 'docs') continue;
+    const skillMd = path.join(repoRoot, e.name, 'SKILL.md');
+    if (fs.existsSync(skillMd)) dirs.push(e.name);
+  }
+  return dirs.sort();
+}
+
+/** Check whether a skill has E2E gate / periodic eval coverage by scanning test/. */
+function discoverEvalCoverage(repoRoot: string, skills: string[]): {
+  gate: Set<string>;
+  periodic: Set<string>;
+} {
+  const gate = new Set<string>();
+  const periodic = new Set<string>();
+  const testDir = path.join(repoRoot, 'test');
+  if (!fs.existsSync(testDir)) return { gate, periodic };
+  const testFiles = fs.readdirSync(testDir).filter(f => f.startsWith('skill-e2e-') && f.endsWith('.test.ts'));
+  // Try to map each test file to a skill by reading its contents for skill names.
+  for (const file of testFiles) {
+    const content = fs.readFileSync(path.join(testDir, file), 'utf-8');
+    for (const skill of skills) {
+      // Match the skill name as a word boundary, also try /skill-name slash form.
+      const re = new RegExp(`(/${skill}|['"\`]${skill}['"\`]|skill[s]?[/=:]\\s*['"\`]${skill}['"\`])`);
+      if (re.test(content)) {
+        // Crude tier inference: if file name contains "regression" / known-periodic markers, classify periodic.
+        if (file.includes('chain') || file.includes('multi') || file.includes('idempotency') || file.includes('finding-floor')) {
+          periodic.add(skill);
+        } else {
+          gate.add(skill);
+        }
+      }
+    }
+  }
+  return { gate, periodic };
+}
+
+function getGitInfo(repoRoot: string): { commit: string; branch: string } {
+  try {
+    const commit = execSync('git rev-parse --short HEAD', { cwd: repoRoot, encoding: 'utf-8' }).trim();
+    const branch = execSync('git rev-parse --abbrev-ref HEAD', { cwd: repoRoot, encoding: 'utf-8' }).trim();
+    return { commit, branch };
+  } catch {
+    return { commit: 'unknown', branch: 'unknown' };
+  }
+}
+
+export function captureBaseline(opts: CaptureOptions): ParityBaseline {
+  const { repoRoot, tag } = opts;
+  const skillDirs = discoverSkillDirs(repoRoot);
+  const evalCoverage = discoverEvalCoverage(repoRoot, skillDirs);
+  const skills: Record<string, SkillBaselineEntry> = {};
+  let totalCorpusBytes = 0;
+  let totalDescriptionBytes = 0;
+  for (const dir of skillDirs) {
+    const skillMdPath = path.join(repoRoot, dir, 'SKILL.md');
+    const tmplPath = path.join(repoRoot, dir, 'SKILL.md.tmpl');
+    const content = fs.readFileSync(skillMdPath, 'utf-8');
+    const bytes = Buffer.byteLength(content, 'utf-8');
+    const lines = content.split('\n').length;
+    const description = extractDescription(content);
+    const descriptionLen = Buffer.byteLength(description, 'utf-8');
+    const tmplBytes = fs.existsSync(tmplPath)
+      ? Buffer.byteLength(fs.readFileSync(tmplPath, 'utf-8'), 'utf-8')
+      : null;
+    const entry: SkillBaselineEntry = {
+      skill: dir,
+      skillMdBytes: bytes,
+      skillMdLines: lines,
+      estTokens: estimateTokens(bytes),
+      tmplBytes,
+      descriptionLen,
+      hasGateEval: evalCoverage.gate.has(dir),
+      hasPeriodicEval: evalCoverage.periodic.has(dir),
+    };
+    skills[dir] = entry;
+    totalCorpusBytes += bytes;
+    totalDescriptionBytes += descriptionLen;
+  }
+  const topHeaviest = Object.values(skills)
+    .slice()
+    .sort((a, b) => b.skillMdBytes - a.skillMdBytes)
+    .slice(0, 10);
+  const git = getGitInfo(repoRoot);
+  return {
+    tag: tag ?? 'untagged',
+    capturedAt: new Date().toISOString(),
+    capturedFromCommit: git.commit,
+    capturedFromBranch: git.branch,
+    totalSkills: skillDirs.length,
+    totalCorpusBytes,
+    estTotalCatalogTokens: estimateTokens(totalDescriptionBytes),
+    topHeaviest,
+    skills,
+  };
+}
+
+/** Diff two baselines; useful for v2 vs v1.44 deltas. */
+export interface BaselineDiff {
+  totalCorpusDelta: number;
+  totalCorpusDeltaPct: number;
+  catalogTokensDelta: number;
+  catalogTokensDeltaPct: number;
+  perSkill: Array<{
+    skill: string;
+    beforeBytes: number;
+    afterBytes: number;
+    deltaBytes: number;
+    deltaPct: number;
+  }>;
+}
+
+export function diffBaselines(before: ParityBaseline, after: ParityBaseline): BaselineDiff {
+  const totalCorpusDelta = after.totalCorpusBytes - before.totalCorpusBytes;
+  const totalCorpusDeltaPct = before.totalCorpusBytes
+    ? (totalCorpusDelta / before.totalCorpusBytes) * 100
+    : 0;
+  const catalogTokensDelta = after.estTotalCatalogTokens - before.estTotalCatalogTokens;
+  const catalogTokensDeltaPct = before.estTotalCatalogTokens
+    ? (catalogTokensDelta / before.estTotalCatalogTokens) * 100
+    : 0;
+  const perSkill: BaselineDiff['perSkill'] = [];
+  const allSkills = new Set([...Object.keys(before.skills), ...Object.keys(after.skills)]);
+  for (const skill of allSkills) {
+    const b = before.skills[skill]?.skillMdBytes ?? 0;
+    const a = after.skills[skill]?.skillMdBytes ?? 0;
+    perSkill.push({
+      skill,
+      beforeBytes: b,
+      afterBytes: a,
+      deltaBytes: a - b,
+      deltaPct: b ? ((a - b) / b) * 100 : 0,
+    });
+  }
+  perSkill.sort((x, y) => Math.abs(y.deltaBytes) - Math.abs(x.deltaBytes));
+  return {
+    totalCorpusDelta,
+    totalCorpusDeltaPct,
+    catalogTokensDelta,
+    catalogTokensDeltaPct,
+    perSkill,
+  };
+}
@@ -0,0 +1,230 @@
+/**
+ * Cathedral parity-eval harness (v1.45.0.0 T0b).
+ *
+ * Compares CURRENT SKILL.md output to a v1.44.1 golden baseline along three
+ * axes: STRUCTURE (frontmatter shape), CONTENT (must-preserve phrases per
+ * skill family), and SIZE (per-skill byte budget). The fourth axis —
+ * BEHAVIORAL parity via LLM-as-judge — runs on top of this harness in the
+ * periodic-tier eval suite (paid, ~$0.20 per skill judge call).
+ *
+ * The structural + content checks ship in v1.45.0.0 as the foundation; the
+ * LLM-judge layer lands in v2.0.0.0 alongside the sections/ pattern. Both
+ * use this module's APIs.
+ *
+ * Why a separate harness from skill-size-budget.test.ts: that one enforces
+ * size discipline only. This module supports content invariants per skill
+ * family (e.g., cso must preserve OWASP/STRIDE; plan-ceo must preserve
+ * mode-selection phrasing) so future compression can't silently strip
+ * load-bearing prose even when size stays within ratio.
+ */
+
+import * as fs from 'fs';
+import * as path from 'path';
+import type { ParityBaseline, SkillBaselineEntry } from './capture-parity-baseline';
+import { captureBaseline } from './capture-parity-baseline';
+
+export interface ParityInvariant {
+  skill: string;
+  /** Phrases that MUST appear in the generated SKILL.md (case-insensitive substring). */
+  mustContain?: string[];
+  /** Markdown H2 headings that MUST appear. */
+  mustHaveHeadings?: string[];
+  /** Maximum byte size growth ratio vs baseline. 1.0 = no growth allowed. */
+  maxSizeRatio?: number;
+  /** Minimum byte size (catches over-stripping cliffs). */
+  minBytes?: number;
+}
+
+export interface ParityCheckResult {
+  skill: string;
+  passed: boolean;
+  failures: string[];
+}
+
+export function checkSkillParity(
+  invariant: ParityInvariant,
+  current: SkillBaselineEntry,
+  baseline: SkillBaselineEntry | undefined,
+  repoRoot: string,
+): ParityCheckResult {
+  const failures: string[] = [];
+
+  // SIZE checks
+  if (invariant.maxSizeRatio !== undefined && baseline) {
+    const ratio = current.skillMdBytes / baseline.skillMdBytes;
+    if (ratio > invariant.maxSizeRatio) {
+      failures.push(`size ratio ${ratio.toFixed(3)} > maxSizeRatio ${invariant.maxSizeRatio}`);
+    }
+  }
+  if (invariant.minBytes !== undefined && current.skillMdBytes < invariant.minBytes) {
+    failures.push(`size ${current.skillMdBytes} < minBytes ${invariant.minBytes}`);
+  }
+
+  // CONTENT checks (read live file for fresh content)
+  if (invariant.mustContain?.length || invariant.mustHaveHeadings?.length) {
+    const skillMdPath = path.join(repoRoot, invariant.skill, 'SKILL.md');
+    let content: string | null = null;
+    try {
+      content = fs.readFileSync(skillMdPath, 'utf-8');
+    } catch (err) {
+      failures.push(`cannot read ${skillMdPath}: ${(err as Error).message}`);
+    }
+    if (content) {
+      const lower = content.toLowerCase();
+      for (const phrase of invariant.mustContain ?? []) {
+        if (!lower.includes(phrase.toLowerCase())) {
+          failures.push(`missing required phrase: "${phrase}"`);
+        }
+      }
+      for (const heading of invariant.mustHaveHeadings ?? []) {
+        if (!content.includes(heading)) {
+          failures.push(`missing required heading: "${heading}"`);
+        }
+      }
+    }
+  }
+
+  return {
+    skill: invariant.skill,
+    passed: failures.length === 0,
+    failures,
+  };
+}
+
+export interface ParityReport {
+  baselineTag: string;
+  currentCapturedAt: string;
+  totalChecks: number;
+  passed: number;
+  failed: number;
+  details: ParityCheckResult[];
+}
+
+export function runParityChecks(opts: {
+  repoRoot: string;
+  baseline: ParityBaseline;
+  invariants: ParityInvariant[];
+}): ParityReport {
+  const { repoRoot, baseline, invariants } = opts;
+  const current = captureBaseline({ repoRoot });
+  const details: ParityCheckResult[] = [];
+  for (const invariant of invariants) {
+    const baselineEntry = baseline.skills[invariant.skill];
+    const currentEntry = current.skills[invariant.skill];
+    if (!currentEntry) {
+      details.push({
+        skill: invariant.skill,
+        passed: false,
+        failures: [`skill removed: ${invariant.skill} present in baseline but not current state`],
+      });
+      continue;
+    }
+    details.push(checkSkillParity(invariant, currentEntry, baselineEntry, repoRoot));
+  }
+  return {
+    baselineTag: baseline.tag,
+    currentCapturedAt: current.capturedAt,
+    totalChecks: details.length,
+    passed: details.filter(d => d.passed).length,
+    failed: details.filter(d => !d.passed).length,
+    details,
+  };
+}
+
+/**
+ * Standard invariant registry — the v1.45.0.0 set.
+ *
+ * Each entry pins what must-not-break in a skill family. Extend as future
+ * skills land. Phase B (v2.0.0.0) adds LLM-judge invariants on top of these.
+ */
+export const PARITY_INVARIANTS: ParityInvariant[] = [
+  {
+    skill: 'cso',
+    mustContain: ['OWASP', 'STRIDE', 'daily', 'comprehensive', 'verif'],
+    mustHaveHeadings: ['## Preamble', '## When to invoke'],
+    maxSizeRatio: 1.05,
+    minBytes: 30_000,
+  },
+  {
+    skill: 'ship',
+    mustContain: [
+      'VERSION',
+      'CHANGELOG',
+      'review',
+      'merge',
+      'PR',
+    ],
+    mustHaveHeadings: ['## Preamble', '## When to invoke'],
+    maxSizeRatio: 1.05,
+    minBytes: 80_000,
+  },
+  {
+    skill: 'plan-ceo-review',
+    mustContain: [
+      'SCOPE EXPANSION',
+      'SELECTIVE EXPANSION',
+      'HOLD SCOPE',
+      'SCOPE REDUCTION',
+    ],
+    mustHaveHeadings: ['## Preamble', '## When to invoke'],
+    maxSizeRatio: 1.05,
+    minBytes: 80_000,
+  },
+  {
+    skill: 'plan-eng-review',
+    mustContain: [
+      'Architecture',
+      'Code Quality',
+      'Test',
+      'Performance',
+    ],
+    mustHaveHeadings: ['## Preamble', '## When to invoke'],
+    maxSizeRatio: 1.05,
+    minBytes: 70_000,
+  },
+  {
+    skill: 'plan-design-review',
+    mustContain: [
+      'design',
+      'visual',
+    ],
+    mustHaveHeadings: ['## Preamble', '## When to invoke'],
+    maxSizeRatio: 1.05,
+    minBytes: 70_000,
+  },
+  {
+    skill: 'review',
+    mustContain: ['confidence', 'P1', 'P2'],
+    mustHaveHeadings: ['## Preamble', '## When to invoke'],
+    maxSizeRatio: 1.05,
+    minBytes: 70_000,
+  },
+  {
+    skill: 'qa',
+    mustContain: ['bug', 'browse', 'fix'],
+    mustHaveHeadings: ['## Preamble', '## When to invoke'],
+    maxSizeRatio: 1.05,
+    minBytes: 50_000,
+  },
+  {
+    skill: 'investigate',
+    mustContain: ['root cause', 'hypothes'],
+    mustHaveHeadings: ['## Preamble', '## When to invoke'],
+    maxSizeRatio: 1.05,
+    minBytes: 30_000,
+  },
+  {
+    skill: 'office-hours',
+    mustContain: ['design doc', 'problem statement'],
+    mustHaveHeadings: ['## Preamble', '## When to invoke'],
+    maxSizeRatio: 1.05,
+    minBytes: 70_000,
+  },
+  {
+    skill: 'autoplan',
+    mustContain: ['ceo', 'eng', 'design'],
+    mustHaveHeadings: ['## Preamble', '## When to invoke'],
+    maxSizeRatio: 1.05,
+    minBytes: 70_000,
+  },
+];
@@ -374,6 +374,10 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
  // Real-device path — only runs with GSTACK_HAS_IOS_DEVICE=1 + a paired
  // iPhone. Validates the CoreDevice agent + iOS SDK toolchain. Periodic-tier.
  'ios-qa-device':    ['ios-qa/templates/**', 'test/fixtures/ios-qa/FixtureApp/**', 'test/skill-e2e-ios-device.test.ts'],
+
+  // /spec end-to-end via PTY — exercises the full Phase 1→5 pipeline
+  // including --execute spawn. Periodic-tier — paid + non-deterministic.
+  'spec-execute':     ['spec/**', 'test/skill-e2e-spec-execute.test.ts'],
 };

 /**
@@ -649,6 +653,8 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
  'ios-qa-swift-build': 'periodic',
  // Requires a real connected + paired iPhone. Manual-trigger only.
  'ios-qa-device': 'periodic',
+  // /spec end-to-end PTY pipeline (paid, non-deterministic — periodic-tier).
+  'spec-execute': 'periodic',
 };

 /**
@@ -673,6 +679,9 @@ export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
  // Plan Reviews
  'plan-ceo-review/SKILL.md modes':       ['plan-ceo-review/SKILL.md', 'plan-ceo-review/SKILL.md.tmpl'],
  'plan-eng-review/SKILL.md sections':    ['plan-eng-review/SKILL.md', 'plan-eng-review/SKILL.md.tmpl'],
+
+  // /spec authored-spec quality (paid LLM-judge — periodic-tier).
+  'spec authored quality':                ['spec/SKILL.md', 'spec/SKILL.md.tmpl', 'test/fixtures/spec/**'],
  'plan-design-review/SKILL.md passes':   ['plan-design-review/SKILL.md', 'plan-design-review/SKILL.md.tmpl'],

  // Design skills