Files
gstack/test/gen-skill-docs-idempotency.test.ts
T
Garry Tan b5f75c18c7 test(coverage): close 5 remaining v1.46.0.0 test gaps (A-E)
Five behaviors that v1.46 ships but had no test coverage. All now pinned.

A) --host all idempotency (test/gen-skill-docs-idempotency.test.ts)
   The default test ran Claude host only. Non-Claude hosts (Codex, Factory,
   Cursor, OpenClaw, GBrain, Slate, OpenCode, Hermes, Kiro) each have their
   own output paths and could carry their own non-deterministic fields. We
   hit a "--host all needed for freshness check" mid-/ship. Now: two
   consecutive `bun run gen:skill-docs --host all` runs must produce
   byte-identical outputs across a per-host sample (.agents/, .cursor/,
   .factory/, .gbrain/). Catches per-host adapter regressions before CI.

B) --catalog-mode=full opt-out (test/catalog-mode-full.test.ts)
   The legacy escape hatch had zero tests. 6 new tests across two layers:
   static (CATALOG_MODE_ARG parsed; conditional gate present; default is
   "trim"; invalid value throws) + smoke (actual --catalog-mode=full run
   produces a multi-line `description: |` block + omits "## When to invoke"
   body section; mutates the working tree then restores in a finally block).

C) parity-baseline-v1.44.1.json integrity (test/parity-baseline-integrity.test.ts)
   The baseline is the source of every v1→v2 number cited in the
   CHANGELOG v1.46.0.0 entry. Anyone could edit it without test failure
   until now. 8 new tests pin: existence, tag, capturedFromCommit
   allowlist, expected v1.44 numbers (51 skills, ~2,915 KB, ~9,319
   catalog tokens), CHANGELOG references this file by path, per-skill
   shape, and a SHA256 byte-stability hash. Any edit fails with a clear
   "if intentional, update EXPECTED_HASH AND the CHANGELOG numbers" signal.

D) Live appliesTo gate end-to-end (test/resolver-entry.test.ts extended)
   The unwrapResolver unit tests covered the function; the gen-skill-docs.ts
   substitution loop that USES the gate had no integration coverage. 6 new
   tests simulate the exact 4-line shape from gen-skill-docs.ts:457-467
   against synthetic registries: plain-function fires unconditionally,
   gated fires when true / empty-string when false, mixed registries
   compose, parameterized resolvers respect gates, unknown resolvers throw.

E) Per-skill min-size floor (test/skill-size-budget.test.ts extended)
   The existing 200-byte body coverage-floor is a noise floor — a skill
   that lost 99.75% of content still passes. 1 new test asserts every
   skill stays ≥80% of its v1.44.1 baseline size (the parity-suite
   content invariants only covered 10 of 51 skills; the remaining 41
   were uncovered). SECTIONS_EXTRACTED hook in place for v2.0.0.0 when
   the sections/ pattern legitimately shrinks ship/plan-ceo/etc. past
   the floor.

Test plan:
- bun test focused 17-file suite: 1202 pass, 0 fail
  (+23 new tests vs the pre-fill 1179 baseline)
- catalog-mode=full mutates working tree then restores cleanly
- --host all idempotency runs two full gen passes in <1s on this machine

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-26 11:49:59 -07:00

160 lines
5.9 KiB
TypeScript

/**
* Idempotency test for gen-skill-docs (regression for v1.45.0.0 timestamp flap).
*
* Running `bun run gen:skill-docs` twice in a row must produce a no-op on
* the second run: every output file is byte-identical to itself. Without
* this gate, CI freshness checks flap whenever someone introduces a
* timestamp, a random seed, or any other non-deterministic field into a
* generated artifact.
*
* v1.45.0.0 shipped with a `generated_at` ISO timestamp in
* scripts/proactive-suggestions.json that updated every run. CI freshness
* checks failed because the committed file's timestamp never matched the
* latest gen. Fixed in 43e18af4 — this test pins the contract going forward.
*
* The test pays a small cost (~2 gen-skill-docs invocations, ~3s total) but
* catches a class of bugs that's invisible until CI fails.
*/
import { describe, test, expect } from 'bun:test';
import { spawnSync } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
const REPO_ROOT = path.resolve(import.meta.dir, '..');
/** Files that gen-skill-docs writes and that must be byte-stable across runs. */
const STABLE_OUTPUTS = [
'scripts/proactive-suggestions.json',
'SKILL.md',
'ship/SKILL.md',
'plan-ceo-review/SKILL.md',
'office-hours/SKILL.md',
'gstack/llms.txt',
];
/**
* Sampled outputs from EVERY non-Claude host. The full host-all run touches
* .agents/, .cursor/, .factory/, .gbrain/, .hermes/, .kiro/, .openclaw/,
* .opencode/, .slate/ — picking one canonical file per host catches per-host
* non-determinism without paying the cost of snapshotting hundreds of files.
*/
const STABLE_HOST_ALL_OUTPUTS = [
'scripts/proactive-suggestions.json',
'SKILL.md',
'ship/SKILL.md',
'.agents/skills/gstack-ship/SKILL.md',
'.cursor/skills/gstack-ship/SKILL.md',
'.factory/skills/gstack-ship/SKILL.md',
'.gbrain/skills/gstack-ship/SKILL.md',
];
function runGen(extraArgs: string[] = []): { exitCode: number; stderr: string } {
const result = spawnSync('bun', ['run', 'gen:skill-docs', ...extraArgs], {
cwd: REPO_ROOT,
stdio: ['ignore', 'pipe', 'pipe'],
timeout: 120_000,
});
return {
exitCode: result.status ?? -1,
stderr: result.stderr?.toString() ?? '',
};
}
function snapshot(files: string[] = STABLE_OUTPUTS): Map<string, string> {
const m = new Map<string, string>();
for (const rel of files) {
const full = path.join(REPO_ROOT, rel);
if (fs.existsSync(full)) {
m.set(rel, fs.readFileSync(full, 'utf-8'));
}
}
return m;
}
describe('gen-skill-docs idempotency', () => {
test('two consecutive runs produce byte-identical outputs (no flapping fields)', () => {
const firstRun = runGen();
expect(firstRun.exitCode).toBe(0);
const after1 = snapshot();
expect(after1.size).toBeGreaterThan(0);
const secondRun = runGen();
expect(secondRun.exitCode).toBe(0);
const after2 = snapshot();
// Compare each stable output byte-for-byte.
const flapping: string[] = [];
for (const [file, before] of after1.entries()) {
const now = after2.get(file);
if (now !== before) flapping.push(file);
}
if (flapping.length > 0) {
throw new Error(
`${flapping.length} file(s) changed between two consecutive gen-skill-docs runs (flapping):\n` +
flapping.map(f => ` - ${f}`).join('\n') +
`\nLikely cause: a non-deterministic field (timestamp, random ID, ` +
`filesystem-iteration order) leaked into the generated output. CI freshness ` +
`checks (git diff --exit-code) will fail unpredictably until this is fixed.`,
);
}
}, 180_000); // ~2 min budget for two gen runs
test('--dry-run after a fresh gen reports zero stale files', () => {
// Pre-condition: working tree gen must be fresh (idempotency test above ran first).
// If a contributor introduces a non-deterministic field, this dry-run reports STALE.
const result = spawnSync('bun', ['run', 'gen:skill-docs', '--dry-run'], {
cwd: REPO_ROOT,
stdio: ['ignore', 'pipe', 'pipe'],
timeout: 60_000,
});
expect(result.status).toBe(0);
const stdout = result.stdout?.toString() ?? '';
// STALE: prefix means a file would change. Count them.
const staleLines = stdout.split('\n').filter(l => l.startsWith('STALE:'));
if (staleLines.length > 0) {
throw new Error(
`--dry-run reports ${staleLines.length} stale file(s) after a fresh gen:\n` +
staleLines.map(l => ` ${l}`).join('\n') +
`\nRun \`bun run gen:skill-docs\` and commit the result.`,
);
}
}, 90_000);
test('--host all idempotency: every host output is byte-stable across two runs', () => {
// Gap A: the default test above runs Claude host only. Non-Claude hosts
// (Codex, Factory, Cursor, OpenClaw, GBrain, Slate, OpenCode, Hermes,
// Kiro) have their own output paths and could carry their own
// non-deterministic fields. We hit a "--host all needed for freshness
// check" mid-/ship; this test pins the contract across every host.
const firstRun = runGen(['--host', 'all']);
expect(firstRun.exitCode).toBe(0);
const after1 = snapshot(STABLE_HOST_ALL_OUTPUTS);
expect(after1.size).toBeGreaterThan(0);
const secondRun = runGen(['--host', 'all']);
expect(secondRun.exitCode).toBe(0);
const after2 = snapshot(STABLE_HOST_ALL_OUTPUTS);
const flapping: string[] = [];
for (const [file, before] of after1.entries()) {
const now = after2.get(file);
if (now !== before) flapping.push(file);
}
if (flapping.length > 0) {
throw new Error(
`${flapping.length} file(s) changed between two consecutive --host all gen runs:\n` +
flapping.map(f => ` - ${f}`).join('\n') +
`\nLikely cause: a non-deterministic field leaked into a non-Claude host adapter ` +
`(scripts/host-adapters/*.ts). CI freshness checks for that host will flap.`,
);
}
}, 300_000); // ~5 min budget for two host-all runs
});