mirror of
https://github.com/garrytan/gstack.git
synced 2026-06-20 08:40:11 +02:00
Merge remote-tracking branch 'origin/main' into garrytan/plan-flag-unresolved-issues
This commit is contained in:
@@ -0,0 +1,131 @@
|
||||
/**
|
||||
* auq-error-fallback-hook — the OV3:B runtime reliability layer.
|
||||
*
|
||||
* Two layers of testing:
|
||||
* - PURE functions (isErrorResponse, directiveFor): deterministic, the core logic.
|
||||
* - INTEGRATION: spawn the hook as a PostToolUse process with synthetic stdin and
|
||||
* a controlled env, assert it injects the right directive on an error result and
|
||||
* stays inert on a real answer.
|
||||
*
|
||||
* NOTE: whether the Claude Code PLATFORM invokes PostToolUse on an MCP
|
||||
* transport/missing-result error is unverified (could not force the Conductor
|
||||
* bug in a harness — see docs/spikes/claude-code-hook-mutation.md). These tests
|
||||
* pin the hook's BEHAVIOR given it is invoked; the platform trigger is the
|
||||
* documented residual risk. The hook is inert if never invoked.
|
||||
*/
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import { spawnSync } from 'child_process';
|
||||
import * as path from 'path';
|
||||
import { isErrorResponse, directiveFor } from '../hosts/claude/hooks/auq-error-fallback-hook.ts';
|
||||
|
||||
const HOOK = path.resolve(__dirname, '..', 'hosts', 'claude', 'hooks', 'auq-error-fallback-hook.ts');
|
||||
|
||||
describe('isErrorResponse — only clear failures, never a real answer', () => {
|
||||
test('null / undefined / empty string are failures', () => {
|
||||
expect(isErrorResponse(null)).toBe(true);
|
||||
expect(isErrorResponse(undefined)).toBe(true);
|
||||
expect(isErrorResponse('')).toBe(true);
|
||||
expect(isErrorResponse(' ')).toBe(true);
|
||||
});
|
||||
|
||||
test('the Conductor missing-result string is a failure', () => {
|
||||
expect(isErrorResponse('[Tool result missing due to internal error]')).toBe(true);
|
||||
});
|
||||
|
||||
test('is_error: true / error-field / sentinel-in-content are failures', () => {
|
||||
expect(isErrorResponse({ is_error: true })).toBe(true);
|
||||
expect(isErrorResponse({ isError: true })).toBe(true);
|
||||
expect(isErrorResponse({ error: 'boom' })).toBe(true);
|
||||
expect(isErrorResponse({ content: 'Tool result missing due to internal error' })).toBe(true);
|
||||
});
|
||||
|
||||
test('a real answer is NOT a failure (no false trigger)', () => {
|
||||
expect(isErrorResponse({ answers: [{ option_label: 'A' }] })).toBe(false);
|
||||
expect(isErrorResponse('A')).toBe(false);
|
||||
// a choice that coincidentally contains "error" must not trip it
|
||||
expect(isErrorResponse({ answers: [{ option_label: 'Fix the error' }] })).toBe(false);
|
||||
expect(isErrorResponse('Investigate the login error')).toBe(false);
|
||||
});
|
||||
|
||||
test('Codex review: narrow detection — generic "error"/"is_error" substrings do NOT trigger', () => {
|
||||
// A real answer mentioning "internal error" must not be read as a failure.
|
||||
expect(isErrorResponse('Investigate the internal error')).toBe(false);
|
||||
// A serialized success payload containing the substring is_error:false must not trigger.
|
||||
expect(isErrorResponse('{"is_error": false, "answer": "A"}')).toBe(false);
|
||||
expect(isErrorResponse({ is_error: false })).toBe(false);
|
||||
expect(isErrorResponse({ content: 'The page had an internal error we fixed' })).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe('directiveFor — per-session-kind instruction', () => {
|
||||
test('interactive directive demands the prose triad', () => {
|
||||
const d = directiveFor('interactive');
|
||||
expect(d).toMatch(/ELI10/);
|
||||
expect(d).toMatch(/Completeness: X\/10/);
|
||||
expect(d).toMatch(/\(recommended\)/);
|
||||
expect(d).toMatch(/reply with a letter/i);
|
||||
expect(d).toMatch(/STOP/);
|
||||
});
|
||||
|
||||
test('headless directive BLOCKs', () => {
|
||||
expect(directiveFor('headless')).toMatch(/BLOCKED — AskUserQuestion unavailable/);
|
||||
});
|
||||
|
||||
test('spawned directive auto-chooses', () => {
|
||||
expect(directiveFor('spawned')).toMatch(/auto-choose/i);
|
||||
});
|
||||
});
|
||||
|
||||
/** Spawn the hook with synthetic stdin + controlled env; parse its JSON stdout. */
|
||||
function runHook(stdin: object, env: Record<string, string>): { additionalContext?: string } {
|
||||
const res = spawnSync('bun', [HOOK], {
|
||||
input: JSON.stringify(stdin),
|
||||
encoding: 'utf-8',
|
||||
env: { PATH: process.env.PATH ?? '/usr/bin:/bin', ...env },
|
||||
});
|
||||
const parsed = JSON.parse(res.stdout || '{}');
|
||||
return parsed.hookSpecificOutput ?? {};
|
||||
}
|
||||
|
||||
describe('hook integration — invoked as PostToolUse', () => {
|
||||
test('error result + headless env → injects BLOCK directive', () => {
|
||||
const out = runHook(
|
||||
{ tool_name: 'mcp__conductor__AskUserQuestion', tool_response: '[Tool result missing due to internal error]' },
|
||||
{ GSTACK_HEADLESS: '1' },
|
||||
);
|
||||
expect(out.additionalContext).toMatch(/BLOCKED — AskUserQuestion unavailable/);
|
||||
});
|
||||
|
||||
test('error result + interactive env → injects prose-triad directive', () => {
|
||||
const out = runHook(
|
||||
{ tool_name: 'AskUserQuestion', tool_response: null },
|
||||
{ CONDUCTOR_PORT: '55010' },
|
||||
);
|
||||
expect(out.additionalContext).toMatch(/render the decision as a PROSE message/i);
|
||||
expect(out.additionalContext).toMatch(/Completeness: X\/10/);
|
||||
});
|
||||
|
||||
test('error result + spawned env → injects auto-choose directive', () => {
|
||||
const out = runHook(
|
||||
{ tool_name: 'AskUserQuestion', tool_response: { is_error: true } },
|
||||
{ OPENCLAW_SESSION: '1' },
|
||||
);
|
||||
expect(out.additionalContext).toMatch(/auto-choose/i);
|
||||
});
|
||||
|
||||
test('SUCCESSFUL answer → no injection (inert on real answers)', () => {
|
||||
const out = runHook(
|
||||
{ tool_name: 'AskUserQuestion', tool_response: { answers: [{ option_label: 'A' }] } },
|
||||
{ GSTACK_HEADLESS: '1' },
|
||||
);
|
||||
expect(out.additionalContext).toBeUndefined();
|
||||
});
|
||||
|
||||
test('non-AUQ tool → defers (no injection)', () => {
|
||||
const out = runHook(
|
||||
{ tool_name: 'Bash', tool_response: null },
|
||||
{ GSTACK_HEADLESS: '1' },
|
||||
);
|
||||
expect(out.additionalContext).toBeUndefined();
|
||||
});
|
||||
});
|
||||
@@ -47,6 +47,11 @@ const MANDATORY: Array<{ name: string; re: RegExp }> = [
|
||||
{ name: 'Completeness coverage rule', re: /Completeness\s*:/i },
|
||||
{ name: 'kind-vs-coverage rule', re: /options differ in kind/i },
|
||||
{ name: 'Self-check checklist', re: /Self-check before emitting/i },
|
||||
// The runtime-failure fallback must be ALWAYS-LOADED too: when an AUQ call errors
|
||||
// mid-skill, the model needs the prose-fallback rule in context that instant, not
|
||||
// stranded in an on-demand section. Same guarantee as the format spec above.
|
||||
{ name: 'AUQ-failure fallback subsection', re: /When AskUserQuestion is unavailable or a call fails/i },
|
||||
{ name: 'fallback SESSION_KIND branch', re: /SESSION_KIND/ },
|
||||
];
|
||||
|
||||
/** Per-skill AUQ rules that govern review-finding cadence. A carve may move
|
||||
|
||||
@@ -0,0 +1,22 @@
|
||||
/**
|
||||
* E1 — carve-guard completeness meta-guard (GATE tier, free).
|
||||
*
|
||||
* Makes the carve gap impossible to reopen: every skill carved on disk (owns a
|
||||
* sections/manifest.json) MUST be in the canonical CARVE_GUARDS registry, and
|
||||
* vice-versa. Because the static (E2) and behavioral (T2) guards are data-driven
|
||||
* FROM the registry, registry membership IS guard coverage — so this set-parity
|
||||
* check is the whole game (codex #2: no need to grep test source). Carve a 7th
|
||||
* skill without a registry entry and this fails CI.
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import * as path from 'path';
|
||||
import { checkCompleteness } from './helpers/carve-guard-checks';
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
|
||||
describe('carve-guard completeness (gate, free)', () => {
|
||||
test('filesystem carved set == CARVE_GUARDS set, and every entry is consistent', () => {
|
||||
expect(checkCompleteness(ROOT)).toEqual([]);
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,100 @@
|
||||
/**
|
||||
* ET1 — guard-of-guards negative tests (GATE tier, free).
|
||||
*
|
||||
* Proves the guards actually BITE. The happy-path E1/E2 tests prove the real
|
||||
* skills pass; these prove a BROKEN carve fails. Without this, a logic bug in
|
||||
* checkOrdering/checkCompleteness would pass silently and protect nothing — the
|
||||
* exact silent-pass failure class this whole effort exists to kill.
|
||||
*
|
||||
* The checks take an injectable `root` (codex #5), so we point the REAL guard
|
||||
* functions at a temp fixture dir broken three ways — not at a wrapper.
|
||||
*/
|
||||
|
||||
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
|
||||
import * as fs from 'fs';
|
||||
import * as os from 'os';
|
||||
import * as path from 'path';
|
||||
import type { CarveGuard } from './helpers/carve-guards';
|
||||
import { checkOrdering, checkCompleteness, discoverCarvedSkills } from './helpers/carve-guard-checks';
|
||||
|
||||
let root = '';
|
||||
|
||||
/** Write a syntactically-valid carved skill under `root`. */
|
||||
function writeCarve(skill: string, opts: { stop: boolean; autoGen: boolean; leakBody: boolean }) {
|
||||
const dir = path.join(root, skill);
|
||||
const secDir = path.join(dir, 'sections');
|
||||
fs.mkdirSync(secDir, { recursive: true });
|
||||
fs.writeFileSync(
|
||||
path.join(secDir, 'manifest.json'),
|
||||
JSON.stringify({ skill, sections: [{ id: 'body', file: 'body.md', title: 'Body', trigger: 'doing the work' }] }),
|
||||
);
|
||||
const header = opts.autoGen ? '<!-- AUTO-GENERATED -->\n' : '';
|
||||
fs.writeFileSync(path.join(secDir, 'body.md'), `${header}## Heavy Body\nThe real work lives here. MOVED_MARKER.\n`);
|
||||
const stopLine = opts.stop ? '> **STOP.** Before doing the work, Read `sections/body.md` and execute it.\n' : '';
|
||||
const leak = opts.leakBody ? 'MOVED_MARKER\n' : '';
|
||||
fs.writeFileSync(
|
||||
path.join(dir, 'SKILL.md'),
|
||||
`# ${skill}\n## Step 0: Setup\nstays here\n## Section index\n| When | Read |\n${stopLine}${leak}## EXIT PLAN MODE GATE\n`,
|
||||
);
|
||||
}
|
||||
|
||||
const guardFor = (skill: string): CarveGuard => ({
|
||||
skill,
|
||||
expectedSections: ['body.md'],
|
||||
requiredReads: ['body.md'],
|
||||
scenario: 'do the work',
|
||||
staticInvariants: {
|
||||
mustStayInSkeleton: ['## Step 0: Setup'],
|
||||
mustMoveToSection: ['MOVED_MARKER'],
|
||||
gateAfterStop: 'EXIT PLAN MODE GATE',
|
||||
},
|
||||
maxSkeletonBytes: 999_999,
|
||||
minUnionBytes: 0,
|
||||
mustContain: [],
|
||||
});
|
||||
|
||||
beforeAll(() => {
|
||||
root = fs.mkdtempSync(path.join(os.tmpdir(), 'carve-neg-'));
|
||||
});
|
||||
afterAll(() => {
|
||||
fs.rmSync(root, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
describe('guard-of-guards — the guards bite (gate, free)', () => {
|
||||
test('a well-formed fixture carve passes checkOrdering (control)', () => {
|
||||
writeCarve('goodskill', { stop: true, autoGen: true, leakBody: false });
|
||||
expect(checkOrdering(root, guardFor('goodskill'))).toEqual([]);
|
||||
fs.rmSync(path.join(root, 'goodskill'), { recursive: true, force: true });
|
||||
});
|
||||
|
||||
test('E2 fails when the STOP-Read directive is removed', () => {
|
||||
writeCarve('nostopskill', { stop: false, autoGen: true, leakBody: false });
|
||||
const failures = checkOrdering(root, guardFor('nostopskill'));
|
||||
expect(failures.some((f) => f.includes('no STOP-Read directive'))).toBe(true);
|
||||
fs.rmSync(path.join(root, 'nostopskill'), { recursive: true, force: true });
|
||||
});
|
||||
|
||||
test('E2 fails when heavy body leaks back into the skeleton', () => {
|
||||
writeCarve('leakskill', { stop: true, autoGen: true, leakBody: true });
|
||||
const failures = checkOrdering(root, guardFor('leakskill'));
|
||||
expect(failures.some((f) => f.includes('still in the skeleton'))).toBe(true);
|
||||
fs.rmSync(path.join(root, 'leakskill'), { recursive: true, force: true });
|
||||
});
|
||||
|
||||
test('E2 fails when a section is hand-edited (no AUTO-GENERATED header)', () => {
|
||||
writeCarve('handeditskill', { stop: true, autoGen: false, leakBody: false });
|
||||
const failures = checkOrdering(root, guardFor('handeditskill'));
|
||||
expect(failures.some((f) => f.includes('hand-edited'))).toBe(true);
|
||||
fs.rmSync(path.join(root, 'handeditskill'), { recursive: true, force: true });
|
||||
});
|
||||
|
||||
test('E1 fails when a skill is carved on disk but missing from the registry', () => {
|
||||
writeCarve('unregisteredskill', { stop: true, autoGen: true, leakBody: false });
|
||||
// Discovery sees it...
|
||||
expect(discoverCarvedSkills(root)).toContain('unregisteredskill');
|
||||
// ...and completeness flags it as an unguarded carve.
|
||||
const failures = checkCompleteness(root);
|
||||
expect(failures.some((f) => f.includes('unregisteredskill') && f.includes('NOT in CARVE_GUARDS'))).toBe(true);
|
||||
fs.rmSync(path.join(root, 'unregisteredskill'), { recursive: true, force: true });
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,97 @@
|
||||
/**
|
||||
* T2 — data-driven behavioral section-loading guard (PERIODIC tier, paid, SDK capture).
|
||||
*
|
||||
* The behavioral proof that a REAL agent actually Reads each carved skill's
|
||||
* required sections at runtime — not just that the skeleton structure looks right
|
||||
* (that's E2, free, per-PR). One file iterating the canonical CARVE_GUARDS
|
||||
* registry (EQ2): registry membership IS the test, so "registered ⇒ asserted" is
|
||||
* structural — a carve can't be registered yet behaviorally unguarded.
|
||||
*
|
||||
* Per codex refined-plan pass:
|
||||
* #2 — ONE test() per skill, each with its own timeout + named failure output;
|
||||
* a hung claude -p fails only its skill, not the whole file.
|
||||
* #3 / D-CODEX(A) — GSTACK_CARVE_SKILL=<name> runs only that skill's case, so
|
||||
* the touchfile selector can scope cost to the changed skill; unset runs all.
|
||||
* #7 — each case drives the run with the registry's `scenario` (built to force
|
||||
* the STOP-Read path) and asserts the required sections were Read.
|
||||
*
|
||||
* 'external' skills (ship, plan-ceo-review) have bespoke fixtures (git state,
|
||||
* Step-0 mode loop) and keep their dedicated tests; E1 asserts those exist.
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import { setupSkillDir, skillFromWorktree, captureSectionReads } from './helpers/auq-sdk-capture';
|
||||
import { CARVE_GUARDS } from './helpers/carve-guards';
|
||||
|
||||
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
|
||||
const describeE2E = shouldRun ? describe : describe.skip;
|
||||
const runId = `carve-section-loading-${process.env.EVALS_RUN_ID ?? 'local'}`;
|
||||
const only = process.env.GSTACK_CARVE_SKILL?.trim();
|
||||
|
||||
// A generic plan fixture for 'plan' behavioral skills (the review family).
|
||||
const PLAN_MD = [
|
||||
'# Plan: add an in-memory cache layer',
|
||||
'',
|
||||
'## Context',
|
||||
'Reads hit the DB on every request. Add a process-local LRU cache in front of the',
|
||||
'read path to cut DB load.',
|
||||
'',
|
||||
'## Approach',
|
||||
'- Wrap the read repository in a cache that stores the last 1000 keys.',
|
||||
'- Invalidate on write.',
|
||||
'',
|
||||
'## Out of scope',
|
||||
'Distributed cache, cross-process coherence.',
|
||||
'',
|
||||
].join('\n');
|
||||
|
||||
describeE2E('carve behavioral section-loading (periodic, SDK capture)', () => {
|
||||
for (const guard of Object.values(CARVE_GUARDS)) {
|
||||
// 'external' carves keep their dedicated bespoke tests (E1 verifies those exist).
|
||||
if (guard.behavioral === 'external') continue;
|
||||
// Cost-scoped selection: when GSTACK_CARVE_SKILL is set, run only that skill.
|
||||
if (only && only !== guard.skill) continue;
|
||||
|
||||
test(
|
||||
`${guard.skill}: a real run Reads ${guard.requiredReads.join(', ')}`,
|
||||
async () => {
|
||||
const { skillMd, sectionsFrom } = skillFromWorktree(guard.skill);
|
||||
const fixtures = guard.behavioral === 'plan' ? { 'PLAN.md': PLAN_MD } : {};
|
||||
const planDir = setupSkillDir({
|
||||
skillName: guard.skill,
|
||||
skillMd,
|
||||
sectionsFrom,
|
||||
fixtures,
|
||||
tmpPrefix: `gstack-${guard.skill}-secload-`,
|
||||
});
|
||||
|
||||
const { readSections, reportProduced, output } = await captureSectionReads({
|
||||
planDir,
|
||||
skillName: guard.skill,
|
||||
scenario: guard.scenario,
|
||||
reportMarker: /report|review|summary|design doc|handoff/i,
|
||||
testName: `${guard.skill} section-loading`,
|
||||
runId,
|
||||
});
|
||||
|
||||
const missing = guard.requiredReads.filter((s) => !readSections.has(s));
|
||||
// Named failure output (codex #2): skill + expected + observed.
|
||||
expect({
|
||||
skill: guard.skill,
|
||||
reportProduced,
|
||||
expected: guard.requiredReads,
|
||||
observed: [...readSections],
|
||||
missing,
|
||||
}).toEqual({
|
||||
skill: guard.skill,
|
||||
reportProduced: true,
|
||||
expected: guard.requiredReads,
|
||||
observed: expect.any(Array),
|
||||
missing: [],
|
||||
});
|
||||
expect(output.trim().length).toBeGreaterThan(200);
|
||||
},
|
||||
360_000,
|
||||
);
|
||||
}
|
||||
});
|
||||
@@ -0,0 +1,27 @@
|
||||
/**
|
||||
* E2 — carve static ordering guard (GATE tier, free, deterministic).
|
||||
*
|
||||
* The per-PR mechanical backstop for EVERY carved skill: it fails CI the moment a
|
||||
* regen drops/weakens a skeleton's STOP-Read directive, strands a section, leaks
|
||||
* heavy body back into the skeleton, or moves a post-STOP gate above the STOP.
|
||||
*
|
||||
* Data-driven from the canonical CARVE_GUARDS registry (EQ1) with per-skill
|
||||
* invariants (codex outside-voice #3 — NOT a copy of the ceo-specific test, which
|
||||
* this generalizes and retires). One test() per skill so a failure names the skill.
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import * as path from 'path';
|
||||
import { CARVE_GUARDS } from './helpers/carve-guards';
|
||||
import { checkOrdering } from './helpers/carve-guard-checks';
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
|
||||
describe('carve static ordering (gate, free)', () => {
|
||||
for (const guard of Object.values(CARVE_GUARDS)) {
|
||||
test(`${guard.skill}: skeleton routes to sections correctly`, () => {
|
||||
const failures = checkOrdering(ROOT, guard);
|
||||
expect({ skill: guard.skill, failures }).toEqual({ skill: guard.skill, failures: [] });
|
||||
});
|
||||
}
|
||||
});
|
||||
+74
-53
@@ -1,16 +1,22 @@
|
||||
/**
|
||||
* cso security-guidance preservation test (v1.45.0.0 T6).
|
||||
* cso security-guidance preservation test.
|
||||
*
|
||||
* The cso skill carries load-bearing security prose: OWASP Top 10 mappings,
|
||||
* STRIDE threat-model phrasing, "do not auto-fix without user approval"
|
||||
* gates. Codex 2nd-pass critique #9: "cso exemption too broad ... should
|
||||
* still get resolver dedup, catalog trim, sectioning if safe, and targeted
|
||||
* evals around must-not-miss checks."
|
||||
* cso carries load-bearing security prose: OWASP Top 10 mappings, STRIDE
|
||||
* threat-model phrasing, mode dispatch, and false-positive-filtering exceptions
|
||||
* that must NOT be auto-discarded.
|
||||
*
|
||||
* This test pins the must-not-miss checks. cso gets the same resolver gate
|
||||
* (T2), jargon dedup (T3), and catalog trim (T4) as every other skill — but
|
||||
* its security-guidance body content stays intact. Future compression work
|
||||
* that would strip this content fails CI here.
|
||||
* cso is now carved (skeleton SKILL.md + sections/audit-phases.md). The
|
||||
* scope-dependent audit phases (2-11) moved to the section; the mode dispatch
|
||||
* (## Arguments, ## Mode Resolution), the always-run phases (0, 1), and the
|
||||
* FP-filtering exceptions (Phase 12) stay always-loaded in the skeleton.
|
||||
*
|
||||
* Two distinct guarantees (codex outside-voice #5 — earliest-use, not loose
|
||||
* substrings):
|
||||
* 1. PRESERVATION — the security phrases survive somewhere in the union
|
||||
* (skeleton + sections); a carve relocates, it never drops.
|
||||
* 2. ALWAYS-LOADED CONTRACT — dispatch + FP-filtering directives stay in the
|
||||
* skeleton, and mode dispatch precedes any STOP-Read (a directive that
|
||||
* decides which sections to read can't sit behind the STOP that reads them).
|
||||
*/
|
||||
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
@@ -18,69 +24,84 @@ import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
|
||||
const REPO_ROOT = path.resolve(import.meta.dir, '..');
|
||||
const CSO_SKILL = path.join(REPO_ROOT, 'cso', 'SKILL.md');
|
||||
const CSO_DIR = path.join(REPO_ROOT, 'cso');
|
||||
const CSO_SKELETON = path.join(CSO_DIR, 'SKILL.md');
|
||||
|
||||
const MUST_PRESERVE_PHRASES = [
|
||||
// OWASP / STRIDE positioning
|
||||
'OWASP',
|
||||
'STRIDE',
|
||||
// Mode discipline
|
||||
'daily',
|
||||
'comprehensive',
|
||||
// Severity language
|
||||
'confidence',
|
||||
// Active verification requirement (codex critique: "active verification")
|
||||
'verif', // covers "verify", "verification", "verified"
|
||||
];
|
||||
function readSkeleton(): string {
|
||||
return fs.readFileSync(CSO_SKELETON, 'utf-8');
|
||||
}
|
||||
function readUnion(): string {
|
||||
let text = readSkeleton();
|
||||
const dir = path.join(CSO_DIR, 'sections');
|
||||
if (fs.existsSync(dir)) {
|
||||
for (const f of fs.readdirSync(dir).sort()) {
|
||||
if (f.endsWith('.md') && !f.endsWith('.md.tmpl')) {
|
||||
text += '\n' + fs.readFileSync(path.join(dir, f), 'utf-8');
|
||||
}
|
||||
}
|
||||
}
|
||||
return text;
|
||||
}
|
||||
|
||||
const MUST_PRESERVE_HEADINGS = [
|
||||
'## Preamble', // from PREAMBLE resolver
|
||||
];
|
||||
// Security content that must survive the carve (checked against the UNION).
|
||||
const MUST_PRESERVE_PHRASES = ['OWASP', 'STRIDE', 'daily', 'comprehensive', 'confidence', 'verif'];
|
||||
|
||||
describe('cso skill preserves load-bearing security guidance', () => {
|
||||
test('cso/SKILL.md exists and is non-trivial', () => {
|
||||
expect(fs.existsSync(CSO_SKILL)).toBe(true);
|
||||
const content = fs.readFileSync(CSO_SKILL, 'utf-8');
|
||||
// cso is a content-heavy security skill; under 30 KB suggests stripping went too far.
|
||||
expect(content.length).toBeGreaterThan(30_000);
|
||||
test('cso skeleton exists and is non-trivial', () => {
|
||||
expect(fs.existsSync(CSO_SKELETON)).toBe(true);
|
||||
// Skeleton stays substantial: dispatch + always-run phases + FP filtering +
|
||||
// report phases are all always-loaded. Under 30 KB means too much moved out.
|
||||
expect(readSkeleton().length).toBeGreaterThan(30_000);
|
||||
});
|
||||
|
||||
test('cso preserves required security phrases (case-insensitive)', () => {
|
||||
const content = fs.readFileSync(CSO_SKILL, 'utf-8').toLowerCase();
|
||||
const missing: string[] = [];
|
||||
for (const phrase of MUST_PRESERVE_PHRASES) {
|
||||
if (!content.includes(phrase.toLowerCase())) missing.push(phrase);
|
||||
}
|
||||
test('security phrases survive in the union (skeleton + sections)', () => {
|
||||
const union = readUnion().toLowerCase();
|
||||
const missing = MUST_PRESERVE_PHRASES.filter((p) => !union.includes(p.toLowerCase()));
|
||||
if (missing.length > 0) {
|
||||
throw new Error(
|
||||
`cso/SKILL.md is missing required security phrases: ${missing.join(', ')}. ` +
|
||||
`These are load-bearing for the skill's audit posture. If you intentionally ` +
|
||||
`removed them, update this test with the new phrasing.`,
|
||||
`cso union is missing required security phrases: ${missing.join(', ')}. ` +
|
||||
`These are load-bearing. A carve relocates them; it must not drop them.`,
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
test('cso preserves required headings', () => {
|
||||
const content = fs.readFileSync(CSO_SKILL, 'utf-8');
|
||||
for (const heading of MUST_PRESERVE_HEADINGS) {
|
||||
expect(content).toContain(heading);
|
||||
test('ALWAYS-LOADED: mode dispatch + FP-filtering stay in the skeleton', () => {
|
||||
const skeleton = readSkeleton();
|
||||
// Dispatch must be always-loaded — the agent resolves scope before reading sections.
|
||||
expect(skeleton).toContain('## Arguments');
|
||||
expect(skeleton).toContain('## Mode Resolution');
|
||||
// FP-filtering with its critical exceptions is mandatory and must not be on-demand.
|
||||
expect(skeleton).toContain('Phase 12');
|
||||
// The "SKILL.md files are NOT documentation" exception is a must-not-miss
|
||||
// security directive (skill supply-chain findings); it stays always-loaded.
|
||||
expect(skeleton).toContain('NOT documentation');
|
||||
});
|
||||
|
||||
test('EARLIEST-USE: mode dispatch precedes any STOP-Read directive (codex #6)', () => {
|
||||
const skeleton = readSkeleton();
|
||||
const stop = skeleton.indexOf('> **STOP.**');
|
||||
const modeRes = skeleton.indexOf('## Mode Resolution');
|
||||
const args = skeleton.indexOf('## Arguments');
|
||||
expect(modeRes).toBeGreaterThan(-1);
|
||||
expect(args).toBeGreaterThan(-1);
|
||||
if (stop >= 0) {
|
||||
// A dispatch directive stranded after the STOP can't govern which sections to read.
|
||||
expect(args).toBeLessThan(stop);
|
||||
expect(modeRes).toBeLessThan(stop);
|
||||
}
|
||||
});
|
||||
|
||||
test('cso catalog trim landed (frontmatter description ≤ 200 chars)', () => {
|
||||
const content = fs.readFileSync(CSO_SKILL, 'utf-8');
|
||||
const content = readSkeleton();
|
||||
const fmMatch = content.match(/^---\n([\s\S]*?)\n---/);
|
||||
expect(fmMatch).not.toBeNull();
|
||||
const fm = fmMatch![1];
|
||||
const descMatch = fm.match(/^description:\s+(.+)$/m);
|
||||
expect(descMatch).not.toBeNull();
|
||||
const desc = descMatch![1].trim();
|
||||
expect(desc.length).toBeLessThanOrEqual(200);
|
||||
expect(desc).toContain('(gstack)');
|
||||
const desc = fmMatch![1].match(/^description:\s+(.+)$/m);
|
||||
expect(desc).not.toBeNull();
|
||||
expect(desc![1].trim().length).toBeLessThanOrEqual(200);
|
||||
expect(desc![1]).toContain('(gstack)');
|
||||
});
|
||||
|
||||
test('cso routing prose moved to "## When to invoke" body section', () => {
|
||||
const content = fs.readFileSync(CSO_SKILL, 'utf-8');
|
||||
expect(content).toContain('## When to invoke this skill');
|
||||
expect(readSkeleton()).toContain('## When to invoke this skill');
|
||||
});
|
||||
});
|
||||
|
||||
@@ -14,7 +14,20 @@ import { HOST_PATHS } from "../scripts/resolvers/types";
|
||||
import { PATTERNS } from "../lib/redact-patterns";
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, "..");
|
||||
const CSO = fs.readFileSync(path.join(ROOT, "cso", "SKILL.md"), "utf-8");
|
||||
// cso is carved (skeleton + sections/audit-phases.md). The Secrets Archaeology
|
||||
// prose + secret prefixes moved into the section; check the union so relocated
|
||||
// content still counts.
|
||||
function unionSkill(skill: string): string {
|
||||
let t = fs.readFileSync(path.join(ROOT, skill, "SKILL.md"), "utf-8");
|
||||
const dir = path.join(ROOT, skill, "sections");
|
||||
if (fs.existsSync(dir)) {
|
||||
for (const f of fs.readdirSync(dir).sort()) {
|
||||
if (f.endsWith(".md") && !f.endsWith(".md.tmpl")) t += "\n" + fs.readFileSync(path.join(dir, f), "utf-8");
|
||||
}
|
||||
}
|
||||
return t;
|
||||
}
|
||||
const CSO = unionSkill("cso");
|
||||
const ctx = { skillName: "cso", tmplPath: "", host: "claude" as const, paths: HOST_PATHS["claude"] };
|
||||
|
||||
describe("cso/spec taxonomy alignment", () => {
|
||||
|
||||
@@ -6,7 +6,21 @@ import * as fs from "fs";
|
||||
import * as path from "path";
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, "..");
|
||||
const RELEASE = fs.readFileSync(path.join(ROOT, "document-release", "SKILL.md.tmpl"), "utf-8");
|
||||
// document-release is carved (skeleton + sections/release-body.md). Step 9
|
||||
// (commit + PR-body redaction scan) moved into the section template; check the
|
||||
// union of SKILL.md.tmpl + sections/*.md.tmpl so the scan-before-edit ordering
|
||||
// still verifies. document-generate is NOT carved (plain .md.tmpl).
|
||||
function unionTmpl(skill: string): string {
|
||||
let t = fs.readFileSync(path.join(ROOT, skill, "SKILL.md.tmpl"), "utf-8");
|
||||
const dir = path.join(ROOT, skill, "sections");
|
||||
if (fs.existsSync(dir)) {
|
||||
for (const f of fs.readdirSync(dir).sort()) {
|
||||
if (f.endsWith(".md.tmpl")) t += "\n" + fs.readFileSync(path.join(dir, f), "utf-8");
|
||||
}
|
||||
}
|
||||
return t;
|
||||
}
|
||||
const RELEASE = unionTmpl("document-release");
|
||||
const GENERATE = fs.readFileSync(path.join(ROOT, "document-generate", "SKILL.md.tmpl"), "utf-8");
|
||||
|
||||
describe("/document-release redaction", () => {
|
||||
|
||||
+27
-4
@@ -50,6 +50,9 @@ echo "SKILL_PREFIX: $_SKILL_PREFIX"
|
||||
source <(~/.claude/skills/gstack/bin/gstack-repo-mode 2>/dev/null) || true
|
||||
REPO_MODE=${REPO_MODE:-unknown}
|
||||
echo "REPO_MODE: $REPO_MODE"
|
||||
_SESSION_KIND=$(~/.claude/skills/gstack/bin/gstack-session-kind 2>/dev/null || echo "interactive")
|
||||
case "$_SESSION_KIND" in spawned|headless|interactive) ;; *) _SESSION_KIND="interactive" ;; esac
|
||||
echo "SESSION_KIND: $_SESSION_KIND"
|
||||
_LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no")
|
||||
echo "LAKE_INTRO: $_LAKE_SEEN"
|
||||
_TEL=$(~/.claude/skills/gstack/bin/gstack-config get telemetry 2>/dev/null || true)
|
||||
@@ -129,7 +132,7 @@ In plan mode, allowed because they inform the plan: `$B`, `$D`, `codex exec`/`co
|
||||
|
||||
## Skill Invocation During Plan Mode
|
||||
|
||||
If the user invokes a skill in plan mode, the skill takes precedence over generic plan mode behavior. **Treat the skill file as executable instructions, not reference.** Follow it step by step starting from Step 0; the first AskUserQuestion is the workflow entering plan mode, not a violation of it. AskUserQuestion (any variant — `mcp__*__AskUserQuestion` or native; see "AskUserQuestion Format → Tool resolution") satisfies plan mode's end-of-turn requirement. If no variant is callable, the skill is BLOCKED — stop and report `BLOCKED — AskUserQuestion unavailable` per the AskUserQuestion Format rule. At a STOP point, stop immediately. Do not continue the workflow or call ExitPlanMode there. Commands marked "PLAN MODE EXCEPTION — ALWAYS RUN" execute. Call ExitPlanMode only after the skill workflow completes, or if the user tells you to cancel the skill or leave plan mode.
|
||||
If the user invokes a skill in plan mode, the skill takes precedence over generic plan mode behavior. **Treat the skill file as executable instructions, not reference.** Follow it step by step starting from Step 0; the first AskUserQuestion is the workflow entering plan mode, not a violation of it. AskUserQuestion (any variant — `mcp__*__AskUserQuestion` or native; see "AskUserQuestion Format → Tool resolution") satisfies plan mode's end-of-turn requirement. If AskUserQuestion is unavailable or a call fails, follow the AskUserQuestion Format failure fallback: `headless` → BLOCKED; `interactive` → the prose fallback (also satisfies end-of-turn). At a STOP point, stop immediately. Do not continue the workflow or call ExitPlanMode there. Commands marked "PLAN MODE EXCEPTION — ALWAYS RUN" execute. Call ExitPlanMode only after the skill workflow completes, or if the user tells you to cancel the skill or leave plan mode.
|
||||
|
||||
If `PROACTIVE` is `"false"`, do not auto-invoke or proactively suggest skills. If a skill seems useful, ask: "I think /skillname might help here — want me to run it?"
|
||||
|
||||
@@ -301,11 +304,31 @@ AI orchestrator (e.g., OpenClaw). In spawned sessions:
|
||||
|
||||
**Rule:** if any `mcp__*__AskUserQuestion` variant is in your tool list, prefer it. Hosts may disable native AUQ via `--disallowedTools AskUserQuestion` (Conductor does, by default) and route through their MCP variant; calling native there silently fails. Same questions/options shape; same decision-brief format applies.
|
||||
|
||||
**If no AskUserQuestion variant appears in your tool list, this skill is BLOCKED.** Stop, report `BLOCKED — AskUserQuestion unavailable`, and wait for the user. Do not write decisions to the plan file as a substitute, do not emit them as prose and stop, and do not silently auto-decide (only `/plan-tune` AUTO_DECIDE opt-ins authorize auto-picking).
|
||||
If AskUserQuestion is unavailable (no variant in your tool list) OR a call to it fails, do NOT silently auto-decide or write the decision to the plan file as a substitute. Follow the **failure fallback** below.
|
||||
|
||||
### When AskUserQuestion is unavailable or a call fails
|
||||
|
||||
Tell three outcomes apart:
|
||||
|
||||
1. **Auto-decide denial (NOT a failure).** The result contains `[plan-tune auto-decide] <id> → <option>` — the preference hook working as designed. Proceed with that option. Do NOT retry, do NOT fall back to prose.
|
||||
2. **Genuine failure** — no variant in your tool list, OR the variant is present but the call returns an error / missing result (MCP transport error, empty result, host bug — e.g. Conductor's MCP AskUserQuestion is flaky and returns `[Tool result missing due to internal error]`).
|
||||
- If it was present and **errored** (not absent), retry the SAME call **once** — but only if no answer could have surfaced (a missing-result error can arrive after the user already saw the question; retrying would double-prompt, so if it may have reached them, treat as pending, don't retry).
|
||||
- Then branch on `SESSION_KIND` (echoed by the preamble; empty/absent ⇒ `interactive`):
|
||||
- `spawned` → defer to the **Spawned session** block: auto-choose the recommended option. Never prose, never BLOCKED.
|
||||
- `headless` → `BLOCKED — AskUserQuestion unavailable`; stop and wait (no human can answer).
|
||||
- `interactive` → **prose fallback** (below).
|
||||
|
||||
**Prose fallback — render the decision brief as a markdown message, not a tool call.** Same information as the tool format below, different structure (paragraphs, not ✅/❌ bullets). It MUST surface this triad:
|
||||
|
||||
1. **A clear ELI10 of the issue itself** — plain English on what's being decided and why it matters (the question, not per-choice), naming the stakes. Lead with it.
|
||||
2. **Completeness scores per choice** — explicit `Completeness: X/10` on EACH choice (10 complete, 7 happy-path, 3 shortcut); use the kind-note when options differ in kind not coverage, but never silently drop the score.
|
||||
3. **The recommendation and why** — a `Recommendation: <choice> because <reason>` line plus the `(recommended)` marker on that choice.
|
||||
|
||||
Layout: a `D<N>` title + a one-line note that AskUserQuestion failed and to reply with a letter; the issue ELI10; the Recommendation line; then ONE paragraph per choice carrying its `(recommended)` marker, its `Completeness: X/10`, and 2-4 sentences of reasoning — never a bare bullet list; a closing `Net:` line. Split chains / 5+ options: one prose block per per-option call, in sequence. Then STOP and wait — the user's typed answer is the decision. In plan mode this satisfies end-of-turn like a tool call.
|
||||
|
||||
### Format
|
||||
|
||||
Every AskUserQuestion is a decision brief and must be sent as tool_use, not prose.
|
||||
Every AskUserQuestion is a decision brief and must be sent as tool_use, not prose — unless the documented failure fallback above applies (interactive session + the call is unavailable/erroring), in which case the prose fallback is the correct output.
|
||||
|
||||
```
|
||||
D<N> — <one-line question title>
|
||||
@@ -385,7 +408,7 @@ Before calling AskUserQuestion, verify:
|
||||
- [ ] (recommended) label on one option (even for neutral-posture)
|
||||
- [ ] Dual-scale effort labels on effort-bearing options (human / CC)
|
||||
- [ ] Net line closes the decision
|
||||
- [ ] You are calling the tool, not writing prose
|
||||
- [ ] You are calling the tool, not writing prose — unless the documented failure fallback applies (then: prose with the mandatory triad — issue ELI10, per-choice Completeness, Recommendation + `(recommended)` — and a "reply with a letter" instruction, then STOP)
|
||||
- [ ] Non-ASCII characters (CJK / accents) written directly, NOT \u-escaped
|
||||
- [ ] If you had 5+ options, you split (or batched into ≤4-groups) — did NOT drop any
|
||||
- [ ] If you split, you checked dependencies between options before firing the chain
|
||||
|
||||
+27
-4
@@ -36,6 +36,9 @@ echo "SKILL_PREFIX: $_SKILL_PREFIX"
|
||||
source <($GSTACK_BIN/gstack-repo-mode 2>/dev/null) || true
|
||||
REPO_MODE=${REPO_MODE:-unknown}
|
||||
echo "REPO_MODE: $REPO_MODE"
|
||||
_SESSION_KIND=$($GSTACK_BIN/gstack-session-kind 2>/dev/null || echo "interactive")
|
||||
case "$_SESSION_KIND" in spawned|headless|interactive) ;; *) _SESSION_KIND="interactive" ;; esac
|
||||
echo "SESSION_KIND: $_SESSION_KIND"
|
||||
_LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no")
|
||||
echo "LAKE_INTRO: $_LAKE_SEEN"
|
||||
_TEL=$($GSTACK_BIN/gstack-config get telemetry 2>/dev/null || true)
|
||||
@@ -115,7 +118,7 @@ In plan mode, allowed because they inform the plan: `$B`, `$D`, `codex exec`/`co
|
||||
|
||||
## Skill Invocation During Plan Mode
|
||||
|
||||
If the user invokes a skill in plan mode, the skill takes precedence over generic plan mode behavior. **Treat the skill file as executable instructions, not reference.** Follow it step by step starting from Step 0; the first AskUserQuestion is the workflow entering plan mode, not a violation of it. AskUserQuestion (any variant — `mcp__*__AskUserQuestion` or native; see "AskUserQuestion Format → Tool resolution") satisfies plan mode's end-of-turn requirement. If no variant is callable, the skill is BLOCKED — stop and report `BLOCKED — AskUserQuestion unavailable` per the AskUserQuestion Format rule. At a STOP point, stop immediately. Do not continue the workflow or call ExitPlanMode there. Commands marked "PLAN MODE EXCEPTION — ALWAYS RUN" execute. Call ExitPlanMode only after the skill workflow completes, or if the user tells you to cancel the skill or leave plan mode.
|
||||
If the user invokes a skill in plan mode, the skill takes precedence over generic plan mode behavior. **Treat the skill file as executable instructions, not reference.** Follow it step by step starting from Step 0; the first AskUserQuestion is the workflow entering plan mode, not a violation of it. AskUserQuestion (any variant — `mcp__*__AskUserQuestion` or native; see "AskUserQuestion Format → Tool resolution") satisfies plan mode's end-of-turn requirement. If AskUserQuestion is unavailable or a call fails, follow the AskUserQuestion Format failure fallback: `headless` → BLOCKED; `interactive` → the prose fallback (also satisfies end-of-turn). At a STOP point, stop immediately. Do not continue the workflow or call ExitPlanMode there. Commands marked "PLAN MODE EXCEPTION — ALWAYS RUN" execute. Call ExitPlanMode only after the skill workflow completes, or if the user tells you to cancel the skill or leave plan mode.
|
||||
|
||||
If `PROACTIVE` is `"false"`, do not auto-invoke or proactively suggest skills. If a skill seems useful, ask: "I think /skillname might help here — want me to run it?"
|
||||
|
||||
@@ -287,11 +290,31 @@ AI orchestrator (e.g., OpenClaw). In spawned sessions:
|
||||
|
||||
**Rule:** if any `mcp__*__AskUserQuestion` variant is in your tool list, prefer it. Hosts may disable native AUQ via `--disallowedTools AskUserQuestion` (Conductor does, by default) and route through their MCP variant; calling native there silently fails. Same questions/options shape; same decision-brief format applies.
|
||||
|
||||
**If no AskUserQuestion variant appears in your tool list, this skill is BLOCKED.** Stop, report `BLOCKED — AskUserQuestion unavailable`, and wait for the user. Do not write decisions to the plan file as a substitute, do not emit them as prose and stop, and do not silently auto-decide (only `/plan-tune` AUTO_DECIDE opt-ins authorize auto-picking).
|
||||
If AskUserQuestion is unavailable (no variant in your tool list) OR a call to it fails, do NOT silently auto-decide or write the decision to the plan file as a substitute. Follow the **failure fallback** below.
|
||||
|
||||
### When AskUserQuestion is unavailable or a call fails
|
||||
|
||||
Tell three outcomes apart:
|
||||
|
||||
1. **Auto-decide denial (NOT a failure).** The result contains `[plan-tune auto-decide] <id> → <option>` — the preference hook working as designed. Proceed with that option. Do NOT retry, do NOT fall back to prose.
|
||||
2. **Genuine failure** — no variant in your tool list, OR the variant is present but the call returns an error / missing result (MCP transport error, empty result, host bug — e.g. Conductor's MCP AskUserQuestion is flaky and returns `[Tool result missing due to internal error]`).
|
||||
- If it was present and **errored** (not absent), retry the SAME call **once** — but only if no answer could have surfaced (a missing-result error can arrive after the user already saw the question; retrying would double-prompt, so if it may have reached them, treat as pending, don't retry).
|
||||
- Then branch on `SESSION_KIND` (echoed by the preamble; empty/absent ⇒ `interactive`):
|
||||
- `spawned` → defer to the **Spawned session** block: auto-choose the recommended option. Never prose, never BLOCKED.
|
||||
- `headless` → `BLOCKED — AskUserQuestion unavailable`; stop and wait (no human can answer).
|
||||
- `interactive` → **prose fallback** (below).
|
||||
|
||||
**Prose fallback — render the decision brief as a markdown message, not a tool call.** Same information as the tool format below, different structure (paragraphs, not ✅/❌ bullets). It MUST surface this triad:
|
||||
|
||||
1. **A clear ELI10 of the issue itself** — plain English on what's being decided and why it matters (the question, not per-choice), naming the stakes. Lead with it.
|
||||
2. **Completeness scores per choice** — explicit `Completeness: X/10` on EACH choice (10 complete, 7 happy-path, 3 shortcut); use the kind-note when options differ in kind not coverage, but never silently drop the score.
|
||||
3. **The recommendation and why** — a `Recommendation: <choice> because <reason>` line plus the `(recommended)` marker on that choice.
|
||||
|
||||
Layout: a `D<N>` title + a one-line note that AskUserQuestion failed and to reply with a letter; the issue ELI10; the Recommendation line; then ONE paragraph per choice carrying its `(recommended)` marker, its `Completeness: X/10`, and 2-4 sentences of reasoning — never a bare bullet list; a closing `Net:` line. Split chains / 5+ options: one prose block per per-option call, in sequence. Then STOP and wait — the user's typed answer is the decision. In plan mode this satisfies end-of-turn like a tool call.
|
||||
|
||||
### Format
|
||||
|
||||
Every AskUserQuestion is a decision brief and must be sent as tool_use, not prose.
|
||||
Every AskUserQuestion is a decision brief and must be sent as tool_use, not prose — unless the documented failure fallback above applies (interactive session + the call is unavailable/erroring), in which case the prose fallback is the correct output.
|
||||
|
||||
```
|
||||
D<N> — <one-line question title>
|
||||
@@ -371,7 +394,7 @@ Before calling AskUserQuestion, verify:
|
||||
- [ ] (recommended) label on one option (even for neutral-posture)
|
||||
- [ ] Dual-scale effort labels on effort-bearing options (human / CC)
|
||||
- [ ] Net line closes the decision
|
||||
- [ ] You are calling the tool, not writing prose
|
||||
- [ ] You are calling the tool, not writing prose — unless the documented failure fallback applies (then: prose with the mandatory triad — issue ELI10, per-choice Completeness, Recommendation + `(recommended)` — and a "reply with a letter" instruction, then STOP)
|
||||
- [ ] Non-ASCII characters (CJK / accents) written directly, NOT \u-escaped
|
||||
- [ ] If you had 5+ options, you split (or batched into ≤4-groups) — did NOT drop any
|
||||
- [ ] If you split, you checked dependencies between options before firing the chain
|
||||
|
||||
+27
-4
@@ -38,6 +38,9 @@ echo "SKILL_PREFIX: $_SKILL_PREFIX"
|
||||
source <($GSTACK_BIN/gstack-repo-mode 2>/dev/null) || true
|
||||
REPO_MODE=${REPO_MODE:-unknown}
|
||||
echo "REPO_MODE: $REPO_MODE"
|
||||
_SESSION_KIND=$($GSTACK_BIN/gstack-session-kind 2>/dev/null || echo "interactive")
|
||||
case "$_SESSION_KIND" in spawned|headless|interactive) ;; *) _SESSION_KIND="interactive" ;; esac
|
||||
echo "SESSION_KIND: $_SESSION_KIND"
|
||||
_LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no")
|
||||
echo "LAKE_INTRO: $_LAKE_SEEN"
|
||||
_TEL=$($GSTACK_BIN/gstack-config get telemetry 2>/dev/null || true)
|
||||
@@ -117,7 +120,7 @@ In plan mode, allowed because they inform the plan: `$B`, `$D`, `codex exec`/`co
|
||||
|
||||
## Skill Invocation During Plan Mode
|
||||
|
||||
If the user invokes a skill in plan mode, the skill takes precedence over generic plan mode behavior. **Treat the skill file as executable instructions, not reference.** Follow it step by step starting from Step 0; the first AskUserQuestion is the workflow entering plan mode, not a violation of it. AskUserQuestion (any variant — `mcp__*__AskUserQuestion` or native; see "AskUserQuestion Format → Tool resolution") satisfies plan mode's end-of-turn requirement. If no variant is callable, the skill is BLOCKED — stop and report `BLOCKED — AskUserQuestion unavailable` per the AskUserQuestion Format rule. At a STOP point, stop immediately. Do not continue the workflow or call ExitPlanMode there. Commands marked "PLAN MODE EXCEPTION — ALWAYS RUN" execute. Call ExitPlanMode only after the skill workflow completes, or if the user tells you to cancel the skill or leave plan mode.
|
||||
If the user invokes a skill in plan mode, the skill takes precedence over generic plan mode behavior. **Treat the skill file as executable instructions, not reference.** Follow it step by step starting from Step 0; the first AskUserQuestion is the workflow entering plan mode, not a violation of it. AskUserQuestion (any variant — `mcp__*__AskUserQuestion` or native; see "AskUserQuestion Format → Tool resolution") satisfies plan mode's end-of-turn requirement. If AskUserQuestion is unavailable or a call fails, follow the AskUserQuestion Format failure fallback: `headless` → BLOCKED; `interactive` → the prose fallback (also satisfies end-of-turn). At a STOP point, stop immediately. Do not continue the workflow or call ExitPlanMode there. Commands marked "PLAN MODE EXCEPTION — ALWAYS RUN" execute. Call ExitPlanMode only after the skill workflow completes, or if the user tells you to cancel the skill or leave plan mode.
|
||||
|
||||
If `PROACTIVE` is `"false"`, do not auto-invoke or proactively suggest skills. If a skill seems useful, ask: "I think /skillname might help here — want me to run it?"
|
||||
|
||||
@@ -289,11 +292,31 @@ AI orchestrator (e.g., OpenClaw). In spawned sessions:
|
||||
|
||||
**Rule:** if any `mcp__*__AskUserQuestion` variant is in your tool list, prefer it. Hosts may disable native AUQ via `--disallowedTools AskUserQuestion` (Conductor does, by default) and route through their MCP variant; calling native there silently fails. Same questions/options shape; same decision-brief format applies.
|
||||
|
||||
**If no AskUserQuestion variant appears in your tool list, this skill is BLOCKED.** Stop, report `BLOCKED — AskUserQuestion unavailable`, and wait for the user. Do not write decisions to the plan file as a substitute, do not emit them as prose and stop, and do not silently auto-decide (only `/plan-tune` AUTO_DECIDE opt-ins authorize auto-picking).
|
||||
If AskUserQuestion is unavailable (no variant in your tool list) OR a call to it fails, do NOT silently auto-decide or write the decision to the plan file as a substitute. Follow the **failure fallback** below.
|
||||
|
||||
### When AskUserQuestion is unavailable or a call fails
|
||||
|
||||
Tell three outcomes apart:
|
||||
|
||||
1. **Auto-decide denial (NOT a failure).** The result contains `[plan-tune auto-decide] <id> → <option>` — the preference hook working as designed. Proceed with that option. Do NOT retry, do NOT fall back to prose.
|
||||
2. **Genuine failure** — no variant in your tool list, OR the variant is present but the call returns an error / missing result (MCP transport error, empty result, host bug — e.g. Conductor's MCP AskUserQuestion is flaky and returns `[Tool result missing due to internal error]`).
|
||||
- If it was present and **errored** (not absent), retry the SAME call **once** — but only if no answer could have surfaced (a missing-result error can arrive after the user already saw the question; retrying would double-prompt, so if it may have reached them, treat as pending, don't retry).
|
||||
- Then branch on `SESSION_KIND` (echoed by the preamble; empty/absent ⇒ `interactive`):
|
||||
- `spawned` → defer to the **Spawned session** block: auto-choose the recommended option. Never prose, never BLOCKED.
|
||||
- `headless` → `BLOCKED — AskUserQuestion unavailable`; stop and wait (no human can answer).
|
||||
- `interactive` → **prose fallback** (below).
|
||||
|
||||
**Prose fallback — render the decision brief as a markdown message, not a tool call.** Same information as the tool format below, different structure (paragraphs, not ✅/❌ bullets). It MUST surface this triad:
|
||||
|
||||
1. **A clear ELI10 of the issue itself** — plain English on what's being decided and why it matters (the question, not per-choice), naming the stakes. Lead with it.
|
||||
2. **Completeness scores per choice** — explicit `Completeness: X/10` on EACH choice (10 complete, 7 happy-path, 3 shortcut); use the kind-note when options differ in kind not coverage, but never silently drop the score.
|
||||
3. **The recommendation and why** — a `Recommendation: <choice> because <reason>` line plus the `(recommended)` marker on that choice.
|
||||
|
||||
Layout: a `D<N>` title + a one-line note that AskUserQuestion failed and to reply with a letter; the issue ELI10; the Recommendation line; then ONE paragraph per choice carrying its `(recommended)` marker, its `Completeness: X/10`, and 2-4 sentences of reasoning — never a bare bullet list; a closing `Net:` line. Split chains / 5+ options: one prose block per per-option call, in sequence. Then STOP and wait — the user's typed answer is the decision. In plan mode this satisfies end-of-turn like a tool call.
|
||||
|
||||
### Format
|
||||
|
||||
Every AskUserQuestion is a decision brief and must be sent as tool_use, not prose.
|
||||
Every AskUserQuestion is a decision brief and must be sent as tool_use, not prose — unless the documented failure fallback above applies (interactive session + the call is unavailable/erroring), in which case the prose fallback is the correct output.
|
||||
|
||||
```
|
||||
D<N> — <one-line question title>
|
||||
@@ -373,7 +396,7 @@ Before calling AskUserQuestion, verify:
|
||||
- [ ] (recommended) label on one option (even for neutral-posture)
|
||||
- [ ] Dual-scale effort labels on effort-bearing options (human / CC)
|
||||
- [ ] Net line closes the decision
|
||||
- [ ] You are calling the tool, not writing prose
|
||||
- [ ] You are calling the tool, not writing prose — unless the documented failure fallback applies (then: prose with the mandatory triad — issue ELI10, per-choice Completeness, Recommendation + `(recommended)` — and a "reply with a letter" instruction, then STOP)
|
||||
- [ ] Non-ASCII characters (CJK / accents) written directly, NOT \u-escaped
|
||||
- [ ] If you had 5+ options, you split (or batched into ≤4-groups) — did NOT drop any
|
||||
- [ ] If you split, you checked dependencies between options before firing the chain
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import { describe, test, expect, beforeAll } from 'bun:test';
|
||||
import { COMMAND_DESCRIPTIONS } from '../browse/src/commands';
|
||||
import { SNAPSHOT_FLAGS } from '../browse/src/snapshot';
|
||||
import * as fs from 'fs';
|
||||
@@ -2125,6 +2125,21 @@ describe('Factory generation (--host factory)', () => {
|
||||
import { ALL_HOST_CONFIGS, getExternalHosts } from '../hosts/index';
|
||||
|
||||
describe('Parameterized host smoke tests', () => {
|
||||
// Regenerate every external host up front so the per-host `--dry-run` freshness
|
||||
// checks are deterministic. These host dirs (.agents/.factory/.cursor/...) are
|
||||
// gitignored regenerated artifacts, so the freshness check is really an
|
||||
// idempotency/determinism check — it still catches non-deterministic gen, but no
|
||||
// longer flakes on stale-on-disk state left by a missing `gen --host all` prestep
|
||||
// (the canonical `bun test` does not run one). The tracked-claude freshness test
|
||||
// (`generated files are fresh`) runs earlier and is unaffected.
|
||||
beforeAll(() => {
|
||||
for (const h of getExternalHosts()) {
|
||||
Bun.spawnSync(['bun', 'run', 'scripts/gen-skill-docs.ts', '--host', h.name], {
|
||||
cwd: ROOT, stdout: 'pipe', stderr: 'pipe',
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
for (const hostConfig of getExternalHosts()) {
|
||||
describe(`${hostConfig.displayName} (--host ${hostConfig.name})`, () => {
|
||||
const hostDir = path.join(ROOT, hostConfig.hostSubdir, 'skills');
|
||||
@@ -2208,6 +2223,16 @@ describe('Parameterized host smoke tests', () => {
|
||||
// ─── --host all tests ────────────────────────────────────────
|
||||
|
||||
describe('--host all', () => {
|
||||
// Same determinism guard as the parameterized block: make external hosts fresh on
|
||||
// disk so `--host all --dry-run` reports FRESH regardless of prior state.
|
||||
beforeAll(() => {
|
||||
for (const h of getExternalHosts()) {
|
||||
Bun.spawnSync(['bun', 'run', 'scripts/gen-skill-docs.ts', '--host', h.name], {
|
||||
cwd: ROOT, stdout: 'pipe', stderr: 'pipe',
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
test('--host all generates for all registered hosts', () => {
|
||||
const result = Bun.spawnSync(['bun', 'run', 'scripts/gen-skill-docs.ts', '--host', 'all', '--dry-run'], {
|
||||
cwd: ROOT, stdout: 'pipe', stderr: 'pipe',
|
||||
|
||||
@@ -0,0 +1,70 @@
|
||||
/**
|
||||
* gstack-session-kind — classifies the session so skills know whether a human can
|
||||
* answer an AskUserQuestion. Drives the AUQ-failure fallback branch:
|
||||
* spawned → auto-choose (orchestrator)
|
||||
* headless → BLOCK on AUQ failure
|
||||
* interactive → prose fallback on AUQ failure
|
||||
*
|
||||
* These permutations are the contract the resolver rule depends on. Run with a
|
||||
* SCRUBBED env (the test process itself runs inside Conductor, so CONDUCTOR_* /
|
||||
* CLAUDE_CODE_* would leak in and contaminate the classification).
|
||||
*
|
||||
* Free, deterministic, gate-tier.
|
||||
*/
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import { execFileSync } from 'child_process';
|
||||
import * as path from 'path';
|
||||
|
||||
const BIN = path.resolve(__dirname, '..', 'bin', 'gstack-session-kind');
|
||||
|
||||
/** Run the helper with ONLY the supplied env (plus PATH so bash resolves). */
|
||||
function kind(env: Record<string, string>): string {
|
||||
return execFileSync(BIN, [], {
|
||||
env: { PATH: process.env.PATH ?? '/usr/bin:/bin', ...env },
|
||||
encoding: 'utf-8',
|
||||
}).trim();
|
||||
}
|
||||
|
||||
describe('gstack-session-kind', () => {
|
||||
test('OPENCLAW_SESSION → spawned (highest precedence)', () => {
|
||||
expect(kind({ OPENCLAW_SESSION: '1' })).toBe('spawned');
|
||||
// spawned wins even when other markers are also present
|
||||
expect(kind({ OPENCLAW_SESSION: '1', GSTACK_HEADLESS: '1', CONDUCTOR_PORT: '5' })).toBe('spawned');
|
||||
});
|
||||
|
||||
test('GSTACK_HEADLESS → headless', () => {
|
||||
expect(kind({ GSTACK_HEADLESS: '1' })).toBe('headless');
|
||||
});
|
||||
|
||||
test('CONDUCTOR_* → interactive (a human host is present)', () => {
|
||||
expect(kind({ CONDUCTOR_WORKSPACE_PATH: '/tmp/ws' })).toBe('interactive');
|
||||
expect(kind({ CONDUCTOR_PORT: '55010' })).toBe('interactive');
|
||||
});
|
||||
|
||||
test('CLAUDE_CODE_ENTRYPOINT=cli → interactive', () => {
|
||||
expect(kind({ CLAUDE_CODE_ENTRYPOINT: 'cli' })).toBe('interactive');
|
||||
});
|
||||
|
||||
test('interactive host beats CI markers', () => {
|
||||
expect(kind({ CONDUCTOR_PORT: '5', CI: '1' })).toBe('interactive');
|
||||
});
|
||||
|
||||
test('CI / GITHUB_ACTIONS with no host → headless', () => {
|
||||
expect(kind({ CI: '1' })).toBe('headless');
|
||||
expect(kind({ GITHUB_ACTIONS: 'true' })).toBe('headless');
|
||||
});
|
||||
|
||||
test('GSTACK_HEADLESS beats CONDUCTOR (explicit override wins)', () => {
|
||||
expect(kind({ GSTACK_HEADLESS: '1', CONDUCTOR_PORT: '5' })).toBe('headless');
|
||||
});
|
||||
|
||||
test('bare env → interactive (degrade-safe default)', () => {
|
||||
expect(kind({})).toBe('interactive');
|
||||
});
|
||||
|
||||
test('empty GSTACK_HEADLESS is treated as unset (interactive)', () => {
|
||||
// The resolver/helper guard on -n, so an empty string must NOT mean headless —
|
||||
// this is the opt-out path harness suites use to exercise the interactive branch.
|
||||
expect(kind({ GSTACK_HEADLESS: '' })).toBe('interactive');
|
||||
});
|
||||
});
|
||||
@@ -300,6 +300,13 @@ export async function runAgentSdkTest(
|
||||
const queryImpl: QueryProvider = opts.queryProvider ?? query;
|
||||
const model = opts.model ?? 'claude-opus-4-7';
|
||||
|
||||
// NOTE on GSTACK_HEADLESS: the SDK child inherits process.env, so headless
|
||||
// classification for eval/E2E runs is set by the `test:gate` / `test:evals`
|
||||
// package.json scripts (scoped to that invocation), NOT mutated here. We must not
|
||||
// pass sdkOpts.env (it breaks the SDK auth pipeline — see CLAUDE.md) and must not
|
||||
// mutate process.env ambiently (it would leak headless into later interactive-path
|
||||
// tests in the same Bun process — Codex review finding).
|
||||
|
||||
let attempt = 0;
|
||||
let lastErr: unknown = null;
|
||||
|
||||
|
||||
@@ -0,0 +1,177 @@
|
||||
/**
|
||||
* Pure carve-guard check functions, with an injectable `root` (codex
|
||||
* outside-voice #5, refined-plan pass) so the negative tests (T5) can point the
|
||||
* REAL guards at a broken fixture dir instead of testing a wrapper.
|
||||
*
|
||||
* Used by:
|
||||
* - test/carve-section-ordering.test.ts (E2) → checkOrdering
|
||||
* - test/carve-guard-completeness.test.ts (E1) → discoverCarvedSkills + checkCompleteness
|
||||
* - test/carve-guards-negative.test.ts (T5) → both, against a fixture root
|
||||
*
|
||||
* Imports only the leaf data module (carve-guards.ts) + node stdlib — no cycle.
|
||||
*/
|
||||
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import { CARVE_GUARDS, type CarveGuard } from './carve-guards';
|
||||
|
||||
/** Every dir under `root` that owns a sections/manifest.json. Injectable for tests. */
|
||||
export function discoverCarvedSkills(root: string): string[] {
|
||||
return fs
|
||||
.readdirSync(root, { withFileTypes: true })
|
||||
.filter((d) => d.isDirectory())
|
||||
.map((d) => d.name)
|
||||
.filter((name) => fs.existsSync(path.join(root, name, 'sections', 'manifest.json')))
|
||||
.sort();
|
||||
}
|
||||
|
||||
function readSkeleton(root: string, skill: string): string {
|
||||
return fs.readFileSync(path.join(root, skill, 'SKILL.md'), 'utf-8');
|
||||
}
|
||||
|
||||
/** Skeleton + every sections/*.md unioned (relocated content still counts). */
|
||||
function readUnion(root: string, skill: string): string {
|
||||
let text = readSkeleton(root, skill);
|
||||
const dir = path.join(root, skill, 'sections');
|
||||
if (fs.existsSync(dir)) {
|
||||
for (const f of fs.readdirSync(dir).sort()) {
|
||||
if (f.endsWith('.md') && !f.endsWith('.md.tmpl')) {
|
||||
text += '\n' + fs.readFileSync(path.join(dir, f), 'utf-8');
|
||||
}
|
||||
}
|
||||
}
|
||||
return text;
|
||||
}
|
||||
|
||||
const STOP = '> **STOP.**';
|
||||
|
||||
/**
|
||||
* Static ordering invariants for one carved skill. Returns a list of failure
|
||||
* strings (empty = pass). Pure: takes `root` so it runs against the real repo or
|
||||
* a fixture identically.
|
||||
*/
|
||||
export function checkOrdering(root: string, guard: CarveGuard): string[] {
|
||||
const failures: string[] = [];
|
||||
let skeleton: string;
|
||||
try {
|
||||
skeleton = readSkeleton(root, guard.skill);
|
||||
} catch (err) {
|
||||
return [`cannot read ${guard.skill}/SKILL.md: ${(err as Error).message}`];
|
||||
}
|
||||
const union = readUnion(root, guard.skill);
|
||||
|
||||
// 1. The skeleton routes to sections via a Section index + STOP-Read directives.
|
||||
if (!skeleton.includes('## Section index')) {
|
||||
failures.push('skeleton is missing the "## Section index" table');
|
||||
}
|
||||
if (!skeleton.includes(STOP)) {
|
||||
failures.push('skeleton has no STOP-Read directive');
|
||||
}
|
||||
|
||||
// 2. Every expected section is referenced by path AND generated (AUTO-GENERATED).
|
||||
for (const file of guard.expectedSections) {
|
||||
if (!skeleton.includes(`sections/${file}`)) {
|
||||
failures.push(`skeleton does not reference sections/${file}`);
|
||||
}
|
||||
const secPath = path.join(root, guard.skill, 'sections', file);
|
||||
if (!fs.existsSync(secPath)) {
|
||||
failures.push(`section file missing: sections/${file}`);
|
||||
} else if (!fs.readFileSync(secPath, 'utf-8').slice(0, 200).includes('AUTO-GENERATED')) {
|
||||
failures.push(`sections/${file} is hand-edited (no AUTO-GENERATED header)`);
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Pre-STOP anchors stay in the skeleton.
|
||||
for (const anchor of guard.staticInvariants.mustStayInSkeleton) {
|
||||
if (!skeleton.includes(anchor)) {
|
||||
failures.push(`mustStayInSkeleton anchor missing from skeleton: "${anchor}"`);
|
||||
}
|
||||
}
|
||||
|
||||
// 3b. Earliest-use: dispatch directives must appear BEFORE the first STOP
|
||||
// (codex #6 — a directive that governs which sections to read can't sit after
|
||||
// the STOP that reads them).
|
||||
const firstStopIdx = skeleton.indexOf(STOP);
|
||||
for (const anchor of guard.staticInvariants.mustPrecedeStop ?? []) {
|
||||
const at = skeleton.indexOf(anchor);
|
||||
if (at < 0) {
|
||||
failures.push(`mustPrecedeStop anchor missing from skeleton: "${anchor}"`);
|
||||
} else if (firstStopIdx >= 0 && at > firstStopIdx) {
|
||||
failures.push(`mustPrecedeStop anchor "${anchor}" appears AFTER the STOP (stranded)`);
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Heavy body moved out of the skeleton but is preserved in the union.
|
||||
for (const moved of guard.staticInvariants.mustMoveToSection) {
|
||||
if (skeleton.includes(moved)) {
|
||||
failures.push(`mustMoveToSection marker is still in the skeleton: "${moved}"`);
|
||||
}
|
||||
if (!union.includes(moved)) {
|
||||
failures.push(`mustMoveToSection marker absent from the union (lost): "${moved}"`);
|
||||
}
|
||||
}
|
||||
|
||||
// 5. The post-STOP gate fires after the last STOP (review skills).
|
||||
const gate = guard.staticInvariants.gateAfterStop;
|
||||
if (gate) {
|
||||
// Gate must fire after the LAST STOP (once all section work returns), not just
|
||||
// the first — for multi-STOP skeletons a gate between two STOPs is stranded.
|
||||
const lastStop = skeleton.lastIndexOf(STOP);
|
||||
const lastGate = skeleton.lastIndexOf(gate);
|
||||
if (lastGate < 0) {
|
||||
failures.push(`gateAfterStop marker missing from skeleton: "${gate}"`);
|
||||
} else if (lastStop >= 0 && lastGate < lastStop) {
|
||||
failures.push(`gateAfterStop "${gate}" appears before the last STOP (stranded above it)`);
|
||||
}
|
||||
}
|
||||
|
||||
return failures;
|
||||
}
|
||||
|
||||
/**
|
||||
* Completeness (E1): the filesystem carved set must equal the registry set, both
|
||||
* directions, and every registry entry must be internally consistent. Pure:
|
||||
* takes `root`.
|
||||
*/
|
||||
export function checkCompleteness(root: string): string[] {
|
||||
const failures: string[] = [];
|
||||
const discovered = new Set(discoverCarvedSkills(root));
|
||||
const registered = new Set(Object.keys(CARVE_GUARDS));
|
||||
|
||||
for (const skill of discovered) {
|
||||
if (!registered.has(skill)) {
|
||||
failures.push(`carved on disk but NOT in CARVE_GUARDS (unguarded carve): ${skill}`);
|
||||
}
|
||||
}
|
||||
for (const skill of registered) {
|
||||
if (!discovered.has(skill)) {
|
||||
failures.push(`in CARVE_GUARDS but not carved on disk (stale registry entry): ${skill}`);
|
||||
}
|
||||
}
|
||||
|
||||
for (const [skill, g] of Object.entries(CARVE_GUARDS)) {
|
||||
if (g.expectedSections.length === 0) {
|
||||
failures.push(`${skill}: expectedSections is empty`);
|
||||
}
|
||||
if (g.requiredReads.length === 0) {
|
||||
failures.push(`${skill}: requiredReads is empty (behavioral guard would be decorative)`);
|
||||
}
|
||||
for (const r of g.requiredReads) {
|
||||
if (!g.expectedSections.includes(r)) {
|
||||
failures.push(`${skill}: requiredRead "${r}" is not in expectedSections`);
|
||||
}
|
||||
}
|
||||
// Behavioral guard exists: 'plan'/'prompt' are covered structurally by the
|
||||
// data-driven loop (registry membership IS coverage); 'external' must name a
|
||||
// dedicated test file that actually exists on disk.
|
||||
if (g.behavioral === 'external') {
|
||||
if (!g.externalTest) {
|
||||
failures.push(`${skill}: behavioral 'external' but no externalTest path`);
|
||||
} else if (!fs.existsSync(path.join(root, g.externalTest))) {
|
||||
failures.push(`${skill}: externalTest missing on disk: ${g.externalTest}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return failures;
|
||||
}
|
||||
@@ -0,0 +1,290 @@
|
||||
/**
|
||||
* Canonical carved-skill guard registry — the single source of truth for which
|
||||
* skills are carved (skeleton SKILL.md + on-demand sections/*.md) and what each
|
||||
* carve must guarantee.
|
||||
*
|
||||
* PURE LEAF DATA MODULE (codex outside-voice #1, refined-plan pass): this file
|
||||
* has NO runtime imports — `import type` only. parity-harness.ts and
|
||||
* skill-size-budget.test.ts derive their carved-skill lists FROM here (no
|
||||
* parallel hand-maintained lists), so a runtime import back into either of them
|
||||
* would create a cycle. Keep it data.
|
||||
*
|
||||
* Consumers:
|
||||
* - test/carve-section-ordering.test.ts (E2, gate) → staticInvariants
|
||||
* - test/carve-section-loading.test.ts (T2, periodic) → requiredReads + scenario
|
||||
* - test/carve-guard-completeness.test.ts (E1, gate) → the set must equal the
|
||||
* filesystem carved set
|
||||
* - test/carve-guards-negative.test.ts (ET1, gate) → injects a broken fixture
|
||||
* - test/helpers/parity-harness.ts → sectioned/maxSkeletonBytes/minBytes/mustContain
|
||||
* - test/skill-size-budget.test.ts → SECTIONS_EXTRACTED = CARVED_SKILLS
|
||||
*
|
||||
* Adding a carve = add one entry here (atomically, in the same commit as the
|
||||
* skeleton + manifest + sections — codex #4 — so E1's bidirectional parity never
|
||||
* false-positives mid-commit).
|
||||
*/
|
||||
|
||||
/** Static (skeleton-shape) invariants the per-PR ordering guard (E2) asserts. */
|
||||
export interface CarveStaticInvariants {
|
||||
/**
|
||||
* Substrings that MUST remain in the always-loaded skeleton. Empty = skip
|
||||
* (the skill has no distinctive pre-STOP anchor worth pinning beyond the
|
||||
* universal STOP/section-index checks E2 already runs).
|
||||
*/
|
||||
mustStayInSkeleton: string[];
|
||||
/**
|
||||
* Substrings that MUST appear in the skeleton BEFORE the first STOP-Read
|
||||
* (earliest-use, codex #6). For cso: mode-dispatch directives (## Arguments,
|
||||
* ## Mode Resolution) must be resolved before any section is read — a dispatch
|
||||
* directive stranded after the STOP can't govern which sections to read.
|
||||
* Empty/undefined = skip (most skills).
|
||||
*/
|
||||
mustPrecedeStop?: string[];
|
||||
/**
|
||||
* Substrings that MUST be in the union (skeleton + sections) but MUST NOT be in
|
||||
* the skeleton — i.e. the heavy body that the carve relocated. Empty = skip.
|
||||
*/
|
||||
mustMoveToSection: string[];
|
||||
/**
|
||||
* If set, this marker must appear in the skeleton AFTER the last STOP-Read
|
||||
* directive (e.g. the EXIT PLAN MODE GATE that fires once section work returns).
|
||||
* Undefined = the skill has no post-STOP gate (operational/conversational carve).
|
||||
*/
|
||||
gateAfterStop?: string;
|
||||
}
|
||||
|
||||
export interface CarveGuard {
|
||||
skill: string;
|
||||
/** Section .md filenames the manifest lists and the skeleton must STOP-Read. */
|
||||
expectedSections: string[];
|
||||
/**
|
||||
* Sections the behavioral test (T2) asserts the agent actually Read when driven
|
||||
* by `scenario`. A non-empty subset of expectedSections — the ones the scenario
|
||||
* is built to require. The registry owns this so "registered ⇒ asserted" is
|
||||
* structural (codex #2), not policed.
|
||||
*/
|
||||
requiredReads: string[];
|
||||
/**
|
||||
* Fixture prompt that drives a real `claude -p` run down the STOP-Read path for
|
||||
* this skill (codex #7). The behavioral test asserts the run reached the STOP
|
||||
* (read requiredReads), not merely that nothing was read.
|
||||
*/
|
||||
scenario: string;
|
||||
staticInvariants: CarveStaticInvariants;
|
||||
/**
|
||||
* How the behavioral guard (T2) exercises this skill:
|
||||
* - 'plan' → write a PLAN.md fixture, run the review against it
|
||||
* - 'prompt' → no fixture file; the scenario prompt alone drives the run
|
||||
* - 'external' → covered by a dedicated bespoke test (complex fixtures, e.g.
|
||||
* ship's git/VERSION/CHANGELOG state). The data-driven loop
|
||||
* skips it; E1 asserts `externalTest` exists instead.
|
||||
*/
|
||||
behavioral: 'plan' | 'prompt' | 'external';
|
||||
/** Required when behavioral === 'external': path (repo-relative) to the dedicated test. */
|
||||
externalTest?: string;
|
||||
/** Parity: max bytes for the always-loaded skeleton (asserts the carve shrank it). */
|
||||
maxSkeletonBytes: number;
|
||||
/** Parity: min bytes for the skeleton+sections union (total behavior preserved). */
|
||||
minUnionBytes: number;
|
||||
/** Parity: content phrases the union must preserve. */
|
||||
mustContain: string[];
|
||||
/**
|
||||
* Parity: optional per-skill override for the union size-growth ceiling vs the
|
||||
* v1.53.0.0 baseline (default 1.05). Bumped only when a deliberate cross-cutting
|
||||
* preamble feature legitimately grows a smaller carved skeleton past 5%.
|
||||
*/
|
||||
maxSizeRatio?: number;
|
||||
}
|
||||
|
||||
export const CARVE_GUARDS: Record<string, CarveGuard> = {
|
||||
ship: {
|
||||
skill: 'ship',
|
||||
expectedSections: [
|
||||
'tests.md',
|
||||
'test-coverage.md',
|
||||
'plan-completion.md',
|
||||
'review-army.md',
|
||||
'greptile.md',
|
||||
'adversarial.md',
|
||||
'changelog.md',
|
||||
'pr-body.md',
|
||||
],
|
||||
requiredReads: ['review-army.md', 'changelog.md'],
|
||||
scenario:
|
||||
'This is a FRESH version-changing ship: the branch has a real code change, VERSION still equals the base version (needs a bump), and CHANGELOG.md needs a new entry. Follow the skill flow for a version-changing ship: run the pre-landing review and prepare the CHANGELOG entry. Produce the ship plan / review report. Do NOT actually commit, push, or open a PR.',
|
||||
staticInvariants: {
|
||||
// The PR-title-version invariant MUST stay always-loaded: the v1.54.0.0
|
||||
// carve stranded it in pr-body.md and PRs started landing with bare titles
|
||||
// (CI backstop: test/pr-title-sync-workflow-safety.test.ts).
|
||||
mustStayInSkeleton: ['v$NEW_VERSION', 'gstack-pr-title-rewrite'],
|
||||
// ...while the full create/update procedure stays carved into pr-body.md
|
||||
// (out of the skeleton, present in the union). Asserts BOTH PR paths
|
||||
// survive: the create path and the idempotent update path.
|
||||
mustMoveToSection: ['gh pr create --base', 'gh pr edit --title'],
|
||||
// ship is operational (multi-STOP, not a plan review); no single post-STOP gate.
|
||||
gateAfterStop: undefined,
|
||||
},
|
||||
behavioral: 'external',
|
||||
externalTest: 'test/skill-e2e-ship-section-loading.test.ts',
|
||||
maxSkeletonBytes: 90_000,
|
||||
minUnionBytes: 120_000,
|
||||
mustContain: ['VERSION', 'CHANGELOG', 'review', 'merge', 'PR'],
|
||||
},
|
||||
'plan-ceo-review': {
|
||||
skill: 'plan-ceo-review',
|
||||
expectedSections: ['review-sections.md'],
|
||||
requiredReads: ['review-sections.md'],
|
||||
scenario:
|
||||
'Review the plan in PLAN.md. Hold the current scope (HOLD SCOPE mode) — do not challenge or expand scope. Run the full CEO review and produce the review report.',
|
||||
staticInvariants: {
|
||||
mustStayInSkeleton: ['## Step 0: Nuclear Scope Challenge'],
|
||||
mustMoveToSection: ['### Section 1: Architecture Review', '## Mode Quick Reference'],
|
||||
gateAfterStop: 'EXIT PLAN MODE GATE',
|
||||
},
|
||||
behavioral: 'external',
|
||||
externalTest: 'test/skill-e2e-plan-ceo-review-section-loading.test.ts',
|
||||
maxSkeletonBytes: 90_000,
|
||||
minUnionBytes: 80_000,
|
||||
mustContain: ['SCOPE EXPANSION', 'SELECTIVE EXPANSION', 'HOLD SCOPE', 'SCOPE REDUCTION'],
|
||||
},
|
||||
'plan-eng-review': {
|
||||
skill: 'plan-eng-review',
|
||||
expectedSections: ['review-sections.md'],
|
||||
requiredReads: ['review-sections.md'],
|
||||
scenario:
|
||||
'Review the plan in PLAN.md. Accept the current scope. Run the full engineering review (architecture, code quality, tests, performance) and produce the review report.',
|
||||
staticInvariants: {
|
||||
mustStayInSkeleton: ['### Step 0: Scope Challenge'],
|
||||
mustMoveToSection: ['### 1. Architecture review'],
|
||||
gateAfterStop: 'EXIT PLAN MODE GATE',
|
||||
},
|
||||
behavioral: 'plan',
|
||||
maxSkeletonBytes: 62_000,
|
||||
minUnionBytes: 70_000,
|
||||
mustContain: ['Architecture', 'Code Quality', 'Test', 'Performance'],
|
||||
},
|
||||
'plan-design-review': {
|
||||
skill: 'plan-design-review',
|
||||
expectedSections: ['review-sections.md'],
|
||||
requiredReads: ['review-sections.md'],
|
||||
scenario:
|
||||
'Review the plan in PLAN.md for design and UX. Accept the current scope. Run the full design review passes and produce the review report.',
|
||||
staticInvariants: {
|
||||
mustStayInSkeleton: [],
|
||||
mustMoveToSection: ['### Pass 1: Information Architecture'],
|
||||
gateAfterStop: 'EXIT PLAN MODE GATE',
|
||||
},
|
||||
behavioral: 'plan',
|
||||
maxSkeletonBytes: 82_000,
|
||||
minUnionBytes: 70_000,
|
||||
mustContain: ['design', 'visual'],
|
||||
},
|
||||
'plan-devex-review': {
|
||||
skill: 'plan-devex-review',
|
||||
expectedSections: ['review-sections.md'],
|
||||
requiredReads: ['review-sections.md'],
|
||||
scenario:
|
||||
'Review the plan in PLAN.md for developer experience. Accept the current scope. Run the full DX review passes and produce the review report.',
|
||||
staticInvariants: {
|
||||
mustStayInSkeleton: [],
|
||||
mustMoveToSection: ['### Pass 1: Getting Started Experience'],
|
||||
gateAfterStop: 'EXIT PLAN MODE GATE',
|
||||
},
|
||||
behavioral: 'plan',
|
||||
maxSkeletonBytes: 76_000,
|
||||
minUnionBytes: 70_000,
|
||||
mustContain: ['developer experience', 'Getting Started'],
|
||||
},
|
||||
'office-hours': {
|
||||
skill: 'office-hours',
|
||||
expectedSections: ['design-and-handoff.md'],
|
||||
requiredReads: ['design-and-handoff.md'],
|
||||
scenario:
|
||||
'Run office hours for this product idea through to the end: have the diagnostic conversation, explore alternatives, then write the design doc and run the relationship handoff (Phases 5-6).',
|
||||
staticInvariants: {
|
||||
mustStayInSkeleton: [],
|
||||
mustMoveToSection: [],
|
||||
// office-hours is conversational; the design-doc/handoff section has no
|
||||
// post-STOP review gate in the skeleton.
|
||||
gateAfterStop: undefined,
|
||||
},
|
||||
behavioral: 'prompt',
|
||||
maxSkeletonBytes: 96_000,
|
||||
minUnionBytes: 70_000,
|
||||
mustContain: ['design doc', 'problem statement'],
|
||||
},
|
||||
'document-release': {
|
||||
skill: 'document-release',
|
||||
expectedSections: ['release-body.md'],
|
||||
requiredReads: ['release-body.md'],
|
||||
scenario:
|
||||
'A PR has shipped a new CLI flag and touched README.md and CHANGELOG.md. Skip the git pre-flight shell commands (assume the diff adds --new-flag and updates those two docs). Run the documentation workflow: build the coverage map, then audit the docs, apply updates, and polish the CHANGELOG voice. Produce the documentation health summary.',
|
||||
staticInvariants: {
|
||||
mustStayInSkeleton: ['## Step 1: Pre-flight', '## Step 1.5: Coverage Map'],
|
||||
mustMoveToSection: ['## Step 2: Per-File Documentation Audit', '## Step 5: CHANGELOG Voice Polish'],
|
||||
// Operational skill (no plan-mode review gate).
|
||||
gateAfterStop: undefined,
|
||||
},
|
||||
behavioral: 'prompt',
|
||||
maxSkeletonBytes: 50_000,
|
||||
minUnionBytes: 55_000,
|
||||
mustContain: ['CHANGELOG', 'Diataxis', 'coverage'],
|
||||
// The AUQ-failure prose fallback (v1.57.2.0) adds ~2KB to every skill's
|
||||
// always-loaded preamble; on this small carved skeleton that lands at ~5.9%
|
||||
// over the pre-carve/pre-AUQ v1.53.0.0 baseline. Headroom for the
|
||||
// cross-cutting addition; all other skills keep the strict 1.05 ceiling.
|
||||
maxSizeRatio: 1.08,
|
||||
},
|
||||
'design-consultation': {
|
||||
skill: 'design-consultation',
|
||||
expectedSections: ['proposal-and-preview.md'],
|
||||
requiredReads: ['proposal-and-preview.md'],
|
||||
scenario:
|
||||
'The user gave product context (a B2B analytics dashboard for ops teams) and declined the research phase. Skip browser/design tool setup. Proceed to build the complete design-system proposal, then write DESIGN.md. Produce the proposal and the DESIGN.md content.',
|
||||
staticInvariants: {
|
||||
mustStayInSkeleton: ['## Phase 0: Pre-checks', '## Phase 1: Product Context', '## Phase 2: Research'],
|
||||
mustMoveToSection: ['## Phase 3: The Complete Proposal', '## Phase 6: Write DESIGN.md'],
|
||||
gateAfterStop: undefined,
|
||||
},
|
||||
behavioral: 'prompt',
|
||||
maxSkeletonBytes: 64_000,
|
||||
minUnionBytes: 72_000,
|
||||
mustContain: ['Typography', 'Color', 'Aesthetic Direction'],
|
||||
},
|
||||
cso: {
|
||||
skill: 'cso',
|
||||
expectedSections: ['audit-phases.md'],
|
||||
requiredReads: ['audit-phases.md'],
|
||||
scenario:
|
||||
'Run a security audit on this repository in --owasp mode (OWASP Top 10 only). Resolve the mode, do the Phase 0 stack detection and Phase 1 attack-surface census, then run the scoped audit phases and produce the findings report. Skip any step that needs network access.',
|
||||
staticInvariants: {
|
||||
// Dispatch + always-run + FP-filtering phases are ALWAYS loaded (security).
|
||||
mustStayInSkeleton: [
|
||||
'## Arguments',
|
||||
'## Mode Resolution',
|
||||
'### Phase 0',
|
||||
'### Phase 1',
|
||||
'### Phase 12',
|
||||
'### Phase 13',
|
||||
'### Phase 14',
|
||||
],
|
||||
// Earliest-use: mode must be resolvable before any section is read (codex #6).
|
||||
mustPrecedeStop: ['## Arguments', '## Mode Resolution'],
|
||||
// Scope-dependent audit detail moved to the section.
|
||||
mustMoveToSection: [
|
||||
'### Phase 2: Secrets Archaeology',
|
||||
'### Phase 9: OWASP Top 10 Assessment',
|
||||
'### Phase 10: STRIDE Threat Model',
|
||||
],
|
||||
gateAfterStop: undefined,
|
||||
},
|
||||
behavioral: 'prompt',
|
||||
maxSkeletonBytes: 70_000,
|
||||
minUnionBytes: 72_000,
|
||||
mustContain: ['OWASP', 'STRIDE', 'daily', 'comprehensive', 'verif'],
|
||||
},
|
||||
};
|
||||
|
||||
/** Sorted carved-skill names. Consumers derive their lists from this — no parallel lists. */
|
||||
export const CARVED_SKILLS: readonly string[] = Object.freeze(
|
||||
Object.keys(CARVE_GUARDS).sort(),
|
||||
);
|
||||
@@ -22,6 +22,7 @@ import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import type { ParityBaseline, SkillBaselineEntry } from './capture-parity-baseline';
|
||||
import { captureBaseline } from './capture-parity-baseline';
|
||||
import { CARVE_GUARDS } from './carve-guards';
|
||||
|
||||
export interface ParityInvariant {
|
||||
skill: string;
|
||||
@@ -198,86 +199,13 @@ export function runParityChecks(opts: {
|
||||
* Each entry pins what must-not-break in a skill family. Extend as future
|
||||
* skills land. Phase B (v2.0.0.0) adds LLM-judge invariants on top of these.
|
||||
*/
|
||||
export const PARITY_INVARIANTS: ParityInvariant[] = [
|
||||
{
|
||||
skill: 'cso',
|
||||
mustContain: ['OWASP', 'STRIDE', 'daily', 'comprehensive', 'verif'],
|
||||
mustHaveHeadings: ['## Preamble', '## When to invoke'],
|
||||
maxSizeRatio: 1.05,
|
||||
minBytes: 30_000,
|
||||
},
|
||||
{
|
||||
// Carved (v2 plan T9): skeleton SKILL.md + sections/*.md. Content checks run
|
||||
// against the union (relocated phrases still count); size floors run against
|
||||
// the union (total behavior preserved); maxSkeletonBytes asserts the
|
||||
// always-loaded skeleton actually shrank from the ~167KB monolith.
|
||||
skill: 'ship',
|
||||
sectioned: true,
|
||||
maxSkeletonBytes: 90_000,
|
||||
mustContain: [
|
||||
'VERSION',
|
||||
'CHANGELOG',
|
||||
'review',
|
||||
'merge',
|
||||
'PR',
|
||||
],
|
||||
mustHaveHeadings: ['## Preamble', '## When to invoke'],
|
||||
maxSizeRatio: 1.05,
|
||||
minBytes: 120_000,
|
||||
},
|
||||
{
|
||||
// Carved (v2 plan T9): skeleton SKILL.md + sections/review-sections.md.
|
||||
// Content + size floors run against the union (relocated prose still counts);
|
||||
// maxSkeletonBytes asserts the always-loaded skeleton shrank from the ~138KB
|
||||
// monolith to ~81KB (measured 80,731 B, -42%). Headroom to 90KB so a small
|
||||
// skeleton edit doesn't trip CI, but a 10KB regression does.
|
||||
skill: 'plan-ceo-review',
|
||||
sectioned: true,
|
||||
maxSkeletonBytes: 90_000,
|
||||
mustContain: [
|
||||
'SCOPE EXPANSION',
|
||||
'SELECTIVE EXPANSION',
|
||||
'HOLD SCOPE',
|
||||
'SCOPE REDUCTION',
|
||||
],
|
||||
mustHaveHeadings: ['## Preamble', '## When to invoke'],
|
||||
maxSizeRatio: 1.05,
|
||||
minBytes: 80_000,
|
||||
},
|
||||
{
|
||||
// Carved (v2 plan T9): skeleton + sections/review-sections.md. The 4-section
|
||||
// review, outside voice, and required outputs moved to the section; content
|
||||
// checks run against the union. Skeleton shrank 106,984 -> 54,892 B (-48.7%);
|
||||
// maxSkeletonBytes 62KB = measured + headroom.
|
||||
skill: 'plan-eng-review',
|
||||
sectioned: true,
|
||||
maxSkeletonBytes: 62_000,
|
||||
mustContain: [
|
||||
'Architecture',
|
||||
'Code Quality',
|
||||
'Test',
|
||||
'Performance',
|
||||
],
|
||||
mustHaveHeadings: ['## Preamble', '## When to invoke'],
|
||||
maxSizeRatio: 1.05,
|
||||
minBytes: 70_000,
|
||||
},
|
||||
{
|
||||
// Carved (v2 plan T9): skeleton + sections/review-sections.md. The 7 design
|
||||
// passes + required outputs moved to the section; content checks run against
|
||||
// the union. Skeleton shrank 112,057 -> 76,024 B (-32.2%); maxSkeletonBytes
|
||||
// 82KB = measured + headroom.
|
||||
skill: 'plan-design-review',
|
||||
sectioned: true,
|
||||
maxSkeletonBytes: 82_000,
|
||||
mustContain: [
|
||||
'design',
|
||||
'visual',
|
||||
],
|
||||
mustHaveHeadings: ['## Preamble', '## When to invoke'],
|
||||
maxSizeRatio: 1.05,
|
||||
minBytes: 70_000,
|
||||
},
|
||||
/**
|
||||
* Monolith (non-carved) invariants — hand-written. Carved-skill invariants are
|
||||
* generated from CARVE_GUARDS below (single source of truth), so they never drift
|
||||
* from the size-budget / static / behavioral guards.
|
||||
*/
|
||||
const MONOLITH_INVARIANTS: ParityInvariant[] = [
|
||||
// cso is now carved — its invariant is generated from CARVE_GUARDS below.
|
||||
{
|
||||
skill: 'review',
|
||||
mustContain: ['confidence', 'P1', 'P2'],
|
||||
@@ -299,21 +227,6 @@ export const PARITY_INVARIANTS: ParityInvariant[] = [
|
||||
maxSizeRatio: 1.05,
|
||||
minBytes: 30_000,
|
||||
},
|
||||
{
|
||||
// Carved (v2 plan T9): skeleton SKILL.md + sections/design-and-handoff.md.
|
||||
// Phase 5 (design doc) + Phase 6 (handoff) moved into the section, so
|
||||
// 'design doc' / 'problem statement' now live there — content checks run
|
||||
// against the union. maxSkeletonBytes asserts the always-loaded skeleton
|
||||
// shrank from the ~118KB monolith to ~89KB (measured 88,975 B, -24.8%);
|
||||
// headroom to 96KB so a small skeleton edit doesn't trip CI.
|
||||
skill: 'office-hours',
|
||||
sectioned: true,
|
||||
maxSkeletonBytes: 96_000,
|
||||
mustContain: ['design doc', 'problem statement'],
|
||||
mustHaveHeadings: ['## Preamble', '## When to invoke'],
|
||||
maxSizeRatio: 1.05,
|
||||
minBytes: 70_000,
|
||||
},
|
||||
{
|
||||
skill: 'autoplan',
|
||||
mustContain: ['ceo', 'eng', 'design'],
|
||||
@@ -322,3 +235,27 @@ export const PARITY_INVARIANTS: ParityInvariant[] = [
|
||||
minBytes: 70_000,
|
||||
},
|
||||
];
|
||||
|
||||
/**
|
||||
* Carved-skill invariants, GENERATED from the canonical CARVE_GUARDS registry
|
||||
* (EQ1: single source of truth). Each carve's skeleton-shrink floor
|
||||
* (maxSkeletonBytes), union floor (minUnionBytes), and content invariants
|
||||
* (mustContain) live in carve-guards.ts; this just projects them into the parity
|
||||
* shape. Adding a carve there auto-adds its union guard here — which is how
|
||||
* plan-devex-review (previously in SECTIONS_EXTRACTED but missing a sectioned
|
||||
* parity invariant) is now guarded.
|
||||
*/
|
||||
const CARVED_INVARIANTS: ParityInvariant[] = Object.values(CARVE_GUARDS).map((g) => ({
|
||||
skill: g.skill,
|
||||
sectioned: true,
|
||||
maxSkeletonBytes: g.maxSkeletonBytes,
|
||||
minBytes: g.minUnionBytes,
|
||||
mustContain: g.mustContain,
|
||||
mustHaveHeadings: ['## Preamble', '## When to invoke'],
|
||||
maxSizeRatio: g.maxSizeRatio ?? 1.05,
|
||||
}));
|
||||
|
||||
export const PARITY_INVARIANTS: ParityInvariant[] = [
|
||||
...MONOLITH_INVARIANTS,
|
||||
...CARVED_INVARIANTS,
|
||||
];
|
||||
|
||||
@@ -52,6 +52,9 @@ export class ClaudeAdapter implements ProviderAdapter {
|
||||
timeout: opts.timeoutMs,
|
||||
encoding: 'utf-8',
|
||||
maxBuffer: 32 * 1024 * 1024,
|
||||
// Default GSTACK_HEADLESS=1 so a benchmark run classifies as headless (an
|
||||
// AskUserQuestion failure BLOCKs rather than emitting unanswerable prose).
|
||||
env: { ...process.env, GSTACK_HEADLESS: '1' },
|
||||
});
|
||||
const parsed = this.parseOutput(out);
|
||||
return {
|
||||
|
||||
@@ -176,7 +176,11 @@ export async function runSkillTest(options: {
|
||||
|
||||
const proc = Bun.spawn(['sh', '-c', `cat "${promptFile}" | claude ${args.map(a => `"${a}"`).join(' ')}`], {
|
||||
cwd: workingDirectory,
|
||||
env: extraEnv ? { ...process.env, ...extraEnv } : undefined,
|
||||
// Default GSTACK_HEADLESS=1 so eval/E2E runs classify as headless (BLOCK on an
|
||||
// AskUserQuestion failure rather than emit a prose question no human reads). A
|
||||
// suite exercising the INTERACTIVE prose-fallback path opts out by passing
|
||||
// `env: { GSTACK_HEADLESS: '' }` — extraEnv wins because it spreads last.
|
||||
env: { ...process.env, GSTACK_HEADLESS: '1', ...extraEnv },
|
||||
stdout: 'pipe',
|
||||
stderr: 'pipe',
|
||||
});
|
||||
|
||||
@@ -123,6 +123,11 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
'ship-idempotency-pty': ['ship/**', 'bin/gstack-next-version', 'bin/gstack-version-bump', 'scripts/resolvers/sections.ts', 'lib/worktree.ts', 'test/helpers/claude-pty-runner.ts'],
|
||||
'ship-section-loading': ['ship/**', 'scripts/resolvers/sections.ts', 'scripts/gen-skill-docs.ts', 'test/helpers/auq-sdk-capture.ts', 'test/helpers/session-runner.ts'],
|
||||
'plan-ceo-section-loading': ['plan-ceo-review/**', 'scripts/resolvers/sections.ts', 'scripts/gen-skill-docs.ts', 'test/helpers/auq-sdk-capture.ts', 'test/helpers/session-runner.ts'],
|
||||
// Data-driven behavioral guard for the 'plan'/'prompt' carves (eng, design,
|
||||
// devex, office-hours + future PR2 carves). One file iterating CARVE_GUARDS;
|
||||
// the selector sets GSTACK_CARVE_SKILL=<name> to scope cost to the changed
|
||||
// skill (D-CODEX A). Touching the registry/helper or sections.ts runs all.
|
||||
'carve-section-loading': ['plan-eng-review/**', 'plan-design-review/**', 'plan-devex-review/**', 'office-hours/**', 'document-release/**', 'design-consultation/**', 'cso/**', 'test/helpers/carve-guards.ts', 'scripts/resolvers/sections.ts', 'scripts/gen-skill-docs.ts', 'test/helpers/auq-sdk-capture.ts', 'test/helpers/session-runner.ts'],
|
||||
'autoplan-chain-pty': ['autoplan/**', 'plan-ceo-review/**', 'plan-design-review/**', 'plan-eng-review/**', 'plan-devex-review/**', 'test/fixtures/plans/ui-heavy-feature.md', 'test/helpers/claude-pty-runner.ts'],
|
||||
'e2e-harness-audit': ['plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'test/helpers/agent-sdk-runner.ts', 'test/helpers/claude-pty-runner.ts'],
|
||||
|
||||
@@ -512,6 +517,7 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
|
||||
'ship-idempotency-pty': 'periodic', // ~$3/run, real /ship in plan mode
|
||||
'ship-section-loading': 'periodic', // ~$3/run, real /ship; asserts section reads
|
||||
'plan-ceo-section-loading': 'periodic', // ~$3-5/run, real /plan-ceo-review; asserts section read
|
||||
'carve-section-loading': 'periodic', // ~$1-2/skill, data-driven; GSTACK_CARVE_SKILL scopes to one
|
||||
'autoplan-chain-pty': 'periodic', // ~$8/run, all 3 phases sequential
|
||||
|
||||
// Per-finding count + review-report-at-bottom — periodic because each
|
||||
|
||||
@@ -0,0 +1,135 @@
|
||||
/**
|
||||
* pr-title-sync.yml is a `pull_request_target` workflow — static injection
|
||||
* tripwire (gate, free).
|
||||
*
|
||||
* The anxiety this kills: `pull_request_target` runs with a WRITE token in the
|
||||
* base-repo context, even for fork PRs. That is what lets this workflow rewrite
|
||||
* fork-PR titles (the backstop). It is also the single most dangerous workflow
|
||||
* trigger in GitHub Actions. Two classic footguns turn it into remote code
|
||||
* execution / token theft, and `actionlint` catches NEITHER:
|
||||
*
|
||||
* 1. Checking out the PR head (`actions/checkout` with a `ref:` pointing at
|
||||
* `pull_request.head` / `head_ref`) and then running anything from it —
|
||||
* that executes attacker-controlled fork code with the write token.
|
||||
* 2. Interpolating an attacker-controlled `${{ github.event.pull_request.* }}`
|
||||
* field directly INSIDE a `run:` block — the title/body are attacker-
|
||||
* controlled and the `${{ }}` is expanded into the shell before execution,
|
||||
* so a crafted title runs as code. Those fields MUST arrive via `env:` and
|
||||
* be referenced as `"$VAR"` (shell-quoted), never inlined.
|
||||
*
|
||||
* This tripwire reads the workflow file directly and fails CI if either pattern
|
||||
* reappears. Mirrors the static-grep invariant tests in browse/test
|
||||
* (terminal-agent-pid-identity, server-sanitize-surrogates).
|
||||
*
|
||||
* Note: `gh api ... -q '.head.sha'` inside a run block is SAFE (reading PR
|
||||
* metadata as data via a jq filter string, not `${{ }}` interpolation), so we
|
||||
* ban the interpolation form specifically, not the literal substring `head.sha`.
|
||||
*/
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import * as fs from 'node:fs';
|
||||
import * as path from 'node:path';
|
||||
|
||||
const WORKFLOW = path.resolve(__dirname, '..', '.github', 'workflows', 'pr-title-sync.yml');
|
||||
|
||||
/** Indentation width (count of leading spaces) of a line. */
|
||||
function indent(line: string): number {
|
||||
const m = line.match(/^( *)/);
|
||||
return m ? m[1].length : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the lines that live inside a `run:` block, each tagged with its 1-based
|
||||
* line number. Handles both `run: |` (multiline) and `run: <inline command>`.
|
||||
*/
|
||||
function runBlockLines(content: string): Array<{ n: number; text: string }> {
|
||||
const lines = content.split('\n');
|
||||
const out: Array<{ n: number; text: string }> = [];
|
||||
let inRun = false;
|
||||
let runIndent = -1;
|
||||
for (let i = 0; i < lines.length; i++) {
|
||||
const line = lines[i];
|
||||
const n = i + 1;
|
||||
const inlineRun = line.match(/^(\s*)run:\s*(\S.*)$/); // `run: echo foo`
|
||||
const blockRun = /^(\s*)run:\s*(\|>?[+-]?)?\s*$/.test(line); // `run: |`
|
||||
if (inlineRun && !/^\|/.test(inlineRun[2])) {
|
||||
out.push({ n, text: inlineRun[2] });
|
||||
inRun = false;
|
||||
continue;
|
||||
}
|
||||
if (blockRun) {
|
||||
inRun = true;
|
||||
runIndent = indent(line);
|
||||
continue;
|
||||
}
|
||||
if (inRun) {
|
||||
if (line.trim() === '') {
|
||||
out.push({ n, text: line });
|
||||
continue;
|
||||
}
|
||||
// Block ends when a non-empty line is indented at or below the `run:` key.
|
||||
if (indent(line) <= runIndent) {
|
||||
inRun = false;
|
||||
} else {
|
||||
out.push({ n, text: line });
|
||||
}
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
describe('pr-title-sync.yml pull_request_target safety', () => {
|
||||
const content = fs.readFileSync(WORKFLOW, 'utf-8');
|
||||
|
||||
test('workflow file exists', () => {
|
||||
expect(fs.existsSync(WORKFLOW)).toBe(true);
|
||||
});
|
||||
|
||||
test('does NOT check out the PR head ref (no fork-code execution)', () => {
|
||||
const offenders: string[] = [];
|
||||
content.split('\n').forEach((line, i) => {
|
||||
// A checkout `ref:` (or any `ref:`) pointing at the PR head is the footgun.
|
||||
if (/ref:\s*\$\{\{[^}]*(pull_request\.head|head_ref)/.test(line)) {
|
||||
offenders.push(` L${i + 1}: ${line.trim()}`);
|
||||
}
|
||||
});
|
||||
if (offenders.length > 0) {
|
||||
throw new Error(
|
||||
`pr-title-sync.yml checks out the PR head under pull_request_target — that ` +
|
||||
`runs attacker-controlled fork code with a write token. Check out the base ` +
|
||||
`repo (no ref:) and read PR-head data via the API instead.\n` +
|
||||
offenders.join('\n'),
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
test('does NOT interpolate ${{ github.event.pull_request.* }} inside a run: block', () => {
|
||||
const offenders: string[] = [];
|
||||
for (const { n, text } of runBlockLines(content)) {
|
||||
if (/\$\{\{\s*github\.event\.pull_request/.test(text)) {
|
||||
offenders.push(` L${n}: ${text.trim()}`);
|
||||
}
|
||||
}
|
||||
if (offenders.length > 0) {
|
||||
throw new Error(
|
||||
`pr-title-sync.yml inlines an attacker-controlled PR field into a run: block ` +
|
||||
`— a crafted PR title/body executes as shell. Pass it via env: and ` +
|
||||
`reference "$VAR" (shell-quoted) instead.\n` +
|
||||
offenders.join('\n'),
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
test('uses pull_request_target (the hardening is actually present)', () => {
|
||||
// Positive assertion: if someone reverts to plain pull_request, the fork
|
||||
// backstop silently stops working (read-only token). Keep it intentional.
|
||||
expect(/^on:\s*$/m.test(content) || /\bpull_request_target\b/.test(content)).toBe(true);
|
||||
expect(content).toMatch(/\bpull_request_target\b/);
|
||||
});
|
||||
|
||||
test('passes the PR title through env:, not raw interpolation', () => {
|
||||
// The safe pattern: OLD_TITLE: ${{ github.event.pull_request.title }} in an
|
||||
// env: mapping, consumed as "$OLD_TITLE" in script.
|
||||
expect(content).toMatch(/env:/);
|
||||
expect(content).toMatch(/github\.event\.pull_request\.title/);
|
||||
});
|
||||
});
|
||||
@@ -16,6 +16,8 @@
|
||||
* for the weekly periodic eval to notice.
|
||||
*/
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import type { TemplateContext } from '../scripts/resolvers/types';
|
||||
import { HOST_PATHS } from '../scripts/resolvers/types';
|
||||
import { generateAskUserFormat } from '../scripts/resolvers/preamble/generate-ask-user-format';
|
||||
@@ -161,3 +163,88 @@ describe('generateAskUserFormat — 5+ option split rule (slim inline + docs poi
|
||||
expect(out).toContain('**Non-ASCII characters');
|
||||
});
|
||||
});
|
||||
|
||||
describe('generateAskUserFormat — runtime-failure prose fallback', () => {
|
||||
const out = generateAskUserFormat(makeCtx());
|
||||
|
||||
test('documents the unavailable/failed subsection', () => {
|
||||
expect(out).toMatch(/When AskUserQuestion is unavailable or a call fails/i);
|
||||
});
|
||||
|
||||
test('carves out the auto-decide denial as NOT a failure', () => {
|
||||
expect(out).toContain('[plan-tune auto-decide]');
|
||||
expect(out).toMatch(/NOT a failure/i);
|
||||
// and explicitly: do not fall back to prose on an auto-decide denial
|
||||
expect(out).toMatch(/Do NOT[\s\S]{0,40}fall back to prose|never prose/i);
|
||||
});
|
||||
|
||||
test('retries the errored call exactly once before degrading', () => {
|
||||
expect(out).toMatch(/retry the SAME call \*\*once\*\*|retry the same call.*once/i);
|
||||
// idempotency guard against double-prompting
|
||||
expect(out).toMatch(/double-prompt|no answer could have surfaced/i);
|
||||
});
|
||||
|
||||
test('branches on SESSION_KIND: spawned / headless / interactive', () => {
|
||||
expect(out).toContain('SESSION_KIND');
|
||||
expect(out).toMatch(/`spawned`[\s\S]*auto-choose/);
|
||||
expect(out).toMatch(/`headless`[\s\S]*BLOCKED/);
|
||||
expect(out).toMatch(/`interactive`[\s\S]*prose fallback/);
|
||||
// empty/absent SESSION_KIND degrades to interactive
|
||||
expect(out).toMatch(/empty\/absent[\s\S]{0,40}interactive/i);
|
||||
});
|
||||
|
||||
// The mandatory triad the user explicitly required for the plain-text output.
|
||||
test('prose fallback mandates the triad: issue ELI10', () => {
|
||||
expect(out).toMatch(/ELI10 of the issue itself/i);
|
||||
});
|
||||
|
||||
test('prose fallback mandates the triad: per-choice Completeness score', () => {
|
||||
expect(out).toMatch(/Completeness scores per choice/i);
|
||||
expect(out).toMatch(/Completeness: X\/10.*EACH choice|on EACH choice/i);
|
||||
});
|
||||
|
||||
test('prose fallback mandates the triad: recommendation + (recommended) marker', () => {
|
||||
expect(out).toMatch(/Recommendation: <choice> because/);
|
||||
expect(out).toMatch(/\(recommended\)`? marker on that choice/);
|
||||
});
|
||||
|
||||
test('prose fallback is one paragraph per choice, not a bare bullet list', () => {
|
||||
expect(out).toMatch(/ONE paragraph per choice/i);
|
||||
expect(out).toMatch(/never a bare bullet list/i);
|
||||
});
|
||||
|
||||
test('prose fallback tells the user to reply with a letter, then STOP', () => {
|
||||
expect(out).toMatch(/reply with a letter/i);
|
||||
expect(out).toMatch(/STOP and wait/i);
|
||||
});
|
||||
|
||||
// OV2: the former "tool_use, not prose" assertions must carry the qualifier so the
|
||||
// fallback is not self-contradicting. Guards against the instruction collision
|
||||
// silently returning on a future edit.
|
||||
test('OV2: the Format line qualifies "not prose" with the fallback exception', () => {
|
||||
expect(out).toMatch(/must be sent as tool_use, not prose — unless the documented failure fallback/);
|
||||
});
|
||||
|
||||
test('OV2: the self-check "not writing prose" line carries the fallback qualifier', () => {
|
||||
expect(out).toMatch(/not writing prose — unless the documented failure fallback applies/);
|
||||
});
|
||||
});
|
||||
|
||||
describe('CQ2 — cross-file invariant: auto-decide prefix matches the hook', () => {
|
||||
const out = generateAskUserFormat(makeCtx());
|
||||
const hookSrc = fs.readFileSync(
|
||||
path.resolve(__dirname, '..', 'hosts', 'claude', 'hooks', 'question-preference-hook.ts'),
|
||||
'utf-8',
|
||||
);
|
||||
|
||||
test('the hook actually emits the [plan-tune auto-decide] prefix', () => {
|
||||
expect(hookSrc).toContain('[plan-tune auto-decide]');
|
||||
});
|
||||
|
||||
test('the resolver references the exact same prefix the hook emits', () => {
|
||||
// If a future edit reworded the hook reason, this catches the drift: the prose
|
||||
// fallback would stop recognizing the auto-decide denial as not-a-failure.
|
||||
const PREFIX = '[plan-tune auto-decide]';
|
||||
expect(hookSrc.includes(PREFIX) && out.includes(PREFIX)).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -74,7 +74,7 @@ describeE2E('/plan-ceo-review section-loading E2E (periodic, SDK capture)', () =
|
||||
'Review the plan in PLAN.md. Hold the current scope (HOLD SCOPE mode) — do not challenge or expand scope. Run the full CEO review and produce the review report.',
|
||||
requiredSections: REQUIRED_SECTIONS,
|
||||
reportMarker: /GSTACK REVIEW REPORT|COMPLETION SUMMARY|review/i,
|
||||
testName: '/plan-ceo-review section-loading',
|
||||
testName: 'plan-ceo-section-loading',
|
||||
runId,
|
||||
});
|
||||
|
||||
|
||||
@@ -65,7 +65,7 @@ describeE2E('/ship section-loading E2E (periodic, SDK capture)', () => {
|
||||
'This is a FRESH version-changing ship: the branch has a real code change (app.js gained a new function with a test), VERSION still equals the base version (0.0.1, so it needs a bump), and CHANGELOG.md needs a new entry. Follow the skill\'s flow for a version-changing ship: run the pre-landing review and prepare the CHANGELOG entry. Produce the ship plan / review report. Do NOT actually commit, push, or open a PR.',
|
||||
requiredSections: REQUIRED_SECTIONS,
|
||||
reportMarker: /version|changelog|review|ship/i,
|
||||
testName: '/ship section-loading',
|
||||
testName: 'ship-section-loading',
|
||||
runId,
|
||||
});
|
||||
|
||||
|
||||
@@ -33,6 +33,7 @@ import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import { captureBaseline, type ParityBaseline } from './helpers/capture-parity-baseline';
|
||||
import { logBudgetOverride } from './helpers/budget-override';
|
||||
import { CARVED_SKILLS } from './helpers/carve-guards';
|
||||
|
||||
const REPO_ROOT = path.resolve(import.meta.dir, '..');
|
||||
const BASELINE_PATH = path.join(REPO_ROOT, 'test', 'fixtures', 'parity-baseline-v1.47.0.0.json');
|
||||
@@ -161,9 +162,10 @@ describe('SKILL.md size budget regression (gate, free)', () => {
|
||||
const MIN_RATIO = 0.80; // a skill at <80% of its v1.44 size signals mass-deletion
|
||||
// Carved skills (v2 plan T9): the skeleton SKILL.md intentionally shrinks
|
||||
// because prose moved into sections/*.md. The union size is guarded instead
|
||||
// by the sectioned ship invariant in parity-harness.ts (minBytes on the
|
||||
// by the sectioned invariant in parity-harness.ts (minBytes on the
|
||||
// skeleton+sections union), so exempt the skeleton from the body-strip floor.
|
||||
const SECTIONS_EXTRACTED = new Set<string>(['ship', 'plan-ceo-review', 'office-hours', 'plan-eng-review', 'plan-design-review', 'plan-devex-review']);
|
||||
// EQ1: derived from the canonical CARVE_GUARDS registry — no parallel list.
|
||||
const SECTIONS_EXTRACTED = new Set<string>(CARVED_SKILLS);
|
||||
|
||||
const undershoots: Array<{
|
||||
skill: string; beforeBytes: number; afterBytes: number; ratio: number;
|
||||
|
||||
@@ -111,7 +111,16 @@ describe('/spec quality gate fallback', () => {
|
||||
|
||||
describe('/spec fail-closed redaction (shared engine)', () => {
|
||||
test('the full taxonomy (with secret prefixes) lives in the generated /cso doc', () => {
|
||||
const cso = fs.readFileSync(path.join(ROOT, 'cso', 'SKILL.md'), 'utf-8');
|
||||
// cso is carved — the Secrets Archaeology prose + prefixes moved into
|
||||
// sections/audit-phases.md; read the skeleton+sections union.
|
||||
const csoDir = path.join(ROOT, 'cso');
|
||||
let cso = fs.readFileSync(path.join(csoDir, 'SKILL.md'), 'utf-8');
|
||||
const secDir = path.join(csoDir, 'sections');
|
||||
if (fs.existsSync(secDir)) {
|
||||
for (const f of fs.readdirSync(secDir).sort()) {
|
||||
if (f.endsWith('.md') && !f.endsWith('.md.tmpl')) cso += '\n' + fs.readFileSync(path.join(secDir, f), 'utf-8');
|
||||
}
|
||||
}
|
||||
expect(cso).toContain('AKIA');
|
||||
expect(cso).toMatch(/ghp_|gho_|ghs_/);
|
||||
expect(cso).toContain('sk-ant-');
|
||||
|
||||
Reference in New Issue
Block a user