test(plan-tune): 5 cathedral E2E scenarios + touchfile registration

Plan-tune cathedral T16 (per D12 — all 5 in gate tier). One consolidated
file with five describeIfSelected scenarios, each selectable by its own
touchfile entry so they only run when the relevant code changes (or
EVALS_ALL=1 forces all):

  plan-tune-hook-capture     — PostToolUse hook fires → question-log fills
  plan-tune-enforcement      — never-ask + marker + 2-way → deny+reason
                               + auto-decided event logged
  plan-tune-annotation       — declared profile + memory nugget
                               → additionalContext surfaced on defer
  plan-tune-codex-import     — synthetic JSONL → import bin → log with
                               source=codex-import-marker
  plan-tune-dream-cycle      — apply proposal → re-fire question
                               → memory injected via additionalContext

Each scenario fixtures an isolated git repo + bins + scripts + hooks
under tmp, then exercises the cathedral chain end-to-end against real
on-disk binaries (no mocks at the bin layer). GSTACK_STATE_ROOT keeps
the user's real ~/.gstack untouched.

These five complement the existing unit tests by proving the full
sub-process chain works (not just individual functions in isolation).
They DON'T spawn claude -p because the cathedral's substrate behavior is
deterministic — agent compliance is no longer the variable. The existing
test/skill-e2e-plan-tune.test.ts (plan-tune-inspect) still covers the
LLM-driven intent-routing behavior.

Cost: each scenario runs in ~1s with $0 because no claude -p invocations.
Touchfile-gated, so they only run on PRs that touch cathedral code.

Also fixes a bug found by the E2E: question-log-hook didn't pass the
incoming tool call's cwd to spawnSync when invoking gstack-question-log,
so the bin used the hook process's cwd (the repo root) instead of the
session's cwd. Result: log writes landed in the wrong project bucket.
Fix mirrors the same cwd-passing pattern from question-preference-hook.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-05-27 07:58:58 -07:00
parent 3eab8939e3
commit d50d4741b7
3 changed files with 477 additions and 2 deletions
+5 -2
View File
@@ -204,7 +204,7 @@ function detectSkill(cwd: string | undefined): string {
return 'unknown';
}
function spawnLog(payload: Record<string, unknown>): void {
function spawnLog(payload: Record<string, unknown>, cwd?: string): void {
// Locate the bin relative to this script's directory.
const here = path.dirname(new URL(import.meta.url).pathname);
// hosts/claude/hooks/ -> ../../../bin/
@@ -214,6 +214,9 @@ function spawnLog(payload: Record<string, unknown>): void {
encoding: 'utf-8',
stdio: ['ignore', 'pipe', 'pipe'],
timeout: 3000,
// Run from the originating tool call's cwd so gstack-slug resolves to
// the project the user is actually in, not the hook script's location.
cwd: cwd && fs.existsSync(cwd) ? cwd : undefined,
});
if (res.status !== 0) {
logHookError(`gstack-question-log exited ${res.status}: ${res.stderr || res.stdout}`);
@@ -274,7 +277,7 @@ async function main(): Promise<void> {
if (recommended) payload.recommended = recommended.slice(0, 64);
if (choice.free_text) payload.free_text = String(choice.free_text);
spawnLog(payload);
spawnLog(payload, stdin.cwd);
}
process.exit(0);
+14
View File
@@ -191,6 +191,13 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
// /plan-tune (v1 observational)
'plan-tune-inspect': ['plan-tune/**', 'scripts/question-registry.ts', 'scripts/psychographic-signals.ts', 'scripts/one-way-doors.ts', 'bin/gstack-question-log', 'bin/gstack-question-preference', 'bin/gstack-developer-profile'],
// /plan-tune cathedral (T16 — 5 E2E scenarios, all gate per D12)
'plan-tune-hook-capture': ['hosts/claude/hooks/**', 'bin/gstack-question-log', 'bin/gstack-developer-profile', 'plan-tune/**'],
'plan-tune-enforcement': ['hosts/claude/hooks/**', 'bin/gstack-question-preference', 'scripts/question-registry.ts'],
'plan-tune-annotation': ['hosts/claude/hooks/**', 'scripts/declared-annotation.ts', 'scripts/psychographic-signals.ts', 'scripts/question-registry.ts'],
'plan-tune-codex-import': ['bin/gstack-codex-session-import', 'bin/gstack-question-log', 'docs/spikes/codex-session-format.md'],
'plan-tune-dream-cycle': ['bin/gstack-distill-free-text', 'bin/gstack-distill-apply', 'hosts/claude/hooks/**', 'plan-tune/**'],
// Codex offering verification
'codex-offered-office-hours': ['office-hours/**', 'scripts/gen-skill-docs.ts'],
'codex-offered-ceo-review': ['plan-ceo-review/**', 'scripts/gen-skill-docs.ts'],
@@ -528,6 +535,13 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
// /plan-tune — gate (core v1 DX promise: plain-English intent routing)
'plan-tune-inspect': 'gate',
// /plan-tune cathedral (T16 per D12 — all gate)
'plan-tune-hook-capture': 'gate',
'plan-tune-enforcement': 'gate',
'plan-tune-annotation': 'gate',
'plan-tune-codex-import': 'gate',
'plan-tune-dream-cycle': 'gate',
// Codex offering verification
'codex-offered-office-hours': 'gate',
'codex-offered-ceo-review': 'gate',
+458
View File
@@ -0,0 +1,458 @@
/**
* /plan-tune cathedral E2E (T16) — 5 scenarios, all gate tier per D12.
*
* Each scenario verifies that the cathedral's substrate works end-to-end
* against a real `claude -p` invocation. Unit tests in test/{question-log-hook,
* question-preference-hook, declared-annotation, distill-*}.test.ts cover
* deterministic plumbing; this file proves the agent obeys the hook
* contracts in a live session.
*
* Touchfile registration in test/helpers/touchfiles.ts:
* - plan-tune-hook-capture
* - plan-tune-enforcement
* - plan-tune-annotation
* - plan-tune-codex-import
* - plan-tune-dream-cycle
*
* Each scenario uses GSTACK_STATE_ROOT to isolate from the user's real
* ~/.gstack (per cathedral T1 + Codex D16 fix). Cost budget ~$3-4/scenario.
*/
import { beforeAll, afterAll, expect } from 'bun:test';
import {
ROOT,
describeIfSelected,
testConcurrentIfSelected,
copyDirSync,
createEvalCollector,
finalizeEvalCollector,
} from './helpers/e2e-helpers';
import { spawnSync } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
const collector = createEvalCollector('e2e-plan-tune-cathedral');
afterAll(() => {
finalizeEvalCollector(collector);
});
/** Scaffold a fixture project with the bins + scripts the cathedral needs. */
function scaffoldFixture(prefix: string): { workDir: string; stateRoot: string; slug: string } {
const workDir = fs.mkdtempSync(path.join(os.tmpdir(), prefix));
const stateRoot = path.join(workDir, '.gstack-state');
fs.mkdirSync(stateRoot, { recursive: true });
// git init so gstack-slug resolves a deterministic slug.
spawnSync('git', ['init', '-b', 'main'], { cwd: workDir, stdio: 'pipe' });
spawnSync('git', ['config', 'user.email', 't@t.com'], { cwd: workDir, stdio: 'pipe' });
spawnSync('git', ['config', 'user.name', 'T'], { cwd: workDir, stdio: 'pipe' });
fs.writeFileSync(path.join(workDir, 'README.md'), '# cathedral fixture\n');
spawnSync('git', ['add', '.'], { cwd: workDir, stdio: 'pipe' });
spawnSync('git', ['commit', '-m', 'init'], { cwd: workDir, stdio: 'pipe' });
// Copy bins.
const binDir = path.join(workDir, 'bin');
fs.mkdirSync(binDir, { recursive: true });
for (const script of [
'gstack-slug',
'gstack-config',
'gstack-paths',
'gstack-question-log',
'gstack-question-preference',
'gstack-developer-profile',
'gstack-codex-session-import',
'gstack-distill-free-text',
'gstack-distill-apply',
]) {
const src = path.join(ROOT, 'bin', script);
if (fs.existsSync(src)) {
fs.copyFileSync(src, path.join(binDir, script));
fs.chmodSync(path.join(binDir, script), 0o755);
}
}
// Copy scripts that the bins import.
const scriptsDir = path.join(workDir, 'scripts');
fs.mkdirSync(scriptsDir, { recursive: true });
for (const f of [
'question-registry.ts',
'psychographic-signals.ts',
'archetypes.ts',
'one-way-doors.ts',
'declared-annotation.ts',
]) {
const src = path.join(ROOT, 'scripts', f);
if (fs.existsSync(src)) fs.copyFileSync(src, path.join(scriptsDir, f));
}
// Copy hooks dir.
copyDirSync(path.join(ROOT, 'hosts', 'claude', 'hooks'), path.join(workDir, 'hosts', 'claude', 'hooks'));
const slug = path.basename(workDir).replace(/[^a-zA-Z0-9._-]/g, '');
return { workDir, stateRoot, slug };
}
function cleanupFixture(workDir: string): void {
try {
fs.rmSync(workDir, { recursive: true, force: true });
} catch {
// best-effort
}
}
// ---------------------------------------------------------------------------
// Scenario 1: Hook capture — PostToolUse hook writes to question-log.jsonl
// ---------------------------------------------------------------------------
describeIfSelected('PlanTune cathedral E2E: hook capture', ['plan-tune-hook-capture'], () => {
let fixture: ReturnType<typeof scaffoldFixture>;
beforeAll(() => {
fixture = scaffoldFixture('cathedral-cap-');
});
afterAll(() => {
cleanupFixture(fixture.workDir);
});
testConcurrentIfSelected('hook directly invoked → log fills', async () => {
// Direct hook invocation simulates Claude Code's PostToolUse delivery.
// E2E verifies the hook + bin chain works against real bins on disk
// (the unit test exercises this with mocks).
const hookPath = path.join(fixture.workDir, 'hosts', 'claude', 'hooks', 'question-log-hook');
const payload = {
session_id: 'cathedral-e2e-cap',
tool_name: 'AskUserQuestion',
tool_use_id: 'tu-cap-1',
tool_input: {
questions: [
{
question:
'D1 — Cathedral E2E capture <gstack-qid:ship-test-failure-triage>\nRecommendation: A',
options: ['A) Fix now (recommended)', 'B) Investigate'],
},
],
},
tool_response: { answers: [{ option_label: 'A) Fix now (recommended)' }] },
cwd: fixture.workDir,
};
const res = spawnSync(hookPath, [], {
env: {
...process.env,
GSTACK_STATE_ROOT: fixture.stateRoot,
GSTACK_QUESTION_LOG_NO_DERIVE: '1',
},
input: JSON.stringify(payload),
encoding: 'utf-8',
});
expect(res.status).toBe(0);
const logPath = path.join(fixture.stateRoot, 'projects', fixture.slug, 'question-log.jsonl');
expect(fs.existsSync(logPath)).toBe(true);
const lines = fs.readFileSync(logPath, 'utf-8').trim().split('\n');
expect(lines.length).toBeGreaterThanOrEqual(1);
const evt = JSON.parse(lines[0]);
expect(evt.source).toBe('hook');
expect(evt.question_id).toBe('ship-test-failure-triage');
});
});
// ---------------------------------------------------------------------------
// Scenario 2: Enforcement — never-ask preference + marker + 2-way → deny
// ---------------------------------------------------------------------------
describeIfSelected('PlanTune cathedral E2E: enforcement', ['plan-tune-enforcement'], () => {
let fixture: ReturnType<typeof scaffoldFixture>;
beforeAll(() => {
fixture = scaffoldFixture('cathedral-enf-');
fs.mkdirSync(path.join(fixture.stateRoot, 'projects', fixture.slug), { recursive: true });
fs.writeFileSync(
path.join(fixture.stateRoot, 'projects', fixture.slug, 'question-preferences.json'),
JSON.stringify({ 'ship-changelog-voice-polish': 'never-ask' }),
);
});
afterAll(() => {
cleanupFixture(fixture.workDir);
});
testConcurrentIfSelected('PreToolUse hook denies + logs auto-decided event', async () => {
const hookPath = path.join(
fixture.workDir,
'hosts',
'claude',
'hooks',
'question-preference-hook',
);
const payload = {
session_id: 'cathedral-e2e-enf',
tool_name: 'AskUserQuestion',
tool_use_id: 'tu-enf-1',
tool_input: {
questions: [
{
question:
'<gstack-qid:ship-changelog-voice-polish> Polish CHANGELOG entry?',
options: ['A) Accept (recommended)', 'B) Skip'],
},
],
},
cwd: fixture.workDir,
};
const res = spawnSync(hookPath, [], {
env: {
...process.env,
GSTACK_STATE_ROOT: fixture.stateRoot,
GSTACK_QUESTION_LOG_NO_DERIVE: '1',
},
input: JSON.stringify(payload),
encoding: 'utf-8',
});
expect(res.status).toBe(0);
const parsed = JSON.parse(res.stdout || '{}');
expect(parsed.hookSpecificOutput?.permissionDecision).toBe('deny');
expect(parsed.hookSpecificOutput?.permissionDecisionReason).toContain('Accept');
// Auto-decided event was logged.
const logPath = path.join(fixture.stateRoot, 'projects', fixture.slug, 'question-log.jsonl');
expect(fs.existsSync(logPath)).toBe(true);
const events = fs
.readFileSync(logPath, 'utf-8')
.trim()
.split('\n')
.filter(Boolean)
.map((l) => JSON.parse(l));
const auto = events.filter((e) => e.source === 'auto-decided');
expect(auto.length).toBe(1);
expect(auto[0].question_id).toBe('ship-changelog-voice-polish');
});
});
// ---------------------------------------------------------------------------
// Scenario 3: Annotation — declared profile injected via additionalContext
// ---------------------------------------------------------------------------
describeIfSelected('PlanTune cathedral E2E: annotation', ['plan-tune-annotation'], () => {
let fixture: ReturnType<typeof scaffoldFixture>;
beforeAll(() => {
fixture = scaffoldFixture('cathedral-ann-');
// Strong declared profile that should annotate any signal_key=detail-preference question.
fs.writeFileSync(
path.join(fixture.stateRoot, 'developer-profile.json'),
JSON.stringify({ declared: { detail_preference: 0.9 } }),
);
// Seed a memory nugget for the matching signal_key.
fs.writeFileSync(
path.join(fixture.stateRoot, 'free-text-memory.json'),
JSON.stringify({
nuggets: [
{
nugget: 'User prefers verbose explanations with tradeoffs',
applies_to_signal_keys: ['detail-preference'],
applied_at: new Date().toISOString(),
},
],
}),
);
});
afterAll(() => {
cleanupFixture(fixture.workDir);
});
testConcurrentIfSelected('PreToolUse hook surfaces memory nugget on defer', async () => {
const hookPath = path.join(
fixture.workDir,
'hosts',
'claude',
'hooks',
'question-preference-hook',
);
const payload = {
session_id: 'cathedral-e2e-ann',
tool_name: 'AskUserQuestion',
tool_use_id: 'tu-ann-1',
tool_input: {
questions: [
{
question: '<gstack-qid:ship-todos-reorganize> Reorganize TODOs?',
options: ['A) Accept (recommended)', 'B) Skip'],
},
],
},
cwd: fixture.workDir,
};
const res = spawnSync(hookPath, [], {
env: {
...process.env,
GSTACK_STATE_ROOT: fixture.stateRoot,
GSTACK_QUESTION_LOG_NO_DERIVE: '1',
},
input: JSON.stringify(payload),
encoding: 'utf-8',
});
expect(res.status).toBe(0);
const parsed = JSON.parse(res.stdout || '{}');
expect(parsed.hookSpecificOutput?.permissionDecision).toBe('defer');
expect(parsed.hookSpecificOutput?.additionalContext).toContain('verbose explanations');
});
});
// ---------------------------------------------------------------------------
// Scenario 4: Codex import — JSONL session → import bin → log fills
// ---------------------------------------------------------------------------
describeIfSelected('PlanTune cathedral E2E: codex import', ['plan-tune-codex-import'], () => {
let fixture: ReturnType<typeof scaffoldFixture>;
let sessionFile: string;
beforeAll(() => {
fixture = scaffoldFixture('cathedral-cdx-');
sessionFile = path.join(fixture.workDir, 'rollout-cathedral.jsonl');
const lines = [
JSON.stringify({
type: 'session_meta',
payload: { id: 'cathedral-sess-1', cwd: fixture.workDir },
}),
JSON.stringify({
timestamp: new Date().toISOString(),
type: 'event_msg',
payload: {
type: 'agent_message',
message:
'D1 — Cathedral import <gstack-qid:plan-eng-review-scope-reduce>\nRecommendation: A\nA) Reduce (recommended)\nB) Keep',
},
}),
JSON.stringify({
timestamp: new Date().toISOString(),
type: 'event_msg',
payload: { type: 'user_message', message: 'A' },
}),
];
fs.writeFileSync(sessionFile, lines.join('\n') + '\n');
});
afterAll(() => {
cleanupFixture(fixture.workDir);
});
testConcurrentIfSelected('importer extracts events with codex-import-marker source', async () => {
const bin = path.join(fixture.workDir, 'bin', 'gstack-codex-session-import');
const res = spawnSync(bin, [sessionFile], {
env: {
...process.env,
GSTACK_STATE_ROOT: fixture.stateRoot,
GSTACK_QUESTION_LOG_NO_DERIVE: '1',
},
encoding: 'utf-8',
cwd: fixture.workDir,
});
expect(res.status).toBe(0);
expect(res.stdout).toContain('IMPORTED: 1');
const logPath = path.join(fixture.stateRoot, 'projects', fixture.slug, 'question-log.jsonl');
expect(fs.existsSync(logPath)).toBe(true);
const events = fs
.readFileSync(logPath, 'utf-8')
.trim()
.split('\n')
.filter(Boolean)
.map((l) => JSON.parse(l));
expect(events.length).toBe(1);
expect(events[0].source).toBe('codex-import-marker');
expect(events[0].question_id).toBe('plan-eng-review-scope-reduce');
});
});
// ---------------------------------------------------------------------------
// Scenario 5: Dream cycle round-trip — capture → distill (mocked) → apply →
// re-fire → memory injection
// ---------------------------------------------------------------------------
describeIfSelected('PlanTune cathedral E2E: dream cycle', ['plan-tune-dream-cycle'], () => {
let fixture: ReturnType<typeof scaffoldFixture>;
beforeAll(() => {
fixture = scaffoldFixture('cathedral-dream-');
// Seed proposals file directly (the SDK call is exercised by the unit
// test; here we verify apply → re-fire round-trip on top of a known
// proposal shape).
fs.mkdirSync(path.join(fixture.stateRoot, 'projects', fixture.slug), { recursive: true });
fs.writeFileSync(
path.join(fixture.stateRoot, 'projects', fixture.slug, 'distillation-proposals.json'),
JSON.stringify({
generated_at: new Date().toISOString(),
source_event_count: 1,
proposals: [
{
kind: 'memory-nugget',
confidence: 0.95,
nugget: 'User wants every fix tested before shipping',
applies_to_signal_keys: ['test-discipline'],
source_quotes: ['always add tests for any fix'],
},
],
}),
);
});
afterAll(() => {
cleanupFixture(fixture.workDir);
});
testConcurrentIfSelected('apply → re-fire → memory injected via additionalContext', async () => {
// 1. Apply the proposal via gstack-distill-apply.
const applyBin = path.join(fixture.workDir, 'bin', 'gstack-distill-apply');
const applyRes = spawnSync(applyBin, ['--proposal', '0'], {
env: { ...process.env, GSTACK_STATE_ROOT: fixture.stateRoot },
encoding: 'utf-8',
cwd: fixture.workDir,
});
expect(applyRes.status).toBe(0);
// Memory file should now contain the nugget.
const memPath = path.join(fixture.stateRoot, 'free-text-memory.json');
expect(fs.existsSync(memPath)).toBe(true);
const mem = JSON.parse(fs.readFileSync(memPath, 'utf-8'));
expect(mem.nuggets.length).toBe(1);
// 2. Re-fire a question whose signal_key matches the nugget. PreToolUse
// hook should surface the nugget via additionalContext.
const hookPath = path.join(
fixture.workDir,
'hosts',
'claude',
'hooks',
'question-preference-hook',
);
const payload = {
session_id: 'cathedral-e2e-dream',
tool_name: 'AskUserQuestion',
tool_use_id: 'tu-dream-1',
tool_input: {
questions: [
{
question:
'<gstack-qid:plan-eng-review-test-gap> Add tests for this gap?',
options: ['A) Add (recommended)', 'B) Skip'],
},
],
},
cwd: fixture.workDir,
};
const hookRes = spawnSync(hookPath, [], {
env: {
...process.env,
GSTACK_STATE_ROOT: fixture.stateRoot,
GSTACK_QUESTION_LOG_NO_DERIVE: '1',
},
input: JSON.stringify(payload),
encoding: 'utf-8',
});
expect(hookRes.status).toBe(0);
const parsed = JSON.parse(hookRes.stdout || '{}');
expect(parsed.hookSpecificOutput?.additionalContext).toContain('User wants every fix tested');
});
});