diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index ca9957c0e..3902b968e 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -111,7 +111,12 @@ export const E2E_TOUCHFILES: Record = { // written a never-ask preference, AUQ should still auto-decide rather than // surfacing the question. Touches the question-tuning + preference // infrastructure plus the resolvers that own the AUTO_DECIDE preamble. - 'auto-decide-preserved': ['scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'plan-ceo-review/**', 'bin/gstack-question-preference', 'bin/gstack-config', 'bin/gstack-slug', 'test/helpers/claude-pty-runner.ts'], + 'auto-decide-preserved': ['scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-preamble-bash.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'plan-ceo-review/**', 'bin/gstack-question-preference', 'bin/gstack-config', 'bin/gstack-slug', 'hosts/claude/hooks/question-preference-hook.ts', 'lib/is-conductor.ts', 'test/helpers/claude-pty-runner.ts'], + + // Conductor → prose decision brief (Conductor signal makes prose the default; + // the PreToolUse hook denies the flaky tool). Touches the resolver that owns + // the Conductor rule, the preamble signal, the hook, and the detection helper. + 'conductor-prose': ['scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-preamble-bash.ts', 'scripts/resolvers/preamble.ts', 'plan-eng-review/**', 'hosts/claude/hooks/question-preference-hook.ts', 'lib/is-conductor.ts', 'test/helpers/claude-pty-runner.ts', 'test/skill-e2e-conductor-prose.test.ts'], // Real-PTY E2E batch (#6 new tests on the harness). // Each one tests behavior the SDK harness can't observe (rendered TTY, @@ -505,6 +510,7 @@ export const E2E_TIERS: Record = { // v1.21+ auto-mode regression tests 'office-hours-auto-mode': 'gate', 'auto-decide-preserved': 'periodic', + 'conductor-prose': 'periodic', 'e2e-harness-audit': 'gate', // Real-PTY E2E batch — tier classification: diff --git a/test/skill-e2e-auto-decide-preserved.test.ts b/test/skill-e2e-auto-decide-preserved.test.ts index 8b773d5fc..7e74d8438 100644 --- a/test/skill-e2e-auto-decide-preserved.test.ts +++ b/test/skill-e2e-auto-decide-preserved.test.ts @@ -100,11 +100,19 @@ describeE2E('AUTO_DECIDE opt-in preserved under Conductor flags (periodic)', () } // 4. Run /plan-ceo-review with the Conductor flag set + isolated state. + // GSTACK_HOME=tmpHome is REQUIRED: the preference + question_tuning were + // seeded there. Without it the spawned claude reads the real ~/.gstack, + // never sees the never-ask preference, and the test silently exercises + // the wrong state root (pre-existing bug, Codex #9 / Issue 13). + // CONDUCTOR_WORKSPACE_PATH additionally proves auto-decide still WINS + // over the Conductor prose redirect (precedence: settled preference + // beats transport-avoidance). const obs = await runPlanSkillObservation({ skillName: 'plan-ceo-review', inPlanMode: true, extraArgs: ['--disallowedTools', 'AskUserQuestion'], timeoutMs: 300_000, + env: { GSTACK_HOME: tmpHome, CONDUCTOR_WORKSPACE_PATH: tmpHome }, }); // 5. Pass: 'auto_decided' (the strongest signal) or 'plan_ready' with diff --git a/test/skill-e2e-conductor-prose.test.ts b/test/skill-e2e-conductor-prose.test.ts new file mode 100644 index 000000000..a130dcab5 --- /dev/null +++ b/test/skill-e2e-conductor-prose.test.ts @@ -0,0 +1,69 @@ +/** + * Conductor → prose decision brief (periodic-tier, paid, real-PTY). + * + * Proves the end-to-end behavior: when CONDUCTOR_SESSION is signalled, a skill + * that hits a decision renders a PROSE decision brief and waits, instead of + * silently skipping the user. + * + * SCOPE — read before trusting this as the Conductor guard. This is END-TO-END + * BEHAVIOR coverage, NOT the discriminating Conductor guarantee: + * - The deterministic guard is test/question-preference-hook.test.ts + * ("Conductor prose redirect") — it sets process.env.CONDUCTOR_* and asserts + * the PreToolUse hook denies + redirects. That test CAN fail on unfixed code. + * - The PTY harness here cannot register `mcp__conductor__AskUserQuestion`, so + * it tests "native AUQ unavailable + Conductor signal → prose," NOT "the MCP + * variant exists and must not be called" (Codex #10). Under --disallowedTools + * a present-human interactive session already prose-falls-back, so this test + * is a smoke check that the Conductor path still produces a prose brief, not + * a proof that the Conductor signal (vs the generic fallback) drove it. + * + * Periodic tier: model-behavior, non-deterministic. + */ + +import { describe, test, expect } from 'bun:test'; +import { runPlanSkillObservation } from './helpers/claude-pty-runner'; + +const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic'; +const describeE2E = shouldRun ? describe : describe.skip; + +const FLAWED_PLAN = `# Plan: add a "developer-friendly" pricing tier + +## Goal +Increase developer adoption. + +## Premise +No tests mentioned, no rollout plan, no auth check on the upgrade endpoint. +Adds a Stripe tier, a React pricing page, a Postgres entitlements table, and a +Redis cache. The team "feels like" it should be cheaper; no developer was asked. +`; + +describeE2E('Conductor renders decisions as prose (periodic)', () => { + test('plan-eng-review in a Conductor session surfaces a PROSE decision brief, not a silent skip', async () => { + const obs = await runPlanSkillObservation({ + skillName: 'plan-eng-review', + inPlanMode: true, + // Mimic Conductor: native AUQ disabled + the Conductor env signal present. + extraArgs: ['--disallowedTools', 'AskUserQuestion'], + env: { CONDUCTOR_WORKSPACE_PATH: '/tmp/conductor-prose-e2e' }, + initialPlanContent: FLAWED_PLAN, + timeoutMs: 300_000, + }); + + // The decision must reach the human as prose. 'silent_write' (wrote findings + // to the plan without asking) is the precise failure we guard against. + if (obs.outcome === 'silent_write') { + throw new Error( + `Conductor prose regression: skill wrote findings without surfacing a decision.\n` + + `summary: ${obs.summary}\n--- evidence ---\n${obs.evidence}`, + ); + } + if (obs.outcome === 'exited' || obs.outcome === 'timeout') { + throw new Error( + `Conductor prose test inconclusive: outcome=${obs.outcome}\n` + + `summary: ${obs.summary}\n--- evidence ---\n${obs.evidence}`, + ); + } + // A prose-rendered decision brief was observed at some point in the run. + expect(obs.proseAUQEverObserved).toBe(true); + }, 360_000); +});