test: E2E for Conductor prose + fix auto-decide-preserved GSTACK_HOME bug

- New skill-e2e-conductor-prose (periodic): Conductor env + plan-eng-review surfaces a prose decision brief, not a silent skip. Header documents this is end-to-end behavior coverage; the deterministic Conductor guard is the question-preference-hook unit test (the PTY harness can't register the MCP variant — Codex #10). - Fix the pre-existing bug in auto-decide-preserved: it seeded the never-ask preference under GSTACK_HOME=tmpHome but never passed GSTACK_HOME into the PTY run, so the spawned claude read the real ~/.gstack and the preference was inert (Codex #9). Now passes GSTACK_HOME + CONDUCTOR_WORKSPACE_PATH to prove auto-decide still wins over the Conductor prose redirect. - Register both in touchfiles (periodic tier). Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-08-02 20:38:37 +02:00 · 2026-06-11 20:19:10 -07:00
parent ec63a2d25b
commit 788f35f021
3 changed files with 84 additions and 1 deletions
@@ -111,7 +111,12 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
  // written a never-ask preference, AUQ should still auto-decide rather than
  // surfacing the question. Touches the question-tuning + preference
  // infrastructure plus the resolvers that own the AUTO_DECIDE preamble.
-  'auto-decide-preserved':        ['scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'plan-ceo-review/**', 'bin/gstack-question-preference', 'bin/gstack-config', 'bin/gstack-slug', 'test/helpers/claude-pty-runner.ts'],
+  'auto-decide-preserved':        ['scripts/resolvers/question-tuning.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-preamble-bash.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'plan-ceo-review/**', 'bin/gstack-question-preference', 'bin/gstack-config', 'bin/gstack-slug', 'hosts/claude/hooks/question-preference-hook.ts', 'lib/is-conductor.ts', 'test/helpers/claude-pty-runner.ts'],
+
+  // Conductor → prose decision brief (Conductor signal makes prose the default;
+  // the PreToolUse hook denies the flaky tool). Touches the resolver that owns
+  // the Conductor rule, the preamble signal, the hook, and the detection helper.
+  'conductor-prose':              ['scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-preamble-bash.ts', 'scripts/resolvers/preamble.ts', 'plan-eng-review/**', 'hosts/claude/hooks/question-preference-hook.ts', 'lib/is-conductor.ts', 'test/helpers/claude-pty-runner.ts', 'test/skill-e2e-conductor-prose.test.ts'],

  // Real-PTY E2E batch (#6 new tests on the harness).
  // Each one tests behavior the SDK harness can't observe (rendered TTY,
@@ -505,6 +510,7 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
  // v1.21+ auto-mode regression tests
  'office-hours-auto-mode': 'gate',
  'auto-decide-preserved': 'periodic',
+  'conductor-prose': 'periodic',
  'e2e-harness-audit': 'gate',

  // Real-PTY E2E batch — tier classification:
@@ -100,11 +100,19 @@ describeE2E('AUTO_DECIDE opt-in preserved under Conductor flags (periodic)', ()
      }

      // 4. Run /plan-ceo-review with the Conductor flag set + isolated state.
+      //    GSTACK_HOME=tmpHome is REQUIRED: the preference + question_tuning were
+      //    seeded there. Without it the spawned claude reads the real ~/.gstack,
+      //    never sees the never-ask preference, and the test silently exercises
+      //    the wrong state root (pre-existing bug, Codex #9 / Issue 13).
+      //    CONDUCTOR_WORKSPACE_PATH additionally proves auto-decide still WINS
+      //    over the Conductor prose redirect (precedence: settled preference
+      //    beats transport-avoidance).
      const obs = await runPlanSkillObservation({
        skillName: 'plan-ceo-review',
        inPlanMode: true,
        extraArgs: ['--disallowedTools', 'AskUserQuestion'],
        timeoutMs: 300_000,
+        env: { GSTACK_HOME: tmpHome, CONDUCTOR_WORKSPACE_PATH: tmpHome },
      });

      // 5. Pass: 'auto_decided' (the strongest signal) or 'plan_ready' with
@@ -0,0 +1,69 @@
+/**
+ * Conductor → prose decision brief (periodic-tier, paid, real-PTY).
+ *
+ * Proves the end-to-end behavior: when CONDUCTOR_SESSION is signalled, a skill
+ * that hits a decision renders a PROSE decision brief and waits, instead of
+ * silently skipping the user.
+ *
+ * SCOPE — read before trusting this as the Conductor guard. This is END-TO-END
+ * BEHAVIOR coverage, NOT the discriminating Conductor guarantee:
+ *   - The deterministic guard is test/question-preference-hook.test.ts
+ *     ("Conductor prose redirect") — it sets process.env.CONDUCTOR_* and asserts
+ *     the PreToolUse hook denies + redirects. That test CAN fail on unfixed code.
+ *   - The PTY harness here cannot register `mcp__conductor__AskUserQuestion`, so
+ *     it tests "native AUQ unavailable + Conductor signal → prose," NOT "the MCP
+ *     variant exists and must not be called" (Codex #10). Under --disallowedTools
+ *     a present-human interactive session already prose-falls-back, so this test
+ *     is a smoke check that the Conductor path still produces a prose brief, not
+ *     a proof that the Conductor signal (vs the generic fallback) drove it.
+ *
+ * Periodic tier: model-behavior, non-deterministic.
+ */
+
+import { describe, test, expect } from 'bun:test';
+import { runPlanSkillObservation } from './helpers/claude-pty-runner';
+
+const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
+const describeE2E = shouldRun ? describe : describe.skip;
+
+const FLAWED_PLAN = `# Plan: add a "developer-friendly" pricing tier
+
+## Goal
+Increase developer adoption.
+
+## Premise
+No tests mentioned, no rollout plan, no auth check on the upgrade endpoint.
+Adds a Stripe tier, a React pricing page, a Postgres entitlements table, and a
+Redis cache. The team "feels like" it should be cheaper; no developer was asked.
+`;
+
+describeE2E('Conductor renders decisions as prose (periodic)', () => {
+  test('plan-eng-review in a Conductor session surfaces a PROSE decision brief, not a silent skip', async () => {
+    const obs = await runPlanSkillObservation({
+      skillName: 'plan-eng-review',
+      inPlanMode: true,
+      // Mimic Conductor: native AUQ disabled + the Conductor env signal present.
+      extraArgs: ['--disallowedTools', 'AskUserQuestion'],
+      env: { CONDUCTOR_WORKSPACE_PATH: '/tmp/conductor-prose-e2e' },
+      initialPlanContent: FLAWED_PLAN,
+      timeoutMs: 300_000,
+    });
+
+    // The decision must reach the human as prose. 'silent_write' (wrote findings
+    // to the plan without asking) is the precise failure we guard against.
+    if (obs.outcome === 'silent_write') {
+      throw new Error(
+        `Conductor prose regression: skill wrote findings without surfacing a decision.\n` +
+          `summary: ${obs.summary}\n--- evidence ---\n${obs.evidence}`,
+      );
+    }
+    if (obs.outcome === 'exited' || obs.outcome === 'timeout') {
+      throw new Error(
+        `Conductor prose test inconclusive: outcome=${obs.outcome}\n` +
+          `summary: ${obs.summary}\n--- evidence ---\n${obs.evidence}`,
+      );
+    }
+    // A prose-rendered decision brief was observed at some point in the run.
+    expect(obs.proseAUQEverObserved).toBe(true);
+  }, 360_000);
+});