gstack/test/skill-e2e-conductor-prose.test.ts

/**
 * Conductor → prose decision brief (periodic-tier, paid, real-PTY).
 *
 * Proves the end-to-end behavior: when CONDUCTOR_SESSION is signalled, a skill
 * that hits a decision renders a PROSE decision brief and waits, instead of
 * silently skipping the user.
 *
 * SCOPE — read before trusting this as the Conductor guard. This is END-TO-END
 * BEHAVIOR coverage, NOT the discriminating Conductor guarantee:
 *   - The deterministic guard is test/question-preference-hook.test.ts
 *     ("Conductor prose redirect") — it sets process.env.CONDUCTOR_* and asserts
 *     the PreToolUse hook denies + redirects. That test CAN fail on unfixed code.
 *   - The PTY harness here cannot register `mcp__conductor__AskUserQuestion`, so
 *     it tests "native AUQ unavailable + Conductor signal → prose," NOT "the MCP
 *     variant exists and must not be called" (Codex #10). Under --disallowedTools
 *     a present-human interactive session already prose-falls-back, so this test
 *     is a smoke check that the Conductor path still produces a prose brief, not
 *     a proof that the Conductor signal (vs the generic fallback) drove it.
 *
 * Periodic tier: model-behavior, non-deterministic.
 */

import { describe, test, expect } from 'bun:test';
import { runPlanSkillObservation } from './helpers/claude-pty-runner';

const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
const describeE2E = shouldRun ? describe : describe.skip;

const FLAWED_PLAN = `# Plan: add a "developer-friendly" pricing tier

## Goal
Increase developer adoption.

## Premise
No tests mentioned, no rollout plan, no auth check on the upgrade endpoint.
Adds a Stripe tier, a React pricing page, a Postgres entitlements table, and a
Redis cache. The team "feels like" it should be cheaper; no developer was asked.
`;

describeE2E('Conductor renders decisions as prose (periodic)', () => {
  test('plan-eng-review in a Conductor session surfaces a PROSE decision brief, not a silent skip', async () => {
    const obs = await runPlanSkillObservation({
      skillName: 'plan-eng-review',
      inPlanMode: true,
      // Mimic Conductor: native AUQ disabled + the Conductor env signal present.
      extraArgs: ['--disallowedTools', 'AskUserQuestion'],
      env: { CONDUCTOR_WORKSPACE_PATH: '/tmp/conductor-prose-e2e' },
      initialPlanContent: FLAWED_PLAN,
      timeoutMs: 300_000,
    });

    // The decision must reach the human as prose. 'silent_write' (wrote findings
    // to the plan without asking) is the precise failure we guard against.
    if (obs.outcome === 'silent_write') {
      throw new Error(
        `Conductor prose regression: skill wrote findings without surfacing a decision.\n` +
          `summary: ${obs.summary}\n--- evidence ---\n${obs.evidence}`,
      );
    }
    if (obs.outcome === 'exited' || obs.outcome === 'timeout') {
      throw new Error(
        `Conductor prose test inconclusive: outcome=${obs.outcome}\n` +
          `summary: ${obs.summary}\n--- evidence ---\n${obs.evidence}`,
      );
    }
    // A prose-rendered decision brief was observed at some point in the run.
    expect(obs.proseAUQEverObserved).toBe(true);
  }, 360_000);
});