gstack/test/skill-e2e-plan-ceo-finding-count.test.ts

/**
 * /plan-ceo-review per-finding AskUserQuestion count (periodic, paid, real-PTY).
 *
 * Asserts the load-bearing rule "One issue = one AskUserQuestion call" by
 * driving /plan-ceo-review against a 5-finding seeded plan and counting
 * distinct review-phase AUQs. Passes when count is in [N-1, N+2].
 *
 * Two tests in this file:
 *   - 5-finding distinct fixture: count band assertion + D19 review-report-at-bottom.
 *   - 2-finding paired control (D12 positive control): related findings still
 *     produce 2 distinct AUQs, not 1 batched, when the rule is honored.
 *
 * Tier: periodic. Each run drives Step 0 + 11 review sections end-to-end
 * (~25 min, ~$5/run). Sequential by default per plan §D15. See
 * test/helpers/claude-pty-runner.ts for runPlanSkillCounting internals.
 */

import { describe, test } from 'bun:test';
import * as fs from 'node:fs';
import {
  runPlanSkillCounting,
  ceoStep0Boundary,
  assertReviewReportAtBottom,
  type AskUserQuestionFingerprint,
} from './helpers/claude-pty-runner';

/**
 * /plan-ceo-review's first AUQ asks "what scope?" with options like
 *   1. Branch diff vs main
 *   2. A specific plan file or design doc
 *   3. An idea you'll describe inline
 *   ...
 *   7. Skip interview and plan immediately
 *
 * The default pick (1) routes to "branch diff vs main" — the wrong target
 * for our seeded fixture (the agent would review the gstack PR itself,
 * recursively). Picking "Skip interview and plan immediately" bypasses
 * Step 0 and routes the agent to review the chat context (where our
 * follow-up plan was pasted).
 */
function pickSkipInterview(fp: AskUserQuestionFingerprint): number {
  const skipOpt = fp.options.find((o) =>
    /skip\s+interview|plan\s+immediately/i.test(o.label),
  );
  if (skipOpt) return skipOpt.index;
  // Fallback: "describe inline" also routes to using our pasted plan.
  const inlineOpt = fp.options.find((o) =>
    /describe.*inline|inline.*idea/i.test(o.label),
  );
  if (inlineOpt) return inlineOpt.index;
  return 1;
}

const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
const describeE2E = shouldRun ? describe : describe.skip;

const N_DISTINCT = 5;
const FLOOR_DISTINCT = N_DISTINCT - 1; // 4 (D11)
const CEILING_DISTINCT = N_DISTINCT + 2; // 7 (D11)

const N_PAIRED = 2;
const FLOOR_PAIRED = 2;
const CEILING_PAIRED = 4;

const PLAN_CEO_5_FINDINGS = [
  'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-ceo.md (use Edit/Write to that exact path).',
  '',
  '# Plan: Payment Processing Integration',
  '',
  '## Architecture',
  "We're adding a new `PaymentService` class that will handle Stripe webhooks.",
  'This bypasses the existing `WebhookDispatcher` module — we want a clean',
  'namespace separation.',
  '',
  '## Database access',
  'The new endpoint reads `request.params.userId` directly into a raw SQL',
  'fragment for the lookup query.',
  '',
  '## Webhook fan-out',
  'On payment success we update the user record AND fire a notification email.',
  'Both happen inline; no error handling on the email leg.',
  '',
  '## Tests',
  "None planned. We'll rely on the existing integration suite catching regressions.",
  '',
  '## Performance',
  'Each webhook lookup hits the database for the user, then fetches each',
  'order in a loop.',
].join('\n');

const PLAN_CEO_2_PAIRED_FINDINGS = [
  'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-ceo-paired.md (use Edit/Write to that exact path).',
  '',
  '# Plan: Payment Processing — Test Coverage',
  '',
  '## Tests',
  'We need test coverage for `processPayment()`. Specifically:',
  '1. The happy path (successful Stripe charge — assert correct receipt is generated).',
  '2. The error/timeout path (Stripe returns 502 — assert retry-with-backoff fires once, then fails clean).',
  '',
  'Currently neither has a unit test. These are deliberately separate concerns:',
  'the success path is correctness, the failure path is graceful degradation.',
].join('\n');

const PLAN_CEO_PATH = '/tmp/gstack-test-plan-ceo.md';
const PLAN_CEO_PAIRED_PATH = '/tmp/gstack-test-plan-ceo-paired.md';

describeE2E('/plan-ceo-review per-finding AskUserQuestion count (periodic)', () => {
  test(
    `5-finding plan emits ${FLOOR_DISTINCT}-${CEILING_DISTINCT} review-phase AskUserQuestions`,
    async () => {
      try {
        fs.rmSync(PLAN_CEO_PATH, { force: true });
      } catch {
        /* best-effort */
      }

      const obs = await runPlanSkillCounting({
        skillName: 'plan-ceo-review',
        slashCommand: '/plan-ceo-review',
        followUpPrompt: PLAN_CEO_5_FINDINGS,
        isLastStep0AUQ: ceoStep0Boundary,
        reviewCountCeiling: CEILING_DISTINCT + 1, // hard cap above assertion ceiling
        firstAUQPick: pickSkipInterview, // bypass scope-selection, route to review
        cwd: process.cwd(),
        timeoutMs: 1_500_000, // 25 min
        env: { QUESTION_TUNING: 'false', EXPLAIN_LEVEL: 'default' },
      });

      try {
        if (!['plan_ready', 'completion_summary', 'ceiling_reached'].includes(obs.outcome)) {
          throw new Error(
            `plan-ceo-review finding-count FAILED: outcome=${obs.outcome}\n` +
              `step0=${obs.step0Count} review=${obs.reviewCount} elapsed=${obs.elapsedMs}ms\n` +
              `fingerprints (last 8):\n` +
              obs.fingerprints
                .slice(-8)
                .map(
                  (f, i) =>
                    `  ${i}. preReview=${f.preReview} sig=${f.signature.slice(0, 12)} prompt="${f.promptSnippet.slice(0, 60)}"`,
                )
                .join('\n') +
              `\n--- evidence (last 3KB) ---\n${obs.evidence}`,
          );
        }
        if (obs.reviewCount < FLOOR_DISTINCT) {
          throw new Error(
            `BAND FAIL (below floor): reviewCount=${obs.reviewCount} < FLOOR=${FLOOR_DISTINCT}.\n` +
              `Likely batching regression — agent collapsed multiple findings into fewer questions.\n` +
              `Fingerprints (review-phase only):\n` +
              obs.fingerprints
                .filter((f) => !f.preReview)
                .map((f) => `  - "${f.promptSnippet.slice(0, 80)}"`)
                .join('\n'),
          );
        }
        if (obs.reviewCount > CEILING_DISTINCT) {
          throw new Error(
            `BAND FAIL (above ceiling): reviewCount=${obs.reviewCount} > CEILING=${CEILING_DISTINCT}.\n` +
              `Possible over-asking regression. Review-phase fingerprints:\n` +
              obs.fingerprints
                .filter((f) => !f.preReview)
                .map((f) => `  - "${f.promptSnippet.slice(0, 80)}"`)
                .join('\n'),
          );
        }

        // D19: review report at bottom of plan file.
        if (!fs.existsSync(PLAN_CEO_PATH)) {
          throw new Error(
            `D19 FAIL: agent did not produce expected plan file at ${PLAN_CEO_PATH}.\n` +
              `Either the agent ignored the path instruction in the follow-up prompt, or\n` +
              `the helper exited before the agent wrote the file. ` +
              `outcome=${obs.outcome} review=${obs.reviewCount}`,
          );
        }
        const planContent = fs.readFileSync(PLAN_CEO_PATH, 'utf-8');
        const verdict = assertReviewReportAtBottom(planContent);
        if (!verdict.ok) {
          throw new Error(
            `D19 FAIL: plan file at ${PLAN_CEO_PATH} ${verdict.reason}\n` +
              (verdict.trailingHeadings
                ? `Trailing headings: ${verdict.trailingHeadings.join(' | ')}\n`
                : '') +
              `--- plan content (last 1KB) ---\n${planContent.slice(-1024)}`,
          );
        }
      } finally {
        try {
          fs.rmSync(PLAN_CEO_PATH, { force: true });
        } catch {
          /* best-effort */
        }
      }
    },
    1_700_000,
  );

  test(
    `paired-finding positive control: ${N_PAIRED} related findings produce ${FLOOR_PAIRED}-${CEILING_PAIRED} AskUserQuestions`,
    async () => {
      try {
        fs.rmSync(PLAN_CEO_PAIRED_PATH, { force: true });
      } catch {
        /* best-effort */
      }

      const obs = await runPlanSkillCounting({
        skillName: 'plan-ceo-review',
        slashCommand: '/plan-ceo-review',
        followUpPrompt: PLAN_CEO_2_PAIRED_FINDINGS,
        isLastStep0AUQ: ceoStep0Boundary,
        reviewCountCeiling: CEILING_PAIRED + 1,
        cwd: process.cwd(),
        timeoutMs: 1_500_000,
        env: { QUESTION_TUNING: 'false', EXPLAIN_LEVEL: 'default' },
      });

      try {
        if (!['plan_ready', 'completion_summary', 'ceiling_reached'].includes(obs.outcome)) {
          throw new Error(
            `paired-finding control FAILED: outcome=${obs.outcome}\n` +
              `step0=${obs.step0Count} review=${obs.reviewCount}\n` +
              `--- evidence (last 3KB) ---\n${obs.evidence}`,
          );
        }
        if (obs.reviewCount < FLOOR_PAIRED) {
          throw new Error(
            `PAIRED CONTROL FAIL: reviewCount=${obs.reviewCount} < FLOOR=${FLOOR_PAIRED}.\n` +
              `Two deliberately related findings were batched into <2 questions — the rule failed under D12.\n` +
              `Review-phase fingerprints:\n` +
              obs.fingerprints
                .filter((f) => !f.preReview)
                .map((f) => `  - "${f.promptSnippet.slice(0, 80)}"`)
                .join('\n'),
          );
        }
        if (obs.reviewCount > CEILING_PAIRED) {
          throw new Error(
            `PAIRED CONTROL FAIL: reviewCount=${obs.reviewCount} > CEILING=${CEILING_PAIRED} (over-asking on a 2-finding fixture).`,
          );
        }
      } finally {
        try {
          fs.rmSync(PLAN_CEO_PAIRED_PATH, { force: true });
        } catch {
          /* best-effort */
        }
      }
    },
    1_700_000,
  );
});