test: add four per-finding count E2E tests (plan-ceo + eng + design + devex)

Each test drives its plan-* skill through Step 0 then asserts the review-phase AskUserQuestion count falls in [N-1, N+2] for an N=5 seeded plan, plus D19: produced plan file ends with "## GSTACK REVIEW REPORT" as its last "## " heading. plan-ceo also runs a paired-finding positive control: 2 deliberately related findings should still produce 2 distinct AUQs, not 1 batched. Periodic-tier (gate-skipped without EVALS=1, EVALS_TIER=periodic). Sequential execution by plan §D15. Each fixture is inline TypeScript content delivered as a follow-up message after the slash command, per the proven pattern at skill-e2e-plan-design-with-ui.test.ts. Calibration loop (5 runs per skill) and the manual pre-merge negative check (D7 + D12) are required before merge per plan §Verification. NOT yet run. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-02 03:35:09 +02:00 · 2026-04-28 20:12:32 -07:00
parent 0b04ca8486
commit f479134fba
4 changed files with 628 additions and 0 deletions
@@ -0,0 +1,224 @@
+/**
+ * /plan-ceo-review per-finding AskUserQuestion count (periodic, paid, real-PTY).
+ *
+ * Asserts the load-bearing rule "One issue = one AskUserQuestion call" by
+ * driving /plan-ceo-review against a 5-finding seeded plan and counting
+ * distinct review-phase AUQs. Passes when count is in [N-1, N+2].
+ *
+ * Two tests in this file:
+ *   - 5-finding distinct fixture: count band assertion + D19 review-report-at-bottom.
+ *   - 2-finding paired control (D12 positive control): related findings still
+ *     produce 2 distinct AUQs, not 1 batched, when the rule is honored.
+ *
+ * Tier: periodic. Each run drives Step 0 + 11 review sections end-to-end
+ * (~25 min, ~$5/run). Sequential by default per plan §D15. See
+ * test/helpers/claude-pty-runner.ts for runPlanSkillCounting internals.
+ */
+
+import { describe, test } from 'bun:test';
+import * as fs from 'node:fs';
+import {
+  runPlanSkillCounting,
+  ceoStep0Boundary,
+  assertReviewReportAtBottom,
+} from './helpers/claude-pty-runner';
+
+const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
+const describeE2E = shouldRun ? describe : describe.skip;
+
+const N_DISTINCT = 5;
+const FLOOR_DISTINCT = N_DISTINCT - 1; // 4 (D11)
+const CEILING_DISTINCT = N_DISTINCT + 2; // 7 (D11)
+
+const N_PAIRED = 2;
+const FLOOR_PAIRED = 2;
+const CEILING_PAIRED = 4;
+
+const PLAN_CEO_5_FINDINGS = [
+  'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-ceo.md (use Edit/Write to that exact path).',
+  '',
+  '# Plan: Payment Processing Integration',
+  '',
+  '## Architecture',
+  "We're adding a new `PaymentService` class that will handle Stripe webhooks.",
+  'This bypasses the existing `WebhookDispatcher` module — we want a clean',
+  'namespace separation.',
+  '',
+  '## Database access',
+  'The new endpoint reads `request.params.userId` directly into a raw SQL',
+  'fragment for the lookup query.',
+  '',
+  '## Webhook fan-out',
+  'On payment success we update the user record AND fire a notification email.',
+  'Both happen inline; no error handling on the email leg.',
+  '',
+  '## Tests',
+  "None planned. We'll rely on the existing integration suite catching regressions.",
+  '',
+  '## Performance',
+  'Each webhook lookup hits the database for the user, then fetches each',
+  'order in a loop.',
+].join('\n');
+
+const PLAN_CEO_2_PAIRED_FINDINGS = [
+  'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-ceo-paired.md (use Edit/Write to that exact path).',
+  '',
+  '# Plan: Payment Processing — Test Coverage',
+  '',
+  '## Tests',
+  'We need test coverage for `processPayment()`. Specifically:',
+  '1. The happy path (successful Stripe charge — assert correct receipt is generated).',
+  '2. The error/timeout path (Stripe returns 502 — assert retry-with-backoff fires once, then fails clean).',
+  '',
+  'Currently neither has a unit test. These are deliberately separate concerns:',
+  'the success path is correctness, the failure path is graceful degradation.',
+].join('\n');
+
+const PLAN_CEO_PATH = '/tmp/gstack-test-plan-ceo.md';
+const PLAN_CEO_PAIRED_PATH = '/tmp/gstack-test-plan-ceo-paired.md';
+
+describeE2E('/plan-ceo-review per-finding AskUserQuestion count (periodic)', () => {
+  test(
+    `5-finding plan emits ${FLOOR_DISTINCT}-${CEILING_DISTINCT} review-phase AskUserQuestions`,
+    async () => {
+      try {
+        fs.rmSync(PLAN_CEO_PATH, { force: true });
+      } catch {
+        /* best-effort */
+      }
+
+      const obs = await runPlanSkillCounting({
+        skillName: 'plan-ceo-review',
+        slashCommand: '/plan-ceo-review',
+        followUpPrompt: PLAN_CEO_5_FINDINGS,
+        isLastStep0AUQ: ceoStep0Boundary,
+        reviewCountCeiling: CEILING_DISTINCT + 1, // hard cap above assertion ceiling
+        cwd: process.cwd(),
+        timeoutMs: 1_500_000, // 25 min
+        env: { QUESTION_TUNING: 'false', EXPLAIN_LEVEL: 'default' },
+      });
+
+      try {
+        if (!['plan_ready', 'completion_summary', 'ceiling_reached'].includes(obs.outcome)) {
+          throw new Error(
+            `plan-ceo-review finding-count FAILED: outcome=${obs.outcome}\n` +
+              `step0=${obs.step0Count} review=${obs.reviewCount} elapsed=${obs.elapsedMs}ms\n` +
+              `fingerprints (last 8):\n` +
+              obs.fingerprints
+                .slice(-8)
+                .map(
+                  (f, i) =>
+                    `  ${i}. preReview=${f.preReview} sig=${f.signature.slice(0, 12)} prompt="${f.promptSnippet.slice(0, 60)}"`,
+                )
+                .join('\n') +
+              `\n--- evidence (last 3KB) ---\n${obs.evidence}`,
+          );
+        }
+        if (obs.reviewCount < FLOOR_DISTINCT) {
+          throw new Error(
+            `BAND FAIL (below floor): reviewCount=${obs.reviewCount} < FLOOR=${FLOOR_DISTINCT}.\n` +
+              `Likely batching regression — agent collapsed multiple findings into fewer questions.\n` +
+              `Fingerprints (review-phase only):\n` +
+              obs.fingerprints
+                .filter((f) => !f.preReview)
+                .map((f) => `  - "${f.promptSnippet.slice(0, 80)}"`)
+                .join('\n'),
+          );
+        }
+        if (obs.reviewCount > CEILING_DISTINCT) {
+          throw new Error(
+            `BAND FAIL (above ceiling): reviewCount=${obs.reviewCount} > CEILING=${CEILING_DISTINCT}.\n` +
+              `Possible over-asking regression. Review-phase fingerprints:\n` +
+              obs.fingerprints
+                .filter((f) => !f.preReview)
+                .map((f) => `  - "${f.promptSnippet.slice(0, 80)}"`)
+                .join('\n'),
+          );
+        }
+
+        // D19: review report at bottom of plan file.
+        if (!fs.existsSync(PLAN_CEO_PATH)) {
+          throw new Error(
+            `D19 FAIL: agent did not produce expected plan file at ${PLAN_CEO_PATH}.\n` +
+              `Either the agent ignored the path instruction in the follow-up prompt, or\n` +
+              `the helper exited before the agent wrote the file. ` +
+              `outcome=${obs.outcome} review=${obs.reviewCount}`,
+          );
+        }
+        const planContent = fs.readFileSync(PLAN_CEO_PATH, 'utf-8');
+        const verdict = assertReviewReportAtBottom(planContent);
+        if (!verdict.ok) {
+          throw new Error(
+            `D19 FAIL: plan file at ${PLAN_CEO_PATH} ${verdict.reason}\n` +
+              (verdict.trailingHeadings
+                ? `Trailing headings: ${verdict.trailingHeadings.join(' | ')}\n`
+                : '') +
+              `--- plan content (last 1KB) ---\n${planContent.slice(-1024)}`,
+          );
+        }
+      } finally {
+        try {
+          fs.rmSync(PLAN_CEO_PATH, { force: true });
+        } catch {
+          /* best-effort */
+        }
+      }
+    },
+    1_700_000,
+  );
+
+  test(
+    `paired-finding positive control: ${N_PAIRED} related findings produce ${FLOOR_PAIRED}-${CEILING_PAIRED} AskUserQuestions`,
+    async () => {
+      try {
+        fs.rmSync(PLAN_CEO_PAIRED_PATH, { force: true });
+      } catch {
+        /* best-effort */
+      }
+
+      const obs = await runPlanSkillCounting({
+        skillName: 'plan-ceo-review',
+        slashCommand: '/plan-ceo-review',
+        followUpPrompt: PLAN_CEO_2_PAIRED_FINDINGS,
+        isLastStep0AUQ: ceoStep0Boundary,
+        reviewCountCeiling: CEILING_PAIRED + 1,
+        cwd: process.cwd(),
+        timeoutMs: 1_500_000,
+        env: { QUESTION_TUNING: 'false', EXPLAIN_LEVEL: 'default' },
+      });
+
+      try {
+        if (!['plan_ready', 'completion_summary', 'ceiling_reached'].includes(obs.outcome)) {
+          throw new Error(
+            `paired-finding control FAILED: outcome=${obs.outcome}\n` +
+              `step0=${obs.step0Count} review=${obs.reviewCount}\n` +
+              `--- evidence (last 3KB) ---\n${obs.evidence}`,
+          );
+        }
+        if (obs.reviewCount < FLOOR_PAIRED) {
+          throw new Error(
+            `PAIRED CONTROL FAIL: reviewCount=${obs.reviewCount} < FLOOR=${FLOOR_PAIRED}.\n` +
+              `Two deliberately related findings were batched into <2 questions — the rule failed under D12.\n` +
+              `Review-phase fingerprints:\n` +
+              obs.fingerprints
+                .filter((f) => !f.preReview)
+                .map((f) => `  - "${f.promptSnippet.slice(0, 80)}"`)
+                .join('\n'),
+          );
+        }
+        if (obs.reviewCount > CEILING_PAIRED) {
+          throw new Error(
+            `PAIRED CONTROL FAIL: reviewCount=${obs.reviewCount} > CEILING=${CEILING_PAIRED} (over-asking on a 2-finding fixture).`,
+          );
+        }
+      } finally {
+        try {
+          fs.rmSync(PLAN_CEO_PAIRED_PATH, { force: true });
+        } catch {
+          /* best-effort */
+        }
+      }
+    },
+    1_700_000,
+  );
+});
@@ -0,0 +1,135 @@
+/**
+ * /plan-design-review per-finding AskUserQuestion count (periodic, paid, real-PTY).
+ *
+ * Same shape as skill-e2e-plan-ceo-finding-count: drives /plan-design-review
+ * against a 5-finding seeded plan and asserts review-phase AUQ count ∈ [N-1, N+2].
+ * Plus D19: review report at bottom of produced plan file.
+ *
+ * Tier: periodic (~25 min, ~$5/run). Sequential by default per plan §D15.
+ */
+
+import { describe, test } from 'bun:test';
+import * as fs from 'node:fs';
+import {
+  runPlanSkillCounting,
+  designStep0Boundary,
+  assertReviewReportAtBottom,
+} from './helpers/claude-pty-runner';
+
+const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
+const describeE2E = shouldRun ? describe : describe.skip;
+
+const N = 5;
+const FLOOR = N - 1;
+const CEILING = N + 2;
+
+const PLAN_DESIGN_5_FINDINGS = [
+  'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-design.md (use Edit/Write to that exact path).',
+  '',
+  '# Plan: Settings Page UI redesign',
+  '',
+  '## Visual Hierarchy',
+  'The "Save" button is rendered with the same size, weight, and color as',
+  'three other buttons in the page header (Reset, Cancel, Export). Nothing',
+  'tells the user which is the primary action.',
+  '',
+  '## Spacing',
+  'Between sections we have 24px in some places, 32px in others, and 16px',
+  'in a third — no consistent vertical rhythm.',
+  '',
+  '## Color',
+  'The error message uses red text on a light pink background. Contrast',
+  'ratio is approximately 3:1 (below WCAG AA).',
+  '',
+  '## Typography',
+  'We use 14px, 16px, and 18px font sizes across the form labels. Two',
+  'sizes would suffice and create stronger hierarchy.',
+  '',
+  '## Motion',
+  'The "Save" action takes 2-5 seconds with no loading indicator. Users',
+  'see a frozen page; we should add a spinner or skeleton state.',
+].join('\n');
+
+const PLAN_DESIGN_PATH = '/tmp/gstack-test-plan-design.md';
+
+describeE2E('/plan-design-review per-finding AskUserQuestion count (periodic)', () => {
+  test(
+    `5-finding plan emits ${FLOOR}-${CEILING} review-phase AskUserQuestions`,
+    async () => {
+      try {
+        fs.rmSync(PLAN_DESIGN_PATH, { force: true });
+      } catch {
+        /* best-effort */
+      }
+
+      const obs = await runPlanSkillCounting({
+        skillName: 'plan-design-review',
+        slashCommand: '/plan-design-review',
+        followUpPrompt: PLAN_DESIGN_5_FINDINGS,
+        isLastStep0AUQ: designStep0Boundary,
+        reviewCountCeiling: CEILING + 1,
+        cwd: process.cwd(),
+        timeoutMs: 1_500_000,
+        env: { QUESTION_TUNING: 'false', EXPLAIN_LEVEL: 'default' },
+      });
+
+      try {
+        if (!['plan_ready', 'completion_summary', 'ceiling_reached'].includes(obs.outcome)) {
+          throw new Error(
+            `plan-design-review finding-count FAILED: outcome=${obs.outcome}\n` +
+              `step0=${obs.step0Count} review=${obs.reviewCount} elapsed=${obs.elapsedMs}ms\n` +
+              `fingerprints (last 8):\n` +
+              obs.fingerprints
+                .slice(-8)
+                .map(
+                  (f, i) =>
+                    `  ${i}. preReview=${f.preReview} sig=${f.signature.slice(0, 12)} prompt="${f.promptSnippet.slice(0, 60)}"`,
+                )
+                .join('\n') +
+              `\n--- evidence (last 3KB) ---\n${obs.evidence}`,
+          );
+        }
+        if (obs.reviewCount < FLOOR) {
+          throw new Error(
+            `BAND FAIL (below floor): reviewCount=${obs.reviewCount} < FLOOR=${FLOOR}.\n` +
+              `Likely batching regression. Review-phase fingerprints:\n` +
+              obs.fingerprints
+                .filter((f) => !f.preReview)
+                .map((f) => `  - "${f.promptSnippet.slice(0, 80)}"`)
+                .join('\n'),
+          );
+        }
+        if (obs.reviewCount > CEILING) {
+          throw new Error(
+            `BAND FAIL (above ceiling): reviewCount=${obs.reviewCount} > CEILING=${CEILING}.`,
+          );
+        }
+
+        if (!fs.existsSync(PLAN_DESIGN_PATH)) {
+          throw new Error(
+            `D19 FAIL: agent did not produce expected plan file at ${PLAN_DESIGN_PATH}. ` +
+              `outcome=${obs.outcome} review=${obs.reviewCount}`,
+          );
+        }
+        const planContent = fs.readFileSync(PLAN_DESIGN_PATH, 'utf-8');
+        const verdict = assertReviewReportAtBottom(planContent);
+        if (!verdict.ok) {
+          throw new Error(
+            `D19 FAIL: plan file at ${PLAN_DESIGN_PATH} ${verdict.reason}\n` +
+              (verdict.trailingHeadings
+                ? `Trailing headings: ${verdict.trailingHeadings.join(' | ')}\n`
+                : '') +
+              `--- plan content (last 1KB) ---\n${planContent.slice(-1024)}`,
+          );
+        }
+      } finally {
+        try {
+          fs.rmSync(PLAN_DESIGN_PATH, { force: true });
+        } catch {
+          /* best-effort */
+        }
+      }
+    },
+    1_700_000,
+  );
+});
@@ -0,0 +1,135 @@
+/**
+ * /plan-devex-review per-finding AskUserQuestion count (periodic, paid, real-PTY).
+ *
+ * Same shape as skill-e2e-plan-ceo-finding-count: drives /plan-devex-review
+ * against a 5-finding seeded plan and asserts review-phase AUQ count ∈ [N-1, N+2].
+ * Plus D19: review report at bottom of produced plan file.
+ *
+ * Tier: periodic (~25 min, ~$5/run). Sequential by default per plan §D15.
+ */
+
+import { describe, test } from 'bun:test';
+import * as fs from 'node:fs';
+import {
+  runPlanSkillCounting,
+  devexStep0Boundary,
+  assertReviewReportAtBottom,
+} from './helpers/claude-pty-runner';
+
+const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
+const describeE2E = shouldRun ? describe : describe.skip;
+
+const N = 5;
+const FLOOR = N - 1;
+const CEILING = N + 2;
+
+const PLAN_DEVEX_5_FINDINGS = [
+  'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-devex.md (use Edit/Write to that exact path).',
+  '',
+  '# Plan: Public SDK Beta Launch',
+  '',
+  '## Persona',
+  "The plan doesn't specify which developer persona is the target — we're",
+  "shipping for \"everyone,\" which means we tune for nobody.",
+  '',
+  '## TTHW (time to hello world)',
+  'Time-to-hello-world is not measured. No benchmark data referenced. We',
+  "don't know if first-run takes 5 minutes or 50.",
+  '',
+  '## Friction Point',
+  'First-run currently requires a 5-minute mandatory CI step before the',
+  'developer can run their first eval. There is no way to skip it.',
+  '',
+  '## Magical Moment',
+  'Getting-started flow has no delight beat. Pure documentation, no',
+  'interactive demo, no "ah-ha" moment that makes the developer trust us.',
+  '',
+  '## Competitive Blind Spot',
+  "The plan doesn't reference how peer SDKs (LangChain, Semantic Kernel,",
+  'OpenAI) handle this DX surface. We may be reinventing worse versions',
+  'of solved problems.',
+].join('\n');
+
+const PLAN_DEVEX_PATH = '/tmp/gstack-test-plan-devex.md';
+
+describeE2E('/plan-devex-review per-finding AskUserQuestion count (periodic)', () => {
+  test(
+    `5-finding plan emits ${FLOOR}-${CEILING} review-phase AskUserQuestions`,
+    async () => {
+      try {
+        fs.rmSync(PLAN_DEVEX_PATH, { force: true });
+      } catch {
+        /* best-effort */
+      }
+
+      const obs = await runPlanSkillCounting({
+        skillName: 'plan-devex-review',
+        slashCommand: '/plan-devex-review',
+        followUpPrompt: PLAN_DEVEX_5_FINDINGS,
+        isLastStep0AUQ: devexStep0Boundary,
+        reviewCountCeiling: CEILING + 1,
+        cwd: process.cwd(),
+        timeoutMs: 1_500_000,
+        env: { QUESTION_TUNING: 'false', EXPLAIN_LEVEL: 'default' },
+      });
+
+      try {
+        if (!['plan_ready', 'completion_summary', 'ceiling_reached'].includes(obs.outcome)) {
+          throw new Error(
+            `plan-devex-review finding-count FAILED: outcome=${obs.outcome}\n` +
+              `step0=${obs.step0Count} review=${obs.reviewCount} elapsed=${obs.elapsedMs}ms\n` +
+              `fingerprints (last 8):\n` +
+              obs.fingerprints
+                .slice(-8)
+                .map(
+                  (f, i) =>
+                    `  ${i}. preReview=${f.preReview} sig=${f.signature.slice(0, 12)} prompt="${f.promptSnippet.slice(0, 60)}"`,
+                )
+                .join('\n') +
+              `\n--- evidence (last 3KB) ---\n${obs.evidence}`,
+          );
+        }
+        if (obs.reviewCount < FLOOR) {
+          throw new Error(
+            `BAND FAIL (below floor): reviewCount=${obs.reviewCount} < FLOOR=${FLOOR}.\n` +
+              `Likely batching regression. Review-phase fingerprints:\n` +
+              obs.fingerprints
+                .filter((f) => !f.preReview)
+                .map((f) => `  - "${f.promptSnippet.slice(0, 80)}"`)
+                .join('\n'),
+          );
+        }
+        if (obs.reviewCount > CEILING) {
+          throw new Error(
+            `BAND FAIL (above ceiling): reviewCount=${obs.reviewCount} > CEILING=${CEILING}.`,
+          );
+        }
+
+        if (!fs.existsSync(PLAN_DEVEX_PATH)) {
+          throw new Error(
+            `D19 FAIL: agent did not produce expected plan file at ${PLAN_DEVEX_PATH}. ` +
+              `outcome=${obs.outcome} review=${obs.reviewCount}`,
+          );
+        }
+        const planContent = fs.readFileSync(PLAN_DEVEX_PATH, 'utf-8');
+        const verdict = assertReviewReportAtBottom(planContent);
+        if (!verdict.ok) {
+          throw new Error(
+            `D19 FAIL: plan file at ${PLAN_DEVEX_PATH} ${verdict.reason}\n` +
+              (verdict.trailingHeadings
+                ? `Trailing headings: ${verdict.trailingHeadings.join(' | ')}\n`
+                : '') +
+              `--- plan content (last 1KB) ---\n${planContent.slice(-1024)}`,
+          );
+        }
+      } finally {
+        try {
+          fs.rmSync(PLAN_DEVEX_PATH, { force: true });
+        } catch {
+          /* best-effort */
+        }
+      }
+    },
+    1_700_000,
+  );
+});
@@ -0,0 +1,134 @@
+/**
+ * /plan-eng-review per-finding AskUserQuestion count (periodic, paid, real-PTY).
+ *
+ * Same shape as skill-e2e-plan-ceo-finding-count: drives /plan-eng-review
+ * against a 5-finding seeded plan and asserts review-phase AUQ count ∈ [N-1, N+2].
+ * Plus D19: review report at bottom of produced plan file.
+ *
+ * Tier: periodic (~25 min, ~$5/run). Sequential by default per plan §D15.
+ */
+
+import { describe, test } from 'bun:test';
+import * as fs from 'node:fs';
+import {
+  runPlanSkillCounting,
+  engStep0Boundary,
+  assertReviewReportAtBottom,
+} from './helpers/claude-pty-runner';
+
+const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
+const describeE2E = shouldRun ? describe : describe.skip;
+
+const N = 5;
+const FLOOR = N - 1; // 4
+const CEILING = N + 2; // 7
+
+const PLAN_ENG_5_FINDINGS = [
+  'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-eng.md (use Edit/Write to that exact path).',
+  '',
+  '# Plan: Multi-tenant Auth Refactor',
+  '',
+  '## Architecture',
+  'Two new services (`AuthBroker` and `SessionMint`) share a global mutable',
+  '`AuthCache` instance via module-level export. Both services mutate it.',
+  '',
+  '## Code quality',
+  'The `validateAndDispatch()` function is 60 lines with three nested',
+  'try/catch blocks; each catch swallows a different error class.',
+  '',
+  '## Tests',
+  'The existing `legacyAuthFlow()` will get rewritten as part of this work;',
+  'no regression test for the prior behavior is planned.',
+  '',
+  '## Performance',
+  'Token validation issues 5 sequential API calls to the IDP; they could be',
+  'parallelized via Promise.all trivially (calls are independent).',
+  '',
+  '## Architecture (scope smell)',
+  'This touches 12 files and introduces 4 new classes (TokenStore,',
+  'SessionMint, AuthCache, RequestPolicy). Worth flagging the complexity check.',
+].join('\n');
+
+const PLAN_ENG_PATH = '/tmp/gstack-test-plan-eng.md';
+
+describeE2E('/plan-eng-review per-finding AskUserQuestion count (periodic)', () => {
+  test(
+    `5-finding plan emits ${FLOOR}-${CEILING} review-phase AskUserQuestions`,
+    async () => {
+      try {
+        fs.rmSync(PLAN_ENG_PATH, { force: true });
+      } catch {
+        /* best-effort */
+      }
+
+      const obs = await runPlanSkillCounting({
+        skillName: 'plan-eng-review',
+        slashCommand: '/plan-eng-review',
+        followUpPrompt: PLAN_ENG_5_FINDINGS,
+        isLastStep0AUQ: engStep0Boundary,
+        reviewCountCeiling: CEILING + 1,
+        cwd: process.cwd(),
+        timeoutMs: 1_500_000,
+        env: { QUESTION_TUNING: 'false', EXPLAIN_LEVEL: 'default' },
+      });
+
+      try {
+        if (!['plan_ready', 'completion_summary', 'ceiling_reached'].includes(obs.outcome)) {
+          throw new Error(
+            `plan-eng-review finding-count FAILED: outcome=${obs.outcome}\n` +
+              `step0=${obs.step0Count} review=${obs.reviewCount} elapsed=${obs.elapsedMs}ms\n` +
+              `fingerprints (last 8):\n` +
+              obs.fingerprints
+                .slice(-8)
+                .map(
+                  (f, i) =>
+                    `  ${i}. preReview=${f.preReview} sig=${f.signature.slice(0, 12)} prompt="${f.promptSnippet.slice(0, 60)}"`,
+                )
+                .join('\n') +
+              `\n--- evidence (last 3KB) ---\n${obs.evidence}`,
+          );
+        }
+        if (obs.reviewCount < FLOOR) {
+          throw new Error(
+            `BAND FAIL (below floor): reviewCount=${obs.reviewCount} < FLOOR=${FLOOR}.\n` +
+              `Likely batching regression. Review-phase fingerprints:\n` +
+              obs.fingerprints
+                .filter((f) => !f.preReview)
+                .map((f) => `  - "${f.promptSnippet.slice(0, 80)}"`)
+                .join('\n'),
+          );
+        }
+        if (obs.reviewCount > CEILING) {
+          throw new Error(
+            `BAND FAIL (above ceiling): reviewCount=${obs.reviewCount} > CEILING=${CEILING}.`,
+          );
+        }
+
+        if (!fs.existsSync(PLAN_ENG_PATH)) {
+          throw new Error(
+            `D19 FAIL: agent did not produce expected plan file at ${PLAN_ENG_PATH}. ` +
+              `outcome=${obs.outcome} review=${obs.reviewCount}`,
+          );
+        }
+        const planContent = fs.readFileSync(PLAN_ENG_PATH, 'utf-8');
+        const verdict = assertReviewReportAtBottom(planContent);
+        if (!verdict.ok) {
+          throw new Error(
+            `D19 FAIL: plan file at ${PLAN_ENG_PATH} ${verdict.reason}\n` +
+              (verdict.trailingHeadings
+                ? `Trailing headings: ${verdict.trailingHeadings.join(' | ')}\n`
+                : '') +
+              `--- plan content (last 1KB) ---\n${planContent.slice(-1024)}`,
+          );
+        }
+      } finally {
+        try {
+          fs.rmSync(PLAN_ENG_PATH, { force: true });
+        } catch {
+          /* best-effort */
+        }
+      }
+    },
+    1_700_000,
+  );
+});