From f479134fba68a18264d8cc69e5ce6d9472976cba Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Tue, 28 Apr 2026 20:12:32 -0700 Subject: [PATCH] test: add four per-finding count E2E tests (plan-ceo + eng + design + devex) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each test drives its plan-* skill through Step 0 then asserts the review-phase AskUserQuestion count falls in [N-1, N+2] for an N=5 seeded plan, plus D19: produced plan file ends with "## GSTACK REVIEW REPORT" as its last "## " heading. plan-ceo also runs a paired-finding positive control: 2 deliberately related findings should still produce 2 distinct AUQs, not 1 batched. Periodic-tier (gate-skipped without EVALS=1, EVALS_TIER=periodic). Sequential execution by plan §D15. Each fixture is inline TypeScript content delivered as a follow-up message after the slash command, per the proven pattern at skill-e2e-plan-design-with-ui.test.ts. Calibration loop (5 runs per skill) and the manual pre-merge negative check (D7 + D12) are required before merge per plan §Verification. NOT yet run. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/skill-e2e-plan-ceo-finding-count.test.ts | 224 ++++++++++++++++++ ...kill-e2e-plan-design-finding-count.test.ts | 135 +++++++++++ ...skill-e2e-plan-devex-finding-count.test.ts | 135 +++++++++++ test/skill-e2e-plan-eng-finding-count.test.ts | 134 +++++++++++ 4 files changed, 628 insertions(+) create mode 100644 test/skill-e2e-plan-ceo-finding-count.test.ts create mode 100644 test/skill-e2e-plan-design-finding-count.test.ts create mode 100644 test/skill-e2e-plan-devex-finding-count.test.ts create mode 100644 test/skill-e2e-plan-eng-finding-count.test.ts diff --git a/test/skill-e2e-plan-ceo-finding-count.test.ts b/test/skill-e2e-plan-ceo-finding-count.test.ts new file mode 100644 index 00000000..0bc94d9c --- /dev/null +++ b/test/skill-e2e-plan-ceo-finding-count.test.ts @@ -0,0 +1,224 @@ +/** + * /plan-ceo-review per-finding AskUserQuestion count (periodic, paid, real-PTY). + * + * Asserts the load-bearing rule "One issue = one AskUserQuestion call" by + * driving /plan-ceo-review against a 5-finding seeded plan and counting + * distinct review-phase AUQs. Passes when count is in [N-1, N+2]. + * + * Two tests in this file: + * - 5-finding distinct fixture: count band assertion + D19 review-report-at-bottom. + * - 2-finding paired control (D12 positive control): related findings still + * produce 2 distinct AUQs, not 1 batched, when the rule is honored. + * + * Tier: periodic. Each run drives Step 0 + 11 review sections end-to-end + * (~25 min, ~$5/run). Sequential by default per plan §D15. See + * test/helpers/claude-pty-runner.ts for runPlanSkillCounting internals. + */ + +import { describe, test } from 'bun:test'; +import * as fs from 'node:fs'; +import { + runPlanSkillCounting, + ceoStep0Boundary, + assertReviewReportAtBottom, +} from './helpers/claude-pty-runner'; + +const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic'; +const describeE2E = shouldRun ? describe : describe.skip; + +const N_DISTINCT = 5; +const FLOOR_DISTINCT = N_DISTINCT - 1; // 4 (D11) +const CEILING_DISTINCT = N_DISTINCT + 2; // 7 (D11) + +const N_PAIRED = 2; +const FLOOR_PAIRED = 2; +const CEILING_PAIRED = 4; + +const PLAN_CEO_5_FINDINGS = [ + 'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-ceo.md (use Edit/Write to that exact path).', + '', + '# Plan: Payment Processing Integration', + '', + '## Architecture', + "We're adding a new `PaymentService` class that will handle Stripe webhooks.", + 'This bypasses the existing `WebhookDispatcher` module — we want a clean', + 'namespace separation.', + '', + '## Database access', + 'The new endpoint reads `request.params.userId` directly into a raw SQL', + 'fragment for the lookup query.', + '', + '## Webhook fan-out', + 'On payment success we update the user record AND fire a notification email.', + 'Both happen inline; no error handling on the email leg.', + '', + '## Tests', + "None planned. We'll rely on the existing integration suite catching regressions.", + '', + '## Performance', + 'Each webhook lookup hits the database for the user, then fetches each', + 'order in a loop.', +].join('\n'); + +const PLAN_CEO_2_PAIRED_FINDINGS = [ + 'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-ceo-paired.md (use Edit/Write to that exact path).', + '', + '# Plan: Payment Processing — Test Coverage', + '', + '## Tests', + 'We need test coverage for `processPayment()`. Specifically:', + '1. The happy path (successful Stripe charge — assert correct receipt is generated).', + '2. The error/timeout path (Stripe returns 502 — assert retry-with-backoff fires once, then fails clean).', + '', + 'Currently neither has a unit test. These are deliberately separate concerns:', + 'the success path is correctness, the failure path is graceful degradation.', +].join('\n'); + +const PLAN_CEO_PATH = '/tmp/gstack-test-plan-ceo.md'; +const PLAN_CEO_PAIRED_PATH = '/tmp/gstack-test-plan-ceo-paired.md'; + +describeE2E('/plan-ceo-review per-finding AskUserQuestion count (periodic)', () => { + test( + `5-finding plan emits ${FLOOR_DISTINCT}-${CEILING_DISTINCT} review-phase AskUserQuestions`, + async () => { + try { + fs.rmSync(PLAN_CEO_PATH, { force: true }); + } catch { + /* best-effort */ + } + + const obs = await runPlanSkillCounting({ + skillName: 'plan-ceo-review', + slashCommand: '/plan-ceo-review', + followUpPrompt: PLAN_CEO_5_FINDINGS, + isLastStep0AUQ: ceoStep0Boundary, + reviewCountCeiling: CEILING_DISTINCT + 1, // hard cap above assertion ceiling + cwd: process.cwd(), + timeoutMs: 1_500_000, // 25 min + env: { QUESTION_TUNING: 'false', EXPLAIN_LEVEL: 'default' }, + }); + + try { + if (!['plan_ready', 'completion_summary', 'ceiling_reached'].includes(obs.outcome)) { + throw new Error( + `plan-ceo-review finding-count FAILED: outcome=${obs.outcome}\n` + + `step0=${obs.step0Count} review=${obs.reviewCount} elapsed=${obs.elapsedMs}ms\n` + + `fingerprints (last 8):\n` + + obs.fingerprints + .slice(-8) + .map( + (f, i) => + ` ${i}. preReview=${f.preReview} sig=${f.signature.slice(0, 12)} prompt="${f.promptSnippet.slice(0, 60)}"`, + ) + .join('\n') + + `\n--- evidence (last 3KB) ---\n${obs.evidence}`, + ); + } + if (obs.reviewCount < FLOOR_DISTINCT) { + throw new Error( + `BAND FAIL (below floor): reviewCount=${obs.reviewCount} < FLOOR=${FLOOR_DISTINCT}.\n` + + `Likely batching regression — agent collapsed multiple findings into fewer questions.\n` + + `Fingerprints (review-phase only):\n` + + obs.fingerprints + .filter((f) => !f.preReview) + .map((f) => ` - "${f.promptSnippet.slice(0, 80)}"`) + .join('\n'), + ); + } + if (obs.reviewCount > CEILING_DISTINCT) { + throw new Error( + `BAND FAIL (above ceiling): reviewCount=${obs.reviewCount} > CEILING=${CEILING_DISTINCT}.\n` + + `Possible over-asking regression. Review-phase fingerprints:\n` + + obs.fingerprints + .filter((f) => !f.preReview) + .map((f) => ` - "${f.promptSnippet.slice(0, 80)}"`) + .join('\n'), + ); + } + + // D19: review report at bottom of plan file. + if (!fs.existsSync(PLAN_CEO_PATH)) { + throw new Error( + `D19 FAIL: agent did not produce expected plan file at ${PLAN_CEO_PATH}.\n` + + `Either the agent ignored the path instruction in the follow-up prompt, or\n` + + `the helper exited before the agent wrote the file. ` + + `outcome=${obs.outcome} review=${obs.reviewCount}`, + ); + } + const planContent = fs.readFileSync(PLAN_CEO_PATH, 'utf-8'); + const verdict = assertReviewReportAtBottom(planContent); + if (!verdict.ok) { + throw new Error( + `D19 FAIL: plan file at ${PLAN_CEO_PATH} ${verdict.reason}\n` + + (verdict.trailingHeadings + ? `Trailing headings: ${verdict.trailingHeadings.join(' | ')}\n` + : '') + + `--- plan content (last 1KB) ---\n${planContent.slice(-1024)}`, + ); + } + } finally { + try { + fs.rmSync(PLAN_CEO_PATH, { force: true }); + } catch { + /* best-effort */ + } + } + }, + 1_700_000, + ); + + test( + `paired-finding positive control: ${N_PAIRED} related findings produce ${FLOOR_PAIRED}-${CEILING_PAIRED} AskUserQuestions`, + async () => { + try { + fs.rmSync(PLAN_CEO_PAIRED_PATH, { force: true }); + } catch { + /* best-effort */ + } + + const obs = await runPlanSkillCounting({ + skillName: 'plan-ceo-review', + slashCommand: '/plan-ceo-review', + followUpPrompt: PLAN_CEO_2_PAIRED_FINDINGS, + isLastStep0AUQ: ceoStep0Boundary, + reviewCountCeiling: CEILING_PAIRED + 1, + cwd: process.cwd(), + timeoutMs: 1_500_000, + env: { QUESTION_TUNING: 'false', EXPLAIN_LEVEL: 'default' }, + }); + + try { + if (!['plan_ready', 'completion_summary', 'ceiling_reached'].includes(obs.outcome)) { + throw new Error( + `paired-finding control FAILED: outcome=${obs.outcome}\n` + + `step0=${obs.step0Count} review=${obs.reviewCount}\n` + + `--- evidence (last 3KB) ---\n${obs.evidence}`, + ); + } + if (obs.reviewCount < FLOOR_PAIRED) { + throw new Error( + `PAIRED CONTROL FAIL: reviewCount=${obs.reviewCount} < FLOOR=${FLOOR_PAIRED}.\n` + + `Two deliberately related findings were batched into <2 questions — the rule failed under D12.\n` + + `Review-phase fingerprints:\n` + + obs.fingerprints + .filter((f) => !f.preReview) + .map((f) => ` - "${f.promptSnippet.slice(0, 80)}"`) + .join('\n'), + ); + } + if (obs.reviewCount > CEILING_PAIRED) { + throw new Error( + `PAIRED CONTROL FAIL: reviewCount=${obs.reviewCount} > CEILING=${CEILING_PAIRED} (over-asking on a 2-finding fixture).`, + ); + } + } finally { + try { + fs.rmSync(PLAN_CEO_PAIRED_PATH, { force: true }); + } catch { + /* best-effort */ + } + } + }, + 1_700_000, + ); +}); diff --git a/test/skill-e2e-plan-design-finding-count.test.ts b/test/skill-e2e-plan-design-finding-count.test.ts new file mode 100644 index 00000000..ef0d9b68 --- /dev/null +++ b/test/skill-e2e-plan-design-finding-count.test.ts @@ -0,0 +1,135 @@ +/** + * /plan-design-review per-finding AskUserQuestion count (periodic, paid, real-PTY). + * + * Same shape as skill-e2e-plan-ceo-finding-count: drives /plan-design-review + * against a 5-finding seeded plan and asserts review-phase AUQ count ∈ [N-1, N+2]. + * Plus D19: review report at bottom of produced plan file. + * + * Tier: periodic (~25 min, ~$5/run). Sequential by default per plan §D15. + */ + +import { describe, test } from 'bun:test'; +import * as fs from 'node:fs'; +import { + runPlanSkillCounting, + designStep0Boundary, + assertReviewReportAtBottom, +} from './helpers/claude-pty-runner'; + +const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic'; +const describeE2E = shouldRun ? describe : describe.skip; + +const N = 5; +const FLOOR = N - 1; +const CEILING = N + 2; + +const PLAN_DESIGN_5_FINDINGS = [ + 'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-design.md (use Edit/Write to that exact path).', + '', + '# Plan: Settings Page UI redesign', + '', + '## Visual Hierarchy', + 'The "Save" button is rendered with the same size, weight, and color as', + 'three other buttons in the page header (Reset, Cancel, Export). Nothing', + 'tells the user which is the primary action.', + '', + '## Spacing', + 'Between sections we have 24px in some places, 32px in others, and 16px', + 'in a third — no consistent vertical rhythm.', + '', + '## Color', + 'The error message uses red text on a light pink background. Contrast', + 'ratio is approximately 3:1 (below WCAG AA).', + '', + '## Typography', + 'We use 14px, 16px, and 18px font sizes across the form labels. Two', + 'sizes would suffice and create stronger hierarchy.', + '', + '## Motion', + 'The "Save" action takes 2-5 seconds with no loading indicator. Users', + 'see a frozen page; we should add a spinner or skeleton state.', +].join('\n'); + +const PLAN_DESIGN_PATH = '/tmp/gstack-test-plan-design.md'; + +describeE2E('/plan-design-review per-finding AskUserQuestion count (periodic)', () => { + test( + `5-finding plan emits ${FLOOR}-${CEILING} review-phase AskUserQuestions`, + async () => { + try { + fs.rmSync(PLAN_DESIGN_PATH, { force: true }); + } catch { + /* best-effort */ + } + + const obs = await runPlanSkillCounting({ + skillName: 'plan-design-review', + slashCommand: '/plan-design-review', + followUpPrompt: PLAN_DESIGN_5_FINDINGS, + isLastStep0AUQ: designStep0Boundary, + reviewCountCeiling: CEILING + 1, + cwd: process.cwd(), + timeoutMs: 1_500_000, + env: { QUESTION_TUNING: 'false', EXPLAIN_LEVEL: 'default' }, + }); + + try { + if (!['plan_ready', 'completion_summary', 'ceiling_reached'].includes(obs.outcome)) { + throw new Error( + `plan-design-review finding-count FAILED: outcome=${obs.outcome}\n` + + `step0=${obs.step0Count} review=${obs.reviewCount} elapsed=${obs.elapsedMs}ms\n` + + `fingerprints (last 8):\n` + + obs.fingerprints + .slice(-8) + .map( + (f, i) => + ` ${i}. preReview=${f.preReview} sig=${f.signature.slice(0, 12)} prompt="${f.promptSnippet.slice(0, 60)}"`, + ) + .join('\n') + + `\n--- evidence (last 3KB) ---\n${obs.evidence}`, + ); + } + if (obs.reviewCount < FLOOR) { + throw new Error( + `BAND FAIL (below floor): reviewCount=${obs.reviewCount} < FLOOR=${FLOOR}.\n` + + `Likely batching regression. Review-phase fingerprints:\n` + + obs.fingerprints + .filter((f) => !f.preReview) + .map((f) => ` - "${f.promptSnippet.slice(0, 80)}"`) + .join('\n'), + ); + } + if (obs.reviewCount > CEILING) { + throw new Error( + `BAND FAIL (above ceiling): reviewCount=${obs.reviewCount} > CEILING=${CEILING}.`, + ); + } + + if (!fs.existsSync(PLAN_DESIGN_PATH)) { + throw new Error( + `D19 FAIL: agent did not produce expected plan file at ${PLAN_DESIGN_PATH}. ` + + `outcome=${obs.outcome} review=${obs.reviewCount}`, + ); + } + const planContent = fs.readFileSync(PLAN_DESIGN_PATH, 'utf-8'); + const verdict = assertReviewReportAtBottom(planContent); + if (!verdict.ok) { + throw new Error( + `D19 FAIL: plan file at ${PLAN_DESIGN_PATH} ${verdict.reason}\n` + + (verdict.trailingHeadings + ? `Trailing headings: ${verdict.trailingHeadings.join(' | ')}\n` + : '') + + `--- plan content (last 1KB) ---\n${planContent.slice(-1024)}`, + ); + } + } finally { + try { + fs.rmSync(PLAN_DESIGN_PATH, { force: true }); + } catch { + /* best-effort */ + } + } + }, + 1_700_000, + ); +}); diff --git a/test/skill-e2e-plan-devex-finding-count.test.ts b/test/skill-e2e-plan-devex-finding-count.test.ts new file mode 100644 index 00000000..e4b3f8e7 --- /dev/null +++ b/test/skill-e2e-plan-devex-finding-count.test.ts @@ -0,0 +1,135 @@ +/** + * /plan-devex-review per-finding AskUserQuestion count (periodic, paid, real-PTY). + * + * Same shape as skill-e2e-plan-ceo-finding-count: drives /plan-devex-review + * against a 5-finding seeded plan and asserts review-phase AUQ count ∈ [N-1, N+2]. + * Plus D19: review report at bottom of produced plan file. + * + * Tier: periodic (~25 min, ~$5/run). Sequential by default per plan §D15. + */ + +import { describe, test } from 'bun:test'; +import * as fs from 'node:fs'; +import { + runPlanSkillCounting, + devexStep0Boundary, + assertReviewReportAtBottom, +} from './helpers/claude-pty-runner'; + +const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic'; +const describeE2E = shouldRun ? describe : describe.skip; + +const N = 5; +const FLOOR = N - 1; +const CEILING = N + 2; + +const PLAN_DEVEX_5_FINDINGS = [ + 'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-devex.md (use Edit/Write to that exact path).', + '', + '# Plan: Public SDK Beta Launch', + '', + '## Persona', + "The plan doesn't specify which developer persona is the target — we're", + "shipping for \"everyone,\" which means we tune for nobody.", + '', + '## TTHW (time to hello world)', + 'Time-to-hello-world is not measured. No benchmark data referenced. We', + "don't know if first-run takes 5 minutes or 50.", + '', + '## Friction Point', + 'First-run currently requires a 5-minute mandatory CI step before the', + 'developer can run their first eval. There is no way to skip it.', + '', + '## Magical Moment', + 'Getting-started flow has no delight beat. Pure documentation, no', + 'interactive demo, no "ah-ha" moment that makes the developer trust us.', + '', + '## Competitive Blind Spot', + "The plan doesn't reference how peer SDKs (LangChain, Semantic Kernel,", + 'OpenAI) handle this DX surface. We may be reinventing worse versions', + 'of solved problems.', +].join('\n'); + +const PLAN_DEVEX_PATH = '/tmp/gstack-test-plan-devex.md'; + +describeE2E('/plan-devex-review per-finding AskUserQuestion count (periodic)', () => { + test( + `5-finding plan emits ${FLOOR}-${CEILING} review-phase AskUserQuestions`, + async () => { + try { + fs.rmSync(PLAN_DEVEX_PATH, { force: true }); + } catch { + /* best-effort */ + } + + const obs = await runPlanSkillCounting({ + skillName: 'plan-devex-review', + slashCommand: '/plan-devex-review', + followUpPrompt: PLAN_DEVEX_5_FINDINGS, + isLastStep0AUQ: devexStep0Boundary, + reviewCountCeiling: CEILING + 1, + cwd: process.cwd(), + timeoutMs: 1_500_000, + env: { QUESTION_TUNING: 'false', EXPLAIN_LEVEL: 'default' }, + }); + + try { + if (!['plan_ready', 'completion_summary', 'ceiling_reached'].includes(obs.outcome)) { + throw new Error( + `plan-devex-review finding-count FAILED: outcome=${obs.outcome}\n` + + `step0=${obs.step0Count} review=${obs.reviewCount} elapsed=${obs.elapsedMs}ms\n` + + `fingerprints (last 8):\n` + + obs.fingerprints + .slice(-8) + .map( + (f, i) => + ` ${i}. preReview=${f.preReview} sig=${f.signature.slice(0, 12)} prompt="${f.promptSnippet.slice(0, 60)}"`, + ) + .join('\n') + + `\n--- evidence (last 3KB) ---\n${obs.evidence}`, + ); + } + if (obs.reviewCount < FLOOR) { + throw new Error( + `BAND FAIL (below floor): reviewCount=${obs.reviewCount} < FLOOR=${FLOOR}.\n` + + `Likely batching regression. Review-phase fingerprints:\n` + + obs.fingerprints + .filter((f) => !f.preReview) + .map((f) => ` - "${f.promptSnippet.slice(0, 80)}"`) + .join('\n'), + ); + } + if (obs.reviewCount > CEILING) { + throw new Error( + `BAND FAIL (above ceiling): reviewCount=${obs.reviewCount} > CEILING=${CEILING}.`, + ); + } + + if (!fs.existsSync(PLAN_DEVEX_PATH)) { + throw new Error( + `D19 FAIL: agent did not produce expected plan file at ${PLAN_DEVEX_PATH}. ` + + `outcome=${obs.outcome} review=${obs.reviewCount}`, + ); + } + const planContent = fs.readFileSync(PLAN_DEVEX_PATH, 'utf-8'); + const verdict = assertReviewReportAtBottom(planContent); + if (!verdict.ok) { + throw new Error( + `D19 FAIL: plan file at ${PLAN_DEVEX_PATH} ${verdict.reason}\n` + + (verdict.trailingHeadings + ? `Trailing headings: ${verdict.trailingHeadings.join(' | ')}\n` + : '') + + `--- plan content (last 1KB) ---\n${planContent.slice(-1024)}`, + ); + } + } finally { + try { + fs.rmSync(PLAN_DEVEX_PATH, { force: true }); + } catch { + /* best-effort */ + } + } + }, + 1_700_000, + ); +}); diff --git a/test/skill-e2e-plan-eng-finding-count.test.ts b/test/skill-e2e-plan-eng-finding-count.test.ts new file mode 100644 index 00000000..93b8ba68 --- /dev/null +++ b/test/skill-e2e-plan-eng-finding-count.test.ts @@ -0,0 +1,134 @@ +/** + * /plan-eng-review per-finding AskUserQuestion count (periodic, paid, real-PTY). + * + * Same shape as skill-e2e-plan-ceo-finding-count: drives /plan-eng-review + * against a 5-finding seeded plan and asserts review-phase AUQ count ∈ [N-1, N+2]. + * Plus D19: review report at bottom of produced plan file. + * + * Tier: periodic (~25 min, ~$5/run). Sequential by default per plan §D15. + */ + +import { describe, test } from 'bun:test'; +import * as fs from 'node:fs'; +import { + runPlanSkillCounting, + engStep0Boundary, + assertReviewReportAtBottom, +} from './helpers/claude-pty-runner'; + +const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic'; +const describeE2E = shouldRun ? describe : describe.skip; + +const N = 5; +const FLOOR = N - 1; // 4 +const CEILING = N + 2; // 7 + +const PLAN_ENG_5_FINDINGS = [ + 'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-eng.md (use Edit/Write to that exact path).', + '', + '# Plan: Multi-tenant Auth Refactor', + '', + '## Architecture', + 'Two new services (`AuthBroker` and `SessionMint`) share a global mutable', + '`AuthCache` instance via module-level export. Both services mutate it.', + '', + '## Code quality', + 'The `validateAndDispatch()` function is 60 lines with three nested', + 'try/catch blocks; each catch swallows a different error class.', + '', + '## Tests', + 'The existing `legacyAuthFlow()` will get rewritten as part of this work;', + 'no regression test for the prior behavior is planned.', + '', + '## Performance', + 'Token validation issues 5 sequential API calls to the IDP; they could be', + 'parallelized via Promise.all trivially (calls are independent).', + '', + '## Architecture (scope smell)', + 'This touches 12 files and introduces 4 new classes (TokenStore,', + 'SessionMint, AuthCache, RequestPolicy). Worth flagging the complexity check.', +].join('\n'); + +const PLAN_ENG_PATH = '/tmp/gstack-test-plan-eng.md'; + +describeE2E('/plan-eng-review per-finding AskUserQuestion count (periodic)', () => { + test( + `5-finding plan emits ${FLOOR}-${CEILING} review-phase AskUserQuestions`, + async () => { + try { + fs.rmSync(PLAN_ENG_PATH, { force: true }); + } catch { + /* best-effort */ + } + + const obs = await runPlanSkillCounting({ + skillName: 'plan-eng-review', + slashCommand: '/plan-eng-review', + followUpPrompt: PLAN_ENG_5_FINDINGS, + isLastStep0AUQ: engStep0Boundary, + reviewCountCeiling: CEILING + 1, + cwd: process.cwd(), + timeoutMs: 1_500_000, + env: { QUESTION_TUNING: 'false', EXPLAIN_LEVEL: 'default' }, + }); + + try { + if (!['plan_ready', 'completion_summary', 'ceiling_reached'].includes(obs.outcome)) { + throw new Error( + `plan-eng-review finding-count FAILED: outcome=${obs.outcome}\n` + + `step0=${obs.step0Count} review=${obs.reviewCount} elapsed=${obs.elapsedMs}ms\n` + + `fingerprints (last 8):\n` + + obs.fingerprints + .slice(-8) + .map( + (f, i) => + ` ${i}. preReview=${f.preReview} sig=${f.signature.slice(0, 12)} prompt="${f.promptSnippet.slice(0, 60)}"`, + ) + .join('\n') + + `\n--- evidence (last 3KB) ---\n${obs.evidence}`, + ); + } + if (obs.reviewCount < FLOOR) { + throw new Error( + `BAND FAIL (below floor): reviewCount=${obs.reviewCount} < FLOOR=${FLOOR}.\n` + + `Likely batching regression. Review-phase fingerprints:\n` + + obs.fingerprints + .filter((f) => !f.preReview) + .map((f) => ` - "${f.promptSnippet.slice(0, 80)}"`) + .join('\n'), + ); + } + if (obs.reviewCount > CEILING) { + throw new Error( + `BAND FAIL (above ceiling): reviewCount=${obs.reviewCount} > CEILING=${CEILING}.`, + ); + } + + if (!fs.existsSync(PLAN_ENG_PATH)) { + throw new Error( + `D19 FAIL: agent did not produce expected plan file at ${PLAN_ENG_PATH}. ` + + `outcome=${obs.outcome} review=${obs.reviewCount}`, + ); + } + const planContent = fs.readFileSync(PLAN_ENG_PATH, 'utf-8'); + const verdict = assertReviewReportAtBottom(planContent); + if (!verdict.ok) { + throw new Error( + `D19 FAIL: plan file at ${PLAN_ENG_PATH} ${verdict.reason}\n` + + (verdict.trailingHeadings + ? `Trailing headings: ${verdict.trailingHeadings.join(' | ')}\n` + : '') + + `--- plan content (last 1KB) ---\n${planContent.slice(-1024)}`, + ); + } + } finally { + try { + fs.rmSync(PLAN_ENG_PATH, { force: true }); + } catch { + /* best-effort */ + } + } + }, + 1_700_000, + ); +});