test: add four per-finding count E2E tests (plan-ceo + eng + design + devex)

Each test drives its plan-* skill through Step 0 then asserts the
review-phase AskUserQuestion count falls in [N-1, N+2] for an N=5
seeded plan, plus D19: produced plan file ends with
"## GSTACK REVIEW REPORT" as its last "## " heading.

plan-ceo also runs a paired-finding positive control: 2 deliberately
related findings should still produce 2 distinct AUQs, not 1 batched.

Periodic-tier (gate-skipped without EVALS=1, EVALS_TIER=periodic).
Sequential execution by plan §D15. Each fixture is inline TypeScript
content delivered as a follow-up message after the slash command, per
the proven pattern at skill-e2e-plan-design-with-ui.test.ts.

Calibration loop (5 runs per skill) and the manual pre-merge negative
check (D7 + D12) are required before merge per plan §Verification.
NOT yet run.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-04-28 20:12:32 -07:00
parent 0b04ca8486
commit f479134fba
4 changed files with 628 additions and 0 deletions
@@ -0,0 +1,224 @@
/**
* /plan-ceo-review per-finding AskUserQuestion count (periodic, paid, real-PTY).
*
* Asserts the load-bearing rule "One issue = one AskUserQuestion call" by
* driving /plan-ceo-review against a 5-finding seeded plan and counting
* distinct review-phase AUQs. Passes when count is in [N-1, N+2].
*
* Two tests in this file:
* - 5-finding distinct fixture: count band assertion + D19 review-report-at-bottom.
* - 2-finding paired control (D12 positive control): related findings still
* produce 2 distinct AUQs, not 1 batched, when the rule is honored.
*
* Tier: periodic. Each run drives Step 0 + 11 review sections end-to-end
* (~25 min, ~$5/run). Sequential by default per plan §D15. See
* test/helpers/claude-pty-runner.ts for runPlanSkillCounting internals.
*/
import { describe, test } from 'bun:test';
import * as fs from 'node:fs';
import {
runPlanSkillCounting,
ceoStep0Boundary,
assertReviewReportAtBottom,
} from './helpers/claude-pty-runner';
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
const describeE2E = shouldRun ? describe : describe.skip;
const N_DISTINCT = 5;
const FLOOR_DISTINCT = N_DISTINCT - 1; // 4 (D11)
const CEILING_DISTINCT = N_DISTINCT + 2; // 7 (D11)
const N_PAIRED = 2;
const FLOOR_PAIRED = 2;
const CEILING_PAIRED = 4;
const PLAN_CEO_5_FINDINGS = [
'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-ceo.md (use Edit/Write to that exact path).',
'',
'# Plan: Payment Processing Integration',
'',
'## Architecture',
"We're adding a new `PaymentService` class that will handle Stripe webhooks.",
'This bypasses the existing `WebhookDispatcher` module — we want a clean',
'namespace separation.',
'',
'## Database access',
'The new endpoint reads `request.params.userId` directly into a raw SQL',
'fragment for the lookup query.',
'',
'## Webhook fan-out',
'On payment success we update the user record AND fire a notification email.',
'Both happen inline; no error handling on the email leg.',
'',
'## Tests',
"None planned. We'll rely on the existing integration suite catching regressions.",
'',
'## Performance',
'Each webhook lookup hits the database for the user, then fetches each',
'order in a loop.',
].join('\n');
const PLAN_CEO_2_PAIRED_FINDINGS = [
'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-ceo-paired.md (use Edit/Write to that exact path).',
'',
'# Plan: Payment Processing — Test Coverage',
'',
'## Tests',
'We need test coverage for `processPayment()`. Specifically:',
'1. The happy path (successful Stripe charge — assert correct receipt is generated).',
'2. The error/timeout path (Stripe returns 502 — assert retry-with-backoff fires once, then fails clean).',
'',
'Currently neither has a unit test. These are deliberately separate concerns:',
'the success path is correctness, the failure path is graceful degradation.',
].join('\n');
const PLAN_CEO_PATH = '/tmp/gstack-test-plan-ceo.md';
const PLAN_CEO_PAIRED_PATH = '/tmp/gstack-test-plan-ceo-paired.md';
describeE2E('/plan-ceo-review per-finding AskUserQuestion count (periodic)', () => {
test(
`5-finding plan emits ${FLOOR_DISTINCT}-${CEILING_DISTINCT} review-phase AskUserQuestions`,
async () => {
try {
fs.rmSync(PLAN_CEO_PATH, { force: true });
} catch {
/* best-effort */
}
const obs = await runPlanSkillCounting({
skillName: 'plan-ceo-review',
slashCommand: '/plan-ceo-review',
followUpPrompt: PLAN_CEO_5_FINDINGS,
isLastStep0AUQ: ceoStep0Boundary,
reviewCountCeiling: CEILING_DISTINCT + 1, // hard cap above assertion ceiling
cwd: process.cwd(),
timeoutMs: 1_500_000, // 25 min
env: { QUESTION_TUNING: 'false', EXPLAIN_LEVEL: 'default' },
});
try {
if (!['plan_ready', 'completion_summary', 'ceiling_reached'].includes(obs.outcome)) {
throw new Error(
`plan-ceo-review finding-count FAILED: outcome=${obs.outcome}\n` +
`step0=${obs.step0Count} review=${obs.reviewCount} elapsed=${obs.elapsedMs}ms\n` +
`fingerprints (last 8):\n` +
obs.fingerprints
.slice(-8)
.map(
(f, i) =>
` ${i}. preReview=${f.preReview} sig=${f.signature.slice(0, 12)} prompt="${f.promptSnippet.slice(0, 60)}"`,
)
.join('\n') +
`\n--- evidence (last 3KB) ---\n${obs.evidence}`,
);
}
if (obs.reviewCount < FLOOR_DISTINCT) {
throw new Error(
`BAND FAIL (below floor): reviewCount=${obs.reviewCount} < FLOOR=${FLOOR_DISTINCT}.\n` +
`Likely batching regression — agent collapsed multiple findings into fewer questions.\n` +
`Fingerprints (review-phase only):\n` +
obs.fingerprints
.filter((f) => !f.preReview)
.map((f) => ` - "${f.promptSnippet.slice(0, 80)}"`)
.join('\n'),
);
}
if (obs.reviewCount > CEILING_DISTINCT) {
throw new Error(
`BAND FAIL (above ceiling): reviewCount=${obs.reviewCount} > CEILING=${CEILING_DISTINCT}.\n` +
`Possible over-asking regression. Review-phase fingerprints:\n` +
obs.fingerprints
.filter((f) => !f.preReview)
.map((f) => ` - "${f.promptSnippet.slice(0, 80)}"`)
.join('\n'),
);
}
// D19: review report at bottom of plan file.
if (!fs.existsSync(PLAN_CEO_PATH)) {
throw new Error(
`D19 FAIL: agent did not produce expected plan file at ${PLAN_CEO_PATH}.\n` +
`Either the agent ignored the path instruction in the follow-up prompt, or\n` +
`the helper exited before the agent wrote the file. ` +
`outcome=${obs.outcome} review=${obs.reviewCount}`,
);
}
const planContent = fs.readFileSync(PLAN_CEO_PATH, 'utf-8');
const verdict = assertReviewReportAtBottom(planContent);
if (!verdict.ok) {
throw new Error(
`D19 FAIL: plan file at ${PLAN_CEO_PATH} ${verdict.reason}\n` +
(verdict.trailingHeadings
? `Trailing headings: ${verdict.trailingHeadings.join(' | ')}\n`
: '') +
`--- plan content (last 1KB) ---\n${planContent.slice(-1024)}`,
);
}
} finally {
try {
fs.rmSync(PLAN_CEO_PATH, { force: true });
} catch {
/* best-effort */
}
}
},
1_700_000,
);
test(
`paired-finding positive control: ${N_PAIRED} related findings produce ${FLOOR_PAIRED}-${CEILING_PAIRED} AskUserQuestions`,
async () => {
try {
fs.rmSync(PLAN_CEO_PAIRED_PATH, { force: true });
} catch {
/* best-effort */
}
const obs = await runPlanSkillCounting({
skillName: 'plan-ceo-review',
slashCommand: '/plan-ceo-review',
followUpPrompt: PLAN_CEO_2_PAIRED_FINDINGS,
isLastStep0AUQ: ceoStep0Boundary,
reviewCountCeiling: CEILING_PAIRED + 1,
cwd: process.cwd(),
timeoutMs: 1_500_000,
env: { QUESTION_TUNING: 'false', EXPLAIN_LEVEL: 'default' },
});
try {
if (!['plan_ready', 'completion_summary', 'ceiling_reached'].includes(obs.outcome)) {
throw new Error(
`paired-finding control FAILED: outcome=${obs.outcome}\n` +
`step0=${obs.step0Count} review=${obs.reviewCount}\n` +
`--- evidence (last 3KB) ---\n${obs.evidence}`,
);
}
if (obs.reviewCount < FLOOR_PAIRED) {
throw new Error(
`PAIRED CONTROL FAIL: reviewCount=${obs.reviewCount} < FLOOR=${FLOOR_PAIRED}.\n` +
`Two deliberately related findings were batched into <2 questions — the rule failed under D12.\n` +
`Review-phase fingerprints:\n` +
obs.fingerprints
.filter((f) => !f.preReview)
.map((f) => ` - "${f.promptSnippet.slice(0, 80)}"`)
.join('\n'),
);
}
if (obs.reviewCount > CEILING_PAIRED) {
throw new Error(
`PAIRED CONTROL FAIL: reviewCount=${obs.reviewCount} > CEILING=${CEILING_PAIRED} (over-asking on a 2-finding fixture).`,
);
}
} finally {
try {
fs.rmSync(PLAN_CEO_PAIRED_PATH, { force: true });
} catch {
/* best-effort */
}
}
},
1_700_000,
);
});
@@ -0,0 +1,135 @@
/**
* /plan-design-review per-finding AskUserQuestion count (periodic, paid, real-PTY).
*
* Same shape as skill-e2e-plan-ceo-finding-count: drives /plan-design-review
* against a 5-finding seeded plan and asserts review-phase AUQ count ∈ [N-1, N+2].
* Plus D19: review report at bottom of produced plan file.
*
* Tier: periodic (~25 min, ~$5/run). Sequential by default per plan §D15.
*/
import { describe, test } from 'bun:test';
import * as fs from 'node:fs';
import {
runPlanSkillCounting,
designStep0Boundary,
assertReviewReportAtBottom,
} from './helpers/claude-pty-runner';
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
const describeE2E = shouldRun ? describe : describe.skip;
const N = 5;
const FLOOR = N - 1;
const CEILING = N + 2;
const PLAN_DESIGN_5_FINDINGS = [
'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-design.md (use Edit/Write to that exact path).',
'',
'# Plan: Settings Page UI redesign',
'',
'## Visual Hierarchy',
'The "Save" button is rendered with the same size, weight, and color as',
'three other buttons in the page header (Reset, Cancel, Export). Nothing',
'tells the user which is the primary action.',
'',
'## Spacing',
'Between sections we have 24px in some places, 32px in others, and 16px',
'in a third — no consistent vertical rhythm.',
'',
'## Color',
'The error message uses red text on a light pink background. Contrast',
'ratio is approximately 3:1 (below WCAG AA).',
'',
'## Typography',
'We use 14px, 16px, and 18px font sizes across the form labels. Two',
'sizes would suffice and create stronger hierarchy.',
'',
'## Motion',
'The "Save" action takes 2-5 seconds with no loading indicator. Users',
'see a frozen page; we should add a spinner or skeleton state.',
].join('\n');
const PLAN_DESIGN_PATH = '/tmp/gstack-test-plan-design.md';
describeE2E('/plan-design-review per-finding AskUserQuestion count (periodic)', () => {
test(
`5-finding plan emits ${FLOOR}-${CEILING} review-phase AskUserQuestions`,
async () => {
try {
fs.rmSync(PLAN_DESIGN_PATH, { force: true });
} catch {
/* best-effort */
}
const obs = await runPlanSkillCounting({
skillName: 'plan-design-review',
slashCommand: '/plan-design-review',
followUpPrompt: PLAN_DESIGN_5_FINDINGS,
isLastStep0AUQ: designStep0Boundary,
reviewCountCeiling: CEILING + 1,
cwd: process.cwd(),
timeoutMs: 1_500_000,
env: { QUESTION_TUNING: 'false', EXPLAIN_LEVEL: 'default' },
});
try {
if (!['plan_ready', 'completion_summary', 'ceiling_reached'].includes(obs.outcome)) {
throw new Error(
`plan-design-review finding-count FAILED: outcome=${obs.outcome}\n` +
`step0=${obs.step0Count} review=${obs.reviewCount} elapsed=${obs.elapsedMs}ms\n` +
`fingerprints (last 8):\n` +
obs.fingerprints
.slice(-8)
.map(
(f, i) =>
` ${i}. preReview=${f.preReview} sig=${f.signature.slice(0, 12)} prompt="${f.promptSnippet.slice(0, 60)}"`,
)
.join('\n') +
`\n--- evidence (last 3KB) ---\n${obs.evidence}`,
);
}
if (obs.reviewCount < FLOOR) {
throw new Error(
`BAND FAIL (below floor): reviewCount=${obs.reviewCount} < FLOOR=${FLOOR}.\n` +
`Likely batching regression. Review-phase fingerprints:\n` +
obs.fingerprints
.filter((f) => !f.preReview)
.map((f) => ` - "${f.promptSnippet.slice(0, 80)}"`)
.join('\n'),
);
}
if (obs.reviewCount > CEILING) {
throw new Error(
`BAND FAIL (above ceiling): reviewCount=${obs.reviewCount} > CEILING=${CEILING}.`,
);
}
if (!fs.existsSync(PLAN_DESIGN_PATH)) {
throw new Error(
`D19 FAIL: agent did not produce expected plan file at ${PLAN_DESIGN_PATH}. ` +
`outcome=${obs.outcome} review=${obs.reviewCount}`,
);
}
const planContent = fs.readFileSync(PLAN_DESIGN_PATH, 'utf-8');
const verdict = assertReviewReportAtBottom(planContent);
if (!verdict.ok) {
throw new Error(
`D19 FAIL: plan file at ${PLAN_DESIGN_PATH} ${verdict.reason}\n` +
(verdict.trailingHeadings
? `Trailing headings: ${verdict.trailingHeadings.join(' | ')}\n`
: '') +
`--- plan content (last 1KB) ---\n${planContent.slice(-1024)}`,
);
}
} finally {
try {
fs.rmSync(PLAN_DESIGN_PATH, { force: true });
} catch {
/* best-effort */
}
}
},
1_700_000,
);
});
@@ -0,0 +1,135 @@
/**
* /plan-devex-review per-finding AskUserQuestion count (periodic, paid, real-PTY).
*
* Same shape as skill-e2e-plan-ceo-finding-count: drives /plan-devex-review
* against a 5-finding seeded plan and asserts review-phase AUQ count ∈ [N-1, N+2].
* Plus D19: review report at bottom of produced plan file.
*
* Tier: periodic (~25 min, ~$5/run). Sequential by default per plan §D15.
*/
import { describe, test } from 'bun:test';
import * as fs from 'node:fs';
import {
runPlanSkillCounting,
devexStep0Boundary,
assertReviewReportAtBottom,
} from './helpers/claude-pty-runner';
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
const describeE2E = shouldRun ? describe : describe.skip;
const N = 5;
const FLOOR = N - 1;
const CEILING = N + 2;
const PLAN_DEVEX_5_FINDINGS = [
'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-devex.md (use Edit/Write to that exact path).',
'',
'# Plan: Public SDK Beta Launch',
'',
'## Persona',
"The plan doesn't specify which developer persona is the target — we're",
"shipping for \"everyone,\" which means we tune for nobody.",
'',
'## TTHW (time to hello world)',
'Time-to-hello-world is not measured. No benchmark data referenced. We',
"don't know if first-run takes 5 minutes or 50.",
'',
'## Friction Point',
'First-run currently requires a 5-minute mandatory CI step before the',
'developer can run their first eval. There is no way to skip it.',
'',
'## Magical Moment',
'Getting-started flow has no delight beat. Pure documentation, no',
'interactive demo, no "ah-ha" moment that makes the developer trust us.',
'',
'## Competitive Blind Spot',
"The plan doesn't reference how peer SDKs (LangChain, Semantic Kernel,",
'OpenAI) handle this DX surface. We may be reinventing worse versions',
'of solved problems.',
].join('\n');
const PLAN_DEVEX_PATH = '/tmp/gstack-test-plan-devex.md';
describeE2E('/plan-devex-review per-finding AskUserQuestion count (periodic)', () => {
test(
`5-finding plan emits ${FLOOR}-${CEILING} review-phase AskUserQuestions`,
async () => {
try {
fs.rmSync(PLAN_DEVEX_PATH, { force: true });
} catch {
/* best-effort */
}
const obs = await runPlanSkillCounting({
skillName: 'plan-devex-review',
slashCommand: '/plan-devex-review',
followUpPrompt: PLAN_DEVEX_5_FINDINGS,
isLastStep0AUQ: devexStep0Boundary,
reviewCountCeiling: CEILING + 1,
cwd: process.cwd(),
timeoutMs: 1_500_000,
env: { QUESTION_TUNING: 'false', EXPLAIN_LEVEL: 'default' },
});
try {
if (!['plan_ready', 'completion_summary', 'ceiling_reached'].includes(obs.outcome)) {
throw new Error(
`plan-devex-review finding-count FAILED: outcome=${obs.outcome}\n` +
`step0=${obs.step0Count} review=${obs.reviewCount} elapsed=${obs.elapsedMs}ms\n` +
`fingerprints (last 8):\n` +
obs.fingerprints
.slice(-8)
.map(
(f, i) =>
` ${i}. preReview=${f.preReview} sig=${f.signature.slice(0, 12)} prompt="${f.promptSnippet.slice(0, 60)}"`,
)
.join('\n') +
`\n--- evidence (last 3KB) ---\n${obs.evidence}`,
);
}
if (obs.reviewCount < FLOOR) {
throw new Error(
`BAND FAIL (below floor): reviewCount=${obs.reviewCount} < FLOOR=${FLOOR}.\n` +
`Likely batching regression. Review-phase fingerprints:\n` +
obs.fingerprints
.filter((f) => !f.preReview)
.map((f) => ` - "${f.promptSnippet.slice(0, 80)}"`)
.join('\n'),
);
}
if (obs.reviewCount > CEILING) {
throw new Error(
`BAND FAIL (above ceiling): reviewCount=${obs.reviewCount} > CEILING=${CEILING}.`,
);
}
if (!fs.existsSync(PLAN_DEVEX_PATH)) {
throw new Error(
`D19 FAIL: agent did not produce expected plan file at ${PLAN_DEVEX_PATH}. ` +
`outcome=${obs.outcome} review=${obs.reviewCount}`,
);
}
const planContent = fs.readFileSync(PLAN_DEVEX_PATH, 'utf-8');
const verdict = assertReviewReportAtBottom(planContent);
if (!verdict.ok) {
throw new Error(
`D19 FAIL: plan file at ${PLAN_DEVEX_PATH} ${verdict.reason}\n` +
(verdict.trailingHeadings
? `Trailing headings: ${verdict.trailingHeadings.join(' | ')}\n`
: '') +
`--- plan content (last 1KB) ---\n${planContent.slice(-1024)}`,
);
}
} finally {
try {
fs.rmSync(PLAN_DEVEX_PATH, { force: true });
} catch {
/* best-effort */
}
}
},
1_700_000,
);
});
@@ -0,0 +1,134 @@
/**
* /plan-eng-review per-finding AskUserQuestion count (periodic, paid, real-PTY).
*
* Same shape as skill-e2e-plan-ceo-finding-count: drives /plan-eng-review
* against a 5-finding seeded plan and asserts review-phase AUQ count ∈ [N-1, N+2].
* Plus D19: review report at bottom of produced plan file.
*
* Tier: periodic (~25 min, ~$5/run). Sequential by default per plan §D15.
*/
import { describe, test } from 'bun:test';
import * as fs from 'node:fs';
import {
runPlanSkillCounting,
engStep0Boundary,
assertReviewReportAtBottom,
} from './helpers/claude-pty-runner';
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
const describeE2E = shouldRun ? describe : describe.skip;
const N = 5;
const FLOOR = N - 1; // 4
const CEILING = N + 2; // 7
const PLAN_ENG_5_FINDINGS = [
'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-eng.md (use Edit/Write to that exact path).',
'',
'# Plan: Multi-tenant Auth Refactor',
'',
'## Architecture',
'Two new services (`AuthBroker` and `SessionMint`) share a global mutable',
'`AuthCache` instance via module-level export. Both services mutate it.',
'',
'## Code quality',
'The `validateAndDispatch()` function is 60 lines with three nested',
'try/catch blocks; each catch swallows a different error class.',
'',
'## Tests',
'The existing `legacyAuthFlow()` will get rewritten as part of this work;',
'no regression test for the prior behavior is planned.',
'',
'## Performance',
'Token validation issues 5 sequential API calls to the IDP; they could be',
'parallelized via Promise.all trivially (calls are independent).',
'',
'## Architecture (scope smell)',
'This touches 12 files and introduces 4 new classes (TokenStore,',
'SessionMint, AuthCache, RequestPolicy). Worth flagging the complexity check.',
].join('\n');
const PLAN_ENG_PATH = '/tmp/gstack-test-plan-eng.md';
describeE2E('/plan-eng-review per-finding AskUserQuestion count (periodic)', () => {
test(
`5-finding plan emits ${FLOOR}-${CEILING} review-phase AskUserQuestions`,
async () => {
try {
fs.rmSync(PLAN_ENG_PATH, { force: true });
} catch {
/* best-effort */
}
const obs = await runPlanSkillCounting({
skillName: 'plan-eng-review',
slashCommand: '/plan-eng-review',
followUpPrompt: PLAN_ENG_5_FINDINGS,
isLastStep0AUQ: engStep0Boundary,
reviewCountCeiling: CEILING + 1,
cwd: process.cwd(),
timeoutMs: 1_500_000,
env: { QUESTION_TUNING: 'false', EXPLAIN_LEVEL: 'default' },
});
try {
if (!['plan_ready', 'completion_summary', 'ceiling_reached'].includes(obs.outcome)) {
throw new Error(
`plan-eng-review finding-count FAILED: outcome=${obs.outcome}\n` +
`step0=${obs.step0Count} review=${obs.reviewCount} elapsed=${obs.elapsedMs}ms\n` +
`fingerprints (last 8):\n` +
obs.fingerprints
.slice(-8)
.map(
(f, i) =>
` ${i}. preReview=${f.preReview} sig=${f.signature.slice(0, 12)} prompt="${f.promptSnippet.slice(0, 60)}"`,
)
.join('\n') +
`\n--- evidence (last 3KB) ---\n${obs.evidence}`,
);
}
if (obs.reviewCount < FLOOR) {
throw new Error(
`BAND FAIL (below floor): reviewCount=${obs.reviewCount} < FLOOR=${FLOOR}.\n` +
`Likely batching regression. Review-phase fingerprints:\n` +
obs.fingerprints
.filter((f) => !f.preReview)
.map((f) => ` - "${f.promptSnippet.slice(0, 80)}"`)
.join('\n'),
);
}
if (obs.reviewCount > CEILING) {
throw new Error(
`BAND FAIL (above ceiling): reviewCount=${obs.reviewCount} > CEILING=${CEILING}.`,
);
}
if (!fs.existsSync(PLAN_ENG_PATH)) {
throw new Error(
`D19 FAIL: agent did not produce expected plan file at ${PLAN_ENG_PATH}. ` +
`outcome=${obs.outcome} review=${obs.reviewCount}`,
);
}
const planContent = fs.readFileSync(PLAN_ENG_PATH, 'utf-8');
const verdict = assertReviewReportAtBottom(planContent);
if (!verdict.ok) {
throw new Error(
`D19 FAIL: plan file at ${PLAN_ENG_PATH} ${verdict.reason}\n` +
(verdict.trailingHeadings
? `Trailing headings: ${verdict.trailingHeadings.join(' | ')}\n`
: '') +
`--- plan content (last 1KB) ---\n${planContent.slice(-1024)}`,
);
}
} finally {
try {
fs.rmSync(PLAN_ENG_PATH, { force: true });
} catch {
/* best-effort */
}
}
},
1_700_000,
);
});