diff --git a/test/fixtures/forcing-finding-seeds.ts b/test/fixtures/forcing-finding-seeds.ts index fec887935..8d52c858d 100644 --- a/test/fixtures/forcing-finding-seeds.ts +++ b/test/fixtures/forcing-finding-seeds.ts @@ -81,3 +81,42 @@ export const FORCING_FLOOR_DEVEX = [ '', 'No quickstart command, no hosted sandbox, no copy-pasteable curl example.', ].join('\n'); + +/** + * Multi-finding batching regression seed (periodic tier). + * + * Mirrors the May 2026 transcript bug shape: 4 distinct non-trivial findings + * spread across plan-eng-review's standard sections (Architecture, Code + * Quality, Tests, Performance). Each finding is independent — there is no + * legitimate reason to batch them into a single AskUserQuestion. + * + * Used by test/skill-e2e-plan-eng-multi-finding-batching.test.ts to assert + * the agent fires >= 3 review-phase AUQs (i.e., does NOT batch them into a + * "## Decisions to confirm" section + ExitPlanMode). Floor of 3 (not 4) is + * the [N-1] tolerance from the existing finding-count band convention. + */ +export const FORCING_BATCHING_ENG = [ + 'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-eng-batching.md (use Edit/Write to that exact path).', + '', + '# Plan: Add background job retry framework', + '', + '## Architecture', + "We'll roll a custom exponential-backoff scheduler inline in each worker", + "rather than use the existing job library's built-in retry hooks. Same", + 'shape as the library version, but we want full control over the curve.', + '', + '## Code quality', + 'The retry envelope (compute delay, log attempt, dispatch) is duplicated', + 'across 5 worker files with copy-pasted bodies. We will leave the', + 'duplication for now and refactor "later."', + '', + '## Tests', + 'The existing `processWebhookJob()` flow gets rewritten as part of this', + 'change. No regression test for the prior at-most-once delivery guarantee', + 'is planned.', + '', + '## Performance', + 'On every retry we re-fetch the full job payload from the database, then', + 'iterate the payload to recompute the dependency graph. Could cache the', + 'graph on the first attempt; not planned.', +].join('\n'); diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index 18c25e0bc..1d18bee46 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -143,6 +143,13 @@ export const E2E_TOUCHFILES: Record = { 'plan-ceo-finding-floor': ['plan-ceo-review/**', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/review.ts', 'test/helpers/claude-pty-runner.ts', 'test/fixtures/forcing-finding-seeds.ts', 'test/skill-e2e-plan-ceo-finding-floor.test.ts'], 'plan-design-finding-floor': ['plan-design-review/**', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/review.ts', 'test/helpers/claude-pty-runner.ts', 'test/fixtures/forcing-finding-seeds.ts', 'test/skill-e2e-plan-design-finding-floor.test.ts'], 'plan-devex-finding-floor': ['plan-devex-review/**', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/review.ts', 'test/helpers/claude-pty-runner.ts', 'test/fixtures/forcing-finding-seeds.ts', 'test/skill-e2e-plan-devex-finding-floor.test.ts'], + + // Multi-finding batching regression — periodic tier complement to the + // gate-tier finding-floor. Catches the May 2026 transcript shape where + // a model fires one AUQ then batches the rest into a "## Decisions to + // confirm" plan write. runPlanSkillFloorCheck cannot detect that shape + // (it exits on first AUQ); runPlanSkillCounting can. + 'plan-eng-multi-finding-batching': ['plan-eng-review/**', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/review.ts', 'test/helpers/claude-pty-runner.ts', 'test/fixtures/forcing-finding-seeds.ts', 'test/skill-e2e-plan-eng-multi-finding-batching.test.ts'], 'brain-privacy-gate': ['scripts/resolvers/preamble/generate-brain-sync-block.ts', 'scripts/resolvers/preamble.ts', 'bin/gstack-brain-sync', 'bin/gstack-artifacts-init', 'bin/gstack-config', 'test/helpers/agent-sdk-runner.ts'], // /setup-gbrain Path 4 (Remote MCP) — happy + bad-token end-to-end via @@ -443,6 +450,7 @@ export const E2E_TIERS: Record = { 'plan-ceo-finding-floor': 'gate', 'plan-design-finding-floor': 'gate', 'plan-devex-finding-floor': 'gate', + 'plan-eng-multi-finding-batching': 'periodic', // Privacy gate for gstack-brain-sync — periodic (non-deterministic LLM call, // costs ~$0.30-$0.50 per run, not needed on every commit) diff --git a/test/skill-e2e-plan-eng-multi-finding-batching.test.ts b/test/skill-e2e-plan-eng-multi-finding-batching.test.ts new file mode 100644 index 000000000..1eef660a9 --- /dev/null +++ b/test/skill-e2e-plan-eng-multi-finding-batching.test.ts @@ -0,0 +1,96 @@ +/** + * /plan-eng-review multi-finding batching regression (periodic, paid, real-PTY). + * + * Catches the specific shape of the May 2026 transcript bug that the + * single-finding gate-tier floor test cannot detect: a model that fires + * one AskUserQuestion and then batches the remaining findings into a + * single "## Decisions to confirm" plan write + ExitPlanMode. + * + * Why a separate test from skill-e2e-plan-eng-finding-floor: + * - The gate-tier floor (runPlanSkillFloorCheck) exits on the first AUQ + * render and returns success. A model that fires once-then-batches + * would pass that test trivially. + * - This test uses runPlanSkillCounting at periodic tier (~25 min budget, + * N-AUQ tracking, ceiling-bounded retries) to actually count distinct + * review-phase AUQs and assert the model fires one per finding. + * + * Why a separate test from skill-e2e-plan-eng-finding-count (the existing + * 5-finding count test): + * - The fixture here mirrors the D1-D4 transcript shape (4 findings) and + * the floor matches that exact threshold (3, the [N-1] tolerance band). + * This is the tightest regression test for the original bug class — + * not a band-around-N test, but a "did the agent batch?" test. + * + * Tier: periodic (~25 min, ~$5/run). Sequential by default. + */ + +import { describe, test } from 'bun:test'; +import * as fs from 'node:fs'; +import { + runPlanSkillCounting, + engStep0Boundary, +} from './helpers/claude-pty-runner'; +import { FORCING_BATCHING_ENG } from './fixtures/forcing-finding-seeds'; + +const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic'; +const describeE2E = shouldRun ? describe : describe.skip; + +const N = 4; +const FLOOR = N - 1; // 3 — agent must fire at least one AUQ per non-batched finding + +const PLAN_PATH = '/tmp/gstack-test-plan-eng-batching.md'; + +describeE2E('/plan-eng-review multi-finding batching regression (periodic)', () => { + test( + `4-finding plan emits >= ${FLOOR} review-phase AskUserQuestions (no batching)`, + async () => { + try { + fs.rmSync(PLAN_PATH, { force: true }); + } catch { + /* best-effort */ + } + + const obs = await runPlanSkillCounting({ + skillName: 'plan-eng-review', + slashCommand: '/plan-eng-review', + followUpPrompt: FORCING_BATCHING_ENG, + isLastStep0AUQ: engStep0Boundary, + reviewCountCeiling: N + 3, // hard cap above floor + tolerance + cwd: process.cwd(), + timeoutMs: 1_500_000, // 25 min + env: { QUESTION_TUNING: 'false', EXPLAIN_LEVEL: 'default' }, + }); + + try { + if (!['plan_ready', 'completion_summary', 'ceiling_reached'].includes(obs.outcome)) { + throw new Error( + `multi-finding batching test FAILED: outcome=${obs.outcome}\n` + + `step0=${obs.step0Count} review=${obs.reviewCount} elapsed=${obs.elapsedMs}ms\n` + + `--- evidence (last 3KB) ---\n${obs.evidence}`, + ); + } + if (obs.reviewCount < FLOOR) { + throw new Error( + `BATCHING REGRESSION: reviewCount=${obs.reviewCount} < FLOOR=${FLOOR}.\n` + + `Agent surfaced fewer review-phase AUQs than findings — this is the\n` + + `May 2026 transcript bug shape: model batched multiple findings into\n` + + `a single plan write + ExitPlanMode instead of asking one per finding.\n` + + `Review-phase fingerprints:\n` + + obs.fingerprints + .filter((f) => !f.preReview) + .map((f) => ` - "${f.promptSnippet.slice(0, 80)}"`) + .join('\n') + + `\n--- evidence (last 3KB) ---\n${obs.evidence}`, + ); + } + } finally { + try { + fs.rmSync(PLAN_PATH, { force: true }); + } catch { + /* best-effort */ + } + } + }, + 1_700_000, + ); +});