/** * /plan-design-review with UI scope (gate, paid, real-PTY). * * Counterpart to the existing no-UI early-exit test. When the input plan * DOES describe UI changes, /plan-design-review must NOT early-exit and * must reach a real skill numbered-option AskUserQuestion (its first design-rating * question), with the captured evidence NOT echoing the early-exit phrase. * * Why: today we only test the negative path (no-UI → early-exit). A * regression that flips the UI-detection logic — making EVERY plan early- * exit — would pass the no-UI test (vacuously) and ship undetected. This * test is the positive coverage. * * How: launch claude in plan mode in the gstack repo cwd (so the skill * registry is loaded). Send /plan-design-review with the fixture path * inline so the skill reviews the UI-heavy plan rather than git diff or * .claude/plans/. Drive past permission dialogs. Wait for a numbered- * option list that is NOT a permission dialog. Assert evidence does NOT * contain "no UI scope". */ import { describe, test } from 'bun:test'; import * as path from 'path'; import { launchClaudePty, isNumberedOptionListVisible, isPermissionDialogVisible, parseNumberedOptions, isPlanReadyVisible, } from './helpers/claude-pty-runner'; const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate'; const describeE2E = shouldRun ? describe : describe.skip; const ROOT = path.resolve(import.meta.dir, '..'); const FIXTURE = path.join(ROOT, 'test', 'fixtures', 'plans', 'ui-heavy-feature.md'); describeE2E('/plan-design-review with UI scope (gate)', () => { test( 'reaches a real skill AskUserQuestion (or plan_ready) without echoing the no-UI early-exit phrase', async () => { const fixtureRelPath = path.relative(ROOT, FIXTURE); const session = await launchClaudePty({ permissionMode: 'plan', cwd: ROOT, timeoutMs: 480_000, }); let outcome: 'real_question' | 'plan_ready' | 'timeout' | 'exited' = 'timeout'; let evidence = ''; let debugBuffer = ''; // captured at end so timeout error has data try { await Bun.sleep(8000); const since = session.mark(); // Send the slash command alone first; then provide the UI-heavy // plan content as a follow-up message. Claude Code rejects slash // commands with trailing arguments unless the skill defines them. session.send('/plan-design-review\r'); await Bun.sleep(3000); session.send( `Please review this plan for UI scope:\n\n` + `Title: User Dashboard Page\n` + `New React page UserDashboard.tsx with three subcomponents: ` + `ActivityFeed, NotificationsPanel, QuickActions. ` + `Tailwind CSS responsive layout (mobile/desktop breakpoints), ` + `loading skeletons, empty states, hover states on every interactive element, ` + `modal dialog for "mark all read", toast notifications for action feedback. ` + `Reference plan file: ${fixtureRelPath}\r` ); const budgetMs = 360_000; const start = Date.now(); let lastPermSig = ''; while (Date.now() - start < budgetMs) { await Bun.sleep(2500); if (session.exited()) { outcome = 'exited'; evidence = session.visibleSince(since).slice(-3000); break; } const visible = session.visibleSince(since); // Classify the recent tail only — old permission text persists // in visibleSince(since) and would otherwise re-trigger forever. const recentTail = visible.slice(-2500); // Real skill AskUserQuestion visible (not a permission dialog)? if ( isNumberedOptionListVisible(recentTail) && parseNumberedOptions(recentTail).length >= 2 && !isPermissionDialogVisible(recentTail) ) { outcome = 'real_question'; evidence = visible.slice(-3000); break; } // Permission dialog: grant once per unique rendering. if (isPermissionDialogVisible(recentTail)) { const sig = visible.slice(-500); if (sig !== lastPermSig) { lastPermSig = sig; session.send('1\r'); await Bun.sleep(1500); continue; } } // Plan-ready terminal — also acceptable (skill ran end-to-end // and surfaced claude's "Ready to execute" prompt). if (isPlanReadyVisible(visible)) { outcome = 'plan_ready'; evidence = visible.slice(-3000); break; } } // Capture buffer state at end so a timeout error has diagnostic data. debugBuffer = session.visibleSince(since).slice(-4000); } finally { await session.close(); } // PASS: real_question or plan_ready, AND evidence does NOT echo the // early-exit phrase. if (outcome === 'exited' || outcome === 'timeout') { throw new Error( `plan-design-review with UI scope FAILED: outcome=${outcome}\n` + `--- buffer at timeout (last 4KB) ---\n${debugBuffer || evidence}`, ); } const NO_UI_PHRASE = /no\s+UI\s+scope|isn'?t\s+applicable/i; if (NO_UI_PHRASE.test(evidence)) { throw new Error( `plan-design-review early-exited despite UI-heavy fixture.\n` + `--- evidence (last 3KB) ---\n${evidence}`, ); } }, 540_000, ); });