mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-02 11:45:20 +02:00
9ce9e10aae
Per user feedback: don't shorten AskUserQuestion to AUQ — the abbreviation reads as cryptic. Apply across all the new code from this branch: - Rename test/skill-e2e-auq-format-compliance.test.ts → test/skill-e2e-ask-user-question-format-compliance.test.ts - Touchfile entry auq-format-pty → ask-user-question-format-pty (touchfiles.ts + matching assertion in touchfiles.test.ts) - Function rename navigateToModeAuq → navigateToModeAskUserQuestion - Variable auqVisible → askUserQuestionVisible - Outcome literal 'real_auq' → 'real_question' - All comments + JSDoc + CHANGELOG entry write AskUserQuestion in full - "AUQs" plural → "AskUserQuestions" No behavior change. 49/49 free tests still pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
144 lines
5.5 KiB
TypeScript
144 lines
5.5 KiB
TypeScript
/**
|
|
* /plan-design-review with UI scope (gate, paid, real-PTY).
|
|
*
|
|
* Counterpart to the existing no-UI early-exit test. When the input plan
|
|
* DOES describe UI changes, /plan-design-review must NOT early-exit and
|
|
* must reach a real skill numbered-option AskUserQuestion (its first design-rating
|
|
* question), with the captured evidence NOT echoing the early-exit phrase.
|
|
*
|
|
* Why: today we only test the negative path (no-UI → early-exit). A
|
|
* regression that flips the UI-detection logic — making EVERY plan early-
|
|
* exit — would pass the no-UI test (vacuously) and ship undetected. This
|
|
* test is the positive coverage.
|
|
*
|
|
* How: launch claude in plan mode in the gstack repo cwd (so the skill
|
|
* registry is loaded). Send /plan-design-review with the fixture path
|
|
* inline so the skill reviews the UI-heavy plan rather than git diff or
|
|
* .claude/plans/. Drive past permission dialogs. Wait for a numbered-
|
|
* option list that is NOT a permission dialog. Assert evidence does NOT
|
|
* contain "no UI scope".
|
|
*/
|
|
|
|
import { describe, test } from 'bun:test';
|
|
import * as path from 'path';
|
|
import {
|
|
launchClaudePty,
|
|
isNumberedOptionListVisible,
|
|
isPermissionDialogVisible,
|
|
parseNumberedOptions,
|
|
isPlanReadyVisible,
|
|
} from './helpers/claude-pty-runner';
|
|
|
|
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
|
|
const describeE2E = shouldRun ? describe : describe.skip;
|
|
|
|
const ROOT = path.resolve(import.meta.dir, '..');
|
|
const FIXTURE = path.join(ROOT, 'test', 'fixtures', 'plans', 'ui-heavy-feature.md');
|
|
|
|
describeE2E('/plan-design-review with UI scope (gate)', () => {
|
|
test(
|
|
'reaches a real skill AskUserQuestion (or plan_ready) without echoing the no-UI early-exit phrase',
|
|
async () => {
|
|
const fixtureRelPath = path.relative(ROOT, FIXTURE);
|
|
|
|
const session = await launchClaudePty({
|
|
permissionMode: 'plan',
|
|
cwd: ROOT,
|
|
timeoutMs: 480_000,
|
|
});
|
|
|
|
let outcome: 'real_question' | 'plan_ready' | 'timeout' | 'exited' = 'timeout';
|
|
let evidence = '';
|
|
let debugBuffer = ''; // captured at end so timeout error has data
|
|
|
|
try {
|
|
await Bun.sleep(8000);
|
|
const since = session.mark();
|
|
// Send the slash command alone first; then provide the UI-heavy
|
|
// plan content as a follow-up message. Claude Code rejects slash
|
|
// commands with trailing arguments unless the skill defines them.
|
|
session.send('/plan-design-review\r');
|
|
await Bun.sleep(3000);
|
|
session.send(
|
|
`Please review this plan for UI scope:\n\n` +
|
|
`Title: User Dashboard Page\n` +
|
|
`New React page UserDashboard.tsx with three subcomponents: ` +
|
|
`ActivityFeed, NotificationsPanel, QuickActions. ` +
|
|
`Tailwind CSS responsive layout (mobile/desktop breakpoints), ` +
|
|
`loading skeletons, empty states, hover states on every interactive element, ` +
|
|
`modal dialog for "mark all read", toast notifications for action feedback. ` +
|
|
`Reference plan file: ${fixtureRelPath}\r`
|
|
);
|
|
|
|
const budgetMs = 360_000;
|
|
const start = Date.now();
|
|
let lastPermSig = '';
|
|
while (Date.now() - start < budgetMs) {
|
|
await Bun.sleep(2500);
|
|
if (session.exited()) {
|
|
outcome = 'exited';
|
|
evidence = session.visibleSince(since).slice(-3000);
|
|
break;
|
|
}
|
|
const visible = session.visibleSince(since);
|
|
|
|
// Classify the recent tail only — old permission text persists
|
|
// in visibleSince(since) and would otherwise re-trigger forever.
|
|
const recentTail = visible.slice(-2500);
|
|
|
|
// Real skill AskUserQuestion visible (not a permission dialog)?
|
|
if (
|
|
isNumberedOptionListVisible(recentTail) &&
|
|
parseNumberedOptions(recentTail).length >= 2 &&
|
|
!isPermissionDialogVisible(recentTail)
|
|
) {
|
|
outcome = 'real_question';
|
|
evidence = visible.slice(-3000);
|
|
break;
|
|
}
|
|
|
|
// Permission dialog: grant once per unique rendering.
|
|
if (isPermissionDialogVisible(recentTail)) {
|
|
const sig = visible.slice(-500);
|
|
if (sig !== lastPermSig) {
|
|
lastPermSig = sig;
|
|
session.send('1\r');
|
|
await Bun.sleep(1500);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// Plan-ready terminal — also acceptable (skill ran end-to-end
|
|
// and surfaced claude's "Ready to execute" prompt).
|
|
if (isPlanReadyVisible(visible)) {
|
|
outcome = 'plan_ready';
|
|
evidence = visible.slice(-3000);
|
|
break;
|
|
}
|
|
}
|
|
// Capture buffer state at end so a timeout error has diagnostic data.
|
|
debugBuffer = session.visibleSince(since).slice(-4000);
|
|
} finally {
|
|
await session.close();
|
|
}
|
|
|
|
// PASS: real_question or plan_ready, AND evidence does NOT echo the
|
|
// early-exit phrase.
|
|
if (outcome === 'exited' || outcome === 'timeout') {
|
|
throw new Error(
|
|
`plan-design-review with UI scope FAILED: outcome=${outcome}\n` +
|
|
`--- buffer at timeout (last 4KB) ---\n${debugBuffer || evidence}`,
|
|
);
|
|
}
|
|
const NO_UI_PHRASE = /no\s+UI\s+scope|isn'?t\s+applicable/i;
|
|
if (NO_UI_PHRASE.test(evidence)) {
|
|
throw new Error(
|
|
`plan-design-review early-exited despite UI-heavy fixture.\n` +
|
|
`--- evidence (last 3KB) ---\n${evidence}`,
|
|
);
|
|
}
|
|
},
|
|
540_000,
|
|
);
|
|
});
|