mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-06 13:45:35 +02:00
2b1a0da7c1
skill-e2e-auq-format-compliance.test.ts (~$0.50/run, 90-130s): - Asserts /plan-ceo-review's first AUQ contains all 7 mandated format elements (ELI10, Recommendation, Pros/Cons with ✅/❌, Net, (recommended) label). Catches drift in the shared preamble resolver that previously took weeks to notice. - Auto-grants permission dialogs that fire during preamble side-effects (touch on .feature-prompted markers in fresh user environments). - Verified PASS in 126s. skill-e2e-plan-design-with-ui.test.ts (~$0.80/run, 50-90s): - Counterpart to the existing no-UI early-exit test. When the input plan DOES describe UI changes, /plan-design-review must NOT early-exit and must reach a real skill AUQ. - Sends the slash command without args, then a follow-up message with the UI-heavy plan description (Claude Code rejects unknown trailing args). Asserts evidence does NOT contain "no UI scope". - Verified PASS in 54s. skill-budget-regression.test.ts (free, gate): - Library-only assertion. Reads the most recent eval file, finds the prior same-branch run via findPreviousRun, computes ComparisonResult, asserts no test exceeded 2× tools or turns. - Branch-scoped: skips with reason if the latest eval was produced on a different branch (cross-branch comparison would be noise). - First-run grace (vacuous pass) when no prior data exists. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
144 lines
5.4 KiB
TypeScript
144 lines
5.4 KiB
TypeScript
/**
|
|
* /plan-design-review with UI scope (gate, paid, real-PTY).
|
|
*
|
|
* Counterpart to the existing no-UI early-exit test. When the input plan
|
|
* DOES describe UI changes, /plan-design-review must NOT early-exit and
|
|
* must reach a real skill numbered-option AUQ (its first design-rating
|
|
* question), with the captured evidence NOT echoing the early-exit phrase.
|
|
*
|
|
* Why: today we only test the negative path (no-UI → early-exit). A
|
|
* regression that flips the UI-detection logic — making EVERY plan early-
|
|
* exit — would pass the no-UI test (vacuously) and ship undetected. This
|
|
* test is the positive coverage.
|
|
*
|
|
* How: launch claude in plan mode in the gstack repo cwd (so the skill
|
|
* registry is loaded). Send /plan-design-review with the fixture path
|
|
* inline so the skill reviews the UI-heavy plan rather than git diff or
|
|
* .claude/plans/. Drive past permission dialogs. Wait for a numbered-
|
|
* option list that is NOT a permission dialog. Assert evidence does NOT
|
|
* contain "no UI scope".
|
|
*/
|
|
|
|
import { describe, test } from 'bun:test';
|
|
import * as path from 'path';
|
|
import {
|
|
launchClaudePty,
|
|
isNumberedOptionListVisible,
|
|
isPermissionDialogVisible,
|
|
parseNumberedOptions,
|
|
isPlanReadyVisible,
|
|
} from './helpers/claude-pty-runner';
|
|
|
|
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
|
|
const describeE2E = shouldRun ? describe : describe.skip;
|
|
|
|
const ROOT = path.resolve(import.meta.dir, '..');
|
|
const FIXTURE = path.join(ROOT, 'test', 'fixtures', 'plans', 'ui-heavy-feature.md');
|
|
|
|
describeE2E('/plan-design-review with UI scope (gate)', () => {
|
|
test(
|
|
'reaches a real skill AUQ (or plan_ready) without echoing the no-UI early-exit phrase',
|
|
async () => {
|
|
const fixtureRelPath = path.relative(ROOT, FIXTURE);
|
|
|
|
const session = await launchClaudePty({
|
|
permissionMode: 'plan',
|
|
cwd: ROOT,
|
|
timeoutMs: 480_000,
|
|
});
|
|
|
|
let outcome: 'real_auq' | 'plan_ready' | 'timeout' | 'exited' = 'timeout';
|
|
let evidence = '';
|
|
let debugBuffer = ''; // captured at end so timeout error has data
|
|
|
|
try {
|
|
await Bun.sleep(8000);
|
|
const since = session.mark();
|
|
// Send the slash command alone first; then provide the UI-heavy
|
|
// plan content as a follow-up message. Claude Code rejects slash
|
|
// commands with trailing arguments unless the skill defines them.
|
|
session.send('/plan-design-review\r');
|
|
await Bun.sleep(3000);
|
|
session.send(
|
|
`Please review this plan for UI scope:\n\n` +
|
|
`Title: User Dashboard Page\n` +
|
|
`New React page UserDashboard.tsx with three subcomponents: ` +
|
|
`ActivityFeed, NotificationsPanel, QuickActions. ` +
|
|
`Tailwind CSS responsive layout (mobile/desktop breakpoints), ` +
|
|
`loading skeletons, empty states, hover states on every interactive element, ` +
|
|
`modal dialog for "mark all read", toast notifications for action feedback. ` +
|
|
`Reference plan file: ${fixtureRelPath}\r`
|
|
);
|
|
|
|
const budgetMs = 360_000;
|
|
const start = Date.now();
|
|
let lastPermSig = '';
|
|
while (Date.now() - start < budgetMs) {
|
|
await Bun.sleep(2500);
|
|
if (session.exited()) {
|
|
outcome = 'exited';
|
|
evidence = session.visibleSince(since).slice(-3000);
|
|
break;
|
|
}
|
|
const visible = session.visibleSince(since);
|
|
|
|
// Classify the recent tail only — old permission text persists
|
|
// in visibleSince(since) and would otherwise re-trigger forever.
|
|
const recentTail = visible.slice(-2500);
|
|
|
|
// Real skill AUQ visible (not a permission dialog)?
|
|
if (
|
|
isNumberedOptionListVisible(recentTail) &&
|
|
parseNumberedOptions(recentTail).length >= 2 &&
|
|
!isPermissionDialogVisible(recentTail)
|
|
) {
|
|
outcome = 'real_auq';
|
|
evidence = visible.slice(-3000);
|
|
break;
|
|
}
|
|
|
|
// Permission dialog: grant once per unique rendering.
|
|
if (isPermissionDialogVisible(recentTail)) {
|
|
const sig = visible.slice(-500);
|
|
if (sig !== lastPermSig) {
|
|
lastPermSig = sig;
|
|
session.send('1\r');
|
|
await Bun.sleep(1500);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// Plan-ready terminal — also acceptable (skill ran end-to-end
|
|
// and surfaced claude's "Ready to execute" prompt).
|
|
if (isPlanReadyVisible(visible)) {
|
|
outcome = 'plan_ready';
|
|
evidence = visible.slice(-3000);
|
|
break;
|
|
}
|
|
}
|
|
// Capture buffer state at end so a timeout error has diagnostic data.
|
|
debugBuffer = session.visibleSince(since).slice(-4000);
|
|
} finally {
|
|
await session.close();
|
|
}
|
|
|
|
// PASS: real_auq or plan_ready, AND evidence does NOT echo the
|
|
// early-exit phrase.
|
|
if (outcome === 'exited' || outcome === 'timeout') {
|
|
throw new Error(
|
|
`plan-design-review with UI scope FAILED: outcome=${outcome}\n` +
|
|
`--- buffer at timeout (last 4KB) ---\n${debugBuffer || evidence}`,
|
|
);
|
|
}
|
|
const NO_UI_PHRASE = /no\s+UI\s+scope|isn'?t\s+applicable/i;
|
|
if (NO_UI_PHRASE.test(evidence)) {
|
|
throw new Error(
|
|
`plan-design-review early-exited despite UI-heavy fixture.\n` +
|
|
`--- evidence (last 3KB) ---\n${evidence}`,
|
|
);
|
|
}
|
|
},
|
|
540_000,
|
|
);
|
|
});
|