diff --git a/test/skill-budget-regression.test.ts b/test/skill-budget-regression.test.ts new file mode 100644 index 00000000..651f0918 --- /dev/null +++ b/test/skill-budget-regression.test.ts @@ -0,0 +1,148 @@ +/** + * Tool-budget regression test (gate, free). + * + * Asserts: no test in the most recent eval run grew its tool calls or + * turns by more than 2× vs the prior recorded run. Pure library — does + * not spawn `claude` or pay any API cost. Reads the project eval dir + * (~/.gstack/projects//evals/) and compares the latest run against + * its predecessor. + * + * First-run grace: if there's no prior run, the test passes vacuously. + * The purpose is to catch a SECOND-run regression — a real-world scenario + * is "preamble change shipped, /qa eval went from 30 tool calls to 90". + * + * Why two metrics (tools and turns): a regression that adds tool calls + * usually reflects an inefficient skill prompt; a regression that adds + * turns reflects a skill that is hesitating or losing track. Either is + * worth catching. We use a noise floor (5 tool calls / 3 turns) to + * avoid flagging tests that started tiny and got slightly bigger. + * + * Override: GSTACK_BUDGET_RATIO= (default 2.0). + * + * Skipping: only the gate-level CI-blocking variant runs in EVALS_TIER=gate. + * The same logic runs anywhere `bun test` is invoked because comparison + * is free — no LLM cost. + */ + +import { describe, test } from 'bun:test'; +import { spawnSync } from 'child_process'; +import * as fs from 'fs'; +import * as path from 'path'; +import { + getProjectEvalDir, + findPreviousRun, + compareEvalResults, + assertNoBudgetRegression, + type EvalResult, +} from './helpers/eval-store'; + +function currentGitBranch(): string { + try { + const result = spawnSync('git', ['rev-parse', '--abbrev-ref', 'HEAD'], { + stdio: 'pipe', timeout: 3000, + }); + return result.stdout?.toString().trim() || 'unknown'; + } catch { + return 'unknown'; + } +} + +interface LatestRun { + filepath: string; + result: EvalResult; +} + +/** Find the most recent finalized (non-_partial) eval file for a tier. */ +function findLatestRun(evalDir: string, tier: 'e2e' | 'llm-judge'): LatestRun | null { + let entries: string[]; + try { + entries = fs.readdirSync(evalDir); + } catch { + return null; + } + const candidates: Array<{ filepath: string; timestamp: string }> = []; + for (const f of entries) { + if (!f.endsWith('.json')) continue; + if (f.startsWith('_partial')) continue; + const fullPath = path.join(evalDir, f); + try { + const data = JSON.parse(fs.readFileSync(fullPath, 'utf-8')) as EvalResult; + if (data.tier !== tier) continue; + candidates.push({ filepath: fullPath, timestamp: data.timestamp ?? '' }); + } catch { /* ignore corrupt */ } + } + if (candidates.length === 0) return null; + candidates.sort((a, b) => b.timestamp.localeCompare(a.timestamp)); + const top = candidates[0]!; + return { + filepath: top.filepath, + result: JSON.parse(fs.readFileSync(top.filepath, 'utf-8')) as EvalResult, + }; +} + +function checkTier(tier: 'e2e' | 'llm-judge'): void { + const evalDir = getProjectEvalDir(); + const latest = findLatestRun(evalDir, tier); + if (!latest) { + // eslint-disable-next-line no-console + console.log(`[budget-regression:${tier}] no current run in ${evalDir} — skipping`); + return; + } + // Branch alignment: only assert when the latest eval was actually + // produced by THIS checkout's branch. Cross-branch comparison would + // measure noise from unrelated work. Pre-existing eval history from + // other branches is not our regression to fix. + const myBranch = currentGitBranch(); + if (latest.result.branch !== myBranch) { + // eslint-disable-next-line no-console + console.log( + `[budget-regression:${tier}] latest eval is from "${latest.result.branch}" ` + + `but current branch is "${myBranch}" — skipping (run evals on this branch first)`, + ); + return; + } + const branch = latest.result.branch; + const priorPath = findPreviousRun(evalDir, tier, branch, latest.filepath); + if (!priorPath) { + // eslint-disable-next-line no-console + console.log(`[budget-regression:${tier}] no prior run found — first-run grace`); + return; + } + let prior: EvalResult; + try { + prior = JSON.parse(fs.readFileSync(priorPath, 'utf-8')) as EvalResult; + } catch (err) { + // eslint-disable-next-line no-console + console.warn(`[budget-regression:${tier}] could not read prior ${priorPath}: ${(err as Error).message}`); + return; + } + // Branch-scoped: only compare same-branch history. Cross-branch + // comparison is noisy (different branches do different work). If + // findPreviousRun fell back to another branch, treat as no prior. + if (prior.branch !== branch) { + // eslint-disable-next-line no-console + console.log( + `[budget-regression:${tier}] no same-branch prior (latest on "${branch}", prior on "${prior.branch}") — skipping`, + ); + return; + } + const comparison = compareEvalResults(prior, latest.result, priorPath, latest.filepath); + // Throws on regression. + assertNoBudgetRegression(comparison); + // eslint-disable-next-line no-console + console.log( + `[budget-regression:${tier}] OK — ${comparison.deltas.length} test(s) compared, ` + + `${comparison.tool_count_before}→${comparison.tool_count_after} tools, ` + + `cost Δ $${comparison.total_cost_delta.toFixed(2)}`, + ); +} + +describe('tool budget regression (gate, free)', () => { + test('no e2e test exceeds 2× prior tool calls or turns', () => { + checkTier('e2e'); + }); + + test('no llm-judge test exceeds 2× prior tool calls or turns', () => { + checkTier('llm-judge'); + }); +}); diff --git a/test/skill-e2e-auq-format-compliance.test.ts b/test/skill-e2e-auq-format-compliance.test.ts new file mode 100644 index 00000000..233246a0 --- /dev/null +++ b/test/skill-e2e-auq-format-compliance.test.ts @@ -0,0 +1,196 @@ +/** + * AskUserQuestion format-compliance smoke (gate, paid, real-PTY). + * + * Asserts: when /plan-ceo-review fires its first AskUserQuestion in plan + * mode, the rendered TTY output contains every element the preamble + * format spec mandates (scripts/resolvers/preamble/generate-ask-user-format.ts + * + voice directive): + * + * 1. ELI10 prose paragraph + * 2. "Recommendation:" line + * 3. Pros/Cons header + * 4. ✅ pro bullet AND ❌ con bullet + * 5. "Net:" closer line + * 6. "(recommended)" label on one option + * + * Why real-PTY: the existing skill-e2e-plan-format tests cover what the + * AGENT writes via the SDK (capture-to-file harness). This test covers + * what the USER actually sees in the terminal — different bug class + * (e.g., AUQ tool truncates long prose, conductor renderer mangles + * bullets, model collapses sections under token pressure). Two layers + * of defense for a format-discipline regression that previously ate ~6 + * weeks of compliance drift before it was noticed. + * + * Trigger choice: /plan-ceo-review fires its mode-selection AUQ + * deterministically and early (Step 0F), so we don't need to drive + * through any prior questions to reach a format check. + * + * See test/helpers/claude-pty-runner.ts for runner internals. + */ + +import { describe, test, expect } from 'bun:test'; +import { + launchClaudePty, + isNumberedOptionListVisible, + isPermissionDialogVisible, + parseNumberedOptions, +} from './helpers/claude-pty-runner'; + +const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate'; +const describeE2E = shouldRun ? describe : describe.skip; + +// Format predicates. Permissive on whitespace and capitalization. +// Tightening these is V2 if real drift is observed. +const ELI10_RE = /ELI10\s*:/i; +const RECOMMEND_RE = /Recommendation\s*:/i; +const PROS_CONS_RE = /Pros\s*\/\s*cons\s*:/i; +const PRO_BULLET_RE = /✅/; +const CON_BULLET_RE = /❌/; +const NET_LINE_RE = /^[\s|]*Net\s*:/im; +const RECOMMENDED_LBL = /\(recommended\)/i; + +interface FormatGap { + field: string; + re: RegExp; +} + +function findFormatGaps(visible: string): FormatGap[] { + const checks: FormatGap[] = [ + { field: 'ELI10:', re: ELI10_RE }, + { field: 'Recommendation:', re: RECOMMEND_RE }, + { field: 'Pros / cons:', re: PROS_CONS_RE }, + { field: '✅ pro bullet', re: PRO_BULLET_RE }, + { field: '❌ con bullet', re: CON_BULLET_RE }, + { field: 'Net:', re: NET_LINE_RE }, + { field: '(recommended) label', re: RECOMMENDED_LBL }, + ]; + return checks.filter(c => !c.re.test(visible)); +} + +describeE2E('AskUserQuestion format compliance (gate)', () => { + test( + 'first AUQ from /plan-ceo-review contains all 7 mandated format elements', + async () => { + const session = await launchClaudePty({ + permissionMode: 'plan', + timeoutMs: 360_000, + }); + + try { + // Boot grace + auto trust-dialog handler. + await Bun.sleep(8000); + const since = session.mark(); + session.send('/plan-ceo-review\r'); + + // Wait for a SKILL AUQ. Strategy: poll the visible buffer until it + // contains both a numbered-option list AND the format markers we + // expect (ELI10 + Recommendation). When both are present, it IS a + // real format-compliant AUQ — not a permission dialog or trust + // prompt. + // + // While polling, auto-grant any permission dialogs we see in the + // recent tail (preamble side-effects: touch on a sensitive file, + // etc) so the agent isn't blocked. + const budgetMs = 300_000; + const start = Date.now(); + let captured = ''; + let auqVisible = false; + let lastPermSig = ''; + // Snapshot debug counters every poll so the timeout error shows + // WHY we never matched (cursor-found vs markers-found discrepancy). + let debugCursorSeen = 0; + let debugMarkersSeen = 0; + let debugBothSeen = 0; + + while (Date.now() - start < budgetMs) { + await Bun.sleep(2000); + if (session.exited()) { + throw new Error( + `claude exited (code=${session.exitCode()}) before AUQ rendered.\n` + + `Last visible:\n${session.visibleSince(since).slice(-2000)}`, + ); + } + const visible = session.visibleSince(since); + // Marker check: anywhere in the post-slash region. Since `since` + // is set right after sending /plan-ceo-review, there's no stale + // AUQ above this line — the only AUQ that can produce these + // markers is the current one. + const hasEli10 = /ELI10\s*:/i.test(visible); + const hasRecommend = /Recommendation\s*:/i.test(visible); + + // Cursor check: a numbered option list near the bottom of the + // buffer means the AUQ is currently rendered (not scrolled away). + const cursorTail = visible.slice(-4000); + const hasCursor = isNumberedOptionListVisible(cursorTail) && + parseNumberedOptions(cursorTail).length >= 2; + + if (hasCursor) debugCursorSeen++; + if (hasEli10 && hasRecommend) debugMarkersSeen++; + + // Permission dialog branch: grant once per unique rendering, but + // only when we don't already have format markers visible (so we + // don't accidentally grant a permission inside a real AUQ). + if ( + hasCursor && + !(hasEli10 && hasRecommend) && + isPermissionDialogVisible(cursorTail) + ) { + const sig = visible.slice(-500); + if (sig !== lastPermSig) { + lastPermSig = sig; + session.send('1\r'); + await Bun.sleep(1500); + continue; + } + } + + // Real AUQ check: cursor visible AND markers present anywhere in + // the post-slash region. + if (hasCursor && hasEli10 && hasRecommend) { + debugBothSeen++; + captured = visible; + auqVisible = true; + break; + } + } + if (!auqVisible) { + throw new Error( + `AUQ not rendered within ${budgetMs}ms.\n` + + `Debug counts: cursorSeen=${debugCursorSeen} markersSeen=${debugMarkersSeen} bothSeen=${debugBothSeen}\n` + + `Last visible (4KB):\n${session.visibleSince(since).slice(-4000)}`, + ); + } + const gaps = findFormatGaps(captured); + if (gaps.length > 0) { + // Surface the captured text last 3KB on failure for debugging. + const tail = captured.slice(-3000); + throw new Error( + `AUQ format compliance FAILED — missing ${gaps.length} mandated field(s):\n` + + gaps.map(g => ` - ${g.field} (regex: ${g.re.source})`).join('\n') + + `\n--- captured (last 3KB) ---\n${tail}`, + ); + } + + // Sanity: the parsed option list contains at least 2 options and + // one of them carries the (recommended) marker. + const opts = parseNumberedOptions(captured); + expect(opts.length).toBeGreaterThanOrEqual(2); + const hasRecommended = opts.some(o => /\(recommended\)/i.test(o.label)); + if (!hasRecommended) { + // It's also acceptable for the (recommended) marker to live in + // prose above the box (some renderers wrap labels). The text-level + // RECOMMENDED_LBL check above already covers that case. + // Surface a friendlier message if the box itself missed it. + // (This is non-fatal because findFormatGaps already passed.) + // eslint-disable-next-line no-console + console.warn( + '(recommended) label appears in prose but not on a parsed option label — acceptable but watch for drift', + ); + } + } finally { + await session.close(); + } + }, + 420_000, + ); +}); diff --git a/test/skill-e2e-plan-design-with-ui.test.ts b/test/skill-e2e-plan-design-with-ui.test.ts new file mode 100644 index 00000000..bb56b143 --- /dev/null +++ b/test/skill-e2e-plan-design-with-ui.test.ts @@ -0,0 +1,143 @@ +/** + * /plan-design-review with UI scope (gate, paid, real-PTY). + * + * Counterpart to the existing no-UI early-exit test. When the input plan + * DOES describe UI changes, /plan-design-review must NOT early-exit and + * must reach a real skill numbered-option AUQ (its first design-rating + * question), with the captured evidence NOT echoing the early-exit phrase. + * + * Why: today we only test the negative path (no-UI → early-exit). A + * regression that flips the UI-detection logic — making EVERY plan early- + * exit — would pass the no-UI test (vacuously) and ship undetected. This + * test is the positive coverage. + * + * How: launch claude in plan mode in the gstack repo cwd (so the skill + * registry is loaded). Send /plan-design-review with the fixture path + * inline so the skill reviews the UI-heavy plan rather than git diff or + * .claude/plans/. Drive past permission dialogs. Wait for a numbered- + * option list that is NOT a permission dialog. Assert evidence does NOT + * contain "no UI scope". + */ + +import { describe, test } from 'bun:test'; +import * as path from 'path'; +import { + launchClaudePty, + isNumberedOptionListVisible, + isPermissionDialogVisible, + parseNumberedOptions, + isPlanReadyVisible, +} from './helpers/claude-pty-runner'; + +const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate'; +const describeE2E = shouldRun ? describe : describe.skip; + +const ROOT = path.resolve(import.meta.dir, '..'); +const FIXTURE = path.join(ROOT, 'test', 'fixtures', 'plans', 'ui-heavy-feature.md'); + +describeE2E('/plan-design-review with UI scope (gate)', () => { + test( + 'reaches a real skill AUQ (or plan_ready) without echoing the no-UI early-exit phrase', + async () => { + const fixtureRelPath = path.relative(ROOT, FIXTURE); + + const session = await launchClaudePty({ + permissionMode: 'plan', + cwd: ROOT, + timeoutMs: 480_000, + }); + + let outcome: 'real_auq' | 'plan_ready' | 'timeout' | 'exited' = 'timeout'; + let evidence = ''; + let debugBuffer = ''; // captured at end so timeout error has data + + try { + await Bun.sleep(8000); + const since = session.mark(); + // Send the slash command alone first; then provide the UI-heavy + // plan content as a follow-up message. Claude Code rejects slash + // commands with trailing arguments unless the skill defines them. + session.send('/plan-design-review\r'); + await Bun.sleep(3000); + session.send( + `Please review this plan for UI scope:\n\n` + + `Title: User Dashboard Page\n` + + `New React page UserDashboard.tsx with three subcomponents: ` + + `ActivityFeed, NotificationsPanel, QuickActions. ` + + `Tailwind CSS responsive layout (mobile/desktop breakpoints), ` + + `loading skeletons, empty states, hover states on every interactive element, ` + + `modal dialog for "mark all read", toast notifications for action feedback. ` + + `Reference plan file: ${fixtureRelPath}\r` + ); + + const budgetMs = 360_000; + const start = Date.now(); + let lastPermSig = ''; + while (Date.now() - start < budgetMs) { + await Bun.sleep(2500); + if (session.exited()) { + outcome = 'exited'; + evidence = session.visibleSince(since).slice(-3000); + break; + } + const visible = session.visibleSince(since); + + // Classify the recent tail only — old permission text persists + // in visibleSince(since) and would otherwise re-trigger forever. + const recentTail = visible.slice(-2500); + + // Real skill AUQ visible (not a permission dialog)? + if ( + isNumberedOptionListVisible(recentTail) && + parseNumberedOptions(recentTail).length >= 2 && + !isPermissionDialogVisible(recentTail) + ) { + outcome = 'real_auq'; + evidence = visible.slice(-3000); + break; + } + + // Permission dialog: grant once per unique rendering. + if (isPermissionDialogVisible(recentTail)) { + const sig = visible.slice(-500); + if (sig !== lastPermSig) { + lastPermSig = sig; + session.send('1\r'); + await Bun.sleep(1500); + continue; + } + } + + // Plan-ready terminal — also acceptable (skill ran end-to-end + // and surfaced claude's "Ready to execute" prompt). + if (isPlanReadyVisible(visible)) { + outcome = 'plan_ready'; + evidence = visible.slice(-3000); + break; + } + } + // Capture buffer state at end so a timeout error has diagnostic data. + debugBuffer = session.visibleSince(since).slice(-4000); + } finally { + await session.close(); + } + + // PASS: real_auq or plan_ready, AND evidence does NOT echo the + // early-exit phrase. + if (outcome === 'exited' || outcome === 'timeout') { + throw new Error( + `plan-design-review with UI scope FAILED: outcome=${outcome}\n` + + `--- buffer at timeout (last 4KB) ---\n${debugBuffer || evidence}`, + ); + } + const NO_UI_PHRASE = /no\s+UI\s+scope|isn'?t\s+applicable/i; + if (NO_UI_PHRASE.test(evidence)) { + throw new Error( + `plan-design-review early-exited despite UI-heavy fixture.\n` + + `--- evidence (last 3KB) ---\n${evidence}`, + ); + } + }, + 540_000, + ); +});