feat(test): 3 gate-tier real-PTY E2E tests

skill-e2e-auq-format-compliance.test.ts (~$0.50/run, 90-130s):
- Asserts /plan-ceo-review's first AUQ contains all 7 mandated format
  elements (ELI10, Recommendation, Pros/Cons with /, Net,
  (recommended) label). Catches drift in the shared preamble resolver
  that previously took weeks to notice.
- Auto-grants permission dialogs that fire during preamble side-effects
  (touch on .feature-prompted markers in fresh user environments).
- Verified PASS in 126s.

skill-e2e-plan-design-with-ui.test.ts (~$0.80/run, 50-90s):
- Counterpart to the existing no-UI early-exit test. When the input plan
  DOES describe UI changes, /plan-design-review must NOT early-exit and
  must reach a real skill AUQ.
- Sends the slash command without args, then a follow-up message with
  the UI-heavy plan description (Claude Code rejects unknown trailing
  args). Asserts evidence does NOT contain "no UI scope".
- Verified PASS in 54s.

skill-budget-regression.test.ts (free, gate):
- Library-only assertion. Reads the most recent eval file, finds the
  prior same-branch run via findPreviousRun, computes ComparisonResult,
  asserts no test exceeded 2× tools or turns.
- Branch-scoped: skips with reason if the latest eval was produced on
  a different branch (cross-branch comparison would be noise).
- First-run grace (vacuous pass) when no prior data exists.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-04-26 04:36:35 -07:00
parent 137b11f39a
commit 2b1a0da7c1
3 changed files with 487 additions and 0 deletions
+148
View File
@@ -0,0 +1,148 @@
/**
* Tool-budget regression test (gate, free).
*
* Asserts: no test in the most recent eval run grew its tool calls or
* turns by more than 2× vs the prior recorded run. Pure library — does
* not spawn `claude` or pay any API cost. Reads the project eval dir
* (~/.gstack/projects/<slug>/evals/) and compares the latest run against
* its predecessor.
*
* First-run grace: if there's no prior run, the test passes vacuously.
* The purpose is to catch a SECOND-run regression — a real-world scenario
* is "preamble change shipped, /qa eval went from 30 tool calls to 90".
*
* Why two metrics (tools and turns): a regression that adds tool calls
* usually reflects an inefficient skill prompt; a regression that adds
* turns reflects a skill that is hesitating or losing track. Either is
* worth catching. We use a noise floor (5 tool calls / 3 turns) to
* avoid flagging tests that started tiny and got slightly bigger.
*
* Override: GSTACK_BUDGET_RATIO=<n> (default 2.0).
*
* Skipping: only the gate-level CI-blocking variant runs in EVALS_TIER=gate.
* The same logic runs anywhere `bun test` is invoked because comparison
* is free — no LLM cost.
*/
import { describe, test } from 'bun:test';
import { spawnSync } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
import {
getProjectEvalDir,
findPreviousRun,
compareEvalResults,
assertNoBudgetRegression,
type EvalResult,
} from './helpers/eval-store';
function currentGitBranch(): string {
try {
const result = spawnSync('git', ['rev-parse', '--abbrev-ref', 'HEAD'], {
stdio: 'pipe', timeout: 3000,
});
return result.stdout?.toString().trim() || 'unknown';
} catch {
return 'unknown';
}
}
interface LatestRun {
filepath: string;
result: EvalResult;
}
/** Find the most recent finalized (non-_partial) eval file for a tier. */
function findLatestRun(evalDir: string, tier: 'e2e' | 'llm-judge'): LatestRun | null {
let entries: string[];
try {
entries = fs.readdirSync(evalDir);
} catch {
return null;
}
const candidates: Array<{ filepath: string; timestamp: string }> = [];
for (const f of entries) {
if (!f.endsWith('.json')) continue;
if (f.startsWith('_partial')) continue;
const fullPath = path.join(evalDir, f);
try {
const data = JSON.parse(fs.readFileSync(fullPath, 'utf-8')) as EvalResult;
if (data.tier !== tier) continue;
candidates.push({ filepath: fullPath, timestamp: data.timestamp ?? '' });
} catch { /* ignore corrupt */ }
}
if (candidates.length === 0) return null;
candidates.sort((a, b) => b.timestamp.localeCompare(a.timestamp));
const top = candidates[0]!;
return {
filepath: top.filepath,
result: JSON.parse(fs.readFileSync(top.filepath, 'utf-8')) as EvalResult,
};
}
function checkTier(tier: 'e2e' | 'llm-judge'): void {
const evalDir = getProjectEvalDir();
const latest = findLatestRun(evalDir, tier);
if (!latest) {
// eslint-disable-next-line no-console
console.log(`[budget-regression:${tier}] no current run in ${evalDir} — skipping`);
return;
}
// Branch alignment: only assert when the latest eval was actually
// produced by THIS checkout's branch. Cross-branch comparison would
// measure noise from unrelated work. Pre-existing eval history from
// other branches is not our regression to fix.
const myBranch = currentGitBranch();
if (latest.result.branch !== myBranch) {
// eslint-disable-next-line no-console
console.log(
`[budget-regression:${tier}] latest eval is from "${latest.result.branch}" ` +
`but current branch is "${myBranch}" — skipping (run evals on this branch first)`,
);
return;
}
const branch = latest.result.branch;
const priorPath = findPreviousRun(evalDir, tier, branch, latest.filepath);
if (!priorPath) {
// eslint-disable-next-line no-console
console.log(`[budget-regression:${tier}] no prior run found — first-run grace`);
return;
}
let prior: EvalResult;
try {
prior = JSON.parse(fs.readFileSync(priorPath, 'utf-8')) as EvalResult;
} catch (err) {
// eslint-disable-next-line no-console
console.warn(`[budget-regression:${tier}] could not read prior ${priorPath}: ${(err as Error).message}`);
return;
}
// Branch-scoped: only compare same-branch history. Cross-branch
// comparison is noisy (different branches do different work). If
// findPreviousRun fell back to another branch, treat as no prior.
if (prior.branch !== branch) {
// eslint-disable-next-line no-console
console.log(
`[budget-regression:${tier}] no same-branch prior (latest on "${branch}", prior on "${prior.branch}") — skipping`,
);
return;
}
const comparison = compareEvalResults(prior, latest.result, priorPath, latest.filepath);
// Throws on regression.
assertNoBudgetRegression(comparison);
// eslint-disable-next-line no-console
console.log(
`[budget-regression:${tier}] OK — ${comparison.deltas.length} test(s) compared, ` +
`${comparison.tool_count_before}${comparison.tool_count_after} tools, ` +
`cost Δ $${comparison.total_cost_delta.toFixed(2)}`,
);
}
describe('tool budget regression (gate, free)', () => {
test('no e2e test exceeds 2× prior tool calls or turns', () => {
checkTier('e2e');
});
test('no llm-judge test exceeds 2× prior tool calls or turns', () => {
checkTier('llm-judge');
});
});
@@ -0,0 +1,196 @@
/**
* AskUserQuestion format-compliance smoke (gate, paid, real-PTY).
*
* Asserts: when /plan-ceo-review fires its first AskUserQuestion in plan
* mode, the rendered TTY output contains every element the preamble
* format spec mandates (scripts/resolvers/preamble/generate-ask-user-format.ts
* + voice directive):
*
* 1. ELI10 prose paragraph
* 2. "Recommendation:" line
* 3. Pros/Cons header
* 4. ✅ pro bullet AND ❌ con bullet
* 5. "Net:" closer line
* 6. "(recommended)" label on one option
*
* Why real-PTY: the existing skill-e2e-plan-format tests cover what the
* AGENT writes via the SDK (capture-to-file harness). This test covers
* what the USER actually sees in the terminal — different bug class
* (e.g., AUQ tool truncates long prose, conductor renderer mangles
* bullets, model collapses sections under token pressure). Two layers
* of defense for a format-discipline regression that previously ate ~6
* weeks of compliance drift before it was noticed.
*
* Trigger choice: /plan-ceo-review fires its mode-selection AUQ
* deterministically and early (Step 0F), so we don't need to drive
* through any prior questions to reach a format check.
*
* See test/helpers/claude-pty-runner.ts for runner internals.
*/
import { describe, test, expect } from 'bun:test';
import {
launchClaudePty,
isNumberedOptionListVisible,
isPermissionDialogVisible,
parseNumberedOptions,
} from './helpers/claude-pty-runner';
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
const describeE2E = shouldRun ? describe : describe.skip;
// Format predicates. Permissive on whitespace and capitalization.
// Tightening these is V2 if real drift is observed.
const ELI10_RE = /ELI10\s*:/i;
const RECOMMEND_RE = /Recommendation\s*:/i;
const PROS_CONS_RE = /Pros\s*\/\s*cons\s*:/i;
const PRO_BULLET_RE = /✅/;
const CON_BULLET_RE = /❌/;
const NET_LINE_RE = /^[\s|]*Net\s*:/im;
const RECOMMENDED_LBL = /\(recommended\)/i;
interface FormatGap {
field: string;
re: RegExp;
}
function findFormatGaps(visible: string): FormatGap[] {
const checks: FormatGap[] = [
{ field: 'ELI10:', re: ELI10_RE },
{ field: 'Recommendation:', re: RECOMMEND_RE },
{ field: 'Pros / cons:', re: PROS_CONS_RE },
{ field: '✅ pro bullet', re: PRO_BULLET_RE },
{ field: '❌ con bullet', re: CON_BULLET_RE },
{ field: 'Net:', re: NET_LINE_RE },
{ field: '(recommended) label', re: RECOMMENDED_LBL },
];
return checks.filter(c => !c.re.test(visible));
}
describeE2E('AskUserQuestion format compliance (gate)', () => {
test(
'first AUQ from /plan-ceo-review contains all 7 mandated format elements',
async () => {
const session = await launchClaudePty({
permissionMode: 'plan',
timeoutMs: 360_000,
});
try {
// Boot grace + auto trust-dialog handler.
await Bun.sleep(8000);
const since = session.mark();
session.send('/plan-ceo-review\r');
// Wait for a SKILL AUQ. Strategy: poll the visible buffer until it
// contains both a numbered-option list AND the format markers we
// expect (ELI10 + Recommendation). When both are present, it IS a
// real format-compliant AUQ — not a permission dialog or trust
// prompt.
//
// While polling, auto-grant any permission dialogs we see in the
// recent tail (preamble side-effects: touch on a sensitive file,
// etc) so the agent isn't blocked.
const budgetMs = 300_000;
const start = Date.now();
let captured = '';
let auqVisible = false;
let lastPermSig = '';
// Snapshot debug counters every poll so the timeout error shows
// WHY we never matched (cursor-found vs markers-found discrepancy).
let debugCursorSeen = 0;
let debugMarkersSeen = 0;
let debugBothSeen = 0;
while (Date.now() - start < budgetMs) {
await Bun.sleep(2000);
if (session.exited()) {
throw new Error(
`claude exited (code=${session.exitCode()}) before AUQ rendered.\n` +
`Last visible:\n${session.visibleSince(since).slice(-2000)}`,
);
}
const visible = session.visibleSince(since);
// Marker check: anywhere in the post-slash region. Since `since`
// is set right after sending /plan-ceo-review, there's no stale
// AUQ above this line — the only AUQ that can produce these
// markers is the current one.
const hasEli10 = /ELI10\s*:/i.test(visible);
const hasRecommend = /Recommendation\s*:/i.test(visible);
// Cursor check: a numbered option list near the bottom of the
// buffer means the AUQ is currently rendered (not scrolled away).
const cursorTail = visible.slice(-4000);
const hasCursor = isNumberedOptionListVisible(cursorTail) &&
parseNumberedOptions(cursorTail).length >= 2;
if (hasCursor) debugCursorSeen++;
if (hasEli10 && hasRecommend) debugMarkersSeen++;
// Permission dialog branch: grant once per unique rendering, but
// only when we don't already have format markers visible (so we
// don't accidentally grant a permission inside a real AUQ).
if (
hasCursor &&
!(hasEli10 && hasRecommend) &&
isPermissionDialogVisible(cursorTail)
) {
const sig = visible.slice(-500);
if (sig !== lastPermSig) {
lastPermSig = sig;
session.send('1\r');
await Bun.sleep(1500);
continue;
}
}
// Real AUQ check: cursor visible AND markers present anywhere in
// the post-slash region.
if (hasCursor && hasEli10 && hasRecommend) {
debugBothSeen++;
captured = visible;
auqVisible = true;
break;
}
}
if (!auqVisible) {
throw new Error(
`AUQ not rendered within ${budgetMs}ms.\n` +
`Debug counts: cursorSeen=${debugCursorSeen} markersSeen=${debugMarkersSeen} bothSeen=${debugBothSeen}\n` +
`Last visible (4KB):\n${session.visibleSince(since).slice(-4000)}`,
);
}
const gaps = findFormatGaps(captured);
if (gaps.length > 0) {
// Surface the captured text last 3KB on failure for debugging.
const tail = captured.slice(-3000);
throw new Error(
`AUQ format compliance FAILED — missing ${gaps.length} mandated field(s):\n` +
gaps.map(g => ` - ${g.field} (regex: ${g.re.source})`).join('\n') +
`\n--- captured (last 3KB) ---\n${tail}`,
);
}
// Sanity: the parsed option list contains at least 2 options and
// one of them carries the (recommended) marker.
const opts = parseNumberedOptions(captured);
expect(opts.length).toBeGreaterThanOrEqual(2);
const hasRecommended = opts.some(o => /\(recommended\)/i.test(o.label));
if (!hasRecommended) {
// It's also acceptable for the (recommended) marker to live in
// prose above the box (some renderers wrap labels). The text-level
// RECOMMENDED_LBL check above already covers that case.
// Surface a friendlier message if the box itself missed it.
// (This is non-fatal because findFormatGaps already passed.)
// eslint-disable-next-line no-console
console.warn(
'(recommended) label appears in prose but not on a parsed option label — acceptable but watch for drift',
);
}
} finally {
await session.close();
}
},
420_000,
);
});
+143
View File
@@ -0,0 +1,143 @@
/**
* /plan-design-review with UI scope (gate, paid, real-PTY).
*
* Counterpart to the existing no-UI early-exit test. When the input plan
* DOES describe UI changes, /plan-design-review must NOT early-exit and
* must reach a real skill numbered-option AUQ (its first design-rating
* question), with the captured evidence NOT echoing the early-exit phrase.
*
* Why: today we only test the negative path (no-UI → early-exit). A
* regression that flips the UI-detection logic — making EVERY plan early-
* exit — would pass the no-UI test (vacuously) and ship undetected. This
* test is the positive coverage.
*
* How: launch claude in plan mode in the gstack repo cwd (so the skill
* registry is loaded). Send /plan-design-review with the fixture path
* inline so the skill reviews the UI-heavy plan rather than git diff or
* .claude/plans/. Drive past permission dialogs. Wait for a numbered-
* option list that is NOT a permission dialog. Assert evidence does NOT
* contain "no UI scope".
*/
import { describe, test } from 'bun:test';
import * as path from 'path';
import {
launchClaudePty,
isNumberedOptionListVisible,
isPermissionDialogVisible,
parseNumberedOptions,
isPlanReadyVisible,
} from './helpers/claude-pty-runner';
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
const describeE2E = shouldRun ? describe : describe.skip;
const ROOT = path.resolve(import.meta.dir, '..');
const FIXTURE = path.join(ROOT, 'test', 'fixtures', 'plans', 'ui-heavy-feature.md');
describeE2E('/plan-design-review with UI scope (gate)', () => {
test(
'reaches a real skill AUQ (or plan_ready) without echoing the no-UI early-exit phrase',
async () => {
const fixtureRelPath = path.relative(ROOT, FIXTURE);
const session = await launchClaudePty({
permissionMode: 'plan',
cwd: ROOT,
timeoutMs: 480_000,
});
let outcome: 'real_auq' | 'plan_ready' | 'timeout' | 'exited' = 'timeout';
let evidence = '';
let debugBuffer = ''; // captured at end so timeout error has data
try {
await Bun.sleep(8000);
const since = session.mark();
// Send the slash command alone first; then provide the UI-heavy
// plan content as a follow-up message. Claude Code rejects slash
// commands with trailing arguments unless the skill defines them.
session.send('/plan-design-review\r');
await Bun.sleep(3000);
session.send(
`Please review this plan for UI scope:\n\n` +
`Title: User Dashboard Page\n` +
`New React page UserDashboard.tsx with three subcomponents: ` +
`ActivityFeed, NotificationsPanel, QuickActions. ` +
`Tailwind CSS responsive layout (mobile/desktop breakpoints), ` +
`loading skeletons, empty states, hover states on every interactive element, ` +
`modal dialog for "mark all read", toast notifications for action feedback. ` +
`Reference plan file: ${fixtureRelPath}\r`
);
const budgetMs = 360_000;
const start = Date.now();
let lastPermSig = '';
while (Date.now() - start < budgetMs) {
await Bun.sleep(2500);
if (session.exited()) {
outcome = 'exited';
evidence = session.visibleSince(since).slice(-3000);
break;
}
const visible = session.visibleSince(since);
// Classify the recent tail only — old permission text persists
// in visibleSince(since) and would otherwise re-trigger forever.
const recentTail = visible.slice(-2500);
// Real skill AUQ visible (not a permission dialog)?
if (
isNumberedOptionListVisible(recentTail) &&
parseNumberedOptions(recentTail).length >= 2 &&
!isPermissionDialogVisible(recentTail)
) {
outcome = 'real_auq';
evidence = visible.slice(-3000);
break;
}
// Permission dialog: grant once per unique rendering.
if (isPermissionDialogVisible(recentTail)) {
const sig = visible.slice(-500);
if (sig !== lastPermSig) {
lastPermSig = sig;
session.send('1\r');
await Bun.sleep(1500);
continue;
}
}
// Plan-ready terminal — also acceptable (skill ran end-to-end
// and surfaced claude's "Ready to execute" prompt).
if (isPlanReadyVisible(visible)) {
outcome = 'plan_ready';
evidence = visible.slice(-3000);
break;
}
}
// Capture buffer state at end so a timeout error has diagnostic data.
debugBuffer = session.visibleSince(since).slice(-4000);
} finally {
await session.close();
}
// PASS: real_auq or plan_ready, AND evidence does NOT echo the
// early-exit phrase.
if (outcome === 'exited' || outcome === 'timeout') {
throw new Error(
`plan-design-review with UI scope FAILED: outcome=${outcome}\n` +
`--- buffer at timeout (last 4KB) ---\n${debugBuffer || evidence}`,
);
}
const NO_UI_PHRASE = /no\s+UI\s+scope|isn'?t\s+applicable/i;
if (NO_UI_PHRASE.test(evidence)) {
throw new Error(
`plan-design-review early-exited despite UI-heavy fixture.\n` +
`--- evidence (last 3KB) ---\n${evidence}`,
);
}
},
540_000,
);
});