feat(test): 3 gate-tier real-PTY E2E tests

skill-e2e-auq-format-compliance.test.ts (~$0.50/run, 90-130s): - Asserts /plan-ceo-review's first AUQ contains all 7 mandated format elements (ELI10, Recommendation, Pros/Cons with ✅/❌, Net, (recommended) label). Catches drift in the shared preamble resolver that previously took weeks to notice. - Auto-grants permission dialogs that fire during preamble side-effects (touch on .feature-prompted markers in fresh user environments). - Verified PASS in 126s. skill-e2e-plan-design-with-ui.test.ts (~$0.80/run, 50-90s): - Counterpart to the existing no-UI early-exit test. When the input plan DOES describe UI changes, /plan-design-review must NOT early-exit and must reach a real skill AUQ. - Sends the slash command without args, then a follow-up message with the UI-heavy plan description (Claude Code rejects unknown trailing args). Asserts evidence does NOT contain "no UI scope". - Verified PASS in 54s. skill-budget-regression.test.ts (free, gate): - Library-only assertion. Reads the most recent eval file, finds the prior same-branch run via findPreviousRun, computes ComparisonResult, asserts no test exceeded 2× tools or turns. - Branch-scoped: skips with reason if the latest eval was produced on a different branch (cross-branch comparison would be noise). - First-run grace (vacuous pass) when no prior data exists. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-19 16:20:09 +02:00 · 2026-04-26 04:36:35 -07:00
parent 137b11f39a
commit 2b1a0da7c1
3 changed files with 487 additions and 0 deletions
@@ -0,0 +1,148 @@
+/**
+ * Tool-budget regression test (gate, free).
+ *
+ * Asserts: no test in the most recent eval run grew its tool calls or
+ * turns by more than 2× vs the prior recorded run. Pure library — does
+ * not spawn `claude` or pay any API cost. Reads the project eval dir
+ * (~/.gstack/projects/<slug>/evals/) and compares the latest run against
+ * its predecessor.
+ *
+ * First-run grace: if there's no prior run, the test passes vacuously.
+ * The purpose is to catch a SECOND-run regression — a real-world scenario
+ * is "preamble change shipped, /qa eval went from 30 tool calls to 90".
+ *
+ * Why two metrics (tools and turns): a regression that adds tool calls
+ * usually reflects an inefficient skill prompt; a regression that adds
+ * turns reflects a skill that is hesitating or losing track. Either is
+ * worth catching. We use a noise floor (5 tool calls / 3 turns) to
+ * avoid flagging tests that started tiny and got slightly bigger.
+ *
+ * Override: GSTACK_BUDGET_RATIO=<n> (default 2.0).
+ *
+ * Skipping: only the gate-level CI-blocking variant runs in EVALS_TIER=gate.
+ * The same logic runs anywhere `bun test` is invoked because comparison
+ * is free — no LLM cost.
+ */
+
+import { describe, test } from 'bun:test';
+import { spawnSync } from 'child_process';
+import * as fs from 'fs';
+import * as path from 'path';
+import {
+  getProjectEvalDir,
+  findPreviousRun,
+  compareEvalResults,
+  assertNoBudgetRegression,
+  type EvalResult,
+} from './helpers/eval-store';
+
+function currentGitBranch(): string {
+  try {
+    const result = spawnSync('git', ['rev-parse', '--abbrev-ref', 'HEAD'], {
+      stdio: 'pipe', timeout: 3000,
+    });
+    return result.stdout?.toString().trim() || 'unknown';
+  } catch {
+    return 'unknown';
+  }
+}
+
+interface LatestRun {
+  filepath: string;
+  result: EvalResult;
+}
+
+/** Find the most recent finalized (non-_partial) eval file for a tier. */
+function findLatestRun(evalDir: string, tier: 'e2e' | 'llm-judge'): LatestRun | null {
+  let entries: string[];
+  try {
+    entries = fs.readdirSync(evalDir);
+  } catch {
+    return null;
+  }
+  const candidates: Array<{ filepath: string; timestamp: string }> = [];
+  for (const f of entries) {
+    if (!f.endsWith('.json')) continue;
+    if (f.startsWith('_partial')) continue;
+    const fullPath = path.join(evalDir, f);
+    try {
+      const data = JSON.parse(fs.readFileSync(fullPath, 'utf-8')) as EvalResult;
+      if (data.tier !== tier) continue;
+      candidates.push({ filepath: fullPath, timestamp: data.timestamp ?? '' });
+    } catch { /* ignore corrupt */ }
+  }
+  if (candidates.length === 0) return null;
+  candidates.sort((a, b) => b.timestamp.localeCompare(a.timestamp));
+  const top = candidates[0]!;
+  return {
+    filepath: top.filepath,
+    result: JSON.parse(fs.readFileSync(top.filepath, 'utf-8')) as EvalResult,
+  };
+}
+
+function checkTier(tier: 'e2e' | 'llm-judge'): void {
+  const evalDir = getProjectEvalDir();
+  const latest = findLatestRun(evalDir, tier);
+  if (!latest) {
+    // eslint-disable-next-line no-console
+    console.log(`[budget-regression:${tier}] no current run in ${evalDir} — skipping`);
+    return;
+  }
+  // Branch alignment: only assert when the latest eval was actually
+  // produced by THIS checkout's branch. Cross-branch comparison would
+  // measure noise from unrelated work. Pre-existing eval history from
+  // other branches is not our regression to fix.
+  const myBranch = currentGitBranch();
+  if (latest.result.branch !== myBranch) {
+    // eslint-disable-next-line no-console
+    console.log(
+      `[budget-regression:${tier}] latest eval is from "${latest.result.branch}" ` +
+      `but current branch is "${myBranch}" — skipping (run evals on this branch first)`,
+    );
+    return;
+  }
+  const branch = latest.result.branch;
+  const priorPath = findPreviousRun(evalDir, tier, branch, latest.filepath);
+  if (!priorPath) {
+    // eslint-disable-next-line no-console
+    console.log(`[budget-regression:${tier}] no prior run found — first-run grace`);
+    return;
+  }
+  let prior: EvalResult;
+  try {
+    prior = JSON.parse(fs.readFileSync(priorPath, 'utf-8')) as EvalResult;
+  } catch (err) {
+    // eslint-disable-next-line no-console
+    console.warn(`[budget-regression:${tier}] could not read prior ${priorPath}: ${(err as Error).message}`);
+    return;
+  }
+  // Branch-scoped: only compare same-branch history. Cross-branch
+  // comparison is noisy (different branches do different work). If
+  // findPreviousRun fell back to another branch, treat as no prior.
+  if (prior.branch !== branch) {
+    // eslint-disable-next-line no-console
+    console.log(
+      `[budget-regression:${tier}] no same-branch prior (latest on "${branch}", prior on "${prior.branch}") — skipping`,
+    );
+    return;
+  }
+  const comparison = compareEvalResults(prior, latest.result, priorPath, latest.filepath);
+  // Throws on regression.
+  assertNoBudgetRegression(comparison);
+  // eslint-disable-next-line no-console
+  console.log(
+    `[budget-regression:${tier}] OK — ${comparison.deltas.length} test(s) compared, ` +
+    `${comparison.tool_count_before}→${comparison.tool_count_after} tools, ` +
+    `cost Δ $${comparison.total_cost_delta.toFixed(2)}`,
+  );
+}
+
+describe('tool budget regression (gate, free)', () => {
+  test('no e2e test exceeds 2× prior tool calls or turns', () => {
+    checkTier('e2e');
+  });
+
+  test('no llm-judge test exceeds 2× prior tool calls or turns', () => {
+    checkTier('llm-judge');
+  });
+});
@@ -0,0 +1,196 @@
+/**
+ * AskUserQuestion format-compliance smoke (gate, paid, real-PTY).
+ *
+ * Asserts: when /plan-ceo-review fires its first AskUserQuestion in plan
+ * mode, the rendered TTY output contains every element the preamble
+ * format spec mandates (scripts/resolvers/preamble/generate-ask-user-format.ts
+ * + voice directive):
+ *
+ *   1. ELI10 prose paragraph
+ *   2. "Recommendation:" line
+ *   3. Pros/Cons header
+ *   4. ✅ pro bullet AND ❌ con bullet
+ *   5. "Net:" closer line
+ *   6. "(recommended)" label on one option
+ *
+ * Why real-PTY: the existing skill-e2e-plan-format tests cover what the
+ * AGENT writes via the SDK (capture-to-file harness). This test covers
+ * what the USER actually sees in the terminal — different bug class
+ * (e.g., AUQ tool truncates long prose, conductor renderer mangles
+ * bullets, model collapses sections under token pressure). Two layers
+ * of defense for a format-discipline regression that previously ate ~6
+ * weeks of compliance drift before it was noticed.
+ *
+ * Trigger choice: /plan-ceo-review fires its mode-selection AUQ
+ * deterministically and early (Step 0F), so we don't need to drive
+ * through any prior questions to reach a format check.
+ *
+ * See test/helpers/claude-pty-runner.ts for runner internals.
+ */
+
+import { describe, test, expect } from 'bun:test';
+import {
+  launchClaudePty,
+  isNumberedOptionListVisible,
+  isPermissionDialogVisible,
+  parseNumberedOptions,
+} from './helpers/claude-pty-runner';
+
+const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
+const describeE2E = shouldRun ? describe : describe.skip;
+
+// Format predicates. Permissive on whitespace and capitalization.
+// Tightening these is V2 if real drift is observed.
+const ELI10_RE        = /ELI10\s*:/i;
+const RECOMMEND_RE    = /Recommendation\s*:/i;
+const PROS_CONS_RE    = /Pros\s*\/\s*cons\s*:/i;
+const PRO_BULLET_RE   = /✅/;
+const CON_BULLET_RE   = /❌/;
+const NET_LINE_RE     = /^[\s|]*Net\s*:/im;
+const RECOMMENDED_LBL = /\(recommended\)/i;
+
+interface FormatGap {
+  field: string;
+  re: RegExp;
+}
+
+function findFormatGaps(visible: string): FormatGap[] {
+  const checks: FormatGap[] = [
+    { field: 'ELI10:', re: ELI10_RE },
+    { field: 'Recommendation:', re: RECOMMEND_RE },
+    { field: 'Pros / cons:', re: PROS_CONS_RE },
+    { field: '✅ pro bullet', re: PRO_BULLET_RE },
+    { field: '❌ con bullet', re: CON_BULLET_RE },
+    { field: 'Net:', re: NET_LINE_RE },
+    { field: '(recommended) label', re: RECOMMENDED_LBL },
+  ];
+  return checks.filter(c => !c.re.test(visible));
+}
+
+describeE2E('AskUserQuestion format compliance (gate)', () => {
+  test(
+    'first AUQ from /plan-ceo-review contains all 7 mandated format elements',
+    async () => {
+      const session = await launchClaudePty({
+        permissionMode: 'plan',
+        timeoutMs: 360_000,
+      });
+
+      try {
+        // Boot grace + auto trust-dialog handler.
+        await Bun.sleep(8000);
+        const since = session.mark();
+        session.send('/plan-ceo-review\r');
+
+        // Wait for a SKILL AUQ. Strategy: poll the visible buffer until it
+        // contains both a numbered-option list AND the format markers we
+        // expect (ELI10 + Recommendation). When both are present, it IS a
+        // real format-compliant AUQ — not a permission dialog or trust
+        // prompt.
+        //
+        // While polling, auto-grant any permission dialogs we see in the
+        // recent tail (preamble side-effects: touch on a sensitive file,
+        // etc) so the agent isn't blocked.
+        const budgetMs = 300_000;
+        const start = Date.now();
+        let captured = '';
+        let auqVisible = false;
+        let lastPermSig = '';
+        // Snapshot debug counters every poll so the timeout error shows
+        // WHY we never matched (cursor-found vs markers-found discrepancy).
+        let debugCursorSeen = 0;
+        let debugMarkersSeen = 0;
+        let debugBothSeen = 0;
+
+        while (Date.now() - start < budgetMs) {
+          await Bun.sleep(2000);
+          if (session.exited()) {
+            throw new Error(
+              `claude exited (code=${session.exitCode()}) before AUQ rendered.\n` +
+                `Last visible:\n${session.visibleSince(since).slice(-2000)}`,
+            );
+          }
+          const visible = session.visibleSince(since);
+          // Marker check: anywhere in the post-slash region. Since `since`
+          // is set right after sending /plan-ceo-review, there's no stale
+          // AUQ above this line — the only AUQ that can produce these
+          // markers is the current one.
+          const hasEli10 = /ELI10\s*:/i.test(visible);
+          const hasRecommend = /Recommendation\s*:/i.test(visible);
+
+          // Cursor check: a numbered option list near the bottom of the
+          // buffer means the AUQ is currently rendered (not scrolled away).
+          const cursorTail = visible.slice(-4000);
+          const hasCursor = isNumberedOptionListVisible(cursorTail) &&
+                            parseNumberedOptions(cursorTail).length >= 2;
+
+          if (hasCursor) debugCursorSeen++;
+          if (hasEli10 && hasRecommend) debugMarkersSeen++;
+
+          // Permission dialog branch: grant once per unique rendering, but
+          // only when we don't already have format markers visible (so we
+          // don't accidentally grant a permission inside a real AUQ).
+          if (
+            hasCursor &&
+            !(hasEli10 && hasRecommend) &&
+            isPermissionDialogVisible(cursorTail)
+          ) {
+            const sig = visible.slice(-500);
+            if (sig !== lastPermSig) {
+              lastPermSig = sig;
+              session.send('1\r');
+              await Bun.sleep(1500);
+              continue;
+            }
+          }
+
+          // Real AUQ check: cursor visible AND markers present anywhere in
+          // the post-slash region.
+          if (hasCursor && hasEli10 && hasRecommend) {
+            debugBothSeen++;
+            captured = visible;
+            auqVisible = true;
+            break;
+          }
+        }
+        if (!auqVisible) {
+          throw new Error(
+            `AUQ not rendered within ${budgetMs}ms.\n` +
+              `Debug counts: cursorSeen=${debugCursorSeen} markersSeen=${debugMarkersSeen} bothSeen=${debugBothSeen}\n` +
+              `Last visible (4KB):\n${session.visibleSince(since).slice(-4000)}`,
+          );
+        }
+        const gaps = findFormatGaps(captured);
+        if (gaps.length > 0) {
+          // Surface the captured text last 3KB on failure for debugging.
+          const tail = captured.slice(-3000);
+          throw new Error(
+            `AUQ format compliance FAILED — missing ${gaps.length} mandated field(s):\n` +
+              gaps.map(g => `  - ${g.field} (regex: ${g.re.source})`).join('\n') +
+              `\n--- captured (last 3KB) ---\n${tail}`,
+          );
+        }
+
+        // Sanity: the parsed option list contains at least 2 options and
+        // one of them carries the (recommended) marker.
+        const opts = parseNumberedOptions(captured);
+        expect(opts.length).toBeGreaterThanOrEqual(2);
+        const hasRecommended = opts.some(o => /\(recommended\)/i.test(o.label));
+        if (!hasRecommended) {
+          // It's also acceptable for the (recommended) marker to live in
+          // prose above the box (some renderers wrap labels). The text-level
+          // RECOMMENDED_LBL check above already covers that case.
+          // Surface a friendlier message if the box itself missed it.
+          // (This is non-fatal because findFormatGaps already passed.)
+          // eslint-disable-next-line no-console
+          console.warn(
+            '(recommended) label appears in prose but not on a parsed option label — acceptable but watch for drift',
+          );
+        }
+      } finally {
+        await session.close();
+      }
+    },
+    420_000,
+  );
+});
@@ -0,0 +1,143 @@
+/**
+ * /plan-design-review with UI scope (gate, paid, real-PTY).
+ *
+ * Counterpart to the existing no-UI early-exit test. When the input plan
+ * DOES describe UI changes, /plan-design-review must NOT early-exit and
+ * must reach a real skill numbered-option AUQ (its first design-rating
+ * question), with the captured evidence NOT echoing the early-exit phrase.
+ *
+ * Why: today we only test the negative path (no-UI → early-exit). A
+ * regression that flips the UI-detection logic — making EVERY plan early-
+ * exit — would pass the no-UI test (vacuously) and ship undetected. This
+ * test is the positive coverage.
+ *
+ * How: launch claude in plan mode in the gstack repo cwd (so the skill
+ * registry is loaded). Send /plan-design-review with the fixture path
+ * inline so the skill reviews the UI-heavy plan rather than git diff or
+ * .claude/plans/. Drive past permission dialogs. Wait for a numbered-
+ * option list that is NOT a permission dialog. Assert evidence does NOT
+ * contain "no UI scope".
+ */
+
+import { describe, test } from 'bun:test';
+import * as path from 'path';
+import {
+  launchClaudePty,
+  isNumberedOptionListVisible,
+  isPermissionDialogVisible,
+  parseNumberedOptions,
+  isPlanReadyVisible,
+} from './helpers/claude-pty-runner';
+
+const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
+const describeE2E = shouldRun ? describe : describe.skip;
+
+const ROOT = path.resolve(import.meta.dir, '..');
+const FIXTURE = path.join(ROOT, 'test', 'fixtures', 'plans', 'ui-heavy-feature.md');
+
+describeE2E('/plan-design-review with UI scope (gate)', () => {
+  test(
+    'reaches a real skill AUQ (or plan_ready) without echoing the no-UI early-exit phrase',
+    async () => {
+      const fixtureRelPath = path.relative(ROOT, FIXTURE);
+
+      const session = await launchClaudePty({
+        permissionMode: 'plan',
+        cwd: ROOT,
+        timeoutMs: 480_000,
+      });
+
+      let outcome: 'real_auq' | 'plan_ready' | 'timeout' | 'exited' = 'timeout';
+      let evidence = '';
+      let debugBuffer = ''; // captured at end so timeout error has data
+
+      try {
+        await Bun.sleep(8000);
+        const since = session.mark();
+        // Send the slash command alone first; then provide the UI-heavy
+        // plan content as a follow-up message. Claude Code rejects slash
+        // commands with trailing arguments unless the skill defines them.
+        session.send('/plan-design-review\r');
+        await Bun.sleep(3000);
+        session.send(
+          `Please review this plan for UI scope:\n\n` +
+          `Title: User Dashboard Page\n` +
+          `New React page UserDashboard.tsx with three subcomponents: ` +
+          `ActivityFeed, NotificationsPanel, QuickActions. ` +
+          `Tailwind CSS responsive layout (mobile/desktop breakpoints), ` +
+          `loading skeletons, empty states, hover states on every interactive element, ` +
+          `modal dialog for "mark all read", toast notifications for action feedback. ` +
+          `Reference plan file: ${fixtureRelPath}\r`
+        );
+
+        const budgetMs = 360_000;
+        const start = Date.now();
+        let lastPermSig = '';
+        while (Date.now() - start < budgetMs) {
+          await Bun.sleep(2500);
+          if (session.exited()) {
+            outcome = 'exited';
+            evidence = session.visibleSince(since).slice(-3000);
+            break;
+          }
+          const visible = session.visibleSince(since);
+
+          // Classify the recent tail only — old permission text persists
+          // in visibleSince(since) and would otherwise re-trigger forever.
+          const recentTail = visible.slice(-2500);
+
+          // Real skill AUQ visible (not a permission dialog)?
+          if (
+            isNumberedOptionListVisible(recentTail) &&
+            parseNumberedOptions(recentTail).length >= 2 &&
+            !isPermissionDialogVisible(recentTail)
+          ) {
+            outcome = 'real_auq';
+            evidence = visible.slice(-3000);
+            break;
+          }
+
+          // Permission dialog: grant once per unique rendering.
+          if (isPermissionDialogVisible(recentTail)) {
+            const sig = visible.slice(-500);
+            if (sig !== lastPermSig) {
+              lastPermSig = sig;
+              session.send('1\r');
+              await Bun.sleep(1500);
+              continue;
+            }
+          }
+
+          // Plan-ready terminal — also acceptable (skill ran end-to-end
+          // and surfaced claude's "Ready to execute" prompt).
+          if (isPlanReadyVisible(visible)) {
+            outcome = 'plan_ready';
+            evidence = visible.slice(-3000);
+            break;
+          }
+        }
+        // Capture buffer state at end so a timeout error has diagnostic data.
+        debugBuffer = session.visibleSince(since).slice(-4000);
+      } finally {
+        await session.close();
+      }
+
+      // PASS: real_auq or plan_ready, AND evidence does NOT echo the
+      // early-exit phrase.
+      if (outcome === 'exited' || outcome === 'timeout') {
+        throw new Error(
+          `plan-design-review with UI scope FAILED: outcome=${outcome}\n` +
+            `--- buffer at timeout (last 4KB) ---\n${debugBuffer || evidence}`,
+        );
+      }
+      const NO_UI_PHRASE = /no\s+UI\s+scope|isn'?t\s+applicable/i;
+      if (NO_UI_PHRASE.test(evidence)) {
+        throw new Error(
+          `plan-design-review early-exited despite UI-heavy fixture.\n` +
+            `--- evidence (last 3KB) ---\n${evidence}`,
+        );
+      }
+    },
+    540_000,
+  );
+});