mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-07 05:56:41 +02:00
2b1a0da7c1
skill-e2e-auq-format-compliance.test.ts (~$0.50/run, 90-130s): - Asserts /plan-ceo-review's first AUQ contains all 7 mandated format elements (ELI10, Recommendation, Pros/Cons with ✅/❌, Net, (recommended) label). Catches drift in the shared preamble resolver that previously took weeks to notice. - Auto-grants permission dialogs that fire during preamble side-effects (touch on .feature-prompted markers in fresh user environments). - Verified PASS in 126s. skill-e2e-plan-design-with-ui.test.ts (~$0.80/run, 50-90s): - Counterpart to the existing no-UI early-exit test. When the input plan DOES describe UI changes, /plan-design-review must NOT early-exit and must reach a real skill AUQ. - Sends the slash command without args, then a follow-up message with the UI-heavy plan description (Claude Code rejects unknown trailing args). Asserts evidence does NOT contain "no UI scope". - Verified PASS in 54s. skill-budget-regression.test.ts (free, gate): - Library-only assertion. Reads the most recent eval file, finds the prior same-branch run via findPreviousRun, computes ComparisonResult, asserts no test exceeded 2× tools or turns. - Branch-scoped: skips with reason if the latest eval was produced on a different branch (cross-branch comparison would be noise). - First-run grace (vacuous pass) when no prior data exists. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
197 lines
7.7 KiB
TypeScript
197 lines
7.7 KiB
TypeScript
/**
|
|
* AskUserQuestion format-compliance smoke (gate, paid, real-PTY).
|
|
*
|
|
* Asserts: when /plan-ceo-review fires its first AskUserQuestion in plan
|
|
* mode, the rendered TTY output contains every element the preamble
|
|
* format spec mandates (scripts/resolvers/preamble/generate-ask-user-format.ts
|
|
* + voice directive):
|
|
*
|
|
* 1. ELI10 prose paragraph
|
|
* 2. "Recommendation:" line
|
|
* 3. Pros/Cons header
|
|
* 4. ✅ pro bullet AND ❌ con bullet
|
|
* 5. "Net:" closer line
|
|
* 6. "(recommended)" label on one option
|
|
*
|
|
* Why real-PTY: the existing skill-e2e-plan-format tests cover what the
|
|
* AGENT writes via the SDK (capture-to-file harness). This test covers
|
|
* what the USER actually sees in the terminal — different bug class
|
|
* (e.g., AUQ tool truncates long prose, conductor renderer mangles
|
|
* bullets, model collapses sections under token pressure). Two layers
|
|
* of defense for a format-discipline regression that previously ate ~6
|
|
* weeks of compliance drift before it was noticed.
|
|
*
|
|
* Trigger choice: /plan-ceo-review fires its mode-selection AUQ
|
|
* deterministically and early (Step 0F), so we don't need to drive
|
|
* through any prior questions to reach a format check.
|
|
*
|
|
* See test/helpers/claude-pty-runner.ts for runner internals.
|
|
*/
|
|
|
|
import { describe, test, expect } from 'bun:test';
|
|
import {
|
|
launchClaudePty,
|
|
isNumberedOptionListVisible,
|
|
isPermissionDialogVisible,
|
|
parseNumberedOptions,
|
|
} from './helpers/claude-pty-runner';
|
|
|
|
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
|
|
const describeE2E = shouldRun ? describe : describe.skip;
|
|
|
|
// Format predicates. Permissive on whitespace and capitalization.
|
|
// Tightening these is V2 if real drift is observed.
|
|
const ELI10_RE = /ELI10\s*:/i;
|
|
const RECOMMEND_RE = /Recommendation\s*:/i;
|
|
const PROS_CONS_RE = /Pros\s*\/\s*cons\s*:/i;
|
|
const PRO_BULLET_RE = /✅/;
|
|
const CON_BULLET_RE = /❌/;
|
|
const NET_LINE_RE = /^[\s|]*Net\s*:/im;
|
|
const RECOMMENDED_LBL = /\(recommended\)/i;
|
|
|
|
interface FormatGap {
|
|
field: string;
|
|
re: RegExp;
|
|
}
|
|
|
|
function findFormatGaps(visible: string): FormatGap[] {
|
|
const checks: FormatGap[] = [
|
|
{ field: 'ELI10:', re: ELI10_RE },
|
|
{ field: 'Recommendation:', re: RECOMMEND_RE },
|
|
{ field: 'Pros / cons:', re: PROS_CONS_RE },
|
|
{ field: '✅ pro bullet', re: PRO_BULLET_RE },
|
|
{ field: '❌ con bullet', re: CON_BULLET_RE },
|
|
{ field: 'Net:', re: NET_LINE_RE },
|
|
{ field: '(recommended) label', re: RECOMMENDED_LBL },
|
|
];
|
|
return checks.filter(c => !c.re.test(visible));
|
|
}
|
|
|
|
describeE2E('AskUserQuestion format compliance (gate)', () => {
|
|
test(
|
|
'first AUQ from /plan-ceo-review contains all 7 mandated format elements',
|
|
async () => {
|
|
const session = await launchClaudePty({
|
|
permissionMode: 'plan',
|
|
timeoutMs: 360_000,
|
|
});
|
|
|
|
try {
|
|
// Boot grace + auto trust-dialog handler.
|
|
await Bun.sleep(8000);
|
|
const since = session.mark();
|
|
session.send('/plan-ceo-review\r');
|
|
|
|
// Wait for a SKILL AUQ. Strategy: poll the visible buffer until it
|
|
// contains both a numbered-option list AND the format markers we
|
|
// expect (ELI10 + Recommendation). When both are present, it IS a
|
|
// real format-compliant AUQ — not a permission dialog or trust
|
|
// prompt.
|
|
//
|
|
// While polling, auto-grant any permission dialogs we see in the
|
|
// recent tail (preamble side-effects: touch on a sensitive file,
|
|
// etc) so the agent isn't blocked.
|
|
const budgetMs = 300_000;
|
|
const start = Date.now();
|
|
let captured = '';
|
|
let auqVisible = false;
|
|
let lastPermSig = '';
|
|
// Snapshot debug counters every poll so the timeout error shows
|
|
// WHY we never matched (cursor-found vs markers-found discrepancy).
|
|
let debugCursorSeen = 0;
|
|
let debugMarkersSeen = 0;
|
|
let debugBothSeen = 0;
|
|
|
|
while (Date.now() - start < budgetMs) {
|
|
await Bun.sleep(2000);
|
|
if (session.exited()) {
|
|
throw new Error(
|
|
`claude exited (code=${session.exitCode()}) before AUQ rendered.\n` +
|
|
`Last visible:\n${session.visibleSince(since).slice(-2000)}`,
|
|
);
|
|
}
|
|
const visible = session.visibleSince(since);
|
|
// Marker check: anywhere in the post-slash region. Since `since`
|
|
// is set right after sending /plan-ceo-review, there's no stale
|
|
// AUQ above this line — the only AUQ that can produce these
|
|
// markers is the current one.
|
|
const hasEli10 = /ELI10\s*:/i.test(visible);
|
|
const hasRecommend = /Recommendation\s*:/i.test(visible);
|
|
|
|
// Cursor check: a numbered option list near the bottom of the
|
|
// buffer means the AUQ is currently rendered (not scrolled away).
|
|
const cursorTail = visible.slice(-4000);
|
|
const hasCursor = isNumberedOptionListVisible(cursorTail) &&
|
|
parseNumberedOptions(cursorTail).length >= 2;
|
|
|
|
if (hasCursor) debugCursorSeen++;
|
|
if (hasEli10 && hasRecommend) debugMarkersSeen++;
|
|
|
|
// Permission dialog branch: grant once per unique rendering, but
|
|
// only when we don't already have format markers visible (so we
|
|
// don't accidentally grant a permission inside a real AUQ).
|
|
if (
|
|
hasCursor &&
|
|
!(hasEli10 && hasRecommend) &&
|
|
isPermissionDialogVisible(cursorTail)
|
|
) {
|
|
const sig = visible.slice(-500);
|
|
if (sig !== lastPermSig) {
|
|
lastPermSig = sig;
|
|
session.send('1\r');
|
|
await Bun.sleep(1500);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// Real AUQ check: cursor visible AND markers present anywhere in
|
|
// the post-slash region.
|
|
if (hasCursor && hasEli10 && hasRecommend) {
|
|
debugBothSeen++;
|
|
captured = visible;
|
|
auqVisible = true;
|
|
break;
|
|
}
|
|
}
|
|
if (!auqVisible) {
|
|
throw new Error(
|
|
`AUQ not rendered within ${budgetMs}ms.\n` +
|
|
`Debug counts: cursorSeen=${debugCursorSeen} markersSeen=${debugMarkersSeen} bothSeen=${debugBothSeen}\n` +
|
|
`Last visible (4KB):\n${session.visibleSince(since).slice(-4000)}`,
|
|
);
|
|
}
|
|
const gaps = findFormatGaps(captured);
|
|
if (gaps.length > 0) {
|
|
// Surface the captured text last 3KB on failure for debugging.
|
|
const tail = captured.slice(-3000);
|
|
throw new Error(
|
|
`AUQ format compliance FAILED — missing ${gaps.length} mandated field(s):\n` +
|
|
gaps.map(g => ` - ${g.field} (regex: ${g.re.source})`).join('\n') +
|
|
`\n--- captured (last 3KB) ---\n${tail}`,
|
|
);
|
|
}
|
|
|
|
// Sanity: the parsed option list contains at least 2 options and
|
|
// one of them carries the (recommended) marker.
|
|
const opts = parseNumberedOptions(captured);
|
|
expect(opts.length).toBeGreaterThanOrEqual(2);
|
|
const hasRecommended = opts.some(o => /\(recommended\)/i.test(o.label));
|
|
if (!hasRecommended) {
|
|
// It's also acceptable for the (recommended) marker to live in
|
|
// prose above the box (some renderers wrap labels). The text-level
|
|
// RECOMMENDED_LBL check above already covers that case.
|
|
// Surface a friendlier message if the box itself missed it.
|
|
// (This is non-fatal because findFormatGaps already passed.)
|
|
// eslint-disable-next-line no-console
|
|
console.warn(
|
|
'(recommended) label appears in prose but not on a parsed option label — acceptable but watch for drift',
|
|
);
|
|
}
|
|
} finally {
|
|
await session.close();
|
|
}
|
|
},
|
|
420_000,
|
|
);
|
|
});
|