mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-06 13:45:35 +02:00
test: spell out AskUserQuestion everywhere instead of AUQ
Per user feedback: don't shorten AskUserQuestion to AUQ — the abbreviation reads as cryptic. Apply across all the new code from this branch: - Rename test/skill-e2e-auq-format-compliance.test.ts → test/skill-e2e-ask-user-question-format-compliance.test.ts - Touchfile entry auq-format-pty → ask-user-question-format-pty (touchfiles.ts + matching assertion in touchfiles.test.ts) - Function rename navigateToModeAuq → navigateToModeAskUserQuestion - Variable auqVisible → askUserQuestionVisible - Outcome literal 'real_auq' → 'real_question' - All comments + JSDoc + CHANGELOG entry write AskUserQuestion in full - "AUQs" plural → "AskUserQuestions" No behavior change. 49/49 free tests still pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -3,7 +3,7 @@
|
||||
*
|
||||
* - parseNumberedOptions(visible)
|
||||
* Parses `❯ 1.` / ` 2.` numbered-option lines out of TTY text.
|
||||
* Used by the AUQ format-compliance and mode-routing tests to look
|
||||
* Used by the AskUserQuestion format-compliance and mode-routing tests to look
|
||||
* up an option index by its label without hard-coding positions.
|
||||
*
|
||||
* - findBudgetRegressions / assertNoBudgetRegression(comparison)
|
||||
@@ -117,7 +117,7 @@ describe('parseNumberedOptions', () => {
|
||||
|
||||
test('anchors on LAST cursor when both stale and fresh fit in the tail', () => {
|
||||
// Both lists fit in the same 4KB tail (small buffer). The granted
|
||||
// permission dialog options come first, the real AUQ comes second.
|
||||
// permission dialog options come first, the real AskUserQuestion comes second.
|
||||
// We must return the FRESH options, not the STALE ones.
|
||||
const visible = [
|
||||
'❯ 1. STALE_grant',
|
||||
|
||||
@@ -143,7 +143,7 @@ export function isPlanReadyVisible(visible: string): boolean {
|
||||
* option list (so isNumberedOptionListVisible matches them) but they
|
||||
* are NOT a skill's AskUserQuestion — they're claude asking the user
|
||||
* whether to grant a tool/file permission. Tests that look for skill
|
||||
* AUQs must explicitly skip these.
|
||||
* AskUserQuestions must explicitly skip these.
|
||||
*
|
||||
* Both English phrases below are stable across recent Claude Code
|
||||
* versions. The check is permissive on whitespace because TTY rendering
|
||||
@@ -206,13 +206,13 @@ export function parseNumberedOptions(
|
||||
// visually reads "1. Option" can come through as "1.Option".
|
||||
const optionRe = /^[\s❯]*([1-9])\.\s*(\S.*?)\s*$/;
|
||||
// We anchor on the LATEST `❯ 1.` line in the buffer — the cursor marker
|
||||
// for the active AUQ. Older numbered lists (e.g., a granted permission
|
||||
// for the active AskUserQuestion. Older numbered lists (e.g., a granted permission
|
||||
// dialog still in scrollback) sit above it and must be ignored. Without
|
||||
// this, parseNumberedOptions returns stale options after the dialog is
|
||||
// dismissed.
|
||||
const lines = tail.split('\n');
|
||||
// Anchor on the LAST `❯ 1.` line (cursor is on option 1 of the active
|
||||
// AUQ). Greedy character classes don't help here — we need a literal
|
||||
// AskUserQuestion). Greedy character classes don't help here — we need a literal
|
||||
// `❯` after optional leading whitespace.
|
||||
let cursorLineIdx = -1;
|
||||
for (let i = lines.length - 1; i >= 0; i--) {
|
||||
|
||||
@@ -96,7 +96,7 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
|
||||
// Real-PTY E2E batch (#6 new tests on the harness).
|
||||
// Each one tests behavior the SDK harness can't observe (rendered TTY,
|
||||
// numbered-option lists, multi-phase ordering, idempotency state echo).
|
||||
'auq-format-pty': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
|
||||
'ask-user-question-format-pty': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completeness-section.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
|
||||
'plan-ceo-mode-routing': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
|
||||
'plan-design-with-ui-scope': ['plan-design-review/**', 'test/fixtures/plans/ui-heavy-feature.md', 'test/helpers/claude-pty-runner.ts'],
|
||||
'budget-regression-pty': ['test/helpers/eval-store.ts', 'test/skill-budget-regression.test.ts'],
|
||||
@@ -351,8 +351,8 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
|
||||
// Real-PTY E2E batch — tier classification:
|
||||
// gate: cheap, deterministic, run on every PR
|
||||
// periodic: long-running or expensive (>$3/run), run weekly
|
||||
'auq-format-pty': 'gate', // ~$0.50/run, single skill probe
|
||||
'plan-ceo-mode-routing': 'periodic', // ~$3/run, deep navigation through 8-12 prior AUQs
|
||||
'ask-user-question-format-pty': 'gate', // ~$0.50/run, single skill probe
|
||||
'plan-ceo-mode-routing': 'periodic', // ~$3/run, deep navigation through 8-12 prior AskUserQuestions
|
||||
'plan-design-with-ui-scope': 'gate', // ~$0.80/run
|
||||
'budget-regression-pty': 'gate', // free, library-only assertion
|
||||
'ship-idempotency-pty': 'periodic', // ~$3/run, real /ship in plan mode
|
||||
|
||||
+15
-15
@@ -16,12 +16,12 @@
|
||||
* Why real-PTY: the existing skill-e2e-plan-format tests cover what the
|
||||
* AGENT writes via the SDK (capture-to-file harness). This test covers
|
||||
* what the USER actually sees in the terminal — different bug class
|
||||
* (e.g., AUQ tool truncates long prose, conductor renderer mangles
|
||||
* (e.g., AskUserQuestion tool truncates long prose, conductor renderer mangles
|
||||
* bullets, model collapses sections under token pressure). Two layers
|
||||
* of defense for a format-discipline regression that previously ate ~6
|
||||
* weeks of compliance drift before it was noticed.
|
||||
*
|
||||
* Trigger choice: /plan-ceo-review fires its mode-selection AUQ
|
||||
* Trigger choice: /plan-ceo-review fires its mode-selection AskUserQuestion
|
||||
* deterministically and early (Step 0F), so we don't need to drive
|
||||
* through any prior questions to reach a format check.
|
||||
*
|
||||
@@ -69,7 +69,7 @@ function findFormatGaps(visible: string): FormatGap[] {
|
||||
|
||||
describeE2E('AskUserQuestion format compliance (gate)', () => {
|
||||
test(
|
||||
'first AUQ from /plan-ceo-review contains all 7 mandated format elements',
|
||||
'first AskUserQuestion from /plan-ceo-review contains all 7 mandated format elements',
|
||||
async () => {
|
||||
const session = await launchClaudePty({
|
||||
permissionMode: 'plan',
|
||||
@@ -82,10 +82,10 @@ describeE2E('AskUserQuestion format compliance (gate)', () => {
|
||||
const since = session.mark();
|
||||
session.send('/plan-ceo-review\r');
|
||||
|
||||
// Wait for a SKILL AUQ. Strategy: poll the visible buffer until it
|
||||
// Wait for a SKILL AskUserQuestion. Strategy: poll the visible buffer until it
|
||||
// contains both a numbered-option list AND the format markers we
|
||||
// expect (ELI10 + Recommendation). When both are present, it IS a
|
||||
// real format-compliant AUQ — not a permission dialog or trust
|
||||
// real format-compliant AskUserQuestion — not a permission dialog or trust
|
||||
// prompt.
|
||||
//
|
||||
// While polling, auto-grant any permission dialogs we see in the
|
||||
@@ -94,7 +94,7 @@ describeE2E('AskUserQuestion format compliance (gate)', () => {
|
||||
const budgetMs = 300_000;
|
||||
const start = Date.now();
|
||||
let captured = '';
|
||||
let auqVisible = false;
|
||||
let askUserQuestionVisible = false;
|
||||
let lastPermSig = '';
|
||||
// Snapshot debug counters every poll so the timeout error shows
|
||||
// WHY we never matched (cursor-found vs markers-found discrepancy).
|
||||
@@ -106,20 +106,20 @@ describeE2E('AskUserQuestion format compliance (gate)', () => {
|
||||
await Bun.sleep(2000);
|
||||
if (session.exited()) {
|
||||
throw new Error(
|
||||
`claude exited (code=${session.exitCode()}) before AUQ rendered.\n` +
|
||||
`claude exited (code=${session.exitCode()}) before AskUserQuestion rendered.\n` +
|
||||
`Last visible:\n${session.visibleSince(since).slice(-2000)}`,
|
||||
);
|
||||
}
|
||||
const visible = session.visibleSince(since);
|
||||
// Marker check: anywhere in the post-slash region. Since `since`
|
||||
// is set right after sending /plan-ceo-review, there's no stale
|
||||
// AUQ above this line — the only AUQ that can produce these
|
||||
// AskUserQuestion above this line — the only AskUserQuestion that can produce these
|
||||
// markers is the current one.
|
||||
const hasEli10 = /ELI10\s*:/i.test(visible);
|
||||
const hasRecommend = /Recommendation\s*:/i.test(visible);
|
||||
|
||||
// Cursor check: a numbered option list near the bottom of the
|
||||
// buffer means the AUQ is currently rendered (not scrolled away).
|
||||
// buffer means the AskUserQuestion is currently rendered (not scrolled away).
|
||||
const cursorTail = visible.slice(-4000);
|
||||
const hasCursor = isNumberedOptionListVisible(cursorTail) &&
|
||||
parseNumberedOptions(cursorTail).length >= 2;
|
||||
@@ -129,7 +129,7 @@ describeE2E('AskUserQuestion format compliance (gate)', () => {
|
||||
|
||||
// Permission dialog branch: grant once per unique rendering, but
|
||||
// only when we don't already have format markers visible (so we
|
||||
// don't accidentally grant a permission inside a real AUQ).
|
||||
// don't accidentally grant a permission inside a real AskUserQuestion).
|
||||
if (
|
||||
hasCursor &&
|
||||
!(hasEli10 && hasRecommend) &&
|
||||
@@ -144,18 +144,18 @@ describeE2E('AskUserQuestion format compliance (gate)', () => {
|
||||
}
|
||||
}
|
||||
|
||||
// Real AUQ check: cursor visible AND markers present anywhere in
|
||||
// Real AskUserQuestion check: cursor visible AND markers present anywhere in
|
||||
// the post-slash region.
|
||||
if (hasCursor && hasEli10 && hasRecommend) {
|
||||
debugBothSeen++;
|
||||
captured = visible;
|
||||
auqVisible = true;
|
||||
askUserQuestionVisible = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!auqVisible) {
|
||||
if (!askUserQuestionVisible) {
|
||||
throw new Error(
|
||||
`AUQ not rendered within ${budgetMs}ms.\n` +
|
||||
`AskUserQuestion not rendered within ${budgetMs}ms.\n` +
|
||||
`Debug counts: cursorSeen=${debugCursorSeen} markersSeen=${debugMarkersSeen} bothSeen=${debugBothSeen}\n` +
|
||||
`Last visible (4KB):\n${session.visibleSince(since).slice(-4000)}`,
|
||||
);
|
||||
@@ -165,7 +165,7 @@ describeE2E('AskUserQuestion format compliance (gate)', () => {
|
||||
// Surface the captured text last 3KB on failure for debugging.
|
||||
const tail = captured.slice(-3000);
|
||||
throw new Error(
|
||||
`AUQ format compliance FAILED — missing ${gaps.length} mandated field(s):\n` +
|
||||
`AskUserQuestion format compliance FAILED — missing ${gaps.length} mandated field(s):\n` +
|
||||
gaps.map(g => ` - ${g.field} (regex: ${g.re.source})`).join('\n') +
|
||||
`\n--- captured (last 3KB) ---\n${tail}`,
|
||||
);
|
||||
@@ -99,7 +99,7 @@ describeE2E('/autoplan chain ordering (periodic)', () => {
|
||||
const visible = session.visibleSince(since);
|
||||
|
||||
// Auto-grant any permission dialog so autoplan can keep moving
|
||||
// through its phases. The autoplan template auto-decides AUQs
|
||||
// through its phases. The autoplan template auto-decides AskUserQuestions
|
||||
// it owns; only permission prompts (file/tool grants) need our
|
||||
// hand-pressing. Classify on tail to avoid stale matches.
|
||||
const recentTail = visible.slice(-1500);
|
||||
|
||||
@@ -11,9 +11,9 @@
|
||||
* the question but the agent ignores the choice (e.g. always defaults
|
||||
* to EXPANSION) would not be caught by any prior test.
|
||||
*
|
||||
* Tier: periodic (not gate). Each run navigates 8-12 prior AUQs (telemetry,
|
||||
* Tier: periodic (not gate). Each run navigates 8-12 prior AskUserQuestions (telemetry,
|
||||
* proactive, routing, vendoring, brain, office-hours, premise×3, approach)
|
||||
* before reaching Step 0F. At ~30s per AUQ that's a 4-6 min navigation
|
||||
* before reaching Step 0F. At ~30s per AskUserQuestion that's a 4-6 min navigation
|
||||
* phase per case. The full 2-case suite runs ~12-15 min, $3-4. Too slow
|
||||
* for gate-tier; weekly is fine.
|
||||
*
|
||||
@@ -57,20 +57,20 @@ const CASES: ModeCase[] = [
|
||||
];
|
||||
|
||||
/**
|
||||
* Navigate prior AUQs by picking option 1 until we hit an AUQ whose
|
||||
* Navigate prior AskUserQuestions by picking option 1 until we hit an AskUserQuestion whose
|
||||
* options match one of the 4 mode names. Returns the option index
|
||||
* matching `targetMode`, with the buffer marker pointing AT that AUQ.
|
||||
* matching `targetMode`, with the buffer marker pointing AT that AskUserQuestion.
|
||||
*
|
||||
* Throws if we don't reach the mode AUQ within `maxNav` prior AUQs or
|
||||
* Throws if we don't reach the mode AskUserQuestion within `maxNav` prior AskUserQuestions or
|
||||
* the overall budget.
|
||||
*/
|
||||
async function navigateToModeAuq(
|
||||
async function navigateToModeAskUserQuestion(
|
||||
session: ClaudePtySession,
|
||||
since: number,
|
||||
targetMode: ModeCase['mode'],
|
||||
opts: { maxNav?: number; budgetMs?: number } = {},
|
||||
): Promise<{ modeIndex: number; visibleAtMode: string }> {
|
||||
// /plan-ceo-review's mode AUQ (Step 0F) sits behind several preamble
|
||||
// /plan-ceo-review's mode AskUserQuestion (Step 0F) sits behind several preamble
|
||||
// and Step 0A-0C-bis gates: telemetry, proactive, routing, vendoring,
|
||||
// brain privacy, office-hours offer, premise challenge (3 questions),
|
||||
// approach selection. 12 hops is the conservative ceiling.
|
||||
@@ -100,12 +100,12 @@ async function navigateToModeAuq(
|
||||
if (sig === lastSig) continue;
|
||||
lastSeenList = opts;
|
||||
|
||||
// Is THIS the mode AUQ?
|
||||
// Is THIS the mode AskUserQuestion?
|
||||
if (opts.some(o => MODE_RE.test(o.label))) {
|
||||
const target = opts.find(o => o.label.toUpperCase().includes(targetMode));
|
||||
if (!target) {
|
||||
throw new Error(
|
||||
`Mode AUQ rendered but target "${targetMode}" not in option labels:\n` +
|
||||
`Mode AskUserQuestion rendered but target "${targetMode}" not in option labels:\n` +
|
||||
opts.map(o => ` ${o.index}. ${o.label}`).join('\n'),
|
||||
);
|
||||
}
|
||||
@@ -121,10 +121,10 @@ async function navigateToModeAuq(
|
||||
continue;
|
||||
}
|
||||
|
||||
// Not the mode AUQ — answer with option 1 (recommended) and continue.
|
||||
// Not the mode AskUserQuestion — answer with option 1 (recommended) and continue.
|
||||
if (priorAnswered >= maxNav) {
|
||||
throw new Error(
|
||||
`Navigated ${maxNav} prior AUQs without reaching the mode AUQ. ` +
|
||||
`Navigated ${maxNav} prior AskUserQuestions without reaching the mode AskUserQuestion. ` +
|
||||
`Last list:\n${opts.map(o => ` ${o.index}. ${o.label}`).join('\n')}`,
|
||||
);
|
||||
}
|
||||
@@ -133,7 +133,7 @@ async function navigateToModeAuq(
|
||||
// Give the agent a beat to advance before re-polling.
|
||||
await Bun.sleep(2000);
|
||||
}
|
||||
throw new Error(`Mode AUQ not reached within ${budgetMs}ms`);
|
||||
throw new Error(`Mode AskUserQuestion not reached within ${budgetMs}ms`);
|
||||
}
|
||||
|
||||
describeE2E('/plan-ceo-review mode routing (gate)', () => {
|
||||
@@ -150,13 +150,13 @@ describeE2E('/plan-ceo-review mode routing (gate)', () => {
|
||||
const since = session.mark();
|
||||
session.send('/plan-ceo-review\r');
|
||||
|
||||
const { modeIndex } = await navigateToModeAuq(session, since, c.mode);
|
||||
const { modeIndex } = await navigateToModeAskUserQuestion(session, since, c.mode);
|
||||
|
||||
// Snapshot the visible buffer at mode-pick time, then send the index.
|
||||
const sincePick = session.rawOutput().length;
|
||||
session.send(`${modeIndex}\r`);
|
||||
|
||||
// Wait for downstream evidence: either next AUQ or plan_ready or
|
||||
// Wait for downstream evidence: either next AskUserQuestion or plan_ready or
|
||||
// a posture-distinctive substring shows up.
|
||||
const budgetMs = 240_000;
|
||||
const start = Date.now();
|
||||
@@ -183,7 +183,7 @@ describeE2E('/plan-ceo-review mode routing (gate)', () => {
|
||||
isNumberedOptionListVisible(downstreamSnapshot) &&
|
||||
!c.postureRe.test(downstreamSnapshot)
|
||||
) {
|
||||
// Plan-ready AND a follow-up AUQ are both visible but
|
||||
// Plan-ready AND a follow-up AskUserQuestion are both visible but
|
||||
// posture text has not appeared yet. Keep polling for a bit.
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
*
|
||||
* Counterpart to the existing no-UI early-exit test. When the input plan
|
||||
* DOES describe UI changes, /plan-design-review must NOT early-exit and
|
||||
* must reach a real skill numbered-option AUQ (its first design-rating
|
||||
* must reach a real skill numbered-option AskUserQuestion (its first design-rating
|
||||
* question), with the captured evidence NOT echoing the early-exit phrase.
|
||||
*
|
||||
* Why: today we only test the negative path (no-UI → early-exit). A
|
||||
@@ -37,7 +37,7 @@ const FIXTURE = path.join(ROOT, 'test', 'fixtures', 'plans', 'ui-heavy-feature.m
|
||||
|
||||
describeE2E('/plan-design-review with UI scope (gate)', () => {
|
||||
test(
|
||||
'reaches a real skill AUQ (or plan_ready) without echoing the no-UI early-exit phrase',
|
||||
'reaches a real skill AskUserQuestion (or plan_ready) without echoing the no-UI early-exit phrase',
|
||||
async () => {
|
||||
const fixtureRelPath = path.relative(ROOT, FIXTURE);
|
||||
|
||||
@@ -47,7 +47,7 @@ describeE2E('/plan-design-review with UI scope (gate)', () => {
|
||||
timeoutMs: 480_000,
|
||||
});
|
||||
|
||||
let outcome: 'real_auq' | 'plan_ready' | 'timeout' | 'exited' = 'timeout';
|
||||
let outcome: 'real_question' | 'plan_ready' | 'timeout' | 'exited' = 'timeout';
|
||||
let evidence = '';
|
||||
let debugBuffer = ''; // captured at end so timeout error has data
|
||||
|
||||
@@ -86,13 +86,13 @@ describeE2E('/plan-design-review with UI scope (gate)', () => {
|
||||
// in visibleSince(since) and would otherwise re-trigger forever.
|
||||
const recentTail = visible.slice(-2500);
|
||||
|
||||
// Real skill AUQ visible (not a permission dialog)?
|
||||
// Real skill AskUserQuestion visible (not a permission dialog)?
|
||||
if (
|
||||
isNumberedOptionListVisible(recentTail) &&
|
||||
parseNumberedOptions(recentTail).length >= 2 &&
|
||||
!isPermissionDialogVisible(recentTail)
|
||||
) {
|
||||
outcome = 'real_auq';
|
||||
outcome = 'real_question';
|
||||
evidence = visible.slice(-3000);
|
||||
break;
|
||||
}
|
||||
@@ -122,7 +122,7 @@ describeE2E('/plan-design-review with UI scope (gate)', () => {
|
||||
await session.close();
|
||||
}
|
||||
|
||||
// PASS: real_auq or plan_ready, AND evidence does NOT echo the
|
||||
// PASS: real_question or plan_ready, AND evidence does NOT echo the
|
||||
// early-exit phrase.
|
||||
if (outcome === 'exited' || outcome === 'timeout') {
|
||||
throw new Error(
|
||||
|
||||
@@ -94,7 +94,7 @@ describe('selectTests', () => {
|
||||
expect(result.selected).toContain('plan-review-prosons-hardstop-neg');
|
||||
expect(result.selected).toContain('plan-review-prosons-neutral-neg');
|
||||
// v1.13.x real-PTY E2E batch entries that also depend on plan-ceo-review/**
|
||||
expect(result.selected).toContain('auq-format-pty');
|
||||
expect(result.selected).toContain('ask-user-question-format-pty');
|
||||
expect(result.selected).toContain('plan-ceo-mode-routing');
|
||||
expect(result.selected).toContain('autoplan-chain-pty');
|
||||
expect(result.selected.length).toBe(18);
|
||||
|
||||
Reference in New Issue
Block a user