mirror of
https://github.com/garrytan/gstack.git
synced 2026-06-18 15:50:11 +02:00
feat(auq): prose fallback when AskUserQuestion fails (interactive sessions)
On a genuine AUQ failure (tool absent, or present-but-erroring like Conductor's flaky MCP returning '[Tool result missing due to internal error]'): retry once, then branch on SESSION_KIND — spawned auto-chooses, headless BLOCKs, interactive renders a prose decision brief the user answers by typing a letter. The prose fallback MUST surface the triad: a clear ELI10 of the issue, a per-choice Completeness score, and a recommendation+why (one paragraph per choice). Carves out the [plan-tune auto-decide] denial as NOT a failure, and qualifies the former 'tool_use, not prose' assertions so the rule isn't self-contradicting. Tests pin the triad, the SESSION_KIND branch, the OV2 collision guard, the always-loaded guarantee, and a cross-file invariant on the auto-decide prefix. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -47,6 +47,11 @@ const MANDATORY: Array<{ name: string; re: RegExp }> = [
|
||||
{ name: 'Completeness coverage rule', re: /Completeness\s*:/i },
|
||||
{ name: 'kind-vs-coverage rule', re: /options differ in kind/i },
|
||||
{ name: 'Self-check checklist', re: /Self-check before emitting/i },
|
||||
// The runtime-failure fallback must be ALWAYS-LOADED too: when an AUQ call errors
|
||||
// mid-skill, the model needs the prose-fallback rule in context that instant, not
|
||||
// stranded in an on-demand section. Same guarantee as the format spec above.
|
||||
{ name: 'AUQ-failure fallback subsection', re: /When AskUserQuestion is unavailable or a call fails/i },
|
||||
{ name: 'fallback SESSION_KIND branch', re: /SESSION_KIND/ },
|
||||
];
|
||||
|
||||
/** Per-skill AUQ rules that govern review-finding cadence. A carve may move
|
||||
|
||||
@@ -16,6 +16,8 @@
|
||||
* for the weekly periodic eval to notice.
|
||||
*/
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import type { TemplateContext } from '../scripts/resolvers/types';
|
||||
import { HOST_PATHS } from '../scripts/resolvers/types';
|
||||
import { generateAskUserFormat } from '../scripts/resolvers/preamble/generate-ask-user-format';
|
||||
@@ -161,3 +163,88 @@ describe('generateAskUserFormat — 5+ option split rule (slim inline + docs poi
|
||||
expect(out).toContain('**Non-ASCII characters');
|
||||
});
|
||||
});
|
||||
|
||||
describe('generateAskUserFormat — runtime-failure prose fallback', () => {
|
||||
const out = generateAskUserFormat(makeCtx());
|
||||
|
||||
test('documents the unavailable/failed subsection', () => {
|
||||
expect(out).toMatch(/When AskUserQuestion is unavailable or a call fails/i);
|
||||
});
|
||||
|
||||
test('carves out the auto-decide denial as NOT a failure', () => {
|
||||
expect(out).toContain('[plan-tune auto-decide]');
|
||||
expect(out).toMatch(/NOT a failure/i);
|
||||
// and explicitly: do not fall back to prose on an auto-decide denial
|
||||
expect(out).toMatch(/Do NOT[\s\S]{0,40}fall back to prose|never prose/i);
|
||||
});
|
||||
|
||||
test('retries the errored call exactly once before degrading', () => {
|
||||
expect(out).toMatch(/retry the SAME call \*\*once\*\*|retry the same call.*once/i);
|
||||
// idempotency guard against double-prompting
|
||||
expect(out).toMatch(/double-prompt|no answer could have surfaced/i);
|
||||
});
|
||||
|
||||
test('branches on SESSION_KIND: spawned / headless / interactive', () => {
|
||||
expect(out).toContain('SESSION_KIND');
|
||||
expect(out).toMatch(/`spawned`[\s\S]*auto-choose/);
|
||||
expect(out).toMatch(/`headless`[\s\S]*BLOCKED/);
|
||||
expect(out).toMatch(/`interactive`[\s\S]*prose fallback/);
|
||||
// empty/absent SESSION_KIND degrades to interactive
|
||||
expect(out).toMatch(/empty\/absent[\s\S]{0,40}interactive/i);
|
||||
});
|
||||
|
||||
// The mandatory triad the user explicitly required for the plain-text output.
|
||||
test('prose fallback mandates the triad: issue ELI10', () => {
|
||||
expect(out).toMatch(/ELI10 of the issue itself/i);
|
||||
});
|
||||
|
||||
test('prose fallback mandates the triad: per-choice Completeness score', () => {
|
||||
expect(out).toMatch(/Completeness scores per choice/i);
|
||||
expect(out).toMatch(/Completeness: X\/10.*EACH choice|on EACH choice/i);
|
||||
});
|
||||
|
||||
test('prose fallback mandates the triad: recommendation + (recommended) marker', () => {
|
||||
expect(out).toMatch(/Recommendation: <choice> because/);
|
||||
expect(out).toMatch(/\(recommended\)`? marker on that choice/);
|
||||
});
|
||||
|
||||
test('prose fallback is one paragraph per choice, not a bare bullet list', () => {
|
||||
expect(out).toMatch(/ONE paragraph per choice/i);
|
||||
expect(out).toMatch(/never a bare bullet list/i);
|
||||
});
|
||||
|
||||
test('prose fallback tells the user to reply with a letter, then STOP', () => {
|
||||
expect(out).toMatch(/reply with a letter/i);
|
||||
expect(out).toMatch(/STOP and wait/i);
|
||||
});
|
||||
|
||||
// OV2: the former "tool_use, not prose" assertions must carry the qualifier so the
|
||||
// fallback is not self-contradicting. Guards against the instruction collision
|
||||
// silently returning on a future edit.
|
||||
test('OV2: the Format line qualifies "not prose" with the fallback exception', () => {
|
||||
expect(out).toMatch(/must be sent as tool_use, not prose — unless the documented failure fallback/);
|
||||
});
|
||||
|
||||
test('OV2: the self-check "not writing prose" line carries the fallback qualifier', () => {
|
||||
expect(out).toMatch(/not writing prose — unless the documented failure fallback applies/);
|
||||
});
|
||||
});
|
||||
|
||||
describe('CQ2 — cross-file invariant: auto-decide prefix matches the hook', () => {
|
||||
const out = generateAskUserFormat(makeCtx());
|
||||
const hookSrc = fs.readFileSync(
|
||||
path.resolve(__dirname, '..', 'hosts', 'claude', 'hooks', 'question-preference-hook.ts'),
|
||||
'utf-8',
|
||||
);
|
||||
|
||||
test('the hook actually emits the [plan-tune auto-decide] prefix', () => {
|
||||
expect(hookSrc).toContain('[plan-tune auto-decide]');
|
||||
});
|
||||
|
||||
test('the resolver references the exact same prefix the hook emits', () => {
|
||||
// If a future edit reworded the hook reason, this catches the drift: the prose
|
||||
// fallback would stop recognizing the auto-decide denial as not-a-failure.
|
||||
const PREFIX = '[plan-tune auto-decide]';
|
||||
expect(hookSrc.includes(PREFIX) && out.includes(PREFIX)).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user