diff --git a/docs/spikes/claude-code-hook-mutation.md b/docs/spikes/claude-code-hook-mutation.md index 70a4ae18a..9d91e16cd 100644 --- a/docs/spikes/claude-code-hook-mutation.md +++ b/docs/spikes/claude-code-hook-mutation.md @@ -127,6 +127,38 @@ required for our hook to fire there. (PostToolUse hooks can also set `additionalContext` to append to the tool result; we don't need this for v1 capture.) +## PostToolUse on tool error (AUQ-failure fallback, OV3:B) — UNVERIFIED + +The AUQ-failure prose fallback adds a defensive PostToolUse hook +(`hosts/claude/hooks/auq-error-fallback-hook.ts`) that, when an AskUserQuestion +call returns an error / missing result, injects `additionalContext` reminding the +model to run the prose fallback per `SESSION_KIND`. It uses the same +`additionalContext` mechanism documented above. + +**Open question we could NOT settle in a harness:** does Claude Code invoke +PostToolUse hooks when an MCP tool call returns a transport/missing-result error +(the Conductor bug surfaces `[Tool result missing due to internal error]`)? The +docs above cover PostToolUse on *success*. We could not force the Conductor +internal MCP failure on demand to observe it. + +**Decision (OV3:B = A):** build the hook defensively anyway. +- It is **inert on success** (only fires when `isErrorResponse(tool_response)` is + true) and **inert if the platform never invokes it** on the error path. +- The prompt-level fallback in `generate-ask-user-format.ts` covers the case + regardless — the hook is a reliability *layer*, not the mechanism. +- Its decision logic is unit-tested deterministically + (`test/auq-error-fallback-hook.test.ts`): given a synthetic error `tool_response` + + each `SESSION_KIND`, it emits the correct directive; given a real answer it + defers. + +**Recommended manual / partial spike (to close the gap later):** register a +throwaway PostToolUse hook that logs on fire, then (a) trigger a normal tool +error (e.g. a failing `Bash` call) to confirm PostToolUse fires on tool errors at +all, and (b) reproduce the Conductor MCP AUQ failure and check the log. If (b) +confirms a fire, promote the hook from "defensive/inert" to "verified". Until +then, treat the runtime layer as best-effort and the prompt-level fallback as the +guaranteed path. + ## Settings.json snippet for T8 hook installer ```json diff --git a/hosts/claude/hooks/auq-error-fallback-hook b/hosts/claude/hooks/auq-error-fallback-hook new file mode 100755 index 000000000..063d7fce9 --- /dev/null +++ b/hosts/claude/hooks/auq-error-fallback-hook @@ -0,0 +1,7 @@ +#!/usr/bin/env bash +# Bash shim — Claude Code hooks run `command` strings via /bin/sh, so this +# wrapper makes the TypeScript hook executable via bun. Settings.json +# references this file directly. +set -e +HERE="$(cd "$(dirname "$0")" && pwd)" +exec bun "$HERE/auq-error-fallback-hook.ts" diff --git a/hosts/claude/hooks/auq-error-fallback-hook.ts b/hosts/claude/hooks/auq-error-fallback-hook.ts new file mode 100755 index 000000000..be0900830 --- /dev/null +++ b/hosts/claude/hooks/auq-error-fallback-hook.ts @@ -0,0 +1,197 @@ +#!/usr/bin/env bun +/** + * PostToolUse hook for AskUserQuestion — runtime reliability layer for the + * AUQ-failure prose fallback (OV3:B). + * + * When an AskUserQuestion call comes back as an ERROR / missing result (the + * Conductor MCP bug returns `[Tool result missing due to internal error]`), this + * hook injects `additionalContext` reminding the model of the failure-fallback + * rule, tailored to the session kind. It does NOT render the prose itself — the + * model still emits it. The hook only guarantees the reminder fires at the moment + * of failure, instead of relying on the model noticing the error result and + * recalling the echoed SESSION_KIND. + * + * DEFENSIVE / INERT-IF-UNSUPPORTED: it is unverified whether Claude Code invokes + * PostToolUse hooks when an MCP tool returns a transport/missing-result error (we + * could not force that Conductor-internal failure in a harness — see + * docs/spikes/claude-code-hook-mutation.md §"PostToolUse on tool error"). If the + * platform does NOT fire the hook on that path, this is simply never invoked — no + * harm; the prompt-level fallback in generate-ask-user-format.ts still covers it. + * On a SUCCESSFUL AskUserQuestion (a real answer), the hook defers (no output). + * + * Triggered by ~/.claude/settings.json (registered by `setup` next to + * question-log-hook): + * PostToolUse matcher "(AskUserQuestion|mcp__.*__AskUserQuestion)" + * + * Invariants: + * - Always exits 0. A failing hook MUST NOT block the user's session. + * - Never triggers on a successful answer (would corrupt a normal AUQ). + * - Errors land in ~/.gstack/hook-errors.log. + */ +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; +import { spawnSync } from 'child_process'; + +interface HookStdin { + tool_name?: string; + tool_response?: unknown; + cwd?: string; +} + +function stateRoot(): string { + return ( + process.env.GSTACK_STATE_ROOT || + process.env.GSTACK_HOME || + path.join(os.homedir(), '.gstack') + ); +} + +function logHookError(msg: string): void { + try { + const sr = stateRoot(); + fs.mkdirSync(sr, { recursive: true }); + fs.appendFileSync( + path.join(sr, 'hook-errors.log'), + `${new Date().toISOString()} auq-error-fallback-hook: ${msg}\n`, + ); + } catch { + // last-resort swallow + } +} + +function readStdin(): Promise { + return new Promise((resolve) => { + let buf = ''; + process.stdin.setEncoding('utf-8'); + process.stdin.on('data', (chunk) => (buf += chunk)); + process.stdin.on('end', () => resolve(buf)); + process.stdin.on('error', () => resolve(buf)); + setTimeout(() => resolve(buf), 2000); + }); +} + +/** No-op output — let the tool result stand untouched. */ +function defer(): void { + process.stdout.write( + JSON.stringify({ hookSpecificOutput: { hookEventName: 'PostToolUse' } }), + ); + process.exit(0); +} + +function inject(additionalContext: string): void { + process.stdout.write( + JSON.stringify({ + hookSpecificOutput: { hookEventName: 'PostToolUse', additionalContext }, + }), + ); + process.exit(0); +} + +/** + * Decide whether the tool_response is an ERROR / missing result rather than a + * real answer. Conservative: only flag clear failure shapes, so a successful + * AskUserQuestion (which carries the user's choice) is never misread as failure. + */ +export function isErrorResponse(response: unknown): boolean { + if (response === null || response === undefined) return true; + if (typeof response === 'string') { + const s = response.trim(); + if (s === '') return true; + return /tool result missing|internal error|\bis_error\b/i.test(s); + } + if (typeof response === 'object') { + const rec = response as Record; + if (rec.is_error === true || rec.isError === true || rec.error) return true; + // Some hosts wrap the payload as { content: "..." } or { content: [{text}] }. + const content = rec.content; + if (typeof content === 'string') return isErrorResponse(content); + if (Array.isArray(content)) { + const text = content + .map((c) => (typeof c === 'string' ? c : (c as Record)?.text ?? '')) + .join(' '); + if (text.trim() === '') return false; // empty content array on success is ambiguous; don't trigger + return /tool result missing|internal error/i.test(text); + } + } + return false; +} + +/** Resolve SESSION_KIND via the shared helper (same classification the preamble + * echoes). Falls back to 'interactive' (degrade-safe) on any failure. */ +export function sessionKind(cwd?: string): 'spawned' | 'headless' | 'interactive' { + try { + const here = path.dirname(new URL(import.meta.url).pathname); + const bin = path.resolve(here, '..', '..', '..', 'bin', 'gstack-session-kind'); + const res = spawnSync(bin, [], { + encoding: 'utf-8', + timeout: 3000, + cwd: cwd && fs.existsSync(cwd) ? cwd : undefined, + }); + const out = (res.stdout || '').trim(); + if (out === 'spawned' || out === 'headless' || out === 'interactive') return out; + } catch (e) { + logHookError(`sessionKind failed: ${(e as Error).message}`); + } + return 'interactive'; +} + +/** The directive injected per session kind. Exported for unit testing. */ +export function directiveFor(kind: 'spawned' | 'headless' | 'interactive'): string { + const lead = + 'The AskUserQuestion call did not return a usable answer (error / missing result). ' + + 'Per the AskUserQuestion failure-fallback rule: '; + switch (kind) { + case 'spawned': + return ( + lead + + 'SESSION_KIND=spawned — auto-choose the recommended option per the Spawned session block. ' + + 'Do not emit prose, do not BLOCK.' + ); + case 'headless': + return ( + lead + + 'SESSION_KIND=headless — report `BLOCKED — AskUserQuestion unavailable` and stop; no human can answer.' + ); + case 'interactive': + default: + return ( + lead + + 'SESSION_KIND=interactive — render the decision as a PROSE message now: a clear ELI10 of the issue, ' + + 'then a Recommendation line, then ONE paragraph per choice carrying its `(recommended)` marker, its ' + + '`Completeness: X/10`, and 2-4 sentences of reasoning. Tell the user to reply with a letter, then STOP. ' + + '(Retry the call once first only if no answer could have surfaced.)' + ); + } +} + +async function main(): Promise { + const raw = await readStdin(); + if (!raw.trim()) return defer(); + + let stdin: HookStdin; + try { + stdin = JSON.parse(raw); + } catch (e) { + logHookError(`stdin parse failed: ${(e as Error).message}`); + return defer(); + } + + const toolName = stdin.tool_name || ''; + if (toolName !== 'AskUserQuestion' && !/^mcp__.+__AskUserQuestion$/.test(toolName)) { + return defer(); + } + + if (!isErrorResponse(stdin.tool_response)) return defer(); + + inject(directiveFor(sessionKind(stdin.cwd))); +} + +// Only run the stdin→stdout pipeline when executed as a hook, not when imported +// by the unit test (which exercises the exported pure functions). +if (import.meta.main) { + main().catch((e) => { + logHookError(`main crash: ${(e as Error).message}`); + defer(); + }); +} diff --git a/setup b/setup index 37991eda7..1fce574ce 100755 --- a/setup +++ b/setup @@ -1317,6 +1317,7 @@ fi # already registered under that tag, the install is a no-op (no prompt). PLAN_TUNE_LOG_HOOK="$SOURCE_GSTACK_DIR/hosts/claude/hooks/question-log-hook" PLAN_TUNE_PREF_HOOK="$SOURCE_GSTACK_DIR/hosts/claude/hooks/question-preference-hook" +AUQ_ERROR_FALLBACK_HOOK="$SOURCE_GSTACK_DIR/hosts/claude/hooks/auq-error-fallback-hook" PLAN_TUNE_INSTALL_MARKER="$HOME/.gstack/.plan-tune-hooks-prompted" if [ "$NO_TEAM_MODE" -ne 1 ] \ @@ -1364,6 +1365,17 @@ if [ "$NO_TEAM_MODE" -ne 1 ] \ --command "$PLAN_TUNE_PREF_HOOK" \ --source plan-tune-cathedral \ --timeout 5 + # AUQ-failure prose-fallback reliability hook (OV3:B). Fires only when an + # AskUserQuestion call returns an error/missing result; inert on success and + # inert if the platform doesn't invoke PostToolUse on tool errors. + if [ -x "$AUQ_ERROR_FALLBACK_HOOK" ]; then + "$SETTINGS_HOOK" add-event \ + --event PostToolUse \ + --matcher '(AskUserQuestion|mcp__.*__AskUserQuestion)' \ + --command "$AUQ_ERROR_FALLBACK_HOOK" \ + --source plan-tune-cathedral \ + --timeout 5 + fi } if [ "$ALREADY_INSTALLED" -eq 1 ]; then diff --git a/test/auq-error-fallback-hook.test.ts b/test/auq-error-fallback-hook.test.ts new file mode 100644 index 000000000..48a996edb --- /dev/null +++ b/test/auq-error-fallback-hook.test.ts @@ -0,0 +1,122 @@ +/** + * auq-error-fallback-hook — the OV3:B runtime reliability layer. + * + * Two layers of testing: + * - PURE functions (isErrorResponse, directiveFor): deterministic, the core logic. + * - INTEGRATION: spawn the hook as a PostToolUse process with synthetic stdin and + * a controlled env, assert it injects the right directive on an error result and + * stays inert on a real answer. + * + * NOTE: whether the Claude Code PLATFORM invokes PostToolUse on an MCP + * transport/missing-result error is unverified (could not force the Conductor + * bug in a harness — see docs/spikes/claude-code-hook-mutation.md). These tests + * pin the hook's BEHAVIOR given it is invoked; the platform trigger is the + * documented residual risk. The hook is inert if never invoked. + */ +import { describe, test, expect } from 'bun:test'; +import { spawnSync } from 'child_process'; +import * as path from 'path'; +import { isErrorResponse, directiveFor } from '../hosts/claude/hooks/auq-error-fallback-hook.ts'; + +const HOOK = path.resolve(__dirname, '..', 'hosts', 'claude', 'hooks', 'auq-error-fallback-hook.ts'); + +describe('isErrorResponse — only clear failures, never a real answer', () => { + test('null / undefined / empty string are failures', () => { + expect(isErrorResponse(null)).toBe(true); + expect(isErrorResponse(undefined)).toBe(true); + expect(isErrorResponse('')).toBe(true); + expect(isErrorResponse(' ')).toBe(true); + }); + + test('the Conductor missing-result string is a failure', () => { + expect(isErrorResponse('[Tool result missing due to internal error]')).toBe(true); + }); + + test('is_error / error object shapes are failures', () => { + expect(isErrorResponse({ is_error: true })).toBe(true); + expect(isErrorResponse({ isError: true })).toBe(true); + expect(isErrorResponse({ error: 'boom' })).toBe(true); + expect(isErrorResponse({ content: '...internal error...' })).toBe(true); + }); + + test('a real answer is NOT a failure (no false trigger)', () => { + expect(isErrorResponse({ answers: [{ option_label: 'A' }] })).toBe(false); + expect(isErrorResponse('A')).toBe(false); + // a choice that coincidentally contains the word "error" must not trip it + expect(isErrorResponse({ answers: [{ option_label: 'Fix the error' }] })).toBe(false); + expect(isErrorResponse('Investigate the login error')).toBe(false); + }); +}); + +describe('directiveFor — per-session-kind instruction', () => { + test('interactive directive demands the prose triad', () => { + const d = directiveFor('interactive'); + expect(d).toMatch(/ELI10/); + expect(d).toMatch(/Completeness: X\/10/); + expect(d).toMatch(/\(recommended\)/); + expect(d).toMatch(/reply with a letter/i); + expect(d).toMatch(/STOP/); + }); + + test('headless directive BLOCKs', () => { + expect(directiveFor('headless')).toMatch(/BLOCKED — AskUserQuestion unavailable/); + }); + + test('spawned directive auto-chooses', () => { + expect(directiveFor('spawned')).toMatch(/auto-choose/i); + }); +}); + +/** Spawn the hook with synthetic stdin + controlled env; parse its JSON stdout. */ +function runHook(stdin: object, env: Record): { additionalContext?: string } { + const res = spawnSync('bun', [HOOK], { + input: JSON.stringify(stdin), + encoding: 'utf-8', + env: { PATH: process.env.PATH ?? '/usr/bin:/bin', ...env }, + }); + const parsed = JSON.parse(res.stdout || '{}'); + return parsed.hookSpecificOutput ?? {}; +} + +describe('hook integration — invoked as PostToolUse', () => { + test('error result + headless env → injects BLOCK directive', () => { + const out = runHook( + { tool_name: 'mcp__conductor__AskUserQuestion', tool_response: '[Tool result missing due to internal error]' }, + { GSTACK_HEADLESS: '1' }, + ); + expect(out.additionalContext).toMatch(/BLOCKED — AskUserQuestion unavailable/); + }); + + test('error result + interactive env → injects prose-triad directive', () => { + const out = runHook( + { tool_name: 'AskUserQuestion', tool_response: null }, + { CONDUCTOR_PORT: '55010' }, + ); + expect(out.additionalContext).toMatch(/render the decision as a PROSE message/i); + expect(out.additionalContext).toMatch(/Completeness: X\/10/); + }); + + test('error result + spawned env → injects auto-choose directive', () => { + const out = runHook( + { tool_name: 'AskUserQuestion', tool_response: { is_error: true } }, + { OPENCLAW_SESSION: '1' }, + ); + expect(out.additionalContext).toMatch(/auto-choose/i); + }); + + test('SUCCESSFUL answer → no injection (inert on real answers)', () => { + const out = runHook( + { tool_name: 'AskUserQuestion', tool_response: { answers: [{ option_label: 'A' }] } }, + { GSTACK_HEADLESS: '1' }, + ); + expect(out.additionalContext).toBeUndefined(); + }); + + test('non-AUQ tool → defers (no injection)', () => { + const out = runHook( + { tool_name: 'Bash', tool_response: null }, + { GSTACK_HEADLESS: '1' }, + ); + expect(out.additionalContext).toBeUndefined(); + }); +});