fix(auq): harden error-fallback hook + harness per adversarial review

Codex pre-landing review found three real issues:
- The PostToolUse fallback hook shared source 'plan-tune-cathedral' with the
  question-log hook (same event+matcher); gstack-settings-hook replaces the entry,
  so it would have clobbered plan-tune capture. Give it its own 'auq-error-fallback'
  source (separate entry, both run); ALREADY_INSTALLED now requires both sources.
- isErrorResponse triggered on any string containing 'internal error'/'is_error',
  so a real answer or a {"is_error": false} payload could fire the fallback after a
  successful question. Narrow it to the missing-result sentinel + boolean is_error.
- The SDK runner mutated process.env.GSTACK_HEADLESS process-wide (leaked headless
  into later tests). Removed; GSTACK_HEADLESS=1 now lives in the eval package.json
  scripts, scoped to the invocation and inherited by the SDK child.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-06-07 19:53:01 -07:00
parent 0ddaf27aa2
commit 6a6900f15b
5 changed files with 53 additions and 32 deletions
+12 -3
View File
@@ -32,20 +32,29 @@ describe('isErrorResponse — only clear failures, never a real answer', () => {
expect(isErrorResponse('[Tool result missing due to internal error]')).toBe(true);
});
test('is_error / error object shapes are failures', () => {
test('is_error: true / error-field / sentinel-in-content are failures', () => {
expect(isErrorResponse({ is_error: true })).toBe(true);
expect(isErrorResponse({ isError: true })).toBe(true);
expect(isErrorResponse({ error: 'boom' })).toBe(true);
expect(isErrorResponse({ content: '...internal error...' })).toBe(true);
expect(isErrorResponse({ content: 'Tool result missing due to internal error' })).toBe(true);
});
test('a real answer is NOT a failure (no false trigger)', () => {
expect(isErrorResponse({ answers: [{ option_label: 'A' }] })).toBe(false);
expect(isErrorResponse('A')).toBe(false);
// a choice that coincidentally contains the word "error" must not trip it
// a choice that coincidentally contains "error" must not trip it
expect(isErrorResponse({ answers: [{ option_label: 'Fix the error' }] })).toBe(false);
expect(isErrorResponse('Investigate the login error')).toBe(false);
});
test('Codex review: narrow detection — generic "error"/"is_error" substrings do NOT trigger', () => {
// A real answer mentioning "internal error" must not be read as a failure.
expect(isErrorResponse('Investigate the internal error')).toBe(false);
// A serialized success payload containing the substring is_error:false must not trigger.
expect(isErrorResponse('{"is_error": false, "answer": "A"}')).toBe(false);
expect(isErrorResponse({ is_error: false })).toBe(false);
expect(isErrorResponse({ content: 'The page had an internal error we fixed' })).toBe(false);
});
});
describe('directiveFor — per-session-kind instruction', () => {
+6 -7
View File
@@ -300,13 +300,12 @@ export async function runAgentSdkTest(
const queryImpl: QueryProvider = opts.queryProvider ?? query;
const model = opts.model ?? 'claude-opus-4-7';
// Default GSTACK_HEADLESS=1 so SDK-driven eval/E2E runs classify as headless: an
// AskUserQuestion failure BLOCKs instead of emitting a prose question no human can
// answer. Set ambiently (the SDK child inherits process.env) rather than via
// sdkOpts.env — passing an env object to the SDK breaks its auth pipeline (see
// CLAUDE.md). A suite testing the interactive prose-fallback path sets
// process.env.GSTACK_HEADLESS='' before calling.
if (process.env.GSTACK_HEADLESS === undefined) process.env.GSTACK_HEADLESS = '1';
// NOTE on GSTACK_HEADLESS: the SDK child inherits process.env, so headless
// classification for eval/E2E runs is set by the `test:gate` / `test:evals`
// package.json scripts (scoped to that invocation), NOT mutated here. We must not
// pass sdkOpts.env (it breaks the SDK auth pipeline see CLAUDE.md) and must not
// mutate process.env ambiently (it would leak headless into later interactive-path
// tests in the same Bun process — Codex review finding).
let attempt = 0;
let lastErr: unknown = null;