mirror of
https://github.com/garrytan/gstack.git
synced 2026-06-18 07:40:09 +02:00
fix(auq): harden error-fallback hook + harness per adversarial review
Codex pre-landing review found three real issues:
- The PostToolUse fallback hook shared source 'plan-tune-cathedral' with the
question-log hook (same event+matcher); gstack-settings-hook replaces the entry,
so it would have clobbered plan-tune capture. Give it its own 'auq-error-fallback'
source (separate entry, both run); ALREADY_INSTALLED now requires both sources.
- isErrorResponse triggered on any string containing 'internal error'/'is_error',
so a real answer or a {"is_error": false} payload could fire the fallback after a
successful question. Narrow it to the missing-result sentinel + boolean is_error.
- The SDK runner mutated process.env.GSTACK_HEADLESS process-wide (leaked headless
into later tests). Removed; GSTACK_HEADLESS=1 now lives in the eval package.json
scripts, scoped to the invocation and inherited by the SDK child.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -32,20 +32,29 @@ describe('isErrorResponse — only clear failures, never a real answer', () => {
|
||||
expect(isErrorResponse('[Tool result missing due to internal error]')).toBe(true);
|
||||
});
|
||||
|
||||
test('is_error / error object shapes are failures', () => {
|
||||
test('is_error: true / error-field / sentinel-in-content are failures', () => {
|
||||
expect(isErrorResponse({ is_error: true })).toBe(true);
|
||||
expect(isErrorResponse({ isError: true })).toBe(true);
|
||||
expect(isErrorResponse({ error: 'boom' })).toBe(true);
|
||||
expect(isErrorResponse({ content: '...internal error...' })).toBe(true);
|
||||
expect(isErrorResponse({ content: 'Tool result missing due to internal error' })).toBe(true);
|
||||
});
|
||||
|
||||
test('a real answer is NOT a failure (no false trigger)', () => {
|
||||
expect(isErrorResponse({ answers: [{ option_label: 'A' }] })).toBe(false);
|
||||
expect(isErrorResponse('A')).toBe(false);
|
||||
// a choice that coincidentally contains the word "error" must not trip it
|
||||
// a choice that coincidentally contains "error" must not trip it
|
||||
expect(isErrorResponse({ answers: [{ option_label: 'Fix the error' }] })).toBe(false);
|
||||
expect(isErrorResponse('Investigate the login error')).toBe(false);
|
||||
});
|
||||
|
||||
test('Codex review: narrow detection — generic "error"/"is_error" substrings do NOT trigger', () => {
|
||||
// A real answer mentioning "internal error" must not be read as a failure.
|
||||
expect(isErrorResponse('Investigate the internal error')).toBe(false);
|
||||
// A serialized success payload containing the substring is_error:false must not trigger.
|
||||
expect(isErrorResponse('{"is_error": false, "answer": "A"}')).toBe(false);
|
||||
expect(isErrorResponse({ is_error: false })).toBe(false);
|
||||
expect(isErrorResponse({ content: 'The page had an internal error we fixed' })).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe('directiveFor — per-session-kind instruction', () => {
|
||||
|
||||
@@ -300,13 +300,12 @@ export async function runAgentSdkTest(
|
||||
const queryImpl: QueryProvider = opts.queryProvider ?? query;
|
||||
const model = opts.model ?? 'claude-opus-4-7';
|
||||
|
||||
// Default GSTACK_HEADLESS=1 so SDK-driven eval/E2E runs classify as headless: an
|
||||
// AskUserQuestion failure BLOCKs instead of emitting a prose question no human can
|
||||
// answer. Set ambiently (the SDK child inherits process.env) rather than via
|
||||
// sdkOpts.env — passing an env object to the SDK breaks its auth pipeline (see
|
||||
// CLAUDE.md). A suite testing the interactive prose-fallback path sets
|
||||
// process.env.GSTACK_HEADLESS='' before calling.
|
||||
if (process.env.GSTACK_HEADLESS === undefined) process.env.GSTACK_HEADLESS = '1';
|
||||
// NOTE on GSTACK_HEADLESS: the SDK child inherits process.env, so headless
|
||||
// classification for eval/E2E runs is set by the `test:gate` / `test:evals`
|
||||
// package.json scripts (scoped to that invocation), NOT mutated here. We must not
|
||||
// pass sdkOpts.env (it breaks the SDK auth pipeline — see CLAUDE.md) and must not
|
||||
// mutate process.env ambiently (it would leak headless into later interactive-path
|
||||
// tests in the same Bun process — Codex review finding).
|
||||
|
||||
let attempt = 0;
|
||||
let lastErr: unknown = null;
|
||||
|
||||
Reference in New Issue
Block a user