diff --git a/hosts/claude/hooks/auq-error-fallback-hook.ts b/hosts/claude/hooks/auq-error-fallback-hook.ts index be0900830..502c9d486 100755 --- a/hosts/claude/hooks/auq-error-fallback-hook.ts +++ b/hosts/claude/hooks/auq-error-fallback-hook.ts @@ -98,20 +98,25 @@ export function isErrorResponse(response: unknown): boolean { if (typeof response === 'string') { const s = response.trim(); if (s === '') return true; - return /tool result missing|internal error|\bis_error\b/i.test(s); + // Match ONLY the specific missing-result sentinel phrase, not any string that + // merely contains "error" — a real answer like "Investigate the internal error" + // must NOT trigger the fallback. (Codex review finding.) + return /tool result missing/i.test(s); } if (typeof response === 'object') { const rec = response as Record; - if (rec.is_error === true || rec.isError === true || rec.error) return true; + // Structured flag must be the boolean true — not the substring "is_error" inside + // a serialized success payload like '{"is_error": false}'. + if (rec.is_error === true || rec.isError === true) return true; + if (typeof rec.error === 'string' && rec.error.trim() !== '') return true; // Some hosts wrap the payload as { content: "..." } or { content: [{text}] }. const content = rec.content; - if (typeof content === 'string') return isErrorResponse(content); + if (typeof content === 'string') return /tool result missing/i.test(content); if (Array.isArray(content)) { const text = content .map((c) => (typeof c === 'string' ? c : (c as Record)?.text ?? '')) .join(' '); - if (text.trim() === '') return false; // empty content array on success is ambiguous; don't trigger - return /tool result missing|internal error/i.test(text); + return /tool result missing/i.test(text); } } return false; diff --git a/package.json b/package.json index a9440ba15..e1fb1cf70 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "gstack", - "version": "1.57.0.0", + "version": "1.57.2.0", "description": "Garry's Stack — Claude Code skills + fast headless browser. One repo, one install, entire AI engineering workflow.", "license": "MIT", "type": "module", @@ -20,16 +20,16 @@ "test": "bun test browse/test/ test/ make-pdf/test/ --ignore 'test/skill-e2e-*.test.ts' --ignore test/skill-llm-eval.test.ts --ignore test/skill-routing-e2e.test.ts --ignore test/codex-e2e.test.ts --ignore test/gemini-e2e.test.ts && (bun run slop:diff 2>/dev/null || true)", "test:free": "bun run scripts/test-free-shards.ts", "test:windows": "bun run scripts/test-free-shards.ts --windows-only", - "test:evals": "EVALS=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-llm-eval.test.ts test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts", - "test:evals:all": "EVALS=1 EVALS_ALL=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-llm-eval.test.ts test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts", - "test:e2e": "EVALS=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts", - "test:e2e:all": "EVALS=1 EVALS_ALL=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts", - "test:gate": "EVALS=1 EVALS_TIER=gate bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-llm-eval.test.ts test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts", - "test:periodic": "EVALS=1 EVALS_TIER=periodic EVALS_ALL=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts", - "test:codex": "EVALS=1 bun test test/codex-e2e.test.ts", - "test:codex:all": "EVALS=1 EVALS_ALL=1 bun test test/codex-e2e.test.ts", - "test:gemini": "EVALS=1 bun test test/gemini-e2e.test.ts", - "test:gemini:all": "EVALS=1 EVALS_ALL=1 bun test test/gemini-e2e.test.ts", + "test:evals": "EVALS=1 GSTACK_HEADLESS=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-llm-eval.test.ts test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts", + "test:evals:all": "EVALS=1 GSTACK_HEADLESS=1 EVALS_ALL=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-llm-eval.test.ts test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts", + "test:e2e": "EVALS=1 GSTACK_HEADLESS=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts", + "test:e2e:all": "EVALS=1 GSTACK_HEADLESS=1 EVALS_ALL=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts", + "test:gate": "EVALS=1 GSTACK_HEADLESS=1 EVALS_TIER=gate bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-llm-eval.test.ts test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts", + "test:periodic": "EVALS=1 GSTACK_HEADLESS=1 EVALS_TIER=periodic EVALS_ALL=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts", + "test:codex": "EVALS=1 GSTACK_HEADLESS=1 bun test test/codex-e2e.test.ts", + "test:codex:all": "EVALS=1 GSTACK_HEADLESS=1 EVALS_ALL=1 bun test test/codex-e2e.test.ts", + "test:gemini": "EVALS=1 GSTACK_HEADLESS=1 bun test test/gemini-e2e.test.ts", + "test:gemini:all": "EVALS=1 GSTACK_HEADLESS=1 EVALS_ALL=1 bun test test/gemini-e2e.test.ts", "skill:check": "bun run scripts/skill-check.ts", "dev:skill": "bun run scripts/dev-skill.ts", "start": "bun run browse/src/server.ts", diff --git a/setup b/setup index 1fce574ce..0c180f7bf 100755 --- a/setup +++ b/setup @@ -1325,9 +1325,13 @@ if [ "$NO_TEAM_MODE" -ne 1 ] \ && [ -x "$PLAN_TUNE_LOG_HOOK" ] \ && [ -x "$PLAN_TUNE_PREF_HOOK" ]; then - # Already installed? Check the settings.json for our source tag. + # Already installed? Require BOTH the plan-tune source AND the AUQ-error-fallback + # source — so an existing install that predates the fallback hook re-runs the + # install (which is idempotent for the plan-tune hooks) and picks up the new one. ALREADY_INSTALLED=0 - if "$SETTINGS_HOOK" list-sources 2>/dev/null | grep -q "plan-tune-cathedral"; then + _HOOK_SOURCES=$("$SETTINGS_HOOK" list-sources 2>/dev/null || true) + if printf '%s' "$_HOOK_SOURCES" | grep -q "plan-tune-cathedral" \ + && printf '%s' "$_HOOK_SOURCES" | grep -q "auq-error-fallback"; then ALREADY_INSTALLED=1 fi @@ -1365,15 +1369,19 @@ if [ "$NO_TEAM_MODE" -ne 1 ] \ --command "$PLAN_TUNE_PREF_HOOK" \ --source plan-tune-cathedral \ --timeout 5 - # AUQ-failure prose-fallback reliability hook (OV3:B). Fires only when an - # AskUserQuestion call returns an error/missing result; inert on success and - # inert if the platform doesn't invoke PostToolUse on tool errors. + # AskUserQuestion-failure prose-fallback reliability hook (OV3:B). Fires only when + # an AskUserQuestion call returns an error/missing result; inert on success and + # inert if the platform doesn't invoke PostToolUse on tool errors. MUST use its + # OWN source tag: gstack-settings-hook dedupes by (event, matcher, source) and + # REPLACES the entry's hooks, so sharing 'plan-tune-cathedral' would overwrite the + # question-log capture hook (same event+matcher). A distinct source = a second + # PostToolUse entry; both run in parallel. if [ -x "$AUQ_ERROR_FALLBACK_HOOK" ]; then "$SETTINGS_HOOK" add-event \ --event PostToolUse \ --matcher '(AskUserQuestion|mcp__.*__AskUserQuestion)' \ --command "$AUQ_ERROR_FALLBACK_HOOK" \ - --source plan-tune-cathedral \ + --source auq-error-fallback \ --timeout 5 fi } diff --git a/test/auq-error-fallback-hook.test.ts b/test/auq-error-fallback-hook.test.ts index 48a996edb..ee6d317f5 100644 --- a/test/auq-error-fallback-hook.test.ts +++ b/test/auq-error-fallback-hook.test.ts @@ -32,20 +32,29 @@ describe('isErrorResponse — only clear failures, never a real answer', () => { expect(isErrorResponse('[Tool result missing due to internal error]')).toBe(true); }); - test('is_error / error object shapes are failures', () => { + test('is_error: true / error-field / sentinel-in-content are failures', () => { expect(isErrorResponse({ is_error: true })).toBe(true); expect(isErrorResponse({ isError: true })).toBe(true); expect(isErrorResponse({ error: 'boom' })).toBe(true); - expect(isErrorResponse({ content: '...internal error...' })).toBe(true); + expect(isErrorResponse({ content: 'Tool result missing due to internal error' })).toBe(true); }); test('a real answer is NOT a failure (no false trigger)', () => { expect(isErrorResponse({ answers: [{ option_label: 'A' }] })).toBe(false); expect(isErrorResponse('A')).toBe(false); - // a choice that coincidentally contains the word "error" must not trip it + // a choice that coincidentally contains "error" must not trip it expect(isErrorResponse({ answers: [{ option_label: 'Fix the error' }] })).toBe(false); expect(isErrorResponse('Investigate the login error')).toBe(false); }); + + test('Codex review: narrow detection — generic "error"/"is_error" substrings do NOT trigger', () => { + // A real answer mentioning "internal error" must not be read as a failure. + expect(isErrorResponse('Investigate the internal error')).toBe(false); + // A serialized success payload containing the substring is_error:false must not trigger. + expect(isErrorResponse('{"is_error": false, "answer": "A"}')).toBe(false); + expect(isErrorResponse({ is_error: false })).toBe(false); + expect(isErrorResponse({ content: 'The page had an internal error we fixed' })).toBe(false); + }); }); describe('directiveFor — per-session-kind instruction', () => { diff --git a/test/helpers/agent-sdk-runner.ts b/test/helpers/agent-sdk-runner.ts index c35585438..fc50f7884 100644 --- a/test/helpers/agent-sdk-runner.ts +++ b/test/helpers/agent-sdk-runner.ts @@ -300,13 +300,12 @@ export async function runAgentSdkTest( const queryImpl: QueryProvider = opts.queryProvider ?? query; const model = opts.model ?? 'claude-opus-4-7'; - // Default GSTACK_HEADLESS=1 so SDK-driven eval/E2E runs classify as headless: an - // AskUserQuestion failure BLOCKs instead of emitting a prose question no human can - // answer. Set ambiently (the SDK child inherits process.env) rather than via - // sdkOpts.env — passing an env object to the SDK breaks its auth pipeline (see - // CLAUDE.md). A suite testing the interactive prose-fallback path sets - // process.env.GSTACK_HEADLESS='' before calling. - if (process.env.GSTACK_HEADLESS === undefined) process.env.GSTACK_HEADLESS = '1'; + // NOTE on GSTACK_HEADLESS: the SDK child inherits process.env, so headless + // classification for eval/E2E runs is set by the `test:gate` / `test:evals` + // package.json scripts (scoped to that invocation), NOT mutated here. We must not + // pass sdkOpts.env (it breaks the SDK auth pipeline — see CLAUDE.md) and must not + // mutate process.env ambiently (it would leak headless into later interactive-path + // tests in the same Bun process — Codex review finding). let attempt = 0; let lastErr: unknown = null;