mirror of
https://github.com/garrytan/gstack.git
synced 2026-06-17 07:10:12 +02:00
fix(auq): harden error-fallback hook + harness per adversarial review
Codex pre-landing review found three real issues:
- The PostToolUse fallback hook shared source 'plan-tune-cathedral' with the
question-log hook (same event+matcher); gstack-settings-hook replaces the entry,
so it would have clobbered plan-tune capture. Give it its own 'auq-error-fallback'
source (separate entry, both run); ALREADY_INSTALLED now requires both sources.
- isErrorResponse triggered on any string containing 'internal error'/'is_error',
so a real answer or a {"is_error": false} payload could fire the fallback after a
successful question. Narrow it to the missing-result sentinel + boolean is_error.
- The SDK runner mutated process.env.GSTACK_HEADLESS process-wide (leaked headless
into later tests). Removed; GSTACK_HEADLESS=1 now lives in the eval package.json
scripts, scoped to the invocation and inherited by the SDK child.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -98,20 +98,25 @@ export function isErrorResponse(response: unknown): boolean {
|
||||
if (typeof response === 'string') {
|
||||
const s = response.trim();
|
||||
if (s === '') return true;
|
||||
return /tool result missing|internal error|\bis_error\b/i.test(s);
|
||||
// Match ONLY the specific missing-result sentinel phrase, not any string that
|
||||
// merely contains "error" — a real answer like "Investigate the internal error"
|
||||
// must NOT trigger the fallback. (Codex review finding.)
|
||||
return /tool result missing/i.test(s);
|
||||
}
|
||||
if (typeof response === 'object') {
|
||||
const rec = response as Record<string, unknown>;
|
||||
if (rec.is_error === true || rec.isError === true || rec.error) return true;
|
||||
// Structured flag must be the boolean true — not the substring "is_error" inside
|
||||
// a serialized success payload like '{"is_error": false}'.
|
||||
if (rec.is_error === true || rec.isError === true) return true;
|
||||
if (typeof rec.error === 'string' && rec.error.trim() !== '') return true;
|
||||
// Some hosts wrap the payload as { content: "..." } or { content: [{text}] }.
|
||||
const content = rec.content;
|
||||
if (typeof content === 'string') return isErrorResponse(content);
|
||||
if (typeof content === 'string') return /tool result missing/i.test(content);
|
||||
if (Array.isArray(content)) {
|
||||
const text = content
|
||||
.map((c) => (typeof c === 'string' ? c : (c as Record<string, unknown>)?.text ?? ''))
|
||||
.join(' ');
|
||||
if (text.trim() === '') return false; // empty content array on success is ambiguous; don't trigger
|
||||
return /tool result missing|internal error/i.test(text);
|
||||
return /tool result missing/i.test(text);
|
||||
}
|
||||
}
|
||||
return false;
|
||||
|
||||
+11
-11
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "gstack",
|
||||
"version": "1.57.0.0",
|
||||
"version": "1.57.2.0",
|
||||
"description": "Garry's Stack — Claude Code skills + fast headless browser. One repo, one install, entire AI engineering workflow.",
|
||||
"license": "MIT",
|
||||
"type": "module",
|
||||
@@ -20,16 +20,16 @@
|
||||
"test": "bun test browse/test/ test/ make-pdf/test/ --ignore 'test/skill-e2e-*.test.ts' --ignore test/skill-llm-eval.test.ts --ignore test/skill-routing-e2e.test.ts --ignore test/codex-e2e.test.ts --ignore test/gemini-e2e.test.ts && (bun run slop:diff 2>/dev/null || true)",
|
||||
"test:free": "bun run scripts/test-free-shards.ts",
|
||||
"test:windows": "bun run scripts/test-free-shards.ts --windows-only",
|
||||
"test:evals": "EVALS=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-llm-eval.test.ts test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts",
|
||||
"test:evals:all": "EVALS=1 EVALS_ALL=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-llm-eval.test.ts test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts",
|
||||
"test:e2e": "EVALS=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts",
|
||||
"test:e2e:all": "EVALS=1 EVALS_ALL=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts",
|
||||
"test:gate": "EVALS=1 EVALS_TIER=gate bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-llm-eval.test.ts test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts",
|
||||
"test:periodic": "EVALS=1 EVALS_TIER=periodic EVALS_ALL=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts",
|
||||
"test:codex": "EVALS=1 bun test test/codex-e2e.test.ts",
|
||||
"test:codex:all": "EVALS=1 EVALS_ALL=1 bun test test/codex-e2e.test.ts",
|
||||
"test:gemini": "EVALS=1 bun test test/gemini-e2e.test.ts",
|
||||
"test:gemini:all": "EVALS=1 EVALS_ALL=1 bun test test/gemini-e2e.test.ts",
|
||||
"test:evals": "EVALS=1 GSTACK_HEADLESS=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-llm-eval.test.ts test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts",
|
||||
"test:evals:all": "EVALS=1 GSTACK_HEADLESS=1 EVALS_ALL=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-llm-eval.test.ts test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts",
|
||||
"test:e2e": "EVALS=1 GSTACK_HEADLESS=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts",
|
||||
"test:e2e:all": "EVALS=1 GSTACK_HEADLESS=1 EVALS_ALL=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts",
|
||||
"test:gate": "EVALS=1 GSTACK_HEADLESS=1 EVALS_TIER=gate bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-llm-eval.test.ts test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts",
|
||||
"test:periodic": "EVALS=1 GSTACK_HEADLESS=1 EVALS_TIER=periodic EVALS_ALL=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts",
|
||||
"test:codex": "EVALS=1 GSTACK_HEADLESS=1 bun test test/codex-e2e.test.ts",
|
||||
"test:codex:all": "EVALS=1 GSTACK_HEADLESS=1 EVALS_ALL=1 bun test test/codex-e2e.test.ts",
|
||||
"test:gemini": "EVALS=1 GSTACK_HEADLESS=1 bun test test/gemini-e2e.test.ts",
|
||||
"test:gemini:all": "EVALS=1 GSTACK_HEADLESS=1 EVALS_ALL=1 bun test test/gemini-e2e.test.ts",
|
||||
"skill:check": "bun run scripts/skill-check.ts",
|
||||
"dev:skill": "bun run scripts/dev-skill.ts",
|
||||
"start": "bun run browse/src/server.ts",
|
||||
|
||||
@@ -1325,9 +1325,13 @@ if [ "$NO_TEAM_MODE" -ne 1 ] \
|
||||
&& [ -x "$PLAN_TUNE_LOG_HOOK" ] \
|
||||
&& [ -x "$PLAN_TUNE_PREF_HOOK" ]; then
|
||||
|
||||
# Already installed? Check the settings.json for our source tag.
|
||||
# Already installed? Require BOTH the plan-tune source AND the AUQ-error-fallback
|
||||
# source — so an existing install that predates the fallback hook re-runs the
|
||||
# install (which is idempotent for the plan-tune hooks) and picks up the new one.
|
||||
ALREADY_INSTALLED=0
|
||||
if "$SETTINGS_HOOK" list-sources 2>/dev/null | grep -q "plan-tune-cathedral"; then
|
||||
_HOOK_SOURCES=$("$SETTINGS_HOOK" list-sources 2>/dev/null || true)
|
||||
if printf '%s' "$_HOOK_SOURCES" | grep -q "plan-tune-cathedral" \
|
||||
&& printf '%s' "$_HOOK_SOURCES" | grep -q "auq-error-fallback"; then
|
||||
ALREADY_INSTALLED=1
|
||||
fi
|
||||
|
||||
@@ -1365,15 +1369,19 @@ if [ "$NO_TEAM_MODE" -ne 1 ] \
|
||||
--command "$PLAN_TUNE_PREF_HOOK" \
|
||||
--source plan-tune-cathedral \
|
||||
--timeout 5
|
||||
# AUQ-failure prose-fallback reliability hook (OV3:B). Fires only when an
|
||||
# AskUserQuestion call returns an error/missing result; inert on success and
|
||||
# inert if the platform doesn't invoke PostToolUse on tool errors.
|
||||
# AskUserQuestion-failure prose-fallback reliability hook (OV3:B). Fires only when
|
||||
# an AskUserQuestion call returns an error/missing result; inert on success and
|
||||
# inert if the platform doesn't invoke PostToolUse on tool errors. MUST use its
|
||||
# OWN source tag: gstack-settings-hook dedupes by (event, matcher, source) and
|
||||
# REPLACES the entry's hooks, so sharing 'plan-tune-cathedral' would overwrite the
|
||||
# question-log capture hook (same event+matcher). A distinct source = a second
|
||||
# PostToolUse entry; both run in parallel.
|
||||
if [ -x "$AUQ_ERROR_FALLBACK_HOOK" ]; then
|
||||
"$SETTINGS_HOOK" add-event \
|
||||
--event PostToolUse \
|
||||
--matcher '(AskUserQuestion|mcp__.*__AskUserQuestion)' \
|
||||
--command "$AUQ_ERROR_FALLBACK_HOOK" \
|
||||
--source plan-tune-cathedral \
|
||||
--source auq-error-fallback \
|
||||
--timeout 5
|
||||
fi
|
||||
}
|
||||
|
||||
@@ -32,20 +32,29 @@ describe('isErrorResponse — only clear failures, never a real answer', () => {
|
||||
expect(isErrorResponse('[Tool result missing due to internal error]')).toBe(true);
|
||||
});
|
||||
|
||||
test('is_error / error object shapes are failures', () => {
|
||||
test('is_error: true / error-field / sentinel-in-content are failures', () => {
|
||||
expect(isErrorResponse({ is_error: true })).toBe(true);
|
||||
expect(isErrorResponse({ isError: true })).toBe(true);
|
||||
expect(isErrorResponse({ error: 'boom' })).toBe(true);
|
||||
expect(isErrorResponse({ content: '...internal error...' })).toBe(true);
|
||||
expect(isErrorResponse({ content: 'Tool result missing due to internal error' })).toBe(true);
|
||||
});
|
||||
|
||||
test('a real answer is NOT a failure (no false trigger)', () => {
|
||||
expect(isErrorResponse({ answers: [{ option_label: 'A' }] })).toBe(false);
|
||||
expect(isErrorResponse('A')).toBe(false);
|
||||
// a choice that coincidentally contains the word "error" must not trip it
|
||||
// a choice that coincidentally contains "error" must not trip it
|
||||
expect(isErrorResponse({ answers: [{ option_label: 'Fix the error' }] })).toBe(false);
|
||||
expect(isErrorResponse('Investigate the login error')).toBe(false);
|
||||
});
|
||||
|
||||
test('Codex review: narrow detection — generic "error"/"is_error" substrings do NOT trigger', () => {
|
||||
// A real answer mentioning "internal error" must not be read as a failure.
|
||||
expect(isErrorResponse('Investigate the internal error')).toBe(false);
|
||||
// A serialized success payload containing the substring is_error:false must not trigger.
|
||||
expect(isErrorResponse('{"is_error": false, "answer": "A"}')).toBe(false);
|
||||
expect(isErrorResponse({ is_error: false })).toBe(false);
|
||||
expect(isErrorResponse({ content: 'The page had an internal error we fixed' })).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe('directiveFor — per-session-kind instruction', () => {
|
||||
|
||||
@@ -300,13 +300,12 @@ export async function runAgentSdkTest(
|
||||
const queryImpl: QueryProvider = opts.queryProvider ?? query;
|
||||
const model = opts.model ?? 'claude-opus-4-7';
|
||||
|
||||
// Default GSTACK_HEADLESS=1 so SDK-driven eval/E2E runs classify as headless: an
|
||||
// AskUserQuestion failure BLOCKs instead of emitting a prose question no human can
|
||||
// answer. Set ambiently (the SDK child inherits process.env) rather than via
|
||||
// sdkOpts.env — passing an env object to the SDK breaks its auth pipeline (see
|
||||
// CLAUDE.md). A suite testing the interactive prose-fallback path sets
|
||||
// process.env.GSTACK_HEADLESS='' before calling.
|
||||
if (process.env.GSTACK_HEADLESS === undefined) process.env.GSTACK_HEADLESS = '1';
|
||||
// NOTE on GSTACK_HEADLESS: the SDK child inherits process.env, so headless
|
||||
// classification for eval/E2E runs is set by the `test:gate` / `test:evals`
|
||||
// package.json scripts (scoped to that invocation), NOT mutated here. We must not
|
||||
// pass sdkOpts.env (it breaks the SDK auth pipeline — see CLAUDE.md) and must not
|
||||
// mutate process.env ambiently (it would leak headless into later interactive-path
|
||||
// tests in the same Bun process — Codex review finding).
|
||||
|
||||
let attempt = 0;
|
||||
let lastErr: unknown = null;
|
||||
|
||||
Reference in New Issue
Block a user