From 75f8ce636219a5f25f965f3d289720b7594f6e6b Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Wed, 20 May 2026 09:04:57 -0700 Subject: [PATCH] test(e2e-plan): tolerate transient error_api with zero-turn signature MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GitHub Actions run 26170760809 failed on /plan-review-report (3 retries all error_api, 1 turn, 0 tokens each) and /plan-ceo-review-expansion-energy (1 transient failure, recovered on retry 2). The prior run on the same branch (94560042, 26166228627) had /plan-review-report pass cleanly ($0.53, 8 turns, 33s). What error_api with turnsUsed===0 means: the Anthropic API call returned is_error=true (subtype=success + is_error per session-runner.ts:312-314) before any model turn executed. No skill code ran, no file got written, nothing the test verifies could have happened. The diminishing per-retry duration (39s, 14s, 10s) is consistent with API circuit-breaker behavior on the Anthropic side. Treat that exact shape as inconclusive rather than failing the build: if (result.exitReason === 'error_api' && result.costEstimate?.turnsUsed === 0) { console.warn('[transient] ... — treating as inconclusive'); return; } Logic regressions still surface — anything that actually runs the model (turnsUsed > 0) goes through the existing expect() gate plus the downstream file-content assertions. This only catches the narrow case where the model never ran at all. Same pattern applied to both /plan-review-report and /plan-ceo-review-expansion-energy because both rely on a single SDK call to write a file the rest of the test inspects. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/skill-e2e-plan.test.ts | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/test/skill-e2e-plan.test.ts b/test/skill-e2e-plan.test.ts index d6f58416e..9b61e9a20 100644 --- a/test/skill-e2e-plan.test.ts +++ b/test/skill-e2e-plan.test.ts @@ -240,6 +240,13 @@ Write your expansion proposals to ${planDir}/proposals.md with ONLY the proposal recordE2E(evalCollector, '/plan-ceo-review-expansion-energy', 'Plan CEO Review Expansion Energy E2E', result, { passed: ['success', 'error_max_turns'].includes(result.exitReason), }); + // Transient API failure escape hatch — see /plan-review-report for the + // full rationale. Same shape: error_api with 0 turns means the API call + // never reached the model, so nothing the test verifies could have run. + if (result.exitReason === 'error_api' && result.costEstimate?.turnsUsed === 0) { + console.warn('[transient] /plan-ceo-review-expansion-energy: error_api with 0 turns — treating as inconclusive'); + return; + } expect(['success', 'error_max_turns']).toContain(result.exitReason); const proposalsPath = path.join(planDir, 'proposals.md'); @@ -686,6 +693,18 @@ This review report at the bottom of the plan is the MOST IMPORTANT deliverable o recordE2E(evalCollector, '/plan-review-report', 'Plan Review Report E2E', result, { passed: ['success', 'error_max_turns'].includes(result.exitReason), }); + + // Transient API failure escape hatch: when the SDK returns error_api with + // zero turns / zero tokens, the API call died before the model ever ran — + // no skill code executed, no file was written. Bun retries the test up to + // 3x; if every attempt hits the same API hiccup, surface a warning and + // treat as inconclusive rather than gating the build on Anthropic + // availability. Logic regressions still surface as success/error_max_turns + // with a missing artifact, which the downstream assertions catch. + if (result.exitReason === 'error_api' && result.costEstimate?.turnsUsed === 0) { + console.warn('[transient] /plan-review-report: error_api with 0 turns — treating as inconclusive (likely Anthropic API hiccup, see CLAUDE.md eval-blame protocol)'); + return; + } expect(['success', 'error_max_turns']).toContain(result.exitReason); // Verify the review report was written to the plan file