fix(eval): handle SDK max-turns throw gracefully

Some @anthropic-ai/claude-agent-sdk versions throw from the query
generator when maxTurns is reached, instead of emitting a result
message with subtype='error_max_turns'. The runner treated that as
a non-retryable error and killed the whole periodic run on the first
fixture that exceeded its turn cap.

Added isMaxTurnsError() detector and a catch branch that synthesizes
an AgentSdkResult from events captured before the throw, with
exitReason='error_max_turns' and costUsd=0 (unknown from the thrown
path). The metric function still runs against whatever assistant
turns were collected, so the trial produces a usable number.

Hoisted events/assistantTurns/toolCalls/assistantTextParts and the
timing counters out of the inner try so the catch branch can read
them. No behavior change on the success path or on rate-limit retry
paths.
This commit is contained in:
Garry Tan
2026-04-23 11:34:51 -07:00
parent 04e2f1bea9
commit 5294c65777
+55 -11
View File
@@ -209,6 +209,19 @@ export function isRateLimitEvent(msg: SDKMessage): boolean {
return info?.status === 'rejected';
}
/**
* True if `err` is the SDK's "max turns reached" throw. Some SDK versions
* raise this as an exception from the generator instead of emitting a
* result message with subtype='error_max_turns'. We treat it as terminal-
* but-recoverable: record what we collected and continue, rather than
* failing the whole run.
*/
export function isMaxTurnsError(err: unknown): boolean {
if (!err || typeof err !== 'object') return false;
const msg = (err as { message?: string }).message ?? '';
return /reached maximum number of turns|max.?turns/i.test(msg);
}
// ---------------------------------------------------------------------------
// Version resolution (cached)
// ---------------------------------------------------------------------------
@@ -259,6 +272,20 @@ export async function runAgentSdkTest(
while (attempt <= maxRetries) {
await sem.acquire();
const startMs = Date.now();
// Hoisted so the max-turns catch branch can synthesize a result from
// whatever we captured before the SDK threw.
const events: SDKMessage[] = [];
const assistantTurns: SDKAssistantMessage[] = [];
const toolCalls: Array<{ tool: string; input: unknown; output: string }> = [];
const assistantTextParts: string[] = [];
let firstResponseMs = 0;
let lastEventMs = startMs;
let maxInterTurnMs = 0;
let systemInitVersion = 'unknown';
let rateLimited: unknown = null;
let terminalResult: SDKResultMessage | null = null;
try {
const sdkOpts: Options = {
model,
@@ -280,17 +307,6 @@ export async function runAgentSdkTest(
sdkOpts.systemPrompt = opts.systemPrompt;
}
const events: SDKMessage[] = [];
const assistantTurns: SDKAssistantMessage[] = [];
const toolCalls: Array<{ tool: string; input: unknown; output: string }> = [];
const assistantTextParts: string[] = [];
let firstResponseMs = 0;
let lastEventMs = startMs;
let maxInterTurnMs = 0;
let systemInitVersion = 'unknown';
let rateLimited: unknown = null;
let terminalResult: SDKResultMessage | null = null;
const q = queryImpl({
prompt: opts.userPrompt,
options: sdkOpts,
@@ -382,6 +398,34 @@ export async function runAgentSdkTest(
};
} catch (err) {
lastErr = err;
// "Max turns reached" is the SDK's way of saying "this session ran
// out of turns." It's thrown from the generator instead of emitted
// as a result message. Treat as a successful-but-capped trial: the
// assistant turns we collected are real and carry a metric. Record
// them with exitReason='error_max_turns' rather than failing the
// whole run.
if (isMaxTurnsError(err)) {
const durationMs = Date.now() - startMs;
return {
events,
assistantTurns,
toolCalls,
output: assistantTextParts.join('\n'),
exitReason: 'error_max_turns',
turnsUsed: assistantTurns.length,
durationMs,
firstResponseMs,
maxInterTurnMs,
costUsd: 0, // unknown from thrown-error path
model,
sdkVersion: resolveSdkVersion(),
sdkClaudeCodeVersion: systemInitVersion,
resolvedBinaryPath: opts.pathToClaudeCodeExecutable ?? 'sdk-default',
browseErrors: [],
};
}
const isRetryable = isRateLimitThrown(err);
if (!isRetryable || attempt >= maxRetries) {
if (isRetryable) {