diff --git a/internal/einomcp/mcp_tools.go b/internal/einomcp/mcp_tools.go index 9fcd2b7a..2718f18d 100644 --- a/internal/einomcp/mcp_tools.go +++ b/internal/einomcp/mcp_tools.go @@ -108,7 +108,13 @@ func runMCPToolInvocation( var args map[string]interface{} if argumentsInJSON != "" && argumentsInJSON != "null" { if err := json.Unmarshal([]byte(argumentsInJSON), &args); err != nil { - return "", fmt.Errorf("invalid tool arguments JSON: %w", err) + // Return soft error (nil error) so the eino graph continues and the LLM can self-correct, + // instead of a hard error that terminates the iteration loop. + return ToolErrorPrefix + fmt.Sprintf( + "Invalid tool arguments JSON: %s\n\nPlease ensure the arguments are a valid JSON object "+ + "(double-quoted keys, matched braces, no trailing commas) and retry.\n\n"+ + "(工具参数 JSON 解析失败:%s。请确保 arguments 是合法的 JSON 对象并重试。)", + err.Error(), err.Error()), nil } } if args == nil { diff --git a/internal/multiagent/runner.go b/internal/multiagent/runner.go index dc45e20a..3967e3d3 100644 --- a/internal/multiagent/runner.go +++ b/internal/multiagent/runner.go @@ -302,34 +302,20 @@ func RunDeepAgent( var lastRunMsgs []adk.Message var lastAssistant string + // retryHints tracks the corrective hint to append for each retry attempt. + // Index i corresponds to the hint that will be appended on attempt i+1. + var retryHints []adk.Message + attemptLoop: - for attempt := 0; attempt < maxToolCallArgumentsJSONAttempts; attempt++ { - msgs := make([]adk.Message, 0, len(baseMsgs)+attempt) + for attempt := 0; attempt < maxToolCallRecoveryAttempts; attempt++ { + msgs := make([]adk.Message, 0, len(baseMsgs)+len(retryHints)) msgs = append(msgs, baseMsgs...) - for i := 0; i < attempt; i++ { - msgs = append(msgs, toolCallArgumentsJSONRetryHint()) - } + msgs = append(msgs, retryHints...) if attempt > 0 { mcpIDsMu.Lock() mcpIDs = mcpIDs[:0] mcpIDsMu.Unlock() - if logger != nil { - logger.Warn("eino DeepAgent: 工具参数 JSON 被接口拒绝,追加提示后重试", - zap.Int("attempt", attempt), - zap.Int("maxAttempts", maxToolCallArgumentsJSONAttempts)) - } - if progress != nil { - // 使用专用事件类型 eino_recovery,便于前端时间线展示(progress 仅改标题,不进时间线) - progress("eino_recovery", toolCallArgumentsJSONRecoveryTimelineMessage(attempt), map[string]interface{}{ - "conversationId": conversationID, - "source": "eino", - "einoRetry": attempt, - "runIndex": attempt + 1, // 第几轮完整运行(1 为首次,重试后递增) - "maxRuns": maxToolCallArgumentsJSONAttempts, - "reason": "invalid_tool_arguments_json", - }) - } } // 仅保留主代理最后一次 assistant 输出;每轮重试重置,避免拼接失败轮次的片段。 @@ -357,12 +343,48 @@ attemptLoop: continue } if ev.Err != nil { - if isRecoverableToolCallArgumentsJSONError(ev.Err) && attempt+1 < maxToolCallArgumentsJSONAttempts { + canRetry := attempt+1 < maxToolCallRecoveryAttempts + + // Recoverable: API-level JSON argument validation error. + if canRetry && isRecoverableToolCallArgumentsJSONError(ev.Err) { if logger != nil { logger.Warn("eino: recoverable tool-call JSON error from model/API", zap.Error(ev.Err), zap.Int("attempt", attempt)) } + retryHints = append(retryHints, toolCallArgumentsJSONRetryHint()) + if progress != nil { + progress("eino_recovery", toolCallArgumentsJSONRecoveryTimelineMessage(attempt), map[string]interface{}{ + "conversationId": conversationID, + "source": "eino", + "einoRetry": attempt, + "runIndex": attempt + 1, + "maxRuns": maxToolCallRecoveryAttempts, + "reason": "invalid_tool_arguments_json", + }) + } continue attemptLoop } + + // Recoverable: tool execution error (unknown sub-agent, tool not found, bad JSON in args, etc.). + if canRetry && isRecoverableToolExecutionError(ev.Err) { + if logger != nil { + logger.Warn("eino: recoverable tool execution error, will retry with corrective hint", + zap.Error(ev.Err), zap.Int("attempt", attempt)) + } + retryHints = append(retryHints, toolExecutionRetryHint()) + if progress != nil { + progress("eino_recovery", toolExecutionRecoveryTimelineMessage(attempt), map[string]interface{}{ + "conversationId": conversationID, + "source": "eino", + "einoRetry": attempt, + "runIndex": attempt + 1, + "maxRuns": maxToolCallRecoveryAttempts, + "reason": "tool_execution_error", + }) + } + continue attemptLoop + } + + // Non-recoverable error. if progress != nil { progress("error", ev.Err.Error(), map[string]interface{}{ "conversationId": conversationID, diff --git a/internal/multiagent/tool_args_json_retry.go b/internal/multiagent/tool_args_json_retry.go index 119797e6..9f97a0f0 100644 --- a/internal/multiagent/tool_args_json_retry.go +++ b/internal/multiagent/tool_args_json_retry.go @@ -7,9 +7,10 @@ import ( "github.com/cloudwego/eino/schema" ) -// maxToolCallArgumentsJSONAttempts 含首次运行:首次 + 自动重试次数。 +// maxToolCallRecoveryAttempts 含首次运行:首次 + 自动重试次数。 // 例如为 3 表示最多共 3 次完整 DeepAgent 运行(2 次失败后各追加一条纠错提示)。 -const maxToolCallArgumentsJSONAttempts = 3 +// 该常量同时用于 JSON 参数错误和工具执行错误(如子代理名称不存在)的恢复重试。 +const maxToolCallRecoveryAttempts = 3 // toolCallArgumentsJSONRetryHint 追加在用户消息后,提示模型输出合法 JSON 工具参数(部分云厂商会在流式阶段校验 arguments)。 func toolCallArgumentsJSONRetryHint() *schema.Message { @@ -24,7 +25,7 @@ func toolCallArgumentsJSONRecoveryTimelineMessage(attempt int) string { "接口拒绝了无效的工具参数 JSON。已向对话追加系统提示并要求模型重新生成合法的 function.arguments。"+ "当前为第 %d/%d 轮完整运行。\n\n"+ "The API rejected invalid JSON in tool arguments. A system hint was appended. This is full run %d of %d.", - attempt+1, maxToolCallArgumentsJSONAttempts, attempt+1, maxToolCallArgumentsJSONAttempts, + attempt+1, maxToolCallRecoveryAttempts, attempt+1, maxToolCallRecoveryAttempts, ) } diff --git a/internal/multiagent/tool_execution_retry.go b/internal/multiagent/tool_execution_retry.go new file mode 100644 index 00000000..c79f8a66 --- /dev/null +++ b/internal/multiagent/tool_execution_retry.go @@ -0,0 +1,76 @@ +package multiagent + +import ( + "fmt" + "strings" + + "github.com/cloudwego/eino/schema" +) + +// isRecoverableToolExecutionError detects tool-level execution errors that can be +// recovered by retrying with a corrective hint. These errors originate from eino +// framework internals (e.g. task_tool.go, tool_node.go) when the LLM produces +// invalid tool calls such as non-existent sub-agent types, malformed JSON arguments, +// or unregistered tool names. +func isRecoverableToolExecutionError(err error) bool { + if err == nil { + return false + } + s := strings.ToLower(err.Error()) + + // Sub-agent type not found (from deep/task_tool.go) + if strings.Contains(s, "subagent type") && strings.Contains(s, "not found") { + return true + } + + // Tool not found in toolsNode indexes (from compose/tool_node.go, when UnknownToolsHandler is nil) + if strings.Contains(s, "tool") && strings.Contains(s, "not found") { + return true + } + + // Invalid tool arguments JSON (from einomcp/mcp_tools.go or eino internals) + if strings.Contains(s, "invalid tool arguments json") { + return true + } + + // Failed to unmarshal task tool input json (from deep/task_tool.go) + if strings.Contains(s, "failed to unmarshal") && strings.Contains(s, "json") { + return true + } + + // Generic tool call stream/invoke failure wrapping the above + if (strings.Contains(s, "failed to stream tool call") || strings.Contains(s, "failed to invoke tool")) && + (strings.Contains(s, "not found") || strings.Contains(s, "json") || strings.Contains(s, "unmarshal")) { + return true + } + + return false +} + +// toolExecutionRetryHint returns a user message appended to the conversation to prompt +// the LLM to correct its tool call after a tool execution error. +func toolExecutionRetryHint() *schema.Message { + return schema.UserMessage(`[System] Your previous tool call failed because: +- The tool or sub-agent name you used does not exist, OR +- The tool call arguments were not valid JSON. + +Please carefully review the available tools and sub-agents listed in your context, use only exact registered names (case-sensitive), and ensure all arguments are well-formed JSON objects. Then retry your action. + +[系统提示] 上一次工具调用失败,可能原因: +- 你使用的工具名或子代理名称不存在; +- 工具调用参数不是合法 JSON。 + +请仔细检查上下文中列出的可用工具和子代理名称(须完全匹配、区分大小写),确保所有参数均为合法的 JSON 对象,然后重新执行。`) +} + +// toolExecutionRecoveryTimelineMessage returns a message for the eino_recovery event +// displayed in the UI timeline when a tool execution error triggers a retry. +func toolExecutionRecoveryTimelineMessage(attempt int) string { + return fmt.Sprintf( + "工具调用执行失败(工具/子代理名称不存在或参数 JSON 无效)。已向对话追加纠错提示并要求模型重新生成。"+ + "当前为第 %d/%d 轮完整运行。\n\n"+ + "Tool call execution failed (unknown tool/sub-agent name or invalid JSON arguments). "+ + "A corrective hint was appended. This is full run %d of %d.", + attempt+1, maxToolCallRecoveryAttempts, attempt+1, maxToolCallRecoveryAttempts, + ) +}