From a81da85aa120982efb8da1018328b18e99f640b9 Mon Sep 17 00:00:00 2001 From: xiaosu Date: Wed, 13 May 2026 15:22:40 +0800 Subject: [PATCH 1/3] fix: replace RuntimeError with cancel_tool to prevent memory corruption on tool limit When tool calls reach the 20-call limit, raising RuntimeError in AfterToolCallEvent breaks the message history, leaving toolUse blocks without matching toolResult blocks. When AgentCore Memory restores this corrupted history in subsequent requests, Bedrock's ConverseStream API rejects it with ValidationException. Fix: Use BeforeToolCallEvent with event.cancel_tool instead. This cancels the tool gracefully by returning an error message to the model, which then responds using already-gathered information. The conversation history remains consistent and Memory can safely restore it. --- main.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/main.py b/main.py index 8d09e8d..4a21045 100644 --- a/main.py +++ b/main.py @@ -176,15 +176,18 @@ async def invoke(payload): # Limit tool calls to prevent infinite loops tool_call_count = {"n": 0} - from strands.hooks import AfterToolCallEvent + from strands.hooks import BeforeToolCallEvent - def check_tool_limit(event: AfterToolCallEvent): + def check_tool_limit(event: BeforeToolCallEvent): tool_call_count["n"] += 1 - if tool_call_count["n"] >= 20: + if tool_call_count["n"] > 20: logger.warning(f"⚠️ Tool call limit reached (20)") - raise RuntimeError("工具调用次数超过上限(20次),已强制停止。请简化问题后重试。") + event.cancel_tool = ( + "工具调用次数已超过上限(20次)。" + "DO NOT CALL ANY MORE TOOLS. 请直接根据已有信息回答用户。" + ) - agent.hooks.add_callback(AfterToolCallEvent, check_tool_limit) + agent.hooks.add_callback(BeforeToolCallEvent, check_tool_limit) healthy_status.value = "HealthyBusy" logger.info(f"🚀 Agent job starts | actor={actor_id} session={session_id}") From c085cc4e7103e3cc9f290a6235d1b4885946da0f Mon Sep 17 00:00:00 2001 From: xiaosu Date: Fri, 15 May 2026 10:15:54 +0800 Subject: [PATCH 2/3] fix: add message history validation to handle orphaned toolUse from Memory When MCP tool calls are interrupted (timeout, network error), Memory saves incomplete history with toolUse but no toolResult. On restoration, Strands SDK's repair logic can add incorrect toolResult counts (strands-agents/sdk-python#2296), causing Bedrock API rejection. Add fix_message_history() that validates toolUse/toolResult pairing before each invocation and corrects any mismatches. --- main.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/main.py b/main.py index 4a21045..f82ed1f 100644 --- a/main.py +++ b/main.py @@ -189,6 +189,41 @@ def check_tool_limit(event: BeforeToolCallEvent): agent.hooks.add_callback(BeforeToolCallEvent, check_tool_limit) + # Fix corrupted message history from Memory restoration + # Workaround for https://github.com/strands-agents/sdk-python/issues/2296 + def fix_message_history(agent): + """Validate and fix toolUse/toolResult pairing in restored history.""" + messages = getattr(agent, 'messages', None) + if not messages or len(messages) < 2: + return + for i, msg in enumerate(messages): + content = msg.get("content", []) + if msg.get("role") == "assistant" and i + 1 < len(messages): + tool_use_ids = [b["toolUse"]["toolUseId"] for b in content if "toolUse" in b] + if not tool_use_ids: + continue + next_msg = messages[i + 1] + if next_msg.get("role") != "user": + continue + next_content = next_msg.get("content", []) + tool_results = [b for b in next_content if "toolResult" in b] + if len(tool_results) != len(tool_use_ids): + logger.warning(f"⚠️ Fixing toolUse/toolResult mismatch at msg {i}: " + f"{len(tool_use_ids)} toolUse vs {len(tool_results)} toolResult") + non_tool = [b for b in next_content if "toolResult" not in b] + fixed_results = [] + for tid in tool_use_ids: + existing = next((b for b in tool_results + if b.get("toolResult", {}).get("toolUseId") == tid), None) + if existing: + fixed_results.append(existing) + else: + fixed_results.append({"toolResult": {"toolUseId": tid, + "content": [{"text": "Tool execution was interrupted."}], "status": "error"}}) + messages[i + 1]["content"] = non_tool + fixed_results + + fix_message_history(agent) + healthy_status.value = "HealthyBusy" logger.info(f"🚀 Agent job starts | actor={actor_id} session={session_id}") From 3d9cfe94575625ae6e184ba3eb03819fafef414b Mon Sep 17 00:00:00 2001 From: xiaosu Date: Fri, 15 May 2026 10:22:31 +0800 Subject: [PATCH 3/3] fix: move message history fix to BeforeModelCallEvent hook The previous fix_message_history() ran after Agent creation but before invoke_async(). However, session_manager restores history inside invoke_async(), so the fix ran too early. Move to BeforeModelCallEvent hook which fires right before each model call, after history restoration and SDK's own (buggy) repair logic. This ensures messages are always valid when sent to Bedrock. --- main.py | 67 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 33 insertions(+), 34 deletions(-) diff --git a/main.py b/main.py index f82ed1f..e3fb980 100644 --- a/main.py +++ b/main.py @@ -176,7 +176,7 @@ async def invoke(payload): # Limit tool calls to prevent infinite loops tool_call_count = {"n": 0} - from strands.hooks import BeforeToolCallEvent + from strands.hooks import BeforeToolCallEvent, BeforeModelCallEvent def check_tool_limit(event: BeforeToolCallEvent): tool_call_count["n"] += 1 @@ -187,42 +187,41 @@ def check_tool_limit(event: BeforeToolCallEvent): "DO NOT CALL ANY MORE TOOLS. 请直接根据已有信息回答用户。" ) - agent.hooks.add_callback(BeforeToolCallEvent, check_tool_limit) + def fix_messages_before_model(event: BeforeModelCallEvent): + """Fix toolUse/toolResult mismatch right before model call. - # Fix corrupted message history from Memory restoration - # Workaround for https://github.com/strands-agents/sdk-python/issues/2296 - def fix_message_history(agent): - """Validate and fix toolUse/toolResult pairing in restored history.""" - messages = getattr(agent, 'messages', None) + Workaround for https://github.com/strands-agents/sdk-python/issues/2296 + """ + messages = agent.messages if not messages or len(messages) < 2: return - for i, msg in enumerate(messages): - content = msg.get("content", []) - if msg.get("role") == "assistant" and i + 1 < len(messages): - tool_use_ids = [b["toolUse"]["toolUseId"] for b in content if "toolUse" in b] - if not tool_use_ids: - continue - next_msg = messages[i + 1] - if next_msg.get("role") != "user": - continue - next_content = next_msg.get("content", []) - tool_results = [b for b in next_content if "toolResult" in b] - if len(tool_results) != len(tool_use_ids): - logger.warning(f"⚠️ Fixing toolUse/toolResult mismatch at msg {i}: " - f"{len(tool_use_ids)} toolUse vs {len(tool_results)} toolResult") - non_tool = [b for b in next_content if "toolResult" not in b] - fixed_results = [] - for tid in tool_use_ids: - existing = next((b for b in tool_results - if b.get("toolResult", {}).get("toolUseId") == tid), None) - if existing: - fixed_results.append(existing) - else: - fixed_results.append({"toolResult": {"toolUseId": tid, - "content": [{"text": "Tool execution was interrupted."}], "status": "error"}}) - messages[i + 1]["content"] = non_tool + fixed_results - - fix_message_history(agent) + for i in range(len(messages) - 1): + msg = messages[i] + if msg.get("role") != "assistant": + continue + tool_use_ids = [b["toolUse"]["toolUseId"] for b in msg.get("content", []) if "toolUse" in b] + if not tool_use_ids: + continue + next_msg = messages[i + 1] + if next_msg.get("role") != "user": + continue + next_content = next_msg.get("content", []) + tool_results = [b for b in next_content if "toolResult" in b] + if len(tool_results) == len(tool_use_ids): + continue + logger.warning(f"⚠️ Fixing toolUse/toolResult mismatch at msg {i}: " + f"{len(tool_use_ids)} toolUse vs {len(tool_results)} toolResult") + non_tool = [b for b in next_content if "toolResult" not in b] + fixed_results = [] + for tid in tool_use_ids: + existing = next((b for b in tool_results + if b.get("toolResult", {}).get("toolUseId") == tid), None) + fixed_results.append(existing if existing else {"toolResult": {"toolUseId": tid, + "content": [{"text": "Tool execution was interrupted."}], "status": "error"}}) + messages[i + 1]["content"] = non_tool + fixed_results + + agent.hooks.add_callback(BeforeToolCallEvent, check_tool_limit) + agent.hooks.add_callback(BeforeModelCallEvent, fix_messages_before_model) healthy_status.value = "HealthyBusy" logger.info(f"🚀 Agent job starts | actor={actor_id} session={session_id}")