dataelement · Y1fe1Zh0u · Apr 27, 2026 · May 12, 2026 · May 12, 2026
diff --git a/backend/alembic/versions/add_llm_context_window_tokens.py b/backend/alembic/versions/add_llm_context_window_tokens.py
@@ -0,0 +1,22 @@
+"""add llm context_window_tokens
+
+Revision ID: add_llm_context_window_tokens
+Revises: add_user_tenant_onboarding
+Create Date: 2026-05-12 00:00:00.000000
+"""
+
+from alembic import op
+
+
+revision = "add_llm_context_window_tokens"
+down_revision = "add_user_tenant_onboarding"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.execute("ALTER TABLE llm_models ADD COLUMN IF NOT EXISTS context_window_tokens INTEGER")
+
+
+def downgrade() -> None:
+    op.execute("ALTER TABLE llm_models DROP COLUMN IF EXISTS context_window_tokens")
diff --git a/backend/app/api/enterprise.py b/backend/app/api/enterprise.py
@@ -177,6 +177,7 @@ async def add_llm_model(
         enabled=data.enabled,
         supports_vision=data.supports_vision,
         max_output_tokens=data.max_output_tokens,
+        context_window_tokens=data.context_window_tokens,
         request_timeout=data.request_timeout,
         tenant_id=uuid.UUID(tid) if tid else None,
     )
@@ -321,6 +322,8 @@ async def update_llm_model(
             model.supports_vision = data.supports_vision
         if hasattr(data, 'max_output_tokens') and data.max_output_tokens is not None:
             model.max_output_tokens = data.max_output_tokens
+        if 'context_window_tokens' in data.model_fields_set:
+            model.context_window_tokens = data.context_window_tokens
         if hasattr(data, 'request_timeout') and data.request_timeout is not None:
             model.request_timeout = data.request_timeout
 

diff --git a/backend/app/api/feishu.py b/backend/app/api/feishu.py
@@ -18,6 +18,11 @@
 from app.models.identity import IdentityProvider
 from app.schemas.schemas import ChannelConfigCreate, ChannelConfigOut, TokenResponse, UserOut
 from app.services.feishu_service import feishu_service
+from app.services.history_window import (
+    token_budget_from_context_window,
+    truncate_by_message_count,
+    truncate_by_token_budget,
+)
 
 router = APIRouter(tags=["feishu"])
 
@@ -1633,9 +1638,23 @@ async def _call_agent_llm(
     messages: list[dict] = []
     from app.models.agent import DEFAULT_CONTEXT_WINDOW_SIZE
     ctx_size = agent.context_window_size or DEFAULT_CONTEXT_WINDOW_SIZE
+    user_message = {"role": "user", "content": user_text}
     if history:
-        messages.extend(_normalize_history_messages(history)[-ctx_size:])
-    messages.append({"role": "user", "content": user_text})
+        _normalized_history = _normalize_history_messages(history)
+        _conversation = [*_normalized_history, user_message]
+        _token_budget = token_budget_from_context_window(
+            getattr(model, "context_window_tokens", None)
+        )
+        if _token_budget:
+            messages.extend(
+                truncate_by_token_budget(_conversation, ctx_size, _token_budget)
+            )
+        else:
+            messages.extend(
+                truncate_by_message_count(_conversation, ctx_size)
+            )
+    else:
+        messages.append(user_message)
 
     # Use actual user_id so the system prompt knows who it's chatting with
     effective_user_id = user_id or agent_id

diff --git a/backend/app/api/websocket.py b/backend/app/api/websocket.py
@@ -19,6 +19,11 @@
 from app.models.llm import LLMModel
 from app.models.user import User
 from app.services.chat_session_service import ensure_primary_platform_session
+from app.services.history_window import (
+    token_budget_from_context_window,
+    truncate_by_message_count,
+    truncate_by_token_budget,
+)
 from app.services.llm import call_llm, call_llm_with_failover
 
 router = APIRouter(tags=["websocket"])
@@ -775,10 +780,22 @@ async def _call_with_failover():
                         async def _on_failover(reason: str):
                             await websocket.send_json({"type": "info", "content": f"Primary model error, {reason}"})
 
-                        # To prevent tool call message pairs(assistant + tool) from being broken down.
-                        _truncated = conversation[-ctx_size:]
-                        while _truncated and _truncated[0].get("role") == "tool":
-                            _truncated.pop(0)
+                        # Pair-aware truncation: keep the last `ctx_size` messages while
+                        # preserving assistant.tool_calls ↔ role=tool blocks atomically.
+                        # Naive [-ctx_size:] slicing can leave orphan tool messages at the
+                        # head when the cut lands mid-pair, which OpenAI rejects with
+                        # "No tool call found for function call output" (issue #446).
+                        _token_budget = token_budget_from_context_window(
+                            getattr(effective_llm_model, "context_window_tokens", None)
+                        )
+                        if _token_budget:
+                            _truncated = truncate_by_token_budget(
+                                conversation,
+                                ctx_size,
+                                _token_budget,
+                            )
+                        else:
+                            _truncated = truncate_by_message_count(conversation, ctx_size)
 
                         # Per-(user, agent) onboarding. With no row, prepend the
                         # greeting prompt and mark the pair as "greeted" once it

diff --git a/backend/app/models/llm.py b/backend/app/models/llm.py
@@ -28,6 +28,8 @@ class LLMModel(Base):
     temperature: Mapped[float | None] = mapped_column(Float, nullable=True)
     request_timeout: Mapped[int | None] = mapped_column(Integer, nullable=True)  # Request timeout in seconds, default 120
     max_output_tokens: Mapped[int | None] = mapped_column(Integer, nullable=True)  # Per-model output token limit override
+    # Model input context window used for local history fallback.
+    context_window_tokens: Mapped[int | None] = mapped_column(Integer, nullable=True)
     created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), server_default=func.now())
     updated_at: Mapped[datetime] = mapped_column(
         DateTime(timezone=True), server_default=func.now(), onupdate=func.now()

diff --git a/backend/app/schemas/schemas.py b/backend/app/schemas/schemas.py
@@ -399,6 +399,7 @@ class LLMModelCreate(BaseModel):
     enabled: bool = True
     supports_vision: bool = False
     max_output_tokens: int | None = None
+    context_window_tokens: int | None = Field(None, ge=1)
     request_timeout: int | None = None
 
 class LLMModelUpdate(BaseModel):
@@ -412,6 +413,7 @@ class LLMModelUpdate(BaseModel):
     enabled: bool | None = None
     supports_vision: bool | None = None
     max_output_tokens: int | None = None
+    context_window_tokens: int | None = Field(None, ge=1)
     request_timeout: int | None = None
 
 
@@ -427,6 +429,7 @@ class LLMModelOut(BaseModel):
     enabled: bool
     supports_vision: bool = False
     max_output_tokens: int | None = None
+    context_window_tokens: int | None = None
     request_timeout: int | None = None
     created_at: datetime
 

diff --git a/backend/app/services/history_window.py b/backend/app/services/history_window.py
@@ -0,0 +1,223 @@
+"""Tool-block-safe conversation history truncation.
+
+Replaces naive ``conversation[-N:]`` slicing with a walker that keeps
+``assistant.tool_calls`` and their matching ``role="tool"`` messages as an
+atomic block — never half a pair, never orphan tool messages.
+
+Why: OpenAI Responses API and Chat Completions both reject input where a
+``function_call_output`` / ``role="tool"`` message has no matching
+``function_call`` / ``assistant.tool_calls`` earlier in the input. Naive
+``[-N:]`` slicing can leave such orphans at the head when the cut lands
+between an assistant message and its tool results. This is the failure mode
+reported in issue #446.
+
+Tool results must be in the contiguous tool-result run immediately after
+their owning assistant. A tool message inserted elsewhere (from malformed
+persistence or upstream truncation) is dropped, not folded into an adjacent
+block. This makes the helper robust against orphans at any position, not just
+at the slice head.
+
+Incomplete assistant tool-call blocks are also dropped. If an assistant
+declares multiple tool calls, every declared ``tool_call_id`` must have a
+matching ``role="tool"`` result before the next non-tool message. This mirrors
+the API contract enforced by OpenAI-compatible providers and avoids sending
+synthetic/fake tool results into weaker models' context.
+
+Input is expected to be in OpenAI chat-completion format (post-reorganization
+from DB ``role="tool_call"`` rows).
+"""
+
+from __future__ import annotations
+
+import json
+from typing import Any
+
+from app.services.token_tracker import estimate_tokens_from_chars
+
+
+TOKEN_BUDGET_CONTEXT_RATIO = 0.8
+
+
+def _assistant_tool_call_ids(message: dict[str, Any]) -> list[str]:
+    """Return non-empty tool call ids declared by an assistant message."""
+    if message.get("role") != "assistant":
+        return []
+    tool_calls = message.get("tool_calls")
+    if not isinstance(tool_calls, list):
+        return []
+
+    ids: list[str] = []
+    for tool_call in tool_calls:
+        if isinstance(tool_call, dict):
+            tool_call_id = tool_call.get("id")
+            if isinstance(tool_call_id, str) and tool_call_id:
+                ids.append(tool_call_id)
+    return ids
+
+
+def _safe_history_blocks(messages: list[dict[str, Any]]) -> list[list[dict[str, Any]]]:
+    """Build API-safe message blocks in original order.
+
+    A valid tool block is an assistant message with tool calls followed by
+    contiguous matching ``role="tool"`` results. A missing result invalidates
+    the whole block; orphan/duplicate tool results are consumed and dropped.
+    """
+    blocks: list[list[dict[str, Any]]] = []
+    i = 0
+    n = len(messages)
+
+    while i < n:
+        message = messages[i]
+        role = message.get("role")
+
+        if role == "tool":
+            # Orphan or delayed tool result. It is invalid without the owning
+            # assistant immediately before the tool-result run.
+            i += 1
+            continue
+
+        tool_call_ids = _assistant_tool_call_ids(message)
+        if not tool_call_ids:
+            blocks.append([message])
+            i += 1
+            continue
+
+        required = set(tool_call_ids)
+        seen: set[str] = set()
+        block = [message]
+        j = i + 1
+
+        while j < n and messages[j].get("role") == "tool":
+            tool_message = messages[j]
+            tool_call_id = tool_message.get("tool_call_id")
+            if (
+                isinstance(tool_call_id, str)
+                and tool_call_id in required
+                and tool_call_id not in seen
+            ):
+                seen.add(tool_call_id)
+                block.append(tool_message)
+            # Consume every contiguous tool message here. Non-matching or
+            # duplicate tool results are invalid for this block and are dropped
+            # instead of being allowed to become later orphan messages.
+            j += 1
+
+        if seen == required:
+            blocks.append(block)
+        # If incomplete, drop the assistant and any partial tool results. Old
+        # history truncation should discard broken blocks rather than inventing
+        # synthetic tool results.
+        i = j
+
+    return blocks
+
+
+def token_budget_from_context_window(
+    context_window_tokens: int | None,
+    ratio: float = TOKEN_BUDGET_CONTEXT_RATIO,
+) -> int | None:
+    """Return the fallback history token budget from an explicit model window."""
+    if not context_window_tokens or context_window_tokens <= 0:
+        return None
+    ratio = min(max(ratio, 0.0), 1.0)
+    return max(int(context_window_tokens * ratio), 1)
+
+
+def _estimate_message_tokens(message: dict[str, Any]) -> int:
+    """Roughly estimate one OpenAI-format message's token footprint."""
+    try:
+        serialized = json.dumps(message, ensure_ascii=False, separators=(",", ":"))
+    except (TypeError, ValueError):
+        serialized = str(message)
+    return estimate_tokens_from_chars(len(serialized))
+
+
+def _estimate_block_tokens(block: list[dict[str, Any]]) -> int:
+    return sum(_estimate_message_tokens(message) for message in block)
+
+
+def truncate_by_message_count(
+    messages: list[dict[str, Any]],
+    max_messages: int,
+) -> list[dict[str, Any]]:
+    """Keep at most ``max_messages`` recent messages, preserving tool-call pairs.
+
+    A "block" is either:
+      - a single non-tool, non-tool-calling message (user / system / assistant text), or
+      - an ``assistant`` with ``tool_calls`` plus every matching contiguous
+        ``role="tool"`` message.
+
+    Blocks are atomic: included whole or not at all. Orphan ``role="tool"``
+    messages and incomplete assistant tool-call blocks are silently dropped
+    regardless of budget. Sending either shape to OpenAI causes the #446 class
+    of errors.
+
+    Args:
+        messages: Conversation list in OpenAI format. Empty list is fine.
+        max_messages: Soft upper bound on the number of returned entries.
+            Values ``<= 0`` return ``[]``.
+
+    Returns:
+        A new list (input is never mutated) of at most ``max_messages`` entries
+        from the tail of ``messages``, with all tool-call pairs intact.
+    """
+    if max_messages <= 0 or not messages:
+        return []
+
+    blocks = _safe_history_blocks(messages)
+    selected: list[list[dict[str, Any]]] = []
+    budget = max_messages
+    for block in reversed(blocks):
+        size = len(block)
+        if size <= budget:
+            selected.append(block)
+            budget -= size
+        else:
+            # Block doesn't fit — stop. Do NOT partial-include (would split pair).
+            break
+
+    return [message for block in reversed(selected) for message in block]
+
+
+def truncate_by_token_budget(
+    messages: list[dict[str, Any]],
+    max_messages: int,
+    max_tokens: int,
+    *,
+    keep_latest_block: bool = True,
+) -> list[dict[str, Any]]:
+    """Keep recent API-safe blocks within message and token budgets.
+
+    This is a provider-safe fallback, not summary compaction: old blocks are
+    dropped, never summarized, and tool-call blocks remain atomic. The latest
+    block is retained by default so the current user turn is not removed even
+    when it alone exceeds the local estimate.
+    """
+    if max_messages <= 0 or max_tokens <= 0 or not messages:
+        return []
+
+    blocks = _safe_history_blocks(messages)
+    selected: list[list[dict[str, Any]]] = []
+    message_budget = max_messages
+    token_budget = max_tokens
+
+    for block in reversed(blocks):
+        size = len(block)
+        tokens = _estimate_block_tokens(block)
+        is_latest = not selected
+
+        if size <= message_budget and tokens <= token_budget:
+            selected.append(block)
+            message_budget -= size
+            token_budget -= tokens
+            continue
+
+        if keep_latest_block and is_latest:
+            selected.append(block)
+            message_budget = max(message_budget - size, 0)
+            token_budget = max(token_budget - tokens, 0)
+            continue
+
+        break
+
+    return [message for block in reversed(selected) for message in block]