Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions backend/alembic/versions/add_llm_context_window_tokens.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
"""add llm context_window_tokens

Revision ID: add_llm_context_window_tokens
Revises: add_user_tenant_onboarding
Create Date: 2026-05-12 00:00:00.000000
"""

from alembic import op


revision = "add_llm_context_window_tokens"
down_revision = "add_user_tenant_onboarding"
branch_labels = None
depends_on = None


def upgrade() -> None:
op.execute("ALTER TABLE llm_models ADD COLUMN IF NOT EXISTS context_window_tokens INTEGER")


def downgrade() -> None:
op.execute("ALTER TABLE llm_models DROP COLUMN IF EXISTS context_window_tokens")
3 changes: 3 additions & 0 deletions backend/app/api/enterprise.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ async def add_llm_model(
enabled=data.enabled,
supports_vision=data.supports_vision,
max_output_tokens=data.max_output_tokens,
context_window_tokens=data.context_window_tokens,
request_timeout=data.request_timeout,
tenant_id=uuid.UUID(tid) if tid else None,
)
Expand Down Expand Up @@ -321,6 +322,8 @@ async def update_llm_model(
model.supports_vision = data.supports_vision
if hasattr(data, 'max_output_tokens') and data.max_output_tokens is not None:
model.max_output_tokens = data.max_output_tokens
if 'context_window_tokens' in data.model_fields_set:
model.context_window_tokens = data.context_window_tokens
if hasattr(data, 'request_timeout') and data.request_timeout is not None:
model.request_timeout = data.request_timeout

Expand Down
23 changes: 21 additions & 2 deletions backend/app/api/feishu.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@
from app.models.identity import IdentityProvider
from app.schemas.schemas import ChannelConfigCreate, ChannelConfigOut, TokenResponse, UserOut
from app.services.feishu_service import feishu_service
from app.services.history_window import (
token_budget_from_context_window,
truncate_by_message_count,
truncate_by_token_budget,
)

router = APIRouter(tags=["feishu"])

Expand Down Expand Up @@ -1633,9 +1638,23 @@ async def _call_agent_llm(
messages: list[dict] = []
from app.models.agent import DEFAULT_CONTEXT_WINDOW_SIZE
ctx_size = agent.context_window_size or DEFAULT_CONTEXT_WINDOW_SIZE
user_message = {"role": "user", "content": user_text}
if history:
messages.extend(_normalize_history_messages(history)[-ctx_size:])
messages.append({"role": "user", "content": user_text})
_normalized_history = _normalize_history_messages(history)
_conversation = [*_normalized_history, user_message]
_token_budget = token_budget_from_context_window(
getattr(model, "context_window_tokens", None)
)
if _token_budget:
messages.extend(
truncate_by_token_budget(_conversation, ctx_size, _token_budget)
)
else:
messages.extend(
truncate_by_message_count(_conversation, ctx_size)
)
else:
messages.append(user_message)

# Use actual user_id so the system prompt knows who it's chatting with
effective_user_id = user_id or agent_id
Expand Down
25 changes: 21 additions & 4 deletions backend/app/api/websocket.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@
from app.models.llm import LLMModel
from app.models.user import User
from app.services.chat_session_service import ensure_primary_platform_session
from app.services.history_window import (
token_budget_from_context_window,
truncate_by_message_count,
truncate_by_token_budget,
)
from app.services.llm import call_llm, call_llm_with_failover

router = APIRouter(tags=["websocket"])
Expand Down Expand Up @@ -775,10 +780,22 @@ async def _call_with_failover():
async def _on_failover(reason: str):
await websocket.send_json({"type": "info", "content": f"Primary model error, {reason}"})

# To prevent tool call message pairs(assistant + tool) from being broken down.
_truncated = conversation[-ctx_size:]
while _truncated and _truncated[0].get("role") == "tool":
_truncated.pop(0)
# Pair-aware truncation: keep the last `ctx_size` messages while
# preserving assistant.tool_calls ↔ role=tool blocks atomically.
# Naive [-ctx_size:] slicing can leave orphan tool messages at the
# head when the cut lands mid-pair, which OpenAI rejects with
# "No tool call found for function call output" (issue #446).
_token_budget = token_budget_from_context_window(
getattr(effective_llm_model, "context_window_tokens", None)
)
if _token_budget:
_truncated = truncate_by_token_budget(
conversation,
ctx_size,
_token_budget,
)
else:
_truncated = truncate_by_message_count(conversation, ctx_size)

# Per-(user, agent) onboarding. With no row, prepend the
# greeting prompt and mark the pair as "greeted" once it
Expand Down
2 changes: 2 additions & 0 deletions backend/app/models/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ class LLMModel(Base):
temperature: Mapped[float | None] = mapped_column(Float, nullable=True)
request_timeout: Mapped[int | None] = mapped_column(Integer, nullable=True) # Request timeout in seconds, default 120
max_output_tokens: Mapped[int | None] = mapped_column(Integer, nullable=True) # Per-model output token limit override
# Model input context window used for local history fallback.
context_window_tokens: Mapped[int | None] = mapped_column(Integer, nullable=True)
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), server_default=func.now())
updated_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), server_default=func.now(), onupdate=func.now()
Expand Down
3 changes: 3 additions & 0 deletions backend/app/schemas/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,7 @@ class LLMModelCreate(BaseModel):
enabled: bool = True
supports_vision: bool = False
max_output_tokens: int | None = None
context_window_tokens: int | None = Field(None, ge=1)
request_timeout: int | None = None

class LLMModelUpdate(BaseModel):
Expand All @@ -412,6 +413,7 @@ class LLMModelUpdate(BaseModel):
enabled: bool | None = None
supports_vision: bool | None = None
max_output_tokens: int | None = None
context_window_tokens: int | None = Field(None, ge=1)
request_timeout: int | None = None


Expand All @@ -427,6 +429,7 @@ class LLMModelOut(BaseModel):
enabled: bool
supports_vision: bool = False
max_output_tokens: int | None = None
context_window_tokens: int | None = None
request_timeout: int | None = None
created_at: datetime

Expand Down
223 changes: 223 additions & 0 deletions backend/app/services/history_window.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
"""Tool-block-safe conversation history truncation.

Replaces naive ``conversation[-N:]`` slicing with a walker that keeps
``assistant.tool_calls`` and their matching ``role="tool"`` messages as an
atomic block — never half a pair, never orphan tool messages.

Why: OpenAI Responses API and Chat Completions both reject input where a
``function_call_output`` / ``role="tool"`` message has no matching
``function_call`` / ``assistant.tool_calls`` earlier in the input. Naive
``[-N:]`` slicing can leave such orphans at the head when the cut lands
between an assistant message and its tool results. This is the failure mode
reported in issue #446.

Tool results must be in the contiguous tool-result run immediately after
their owning assistant. A tool message inserted elsewhere (from malformed
persistence or upstream truncation) is dropped, not folded into an adjacent
block. This makes the helper robust against orphans at any position, not just
at the slice head.

Incomplete assistant tool-call blocks are also dropped. If an assistant
declares multiple tool calls, every declared ``tool_call_id`` must have a
matching ``role="tool"`` result before the next non-tool message. This mirrors
the API contract enforced by OpenAI-compatible providers and avoids sending
synthetic/fake tool results into weaker models' context.

Input is expected to be in OpenAI chat-completion format (post-reorganization
from DB ``role="tool_call"`` rows).
"""

from __future__ import annotations

import json
from typing import Any

from app.services.token_tracker import estimate_tokens_from_chars


TOKEN_BUDGET_CONTEXT_RATIO = 0.8


def _assistant_tool_call_ids(message: dict[str, Any]) -> list[str]:
"""Return non-empty tool call ids declared by an assistant message."""
if message.get("role") != "assistant":
return []
tool_calls = message.get("tool_calls")
if not isinstance(tool_calls, list):
return []

ids: list[str] = []
for tool_call in tool_calls:
if isinstance(tool_call, dict):
tool_call_id = tool_call.get("id")
if isinstance(tool_call_id, str) and tool_call_id:
ids.append(tool_call_id)
return ids


def _safe_history_blocks(messages: list[dict[str, Any]]) -> list[list[dict[str, Any]]]:
"""Build API-safe message blocks in original order.

A valid tool block is an assistant message with tool calls followed by
contiguous matching ``role="tool"`` results. A missing result invalidates
the whole block; orphan/duplicate tool results are consumed and dropped.
"""
blocks: list[list[dict[str, Any]]] = []
i = 0
n = len(messages)

while i < n:
message = messages[i]
role = message.get("role")

if role == "tool":
# Orphan or delayed tool result. It is invalid without the owning
# assistant immediately before the tool-result run.
i += 1
continue

tool_call_ids = _assistant_tool_call_ids(message)
if not tool_call_ids:
blocks.append([message])
i += 1
continue

required = set(tool_call_ids)
seen: set[str] = set()
block = [message]
j = i + 1

while j < n and messages[j].get("role") == "tool":
tool_message = messages[j]
tool_call_id = tool_message.get("tool_call_id")
if (
isinstance(tool_call_id, str)
and tool_call_id in required
and tool_call_id not in seen
):
seen.add(tool_call_id)
block.append(tool_message)
# Consume every contiguous tool message here. Non-matching or
# duplicate tool results are invalid for this block and are dropped
# instead of being allowed to become later orphan messages.
j += 1

if seen == required:
blocks.append(block)
# If incomplete, drop the assistant and any partial tool results. Old
# history truncation should discard broken blocks rather than inventing
# synthetic tool results.
i = j

return blocks


def token_budget_from_context_window(
context_window_tokens: int | None,
ratio: float = TOKEN_BUDGET_CONTEXT_RATIO,
) -> int | None:
"""Return the fallback history token budget from an explicit model window."""
if not context_window_tokens or context_window_tokens <= 0:
return None
ratio = min(max(ratio, 0.0), 1.0)
return max(int(context_window_tokens * ratio), 1)


def _estimate_message_tokens(message: dict[str, Any]) -> int:
"""Roughly estimate one OpenAI-format message's token footprint."""
try:
serialized = json.dumps(message, ensure_ascii=False, separators=(",", ":"))
except (TypeError, ValueError):
serialized = str(message)
return estimate_tokens_from_chars(len(serialized))


def _estimate_block_tokens(block: list[dict[str, Any]]) -> int:
return sum(_estimate_message_tokens(message) for message in block)


def truncate_by_message_count(
messages: list[dict[str, Any]],
max_messages: int,
) -> list[dict[str, Any]]:
"""Keep at most ``max_messages`` recent messages, preserving tool-call pairs.

A "block" is either:
- a single non-tool, non-tool-calling message (user / system / assistant text), or
- an ``assistant`` with ``tool_calls`` plus every matching contiguous
``role="tool"`` message.

Blocks are atomic: included whole or not at all. Orphan ``role="tool"``
messages and incomplete assistant tool-call blocks are silently dropped
regardless of budget. Sending either shape to OpenAI causes the #446 class
of errors.

Args:
messages: Conversation list in OpenAI format. Empty list is fine.
max_messages: Soft upper bound on the number of returned entries.
Values ``<= 0`` return ``[]``.

Returns:
A new list (input is never mutated) of at most ``max_messages`` entries
from the tail of ``messages``, with all tool-call pairs intact.
"""
if max_messages <= 0 or not messages:
return []

blocks = _safe_history_blocks(messages)
selected: list[list[dict[str, Any]]] = []
budget = max_messages
for block in reversed(blocks):
size = len(block)
if size <= budget:
selected.append(block)
budget -= size
else:
# Block doesn't fit — stop. Do NOT partial-include (would split pair).
break

return [message for block in reversed(selected) for message in block]


def truncate_by_token_budget(
messages: list[dict[str, Any]],
max_messages: int,
max_tokens: int,
*,
keep_latest_block: bool = True,
) -> list[dict[str, Any]]:
"""Keep recent API-safe blocks within message and token budgets.

This is a provider-safe fallback, not summary compaction: old blocks are
dropped, never summarized, and tool-call blocks remain atomic. The latest
block is retained by default so the current user turn is not removed even
when it alone exceeds the local estimate.
"""
if max_messages <= 0 or max_tokens <= 0 or not messages:
return []

blocks = _safe_history_blocks(messages)
selected: list[list[dict[str, Any]]] = []
message_budget = max_messages
token_budget = max_tokens

for block in reversed(blocks):
size = len(block)
tokens = _estimate_block_tokens(block)
is_latest = not selected

if size <= message_budget and tokens <= token_budget:
selected.append(block)
message_budget -= size
token_budget -= tokens
continue

if keep_latest_block and is_latest:
selected.append(block)
message_budget = max(message_budget - size, 0)
token_budget = max(token_budget - tokens, 0)
continue

break

return [message for block in reversed(selected) for message in block]
Loading