Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,17 @@ class LangChainModelRunner(Runner):
:meth:`run`.
"""

def __init__(self, llm: BaseChatModel, config_messages: Optional[List[LDMessage]] = None):
def __init__(
self,
llm: BaseChatModel,
config_messages: Optional[List[LDMessage]] = None,
multi_turn: bool = True,
):
self._llm = llm
self._chat_history = InMemoryChatMessageHistory(
messages=cast(List[BaseMessage], convert_messages_to_langchain(config_messages or []))
)
self._multi_turn = multi_turn

def get_llm(self) -> BaseChatModel:
"""
Expand Down Expand Up @@ -61,7 +67,7 @@ async def run(
else:
result = await self._run_completion(langchain_messages)

if result.metrics.success and result.content:
if result.metrics.success and result.content and self._multi_turn:
self._chat_history.add_user_message(input)
self._chat_history.add_ai_message(result.content)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,16 @@ def create_agent_graph(
)
return LangGraphAgentGraphRunner(graph_def, tools)

def create_model(self, config: AIConfigKind) -> LangChainModelRunner:
def create_model(self, config: AIConfigKind, multi_turn: bool = True) -> LangChainModelRunner:
"""
Create a configured LangChainModelRunner for the given AI config.

:param config: The LaunchDarkly AI configuration
:param multi_turn: When ``True`` (the default) the runner accumulates
successful exchanges into its conversation history. Pass ``False`` to
keep history fixed at the configured baseline across ``run()`` calls.
:return: LangChainModelRunner ready to invoke the model
"""
llm = create_langchain_model(config)
config_messages = list(getattr(config, 'messages', None) or [])
return LangChainModelRunner(llm, config_messages)
return LangChainModelRunner(llm, config_messages, multi_turn=multi_turn)
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,41 @@ async def test_accumulates_history_across_successful_calls(self, mock_llm):
assert second_call_messages[1].content == 'First response'
assert second_call_messages[2].content == 'Second question'

@pytest.mark.asyncio
async def test_multi_turn_false_does_not_accumulate_history(self, mock_llm):
"""When multi_turn=False the runner must not append to history on success."""
mock_llm.ainvoke = AsyncMock(side_effect=[
AIMessage(content='First response'),
AIMessage(content='Second response'),
])
provider = LangChainModelRunner(mock_llm, multi_turn=False)
baseline_len = len(provider._chat_history.messages)

await provider.run('First question')
assert len(provider._chat_history.messages) == baseline_len

await provider.run('Second question')
assert len(provider._chat_history.messages) == baseline_len

second_call_messages = mock_llm.ainvoke.call_args_list[1][0][0]
assert len(second_call_messages) == 1
assert second_call_messages[0].content == 'Second question'

@pytest.mark.asyncio
async def test_multi_turn_default_accumulates_history(self, mock_llm):
"""Default behavior (multi_turn omitted) still accumulates history (preserves PR #166)."""
mock_llm.ainvoke = AsyncMock(side_effect=[
AIMessage(content='First response'),
AIMessage(content='Second response'),
])
provider = LangChainModelRunner(mock_llm)
baseline_len = len(provider._chat_history.messages)

await provider.run('First question')
await provider.run('Second question')

assert len(provider._chat_history.messages) == baseline_len + 4

@pytest.mark.asyncio
async def test_does_not_accumulate_history_on_failed_call(self, mock_llm):
"""Should not add to history when the call fails."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,13 @@ def __init__(
model_name: str,
parameters: Dict[str, Any],
config_messages: Optional[List[LDMessage]] = None,
multi_turn: bool = True,
):
self._client = client
self._model_name = model_name
self._parameters = parameters
self._history: List[LDMessage] = list(config_messages or [])
self._multi_turn = multi_turn

async def run(
self,
Expand All @@ -58,7 +60,7 @@ async def run(
else:
result = await self._run_completion(messages)

if result.metrics.success and result.content:
if result.metrics.success and result.content and self._multi_turn:
self._history.append(user_message)
self._history.append(LDMessage(role='assistant', content=result.content))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def create_agent_graph(
from ldai_openai.openai_agent_graph_runner import OpenAIAgentGraphRunner
return OpenAIAgentGraphRunner(graph_def, tools)

def create_model(self, config: AIConfigKind) -> OpenAIModelRunner:
def create_model(self, config: AIConfigKind, multi_turn: bool = True) -> OpenAIModelRunner:
"""
Create a configured OpenAIModelRunner for the given AI config.

Expand All @@ -93,6 +93,9 @@ def create_model(self, config: AIConfigKind) -> OpenAIModelRunner:
needed; all other fields are passed through from the config.

:param config: The LaunchDarkly AI configuration
:param multi_turn: When ``True`` (the default) the runner accumulates
successful exchanges into its conversation history. Pass ``False`` to
keep history fixed at the configured baseline across ``run()`` calls.
:return: OpenAIModelRunner ready to invoke the model
"""
model_name, parameters = self._extract_model_config(config)
Expand All @@ -101,7 +104,9 @@ def create_model(self, config: AIConfigKind) -> OpenAIModelRunner:
if tool_defs:
parameters['tools'] = normalize_tool_types(tool_defs)
config_messages = list(getattr(config, 'messages', None) or [])
return OpenAIModelRunner(self._client, model_name, parameters, config_messages)
return OpenAIModelRunner(
self._client, model_name, parameters, config_messages, multi_turn=multi_turn
)

def get_client(self) -> AsyncOpenAI:
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,65 @@ def make_response(text: str):
{'role': 'user', 'content': 'Second question'},
]

@pytest.mark.asyncio
async def test_multi_turn_false_does_not_accumulate_history(self, mock_client):
"""When multi_turn=False the runner must not append to history on success."""
def make_response(text: str):
r = MagicMock()
r.context_wrapper = None
r.choices = [MagicMock()]
r.choices[0].message = MagicMock()
r.choices[0].message.content = text
r.usage = None
return r

mock_client.chat = MagicMock()
mock_client.chat.completions = MagicMock()
mock_client.chat.completions.create = AsyncMock(side_effect=[
make_response('First response'),
make_response('Second response'),
])

provider = OpenAIModelRunner(mock_client, 'gpt-4o', {}, multi_turn=False)
baseline_len = len(provider._history)

await provider.run('First question')
assert len(provider._history) == baseline_len

await provider.run('Second question')
assert len(provider._history) == baseline_len

# Each call must see only the configured baseline, never the prior turn.
second_call_messages = mock_client.chat.completions.create.call_args_list[1].kwargs['messages']
assert second_call_messages == [{'role': 'user', 'content': 'Second question'}]

@pytest.mark.asyncio
async def test_multi_turn_default_accumulates_history(self, mock_client):
"""Default behavior (multi_turn omitted) still accumulates history (preserves PR #166)."""
def make_response(text: str):
r = MagicMock()
r.context_wrapper = None
r.choices = [MagicMock()]
r.choices[0].message = MagicMock()
r.choices[0].message.content = text
r.usage = None
return r

mock_client.chat = MagicMock()
mock_client.chat.completions = MagicMock()
mock_client.chat.completions.create = AsyncMock(side_effect=[
make_response('First response'),
make_response('Second response'),
])

provider = OpenAIModelRunner(mock_client, 'gpt-4o', {})
baseline_len = len(provider._history)

await provider.run('First question')
await provider.run('Second question')

assert len(provider._history) == baseline_len + 4

@pytest.mark.asyncio
async def test_does_not_accumulate_history_on_failed_call(self, mock_client):
"""Should not add to history when the call fails."""
Expand Down
4 changes: 3 additions & 1 deletion packages/sdk/server-ai/src/ldai/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,9 @@ def _create_judge_instance(
if not judge_config.enabled:
return None

provider = RunnerFactory.create_model(judge_config, default_ai_provider)
provider = RunnerFactory.create_model(
judge_config, default_ai_provider, multi_turn=False
)
if not provider:
return None

Expand Down
8 changes: 7 additions & 1 deletion packages/sdk/server-ai/src/ldai/judge/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,13 +132,19 @@ async def evaluate_messages(
"""
Evaluates an AI response from chat messages and response.

The conversation is rendered for the judge by joining each message as
``"{role}: {content}"`` on newlines, preserving who said what so the
judge can distinguish user turns from assistant turns.

:param messages: Array of messages representing the conversation history
:param response: The runner result to be evaluated
:param sampling_ratio: Sampling ratio (0-1) to determine if evaluation should be processed.
When ``None`` (the default), falls back to ``self.sample_rate``.
:return: The result of the judge evaluation.
"""
input_text = '\r\n'.join([msg.content for msg in messages]) if messages else ''
input_text = (
'\n'.join(f'{msg.role}: {msg.content}' for msg in messages) if messages else ''
)
output_text = response.content

return await self.evaluate(input_text, output_text, sampling_ratio)
Expand Down
7 changes: 6 additions & 1 deletion packages/sdk/server-ai/src/ldai/providers/ai_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,18 @@ class AIProvider(ABC):
create_model(), create_agent(), and create_agent_graph().
"""

def create_model(self, config: Any) -> Optional[Any]:
def create_model(self, config: Any, multi_turn: bool = True) -> Optional[Any]:
"""
Create a configured model executor for the given AI config.

Default implementation warns. Provider implementations should override this method.

:param config: The LaunchDarkly AI configuration
:param multi_turn: When ``True`` (the default) the returned runner should
accumulate conversation history across successful ``run()`` calls.
When ``False`` each invocation starts from the same baseline history,
which is required for callers that share one runner across
independent invocations (e.g. judges).
:return: Configured model runner instance, or None if unsupported
"""
log.warning('create_model not implemented by this provider')
Expand Down
10 changes: 9 additions & 1 deletion packages/sdk/server-ai/src/ldai/providers/runner_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,17 +120,25 @@ def _get_providers_to_try(
def create_model(
config: AIConfigKind,
default_ai_provider: Optional[str] = None,
multi_turn: bool = True,
) -> Optional[Runner]:
"""
Create a model executor for the given AI completion config.

:param config: LaunchDarkly AI config (completion or judge)
:param default_ai_provider: Optional provider override ('openai', 'langchain', …)
:param multi_turn: When ``True`` (the default) the returned runner appends
each successful exchange to its history so subsequent ``run()`` calls
include the prior conversation. Set ``False`` for callers that share a
single runner across independent invocations (for example, judges) so
each call starts from the same baseline history.
:return: Configured Runner ready to invoke the model, or None
"""
provider_name = config.provider.name.lower() if config.provider else None
providers = RunnerFactory._get_providers_to_try(default_ai_provider, provider_name)
return RunnerFactory._with_fallback(providers, lambda p: p.create_model(config))
return RunnerFactory._with_fallback(
providers, lambda p: p.create_model(config, multi_turn=multi_turn)
)

@staticmethod
def create_agent(
Expand Down
74 changes: 74 additions & 0 deletions packages/sdk/server-ai/tests/test_judge.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Tests for Judge functionality."""

from typing import List
from unittest.mock import AsyncMock, MagicMock, call, patch

import pytest
Expand Down Expand Up @@ -543,6 +544,79 @@ async def test_evaluate_messages_calls_evaluate(
assert result.success is True
assert tracker.track_metrics_of_async.called

@pytest.mark.asyncio
async def test_evaluate_messages_preserves_roles_in_input(
self, judge_config_with_key: AIJudgeConfig, mock_runner
):
"""evaluate_messages must forward role-prefixed lines to evaluate()."""
messages = [
LDMessage(role='user', content='hi'),
LDMessage(role='assistant', content='hello'),
]
chat_response = RunnerResult(content='reply', metrics=LDAIMetrics(success=True))

judge = Judge(judge_config_with_key, mock_runner)
with patch.object(judge, 'evaluate', new=AsyncMock(return_value=JudgeResult(judge_config_key='judge-config'))) as mock_evaluate:
await judge.evaluate_messages(messages, chat_response)

mock_evaluate.assert_called_once()
args, _ = mock_evaluate.call_args
assert args[0] == 'user: hi\nassistant: hello'
assert args[1] == 'reply'


class TestJudgeRunnerNonMultiTurn:
"""Successive evaluate() calls must not contaminate each other.

The Judge shares one runner across evaluations, so the runner must be
stateless across calls — RunnerFactory.create_model(..., multi_turn=False)
is what guarantees that at the client layer. These tests verify the Judge
itself does not accidentally mutate the runner's history and that two
evaluations see the same baseline.
"""

@pytest.mark.asyncio
async def test_two_evaluations_do_not_contaminate_history(
self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker
):
"""A judge bound to a non-multi-turn runner must run the same baseline twice."""
# Stand in a fake runner that records the history it would expose to the
# LLM at the moment run() is called. With multi_turn=False the recorded
# baseline should be identical across calls.
seen_baselines: List[List[LDMessage]] = []

class _FakeRunner:
def __init__(self):
self._history: List[LDMessage] = []
self._multi_turn = False

async def run(self, input, output_type=None): # type: ignore[no-untyped-def]
# Snapshot history as seen at call time.
seen_baselines.append(list(self._history))
return RunnerResult(
content='ok',
metrics=LDAIMetrics(success=True),
parsed={'score': 0.9, 'reasoning': 'fine'},
)

runner = _FakeRunner()

async def _await_fn(_metric_fn, fn):
return await fn()

tracker.track_metrics_of_async = AsyncMock(side_effect=_await_fn)
judge = Judge(judge_config_with_key, runner) # type: ignore[arg-type]

await judge.evaluate('first input', 'first output')
await judge.evaluate('second input', 'second output')

assert len(seen_baselines) == 2
# Both runs see the same baseline (empty in this fake; the point is
# they're equal — no contamination from the prior turn).
assert seen_baselines[0] == seen_baselines[1]
# And the runner's history never grew because multi_turn is False.
assert runner._history == []


class TestJudgeConfigStripsLegacyMessages:
"""Tests for ``LDAIClient.judge_config()`` legacy-message stripping.
Expand Down
Loading
Loading