launchdarkly · jsonbailey · May 14, 2026 · May 14, 2026
@@ -25,11 +25,17 @@ class LangChainModelRunner(Runner):
     :meth:`run`.
     """
 
-    def __init__(self, llm: BaseChatModel, config_messages: Optional[List[LDMessage]] = None):
+    def __init__(
+        self,
+        llm: BaseChatModel,
+        config_messages: Optional[List[LDMessage]] = None,
+        multi_turn: bool = True,
+    ):
         self._llm = llm
         self._chat_history = InMemoryChatMessageHistory(
             messages=cast(List[BaseMessage], convert_messages_to_langchain(config_messages or []))
         )
+        self._multi_turn = multi_turn
 
     def get_llm(self) -> BaseChatModel:
         """
@@ -61,7 +67,7 @@ async def run(
         else:
             result = await self._run_completion(langchain_messages)
 
-        if result.metrics.success and result.content:
+        if result.metrics.success and result.content and self._multi_turn:
             self._chat_history.add_user_message(input)
             self._chat_history.add_ai_message(result.content)
 

@@ -61,13 +61,16 @@ def create_agent_graph(
         )
         return LangGraphAgentGraphRunner(graph_def, tools)
 
-    def create_model(self, config: AIConfigKind) -> LangChainModelRunner:
+    def create_model(self, config: AIConfigKind, multi_turn: bool = True) -> LangChainModelRunner:
         """
         Create a configured LangChainModelRunner for the given AI config.
 
         :param config: The LaunchDarkly AI configuration
+        :param multi_turn: When ``True`` (the default) the runner accumulates
+            successful exchanges into its conversation history. Pass ``False`` to
+            keep history fixed at the configured baseline across ``run()`` calls.
         :return: LangChainModelRunner ready to invoke the model
         """
         llm = create_langchain_model(config)
         config_messages = list(getattr(config, 'messages', None) or [])
-        return LangChainModelRunner(llm, config_messages)
+        return LangChainModelRunner(llm, config_messages, multi_turn=multi_turn)
@@ -281,6 +281,41 @@ async def test_accumulates_history_across_successful_calls(self, mock_llm):
         assert second_call_messages[1].content == 'First response'
         assert second_call_messages[2].content == 'Second question'
 
+    @pytest.mark.asyncio
+    async def test_multi_turn_false_does_not_accumulate_history(self, mock_llm):
+        """When multi_turn=False the runner must not append to history on success."""
+        mock_llm.ainvoke = AsyncMock(side_effect=[
+            AIMessage(content='First response'),
+            AIMessage(content='Second response'),
+        ])
+        provider = LangChainModelRunner(mock_llm, multi_turn=False)
+        baseline_len = len(provider._chat_history.messages)
+
+        await provider.run('First question')
+        assert len(provider._chat_history.messages) == baseline_len
+
+        await provider.run('Second question')
+        assert len(provider._chat_history.messages) == baseline_len
+
+        second_call_messages = mock_llm.ainvoke.call_args_list[1][0][0]
+        assert len(second_call_messages) == 1
+        assert second_call_messages[0].content == 'Second question'
+
+    @pytest.mark.asyncio
+    async def test_multi_turn_default_accumulates_history(self, mock_llm):
+        """Default behavior (multi_turn omitted) still accumulates history (preserves PR #166)."""
+        mock_llm.ainvoke = AsyncMock(side_effect=[
+            AIMessage(content='First response'),
+            AIMessage(content='Second response'),
+        ])
+        provider = LangChainModelRunner(mock_llm)
+        baseline_len = len(provider._chat_history.messages)
+
+        await provider.run('First question')
+        await provider.run('Second question')
+
+        assert len(provider._chat_history.messages) == baseline_len + 4
+
     @pytest.mark.asyncio
     async def test_does_not_accumulate_history_on_failed_call(self, mock_llm):
         """Should not add to history when the call fails."""

@@ -29,11 +29,13 @@ def __init__(
         model_name: str,
         parameters: Dict[str, Any],
         config_messages: Optional[List[LDMessage]] = None,
+        multi_turn: bool = True,
     ):
         self._client = client
         self._model_name = model_name
         self._parameters = parameters
         self._history: List[LDMessage] = list(config_messages or [])
+        self._multi_turn = multi_turn
 
     async def run(
         self,
@@ -58,7 +60,7 @@ async def run(
         else:
             result = await self._run_completion(messages)
 
-        if result.metrics.success and result.content:
+        if result.metrics.success and result.content and self._multi_turn:
             self._history.append(user_message)
             self._history.append(LDMessage(role='assistant', content=result.content))
 

@@ -84,7 +84,7 @@ def create_agent_graph(
         from ldai_openai.openai_agent_graph_runner import OpenAIAgentGraphRunner
         return OpenAIAgentGraphRunner(graph_def, tools)
 
-    def create_model(self, config: AIConfigKind) -> OpenAIModelRunner:
+    def create_model(self, config: AIConfigKind, multi_turn: bool = True) -> OpenAIModelRunner:
         """
         Create a configured OpenAIModelRunner for the given AI config.
 
@@ -93,6 +93,9 @@ def create_model(self, config: AIConfigKind) -> OpenAIModelRunner:
         needed; all other fields are passed through from the config.
 
         :param config: The LaunchDarkly AI configuration
+        :param multi_turn: When ``True`` (the default) the runner accumulates
+            successful exchanges into its conversation history. Pass ``False`` to
+            keep history fixed at the configured baseline across ``run()`` calls.
         :return: OpenAIModelRunner ready to invoke the model
         """
         model_name, parameters = self._extract_model_config(config)
@@ -101,7 +104,9 @@ def create_model(self, config: AIConfigKind) -> OpenAIModelRunner:
         if tool_defs:
             parameters['tools'] = normalize_tool_types(tool_defs)
         config_messages = list(getattr(config, 'messages', None) or [])
-        return OpenAIModelRunner(self._client, model_name, parameters, config_messages)
+        return OpenAIModelRunner(
+            self._client, model_name, parameters, config_messages, multi_turn=multi_turn
+        )
 
     def get_client(self) -> AsyncOpenAI:
         """

@@ -234,6 +234,65 @@ def make_response(text: str):
             {'role': 'user', 'content': 'Second question'},
         ]
 
+    @pytest.mark.asyncio
+    async def test_multi_turn_false_does_not_accumulate_history(self, mock_client):
+        """When multi_turn=False the runner must not append to history on success."""
+        def make_response(text: str):
+            r = MagicMock()
+            r.context_wrapper = None
+            r.choices = [MagicMock()]
+            r.choices[0].message = MagicMock()
+            r.choices[0].message.content = text
+            r.usage = None
+            return r
+
+        mock_client.chat = MagicMock()
+        mock_client.chat.completions = MagicMock()
+        mock_client.chat.completions.create = AsyncMock(side_effect=[
+            make_response('First response'),
+            make_response('Second response'),
+        ])
+
+        provider = OpenAIModelRunner(mock_client, 'gpt-4o', {}, multi_turn=False)
+        baseline_len = len(provider._history)
+
+        await provider.run('First question')
+        assert len(provider._history) == baseline_len
+
+        await provider.run('Second question')
+        assert len(provider._history) == baseline_len
+
+        # Each call must see only the configured baseline, never the prior turn.
+        second_call_messages = mock_client.chat.completions.create.call_args_list[1].kwargs['messages']
+        assert second_call_messages == [{'role': 'user', 'content': 'Second question'}]
+
+    @pytest.mark.asyncio
+    async def test_multi_turn_default_accumulates_history(self, mock_client):
+        """Default behavior (multi_turn omitted) still accumulates history (preserves PR #166)."""
+        def make_response(text: str):
+            r = MagicMock()
+            r.context_wrapper = None
+            r.choices = [MagicMock()]
+            r.choices[0].message = MagicMock()
+            r.choices[0].message.content = text
+            r.usage = None
+            return r
+
+        mock_client.chat = MagicMock()
+        mock_client.chat.completions = MagicMock()
+        mock_client.chat.completions.create = AsyncMock(side_effect=[
+            make_response('First response'),
+            make_response('Second response'),
+        ])
+
+        provider = OpenAIModelRunner(mock_client, 'gpt-4o', {})
+        baseline_len = len(provider._history)
+
+        await provider.run('First question')
+        await provider.run('Second question')
+
+        assert len(provider._history) == baseline_len + 4
+
     @pytest.mark.asyncio
     async def test_does_not_accumulate_history_on_failed_call(self, mock_client):
         """Should not add to history when the call fails."""

@@ -339,7 +339,9 @@ def _create_judge_instance(
             if not judge_config.enabled:
                 return None
 
-            provider = RunnerFactory.create_model(judge_config, default_ai_provider)
+            provider = RunnerFactory.create_model(
+                judge_config, default_ai_provider, multi_turn=False
+            )
             if not provider:
                 return None
 

@@ -132,13 +132,19 @@ async def evaluate_messages(
         """
         Evaluates an AI response from chat messages and response.
 
+        The conversation is rendered for the judge by joining each message as
+        ``"{role}: {content}"`` on newlines, preserving who said what so the
+        judge can distinguish user turns from assistant turns.
+
         :param messages: Array of messages representing the conversation history
         :param response: The runner result to be evaluated
         :param sampling_ratio: Sampling ratio (0-1) to determine if evaluation should be processed.
             When ``None`` (the default), falls back to ``self.sample_rate``.
         :return: The result of the judge evaluation.
         """
-        input_text = '\r\n'.join([msg.content for msg in messages]) if messages else ''
+        input_text = (
+            '\n'.join(f'{msg.role}: {msg.content}' for msg in messages) if messages else ''
+        )
         output_text = response.content
 
         return await self.evaluate(input_text, output_text, sampling_ratio)

@@ -15,13 +15,18 @@ class AIProvider(ABC):
     create_model(), create_agent(), and create_agent_graph().
     """
 
-    def create_model(self, config: Any) -> Optional[Any]:
+    def create_model(self, config: Any, multi_turn: bool = True) -> Optional[Any]:
         """
         Create a configured model executor for the given AI config.
 
         Default implementation warns. Provider implementations should override this method.
 
         :param config: The LaunchDarkly AI configuration
+        :param multi_turn: When ``True`` (the default) the returned runner should
+            accumulate conversation history across successful ``run()`` calls.
+            When ``False`` each invocation starts from the same baseline history,
+            which is required for callers that share one runner across
+            independent invocations (e.g. judges).
         :return: Configured model runner instance, or None if unsupported
         """
         log.warning('create_model not implemented by this provider')

@@ -120,17 +120,25 @@ def _get_providers_to_try(
     def create_model(
         config: AIConfigKind,
         default_ai_provider: Optional[str] = None,
+        multi_turn: bool = True,
     ) -> Optional[Runner]:
         """
         Create a model executor for the given AI completion config.
 
         :param config: LaunchDarkly AI config (completion or judge)
         :param default_ai_provider: Optional provider override ('openai', 'langchain', …)
+        :param multi_turn: When ``True`` (the default) the returned runner appends
+            each successful exchange to its history so subsequent ``run()`` calls
+            include the prior conversation. Set ``False`` for callers that share a
+            single runner across independent invocations (for example, judges) so
+            each call starts from the same baseline history.
         :return: Configured Runner ready to invoke the model, or None
         """
         provider_name = config.provider.name.lower() if config.provider else None
         providers = RunnerFactory._get_providers_to_try(default_ai_provider, provider_name)
-        return RunnerFactory._with_fallback(providers, lambda p: p.create_model(config))
+        return RunnerFactory._with_fallback(
+            providers, lambda p: p.create_model(config, multi_turn=multi_turn)
+        )
 
     @staticmethod
     def create_agent(

@@ -1,5 +1,6 @@
 """Tests for Judge functionality."""
 
+from typing import List
 from unittest.mock import AsyncMock, MagicMock, call, patch
 
 import pytest
@@ -543,6 +544,79 @@ async def test_evaluate_messages_calls_evaluate(
         assert result.success is True
         assert tracker.track_metrics_of_async.called
 
+    @pytest.mark.asyncio
+    async def test_evaluate_messages_preserves_roles_in_input(
+        self, judge_config_with_key: AIJudgeConfig, mock_runner
+    ):
+        """evaluate_messages must forward role-prefixed lines to evaluate()."""
+        messages = [
+            LDMessage(role='user', content='hi'),
+            LDMessage(role='assistant', content='hello'),
+        ]
+        chat_response = RunnerResult(content='reply', metrics=LDAIMetrics(success=True))
+
+        judge = Judge(judge_config_with_key, mock_runner)
+        with patch.object(judge, 'evaluate', new=AsyncMock(return_value=JudgeResult(judge_config_key='judge-config'))) as mock_evaluate:
+            await judge.evaluate_messages(messages, chat_response)
+
+        mock_evaluate.assert_called_once()
+        args, _ = mock_evaluate.call_args
+        assert args[0] == 'user: hi\nassistant: hello'
+        assert args[1] == 'reply'
+
+
+class TestJudgeRunnerNonMultiTurn:
+    """Successive evaluate() calls must not contaminate each other.
+
+    The Judge shares one runner across evaluations, so the runner must be
+    stateless across calls — RunnerFactory.create_model(..., multi_turn=False)
+    is what guarantees that at the client layer. These tests verify the Judge
+    itself does not accidentally mutate the runner's history and that two
+    evaluations see the same baseline.
+    """
+
+    @pytest.mark.asyncio
+    async def test_two_evaluations_do_not_contaminate_history(
+        self, judge_config_with_key: AIJudgeConfig, tracker: LDAIConfigTracker
+    ):
+        """A judge bound to a non-multi-turn runner must run the same baseline twice."""
+        # Stand in a fake runner that records the history it would expose to the
+        # LLM at the moment run() is called. With multi_turn=False the recorded
+        # baseline should be identical across calls.
+        seen_baselines: List[List[LDMessage]] = []
+
+        class _FakeRunner:
+            def __init__(self):
+                self._history: List[LDMessage] = []
+                self._multi_turn = False
+
+            async def run(self, input, output_type=None):  # type: ignore[no-untyped-def]
+                # Snapshot history as seen at call time.
+                seen_baselines.append(list(self._history))
+                return RunnerResult(
+                    content='ok',
+                    metrics=LDAIMetrics(success=True),
+                    parsed={'score': 0.9, 'reasoning': 'fine'},
+                )
+
+        runner = _FakeRunner()
+
+        async def _await_fn(_metric_fn, fn):
+            return await fn()
+
+        tracker.track_metrics_of_async = AsyncMock(side_effect=_await_fn)
+        judge = Judge(judge_config_with_key, runner)  # type: ignore[arg-type]
+
+        await judge.evaluate('first input', 'first output')
+        await judge.evaluate('second input', 'second output')
+
+        assert len(seen_baselines) == 2
+        # Both runs see the same baseline (empty in this fake; the point is
+        # they're equal — no contamination from the prior turn).
+        assert seen_baselines[0] == seen_baselines[1]
+        # And the runner's history never grew because multi_turn is False.
+        assert runner._history == []
+
 
 class TestJudgeConfigStripsLegacyMessages:
     """Tests for ``LDAIClient.judge_config()`` legacy-message stripping.