diff --git a/py/src/braintrust/devserver/server.py b/py/src/braintrust/devserver/server.py index a141fc79..fe9f1295 100644 --- a/py/src/braintrust/devserver/server.py +++ b/py/src/braintrust/devserver/server.py @@ -2,6 +2,7 @@ import json import sys import textwrap +import warnings from typing import Any @@ -313,7 +314,20 @@ def create_app(evaluators: list[Evaluator[Any, Any, Any]], org_name: str | None Configured Starlette app """ global _all_evaluators - _all_evaluators = {evaluator.eval_name: evaluator for evaluator in evaluators} + # Build the registry explicitly so a duplicate eval_name is surfaced (issue #366) + # rather than silently dropped by a dict comprehension. Keep the FIRST registration, + # mirroring how duplicate reporters are handled in cli/eval.py. + _all_evaluators = {} + for evaluator in evaluators: + if evaluator.eval_name in _all_evaluators: + warnings.warn( + f"Multiple evaluators registered with eval_name {evaluator.eval_name!r}; " + f"keeping the first and skipping the duplicate. " + f"Give each Eval(...) a unique name to register both.", + stacklevel=2, + ) + continue + _all_evaluators[evaluator.eval_name] = evaluator routes = [ Route("/", endpoint=index), @@ -345,9 +359,10 @@ def run_dev_server( org_name: Optional organization name to restrict access to """ print(f"Starting dev server on http://{host}:{port}") - print(f"Loaded {len(evaluators)} evaluator(s): {[e.eval_name for e in evaluators]}") app = create_app(evaluators, org_name=org_name) + # Report the count AFTER de-duplication so the log matches what /list actually serves (issue #366). + print(f"Loaded {len(_all_evaluators)} evaluator(s): {list(_all_evaluators.keys())}") uvicorn.run(app, host=host, port=port) diff --git a/py/src/braintrust/devserver/test_server_integration.py b/py/src/braintrust/devserver/test_server_integration.py index 4a013684..b91dc37f 100644 --- a/py/src/braintrust/devserver/test_server_integration.py +++ b/py/src/braintrust/devserver/test_server_integration.py @@ -428,3 +428,45 @@ async def fake_eval_async(*, project_id, **_kwargs): assert response.status_code == 200 assert captured["project_id"] == "request-explicit-project-id" + + +def test_create_app_warns_and_keeps_first_on_duplicate_eval_name(): + """create_app must warn (not silently drop) when two evaluators share an eval_name, + keep the FIRST registration, and end up with exactly one entry. + + Regression test for: + https://github.com/braintrustdata/braintrust-sdk-python/issues/366 + """ + if not has_devserver_installed(): + pytest.skip("Devserver dependencies not installed (requires .[cli])") + + from braintrust import Evaluator + from braintrust.devserver import server as devserver_module + from braintrust.devserver.server import create_app + + first = Evaluator( + project_name="SharedProject", + eval_name="shared-name", + data=lambda: [{"input": "a", "expected": "a"}], + task=lambda input, hooks: input, + scores=[], + experiment_name=None, + metadata=None, + ) + second = Evaluator( + project_name="SharedProject", + eval_name="shared-name", # intentionally the same eval_name + data=lambda: [{"input": "b", "expected": "b"}], + task=lambda input, hooks: "wrong", + scores=[], + experiment_name=None, + metadata=None, + ) + + with pytest.warns(UserWarning, match="shared-name"): + create_app([first, second]) + + # First registration kept, duplicate skipped — exactly one entry, matching what + # GET /list (and now the startup log) report. + assert devserver_module._all_evaluators["shared-name"] is first + assert len(devserver_module._all_evaluators) == 1