braintrustdata · srijanarya · May 30, 2026
diff --git a/py/src/braintrust/devserver/server.py b/py/src/braintrust/devserver/server.py
@@ -2,6 +2,7 @@
 import json
 import sys
 import textwrap
+import warnings
 from typing import Any
 
 
@@ -313,7 +314,20 @@ def create_app(evaluators: list[Evaluator[Any, Any, Any]], org_name: str | None
         Configured Starlette app
     """
     global _all_evaluators
-    _all_evaluators = {evaluator.eval_name: evaluator for evaluator in evaluators}
+    # Build the registry explicitly so a duplicate eval_name is surfaced (issue #366)
+    # rather than silently dropped by a dict comprehension. Keep the FIRST registration,
+    # mirroring how duplicate reporters are handled in cli/eval.py.
+    _all_evaluators = {}
+    for evaluator in evaluators:
+        if evaluator.eval_name in _all_evaluators:
+            warnings.warn(
+                f"Multiple evaluators registered with eval_name {evaluator.eval_name!r}; "
+                f"keeping the first and skipping the duplicate. "
+                f"Give each Eval(...) a unique name to register both.",
+                stacklevel=2,
+            )
+            continue
+        _all_evaluators[evaluator.eval_name] = evaluator
 
     routes = [
         Route("/", endpoint=index),
@@ -345,9 +359,10 @@ def run_dev_server(
         org_name: Optional organization name to restrict access to
     """
     print(f"Starting dev server on http://{host}:{port}")
-    print(f"Loaded {len(evaluators)} evaluator(s): {[e.eval_name for e in evaluators]}")
 
     app = create_app(evaluators, org_name=org_name)
+    # Report the count AFTER de-duplication so the log matches what /list actually serves (issue #366).
+    print(f"Loaded {len(_all_evaluators)} evaluator(s): {list(_all_evaluators.keys())}")
     uvicorn.run(app, host=host, port=port)
 
 

diff --git a/py/src/braintrust/devserver/test_server_integration.py b/py/src/braintrust/devserver/test_server_integration.py
@@ -428,3 +428,45 @@ async def fake_eval_async(*, project_id, **_kwargs):
 
     assert response.status_code == 200
     assert captured["project_id"] == "request-explicit-project-id"
+
+
+def test_create_app_warns_and_keeps_first_on_duplicate_eval_name():
+    """create_app must warn (not silently drop) when two evaluators share an eval_name,
+    keep the FIRST registration, and end up with exactly one entry.
+
+    Regression test for:
+    https://github.com/braintrustdata/braintrust-sdk-python/issues/366
+    """
+    if not has_devserver_installed():
+        pytest.skip("Devserver dependencies not installed (requires .[cli])")
+
+    from braintrust import Evaluator
+    from braintrust.devserver import server as devserver_module
+    from braintrust.devserver.server import create_app
+
+    first = Evaluator(
+        project_name="SharedProject",
+        eval_name="shared-name",
+        data=lambda: [{"input": "a", "expected": "a"}],
+        task=lambda input, hooks: input,
+        scores=[],
+        experiment_name=None,
+        metadata=None,
+    )
+    second = Evaluator(
+        project_name="SharedProject",
+        eval_name="shared-name",  # intentionally the same eval_name
+        data=lambda: [{"input": "b", "expected": "b"}],
+        task=lambda input, hooks: "wrong",
+        scores=[],
+        experiment_name=None,
+        metadata=None,
+    )
+
+    with pytest.warns(UserWarning, match="shared-name"):
+        create_app([first, second])
+
+    # First registration kept, duplicate skipped — exactly one entry, matching what
+    # GET /list (and now the startup log) report.
+    assert devserver_module._all_evaluators["shared-name"] is first
+    assert len(devserver_module._all_evaluators) == 1