Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 17 additions & 2 deletions py/src/braintrust/devserver/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import json
import sys
import textwrap
import warnings
from typing import Any


Expand Down Expand Up @@ -313,7 +314,20 @@ def create_app(evaluators: list[Evaluator[Any, Any, Any]], org_name: str | None
Configured Starlette app
"""
global _all_evaluators
_all_evaluators = {evaluator.eval_name: evaluator for evaluator in evaluators}
# Build the registry explicitly so a duplicate eval_name is surfaced (issue #366)
# rather than silently dropped by a dict comprehension. Keep the FIRST registration,
# mirroring how duplicate reporters are handled in cli/eval.py.
_all_evaluators = {}
for evaluator in evaluators:
if evaluator.eval_name in _all_evaluators:
warnings.warn(
f"Multiple evaluators registered with eval_name {evaluator.eval_name!r}; "
f"keeping the first and skipping the duplicate. "
f"Give each Eval(...) a unique name to register both.",
stacklevel=2,
)
continue
_all_evaluators[evaluator.eval_name] = evaluator

routes = [
Route("/", endpoint=index),
Expand Down Expand Up @@ -345,9 +359,10 @@ def run_dev_server(
org_name: Optional organization name to restrict access to
"""
print(f"Starting dev server on http://{host}:{port}")
print(f"Loaded {len(evaluators)} evaluator(s): {[e.eval_name for e in evaluators]}")

app = create_app(evaluators, org_name=org_name)
# Report the count AFTER de-duplication so the log matches what /list actually serves (issue #366).
print(f"Loaded {len(_all_evaluators)} evaluator(s): {list(_all_evaluators.keys())}")
uvicorn.run(app, host=host, port=port)


Expand Down
42 changes: 42 additions & 0 deletions py/src/braintrust/devserver/test_server_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,3 +428,45 @@ async def fake_eval_async(*, project_id, **_kwargs):

assert response.status_code == 200
assert captured["project_id"] == "request-explicit-project-id"


def test_create_app_warns_and_keeps_first_on_duplicate_eval_name():
"""create_app must warn (not silently drop) when two evaluators share an eval_name,
keep the FIRST registration, and end up with exactly one entry.

Regression test for:
https://github.com/braintrustdata/braintrust-sdk-python/issues/366
"""
if not has_devserver_installed():
pytest.skip("Devserver dependencies not installed (requires .[cli])")

from braintrust import Evaluator
from braintrust.devserver import server as devserver_module
from braintrust.devserver.server import create_app

first = Evaluator(
project_name="SharedProject",
eval_name="shared-name",
data=lambda: [{"input": "a", "expected": "a"}],
task=lambda input, hooks: input,
scores=[],
experiment_name=None,
metadata=None,
)
second = Evaluator(
project_name="SharedProject",
eval_name="shared-name", # intentionally the same eval_name
data=lambda: [{"input": "b", "expected": "b"}],
task=lambda input, hooks: "wrong",
scores=[],
experiment_name=None,
metadata=None,
)

with pytest.warns(UserWarning, match="shared-name"):
create_app([first, second])

# First registration kept, duplicate skipped — exactly one entry, matching what
# GET /list (and now the startup log) report.
assert devserver_module._all_evaluators["shared-name"] is first
assert len(devserver_module._all_evaluators) == 1