@@ -46,9 +46,7 @@ def test_defaults(self) -> None:
4646 assert result .error is None
4747
4848 def test_with_error (self ) -> None :
49- result = EvalResult (
50- task_name = "t1" , success = False , duration_ms = 50.0 , error = "timeout"
51- )
49+ result = EvalResult (task_name = "t1" , success = False , duration_ms = 50.0 , error = "timeout" )
5250 assert result .success is False
5351 assert result .error == "timeout"
5452
@@ -66,10 +64,16 @@ def test_empty(self) -> None:
6664 assert br .total_tokens == 0
6765
6866 def test_all_passed (self ) -> None :
69- br = BenchmarkResults (results = [
70- EvalResult (task_name = "t1" , success = True , duration_ms = 100.0 , tokens_in = 10 , tokens_out = 5 ),
71- EvalResult (task_name = "t2" , success = True , duration_ms = 200.0 , tokens_in = 20 , tokens_out = 10 ),
72- ])
67+ br = BenchmarkResults (
68+ results = [
69+ EvalResult (
70+ task_name = "t1" , success = True , duration_ms = 100.0 , tokens_in = 10 , tokens_out = 5
71+ ),
72+ EvalResult (
73+ task_name = "t2" , success = True , duration_ms = 200.0 , tokens_in = 20 , tokens_out = 10
74+ ),
75+ ]
76+ )
7377 assert br .total_tasks == 2
7478 assert br .passed == 2
7579 assert br .failed == 0
@@ -78,39 +82,49 @@ def test_all_passed(self) -> None:
7882 assert br .total_tokens == 45
7983
8084 def test_mixed_results (self ) -> None :
81- br = BenchmarkResults (results = [
82- EvalResult (task_name = "t1" , success = True , duration_ms = 100.0 ),
83- EvalResult (task_name = "t2" , success = False , duration_ms = 200.0 , error = "fail" ),
84- ])
85+ br = BenchmarkResults (
86+ results = [
87+ EvalResult (task_name = "t1" , success = True , duration_ms = 100.0 ),
88+ EvalResult (task_name = "t2" , success = False , duration_ms = 200.0 , error = "fail" ),
89+ ]
90+ )
8591 assert br .passed == 1
8692 assert br .failed == 1
8793 assert br .pass_rate == 0.5
8894
8995 def test_by_category (self ) -> None :
9096 # by_category splits on "/" — names without "/" get "general"
91- br = BenchmarkResults (results = [
92- EvalResult (task_name = "math/add" , success = True , duration_ms = 100.0 ),
93- EvalResult (task_name = "math/mul" , success = True , duration_ms = 100.0 ),
94- EvalResult (task_name = "general/weather" , success = False , duration_ms = 100.0 ),
95- ])
97+ br = BenchmarkResults (
98+ results = [
99+ EvalResult (task_name = "math/add" , success = True , duration_ms = 100.0 ),
100+ EvalResult (task_name = "math/mul" , success = True , duration_ms = 100.0 ),
101+ EvalResult (task_name = "general/weather" , success = False , duration_ms = 100.0 ),
102+ ]
103+ )
96104 cats = br .by_category ()
97105 assert len (cats ["math" ]) == 2
98106 assert len (cats ["general" ]) == 1
99107
100108 def test_summary (self ) -> None :
101- br = BenchmarkResults (results = [
102- EvalResult (task_name = "t1" , success = True , duration_ms = 100.0 , tokens_in = 10 , tokens_out = 5 ),
103- ])
109+ br = BenchmarkResults (
110+ results = [
111+ EvalResult (
112+ task_name = "t1" , success = True , duration_ms = 100.0 , tokens_in = 10 , tokens_out = 5
113+ ),
114+ ]
115+ )
104116 summary = br .summary ()
105117 assert "1/1 passed" in summary
106118 assert "100ms" in summary
107119 assert "15" in summary # total tokens
108120
109121 def test_summary_with_failures (self ) -> None :
110- br = BenchmarkResults (results = [
111- EvalResult (task_name = "t1" , success = True , duration_ms = 100.0 ),
112- EvalResult (task_name = "t2" , success = False , duration_ms = 50.0 , error = "bad output" ),
113- ])
122+ br = BenchmarkResults (
123+ results = [
124+ EvalResult (task_name = "t1" , success = True , duration_ms = 100.0 ),
125+ EvalResult (task_name = "t2" , success = False , duration_ms = 50.0 , error = "bad output" ),
126+ ]
127+ )
114128 summary = br .summary ()
115129 assert "1/2 passed" in summary
116130 assert "Failures:" in summary
@@ -178,21 +192,25 @@ def test_no_reset(self) -> None:
178192
179193 def test_validation_pass (self ) -> None :
180194 agent = _make_mock_agent (response = "The temperature is 72F" )
181- tasks = [EvalTask (
182- name = "t1" ,
183- prompt = "weather?" ,
184- validate = lambda r : "temperature" in r .response ,
185- )]
195+ tasks = [
196+ EvalTask (
197+ name = "t1" ,
198+ prompt = "weather?" ,
199+ validate = lambda r : "temperature" in r .response ,
200+ )
201+ ]
186202 results = run_benchmark (agent , tasks )
187203 assert results .passed == 1
188204
189205 def test_validation_fail (self ) -> None :
190206 agent = _make_mock_agent (response = "ok" )
191- tasks = [EvalTask (
192- name = "t1" ,
193- prompt = "weather?" ,
194- validate = lambda r : "temperature" in r .response ,
195- )]
207+ tasks = [
208+ EvalTask (
209+ name = "t1" ,
210+ prompt = "weather?" ,
211+ validate = lambda r : "temperature" in r .response ,
212+ )
213+ ]
196214 results = run_benchmark (agent , tasks )
197215 assert results .passed == 0
198216 assert results .failed == 1
@@ -253,11 +271,13 @@ async def test_validation_fail(self) -> None:
253271 resp .turns_taken = 1
254272 resp .duration = "1.0s"
255273 agent .chat .return_value = resp
256- tasks = [EvalTask (
257- name = "t1" ,
258- prompt = "test" ,
259- validate = lambda r : "target" in r .response ,
260- )]
274+ tasks = [
275+ EvalTask (
276+ name = "t1" ,
277+ prompt = "test" ,
278+ validate = lambda r : "target" in r .response ,
279+ )
280+ ]
261281 results = await run_benchmark_async (agent , tasks )
262282 assert results .failed == 1
263283
0 commit comments