Skip to content

Commit 090163b

Browse files
jsondaicopybara-github
authored andcommitted
chore: GenAI Client(evals) - simplify create eval run evaluation interface for user simulation
PiperOrigin-RevId: 885436116
1 parent 981a551 commit 090163b

9 files changed

Lines changed: 667 additions & 550 deletions

File tree

tests/unit/vertexai/genai/replays/test_create_evaluation_run.py

Lines changed: 114 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@
1717
from tests.unit.vertexai.genai.replays import pytest_helper
1818
from vertexai import types
1919
from google.genai import types as genai_types
20-
import pytest
2120
import pandas as pd
21+
import pytest
2222

2323
GCS_DEST = "gs://lakeyk-limited-bucket/eval_run_output"
2424
GENERAL_QUALITY_METRIC = types.EvaluationRunMetric(
@@ -42,7 +42,7 @@
4242
metric_config=types.UnifiedMetric(
4343
llm_based_metric_spec=types.LLMBasedMetricSpec(
4444
metric_prompt_template=(
45-
"\nEvaluate the fluency of the response. Provide a score from 1-5."
45+
"\nEvaluate the fluency of the response. Provide a score from" " 1-5."
4646
)
4747
)
4848
),
@@ -65,7 +65,7 @@
6565
),
6666
)
6767
INFERENCE_CONFIG = types.EvaluationRunInferenceConfig(
68-
model="projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash"
68+
model="projects/977012026409/locations/us-central1/publishers/google/models/gemini-2.5-flash"
6969
)
7070
TOOL = genai_types.Tool(
7171
function_declarations=[
@@ -80,10 +80,16 @@
8080
]
8181
)
8282
AGENT_INFO = types.evals.AgentInfo(
83-
agent_resource_name="projects/123/locations/us-central1/reasoningEngines/456",
83+
agent_resource_name=("projects/123/locations/us-central1/reasoningEngines/456"),
8484
name="agent-1",
85-
instruction="agent-1 instruction",
86-
tool_declarations=[TOOL],
85+
agents={
86+
"agent-1": types.evals.AgentConfig(
87+
agent_id="agent-1",
88+
instruction="agent-1 instruction",
89+
tools=[TOOL],
90+
)
91+
},
92+
root_agent_id="agent-1",
8793
)
8894
DEFAULT_PROMPT_TEMPLATE = "{prompt}"
8995
INPUT_DF_WITH_CONTEXT_AND_HISTORY = pd.DataFrame(
@@ -96,9 +102,9 @@
96102
}
97103
)
98104
CANDIDATE_NAME = "candidate_1"
99-
MODEL_NAME = "projects/503583131166/locations/us-central1/publishers/google/models/gemini-2.5-flash"
105+
MODEL_NAME = "projects/977012026409/locations/us-central1/publishers/google/models/gemini-2.5-flash"
100106
EVAL_SET_NAME = (
101-
"projects/503583131166/locations/us-central1/evaluationSets/6619939608513740800"
107+
"projects/977012026409/locations/us-central1/evaluationSets/6619939608513740800"
102108
)
103109

104110

@@ -140,12 +146,11 @@ def test_create_eval_run_data_source_evaluation_set(client):
140146
assert evaluation_run.inference_configs[
141147
AGENT_INFO.name
142148
] == types.EvaluationRunInferenceConfig(
143-
agent_config=types.EvaluationRunAgentConfig(
144-
developer_instruction=genai_types.Content(
145-
parts=[genai_types.Part(text="agent-1 instruction")]
146-
),
147-
tools=[TOOL],
148-
)
149+
agent_configs=AGENT_INFO.agents,
150+
agent_run_config=types.AgentRunConfig(
151+
agent_engine=AGENT_INFO.agent_resource_name,
152+
user_simulator_config={"max_turn": 5},
153+
),
149154
)
150155
assert evaluation_run.labels == {
151156
"vertex-ai-evaluation-agent-engine-id": "456",
@@ -202,6 +207,53 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
202207
assert evaluation_run.error is None
203208

204209

210+
def test_create_eval_run_with_user_simulator_config(client):
211+
"""Tests that create_evaluation_run() creates a correctly structured EvaluationRun with user_simulator_config."""
212+
client._api_client._http_options.api_version = "v1beta1"
213+
evaluation_run = client.evals.create_evaluation_run(
214+
name="test_user_simulator_config",
215+
display_name="test_user_simulator_config",
216+
dataset=types.EvaluationRunDataSource(
217+
evaluation_set="projects/977012026409/locations/us-central1/evaluationSets/3885168317211607040"
218+
),
219+
dest=GCS_DEST,
220+
metrics=[GENERAL_QUALITY_METRIC],
221+
agent_info=AGENT_INFO,
222+
user_simulator_config=types.evals.UserSimulatorConfig(
223+
max_turn=5,
224+
),
225+
labels={"label1": "value1"},
226+
)
227+
assert isinstance(evaluation_run, types.EvaluationRun)
228+
assert evaluation_run.display_name == "test_user_simulator_config"
229+
assert evaluation_run.state == types.EvaluationRunState.PENDING
230+
assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
231+
assert (
232+
evaluation_run.data_source.evaluation_set
233+
== "projects/977012026409/locations/us-central1/evaluationSets/3885168317211607040"
234+
)
235+
assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
236+
output_config=genai_types.OutputConfig(
237+
gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
238+
),
239+
metrics=[GENERAL_QUALITY_METRIC],
240+
)
241+
assert evaluation_run.inference_configs[
242+
AGENT_INFO.name
243+
] == types.EvaluationRunInferenceConfig(
244+
agent_configs=AGENT_INFO.agents,
245+
agent_run_config=types.AgentRunConfig(
246+
agent_engine=AGENT_INFO.agent_resource_name,
247+
user_simulator_config=types.evals.UserSimulatorConfig(max_turn=5),
248+
),
249+
)
250+
assert evaluation_run.labels == {
251+
"vertex-ai-evaluation-agent-engine-id": "456",
252+
"label1": "value1",
253+
}
254+
assert evaluation_run.error is None
255+
256+
205257
def test_create_eval_run_with_inference_configs(client):
206258
"""Tests that create_evaluation_run() creates a correctly structured EvaluationRun with inference_configs."""
207259
client._api_client._http_options.api_version = "v1beta1"
@@ -668,6 +720,54 @@ async def test_create_eval_run_async(client):
668720
assert evaluation_run.error is None
669721

670722

723+
@pytest.mark.asyncio
724+
async def test_create_eval_run_async_with_user_simulator_config(client):
725+
"""Tests that create_evaluation_run() creates a correctly structured EvaluationRun with user_simulator_config asynchronously."""
726+
client._api_client._http_options.api_version = "v1beta1"
727+
evaluation_run = await client.aio.evals.create_evaluation_run(
728+
name="test_user_simulator_config_async",
729+
display_name="test_user_simulator_config_async",
730+
dataset=types.EvaluationRunDataSource(
731+
evaluation_set="projects/977012026409/locations/us-central1/evaluationSets/3885168317211607040"
732+
),
733+
dest=GCS_DEST,
734+
metrics=[GENERAL_QUALITY_METRIC],
735+
agent_info=AGENT_INFO,
736+
user_simulator_config=types.evals.UserSimulatorConfig(
737+
max_turn=5,
738+
),
739+
labels={"label1": "value1"},
740+
)
741+
assert isinstance(evaluation_run, types.EvaluationRun)
742+
assert evaluation_run.display_name == "test_user_simulator_config_async"
743+
assert evaluation_run.state == types.EvaluationRunState.PENDING
744+
assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
745+
assert (
746+
evaluation_run.data_source.evaluation_set
747+
== "projects/977012026409/locations/us-central1/evaluationSets/3885168317211607040"
748+
)
749+
assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
750+
output_config=genai_types.OutputConfig(
751+
gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
752+
),
753+
metrics=[GENERAL_QUALITY_METRIC],
754+
)
755+
assert evaluation_run.inference_configs[
756+
AGENT_INFO.name
757+
] == types.EvaluationRunInferenceConfig(
758+
agent_configs=AGENT_INFO.agents,
759+
agent_run_config=types.AgentRunConfig(
760+
agent_engine=AGENT_INFO.agent_resource_name,
761+
user_simulator_config=types.evals.UserSimulatorConfig(max_turn=5),
762+
),
763+
)
764+
assert evaluation_run.labels == {
765+
"label1": "value1",
766+
"vertex-ai-evaluation-agent-engine-id": "456",
767+
}
768+
assert evaluation_run.error is None
769+
770+
671771
@pytest.mark.asyncio
672772
async def test_create_eval_run_async_with_inference_configs(client):
673773
"""Tests that create_evaluation_run() creates a correctly structured EvaluationRun with inference_configs asynchronously."""

tests/unit/vertexai/genai/replays/test_generate_user_scenarios.py

Lines changed: 46 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -22,24 +22,27 @@
2222
def test_gen_user_scenarios(client):
2323
"""Tests that generate_user_scenarios() correctly calls the API and parses the response."""
2424
eval_dataset = client.evals.generate_user_scenarios(
25-
agents={
26-
"booking-agent": types.evals.AgentConfig(
27-
agent_id="booking-agent",
28-
agent_type="service_agent",
29-
description="An agent capable of booking flights and hotels.",
30-
instruction="You are a helpful travel assistant. Use tools to find flights.",
31-
tools=[
32-
{
33-
"function_declarations": [
34-
{
35-
"name": "search_flights",
36-
"description": "Search for available flights.",
37-
}
38-
]
39-
}
40-
],
41-
)
42-
},
25+
agent_info=types.evals.AgentInfo(
26+
agents={
27+
"booking-agent": types.evals.AgentConfig(
28+
agent_id="booking-agent",
29+
agent_type="service_agent",
30+
description="An agent capable of booking flights and hotels.",
31+
instruction="You are a helpful travel assistant. Use tools to find flights.",
32+
tools=[
33+
{
34+
"function_declarations": [
35+
{
36+
"name": "search_flights",
37+
"description": "Search for available flights.",
38+
}
39+
]
40+
}
41+
],
42+
)
43+
},
44+
root_agent_id="booking-agent",
45+
),
4346
user_scenario_generation_config=types.evals.UserScenarioGenerationConfig(
4447
user_scenario_count=2,
4548
simulation_instruction=(
@@ -49,18 +52,11 @@ def test_gen_user_scenarios(client):
4952
environment_data="Today is Monday. Flights to Paris are available.",
5053
model_name="gemini-2.5-flash",
5154
),
52-
root_agent_id="booking-agent",
5355
)
5456
assert isinstance(eval_dataset, types.EvaluationDataset)
5557
assert len(eval_dataset.eval_cases) == 2
56-
assert (
57-
eval_dataset.eval_cases[0].user_scenario.starting_prompt
58-
== "I want to find a flight from New York to London."
59-
)
60-
assert (
61-
eval_dataset.eval_cases[0].user_scenario.conversation_plan
62-
== "Actually, I meant Paris, not London. Please search for flights to Paris."
63-
)
58+
assert eval_dataset.eval_cases[0].user_scenario.starting_prompt
59+
assert eval_dataset.eval_cases[0].user_scenario.conversation_plan
6460

6561

6662
pytest_plugins = ("pytest_asyncio",)
@@ -70,24 +66,27 @@ def test_gen_user_scenarios(client):
7066
async def test_gen_user_scenarios_async(client):
7167
"""Tests that generate_user_scenarios() async correctly calls the API and parses the response."""
7268
eval_dataset = await client.aio.evals.generate_user_scenarios(
73-
agents={
74-
"booking-agent": types.evals.AgentConfig(
75-
agent_id="booking-agent",
76-
agent_type="service_agent",
77-
description="An agent capable of booking flights and hotels.",
78-
instruction="You are a helpful travel assistant. Use tools to find flights.",
79-
tools=[
80-
{
81-
"function_declarations": [
82-
{
83-
"name": "search_flights",
84-
"description": "Search for available flights.",
85-
}
86-
]
87-
}
88-
],
89-
)
90-
},
69+
agent_info=types.evals.AgentInfo(
70+
agents={
71+
"booking-agent": types.evals.AgentConfig(
72+
agent_id="booking-agent",
73+
agent_type="service_agent",
74+
description="An agent capable of booking flights and hotels.",
75+
instruction="You are a helpful travel assistant. Use tools to find flights.",
76+
tools=[
77+
{
78+
"function_declarations": [
79+
{
80+
"name": "search_flights",
81+
"description": "Search for available flights.",
82+
}
83+
]
84+
}
85+
],
86+
)
87+
},
88+
root_agent_id="booking-agent",
89+
),
9190
user_scenario_generation_config=types.evals.UserScenarioGenerationConfig(
9291
user_scenario_count=2,
9392
simulation_instruction=(
@@ -97,18 +96,11 @@ async def test_gen_user_scenarios_async(client):
9796
environment_data="Today is Monday. Flights to Paris are available.",
9897
model_name="gemini-2.5-flash",
9998
),
100-
root_agent_id="booking-agent",
10199
)
102100
assert isinstance(eval_dataset, types.EvaluationDataset)
103101
assert len(eval_dataset.eval_cases) == 2
104-
assert (
105-
eval_dataset.eval_cases[1].user_scenario.starting_prompt
106-
== "Find me a flight from Boston to Rome for next month."
107-
)
108-
assert (
109-
eval_dataset.eval_cases[1].user_scenario.conversation_plan
110-
== "Wait, change of plans. I need to go to Milan instead, and it needs to be a round trip, returning two weeks after departure."
111-
)
102+
assert eval_dataset.eval_cases[1].user_scenario.starting_prompt
103+
assert eval_dataset.eval_cases[1].user_scenario.conversation_plan
112104

113105

114106
pytestmark = pytest_helper.setup(

0 commit comments

Comments
 (0)