Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 102 additions & 3 deletions tests/unit/vertexai/genai/replays/test_create_evaluation_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
from tests.unit.vertexai.genai.replays import pytest_helper
from vertexai import types
from google.genai import types as genai_types
import pytest
import pandas as pd
import pytest

GCS_DEST = "gs://lakeyk-limited-bucket/eval_run_output"
GENERAL_QUALITY_METRIC = types.EvaluationRunMetric(
Expand All @@ -42,7 +42,7 @@
metric_config=types.UnifiedMetric(
llm_based_metric_spec=genai_types.LLMBasedMetricSpec(
metric_prompt_template=(
"\nEvaluate the fluency of the response. Provide a score from 1-5."
"\nEvaluate the fluency of the response. Provide a score from" " 1-5."
)
)
),
Expand Down Expand Up @@ -80,7 +80,7 @@
]
)
AGENT_INFO = types.evals.AgentInfo(
agent_resource_name="projects/123/locations/us-central1/reasoningEngines/456",
agent_resource_name=("projects/123/locations/us-central1/reasoningEngines/456"),
name="agent-1",
agents={
"agent-1": types.evals.AgentConfig(
Expand Down Expand Up @@ -147,6 +147,10 @@ def test_create_eval_run_data_source_evaluation_set(client):
AGENT_INFO.name
] == types.EvaluationRunInferenceConfig(
agent_configs=AGENT_INFO.agents,
agent_run_config=types.AgentRunConfig(
agent_engine=AGENT_INFO.agent_resource_name,
user_simulator_config={"max_turn": 5},
),
)
assert evaluation_run.labels == {
"vertex-ai-evaluation-agent-engine-id": "456",
Expand Down Expand Up @@ -203,6 +207,53 @@ def test_create_eval_run_data_source_bigquery_request_set(client):
assert evaluation_run.error is None


def test_create_eval_run_with_user_simulator_config(client):
"""Tests that create_evaluation_run() creates a correctly structured EvaluationRun with user_simulator_config."""
client._api_client._http_options.api_version = "v1beta1"
evaluation_run = client.evals.create_evaluation_run(
name="test_user_simulator_config",
display_name="test_user_simulator_config",
dataset=types.EvaluationRunDataSource(
evaluation_set="projects/977012026409/locations/us-central1/evaluationSets/3885168317211607040"
),
dest=GCS_DEST,
metrics=[GENERAL_QUALITY_METRIC],
agent_info=AGENT_INFO,
user_simulator_config=types.evals.UserSimulatorConfig(
max_turn=5,
),
labels={"label1": "value1"},
)
assert isinstance(evaluation_run, types.EvaluationRun)
assert evaluation_run.display_name == "test_user_simulator_config"
assert evaluation_run.state == types.EvaluationRunState.PENDING
assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
assert (
evaluation_run.data_source.evaluation_set
== "projects/977012026409/locations/us-central1/evaluationSets/3885168317211607040"
)
assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
output_config=genai_types.OutputConfig(
gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
),
metrics=[GENERAL_QUALITY_METRIC],
)
assert evaluation_run.inference_configs[
AGENT_INFO.name
] == types.EvaluationRunInferenceConfig(
agent_configs=AGENT_INFO.agents,
agent_run_config=types.AgentRunConfig(
agent_engine=AGENT_INFO.agent_resource_name,
user_simulator_config=types.evals.UserSimulatorConfig(max_turn=5),
),
)
assert evaluation_run.labels == {
"vertex-ai-evaluation-agent-engine-id": "456",
"label1": "value1",
}
assert evaluation_run.error is None


def test_create_eval_run_with_inference_configs(client):
"""Tests that create_evaluation_run() creates a correctly structured EvaluationRun with inference_configs."""
client._api_client._http_options.api_version = "v1beta1"
Expand Down Expand Up @@ -669,6 +720,54 @@ async def test_create_eval_run_async(client):
assert evaluation_run.error is None


@pytest.mark.asyncio
async def test_create_eval_run_async_with_user_simulator_config(client):
"""Tests that create_evaluation_run() creates a correctly structured EvaluationRun with user_simulator_config asynchronously."""
client._api_client._http_options.api_version = "v1beta1"
evaluation_run = await client.aio.evals.create_evaluation_run(
name="test_user_simulator_config_async",
display_name="test_user_simulator_config_async",
dataset=types.EvaluationRunDataSource(
evaluation_set="projects/977012026409/locations/us-central1/evaluationSets/3885168317211607040"
),
dest=GCS_DEST,
metrics=[GENERAL_QUALITY_METRIC],
agent_info=AGENT_INFO,
user_simulator_config=types.evals.UserSimulatorConfig(
max_turn=5,
),
labels={"label1": "value1"},
)
assert isinstance(evaluation_run, types.EvaluationRun)
assert evaluation_run.display_name == "test_user_simulator_config_async"
assert evaluation_run.state == types.EvaluationRunState.PENDING
assert isinstance(evaluation_run.data_source, types.EvaluationRunDataSource)
assert (
evaluation_run.data_source.evaluation_set
== "projects/977012026409/locations/us-central1/evaluationSets/3885168317211607040"
)
assert evaluation_run.evaluation_config == types.EvaluationRunConfig(
output_config=genai_types.OutputConfig(
gcs_destination=genai_types.GcsDestination(output_uri_prefix=GCS_DEST)
),
metrics=[GENERAL_QUALITY_METRIC],
)
assert evaluation_run.inference_configs[
AGENT_INFO.name
] == types.EvaluationRunInferenceConfig(
agent_configs=AGENT_INFO.agents,
agent_run_config=types.AgentRunConfig(
agent_engine=AGENT_INFO.agent_resource_name,
user_simulator_config=types.evals.UserSimulatorConfig(max_turn=5),
),
)
assert evaluation_run.labels == {
"label1": "value1",
"vertex-ai-evaluation-agent-engine-id": "456",
}
assert evaluation_run.error is None


@pytest.mark.asyncio
async def test_create_eval_run_async_with_inference_configs(client):
"""Tests that create_evaluation_run() creates a correctly structured EvaluationRun with inference_configs asynchronously."""
Expand Down
121 changes: 101 additions & 20 deletions vertexai/_genai/_evals_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,15 +283,66 @@ def _resolve_dataset(
api_client: BaseApiClient,
dataset: Union[types.EvaluationRunDataSource, types.EvaluationDataset],
dest: str,
agent_info_pydantic: Optional[types.evals.AgentInfo] = None,
parsed_agent_info: Optional[types.evals.AgentInfo] = None,
) -> types.EvaluationRunDataSource:
"""Resolves dataset for the evaluation run."""
if isinstance(dataset, types.EvaluationDataset):
candidate_name = _get_candidate_name(dataset, agent_info_pydantic)
candidate_name = _get_candidate_name(dataset, parsed_agent_info)
eval_df = dataset.eval_dataset_df
if eval_df is None and dataset.eval_cases:
rows = []
for case in dataset.eval_cases:
row: dict[str, Any] = {}
if case.prompt:
row[_evals_constant.PROMPT] = (
_evals_data_converters._get_content_text(case.prompt)
)

if (
case.responses
and len(case.responses) > 0
and case.responses[0].response
):
row[_evals_constant.RESPONSE] = (
_evals_data_converters._get_content_text(
case.responses[0].response
)
)

if case.reference and case.reference.response:
row[_evals_constant.REFERENCE] = (
_evals_data_converters._get_content_text(
case.reference.response
)
)

if case.agent_data:
row[AGENT_DATA] = case.agent_data

if case.intermediate_events:
row[_evals_constant.INTERMEDIATE_EVENTS] = [
{CONTENT: event.content}
for event in case.intermediate_events
if event.content
]

if case.user_scenario:
if case.user_scenario.starting_prompt:
row[_evals_constant.STARTING_PROMPT] = (
case.user_scenario.starting_prompt
)
if case.user_scenario.conversation_plan:
row[_evals_constant.CONVERSATION_PLAN] = (
case.user_scenario.conversation_plan
)

rows.append(row)
eval_df = pd.DataFrame(rows)

eval_set = _create_evaluation_set_from_dataframe(
api_client,
dest,
dataset.eval_dataset_df,
eval_df,
candidate_name,
)
dataset = types.EvaluationRunDataSource(evaluation_set=eval_set.name)
Expand Down Expand Up @@ -339,15 +390,34 @@ def _resolve_inference_configs(
inference_configs: Optional[
dict[str, types.EvaluationRunInferenceConfigOrDict]
] = None,
agent_info_pydantic: Optional[types.evals.AgentInfo] = None,
parsed_agent_info: Optional[types.evals.AgentInfo] = None,
) -> Optional[dict[str, types.EvaluationRunInferenceConfigOrDict]]:
"""Resolves inference configs for the evaluation run."""
# Resolve agent config
if agent_info_pydantic and agent_info_pydantic.name:
inference_configs = {}
inference_configs[agent_info_pydantic.name] = (
types.EvaluationRunInferenceConfig(agent_configs=agent_info_pydantic.agents)
)
if parsed_agent_info and parsed_agent_info.name:
if inference_configs is None:
inference_configs = {}

# We might have used "candidate-1" as a placeholder key in the caller,
# let's migrate it to the agent name, or if it doesn't exist, just create it.
if "candidate-1" in inference_configs:
inference_configs[parsed_agent_info.name] = inference_configs.pop(
"candidate-1"
)

if parsed_agent_info.name not in inference_configs:
inference_configs[parsed_agent_info.name] = (
types.EvaluationRunInferenceConfig(
agent_configs=parsed_agent_info.agents
)
)
else:
config = inference_configs[parsed_agent_info.name]
if isinstance(config, dict):
config["agent_configs"] = parsed_agent_info.agents
else:
config.agent_configs = parsed_agent_info.agents

# Resolve prompt template data
if inference_configs:
for inference_config in inference_configs.values():
Expand Down Expand Up @@ -381,33 +451,33 @@ def _resolve_inference_configs(

def _add_evaluation_run_labels(
labels: Optional[dict[str, str]] = None,
agent_info_pydantic: Optional[types.evals.AgentInfo] = None,
parsed_agent_info: Optional[types.evals.AgentInfo] = None,
) -> Optional[dict[str, str]]:
"""Adds labels to the evaluation run."""
if agent_info_pydantic and agent_info_pydantic.agent_resource_name:
if parsed_agent_info and parsed_agent_info.agent_resource_name:
labels = labels or {}
labels["vertex-ai-evaluation-agent-engine-id"] = (
agent_info_pydantic.agent_resource_name.split("reasoningEngines/")[-1]
parsed_agent_info.agent_resource_name.split("reasoningEngines/")[-1]
)
return labels


def _get_candidate_name(
dataset: types.EvaluationDataset,
agent_info_pydantic: Optional[types.evals.AgentInfo] = None,
parsed_agent_info: Optional[types.evals.AgentInfo] = None,
) -> Optional[str]:
"""Internal helper to get candidate name."""
if agent_info_pydantic is not None and (
if parsed_agent_info is not None and (
dataset.candidate_name
and agent_info_pydantic
and agent_info_pydantic.name
and dataset.candidate_name != agent_info_pydantic.name
and parsed_agent_info
and parsed_agent_info.name
and dataset.candidate_name != parsed_agent_info.name
):
logger.warning(
"Evaluation dataset candidate_name and agent_info.name are different. Please make sure this is intended."
)
elif dataset.candidate_name is None and agent_info_pydantic:
return agent_info_pydantic.name
elif dataset.candidate_name is None and parsed_agent_info:
return parsed_agent_info.name
return dataset.candidate_name or None


Expand Down Expand Up @@ -2406,10 +2476,21 @@ def _create_evaluation_set_from_dataframe(

candidate_responses = []
if _evals_constant.RESPONSE in row or agent_data_obj or intermediate_events:
# Resolve the oneof conflict: prioritize agent_data over flat text
response_text = row.get(_evals_constant.RESPONSE) or None

if agent_data_obj and response_text:
logger.info(
"Both 'response' and 'agent_data' columns found in the evaluation dataset. "
"Prioritizing 'agent_data' and omitting 'response' text to satisfy "
"CandidateResponse protobuf oneof constraints."
)
response_text = None

candidate_responses.append(
types.CandidateResponse(
candidate=candidate_name or "Candidate 1",
text=row.get(_evals_constant.RESPONSE) or None,
text=response_text,
events=intermediate_events or None,
agent_data=agent_data_obj,
)
Expand Down
24 changes: 15 additions & 9 deletions vertexai/_genai/_evals_data_converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -672,26 +672,32 @@ def get_dataset_converter(
raise ValueError("Unsupported dataset schema: %s" % dataset_schema)


def _get_first_part_text(content: genai_types.Content) -> str:
"""Safely extracts text from the first part of a content."""
def _get_content_text(content: genai_types.Content) -> str:
"""Safely extracts text from all parts of a content.

If the content has multiple parts, text from all parts is concatenated.
If a part is not text, it is ignored. If no text parts are found,
an empty string is returned.
"""
text_parts = []
if (
content
and hasattr(content, "parts")
and isinstance(content.parts, list)
and content.parts
):
first_part = content.parts[0]
if hasattr(first_part, "text"):
return str(first_part.text)
return ""
for part in content.parts:
if hasattr(part, "text") and part.text is not None:
text_parts.append(str(part.text))
return "".join(text_parts)


def _get_text_from_reference(
reference: Optional[types.ResponseCandidate],
) -> Optional[str]:
"""Safely extracts text from a reference field."""
if reference and hasattr(reference, "response") and reference.response:
return _get_first_part_text(reference.response)
return _get_content_text(reference.response)
return None


Expand All @@ -703,8 +709,8 @@ def _validate_case_consistency(
) -> None:
"""Logs warnings if prompt or reference mismatches occur."""
if base_case.prompt != current_case.prompt:
base_prompt_text_preview = _get_first_part_text(base_case.prompt)[:50]
current_prompt_text_preview = _get_first_part_text(current_case.prompt)[:50]
base_prompt_text_preview = _get_content_text(base_case.prompt)[:50]
current_prompt_text_preview = _get_content_text(current_case.prompt)[:50]
logger.warning(
"Prompt mismatch for case index %d between base dataset (0)"
" and dataset %d. Using prompt from base. Base prompt"
Expand Down
Loading
Loading