Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions vertexai/_genai/_evals_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@


@contextlib.contextmanager
def _temp_logger_level(logger_name: str, level: int):
def _temp_logger_level(logger_name: str, level: int) -> None: # type: ignore[misc]
"""Temporarily sets the level of a logger."""
logger_instance = logging.getLogger(logger_name)
original_level = logger_instance.getEffectiveLevel()
Expand All @@ -95,7 +95,7 @@ def _get_api_client_with_location(
location,
api_client.location,
)
return vertexai.Client(
return vertexai.Client( # type: ignore[no-any-return]
project=api_client.project,
location=location,
credentials=api_client._credentials,
Expand Down Expand Up @@ -1798,10 +1798,10 @@ def _convert_evaluation_run_results(
api_client: BaseApiClient,
evaluation_run_results: types.EvaluationRunResults,
inference_configs: Optional[dict[str, types.EvaluationRunInferenceConfig]] = None,
) -> Union[list[types.EvaluationItem], types.EvaluationResult]:
) -> Optional[types.EvaluationResult]:
"""Retrieves an EvaluationItem from the EvaluationRunResults."""
if not evaluation_run_results or not evaluation_run_results.evaluation_set:
return []
return None

evals_module = evals.Evals(api_client_=api_client)
eval_set = evals_module.get_evaluation_set(
Expand All @@ -1823,10 +1823,10 @@ async def _convert_evaluation_run_results_async(
api_client: BaseApiClient,
evaluation_run_results: types.EvaluationRunResults,
inference_configs: Optional[dict[str, types.EvaluationRunInferenceConfig]] = None,
) -> Union[list[types.EvaluationItem], types.EvaluationResult]:
) -> Optional[types.EvaluationResult]:
"""Retrieves an EvaluationItem from the EvaluationRunResults."""
if not evaluation_run_results or not evaluation_run_results.evaluation_set:
return []
return None

evals_module = evals.AsyncEvals(api_client_=api_client)
eval_set = await evals_module.get_evaluation_set(
Expand Down
8 changes: 4 additions & 4 deletions vertexai/_genai/_evals_metric_handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -617,15 +617,15 @@ def _build_pointwise_input(

def _add_autorater_config(self, payload: dict[str, Any]) -> None:
"""Adds autorater config to the request payload if specified."""
autorater_config = {}
autorater_config: dict[str, Any] = {}
if self.metric.judge_model:
autorater_config["autorater_model"] = self.metric.judge_model
if self.metric.judge_model_generation_config:
autorater_config["generation_config"] = (
self.metric.judge_model_generation_config
)
if self.metric.judge_model_sampling_count:
autorater_config["sampling_count"] = self.metric.judge_model_sampling_count # type: ignore[assignment]
autorater_config["sampling_count"] = self.metric.judge_model_sampling_count

if not autorater_config:
return
Expand Down Expand Up @@ -989,11 +989,11 @@ def _build_request_payload(
agent_data=PredefinedMetricHandler._eval_case_to_agent_data(eval_case),
)

request_payload = {
request_payload: dict[str, Any] = {
"instance": instance_payload,
}

autorater_config = {}
autorater_config: dict[str, Any] = {}
if self.metric.judge_model:
autorater_config["autorater_model"] = self.metric.judge_model
if self.metric.judge_model_generation_config:
Expand Down
64 changes: 37 additions & 27 deletions vertexai/_genai/evals.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
try:
from google.adk.agents import LlmAgent
except ImportError:
LlmAgent = None # type: ignore[assignment]
LlmAgent = None


logger = logging.getLogger("vertexai_genai.evals")
Expand Down Expand Up @@ -1216,10 +1216,10 @@ def evaluate(
types.EvaluationDatasetOrDict,
list[types.EvaluationDatasetOrDict],
],
metrics: list[types.MetricOrDict] = None,
metrics: Optional[list[types.MetricOrDict]] = None,
location: Optional[str] = None,
config: Optional[types.EvaluateMethodConfigOrDict] = None,
**kwargs,
**kwargs: Any,
) -> types.EvaluationResult:
"""Evaluates candidate responses in the provided dataset(s) using the specified metrics.

Expand Down Expand Up @@ -1625,24 +1625,28 @@ def create_evaluation_run(
raise ValueError(
"At most one of agent_info or inference_configs can be provided."
)
if agent_info and isinstance(agent_info, dict):
agent_info = types.evals.AgentInfo.model_validate(agent_info)
if type(dataset).__name__ == "EvaluationDataset":
agent_info_pydantic: types.evals.AgentInfo = types.evals.AgentInfo()
if agent_info:
if isinstance(agent_info, dict):
agent_info_pydantic = types.evals.AgentInfo.model_validate(agent_info)
else:
agent_info_pydantic = agent_info
if isinstance(dataset, types.EvaluationDataset):
if dataset.eval_dataset_df is None:
raise ValueError(
"EvaluationDataset must have eval_dataset_df populated."
)
if (
if agent_info_pydantic is not None and (
dataset.candidate_name
and agent_info
and agent_info.name
and dataset.candidate_name != agent_info.name
and agent_info_pydantic
and agent_info_pydantic.name
and dataset.candidate_name != agent_info_pydantic.name
):
logger.warning(
"Evaluation dataset candidate_name and agent_info.name are different. Please make sure this is intended."
)
elif dataset.candidate_name is None and agent_info:
dataset.candidate_name = agent_info.name
elif dataset.candidate_name is None and agent_info_pydantic:
dataset.candidate_name = agent_info_pydantic.name
eval_set = _evals_common._create_evaluation_set_from_dataframe(
self._api_client, dest, dataset.eval_dataset_df, dataset.candidate_name
)
Expand All @@ -1656,20 +1660,26 @@ def create_evaluation_run(
evaluation_config = types.EvaluationRunConfig(
output_config=output_config, metrics=resolved_metrics
)
if agent_info:
if agent_info_pydantic and agent_info_pydantic.name is not None:
inference_configs = {}
inference_configs[agent_info.name] = types.EvaluationRunInferenceConfig(
agent_config=types.EvaluationRunAgentConfig(
developer_instruction=genai_types.Content(
parts=[genai_types.Part(text=agent_info.instruction)]
),
tools=agent_info.tool_declarations,
inference_configs[agent_info_pydantic.name] = (
types.EvaluationRunInferenceConfig(
agent_config=types.EvaluationRunAgentConfig(
developer_instruction=genai_types.Content(
parts=[
genai_types.Part(text=agent_info_pydantic.instruction)
]
),
tools=agent_info_pydantic.tool_declarations,
)
)
)
if agent_info.agent_resource_name:
if agent_info_pydantic.agent_resource_name:
labels = labels or {}
labels["vertex-ai-evaluation-agent-engine-id"] = (
agent_info.agent_resource_name.split("reasoningEngines/")[-1]
agent_info_pydantic.agent_resource_name.split("reasoningEngines/")[
-1
]
)
if not name:
name = f"evaluation_run_{uuid.uuid4()}"
Expand Down Expand Up @@ -2487,12 +2497,12 @@ async def create_evaluation_run(
)
if agent_info and isinstance(agent_info, dict):
agent_info = types.evals.AgentInfo.model_validate(agent_info)
if type(dataset).__name__ == "EvaluationDataset":
if isinstance(dataset, types.EvaluationDataset):
if dataset.eval_dataset_df is None:
raise ValueError(
"EvaluationDataset must have eval_dataset_df populated."
)
if (
if agent_info is not None and (
dataset.candidate_name
and agent_info.name
and dataset.candidate_name != agent_info.name
Expand All @@ -2515,7 +2525,7 @@ async def create_evaluation_run(
evaluation_config = types.EvaluationRunConfig(
output_config=output_config, metrics=resolved_metrics
)
if agent_info:
if agent_info and agent_info.name is not None:
inference_configs = {}
inference_configs[agent_info.name] = types.EvaluationRunInferenceConfig(
agent_config=types.EvaluationRunAgentConfig(
Expand All @@ -2533,7 +2543,7 @@ async def create_evaluation_run(
if not name:
name = f"evaluation_run_{uuid.uuid4()}"

result = await self._create_evaluation_run( # type: ignore[no-any-return]
result = await self._create_evaluation_run(
name=name,
display_name=display_name or name,
data_source=dataset,
Expand Down Expand Up @@ -2645,7 +2655,7 @@ async def create_evaluation_item(
Returns:
The evaluation item.
"""
result = await self._create_evaluation_item( # type: ignore[no-any-return]
result = await self._create_evaluation_item(
evaluation_item_type=evaluation_item_type,
gcs_uri=gcs_uri,
display_name=display_name,
Expand Down Expand Up @@ -2676,7 +2686,7 @@ async def create_evaluation_set(
Returns:
The evaluation set.
"""
result = await self._create_evaluation_set( # type: ignore[no-any-return]
result = await self._create_evaluation_set(
evaluation_items=evaluation_items,
display_name=display_name,
config=config,
Expand Down
Loading