From 20843dfef9796b92d35f5de8e71263dea860ac69 Mon Sep 17 00:00:00 2001
From: Sara Robinson <sararob@google.com>
Date: Fri, 30 Jan 2026 11:05:44 -0800
Subject: [PATCH] chore: Resolve all evals mypy errors

PiperOrigin-RevId: 863313260
---
 vertexai/_genai/_evals_common.py          | 12 ++---
 vertexai/_genai/_evals_metric_handlers.py |  8 +--
 vertexai/_genai/evals.py                  | 64 +++++++++++++----------
 3 files changed, 47 insertions(+), 37 deletions(-)

diff --git a/vertexai/_genai/_evals_common.py b/vertexai/_genai/_evals_common.py
index 5d9a854b81..f33320324a 100644
--- a/vertexai/_genai/_evals_common.py
+++ b/vertexai/_genai/_evals_common.py
@@ -71,7 +71,7 @@
 
 
 @contextlib.contextmanager
-def _temp_logger_level(logger_name: str, level: int):
+def _temp_logger_level(logger_name: str, level: int) -> None:  # type: ignore[misc]
     """Temporarily sets the level of a logger."""
     logger_instance = logging.getLogger(logger_name)
     original_level = logger_instance.getEffectiveLevel()
@@ -95,7 +95,7 @@ def _get_api_client_with_location(
         location,
         api_client.location,
     )
-    return vertexai.Client(
+    return vertexai.Client(  # type: ignore[no-any-return]
         project=api_client.project,
         location=location,
         credentials=api_client._credentials,
@@ -1798,10 +1798,10 @@ def _convert_evaluation_run_results(
     api_client: BaseApiClient,
     evaluation_run_results: types.EvaluationRunResults,
     inference_configs: Optional[dict[str, types.EvaluationRunInferenceConfig]] = None,
-) -> Union[list[types.EvaluationItem], types.EvaluationResult]:
+) -> Optional[types.EvaluationResult]:
     """Retrieves an EvaluationItem from the EvaluationRunResults."""
     if not evaluation_run_results or not evaluation_run_results.evaluation_set:
-        return []
+        return None
 
     evals_module = evals.Evals(api_client_=api_client)
     eval_set = evals_module.get_evaluation_set(
@@ -1823,10 +1823,10 @@ async def _convert_evaluation_run_results_async(
     api_client: BaseApiClient,
     evaluation_run_results: types.EvaluationRunResults,
     inference_configs: Optional[dict[str, types.EvaluationRunInferenceConfig]] = None,
-) -> Union[list[types.EvaluationItem], types.EvaluationResult]:
+) -> Optional[types.EvaluationResult]:
     """Retrieves an EvaluationItem from the EvaluationRunResults."""
     if not evaluation_run_results or not evaluation_run_results.evaluation_set:
-        return []
+        return None
 
     evals_module = evals.AsyncEvals(api_client_=api_client)
     eval_set = await evals_module.get_evaluation_set(
diff --git a/vertexai/_genai/_evals_metric_handlers.py b/vertexai/_genai/_evals_metric_handlers.py
index 9475846fdb..9f4494c74d 100644
--- a/vertexai/_genai/_evals_metric_handlers.py
+++ b/vertexai/_genai/_evals_metric_handlers.py
@@ -617,7 +617,7 @@ def _build_pointwise_input(
 
     def _add_autorater_config(self, payload: dict[str, Any]) -> None:
         """Adds autorater config to the request payload if specified."""
-        autorater_config = {}
+        autorater_config: dict[str, Any] = {}
         if self.metric.judge_model:
             autorater_config["autorater_model"] = self.metric.judge_model
         if self.metric.judge_model_generation_config:
@@ -625,7 +625,7 @@ def _add_autorater_config(self, payload: dict[str, Any]) -> None:
                 self.metric.judge_model_generation_config
             )
         if self.metric.judge_model_sampling_count:
-            autorater_config["sampling_count"] = self.metric.judge_model_sampling_count  # type: ignore[assignment]
+            autorater_config["sampling_count"] = self.metric.judge_model_sampling_count
 
         if not autorater_config:
             return
@@ -989,11 +989,11 @@ def _build_request_payload(
             agent_data=PredefinedMetricHandler._eval_case_to_agent_data(eval_case),
         )
 
-        request_payload = {
+        request_payload: dict[str, Any] = {
             "instance": instance_payload,
         }
 
-        autorater_config = {}
+        autorater_config: dict[str, Any] = {}
         if self.metric.judge_model:
             autorater_config["autorater_model"] = self.metric.judge_model
         if self.metric.judge_model_generation_config:
diff --git a/vertexai/_genai/evals.py b/vertexai/_genai/evals.py
index 67bbf44ffa..3632628b87 100644
--- a/vertexai/_genai/evals.py
+++ b/vertexai/_genai/evals.py
@@ -36,7 +36,7 @@
 try:
     from google.adk.agents import LlmAgent
 except ImportError:
-    LlmAgent = None  # type: ignore[assignment]
+    LlmAgent = None
 
 
 logger = logging.getLogger("vertexai_genai.evals")
@@ -1216,10 +1216,10 @@ def evaluate(
             types.EvaluationDatasetOrDict,
             list[types.EvaluationDatasetOrDict],
         ],
-        metrics: list[types.MetricOrDict] = None,
+        metrics: Optional[list[types.MetricOrDict]] = None,
         location: Optional[str] = None,
         config: Optional[types.EvaluateMethodConfigOrDict] = None,
-        **kwargs,
+        **kwargs: Any,
     ) -> types.EvaluationResult:
         """Evaluates candidate responses in the provided dataset(s) using the specified metrics.
 
@@ -1625,24 +1625,28 @@ def create_evaluation_run(
             raise ValueError(
                 "At most one of agent_info or inference_configs can be provided."
             )
-        if agent_info and isinstance(agent_info, dict):
-            agent_info = types.evals.AgentInfo.model_validate(agent_info)
-        if type(dataset).__name__ == "EvaluationDataset":
+        agent_info_pydantic: types.evals.AgentInfo = types.evals.AgentInfo()
+        if agent_info:
+            if isinstance(agent_info, dict):
+                agent_info_pydantic = types.evals.AgentInfo.model_validate(agent_info)
+            else:
+                agent_info_pydantic = agent_info
+        if isinstance(dataset, types.EvaluationDataset):
             if dataset.eval_dataset_df is None:
                 raise ValueError(
                     "EvaluationDataset must have eval_dataset_df populated."
                 )
-            if (
+            if agent_info_pydantic is not None and (
                 dataset.candidate_name
-                and agent_info
-                and agent_info.name
-                and dataset.candidate_name != agent_info.name
+                and agent_info_pydantic
+                and agent_info_pydantic.name
+                and dataset.candidate_name != agent_info_pydantic.name
             ):
                 logger.warning(
                     "Evaluation dataset candidate_name and agent_info.name are different. Please make sure this is intended."
                 )
-            elif dataset.candidate_name is None and agent_info:
-                dataset.candidate_name = agent_info.name
+            elif dataset.candidate_name is None and agent_info_pydantic:
+                dataset.candidate_name = agent_info_pydantic.name
             eval_set = _evals_common._create_evaluation_set_from_dataframe(
                 self._api_client, dest, dataset.eval_dataset_df, dataset.candidate_name
             )
@@ -1656,20 +1660,26 @@ def create_evaluation_run(
         evaluation_config = types.EvaluationRunConfig(
             output_config=output_config, metrics=resolved_metrics
         )
-        if agent_info:
+        if agent_info_pydantic and agent_info_pydantic.name is not None:
             inference_configs = {}
-            inference_configs[agent_info.name] = types.EvaluationRunInferenceConfig(
-                agent_config=types.EvaluationRunAgentConfig(
-                    developer_instruction=genai_types.Content(
-                        parts=[genai_types.Part(text=agent_info.instruction)]
-                    ),
-                    tools=agent_info.tool_declarations,
+            inference_configs[agent_info_pydantic.name] = (
+                types.EvaluationRunInferenceConfig(
+                    agent_config=types.EvaluationRunAgentConfig(
+                        developer_instruction=genai_types.Content(
+                            parts=[
+                                genai_types.Part(text=agent_info_pydantic.instruction)
+                            ]
+                        ),
+                        tools=agent_info_pydantic.tool_declarations,
+                    )
                 )
             )
-            if agent_info.agent_resource_name:
+            if agent_info_pydantic.agent_resource_name:
                 labels = labels or {}
                 labels["vertex-ai-evaluation-agent-engine-id"] = (
-                    agent_info.agent_resource_name.split("reasoningEngines/")[-1]
+                    agent_info_pydantic.agent_resource_name.split("reasoningEngines/")[
+                        -1
+                    ]
                 )
         if not name:
             name = f"evaluation_run_{uuid.uuid4()}"
@@ -2487,12 +2497,12 @@ async def create_evaluation_run(
             )
         if agent_info and isinstance(agent_info, dict):
             agent_info = types.evals.AgentInfo.model_validate(agent_info)
-        if type(dataset).__name__ == "EvaluationDataset":
+        if isinstance(dataset, types.EvaluationDataset):
             if dataset.eval_dataset_df is None:
                 raise ValueError(
                     "EvaluationDataset must have eval_dataset_df populated."
                 )
-            if (
+            if agent_info is not None and (
                 dataset.candidate_name
                 and agent_info.name
                 and dataset.candidate_name != agent_info.name
@@ -2515,7 +2525,7 @@ async def create_evaluation_run(
         evaluation_config = types.EvaluationRunConfig(
             output_config=output_config, metrics=resolved_metrics
         )
-        if agent_info:
+        if agent_info and agent_info.name is not None:
             inference_configs = {}
             inference_configs[agent_info.name] = types.EvaluationRunInferenceConfig(
                 agent_config=types.EvaluationRunAgentConfig(
@@ -2533,7 +2543,7 @@ async def create_evaluation_run(
         if not name:
             name = f"evaluation_run_{uuid.uuid4()}"
 
-        result = await self._create_evaluation_run(  # type: ignore[no-any-return]
+        result = await self._create_evaluation_run(
             name=name,
             display_name=display_name or name,
             data_source=dataset,
@@ -2645,7 +2655,7 @@ async def create_evaluation_item(
         Returns:
           The evaluation item.
         """
-        result = await self._create_evaluation_item(  # type: ignore[no-any-return]
+        result = await self._create_evaluation_item(
             evaluation_item_type=evaluation_item_type,
             gcs_uri=gcs_uri,
             display_name=display_name,
@@ -2676,7 +2686,7 @@ async def create_evaluation_set(
         Returns:
           The evaluation set.
         """
-        result = await self._create_evaluation_set(  # type: ignore[no-any-return]
+        result = await self._create_evaluation_set(
             evaluation_items=evaluation_items,
             display_name=display_name,
             config=config,