diff --git a/src/google/adk/evaluation/eval_metrics.py b/src/google/adk/evaluation/eval_metrics.py index 50c3473c3a..aa1af8ac39 100644 --- a/src/google/adk/evaluation/eval_metrics.py +++ b/src/google/adk/evaluation/eval_metrics.py @@ -115,6 +115,19 @@ class BaseCriterion(BaseModel): description="The threshold to be used by the metric.", ) + include_intermediate_responses_in_final: bool = Field( + default=False, + description=( + "Whether to evaluate the full agent response including intermediate" + " natural language text (e.g. text emitted before tool calls) in" + " addition to the final response. By default, only the final" + " response text is sent to the judge. When True, text from all" + " intermediate invocation events is concatenated with the final" + " response before evaluation. This is useful for agents that emit" + " text both before and after tool calls within a single invocation." + ), + ) + class LlmAsAJudgeCriterion(BaseCriterion): """Criterion when using LLM-As-A-Judge metric.""" diff --git a/src/google/adk/evaluation/llm_as_judge_utils.py b/src/google/adk/evaluation/llm_as_judge_utils.py index cf1309ca38..0986f2bed0 100644 --- a/src/google/adk/evaluation/llm_as_judge_utils.py +++ b/src/google/adk/evaluation/llm_as_judge_utils.py @@ -26,6 +26,8 @@ from .common import EvalBaseModel from .eval_case import get_all_tool_calls_with_responses from .eval_case import IntermediateDataType +from .eval_case import Invocation +from .eval_case import InvocationEvents from .eval_metrics import RubricScore from .evaluator import EvalStatus @@ -44,8 +46,38 @@ class Label(enum.Enum): def get_text_from_content( - content: Optional[genai_types.Content], + content: Optional[Union[genai_types.Content, Invocation]], + *, + include_intermediate_responses_in_final: bool = False, ) -> Optional[str]: + """Extracts text from a `Content` or an `Invocation`. + + When `content` is a `Content`, returns the concatenated text of its parts. + + When `content` is an `Invocation`, returns the text of the invocation's final + response. If `include_intermediate_responses_in_final` is True, text from + intermediate invocation events (e.g. natural language emitted before tool + calls) is concatenated with the final response text. + """ + if isinstance(content, Invocation): + if not include_intermediate_responses_in_final: + # Flag off: revert to basic plain-Content behavior. + return get_text_from_content(content.final_response) + + parts: list[str] = [] + if isinstance(content.intermediate_data, InvocationEvents): + # Walk intermediate events in order; collect text parts. + for event in content.intermediate_data.invocation_events: + text = get_text_from_content(event.content) + if text: + parts.append(text) + # Then fetch the final response text and append it to the end. + final_text = get_text_from_content(content.final_response) + if final_text: + parts.append(final_text) + + return "\n".join(parts) if parts else None + if content and content.parts: return "\n".join([p.text for p in content.parts if p.text]) diff --git a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py index df01aba4ff..135b2b9593 100644 --- a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py +++ b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py @@ -274,7 +274,18 @@ def format_auto_rater_prompt( """Returns the autorater prompt.""" self.create_effective_rubrics_list(actual_invocation.rubrics) user_input = get_text_from_content(actual_invocation.user_content) - final_response = get_text_from_content(actual_invocation.final_response) + + criterion = self._eval_metric.criterion + include_intermediate = getattr( + criterion, "include_intermediate_responses_in_final", False + ) + final_response = ( + get_text_from_content( + actual_invocation, + include_intermediate_responses_in_final=include_intermediate, + ) + or "" + ) rubrics_text = "\n".join([ f"* {r.rubric_content.text_property}" diff --git a/tests/unittests/evaluation/test_llm_as_judge_utils.py b/tests/unittests/evaluation/test_llm_as_judge_utils.py index e5327cf454..4b53a2dc43 100644 --- a/tests/unittests/evaluation/test_llm_as_judge_utils.py +++ b/tests/unittests/evaluation/test_llm_as_judge_utils.py @@ -19,6 +19,7 @@ from google.adk.evaluation.app_details import AgentDetails from google.adk.evaluation.app_details import AppDetails from google.adk.evaluation.eval_case import IntermediateData +from google.adk.evaluation.eval_case import Invocation from google.adk.evaluation.eval_case import InvocationEvent from google.adk.evaluation.eval_case import InvocationEvents from google.adk.evaluation.eval_rubrics import RubricScore @@ -88,6 +89,49 @@ def test_get_text_from_content_with_mixed_parts(): assert get_text_from_content(content) == "Hello\nWorld" +def test_get_text_from_content_with_invocation_include_intermediate_responses_in_final(): + """Tests get_text_from_content on an Invocation with and without the flag.""" + intermediate_text = "Let me check." + final_response_text = "Done." + invocation = Invocation( + user_content=genai_types.Content(parts=[genai_types.Part(text="user")]), + intermediate_data=InvocationEvents( + invocation_events=[ + InvocationEvent( + author="agent", + content=genai_types.Content( + parts=[genai_types.Part(text=intermediate_text)] + ), + ), + InvocationEvent( + author="tool", + content=genai_types.Content( + parts=[ + genai_types.Part( + function_call=genai_types.FunctionCall(name="t") + ) + ] + ), + ), + ] + ), + final_response=genai_types.Content( + parts=[genai_types.Part(text=final_response_text)] + ), + ) + + # Flag off (default): only the final response text is returned. + assert get_text_from_content(invocation) == final_response_text + + # Flag on: intermediate text is concatenated before the final response. + assert ( + get_text_from_content( + invocation, include_intermediate_responses_in_final=True + ) + == f"{intermediate_text}\n{final_response_text}" + ) + + def test_get_eval_status_with_none_score(): """Tests get_eval_status returns NOT_EVALUATED for a None score.""" assert get_eval_status(score=None, threshold=0.5) == EvalStatus.NOT_EVALUATED