From 29e560fdec230b5fdfbfd523411341e251090371 Mon Sep 17 00:00:00 2001 From: Sid Gupta Date: Mon, 13 Apr 2026 23:06:20 -0700 Subject: [PATCH 1/7] feat(eval): add evaluate_full_response option to rubric-based evaluation --- src/google/adk/evaluation/eval_metrics.py | 19 ++++++++++ .../rubric_based_final_response_quality_v1.py | 35 ++++++++++++++++++- 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/src/google/adk/evaluation/eval_metrics.py b/src/google/adk/evaluation/eval_metrics.py index eb7c7e36cb..5895ad807a 100644 --- a/src/google/adk/evaluation/eval_metrics.py +++ b/src/google/adk/evaluation/eval_metrics.py @@ -61,6 +61,12 @@ class PrebuiltMetrics(Enum): PER_TURN_USER_SIMULATOR_QUALITY_V1 = "per_turn_user_simulator_quality_v1" + MULTI_TURN_TASK_SUCCESS_V1 = "multi_turn_task_success_v1" + + MULTI_TURN_TRAJECTORY_QUALITY_V1 = "multi_turn_trajectory_quality_v1" + + MULTI_TURN_TOOL_USE_QUALITY_V1 = "multi_turn_tool_use_quality_v1" + MetricName: TypeAlias = Union[str, PrebuiltMetrics] Threshold: TypeAlias = float @@ -138,6 +144,19 @@ class RubricsBasedCriterion(BaseCriterion): ), ) + evaluate_full_response: bool = Field( + default=False, + description=( + "Whether to evaluate the full agent response including intermediate" + " natural language text (e.g. text emitted before tool calls) in" + " addition to the final response. By default, only the final" + " response text is sent to the judge. When True, text from all" + " intermediate invocation events is concatenated with the final" + " response before evaluation. This is useful for agents that emit" + " text both before and after tool calls within a single invocation." + ), + ) + class HallucinationsCriterion(BaseCriterion): """Criterion to use when evaluating agents response for hallucinations.""" diff --git a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py index df01aba4ff..5dc6320da3 100644 --- a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py +++ b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py @@ -274,7 +274,18 @@ def format_auto_rater_prompt( """Returns the autorater prompt.""" self.create_effective_rubrics_list(actual_invocation.rubrics) user_input = get_text_from_content(actual_invocation.user_content) - final_response = get_text_from_content(actual_invocation.final_response) + + # When evaluate_full_response is enabled, include text from intermediate + # invocation events (e.g. text emitted before tool calls) in addition to + # the final response. This is useful for agents that stream text, call + # tools, then stream more text within a single invocation. + criterion = self._eval_metric.criterion + evaluate_full = getattr(criterion, "evaluate_full_response", False) + + if evaluate_full: + final_response = self._get_full_response_text(actual_invocation) + else: + final_response = get_text_from_content(actual_invocation.final_response) rubrics_text = "\n".join([ f"* {r.rubric_content.text_property}" @@ -310,3 +321,25 @@ def format_auto_rater_prompt( ) return auto_rater_prompt + + @staticmethod + def _get_full_response_text(invocation: Invocation) -> str: + """Concatenates all NL text from invocation events and the final response. + + When an agent emits text before a tool call (e.g. presenting a plan), + that text is stored in intermediate_data.invocation_events but not in + final_response. This method collects text from both sources to give the + judge a complete picture of the agent's output. + """ + parts = [] + if invocation.intermediate_data and isinstance( + invocation.intermediate_data, InvocationEvents + ): + for evt in invocation.intermediate_data.invocation_events: + text = get_text_from_content(evt.content) + if text: + parts.append(text) + final_text = get_text_from_content(invocation.final_response) + if final_text: + parts.append(final_text) + return "\n\n".join(parts) From 95f314808ab359c8d94d442cc12e188b71a5bbf2 Mon Sep 17 00:00:00 2001 From: Sid Gupta Date: Tue, 14 Apr 2026 14:33:25 -0700 Subject: [PATCH 2/7] fix --- .../adk/evaluation/rubric_based_final_response_quality_v1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py index 5dc6320da3..d8cdeb452b 100644 --- a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py +++ b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py @@ -285,7 +285,7 @@ def format_auto_rater_prompt( if evaluate_full: final_response = self._get_full_response_text(actual_invocation) else: - final_response = get_text_from_content(actual_invocation.final_response) + final_response = get_text_from_content(actual_invocation.final_response) or "" rubrics_text = "\n".join([ f"* {r.rubric_content.text_property}" From c84f65311bb6bf88a03b94fa11370c448d25debe Mon Sep 17 00:00:00 2001 From: Sid Gupta Date: Thu, 16 Apr 2026 21:08:59 -0700 Subject: [PATCH 3/7] rename field, add to BaseCriterion class --- src/google/adk/evaluation/eval_metrics.py | 26 ++++++------ .../adk/evaluation/llm_as_judge_utils.py | 34 +++++++++++++-- .../rubric_based_final_response_quality_v1.py | 42 +++++-------------- 3 files changed, 54 insertions(+), 48 deletions(-) diff --git a/src/google/adk/evaluation/eval_metrics.py b/src/google/adk/evaluation/eval_metrics.py index 5895ad807a..aa1af8ac39 100644 --- a/src/google/adk/evaluation/eval_metrics.py +++ b/src/google/adk/evaluation/eval_metrics.py @@ -115,6 +115,19 @@ class BaseCriterion(BaseModel): description="The threshold to be used by the metric.", ) + include_intermediate_responses_in_final: bool = Field( + default=False, + description=( + "Whether to evaluate the full agent response including intermediate" + " natural language text (e.g. text emitted before tool calls) in" + " addition to the final response. By default, only the final" + " response text is sent to the judge. When True, text from all" + " intermediate invocation events is concatenated with the final" + " response before evaluation. This is useful for agents that emit" + " text both before and after tool calls within a single invocation." + ), + ) + class LlmAsAJudgeCriterion(BaseCriterion): """Criterion when using LLM-As-A-Judge metric.""" @@ -144,19 +157,6 @@ class RubricsBasedCriterion(BaseCriterion): ), ) - evaluate_full_response: bool = Field( - default=False, - description=( - "Whether to evaluate the full agent response including intermediate" - " natural language text (e.g. text emitted before tool calls) in" - " addition to the final response. By default, only the final" - " response text is sent to the judge. When True, text from all" - " intermediate invocation events is concatenated with the final" - " response before evaluation. This is useful for agents that emit" - " text both before and after tool calls within a single invocation." - ), - ) - class HallucinationsCriterion(BaseCriterion): """Criterion to use when evaluating agents response for hallucinations.""" diff --git a/src/google/adk/evaluation/llm_as_judge_utils.py b/src/google/adk/evaluation/llm_as_judge_utils.py index cf1309ca38..72fdcdcfac 100644 --- a/src/google/adk/evaluation/llm_as_judge_utils.py +++ b/src/google/adk/evaluation/llm_as_judge_utils.py @@ -26,6 +26,8 @@ from .common import EvalBaseModel from .eval_case import get_all_tool_calls_with_responses from .eval_case import IntermediateDataType +from .eval_case import Invocation +from .eval_case import InvocationEvents from .eval_metrics import RubricScore from .evaluator import EvalStatus @@ -44,10 +46,36 @@ class Label(enum.Enum): def get_text_from_content( - content: Optional[genai_types.Content], + source: Optional[Union[genai_types.Content, Invocation]], + *, + include_intermediate_responses_in_final: bool = False, ) -> Optional[str]: - if content and content.parts: - return "\n".join([p.text for p in content.parts if p.text]) + """Extracts text from a `Content` or an `Invocation`. + + When `source` is a `Content`, returns the concatenated text of its parts. + + When `source` is an `Invocation`, returns the text of the invocation's final + response. If `include_intermediate_responses_in_final` is True, text from + intermediate invocation events (e.g. natural language emitted before tool + calls) is concatenated with the final response text. + """ + if source is None: + return None + if isinstance(source, Invocation): + if not include_intermediate_responses_in_final: + return get_text_from_content(source.final_response) + parts: list[str] = [] + if isinstance(source.intermediate_data, InvocationEvents): + for event in source.intermediate_data.invocation_events: + text = get_text_from_content(event.content) + if text: + parts.append(text) + final_text = get_text_from_content(source.final_response) + if final_text: + parts.append(final_text) + return "\n\n".join(parts) if parts else None + if source.parts: + return "\n".join([p.text for p in source.parts if p.text]) def get_eval_status(score: Optional[float], threshold: float) -> EvalStatus: diff --git a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py index d8cdeb452b..135b2b9593 100644 --- a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py +++ b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py @@ -275,17 +275,17 @@ def format_auto_rater_prompt( self.create_effective_rubrics_list(actual_invocation.rubrics) user_input = get_text_from_content(actual_invocation.user_content) - # When evaluate_full_response is enabled, include text from intermediate - # invocation events (e.g. text emitted before tool calls) in addition to - # the final response. This is useful for agents that stream text, call - # tools, then stream more text within a single invocation. criterion = self._eval_metric.criterion - evaluate_full = getattr(criterion, "evaluate_full_response", False) - - if evaluate_full: - final_response = self._get_full_response_text(actual_invocation) - else: - final_response = get_text_from_content(actual_invocation.final_response) or "" + include_intermediate = getattr( + criterion, "include_intermediate_responses_in_final", False + ) + final_response = ( + get_text_from_content( + actual_invocation, + include_intermediate_responses_in_final=include_intermediate, + ) + or "" + ) rubrics_text = "\n".join([ f"* {r.rubric_content.text_property}" @@ -321,25 +321,3 @@ def format_auto_rater_prompt( ) return auto_rater_prompt - - @staticmethod - def _get_full_response_text(invocation: Invocation) -> str: - """Concatenates all NL text from invocation events and the final response. - - When an agent emits text before a tool call (e.g. presenting a plan), - that text is stored in intermediate_data.invocation_events but not in - final_response. This method collects text from both sources to give the - judge a complete picture of the agent's output. - """ - parts = [] - if invocation.intermediate_data and isinstance( - invocation.intermediate_data, InvocationEvents - ): - for evt in invocation.intermediate_data.invocation_events: - text = get_text_from_content(evt.content) - if text: - parts.append(text) - final_text = get_text_from_content(invocation.final_response) - if final_text: - parts.append(final_text) - return "\n\n".join(parts) From aad242f18dae1fbb466af0aa1fc29b7b343a28ef Mon Sep 17 00:00:00 2001 From: Sid Gupta Date: Thu, 16 Apr 2026 21:18:17 -0700 Subject: [PATCH 4/7] rename --- .../adk/evaluation/llm_as_judge_utils.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/google/adk/evaluation/llm_as_judge_utils.py b/src/google/adk/evaluation/llm_as_judge_utils.py index 72fdcdcfac..b9c6da2d0c 100644 --- a/src/google/adk/evaluation/llm_as_judge_utils.py +++ b/src/google/adk/evaluation/llm_as_judge_utils.py @@ -46,36 +46,36 @@ class Label(enum.Enum): def get_text_from_content( - source: Optional[Union[genai_types.Content, Invocation]], + content: Optional[Union[genai_types.Content, Invocation]], *, include_intermediate_responses_in_final: bool = False, ) -> Optional[str]: """Extracts text from a `Content` or an `Invocation`. - When `source` is a `Content`, returns the concatenated text of its parts. + When `content` is a `Content`, returns the concatenated text of its parts. - When `source` is an `Invocation`, returns the text of the invocation's final + When `content` is an `Invocation`, returns the text of the invocation's final response. If `include_intermediate_responses_in_final` is True, text from intermediate invocation events (e.g. natural language emitted before tool calls) is concatenated with the final response text. """ - if source is None: + if content is None: return None - if isinstance(source, Invocation): + if isinstance(content, Invocation): if not include_intermediate_responses_in_final: - return get_text_from_content(source.final_response) + return get_text_from_content(content.final_response) parts: list[str] = [] - if isinstance(source.intermediate_data, InvocationEvents): - for event in source.intermediate_data.invocation_events: + if isinstance(content.intermediate_data, InvocationEvents): + for event in content.intermediate_data.invocation_events: text = get_text_from_content(event.content) if text: parts.append(text) - final_text = get_text_from_content(source.final_response) + final_text = get_text_from_content(content.final_response) if final_text: parts.append(final_text) return "\n\n".join(parts) if parts else None - if source.parts: - return "\n".join([p.text for p in source.parts if p.text]) + if content.parts: + return "\n".join([p.text for p in content.parts if p.text]) def get_eval_status(score: Optional[float], threshold: float) -> EvalStatus: From f938def7d0f3f06fdd3b832a5754a62760c5910f Mon Sep 17 00:00:00 2001 From: Sid Gupta Date: Thu, 16 Apr 2026 21:29:34 -0700 Subject: [PATCH 5/7] fix --- src/google/adk/evaluation/llm_as_judge_utils.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/google/adk/evaluation/llm_as_judge_utils.py b/src/google/adk/evaluation/llm_as_judge_utils.py index b9c6da2d0c..a68a5d1af0 100644 --- a/src/google/adk/evaluation/llm_as_judge_utils.py +++ b/src/google/adk/evaluation/llm_as_judge_utils.py @@ -59,22 +59,26 @@ def get_text_from_content( intermediate invocation events (e.g. natural language emitted before tool calls) is concatenated with the final response text. """ - if content is None: - return None - if isinstance(content, Invocation): + if isinstance(content, Invocation): if not include_intermediate_responses_in_final: + # Flag off: revert to basic plain-Content behavior. return get_text_from_content(content.final_response) + parts: list[str] = [] if isinstance(content.intermediate_data, InvocationEvents): + # Walk intermediate events in order; collect text parts. for event in content.intermediate_data.invocation_events: text = get_text_from_content(event.content) if text: parts.append(text) + # Then fetch the final response text and append it to the end. final_text = get_text_from_content(content.final_response) if final_text: parts.append(final_text) + return "\n\n".join(parts) if parts else None - if content.parts: + + if content and content.parts: return "\n".join([p.text for p in content.parts if p.text]) From 97a4168a5c35b56ec5f217f5c83a7f2bcb1fa011 Mon Sep 17 00:00:00 2001 From: Sid Gupta Date: Thu, 16 Apr 2026 21:37:12 -0700 Subject: [PATCH 6/7] add test --- .../adk/evaluation/llm_as_judge_utils.py | 2 +- .../evaluation/test_llm_as_judge_utils.py | 44 +++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/src/google/adk/evaluation/llm_as_judge_utils.py b/src/google/adk/evaluation/llm_as_judge_utils.py index a68a5d1af0..4966e98f15 100644 --- a/src/google/adk/evaluation/llm_as_judge_utils.py +++ b/src/google/adk/evaluation/llm_as_judge_utils.py @@ -76,7 +76,7 @@ def get_text_from_content( if final_text: parts.append(final_text) - return "\n\n".join(parts) if parts else None + return "\n".join(parts) if parts else None if content and content.parts: return "\n".join([p.text for p in content.parts if p.text]) diff --git a/tests/unittests/evaluation/test_llm_as_judge_utils.py b/tests/unittests/evaluation/test_llm_as_judge_utils.py index e5327cf454..4b53a2dc43 100644 --- a/tests/unittests/evaluation/test_llm_as_judge_utils.py +++ b/tests/unittests/evaluation/test_llm_as_judge_utils.py @@ -19,6 +19,7 @@ from google.adk.evaluation.app_details import AgentDetails from google.adk.evaluation.app_details import AppDetails from google.adk.evaluation.eval_case import IntermediateData +from google.adk.evaluation.eval_case import Invocation from google.adk.evaluation.eval_case import InvocationEvent from google.adk.evaluation.eval_case import InvocationEvents from google.adk.evaluation.eval_rubrics import RubricScore @@ -88,6 +89,49 @@ def test_get_text_from_content_with_mixed_parts(): assert get_text_from_content(content) == "Hello\nWorld" +def test_get_text_from_content_with_invocation_include_intermediate_responses_in_final(): + """Tests get_text_from_content on an Invocation with and without the flag.""" + intermediate_text = "Let me check." + final_response_text = "Done." + invocation = Invocation( + user_content=genai_types.Content(parts=[genai_types.Part(text="user")]), + intermediate_data=InvocationEvents( + invocation_events=[ + InvocationEvent( + author="agent", + content=genai_types.Content( + parts=[genai_types.Part(text=intermediate_text)] + ), + ), + InvocationEvent( + author="tool", + content=genai_types.Content( + parts=[ + genai_types.Part( + function_call=genai_types.FunctionCall(name="t") + ) + ] + ), + ), + ] + ), + final_response=genai_types.Content( + parts=[genai_types.Part(text=final_response_text)] + ), + ) + + # Flag off (default): only the final response text is returned. + assert get_text_from_content(invocation) == final_response_text + + # Flag on: intermediate text is concatenated before the final response. + assert ( + get_text_from_content( + invocation, include_intermediate_responses_in_final=True + ) + == f"{intermediate_text}\n{final_response_text}" + ) + + def test_get_eval_status_with_none_score(): """Tests get_eval_status returns NOT_EVALUATED for a None score.""" assert get_eval_status(score=None, threshold=0.5) == EvalStatus.NOT_EVALUATED From 898da3d72cdf782d5ccd8c7d9811b346808bc36e Mon Sep 17 00:00:00 2001 From: Sid Gupta Date: Thu, 16 Apr 2026 22:41:34 -0700 Subject: [PATCH 7/7] fix pyink test: --- src/google/adk/evaluation/llm_as_judge_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/google/adk/evaluation/llm_as_judge_utils.py b/src/google/adk/evaluation/llm_as_judge_utils.py index 4966e98f15..0986f2bed0 100644 --- a/src/google/adk/evaluation/llm_as_judge_utils.py +++ b/src/google/adk/evaluation/llm_as_judge_utils.py @@ -59,7 +59,7 @@ def get_text_from_content( intermediate invocation events (e.g. natural language emitted before tool calls) is concatenated with the final response text. """ - if isinstance(content, Invocation): + if isinstance(content, Invocation): if not include_intermediate_responses_in_final: # Flag off: revert to basic plain-Content behavior. return get_text_from_content(content.final_response)