From 29e560fdec230b5fdfbfd523411341e251090371 Mon Sep 17 00:00:00 2001
From: Sid Gupta <sid.gupta@joinhandshake.com>
Date: Mon, 13 Apr 2026 23:06:20 -0700
Subject: [PATCH 1/7] feat(eval): add evaluate_full_response option to
 rubric-based evaluation

---
 src/google/adk/evaluation/eval_metrics.py     | 19 ++++++++++
 .../rubric_based_final_response_quality_v1.py | 35 ++++++++++++++++++-
 2 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/src/google/adk/evaluation/eval_metrics.py b/src/google/adk/evaluation/eval_metrics.py
index eb7c7e36cb..5895ad807a 100644
--- a/src/google/adk/evaluation/eval_metrics.py
+++ b/src/google/adk/evaluation/eval_metrics.py
@@ -61,6 +61,12 @@ class PrebuiltMetrics(Enum):
 
   PER_TURN_USER_SIMULATOR_QUALITY_V1 = "per_turn_user_simulator_quality_v1"
 
+  MULTI_TURN_TASK_SUCCESS_V1 = "multi_turn_task_success_v1"
+
+  MULTI_TURN_TRAJECTORY_QUALITY_V1 = "multi_turn_trajectory_quality_v1"
+
+  MULTI_TURN_TOOL_USE_QUALITY_V1 = "multi_turn_tool_use_quality_v1"
+
 
 MetricName: TypeAlias = Union[str, PrebuiltMetrics]
 Threshold: TypeAlias = float
@@ -138,6 +144,19 @@ class RubricsBasedCriterion(BaseCriterion):
       ),
   )
 
+  evaluate_full_response: bool = Field(
+      default=False,
+      description=(
+          "Whether to evaluate the full agent response including intermediate"
+          " natural language text (e.g. text emitted before tool calls) in"
+          " addition to the final response. By default, only the final"
+          " response text is sent to the judge. When True, text from all"
+          " intermediate invocation events is concatenated with the final"
+          " response before evaluation. This is useful for agents that emit"
+          " text both before and after tool calls within a single invocation."
+      ),
+  )
+
 
 class HallucinationsCriterion(BaseCriterion):
   """Criterion to use when evaluating agents response for hallucinations."""
diff --git a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py
index df01aba4ff..5dc6320da3 100644
--- a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py
+++ b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py
@@ -274,7 +274,18 @@ def format_auto_rater_prompt(
     """Returns the autorater prompt."""
     self.create_effective_rubrics_list(actual_invocation.rubrics)
     user_input = get_text_from_content(actual_invocation.user_content)
-    final_response = get_text_from_content(actual_invocation.final_response)
+
+    # When evaluate_full_response is enabled, include text from intermediate
+    # invocation events (e.g. text emitted before tool calls) in addition to
+    # the final response. This is useful for agents that stream text, call
+    # tools, then stream more text within a single invocation.
+    criterion = self._eval_metric.criterion
+    evaluate_full = getattr(criterion, "evaluate_full_response", False)
+
+    if evaluate_full:
+      final_response = self._get_full_response_text(actual_invocation)
+    else:
+      final_response = get_text_from_content(actual_invocation.final_response)
 
     rubrics_text = "\n".join([
         f"*  {r.rubric_content.text_property}"
@@ -310,3 +321,25 @@ def format_auto_rater_prompt(
     )
 
     return auto_rater_prompt
+
+  @staticmethod
+  def _get_full_response_text(invocation: Invocation) -> str:
+    """Concatenates all NL text from invocation events and the final response.
+
+    When an agent emits text before a tool call (e.g. presenting a plan),
+    that text is stored in intermediate_data.invocation_events but not in
+    final_response. This method collects text from both sources to give the
+    judge a complete picture of the agent's output.
+    """
+    parts = []
+    if invocation.intermediate_data and isinstance(
+        invocation.intermediate_data, InvocationEvents
+    ):
+      for evt in invocation.intermediate_data.invocation_events:
+        text = get_text_from_content(evt.content)
+        if text:
+          parts.append(text)
+    final_text = get_text_from_content(invocation.final_response)
+    if final_text:
+      parts.append(final_text)
+    return "\n\n".join(parts)

From 95f314808ab359c8d94d442cc12e188b71a5bbf2 Mon Sep 17 00:00:00 2001
From: Sid Gupta <sid.gupta@joinhandshake.com>
Date: Tue, 14 Apr 2026 14:33:25 -0700
Subject: [PATCH 2/7] fix

---
 .../adk/evaluation/rubric_based_final_response_quality_v1.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py
index 5dc6320da3..d8cdeb452b 100644
--- a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py
+++ b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py
@@ -285,7 +285,7 @@ def format_auto_rater_prompt(
     if evaluate_full:
       final_response = self._get_full_response_text(actual_invocation)
     else:
-      final_response = get_text_from_content(actual_invocation.final_response)
+      final_response = get_text_from_content(actual_invocation.final_response) or ""
 
     rubrics_text = "\n".join([
         f"*  {r.rubric_content.text_property}"

From c84f65311bb6bf88a03b94fa11370c448d25debe Mon Sep 17 00:00:00 2001
From: Sid Gupta <sid.gupta@joinhandshake.com>
Date: Thu, 16 Apr 2026 21:08:59 -0700
Subject: [PATCH 3/7] rename field, add to BaseCriterion class

---
 src/google/adk/evaluation/eval_metrics.py     | 26 ++++++------
 .../adk/evaluation/llm_as_judge_utils.py      | 34 +++++++++++++--
 .../rubric_based_final_response_quality_v1.py | 42 +++++--------------
 3 files changed, 54 insertions(+), 48 deletions(-)

diff --git a/src/google/adk/evaluation/eval_metrics.py b/src/google/adk/evaluation/eval_metrics.py
index 5895ad807a..aa1af8ac39 100644
--- a/src/google/adk/evaluation/eval_metrics.py
+++ b/src/google/adk/evaluation/eval_metrics.py
@@ -115,6 +115,19 @@ class BaseCriterion(BaseModel):
       description="The threshold to be used by the metric.",
   )
 
+  include_intermediate_responses_in_final: bool = Field(
+      default=False,
+      description=(
+          "Whether to evaluate the full agent response including intermediate"
+          " natural language text (e.g. text emitted before tool calls) in"
+          " addition to the final response. By default, only the final"
+          " response text is sent to the judge. When True, text from all"
+          " intermediate invocation events is concatenated with the final"
+          " response before evaluation. This is useful for agents that emit"
+          " text both before and after tool calls within a single invocation."
+      ),
+  )
+
 
 class LlmAsAJudgeCriterion(BaseCriterion):
   """Criterion when using LLM-As-A-Judge metric."""
@@ -144,19 +157,6 @@ class RubricsBasedCriterion(BaseCriterion):
       ),
   )
 
-  evaluate_full_response: bool = Field(
-      default=False,
-      description=(
-          "Whether to evaluate the full agent response including intermediate"
-          " natural language text (e.g. text emitted before tool calls) in"
-          " addition to the final response. By default, only the final"
-          " response text is sent to the judge. When True, text from all"
-          " intermediate invocation events is concatenated with the final"
-          " response before evaluation. This is useful for agents that emit"
-          " text both before and after tool calls within a single invocation."
-      ),
-  )
-
 
 class HallucinationsCriterion(BaseCriterion):
   """Criterion to use when evaluating agents response for hallucinations."""
diff --git a/src/google/adk/evaluation/llm_as_judge_utils.py b/src/google/adk/evaluation/llm_as_judge_utils.py
index cf1309ca38..72fdcdcfac 100644
--- a/src/google/adk/evaluation/llm_as_judge_utils.py
+++ b/src/google/adk/evaluation/llm_as_judge_utils.py
@@ -26,6 +26,8 @@
 from .common import EvalBaseModel
 from .eval_case import get_all_tool_calls_with_responses
 from .eval_case import IntermediateDataType
+from .eval_case import Invocation
+from .eval_case import InvocationEvents
 from .eval_metrics import RubricScore
 from .evaluator import EvalStatus
 
@@ -44,10 +46,36 @@ class Label(enum.Enum):
 
 
 def get_text_from_content(
-    content: Optional[genai_types.Content],
+    source: Optional[Union[genai_types.Content, Invocation]],
+    *,
+    include_intermediate_responses_in_final: bool = False,
 ) -> Optional[str]:
-  if content and content.parts:
-    return "\n".join([p.text for p in content.parts if p.text])
+  """Extracts text from a `Content` or an `Invocation`.
+
+  When `source` is a `Content`, returns the concatenated text of its parts.
+
+  When `source` is an `Invocation`, returns the text of the invocation's final
+  response. If `include_intermediate_responses_in_final` is True, text from
+  intermediate invocation events (e.g. natural language emitted before tool
+  calls) is concatenated with the final response text.
+  """
+  if source is None:
+    return None
+  if isinstance(source, Invocation):
+    if not include_intermediate_responses_in_final:
+      return get_text_from_content(source.final_response)
+    parts: list[str] = []
+    if isinstance(source.intermediate_data, InvocationEvents):
+      for event in source.intermediate_data.invocation_events:
+        text = get_text_from_content(event.content)
+        if text:
+          parts.append(text)
+    final_text = get_text_from_content(source.final_response)
+    if final_text:
+      parts.append(final_text)
+    return "\n\n".join(parts) if parts else None
+  if source.parts:
+    return "\n".join([p.text for p in source.parts if p.text])
 
 
 def get_eval_status(score: Optional[float], threshold: float) -> EvalStatus:
diff --git a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py
index d8cdeb452b..135b2b9593 100644
--- a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py
+++ b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py
@@ -275,17 +275,17 @@ def format_auto_rater_prompt(
     self.create_effective_rubrics_list(actual_invocation.rubrics)
     user_input = get_text_from_content(actual_invocation.user_content)
 
-    # When evaluate_full_response is enabled, include text from intermediate
-    # invocation events (e.g. text emitted before tool calls) in addition to
-    # the final response. This is useful for agents that stream text, call
-    # tools, then stream more text within a single invocation.
     criterion = self._eval_metric.criterion
-    evaluate_full = getattr(criterion, "evaluate_full_response", False)
-
-    if evaluate_full:
-      final_response = self._get_full_response_text(actual_invocation)
-    else:
-      final_response = get_text_from_content(actual_invocation.final_response) or ""
+    include_intermediate = getattr(
+        criterion, "include_intermediate_responses_in_final", False
+    )
+    final_response = (
+        get_text_from_content(
+            actual_invocation,
+            include_intermediate_responses_in_final=include_intermediate,
+        )
+        or ""
+    )
 
     rubrics_text = "\n".join([
         f"*  {r.rubric_content.text_property}"
@@ -321,25 +321,3 @@ def format_auto_rater_prompt(
     )
 
     return auto_rater_prompt
-
-  @staticmethod
-  def _get_full_response_text(invocation: Invocation) -> str:
-    """Concatenates all NL text from invocation events and the final response.
-
-    When an agent emits text before a tool call (e.g. presenting a plan),
-    that text is stored in intermediate_data.invocation_events but not in
-    final_response. This method collects text from both sources to give the
-    judge a complete picture of the agent's output.
-    """
-    parts = []
-    if invocation.intermediate_data and isinstance(
-        invocation.intermediate_data, InvocationEvents
-    ):
-      for evt in invocation.intermediate_data.invocation_events:
-        text = get_text_from_content(evt.content)
-        if text:
-          parts.append(text)
-    final_text = get_text_from_content(invocation.final_response)
-    if final_text:
-      parts.append(final_text)
-    return "\n\n".join(parts)

From aad242f18dae1fbb466af0aa1fc29b7b343a28ef Mon Sep 17 00:00:00 2001
From: Sid Gupta <sid.gupta@joinhandshake.com>
Date: Thu, 16 Apr 2026 21:18:17 -0700
Subject: [PATCH 4/7] rename

---
 .../adk/evaluation/llm_as_judge_utils.py      | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/google/adk/evaluation/llm_as_judge_utils.py b/src/google/adk/evaluation/llm_as_judge_utils.py
index 72fdcdcfac..b9c6da2d0c 100644
--- a/src/google/adk/evaluation/llm_as_judge_utils.py
+++ b/src/google/adk/evaluation/llm_as_judge_utils.py
@@ -46,36 +46,36 @@ class Label(enum.Enum):
 
 
 def get_text_from_content(
-    source: Optional[Union[genai_types.Content, Invocation]],
+    content: Optional[Union[genai_types.Content, Invocation]],
     *,
     include_intermediate_responses_in_final: bool = False,
 ) -> Optional[str]:
   """Extracts text from a `Content` or an `Invocation`.
 
-  When `source` is a `Content`, returns the concatenated text of its parts.
+  When `content` is a `Content`, returns the concatenated text of its parts.
 
-  When `source` is an `Invocation`, returns the text of the invocation's final
+  When `content` is an `Invocation`, returns the text of the invocation's final
   response. If `include_intermediate_responses_in_final` is True, text from
   intermediate invocation events (e.g. natural language emitted before tool
   calls) is concatenated with the final response text.
   """
-  if source is None:
+  if content is None:
     return None
-  if isinstance(source, Invocation):
+  if isinstance(content, Invocation):
     if not include_intermediate_responses_in_final:
-      return get_text_from_content(source.final_response)
+      return get_text_from_content(content.final_response)
     parts: list[str] = []
-    if isinstance(source.intermediate_data, InvocationEvents):
-      for event in source.intermediate_data.invocation_events:
+    if isinstance(content.intermediate_data, InvocationEvents):
+      for event in content.intermediate_data.invocation_events:
         text = get_text_from_content(event.content)
         if text:
           parts.append(text)
-    final_text = get_text_from_content(source.final_response)
+    final_text = get_text_from_content(content.final_response)
     if final_text:
       parts.append(final_text)
     return "\n\n".join(parts) if parts else None
-  if source.parts:
-    return "\n".join([p.text for p in source.parts if p.text])
+  if content.parts:
+    return "\n".join([p.text for p in content.parts if p.text])
 
 
 def get_eval_status(score: Optional[float], threshold: float) -> EvalStatus:

From f938def7d0f3f06fdd3b832a5754a62760c5910f Mon Sep 17 00:00:00 2001
From: Sid Gupta <sid.gupta@joinhandshake.com>
Date: Thu, 16 Apr 2026 21:29:34 -0700
Subject: [PATCH 5/7] fix

---
 src/google/adk/evaluation/llm_as_judge_utils.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/google/adk/evaluation/llm_as_judge_utils.py b/src/google/adk/evaluation/llm_as_judge_utils.py
index b9c6da2d0c..a68a5d1af0 100644
--- a/src/google/adk/evaluation/llm_as_judge_utils.py
+++ b/src/google/adk/evaluation/llm_as_judge_utils.py
@@ -59,22 +59,26 @@ def get_text_from_content(
   intermediate invocation events (e.g. natural language emitted before tool
   calls) is concatenated with the final response text.
   """
-  if content is None:
-    return None
-  if isinstance(content, Invocation):
+  if isinstance(content, Invocation):    
     if not include_intermediate_responses_in_final:
+      # Flag off: revert to basic plain-Content behavior.
       return get_text_from_content(content.final_response)
+
     parts: list[str] = []
     if isinstance(content.intermediate_data, InvocationEvents):
+      # Walk intermediate events in order; collect text parts.
       for event in content.intermediate_data.invocation_events:
         text = get_text_from_content(event.content)
         if text:
           parts.append(text)
+    # Then fetch the final response text and append it to the end.
     final_text = get_text_from_content(content.final_response)
     if final_text:
       parts.append(final_text)
+
     return "\n\n".join(parts) if parts else None
-  if content.parts:
+
+  if content and content.parts:
     return "\n".join([p.text for p in content.parts if p.text])
 
 

From 97a4168a5c35b56ec5f217f5c83a7f2bcb1fa011 Mon Sep 17 00:00:00 2001
From: Sid Gupta <sid.gupta@joinhandshake.com>
Date: Thu, 16 Apr 2026 21:37:12 -0700
Subject: [PATCH 6/7] add test

---
 .../adk/evaluation/llm_as_judge_utils.py      |  2 +-
 .../evaluation/test_llm_as_judge_utils.py     | 44 +++++++++++++++++++
 2 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/src/google/adk/evaluation/llm_as_judge_utils.py b/src/google/adk/evaluation/llm_as_judge_utils.py
index a68a5d1af0..4966e98f15 100644
--- a/src/google/adk/evaluation/llm_as_judge_utils.py
+++ b/src/google/adk/evaluation/llm_as_judge_utils.py
@@ -76,7 +76,7 @@ def get_text_from_content(
     if final_text:
       parts.append(final_text)
 
-    return "\n\n".join(parts) if parts else None
+    return "\n".join(parts) if parts else None
 
   if content and content.parts:
     return "\n".join([p.text for p in content.parts if p.text])
diff --git a/tests/unittests/evaluation/test_llm_as_judge_utils.py b/tests/unittests/evaluation/test_llm_as_judge_utils.py
index e5327cf454..4b53a2dc43 100644
--- a/tests/unittests/evaluation/test_llm_as_judge_utils.py
+++ b/tests/unittests/evaluation/test_llm_as_judge_utils.py
@@ -19,6 +19,7 @@
 from google.adk.evaluation.app_details import AgentDetails
 from google.adk.evaluation.app_details import AppDetails
 from google.adk.evaluation.eval_case import IntermediateData
+from google.adk.evaluation.eval_case import Invocation
 from google.adk.evaluation.eval_case import InvocationEvent
 from google.adk.evaluation.eval_case import InvocationEvents
 from google.adk.evaluation.eval_rubrics import RubricScore
@@ -88,6 +89,49 @@ def test_get_text_from_content_with_mixed_parts():
   assert get_text_from_content(content) == "Hello\nWorld"
 
 
+def test_get_text_from_content_with_invocation_include_intermediate_responses_in_final():
+  """Tests get_text_from_content on an Invocation with and without the flag."""
+  intermediate_text = "Let me check."
+  final_response_text = "Done."
+  invocation = Invocation(
+      user_content=genai_types.Content(parts=[genai_types.Part(text="user")]),
+      intermediate_data=InvocationEvents(
+          invocation_events=[
+              InvocationEvent(
+                  author="agent",
+                  content=genai_types.Content(
+                      parts=[genai_types.Part(text=intermediate_text)]
+                  ),
+              ),
+              InvocationEvent(
+                  author="tool",
+                  content=genai_types.Content(
+                      parts=[
+                          genai_types.Part(
+                              function_call=genai_types.FunctionCall(name="t")
+                          )
+                      ]
+                  ),
+              ),
+          ]
+      ),
+      final_response=genai_types.Content(
+          parts=[genai_types.Part(text=final_response_text)]
+      ),
+  )
+
+  # Flag off (default): only the final response text is returned.
+  assert get_text_from_content(invocation) == final_response_text
+
+  # Flag on: intermediate text is concatenated before the final response.
+  assert (
+      get_text_from_content(
+          invocation, include_intermediate_responses_in_final=True
+      )
+      == f"{intermediate_text}\n{final_response_text}"
+  )
+
+
 def test_get_eval_status_with_none_score():
   """Tests get_eval_status returns NOT_EVALUATED for a None score."""
   assert get_eval_status(score=None, threshold=0.5) == EvalStatus.NOT_EVALUATED

From 898da3d72cdf782d5ccd8c7d9811b346808bc36e Mon Sep 17 00:00:00 2001
From: Sid Gupta <sid.gupta@joinhandshake.com>
Date: Thu, 16 Apr 2026 22:41:34 -0700
Subject: [PATCH 7/7] fix pyink test:

---
 src/google/adk/evaluation/llm_as_judge_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/google/adk/evaluation/llm_as_judge_utils.py b/src/google/adk/evaluation/llm_as_judge_utils.py
index 4966e98f15..0986f2bed0 100644
--- a/src/google/adk/evaluation/llm_as_judge_utils.py
+++ b/src/google/adk/evaluation/llm_as_judge_utils.py
@@ -59,7 +59,7 @@ def get_text_from_content(
   intermediate invocation events (e.g. natural language emitted before tool
   calls) is concatenated with the final response text.
   """
-  if isinstance(content, Invocation):    
+  if isinstance(content, Invocation):
     if not include_intermediate_responses_in_final:
       # Flag off: revert to basic plain-Content behavior.
       return get_text_from_content(content.final_response)