From b3f900da15cad68c3c76a8cbe31be31117da286d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arnaud=20He=CC=81ritier?= <arnaud.heritier@docker.com>
Date: Wed, 11 Mar 2026 13:43:44 +0100
Subject: [PATCH 1/4] fix(#2053): filter sub-agent streaming events from parent
 session persistence

During task transfers, sub-agent streaming events (AgentChoiceEvent,
AgentChoiceReasoningEvent, MessageAddedEvent) were forwarded through
the parent session's event channel. The PersistentRuntime's
handleEvent checked sess.IsSubSession(), but sess was always the
parent session, so the guard never triggered. This caused sub-agent
assistant messages to be persisted directly into the parent session's
message history, corrupting it.

On session restore, the parent session contained interleaved sub-agent
messages with tool_use blocks that had no corresponding tool_result
messages in the parent context, causing Anthropic API errors:
"unexpected tool_use_id found in tool_result blocks".

Add SessionID field to AgentChoiceEvent and AgentChoiceReasoningEvent,
and filter all streaming/message events by comparing the event's
SessionID against the parent session's ID. Events from sub-sessions
are now silently skipped during persistence (they are persisted
separately via SubSessionCompletedEvent).

Assisted-By: docker-agent
---
 pkg/runtime/event.go              | 16 ++++++++++------
 pkg/runtime/persistent_runtime.go |  9 +++++++++
 pkg/runtime/runtime_test.go       | 26 +++++++++++++-------------
 pkg/runtime/streaming.go          |  4 ++--
 4 files changed, 34 insertions(+), 21 deletions(-)

diff --git a/pkg/runtime/event.go b/pkg/runtime/event.go
index 5a1160bb0..81cef421f 100644
--- a/pkg/runtime/event.go
+++ b/pkg/runtime/event.go
@@ -138,29 +138,33 @@ func StreamStarted(sessionID, agentName string) Event {
 }
 
 type AgentChoiceEvent struct {
-	Type    string `json:"type"`
-	Content string `json:"content"`
+	Type      string `json:"type"`
+	Content   string `json:"content"`
+	SessionID string `json:"session_id,omitempty"`
 	AgentContext
 }
 
-func AgentChoice(agentName, content string) Event {
+func AgentChoice(agentName, sessionID, content string) Event {
 	return &AgentChoiceEvent{
 		Type:         "agent_choice",
 		Content:      content,
+		SessionID:    sessionID,
 		AgentContext: newAgentContext(agentName),
 	}
 }
 
 type AgentChoiceReasoningEvent struct {
-	Type    string `json:"type"`
-	Content string `json:"content"`
+	Type      string `json:"type"`
+	Content   string `json:"content"`
+	SessionID string `json:"session_id,omitempty"`
 	AgentContext
 }
 
-func AgentChoiceReasoning(agentName, content string) Event {
+func AgentChoiceReasoning(agentName, sessionID, content string) Event {
 	return &AgentChoiceReasoningEvent{
 		Type:         "agent_choice_reasoning",
 		Content:      content,
+		SessionID:    sessionID,
 		AgentContext: newAgentContext(agentName),
 	}
 }
diff --git a/pkg/runtime/persistent_runtime.go b/pkg/runtime/persistent_runtime.go
index fea609fa2..f47ffce6d 100644
--- a/pkg/runtime/persistent_runtime.go
+++ b/pkg/runtime/persistent_runtime.go
@@ -73,6 +73,9 @@ func (r *PersistentRuntime) handleEvent(ctx context.Context, sess *session.Sessi
 
 	switch e := event.(type) {
 	case *AgentChoiceEvent:
+		if e.SessionID != sess.ID {
+			return
+		}
 		// Accumulate streaming content
 		streaming.content.WriteString(e.Content)
 		streaming.agentName = e.AgentName
@@ -80,6 +83,9 @@ func (r *PersistentRuntime) handleEvent(ctx context.Context, sess *session.Sessi
 		r.persistStreamingContent(ctx, sess.ID, streaming)
 
 	case *AgentChoiceReasoningEvent:
+		if e.SessionID != sess.ID {
+			return
+		}
 		// Accumulate streaming reasoning content
 		streaming.reasoningContent.WriteString(e.Content)
 		streaming.agentName = e.AgentName
@@ -98,6 +104,9 @@ func (r *PersistentRuntime) handleEvent(ctx context.Context, sess *session.Sessi
 		}
 
 	case *MessageAddedEvent:
+		if e.SessionID != sess.ID {
+			return
+		}
 		// Finalize the streaming message with complete metadata
 		if streaming.messageID != 0 {
 			// Update the existing streaming message with final content
diff --git a/pkg/runtime/runtime_test.go b/pkg/runtime/runtime_test.go
index a1e651ecf..88e5c9824 100644
--- a/pkg/runtime/runtime_test.go
+++ b/pkg/runtime/runtime_test.go
@@ -282,7 +282,7 @@ func TestSimple(t *testing.T) {
 		UserMessage("Hi", sess.ID, nil, 0),
 		StreamStarted(sess.ID, "root"),
 		ToolsetInfo(0, false, "root"),
-		AgentChoice("root", "Hello"),
+		AgentChoice("root", sess.ID, "Hello"),
 		MessageAdded(sess.ID, msgAdded.Message, "root"),
 		NewTokenUsageEvent(sess.ID, "root", &Usage{InputTokens: 3, OutputTokens: 2, ContextLength: 5, LastMessage: &MessageUsage{
 			Usage: chat.Usage{InputTokens: 3, OutputTokens: 2},
@@ -321,11 +321,11 @@ func TestMultipleContentChunks(t *testing.T) {
 		UserMessage("Please greet me", sess.ID, nil, 0),
 		StreamStarted(sess.ID, "root"),
 		ToolsetInfo(0, false, "root"),
-		AgentChoice("root", "Hello "),
-		AgentChoice("root", "there, "),
-		AgentChoice("root", "how "),
-		AgentChoice("root", "are "),
-		AgentChoice("root", "you?"),
+		AgentChoice("root", sess.ID, "Hello "),
+		AgentChoice("root", sess.ID, "there, "),
+		AgentChoice("root", sess.ID, "how "),
+		AgentChoice("root", sess.ID, "are "),
+		AgentChoice("root", sess.ID, "you?"),
 		MessageAdded(sess.ID, msgAdded.Message, "root"),
 		NewTokenUsageEvent(sess.ID, "root", &Usage{InputTokens: 8, OutputTokens: 12, ContextLength: 20, LastMessage: &MessageUsage{
 			Usage: chat.Usage{InputTokens: 8, OutputTokens: 12},
@@ -362,9 +362,9 @@ func TestWithReasoning(t *testing.T) {
 		UserMessage("Hi", sess.ID, nil, 0),
 		StreamStarted(sess.ID, "root"),
 		ToolsetInfo(0, false, "root"),
-		AgentChoiceReasoning("root", "Let me think about this..."),
-		AgentChoiceReasoning("root", " I should respond politely."),
-		AgentChoice("root", "Hello, how can I help you?"),
+		AgentChoiceReasoning("root", sess.ID, "Let me think about this..."),
+		AgentChoiceReasoning("root", sess.ID, " I should respond politely."),
+		AgentChoice("root", sess.ID, "Hello, how can I help you?"),
 		MessageAdded(sess.ID, msgAdded.Message, "root"),
 		NewTokenUsageEvent(sess.ID, "root", &Usage{InputTokens: 10, OutputTokens: 15, ContextLength: 25, LastMessage: &MessageUsage{
 			Usage: chat.Usage{InputTokens: 10, OutputTokens: 15},
@@ -402,10 +402,10 @@ func TestMixedContentAndReasoning(t *testing.T) {
 		UserMessage("Hi there", sess.ID, nil, 0),
 		StreamStarted(sess.ID, "root"),
 		ToolsetInfo(0, false, "root"),
-		AgentChoiceReasoning("root", "The user wants a greeting"),
-		AgentChoice("root", "Hello!"),
-		AgentChoiceReasoning("root", " I should be friendly"),
-		AgentChoice("root", " How can I help you today?"),
+		AgentChoiceReasoning("root", sess.ID, "The user wants a greeting"),
+		AgentChoice("root", sess.ID, "Hello!"),
+		AgentChoiceReasoning("root", sess.ID, " I should be friendly"),
+		AgentChoice("root", sess.ID, " How can I help you today?"),
 		MessageAdded(sess.ID, msgAdded.Message, "root"),
 		NewTokenUsageEvent(sess.ID, "root", &Usage{InputTokens: 15, OutputTokens: 20, ContextLength: 35, LastMessage: &MessageUsage{
 			Usage: chat.Usage{InputTokens: 15, OutputTokens: 20},
diff --git a/pkg/runtime/streaming.go b/pkg/runtime/streaming.go
index 8f7d44960..3d06a8916 100644
--- a/pkg/runtime/streaming.go
+++ b/pkg/runtime/streaming.go
@@ -164,7 +164,7 @@ func (r *LocalRuntime) handleStream(ctx context.Context, stream chat.MessageStre
 		}
 
 		if choice.Delta.ReasoningContent != "" {
-			events <- AgentChoiceReasoning(a.Name(), choice.Delta.ReasoningContent)
+			events <- AgentChoiceReasoning(a.Name(), sess.ID, choice.Delta.ReasoningContent)
 			fullReasoningContent.WriteString(choice.Delta.ReasoningContent)
 		}
 
@@ -174,7 +174,7 @@ func (r *LocalRuntime) handleStream(ctx context.Context, stream chat.MessageStre
 		}
 
 		if choice.Delta.Content != "" {
-			events <- AgentChoice(a.Name(), choice.Delta.Content)
+			events <- AgentChoice(a.Name(), sess.ID, choice.Delta.Content)
 			fullContent.WriteString(choice.Delta.Content)
 		}
 	}

From e3c28195728c50fe463374cfc53f992ad19a4515 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arnaud=20He=CC=81ritier?= <arnaud.heritier@docker.com>
Date: Wed, 11 Mar 2026 13:44:56 +0100
Subject: [PATCH 2/4] fix(#2053): add pendingAssistantToolUse guard to beta
 message converter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The non-beta convertMessages has a pendingAssistantToolUse flag that
only includes tool_result user messages when they immediately follow
an assistant message with tool_use blocks. Orphan tool results from
corrupted session history are silently dropped.

The beta convertBetaMessages had no such guard — every tool role
message was unconditionally converted to a tool_result block. When
the session history contained orphan tool results (e.g. from
sub-agent messages that leaked into the parent session), they passed
straight through to the Anthropic API, causing:

  "unexpected tool_use_id found in tool_result blocks"

Add the same pendingAssistantToolUse tracking to convertBetaMessages
to match the non-beta converter behavior.

Assisted-By: docker-agent
---
 .../provider/anthropic/beta_converter.go      | 49 ++++++++++---------
 1 file changed, 25 insertions(+), 24 deletions(-)

diff --git a/pkg/model/provider/anthropic/beta_converter.go b/pkg/model/provider/anthropic/beta_converter.go
index fa1c04dcf..1f3abd6b6 100644
--- a/pkg/model/provider/anthropic/beta_converter.go
+++ b/pkg/model/provider/anthropic/beta_converter.go
@@ -23,6 +23,7 @@ import (
 // blocks from the same assistant message MUST be grouped into a single user message.
 func (c *Client) convertBetaMessages(ctx context.Context, messages []chat.Message) ([]anthropic.BetaMessageParam, error) {
 	var betaMessages []anthropic.BetaMessageParam
+	pendingAssistantToolUse := false
 
 	for i := 0; i < len(messages); i++ {
 		msg := &messages[i]
@@ -75,20 +76,19 @@ func (c *Client) convertBetaMessages(ctx context.Context, messages []chat.Messag
 			}
 
 			// Add tool calls
-			if len(msg.ToolCalls) > 0 {
-				for _, toolCall := range msg.ToolCalls {
-					var inpts map[string]any
-					if err := json.Unmarshal([]byte(toolCall.Function.Arguments), &inpts); err != nil {
-						inpts = map[string]any{}
-					}
-					contentBlocks = append(contentBlocks, anthropic.BetaContentBlockParamUnion{
-						OfToolUse: &anthropic.BetaToolUseBlockParam{
-							ID:    toolCall.ID,
-							Input: inpts,
-							Name:  toolCall.Function.Name,
-						},
-					})
+			hasToolCalls := len(msg.ToolCalls) > 0
+			for _, toolCall := range msg.ToolCalls {
+				var inpts map[string]any
+				if err := json.Unmarshal([]byte(toolCall.Function.Arguments), &inpts); err != nil {
+					inpts = map[string]any{}
 				}
+				contentBlocks = append(contentBlocks, anthropic.BetaContentBlockParamUnion{
+					OfToolUse: &anthropic.BetaToolUseBlockParam{
+						ID:    toolCall.ID,
+						Input: inpts,
+						Name:  toolCall.Function.Name,
+					},
+				})
 			}
 
 			if len(contentBlocks) > 0 {
@@ -97,28 +97,29 @@ func (c *Client) convertBetaMessages(ctx context.Context, messages []chat.Messag
 					Content: contentBlocks,
 				})
 			}
+			pendingAssistantToolUse = hasToolCalls
 			continue
 		}
 		if msg.Role == chat.MessageRoleTool {
 			// Collect consecutive tool messages and merge them into a single user message
 			// This is required by Anthropic API: all tool_result blocks for tool_use blocks
 			// from the same assistant message must be in the same user message
-			toolResultBlocks := []anthropic.BetaContentBlockParamUnion{
-				convertBetaToolResultBlock(msg),
-			}
-
-			// Look ahead for consecutive tool messages and merge them
-			j := i + 1
+			var toolResultBlocks []anthropic.BetaContentBlockParamUnion
+			j := i
 			for j < len(messages) && messages[j].Role == chat.MessageRoleTool {
 				toolResultBlocks = append(toolResultBlocks, convertBetaToolResultBlock(&messages[j]))
 				j++
 			}
 
-			// Add the merged user message with all tool results
-			betaMessages = append(betaMessages, anthropic.BetaMessageParam{
-				Role:    anthropic.BetaMessageParamRoleUser,
-				Content: toolResultBlocks,
-			})
+			// Only include tool results if they follow an assistant message with tool_use.
+			// Orphan tool_result blocks (e.g. from corrupted session history) are dropped.
+			if pendingAssistantToolUse && len(toolResultBlocks) > 0 {
+				betaMessages = append(betaMessages, anthropic.BetaMessageParam{
+					Role:    anthropic.BetaMessageParamRoleUser,
+					Content: toolResultBlocks,
+				})
+			}
+			pendingAssistantToolUse = false
 
 			// Skip the messages we've already processed
 			i = j - 1

From fee4f1b531de884809d4abfda9440f33f8c875ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arnaud=20He=CC=81ritier?= <arnaud.heritier@docker.com>
Date: Wed, 11 Mar 2026 13:46:27 +0100
Subject: [PATCH 3/4] fix(#2053): add reverse validation and merge-based repair
 for tool sequencing

validateSequencing only checked the forward direction: every assistant
tool_use must have a matching tool_result in the next user message. It
did not check the reverse: every tool_result must reference a tool_use
in the immediately preceding assistant message. Orphan tool_results
passed validation silently.

Additionally, repairSequencing inserted a synthetic user message with
missing tool_results *before* an existing user message that already
had partial tool_results. This split tool_results across two user
messages, causing the existing results to become orphaned:

  assistant(tool_use: A, B)
  synthetic_user(tool_result: B)   <- repair inserted this
  user(tool_result: A)             <- A now orphaned (previous is synthetic_user)

Two fixes:

1. validateSequencing now also checks the reverse direction: for each
   user message containing tool_result blocks, verify the immediately
   preceding message is an assistant with corresponding tool_use IDs.

2. repairSequencing now merges missing tool_results into the existing
   next user message instead of inserting a separate synthetic message.
   This keeps all tool_results in a single user message adjacent to the
   assistant, satisfying both forward and reverse invariants.

Assisted-By: docker-agent
---
 pkg/model/provider/anthropic/client.go | 131 +++++++++++++++++++------
 1 file changed, 103 insertions(+), 28 deletions(-)

diff --git a/pkg/model/provider/anthropic/client.go b/pkg/model/provider/anthropic/client.go
index 10e05b701..9a637b9f6 100644
--- a/pkg/model/provider/anthropic/client.go
+++ b/pkg/model/provider/anthropic/client.go
@@ -776,67 +776,103 @@ func contentArray(m map[string]any) []any {
 	return nil
 }
 
-// validateSequencing generically validates that every assistant message with tool_use blocks
-// is immediately followed by a user message with corresponding tool_result blocks.
-// It works on both standard (MessageParam) and Beta (BetaMessageParam) types by
-// marshaling to map[string]any for inspection.
+// validateSequencing generically validates that:
+//  1. Every assistant message with tool_use blocks is immediately followed by a user
+//     message with corresponding tool_result blocks.
+//  2. Every user message with tool_result blocks is immediately preceded by an assistant
+//     message that contains the corresponding tool_use blocks.
 func validateSequencing[T any](msgs []T) error {
 	for i := range msgs {
 		m, ok := marshalToMap(msgs[i])
-		if !ok || m["role"] != "assistant" {
+		if !ok {
 			continue
 		}
 
-		toolUseIDs := collectToolUseIDs(contentArray(m))
-		if len(toolUseIDs) == 0 {
-			continue
-		}
+		// Forward check: assistant tool_use → next user must have matching tool_results
+		if m["role"] == "assistant" {
+			toolUseIDs := collectToolUseIDs(contentArray(m))
+			if len(toolUseIDs) == 0 {
+				continue
+			}
 
-		if i+1 >= len(msgs) {
-			slog.Warn("Anthropic sequencing invalid: assistant tool_use present but no next user tool_result message", "assistant_index", i)
-			return errors.New("assistant tool_use present but no subsequent user message with tool_result blocks")
-		}
+			if i+1 >= len(msgs) {
+				slog.Warn("Anthropic sequencing invalid: assistant tool_use present but no next user tool_result message", "assistant_index", i)
+				return errors.New("assistant tool_use present but no subsequent user message with tool_result blocks")
+			}
+
+			next, ok := marshalToMap(msgs[i+1])
+			if !ok || next["role"] != "user" {
+				slog.Warn("Anthropic sequencing invalid: next message after assistant tool_use is not user", "assistant_index", i, "next_role", next["role"])
+				return errors.New("assistant tool_use must be followed by a user message containing corresponding tool_result blocks")
+			}
 
-		next, ok := marshalToMap(msgs[i+1])
-		if !ok || next["role"] != "user" {
-			slog.Warn("Anthropic sequencing invalid: next message after assistant tool_use is not user", "assistant_index", i, "next_role", next["role"])
-			return errors.New("assistant tool_use must be followed by a user message containing corresponding tool_result blocks")
+			toolResultIDs := collectToolResultIDs(contentArray(next))
+			missing := differenceIDs(toolUseIDs, toolResultIDs)
+			if len(missing) > 0 {
+				slog.Warn("Anthropic sequencing invalid: missing tool_result for tool_use id in next user message", "assistant_index", i, "tool_use_id", missing[0], "missing_count", len(missing))
+				return fmt.Errorf("missing tool_result for tool_use id %s in the next user message", missing[0])
+			}
 		}
 
-		toolResultIDs := collectToolResultIDs(contentArray(next))
-		missing := differenceIDs(toolUseIDs, toolResultIDs)
-		if len(missing) > 0 {
-			slog.Warn("Anthropic sequencing invalid: missing tool_result for tool_use id in next user message", "assistant_index", i, "tool_use_id", missing[0], "missing_count", len(missing))
-			return fmt.Errorf("missing tool_result for tool_use id %s in the next user message", missing[0])
+		// Reverse check: user with tool_result → previous message must be assistant with matching tool_use
+		if m["role"] == "user" {
+			toolResultIDs := collectToolResultIDs(contentArray(m))
+			if len(toolResultIDs) == 0 {
+				continue
+			}
+
+			if i == 0 {
+				slog.Warn("Anthropic sequencing invalid: user tool_result with no preceding assistant message", "user_index", i)
+				return errors.New("user tool_result blocks with no preceding assistant message")
+			}
+
+			prev, ok := marshalToMap(msgs[i-1])
+			if !ok || prev["role"] != "assistant" {
+				slog.Warn("Anthropic sequencing invalid: user tool_result not preceded by assistant", "user_index", i)
+				return errors.New("user tool_result blocks must be preceded by an assistant message with corresponding tool_use blocks")
+			}
+
+			toolUseIDs := collectToolUseIDs(contentArray(prev))
+			orphan := differenceIDs(toolResultIDs, toolUseIDs)
+			if len(orphan) > 0 {
+				slog.Warn("Anthropic sequencing invalid: orphan tool_result referencing non-existent tool_use", "user_index", i, "tool_use_id", orphan[0], "orphan_count", len(orphan))
+				return fmt.Errorf("orphan tool_result for tool_use id %s: no matching tool_use in preceding assistant message", orphan[0])
+			}
 		}
 	}
 	return nil
 }
 
 // repairSequencing generically inserts a synthetic user message after any assistant
-// tool_use message that is missing corresponding tool_result blocks. The makeSynthetic
+// tool_use message that is missing corresponding tool_result blocks. When the next
+// message is already a user message with partial tool_results, the missing results
+// are merged into that message to avoid splitting results across messages (which
+// would cause the API to reject orphan tool_result references). The makeSynthetic
 // callback builds the appropriate user message type for the remaining tool_use IDs.
 func repairSequencing[T any](msgs []T, makeSynthetic func(toolUseIDs map[string]struct{}) T) []T {
 	if len(msgs) == 0 {
 		return msgs
 	}
 	repaired := make([]T, 0, len(msgs)+2)
-	for i := range msgs {
-		repaired = append(repaired, msgs[i])
-
+	for i := 0; i < len(msgs); i++ {
 		m, ok := marshalToMap(msgs[i])
 		if !ok || m["role"] != "assistant" {
+			repaired = append(repaired, msgs[i])
 			continue
 		}
 
+		repaired = append(repaired, msgs[i])
+
 		toolUseIDs := collectToolUseIDs(contentArray(m))
 		if len(toolUseIDs) == 0 {
 			continue
 		}
 
-		// Remove any IDs that already have results in the next user message
+		// Check if the next message is a user message with some tool_results
+		hasNextUser := false
 		if i+1 < len(msgs) {
 			if next, ok := marshalToMap(msgs[i+1]); ok && next["role"] == "user" {
+				hasNextUser = true
 				toolResultIDs := collectToolResultIDs(contentArray(next))
 				for id := range toolResultIDs {
 					delete(toolUseIDs, id)
@@ -844,7 +880,46 @@ func repairSequencing[T any](msgs []T, makeSynthetic func(toolUseIDs map[string]
 			}
 		}
 
-		if len(toolUseIDs) > 0 {
+		if len(toolUseIDs) == 0 {
+			continue
+		}
+
+		if hasNextUser {
+			// Merge the synthetic results into the existing next user message
+			// by replacing it with a combined message (synthetic first, then original).
+			// This avoids splitting tool_results across separate user messages.
+			synthetic := makeSynthetic(toolUseIDs)
+			synthMap, _ := marshalToMap(synthetic)
+			nextMap, _ := marshalToMap(msgs[i+1])
+
+			synthContent := contentArray(synthMap)
+			nextContent := contentArray(nextMap)
+
+			mergedContent := append(synthContent, nextContent...)
+			nextMap["content"] = mergedContent
+
+			// Re-marshal the merged message back into the typed parameter
+			mergedBytes, err := json.Marshal(nextMap)
+			if err != nil {
+				slog.Warn("Failed to marshal merged tool_result message, inserting synthetic instead", "error", err)
+				repaired = append(repaired, synthetic)
+				i++ // skip msgs[i+1] — it would split tool_results across user messages
+				continue
+			}
+			var merged T
+			if err := json.Unmarshal(mergedBytes, &merged); err != nil {
+				slog.Warn("Failed to unmarshal merged tool_result message, inserting synthetic instead", "error", err)
+				repaired = append(repaired, synthetic)
+				i++ // skip msgs[i+1] — it would split tool_results across user messages
+				continue
+			}
+
+			slog.Debug("Merged synthetic tool_results into existing user message",
+				"assistant_index", i,
+				"missing_count", len(toolUseIDs))
+			// Replace the next message so we emit the merged version
+			msgs[i+1] = merged
+		} else {
 			slog.Debug("Inserting synthetic user message for missing tool_results",
 				"assistant_index", i,
 				"missing_count", len(toolUseIDs))

From 7220241f9117f1b6c9130c02458f06d96469cecd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arnaud=20He=CC=81ritier?= <arnaud.heritier@docker.com>
Date: Wed, 11 Mar 2026 13:49:04 +0100
Subject: [PATCH 4/4] test(#2053): add coverage for tool_use/tool_result
 sequencing fixes

Add tests covering:

- Reverse validation: orphan tool_results referencing non-existent
  tool_use in the preceding assistant message are detected.
- Reverse validation: tool_results with no preceding assistant
  message at all are detected.
- Repair merge: when partial tool_results exist in the next user
  message, missing results are merged into it (not inserted as a
  separate synthetic message that would break sequencing).
- Repair insert: when there is no next user message at all, a
  synthetic user message is correctly inserted.
- Beta converter: orphan tool_results (no preceding assistant
  tool_use) are dropped by the pendingAssistantToolUse guard.
- Beta converter: normal tool_use/tool_result flow continues to work
  correctly with the guard in place.
- Runtime event tests updated for SessionID field added to
  AgentChoiceEvent and AgentChoiceReasoningEvent.

Assisted-By: docker-agent
---
 pkg/model/provider/anthropic/client_test.go | 205 ++++++++++++++++++++
 1 file changed, 205 insertions(+)

diff --git a/pkg/model/provider/anthropic/client_test.go b/pkg/model/provider/anthropic/client_test.go
index 3e53a15a4..6335a27b4 100644
--- a/pkg/model/provider/anthropic/client_test.go
+++ b/pkg/model/provider/anthropic/client_test.go
@@ -600,3 +600,208 @@ func TestExtractSystemBlocksCacheControl(t *testing.T) {
 	assert.Equal(t, "ephemeral", string(blocks[3].CacheControl.Type))
 	assert.Empty(t, string(blocks[3].CacheControl.TTL))
 }
+
+func TestValidateSequencing_ReverseOrphanToolResult(t *testing.T) {
+	// A user message with tool_result that references a tool_use_id not present
+	// in the preceding assistant message should be caught by reverse validation.
+	msgs := []anthropic.MessageParam{
+		anthropic.NewUserMessage(anthropic.NewTextBlock("start")),
+		anthropic.NewAssistantMessage(
+			anthropic.ContentBlockParamUnion{
+				OfToolUse: &anthropic.ToolUseBlockParam{
+					ID:    "tool-A",
+					Input: map[string]any{},
+					Name:  "read_file",
+				},
+			},
+		),
+		// User message with tool_results for A (matching) and B (orphan)
+		anthropic.NewUserMessage(
+			anthropic.NewToolResultBlock("tool-A", "result-A", false),
+			anthropic.NewToolResultBlock("tool-B", "result-B", false),
+		),
+	}
+
+	err := validateAnthropicSequencing(msgs)
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "orphan tool_result")
+	assert.Contains(t, err.Error(), "tool-B")
+}
+
+func TestValidateSequencing_ReverseNoAssistantBeforeToolResult(t *testing.T) {
+	// A user message with tool_result as the first message should fail.
+	msgs := []anthropic.MessageParam{
+		anthropic.NewUserMessage(
+			anthropic.NewToolResultBlock("tool-A", "result-A", false),
+		),
+	}
+
+	err := validateAnthropicSequencing(msgs)
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "no preceding assistant message")
+}
+
+func TestRepairSequencing_MergesIntoExistingUserMessage(t *testing.T) {
+	// When an assistant has tool_use A and B, but the next user message only has
+	// tool_result for A, repair should merge a synthetic tool_result for B into
+	// the same user message rather than inserting a separate synthetic message.
+	msgs := []anthropic.MessageParam{
+		anthropic.NewUserMessage(anthropic.NewTextBlock("start")),
+		anthropic.NewAssistantMessage(
+			anthropic.ContentBlockParamUnion{
+				OfToolUse: &anthropic.ToolUseBlockParam{
+					ID:    "tool-A",
+					Input: map[string]any{},
+					Name:  "t1",
+				},
+			},
+			anthropic.ContentBlockParamUnion{
+				OfToolUse: &anthropic.ToolUseBlockParam{
+					ID:    "tool-B",
+					Input: map[string]any{},
+					Name:  "t2",
+				},
+			},
+		),
+		// Next user message only has tool_result for A, missing B
+		anthropic.NewUserMessage(
+			anthropic.NewToolResultBlock("tool-A", "result-A", false),
+		),
+		anthropic.NewUserMessage(anthropic.NewTextBlock("continue")),
+	}
+
+	// Should fail validation
+	require.Error(t, validateAnthropicSequencing(msgs))
+
+	// Repair
+	repaired := repairAnthropicSequencing(msgs)
+
+	// Should pass validation after repair
+	require.NoError(t, validateAnthropicSequencing(repaired))
+
+	// The total message count should NOT increase (merged, not inserted)
+	assert.Len(t, repaired, 4, "repair should merge into existing user message, not insert a new one")
+
+	// Verify the user message at index 2 now has both tool_results
+	b, err := json.Marshal(repaired[2])
+	require.NoError(t, err)
+	var m map[string]any
+	require.NoError(t, json.Unmarshal(b, &m))
+
+	content, ok := m["content"].([]any)
+	require.True(t, ok)
+
+	toolResultIDs := make(map[string]struct{})
+	for _, c := range content {
+		if cb, ok := c.(map[string]any); ok {
+			if cb["type"] == "tool_result" {
+				if id, _ := cb["tool_use_id"].(string); id != "" {
+					toolResultIDs[id] = struct{}{}
+				}
+			}
+		}
+	}
+	assert.Contains(t, toolResultIDs, "tool-A")
+	assert.Contains(t, toolResultIDs, "tool-B")
+}
+
+func TestRepairSequencing_InsertsWhenNoNextUserMessage(t *testing.T) {
+	// When an assistant has tool_use but there's no following user message at all,
+	// repair should insert a synthetic user message.
+	msgs := []anthropic.MessageParam{
+		anthropic.NewUserMessage(anthropic.NewTextBlock("start")),
+		anthropic.NewAssistantMessage(
+			anthropic.ContentBlockParamUnion{
+				OfToolUse: &anthropic.ToolUseBlockParam{
+					ID:    "tool-X",
+					Input: map[string]any{},
+					Name:  "do_thing",
+				},
+			},
+		),
+	}
+
+	require.Error(t, validateAnthropicSequencing(msgs))
+
+	repaired := repairAnthropicSequencing(msgs)
+
+	require.NoError(t, validateAnthropicSequencing(repaired))
+	assert.Len(t, repaired, 3, "should insert a synthetic user message")
+}
+
+func TestConvertBetaMessages_DropsOrphanToolResults(t *testing.T) {
+	// When a tool result message appears without a preceding assistant message
+	// with tool_use, the beta converter should drop it.
+	msgs := []chat.Message{
+		{Role: chat.MessageRoleUser, Content: "start"},
+		{Role: chat.MessageRoleAssistant, Content: "sure, let me help"},
+		// Orphan tool result — previous assistant has no tool_use
+		{Role: chat.MessageRoleTool, ToolCallID: "tool-orphan", Content: "orphan result"},
+		{Role: chat.MessageRoleUser, Content: "continue"},
+	}
+
+	converted, err := testClient().convertBetaMessages(t.Context(), msgs)
+	require.NoError(t, err)
+
+	// Should have: user(start), assistant(text), user(continue)
+	// The orphan tool result should be dropped
+	require.Len(t, converted, 3)
+
+	for _, msg := range converted {
+		b, err := json.Marshal(msg)
+		require.NoError(t, err)
+		var m map[string]any
+		require.NoError(t, json.Unmarshal(b, &m))
+		content, _ := m["content"].([]any)
+		for _, c := range content {
+			if cb, ok := c.(map[string]any); ok {
+				assert.NotEqual(t, "tool_result", cb["type"],
+					"orphan tool_result should not be included in beta messages")
+			}
+		}
+	}
+}
+
+func TestConvertBetaMessages_IncludesToolResultsAfterToolUse(t *testing.T) {
+	// Normal case: assistant with tool_use followed by tool results should work.
+	msgs := []chat.Message{
+		{Role: chat.MessageRoleUser, Content: "start"},
+		{
+			Role: chat.MessageRoleAssistant,
+			ToolCalls: []tools.ToolCall{
+				{ID: "tool-1", Function: tools.FunctionCall{Name: "read_file", Arguments: "{}"}},
+				{ID: "tool-2", Function: tools.FunctionCall{Name: "write_file", Arguments: "{}"}},
+			},
+		},
+		{Role: chat.MessageRoleTool, ToolCallID: "tool-1", Content: "file content"},
+		{Role: chat.MessageRoleTool, ToolCallID: "tool-2", Content: "ok"},
+		{Role: chat.MessageRoleUser, Content: "done"},
+	}
+
+	converted, err := testClient().convertBetaMessages(t.Context(), msgs)
+	require.NoError(t, err)
+
+	// Should have: user(start), assistant(tool_use x2), user(tool_result x2), user(done)
+	require.Len(t, converted, 4)
+
+	// Verify the tool results are in the third message
+	b, err := json.Marshal(converted[2])
+	require.NoError(t, err)
+	var m map[string]any
+	require.NoError(t, json.Unmarshal(b, &m))
+	assert.Equal(t, "user", m["role"])
+	content, ok := m["content"].([]any)
+	require.True(t, ok)
+	assert.Len(t, content, 2)
+
+	ids := make(map[string]struct{})
+	for _, c := range content {
+		if cb, ok := c.(map[string]any); ok && cb["type"] == "tool_result" {
+			if id, _ := cb["tool_use_id"].(string); id != "" {
+				ids[id] = struct{}{}
+			}
+		}
+	}
+	assert.Contains(t, ids, "tool-1")
+	assert.Contains(t, ids, "tool-2")
+}