From 6b231dc6fc52e102abb88c368932a5d47021cfc6 Mon Sep 17 00:00:00 2001
From: Akansha Agarwal <agarakan@users.noreply.github.com>
Date: Tue, 30 Dec 2025 05:00:53 +0000
Subject: [PATCH 01/50] introduce retry metadata to batch struct

---
 .../cloudwatchlogs/internal/pusher/batch.go   | 59 +++++++++++++++++++
 .../cloudwatchlogs/internal/pusher/sender.go  | 37 ++++++------
 2 files changed, 79 insertions(+), 17 deletions(-)

diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go b/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go
index 71e7b14a821..e73ffc1e4ef 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go
@@ -101,6 +101,13 @@ type logEventBatch struct {
 	// Callbacks specifically for updating state
 	stateCallbacks []func()
 	batchers       map[string]*state.RangeQueueBatcher
+
+	// Retry metadata
+	retryCountShort    int       // Number of retries using short delay strategy
+	retryCountLong     int       // Number of retries using long delay strategy
+	startTime          time.Time // Time of first request (for max retry duration calculation)
+	nextRetryTime      time.Time // When this batch should be retried next
+	lastError          error     // Last error encountered
 }
 
 func newLogEventBatch(target Target, entityProvider logs.LogEntityProvider) *logEventBatch {
@@ -226,3 +233,55 @@ func (t byTimestamp) Swap(i, j int) {
 func (t byTimestamp) Less(i, j int) bool {
 	return *t[i].Timestamp < *t[j].Timestamp
 }
+
+// initializeStartTime sets the start time if not already set.
+func (b *logEventBatch) initializeStartTime() {
+	if b.startTime.IsZero() {
+		b.startTime = time.Now()
+	}
+}
+
+// updateRetryMetadata updates the retry metadata after a failed send attempt.
+// It increments the appropriate retry counter based on the error type and calculates the next retry time.
+func (b *logEventBatch) updateRetryMetadata(err error) {
+	// Store the error
+	b.lastError = err
+
+	// Determine retry strategy and increment counter
+	var wait time.Duration
+	if chooseRetryWaitStrategy(err) == retryLong {
+		wait = retryWaitLong(b.retryCountLong)
+		b.retryCountLong++
+	} else {
+		wait = retryWaitShort(b.retryCountShort)
+		b.retryCountShort++
+	}
+
+	// Calculate next retry time (honest timestamp, not capped)
+	b.nextRetryTime = time.Now().Add(wait)
+}
+
+// isExpired checks if the batch has exceeded the maximum retry duration (14 days).
+func (b *logEventBatch) isExpired(maxRetryDuration time.Duration) bool {
+	if b.startTime.IsZero() {
+		return false
+	}
+	return time.Since(b.startTime) > maxRetryDuration
+}
+
+// isReadyForRetry checks if enough time has passed since the last failure to retry this batch.
+func (b *logEventBatch) isReadyForRetry() bool {
+	if b.nextRetryTime.IsZero() {
+		return true // Never failed, ready to send
+	}
+	return time.Now().After(b.nextRetryTime)
+}
+
+// resetRetryMetadata resets all retry-related fields after a successful send.
+func (b *logEventBatch) resetRetryMetadata() {
+	b.retryCountShort = 0
+	b.retryCountLong = 0
+	b.startTime = time.Time{}
+	b.nextRetryTime = time.Time{}
+	b.lastError = nil
+}
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go b/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go
index de1bdf6708f..6181f1fa3f7 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go
@@ -63,11 +63,11 @@ func (s *sender) Send(batch *logEventBatch) {
 	if len(batch.events) == 0 {
 		return
 	}
+	
+	// Initialize start time before build()
+	batch.initializeStartTime()
 	input := batch.build()
-	startTime := time.Now()
 
-	retryCountShort := 0
-	retryCountLong := 0
 	for {
 		output, err := s.service.PutLogEvents(input)
 		if err == nil {
@@ -83,8 +83,9 @@ func (s *sender) Send(batch *logEventBatch) {
 					s.logger.Warnf("%d log events for log '%s/%s' are expired", *info.ExpiredLogEventEndIndex, batch.Group, batch.Stream)
 				}
 			}
+			// Success - call done callbacks
 			batch.done()
-			s.logger.Debugf("Pusher published %v log events to group: %v stream: %v with size %v KB in %v.", len(batch.events), batch.Group, batch.Stream, batch.bufferedSize/1024, time.Since(startTime))
+			s.logger.Debugf("Pusher published %v log events to group: %v stream: %v with size %v KB in %v.", len(batch.events), batch.Group, batch.Stream, batch.bufferedSize/1024, time.Since(batch.startTime))
 			return
 		}
 
@@ -110,27 +111,29 @@ func (s *sender) Send(batch *logEventBatch) {
 			s.logger.Errorf("Aws error received when sending logs to %v/%v: %v", batch.Group, batch.Stream, awsErr)
 		}
 
-		// retry wait strategy depends on the type of error returned
-		var wait time.Duration
-		if chooseRetryWaitStrategy(err) == retryLong {
-			wait = retryWaitLong(retryCountLong)
-			retryCountLong++
-		} else {
-			wait = retryWaitShort(retryCountShort)
-			retryCountShort++
-		}
+		// Update retry metadata in the batch
+		batch.updateRetryMetadata(err)
 
-		if time.Since(startTime)+wait > s.RetryDuration() {
-			s.logger.Errorf("All %v retries to %v/%v failed for PutLogEvents, request dropped.", retryCountShort+retryCountLong-1, batch.Group, batch.Stream)
+		// Check if the next retry time would exceed the max retry duration
+		// This prevents us from sleeping and then making another doomed API call
+		totalRetries := batch.retryCountShort + batch.retryCountLong - 1
+		if batch.isExpired(s.RetryDuration()) || batch.nextRetryTime.After(batch.startTime.Add(s.RetryDuration())) {
+			s.logger.Errorf("All %v retries to %v/%v failed for PutLogEvents, request dropped.", totalRetries, batch.Group, batch.Stream)
 			batch.updateState()
 			return
 		}
 
-		s.logger.Warnf("Retried %v time, going to sleep %v before retrying.", retryCountShort+retryCountLong-1, wait)
+		// Calculate wait time until next retry
+		wait := batch.nextRetryTime.Sub(time.Now())
+		if wait < 0 {
+			wait = 0
+		}
+
+		s.logger.Warnf("Retried %v time, going to sleep %v before retrying.", totalRetries, wait)
 
 		select {
 		case <-s.stopCh:
-			s.logger.Errorf("Stop requested after %v retries to %v/%v failed for PutLogEvents, request dropped.", retryCountShort+retryCountLong-1, batch.Group, batch.Stream)
+			s.logger.Errorf("Stop requested after %v retries to %v/%v failed for PutLogEvents, request dropped.", totalRetries, batch.Group, batch.Stream)
 			batch.updateState()
 			return
 		case <-time.After(wait):

From 8521373070f816216c2709efda72220eebc3ccd9 Mon Sep 17 00:00:00 2001
From: Akansha Agarwal <agarakan@users.noreply.github.com>
Date: Tue, 30 Dec 2025 05:04:58 +0000
Subject: [PATCH 02/50] Remove unused reset method

---
 plugins/outputs/cloudwatchlogs/internal/pusher/batch.go | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go b/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go
index e73ffc1e4ef..00b91fecdc5 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go
@@ -276,12 +276,3 @@ func (b *logEventBatch) isReadyForRetry() bool {
 	}
 	return time.Now().After(b.nextRetryTime)
 }
-
-// resetRetryMetadata resets all retry-related fields after a successful send.
-func (b *logEventBatch) resetRetryMetadata() {
-	b.retryCountShort = 0
-	b.retryCountLong = 0
-	b.startTime = time.Time{}
-	b.nextRetryTime = time.Time{}
-	b.lastError = nil
-}

From 7244af8cead2671b8973ede6e449dc642875414b Mon Sep 17 00:00:00 2001
From: Akansha Agarwal <agarakan@users.noreply.github.com>
Date: Tue, 30 Dec 2025 05:13:18 +0000
Subject: [PATCH 03/50] add unit tests for retryMetadata

---
 .../internal/pusher/batch_test.go             | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/batch_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/batch_test.go
index 04e523464e7..28fc741c5bc 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/batch_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/batch_test.go
@@ -404,3 +404,28 @@ func TestValidateAndTruncateMessage(t *testing.T) {
 		})
 	}
 }
+func TestBatchRetryMetadata(t *testing.T) {
+	target := Target{Group: "test-group", Stream: "test-stream"}
+	batch := newLogEventBatch(target, nil)
+
+	// Test initial state
+	assert.True(t, batch.startTime.IsZero())
+	assert.True(t, batch.isReadyForRetry())
+	assert.False(t, batch.isExpired(time.Hour))
+
+	// Test initializeStartTime
+	batch.initializeStartTime()
+	assert.False(t, batch.startTime.IsZero())
+	
+	// Test updateRetryMetadata
+	err := assert.AnError
+	batch.updateRetryMetadata(err)
+	assert.Equal(t, 1, batch.retryCountShort)
+	assert.Equal(t, 0, batch.retryCountLong)
+	assert.Equal(t, err, batch.lastError)
+	assert.False(t, batch.nextRetryTime.IsZero())
+
+	// Test isExpired
+	batch.startTime = time.Now().Add(-25 * time.Hour)
+	assert.True(t, batch.isExpired(24*time.Hour))
+}

From d2f21e1293e0ec1ce539c719e828e465c1f12fe6 Mon Sep 17 00:00:00 2001
From: Akansha Agarwal <agarakan@users.noreply.github.com>
Date: Tue, 30 Dec 2025 05:25:54 +0000
Subject: [PATCH 04/50] fix lint

---
 .../cloudwatchlogs/internal/pusher/batch.go      | 16 ++++++++--------
 .../cloudwatchlogs/internal/pusher/batch_test.go |  2 +-
 .../cloudwatchlogs/internal/pusher/sender.go     |  8 +++-----
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go b/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go
index 00b91fecdc5..a0013e785e7 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go
@@ -103,11 +103,11 @@ type logEventBatch struct {
 	batchers       map[string]*state.RangeQueueBatcher
 
 	// Retry metadata
-	retryCountShort    int       // Number of retries using short delay strategy
-	retryCountLong     int       // Number of retries using long delay strategy
-	startTime          time.Time // Time of first request (for max retry duration calculation)
-	nextRetryTime      time.Time // When this batch should be retried next
-	lastError          error     // Last error encountered
+	retryCountShort int       // Number of retries using short delay strategy
+	retryCountLong  int       // Number of retries using long delay strategy
+	startTime       time.Time // Time of first request (for max retry duration calculation)
+	nextRetryTime   time.Time // When this batch should be retried next
+	lastError       error     // Last error encountered
 }
 
 func newLogEventBatch(target Target, entityProvider logs.LogEntityProvider) *logEventBatch {
@@ -257,11 +257,11 @@ func (b *logEventBatch) updateRetryMetadata(err error) {
 		b.retryCountShort++
 	}
 
-	// Calculate next retry time (honest timestamp, not capped)
+	// Calculate next retry time
 	b.nextRetryTime = time.Now().Add(wait)
 }
 
-// isExpired checks if the batch has exceeded the maximum retry duration (14 days).
+// isExpired checks if the batch has exceeded the maximum retry duration.
 func (b *logEventBatch) isExpired(maxRetryDuration time.Duration) bool {
 	if b.startTime.IsZero() {
 		return false
@@ -272,7 +272,7 @@ func (b *logEventBatch) isExpired(maxRetryDuration time.Duration) bool {
 // isReadyForRetry checks if enough time has passed since the last failure to retry this batch.
 func (b *logEventBatch) isReadyForRetry() bool {
 	if b.nextRetryTime.IsZero() {
-		return true // Never failed, ready to send
+		return true
 	}
 	return time.Now().After(b.nextRetryTime)
 }
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/batch_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/batch_test.go
index 28fc741c5bc..a5f13b127c7 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/batch_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/batch_test.go
@@ -416,7 +416,7 @@ func TestBatchRetryMetadata(t *testing.T) {
 	// Test initializeStartTime
 	batch.initializeStartTime()
 	assert.False(t, batch.startTime.IsZero())
-	
+
 	// Test updateRetryMetadata
 	err := assert.AnError
 	batch.updateRetryMetadata(err)
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go b/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go
index 6181f1fa3f7..3df074a0fbe 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go
@@ -63,7 +63,7 @@ func (s *sender) Send(batch *logEventBatch) {
 	if len(batch.events) == 0 {
 		return
 	}
-	
+
 	// Initialize start time before build()
 	batch.initializeStartTime()
 	input := batch.build()
@@ -83,7 +83,6 @@ func (s *sender) Send(batch *logEventBatch) {
 					s.logger.Warnf("%d log events for log '%s/%s' are expired", *info.ExpiredLogEventEndIndex, batch.Group, batch.Stream)
 				}
 			}
-			// Success - call done callbacks
 			batch.done()
 			s.logger.Debugf("Pusher published %v log events to group: %v stream: %v with size %v KB in %v.", len(batch.events), batch.Group, batch.Stream, batch.bufferedSize/1024, time.Since(batch.startTime))
 			return
@@ -114,8 +113,7 @@ func (s *sender) Send(batch *logEventBatch) {
 		// Update retry metadata in the batch
 		batch.updateRetryMetadata(err)
 
-		// Check if the next retry time would exceed the max retry duration
-		// This prevents us from sleeping and then making another doomed API call
+		// Check if retry would exceed max duration
 		totalRetries := batch.retryCountShort + batch.retryCountLong - 1
 		if batch.isExpired(s.RetryDuration()) || batch.nextRetryTime.After(batch.startTime.Add(s.RetryDuration())) {
 			s.logger.Errorf("All %v retries to %v/%v failed for PutLogEvents, request dropped.", totalRetries, batch.Group, batch.Stream)
@@ -124,7 +122,7 @@ func (s *sender) Send(batch *logEventBatch) {
 		}
 
 		// Calculate wait time until next retry
-		wait := batch.nextRetryTime.Sub(time.Now())
+		wait := time.Until(batch.nextRetryTime)
 		if wait < 0 {
 			wait = 0
 		}

From 66186a78b1268b8bc663468064f6e0480fec8211 Mon Sep 17 00:00:00 2001
From: Akansha Agarwal <agarakan@users.noreply.github.com>
Date: Tue, 30 Dec 2025 16:35:24 +0000
Subject: [PATCH 05/50] Introduce retryHeap and retryHeapProcessor

---
 .../internal/pusher/retryheap.go              | 191 ++++++++++++++++++
 .../internal/pusher/retryheap_test.go         | 151 ++++++++++++++
 2 files changed, 342 insertions(+)
 create mode 100644 plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
 create mode 100644 plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go

diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
new file mode 100644
index 00000000000..5973f57b381
--- /dev/null
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
@@ -0,0 +1,191 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+package pusher
+
+import (
+	"container/heap"
+	"errors"
+	"sync"
+	"time"
+
+	"github.com/influxdata/telegraf"
+)
+
+// retryHeapImpl implements heap.Interface for logEventBatch sorted by nextRetryTime
+type retryHeapImpl []*logEventBatch
+
+func (h retryHeapImpl) Len() int { return len(h) }
+
+func (h retryHeapImpl) Less(i, j int) bool {
+	return h[i].nextRetryTime.Before(h[j].nextRetryTime)
+}
+
+func (h retryHeapImpl) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
+
+func (h *retryHeapImpl) Push(x interface{}) {
+	*h = append(*h, x.(*logEventBatch))
+}
+
+func (h *retryHeapImpl) Pop() interface{} {
+	old := *h
+	n := len(old)
+	item := old[n-1]
+	*h = old[0 : n-1]
+	return item
+}
+
+// RetryHeap manages failed batches during their retry wait periods
+type RetryHeap interface {
+	Push(batch *logEventBatch) error
+	PopReady() []*logEventBatch
+	Size() int
+	Stop()
+}
+
+type retryHeap struct {
+	heap    retryHeapImpl
+	mutex   sync.RWMutex
+	pushCh  chan *logEventBatch
+	stopCh  chan struct{}
+	maxSize int
+}
+
+// NewRetryHeap creates a new retry heap with the specified maximum size
+func NewRetryHeap(maxSize int) RetryHeap {
+	rh := &retryHeap{
+		heap:    make(retryHeapImpl, 0),
+		maxSize: maxSize,
+		pushCh:  make(chan *logEventBatch, maxSize),
+		stopCh:  make(chan struct{}),
+	}
+	heap.Init(&rh.heap)
+	go rh.pushToHeapWorker()
+	return rh
+}
+
+// pushToHeapWorker moves batches from the blocking channel to the time-ordered heap
+// This bridges channel-based blocking (like sender queue) with heap-based time ordering
+func (rh *retryHeap) pushToHeapWorker() {
+	for {
+		select {
+		case batch := <-rh.pushCh:
+			rh.mutex.Lock()
+			heap.Push(&rh.heap, batch)
+			rh.mutex.Unlock()
+		case <-rh.stopCh:
+			return
+		}
+	}
+}
+
+// Push adds a batch to the heap, blocking if full (same as sender queue)
+func (rh *retryHeap) Push(batch *logEventBatch) error {
+	select {
+	case rh.pushCh <- batch:
+		return nil
+	case <-rh.stopCh:
+		return errors.New("retry heap stopped")
+	}
+}
+
+// PopReady returns all batches that are ready for retry (nextRetryTime <= now)
+func (rh *retryHeap) PopReady() []*logEventBatch {
+	rh.mutex.Lock()
+	defer rh.mutex.Unlock()
+
+	now := time.Now()
+	var ready []*logEventBatch
+
+	// Pop all batches that are ready for retry
+	for len(rh.heap) > 0 && !rh.heap[0].nextRetryTime.After(now) {
+		batch := heap.Pop(&rh.heap).(*logEventBatch)
+		ready = append(ready, batch)
+	}
+
+	return ready
+}
+
+// Size returns the current number of batches in the heap and pending channel
+func (rh *retryHeap) Size() int {
+	rh.mutex.RLock()
+	defer rh.mutex.RUnlock()
+	return len(rh.heap) + len(rh.pushCh)
+}
+
+// Stop stops the retry heap
+func (rh *retryHeap) Stop() {
+	close(rh.stopCh)
+}
+
+// RetryHeapProcessor manages the retry heap and moves ready batches back to sender queue
+type RetryHeapProcessor struct {
+	retryHeap        RetryHeap
+	senderPool       Sender
+	ticker           *time.Ticker
+	stopCh           chan struct{}
+	logger           telegraf.Logger
+	stopped          bool
+	maxRetryDuration time.Duration
+}
+
+// NewRetryHeapProcessor creates a new retry heap processor
+func NewRetryHeapProcessor(retryHeap RetryHeap, senderPool Sender, logger telegraf.Logger, maxRetryDuration time.Duration) *RetryHeapProcessor {
+	return &RetryHeapProcessor{
+		retryHeap:        retryHeap,
+		senderPool:       senderPool,
+		stopCh:           make(chan struct{}),
+		logger:           logger,
+		stopped:          false,
+		maxRetryDuration: maxRetryDuration,
+	}
+}
+
+// Start begins processing the retry heap every 100ms
+func (p *RetryHeapProcessor) Start() {
+	p.ticker = time.NewTicker(100 * time.Millisecond)
+	go p.processLoop()
+}
+
+// Stop stops the retry heap processor
+func (p *RetryHeapProcessor) Stop() {
+	if p.stopped {
+		return
+	}
+	if p.ticker != nil {
+		p.ticker.Stop()
+	}
+	close(p.stopCh)
+	p.stopped = true
+}
+
+// processLoop runs the main processing loop
+func (p *RetryHeapProcessor) processLoop() {
+	for {
+		select {
+		case <-p.ticker.C:
+			p.processReadyMessages()
+		case <-p.stopCh:
+			return
+		}
+	}
+}
+
+// processReadyMessages checks the heap for ready batches and moves them back to sender queue
+func (p *RetryHeapProcessor) processReadyMessages() {
+	readyBatches := p.retryHeap.PopReady()
+
+	for _, batch := range readyBatches {
+		// Check if batch has expired
+		if batch.isExpired(p.maxRetryDuration) {
+			p.logger.Debugf("Dropping expired batch for %s/%s", batch.Group, batch.Stream)
+			batch.updateState()
+			continue
+		}
+
+		// Submit the batch back to the sender pool (blocks if full)
+		p.senderPool.Send(batch)
+		p.logger.Debugf("Moved batch from retry heap back to sender pool for %s/%s",
+			batch.Group, batch.Stream)
+	}
+}
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go
new file mode 100644
index 00000000000..05e77e6651e
--- /dev/null
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go
@@ -0,0 +1,151 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+package pusher
+
+import (
+	"testing"
+	"time"
+
+	"github.com/influxdata/telegraf/testutil"
+	"github.com/stretchr/testify/assert"
+)
+
+func TestRetryHeap(t *testing.T) {
+	heap := NewRetryHeap(10)
+	defer heap.Stop()
+
+	// Test empty heap
+	assert.Equal(t, 0, heap.Size())
+	ready := heap.PopReady()
+	assert.Empty(t, ready)
+
+	// Create test batches
+	target := Target{Group: "group", Stream: "stream"}
+	batch1 := newLogEventBatch(target, nil)
+	batch1.nextRetryTime = time.Now().Add(1 * time.Second)
+
+	batch2 := newLogEventBatch(target, nil)
+	batch2.nextRetryTime = time.Now().Add(-1 * time.Second) // Ready now
+
+	// Push batches
+	err := heap.Push(batch1)
+	assert.NoError(t, err)
+	err = heap.Push(batch2)
+	assert.NoError(t, err)
+
+	// Wait for pushToHeapWorker to process
+	time.Sleep(10 * time.Millisecond)
+	assert.Equal(t, 2, heap.Size())
+
+	// Pop ready batches
+	ready = heap.PopReady()
+	assert.Len(t, ready, 1)
+	assert.Equal(t, batch2, ready[0])
+	assert.Equal(t, 1, heap.Size())
+}
+
+func TestRetryHeapOrdering(t *testing.T) {
+	heap := NewRetryHeap(10)
+	defer heap.Stop()
+
+	target := Target{Group: "group", Stream: "stream"}
+	now := time.Now()
+
+	// Create batches with different retry times (not in order)
+	batch1 := newLogEventBatch(target, nil)
+	batch1.nextRetryTime = now.Add(3 * time.Second)
+
+	batch2 := newLogEventBatch(target, nil)
+	batch2.nextRetryTime = now.Add(1 * time.Second)
+
+	batch3 := newLogEventBatch(target, nil)
+	batch3.nextRetryTime = now.Add(2 * time.Second)
+
+	// Push in random order
+	heap.Push(batch1)
+	heap.Push(batch2)
+	heap.Push(batch3)
+
+	// Wait for all to be ready
+	time.Sleep(4 * time.Second)
+
+	// Pop ready batches - should come out in order
+	ready := heap.PopReady()
+	assert.Len(t, ready, 3)
+	assert.True(t, ready[0].nextRetryTime.Before(ready[1].nextRetryTime))
+	assert.True(t, ready[1].nextRetryTime.Before(ready[2].nextRetryTime))
+}
+
+func TestRetryHeapProcessor(t *testing.T) {
+	heap := NewRetryHeap(10)
+	defer heap.Stop()
+
+	// Create mock senderPool
+	mockSenderPool := &mockSenderPool{}
+	processor := NewRetryHeapProcessor(heap, mockSenderPool, &testutil.Logger{}, time.Hour)
+	defer processor.Stop()
+
+	// Test start/stop
+	processor.Start()
+	assert.NotNil(t, processor.ticker)
+
+	processor.Stop()
+	assert.True(t, processor.stopped)
+}
+
+func TestRetryHeapProcessorExpiredBatch(t *testing.T) {
+	heap := NewRetryHeap(10)
+	defer heap.Stop()
+
+	mockSenderPool := &mockSenderPool{}
+	processor := NewRetryHeapProcessor(heap, mockSenderPool, &testutil.Logger{}, 1*time.Millisecond) // Very short expiry
+
+	// Create expired batch
+	target := Target{Group: "group", Stream: "stream"}
+	batch := newLogEventBatch(target, nil)
+	batch.startTime = time.Now().Add(-1 * time.Hour)       // Old start time
+	batch.nextRetryTime = time.Now().Add(-1 * time.Second) // Ready now
+
+	heap.Push(batch)
+	time.Sleep(10 * time.Millisecond) // Wait for pushToHeapWorker
+
+	// Process should drop expired batch
+	processor.processReadyMessages()
+	assert.Equal(t, 0, heap.Size())
+	assert.Equal(t, 0, mockSenderPool.sendCount) // Should not send expired batch
+}
+
+func TestRetryHeapProcessorSendsBatch(t *testing.T) {
+	heap := NewRetryHeap(10)
+	defer heap.Stop()
+
+	mockSenderPool := &mockSenderPool{}
+	processor := NewRetryHeapProcessor(heap, mockSenderPool, &testutil.Logger{}, time.Hour)
+
+	// Create ready batch
+	target := Target{Group: "group", Stream: "stream"}
+	batch := newLogEventBatch(target, nil)
+	batch.nextRetryTime = time.Now().Add(-1 * time.Second) // Ready now
+
+	heap.Push(batch)
+	time.Sleep(10 * time.Millisecond) // Wait for pushToHeapWorker
+
+	// Process should send batch
+	processor.processReadyMessages()
+	assert.Equal(t, 0, heap.Size())
+	assert.Equal(t, 1, mockSenderPool.sendCount)
+}
+
+// Mock senderPool for testing
+type mockSenderPool struct {
+	sendCount int
+}
+
+func (m *mockSenderPool) Send(_ *logEventBatch) {
+	m.sendCount++
+}
+
+func (m *mockSenderPool) Stop()                          {}
+func (m *mockSenderPool) SetRetryDuration(time.Duration) {}
+func (m *mockSenderPool) RetryDuration() time.Duration   { return time.Hour }

From 83224b4e6fcc2c5d5f003f4a00a32d34570e3109 Mon Sep 17 00:00:00 2001
From: Akansha Agarwal <agarakan@users.noreply.github.com>
Date: Tue, 30 Dec 2025 18:06:54 +0000
Subject: [PATCH 06/50] Exchange pushch for semaphor to enformce heap size and
 blocking

---
 .../internal/pusher/retryheap.go              | 49 +++++++---------
 .../internal/pusher/retryheap_test.go         | 56 +++++++++++++++++--
 2 files changed, 72 insertions(+), 33 deletions(-)

diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
index 5973f57b381..85b51304727 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
@@ -44,45 +44,34 @@ type RetryHeap interface {
 }
 
 type retryHeap struct {
-	heap    retryHeapImpl
-	mutex   sync.RWMutex
-	pushCh  chan *logEventBatch
-	stopCh  chan struct{}
-	maxSize int
+	heap      retryHeapImpl
+	mutex     sync.RWMutex
+	semaphore chan struct{} // Size enforcer
+	stopCh    chan struct{}
+	maxSize   int
 }
 
 // NewRetryHeap creates a new retry heap with the specified maximum size
 func NewRetryHeap(maxSize int) RetryHeap {
 	rh := &retryHeap{
-		heap:    make(retryHeapImpl, 0),
-		maxSize: maxSize,
-		pushCh:  make(chan *logEventBatch, maxSize),
-		stopCh:  make(chan struct{}),
+		heap:      make(retryHeapImpl, 0),
+		maxSize:   maxSize,
+		semaphore: make(chan struct{}, maxSize), // Semaphore for size enforcement
+		stopCh:    make(chan struct{}),
 	}
 	heap.Init(&rh.heap)
-	go rh.pushToHeapWorker()
 	return rh
 }
 
-// pushToHeapWorker moves batches from the blocking channel to the time-ordered heap
-// This bridges channel-based blocking (like sender queue) with heap-based time ordering
-func (rh *retryHeap) pushToHeapWorker() {
-	for {
-		select {
-		case batch := <-rh.pushCh:
-			rh.mutex.Lock()
-			heap.Push(&rh.heap, batch)
-			rh.mutex.Unlock()
-		case <-rh.stopCh:
-			return
-		}
-	}
-}
-
-// Push adds a batch to the heap, blocking if full (same as sender queue)
+// Push adds a batch to the heap, blocking if full
 func (rh *retryHeap) Push(batch *logEventBatch) error {
+	// Acquire semaphore slot (blocks if at maxSize capacity)
 	select {
-	case rh.pushCh <- batch:
+	case rh.semaphore <- struct{}{}:
+		// add batch to heap with mutex protection
+		rh.mutex.Lock()
+		heap.Push(&rh.heap, batch)
+		rh.mutex.Unlock()
 		return nil
 	case <-rh.stopCh:
 		return errors.New("retry heap stopped")
@@ -101,16 +90,18 @@ func (rh *retryHeap) PopReady() []*logEventBatch {
 	for len(rh.heap) > 0 && !rh.heap[0].nextRetryTime.After(now) {
 		batch := heap.Pop(&rh.heap).(*logEventBatch)
 		ready = append(ready, batch)
+		// Release semaphore slot for each popped batch
+		<-rh.semaphore
 	}
 
 	return ready
 }
 
-// Size returns the current number of batches in the heap and pending channel
+// Size returns the current number of batches in the heap
 func (rh *retryHeap) Size() int {
 	rh.mutex.RLock()
 	defer rh.mutex.RUnlock()
-	return len(rh.heap) + len(rh.pushCh)
+	return len(rh.heap)
 }
 
 // Stop stops the retry heap
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go
index 05e77e6651e..25cf27831ce 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go
@@ -34,8 +34,6 @@ func TestRetryHeap(t *testing.T) {
 	err = heap.Push(batch2)
 	assert.NoError(t, err)
 
-	// Wait for pushToHeapWorker to process
-	time.Sleep(10 * time.Millisecond)
 	assert.Equal(t, 2, heap.Size())
 
 	// Pop ready batches
@@ -108,7 +106,6 @@ func TestRetryHeapProcessorExpiredBatch(t *testing.T) {
 	batch.nextRetryTime = time.Now().Add(-1 * time.Second) // Ready now
 
 	heap.Push(batch)
-	time.Sleep(10 * time.Millisecond) // Wait for pushToHeapWorker
 
 	// Process should drop expired batch
 	processor.processReadyMessages()
@@ -129,7 +126,6 @@ func TestRetryHeapProcessorSendsBatch(t *testing.T) {
 	batch.nextRetryTime = time.Now().Add(-1 * time.Second) // Ready now
 
 	heap.Push(batch)
-	time.Sleep(10 * time.Millisecond) // Wait for pushToHeapWorker
 
 	// Process should send batch
 	processor.processReadyMessages()
@@ -149,3 +145,55 @@ func (m *mockSenderPool) Send(_ *logEventBatch) {
 func (m *mockSenderPool) Stop()                          {}
 func (m *mockSenderPool) SetRetryDuration(time.Duration) {}
 func (m *mockSenderPool) RetryDuration() time.Duration   { return time.Hour }
+func TestRetryHeap_SemaphoreBlockingAndUnblocking(t *testing.T) {
+	heap := NewRetryHeap(2) // maxSize = 2
+	defer heap.Stop()
+
+	// Fill heap to capacity with batches that will be ready in 3 seconds
+	target := Target{Group: "group", Stream: "stream"}
+	batch1 := newLogEventBatch(target, nil)
+	batch1.nextRetryTime = time.Now().Add(3 * time.Second)
+	batch2 := newLogEventBatch(target, nil)
+	batch2.nextRetryTime = time.Now().Add(3 * time.Second)
+
+	heap.Push(batch1)
+	heap.Push(batch2)
+
+	// Verify heap is at capacity
+	if heap.Size() != 2 {
+		t.Fatalf("Expected size 2, got %d", heap.Size())
+	}
+
+	// Try to push third item - should block
+	var pushCompleted bool
+
+	go func() {
+		batch3 := newLogEventBatch(target, nil)
+		batch3.nextRetryTime = time.Now().Add(time.Hour) // Future time, won't be popped
+		heap.Push(batch3)                                // This should block
+		pushCompleted = true
+	}()
+
+	// Give goroutine time to hit the semaphore block
+	time.Sleep(100 * time.Millisecond)
+
+	if pushCompleted {
+		t.Fatal("Push should be blocked by semaphore")
+	}
+
+	// Wait for batches to become ready, then pop to release semaphore
+	time.Sleep(4 * time.Second)
+	heap.PopReady()
+
+	// Give time for push to unblock
+	time.Sleep(100 * time.Millisecond)
+
+	if !pushCompleted {
+		t.Fatal("Push should be unblocked after PopReady")
+	}
+
+	// Verify final state - should have 1 item (2 popped, 1 pushed)
+	if heap.Size() != 1 {
+		t.Fatalf("Expected size 1 after pop/push cycle, got %d", heap.Size())
+	}
+}

From 7cfc7943cbed9f4e61a121f3a9a4f3242d5bf21a Mon Sep 17 00:00:00 2001
From: Akansha Agarwal <agarakan@users.noreply.github.com>
Date: Tue, 30 Dec 2025 19:37:14 +0000
Subject: [PATCH 07/50] Add conditional logic to sender to call batch.Fail()
 during concurrency

---
 .../outputs/cloudwatchlogs/cloudwatchlogs.go  |  2 +-
 .../cloudwatchlogs/internal/pusher/batch.go   | 18 +++++++
 .../cloudwatchlogs/internal/pusher/pusher.go  |  7 ++-
 .../cloudwatchlogs/internal/pusher/sender.go  | 47 ++++++++++++++-----
 4 files changed, 59 insertions(+), 15 deletions(-)

diff --git a/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go b/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go
index 770ef5e3f97..ddc4bed4531 100644
--- a/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go
+++ b/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go
@@ -153,7 +153,7 @@ func (c *CloudWatchLogs) getDest(t pusher.Target, logSrc logs.LogSrc) *cwDest {
 		}
 		c.targetManager = pusher.NewTargetManager(c.Log, client)
 	})
-	p := pusher.NewPusher(c.Log, t, client, c.targetManager, logSrc, c.workerPool, c.ForceFlushInterval.Duration, maxRetryTimeout, &c.pusherWaitGroup)
+	p := pusher.NewPusher(c.Log, t, client, c.targetManager, logSrc, c.workerPool, c.ForceFlushInterval.Duration, maxRetryTimeout, &c.pusherWaitGroup, c.Concurrency)
 	cwd := &cwDest{
 		pusher:   p,
 		retryer:  logThrottleRetryer,
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go b/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go
index a0013e785e7..83737d2ee46 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go
@@ -100,6 +100,8 @@ type logEventBatch struct {
 	doneCallbacks []func()
 	// Callbacks specifically for updating state
 	stateCallbacks []func()
+	// Callbacks to execute when batch fails (for circuit breaker notification)
+	failCallbacks []func()
 	batchers       map[string]*state.RangeQueueBatcher
 
 	// Retry metadata
@@ -182,6 +184,13 @@ func (b *logEventBatch) addStateCallback(callback func()) {
 	}
 }
 
+// addFailCallback adds the callback to the end of the registered fail callbacks.
+func (b *logEventBatch) addFailCallback(callback func()) {
+	if callback != nil {
+		b.failCallbacks = append(b.failCallbacks, callback)
+	}
+}
+
 // done runs all registered callbacks, including both success callbacks and state callbacks.
 func (b *logEventBatch) done() {
 	b.updateState()
@@ -203,6 +212,15 @@ func (b *logEventBatch) updateState() {
 	}
 }
 
+// fail runs fail callbacks to notify upstream components of batch failure.
+// This is used for circuit breaker notification when a batch fails.
+func (b *logEventBatch) fail() {
+	for i := len(b.failCallbacks) - 1; i >= 0; i-- {
+		callback := b.failCallbacks[i]
+		callback()
+	}
+}
+
 // build creates a cloudwatchlogs.PutLogEventsInput from the batch. The log events in the batch must be in
 // chronological order by their timestamp.
 func (b *logEventBatch) build() *cloudwatchlogs.PutLogEventsInput {
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go
index 57256ae0331..e833868931b 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go
@@ -34,8 +34,10 @@ func NewPusher(
 	flushTimeout time.Duration,
 	retryDuration time.Duration,
 	wg *sync.WaitGroup,
+	concurrency int,
 ) *Pusher {
-	s := createSender(logger, service, targetManager, workerPool, retryDuration)
+	concurrencyEnabled := concurrency > 1
+	s := createSender(logger, service, targetManager, workerPool, retryDuration, concurrencyEnabled)
 	q := newQueue(logger, target, flushTimeout, entityProvider, s, wg)
 	targetManager.PutRetentionPolicy(target)
 	return &Pusher{
@@ -60,8 +62,9 @@ func createSender(
 	targetManager TargetManager,
 	workerPool WorkerPool,
 	retryDuration time.Duration,
+	concurrencyEnabled bool,
 ) Sender {
-	s := newSender(logger, service, targetManager, retryDuration)
+	s := newSender(logger, service, targetManager, retryDuration, concurrencyEnabled)
 	if workerPool == nil {
 		return s
 	}
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go b/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go
index 3df074a0fbe..da09deb61db 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go
@@ -30,12 +30,13 @@ type Sender interface {
 }
 
 type sender struct {
-	service       cloudWatchLogsService
-	retryDuration atomic.Value
-	targetManager TargetManager
-	logger        telegraf.Logger
-	stopCh        chan struct{}
-	stopped       bool
+	service           cloudWatchLogsService
+	retryDuration     atomic.Value
+	targetManager     TargetManager
+	logger            telegraf.Logger
+	stopCh            chan struct{}
+	stopped           bool
+	concurrencyEnabled bool
 }
 
 var _ (Sender) = (*sender)(nil)
@@ -45,13 +46,15 @@ func newSender(
 	service cloudWatchLogsService,
 	targetManager TargetManager,
 	retryDuration time.Duration,
+	concurrencyEnabled bool,
 ) Sender {
 	s := &sender{
-		logger:        logger,
-		service:       service,
-		targetManager: targetManager,
-		stopCh:        make(chan struct{}),
-		stopped:       false,
+		logger:             logger,
+		service:            service,
+		targetManager:      targetManager,
+		stopCh:             make(chan struct{}),
+		stopped:            false,
+		concurrencyEnabled: concurrencyEnabled,
 	}
 	s.retryDuration.Store(retryDuration)
 	return s
@@ -121,7 +124,22 @@ func (s *sender) Send(batch *logEventBatch) {
 			return
 		}
 
-		// Calculate wait time until next retry
+		select {
+		case <-s.stopCh:
+			s.logger.Errorf("Stop requested after %v retries to %v/%v failed for PutLogEvents, request dropped.", totalRetries, batch.Group, batch.Stream)
+			batch.updateState()
+			return
+		default:
+		}
+
+		// If concurrency enabled, notify failure (will handle RetryHeap push) and return
+		// Otherwise, continue with existing busy-wait retry behavior
+		if s.isConcurrencyEnabled() {
+			batch.fail()
+			return
+		}
+
+		// Calculate wait time until next retry (synchronous mode)
 		wait := time.Until(batch.nextRetryTime)
 		if wait < 0 {
 			wait = 0
@@ -156,3 +174,8 @@ func (s *sender) SetRetryDuration(retryDuration time.Duration) {
 func (s *sender) RetryDuration() time.Duration {
 	return s.retryDuration.Load().(time.Duration)
 }
+
+// isConcurrencyEnabled returns whether concurrency mode is enabled for this sender.
+func (s *sender) isConcurrencyEnabled() bool {
+	return s.concurrencyEnabled
+}

From b4ffd7a65d3bbdbd8c64b636ebbf35e9c5177f46 Mon Sep 17 00:00:00 2001
From: Akansha Agarwal <agarakan@users.noreply.github.com>
Date: Tue, 30 Dec 2025 22:27:57 +0000
Subject: [PATCH 08/50] Add unit tests

---
 .../cloudwatchlogs/internal/pusher/batch.go   |  2 +-
 .../internal/pusher/pool_test.go              |  2 +-
 .../internal/pusher/pusher_test.go            |  1 +
 .../internal/pusher/queue_test.go             |  2 +-
 .../cloudwatchlogs/internal/pusher/sender.go  | 21 +++-----
 .../internal/pusher/sender_test.go            | 52 +++++++++++++++----
 6 files changed, 53 insertions(+), 27 deletions(-)

diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go b/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go
index 83737d2ee46..3c83be15a0b 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go
@@ -102,7 +102,7 @@ type logEventBatch struct {
 	stateCallbacks []func()
 	// Callbacks to execute when batch fails (for circuit breaker notification)
 	failCallbacks []func()
-	batchers       map[string]*state.RangeQueueBatcher
+	batchers      map[string]*state.RangeQueueBatcher
 
 	// Retry metadata
 	retryCountShort int       // Number of retries using short delay strategy
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go
index d9f3860967c..94fe4b6713b 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go
@@ -107,7 +107,7 @@ func TestSenderPool(t *testing.T) {
 	logger := testutil.NewNopLogger()
 	mockService := new(mockLogsService)
 	mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil)
-	s := newSender(logger, mockService, nil, time.Second)
+	s := newSender(logger, mockService, nil, time.Second, false)
 	p := NewWorkerPool(12)
 	sp := newSenderPool(p, s)
 
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go
index 6d63e3c4ff5..ce575c213f3 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go
@@ -113,6 +113,7 @@ func setupPusher(t *testing.T, workerPool WorkerPool, wg *sync.WaitGroup) *Pushe
 		time.Second,
 		time.Minute,
 		wg,
+		1, // concurrency
 	)
 
 	assert.NotNil(t, pusher)
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
index b5fc04d02eb..dab98651319 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
@@ -712,7 +712,7 @@ func testPreparationWithLogger(
 ) (*queue, Sender) {
 	t.Helper()
 	tm := NewTargetManager(logger, service)
-	s := newSender(logger, service, tm, retryDuration)
+	s := newSender(logger, service, tm, retryDuration, false)
 	q := newQueue(
 		logger,
 		Target{"G", "S", util.StandardLogGroupClass, retention},
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go b/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go
index da09deb61db..31a3b8be299 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go
@@ -30,12 +30,12 @@ type Sender interface {
 }
 
 type sender struct {
-	service           cloudWatchLogsService
-	retryDuration     atomic.Value
-	targetManager     TargetManager
-	logger            telegraf.Logger
-	stopCh            chan struct{}
-	stopped           bool
+	service            cloudWatchLogsService
+	retryDuration      atomic.Value
+	targetManager      TargetManager
+	logger             telegraf.Logger
+	stopCh             chan struct{}
+	stopped            bool
 	concurrencyEnabled bool
 }
 
@@ -124,19 +124,10 @@ func (s *sender) Send(batch *logEventBatch) {
 			return
 		}
 
-		select {
-		case <-s.stopCh:
-			s.logger.Errorf("Stop requested after %v retries to %v/%v failed for PutLogEvents, request dropped.", totalRetries, batch.Group, batch.Stream)
-			batch.updateState()
-			return
-		default:
-		}
-
 		// If concurrency enabled, notify failure (will handle RetryHeap push) and return
 		// Otherwise, continue with existing busy-wait retry behavior
 		if s.isConcurrencyEnabled() {
 			batch.fail()
-			return
 		}
 
 		// Calculate wait time until next retry (synchronous mode)
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go
index 3b469350ef7..450e63006ad 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go
@@ -15,6 +15,7 @@ import (
 
 	"github.com/aws/amazon-cloudwatch-agent/sdk/service/cloudwatchlogs"
 	"github.com/aws/amazon-cloudwatch-agent/tool/testutil"
+	"github.com/aws/amazon-cloudwatch-agent/tool/util"
 )
 
 type mockLogsService struct {
@@ -80,7 +81,7 @@ func TestSender(t *testing.T) {
 		mockManager := new(mockTargetManager)
 		mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Once()
 
-		s := newSender(logger, mockService, mockManager, time.Second)
+		s := newSender(logger, mockService, mockManager, time.Second, false)
 		s.Send(batch)
 		s.Stop()
 
@@ -103,7 +104,7 @@ func TestSender(t *testing.T) {
 		mockManager := new(mockTargetManager)
 		mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{RejectedLogEventsInfo: rejectedInfo}, nil).Once()
 
-		s := newSender(logger, mockService, mockManager, time.Second)
+		s := newSender(logger, mockService, mockManager, time.Second, false)
 		s.Send(batch)
 		s.Stop()
 
@@ -122,7 +123,7 @@ func TestSender(t *testing.T) {
 		mockManager.On("InitTarget", mock.Anything).Return(nil).Once()
 		mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Once()
 
-		s := newSender(logger, mockService, mockManager, time.Second)
+		s := newSender(logger, mockService, mockManager, time.Second, false)
 		s.Send(batch)
 		s.Stop()
 
@@ -149,7 +150,7 @@ func TestSender(t *testing.T) {
 		mockService.On("PutLogEvents", mock.Anything).
 			Return(&cloudwatchlogs.PutLogEventsOutput{}, &cloudwatchlogs.InvalidParameterException{}).Once()
 
-		s := newSender(logger, mockService, mockManager, time.Second)
+		s := newSender(logger, mockService, mockManager, time.Second, false)
 		s.Send(batch)
 		s.Stop()
 
@@ -177,7 +178,7 @@ func TestSender(t *testing.T) {
 		mockService.On("PutLogEvents", mock.Anything).
 			Return(&cloudwatchlogs.PutLogEventsOutput{}, &cloudwatchlogs.DataAlreadyAcceptedException{}).Once()
 
-		s := newSender(logger, mockService, mockManager, time.Second)
+		s := newSender(logger, mockService, mockManager, time.Second, false)
 		s.Send(batch)
 		s.Stop()
 
@@ -205,7 +206,7 @@ func TestSender(t *testing.T) {
 		mockService.On("PutLogEvents", mock.Anything).
 			Return(&cloudwatchlogs.PutLogEventsOutput{}, errors.New("test")).Once()
 
-		s := newSender(logger, mockService, mockManager, time.Second)
+		s := newSender(logger, mockService, mockManager, time.Second, false)
 		s.Send(batch)
 		s.Stop()
 
@@ -225,7 +226,7 @@ func TestSender(t *testing.T) {
 		mockService.On("PutLogEvents", mock.Anything).
 			Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Once()
 
-		s := newSender(logger, mockService, mockManager, time.Second)
+		s := newSender(logger, mockService, mockManager, time.Second, false)
 		s.Send(batch)
 		s.Stop()
 
@@ -251,7 +252,7 @@ func TestSender(t *testing.T) {
 		mockService.On("PutLogEvents", mock.Anything).
 			Return(&cloudwatchlogs.PutLogEventsOutput{}, awserr.New("SomeAWSError", "Some AWS error", nil)).Once()
 
-		s := newSender(logger, mockService, mockManager, 100*time.Millisecond)
+		s := newSender(logger, mockService, mockManager, 100*time.Millisecond, false)
 		s.Send(batch)
 		s.Stop()
 
@@ -279,7 +280,7 @@ func TestSender(t *testing.T) {
 		mockService.On("PutLogEvents", mock.Anything).
 			Return(&cloudwatchlogs.PutLogEventsOutput{}, awserr.New("SomeAWSError", "Some AWS error", nil)).Once()
 
-		s := newSender(logger, mockService, mockManager, time.Second)
+		s := newSender(logger, mockService, mockManager, time.Second, false)
 
 		go func() {
 			time.Sleep(50 * time.Millisecond)
@@ -292,4 +293,37 @@ func TestSender(t *testing.T) {
 		assert.True(t, stateCallbackCalled, "State callback was not called when stop was requested")
 		assert.False(t, doneCallbackCalled, "Done callback should not be called when stop was requested")
 	})
+
+	t.Run("ConcurrencyEnabled/CallsFailCallback", func(t *testing.T) {
+		logger := testutil.NewNopLogger()
+		batch := newLogEventBatch(Target{"G", "S", util.StandardLogGroupClass, -1}, nil)
+		batch.append(newLogEvent(time.Now(), "Test message", nil))
+
+		// Initialize batch for retry logic
+		batch.initializeStartTime()
+
+		mockService := new(mockLogsService)
+		mockManager := new(mockTargetManager)
+		mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, &cloudwatchlogs.ServiceUnavailableException{}).Once()
+
+		// Enable concurrency with 1 hour retry duration
+		s := newSender(logger, mockService, mockManager, time.Hour, true)
+
+		// Track if fail callback was called
+		failCalled := false
+		batch.addFailCallback(func() {
+			failCalled = true
+		})
+
+		go func() {
+			time.Sleep(50 * time.Millisecond)
+			s.Stop()
+		}()
+
+		s.Send(batch)
+
+		// Should call fail callback when concurrency is enabled
+		assert.True(t, failCalled, "fail callback should be called when concurrency is enabled")
+		mockService.AssertExpectations(t)
+	})
 }

From 0e4b0bc699cc3e598fcc40700579addbfec41680 Mon Sep 17 00:00:00 2001
From: Akansha Agarwal <agarakan@users.noreply.github.com>
Date: Tue, 30 Dec 2025 23:18:25 +0000
Subject: [PATCH 09/50] Instantiate RetryHeap and RetryHeapProcessor if
 concurrency enabled

---
 .../outputs/cloudwatchlogs/cloudwatchlogs.go  | 31 ++++++++++++------
 .../cloudwatchlogs/internal/pusher/pool.go    |  4 ++-
 .../internal/pusher/pool_test.go              |  2 +-
 .../cloudwatchlogs/internal/pusher/pusher.go  |  8 +++--
 .../internal/pusher/pusher_test.go            |  3 +-
 .../internal/pusher/retryheap.go              |  6 +++-
 .../internal/pusher/retryheap_test.go         | 32 ++++++++++++-------
 7 files changed, 60 insertions(+), 26 deletions(-)

diff --git a/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go b/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go
index ddc4bed4531..06883993957 100644
--- a/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go
+++ b/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go
@@ -69,14 +69,16 @@ type CloudWatchLogs struct {
 
 	Log telegraf.Logger `toml:"-"`
 
-	pusherWaitGroup sync.WaitGroup
-	cwDests         sync.Map
-	workerPool      pusher.WorkerPool
-	targetManager   pusher.TargetManager
-	once            sync.Once
-	middleware      awsmiddleware.Middleware
-	configurer      *awsmiddleware.Configurer
-	configurerOnce  sync.Once
+	pusherWaitGroup    sync.WaitGroup
+	cwDests            sync.Map
+	workerPool         pusher.WorkerPool
+	retryHeap          pusher.RetryHeap
+	retryHeapProcessor *pusher.RetryHeapProcessor
+	targetManager      pusher.TargetManager
+	once               sync.Once
+	middleware         awsmiddleware.Middleware
+	configurer         *awsmiddleware.Configurer
+	configurerOnce     sync.Once
 }
 
 var _ logs.LogBackend = (*CloudWatchLogs)(nil)
@@ -101,6 +103,14 @@ func (c *CloudWatchLogs) Close() error {
 		c.workerPool.Stop()
 	}
 
+	if c.retryHeapProcessor != nil {
+		c.retryHeapProcessor.Stop()
+	}
+
+	if c.retryHeap != nil {
+		c.retryHeap.Stop()
+	}
+
 	return nil
 }
 
@@ -150,10 +160,13 @@ func (c *CloudWatchLogs) getDest(t pusher.Target, logSrc logs.LogSrc) *cwDest {
 	c.once.Do(func() {
 		if c.Concurrency > 1 {
 			c.workerPool = pusher.NewWorkerPool(c.Concurrency)
+			c.retryHeap = pusher.NewRetryHeap(c.Concurrency)
+			c.retryHeapProcessor = pusher.NewRetryHeapProcessor(c.retryHeap, c.workerPool, client, c.targetManager, c.Log, maxRetryTimeout)
+			c.retryHeapProcessor.Start()
 		}
 		c.targetManager = pusher.NewTargetManager(c.Log, client)
 	})
-	p := pusher.NewPusher(c.Log, t, client, c.targetManager, logSrc, c.workerPool, c.ForceFlushInterval.Duration, maxRetryTimeout, &c.pusherWaitGroup, c.Concurrency)
+	p := pusher.NewPusher(c.Log, t, client, c.targetManager, logSrc, c.workerPool, c.ForceFlushInterval.Duration, maxRetryTimeout, &c.pusherWaitGroup, c.Concurrency, c.retryHeap)
 	cwd := &cwDest{
 		pusher:   p,
 		retryer:  logThrottleRetryer,
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pool.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pool.go
index 1d6edf57e90..6aff5b522e5 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/pool.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pool.go
@@ -91,14 +91,16 @@ func (p *workerPool) Stop() {
 type senderPool struct {
 	workerPool WorkerPool
 	sender     Sender
+	retryHeap  RetryHeap
 }
 
 var _ Sender = (*senderPool)(nil)
 
-func newSenderPool(workerPool WorkerPool, sender Sender) Sender {
+func newSenderPool(workerPool WorkerPool, sender Sender, retryHeap RetryHeap) Sender {
 	return &senderPool{
 		workerPool: workerPool,
 		sender:     sender,
+		retryHeap:  retryHeap,
 	}
 }
 
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go
index 94fe4b6713b..df0cd39fbf9 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go
@@ -109,7 +109,7 @@ func TestSenderPool(t *testing.T) {
 	mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil)
 	s := newSender(logger, mockService, nil, time.Second, false)
 	p := NewWorkerPool(12)
-	sp := newSenderPool(p, s)
+	sp := newSenderPool(p, s, nil)
 
 	assert.Equal(t, time.Second, sp.RetryDuration())
 	sp.SetRetryDuration(time.Minute)
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go
index e833868931b..77707532fec 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go
@@ -35,9 +35,12 @@ func NewPusher(
 	retryDuration time.Duration,
 	wg *sync.WaitGroup,
 	concurrency int,
+	retryHeap RetryHeap,
 ) *Pusher {
 	concurrencyEnabled := concurrency > 1
-	s := createSender(logger, service, targetManager, workerPool, retryDuration, concurrencyEnabled)
+
+	s := createSender(logger, service, targetManager, workerPool, retryDuration, concurrencyEnabled, retryHeap)
+
 	q := newQueue(logger, target, flushTimeout, entityProvider, s, wg)
 	targetManager.PutRetentionPolicy(target)
 	return &Pusher{
@@ -63,10 +66,11 @@ func createSender(
 	workerPool WorkerPool,
 	retryDuration time.Duration,
 	concurrencyEnabled bool,
+	retryHeap RetryHeap,
 ) Sender {
 	s := newSender(logger, service, targetManager, retryDuration, concurrencyEnabled)
 	if workerPool == nil {
 		return s
 	}
-	return newSenderPool(workerPool, s)
+	return newSenderPool(workerPool, s, retryHeap)
 }
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go
index ce575c213f3..dc1774e049e 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go
@@ -113,7 +113,8 @@ func setupPusher(t *testing.T, workerPool WorkerPool, wg *sync.WaitGroup) *Pushe
 		time.Second,
 		time.Minute,
 		wg,
-		1, // concurrency
+		1,   // concurrency
+		nil, // retryHeap
 	)
 
 	assert.NotNil(t, pusher)
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
index 85b51304727..213359b8044 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
@@ -121,7 +121,11 @@ type RetryHeapProcessor struct {
 }
 
 // NewRetryHeapProcessor creates a new retry heap processor
-func NewRetryHeapProcessor(retryHeap RetryHeap, senderPool Sender, logger telegraf.Logger, maxRetryDuration time.Duration) *RetryHeapProcessor {
+func NewRetryHeapProcessor(retryHeap RetryHeap, workerPool WorkerPool, service cloudWatchLogsService, targetManager TargetManager, logger telegraf.Logger, maxRetryDuration time.Duration) *RetryHeapProcessor {
+	// Create processor's own sender and senderPool
+	sender := newSender(logger, service, targetManager, maxRetryDuration, true)
+	senderPool := newSenderPool(workerPool, sender, retryHeap)
+
 	return &RetryHeapProcessor{
 		retryHeap:        retryHeap,
 		senderPool:       senderPool,
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go
index 25cf27831ce..909ff25cd29 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go
@@ -79,9 +79,13 @@ func TestRetryHeapProcessor(t *testing.T) {
 	heap := NewRetryHeap(10)
 	defer heap.Stop()
 
-	// Create mock senderPool
-	mockSenderPool := &mockSenderPool{}
-	processor := NewRetryHeapProcessor(heap, mockSenderPool, &testutil.Logger{}, time.Hour)
+	// Create mock components
+	mockWorkerPool := NewWorkerPool(2)
+	defer mockWorkerPool.Stop()
+	mockService := &mockLogsService{}
+	mockTargetManager := &mockTargetManager{}
+
+	processor := NewRetryHeapProcessor(heap, mockWorkerPool, mockService, mockTargetManager, &testutil.Logger{}, time.Hour)
 	defer processor.Stop()
 
 	// Test start/stop
@@ -96,8 +100,12 @@ func TestRetryHeapProcessorExpiredBatch(t *testing.T) {
 	heap := NewRetryHeap(10)
 	defer heap.Stop()
 
-	mockSenderPool := &mockSenderPool{}
-	processor := NewRetryHeapProcessor(heap, mockSenderPool, &testutil.Logger{}, 1*time.Millisecond) // Very short expiry
+	mockWorkerPool := NewWorkerPool(2)
+	defer mockWorkerPool.Stop()
+	mockService := &mockLogsService{}
+	mockTargetManager := &mockTargetManager{}
+
+	processor := NewRetryHeapProcessor(heap, mockWorkerPool, mockService, mockTargetManager, &testutil.Logger{}, 1*time.Millisecond) // Very short expiry
 
 	// Create expired batch
 	target := Target{Group: "group", Stream: "stream"}
@@ -109,16 +117,19 @@ func TestRetryHeapProcessorExpiredBatch(t *testing.T) {
 
 	// Process should drop expired batch
 	processor.processReadyMessages()
-	assert.Equal(t, 0, heap.Size())
-	assert.Equal(t, 0, mockSenderPool.sendCount) // Should not send expired batch
+	assert.Equal(t, 0, heap.Size()) // Expired batch should be removed
 }
 
 func TestRetryHeapProcessorSendsBatch(t *testing.T) {
 	heap := NewRetryHeap(10)
 	defer heap.Stop()
 
-	mockSenderPool := &mockSenderPool{}
-	processor := NewRetryHeapProcessor(heap, mockSenderPool, &testutil.Logger{}, time.Hour)
+	mockWorkerPool := NewWorkerPool(2)
+	defer mockWorkerPool.Stop()
+	mockService := &mockLogsService{}
+	mockTargetManager := &mockTargetManager{}
+
+	processor := NewRetryHeapProcessor(heap, mockWorkerPool, mockService, mockTargetManager, &testutil.Logger{}, time.Hour)
 
 	// Create ready batch
 	target := Target{Group: "group", Stream: "stream"}
@@ -129,8 +140,7 @@ func TestRetryHeapProcessorSendsBatch(t *testing.T) {
 
 	// Process should send batch
 	processor.processReadyMessages()
-	assert.Equal(t, 0, heap.Size())
-	assert.Equal(t, 1, mockSenderPool.sendCount)
+	assert.Equal(t, 0, heap.Size()) // Batch should be removed from heap
 }
 
 // Mock senderPool for testing

From 9c1332a7ccbed2140680ee365f4c5ec90c8766d3 Mon Sep 17 00:00:00 2001
From: Akansha Agarwal <agarakan@users.noreply.github.com>
Date: Tue, 30 Dec 2025 23:34:54 +0000
Subject: [PATCH 10/50] Add unit tests for retryheap instantiation

---
 .../cloudwatchlogs/cloudwatchlogs_test.go     | 37 ++++++++++++++++++
 .../internal/pusher/pool_test.go              | 21 ++++++++++
 .../internal/pusher/pusher_test.go            | 39 +++++++++++++++++++
 3 files changed, 97 insertions(+)

diff --git a/plugins/outputs/cloudwatchlogs/cloudwatchlogs_test.go b/plugins/outputs/cloudwatchlogs/cloudwatchlogs_test.go
index 66f1643fd09..c06a4240093 100644
--- a/plugins/outputs/cloudwatchlogs/cloudwatchlogs_test.go
+++ b/plugins/outputs/cloudwatchlogs/cloudwatchlogs_test.go
@@ -8,6 +8,7 @@ import (
 	"testing"
 
 	"github.com/influxdata/telegraf/testutil"
+	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 
 	"github.com/aws/amazon-cloudwatch-agent/logs"
@@ -100,3 +101,39 @@ func TestDuplicateDestination(t *testing.T) {
 	// Then the destination for cloudwatchlogs endpoint would be the same
 	require.Equal(t, d1, d2)
 }
+
+func TestRetryHeapCreation(t *testing.T) {
+	t.Run("ConcurrencyEnabled", func(t *testing.T) {
+		c := &CloudWatchLogs{
+			Log:         testutil.Logger{Name: "test"},
+			AccessKey:   "access_key",
+			SecretKey:   "secret_key",
+			Concurrency: 2, // > 1 enables concurrency
+			cwDests:     sync.Map{},
+		}
+
+		c.CreateDest("FILENAME", "", -1, util.StandardLogGroupClass, nil)
+
+		// Should create RetryHeap and processor
+		assert.NotNil(t, c.retryHeap)
+		assert.NotNil(t, c.retryHeapProcessor)
+		assert.NotNil(t, c.workerPool)
+	})
+
+	t.Run("ConcurrencyDisabled", func(t *testing.T) {
+		c := &CloudWatchLogs{
+			Log:         testutil.Logger{Name: "test"},
+			AccessKey:   "access_key",
+			SecretKey:   "secret_key",
+			Concurrency: 1, // <= 1 disables concurrency
+			cwDests:     sync.Map{},
+		}
+
+		c.CreateDest("FILENAME", "", -1, util.StandardLogGroupClass, nil)
+
+		// Should not create RetryHeap and processor
+		assert.Nil(t, c.retryHeap)
+		assert.Nil(t, c.retryHeapProcessor)
+		assert.Nil(t, c.workerPool)
+	})
+}
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go
index df0cd39fbf9..0af043de52a 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go
@@ -134,3 +134,24 @@ func TestSenderPool(t *testing.T) {
 	s.Stop()
 	assert.Equal(t, int32(200), completed.Load())
 }
+
+func TestSenderPoolRetryHeap(t *testing.T) {
+	logger := testutil.NewNopLogger()
+	mockService := new(mockLogsService)
+	mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil)
+	s := newSender(logger, mockService, nil, time.Second, false)
+	p := NewWorkerPool(12)
+	defer p.Stop()
+
+	// Create RetryHeap
+	retryHeap := NewRetryHeap(10)
+	defer retryHeap.Stop()
+
+	sp := newSenderPool(p, s, retryHeap)
+
+	// Verify senderPool has retryHeap
+	assert.NotNil(t, sp.(*senderPool).retryHeap)
+	assert.Equal(t, retryHeap, sp.(*senderPool).retryHeap)
+
+	sp.Stop()
+}
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go
index dc1774e049e..2ec67aea452 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go
@@ -126,3 +126,42 @@ func setupPusher(t *testing.T, workerPool WorkerPool, wg *sync.WaitGroup) *Pushe
 	mockManager.AssertCalled(t, "PutRetentionPolicy", target)
 	return pusher
 }
+
+func TestPusherRetryHeap(t *testing.T) {
+	logger := testutil.NewNopLogger()
+	target := Target{Group: "G", Stream: "S"}
+	service := &stubLogsService{}
+	mockManager := new(mockTargetManager)
+	mockManager.On("PutRetentionPolicy", target).Return()
+
+	workerPool := NewWorkerPool(2)
+	defer workerPool.Stop()
+
+	retryHeap := NewRetryHeap(10)
+	defer retryHeap.Stop()
+
+	var wg sync.WaitGroup
+	pusher := NewPusher(
+		logger,
+		target,
+		service,
+		mockManager,
+		nil,
+		workerPool,
+		time.Second,
+		time.Minute,
+		&wg,
+		2, // concurrency > 1
+		retryHeap,
+	)
+
+	assert.NotNil(t, pusher)
+	assert.Equal(t, target, pusher.Target)
+
+	// Verify senderPool has retryHeap when concurrency enabled
+	if senderPool, ok := pusher.Sender.(*senderPool); ok {
+		assert.Equal(t, retryHeap, senderPool.retryHeap)
+	}
+
+	mockManager.AssertCalled(t, "PutRetentionPolicy", target)
+}

From dddb691d8d0f1391b9fde623ef87cf43ec1ad979 Mon Sep 17 00:00:00 2001
From: Akansha Agarwal <agarakan@users.noreply.github.com>
Date: Tue, 30 Dec 2025 23:52:11 +0000
Subject: [PATCH 11/50] Update sender to reference retryHeap to call push on
 fail

---
 .../cloudwatchlogs/internal/pusher/pool.go    |  4 +---
 .../internal/pusher/pool_test.go              | 19 +++++++--------
 .../cloudwatchlogs/internal/pusher/pusher.go  |  4 ++--
 .../internal/pusher/pusher_test.go            |  6 ++---
 .../internal/pusher/queue_test.go             |  2 +-
 .../internal/pusher/retryheap.go              |  6 +++--
 .../cloudwatchlogs/internal/pusher/sender.go  | 11 ++++++---
 .../internal/pusher/sender_test.go            | 24 ++++++++++---------
 8 files changed, 39 insertions(+), 37 deletions(-)

diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pool.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pool.go
index 6aff5b522e5..1d6edf57e90 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/pool.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pool.go
@@ -91,16 +91,14 @@ func (p *workerPool) Stop() {
 type senderPool struct {
 	workerPool WorkerPool
 	sender     Sender
-	retryHeap  RetryHeap
 }
 
 var _ Sender = (*senderPool)(nil)
 
-func newSenderPool(workerPool WorkerPool, sender Sender, retryHeap RetryHeap) Sender {
+func newSenderPool(workerPool WorkerPool, sender Sender) Sender {
 	return &senderPool{
 		workerPool: workerPool,
 		sender:     sender,
-		retryHeap:  retryHeap,
 	}
 }
 
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go
index 0af043de52a..6b8269b00aa 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go
@@ -107,9 +107,9 @@ func TestSenderPool(t *testing.T) {
 	logger := testutil.NewNopLogger()
 	mockService := new(mockLogsService)
 	mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil)
-	s := newSender(logger, mockService, nil, time.Second, false)
+	s := newSender(logger, mockService, nil, time.Second, false, nil)
 	p := NewWorkerPool(12)
-	sp := newSenderPool(p, s, nil)
+	sp := newSenderPool(p, s)
 
 	assert.Equal(t, time.Second, sp.RetryDuration())
 	sp.SetRetryDuration(time.Minute)
@@ -139,19 +139,16 @@ func TestSenderPoolRetryHeap(t *testing.T) {
 	logger := testutil.NewNopLogger()
 	mockService := new(mockLogsService)
 	mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil)
-	s := newSender(logger, mockService, nil, time.Second, false)
-	p := NewWorkerPool(12)
-	defer p.Stop()
-
+	
 	// Create RetryHeap
 	retryHeap := NewRetryHeap(10)
 	defer retryHeap.Stop()
+	
+	s := newSender(logger, mockService, nil, time.Second, false, retryHeap)
+	p := NewWorkerPool(12)
+	defer p.Stop()
 
-	sp := newSenderPool(p, s, retryHeap)
-
-	// Verify senderPool has retryHeap
-	assert.NotNil(t, sp.(*senderPool).retryHeap)
-	assert.Equal(t, retryHeap, sp.(*senderPool).retryHeap)
+	sp := newSenderPool(p, s)
 
 	sp.Stop()
 }
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go
index 77707532fec..6b8b5046617 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go
@@ -68,9 +68,9 @@ func createSender(
 	concurrencyEnabled bool,
 	retryHeap RetryHeap,
 ) Sender {
-	s := newSender(logger, service, targetManager, retryDuration, concurrencyEnabled)
+	s := newSender(logger, service, targetManager, retryDuration, concurrencyEnabled, retryHeap)
 	if workerPool == nil {
 		return s
 	}
-	return newSenderPool(workerPool, s, retryHeap)
+	return newSenderPool(workerPool, s)
 }
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go
index 2ec67aea452..e3b11e6963c 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go
@@ -158,10 +158,8 @@ func TestPusherRetryHeap(t *testing.T) {
 	assert.NotNil(t, pusher)
 	assert.Equal(t, target, pusher.Target)
 
-	// Verify senderPool has retryHeap when concurrency enabled
-	if senderPool, ok := pusher.Sender.(*senderPool); ok {
-		assert.Equal(t, retryHeap, senderPool.retryHeap)
-	}
+	// Verify pusher has retryHeap when concurrency enabled
+	// (RetryHeap is now passed to the underlying sender, not senderPool)
 
 	mockManager.AssertCalled(t, "PutRetentionPolicy", target)
 }
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
index dab98651319..8d645ae4bd3 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
@@ -712,7 +712,7 @@ func testPreparationWithLogger(
 ) (*queue, Sender) {
 	t.Helper()
 	tm := NewTargetManager(logger, service)
-	s := newSender(logger, service, tm, retryDuration, false)
+	s := newSender(logger, service, tm, retryDuration, false, nil)
 	q := newQueue(
 		logger,
 		Target{"G", "S", util.StandardLogGroupClass, retention},
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
index 213359b8044..691ff89bf74 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
@@ -123,8 +123,10 @@ type RetryHeapProcessor struct {
 // NewRetryHeapProcessor creates a new retry heap processor
 func NewRetryHeapProcessor(retryHeap RetryHeap, workerPool WorkerPool, service cloudWatchLogsService, targetManager TargetManager, logger telegraf.Logger, maxRetryDuration time.Duration) *RetryHeapProcessor {
 	// Create processor's own sender and senderPool
-	sender := newSender(logger, service, targetManager, maxRetryDuration, true)
-	senderPool := newSenderPool(workerPool, sender, retryHeap)
+	// Note: Pass nil for retryHeap to prevent infinite retry loops - 
+	// batches from RetryHeap that fail again use synchronous retry behavior
+	sender := newSender(logger, service, targetManager, maxRetryDuration, true, nil)
+	senderPool := newSenderPool(workerPool, sender)
 
 	return &RetryHeapProcessor{
 		retryHeap:        retryHeap,
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go b/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go
index 31a3b8be299..9d3339d963e 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go
@@ -37,6 +37,7 @@ type sender struct {
 	stopCh             chan struct{}
 	stopped            bool
 	concurrencyEnabled bool
+	retryHeap          RetryHeap
 }
 
 var _ (Sender) = (*sender)(nil)
@@ -47,6 +48,7 @@ func newSender(
 	targetManager TargetManager,
 	retryDuration time.Duration,
 	concurrencyEnabled bool,
+	retryHeap RetryHeap,
 ) Sender {
 	s := &sender{
 		logger:             logger,
@@ -55,6 +57,7 @@ func newSender(
 		stopCh:             make(chan struct{}),
 		stopped:            false,
 		concurrencyEnabled: concurrencyEnabled,
+		retryHeap:          retryHeap,
 	}
 	s.retryDuration.Store(retryDuration)
 	return s
@@ -124,10 +127,12 @@ func (s *sender) Send(batch *logEventBatch) {
 			return
 		}
 
-		// If concurrency enabled, notify failure (will handle RetryHeap push) and return
+		// If concurrency enabled, push to RetryHeap and return
 		// Otherwise, continue with existing busy-wait retry behavior
 		if s.isConcurrencyEnabled() {
+			s.retryHeap.Push(batch)
 			batch.fail()
+			return
 		}
 
 		// Calculate wait time until next retry (synchronous mode)
@@ -166,7 +171,7 @@ func (s *sender) RetryDuration() time.Duration {
 	return s.retryDuration.Load().(time.Duration)
 }
 
-// isConcurrencyEnabled returns whether concurrency mode is enabled for this sender.
+// isConcurrencyEnabled returns whether concurrency mode is enabled and RetryHeap is available.
 func (s *sender) isConcurrencyEnabled() bool {
-	return s.concurrencyEnabled
+	return s.concurrencyEnabled && s.retryHeap != nil
 }
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go
index 450e63006ad..24330112a77 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go
@@ -81,7 +81,7 @@ func TestSender(t *testing.T) {
 		mockManager := new(mockTargetManager)
 		mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Once()
 
-		s := newSender(logger, mockService, mockManager, time.Second, false)
+		s := newSender(logger, mockService, mockManager, time.Second, false, nil)
 		s.Send(batch)
 		s.Stop()
 
@@ -104,7 +104,7 @@ func TestSender(t *testing.T) {
 		mockManager := new(mockTargetManager)
 		mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{RejectedLogEventsInfo: rejectedInfo}, nil).Once()
 
-		s := newSender(logger, mockService, mockManager, time.Second, false)
+		s := newSender(logger, mockService, mockManager, time.Second, false, nil)
 		s.Send(batch)
 		s.Stop()
 
@@ -123,7 +123,7 @@ func TestSender(t *testing.T) {
 		mockManager.On("InitTarget", mock.Anything).Return(nil).Once()
 		mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Once()
 
-		s := newSender(logger, mockService, mockManager, time.Second, false)
+		s := newSender(logger, mockService, mockManager, time.Second, false, nil)
 		s.Send(batch)
 		s.Stop()
 
@@ -150,7 +150,7 @@ func TestSender(t *testing.T) {
 		mockService.On("PutLogEvents", mock.Anything).
 			Return(&cloudwatchlogs.PutLogEventsOutput{}, &cloudwatchlogs.InvalidParameterException{}).Once()
 
-		s := newSender(logger, mockService, mockManager, time.Second, false)
+		s := newSender(logger, mockService, mockManager, time.Second, false, nil)
 		s.Send(batch)
 		s.Stop()
 
@@ -178,7 +178,7 @@ func TestSender(t *testing.T) {
 		mockService.On("PutLogEvents", mock.Anything).
 			Return(&cloudwatchlogs.PutLogEventsOutput{}, &cloudwatchlogs.DataAlreadyAcceptedException{}).Once()
 
-		s := newSender(logger, mockService, mockManager, time.Second, false)
+		s := newSender(logger, mockService, mockManager, time.Second, false, nil)
 		s.Send(batch)
 		s.Stop()
 
@@ -206,7 +206,7 @@ func TestSender(t *testing.T) {
 		mockService.On("PutLogEvents", mock.Anything).
 			Return(&cloudwatchlogs.PutLogEventsOutput{}, errors.New("test")).Once()
 
-		s := newSender(logger, mockService, mockManager, time.Second, false)
+		s := newSender(logger, mockService, mockManager, time.Second, false, nil)
 		s.Send(batch)
 		s.Stop()
 
@@ -226,7 +226,7 @@ func TestSender(t *testing.T) {
 		mockService.On("PutLogEvents", mock.Anything).
 			Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Once()
 
-		s := newSender(logger, mockService, mockManager, time.Second, false)
+		s := newSender(logger, mockService, mockManager, time.Second, false, nil)
 		s.Send(batch)
 		s.Stop()
 
@@ -252,7 +252,7 @@ func TestSender(t *testing.T) {
 		mockService.On("PutLogEvents", mock.Anything).
 			Return(&cloudwatchlogs.PutLogEventsOutput{}, awserr.New("SomeAWSError", "Some AWS error", nil)).Once()
 
-		s := newSender(logger, mockService, mockManager, 100*time.Millisecond, false)
+		s := newSender(logger, mockService, mockManager, 100*time.Millisecond, false, nil)
 		s.Send(batch)
 		s.Stop()
 
@@ -280,7 +280,7 @@ func TestSender(t *testing.T) {
 		mockService.On("PutLogEvents", mock.Anything).
 			Return(&cloudwatchlogs.PutLogEventsOutput{}, awserr.New("SomeAWSError", "Some AWS error", nil)).Once()
 
-		s := newSender(logger, mockService, mockManager, time.Second, false)
+		s := newSender(logger, mockService, mockManager, time.Second, false, nil)
 
 		go func() {
 			time.Sleep(50 * time.Millisecond)
@@ -306,8 +306,10 @@ func TestSender(t *testing.T) {
 		mockManager := new(mockTargetManager)
 		mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, &cloudwatchlogs.ServiceUnavailableException{}).Once()
 
-		// Enable concurrency with 1 hour retry duration
-		s := newSender(logger, mockService, mockManager, time.Hour, true)
+		// Enable concurrency with 1 hour retry duration and RetryHeap
+		retryHeap := NewRetryHeap(10)
+		defer retryHeap.Stop()
+		s := newSender(logger, mockService, mockManager, time.Hour, true, retryHeap)
 
 		// Track if fail callback was called
 		failCalled := false

From 02bc5c6c37228ae56dbbb6be1e08de155941c646 Mon Sep 17 00:00:00 2001
From: Akansha Agarwal <agarakan@users.noreply.github.com>
Date: Wed, 31 Dec 2025 00:06:31 +0000
Subject: [PATCH 12/50] Add unit tests for sender logic

---
 .../internal/pusher/pool_test.go              |  4 +-
 .../internal/pusher/retryheap.go              |  2 +-
 .../internal/pusher/sender_test.go            | 65 +++++++++++--------
 3 files changed, 40 insertions(+), 31 deletions(-)

diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go
index 6b8269b00aa..4a7abe08073 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go
@@ -139,11 +139,11 @@ func TestSenderPoolRetryHeap(t *testing.T) {
 	logger := testutil.NewNopLogger()
 	mockService := new(mockLogsService)
 	mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil)
-	
+
 	// Create RetryHeap
 	retryHeap := NewRetryHeap(10)
 	defer retryHeap.Stop()
-	
+
 	s := newSender(logger, mockService, nil, time.Second, false, retryHeap)
 	p := NewWorkerPool(12)
 	defer p.Stop()
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
index 691ff89bf74..da4f3cd6dc9 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
@@ -123,7 +123,7 @@ type RetryHeapProcessor struct {
 // NewRetryHeapProcessor creates a new retry heap processor
 func NewRetryHeapProcessor(retryHeap RetryHeap, workerPool WorkerPool, service cloudWatchLogsService, targetManager TargetManager, logger telegraf.Logger, maxRetryDuration time.Duration) *RetryHeapProcessor {
 	// Create processor's own sender and senderPool
-	// Note: Pass nil for retryHeap to prevent infinite retry loops - 
+	// Note: Pass nil for retryHeap to prevent infinite retry loops -
 	// batches from RetryHeap that fail again use synchronous retry behavior
 	sender := newSender(logger, service, targetManager, maxRetryDuration, true, nil)
 	senderPool := newSenderPool(workerPool, sender)
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go
index 24330112a77..0bdead2ff0c 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go
@@ -15,7 +15,6 @@ import (
 
 	"github.com/aws/amazon-cloudwatch-agent/sdk/service/cloudwatchlogs"
 	"github.com/aws/amazon-cloudwatch-agent/tool/testutil"
-	"github.com/aws/amazon-cloudwatch-agent/tool/util"
 )
 
 type mockLogsService struct {
@@ -293,39 +292,49 @@ func TestSender(t *testing.T) {
 		assert.True(t, stateCallbackCalled, "State callback was not called when stop was requested")
 		assert.False(t, doneCallbackCalled, "Done callback should not be called when stop was requested")
 	})
+}
+func TestSenderConcurrencyWithRetryHeap(t *testing.T) {
+	logger := testutil.NewNopLogger()
+	mockService := new(mockLogsService)
+	mockManager := new(mockTargetManager)
+	mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, &cloudwatchlogs.ServiceUnavailableException{}).Once()
 
-	t.Run("ConcurrencyEnabled/CallsFailCallback", func(t *testing.T) {
-		logger := testutil.NewNopLogger()
-		batch := newLogEventBatch(Target{"G", "S", util.StandardLogGroupClass, -1}, nil)
-		batch.append(newLogEvent(time.Now(), "Test message", nil))
+	retryHeap := NewRetryHeap(10)
+	defer retryHeap.Stop()
 
-		// Initialize batch for retry logic
-		batch.initializeStartTime()
+	s := newSender(logger, mockService, mockManager, time.Hour, true, retryHeap)
 
-		mockService := new(mockLogsService)
-		mockManager := new(mockTargetManager)
-		mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, &cloudwatchlogs.ServiceUnavailableException{}).Once()
+	batch := newLogEventBatch(Target{Group: "test-group", Stream: "test-stream"}, nil)
+	batch.append(newLogEvent(time.Now(), "Test message", nil))
 
-		// Enable concurrency with 1 hour retry duration and RetryHeap
-		retryHeap := NewRetryHeap(10)
-		defer retryHeap.Stop()
-		s := newSender(logger, mockService, mockManager, time.Hour, true, retryHeap)
+	var failCalled bool
+	batch.addFailCallback(func() { failCalled = true })
 
-		// Track if fail callback was called
-		failCalled := false
-		batch.addFailCallback(func() {
-			failCalled = true
-		})
+	s.Send(batch)
 
-		go func() {
-			time.Sleep(50 * time.Millisecond)
-			s.Stop()
-		}()
+	assert.True(t, failCalled, "Fail callback should be called")
+	assert.Equal(t, 1, retryHeap.Size(), "Batch should be in RetryHeap")
+	mockService.AssertExpectations(t)
+}
 
-		s.Send(batch)
+func TestSenderConcurrencyFallbackToSync(t *testing.T) {
+	logger := testutil.NewNopLogger()
+	mockService := new(mockLogsService)
+	mockManager := new(mockTargetManager)
+	mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, &cloudwatchlogs.ServiceUnavailableException{}).Once()
+	mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Once()
 
-		// Should call fail callback when concurrency is enabled
-		assert.True(t, failCalled, "fail callback should be called when concurrency is enabled")
-		mockService.AssertExpectations(t)
-	})
+	// Concurrency enabled but nil RetryHeap should fall back to sync
+	s := newSender(logger, mockService, mockManager, 2*time.Second, true, nil)
+
+	batch := newLogEventBatch(Target{Group: "test-group", Stream: "test-stream"}, nil)
+	batch.append(newLogEvent(time.Now(), "Test message", nil))
+
+	var doneCalled bool
+	batch.addDoneCallback(func() { doneCalled = true })
+
+	s.Send(batch)
+
+	assert.True(t, doneCalled, "Done callback should be called")
+	mockService.AssertExpectations(t)
 }

From ef7d6279de26f1978de33103d54f35501cb64c77 Mon Sep 17 00:00:00 2001
From: Akansha Agarwal <agarakan@users.noreply.github.com>
Date: Wed, 31 Dec 2025 00:50:23 +0000
Subject: [PATCH 13/50] Implement halt on target logic

---
 .../cloudwatchlogs/internal/pusher/queue.go   | 43 ++++++++++++++
 .../internal/pusher/queue_test.go             | 59 +++++++++++++++++++
 2 files changed, 102 insertions(+)

diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go b/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go
index e8ad65ffdc3..a5468d0dffc 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go
@@ -42,6 +42,10 @@ type queue struct {
 	initNonBlockingChOnce sync.Once
 	startNonBlockCh       chan struct{}
 	wg                    *sync.WaitGroup
+
+	// Circuit breaker halt/resume functionality
+	haltCond *sync.Cond
+	halted   bool
 }
 
 var _ (Queue) = (*queue)(nil)
@@ -67,6 +71,8 @@ func newQueue(
 		stopCh:          make(chan struct{}),
 		startNonBlockCh: make(chan struct{}),
 		wg:              wg,
+		haltCond:        sync.NewCond(&sync.Mutex{}),
+		halted:          false,
 	}
 	q.flushTimeout.Store(flushTimeout)
 	q.wg.Add(1)
@@ -175,6 +181,11 @@ func (q *queue) merge(mergeChan chan logs.LogEvent) {
 func (q *queue) send() {
 	if len(q.batch.events) > 0 {
 		q.batch.addDoneCallback(q.onSuccessCallback(q.batch.bufferedSize))
+		q.batch.addFailCallback(q.onFailCallback())
+		
+		// Wait if halted (circuit breaker)
+		q.waitIfHalted()
+		
 		q.sender.Send(q.batch)
 		q.batch = newLogEventBatch(q.target, q.entityProvider)
 	}
@@ -183,6 +194,7 @@ func (q *queue) send() {
 // onSuccessCallback returns a callback function to be executed after a successful send.
 func (q *queue) onSuccessCallback(bufferedSize int) func() {
 	return func() {
+		q.resume() // Resume queue on success
 		q.lastSentTime.Store(time.Now())
 		go q.addStats("rawSize", float64(bufferedSize))
 		q.resetFlushTimer()
@@ -245,3 +257,34 @@ func hasValidTime(e logs.LogEvent) bool {
 	}
 	return true
 }
+
+// waitIfHalted blocks until the queue is unhalted (circuit breaker functionality)
+func (q *queue) waitIfHalted() {
+	q.haltCond.L.Lock()
+	for q.halted {
+		q.haltCond.Wait()
+	}
+	q.haltCond.L.Unlock()
+}
+
+// halt stops the queue from sending batches (called on failure)
+func (q *queue) halt() {
+	q.haltCond.L.Lock()
+	q.halted = true
+	q.haltCond.L.Unlock()
+}
+
+// resume allows the queue to send batches again (called on success)
+func (q *queue) resume() {
+	q.haltCond.L.Lock()
+	q.halted = false
+	q.haltCond.Broadcast()
+	q.haltCond.L.Unlock()
+}
+
+// onFailCallback returns a callback function to be executed after a failed send
+func (q *queue) onFailCallback() func() {
+	return func() {
+		q.halt()
+	}
+}
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
index 8d645ae4bd3..cbf62fd94a1 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
@@ -759,6 +759,8 @@ func TestQueueCallbackRegistration(t *testing.T) {
 			flushTimer:      time.NewTimer(10 * time.Millisecond),
 			startNonBlockCh: make(chan struct{}),
 			wg:              &wg,
+			haltCond:        sync.NewCond(&sync.Mutex{}),
+			halted:          false,
 		}
 		q.flushTimeout.Store(10 * time.Millisecond)
 
@@ -801,6 +803,8 @@ func TestQueueCallbackRegistration(t *testing.T) {
 			flushTimer:      time.NewTimer(10 * time.Millisecond),
 			startNonBlockCh: make(chan struct{}),
 			wg:              &wg,
+			haltCond:        sync.NewCond(&sync.Mutex{}),
+			halted:          false,
 		}
 		q.flushTimeout.Store(10 * time.Millisecond)
 
@@ -814,3 +818,58 @@ func TestQueueCallbackRegistration(t *testing.T) {
 		mockSender.AssertExpectations(t)
 	})
 }
+func TestQueueHaltResume(t *testing.T) {
+	logger := testutil.NewNopLogger()
+	
+	var sendCount atomic.Int32
+	mockSender := &mockSender{}
+	mockSender.On("Send", mock.Anything).Run(func(args mock.Arguments) {
+		sendCount.Add(1)
+		batch := args.Get(0).(*logEventBatch)
+		// Simulate failure on first call, success on second
+		if sendCount.Load() == 1 {
+			batch.fail() // This should halt the queue
+		} else {
+			batch.done() // This should resume the queue
+		}
+	}).Return()
+
+	var wg sync.WaitGroup
+	q := newQueue(logger, Target{"G", "S", util.StandardLogGroupClass, -1}, 10*time.Millisecond, nil, mockSender, &wg)
+	defer q.Stop()
+
+	// Add first event - should trigger send and halt
+	q.AddEvent(newStubLogEvent("first message", time.Now()))
+	
+	// Wait a bit for the first send to complete and halt
+	time.Sleep(50 * time.Millisecond)
+	
+	// Add second event - should be queued but not sent due to halt
+	q.AddEvent(newStubLogEvent("second message", time.Now()))
+	
+	// Verify only one send happened (queue is halted)
+	assert.Equal(t, int32(1), sendCount.Load(), "Should have only one send due to halt")
+	
+	// Trigger flush to force send of second batch - this should block until resumed
+	done := make(chan bool)
+	go func() {
+		time.Sleep(100 * time.Millisecond) // Wait a bit
+		// Manually resume by calling success callback on a dummy batch
+		dummyBatch := newLogEventBatch(Target{"G", "S", util.StandardLogGroupClass, -1}, nil)
+		dummyBatch.addDoneCallback(func() {
+			// This simulates a successful send that should resume the queue
+		})
+		dummyBatch.done()
+		done <- true
+	}()
+	
+	// This should eventually complete when the queue is resumed
+	select {
+	case <-done:
+		// Success - the resume worked
+	case <-time.After(5 * time.Second):
+		t.Fatal("Test timed out - queue may be permanently halted")
+	}
+	
+	mockSender.AssertExpectations(t)
+}

From 309f904bc71801b064f7432f34c882d2ddffe967 Mon Sep 17 00:00:00 2001
From: Akansha Agarwal <agarakan@users.noreply.github.com>
Date: Wed, 31 Dec 2025 01:41:19 +0000
Subject: [PATCH 14/50] lint

---
 .../cloudwatchlogs/internal/pusher/pool_test.go    |  2 +-
 .../cloudwatchlogs/internal/pusher/queue.go        |  4 ++--
 .../cloudwatchlogs/internal/pusher/queue_test.go   | 14 +++++++-------
 .../internal/pusher/retryheap_test.go              | 12 ------------
 4 files changed, 10 insertions(+), 22 deletions(-)

diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go
index 4a7abe08073..34e83bc89b3 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go
@@ -135,7 +135,7 @@ func TestSenderPool(t *testing.T) {
 	assert.Equal(t, int32(200), completed.Load())
 }
 
-func TestSenderPoolRetryHeap(t *testing.T) {
+func TestSenderPoolRetryHeap(_ *testing.T) {
 	logger := testutil.NewNopLogger()
 	mockService := new(mockLogsService)
 	mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil)
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go b/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go
index a5468d0dffc..5d297aed525 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go
@@ -182,10 +182,10 @@ func (q *queue) send() {
 	if len(q.batch.events) > 0 {
 		q.batch.addDoneCallback(q.onSuccessCallback(q.batch.bufferedSize))
 		q.batch.addFailCallback(q.onFailCallback())
-		
+
 		// Wait if halted (circuit breaker)
 		q.waitIfHalted()
-		
+
 		q.sender.Send(q.batch)
 		q.batch = newLogEventBatch(q.target, q.entityProvider)
 	}
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
index cbf62fd94a1..8d030450dda 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
@@ -820,7 +820,7 @@ func TestQueueCallbackRegistration(t *testing.T) {
 }
 func TestQueueHaltResume(t *testing.T) {
 	logger := testutil.NewNopLogger()
-	
+
 	var sendCount atomic.Int32
 	mockSender := &mockSender{}
 	mockSender.On("Send", mock.Anything).Run(func(args mock.Arguments) {
@@ -840,16 +840,16 @@ func TestQueueHaltResume(t *testing.T) {
 
 	// Add first event - should trigger send and halt
 	q.AddEvent(newStubLogEvent("first message", time.Now()))
-	
+
 	// Wait a bit for the first send to complete and halt
 	time.Sleep(50 * time.Millisecond)
-	
+
 	// Add second event - should be queued but not sent due to halt
 	q.AddEvent(newStubLogEvent("second message", time.Now()))
-	
+
 	// Verify only one send happened (queue is halted)
 	assert.Equal(t, int32(1), sendCount.Load(), "Should have only one send due to halt")
-	
+
 	// Trigger flush to force send of second batch - this should block until resumed
 	done := make(chan bool)
 	go func() {
@@ -862,7 +862,7 @@ func TestQueueHaltResume(t *testing.T) {
 		dummyBatch.done()
 		done <- true
 	}()
-	
+
 	// This should eventually complete when the queue is resumed
 	select {
 	case <-done:
@@ -870,6 +870,6 @@ func TestQueueHaltResume(t *testing.T) {
 	case <-time.After(5 * time.Second):
 		t.Fatal("Test timed out - queue may be permanently halted")
 	}
-	
+
 	mockSender.AssertExpectations(t)
 }
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go
index 909ff25cd29..b14477762c3 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go
@@ -143,18 +143,6 @@ func TestRetryHeapProcessorSendsBatch(t *testing.T) {
 	assert.Equal(t, 0, heap.Size()) // Batch should be removed from heap
 }
 
-// Mock senderPool for testing
-type mockSenderPool struct {
-	sendCount int
-}
-
-func (m *mockSenderPool) Send(_ *logEventBatch) {
-	m.sendCount++
-}
-
-func (m *mockSenderPool) Stop()                          {}
-func (m *mockSenderPool) SetRetryDuration(time.Duration) {}
-func (m *mockSenderPool) RetryDuration() time.Duration   { return time.Hour }
 func TestRetryHeap_SemaphoreBlockingAndUnblocking(t *testing.T) {
 	heap := NewRetryHeap(2) // maxSize = 2
 	defer heap.Stop()

From d9296a6c7d9a3f44f6dc543b1efb2ada7ddfc606 Mon Sep 17 00:00:00 2001
From: Marcus Mann <mpmann@amazon.com>
Date: Mon, 9 Feb 2026 13:54:39 -0500
Subject: [PATCH 15/50] lint

---
 .../outputs/cloudwatchlogs/internal/pusher/retryheap_test.go   | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go
index e1164e0ea6e..0c39ba0c5f3 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go
@@ -94,7 +94,6 @@ func TestRetryHeapProcessor(t *testing.T) {
 
 	// Test start/stop
 	processor.Start()
-	assert.NotNil(t, processor.ticker)
 
 	processor.Stop()
 	assert.True(t, processor.stopped)
@@ -184,6 +183,8 @@ func TestRetryHeap_SemaphoreBlockingAndUnblocking(t *testing.T) {
 		// Push is successfully blocked when at capacity
 	}
 
+	time.Sleep(3 * time.Second)
+
 	// Pop ready batches to release semaphore slots
 	readyBatches := heap.PopReady()
 	assert.Len(t, readyBatches, 2, "Should pop exactly 2 ready batches")

From 7051a0c3dca1e602d8be2edbd85936324bccc3d5 Mon Sep 17 00:00:00 2001
From: Marcus Mann <mpmann@amazon.com>
Date: Mon, 9 Feb 2026 15:02:50 -0500
Subject: [PATCH 16/50] fix tests

---
 .../internal/pusher/circuitbreaker_test.go    | 101 ++++++++++++++++++
 .../internal/pusher/pool_test.go              |   2 +-
 .../internal/pusher/retryheap_test.go         |   2 -
 3 files changed, 102 insertions(+), 3 deletions(-)
 create mode 100644 plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go

diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go
new file mode 100644
index 00000000000..d541e9a46aa
--- /dev/null
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go
@@ -0,0 +1,101 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+package pusher
+
+import (
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+
+	"github.com/aws/amazon-cloudwatch-agent/sdk/service/cloudwatchlogs"
+	"github.com/aws/amazon-cloudwatch-agent/tool/testutil"
+)
+
+// TestCircuitBreakerBlocksTargetAfterFailure verifies that when a batch fails
+// for a target, the circuit breaker prevents additional batches from that target
+// from being sent until the failing batch is retried successfully.
+//
+// Without a circuit breaker, a problematic target continues producing new batches
+// that flood the SenderQueue/WorkerPool, starving healthy targets.
+func TestCircuitBreakerBlocksTargetAfterFailure(t *testing.T) {
+	logger := testutil.NewNopLogger()
+
+	failingTarget := Target{Group: "failing-group", Stream: "stream"}
+	healthyTarget := Target{Group: "healthy-group", Stream: "stream"}
+
+	var failingTargetSendCount atomic.Int32
+	var healthyTargetSendCount atomic.Int32
+
+	service := &stubLogsService{
+		ple: func(input *cloudwatchlogs.PutLogEventsInput) (*cloudwatchlogs.PutLogEventsOutput, error) {
+			if *input.LogGroupName == failingTarget.Group {
+				failingTargetSendCount.Add(1)
+				return nil, &cloudwatchlogs.ServiceUnavailableException{}
+			}
+			healthyTargetSendCount.Add(1)
+			return &cloudwatchlogs.PutLogEventsOutput{}, nil
+		},
+		cls: func(_ *cloudwatchlogs.CreateLogStreamInput) (*cloudwatchlogs.CreateLogStreamOutput, error) {
+			return &cloudwatchlogs.CreateLogStreamOutput{}, nil
+		},
+		clg: func(_ *cloudwatchlogs.CreateLogGroupInput) (*cloudwatchlogs.CreateLogGroupOutput, error) {
+			return &cloudwatchlogs.CreateLogGroupOutput{}, nil
+		},
+		dlg: func(_ *cloudwatchlogs.DescribeLogGroupsInput) (*cloudwatchlogs.DescribeLogGroupsOutput, error) {
+			return &cloudwatchlogs.DescribeLogGroupsOutput{}, nil
+		},
+	}
+
+	concurrency := 5
+	workerPool := NewWorkerPool(concurrency)
+	retryHeap := NewRetryHeap(concurrency, logger)
+	defer workerPool.Stop()
+	defer retryHeap.Stop()
+
+	tm := NewTargetManager(logger, service)
+
+	var wg sync.WaitGroup
+	flushTimeout := 50 * time.Millisecond
+	retryDuration := time.Hour
+
+	failingPusher := NewPusher(logger, failingTarget, service, tm, nil, workerPool, flushTimeout, retryDuration, &wg, retryHeap)
+	healthyPusher := NewPusher(logger, healthyTarget, service, tm, nil, workerPool, flushTimeout, retryDuration, &wg, retryHeap)
+	defer failingPusher.Stop()
+	defer healthyPusher.Stop()
+
+	now := time.Now()
+
+	// Send events to both targets. The failing target will fail on PutLogEvents,
+	// and the circuit breaker should block it from sending more batches.
+	for i := 0; i < 10; i++ {
+		failingPusher.AddEvent(newStubLogEvent("fail", now))
+		healthyPusher.AddEvent(newStubLogEvent("ok", now))
+	}
+
+	// Wait for flushes to occur
+	time.Sleep(500 * time.Millisecond)
+
+	// Send more events - the failing target should be blocked by circuit breaker
+	for i := 0; i < 10; i++ {
+		failingPusher.AddEvent(newStubLogEvent("fail-more", now))
+		healthyPusher.AddEvent(newStubLogEvent("ok-more", now))
+	}
+
+	time.Sleep(500 * time.Millisecond)
+
+	// Circuit breaker assertion: after the first failure, the failing target should
+	// NOT have sent additional batches. Only 1 send attempt should have been made
+	// before the circuit breaker blocks it.
+	assert.LessOrEqual(t, failingTargetSendCount.Load(), int32(1),
+		"Circuit breaker should block failing target from sending more than 1 batch, "+
+			"but %d batches were sent. Without a circuit breaker, the failing target "+
+			"continues flooding the worker pool with bad requests.", failingTargetSendCount.Load())
+
+	// Healthy target should continue sending successfully
+	assert.Greater(t, healthyTargetSendCount.Load(), int32(0),
+		"Healthy target should continue sending while failing target is blocked")
+}
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go
index f666e86560a..16fe906a778 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go
@@ -144,7 +144,7 @@ func TestSenderPoolRetryHeap(_ *testing.T) {
 	retryHeap := NewRetryHeap(10, logger)
 	defer retryHeap.Stop()
 
-	s := newSender(logger, mockService, nil, time.Second, false, retryHeap)
+	s := newSender(logger, mockService, nil, time.Second, retryHeap)
 	p := NewWorkerPool(12)
 	defer p.Stop()
 
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go
index 75f95f22f5c..fa450f82d61 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go
@@ -94,8 +94,6 @@ func TestRetryHeapProcessor(t *testing.T) {
 
 	// Test start/stop
 	processor.Start()
-	assert.NotNil(t, processor.ticker)
-
 	processor.Stop()
 	assert.True(t, processor.stopped)
 }

From fd185dbc5cf4b5dcb91c6d90ba764e32bee4e03c Mon Sep 17 00:00:00 2001
From: Marcus Mann <mpmann@amazon.com>
Date: Mon, 9 Feb 2026 15:18:35 -0500
Subject: [PATCH 17/50] Fix race condition in RetryHeap Stop and Push methods

- Add mutex protection to Stop() method to prevent race conditions
- Add stopped flag checks in Push() to prevent pushing after Stop()
- Ensure Push() checks stopped flag both before and after acquiring semaphore
- Fix TestRetryHeapStopTwice to verify correct behavior
---
 .../cloudwatchlogs/internal/pusher/retryheap.go  | 16 ++++++++++++++++
 .../internal/pusher/retryheap_test.go            |  2 +-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
index ad33cdcf46b..b7202d648c1 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
@@ -75,11 +75,24 @@ func NewRetryHeap(maxSize int, logger telegraf.Logger) RetryHeap {
 
 // Push adds a batch to the heap, blocking if full
 func (rh *retryHeap) Push(batch *logEventBatch) error {
+	rh.mutex.RLock()
+	if rh.stopped {
+		rh.mutex.RUnlock()
+		return errors.New("retry heap stopped")
+	}
+	rh.mutex.RUnlock()
+
 	// Acquire semaphore slot (blocks if at maxSize capacity)
 	select {
 	case rh.semaphore <- struct{}{}:
 		// add batch to heap with mutex protection
 		rh.mutex.Lock()
+		if rh.stopped {
+			// Release semaphore if stopped after acquiring
+			<-rh.semaphore
+			rh.mutex.Unlock()
+			return errors.New("retry heap stopped")
+		}
 		heap.Push(&rh.heap, batch)
 		rh.mutex.Unlock()
 		return nil
@@ -116,6 +129,9 @@ func (rh *retryHeap) Size() int {
 
 // Stop stops the retry heap
 func (rh *retryHeap) Stop() {
+	rh.mutex.Lock()
+	defer rh.mutex.Unlock()
+	
 	if rh.stopped {
 		return
 	}
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go
index fa450f82d61..edc7dbc3145 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go
@@ -275,7 +275,7 @@ func TestRetryHeapStopTwice(t *testing.T) {
 	target := Target{Group: "test-group", Stream: "test-stream"}
 	batch := newLogEventBatch(target, nil)
 
-	rh.Push(batch) // Should not panic or return error
+	rh.Push(batch)
 
 	// Verify heap is empty (nothing was pushed)
 	assert.Equal(t, 0, rh.Size())

From d79ae7f1da08ac6b41121703d9a33da08b10cef4 Mon Sep 17 00:00:00 2001
From: Marcus Mann <mpmann@amazon.com>
Date: Mon, 9 Feb 2026 15:36:48 -0500
Subject: [PATCH 18/50] Add failing test for circuit breaker resume on batch
 expiry

- Add TestRetryHeapProcessorExpiredBatchShouldResume to demonstrate bug
- When a batch expires after 14 days, RetryHeapProcessor calls updateState()
  but not done(), leaving circuit breaker permanently closed
- Target remains blocked forever even though bad batch was dropped
- Test currently fails, demonstrating the bug from PR comment
---
 .../internal/pusher/queue_test.go             |  86 +++++++++++++++
 .../internal/pusher/retryheap_expiry_test.go  | 104 ++++++++++++++++++
 2 files changed, 190 insertions(+)
 create mode 100644 plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go

diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
index 688ea474450..1db52ce32af 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
@@ -873,3 +873,89 @@ func TestQueueHaltResume(t *testing.T) {
 
 	mockSender.AssertExpectations(t)
 }
+
+// TestQueueResumeOnBatchExpiry verifies that when a batch expires after 14 days of retrying,
+// the circuit breaker resumes the queue to allow new batches to be processed.
+// This prevents the target from being permanently blocked when a bad batch is eventually dropped.
+//
+// Scenario from PR comment: "Say a bad batch from a target caused this to halt. Now that bad batch 
+// is re-tried for 14 days and eventually dropped - but this never gets resumed in that case right? 
+// So this target is blocked forever in that scenario?"
+func TestQueueResumeOnBatchExpiry(t *testing.T) {
+	logger := testutil.NewNopLogger()
+
+	var sendCount atomic.Int32
+	mockService := &stubLogsService{
+		ple: func(input *cloudwatchlogs.PutLogEventsInput) (*cloudwatchlogs.PutLogEventsOutput, error) {
+			sendCount.Add(1)
+			// Always return an error to simulate a failing target
+			return nil, &cloudwatchlogs.ServiceUnavailableException{}
+		},
+		cls: func(_ *cloudwatchlogs.CreateLogStreamInput) (*cloudwatchlogs.CreateLogStreamOutput, error) {
+			return &cloudwatchlogs.CreateLogStreamOutput{}, nil
+		},
+		clg: func(_ *cloudwatchlogs.CreateLogGroupInput) (*cloudwatchlogs.CreateLogGroupOutput, error) {
+			return &cloudwatchlogs.CreateLogGroupOutput{}, nil
+		},
+		dlg: func(_ *cloudwatchlogs.DescribeLogGroupsInput) (*cloudwatchlogs.DescribeLogGroupsOutput, error) {
+			return &cloudwatchlogs.DescribeLogGroupsOutput{}, nil
+		},
+	}
+
+	target := Target{Group: "test-group", Stream: "test-stream"}
+	
+	// Create components
+	workerPool := NewWorkerPool(5)
+	retryHeap := NewRetryHeap(10, logger)
+	tm := NewTargetManager(logger, mockService)
+	defer workerPool.Stop()
+	defer retryHeap.Stop()
+
+	// Create RetryHeapProcessor with very short max retry duration for testing
+	maxRetryDuration := 100 * time.Millisecond // Normally 14 days
+	retryHeapProcessor := NewRetryHeapProcessor(retryHeap, workerPool, mockService, tm, logger, maxRetryDuration, nil)
+	retryHeapProcessor.Start()
+	defer retryHeapProcessor.Stop()
+
+	// Create pusher/queue
+	var wg sync.WaitGroup
+	flushTimeout := 50 * time.Millisecond
+	pusher := NewPusher(logger, target, mockService, tm, nil, workerPool, flushTimeout, maxRetryDuration, &wg, retryHeap)
+	defer pusher.Stop()
+
+	// Add first event - will fail and halt the queue
+	pusher.AddEvent(newStubLogEvent("first message", time.Now()))
+	
+	// Wait for batch to be sent, fail, and go to retry heap
+	time.Sleep(200 * time.Millisecond)
+	
+	// Verify at least one send attempt was made
+	assert.Greater(t, sendCount.Load(), int32(0), "Should have attempted to send")
+	
+	// Add second event - should be queued but blocked by circuit breaker
+	pusher.AddEvent(newStubLogEvent("second message", time.Now()))
+	
+	initialSendCount := sendCount.Load()
+	
+	// Wait for the batch in retry heap to expire
+	time.Sleep(200 * time.Millisecond)
+	
+	// After expiry, the RetryHeapProcessor should drop the expired batch
+	// but currently it only calls updateState(), not done()
+	// This means the circuit breaker remains closed and the second batch never gets sent
+	
+	// Add a third event to trigger another flush
+	pusher.AddEvent(newStubLogEvent("third message", time.Now()))
+	
+	// Wait for potential sends
+	time.Sleep(200 * time.Millisecond)
+	
+	finalSendCount := sendCount.Load()
+	
+	// BUG: The second and third batches should have been attempted after the first batch expired
+	// but they won't be because the queue remains halted forever
+	assert.Equal(t, initialSendCount, finalSendCount,
+		"No new send attempts should occur because the circuit breaker is permanently closed. "+
+		"This demonstrates the bug: when a batch expires in RetryHeapProcessor, it calls "+
+		"updateState() but not done(), so the queue never resumes. The target is blocked forever.")
+}
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go
new file mode 100644
index 00000000000..4571a107779
--- /dev/null
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go
@@ -0,0 +1,104 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+package pusher
+
+import (
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+
+	"github.com/aws/amazon-cloudwatch-agent/sdk/service/cloudwatchlogs"
+	"github.com/aws/amazon-cloudwatch-agent/tool/testutil"
+)
+
+// TestRetryHeapProcessorExpiredBatchShouldResume demonstrates the bug where
+// expired batches don't resume the circuit breaker, leaving the target permanently blocked.
+//
+// From PR comment: "Say a bad batch from a target caused this to halt. Now that bad batch
+// is re-tried for 14 days and eventually dropped - but this never gets resumed in that case right?
+// So this target is blocked forever in that scenario?"
+func TestRetryHeapProcessorExpiredBatchShouldResume(t *testing.T) {
+	logger := testutil.NewNopLogger()
+
+	var sendAttempts atomic.Int32
+	mockService := &stubLogsService{
+		ple: func(input *cloudwatchlogs.PutLogEventsInput) (*cloudwatchlogs.PutLogEventsOutput, error) {
+			sendAttempts.Add(1)
+			// Always fail to simulate a problematic target
+			return nil, &cloudwatchlogs.ServiceUnavailableException{}
+		},
+		cls: func(_ *cloudwatchlogs.CreateLogStreamInput) (*cloudwatchlogs.CreateLogStreamOutput, error) {
+			return &cloudwatchlogs.CreateLogStreamOutput{}, nil
+		},
+		clg: func(_ *cloudwatchlogs.CreateLogGroupInput) (*cloudwatchlogs.CreateLogGroupOutput, error) {
+			return &cloudwatchlogs.CreateLogGroupOutput{}, nil
+		},
+		dlg: func(_ *cloudwatchlogs.DescribeLogGroupsInput) (*cloudwatchlogs.DescribeLogGroupsOutput, error) {
+			return &cloudwatchlogs.DescribeLogGroupsOutput{}, nil
+		},
+	}
+
+	target := Target{Group: "failing-group", Stream: "stream"}
+
+	// Create retry heap and processor with very short expiry for testing
+	retryHeap := NewRetryHeap(10, logger)
+	workerPool := NewWorkerPool(5)
+	tm := NewTargetManager(logger, mockService)
+	maxRetryDuration := 50 * time.Millisecond // Normally 14 days
+	
+	retryHeapProcessor := NewRetryHeapProcessor(retryHeap, workerPool, mockService, tm, logger, maxRetryDuration, nil)
+	retryHeapProcessor.Start()
+	
+	defer retryHeap.Stop()
+	defer workerPool.Stop()
+	defer retryHeapProcessor.Stop()
+
+	// Create a batch that will expire
+	batch := newLogEventBatch(target, nil)
+	batch.append(newLogEvent(time.Now(), "test message", nil))
+	
+	// Set up callbacks to track circuit breaker state
+	var circuitBreakerHalted atomic.Bool
+	var circuitBreakerResumed atomic.Bool
+	
+	batch.addFailCallback(func() {
+		circuitBreakerHalted.Store(true)
+	})
+	
+	batch.addDoneCallback(func() {
+		circuitBreakerResumed.Store(true)
+	})
+	
+	// Initialize the batch's start time to make it already expired
+	batch.initializeStartTime()
+	batch.startTime = time.Now().Add(-100 * time.Millisecond) // Already expired
+	
+	// Update retry metadata to simulate a failed attempt and make it ready for retry
+	batch.updateRetryMetadata(&cloudwatchlogs.ServiceUnavailableException{})
+	// Set nextRetryTime to past so it's ready for retry
+	batch.nextRetryTime = time.Now().Add(-10 * time.Millisecond)
+	
+	// Push the expired batch to the retry heap
+	err := retryHeap.Push(batch)
+	assert.NoError(t, err)
+	
+	// Verify batch is in the heap
+	assert.Equal(t, 1, retryHeap.Size())
+	
+	// Wait for RetryHeapProcessor to process the expired batch
+	time.Sleep(200 * time.Millisecond)
+	
+	// The batch should have been removed from the heap
+	assert.Equal(t, 0, retryHeap.Size(), "Expired batch should be removed from heap")
+	
+	// The circuit breaker SHOULD be resumed when the batch expires
+	// This allows the target to continue processing new batches after the bad batch is dropped
+	assert.True(t, circuitBreakerResumed.Load(),
+		"Circuit breaker should be resumed after batch expiry. "+
+		"When a batch is retried for 14 days and eventually dropped, "+
+		"the target must be unblocked to allow new batches to be processed. "+
+		"Otherwise the target remains blocked forever.")
+}

From de410f11bef9e597f6d570e2373531074a90ac68 Mon Sep 17 00:00:00 2001
From: Marcus Mann <mpmann@amazon.com>
Date: Mon, 9 Feb 2026 15:41:29 -0500
Subject: [PATCH 19/50] lx

---
 .../internal/pusher/queue_test.go             | 82 +------------------
 .../internal/pusher/retryheap.go              |  3 +-
 .../internal/pusher/retryheap_expiry_test.go  | 30 +++----
 3 files changed, 19 insertions(+), 96 deletions(-)

diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
index 1db52ce32af..80c343f3ef7 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
@@ -878,84 +878,6 @@ func TestQueueHaltResume(t *testing.T) {
 // the circuit breaker resumes the queue to allow new batches to be processed.
 // This prevents the target from being permanently blocked when a bad batch is eventually dropped.
 //
-// Scenario from PR comment: "Say a bad batch from a target caused this to halt. Now that bad batch 
-// is re-tried for 14 days and eventually dropped - but this never gets resumed in that case right? 
+// Scenario from PR comment: "Say a bad batch from a target caused this to halt. Now that bad batch
+// is re-tried for 14 days and eventually dropped - but this never gets resumed in that case right?
 // So this target is blocked forever in that scenario?"
-func TestQueueResumeOnBatchExpiry(t *testing.T) {
-	logger := testutil.NewNopLogger()
-
-	var sendCount atomic.Int32
-	mockService := &stubLogsService{
-		ple: func(input *cloudwatchlogs.PutLogEventsInput) (*cloudwatchlogs.PutLogEventsOutput, error) {
-			sendCount.Add(1)
-			// Always return an error to simulate a failing target
-			return nil, &cloudwatchlogs.ServiceUnavailableException{}
-		},
-		cls: func(_ *cloudwatchlogs.CreateLogStreamInput) (*cloudwatchlogs.CreateLogStreamOutput, error) {
-			return &cloudwatchlogs.CreateLogStreamOutput{}, nil
-		},
-		clg: func(_ *cloudwatchlogs.CreateLogGroupInput) (*cloudwatchlogs.CreateLogGroupOutput, error) {
-			return &cloudwatchlogs.CreateLogGroupOutput{}, nil
-		},
-		dlg: func(_ *cloudwatchlogs.DescribeLogGroupsInput) (*cloudwatchlogs.DescribeLogGroupsOutput, error) {
-			return &cloudwatchlogs.DescribeLogGroupsOutput{}, nil
-		},
-	}
-
-	target := Target{Group: "test-group", Stream: "test-stream"}
-	
-	// Create components
-	workerPool := NewWorkerPool(5)
-	retryHeap := NewRetryHeap(10, logger)
-	tm := NewTargetManager(logger, mockService)
-	defer workerPool.Stop()
-	defer retryHeap.Stop()
-
-	// Create RetryHeapProcessor with very short max retry duration for testing
-	maxRetryDuration := 100 * time.Millisecond // Normally 14 days
-	retryHeapProcessor := NewRetryHeapProcessor(retryHeap, workerPool, mockService, tm, logger, maxRetryDuration, nil)
-	retryHeapProcessor.Start()
-	defer retryHeapProcessor.Stop()
-
-	// Create pusher/queue
-	var wg sync.WaitGroup
-	flushTimeout := 50 * time.Millisecond
-	pusher := NewPusher(logger, target, mockService, tm, nil, workerPool, flushTimeout, maxRetryDuration, &wg, retryHeap)
-	defer pusher.Stop()
-
-	// Add first event - will fail and halt the queue
-	pusher.AddEvent(newStubLogEvent("first message", time.Now()))
-	
-	// Wait for batch to be sent, fail, and go to retry heap
-	time.Sleep(200 * time.Millisecond)
-	
-	// Verify at least one send attempt was made
-	assert.Greater(t, sendCount.Load(), int32(0), "Should have attempted to send")
-	
-	// Add second event - should be queued but blocked by circuit breaker
-	pusher.AddEvent(newStubLogEvent("second message", time.Now()))
-	
-	initialSendCount := sendCount.Load()
-	
-	// Wait for the batch in retry heap to expire
-	time.Sleep(200 * time.Millisecond)
-	
-	// After expiry, the RetryHeapProcessor should drop the expired batch
-	// but currently it only calls updateState(), not done()
-	// This means the circuit breaker remains closed and the second batch never gets sent
-	
-	// Add a third event to trigger another flush
-	pusher.AddEvent(newStubLogEvent("third message", time.Now()))
-	
-	// Wait for potential sends
-	time.Sleep(200 * time.Millisecond)
-	
-	finalSendCount := sendCount.Load()
-	
-	// BUG: The second and third batches should have been attempted after the first batch expired
-	// but they won't be because the queue remains halted forever
-	assert.Equal(t, initialSendCount, finalSendCount,
-		"No new send attempts should occur because the circuit breaker is permanently closed. "+
-		"This demonstrates the bug: when a batch expires in RetryHeapProcessor, it calls "+
-		"updateState() but not done(), so the queue never resumes. The target is blocked forever.")
-}
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
index b7202d648c1..0c5f0f3e547 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
@@ -131,7 +131,7 @@ func (rh *retryHeap) Size() int {
 func (rh *retryHeap) Stop() {
 	rh.mutex.Lock()
 	defer rh.mutex.Unlock()
-	
+
 	if rh.stopped {
 		return
 	}
@@ -220,6 +220,7 @@ func (p *RetryHeapProcessor) processReadyMessages() {
 		if batch.isExpired(p.maxRetryDuration) {
 			p.logger.Errorf("Dropping expired batch for %v/%v", batch.Group, batch.Stream)
 			batch.updateState()
+			batch.done() // Resume circuit breaker to allow target to process new batches
 			continue
 		}
 
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go
index 4571a107779..178a2bf8ae4 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go
@@ -48,10 +48,10 @@ func TestRetryHeapProcessorExpiredBatchShouldResume(t *testing.T) {
 	workerPool := NewWorkerPool(5)
 	tm := NewTargetManager(logger, mockService)
 	maxRetryDuration := 50 * time.Millisecond // Normally 14 days
-	
+
 	retryHeapProcessor := NewRetryHeapProcessor(retryHeap, workerPool, mockService, tm, logger, maxRetryDuration, nil)
 	retryHeapProcessor.Start()
-	
+
 	defer retryHeap.Stop()
 	defer workerPool.Stop()
 	defer retryHeapProcessor.Stop()
@@ -59,46 +59,46 @@ func TestRetryHeapProcessorExpiredBatchShouldResume(t *testing.T) {
 	// Create a batch that will expire
 	batch := newLogEventBatch(target, nil)
 	batch.append(newLogEvent(time.Now(), "test message", nil))
-	
+
 	// Set up callbacks to track circuit breaker state
 	var circuitBreakerHalted atomic.Bool
 	var circuitBreakerResumed atomic.Bool
-	
+
 	batch.addFailCallback(func() {
 		circuitBreakerHalted.Store(true)
 	})
-	
+
 	batch.addDoneCallback(func() {
 		circuitBreakerResumed.Store(true)
 	})
-	
+
 	// Initialize the batch's start time to make it already expired
 	batch.initializeStartTime()
 	batch.startTime = time.Now().Add(-100 * time.Millisecond) // Already expired
-	
+
 	// Update retry metadata to simulate a failed attempt and make it ready for retry
 	batch.updateRetryMetadata(&cloudwatchlogs.ServiceUnavailableException{})
 	// Set nextRetryTime to past so it's ready for retry
 	batch.nextRetryTime = time.Now().Add(-10 * time.Millisecond)
-	
+
 	// Push the expired batch to the retry heap
 	err := retryHeap.Push(batch)
 	assert.NoError(t, err)
-	
+
 	// Verify batch is in the heap
 	assert.Equal(t, 1, retryHeap.Size())
-	
+
 	// Wait for RetryHeapProcessor to process the expired batch
 	time.Sleep(200 * time.Millisecond)
-	
+
 	// The batch should have been removed from the heap
 	assert.Equal(t, 0, retryHeap.Size(), "Expired batch should be removed from heap")
-	
+
 	// The circuit breaker SHOULD be resumed when the batch expires
 	// This allows the target to continue processing new batches after the bad batch is dropped
 	assert.True(t, circuitBreakerResumed.Load(),
 		"Circuit breaker should be resumed after batch expiry. "+
-		"When a batch is retried for 14 days and eventually dropped, "+
-		"the target must be unblocked to allow new batches to be processed. "+
-		"Otherwise the target remains blocked forever.")
+			"When a batch is retried for 14 days and eventually dropped, "+
+			"the target must be unblocked to allow new batches to be processed. "+
+			"Otherwise the target remains blocked forever.")
 }

From f4c7620447407ca47ee554a39d74f67f16e6198a Mon Sep 17 00:00:00 2001
From: Marcus Mann <mpmann@amazon.com>
Date: Tue, 10 Feb 2026 12:02:29 -0500
Subject: [PATCH 20/50] Remove configurable maxRetryTimeout in favor of default
 hardcoded value

---
 .../outputs/cloudwatchlogs/cloudwatchlogs.go    |  2 +-
 .../cloudwatchlogs/internal/pusher/batch.go     | 16 +++++++++-------
 .../cloudwatchlogs/internal/pusher/pool.go      | 11 -----------
 .../cloudwatchlogs/internal/pusher/pusher.go    |  7 +++----
 .../cloudwatchlogs/internal/pusher/retryheap.go |  4 ++--
 .../cloudwatchlogs/internal/pusher/sender.go    | 17 +----------------
 6 files changed, 16 insertions(+), 41 deletions(-)

diff --git a/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go b/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go
index ee4c8422632..45bcc947d4d 100644
--- a/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go
+++ b/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go
@@ -183,7 +183,7 @@ func (c *CloudWatchLogs) getDest(t pusher.Target, logSrc logs.LogSrc) *cwDest {
 		}
 		c.targetManager = pusher.NewTargetManager(c.Log, client)
 	})
-	p := pusher.NewPusher(c.Log, t, client, c.targetManager, logSrc, c.workerPool, c.ForceFlushInterval.Duration, maxRetryTimeout, &c.pusherWaitGroup, c.retryHeap)
+	p := pusher.NewPusher(c.Log, t, client, c.targetManager, logSrc, c.workerPool, c.ForceFlushInterval.Duration, &c.pusherWaitGroup, c.Concurrency, c.retryHeap)
 	cwd := &cwDest{
 		pusher:   p,
 		retryer:  logThrottleRetryer,
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go b/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go
index 3c83be15a0b..752eeba4b03 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go
@@ -18,6 +18,9 @@ import (
 // CloudWatch Logs PutLogEvents API limits
 // Taken from https://docs.aws.amazon.com/AmazonCloudWatchLogs/latest/APIReference/API_PutLogEvents.html
 const (
+	// maxRetryTimeout is the default retry timeout for CloudWatch Logs operations
+	maxRetryTimeout = 14*24*time.Hour + 10*time.Minute
+
 	// The maximum batch size in bytes. This size is calculated as the sum of all event messages in UTF-8,
 	// plus 26 bytes for each log event.
 	reqSizeLimit = 1024 * 1024
@@ -109,6 +112,7 @@ type logEventBatch struct {
 	retryCountLong  int       // Number of retries using long delay strategy
 	startTime       time.Time // Time of first request (for max retry duration calculation)
 	nextRetryTime   time.Time // When this batch should be retried next
+	expireAfter     time.Time // When this batch expires and should be dropped
 	lastError       error     // Last error encountered
 }
 
@@ -252,11 +256,12 @@ func (t byTimestamp) Less(i, j int) bool {
 	return *t[i].Timestamp < *t[j].Timestamp
 }
 
-// initializeStartTime sets the start time if not already set.
+// initializeStartTime sets the start time and expiration time if not already set.
 func (b *logEventBatch) initializeStartTime() {
 	if b.startTime.IsZero() {
 		b.startTime = time.Now()
 	}
+	b.expireAfter = b.startTime.Add(maxRetryTimeout)
 }
 
 // updateRetryMetadata updates the retry metadata after a failed send attempt.
@@ -279,12 +284,9 @@ func (b *logEventBatch) updateRetryMetadata(err error) {
 	b.nextRetryTime = time.Now().Add(wait)
 }
 
-// isExpired checks if the batch has exceeded the maximum retry duration.
-func (b *logEventBatch) isExpired(maxRetryDuration time.Duration) bool {
-	if b.startTime.IsZero() {
-		return false
-	}
-	return time.Since(b.startTime) > maxRetryDuration
+// isExpired checks if the batch has exceeded its expiration time.
+func (b *logEventBatch) isExpired() bool {
+	return !b.expireAfter.IsZero() && time.Now().After(b.expireAfter)
 }
 
 // isReadyForRetry checks if enough time has passed since the last failure to retry this batch.
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pool.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pool.go
index 1d6edf57e90..fb15ba9fab1 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/pool.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pool.go
@@ -6,7 +6,6 @@ package pusher
 import (
 	"sync"
 	"sync/atomic"
-	"time"
 )
 
 type WorkerPool interface {
@@ -113,13 +112,3 @@ func (s *senderPool) Stop() {
 	// workerpool is stopped by the plugin
 	s.sender.Stop()
 }
-
-// SetRetryDuration sets the retry duration on the wrapped Sender.
-func (s *senderPool) SetRetryDuration(duration time.Duration) {
-	s.sender.SetRetryDuration(duration)
-}
-
-// RetryDuration returns the retry duration of the wrapped Sender.
-func (s *senderPool) RetryDuration() time.Duration {
-	return s.sender.RetryDuration()
-}
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go
index 43310d6861e..6a4c9f2df24 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go
@@ -32,11 +32,11 @@ func NewPusher(
 	entityProvider logs.LogEntityProvider,
 	workerPool WorkerPool,
 	flushTimeout time.Duration,
-	retryDuration time.Duration,
 	wg *sync.WaitGroup,
+	_ int,
 	retryHeap RetryHeap,
 ) *Pusher {
-	s := createSender(logger, service, targetManager, workerPool, retryDuration, retryHeap)
+	s := createSender(logger, service, targetManager, workerPool, retryHeap)
 
 	q := newQueue(logger, target, flushTimeout, entityProvider, s, wg)
 	targetManager.PutRetentionPolicy(target)
@@ -61,10 +61,9 @@ func createSender(
 	service cloudWatchLogsService,
 	targetManager TargetManager,
 	workerPool WorkerPool,
-	retryDuration time.Duration,
 	retryHeap RetryHeap,
 ) Sender {
-	s := newSender(logger, service, targetManager, retryDuration, retryHeap)
+	s := newSender(logger, service, targetManager, retryHeap)
 	if workerPool == nil {
 		return s
 	}
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
index 0c5f0f3e547..025a3063ad1 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
@@ -155,7 +155,7 @@ type RetryHeapProcessor struct {
 func NewRetryHeapProcessor(retryHeap RetryHeap, workerPool WorkerPool, service cloudWatchLogsService, targetManager TargetManager, logger telegraf.Logger, maxRetryDuration time.Duration, retryer *retryer.LogThrottleRetryer) *RetryHeapProcessor {
 	// Create processor's own sender and senderPool
 	// Pass retryHeap so failed batches go back to RetryHeap instead of blocking on sync retry
-	sender := newSender(logger, service, targetManager, maxRetryDuration, retryHeap)
+	sender := newSender(logger, service, targetManager, retryHeap)
 	senderPool := newSenderPool(workerPool, sender)
 
 	return &RetryHeapProcessor{
@@ -217,7 +217,7 @@ func (p *RetryHeapProcessor) processReadyMessages() {
 
 	for _, batch := range readyBatches {
 		// Check if batch has expired
-		if batch.isExpired(p.maxRetryDuration) {
+		if batch.isExpired() {
 			p.logger.Errorf("Dropping expired batch for %v/%v", batch.Group, batch.Stream)
 			batch.updateState()
 			batch.done() // Resume circuit breaker to allow target to process new batches
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go b/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go
index 1b5f13fd481..ad38d7960f7 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go
@@ -5,7 +5,6 @@ package pusher
 
 import (
 	"errors"
-	"sync/atomic"
 	"time"
 
 	"github.com/aws/aws-sdk-go/aws/awserr"
@@ -24,14 +23,11 @@ type cloudWatchLogsService interface {
 
 type Sender interface {
 	Send(*logEventBatch)
-	SetRetryDuration(time.Duration)
-	RetryDuration() time.Duration
 	Stop()
 }
 
 type sender struct {
 	service       cloudWatchLogsService
-	retryDuration atomic.Value
 	targetManager TargetManager
 	logger        telegraf.Logger
 	stopCh        chan struct{}
@@ -45,7 +41,6 @@ func newSender(
 	logger telegraf.Logger,
 	service cloudWatchLogsService,
 	targetManager TargetManager,
-	retryDuration time.Duration,
 	retryHeap RetryHeap,
 ) Sender {
 	s := &sender{
@@ -56,7 +51,6 @@ func newSender(
 		stopped:       false,
 		retryHeap:     retryHeap,
 	}
-	s.retryDuration.Store(retryDuration)
 	return s
 }
 
@@ -118,7 +112,7 @@ func (s *sender) Send(batch *logEventBatch) {
 
 		// Check if retry would exceed max duration
 		totalRetries := batch.retryCountShort + batch.retryCountLong - 1
-		if batch.nextRetryTime.After(batch.startTime.Add(s.RetryDuration())) {
+		if batch.isExpired() {
 			s.logger.Errorf("All %v retries to %v/%v failed for PutLogEvents, request dropped.", totalRetries, batch.Group, batch.Stream)
 			batch.updateState()
 			return
@@ -158,12 +152,3 @@ func (s *sender) Stop() {
 	s.stopped = true
 }
 
-// SetRetryDuration sets the maximum duration for retrying failed log sends.
-func (s *sender) SetRetryDuration(retryDuration time.Duration) {
-	s.retryDuration.Store(retryDuration)
-}
-
-// RetryDuration returns the current maximum retry duration.
-func (s *sender) RetryDuration() time.Duration {
-	return s.retryDuration.Load().(time.Duration)
-}

From c791abd5dca8e4a680af18cb67d37bcfe7d13911 Mon Sep 17 00:00:00 2001
From: Marcus Mann <mpmann@amazon.com>
Date: Tue, 10 Feb 2026 12:21:43 -0500
Subject: [PATCH 21/50] Update tests for removed retryDuration parameter

---
 .../cloudwatchlogs/internal/pusher/batch.go   |  2 +-
 .../internal/pusher/batch_test.go             |  6 +-
 .../internal/pusher/circuitbreaker_test.go    |  5 +-
 .../internal/pusher/pool_test.go              |  8 +--
 .../internal/pusher/pusher_test.go            |  4 +-
 .../internal/pusher/queue_test.go             | 60 +++++--------------
 .../internal/pusher/retryheap_expiry_test.go  |  2 +-
 .../internal/pusher/sender_test.go            | 27 +++++----
 8 files changed, 43 insertions(+), 71 deletions(-)

diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go b/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go
index 752eeba4b03..d68dfdaddd6 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go
@@ -260,8 +260,8 @@ func (t byTimestamp) Less(i, j int) bool {
 func (b *logEventBatch) initializeStartTime() {
 	if b.startTime.IsZero() {
 		b.startTime = time.Now()
+		b.expireAfter = b.startTime.Add(maxRetryTimeout)
 	}
-	b.expireAfter = b.startTime.Add(maxRetryTimeout)
 }
 
 // updateRetryMetadata updates the retry metadata after a failed send attempt.
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/batch_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/batch_test.go
index e433cc74902..2f8db4f689c 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/batch_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/batch_test.go
@@ -411,7 +411,7 @@ func TestBatchRetryMetadata(t *testing.T) {
 	// Test initial state
 	assert.True(t, batch.startTime.IsZero())
 	assert.True(t, batch.isReadyForRetry())
-	assert.False(t, batch.isExpired(time.Hour))
+	assert.False(t, batch.isExpired())
 
 	// Test initializeStartTime
 	batch.initializeStartTime()
@@ -433,6 +433,6 @@ func TestBatchRetryMetadata(t *testing.T) {
 	assert.True(t, batch.isReadyForRetry())
 
 	// Test isExpired
-	batch.startTime = time.Now().Add(-25 * time.Hour)
-	assert.True(t, batch.isExpired(24*time.Hour))
+	batch.expireAfter = time.Now().Add(-1 * time.Hour)
+	assert.True(t, batch.isExpired())
 }
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go
index d541e9a46aa..dd0651020a0 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go
@@ -60,10 +60,9 @@ func TestCircuitBreakerBlocksTargetAfterFailure(t *testing.T) {
 
 	var wg sync.WaitGroup
 	flushTimeout := 50 * time.Millisecond
-	retryDuration := time.Hour
 
-	failingPusher := NewPusher(logger, failingTarget, service, tm, nil, workerPool, flushTimeout, retryDuration, &wg, retryHeap)
-	healthyPusher := NewPusher(logger, healthyTarget, service, tm, nil, workerPool, flushTimeout, retryDuration, &wg, retryHeap)
+	failingPusher := NewPusher(logger, failingTarget, service, tm, nil, workerPool, flushTimeout, &wg, 2, retryHeap)
+	healthyPusher := NewPusher(logger, healthyTarget, service, tm, nil, workerPool, flushTimeout, &wg, 2, retryHeap)
 	defer failingPusher.Stop()
 	defer healthyPusher.Stop()
 
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go
index 16fe906a778..9abf746b5fd 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go
@@ -107,14 +107,10 @@ func TestSenderPool(t *testing.T) {
 	logger := testutil.NewNopLogger()
 	mockService := new(mockLogsService)
 	mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil)
-	s := newSender(logger, mockService, nil, time.Second, nil)
+	s := newSender(logger, mockService, nil, nil)
 	p := NewWorkerPool(12)
 	sp := newSenderPool(p, s)
 
-	assert.Equal(t, time.Second, sp.RetryDuration())
-	sp.SetRetryDuration(time.Minute)
-	assert.Equal(t, time.Minute, sp.RetryDuration())
-
 	var completed atomic.Int32
 	var evts []*logEvent
 	for i := 0; i < 200; i++ {
@@ -144,7 +140,7 @@ func TestSenderPoolRetryHeap(_ *testing.T) {
 	retryHeap := NewRetryHeap(10, logger)
 	defer retryHeap.Stop()
 
-	s := newSender(logger, mockService, nil, time.Second, retryHeap)
+	s := newSender(logger, mockService, nil, retryHeap)
 	p := NewWorkerPool(12)
 	defer p.Stop()
 
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go
index ef5f514501c..41640490b1e 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go
@@ -111,8 +111,8 @@ func setupPusher(t *testing.T, workerPool WorkerPool, wg *sync.WaitGroup) *Pushe
 		nil,
 		workerPool,
 		time.Second,
-		time.Minute,
 		wg,
+		1,   // concurrency
 		nil, // retryHeap
 	)
 
@@ -148,8 +148,8 @@ func TestPusherRetryHeap(t *testing.T) {
 		nil,
 		workerPool,
 		time.Second,
-		time.Minute,
 		&wg,
+		2, // concurrency > 1
 		retryHeap,
 	)
 
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
index 80c343f3ef7..d4b64a17a4c 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
@@ -123,7 +123,7 @@ func TestAddSingleEvent_WithAccountId(t *testing.T) {
 	}
 
 	ep := newMockEntityProvider(expectedEntity)
-	q, sender := testPreparation(t, -1, &s, 1*time.Hour, 2*time.Hour, ep, &wg)
+	q, sender := testPreparation(t, -1, &s, 1*time.Hour, ep, &wg)
 	q.AddEvent(newStubLogEvent("MSG", time.Now()))
 	require.False(t, called.Load(), "PutLogEvents has been called too fast, it should wait until FlushTimeout.")
 
@@ -160,7 +160,7 @@ func TestAddSingleEvent_WithoutAccountId(t *testing.T) {
 	}
 
 	ep := newMockEntityProvider(nil)
-	q, sender := testPreparation(t, -1, &s, 1*time.Hour, 2*time.Hour, ep, &wg)
+	q, sender := testPreparation(t, -1, &s, 1*time.Hour, ep, &wg)
 	q.AddEvent(newStubLogEvent("MSG", time.Now()))
 	require.False(t, called.Load(), "PutLogEvents has been called too fast, it should wait until FlushTimeout.")
 
@@ -190,7 +190,7 @@ func TestStopQueueWouldDoFinalSend(t *testing.T) {
 		return &cloudwatchlogs.PutLogEventsOutput{}, nil
 	}
 
-	q, sender := testPreparation(t, -1, &s, 1*time.Hour, 2*time.Hour, nil, &wg)
+	q, sender := testPreparation(t, -1, &s, 1*time.Hour, nil, &wg)
 	q.AddEvent(newStubLogEvent("MSG", time.Now()))
 
 	time.Sleep(10 * time.Millisecond)
@@ -214,7 +214,7 @@ func TestStopPusherWouldStopRetries(t *testing.T) {
 	}
 
 	logSink := testutil.NewLogSink()
-	q, sender := testPreparationWithLogger(t, logSink, -1, &s, 1*time.Hour, 2*time.Hour, nil, &wg)
+	q, sender := testPreparationWithLogger(t, logSink, -1, &s, 1*time.Hour, nil, &wg)
 	q.AddEvent(newStubLogEvent("MSG", time.Now()))
 	time.Sleep(10 * time.Millisecond)
 
@@ -256,7 +256,7 @@ func TestLongMessageHandling(t *testing.T) {
 		return &cloudwatchlogs.PutLogEventsOutput{}, nil
 	}
 
-	q, sender := testPreparation(t, -1, &s, 1*time.Hour, 2*time.Hour, nil, &wg)
+	q, sender := testPreparation(t, -1, &s, 1*time.Hour, nil, &wg)
 	q.AddEvent(newStubLogEvent(longMsg, time.Now()))
 
 	triggerSend(t, q)
@@ -285,7 +285,7 @@ func TestRequestIsLessThan1MB(t *testing.T) {
 		return &cloudwatchlogs.PutLogEventsOutput{}, nil
 	}
 
-	q, sender := testPreparation(t, -1, &s, 1*time.Hour, 2*time.Hour, nil, &wg)
+	q, sender := testPreparation(t, -1, &s, 1*time.Hour, nil, &wg)
 	for i := 0; i < 8; i++ {
 		q.AddEvent(newStubLogEvent(longMsg, time.Now()))
 	}
@@ -311,7 +311,7 @@ func TestRequestIsLessThan10kEvents(t *testing.T) {
 		return &cloudwatchlogs.PutLogEventsOutput{}, nil
 	}
 
-	q, sender := testPreparation(t, -1, &s, 1*time.Hour, 2*time.Hour, nil, &wg)
+	q, sender := testPreparation(t, -1, &s, 1*time.Hour, nil, &wg)
 	for i := 0; i < 30000; i++ {
 		q.AddEvent(newStubLogEvent(msg, time.Now()))
 	}
@@ -337,7 +337,7 @@ func TestTimestampPopulation(t *testing.T) {
 		return &cloudwatchlogs.PutLogEventsOutput{}, nil
 	}
 
-	q, sender := testPreparation(t, -1, &s, 1*time.Hour, 2*time.Hour, nil, &wg)
+	q, sender := testPreparation(t, -1, &s, 1*time.Hour, nil, &wg)
 	for i := 0; i < 3; i++ {
 		q.AddEvent(newStubLogEvent("msg", time.Time{}))
 	}
@@ -361,7 +361,7 @@ func TestIgnoreOutOfTimeRangeEvent(t *testing.T) {
 	}
 
 	logSink := testutil.NewLogSink()
-	q, sender := testPreparationWithLogger(t, logSink, -1, &s, 10*time.Millisecond, 2*time.Hour, nil, &wg)
+	q, sender := testPreparationWithLogger(t, logSink, -1, &s, 10*time.Millisecond, nil, &wg)
 	q.AddEvent(newStubLogEvent("MSG", time.Now().Add(-15*24*time.Hour)))
 	q.AddEventNonBlocking(newStubLogEvent("MSG", time.Now().Add(2*time.Hour+1*time.Minute)))
 
@@ -414,7 +414,7 @@ func TestAddMultipleEvents(t *testing.T) {
 		))
 	}
 	evts[10], evts[90] = evts[90], evts[10] // make events out of order
-	q, sender := testPreparation(t, -1, &s, 1*time.Hour, 2*time.Hour, nil, &wg)
+	q, sender := testPreparation(t, -1, &s, 1*time.Hour, nil, &wg)
 	for _, e := range evts {
 		q.AddEvent(e)
 	}
@@ -466,7 +466,7 @@ func TestSendReqWhenEventsSpanMoreThan24Hrs(t *testing.T) {
 		return nil, nil
 	}
 
-	q, sender := testPreparation(t, -1, &s, 1*time.Hour, 2*time.Hour, nil, &wg)
+	q, sender := testPreparation(t, -1, &s, 1*time.Hour, nil, &wg)
 	q.AddEvent(newStubLogEvent("MSG 25hrs ago", time.Now().Add(-25*time.Hour)))
 	q.AddEvent(newStubLogEvent("MSG 24hrs ago", time.Now().Add(-24*time.Hour)))
 	q.AddEvent(newStubLogEvent("MSG 23hrs ago", time.Now().Add(-23*time.Hour)))
@@ -496,7 +496,7 @@ func TestUnhandledErrorWouldNotResend(t *testing.T) {
 	}
 
 	logSink := testutil.NewLogSink()
-	q, sender := testPreparationWithLogger(t, logSink, -1, &s, 10*time.Millisecond, 2*time.Hour, nil, &wg)
+	q, sender := testPreparationWithLogger(t, logSink, -1, &s, 10*time.Millisecond, nil, &wg)
 	q.AddEvent(newStubLogEvent("msg", time.Now()))
 	time.Sleep(2 * time.Second)
 
@@ -542,7 +542,7 @@ func TestCreateLogGroupAndLogStreamWhenNotFound(t *testing.T) {
 	}
 
 	logSink := testutil.NewLogSink()
-	q, sender := testPreparationWithLogger(t, logSink, -1, &s, 1*time.Hour, 2*time.Hour, nil, &wg)
+	q, sender := testPreparationWithLogger(t, logSink, -1, &s, 1*time.Hour, nil, &wg)
 	var eventWG sync.WaitGroup
 	eventWG.Add(1)
 	q.AddEvent(&stubLogEvent{message: "msg", timestamp: time.Now(), done: eventWG.Done})
@@ -580,7 +580,7 @@ func TestLogRejectedLogEntryInfo(t *testing.T) {
 	}
 
 	logSink := testutil.NewLogSink()
-	q, sender := testPreparationWithLogger(t, logSink, -1, &s, 1*time.Hour, 2*time.Hour, nil, &wg)
+	q, sender := testPreparationWithLogger(t, logSink, -1, &s, 1*time.Hour, nil, &wg)
 	var eventWG sync.WaitGroup
 	eventWG.Add(1)
 	q.AddEvent(&stubLogEvent{message: "msg", timestamp: time.Now(), done: eventWG.Done})
@@ -630,7 +630,7 @@ func TestAddEventNonBlocking(t *testing.T) {
 			start.Add(time.Duration(i)*time.Millisecond),
 		))
 	}
-	q, sender := testPreparation(t, -1, &s, 1*time.Hour, 2*time.Hour, nil, &wg)
+	q, sender := testPreparation(t, -1, &s, 1*time.Hour, nil, &wg)
 	time.Sleep(200 * time.Millisecond) // Wait until pusher started, merge channel is blocked
 
 	for _, e := range evts {
@@ -646,31 +646,6 @@ func TestAddEventNonBlocking(t *testing.T) {
 	wg.Wait()
 }
 
-func TestResendWouldStopAfterExhaustedRetries(t *testing.T) {
-	t.Parallel()
-	var wg sync.WaitGroup
-	var s stubLogsService
-	var cnt atomic.Int32
-
-	s.ple = func(*cloudwatchlogs.PutLogEventsInput) (*cloudwatchlogs.PutLogEventsOutput, error) {
-		cnt.Add(1)
-		return nil, &cloudwatchlogs.ServiceUnavailableException{}
-	}
-
-	logSink := testutil.NewLogSink()
-	q, sender := testPreparationWithLogger(t, logSink, -1, &s, 10*time.Millisecond, time.Second, nil, &wg)
-	q.AddEvent(newStubLogEvent("msg", time.Now()))
-	time.Sleep(2 * time.Second)
-
-	logLines := logSink.Lines()
-	lastLine := logLines[len(logLines)-1]
-	expected := fmt.Sprintf("All %v retries to G/S failed for PutLogEvents, request dropped.", cnt.Load()-1)
-	require.True(t, strings.HasSuffix(lastLine, expected), fmt.Sprintf("Expecting error log to end with request dropped, but received '%s' in the log", logSink.String()))
-
-	q.Stop()
-	sender.Stop()
-	wg.Wait()
-}
 
 // Cannot call q.send() directly as it would cause a race condition. Reset last sent time and trigger flush.
 func triggerSend(t *testing.T, q *queue) {
@@ -684,7 +659,6 @@ func testPreparation(
 	retention int,
 	service cloudWatchLogsService,
 	flushTimeout time.Duration,
-	retryDuration time.Duration,
 	entityProvider logs.LogEntityProvider,
 	wg *sync.WaitGroup,
 ) (*queue, Sender) {
@@ -694,7 +668,6 @@ func testPreparation(
 		retention,
 		service,
 		flushTimeout,
-		retryDuration,
 		entityProvider,
 		wg,
 	)
@@ -706,13 +679,12 @@ func testPreparationWithLogger(
 	retention int,
 	service cloudWatchLogsService,
 	flushTimeout time.Duration,
-	retryDuration time.Duration,
 	entityProvider logs.LogEntityProvider,
 	wg *sync.WaitGroup,
 ) (*queue, Sender) {
 	t.Helper()
 	tm := NewTargetManager(logger, service)
-	s := newSender(logger, service, tm, retryDuration, nil)
+	s := newSender(logger, service, tm, nil)
 	q := newQueue(
 		logger,
 		Target{"G", "S", util.StandardLogGroupClass, retention},
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go
index 178a2bf8ae4..8726a329890 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go
@@ -74,7 +74,7 @@ func TestRetryHeapProcessorExpiredBatchShouldResume(t *testing.T) {
 
 	// Initialize the batch's start time to make it already expired
 	batch.initializeStartTime()
-	batch.startTime = time.Now().Add(-100 * time.Millisecond) // Already expired
+	batch.expireAfter = time.Now().Add(-10 * time.Millisecond) // Already expired
 
 	// Update retry metadata to simulate a failed attempt and make it ready for retry
 	batch.updateRetryMetadata(&cloudwatchlogs.ServiceUnavailableException{})
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go
index 72072837431..fc52e673a88 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go
@@ -80,7 +80,7 @@ func TestSender(t *testing.T) {
 		mockManager := new(mockTargetManager)
 		mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Once()
 
-		s := newSender(logger, mockService, mockManager, time.Second, nil)
+		s := newSender(logger, mockService, mockManager, nil)
 		s.Send(batch)
 		s.Stop()
 
@@ -103,7 +103,7 @@ func TestSender(t *testing.T) {
 		mockManager := new(mockTargetManager)
 		mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{RejectedLogEventsInfo: rejectedInfo}, nil).Once()
 
-		s := newSender(logger, mockService, mockManager, time.Second, nil)
+		s := newSender(logger, mockService, mockManager, nil)
 		s.Send(batch)
 		s.Stop()
 
@@ -122,7 +122,7 @@ func TestSender(t *testing.T) {
 		mockManager.On("InitTarget", mock.Anything).Return(nil).Once()
 		mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Once()
 
-		s := newSender(logger, mockService, mockManager, time.Second, nil)
+		s := newSender(logger, mockService, mockManager, nil)
 		s.Send(batch)
 		s.Stop()
 
@@ -149,7 +149,7 @@ func TestSender(t *testing.T) {
 		mockService.On("PutLogEvents", mock.Anything).
 			Return(&cloudwatchlogs.PutLogEventsOutput{}, &cloudwatchlogs.InvalidParameterException{}).Once()
 
-		s := newSender(logger, mockService, mockManager, time.Second, nil)
+		s := newSender(logger, mockService, mockManager, nil)
 		s.Send(batch)
 		s.Stop()
 
@@ -177,7 +177,7 @@ func TestSender(t *testing.T) {
 		mockService.On("PutLogEvents", mock.Anything).
 			Return(&cloudwatchlogs.PutLogEventsOutput{}, &cloudwatchlogs.DataAlreadyAcceptedException{}).Once()
 
-		s := newSender(logger, mockService, mockManager, time.Second, nil)
+		s := newSender(logger, mockService, mockManager, nil)
 		s.Send(batch)
 		s.Stop()
 
@@ -205,7 +205,7 @@ func TestSender(t *testing.T) {
 		mockService.On("PutLogEvents", mock.Anything).
 			Return(&cloudwatchlogs.PutLogEventsOutput{}, errors.New("test")).Once()
 
-		s := newSender(logger, mockService, mockManager, time.Second, nil)
+		s := newSender(logger, mockService, mockManager, nil)
 		s.Send(batch)
 		s.Stop()
 
@@ -225,7 +225,7 @@ func TestSender(t *testing.T) {
 		mockService.On("PutLogEvents", mock.Anything).
 			Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Once()
 
-		s := newSender(logger, mockService, mockManager, time.Second, nil)
+		s := newSender(logger, mockService, mockManager, nil)
 		s.Send(batch)
 		s.Stop()
 
@@ -251,7 +251,12 @@ func TestSender(t *testing.T) {
 		mockService.On("PutLogEvents", mock.Anything).
 			Return(&cloudwatchlogs.PutLogEventsOutput{}, awserr.New("SomeAWSError", "Some AWS error", nil)).Once()
 
-		s := newSender(logger, mockService, mockManager, 100*time.Millisecond, nil)
+		s := newSender(logger, mockService, mockManager, nil)
+		
+		// Set expireAfter to past time so batch expires immediately after first retry
+		batch.initializeStartTime()
+		batch.expireAfter = time.Now().Add(-1 * time.Hour)
+		
 		s.Send(batch)
 		s.Stop()
 
@@ -279,7 +284,7 @@ func TestSender(t *testing.T) {
 		mockService.On("PutLogEvents", mock.Anything).
 			Return(&cloudwatchlogs.PutLogEventsOutput{}, awserr.New("SomeAWSError", "Some AWS error", nil)).Once()
 
-		s := newSender(logger, mockService, mockManager, time.Second, nil)
+		s := newSender(logger, mockService, mockManager, nil)
 
 		go func() {
 			time.Sleep(50 * time.Millisecond)
@@ -302,7 +307,7 @@ func TestSenderConcurrencyWithRetryHeap(t *testing.T) {
 	retryHeap := NewRetryHeap(10, logger)
 	defer retryHeap.Stop()
 
-	s := newSender(logger, mockService, mockManager, time.Hour, retryHeap)
+	s := newSender(logger, mockService, mockManager, retryHeap)
 
 	batch := newLogEventBatch(Target{Group: "test-group", Stream: "test-stream"}, nil)
 	batch.append(newLogEvent(time.Now(), "Test message", nil))
@@ -325,7 +330,7 @@ func TestSenderConcurrencyFallbackToSync(t *testing.T) {
 	mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Once()
 
 	// Concurrency enabled but nil RetryHeap should fall back to sync
-	s := newSender(logger, mockService, mockManager, 2*time.Second, nil)
+	s := newSender(logger, mockService, mockManager, nil)
 
 	batch := newLogEventBatch(Target{Group: "test-group", Stream: "test-stream"}, nil)
 	batch.append(newLogEvent(time.Now(), "Test message", nil))

From 4d798e2a7ec89ded1b210cbca5631c56fb65bfbb Mon Sep 17 00:00:00 2001
From: Marcus Mann <mpmann@amazon.com>
Date: Tue, 10 Feb 2026 13:03:27 -0500
Subject: [PATCH 22/50] Add test for initializeStartTime idempotency

Verifies that startTime and expireAfter are only set once on first call
and remain unchanged on subsequent calls, ensuring the 14-day expiration
is measured from the first send attempt, not from each retry.
---
 .../internal/pusher/batch_test.go             | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/batch_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/batch_test.go
index 2f8db4f689c..cc031362adc 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/batch_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/batch_test.go
@@ -436,3 +436,37 @@ func TestBatchRetryMetadata(t *testing.T) {
 	batch.expireAfter = time.Now().Add(-1 * time.Hour)
 	assert.True(t, batch.isExpired())
 }
+
+func TestBatchInitializeStartTimeIdempotent(t *testing.T) {
+	batch := newLogEventBatch(Target{Group: "test-group", Stream: "test-stream"}, nil)
+
+	// Verify initial state
+	assert.True(t, batch.startTime.IsZero())
+	assert.True(t, batch.expireAfter.IsZero())
+
+	// First call should set both values
+	batch.initializeStartTime()
+	assert.False(t, batch.startTime.IsZero())
+	assert.False(t, batch.expireAfter.IsZero())
+
+	// Capture the values
+	firstStartTime := batch.startTime
+	firstExpireAfter := batch.expireAfter
+
+	// Verify expireAfter is set to startTime + maxRetryTimeout
+	expectedExpireAfter := firstStartTime.Add(maxRetryTimeout)
+	assert.Equal(t, expectedExpireAfter, firstExpireAfter)
+
+	// Wait a bit to ensure time has passed
+	time.Sleep(10 * time.Millisecond)
+
+	// Second call should NOT change the values (idempotent)
+	batch.initializeStartTime()
+	assert.Equal(t, firstStartTime, batch.startTime, "startTime should not change on second call")
+	assert.Equal(t, firstExpireAfter, batch.expireAfter, "expireAfter should not change on second call")
+
+	// Third call should also not change the values
+	batch.initializeStartTime()
+	assert.Equal(t, firstStartTime, batch.startTime, "startTime should not change on third call")
+	assert.Equal(t, firstExpireAfter, batch.expireAfter, "expireAfter should not change on third call")
+}

From cdf1651ae8c1b517e4febb6e63ee8ef5ad8a2373 Mon Sep 17 00:00:00 2001
From: Marcus Mann <mpmann@amazon.com>
Date: Tue, 10 Feb 2026 13:42:31 -0500
Subject: [PATCH 23/50] refactor(pusher): Remove unused concurrency parameter
 from NewPusher
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Concurrency is now determined by whether workerPool and retryHeap are
provided, making the explicit concurrency parameter redundant.

🤖 Assisted by AI
---
 plugins/outputs/cloudwatchlogs/cloudwatchlogs.go              | 2 +-
 .../cloudwatchlogs/internal/pusher/circuitbreaker_test.go     | 4 ++--
 plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go      | 1 -
 plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go | 2 --
 4 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go b/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go
index 45bcc947d4d..d35542ff930 100644
--- a/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go
+++ b/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go
@@ -183,7 +183,7 @@ func (c *CloudWatchLogs) getDest(t pusher.Target, logSrc logs.LogSrc) *cwDest {
 		}
 		c.targetManager = pusher.NewTargetManager(c.Log, client)
 	})
-	p := pusher.NewPusher(c.Log, t, client, c.targetManager, logSrc, c.workerPool, c.ForceFlushInterval.Duration, &c.pusherWaitGroup, c.Concurrency, c.retryHeap)
+	p := pusher.NewPusher(c.Log, t, client, c.targetManager, logSrc, c.workerPool, c.ForceFlushInterval.Duration, &c.pusherWaitGroup, c.retryHeap)
 	cwd := &cwDest{
 		pusher:   p,
 		retryer:  logThrottleRetryer,
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go
index dd0651020a0..972d1fab482 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go
@@ -61,8 +61,8 @@ func TestCircuitBreakerBlocksTargetAfterFailure(t *testing.T) {
 	var wg sync.WaitGroup
 	flushTimeout := 50 * time.Millisecond
 
-	failingPusher := NewPusher(logger, failingTarget, service, tm, nil, workerPool, flushTimeout, &wg, 2, retryHeap)
-	healthyPusher := NewPusher(logger, healthyTarget, service, tm, nil, workerPool, flushTimeout, &wg, 2, retryHeap)
+	failingPusher := NewPusher(logger, failingTarget, service, tm, nil, workerPool, flushTimeout, &wg, retryHeap)
+	healthyPusher := NewPusher(logger, healthyTarget, service, tm, nil, workerPool, flushTimeout, &wg, retryHeap)
 	defer failingPusher.Stop()
 	defer healthyPusher.Stop()
 
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go
index 6a4c9f2df24..aa2f4be722a 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go
@@ -33,7 +33,6 @@ func NewPusher(
 	workerPool WorkerPool,
 	flushTimeout time.Duration,
 	wg *sync.WaitGroup,
-	_ int,
 	retryHeap RetryHeap,
 ) *Pusher {
 	s := createSender(logger, service, targetManager, workerPool, retryHeap)
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go
index 41640490b1e..b80a201a07e 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go
@@ -112,7 +112,6 @@ func setupPusher(t *testing.T, workerPool WorkerPool, wg *sync.WaitGroup) *Pushe
 		workerPool,
 		time.Second,
 		wg,
-		1,   // concurrency
 		nil, // retryHeap
 	)
 
@@ -149,7 +148,6 @@ func TestPusherRetryHeap(t *testing.T) {
 		workerPool,
 		time.Second,
 		&wg,
-		2, // concurrency > 1
 		retryHeap,
 	)
 

From 28ba9023a6a5084eaec7ecd764686e837d7893d0 Mon Sep 17 00:00:00 2001
From: Marcus Mann <mpmann@amazon.com>
Date: Tue, 10 Feb 2026 15:47:05 -0500
Subject: [PATCH 24/50] test(pusher): Add automated recovery tests for poison
 pill
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add comprehensive recovery tests validating:
1. Permission granted during retry - system recovers and publishes logs
2. System restart during retry - resumes correctly with preserved metadata
3. Multiple targets - healthy targets unaffected by failing target

Tests validate circuit breaker behavior, retry heap functionality,
and proper isolation between targets during permission failures.

Addresses CWQS-3192 (P1 requirement)

🤖 Assisted by AI
---
 .../pusher/retryheap_recovery_test.go         | 281 ++++++++++++++++++
 1 file changed, 281 insertions(+)
 create mode 100644 plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_recovery_test.go

diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_recovery_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_recovery_test.go
new file mode 100644
index 00000000000..067f4af3136
--- /dev/null
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_recovery_test.go
@@ -0,0 +1,281 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+package pusher
+
+import (
+	"errors"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/influxdata/telegraf/testutil"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/mock"
+
+	"github.com/aws/amazon-cloudwatch-agent/internal/retryer"
+	"github.com/aws/amazon-cloudwatch-agent/sdk/service/cloudwatchlogs"
+)
+
+// TestRecoveryWhenPermissionGrantedDuringRetry validates that when PLE permissions
+// are missing initially but granted while retry is ongoing, the system recovers
+// and successfully publishes logs.
+// This test addresses CWQS-3192 requirement 1.
+func TestRecoveryWhenPermissionGrantedDuringRetry(t *testing.T) {
+	heap := NewRetryHeap(10, &testutil.Logger{})
+	defer heap.Stop()
+
+	workerPool := NewWorkerPool(2)
+	defer workerPool.Stop()
+
+	// Mock service that initially returns AccessDenied, then succeeds
+	mockService := &mockLogsService{}
+	accessDeniedErr := &cloudwatchlogs.AccessDeniedException{
+		Message_: stringPtr("Access denied"),
+	}
+	
+	// First call fails with AccessDenied
+	mockService.On("PutLogEvents", mock.Anything).Return((*cloudwatchlogs.PutLogEventsOutput)(nil), accessDeniedErr).Once()
+	// Second call succeeds (permission granted)
+	mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Once()
+
+	mockTargetManager := &mockTargetManager{}
+	mockTargetManager.On("EnsureTargetExists", mock.Anything).Return(nil)
+
+	processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, time.Hour, retryer.NewLogThrottleRetryer(&testutil.Logger{}))
+
+	// Create batch and track circuit breaker state
+	target := Target{Group: "group", Stream: "stream"}
+	batch := newLogEventBatch(target, nil)
+	batch.events = []*cloudwatchlogs.InputLogEvent{
+		{Message: stringPtr("test message"), Timestamp: int64Ptr(time.Now().Unix() * 1000)},
+	}
+
+	var haltCalled, resumeCalled bool
+	var mu sync.Mutex
+
+	// Register circuit breaker callbacks
+	batch.addFailCallback(func() {
+		mu.Lock()
+		haltCalled = true
+		mu.Unlock()
+	})
+	batch.addDoneCallback(func() {
+		mu.Lock()
+		resumeCalled = true
+		mu.Unlock()
+	})
+
+	// Set batch ready for immediate retry
+	batch.nextRetryTime = time.Now().Add(-1 * time.Second)
+
+	// Push batch to heap
+	err := heap.Push(batch)
+	assert.NoError(t, err)
+
+	// Process first attempt - should fail with AccessDenied
+	processor.processReadyMessages()
+
+	// Wait for async processing to complete
+	time.Sleep(100 * time.Millisecond)
+
+	// Verify circuit breaker halted
+	mu.Lock()
+	assert.True(t, haltCalled, "Circuit breaker should halt on failure")
+	assert.False(t, resumeCalled, "Circuit breaker should not resume yet")
+	mu.Unlock()
+
+	// Batch should be back in heap for retry
+	assert.Equal(t, 1, heap.Size(), "Failed batch should be in retry heap")
+
+	// Simulate permission being granted by waiting for retry time
+	// Set batch ready for immediate retry
+	batch.nextRetryTime = time.Now().Add(-1 * time.Second)
+
+	// Process second attempt - should succeed
+	processor.processReadyMessages()
+
+	// Wait for async processing to complete
+	time.Sleep(100 * time.Millisecond)
+
+	// Verify circuit breaker resumed
+	mu.Lock()
+	assert.True(t, resumeCalled, "Circuit breaker should resume on success")
+	mu.Unlock()
+
+	// Heap should be empty (batch successfully sent)
+	assert.Equal(t, 0, heap.Size(), "Heap should be empty after successful retry")
+
+	// Verify both PutLogEvents calls were made
+	mockService.AssertExpectations(t)
+}
+
+// TestRecoveryAfterSystemRestart validates that when the system restarts with
+// retry ongoing, it resumes correctly by loading state and continuing retries.
+// This test addresses CWQS-3192 requirement 2.
+func TestRecoveryAfterSystemRestart(t *testing.T) {
+	heap := NewRetryHeap(10, &testutil.Logger{})
+	defer heap.Stop()
+
+	workerPool := NewWorkerPool(2)
+	defer workerPool.Stop()
+
+	mockService := &mockLogsService{}
+	mockTargetManager := &mockTargetManager{}
+	mockTargetManager.On("EnsureTargetExists", mock.Anything).Return(nil)
+
+	// Simulate system restart scenario:
+	// 1. Initial failure puts batch in retry state
+	// 2. System "restarts" (new processor instance)
+	// 3. Batch is reloaded with retry metadata intact
+	// 4. Retry succeeds
+
+	target := Target{Group: "group", Stream: "stream"}
+	batch := newLogEventBatch(target, nil)
+	batch.events = []*cloudwatchlogs.InputLogEvent{
+		{Message: stringPtr("test message"), Timestamp: int64Ptr(time.Now().Unix() * 1000)},
+	}
+
+	// Simulate batch that was in retry state before restart
+	batch.retryCountShort = 2
+	batch.startTime = time.Now().Add(-5 * time.Minute)
+	batch.nextRetryTime = time.Now().Add(-1 * time.Second) // Ready for retry
+	batch.lastError = errors.New("previous error before restart")
+
+	var resumeCalled bool
+	var mu sync.Mutex
+
+	batch.addDoneCallback(func() {
+		mu.Lock()
+		resumeCalled = true
+		mu.Unlock()
+	})
+
+	// Mock successful retry after restart
+	mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Once()
+
+	// Create new processor (simulating restart)
+	processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, time.Hour, retryer.NewLogThrottleRetryer(&testutil.Logger{}))
+
+	// Push batch with existing retry metadata
+	err := heap.Push(batch)
+	assert.NoError(t, err)
+
+	// Process should succeed
+	processor.processReadyMessages()
+
+	// Wait for async processing to complete
+	time.Sleep(100 * time.Millisecond)
+
+	// Verify circuit breaker resumed
+	mu.Lock()
+	assert.True(t, resumeCalled, "Circuit breaker should resume after successful retry post-restart")
+	mu.Unlock()
+
+	// Heap should be empty
+	assert.Equal(t, 0, heap.Size(), "Heap should be empty after successful retry")
+
+	// Verify retry metadata was preserved
+	assert.Equal(t, 2, batch.retryCountShort, "Retry count should be preserved across restart")
+	assert.False(t, batch.startTime.IsZero(), "Start time should be preserved across restart")
+
+	mockService.AssertExpectations(t)
+}
+
+// TestRecoveryWithMultipleTargets validates that when one target has permission
+// issues, other healthy targets continue publishing successfully.
+func TestRecoveryWithMultipleTargets(t *testing.T) {
+	heap := NewRetryHeap(10, &testutil.Logger{})
+	defer heap.Stop()
+
+	workerPool := NewWorkerPool(2)
+	defer workerPool.Stop()
+
+	mockService := &mockLogsService{}
+	mockTargetManager := &mockTargetManager{}
+	mockTargetManager.On("EnsureTargetExists", mock.Anything).Return(nil)
+
+	processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, time.Hour, retryer.NewLogThrottleRetryer(&testutil.Logger{}))
+
+	// Create two targets
+	target1 := Target{Group: "group1", Stream: "stream1"}
+	target2 := Target{Group: "group2", Stream: "stream2"}
+
+	batch1 := newLogEventBatch(target1, nil)
+	batch1.events = []*cloudwatchlogs.InputLogEvent{
+		{Message: stringPtr("message1"), Timestamp: int64Ptr(time.Now().Unix() * 1000)},
+	}
+	batch1.nextRetryTime = time.Now().Add(-1 * time.Second)
+
+	batch2 := newLogEventBatch(target2, nil)
+	batch2.events = []*cloudwatchlogs.InputLogEvent{
+		{Message: stringPtr("message2"), Timestamp: int64Ptr(time.Now().Unix() * 1000)},
+	}
+	batch2.nextRetryTime = time.Now().Add(-1 * time.Second)
+
+	var halt1Called, resume1Called, resume2Called bool
+	var mu sync.Mutex
+
+	// Target 1 fails with AccessDenied
+	batch1.addFailCallback(func() {
+		mu.Lock()
+		halt1Called = true
+		mu.Unlock()
+	})
+	batch1.addDoneCallback(func() {
+		mu.Lock()
+		resume1Called = true
+		mu.Unlock()
+	})
+
+	// Target 2 succeeds
+	batch2.addDoneCallback(func() {
+		mu.Lock()
+		resume2Called = true
+		mu.Unlock()
+	})
+
+	// Mock responses: target1 fails, target2 succeeds
+	accessDeniedErr := &cloudwatchlogs.AccessDeniedException{
+		Message_: stringPtr("Access denied"),
+	}
+	mockService.On("PutLogEvents", mock.MatchedBy(func(req *cloudwatchlogs.PutLogEventsInput) bool {
+		return *req.LogGroupName == "group1"
+	})).Return((*cloudwatchlogs.PutLogEventsOutput)(nil), accessDeniedErr).Once()
+
+	mockService.On("PutLogEvents", mock.MatchedBy(func(req *cloudwatchlogs.PutLogEventsInput) bool {
+		return *req.LogGroupName == "group2"
+	})).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Once()
+
+	// Push both batches
+	err := heap.Push(batch1)
+	assert.NoError(t, err)
+	err = heap.Push(batch2)
+	assert.NoError(t, err)
+
+	// Process both batches
+	processor.processReadyMessages()
+
+	// Wait for async processing to complete
+	time.Sleep(100 * time.Millisecond)
+
+	// Verify target1 circuit breaker halted, target2 succeeded
+	mu.Lock()
+	assert.True(t, halt1Called, "Target1 circuit breaker should halt")
+	assert.False(t, resume1Called, "Target1 circuit breaker should not resume")
+	assert.True(t, resume2Called, "Target2 should succeed and resume")
+	mu.Unlock()
+
+	// Target1 should be back in heap, target2 should be done
+	assert.Equal(t, 1, heap.Size(), "Only failed target should remain in heap")
+
+	mockService.AssertExpectations(t)
+}
+
+func stringPtr(s string) *string {
+	return &s
+}
+
+func int64Ptr(i int64) *int64 {
+	return &i
+}

From 11b1d26ac66d2de6d41031dcb62c006d168f8021 Mon Sep 17 00:00:00 2001
From: Marcus Mann <mpmann@amazon.com>
Date: Wed, 11 Feb 2026 11:52:04 -0500
Subject: [PATCH 25/50] Add test filtering to integration test workflows

Add test_os_filter and test_dir_filter inputs to allow running
specific tests on specific OS platforms. Filters use jq to filter
generated test matrices before execution.

Usage:
  -f test_os_filter=al2023 (run only on al2023)
  -f test_dir_filter=./test/cloudwatchlogs (run only cloudwatchlogs)

When filters are omitted, all tests run (default behavior).
---
 .github/workflows/integration-test.yml | 10 ++++
 .github/workflows/test-artifacts.yml   | 69 +++++++++++++++++++-------
 2 files changed, 62 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml
index 37823e93eac..6de692f2838 100644
--- a/.github/workflows/integration-test.yml
+++ b/.github/workflows/integration-test.yml
@@ -33,6 +33,14 @@ on:
       test_repo_branch:
         description: 'Override for the GitHub test repository branch to use (default is main)'
         type: string
+      test_os_filter:
+        description: 'Filter tests to specific OS (e.g., al2023, ubuntu-22.04). Leave empty to run all OS.'
+        type: string
+        required: false
+      test_dir_filter:
+        description: 'Filter tests to specific test directory (e.g., ./test/cloudwatchlogs). Leave empty to run all tests.'
+        type: string
+        required: false
 
 jobs:
   CheckBuildTestArtifacts:
@@ -67,3 +75,5 @@ jobs:
     with:
       build_id: ${{ inputs.build_sha }}
       test_repo_branch: ${{ inputs.test_repo_branch }}
+      test_os_filter: ${{ inputs.test_os_filter }}
+      test_dir_filter: ${{ inputs.test_dir_filter }}
diff --git a/.github/workflows/test-artifacts.yml b/.github/workflows/test-artifacts.yml
index d1965250ae2..2a17fd27a79 100644
--- a/.github/workflows/test-artifacts.yml
+++ b/.github/workflows/test-artifacts.yml
@@ -29,6 +29,14 @@ on:
       test_repo_branch:
         description: 'Override for the GitHub test repository branch to use (default is main)'
         type: string
+      test_os_filter:
+        description: 'Filter tests to specific OS (e.g., al2023, ubuntu-22.04)'
+        type: string
+        required: false
+      test_dir_filter:
+        description: 'Filter tests to specific test directory (e.g., ./test/cloudwatchlogs)'
+        type: string
+        required: false
   workflow_call:
     inputs:
       build_id:
@@ -38,6 +46,14 @@ on:
       test_repo_branch:
         description: 'Override for the GitHub test repository branch to use (default is main)'
         type: string
+      test_os_filter:
+        description: 'Filter tests to specific OS (e.g., al2023, ubuntu-22.04)'
+        type: string
+        required: false
+      test_dir_filter:
+        description: 'Filter tests to specific test directory (e.g., ./test/cloudwatchlogs)'
+        type: string
+        required: false
 
 concurrency:
   group: ${{ github.workflow }}-${{ inputs.build_id }}
@@ -161,23 +177,42 @@ jobs:
         id: set-matrix
         run: |
           go run --tags=generator generator/test_case_generator.go
-          echo "ec2_gpu_matrix=$(echo $(cat generator/resources/ec2_gpu_complete_test_matrix.json))" >> "$GITHUB_OUTPUT"
-          echo "eks_addon_matrix=$(echo $(cat generator/resources/eks_addon_complete_test_matrix.json))" >> "$GITHUB_OUTPUT"
-          echo "ec2_linux_matrix=$(echo $(cat generator/resources/ec2_linux_complete_test_matrix.json))" >> "$GITHUB_OUTPUT"
-          echo "ec2_linux_onprem_matrix=$(echo $(cat generator/resources/ec2_linux_onprem_complete_test_matrix.json))" >> "$GITHUB_OUTPUT"
-          echo "ec2_selinux_matrix=$(echo $(cat generator/resources/ec2_selinux_complete_test_matrix.json))" >> "$GITHUB_OUTPUT"
-          echo "ec2_windows_matrix=$(echo $(cat generator/resources/ec2_windows_complete_test_matrix.json))" >> "$GITHUB_OUTPUT"
-          echo "ec2_mac_matrix=$(echo $(cat generator/resources/ec2_mac_complete_test_matrix.json))" >> "$GITHUB_OUTPUT"
-          echo "ec2_performance_matrix=$(echo $(cat generator/resources/ec2_performance_complete_test_matrix.json))" >> "$GITHUB_OUTPUT"
-          echo "ec2_windows_performance_matrix=$(echo $(cat generator/resources/ec2_windows_performance_complete_test_matrix.json))" >> "$GITHUB_OUTPUT"
-          echo "ec2_stress_matrix=$(echo $(cat generator/resources/ec2_stress_complete_test_matrix.json))" >> "$GITHUB_OUTPUT"
-          echo "ec2_windows_stress_matrix=$(echo $(cat generator/resources/ec2_windows_stress_complete_test_matrix.json))" >> "$GITHUB_OUTPUT"
-          echo "ecs_ec2_launch_daemon_matrix=$(echo $(cat generator/resources/ecs_ec2_daemon_complete_test_matrix.json))" >> "$GITHUB_OUTPUT"
-          echo "ecs_fargate_matrix=$(echo $(cat generator/resources/ecs_fargate_complete_test_matrix.json))" >> "$GITHUB_OUTPUT"
-          echo "eks_daemon_matrix=$(echo $(cat generator/resources/eks_daemon_complete_test_matrix.json))" >> "$GITHUB_OUTPUT"
-          echo "eks_deployment_matrix=$(echo $(cat generator/resources/eks_deployment_complete_test_matrix.json))" >> "$GITHUB_OUTPUT"
-          echo "ec2_linux_itar_matrix=$(echo $(cat generator/resources/ec2_linux_itar_complete_test_matrix.json))" >> "$GITHUB_OUTPUT"
-          echo "ec2_linux_china_matrix=$(echo $(cat generator/resources/ec2_linux_china_complete_test_matrix.json))" >> "$GITHUB_OUTPUT"
+          
+          # Function to apply filters to a matrix
+          apply_filters() {
+            local matrix_file=$1
+            local matrix_content=$(cat "$matrix_file")
+            
+            # Apply OS filter if provided
+            if [ -n "${{ inputs.test_os_filter }}" ]; then
+              matrix_content=$(echo "$matrix_content" | jq '[.[] | select(.os == "${{ inputs.test_os_filter }}")]')
+            fi
+            
+            # Apply test directory filter if provided
+            if [ -n "${{ inputs.test_dir_filter }}" ]; then
+              matrix_content=$(echo "$matrix_content" | jq '[.[] | select(.test_dir == "${{ inputs.test_dir_filter }}")]')
+            fi
+            
+            echo "$matrix_content"
+          }
+          
+          echo "ec2_gpu_matrix=$(apply_filters generator/resources/ec2_gpu_complete_test_matrix.json)" >> "$GITHUB_OUTPUT"
+          echo "eks_addon_matrix=$(apply_filters generator/resources/eks_addon_complete_test_matrix.json)" >> "$GITHUB_OUTPUT"
+          echo "ec2_linux_matrix=$(apply_filters generator/resources/ec2_linux_complete_test_matrix.json)" >> "$GITHUB_OUTPUT"
+          echo "ec2_linux_onprem_matrix=$(apply_filters generator/resources/ec2_linux_onprem_complete_test_matrix.json)" >> "$GITHUB_OUTPUT"
+          echo "ec2_selinux_matrix=$(apply_filters generator/resources/ec2_selinux_complete_test_matrix.json)" >> "$GITHUB_OUTPUT"
+          echo "ec2_windows_matrix=$(apply_filters generator/resources/ec2_windows_complete_test_matrix.json)" >> "$GITHUB_OUTPUT"
+          echo "ec2_mac_matrix=$(apply_filters generator/resources/ec2_mac_complete_test_matrix.json)" >> "$GITHUB_OUTPUT"
+          echo "ec2_performance_matrix=$(apply_filters generator/resources/ec2_performance_complete_test_matrix.json)" >> "$GITHUB_OUTPUT"
+          echo "ec2_windows_performance_matrix=$(apply_filters generator/resources/ec2_windows_performance_complete_test_matrix.json)" >> "$GITHUB_OUTPUT"
+          echo "ec2_stress_matrix=$(apply_filters generator/resources/ec2_stress_complete_test_matrix.json)" >> "$GITHUB_OUTPUT"
+          echo "ec2_windows_stress_matrix=$(apply_filters generator/resources/ec2_windows_stress_complete_test_matrix.json)" >> "$GITHUB_OUTPUT"
+          echo "ecs_ec2_launch_daemon_matrix=$(apply_filters generator/resources/ecs_ec2_daemon_complete_test_matrix.json)" >> "$GITHUB_OUTPUT"
+          echo "ecs_fargate_matrix=$(apply_filters generator/resources/ecs_fargate_complete_test_matrix.json)" >> "$GITHUB_OUTPUT"
+          echo "eks_daemon_matrix=$(apply_filters generator/resources/eks_daemon_complete_test_matrix.json)" >> "$GITHUB_OUTPUT"
+          echo "eks_deployment_matrix=$(apply_filters generator/resources/eks_deployment_complete_test_matrix.json)" >> "$GITHUB_OUTPUT"
+          echo "ec2_linux_itar_matrix=$(apply_filters generator/resources/ec2_linux_itar_complete_test_matrix.json)" >> "$GITHUB_OUTPUT"
+          echo "ec2_linux_china_matrix=$(apply_filters generator/resources/ec2_linux_china_complete_test_matrix.json)" >> "$GITHUB_OUTPUT"
 
       - name: Echo test plan matrix
         run: |

From 8e20ddfc06a92f3664e6b933bf163ad0fc57e283 Mon Sep 17 00:00:00 2001
From: Marcus Mann <mpmann@amazon.com>
Date: Wed, 11 Feb 2026 12:41:57 -0500
Subject: [PATCH 26/50] fix: Use compact JSON output in apply_filters to
 prevent multiline GITHUB_OUTPUT errors

---
 .github/workflows/test-artifacts.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test-artifacts.yml b/.github/workflows/test-artifacts.yml
index 2a17fd27a79..ea75854126f 100644
--- a/.github/workflows/test-artifacts.yml
+++ b/.github/workflows/test-artifacts.yml
@@ -185,12 +185,12 @@ jobs:
             
             # Apply OS filter if provided
             if [ -n "${{ inputs.test_os_filter }}" ]; then
-              matrix_content=$(echo "$matrix_content" | jq '[.[] | select(.os == "${{ inputs.test_os_filter }}")]')
+              matrix_content=$(echo "$matrix_content" | jq -c '[.[] | select(.os == "${{ inputs.test_os_filter }}")]')
             fi
             
             # Apply test directory filter if provided
             if [ -n "${{ inputs.test_dir_filter }}" ]; then
-              matrix_content=$(echo "$matrix_content" | jq '[.[] | select(.test_dir == "${{ inputs.test_dir_filter }}")]')
+              matrix_content=$(echo "$matrix_content" | jq -c '[.[] | select(.test_dir == "${{ inputs.test_dir_filter }}")]')
             fi
             
             echo "$matrix_content"

From fd28e8e2a73dad30c64db4cb284ce8376fc89161 Mon Sep 17 00:00:00 2001
From: Marcus Mann <mpmann@amazon.com>
Date: Wed, 11 Feb 2026 15:08:01 -0500
Subject: [PATCH 27/50] Add unit tests for poison pill scenario

- TestPoisonPillScenario: Validates continuous batch generation with 10 denied + 1 allowed log group
- TestSingleDeniedLogGroup: Baseline test with 1 denied + 1 allowed log group
- TestRetryHeapSmallerThanFailingLogGroups: Demonstrates deadlock when heap size < failing log groups (SKIPPED)

The third test intentionally deadlocks to prove the bug exists when:
- Retry heap size = concurrency (2)
- Number of failing log groups (10) > heap size (2)
- Workers block trying to push to full heap
- System deadlocks, starving allowed log group
---
 .../internal/pusher/poison_pill_test.go       | 390 ++++++++++++++++++
 1 file changed, 390 insertions(+)
 create mode 100644 plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go

diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go
new file mode 100644
index 00000000000..976f9840d09
--- /dev/null
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go
@@ -0,0 +1,390 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+package pusher
+
+import (
+	"fmt"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/influxdata/telegraf/testutil"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/mock"
+
+	"github.com/aws/amazon-cloudwatch-agent/internal/retryer"
+	"github.com/aws/amazon-cloudwatch-agent/sdk/service/cloudwatchlogs"
+)
+
+// TestPoisonPillScenario validates that when multiple log groups encounter
+// AccessDenied errors simultaneously with low concurrency, the agent continues
+// publishing to allowed log groups without blocking the entire pipeline.
+//
+// This test recreates the scenario from poison-pill-test-findings.md where:
+// - 1 allowed log group + 10 denied log groups
+// - Concurrency = 2
+// - Continuous stream of new batches (simulating force_flush_interval=5s)
+// - Expected: Allowed log group continues receiving events
+// - Historical Bug: Agent stopped publishing to ALL log groups after ~5 minutes
+//
+// This test validates that the retry heap and worker pool architecture correctly
+// handles this scenario by:
+// 1. Continuously generating batches for 10 denied + 1 allowed log group
+// 2. Processing with only 2 workers (low concurrency)
+// 3. Verifying allowed log group continues to receive events throughout
+// 4. Ensuring worker pool doesn't get saturated by failed retry attempts
+//
+// The test passes because the current implementation uses a retry heap with
+// proper backoff, preventing failed batches from monopolizing worker threads.
+func TestPoisonPillScenario(t *testing.T) {
+	heap := NewRetryHeap(100, &testutil.Logger{})
+	defer heap.Stop()
+
+	workerPool := NewWorkerPool(2) // Low concurrency as in the bug scenario
+	defer workerPool.Stop()
+
+	mockService := &mockLogsService{}
+	mockTargetManager := &mockTargetManager{}
+	mockTargetManager.On("EnsureTargetExists", mock.Anything).Return(nil)
+
+	accessDeniedErr := &cloudwatchlogs.AccessDeniedException{
+		Message_: stringPtr("User is not authorized to perform: logs:PutLogEvents with an explicit deny"),
+	}
+
+	// Track successful PutLogEvents calls for the allowed log group
+	var allowedGroupSuccessCount atomic.Int32
+	var deniedGroupAttemptCount atomic.Int32
+
+	// Configure mock service responses with realistic latency
+	mockService.On("PutLogEvents", mock.MatchedBy(func(input *cloudwatchlogs.PutLogEventsInput) bool {
+		return *input.LogGroupName == "log-stream-ple-access-granted"
+	})).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Run(func(args mock.Arguments) {
+		time.Sleep(10 * time.Millisecond) // Simulate API latency
+		allowedGroupSuccessCount.Add(1)
+	})
+
+	mockService.On("PutLogEvents", mock.MatchedBy(func(input *cloudwatchlogs.PutLogEventsInput) bool {
+		return *input.LogGroupName != "log-stream-ple-access-granted"
+	})).Return((*cloudwatchlogs.PutLogEventsOutput)(nil), accessDeniedErr).Run(func(args mock.Arguments) {
+		time.Sleep(10 * time.Millisecond) // Simulate API latency
+		deniedGroupAttemptCount.Add(1)
+	})
+
+	processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, 100*time.Millisecond, retryer.NewLogThrottleRetryer(&testutil.Logger{}))
+
+	// Targets
+	allowedTarget := Target{Group: "log-stream-ple-access-granted", Stream: "i-test"}
+	deniedTargets := make([]Target, 10)
+	for i := 0; i < 10; i++ {
+		deniedTargets[i] = Target{
+			Group:  "aws-restricted-log-group-name-log-stream-ple-access-denied" + string(rune('0'+i)),
+			Stream: "i-test",
+		}
+	}
+
+	// Simulate continuous batch generation over time (like force_flush_interval=5s)
+	done := make(chan struct{})
+	var wg sync.WaitGroup
+
+	// Continuously generate batches for denied log groups (simulating continuous log writes)
+	for i := 0; i < 10; i++ {
+		wg.Add(1)
+		go func(target Target) {
+			defer wg.Done()
+			ticker := time.NewTicker(50 * time.Millisecond) // Simulate flush interval
+			defer ticker.Stop()
+			batchCount := 0
+			for {
+				select {
+				case <-done:
+					return
+				case <-ticker.C:
+					if batchCount >= 5 { // Generate 5 batches per denied log group
+						return
+					}
+					batch := createBatch(target, 50)
+					batch.nextRetryTime = time.Now().Add(-1 * time.Second)
+					heap.Push(batch)
+					batchCount++
+				}
+			}
+		}(deniedTargets[i])
+	}
+
+	// Continuously generate batches for allowed log group
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		ticker := time.NewTicker(50 * time.Millisecond)
+		defer ticker.Stop()
+		batchCount := 0
+		for {
+			select {
+			case <-done:
+				return
+			case <-ticker.C:
+				if batchCount >= 10 { // Generate 10 batches for allowed log group
+					return
+				}
+				batch := createBatch(allowedTarget, 20)
+				batch.nextRetryTime = time.Now().Add(-1 * time.Second)
+				heap.Push(batch)
+				batchCount++
+			}
+		}
+	}()
+
+	// Process batches continuously
+	processorDone := make(chan struct{})
+	go func() {
+		ticker := time.NewTicker(20 * time.Millisecond)
+		defer ticker.Stop()
+		for {
+			select {
+			case <-processorDone:
+				return
+			case <-ticker.C:
+				processor.processReadyMessages()
+			}
+		}
+	}()
+
+	// Run for 2 seconds to simulate sustained load
+	time.Sleep(2 * time.Second)
+	close(done)
+	wg.Wait()
+
+	// Process remaining messages
+	time.Sleep(500 * time.Millisecond)
+	processor.processReadyMessages()
+	time.Sleep(200 * time.Millisecond)
+	close(processorDone)
+
+	// CRITICAL ASSERTION: Allowed log group MUST receive events throughout the test
+	successCount := allowedGroupSuccessCount.Load()
+	t.Logf("Allowed group success count: %d, Denied group attempt count: %d", successCount, deniedGroupAttemptCount.Load())
+	
+	assert.Greater(t, successCount, int32(5),
+		"Allowed log group must continue receiving events despite continuous denied log group failures. Got %d, expected > 5", successCount)
+
+	// Verify denied log groups attempted to send
+	assert.Greater(t, deniedGroupAttemptCount.Load(), int32(0),
+		"Denied log groups should have attempted to send")
+}
+
+// TestRetryHeapSmallerThanFailingLogGroups tests the specific bottleneck scenario where:
+// - Retry heap size = concurrency (e.g., 2)
+// - Number of failing log groups (10) > retry heap size (2)
+// - This causes the retry heap to fill up with failed batches
+// - New batches from failing log groups block trying to push to full heap
+// - Workers get stuck waiting to push failed batches back to heap
+// - Allowed log group gets starved of worker time
+//
+// This test validates the ACTUAL bug: when retry heap size (equal to concurrency)
+// is smaller than the number of failing log groups, the system deadlocks.
+//
+// **EXPECTED BEHAVIOR**: This test will timeout/deadlock, proving the bug exists.
+func TestRetryHeapSmallerThanFailingLogGroups(t *testing.T) {
+	t.Skip("This test intentionally deadlocks to demonstrate the poison pill bug where heap size < failing log groups")
+	
+	concurrency := 2
+	numFailingLogGroups := 10
+	
+	// CRITICAL: Retry heap size equals concurrency (this is the bug)
+	heap := NewRetryHeap(concurrency, &testutil.Logger{})
+	defer heap.Stop()
+
+	workerPool := NewWorkerPool(concurrency)
+	defer workerPool.Stop()
+
+	mockService := &mockLogsService{}
+	mockTargetManager := &mockTargetManager{}
+	mockTargetManager.On("EnsureTargetExists", mock.Anything).Return(nil)
+
+	accessDeniedErr := &cloudwatchlogs.AccessDeniedException{
+		Message_: stringPtr("Access denied"),
+	}
+
+	var allowedGroupSuccessCount atomic.Int32
+	var deniedGroupAttemptCount atomic.Int32
+
+	mockService.On("PutLogEvents", mock.MatchedBy(func(input *cloudwatchlogs.PutLogEventsInput) bool {
+		return *input.LogGroupName == "allowed"
+	})).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Run(func(args mock.Arguments) {
+		time.Sleep(10 * time.Millisecond)
+		allowedGroupSuccessCount.Add(1)
+	})
+
+	mockService.On("PutLogEvents", mock.MatchedBy(func(input *cloudwatchlogs.PutLogEventsInput) bool {
+		return *input.LogGroupName != "allowed"
+	})).Return((*cloudwatchlogs.PutLogEventsOutput)(nil), accessDeniedErr).Run(func(args mock.Arguments) {
+		time.Sleep(10 * time.Millisecond)
+		deniedGroupAttemptCount.Add(1)
+	})
+
+	processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, 50*time.Millisecond, retryer.NewLogThrottleRetryer(&testutil.Logger{}))
+
+	// Create targets
+	allowedTarget := Target{Group: "allowed", Stream: "stream"}
+	deniedTargets := make([]Target, numFailingLogGroups)
+	for i := 0; i < numFailingLogGroups; i++ {
+		deniedTargets[i] = Target{Group: fmt.Sprintf("denied-%d", i), Stream: "stream"}
+	}
+
+	done := make(chan struct{})
+	var wg sync.WaitGroup
+
+	// Generate batches for all failing log groups continuously
+	// This will cause deadlock as heap fills up
+	for i := 0; i < numFailingLogGroups; i++ {
+		wg.Add(1)
+		go func(target Target) {
+			defer wg.Done()
+			ticker := time.NewTicker(30 * time.Millisecond)
+			defer ticker.Stop()
+			batchCount := 0
+			for {
+				select {
+				case <-done:
+					return
+				case <-ticker.C:
+					if batchCount >= 3 {
+						return
+					}
+					batch := createBatch(target, 10)
+					batch.nextRetryTime = time.Now().Add(-1 * time.Second)
+					// This will block when heap is full
+					heap.Push(batch)
+					batchCount++
+				}
+			}
+		}(deniedTargets[i])
+	}
+
+	// Generate batches for allowed log group
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		ticker := time.NewTicker(30 * time.Millisecond)
+		defer ticker.Stop()
+		batchCount := 0
+		for {
+			select {
+			case <-done:
+				return
+			case <-ticker.C:
+				if batchCount >= 5 {
+					return
+				}
+				batch := createBatch(allowedTarget, 10)
+				batch.nextRetryTime = time.Now().Add(-1 * time.Second)
+				heap.Push(batch)
+				batchCount++
+			}
+		}
+	}()
+
+	// Process continuously
+	processorDone := make(chan struct{})
+	go func() {
+		ticker := time.NewTicker(15 * time.Millisecond)
+		defer ticker.Stop()
+		for {
+			select {
+			case <-processorDone:
+				return
+			case <-ticker.C:
+				processor.processReadyMessages()
+			}
+		}
+	}()
+
+	// Run for 1 second
+	time.Sleep(1 * time.Second)
+	close(done)
+	wg.Wait()
+	time.Sleep(300 * time.Millisecond)
+	processor.processReadyMessages()
+	time.Sleep(100 * time.Millisecond)
+	close(processorDone)
+
+	successCount := allowedGroupSuccessCount.Load()
+	
+	t.Logf("Results: Allowed success=%d, Denied attempts=%d, Heap size=%d, Failing groups=%d",
+		successCount, deniedGroupAttemptCount.Load(), concurrency, numFailingLogGroups)
+
+	// This test documents the bug: with heap size < failing log groups, the system deadlocks
+	if successCount == 0 {
+		t.Errorf("POISON PILL BUG DETECTED: Allowed log group received 0 events. Heap size (%d) < failing groups (%d) caused deadlock", concurrency, numFailingLogGroups)
+	}
+}
+
+// TestSingleDeniedLogGroup validates the baseline scenario where a single denied
+// log group does not affect the allowed log group.
+func TestSingleDeniedLogGroup(t *testing.T) {
+	heap := NewRetryHeap(10, &testutil.Logger{})
+	defer heap.Stop()
+
+	workerPool := NewWorkerPool(4) // Higher concurrency as in initial test
+	defer workerPool.Stop()
+
+	mockService := &mockLogsService{}
+	mockTargetManager := &mockTargetManager{}
+	mockTargetManager.On("EnsureTargetExists", mock.Anything).Return(nil)
+
+	accessDeniedErr := &cloudwatchlogs.AccessDeniedException{
+		Message_: stringPtr("Access denied"),
+	}
+
+	var allowedGroupSuccessCount atomic.Int32
+
+	mockService.On("PutLogEvents", mock.MatchedBy(func(input *cloudwatchlogs.PutLogEventsInput) bool {
+		return *input.LogGroupName == "log-stream-ple-access-granted"
+	})).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Run(func(args mock.Arguments) {
+		allowedGroupSuccessCount.Add(1)
+	})
+
+	mockService.On("PutLogEvents", mock.MatchedBy(func(input *cloudwatchlogs.PutLogEventsInput) bool {
+		return *input.LogGroupName == "aws-restricted-log-group-name-log-stream-ple-access-denied"
+	})).Return((*cloudwatchlogs.PutLogEventsOutput)(nil), accessDeniedErr)
+
+	processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, time.Hour, retryer.NewLogThrottleRetryer(&testutil.Logger{}))
+
+	// Create batches
+	allowedTarget := Target{Group: "log-stream-ple-access-granted", Stream: "i-test"}
+	deniedTarget := Target{Group: "aws-restricted-log-group-name-log-stream-ple-access-denied", Stream: "i-test"}
+
+	allowedBatch := createBatch(allowedTarget, 40)
+	deniedBatch := createBatch(deniedTarget, 40)
+
+	allowedBatch.nextRetryTime = time.Now().Add(-1 * time.Second)
+	deniedBatch.nextRetryTime = time.Now().Add(-1 * time.Second)
+
+	err := heap.Push(allowedBatch)
+	assert.NoError(t, err)
+	err = heap.Push(deniedBatch)
+	assert.NoError(t, err)
+
+	processor.processReadyMessages()
+	time.Sleep(100 * time.Millisecond)
+
+	// Verify allowed log group received events
+	assert.Greater(t, allowedGroupSuccessCount.Load(), int32(0),
+		"Allowed log group must receive events with single denied log group")
+}
+
+// createBatch creates a log event batch with the specified number of events
+func createBatch(target Target, eventCount int) *logEventBatch {
+	batch := newLogEventBatch(target, nil)
+	batch.events = make([]*cloudwatchlogs.InputLogEvent, eventCount)
+	now := time.Now().Unix() * 1000
+	for i := 0; i < eventCount; i++ {
+		batch.events[i] = &cloudwatchlogs.InputLogEvent{
+			Message:   stringPtr("test message"),
+			Timestamp: int64Ptr(now + int64(i)),
+		}
+	}
+	return batch
+}

From 6a6c0c6d7b5425421dc1c7f6f1a209acd4f9220c Mon Sep 17 00:00:00 2001
From: Marcus Mann <mpmann@amazon.com>
Date: Wed, 11 Feb 2026 15:21:21 -0500
Subject: [PATCH 28/50] Fix poison pill bug: Make retry heap unbounded

Remove max size constraint from retry heap to prevent deadlock when
failing log groups exceed concurrency limit.

Changes:
- Remove maxSize and semaphore from retryHeap struct
- Make Push() non-blocking (no semaphore wait)
- Remove semaphore release from PopReady()
- Update NewRetryHeap() to ignore maxSize parameter (kept for API compatibility)
- Update TestRetryHeap_SemaphoreBlockingAndUnblocking -> TestRetryHeap_UnboundedPush
- Update TestRetryHeapSmallerThanFailingLogGroups to validate fix

Before: With concurrency=2 and 10 failing log groups, retry heap (size=2)
would fill up, causing workers to block on Push(), leading to deadlock.

After: Retry heap is unbounded, allowing all failed batches to be queued
without blocking workers. Allowed log groups continue publishing normally.

Test results:
- TestRetryHeapSmallerThanFailingLogGroups: PASS (5/5 allowed batches published)
- Heap grew to size 28 (beyond concurrency limit of 2)
- No deadlock or starvation
---
 .gitignore                                    |  1 +
 .../internal/pusher/poison_pill_test.go       | 26 ++++----
 .../internal/pusher/retryheap.go              | 53 +++++------------
 .../internal/pusher/retryheap_test.go         | 59 +++++++------------
 4 files changed, 47 insertions(+), 92 deletions(-)

diff --git a/.gitignore b/.gitignore
index 9118e6e9c2f..38313bfed5b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,4 @@ CWAGENT_VERSION
 terraform.*
 **/.terraform/*
 coverage.txt
+agent-sops/
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go
index 976f9840d09..bb4589d4e4e 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go
@@ -177,22 +177,16 @@ func TestPoisonPillScenario(t *testing.T) {
 // TestRetryHeapSmallerThanFailingLogGroups tests the specific bottleneck scenario where:
 // - Retry heap size = concurrency (e.g., 2)
 // - Number of failing log groups (10) > retry heap size (2)
-// - This causes the retry heap to fill up with failed batches
-// - New batches from failing log groups block trying to push to full heap
-// - Workers get stuck waiting to push failed batches back to heap
-// - Allowed log group gets starved of worker time
+// - With bounded heap: This caused deadlock as heap filled up
+// - With unbounded heap: System handles this gracefully
 //
-// This test validates the ACTUAL bug: when retry heap size (equal to concurrency)
-// is smaller than the number of failing log groups, the system deadlocks.
-//
-// **EXPECTED BEHAVIOR**: This test will timeout/deadlock, proving the bug exists.
+// This test validates the FIX: unbounded retry heap allows all failed batches
+// to be queued without blocking workers.
 func TestRetryHeapSmallerThanFailingLogGroups(t *testing.T) {
-	t.Skip("This test intentionally deadlocks to demonstrate the poison pill bug where heap size < failing log groups")
-	
 	concurrency := 2
 	numFailingLogGroups := 10
 	
-	// CRITICAL: Retry heap size equals concurrency (this is the bug)
+	// Retry heap is now unbounded (maxSize parameter ignored)
 	heap := NewRetryHeap(concurrency, &testutil.Logger{})
 	defer heap.Stop()
 
@@ -237,7 +231,6 @@ func TestRetryHeapSmallerThanFailingLogGroups(t *testing.T) {
 	var wg sync.WaitGroup
 
 	// Generate batches for all failing log groups continuously
-	// This will cause deadlock as heap fills up
 	for i := 0; i < numFailingLogGroups; i++ {
 		wg.Add(1)
 		go func(target Target) {
@@ -255,7 +248,6 @@ func TestRetryHeapSmallerThanFailingLogGroups(t *testing.T) {
 					}
 					batch := createBatch(target, 10)
 					batch.nextRetryTime = time.Now().Add(-1 * time.Second)
-					// This will block when heap is full
 					heap.Push(batch)
 					batchCount++
 				}
@@ -313,11 +305,13 @@ func TestRetryHeapSmallerThanFailingLogGroups(t *testing.T) {
 	successCount := allowedGroupSuccessCount.Load()
 	
 	t.Logf("Results: Allowed success=%d, Denied attempts=%d, Heap size=%d, Failing groups=%d",
-		successCount, deniedGroupAttemptCount.Load(), concurrency, numFailingLogGroups)
+		successCount, deniedGroupAttemptCount.Load(), heap.Size(), numFailingLogGroups)
 
-	// This test documents the bug: with heap size < failing log groups, the system deadlocks
+	// With unbounded heap, allowed log group should receive events
 	if successCount == 0 {
-		t.Errorf("POISON PILL BUG DETECTED: Allowed log group received 0 events. Heap size (%d) < failing groups (%d) caused deadlock", concurrency, numFailingLogGroups)
+		t.Errorf("UNEXPECTED: Allowed log group received 0 events with unbounded heap")
+	} else {
+		t.Logf("SUCCESS: Unbounded heap handled poison pill scenario: %d successful publishes despite %d failing groups", successCount, numFailingLogGroups)
 	}
 }
 
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
index 025a3063ad1..90ff0c26539 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
@@ -49,56 +49,37 @@ type RetryHeap interface {
 }
 
 type retryHeap struct {
-	heap      retryHeapImpl
-	mutex     sync.RWMutex
-	semaphore chan struct{} // Size enforcer
-	stopCh    chan struct{}
-	maxSize   int
-	stopped   bool
-	logger    telegraf.Logger
+	heap    retryHeapImpl
+	mutex   sync.RWMutex
+	stopCh  chan struct{}
+	stopped bool
+	logger  telegraf.Logger
 }
 
 var _ RetryHeap = (*retryHeap)(nil)
 
-// NewRetryHeap creates a new retry heap with the specified maximum size
+// NewRetryHeap creates a new retry heap (unbounded)
 func NewRetryHeap(maxSize int, logger telegraf.Logger) RetryHeap {
 	rh := &retryHeap{
-		heap:      make(retryHeapImpl, 0, maxSize),
-		maxSize:   maxSize,
-		semaphore: make(chan struct{}, maxSize), // Semaphore for size enforcement
-		stopCh:    make(chan struct{}),
-		logger:    logger,
+		heap:   make(retryHeapImpl, 0),
+		stopCh: make(chan struct{}),
+		logger: logger,
 	}
 	heap.Init(&rh.heap)
 	return rh
 }
 
-// Push adds a batch to the heap, blocking if full
+// Push adds a batch to the heap (non-blocking)
 func (rh *retryHeap) Push(batch *logEventBatch) error {
-	rh.mutex.RLock()
+	rh.mutex.Lock()
+	defer rh.mutex.Unlock()
+
 	if rh.stopped {
-		rh.mutex.RUnlock()
-		return errors.New("retry heap stopped")
-	}
-	rh.mutex.RUnlock()
-
-	// Acquire semaphore slot (blocks if at maxSize capacity)
-	select {
-	case rh.semaphore <- struct{}{}:
-		// add batch to heap with mutex protection
-		rh.mutex.Lock()
-		if rh.stopped {
-			// Release semaphore if stopped after acquiring
-			<-rh.semaphore
-			rh.mutex.Unlock()
-			return errors.New("retry heap stopped")
-		}
-		heap.Push(&rh.heap, batch)
-		rh.mutex.Unlock()
-		return nil
-	case <-rh.stopCh:
 		return errors.New("retry heap stopped")
 	}
+
+	heap.Push(&rh.heap, batch)
+	return nil
 }
 
 // PopReady returns all batches that are ready for retry (nextRetryTime <= now)
@@ -113,8 +94,6 @@ func (rh *retryHeap) PopReady() []*logEventBatch {
 	for len(rh.heap) > 0 && !rh.heap[0].nextRetryTime.After(now) {
 		batch := heap.Pop(&rh.heap).(*logEventBatch)
 		ready = append(ready, batch)
-		// Release semaphore slot for each popped batch
-		<-rh.semaphore
 	}
 
 	return ready
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go
index edc7dbc3145..038bdbfdbf5 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go
@@ -145,65 +145,46 @@ func TestRetryHeapProcessorSendsBatch(t *testing.T) {
 	assert.Equal(t, 0, heap.Size())
 }
 
-func TestRetryHeap_SemaphoreBlockingAndUnblocking(t *testing.T) {
-	heap := NewRetryHeap(2, &testutil.Logger{}) // maxSize = 2
+func TestRetryHeap_UnboundedPush(t *testing.T) {
+	heap := NewRetryHeap(2, &testutil.Logger{}) // maxSize parameter ignored (unbounded)
 	defer heap.Stop()
 
-	// Fill heap to capacity with batches that will be ready in 3 seconds
+	// Push multiple batches without blocking
 	target := Target{Group: "group", Stream: "stream"}
 	batch1 := newLogEventBatch(target, nil)
 	batch1.nextRetryTime = time.Now().Add(3 * time.Second)
 	batch2 := newLogEventBatch(target, nil)
 	batch2.nextRetryTime = time.Now().Add(3 * time.Second)
+	batch3 := newLogEventBatch(target, nil)
+	batch3.nextRetryTime = time.Now().Add(3 * time.Second)
 
-	heap.Push(batch1)
-	heap.Push(batch2)
-
-	// Verify heap is at capacity
-	if heap.Size() != 2 {
-		t.Fatalf("Expected size 2, got %d", heap.Size())
-	}
+	// All pushes should succeed immediately (non-blocking)
+	err := heap.Push(batch1)
+	assert.NoError(t, err)
+	err = heap.Push(batch2)
+	assert.NoError(t, err)
+	err = heap.Push(batch3)
+	assert.NoError(t, err)
 
-	// Test that semaphore is actually blocking by trying to push in a goroutine
-	pushResult := make(chan error, 1)
-
-	go func() {
-		batch3 := newLogEventBatch(target, nil)
-		batch3.nextRetryTime = time.Now().Add(-1 * time.Hour)
-		heap.Push(batch3) // This should block on semaphore
-		pushResult <- nil
-	}()
-
-	// Verify the push is blocked (expects no result in channel)
-	select {
-	case <-pushResult:
-		t.Fatal("Unexpected push, heap should be blocked")
-	case <-time.After(100 * time.Millisecond):
-		// Push is successfully blocked when at capacity
+	// Verify heap can grow beyond original maxSize parameter
+	if heap.Size() != 3 {
+		t.Fatalf("Expected size 3, got %d", heap.Size())
 	}
 
 	time.Sleep(3 * time.Second)
 
-	// Pop ready batches to release semaphore slots
+	// Pop ready batches
 	readyBatches := heap.PopReady()
-	assert.Len(t, readyBatches, 2, "Should pop exactly 2 ready batches")
+	assert.Len(t, readyBatches, 3, "Should pop exactly 3 ready batches")
 
 	for _, batch := range readyBatches {
 		assert.Equal(t, "group", batch.Group)
 		assert.Equal(t, "stream", batch.Stream)
 	}
 
-	// Expects push to now be unblocked
-	select {
-	case err := <-pushResult:
-		assert.NoError(t, err, "Push should succeed after PopReady")
-	case <-time.After(100 * time.Millisecond):
-		t.Fatal("Unexpected timeout, heap should be unblocked")
-	}
-
-	// Verify 1 item remaining in heap (2 popped, 1 pushed)
-	if heap.Size() != 1 {
-		t.Fatalf("Expected size 1 after pop/push cycle, got %d", heap.Size())
+	// Verify heap is empty
+	if heap.Size() != 0 {
+		t.Fatalf("Expected size 0 after pop, got %d", heap.Size())
 	}
 }
 

From 60b6f49796924b6a60291e622b13b362614f6a84 Mon Sep 17 00:00:00 2001
From: Marcus Mann <mpmann@amazon.com>
Date: Wed, 11 Feb 2026 15:29:20 -0500
Subject: [PATCH 29/50] Remove test filtering feature (moved to separate PR)

---
 .github/workflows/integration-test.yml | 10 ----
 .github/workflows/test-artifacts.yml   | 69 +++++++-------------------
 2 files changed, 17 insertions(+), 62 deletions(-)

diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml
index 5534cd38889..0718615b597 100644
--- a/.github/workflows/integration-test.yml
+++ b/.github/workflows/integration-test.yml
@@ -33,14 +33,6 @@ on:
       test_repo_branch:
         description: 'Override for the GitHub test repository branch to use (default is main)'
         type: string
-      test_os_filter:
-        description: 'Filter tests to specific OS (e.g., al2023, ubuntu-22.04). Leave empty to run all OS.'
-        type: string
-        required: false
-      test_dir_filter:
-        description: 'Filter tests to specific test directory (e.g., ./test/cloudwatchlogs). Leave empty to run all tests.'
-        type: string
-        required: false
 
 jobs:
   CheckBuildTestArtifacts:
@@ -78,5 +70,3 @@ jobs:
     with:
       build_id: ${{ inputs.build_sha }}
       test_repo_branch: ${{ inputs.test_repo_branch }}
-      test_os_filter: ${{ inputs.test_os_filter }}
-      test_dir_filter: ${{ inputs.test_dir_filter }}
diff --git a/.github/workflows/test-artifacts.yml b/.github/workflows/test-artifacts.yml
index ced412144c2..e85f5d9b6cc 100644
--- a/.github/workflows/test-artifacts.yml
+++ b/.github/workflows/test-artifacts.yml
@@ -29,14 +29,6 @@ on:
       test_repo_branch:
         description: 'Override for the GitHub test repository branch to use (default is main)'
         type: string
-      test_os_filter:
-        description: 'Filter tests to specific OS (e.g., al2023, ubuntu-22.04)'
-        type: string
-        required: false
-      test_dir_filter:
-        description: 'Filter tests to specific test directory (e.g., ./test/cloudwatchlogs)'
-        type: string
-        required: false
   workflow_call:
     inputs:
       build_id:
@@ -46,14 +38,6 @@ on:
       test_repo_branch:
         description: 'Override for the GitHub test repository branch to use (default is main)'
         type: string
-      test_os_filter:
-        description: 'Filter tests to specific OS (e.g., al2023, ubuntu-22.04)'
-        type: string
-        required: false
-      test_dir_filter:
-        description: 'Filter tests to specific test directory (e.g., ./test/cloudwatchlogs)'
-        type: string
-        required: false
 
 concurrency:
   group: ${{ github.workflow }}-${{ inputs.build_id }}
@@ -181,42 +165,23 @@ jobs:
         id: set-matrix
         run: |
           go run --tags=generator generator/test_case_generator.go
-          
-          # Function to apply filters to a matrix
-          apply_filters() {
-            local matrix_file=$1
-            local matrix_content=$(cat "$matrix_file")
-            
-            # Apply OS filter if provided
-            if [ -n "${{ inputs.test_os_filter }}" ]; then
-              matrix_content=$(echo "$matrix_content" | jq '[.[] | select(.os == "${{ inputs.test_os_filter }}")]')
-            fi
-            
-            # Apply test directory filter if provided
-            if [ -n "${{ inputs.test_dir_filter }}" ]; then
-              matrix_content=$(echo "$matrix_content" | jq '[.[] | select(.test_dir == "${{ inputs.test_dir_filter }}")]')
-            fi
-            
-            echo "$matrix_content"
-          }
-          
-          echo "ec2_gpu_matrix=$(apply_filters generator/resources/ec2_gpu_complete_test_matrix.json)" >> "$GITHUB_OUTPUT"
-          echo "eks_addon_matrix=$(apply_filters generator/resources/eks_addon_complete_test_matrix.json)" >> "$GITHUB_OUTPUT"
-          echo "ec2_linux_matrix=$(apply_filters generator/resources/ec2_linux_complete_test_matrix.json)" >> "$GITHUB_OUTPUT"
-          echo "ec2_linux_onprem_matrix=$(apply_filters generator/resources/ec2_linux_onprem_complete_test_matrix.json)" >> "$GITHUB_OUTPUT"
-          echo "ec2_selinux_matrix=$(apply_filters generator/resources/ec2_selinux_complete_test_matrix.json)" >> "$GITHUB_OUTPUT"
-          echo "ec2_windows_matrix=$(apply_filters generator/resources/ec2_windows_complete_test_matrix.json)" >> "$GITHUB_OUTPUT"
-          echo "ec2_mac_matrix=$(apply_filters generator/resources/ec2_mac_complete_test_matrix.json)" >> "$GITHUB_OUTPUT"
-          echo "ec2_performance_matrix=$(apply_filters generator/resources/ec2_performance_complete_test_matrix.json)" >> "$GITHUB_OUTPUT"
-          echo "ec2_windows_performance_matrix=$(apply_filters generator/resources/ec2_windows_performance_complete_test_matrix.json)" >> "$GITHUB_OUTPUT"
-          echo "ec2_stress_matrix=$(apply_filters generator/resources/ec2_stress_complete_test_matrix.json)" >> "$GITHUB_OUTPUT"
-          echo "ec2_windows_stress_matrix=$(apply_filters generator/resources/ec2_windows_stress_complete_test_matrix.json)" >> "$GITHUB_OUTPUT"
-          echo "ecs_ec2_launch_daemon_matrix=$(apply_filters generator/resources/ecs_ec2_daemon_complete_test_matrix.json)" >> "$GITHUB_OUTPUT"
-          echo "ecs_fargate_matrix=$(apply_filters generator/resources/ecs_fargate_complete_test_matrix.json)" >> "$GITHUB_OUTPUT"
-          echo "eks_daemon_matrix=$(apply_filters generator/resources/eks_daemon_complete_test_matrix.json)" >> "$GITHUB_OUTPUT"
-          echo "eks_deployment_matrix=$(apply_filters generator/resources/eks_deployment_complete_test_matrix.json)" >> "$GITHUB_OUTPUT"
-          echo "ec2_linux_itar_matrix=$(apply_filters generator/resources/ec2_linux_itar_complete_test_matrix.json)" >> "$GITHUB_OUTPUT"
-          echo "ec2_linux_china_matrix=$(apply_filters generator/resources/ec2_linux_china_complete_test_matrix.json)" >> "$GITHUB_OUTPUT"
+          echo "ec2_gpu_matrix=$(echo $(cat generator/resources/ec2_gpu_complete_test_matrix.json))" >> "$GITHUB_OUTPUT"
+          echo "eks_addon_matrix=$(echo $(cat generator/resources/eks_addon_complete_test_matrix.json))" >> "$GITHUB_OUTPUT"
+          echo "ec2_linux_matrix=$(echo $(cat generator/resources/ec2_linux_complete_test_matrix.json))" >> "$GITHUB_OUTPUT"
+          echo "ec2_linux_onprem_matrix=$(echo $(cat generator/resources/ec2_linux_onprem_complete_test_matrix.json))" >> "$GITHUB_OUTPUT"
+          echo "ec2_selinux_matrix=$(echo $(cat generator/resources/ec2_selinux_complete_test_matrix.json))" >> "$GITHUB_OUTPUT"
+          echo "ec2_windows_matrix=$(echo $(cat generator/resources/ec2_windows_complete_test_matrix.json))" >> "$GITHUB_OUTPUT"
+          echo "ec2_mac_matrix=$(echo $(cat generator/resources/ec2_mac_complete_test_matrix.json))" >> "$GITHUB_OUTPUT"
+          echo "ec2_performance_matrix=$(echo $(cat generator/resources/ec2_performance_complete_test_matrix.json))" >> "$GITHUB_OUTPUT"
+          echo "ec2_windows_performance_matrix=$(echo $(cat generator/resources/ec2_windows_performance_complete_test_matrix.json))" >> "$GITHUB_OUTPUT"
+          echo "ec2_stress_matrix=$(echo $(cat generator/resources/ec2_stress_complete_test_matrix.json))" >> "$GITHUB_OUTPUT"
+          echo "ec2_windows_stress_matrix=$(echo $(cat generator/resources/ec2_windows_stress_complete_test_matrix.json))" >> "$GITHUB_OUTPUT"
+          echo "ecs_ec2_launch_daemon_matrix=$(echo $(cat generator/resources/ecs_ec2_daemon_complete_test_matrix.json))" >> "$GITHUB_OUTPUT"
+          echo "ecs_fargate_matrix=$(echo $(cat generator/resources/ecs_fargate_complete_test_matrix.json))" >> "$GITHUB_OUTPUT"
+          echo "eks_daemon_matrix=$(echo $(cat generator/resources/eks_daemon_complete_test_matrix.json))" >> "$GITHUB_OUTPUT"
+          echo "eks_deployment_matrix=$(echo $(cat generator/resources/eks_deployment_complete_test_matrix.json))" >> "$GITHUB_OUTPUT"
+          echo "ec2_linux_itar_matrix=$(echo $(cat generator/resources/ec2_linux_itar_complete_test_matrix.json))" >> "$GITHUB_OUTPUT"
+          echo "ec2_linux_china_matrix=$(echo $(cat generator/resources/ec2_linux_china_complete_test_matrix.json))" >> "$GITHUB_OUTPUT"
 
       - name: Echo test plan matrix
         run: |

From 1b1973b98b442c348bfb128116429f1b48ad3c8d Mon Sep 17 00:00:00 2001
From: Marcus Mann <mpmann@amazon.com>
Date: Wed, 11 Feb 2026 15:47:54 -0500
Subject: [PATCH 30/50] Trigger PR diff refresh


From a814482696abb93191dd112adaa4fcd1c0ca3941 Mon Sep 17 00:00:00 2001
From: Marcus Mann <mpmann@amazon.com>
Date: Wed, 11 Feb 2026 15:58:59 -0500
Subject: [PATCH 31/50] Revert gitignore changes (remove agent-sops)

---
 .gitignore | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 38313bfed5b..9118e6e9c2f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,4 +10,3 @@ CWAGENT_VERSION
 terraform.*
 **/.terraform/*
 coverage.txt
-agent-sops/

From fd2ea56a0db3d2638b8ac806f17ca3b4118af5bf Mon Sep 17 00:00:00 2001
From: Marcus Mann <mpmann@amazon.com>
Date: Thu, 12 Feb 2026 15:08:42 -0500
Subject: [PATCH 32/50] refactor(pusher): Remove unused maxSize parameter from
 NewRetryHeap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The retry heap is now unbounded, so maxSize is no longer used.

🤖 Assisted by AI
---
 .../outputs/cloudwatchlogs/cloudwatchlogs.go  |  2 +-
 .../internal/pusher/poison_pill_test.go       | 10 +++++-----
 .../internal/pusher/pool_test.go              |  2 +-
 .../internal/pusher/pusher_test.go            |  2 +-
 .../internal/pusher/queue_test.go             |  1 -
 .../internal/pusher/retryheap.go              |  2 +-
 .../internal/pusher/retryheap_expiry_test.go  |  2 +-
 .../pusher/retryheap_recovery_test.go         |  8 ++++----
 .../internal/pusher/retryheap_test.go         | 20 +++++++++----------
 .../cloudwatchlogs/internal/pusher/sender.go  |  1 -
 .../internal/pusher/sender_test.go            |  6 +++---
 11 files changed, 27 insertions(+), 29 deletions(-)

diff --git a/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go b/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go
index d35542ff930..6283b90f3bd 100644
--- a/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go
+++ b/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go
@@ -174,7 +174,7 @@ func (c *CloudWatchLogs) getDest(t pusher.Target, logSrc logs.LogSrc) *cwDest {
 	c.once.Do(func() {
 		if c.Concurrency > 1 {
 			c.workerPool = pusher.NewWorkerPool(c.Concurrency)
-			c.retryHeap = pusher.NewRetryHeap(c.Concurrency, c.Log)
+			c.retryHeap = pusher.NewRetryHeap(c.Log)
 
 			retryHeapProcessorRetryer := retryer.NewLogThrottleRetryer(c.Log)
 			retryHeapProcessorClient := c.createClient(retryHeapProcessorRetryer)
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go
index bb4589d4e4e..5652209831f 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go
@@ -39,7 +39,7 @@ import (
 // The test passes because the current implementation uses a retry heap with
 // proper backoff, preventing failed batches from monopolizing worker threads.
 func TestPoisonPillScenario(t *testing.T) {
-	heap := NewRetryHeap(100, &testutil.Logger{})
+	heap := NewRetryHeap(&testutil.Logger{})
 	defer heap.Stop()
 
 	workerPool := NewWorkerPool(2) // Low concurrency as in the bug scenario
@@ -165,7 +165,7 @@ func TestPoisonPillScenario(t *testing.T) {
 	// CRITICAL ASSERTION: Allowed log group MUST receive events throughout the test
 	successCount := allowedGroupSuccessCount.Load()
 	t.Logf("Allowed group success count: %d, Denied group attempt count: %d", successCount, deniedGroupAttemptCount.Load())
-	
+
 	assert.Greater(t, successCount, int32(5),
 		"Allowed log group must continue receiving events despite continuous denied log group failures. Got %d, expected > 5", successCount)
 
@@ -185,7 +185,7 @@ func TestPoisonPillScenario(t *testing.T) {
 func TestRetryHeapSmallerThanFailingLogGroups(t *testing.T) {
 	concurrency := 2
 	numFailingLogGroups := 10
-	
+
 	// Retry heap is now unbounded (maxSize parameter ignored)
 	heap := NewRetryHeap(concurrency, &testutil.Logger{})
 	defer heap.Stop()
@@ -303,7 +303,7 @@ func TestRetryHeapSmallerThanFailingLogGroups(t *testing.T) {
 	close(processorDone)
 
 	successCount := allowedGroupSuccessCount.Load()
-	
+
 	t.Logf("Results: Allowed success=%d, Denied attempts=%d, Heap size=%d, Failing groups=%d",
 		successCount, deniedGroupAttemptCount.Load(), heap.Size(), numFailingLogGroups)
 
@@ -318,7 +318,7 @@ func TestRetryHeapSmallerThanFailingLogGroups(t *testing.T) {
 // TestSingleDeniedLogGroup validates the baseline scenario where a single denied
 // log group does not affect the allowed log group.
 func TestSingleDeniedLogGroup(t *testing.T) {
-	heap := NewRetryHeap(10, &testutil.Logger{})
+	heap := NewRetryHeap(&testutil.Logger{})
 	defer heap.Stop()
 
 	workerPool := NewWorkerPool(4) // Higher concurrency as in initial test
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go
index 9abf746b5fd..52d2a1fbd63 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go
@@ -137,7 +137,7 @@ func TestSenderPoolRetryHeap(_ *testing.T) {
 	mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil)
 
 	// Create RetryHeap
-	retryHeap := NewRetryHeap(10, logger)
+	retryHeap := NewRetryHeap(logger)
 	defer retryHeap.Stop()
 
 	s := newSender(logger, mockService, nil, retryHeap)
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go
index b80a201a07e..e862c99b64f 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go
@@ -135,7 +135,7 @@ func TestPusherRetryHeap(t *testing.T) {
 	workerPool := NewWorkerPool(2)
 	defer workerPool.Stop()
 
-	retryHeap := NewRetryHeap(10, logger)
+	retryHeap := NewRetryHeap(logger)
 	defer retryHeap.Stop()
 
 	var wg sync.WaitGroup
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
index d4b64a17a4c..9ca08f7654b 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
@@ -646,7 +646,6 @@ func TestAddEventNonBlocking(t *testing.T) {
 	wg.Wait()
 }
 
-
 // Cannot call q.send() directly as it would cause a race condition. Reset last sent time and trigger flush.
 func triggerSend(t *testing.T, q *queue) {
 	t.Helper()
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
index 90ff0c26539..b837be68310 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
@@ -59,7 +59,7 @@ type retryHeap struct {
 var _ RetryHeap = (*retryHeap)(nil)
 
 // NewRetryHeap creates a new retry heap (unbounded)
-func NewRetryHeap(maxSize int, logger telegraf.Logger) RetryHeap {
+func NewRetryHeap(logger telegraf.Logger) RetryHeap {
 	rh := &retryHeap{
 		heap:   make(retryHeapImpl, 0),
 		stopCh: make(chan struct{}),
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go
index 8726a329890..35bd0c28261 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go
@@ -44,7 +44,7 @@ func TestRetryHeapProcessorExpiredBatchShouldResume(t *testing.T) {
 	target := Target{Group: "failing-group", Stream: "stream"}
 
 	// Create retry heap and processor with very short expiry for testing
-	retryHeap := NewRetryHeap(10, logger)
+	retryHeap := NewRetryHeap(logger)
 	workerPool := NewWorkerPool(5)
 	tm := NewTargetManager(logger, mockService)
 	maxRetryDuration := 50 * time.Millisecond // Normally 14 days
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_recovery_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_recovery_test.go
index 067f4af3136..7225355f0d2 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_recovery_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_recovery_test.go
@@ -22,7 +22,7 @@ import (
 // and successfully publishes logs.
 // This test addresses CWQS-3192 requirement 1.
 func TestRecoveryWhenPermissionGrantedDuringRetry(t *testing.T) {
-	heap := NewRetryHeap(10, &testutil.Logger{})
+	heap := NewRetryHeap(&testutil.Logger{})
 	defer heap.Stop()
 
 	workerPool := NewWorkerPool(2)
@@ -33,7 +33,7 @@ func TestRecoveryWhenPermissionGrantedDuringRetry(t *testing.T) {
 	accessDeniedErr := &cloudwatchlogs.AccessDeniedException{
 		Message_: stringPtr("Access denied"),
 	}
-	
+
 	// First call fails with AccessDenied
 	mockService.On("PutLogEvents", mock.Anything).Return((*cloudwatchlogs.PutLogEventsOutput)(nil), accessDeniedErr).Once()
 	// Second call succeeds (permission granted)
@@ -114,7 +114,7 @@ func TestRecoveryWhenPermissionGrantedDuringRetry(t *testing.T) {
 // retry ongoing, it resumes correctly by loading state and continuing retries.
 // This test addresses CWQS-3192 requirement 2.
 func TestRecoveryAfterSystemRestart(t *testing.T) {
-	heap := NewRetryHeap(10, &testutil.Logger{})
+	heap := NewRetryHeap(&testutil.Logger{})
 	defer heap.Stop()
 
 	workerPool := NewWorkerPool(2)
@@ -185,7 +185,7 @@ func TestRecoveryAfterSystemRestart(t *testing.T) {
 // TestRecoveryWithMultipleTargets validates that when one target has permission
 // issues, other healthy targets continue publishing successfully.
 func TestRecoveryWithMultipleTargets(t *testing.T) {
-	heap := NewRetryHeap(10, &testutil.Logger{})
+	heap := NewRetryHeap(&testutil.Logger{})
 	defer heap.Stop()
 
 	workerPool := NewWorkerPool(2)
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go
index 038bdbfdbf5..2313239367c 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go
@@ -16,7 +16,7 @@ import (
 )
 
 func TestRetryHeap(t *testing.T) {
-	heap := NewRetryHeap(10, &testutil.Logger{})
+	heap := NewRetryHeap(&testutil.Logger{})
 	defer heap.Stop()
 
 	// Test empty heap
@@ -48,7 +48,7 @@ func TestRetryHeap(t *testing.T) {
 }
 
 func TestRetryHeapOrdering(t *testing.T) {
-	heap := NewRetryHeap(10, &testutil.Logger{})
+	heap := NewRetryHeap(&testutil.Logger{})
 	defer heap.Stop()
 
 	target := Target{Group: "group", Stream: "stream"}
@@ -80,7 +80,7 @@ func TestRetryHeapOrdering(t *testing.T) {
 }
 
 func TestRetryHeapProcessor(t *testing.T) {
-	heap := NewRetryHeap(10, &testutil.Logger{})
+	heap := NewRetryHeap(&testutil.Logger{})
 	defer heap.Stop()
 
 	// Create mock components with proper signature
@@ -99,7 +99,7 @@ func TestRetryHeapProcessor(t *testing.T) {
 }
 
 func TestRetryHeapProcessorExpiredBatch(t *testing.T) {
-	heap := NewRetryHeap(10, &testutil.Logger{})
+	heap := NewRetryHeap(&testutil.Logger{})
 	defer heap.Stop()
 
 	workerPool := NewWorkerPool(2)
@@ -123,7 +123,7 @@ func TestRetryHeapProcessorExpiredBatch(t *testing.T) {
 }
 
 func TestRetryHeapProcessorSendsBatch(t *testing.T) {
-	heap := NewRetryHeap(10, &testutil.Logger{})
+	heap := NewRetryHeap(&testutil.Logger{})
 	defer heap.Stop()
 
 	workerPool := NewWorkerPool(2)
@@ -146,7 +146,7 @@ func TestRetryHeapProcessorSendsBatch(t *testing.T) {
 }
 
 func TestRetryHeap_UnboundedPush(t *testing.T) {
-	heap := NewRetryHeap(2, &testutil.Logger{}) // maxSize parameter ignored (unbounded)
+	heap := NewRetryHeap(&testutil.Logger{}) // maxSize parameter ignored (unbounded)
 	defer heap.Stop()
 
 	// Push multiple batches without blocking
@@ -189,7 +189,7 @@ func TestRetryHeap_UnboundedPush(t *testing.T) {
 }
 
 func TestRetryHeapProcessorNoReadyBatches(t *testing.T) {
-	heap := NewRetryHeap(10, &testutil.Logger{})
+	heap := NewRetryHeap(&testutil.Logger{})
 	defer heap.Stop()
 
 	workerPool := NewWorkerPool(2)
@@ -206,7 +206,7 @@ func TestRetryHeapProcessorNoReadyBatches(t *testing.T) {
 }
 
 func TestRetryHeapProcessorFailedBatchGoesBackToHeap(t *testing.T) {
-	heap := NewRetryHeap(10, &testutil.Logger{})
+	heap := NewRetryHeap(&testutil.Logger{})
 	defer heap.Stop()
 
 	workerPool := NewWorkerPool(2)
@@ -246,7 +246,7 @@ func TestRetryHeapProcessorFailedBatchGoesBackToHeap(t *testing.T) {
 }
 
 func TestRetryHeapStopTwice(t *testing.T) {
-	rh := NewRetryHeap(5, &testutil.Logger{})
+	rh := NewRetryHeap(&testutil.Logger{})
 
 	// Call Stop twice - should not panic
 	rh.Stop()
@@ -263,7 +263,7 @@ func TestRetryHeapStopTwice(t *testing.T) {
 }
 
 func TestRetryHeapProcessorStoppedProcessReadyMessages(t *testing.T) {
-	heap := NewRetryHeap(10, &testutil.Logger{})
+	heap := NewRetryHeap(&testutil.Logger{})
 	defer heap.Stop()
 
 	workerPool := NewWorkerPool(2)
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go b/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go
index ad38d7960f7..6a34be1e43e 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go
@@ -151,4 +151,3 @@ func (s *sender) Stop() {
 	close(s.stopCh)
 	s.stopped = true
 }
-
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go
index fc52e673a88..973533f3ab8 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go
@@ -252,11 +252,11 @@ func TestSender(t *testing.T) {
 			Return(&cloudwatchlogs.PutLogEventsOutput{}, awserr.New("SomeAWSError", "Some AWS error", nil)).Once()
 
 		s := newSender(logger, mockService, mockManager, nil)
-		
+
 		// Set expireAfter to past time so batch expires immediately after first retry
 		batch.initializeStartTime()
 		batch.expireAfter = time.Now().Add(-1 * time.Hour)
-		
+
 		s.Send(batch)
 		s.Stop()
 
@@ -304,7 +304,7 @@ func TestSenderConcurrencyWithRetryHeap(t *testing.T) {
 	mockManager := new(mockTargetManager)
 	mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, &cloudwatchlogs.ServiceUnavailableException{}).Once()
 
-	retryHeap := NewRetryHeap(10, logger)
+	retryHeap := NewRetryHeap(logger)
 	defer retryHeap.Stop()
 
 	s := newSender(logger, mockService, mockManager, retryHeap)

From 38afc5f638ada8935720fc8411b6c5438bf9a695 Mon Sep 17 00:00:00 2001
From: Marcus Mann <mpmann@amazon.com>
Date: Thu, 12 Feb 2026 15:09:21 -0500
Subject: [PATCH 33/50] fix(pusher): Remove redundant updateState call in
 retryheap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

batch.done() already calls updateState() internally, so the explicit
call is unnecessary.

🤖 Assisted by AI
---
 plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go | 1 -
 1 file changed, 1 deletion(-)

diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
index b837be68310..258c2795e23 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
@@ -198,7 +198,6 @@ func (p *RetryHeapProcessor) processReadyMessages() {
 		// Check if batch has expired
 		if batch.isExpired() {
 			p.logger.Errorf("Dropping expired batch for %v/%v", batch.Group, batch.Stream)
-			batch.updateState()
 			batch.done() // Resume circuit breaker to allow target to process new batches
 			continue
 		}

From 3e1ed82c3ee35361bb91e516a719b59d7047008b Mon Sep 17 00:00:00 2001
From: Marcus Mann <mpmann@amazon.com>
Date: Thu, 12 Feb 2026 15:09:57 -0500
Subject: [PATCH 34/50] test(pusher): Remove empty TestSenderPoolRetryHeap test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Test had no assertions and was not validating any behavior.

🤖 Assisted by AI
---
 .../internal/pusher/pool_test.go               | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go
index 52d2a1fbd63..ed74249250e 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go
@@ -130,21 +130,3 @@ func TestSenderPool(t *testing.T) {
 	s.Stop()
 	assert.Equal(t, int32(200), completed.Load())
 }
-
-func TestSenderPoolRetryHeap(_ *testing.T) {
-	logger := testutil.NewNopLogger()
-	mockService := new(mockLogsService)
-	mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil)
-
-	// Create RetryHeap
-	retryHeap := NewRetryHeap(logger)
-	defer retryHeap.Stop()
-
-	s := newSender(logger, mockService, nil, retryHeap)
-	p := NewWorkerPool(12)
-	defer p.Stop()
-
-	sp := newSenderPool(p, s)
-
-	sp.Stop()
-}

From 2d07b385050952eed980396ec8649e25b02ca10d Mon Sep 17 00:00:00 2001
From: Marcus Mann <mpmann@amazon.com>
Date: Thu, 12 Feb 2026 15:10:31 -0500
Subject: [PATCH 35/50] docs(pusher): Clean up verbose test comment in
 queue_test.go
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

🤖 Assisted by AI
---
 plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
index 9ca08f7654b..cd85b17f98b 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
@@ -847,8 +847,3 @@ func TestQueueHaltResume(t *testing.T) {
 
 // TestQueueResumeOnBatchExpiry verifies that when a batch expires after 14 days of retrying,
 // the circuit breaker resumes the queue to allow new batches to be processed.
-// This prevents the target from being permanently blocked when a bad batch is eventually dropped.
-//
-// Scenario from PR comment: "Say a bad batch from a target caused this to halt. Now that bad batch
-// is re-tried for 14 days and eventually dropped - but this never gets resumed in that case right?
-// So this target is blocked forever in that scenario?"

From c739bb97422b475a6b867b2163aad13f42fdd8e9 Mon Sep 17 00:00:00 2001
From: Marcus Mann <mpmann@amazon.com>
Date: Thu, 12 Feb 2026 15:11:06 -0500
Subject: [PATCH 36/50] docs(pusher): Clean up verbose test comment in
 retryheap_expiry_test.go
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

🤖 Assisted by AI
---
 .../internal/pusher/retryheap_expiry_test.go              | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go
index 35bd0c28261..a3d0088f253 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go
@@ -14,12 +14,8 @@ import (
 	"github.com/aws/amazon-cloudwatch-agent/tool/testutil"
 )
 
-// TestRetryHeapProcessorExpiredBatchShouldResume demonstrates the bug where
-// expired batches don't resume the circuit breaker, leaving the target permanently blocked.
-//
-// From PR comment: "Say a bad batch from a target caused this to halt. Now that bad batch
-// is re-tried for 14 days and eventually dropped - but this never gets resumed in that case right?
-// So this target is blocked forever in that scenario?"
+// TestRetryHeapProcessorExpiredBatchShouldResume verifies that expired batches
+// resume the circuit breaker, preventing the target from being permanently blocked.
 func TestRetryHeapProcessorExpiredBatchShouldResume(t *testing.T) {
 	logger := testutil.NewNopLogger()
 

From 4e7393af9ccd75ae75044ffeb01142e01c4be0a7 Mon Sep 17 00:00:00 2001
From: Marcus Mann <mpmann@amazon.com>
Date: Thu, 12 Feb 2026 15:11:47 -0500
Subject: [PATCH 37/50] test(pusher): Remove unused circuitBreakerHalted
 variable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Variable was set but never checked in the test.

🤖 Assisted by AI
---
 .../internal/pusher/retryheap_expiry_test.go               | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go
index a3d0088f253..f04a26f39b6 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go
@@ -56,14 +56,9 @@ func TestRetryHeapProcessorExpiredBatchShouldResume(t *testing.T) {
 	batch := newLogEventBatch(target, nil)
 	batch.append(newLogEvent(time.Now(), "test message", nil))
 
-	// Set up callbacks to track circuit breaker state
-	var circuitBreakerHalted atomic.Bool
+	// Set up callback to track circuit breaker resume
 	var circuitBreakerResumed atomic.Bool
 
-	batch.addFailCallback(func() {
-		circuitBreakerHalted.Store(true)
-	})
-
 	batch.addDoneCallback(func() {
 		circuitBreakerResumed.Store(true)
 	})

From f640a193abfbe58345c3964a606344830a220139 Mon Sep 17 00:00:00 2001
From: Marcus Mann <mpmann@amazon.com>
Date: Thu, 12 Feb 2026 15:12:26 -0500
Subject: [PATCH 38/50] test(pusher): Use exact assertion for circuit breaker
 send count
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Circuit breaker should always block after exactly 1 send attempt,
not "at most 1".

🤖 Assisted by AI
---
 .../cloudwatchlogs/internal/pusher/circuitbreaker_test.go   | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go
index 972d1fab482..e141b6f5af1 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go
@@ -89,10 +89,8 @@ func TestCircuitBreakerBlocksTargetAfterFailure(t *testing.T) {
 	// Circuit breaker assertion: after the first failure, the failing target should
 	// NOT have sent additional batches. Only 1 send attempt should have been made
 	// before the circuit breaker blocks it.
-	assert.LessOrEqual(t, failingTargetSendCount.Load(), int32(1),
-		"Circuit breaker should block failing target from sending more than 1 batch, "+
-			"but %d batches were sent. Without a circuit breaker, the failing target "+
-			"continues flooding the worker pool with bad requests.", failingTargetSendCount.Load())
+	assert.Equal(t, int32(1), failingTargetSendCount.Load(),
+		"Circuit breaker should block failing target after exactly 1 send attempt")
 
 	// Healthy target should continue sending successfully
 	assert.Greater(t, healthyTargetSendCount.Load(), int32(0),

From eb589127d94453966b44890eedb5f203e99ca572 Mon Sep 17 00:00:00 2001
From: Marcus Mann <mpmann@amazon.com>
Date: Thu, 12 Feb 2026 15:13:11 -0500
Subject: [PATCH 39/50] test(pusher): Remove ineffective dummyBatch code in
 TestQueueHaltResume
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The dummyBatch was not connected to the queue's circuit breaker,
so calling done() on it had no effect. Simplified test to only
verify halt behavior.

🤖 Assisted by AI
---
 .../internal/pusher/queue_test.go             | 21 -------------------
 1 file changed, 21 deletions(-)

diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
index cd85b17f98b..75cdf220844 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
@@ -821,27 +821,6 @@ func TestQueueHaltResume(t *testing.T) {
 	// Verify only one send happened (queue is halted)
 	assert.Equal(t, int32(1), sendCount.Load(), "Should have only one send due to halt")
 
-	// Trigger flush to force send of second batch - this should block until resumed
-	done := make(chan bool)
-	go func() {
-		time.Sleep(100 * time.Millisecond) // Wait a bit
-		// Manually resume by calling success callback on a dummy batch
-		dummyBatch := newLogEventBatch(Target{"G", "S", util.StandardLogGroupClass, -1}, nil)
-		dummyBatch.addDoneCallback(func() {
-			// This simulates a successful send that should resume the queue
-		})
-		dummyBatch.done()
-		done <- true
-	}()
-
-	// This should eventually complete when the queue is resumed
-	select {
-	case <-done:
-		// Success - the resume worked
-	case <-time.After(5 * time.Second):
-		t.Fatal("Test timed out - queue may be permanently halted")
-	}
-
 	mockSender.AssertExpectations(t)
 }
 

From 3d9d77916d3dcb3f14d1e4165a884f36f15971bb Mon Sep 17 00:00:00 2001
From: Marcus Mann <mpmann@amazon.com>
Date: Thu, 12 Feb 2026 22:09:30 -0500
Subject: [PATCH 40/50] fix(pusher): Address review feedback on poison pill PR
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Replace sync.Cond with channel-based halt/resume to prevent shutdown
  deadlock (waitIfHalted now selects on haltCh and stopCh)
- Add mutex to halt/resume/waitIfHalted for thread safety
- Add TestQueueStopWhileHalted to verify no shutdown deadlock
- Add TestQueueHaltResume with proper resume assertions
- Clean up verbose test comments and weak assertions
- Remove orphaned TestQueueResumeOnBatchExpiry comment

🤖 Assisted by AI
---
 .../internal/pusher/circuitbreaker_test.go    |  2 +-
 .../internal/pusher/poison_pill_test.go       | 42 +++----------
 .../cloudwatchlogs/internal/pusher/queue.go   | 42 ++++++++-----
 .../internal/pusher/queue_test.go             | 62 +++++++++++++++++--
 .../internal/pusher/retryheap_expiry_test.go  |  5 +-
 5 files changed, 94 insertions(+), 59 deletions(-)

diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go
index e141b6f5af1..f12e64df1f0 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go
@@ -52,7 +52,7 @@ func TestCircuitBreakerBlocksTargetAfterFailure(t *testing.T) {
 
 	concurrency := 5
 	workerPool := NewWorkerPool(concurrency)
-	retryHeap := NewRetryHeap(concurrency, logger)
+	retryHeap := NewRetryHeap(logger)
 	defer workerPool.Stop()
 	defer retryHeap.Stop()
 
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go
index 5652209831f..4c19b169c49 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go
@@ -18,26 +18,9 @@ import (
 	"github.com/aws/amazon-cloudwatch-agent/sdk/service/cloudwatchlogs"
 )
 
-// TestPoisonPillScenario validates that when multiple log groups encounter
-// AccessDenied errors simultaneously with low concurrency, the agent continues
-// publishing to allowed log groups without blocking the entire pipeline.
-//
-// This test recreates the scenario from poison-pill-test-findings.md where:
-// - 1 allowed log group + 10 denied log groups
-// - Concurrency = 2
-// - Continuous stream of new batches (simulating force_flush_interval=5s)
-// - Expected: Allowed log group continues receiving events
-// - Historical Bug: Agent stopped publishing to ALL log groups after ~5 minutes
-//
-// This test validates that the retry heap and worker pool architecture correctly
-// handles this scenario by:
-// 1. Continuously generating batches for 10 denied + 1 allowed log group
-// 2. Processing with only 2 workers (low concurrency)
-// 3. Verifying allowed log group continues to receive events throughout
-// 4. Ensuring worker pool doesn't get saturated by failed retry attempts
-//
-// The test passes because the current implementation uses a retry heap with
-// proper backoff, preventing failed batches from monopolizing worker threads.
+// TestPoisonPillScenario validates that when 10 denied + 1 allowed log groups
+// share a worker pool with concurrency=2, the allowed log group continues
+// publishing without being starved by failed retries.
 func TestPoisonPillScenario(t *testing.T) {
 	heap := NewRetryHeap(&testutil.Logger{})
 	defer heap.Stop()
@@ -174,20 +157,14 @@ func TestPoisonPillScenario(t *testing.T) {
 		"Denied log groups should have attempted to send")
 }
 
-// TestRetryHeapSmallerThanFailingLogGroups tests the specific bottleneck scenario where:
-// - Retry heap size = concurrency (e.g., 2)
-// - Number of failing log groups (10) > retry heap size (2)
-// - With bounded heap: This caused deadlock as heap filled up
-// - With unbounded heap: System handles this gracefully
-//
-// This test validates the FIX: unbounded retry heap allows all failed batches
-// to be queued without blocking workers.
+// TestRetryHeapSmallerThanFailingLogGroups verifies that with an unbounded retry
+// heap, the system handles more failing log groups than workers without deadlock.
 func TestRetryHeapSmallerThanFailingLogGroups(t *testing.T) {
 	concurrency := 2
 	numFailingLogGroups := 10
 
 	// Retry heap is now unbounded (maxSize parameter ignored)
-	heap := NewRetryHeap(concurrency, &testutil.Logger{})
+	heap := NewRetryHeap(&testutil.Logger{})
 	defer heap.Stop()
 
 	workerPool := NewWorkerPool(concurrency)
@@ -308,11 +285,8 @@ func TestRetryHeapSmallerThanFailingLogGroups(t *testing.T) {
 		successCount, deniedGroupAttemptCount.Load(), heap.Size(), numFailingLogGroups)
 
 	// With unbounded heap, allowed log group should receive events
-	if successCount == 0 {
-		t.Errorf("UNEXPECTED: Allowed log group received 0 events with unbounded heap")
-	} else {
-		t.Logf("SUCCESS: Unbounded heap handled poison pill scenario: %d successful publishes despite %d failing groups", successCount, numFailingLogGroups)
-	}
+	assert.Greater(t, successCount, int32(0),
+		"Allowed log group must receive events despite %d failing groups", numFailingLogGroups)
 }
 
 // TestSingleDeniedLogGroup validates the baseline scenario where a single denied
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go b/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go
index 5d297aed525..d32b0baa0f2 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go
@@ -44,8 +44,9 @@ type queue struct {
 	wg                    *sync.WaitGroup
 
 	// Circuit breaker halt/resume functionality
-	haltCond *sync.Cond
-	halted   bool
+	haltMu sync.Mutex
+	haltCh chan struct{}
+	halted bool
 }
 
 var _ (Queue) = (*queue)(nil)
@@ -71,7 +72,7 @@ func newQueue(
 		stopCh:          make(chan struct{}),
 		startNonBlockCh: make(chan struct{}),
 		wg:              wg,
-		haltCond:        sync.NewCond(&sync.Mutex{}),
+		haltCh:          make(chan struct{}),
 		halted:          false,
 	}
 	q.flushTimeout.Store(flushTimeout)
@@ -258,28 +259,37 @@ func hasValidTime(e logs.LogEvent) bool {
 	return true
 }
 
-// waitIfHalted blocks until the queue is unhalted (circuit breaker functionality)
+// waitIfHalted blocks until the queue is unhalted or stopped.
 func (q *queue) waitIfHalted() {
-	q.haltCond.L.Lock()
-	for q.halted {
-		q.haltCond.Wait()
+	q.haltMu.Lock()
+	if !q.halted {
+		q.haltMu.Unlock()
+		return
+	}
+	ch := q.haltCh
+	q.haltMu.Unlock()
+	select {
+	case <-ch:
+	case <-q.stopCh:
 	}
-	q.haltCond.L.Unlock()
 }
 
-// halt stops the queue from sending batches (called on failure)
+// halt stops the queue from sending batches (called on failure).
 func (q *queue) halt() {
-	q.haltCond.L.Lock()
+	q.haltMu.Lock()
+	defer q.haltMu.Unlock()
 	q.halted = true
-	q.haltCond.L.Unlock()
 }
 
-// resume allows the queue to send batches again (called on success)
+// resume allows the queue to send batches again (called on success).
 func (q *queue) resume() {
-	q.haltCond.L.Lock()
-	q.halted = false
-	q.haltCond.Broadcast()
-	q.haltCond.L.Unlock()
+	q.haltMu.Lock()
+	defer q.haltMu.Unlock()
+	if q.halted {
+		q.halted = false
+		close(q.haltCh)
+		q.haltCh = make(chan struct{})
+	}
 }
 
 // onFailCallback returns a callback function to be executed after a failed send
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
index 75cdf220844..293218e185a 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
@@ -730,7 +730,7 @@ func TestQueueCallbackRegistration(t *testing.T) {
 			flushTimer:      time.NewTimer(10 * time.Millisecond),
 			startNonBlockCh: make(chan struct{}),
 			wg:              &wg,
-			haltCond:        sync.NewCond(&sync.Mutex{}),
+			haltCh:          make(chan struct{}),
 			halted:          false,
 		}
 		q.flushTimeout.Store(10 * time.Millisecond)
@@ -774,7 +774,7 @@ func TestQueueCallbackRegistration(t *testing.T) {
 			flushTimer:      time.NewTimer(10 * time.Millisecond),
 			startNonBlockCh: make(chan struct{}),
 			wg:              &wg,
-			haltCond:        sync.NewCond(&sync.Mutex{}),
+			haltCh:          make(chan struct{}),
 			halted:          false,
 		}
 		q.flushTimeout.Store(10 * time.Millisecond)
@@ -815,14 +815,68 @@ func TestQueueHaltResume(t *testing.T) {
 	// Wait a bit for the first send to complete and halt
 	time.Sleep(50 * time.Millisecond)
 
+	// Verify queue is halted
+	queueImpl := q.(*queue)
+	queueImpl.haltMu.Lock()
+	assert.True(t, queueImpl.halted, "Queue should be halted after failure")
+	queueImpl.haltMu.Unlock()
+
 	// Add second event - should be queued but not sent due to halt
 	q.AddEvent(newStubLogEvent("second message", time.Now()))
 
 	// Verify only one send happened (queue is halted)
 	assert.Equal(t, int32(1), sendCount.Load(), "Should have only one send due to halt")
 
+	// Trigger resume by calling the success callback directly
+	queueImpl.resume()
+
+	// Verify queue is no longer halted
+	queueImpl.haltMu.Lock()
+	assert.False(t, queueImpl.halted, "Queue should be resumed after success")
+	queueImpl.haltMu.Unlock()
+
+	// Add third event - should trigger send since queue is resumed
+	q.AddEvent(newStubLogEvent("third message", time.Now()))
+
+	// Wait for the second send to complete
+	time.Sleep(50 * time.Millisecond)
+
+	// Verify second send happened (queue resumed)
+	assert.Equal(t, int32(2), sendCount.Load(), "Should have two sends after resume")
+
 	mockSender.AssertExpectations(t)
 }
 
-// TestQueueResumeOnBatchExpiry verifies that when a batch expires after 14 days of retrying,
-// the circuit breaker resumes the queue to allow new batches to be processed.
+// TestQueueStopWhileHalted verifies that Stop() unblocks a halted queue.
+// Without the stopCh select in waitIfHalted, this would deadlock.
+func TestQueueStopWhileHalted(t *testing.T) {
+	logger := testutil.NewNopLogger()
+
+	mockSender := &mockSender{}
+	mockSender.On("Send", mock.Anything).Run(func(args mock.Arguments) {
+		batch := args.Get(0).(*logEventBatch)
+		batch.fail() // Halt the queue
+	}).Return()
+	mockSender.On("Stop").Return()
+
+	var wg sync.WaitGroup
+	q := newQueue(logger, Target{"G", "S", util.StandardLogGroupClass, -1}, 10*time.Millisecond, nil, mockSender, &wg)
+
+	// Add event to trigger send → fail → halt
+	q.AddEvent(newStubLogEvent("msg", time.Now()))
+	time.Sleep(50 * time.Millisecond)
+
+	// Queue is now halted. Stop must return without deadlocking.
+	done := make(chan struct{})
+	go func() {
+		q.Stop()
+		close(done)
+	}()
+
+	select {
+	case <-done:
+		// Success — Stop() returned
+	case <-time.After(2 * time.Second):
+		t.Fatal("Stop() deadlocked on halted queue")
+	}
+}
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go
index f04a26f39b6..1daac1e603f 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go
@@ -88,8 +88,5 @@ func TestRetryHeapProcessorExpiredBatchShouldResume(t *testing.T) {
 	// The circuit breaker SHOULD be resumed when the batch expires
 	// This allows the target to continue processing new batches after the bad batch is dropped
 	assert.True(t, circuitBreakerResumed.Load(),
-		"Circuit breaker should be resumed after batch expiry. "+
-			"When a batch is retried for 14 days and eventually dropped, "+
-			"the target must be unblocked to allow new batches to be processed. "+
-			"Otherwise the target remains blocked forever.")
+		"Circuit breaker should resume after batch expiry to unblock the target")
 }

From bef0a7d78f54d897c1fa84c27037f8e42fb845d2 Mon Sep 17 00:00:00 2001
From: Marcus Mann <mpmann@amazon.com>
Date: Thu, 12 Feb 2026 22:20:30 -0500
Subject: [PATCH 41/50] test(pusher): Add state callback tests for retry heap
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Verify state file management during retry, expiry, and shutdown:
- Successful retry persists file offsets via state callbacks
- Expired batch (14d) still persists offsets to prevent re-read
- Clean shutdown does not persist state for unprocessed batches

🤖 Assisted by AI
---
 .../internal/pusher/state_callback_test.go    | 188 ++++++++++++++++++
 1 file changed, 188 insertions(+)
 create mode 100644 plugins/outputs/cloudwatchlogs/internal/pusher/state_callback_test.go

diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/state_callback_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/state_callback_test.go
new file mode 100644
index 00000000000..8190e3a40b5
--- /dev/null
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/state_callback_test.go
@@ -0,0 +1,188 @@
+// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+// SPDX-License-Identifier: MIT
+
+package pusher
+
+import (
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/mock"
+
+	"github.com/aws/amazon-cloudwatch-agent/internal/retryer"
+	"github.com/aws/amazon-cloudwatch-agent/internal/state"
+	"github.com/aws/amazon-cloudwatch-agent/sdk/service/cloudwatchlogs"
+	"github.com/aws/amazon-cloudwatch-agent/tool/testutil"
+)
+
+type mockFileRangeQueue struct {
+	mock.Mock
+}
+
+func (m *mockFileRangeQueue) ID() string {
+	return m.Called().String(0)
+}
+
+func (m *mockFileRangeQueue) Enqueue(r state.Range) {
+	m.Called(r)
+}
+
+// newStatefulBatch creates a batch with stateful events that register state callbacks.
+func newStatefulBatch(target Target, queue *mockFileRangeQueue) *logEventBatch {
+	batch := newLogEventBatch(target, nil)
+	now := time.Now()
+	evt := newStatefulLogEvent(now, "test", nil, &logEventState{
+		r:     state.NewRange(0, 100),
+		queue: queue,
+	})
+	batch.append(evt)
+	return batch
+}
+
+// TestRetryHeapSuccessCallsStateCallback verifies that when a batch succeeds
+// on retry through the heap, state callbacks fire to persist file offsets.
+func TestRetryHeapSuccessCallsStateCallback(t *testing.T) {
+	logger := testutil.NewNopLogger()
+	target := Target{Group: "group", Stream: "stream"}
+
+	queue := &mockFileRangeQueue{}
+	queue.On("ID").Return("file1")
+	queue.On("Enqueue", mock.Anything).Return()
+
+	service := &stubLogsService{
+		ple: func(_ *cloudwatchlogs.PutLogEventsInput) (*cloudwatchlogs.PutLogEventsOutput, error) {
+			return &cloudwatchlogs.PutLogEventsOutput{}, nil
+		},
+		cls: func(_ *cloudwatchlogs.CreateLogStreamInput) (*cloudwatchlogs.CreateLogStreamOutput, error) {
+			return &cloudwatchlogs.CreateLogStreamOutput{}, nil
+		},
+		clg: func(_ *cloudwatchlogs.CreateLogGroupInput) (*cloudwatchlogs.CreateLogGroupOutput, error) {
+			return &cloudwatchlogs.CreateLogGroupOutput{}, nil
+		},
+		dlg: func(_ *cloudwatchlogs.DescribeLogGroupsInput) (*cloudwatchlogs.DescribeLogGroupsOutput, error) {
+			return &cloudwatchlogs.DescribeLogGroupsOutput{}, nil
+		},
+	}
+
+	retryHeap := NewRetryHeap(logger)
+	workerPool := NewWorkerPool(2)
+	tm := NewTargetManager(logger, service)
+	defer retryHeap.Stop()
+	defer workerPool.Stop()
+
+	processor := NewRetryHeapProcessor(retryHeap, workerPool, service, tm, logger, time.Hour, retryer.NewLogThrottleRetryer(logger))
+
+	batch := newStatefulBatch(target, queue)
+	batch.nextRetryTime = time.Now().Add(-1 * time.Second)
+
+	err := retryHeap.Push(batch)
+	assert.NoError(t, err)
+
+	processor.processReadyMessages()
+	time.Sleep(200 * time.Millisecond)
+
+	assert.Equal(t, 0, retryHeap.Size(), "Heap should be empty after success")
+	queue.AssertCalled(t, "Enqueue", mock.Anything)
+}
+
+// TestRetryHeapExpiryCallsStateCallback verifies that when a batch expires
+// after 14 days without successfully publishing, state callbacks still fire
+// to persist file offsets and prevent re-reading on restart.
+func TestRetryHeapExpiryCallsStateCallback(t *testing.T) {
+	logger := testutil.NewNopLogger()
+	target := Target{Group: "group", Stream: "stream"}
+
+	queue := &mockFileRangeQueue{}
+	queue.On("ID").Return("file1")
+	queue.On("Enqueue", mock.Anything).Return()
+
+	service := &stubLogsService{
+		ple: func(_ *cloudwatchlogs.PutLogEventsInput) (*cloudwatchlogs.PutLogEventsOutput, error) {
+			return nil, &cloudwatchlogs.ServiceUnavailableException{}
+		},
+		cls: func(_ *cloudwatchlogs.CreateLogStreamInput) (*cloudwatchlogs.CreateLogStreamOutput, error) {
+			return &cloudwatchlogs.CreateLogStreamOutput{}, nil
+		},
+		clg: func(_ *cloudwatchlogs.CreateLogGroupInput) (*cloudwatchlogs.CreateLogGroupOutput, error) {
+			return &cloudwatchlogs.CreateLogGroupOutput{}, nil
+		},
+		dlg: func(_ *cloudwatchlogs.DescribeLogGroupsInput) (*cloudwatchlogs.DescribeLogGroupsOutput, error) {
+			return &cloudwatchlogs.DescribeLogGroupsOutput{}, nil
+		},
+	}
+
+	retryHeap := NewRetryHeap(logger)
+	workerPool := NewWorkerPool(2)
+	tm := NewTargetManager(logger, service)
+	defer retryHeap.Stop()
+	defer workerPool.Stop()
+
+	processor := NewRetryHeapProcessor(retryHeap, workerPool, service, tm, logger, 50*time.Millisecond, nil)
+
+	batch := newStatefulBatch(target, queue)
+	batch.initializeStartTime()
+	batch.expireAfter = time.Now().Add(-10 * time.Millisecond) // Already expired
+	batch.updateRetryMetadata(&cloudwatchlogs.ServiceUnavailableException{})
+	batch.nextRetryTime = time.Now().Add(-1 * time.Second) // Override to make it ready
+
+	err := retryHeap.Push(batch)
+	assert.NoError(t, err)
+
+	processor.processReadyMessages()
+	time.Sleep(200 * time.Millisecond)
+
+	assert.Equal(t, 0, retryHeap.Size(), "Expired batch should be removed")
+	queue.AssertCalled(t, "Enqueue", mock.Anything)
+}
+
+// TestShutdownDoesNotCallStateCallback verifies that during a clean shutdown
+// via Stop(), remaining batches in the retry heap do NOT have their state
+// callbacks invoked. This prevents marking undelivered data as processed.
+func TestShutdownDoesNotCallStateCallback(t *testing.T) {
+	logger := testutil.NewNopLogger()
+	target := Target{Group: "group", Stream: "stream"}
+
+	var stateCallCount atomic.Int32
+
+	retryHeap := NewRetryHeap(logger)
+	workerPool := NewWorkerPool(2)
+	defer workerPool.Stop()
+
+	service := &stubLogsService{
+		ple: func(_ *cloudwatchlogs.PutLogEventsInput) (*cloudwatchlogs.PutLogEventsOutput, error) {
+			return nil, &cloudwatchlogs.ServiceUnavailableException{}
+		},
+		cls: func(_ *cloudwatchlogs.CreateLogStreamInput) (*cloudwatchlogs.CreateLogStreamOutput, error) {
+			return &cloudwatchlogs.CreateLogStreamOutput{}, nil
+		},
+		clg: func(_ *cloudwatchlogs.CreateLogGroupInput) (*cloudwatchlogs.CreateLogGroupOutput, error) {
+			return &cloudwatchlogs.CreateLogGroupOutput{}, nil
+		},
+		dlg: func(_ *cloudwatchlogs.DescribeLogGroupsInput) (*cloudwatchlogs.DescribeLogGroupsOutput, error) {
+			return &cloudwatchlogs.DescribeLogGroupsOutput{}, nil
+		},
+	}
+	tm := NewTargetManager(logger, service)
+
+	processor := NewRetryHeapProcessor(retryHeap, workerPool, service, tm, logger, time.Hour, nil)
+	processor.Start()
+
+	// Push a batch with a future retry time so it won't be processed before Stop
+	batch := newLogEventBatch(target, nil)
+	batch.append(newLogEvent(time.Now(), "test", nil))
+	batch.addStateCallback(func() { stateCallCount.Add(1) })
+	batch.nextRetryTime = time.Now().Add(1 * time.Hour) // Not ready yet
+
+	err := retryHeap.Push(batch)
+	assert.NoError(t, err)
+
+	// Stop the processor — batch is still in heap, not ready
+	processor.Stop()
+	retryHeap.Stop()
+
+	assert.Equal(t, int32(0), stateCallCount.Load(),
+		"State callback should not be called for unprocessed batches during shutdown")
+	assert.Equal(t, 1, retryHeap.Size(), "Batch should remain in heap after shutdown")
+}

From 55bc63cb42dabc90fa9433906810da483ef8f1e3 Mon Sep 17 00:00:00 2001
From: Marcus Mann <mpmann@amazon.com>
Date: Thu, 12 Feb 2026 22:27:00 -0500
Subject: [PATCH 42/50] refactor(pusher): Audit and fix test assertions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Fix TestRetryHeapProcessorSendsBatch: add events to batch, verify
  PutLogEvents is called and done callback fires (was testing empty batch)
- Fix TestRetryHeapProcessorExpiredBatch: set expireAfter field so
  isExpired() actually returns true, verify done() is called
- Fix race in TestRetryHeapProcessorSendsBatch: use atomic.Bool
- Reduce TestRetryHeap_UnboundedPush sleep from 3s to 100ms

🤖 Assisted by AI
---
 .../internal/pusher/retryheap_test.go         | 33 +++++++++++++------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go
index 2313239367c..d86888856bf 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go
@@ -4,6 +4,7 @@
 package pusher
 
 import (
+	"sync/atomic"
 	"testing"
 	"time"
 
@@ -109,17 +110,20 @@ func TestRetryHeapProcessorExpiredBatch(t *testing.T) {
 
 	processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, 1*time.Millisecond, retryer.NewLogThrottleRetryer(&testutil.Logger{}))
 
-	// Create expired batch
 	target := Target{Group: "group", Stream: "stream"}
 	batch := newLogEventBatch(target, nil)
-	batch.startTime = time.Now().Add(-1 * time.Hour)
+	batch.initializeStartTime()
+	batch.expireAfter = time.Now().Add(-1 * time.Hour) // Already expired
 	batch.nextRetryTime = time.Now().Add(-1 * time.Second)
 
+	var doneCalled bool
+	batch.addDoneCallback(func() { doneCalled = true })
+
 	heap.Push(batch)
 
-	// Process should drop expired batch
 	processor.processReadyMessages()
-	assert.Equal(t, 0, heap.Size())
+	assert.Equal(t, 0, heap.Size(), "Expired batch should be removed from heap")
+	assert.True(t, doneCalled, "done() should be called on expired batch to resume circuit breaker")
 }
 
 func TestRetryHeapProcessorSendsBatch(t *testing.T) {
@@ -128,21 +132,30 @@ func TestRetryHeapProcessorSendsBatch(t *testing.T) {
 
 	workerPool := NewWorkerPool(2)
 	defer workerPool.Stop()
+
 	mockService := &mockLogsService{}
+	mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil)
 	mockTargetManager := &mockTargetManager{}
+	mockTargetManager.On("EnsureTargetExists", mock.Anything).Return(nil)
 
 	processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, time.Hour, retryer.NewLogThrottleRetryer(&testutil.Logger{}))
 
-	// Create ready batch (retryTime already past)
 	target := Target{Group: "group", Stream: "stream"}
 	batch := newLogEventBatch(target, nil)
+	batch.append(newLogEvent(time.Now(), "test message", nil))
 	batch.nextRetryTime = time.Now().Add(-1 * time.Second)
 
+	var doneCalled atomic.Bool
+	batch.addDoneCallback(func() { doneCalled.Store(true) })
+
 	heap.Push(batch)
 
-	// Process should send batch
 	processor.processReadyMessages()
+	time.Sleep(200 * time.Millisecond)
+
 	assert.Equal(t, 0, heap.Size())
+	assert.True(t, doneCalled.Load(), "Batch done callback should be called on successful send")
+	mockService.AssertCalled(t, "PutLogEvents", mock.Anything)
 }
 
 func TestRetryHeap_UnboundedPush(t *testing.T) {
@@ -152,11 +165,11 @@ func TestRetryHeap_UnboundedPush(t *testing.T) {
 	// Push multiple batches without blocking
 	target := Target{Group: "group", Stream: "stream"}
 	batch1 := newLogEventBatch(target, nil)
-	batch1.nextRetryTime = time.Now().Add(3 * time.Second)
+	batch1.nextRetryTime = time.Now().Add(50 * time.Millisecond)
 	batch2 := newLogEventBatch(target, nil)
-	batch2.nextRetryTime = time.Now().Add(3 * time.Second)
+	batch2.nextRetryTime = time.Now().Add(50 * time.Millisecond)
 	batch3 := newLogEventBatch(target, nil)
-	batch3.nextRetryTime = time.Now().Add(3 * time.Second)
+	batch3.nextRetryTime = time.Now().Add(50 * time.Millisecond)
 
 	// All pushes should succeed immediately (non-blocking)
 	err := heap.Push(batch1)
@@ -171,7 +184,7 @@ func TestRetryHeap_UnboundedPush(t *testing.T) {
 		t.Fatalf("Expected size 3, got %d", heap.Size())
 	}
 
-	time.Sleep(3 * time.Second)
+	time.Sleep(100 * time.Millisecond)
 
 	// Pop ready batches
 	readyBatches := heap.PopReady()

From c1e194404e4a04b4e9bfb876dfb68284f41960aa Mon Sep 17 00:00:00 2001
From: Marcus Mann <mpmann@amazon.com>
Date: Thu, 12 Feb 2026 22:39:40 -0500
Subject: [PATCH 43/50] refactor(pusher): Remove redundant
 TestRetryHeapSmallerThanFailingLogGroups
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

TestPoisonPillScenario already covers the same scenario (10 denied +
1 allowed with low concurrency). The bounded heap no longer exists so
the 'smaller than' framing is no longer meaningful.

🤖 Assisted by AI
---
 .../internal/pusher/poison_pill_test.go       | 133 ------------------
 1 file changed, 133 deletions(-)

diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go
index 4c19b169c49..2959ce41e00 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go
@@ -4,7 +4,6 @@
 package pusher
 
 import (
-	"fmt"
 	"sync"
 	"sync/atomic"
 	"testing"
@@ -157,138 +156,6 @@ func TestPoisonPillScenario(t *testing.T) {
 		"Denied log groups should have attempted to send")
 }
 
-// TestRetryHeapSmallerThanFailingLogGroups verifies that with an unbounded retry
-// heap, the system handles more failing log groups than workers without deadlock.
-func TestRetryHeapSmallerThanFailingLogGroups(t *testing.T) {
-	concurrency := 2
-	numFailingLogGroups := 10
-
-	// Retry heap is now unbounded (maxSize parameter ignored)
-	heap := NewRetryHeap(&testutil.Logger{})
-	defer heap.Stop()
-
-	workerPool := NewWorkerPool(concurrency)
-	defer workerPool.Stop()
-
-	mockService := &mockLogsService{}
-	mockTargetManager := &mockTargetManager{}
-	mockTargetManager.On("EnsureTargetExists", mock.Anything).Return(nil)
-
-	accessDeniedErr := &cloudwatchlogs.AccessDeniedException{
-		Message_: stringPtr("Access denied"),
-	}
-
-	var allowedGroupSuccessCount atomic.Int32
-	var deniedGroupAttemptCount atomic.Int32
-
-	mockService.On("PutLogEvents", mock.MatchedBy(func(input *cloudwatchlogs.PutLogEventsInput) bool {
-		return *input.LogGroupName == "allowed"
-	})).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Run(func(args mock.Arguments) {
-		time.Sleep(10 * time.Millisecond)
-		allowedGroupSuccessCount.Add(1)
-	})
-
-	mockService.On("PutLogEvents", mock.MatchedBy(func(input *cloudwatchlogs.PutLogEventsInput) bool {
-		return *input.LogGroupName != "allowed"
-	})).Return((*cloudwatchlogs.PutLogEventsOutput)(nil), accessDeniedErr).Run(func(args mock.Arguments) {
-		time.Sleep(10 * time.Millisecond)
-		deniedGroupAttemptCount.Add(1)
-	})
-
-	processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, 50*time.Millisecond, retryer.NewLogThrottleRetryer(&testutil.Logger{}))
-
-	// Create targets
-	allowedTarget := Target{Group: "allowed", Stream: "stream"}
-	deniedTargets := make([]Target, numFailingLogGroups)
-	for i := 0; i < numFailingLogGroups; i++ {
-		deniedTargets[i] = Target{Group: fmt.Sprintf("denied-%d", i), Stream: "stream"}
-	}
-
-	done := make(chan struct{})
-	var wg sync.WaitGroup
-
-	// Generate batches for all failing log groups continuously
-	for i := 0; i < numFailingLogGroups; i++ {
-		wg.Add(1)
-		go func(target Target) {
-			defer wg.Done()
-			ticker := time.NewTicker(30 * time.Millisecond)
-			defer ticker.Stop()
-			batchCount := 0
-			for {
-				select {
-				case <-done:
-					return
-				case <-ticker.C:
-					if batchCount >= 3 {
-						return
-					}
-					batch := createBatch(target, 10)
-					batch.nextRetryTime = time.Now().Add(-1 * time.Second)
-					heap.Push(batch)
-					batchCount++
-				}
-			}
-		}(deniedTargets[i])
-	}
-
-	// Generate batches for allowed log group
-	wg.Add(1)
-	go func() {
-		defer wg.Done()
-		ticker := time.NewTicker(30 * time.Millisecond)
-		defer ticker.Stop()
-		batchCount := 0
-		for {
-			select {
-			case <-done:
-				return
-			case <-ticker.C:
-				if batchCount >= 5 {
-					return
-				}
-				batch := createBatch(allowedTarget, 10)
-				batch.nextRetryTime = time.Now().Add(-1 * time.Second)
-				heap.Push(batch)
-				batchCount++
-			}
-		}
-	}()
-
-	// Process continuously
-	processorDone := make(chan struct{})
-	go func() {
-		ticker := time.NewTicker(15 * time.Millisecond)
-		defer ticker.Stop()
-		for {
-			select {
-			case <-processorDone:
-				return
-			case <-ticker.C:
-				processor.processReadyMessages()
-			}
-		}
-	}()
-
-	// Run for 1 second
-	time.Sleep(1 * time.Second)
-	close(done)
-	wg.Wait()
-	time.Sleep(300 * time.Millisecond)
-	processor.processReadyMessages()
-	time.Sleep(100 * time.Millisecond)
-	close(processorDone)
-
-	successCount := allowedGroupSuccessCount.Load()
-
-	t.Logf("Results: Allowed success=%d, Denied attempts=%d, Heap size=%d, Failing groups=%d",
-		successCount, deniedGroupAttemptCount.Load(), heap.Size(), numFailingLogGroups)
-
-	// With unbounded heap, allowed log group should receive events
-	assert.Greater(t, successCount, int32(0),
-		"Allowed log group must receive events despite %d failing groups", numFailingLogGroups)
-}
-
 // TestSingleDeniedLogGroup validates the baseline scenario where a single denied
 // log group does not affect the allowed log group.
 func TestSingleDeniedLogGroup(t *testing.T) {

From 334acdff784121c7185ebda873eadda04f7a6090 Mon Sep 17 00:00:00 2001
From: Marcus Mann <mpmann@amazon.com>
Date: Thu, 12 Feb 2026 22:48:06 -0500
Subject: [PATCH 44/50] docs(pusher): Remove internal ticket references from
 test comments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

🤖 Assisted by AI
---
 .../cloudwatchlogs/internal/pusher/retryheap_recovery_test.go   | 2 --
 1 file changed, 2 deletions(-)

diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_recovery_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_recovery_test.go
index 7225355f0d2..747272d1bed 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_recovery_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_recovery_test.go
@@ -20,7 +20,6 @@ import (
 // TestRecoveryWhenPermissionGrantedDuringRetry validates that when PLE permissions
 // are missing initially but granted while retry is ongoing, the system recovers
 // and successfully publishes logs.
-// This test addresses CWQS-3192 requirement 1.
 func TestRecoveryWhenPermissionGrantedDuringRetry(t *testing.T) {
 	heap := NewRetryHeap(&testutil.Logger{})
 	defer heap.Stop()
@@ -112,7 +111,6 @@ func TestRecoveryWhenPermissionGrantedDuringRetry(t *testing.T) {
 
 // TestRecoveryAfterSystemRestart validates that when the system restarts with
 // retry ongoing, it resumes correctly by loading state and continuing retries.
-// This test addresses CWQS-3192 requirement 2.
 func TestRecoveryAfterSystemRestart(t *testing.T) {
 	heap := NewRetryHeap(&testutil.Logger{})
 	defer heap.Stop()

From b6f3b3ed5901717c0d6079eec63b7bd0ee590583 Mon Sep 17 00:00:00 2001
From: Marcus Mann <mpmann@amazon.com>
Date: Thu, 12 Feb 2026 22:50:24 -0500
Subject: [PATCH 45/50] refactor(pusher): Simplify fail callback to direct
 method reference
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

🤖 Assisted by AI
---
 plugins/outputs/cloudwatchlogs/internal/pusher/queue.go | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go b/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go
index d32b0baa0f2..86f1bd6e4c2 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go
@@ -182,7 +182,7 @@ func (q *queue) merge(mergeChan chan logs.LogEvent) {
 func (q *queue) send() {
 	if len(q.batch.events) > 0 {
 		q.batch.addDoneCallback(q.onSuccessCallback(q.batch.bufferedSize))
-		q.batch.addFailCallback(q.onFailCallback())
+		q.batch.addFailCallback(q.halt)
 
 		// Wait if halted (circuit breaker)
 		q.waitIfHalted()
@@ -291,10 +291,3 @@ func (q *queue) resume() {
 		q.haltCh = make(chan struct{})
 	}
 }
-
-// onFailCallback returns a callback function to be executed after a failed send
-func (q *queue) onFailCallback() func() {
-	return func() {
-		q.halt()
-	}
-}

From 98bdc89ea88ad5fd2e8602a3b35f2f165b5f7531 Mon Sep 17 00:00:00 2001
From: Marcus Mann <mpmann@amazon.com>
Date: Fri, 13 Feb 2026 11:34:25 -0500
Subject: [PATCH 46/50] style(pusher): Fix unused parameter lint warnings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

🤖 Assisted by AI
---
 .../cloudwatchlogs/internal/pusher/poison_pill_test.go      | 6 +++---
 .../cloudwatchlogs/internal/pusher/retryheap_expiry_test.go | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go
index 2959ce41e00..3500173e0af 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go
@@ -42,14 +42,14 @@ func TestPoisonPillScenario(t *testing.T) {
 	// Configure mock service responses with realistic latency
 	mockService.On("PutLogEvents", mock.MatchedBy(func(input *cloudwatchlogs.PutLogEventsInput) bool {
 		return *input.LogGroupName == "log-stream-ple-access-granted"
-	})).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Run(func(args mock.Arguments) {
+	})).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Run(func(_ mock.Arguments) {
 		time.Sleep(10 * time.Millisecond) // Simulate API latency
 		allowedGroupSuccessCount.Add(1)
 	})
 
 	mockService.On("PutLogEvents", mock.MatchedBy(func(input *cloudwatchlogs.PutLogEventsInput) bool {
 		return *input.LogGroupName != "log-stream-ple-access-granted"
-	})).Return((*cloudwatchlogs.PutLogEventsOutput)(nil), accessDeniedErr).Run(func(args mock.Arguments) {
+	})).Return((*cloudwatchlogs.PutLogEventsOutput)(nil), accessDeniedErr).Run(func(_ mock.Arguments) {
 		time.Sleep(10 * time.Millisecond) // Simulate API latency
 		deniedGroupAttemptCount.Add(1)
 	})
@@ -177,7 +177,7 @@ func TestSingleDeniedLogGroup(t *testing.T) {
 
 	mockService.On("PutLogEvents", mock.MatchedBy(func(input *cloudwatchlogs.PutLogEventsInput) bool {
 		return *input.LogGroupName == "log-stream-ple-access-granted"
-	})).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Run(func(args mock.Arguments) {
+	})).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Run(func(_ mock.Arguments) {
 		allowedGroupSuccessCount.Add(1)
 	})
 
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go
index 1daac1e603f..cdbbd56838f 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go
@@ -21,7 +21,7 @@ func TestRetryHeapProcessorExpiredBatchShouldResume(t *testing.T) {
 
 	var sendAttempts atomic.Int32
 	mockService := &stubLogsService{
-		ple: func(input *cloudwatchlogs.PutLogEventsInput) (*cloudwatchlogs.PutLogEventsOutput, error) {
+		ple: func(_ *cloudwatchlogs.PutLogEventsInput) (*cloudwatchlogs.PutLogEventsOutput, error) {
 			sendAttempts.Add(1)
 			// Always fail to simulate a problematic target
 			return nil, &cloudwatchlogs.ServiceUnavailableException{}

From 1aad58edcbb6b21bffe01dd4163832e3149cc3ef Mon Sep 17 00:00:00 2001
From: Jeffrey Chien <chienjef@amazon.com>
Date: Mon, 16 Feb 2026 16:47:36 -0500
Subject: [PATCH 47/50] Fix E2E test workflow permissions (#2028)

---
 .../application-signals-e2e-test.yml          | 21 ++++++----
 .github/workflows/build-test-artifacts.yml    | 16 +++----
 .github/workflows/e2e-test.yml                | 42 ++++++++++++++++++-
 .../eks-performance-cluster-addon-install.yml |  6 ++-
 .github/workflows/integration-test.yml        | 21 ++++++----
 .github/workflows/wd-integration-test.yml     | 21 ++++++----
 6 files changed, 96 insertions(+), 31 deletions(-)

diff --git a/.github/workflows/application-signals-e2e-test.yml b/.github/workflows/application-signals-e2e-test.yml
index 656d75b1298..94f4579a581 100644
--- a/.github/workflows/application-signals-e2e-test.yml
+++ b/.github/workflows/application-signals-e2e-test.yml
@@ -41,13 +41,20 @@ jobs:
             exit 1
           fi
       - run: |
-          conclusion=$(gh run view ${{ inputs.build_run_id }} --repo $GITHUB_REPOSITORY --json conclusion -q '.conclusion')
-          if [[ $conclusion == "success" ]]; then
-            echo "Run succeeded"
-          else
-            echo "Run failed"
-            exit 1
-          fi
+          for i in {1..6}; do
+            conclusion=$(gh run view ${{ inputs.build_run_id }} --repo $GITHUB_REPOSITORY --json conclusion -q '.conclusion')
+            if [[ "$conclusion" == "success" ]]; then
+              echo "Run succeeded"
+              exit 0
+            elif [[ "$conclusion" == "failure" || "$conclusion" == "cancelled" ]]; then
+              echo "Run failed with: $conclusion"
+              exit 1
+            fi
+            echo "Waiting for workflow to complete (attempt $i)..."
+            sleep 5
+          done
+          echo "Timed out waiting for workflow"
+          exit 1
         env:
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
diff --git a/.github/workflows/build-test-artifacts.yml b/.github/workflows/build-test-artifacts.yml
index ad180deb77d..e5b0d793a4b 100644
--- a/.github/workflows/build-test-artifacts.yml
+++ b/.github/workflows/build-test-artifacts.yml
@@ -19,14 +19,14 @@ on:
     - cron: '0 11 * * 1,2,3,4,5' # Every day at 11:00 UTC on Monday to Friday
   workflow_dispatch:
     inputs:
-      test-image-before-upload:
-        description: "Run Test on the new container image"
+      run-tests:
+        description: "Run test workflows after build"
         default: true
         type: boolean
   workflow_call:
     inputs:
-      test-image-before-upload:
-        description: "Run Test on the new container image"
+      run-tests:
+        description: "Run test workflows after build"
         default: true
         type: boolean
 
@@ -114,7 +114,7 @@ jobs:
 
   StartIntegrationTests:
     needs: [ BuildAndUploadPackages, BuildAndUploadITAR, BuildAndUploadCN, BuildDocker, BuildDistributor ]
-    if: ${{ github.event_name == 'push' || inputs.test-image-before-upload }}
+    if: ${{ github.event_name == 'push' || inputs.run-tests }}
     runs-on: ubuntu-latest
     permissions:
       actions: write
@@ -126,7 +126,7 @@ jobs:
   StartApplicationSignalsE2ETests:
     needs: [ BuildAndUploadPackages, BuildAndUploadITAR, BuildAndUploadCN, BuildDocker, BuildDistributor ]
     # Workflow only runs against main
-    if: ${{ github.event_name == 'push' || inputs.test-image-before-upload }}
+    if: ${{ github.event_name == 'push' || inputs.run-tests }}
     runs-on: ubuntu-latest
     permissions:
       actions: write
@@ -137,7 +137,7 @@ jobs:
 
   StartEKSE2ETests:
     needs: [ BuildAndUploadPackages, BuildAndUploadITAR, BuildAndUploadCN, BuildDocker, BuildDistributor ]
-    if: ${{ github.event_name == 'push' || inputs.test-image-before-upload }}
+    if: ${{ github.event_name == 'push' || inputs.run-tests }}
     runs-on: ubuntu-latest
     permissions:
       actions: write
@@ -148,7 +148,7 @@ jobs:
 
   StartWorkloadDiscoveryIntegrationTests:
     needs: [ BuildAndUploadPackages, BuildAndUploadITAR, BuildAndUploadCN, BuildDocker, BuildDistributor ]
-    if: ${{ github.event_name == 'push' || inputs.test-image-before-upload }}
+    if: ${{ github.event_name == 'push' || inputs.run-tests }}
     runs-on: ubuntu-latest
     permissions:
       actions: write
diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml
index 6b907c472b6..667992e6978 100644
--- a/.github/workflows/e2e-test.yml
+++ b/.github/workflows/e2e-test.yml
@@ -69,12 +69,16 @@ jobs:
     permissions:
       id-token: write
       contents: read
+      actions: write
     with:
-      test-image-before-upload: false
+      run-tests: false
 
   BuildOperator:
     needs: [GetLatestOperatorCommitSHA]
     uses: aws/amazon-cloudwatch-agent-operator/.github/workflows/build-and-upload.yml@main
+    permissions:
+      id-token: write
+      contents: read
     concurrency:
       group: ${{ github.workflow }}-operator-${{ inputs.operator-branch || 'main' }}
       cancel-in-progress: true
@@ -159,6 +163,9 @@ jobs:
     needs: [ GetLatestOperatorCommitSHA, GenerateTestMatrix, OutputEnvVariables ]
     if: always() && !cancelled() && !contains(needs.*.result, 'failure')
     name: 'EKSE2EJVMTomcatTestHelm'
+    permissions:
+      id-token: write
+      contents: read
     uses: ./.github/workflows/eks-e2e-test.yml
     with:
       terraform_dir: terraform/eks/e2e
@@ -183,6 +190,9 @@ jobs:
     needs: [ GetLatestOperatorCommitSHA, GenerateTestMatrix, OutputEnvVariables ]
     if: always() && !cancelled() && !contains(needs.*.result, 'failure')
     name: 'EKSE2EJVMTomcatTestAddon'
+    permissions:
+      id-token: write
+      contents: read
     uses: ./.github/workflows/eks-e2e-test.yml
     with:
       terraform_dir: terraform/eks/e2e
@@ -207,6 +217,9 @@ jobs:
     needs: [ GetLatestOperatorCommitSHA, GenerateTestMatrix, OutputEnvVariables ]
     if: always() && !cancelled() && !contains(needs.*.result, 'failure')
     name: 'EKSE2EKafkaTestHelm'
+    permissions:
+      id-token: write
+      contents: read
     uses: ./.github/workflows/eks-e2e-test.yml
     with:
       terraform_dir: terraform/eks/e2e
@@ -231,6 +244,9 @@ jobs:
     needs: [ GetLatestOperatorCommitSHA, GenerateTestMatrix, OutputEnvVariables ]
     if: always() && !cancelled() && !contains(needs.*.result, 'failure')
     name: 'EKSE2EKafkaTestAddon'
+    permissions:
+      id-token: write
+      contents: read
     uses: ./.github/workflows/eks-e2e-test.yml
     with:
       terraform_dir: terraform/eks/e2e
@@ -255,6 +271,9 @@ jobs:
     needs: [ GetLatestOperatorCommitSHA, GenerateTestMatrix, OutputEnvVariables ]
     if: always() && !cancelled() && !contains(needs.*.result, 'failure')
     name: 'EKSE2EJMXContainerInsightsTestHelm'
+    permissions:
+      id-token: write
+      contents: read
     uses: ./.github/workflows/eks-e2e-test.yml
     with:
       terraform_dir: terraform/eks/e2e
@@ -279,6 +298,9 @@ jobs:
     needs: [ GetLatestOperatorCommitSHA, GenerateTestMatrix, OutputEnvVariables ]
     if: always() && !cancelled() && !contains(needs.*.result, 'failure')
     name: 'EKSE2EJMXContainerInsightsTestAddon'
+    permissions:
+      id-token: write
+      contents: read
     uses: ./.github/workflows/eks-e2e-test.yml
     with:
       terraform_dir: terraform/eks/e2e
@@ -303,6 +325,9 @@ jobs:
     needs: [ GetLatestOperatorCommitSHA, GenerateTestMatrix, OutputEnvVariables ]
     if: always() && !cancelled() && !contains(needs.*.result, 'failure')
     name: 'EKSE2EJVMTomcatTestHelmIPv6'
+    permissions:
+      id-token: write
+      contents: read
     uses: ./.github/workflows/eks-e2e-test.yml
     with:
       terraform_dir: terraform/eks/e2e
@@ -329,6 +354,9 @@ jobs:
     needs: [ GetLatestOperatorCommitSHA, GenerateTestMatrix, OutputEnvVariables ]
     if: always() && !cancelled() && !contains(needs.*.result, 'failure')
     name: 'EKSE2EJVMTomcatTestAddonIPv6'
+    permissions:
+      id-token: write
+      contents: read
     uses: ./.github/workflows/eks-e2e-test.yml
     with:
       terraform_dir: terraform/eks/e2e
@@ -355,6 +383,9 @@ jobs:
     needs: [ GetLatestOperatorCommitSHA, GenerateTestMatrix, OutputEnvVariables ]
     if: always() && !cancelled() && !contains(needs.*.result, 'failure')
     name: 'EKSE2EKafkaTestHelmIPv6'
+    permissions:
+      id-token: write
+      contents: read
     uses: ./.github/workflows/eks-e2e-test.yml
     with:
       terraform_dir: terraform/eks/e2e
@@ -381,6 +412,9 @@ jobs:
     needs: [ GetLatestOperatorCommitSHA, GenerateTestMatrix, OutputEnvVariables ]
     if: always() && !cancelled() && !contains(needs.*.result, 'failure')
     name: 'EKSE2EKafkaTestAddonIPv6'
+    permissions:
+      id-token: write
+      contents: read
     uses: ./.github/workflows/eks-e2e-test.yml
     with:
       terraform_dir: terraform/eks/e2e
@@ -407,6 +441,9 @@ jobs:
     needs: [ GetLatestOperatorCommitSHA, GenerateTestMatrix, OutputEnvVariables ]
     if: always() && !cancelled() && !contains(needs.*.result, 'failure')
     name: 'EKSE2EJMXContainerInsightsTestHelmIPv6'
+    permissions:
+      id-token: write
+      contents: read
     uses: ./.github/workflows/eks-e2e-test.yml
     with:
       terraform_dir: terraform/eks/e2e
@@ -433,6 +470,9 @@ jobs:
     needs: [ GetLatestOperatorCommitSHA, GenerateTestMatrix, OutputEnvVariables ]
     if: always() && !cancelled() && !contains(needs.*.result, 'failure')
     name: 'EKSE2EJMXContainerInsightsTestAddonIPv6'
+    permissions:
+      id-token: write
+      contents: read
     uses: ./.github/workflows/eks-e2e-test.yml
     with:
       terraform_dir: terraform/eks/e2e
diff --git a/.github/workflows/eks-performance-cluster-addon-install.yml b/.github/workflows/eks-performance-cluster-addon-install.yml
index 60292bf99d7..37580e1680b 100644
--- a/.github/workflows/eks-performance-cluster-addon-install.yml
+++ b/.github/workflows/eks-performance-cluster-addon-install.yml
@@ -122,14 +122,18 @@ jobs:
     permissions:
       id-token: write
       contents: read
+      actions: write
     with:
-      test-image-before-upload: false
+      run-tests: false
 
   # Build and upload operator image to ECR repo
   BuildOperator:
     needs: [ check-trigger, GetLatestOperatorCommitSHA ]
     if: ${{ needs.check-trigger.outputs.should_continue == 'true' }}
     uses: aws/amazon-cloudwatch-agent-operator/.github/workflows/build-and-upload.yml@main
+    permissions:
+      id-token: write
+      contents: read
     concurrency:
       group: ${{ github.workflow }}-operator-${{ inputs.operator-branch || 'main' }}
       cancel-in-progress: true
diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml
index 5534cd38889..ef433f8e011 100644
--- a/.github/workflows/integration-test.yml
+++ b/.github/workflows/integration-test.yml
@@ -57,13 +57,20 @@ jobs:
             exit 1
           fi
       - run: |
-          conclusion=$(gh run view ${{ inputs.build_run_id }} --repo $GITHUB_REPOSITORY --json conclusion -q '.conclusion')
-          if [[ $conclusion == "success" ]]; then
-            echo "Run succeeded"
-          else
-            echo "Run failed"
-            exit 1
-          fi
+          for i in {1..6}; do
+            conclusion=$(gh run view ${{ inputs.build_run_id }} --repo $GITHUB_REPOSITORY --json conclusion -q '.conclusion')
+            if [[ "$conclusion" == "success" ]]; then
+              echo "Run succeeded"
+              exit 0
+            elif [[ "$conclusion" == "failure" || "$conclusion" == "cancelled" ]]; then
+              echo "Run failed with: $conclusion"
+              exit 1
+            fi
+            echo "Waiting for workflow to complete (attempt $i)..."
+            sleep 5
+          done
+          echo "Timed out waiting for workflow"
+          exit 1
         env:
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 
diff --git a/.github/workflows/wd-integration-test.yml b/.github/workflows/wd-integration-test.yml
index b3b35bf2025..c5edc3f1f5e 100644
--- a/.github/workflows/wd-integration-test.yml
+++ b/.github/workflows/wd-integration-test.yml
@@ -42,13 +42,20 @@ jobs:
             exit 1
           fi
       - run: |-
-          conclusion=$(gh run view ${{ inputs.build_run_id }} --repo $GITHUB_REPOSITORY --json conclusion -q '.conclusion')
-          if [[ $conclusion == "success" ]]; then
-            echo "Run succeeded"
-          else
-            echo "Run failed"
-            exit 1
-          fi
+          for i in {1..6}; do
+            conclusion=$(gh run view ${{ inputs.build_run_id }} --repo $GITHUB_REPOSITORY --json conclusion -q '.conclusion')
+            if [[ "$conclusion" == "success" ]]; then
+              echo "Run succeeded"
+              exit 0
+            elif [[ "$conclusion" == "failure" || "$conclusion" == "cancelled" ]]; then
+              echo "Run failed with: $conclusion"
+              exit 1
+            fi
+            echo "Waiting for workflow to complete (attempt $i)..."
+            sleep 5
+          done
+          echo "Timed out waiting for workflow"
+          exit 1
         env:
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 

From 48918e2b6e6d35b36eb690853d7d8e56cf6ebea1 Mon Sep 17 00:00:00 2001
From: Jeffrey Chien <chienjef@amazon.com>
Date: Tue, 24 Feb 2026 12:26:04 -0500
Subject: [PATCH 48/50] Bump minimum workflow go version to match go.mod
 (#2036)

---
 .github/workflows/PR-build.yml                      | 6 +++---
 .github/workflows/PR-test.yml                       | 4 ++--
 .github/workflows/e2e-test.yml                      | 4 ++--
 .github/workflows/eks-performance-cluster-tests.yml | 2 +-
 .github/workflows/otel-fork-replace.yml             | 2 +-
 .github/workflows/test-artifacts.yml                | 6 +++---
 .github/workflows/test-build-docker.yml             | 2 +-
 .github/workflows/test-build-packages.yml           | 2 +-
 .github/workflows/test-build.yml                    | 2 +-
 .github/workflows/upload-dependencies.yml           | 2 +-
 10 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/PR-build.yml b/.github/workflows/PR-build.yml
index ac764e576cd..9a89c0db3d6 100644
--- a/.github/workflows/PR-build.yml
+++ b/.github/workflows/PR-build.yml
@@ -48,7 +48,7 @@ jobs:
         if: needs.changes.outputs.lint == 'true'
         uses: actions/setup-go@v4
         with:
-          go-version: ~1.25
+          go-version: ~1.25.7
           cache: false
 
       - name: Check out code
@@ -105,7 +105,7 @@ jobs:
         if: needs.changes.outputs.build == 'true'
         uses: actions/setup-go@v4
         with:
-          go-version: ~1.25
+          go-version: ~1.25.7
           cache: false
 
       - name: Check out code
@@ -152,7 +152,7 @@ jobs:
         if: needs.changes.outputs.build == 'true'
         uses: actions/setup-go@v4
         with:
-          go-version: ~1.25
+          go-version: ~1.25.7
           cache: false
 
       - name: Check out code
diff --git a/.github/workflows/PR-test.yml b/.github/workflows/PR-test.yml
index 7e4f85f7bf5..742e9870f29 100644
--- a/.github/workflows/PR-test.yml
+++ b/.github/workflows/PR-test.yml
@@ -95,7 +95,7 @@ jobs:
       - name: Set up Go 1.x
         uses: actions/setup-go@v4
         with:
-          go-version: ~1.25
+          go-version: ~1.25.7
 
   StartLocalStack:
     name: 'StartLocalStack'
@@ -138,7 +138,7 @@ jobs:
       - name: Set up Go 1.x
         uses: actions/setup-go@v4
         with:
-          go-version: ~1.25
+          go-version: ~1.25.7
 
       - name: Install jq
         run: sudo apt-get install -y jq
diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml
index 667992e6978..5397885df69 100644
--- a/.github/workflows/e2e-test.yml
+++ b/.github/workflows/e2e-test.yml
@@ -110,7 +110,7 @@ jobs:
       - name: Set up Go 1.x
         uses: actions/setup-go@v4
         with:
-          go-version: ~1.25
+          go-version: ~1.25.7
 
       - name: SetOutputs
         id: set-outputs
@@ -147,7 +147,7 @@ jobs:
       - name: Set up Go 1.x
         uses: actions/setup-go@v4
         with:
-          go-version: ~1.25
+          go-version: ~1.25.7
 
       - name: Generate matrix
         id: set-matrix
diff --git a/.github/workflows/eks-performance-cluster-tests.yml b/.github/workflows/eks-performance-cluster-tests.yml
index 89ce188b2f9..90429fab79d 100644
--- a/.github/workflows/eks-performance-cluster-tests.yml
+++ b/.github/workflows/eks-performance-cluster-tests.yml
@@ -122,7 +122,7 @@ jobs:
       - name: Set up Go 1.x
         uses: actions/setup-go@v4
         with:
-          go-version: ~1.25
+          go-version: ~1.25.7
 
       - uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/otel-fork-replace.yml b/.github/workflows/otel-fork-replace.yml
index 29707139b47..0f33b732b09 100644
--- a/.github/workflows/otel-fork-replace.yml
+++ b/.github/workflows/otel-fork-replace.yml
@@ -33,7 +33,7 @@ jobs:
       - name: Set up Go 1.x
         uses: actions/setup-go@v4
         with:
-          go-version: ~1.25
+          go-version: ~1.25.7
           cache: false
       - name: Update OTel fork components version
         id: set-matrix
diff --git a/.github/workflows/test-artifacts.yml b/.github/workflows/test-artifacts.yml
index ff4c70a63d9..292adee2f38 100644
--- a/.github/workflows/test-artifacts.yml
+++ b/.github/workflows/test-artifacts.yml
@@ -145,7 +145,7 @@ jobs:
       - name: Set up Go 1.x
         uses: actions/setup-go@v4
         with:
-          go-version: ~1.25
+          go-version: ~1.25.7
 
   GenerateTestMatrix:
     name: 'GenerateTestMatrix'
@@ -183,7 +183,7 @@ jobs:
       - name: Set up Go 1.x
         uses: actions/setup-go@v4
         with:
-          go-version: ~1.25
+          go-version: ~1.25.7
 
       - name: Generate matrix
         id: set-matrix
@@ -303,7 +303,7 @@ jobs:
       - name: Set up Go 1.x
         uses: actions/setup-go@v2
         with:
-          go-version: ~1.25
+          go-version: ~1.25.7
 
       - name: Configure AWS Credentials
         uses: aws-actions/configure-aws-credentials@v4
diff --git a/.github/workflows/test-build-docker.yml b/.github/workflows/test-build-docker.yml
index 03c5d143c08..7d474f94dc6 100644
--- a/.github/workflows/test-build-docker.yml
+++ b/.github/workflows/test-build-docker.yml
@@ -143,7 +143,7 @@ jobs:
       - name: Set up Go 1.x
         uses: actions/setup-go@v4
         with:
-          go-version: ~1.25
+          go-version: ~1.25.7
 
       - name: Configure AWS Credentials
         uses: aws-actions/configure-aws-credentials@v4
diff --git a/.github/workflows/test-build-packages.yml b/.github/workflows/test-build-packages.yml
index 0814a80047b..a14bc4e960a 100644
--- a/.github/workflows/test-build-packages.yml
+++ b/.github/workflows/test-build-packages.yml
@@ -80,7 +80,7 @@ jobs:
       - name: Set up Go 1.x
         uses: actions/setup-go@v4
         with:
-          go-version: ~1.25
+          go-version: ~1.25.7
 
       - name: Free up disk space
         working-directory: cwa
diff --git a/.github/workflows/test-build.yml b/.github/workflows/test-build.yml
index d72f59339c9..94e6196f01c 100644
--- a/.github/workflows/test-build.yml
+++ b/.github/workflows/test-build.yml
@@ -75,7 +75,7 @@ jobs:
       - name: Set up Go 1.x
         uses: actions/setup-go@v4
         with:
-          go-version: ~1.25
+          go-version: ~1.25.7
           cache: false
 
       - name: Free up disk space
diff --git a/.github/workflows/upload-dependencies.yml b/.github/workflows/upload-dependencies.yml
index eb4dafb552e..14523c62cf6 100644
--- a/.github/workflows/upload-dependencies.yml
+++ b/.github/workflows/upload-dependencies.yml
@@ -45,7 +45,7 @@ jobs:
       - name: Set up Go
         uses: actions/setup-go@v3
         with:
-          go-version: ~1.25
+          go-version: ~1.25.7
 
       - name: Upload Dependencies and Test Repo
         env:

From ca1df31dc036086dbbbc9669c1d9da35e61c1250 Mon Sep 17 00:00:00 2001
From: Marcus Mann <mpmann@amazon.com>
Date: Tue, 24 Feb 2026 14:24:30 -0500
Subject: [PATCH 49/50] fix: use main branch for helm-charts in EKS performance
 workflow (#2035)

---
 .github/workflows/eks-performance-cluster-addon-install.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/eks-performance-cluster-addon-install.yml b/.github/workflows/eks-performance-cluster-addon-install.yml
index 37580e1680b..539b4fca860 100644
--- a/.github/workflows/eks-performance-cluster-addon-install.yml
+++ b/.github/workflows/eks-performance-cluster-addon-install.yml
@@ -177,10 +177,9 @@ jobs:
         run: |
           aws eks update-kubeconfig --name $CLUSTER_NAME --region $AWS_REGION
 
-      # TODO: Revert to using main helm branch when changes from leader-election are merged in
       - name: Clone Helm Charts Repository
         env:
-          HELM_CHARTS_BRANCH: ${{ inputs.helm-charts-branch || 'sky333999/leader-election' }}
+          HELM_CHARTS_BRANCH: ${{ inputs.helm-charts-branch || 'main' }}
         run: |
           rm -rf ./helm-charts
           git clone -b "$HELM_CHARTS_BRANCH" https://github.com/aws-observability/helm-charts.git ./helm-charts

From 9aabc042b823a6f6cad9aa32db2683482710f05d Mon Sep 17 00:00:00 2001
From: Marcus Mann <mpmann@amazon.com>
Date: Thu, 26 Feb 2026 14:17:42 -0500
Subject: [PATCH 50/50] fix(pusher): Address PR review feedback from Jeffrey
 Chien

CRITICAL fixes:
- Handle retryHeap.Push() error in sender.Send() when heap is stopped
  during shutdown. Now calls batch.done() to persist state and resume
  circuit breaker instead of silently dropping the batch.
- Fix Close() ordering: pushers stop before heap to allow in-flight
  sends to push failed batches. Remove duplicate Stop() calls.

HIGH priority fixes:
- Remove dead maxRetryDuration field from RetryHeapProcessor (batch
  expiry is handled by batch.expireAfter set in initializeStartTime)
- Remove duplicate maxRetryTimeout constant from cloudwatchlogs.go
  (canonical definition is in batch.go)
- Add clarifying comment about circuit breaker in synchronous mode

MEDIUM priority fixes:
- Add stopMu mutex to RetryHeapProcessor.Stop() for thread safety
- Rename TestPoisonPillScenario to TestRetryHeapProcessorDoesNotStarveAllowedTarget
  (test doesn't exercise full pipeline)
- Delete TestRecoveryAfterSystemRestart (doesn't test actual restart)
- Delete TestRecoveryWithMultipleTargets (duplicates TestSingleDeniedLogGroup)

LOW priority fixes:
- Fix TestQueueHaltResume to avoid race condition
- Replace stringPtr/int64Ptr helpers with aws.String()/aws.Int64()
---
 .../outputs/cloudwatchlogs/cloudwatchlogs.go  |  32 ++--
 .../internal/pusher/poison_pill_test.go       |  20 +-
 .../cloudwatchlogs/internal/pusher/queue.go   |   5 +-
 .../internal/pusher/queue_test.go             |   9 +-
 .../internal/pusher/retryheap.go              |  54 ++++--
 .../internal/pusher/retryheap_expiry_test.go  |   3 +-
 .../pusher/retryheap_recovery_test.go         | 177 +-----------------
 .../internal/pusher/retryheap_test.go         |  12 +-
 .../cloudwatchlogs/internal/pusher/sender.go  |   9 +-
 .../internal/pusher/state_callback_test.go    |   6 +-
 10 files changed, 86 insertions(+), 241 deletions(-)

diff --git a/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go b/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go
index 6283b90f3bd..854c5b515f8 100644
--- a/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go
+++ b/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go
@@ -37,8 +37,6 @@ const (
 	LogEntryField     = "value"
 
 	defaultFlushTimeout = 5 * time.Second
-
-	maxRetryTimeout = 14*24*time.Hour + 10*time.Minute
 )
 
 var (
@@ -89,16 +87,12 @@ func (c *CloudWatchLogs) Connect() error {
 }
 
 func (c *CloudWatchLogs) Close() error {
-	// Stop components in specific order to prevent race conditions:
-	// 1. RetryHeap - stop accepting new batches first
-	// 2. Pushers - stop all active pushers (queues/senders)
-	// 3. Wait for pushers to complete
-	// 4. RetryHeapProcessor - stop retry processing and wait for WorkerPool usage to complete
-	// 5. WorkerPool - finally stop the worker threads
-
-	if c.retryHeap != nil {
-		c.retryHeap.Stop()
-	}
+	// Shutdown order:
+	// 1. Stop all pushers (queues stop accepting new events, final send)
+	// 2. Wait for pushers to complete (in-flight sends finish, failed batches pushed to heap)
+	// 3. Stop RetryHeap (no more pushes accepted after this point)
+	// 4. Stop RetryHeapProcessor (flush remaining ready batches, stop goroutine)
+	// 5. Stop WorkerPool (drain worker threads)
 
 	c.cwDests.Range(func(_, value interface{}) bool {
 		if d, ok := value.(*cwDest); ok {
@@ -109,20 +103,16 @@ func (c *CloudWatchLogs) Close() error {
 
 	c.pusherWaitGroup.Wait()
 
-	if c.retryHeapProcessor != nil {
-		c.retryHeapProcessor.Stop()
-	}
-
-	if c.workerPool != nil {
-		c.workerPool.Stop()
+	if c.retryHeap != nil {
+		c.retryHeap.Stop()
 	}
 
 	if c.retryHeapProcessor != nil {
 		c.retryHeapProcessor.Stop()
 	}
 
-	if c.retryHeap != nil {
-		c.retryHeap.Stop()
+	if c.workerPool != nil {
+		c.workerPool.Stop()
 	}
 
 	return nil
@@ -178,7 +168,7 @@ func (c *CloudWatchLogs) getDest(t pusher.Target, logSrc logs.LogSrc) *cwDest {
 
 			retryHeapProcessorRetryer := retryer.NewLogThrottleRetryer(c.Log)
 			retryHeapProcessorClient := c.createClient(retryHeapProcessorRetryer)
-			c.retryHeapProcessor = pusher.NewRetryHeapProcessor(c.retryHeap, c.workerPool, retryHeapProcessorClient, c.targetManager, c.Log, maxRetryTimeout, retryHeapProcessorRetryer)
+			c.retryHeapProcessor = pusher.NewRetryHeapProcessor(c.retryHeap, c.workerPool, retryHeapProcessorClient, c.targetManager, c.Log, retryHeapProcessorRetryer)
 			c.retryHeapProcessor.Start()
 		}
 		c.targetManager = pusher.NewTargetManager(c.Log, client)
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go
index 3500173e0af..5a322c7caac 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go
@@ -9,6 +9,7 @@ import (
 	"testing"
 	"time"
 
+	"github.com/aws/aws-sdk-go/aws"
 	"github.com/influxdata/telegraf/testutil"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/mock"
@@ -17,10 +18,13 @@ import (
 	"github.com/aws/amazon-cloudwatch-agent/sdk/service/cloudwatchlogs"
 )
 
-// TestPoisonPillScenario validates that when 10 denied + 1 allowed log groups
+// TestRetryHeapProcessorDoesNotStarveAllowedTarget validates that when 10 denied + 1 allowed log groups
 // share a worker pool with concurrency=2, the allowed log group continues
 // publishing without being starved by failed retries.
-func TestPoisonPillScenario(t *testing.T) {
+// Note: This test pushes batches directly to the heap and bypasses the full
+// queue → sender → retryHeap → processor pipeline. It validates RetryHeapProcessor
+// behavior, not the end-to-end circuit breaker flow.
+func TestRetryHeapProcessorDoesNotStarveAllowedTarget(t *testing.T) {
 	heap := NewRetryHeap(&testutil.Logger{})
 	defer heap.Stop()
 
@@ -32,7 +36,7 @@ func TestPoisonPillScenario(t *testing.T) {
 	mockTargetManager.On("EnsureTargetExists", mock.Anything).Return(nil)
 
 	accessDeniedErr := &cloudwatchlogs.AccessDeniedException{
-		Message_: stringPtr("User is not authorized to perform: logs:PutLogEvents with an explicit deny"),
+		Message_: aws.String("User is not authorized to perform: logs:PutLogEvents with an explicit deny"),
 	}
 
 	// Track successful PutLogEvents calls for the allowed log group
@@ -54,7 +58,7 @@ func TestPoisonPillScenario(t *testing.T) {
 		deniedGroupAttemptCount.Add(1)
 	})
 
-	processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, 100*time.Millisecond, retryer.NewLogThrottleRetryer(&testutil.Logger{}))
+	processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, retryer.NewLogThrottleRetryer(&testutil.Logger{}))
 
 	// Targets
 	allowedTarget := Target{Group: "log-stream-ple-access-granted", Stream: "i-test"}
@@ -170,7 +174,7 @@ func TestSingleDeniedLogGroup(t *testing.T) {
 	mockTargetManager.On("EnsureTargetExists", mock.Anything).Return(nil)
 
 	accessDeniedErr := &cloudwatchlogs.AccessDeniedException{
-		Message_: stringPtr("Access denied"),
+		Message_: aws.String("Access denied"),
 	}
 
 	var allowedGroupSuccessCount atomic.Int32
@@ -185,7 +189,7 @@ func TestSingleDeniedLogGroup(t *testing.T) {
 		return *input.LogGroupName == "aws-restricted-log-group-name-log-stream-ple-access-denied"
 	})).Return((*cloudwatchlogs.PutLogEventsOutput)(nil), accessDeniedErr)
 
-	processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, time.Hour, retryer.NewLogThrottleRetryer(&testutil.Logger{}))
+	processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, retryer.NewLogThrottleRetryer(&testutil.Logger{}))
 
 	// Create batches
 	allowedTarget := Target{Group: "log-stream-ple-access-granted", Stream: "i-test"}
@@ -217,8 +221,8 @@ func createBatch(target Target, eventCount int) *logEventBatch {
 	now := time.Now().Unix() * 1000
 	for i := 0; i < eventCount; i++ {
 		batch.events[i] = &cloudwatchlogs.InputLogEvent{
-			Message:   stringPtr("test message"),
-			Timestamp: int64Ptr(now + int64(i)),
+			Message:   aws.String("test message"),
+			Timestamp: aws.Int64(now + int64(i)),
 		}
 	}
 	return batch
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go b/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go
index 86f1bd6e4c2..8899554df93 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go
@@ -184,7 +184,10 @@ func (q *queue) send() {
 		q.batch.addDoneCallback(q.onSuccessCallback(q.batch.bufferedSize))
 		q.batch.addFailCallback(q.halt)
 
-		// Wait if halted (circuit breaker)
+		// In synchronous mode (no retryHeap), halt() is never called because
+		// sender only calls batch.fail() when retryHeap != nil. So waitIfHalted
+		// is a no-op. The lock acquisition is negligible overhead (~20ns) on
+		// the uncontended path.
 		q.waitIfHalted()
 
 		q.sender.Send(q.batch)
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
index 293218e185a..f2bd145fc0c 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go
@@ -797,7 +797,7 @@ func TestQueueHaltResume(t *testing.T) {
 	mockSender.On("Send", mock.Anything).Run(func(args mock.Arguments) {
 		sendCount.Add(1)
 		batch := args.Get(0).(*logEventBatch)
-		// Simulate failure on first call, success on second
+		// Simulate failure on first call, success on subsequent calls
 		if sendCount.Load() == 1 {
 			batch.fail() // This should halt the queue
 		} else {
@@ -821,9 +821,6 @@ func TestQueueHaltResume(t *testing.T) {
 	assert.True(t, queueImpl.halted, "Queue should be halted after failure")
 	queueImpl.haltMu.Unlock()
 
-	// Add second event - should be queued but not sent due to halt
-	q.AddEvent(newStubLogEvent("second message", time.Now()))
-
 	// Verify only one send happened (queue is halted)
 	assert.Equal(t, int32(1), sendCount.Load(), "Should have only one send due to halt")
 
@@ -835,8 +832,8 @@ func TestQueueHaltResume(t *testing.T) {
 	assert.False(t, queueImpl.halted, "Queue should be resumed after success")
 	queueImpl.haltMu.Unlock()
 
-	// Add third event - should trigger send since queue is resumed
-	q.AddEvent(newStubLogEvent("third message", time.Now()))
+	// Add second event - should trigger send since queue is resumed
+	q.AddEvent(newStubLogEvent("second message", time.Now()))
 
 	// Wait for the second send to complete
 	time.Sleep(50 * time.Millisecond)
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
index 258c2795e23..a4c708ad6fe 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go
@@ -120,31 +120,30 @@ func (rh *retryHeap) Stop() {
 
 // RetryHeapProcessor manages the retry heap and moves ready batches back to sender queue
 type RetryHeapProcessor struct {
-	retryHeap        RetryHeap
-	senderPool       Sender
-	retryer          *retryer.LogThrottleRetryer
-	stopCh           chan struct{}
-	logger           telegraf.Logger
-	stopped          bool
-	maxRetryDuration time.Duration
-	wg               sync.WaitGroup
+	retryHeap  RetryHeap
+	senderPool Sender
+	retryer    *retryer.LogThrottleRetryer
+	stopCh     chan struct{}
+	logger     telegraf.Logger
+	stopped    bool
+	stopMu     sync.Mutex
+	wg         sync.WaitGroup
 }
 
 // NewRetryHeapProcessor creates a new retry heap processor
-func NewRetryHeapProcessor(retryHeap RetryHeap, workerPool WorkerPool, service cloudWatchLogsService, targetManager TargetManager, logger telegraf.Logger, maxRetryDuration time.Duration, retryer *retryer.LogThrottleRetryer) *RetryHeapProcessor {
+func NewRetryHeapProcessor(retryHeap RetryHeap, workerPool WorkerPool, service cloudWatchLogsService, targetManager TargetManager, logger telegraf.Logger, retryer *retryer.LogThrottleRetryer) *RetryHeapProcessor {
 	// Create processor's own sender and senderPool
 	// Pass retryHeap so failed batches go back to RetryHeap instead of blocking on sync retry
 	sender := newSender(logger, service, targetManager, retryHeap)
 	senderPool := newSenderPool(workerPool, sender)
 
 	return &RetryHeapProcessor{
-		retryHeap:        retryHeap,
-		senderPool:       senderPool,
-		retryer:          retryer,
-		stopCh:           make(chan struct{}),
-		logger:           logger,
-		stopped:          false,
-		maxRetryDuration: maxRetryDuration,
+		retryHeap:  retryHeap,
+		senderPool: senderPool,
+		retryer:    retryer,
+		stopCh:     make(chan struct{}),
+		logger:     logger,
+		stopped:    false,
 	}
 }
 
@@ -156,18 +155,24 @@ func (p *RetryHeapProcessor) Start() {
 
 // Stop stops the retry heap processor
 func (p *RetryHeapProcessor) Stop() {
+	p.stopMu.Lock()
+	defer p.stopMu.Unlock()
+
 	if p.stopped {
 		return
 	}
 
-	// Process any remaining batches before stopping
-	p.processReadyMessages()
+	// Flush remaining ready batches before marking as stopped
+	p.flushReadyBatches()
+
+	p.stopped = true
 
-	p.retryer.Stop()
+	if p.retryer != nil {
+		p.retryer.Stop()
+	}
 	p.senderPool.Stop()
 	close(p.stopCh)
 	p.wg.Wait()
-	p.stopped = true
 }
 
 // processLoop runs the main processing loop
@@ -188,10 +193,19 @@ func (p *RetryHeapProcessor) processLoop() {
 
 // processReadyMessages checks the heap for ready batches and moves them back to sender queue
 func (p *RetryHeapProcessor) processReadyMessages() {
+	p.stopMu.Lock()
 	if p.stopped {
+		p.stopMu.Unlock()
 		return
 	}
+	p.stopMu.Unlock()
+
+	p.flushReadyBatches()
+}
 
+// flushReadyBatches pops ready batches from the heap and sends them.
+// Called by both processReadyMessages and Stop.
+func (p *RetryHeapProcessor) flushReadyBatches() {
 	readyBatches := p.retryHeap.PopReady()
 
 	for _, batch := range readyBatches {
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go
index cdbbd56838f..64bfd588e98 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go
@@ -43,9 +43,8 @@ func TestRetryHeapProcessorExpiredBatchShouldResume(t *testing.T) {
 	retryHeap := NewRetryHeap(logger)
 	workerPool := NewWorkerPool(5)
 	tm := NewTargetManager(logger, mockService)
-	maxRetryDuration := 50 * time.Millisecond // Normally 14 days
 
-	retryHeapProcessor := NewRetryHeapProcessor(retryHeap, workerPool, mockService, tm, logger, maxRetryDuration, nil)
+	retryHeapProcessor := NewRetryHeapProcessor(retryHeap, workerPool, mockService, tm, logger, nil)
 	retryHeapProcessor.Start()
 
 	defer retryHeap.Stop()
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_recovery_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_recovery_test.go
index 747272d1bed..7dfe1020c0d 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_recovery_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_recovery_test.go
@@ -4,11 +4,11 @@
 package pusher
 
 import (
-	"errors"
 	"sync"
 	"testing"
 	"time"
 
+	"github.com/aws/aws-sdk-go/aws"
 	"github.com/influxdata/telegraf/testutil"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/mock"
@@ -30,7 +30,7 @@ func TestRecoveryWhenPermissionGrantedDuringRetry(t *testing.T) {
 	// Mock service that initially returns AccessDenied, then succeeds
 	mockService := &mockLogsService{}
 	accessDeniedErr := &cloudwatchlogs.AccessDeniedException{
-		Message_: stringPtr("Access denied"),
+		Message_: aws.String("Access denied"),
 	}
 
 	// First call fails with AccessDenied
@@ -41,13 +41,13 @@ func TestRecoveryWhenPermissionGrantedDuringRetry(t *testing.T) {
 	mockTargetManager := &mockTargetManager{}
 	mockTargetManager.On("EnsureTargetExists", mock.Anything).Return(nil)
 
-	processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, time.Hour, retryer.NewLogThrottleRetryer(&testutil.Logger{}))
+	processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, retryer.NewLogThrottleRetryer(&testutil.Logger{}))
 
 	// Create batch and track circuit breaker state
 	target := Target{Group: "group", Stream: "stream"}
 	batch := newLogEventBatch(target, nil)
 	batch.events = []*cloudwatchlogs.InputLogEvent{
-		{Message: stringPtr("test message"), Timestamp: int64Ptr(time.Now().Unix() * 1000)},
+		{Message: aws.String("test message"), Timestamp: aws.Int64(time.Now().Unix() * 1000)},
 	}
 
 	var haltCalled, resumeCalled bool
@@ -108,172 +108,3 @@ func TestRecoveryWhenPermissionGrantedDuringRetry(t *testing.T) {
 	// Verify both PutLogEvents calls were made
 	mockService.AssertExpectations(t)
 }
-
-// TestRecoveryAfterSystemRestart validates that when the system restarts with
-// retry ongoing, it resumes correctly by loading state and continuing retries.
-func TestRecoveryAfterSystemRestart(t *testing.T) {
-	heap := NewRetryHeap(&testutil.Logger{})
-	defer heap.Stop()
-
-	workerPool := NewWorkerPool(2)
-	defer workerPool.Stop()
-
-	mockService := &mockLogsService{}
-	mockTargetManager := &mockTargetManager{}
-	mockTargetManager.On("EnsureTargetExists", mock.Anything).Return(nil)
-
-	// Simulate system restart scenario:
-	// 1. Initial failure puts batch in retry state
-	// 2. System "restarts" (new processor instance)
-	// 3. Batch is reloaded with retry metadata intact
-	// 4. Retry succeeds
-
-	target := Target{Group: "group", Stream: "stream"}
-	batch := newLogEventBatch(target, nil)
-	batch.events = []*cloudwatchlogs.InputLogEvent{
-		{Message: stringPtr("test message"), Timestamp: int64Ptr(time.Now().Unix() * 1000)},
-	}
-
-	// Simulate batch that was in retry state before restart
-	batch.retryCountShort = 2
-	batch.startTime = time.Now().Add(-5 * time.Minute)
-	batch.nextRetryTime = time.Now().Add(-1 * time.Second) // Ready for retry
-	batch.lastError = errors.New("previous error before restart")
-
-	var resumeCalled bool
-	var mu sync.Mutex
-
-	batch.addDoneCallback(func() {
-		mu.Lock()
-		resumeCalled = true
-		mu.Unlock()
-	})
-
-	// Mock successful retry after restart
-	mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Once()
-
-	// Create new processor (simulating restart)
-	processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, time.Hour, retryer.NewLogThrottleRetryer(&testutil.Logger{}))
-
-	// Push batch with existing retry metadata
-	err := heap.Push(batch)
-	assert.NoError(t, err)
-
-	// Process should succeed
-	processor.processReadyMessages()
-
-	// Wait for async processing to complete
-	time.Sleep(100 * time.Millisecond)
-
-	// Verify circuit breaker resumed
-	mu.Lock()
-	assert.True(t, resumeCalled, "Circuit breaker should resume after successful retry post-restart")
-	mu.Unlock()
-
-	// Heap should be empty
-	assert.Equal(t, 0, heap.Size(), "Heap should be empty after successful retry")
-
-	// Verify retry metadata was preserved
-	assert.Equal(t, 2, batch.retryCountShort, "Retry count should be preserved across restart")
-	assert.False(t, batch.startTime.IsZero(), "Start time should be preserved across restart")
-
-	mockService.AssertExpectations(t)
-}
-
-// TestRecoveryWithMultipleTargets validates that when one target has permission
-// issues, other healthy targets continue publishing successfully.
-func TestRecoveryWithMultipleTargets(t *testing.T) {
-	heap := NewRetryHeap(&testutil.Logger{})
-	defer heap.Stop()
-
-	workerPool := NewWorkerPool(2)
-	defer workerPool.Stop()
-
-	mockService := &mockLogsService{}
-	mockTargetManager := &mockTargetManager{}
-	mockTargetManager.On("EnsureTargetExists", mock.Anything).Return(nil)
-
-	processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, time.Hour, retryer.NewLogThrottleRetryer(&testutil.Logger{}))
-
-	// Create two targets
-	target1 := Target{Group: "group1", Stream: "stream1"}
-	target2 := Target{Group: "group2", Stream: "stream2"}
-
-	batch1 := newLogEventBatch(target1, nil)
-	batch1.events = []*cloudwatchlogs.InputLogEvent{
-		{Message: stringPtr("message1"), Timestamp: int64Ptr(time.Now().Unix() * 1000)},
-	}
-	batch1.nextRetryTime = time.Now().Add(-1 * time.Second)
-
-	batch2 := newLogEventBatch(target2, nil)
-	batch2.events = []*cloudwatchlogs.InputLogEvent{
-		{Message: stringPtr("message2"), Timestamp: int64Ptr(time.Now().Unix() * 1000)},
-	}
-	batch2.nextRetryTime = time.Now().Add(-1 * time.Second)
-
-	var halt1Called, resume1Called, resume2Called bool
-	var mu sync.Mutex
-
-	// Target 1 fails with AccessDenied
-	batch1.addFailCallback(func() {
-		mu.Lock()
-		halt1Called = true
-		mu.Unlock()
-	})
-	batch1.addDoneCallback(func() {
-		mu.Lock()
-		resume1Called = true
-		mu.Unlock()
-	})
-
-	// Target 2 succeeds
-	batch2.addDoneCallback(func() {
-		mu.Lock()
-		resume2Called = true
-		mu.Unlock()
-	})
-
-	// Mock responses: target1 fails, target2 succeeds
-	accessDeniedErr := &cloudwatchlogs.AccessDeniedException{
-		Message_: stringPtr("Access denied"),
-	}
-	mockService.On("PutLogEvents", mock.MatchedBy(func(req *cloudwatchlogs.PutLogEventsInput) bool {
-		return *req.LogGroupName == "group1"
-	})).Return((*cloudwatchlogs.PutLogEventsOutput)(nil), accessDeniedErr).Once()
-
-	mockService.On("PutLogEvents", mock.MatchedBy(func(req *cloudwatchlogs.PutLogEventsInput) bool {
-		return *req.LogGroupName == "group2"
-	})).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Once()
-
-	// Push both batches
-	err := heap.Push(batch1)
-	assert.NoError(t, err)
-	err = heap.Push(batch2)
-	assert.NoError(t, err)
-
-	// Process both batches
-	processor.processReadyMessages()
-
-	// Wait for async processing to complete
-	time.Sleep(100 * time.Millisecond)
-
-	// Verify target1 circuit breaker halted, target2 succeeded
-	mu.Lock()
-	assert.True(t, halt1Called, "Target1 circuit breaker should halt")
-	assert.False(t, resume1Called, "Target1 circuit breaker should not resume")
-	assert.True(t, resume2Called, "Target2 should succeed and resume")
-	mu.Unlock()
-
-	// Target1 should be back in heap, target2 should be done
-	assert.Equal(t, 1, heap.Size(), "Only failed target should remain in heap")
-
-	mockService.AssertExpectations(t)
-}
-
-func stringPtr(s string) *string {
-	return &s
-}
-
-func int64Ptr(i int64) *int64 {
-	return &i
-}
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go
index d86888856bf..d79e388e071 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go
@@ -90,7 +90,7 @@ func TestRetryHeapProcessor(t *testing.T) {
 	mockService := &mockLogsService{}
 	mockTargetManager := &mockTargetManager{}
 
-	processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, time.Hour, retryer.NewLogThrottleRetryer(&testutil.Logger{}))
+	processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, retryer.NewLogThrottleRetryer(&testutil.Logger{}))
 	defer processor.Stop()
 
 	// Test start/stop
@@ -108,7 +108,7 @@ func TestRetryHeapProcessorExpiredBatch(t *testing.T) {
 	mockService := &mockLogsService{}
 	mockTargetManager := &mockTargetManager{}
 
-	processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, 1*time.Millisecond, retryer.NewLogThrottleRetryer(&testutil.Logger{}))
+	processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, retryer.NewLogThrottleRetryer(&testutil.Logger{}))
 
 	target := Target{Group: "group", Stream: "stream"}
 	batch := newLogEventBatch(target, nil)
@@ -138,7 +138,7 @@ func TestRetryHeapProcessorSendsBatch(t *testing.T) {
 	mockTargetManager := &mockTargetManager{}
 	mockTargetManager.On("EnsureTargetExists", mock.Anything).Return(nil)
 
-	processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, time.Hour, retryer.NewLogThrottleRetryer(&testutil.Logger{}))
+	processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, retryer.NewLogThrottleRetryer(&testutil.Logger{}))
 
 	target := Target{Group: "group", Stream: "stream"}
 	batch := newLogEventBatch(target, nil)
@@ -210,7 +210,7 @@ func TestRetryHeapProcessorNoReadyBatches(t *testing.T) {
 	mockService := &mockLogsService{}
 	mockTargetManager := &mockTargetManager{}
 
-	processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, time.Hour, retryer.NewLogThrottleRetryer(&testutil.Logger{}))
+	processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, retryer.NewLogThrottleRetryer(&testutil.Logger{}))
 
 	// Process with empty heap - should not panic
 	processor.processReadyMessages()
@@ -232,7 +232,7 @@ func TestRetryHeapProcessorFailedBatchGoesBackToHeap(t *testing.T) {
 	mockTargetManager := &mockTargetManager{}
 	mockTargetManager.On("InitTarget", mock.Anything).Return(nil)
 
-	processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, time.Hour, retryer.NewLogThrottleRetryer(&testutil.Logger{}))
+	processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, retryer.NewLogThrottleRetryer(&testutil.Logger{}))
 
 	processor.Start()
 	defer processor.Stop()
@@ -284,7 +284,7 @@ func TestRetryHeapProcessorStoppedProcessReadyMessages(t *testing.T) {
 	mockService := &mockLogsService{}
 	mockTargetManager := &mockTargetManager{}
 
-	processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, time.Hour, retryer.NewLogThrottleRetryer(&testutil.Logger{}))
+	processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, retryer.NewLogThrottleRetryer(&testutil.Logger{}))
 
 	// Add a ready batch to the heap
 	target := Target{Group: "group", Stream: "stream"}
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go b/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go
index 6a34be1e43e..902bb166f7c 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go
@@ -121,7 +121,14 @@ func (s *sender) Send(batch *logEventBatch) {
 		// If RetryHeap available, push to RetryHeap and return
 		// Otherwise, continue with existing busy-wait retry behavior
 		if s.retryHeap != nil {
-			s.retryHeap.Push(batch)
+			if err := s.retryHeap.Push(batch); err != nil {
+				// Heap is stopped (shutdown in progress). Persist file offsets
+				// so these events aren't re-read on restart, then notify the
+				// circuit breaker so the queue isn't permanently halted.
+				s.logger.Warnf("RetryHeap stopped, dropping batch for %v/%v: %v", batch.Group, batch.Stream, err)
+				batch.done()
+				return
+			}
 			batch.fail()
 			return
 		}
diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/state_callback_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/state_callback_test.go
index 8190e3a40b5..5450e52e808 100644
--- a/plugins/outputs/cloudwatchlogs/internal/pusher/state_callback_test.go
+++ b/plugins/outputs/cloudwatchlogs/internal/pusher/state_callback_test.go
@@ -72,7 +72,7 @@ func TestRetryHeapSuccessCallsStateCallback(t *testing.T) {
 	defer retryHeap.Stop()
 	defer workerPool.Stop()
 
-	processor := NewRetryHeapProcessor(retryHeap, workerPool, service, tm, logger, time.Hour, retryer.NewLogThrottleRetryer(logger))
+	processor := NewRetryHeapProcessor(retryHeap, workerPool, service, tm, logger, retryer.NewLogThrottleRetryer(logger))
 
 	batch := newStatefulBatch(target, queue)
 	batch.nextRetryTime = time.Now().Add(-1 * time.Second)
@@ -119,7 +119,7 @@ func TestRetryHeapExpiryCallsStateCallback(t *testing.T) {
 	defer retryHeap.Stop()
 	defer workerPool.Stop()
 
-	processor := NewRetryHeapProcessor(retryHeap, workerPool, service, tm, logger, 50*time.Millisecond, nil)
+	processor := NewRetryHeapProcessor(retryHeap, workerPool, service, tm, logger, nil)
 
 	batch := newStatefulBatch(target, queue)
 	batch.initializeStartTime()
@@ -166,7 +166,7 @@ func TestShutdownDoesNotCallStateCallback(t *testing.T) {
 	}
 	tm := NewTargetManager(logger, service)
 
-	processor := NewRetryHeapProcessor(retryHeap, workerPool, service, tm, logger, time.Hour, nil)
+	processor := NewRetryHeapProcessor(retryHeap, workerPool, service, tm, logger, nil)
 	processor.Start()
 
 	// Push a batch with a future retry time so it won't be processed before Stop