From 6b231dc6fc52e102abb88c368932a5d47021cfc6 Mon Sep 17 00:00:00 2001 From: Akansha Agarwal Date: Tue, 30 Dec 2025 05:00:53 +0000 Subject: [PATCH 01/50] introduce retry metadata to batch struct --- .../cloudwatchlogs/internal/pusher/batch.go | 59 +++++++++++++++++++ .../cloudwatchlogs/internal/pusher/sender.go | 37 ++++++------ 2 files changed, 79 insertions(+), 17 deletions(-) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go b/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go index 71e7b14a821..e73ffc1e4ef 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go @@ -101,6 +101,13 @@ type logEventBatch struct { // Callbacks specifically for updating state stateCallbacks []func() batchers map[string]*state.RangeQueueBatcher + + // Retry metadata + retryCountShort int // Number of retries using short delay strategy + retryCountLong int // Number of retries using long delay strategy + startTime time.Time // Time of first request (for max retry duration calculation) + nextRetryTime time.Time // When this batch should be retried next + lastError error // Last error encountered } func newLogEventBatch(target Target, entityProvider logs.LogEntityProvider) *logEventBatch { @@ -226,3 +233,55 @@ func (t byTimestamp) Swap(i, j int) { func (t byTimestamp) Less(i, j int) bool { return *t[i].Timestamp < *t[j].Timestamp } + +// initializeStartTime sets the start time if not already set. +func (b *logEventBatch) initializeStartTime() { + if b.startTime.IsZero() { + b.startTime = time.Now() + } +} + +// updateRetryMetadata updates the retry metadata after a failed send attempt. +// It increments the appropriate retry counter based on the error type and calculates the next retry time. +func (b *logEventBatch) updateRetryMetadata(err error) { + // Store the error + b.lastError = err + + // Determine retry strategy and increment counter + var wait time.Duration + if chooseRetryWaitStrategy(err) == retryLong { + wait = retryWaitLong(b.retryCountLong) + b.retryCountLong++ + } else { + wait = retryWaitShort(b.retryCountShort) + b.retryCountShort++ + } + + // Calculate next retry time (honest timestamp, not capped) + b.nextRetryTime = time.Now().Add(wait) +} + +// isExpired checks if the batch has exceeded the maximum retry duration (14 days). +func (b *logEventBatch) isExpired(maxRetryDuration time.Duration) bool { + if b.startTime.IsZero() { + return false + } + return time.Since(b.startTime) > maxRetryDuration +} + +// isReadyForRetry checks if enough time has passed since the last failure to retry this batch. +func (b *logEventBatch) isReadyForRetry() bool { + if b.nextRetryTime.IsZero() { + return true // Never failed, ready to send + } + return time.Now().After(b.nextRetryTime) +} + +// resetRetryMetadata resets all retry-related fields after a successful send. +func (b *logEventBatch) resetRetryMetadata() { + b.retryCountShort = 0 + b.retryCountLong = 0 + b.startTime = time.Time{} + b.nextRetryTime = time.Time{} + b.lastError = nil +} diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go b/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go index de1bdf6708f..6181f1fa3f7 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go @@ -63,11 +63,11 @@ func (s *sender) Send(batch *logEventBatch) { if len(batch.events) == 0 { return } + + // Initialize start time before build() + batch.initializeStartTime() input := batch.build() - startTime := time.Now() - retryCountShort := 0 - retryCountLong := 0 for { output, err := s.service.PutLogEvents(input) if err == nil { @@ -83,8 +83,9 @@ func (s *sender) Send(batch *logEventBatch) { s.logger.Warnf("%d log events for log '%s/%s' are expired", *info.ExpiredLogEventEndIndex, batch.Group, batch.Stream) } } + // Success - call done callbacks batch.done() - s.logger.Debugf("Pusher published %v log events to group: %v stream: %v with size %v KB in %v.", len(batch.events), batch.Group, batch.Stream, batch.bufferedSize/1024, time.Since(startTime)) + s.logger.Debugf("Pusher published %v log events to group: %v stream: %v with size %v KB in %v.", len(batch.events), batch.Group, batch.Stream, batch.bufferedSize/1024, time.Since(batch.startTime)) return } @@ -110,27 +111,29 @@ func (s *sender) Send(batch *logEventBatch) { s.logger.Errorf("Aws error received when sending logs to %v/%v: %v", batch.Group, batch.Stream, awsErr) } - // retry wait strategy depends on the type of error returned - var wait time.Duration - if chooseRetryWaitStrategy(err) == retryLong { - wait = retryWaitLong(retryCountLong) - retryCountLong++ - } else { - wait = retryWaitShort(retryCountShort) - retryCountShort++ - } + // Update retry metadata in the batch + batch.updateRetryMetadata(err) - if time.Since(startTime)+wait > s.RetryDuration() { - s.logger.Errorf("All %v retries to %v/%v failed for PutLogEvents, request dropped.", retryCountShort+retryCountLong-1, batch.Group, batch.Stream) + // Check if the next retry time would exceed the max retry duration + // This prevents us from sleeping and then making another doomed API call + totalRetries := batch.retryCountShort + batch.retryCountLong - 1 + if batch.isExpired(s.RetryDuration()) || batch.nextRetryTime.After(batch.startTime.Add(s.RetryDuration())) { + s.logger.Errorf("All %v retries to %v/%v failed for PutLogEvents, request dropped.", totalRetries, batch.Group, batch.Stream) batch.updateState() return } - s.logger.Warnf("Retried %v time, going to sleep %v before retrying.", retryCountShort+retryCountLong-1, wait) + // Calculate wait time until next retry + wait := batch.nextRetryTime.Sub(time.Now()) + if wait < 0 { + wait = 0 + } + + s.logger.Warnf("Retried %v time, going to sleep %v before retrying.", totalRetries, wait) select { case <-s.stopCh: - s.logger.Errorf("Stop requested after %v retries to %v/%v failed for PutLogEvents, request dropped.", retryCountShort+retryCountLong-1, batch.Group, batch.Stream) + s.logger.Errorf("Stop requested after %v retries to %v/%v failed for PutLogEvents, request dropped.", totalRetries, batch.Group, batch.Stream) batch.updateState() return case <-time.After(wait): From 8521373070f816216c2709efda72220eebc3ccd9 Mon Sep 17 00:00:00 2001 From: Akansha Agarwal Date: Tue, 30 Dec 2025 05:04:58 +0000 Subject: [PATCH 02/50] Remove unused reset method --- plugins/outputs/cloudwatchlogs/internal/pusher/batch.go | 9 --------- 1 file changed, 9 deletions(-) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go b/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go index e73ffc1e4ef..00b91fecdc5 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go @@ -276,12 +276,3 @@ func (b *logEventBatch) isReadyForRetry() bool { } return time.Now().After(b.nextRetryTime) } - -// resetRetryMetadata resets all retry-related fields after a successful send. -func (b *logEventBatch) resetRetryMetadata() { - b.retryCountShort = 0 - b.retryCountLong = 0 - b.startTime = time.Time{} - b.nextRetryTime = time.Time{} - b.lastError = nil -} From 7244af8cead2671b8973ede6e449dc642875414b Mon Sep 17 00:00:00 2001 From: Akansha Agarwal Date: Tue, 30 Dec 2025 05:13:18 +0000 Subject: [PATCH 03/50] add unit tests for retryMetadata --- .../internal/pusher/batch_test.go | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/batch_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/batch_test.go index 04e523464e7..28fc741c5bc 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/batch_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/batch_test.go @@ -404,3 +404,28 @@ func TestValidateAndTruncateMessage(t *testing.T) { }) } } +func TestBatchRetryMetadata(t *testing.T) { + target := Target{Group: "test-group", Stream: "test-stream"} + batch := newLogEventBatch(target, nil) + + // Test initial state + assert.True(t, batch.startTime.IsZero()) + assert.True(t, batch.isReadyForRetry()) + assert.False(t, batch.isExpired(time.Hour)) + + // Test initializeStartTime + batch.initializeStartTime() + assert.False(t, batch.startTime.IsZero()) + + // Test updateRetryMetadata + err := assert.AnError + batch.updateRetryMetadata(err) + assert.Equal(t, 1, batch.retryCountShort) + assert.Equal(t, 0, batch.retryCountLong) + assert.Equal(t, err, batch.lastError) + assert.False(t, batch.nextRetryTime.IsZero()) + + // Test isExpired + batch.startTime = time.Now().Add(-25 * time.Hour) + assert.True(t, batch.isExpired(24*time.Hour)) +} From d2f21e1293e0ec1ce539c719e828e465c1f12fe6 Mon Sep 17 00:00:00 2001 From: Akansha Agarwal Date: Tue, 30 Dec 2025 05:25:54 +0000 Subject: [PATCH 04/50] fix lint --- .../cloudwatchlogs/internal/pusher/batch.go | 16 ++++++++-------- .../cloudwatchlogs/internal/pusher/batch_test.go | 2 +- .../cloudwatchlogs/internal/pusher/sender.go | 8 +++----- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go b/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go index 00b91fecdc5..a0013e785e7 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go @@ -103,11 +103,11 @@ type logEventBatch struct { batchers map[string]*state.RangeQueueBatcher // Retry metadata - retryCountShort int // Number of retries using short delay strategy - retryCountLong int // Number of retries using long delay strategy - startTime time.Time // Time of first request (for max retry duration calculation) - nextRetryTime time.Time // When this batch should be retried next - lastError error // Last error encountered + retryCountShort int // Number of retries using short delay strategy + retryCountLong int // Number of retries using long delay strategy + startTime time.Time // Time of first request (for max retry duration calculation) + nextRetryTime time.Time // When this batch should be retried next + lastError error // Last error encountered } func newLogEventBatch(target Target, entityProvider logs.LogEntityProvider) *logEventBatch { @@ -257,11 +257,11 @@ func (b *logEventBatch) updateRetryMetadata(err error) { b.retryCountShort++ } - // Calculate next retry time (honest timestamp, not capped) + // Calculate next retry time b.nextRetryTime = time.Now().Add(wait) } -// isExpired checks if the batch has exceeded the maximum retry duration (14 days). +// isExpired checks if the batch has exceeded the maximum retry duration. func (b *logEventBatch) isExpired(maxRetryDuration time.Duration) bool { if b.startTime.IsZero() { return false @@ -272,7 +272,7 @@ func (b *logEventBatch) isExpired(maxRetryDuration time.Duration) bool { // isReadyForRetry checks if enough time has passed since the last failure to retry this batch. func (b *logEventBatch) isReadyForRetry() bool { if b.nextRetryTime.IsZero() { - return true // Never failed, ready to send + return true } return time.Now().After(b.nextRetryTime) } diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/batch_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/batch_test.go index 28fc741c5bc..a5f13b127c7 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/batch_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/batch_test.go @@ -416,7 +416,7 @@ func TestBatchRetryMetadata(t *testing.T) { // Test initializeStartTime batch.initializeStartTime() assert.False(t, batch.startTime.IsZero()) - + // Test updateRetryMetadata err := assert.AnError batch.updateRetryMetadata(err) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go b/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go index 6181f1fa3f7..3df074a0fbe 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go @@ -63,7 +63,7 @@ func (s *sender) Send(batch *logEventBatch) { if len(batch.events) == 0 { return } - + // Initialize start time before build() batch.initializeStartTime() input := batch.build() @@ -83,7 +83,6 @@ func (s *sender) Send(batch *logEventBatch) { s.logger.Warnf("%d log events for log '%s/%s' are expired", *info.ExpiredLogEventEndIndex, batch.Group, batch.Stream) } } - // Success - call done callbacks batch.done() s.logger.Debugf("Pusher published %v log events to group: %v stream: %v with size %v KB in %v.", len(batch.events), batch.Group, batch.Stream, batch.bufferedSize/1024, time.Since(batch.startTime)) return @@ -114,8 +113,7 @@ func (s *sender) Send(batch *logEventBatch) { // Update retry metadata in the batch batch.updateRetryMetadata(err) - // Check if the next retry time would exceed the max retry duration - // This prevents us from sleeping and then making another doomed API call + // Check if retry would exceed max duration totalRetries := batch.retryCountShort + batch.retryCountLong - 1 if batch.isExpired(s.RetryDuration()) || batch.nextRetryTime.After(batch.startTime.Add(s.RetryDuration())) { s.logger.Errorf("All %v retries to %v/%v failed for PutLogEvents, request dropped.", totalRetries, batch.Group, batch.Stream) @@ -124,7 +122,7 @@ func (s *sender) Send(batch *logEventBatch) { } // Calculate wait time until next retry - wait := batch.nextRetryTime.Sub(time.Now()) + wait := time.Until(batch.nextRetryTime) if wait < 0 { wait = 0 } From 66186a78b1268b8bc663468064f6e0480fec8211 Mon Sep 17 00:00:00 2001 From: Akansha Agarwal Date: Tue, 30 Dec 2025 16:35:24 +0000 Subject: [PATCH 05/50] Introduce retryHeap and retryHeapProcessor --- .../internal/pusher/retryheap.go | 191 ++++++++++++++++++ .../internal/pusher/retryheap_test.go | 151 ++++++++++++++ 2 files changed, 342 insertions(+) create mode 100644 plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go create mode 100644 plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go new file mode 100644 index 00000000000..5973f57b381 --- /dev/null +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go @@ -0,0 +1,191 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package pusher + +import ( + "container/heap" + "errors" + "sync" + "time" + + "github.com/influxdata/telegraf" +) + +// retryHeapImpl implements heap.Interface for logEventBatch sorted by nextRetryTime +type retryHeapImpl []*logEventBatch + +func (h retryHeapImpl) Len() int { return len(h) } + +func (h retryHeapImpl) Less(i, j int) bool { + return h[i].nextRetryTime.Before(h[j].nextRetryTime) +} + +func (h retryHeapImpl) Swap(i, j int) { h[i], h[j] = h[j], h[i] } + +func (h *retryHeapImpl) Push(x interface{}) { + *h = append(*h, x.(*logEventBatch)) +} + +func (h *retryHeapImpl) Pop() interface{} { + old := *h + n := len(old) + item := old[n-1] + *h = old[0 : n-1] + return item +} + +// RetryHeap manages failed batches during their retry wait periods +type RetryHeap interface { + Push(batch *logEventBatch) error + PopReady() []*logEventBatch + Size() int + Stop() +} + +type retryHeap struct { + heap retryHeapImpl + mutex sync.RWMutex + pushCh chan *logEventBatch + stopCh chan struct{} + maxSize int +} + +// NewRetryHeap creates a new retry heap with the specified maximum size +func NewRetryHeap(maxSize int) RetryHeap { + rh := &retryHeap{ + heap: make(retryHeapImpl, 0), + maxSize: maxSize, + pushCh: make(chan *logEventBatch, maxSize), + stopCh: make(chan struct{}), + } + heap.Init(&rh.heap) + go rh.pushToHeapWorker() + return rh +} + +// pushToHeapWorker moves batches from the blocking channel to the time-ordered heap +// This bridges channel-based blocking (like sender queue) with heap-based time ordering +func (rh *retryHeap) pushToHeapWorker() { + for { + select { + case batch := <-rh.pushCh: + rh.mutex.Lock() + heap.Push(&rh.heap, batch) + rh.mutex.Unlock() + case <-rh.stopCh: + return + } + } +} + +// Push adds a batch to the heap, blocking if full (same as sender queue) +func (rh *retryHeap) Push(batch *logEventBatch) error { + select { + case rh.pushCh <- batch: + return nil + case <-rh.stopCh: + return errors.New("retry heap stopped") + } +} + +// PopReady returns all batches that are ready for retry (nextRetryTime <= now) +func (rh *retryHeap) PopReady() []*logEventBatch { + rh.mutex.Lock() + defer rh.mutex.Unlock() + + now := time.Now() + var ready []*logEventBatch + + // Pop all batches that are ready for retry + for len(rh.heap) > 0 && !rh.heap[0].nextRetryTime.After(now) { + batch := heap.Pop(&rh.heap).(*logEventBatch) + ready = append(ready, batch) + } + + return ready +} + +// Size returns the current number of batches in the heap and pending channel +func (rh *retryHeap) Size() int { + rh.mutex.RLock() + defer rh.mutex.RUnlock() + return len(rh.heap) + len(rh.pushCh) +} + +// Stop stops the retry heap +func (rh *retryHeap) Stop() { + close(rh.stopCh) +} + +// RetryHeapProcessor manages the retry heap and moves ready batches back to sender queue +type RetryHeapProcessor struct { + retryHeap RetryHeap + senderPool Sender + ticker *time.Ticker + stopCh chan struct{} + logger telegraf.Logger + stopped bool + maxRetryDuration time.Duration +} + +// NewRetryHeapProcessor creates a new retry heap processor +func NewRetryHeapProcessor(retryHeap RetryHeap, senderPool Sender, logger telegraf.Logger, maxRetryDuration time.Duration) *RetryHeapProcessor { + return &RetryHeapProcessor{ + retryHeap: retryHeap, + senderPool: senderPool, + stopCh: make(chan struct{}), + logger: logger, + stopped: false, + maxRetryDuration: maxRetryDuration, + } +} + +// Start begins processing the retry heap every 100ms +func (p *RetryHeapProcessor) Start() { + p.ticker = time.NewTicker(100 * time.Millisecond) + go p.processLoop() +} + +// Stop stops the retry heap processor +func (p *RetryHeapProcessor) Stop() { + if p.stopped { + return + } + if p.ticker != nil { + p.ticker.Stop() + } + close(p.stopCh) + p.stopped = true +} + +// processLoop runs the main processing loop +func (p *RetryHeapProcessor) processLoop() { + for { + select { + case <-p.ticker.C: + p.processReadyMessages() + case <-p.stopCh: + return + } + } +} + +// processReadyMessages checks the heap for ready batches and moves them back to sender queue +func (p *RetryHeapProcessor) processReadyMessages() { + readyBatches := p.retryHeap.PopReady() + + for _, batch := range readyBatches { + // Check if batch has expired + if batch.isExpired(p.maxRetryDuration) { + p.logger.Debugf("Dropping expired batch for %s/%s", batch.Group, batch.Stream) + batch.updateState() + continue + } + + // Submit the batch back to the sender pool (blocks if full) + p.senderPool.Send(batch) + p.logger.Debugf("Moved batch from retry heap back to sender pool for %s/%s", + batch.Group, batch.Stream) + } +} diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go new file mode 100644 index 00000000000..05e77e6651e --- /dev/null +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go @@ -0,0 +1,151 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package pusher + +import ( + "testing" + "time" + + "github.com/influxdata/telegraf/testutil" + "github.com/stretchr/testify/assert" +) + +func TestRetryHeap(t *testing.T) { + heap := NewRetryHeap(10) + defer heap.Stop() + + // Test empty heap + assert.Equal(t, 0, heap.Size()) + ready := heap.PopReady() + assert.Empty(t, ready) + + // Create test batches + target := Target{Group: "group", Stream: "stream"} + batch1 := newLogEventBatch(target, nil) + batch1.nextRetryTime = time.Now().Add(1 * time.Second) + + batch2 := newLogEventBatch(target, nil) + batch2.nextRetryTime = time.Now().Add(-1 * time.Second) // Ready now + + // Push batches + err := heap.Push(batch1) + assert.NoError(t, err) + err = heap.Push(batch2) + assert.NoError(t, err) + + // Wait for pushToHeapWorker to process + time.Sleep(10 * time.Millisecond) + assert.Equal(t, 2, heap.Size()) + + // Pop ready batches + ready = heap.PopReady() + assert.Len(t, ready, 1) + assert.Equal(t, batch2, ready[0]) + assert.Equal(t, 1, heap.Size()) +} + +func TestRetryHeapOrdering(t *testing.T) { + heap := NewRetryHeap(10) + defer heap.Stop() + + target := Target{Group: "group", Stream: "stream"} + now := time.Now() + + // Create batches with different retry times (not in order) + batch1 := newLogEventBatch(target, nil) + batch1.nextRetryTime = now.Add(3 * time.Second) + + batch2 := newLogEventBatch(target, nil) + batch2.nextRetryTime = now.Add(1 * time.Second) + + batch3 := newLogEventBatch(target, nil) + batch3.nextRetryTime = now.Add(2 * time.Second) + + // Push in random order + heap.Push(batch1) + heap.Push(batch2) + heap.Push(batch3) + + // Wait for all to be ready + time.Sleep(4 * time.Second) + + // Pop ready batches - should come out in order + ready := heap.PopReady() + assert.Len(t, ready, 3) + assert.True(t, ready[0].nextRetryTime.Before(ready[1].nextRetryTime)) + assert.True(t, ready[1].nextRetryTime.Before(ready[2].nextRetryTime)) +} + +func TestRetryHeapProcessor(t *testing.T) { + heap := NewRetryHeap(10) + defer heap.Stop() + + // Create mock senderPool + mockSenderPool := &mockSenderPool{} + processor := NewRetryHeapProcessor(heap, mockSenderPool, &testutil.Logger{}, time.Hour) + defer processor.Stop() + + // Test start/stop + processor.Start() + assert.NotNil(t, processor.ticker) + + processor.Stop() + assert.True(t, processor.stopped) +} + +func TestRetryHeapProcessorExpiredBatch(t *testing.T) { + heap := NewRetryHeap(10) + defer heap.Stop() + + mockSenderPool := &mockSenderPool{} + processor := NewRetryHeapProcessor(heap, mockSenderPool, &testutil.Logger{}, 1*time.Millisecond) // Very short expiry + + // Create expired batch + target := Target{Group: "group", Stream: "stream"} + batch := newLogEventBatch(target, nil) + batch.startTime = time.Now().Add(-1 * time.Hour) // Old start time + batch.nextRetryTime = time.Now().Add(-1 * time.Second) // Ready now + + heap.Push(batch) + time.Sleep(10 * time.Millisecond) // Wait for pushToHeapWorker + + // Process should drop expired batch + processor.processReadyMessages() + assert.Equal(t, 0, heap.Size()) + assert.Equal(t, 0, mockSenderPool.sendCount) // Should not send expired batch +} + +func TestRetryHeapProcessorSendsBatch(t *testing.T) { + heap := NewRetryHeap(10) + defer heap.Stop() + + mockSenderPool := &mockSenderPool{} + processor := NewRetryHeapProcessor(heap, mockSenderPool, &testutil.Logger{}, time.Hour) + + // Create ready batch + target := Target{Group: "group", Stream: "stream"} + batch := newLogEventBatch(target, nil) + batch.nextRetryTime = time.Now().Add(-1 * time.Second) // Ready now + + heap.Push(batch) + time.Sleep(10 * time.Millisecond) // Wait for pushToHeapWorker + + // Process should send batch + processor.processReadyMessages() + assert.Equal(t, 0, heap.Size()) + assert.Equal(t, 1, mockSenderPool.sendCount) +} + +// Mock senderPool for testing +type mockSenderPool struct { + sendCount int +} + +func (m *mockSenderPool) Send(_ *logEventBatch) { + m.sendCount++ +} + +func (m *mockSenderPool) Stop() {} +func (m *mockSenderPool) SetRetryDuration(time.Duration) {} +func (m *mockSenderPool) RetryDuration() time.Duration { return time.Hour } From 83224b4e6fcc2c5d5f003f4a00a32d34570e3109 Mon Sep 17 00:00:00 2001 From: Akansha Agarwal Date: Tue, 30 Dec 2025 18:06:54 +0000 Subject: [PATCH 06/50] Exchange pushch for semaphor to enformce heap size and blocking --- .../internal/pusher/retryheap.go | 49 +++++++--------- .../internal/pusher/retryheap_test.go | 56 +++++++++++++++++-- 2 files changed, 72 insertions(+), 33 deletions(-) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go index 5973f57b381..85b51304727 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go @@ -44,45 +44,34 @@ type RetryHeap interface { } type retryHeap struct { - heap retryHeapImpl - mutex sync.RWMutex - pushCh chan *logEventBatch - stopCh chan struct{} - maxSize int + heap retryHeapImpl + mutex sync.RWMutex + semaphore chan struct{} // Size enforcer + stopCh chan struct{} + maxSize int } // NewRetryHeap creates a new retry heap with the specified maximum size func NewRetryHeap(maxSize int) RetryHeap { rh := &retryHeap{ - heap: make(retryHeapImpl, 0), - maxSize: maxSize, - pushCh: make(chan *logEventBatch, maxSize), - stopCh: make(chan struct{}), + heap: make(retryHeapImpl, 0), + maxSize: maxSize, + semaphore: make(chan struct{}, maxSize), // Semaphore for size enforcement + stopCh: make(chan struct{}), } heap.Init(&rh.heap) - go rh.pushToHeapWorker() return rh } -// pushToHeapWorker moves batches from the blocking channel to the time-ordered heap -// This bridges channel-based blocking (like sender queue) with heap-based time ordering -func (rh *retryHeap) pushToHeapWorker() { - for { - select { - case batch := <-rh.pushCh: - rh.mutex.Lock() - heap.Push(&rh.heap, batch) - rh.mutex.Unlock() - case <-rh.stopCh: - return - } - } -} - -// Push adds a batch to the heap, blocking if full (same as sender queue) +// Push adds a batch to the heap, blocking if full func (rh *retryHeap) Push(batch *logEventBatch) error { + // Acquire semaphore slot (blocks if at maxSize capacity) select { - case rh.pushCh <- batch: + case rh.semaphore <- struct{}{}: + // add batch to heap with mutex protection + rh.mutex.Lock() + heap.Push(&rh.heap, batch) + rh.mutex.Unlock() return nil case <-rh.stopCh: return errors.New("retry heap stopped") @@ -101,16 +90,18 @@ func (rh *retryHeap) PopReady() []*logEventBatch { for len(rh.heap) > 0 && !rh.heap[0].nextRetryTime.After(now) { batch := heap.Pop(&rh.heap).(*logEventBatch) ready = append(ready, batch) + // Release semaphore slot for each popped batch + <-rh.semaphore } return ready } -// Size returns the current number of batches in the heap and pending channel +// Size returns the current number of batches in the heap func (rh *retryHeap) Size() int { rh.mutex.RLock() defer rh.mutex.RUnlock() - return len(rh.heap) + len(rh.pushCh) + return len(rh.heap) } // Stop stops the retry heap diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go index 05e77e6651e..25cf27831ce 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go @@ -34,8 +34,6 @@ func TestRetryHeap(t *testing.T) { err = heap.Push(batch2) assert.NoError(t, err) - // Wait for pushToHeapWorker to process - time.Sleep(10 * time.Millisecond) assert.Equal(t, 2, heap.Size()) // Pop ready batches @@ -108,7 +106,6 @@ func TestRetryHeapProcessorExpiredBatch(t *testing.T) { batch.nextRetryTime = time.Now().Add(-1 * time.Second) // Ready now heap.Push(batch) - time.Sleep(10 * time.Millisecond) // Wait for pushToHeapWorker // Process should drop expired batch processor.processReadyMessages() @@ -129,7 +126,6 @@ func TestRetryHeapProcessorSendsBatch(t *testing.T) { batch.nextRetryTime = time.Now().Add(-1 * time.Second) // Ready now heap.Push(batch) - time.Sleep(10 * time.Millisecond) // Wait for pushToHeapWorker // Process should send batch processor.processReadyMessages() @@ -149,3 +145,55 @@ func (m *mockSenderPool) Send(_ *logEventBatch) { func (m *mockSenderPool) Stop() {} func (m *mockSenderPool) SetRetryDuration(time.Duration) {} func (m *mockSenderPool) RetryDuration() time.Duration { return time.Hour } +func TestRetryHeap_SemaphoreBlockingAndUnblocking(t *testing.T) { + heap := NewRetryHeap(2) // maxSize = 2 + defer heap.Stop() + + // Fill heap to capacity with batches that will be ready in 3 seconds + target := Target{Group: "group", Stream: "stream"} + batch1 := newLogEventBatch(target, nil) + batch1.nextRetryTime = time.Now().Add(3 * time.Second) + batch2 := newLogEventBatch(target, nil) + batch2.nextRetryTime = time.Now().Add(3 * time.Second) + + heap.Push(batch1) + heap.Push(batch2) + + // Verify heap is at capacity + if heap.Size() != 2 { + t.Fatalf("Expected size 2, got %d", heap.Size()) + } + + // Try to push third item - should block + var pushCompleted bool + + go func() { + batch3 := newLogEventBatch(target, nil) + batch3.nextRetryTime = time.Now().Add(time.Hour) // Future time, won't be popped + heap.Push(batch3) // This should block + pushCompleted = true + }() + + // Give goroutine time to hit the semaphore block + time.Sleep(100 * time.Millisecond) + + if pushCompleted { + t.Fatal("Push should be blocked by semaphore") + } + + // Wait for batches to become ready, then pop to release semaphore + time.Sleep(4 * time.Second) + heap.PopReady() + + // Give time for push to unblock + time.Sleep(100 * time.Millisecond) + + if !pushCompleted { + t.Fatal("Push should be unblocked after PopReady") + } + + // Verify final state - should have 1 item (2 popped, 1 pushed) + if heap.Size() != 1 { + t.Fatalf("Expected size 1 after pop/push cycle, got %d", heap.Size()) + } +} From 7cfc7943cbed9f4e61a121f3a9a4f3242d5bf21a Mon Sep 17 00:00:00 2001 From: Akansha Agarwal Date: Tue, 30 Dec 2025 19:37:14 +0000 Subject: [PATCH 07/50] Add conditional logic to sender to call batch.Fail() during concurrency --- .../outputs/cloudwatchlogs/cloudwatchlogs.go | 2 +- .../cloudwatchlogs/internal/pusher/batch.go | 18 +++++++ .../cloudwatchlogs/internal/pusher/pusher.go | 7 ++- .../cloudwatchlogs/internal/pusher/sender.go | 47 ++++++++++++++----- 4 files changed, 59 insertions(+), 15 deletions(-) diff --git a/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go b/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go index 770ef5e3f97..ddc4bed4531 100644 --- a/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go +++ b/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go @@ -153,7 +153,7 @@ func (c *CloudWatchLogs) getDest(t pusher.Target, logSrc logs.LogSrc) *cwDest { } c.targetManager = pusher.NewTargetManager(c.Log, client) }) - p := pusher.NewPusher(c.Log, t, client, c.targetManager, logSrc, c.workerPool, c.ForceFlushInterval.Duration, maxRetryTimeout, &c.pusherWaitGroup) + p := pusher.NewPusher(c.Log, t, client, c.targetManager, logSrc, c.workerPool, c.ForceFlushInterval.Duration, maxRetryTimeout, &c.pusherWaitGroup, c.Concurrency) cwd := &cwDest{ pusher: p, retryer: logThrottleRetryer, diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go b/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go index a0013e785e7..83737d2ee46 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go @@ -100,6 +100,8 @@ type logEventBatch struct { doneCallbacks []func() // Callbacks specifically for updating state stateCallbacks []func() + // Callbacks to execute when batch fails (for circuit breaker notification) + failCallbacks []func() batchers map[string]*state.RangeQueueBatcher // Retry metadata @@ -182,6 +184,13 @@ func (b *logEventBatch) addStateCallback(callback func()) { } } +// addFailCallback adds the callback to the end of the registered fail callbacks. +func (b *logEventBatch) addFailCallback(callback func()) { + if callback != nil { + b.failCallbacks = append(b.failCallbacks, callback) + } +} + // done runs all registered callbacks, including both success callbacks and state callbacks. func (b *logEventBatch) done() { b.updateState() @@ -203,6 +212,15 @@ func (b *logEventBatch) updateState() { } } +// fail runs fail callbacks to notify upstream components of batch failure. +// This is used for circuit breaker notification when a batch fails. +func (b *logEventBatch) fail() { + for i := len(b.failCallbacks) - 1; i >= 0; i-- { + callback := b.failCallbacks[i] + callback() + } +} + // build creates a cloudwatchlogs.PutLogEventsInput from the batch. The log events in the batch must be in // chronological order by their timestamp. func (b *logEventBatch) build() *cloudwatchlogs.PutLogEventsInput { diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go index 57256ae0331..e833868931b 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go @@ -34,8 +34,10 @@ func NewPusher( flushTimeout time.Duration, retryDuration time.Duration, wg *sync.WaitGroup, + concurrency int, ) *Pusher { - s := createSender(logger, service, targetManager, workerPool, retryDuration) + concurrencyEnabled := concurrency > 1 + s := createSender(logger, service, targetManager, workerPool, retryDuration, concurrencyEnabled) q := newQueue(logger, target, flushTimeout, entityProvider, s, wg) targetManager.PutRetentionPolicy(target) return &Pusher{ @@ -60,8 +62,9 @@ func createSender( targetManager TargetManager, workerPool WorkerPool, retryDuration time.Duration, + concurrencyEnabled bool, ) Sender { - s := newSender(logger, service, targetManager, retryDuration) + s := newSender(logger, service, targetManager, retryDuration, concurrencyEnabled) if workerPool == nil { return s } diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go b/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go index 3df074a0fbe..da09deb61db 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go @@ -30,12 +30,13 @@ type Sender interface { } type sender struct { - service cloudWatchLogsService - retryDuration atomic.Value - targetManager TargetManager - logger telegraf.Logger - stopCh chan struct{} - stopped bool + service cloudWatchLogsService + retryDuration atomic.Value + targetManager TargetManager + logger telegraf.Logger + stopCh chan struct{} + stopped bool + concurrencyEnabled bool } var _ (Sender) = (*sender)(nil) @@ -45,13 +46,15 @@ func newSender( service cloudWatchLogsService, targetManager TargetManager, retryDuration time.Duration, + concurrencyEnabled bool, ) Sender { s := &sender{ - logger: logger, - service: service, - targetManager: targetManager, - stopCh: make(chan struct{}), - stopped: false, + logger: logger, + service: service, + targetManager: targetManager, + stopCh: make(chan struct{}), + stopped: false, + concurrencyEnabled: concurrencyEnabled, } s.retryDuration.Store(retryDuration) return s @@ -121,7 +124,22 @@ func (s *sender) Send(batch *logEventBatch) { return } - // Calculate wait time until next retry + select { + case <-s.stopCh: + s.logger.Errorf("Stop requested after %v retries to %v/%v failed for PutLogEvents, request dropped.", totalRetries, batch.Group, batch.Stream) + batch.updateState() + return + default: + } + + // If concurrency enabled, notify failure (will handle RetryHeap push) and return + // Otherwise, continue with existing busy-wait retry behavior + if s.isConcurrencyEnabled() { + batch.fail() + return + } + + // Calculate wait time until next retry (synchronous mode) wait := time.Until(batch.nextRetryTime) if wait < 0 { wait = 0 @@ -156,3 +174,8 @@ func (s *sender) SetRetryDuration(retryDuration time.Duration) { func (s *sender) RetryDuration() time.Duration { return s.retryDuration.Load().(time.Duration) } + +// isConcurrencyEnabled returns whether concurrency mode is enabled for this sender. +func (s *sender) isConcurrencyEnabled() bool { + return s.concurrencyEnabled +} From b4ffd7a65d3bbdbd8c64b636ebbf35e9c5177f46 Mon Sep 17 00:00:00 2001 From: Akansha Agarwal Date: Tue, 30 Dec 2025 22:27:57 +0000 Subject: [PATCH 08/50] Add unit tests --- .../cloudwatchlogs/internal/pusher/batch.go | 2 +- .../internal/pusher/pool_test.go | 2 +- .../internal/pusher/pusher_test.go | 1 + .../internal/pusher/queue_test.go | 2 +- .../cloudwatchlogs/internal/pusher/sender.go | 21 +++----- .../internal/pusher/sender_test.go | 52 +++++++++++++++---- 6 files changed, 53 insertions(+), 27 deletions(-) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go b/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go index 83737d2ee46..3c83be15a0b 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go @@ -102,7 +102,7 @@ type logEventBatch struct { stateCallbacks []func() // Callbacks to execute when batch fails (for circuit breaker notification) failCallbacks []func() - batchers map[string]*state.RangeQueueBatcher + batchers map[string]*state.RangeQueueBatcher // Retry metadata retryCountShort int // Number of retries using short delay strategy diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go index d9f3860967c..94fe4b6713b 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go @@ -107,7 +107,7 @@ func TestSenderPool(t *testing.T) { logger := testutil.NewNopLogger() mockService := new(mockLogsService) mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil) - s := newSender(logger, mockService, nil, time.Second) + s := newSender(logger, mockService, nil, time.Second, false) p := NewWorkerPool(12) sp := newSenderPool(p, s) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go index 6d63e3c4ff5..ce575c213f3 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go @@ -113,6 +113,7 @@ func setupPusher(t *testing.T, workerPool WorkerPool, wg *sync.WaitGroup) *Pushe time.Second, time.Minute, wg, + 1, // concurrency ) assert.NotNil(t, pusher) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go index b5fc04d02eb..dab98651319 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go @@ -712,7 +712,7 @@ func testPreparationWithLogger( ) (*queue, Sender) { t.Helper() tm := NewTargetManager(logger, service) - s := newSender(logger, service, tm, retryDuration) + s := newSender(logger, service, tm, retryDuration, false) q := newQueue( logger, Target{"G", "S", util.StandardLogGroupClass, retention}, diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go b/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go index da09deb61db..31a3b8be299 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go @@ -30,12 +30,12 @@ type Sender interface { } type sender struct { - service cloudWatchLogsService - retryDuration atomic.Value - targetManager TargetManager - logger telegraf.Logger - stopCh chan struct{} - stopped bool + service cloudWatchLogsService + retryDuration atomic.Value + targetManager TargetManager + logger telegraf.Logger + stopCh chan struct{} + stopped bool concurrencyEnabled bool } @@ -124,19 +124,10 @@ func (s *sender) Send(batch *logEventBatch) { return } - select { - case <-s.stopCh: - s.logger.Errorf("Stop requested after %v retries to %v/%v failed for PutLogEvents, request dropped.", totalRetries, batch.Group, batch.Stream) - batch.updateState() - return - default: - } - // If concurrency enabled, notify failure (will handle RetryHeap push) and return // Otherwise, continue with existing busy-wait retry behavior if s.isConcurrencyEnabled() { batch.fail() - return } // Calculate wait time until next retry (synchronous mode) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go index 3b469350ef7..450e63006ad 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go @@ -15,6 +15,7 @@ import ( "github.com/aws/amazon-cloudwatch-agent/sdk/service/cloudwatchlogs" "github.com/aws/amazon-cloudwatch-agent/tool/testutil" + "github.com/aws/amazon-cloudwatch-agent/tool/util" ) type mockLogsService struct { @@ -80,7 +81,7 @@ func TestSender(t *testing.T) { mockManager := new(mockTargetManager) mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Once() - s := newSender(logger, mockService, mockManager, time.Second) + s := newSender(logger, mockService, mockManager, time.Second, false) s.Send(batch) s.Stop() @@ -103,7 +104,7 @@ func TestSender(t *testing.T) { mockManager := new(mockTargetManager) mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{RejectedLogEventsInfo: rejectedInfo}, nil).Once() - s := newSender(logger, mockService, mockManager, time.Second) + s := newSender(logger, mockService, mockManager, time.Second, false) s.Send(batch) s.Stop() @@ -122,7 +123,7 @@ func TestSender(t *testing.T) { mockManager.On("InitTarget", mock.Anything).Return(nil).Once() mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Once() - s := newSender(logger, mockService, mockManager, time.Second) + s := newSender(logger, mockService, mockManager, time.Second, false) s.Send(batch) s.Stop() @@ -149,7 +150,7 @@ func TestSender(t *testing.T) { mockService.On("PutLogEvents", mock.Anything). Return(&cloudwatchlogs.PutLogEventsOutput{}, &cloudwatchlogs.InvalidParameterException{}).Once() - s := newSender(logger, mockService, mockManager, time.Second) + s := newSender(logger, mockService, mockManager, time.Second, false) s.Send(batch) s.Stop() @@ -177,7 +178,7 @@ func TestSender(t *testing.T) { mockService.On("PutLogEvents", mock.Anything). Return(&cloudwatchlogs.PutLogEventsOutput{}, &cloudwatchlogs.DataAlreadyAcceptedException{}).Once() - s := newSender(logger, mockService, mockManager, time.Second) + s := newSender(logger, mockService, mockManager, time.Second, false) s.Send(batch) s.Stop() @@ -205,7 +206,7 @@ func TestSender(t *testing.T) { mockService.On("PutLogEvents", mock.Anything). Return(&cloudwatchlogs.PutLogEventsOutput{}, errors.New("test")).Once() - s := newSender(logger, mockService, mockManager, time.Second) + s := newSender(logger, mockService, mockManager, time.Second, false) s.Send(batch) s.Stop() @@ -225,7 +226,7 @@ func TestSender(t *testing.T) { mockService.On("PutLogEvents", mock.Anything). Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Once() - s := newSender(logger, mockService, mockManager, time.Second) + s := newSender(logger, mockService, mockManager, time.Second, false) s.Send(batch) s.Stop() @@ -251,7 +252,7 @@ func TestSender(t *testing.T) { mockService.On("PutLogEvents", mock.Anything). Return(&cloudwatchlogs.PutLogEventsOutput{}, awserr.New("SomeAWSError", "Some AWS error", nil)).Once() - s := newSender(logger, mockService, mockManager, 100*time.Millisecond) + s := newSender(logger, mockService, mockManager, 100*time.Millisecond, false) s.Send(batch) s.Stop() @@ -279,7 +280,7 @@ func TestSender(t *testing.T) { mockService.On("PutLogEvents", mock.Anything). Return(&cloudwatchlogs.PutLogEventsOutput{}, awserr.New("SomeAWSError", "Some AWS error", nil)).Once() - s := newSender(logger, mockService, mockManager, time.Second) + s := newSender(logger, mockService, mockManager, time.Second, false) go func() { time.Sleep(50 * time.Millisecond) @@ -292,4 +293,37 @@ func TestSender(t *testing.T) { assert.True(t, stateCallbackCalled, "State callback was not called when stop was requested") assert.False(t, doneCallbackCalled, "Done callback should not be called when stop was requested") }) + + t.Run("ConcurrencyEnabled/CallsFailCallback", func(t *testing.T) { + logger := testutil.NewNopLogger() + batch := newLogEventBatch(Target{"G", "S", util.StandardLogGroupClass, -1}, nil) + batch.append(newLogEvent(time.Now(), "Test message", nil)) + + // Initialize batch for retry logic + batch.initializeStartTime() + + mockService := new(mockLogsService) + mockManager := new(mockTargetManager) + mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, &cloudwatchlogs.ServiceUnavailableException{}).Once() + + // Enable concurrency with 1 hour retry duration + s := newSender(logger, mockService, mockManager, time.Hour, true) + + // Track if fail callback was called + failCalled := false + batch.addFailCallback(func() { + failCalled = true + }) + + go func() { + time.Sleep(50 * time.Millisecond) + s.Stop() + }() + + s.Send(batch) + + // Should call fail callback when concurrency is enabled + assert.True(t, failCalled, "fail callback should be called when concurrency is enabled") + mockService.AssertExpectations(t) + }) } From 0e4b0bc699cc3e598fcc40700579addbfec41680 Mon Sep 17 00:00:00 2001 From: Akansha Agarwal Date: Tue, 30 Dec 2025 23:18:25 +0000 Subject: [PATCH 09/50] Instantiate RetryHeap and RetryHeapProcessor if concurrency enabled --- .../outputs/cloudwatchlogs/cloudwatchlogs.go | 31 ++++++++++++------ .../cloudwatchlogs/internal/pusher/pool.go | 4 ++- .../internal/pusher/pool_test.go | 2 +- .../cloudwatchlogs/internal/pusher/pusher.go | 8 +++-- .../internal/pusher/pusher_test.go | 3 +- .../internal/pusher/retryheap.go | 6 +++- .../internal/pusher/retryheap_test.go | 32 ++++++++++++------- 7 files changed, 60 insertions(+), 26 deletions(-) diff --git a/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go b/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go index ddc4bed4531..06883993957 100644 --- a/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go +++ b/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go @@ -69,14 +69,16 @@ type CloudWatchLogs struct { Log telegraf.Logger `toml:"-"` - pusherWaitGroup sync.WaitGroup - cwDests sync.Map - workerPool pusher.WorkerPool - targetManager pusher.TargetManager - once sync.Once - middleware awsmiddleware.Middleware - configurer *awsmiddleware.Configurer - configurerOnce sync.Once + pusherWaitGroup sync.WaitGroup + cwDests sync.Map + workerPool pusher.WorkerPool + retryHeap pusher.RetryHeap + retryHeapProcessor *pusher.RetryHeapProcessor + targetManager pusher.TargetManager + once sync.Once + middleware awsmiddleware.Middleware + configurer *awsmiddleware.Configurer + configurerOnce sync.Once } var _ logs.LogBackend = (*CloudWatchLogs)(nil) @@ -101,6 +103,14 @@ func (c *CloudWatchLogs) Close() error { c.workerPool.Stop() } + if c.retryHeapProcessor != nil { + c.retryHeapProcessor.Stop() + } + + if c.retryHeap != nil { + c.retryHeap.Stop() + } + return nil } @@ -150,10 +160,13 @@ func (c *CloudWatchLogs) getDest(t pusher.Target, logSrc logs.LogSrc) *cwDest { c.once.Do(func() { if c.Concurrency > 1 { c.workerPool = pusher.NewWorkerPool(c.Concurrency) + c.retryHeap = pusher.NewRetryHeap(c.Concurrency) + c.retryHeapProcessor = pusher.NewRetryHeapProcessor(c.retryHeap, c.workerPool, client, c.targetManager, c.Log, maxRetryTimeout) + c.retryHeapProcessor.Start() } c.targetManager = pusher.NewTargetManager(c.Log, client) }) - p := pusher.NewPusher(c.Log, t, client, c.targetManager, logSrc, c.workerPool, c.ForceFlushInterval.Duration, maxRetryTimeout, &c.pusherWaitGroup, c.Concurrency) + p := pusher.NewPusher(c.Log, t, client, c.targetManager, logSrc, c.workerPool, c.ForceFlushInterval.Duration, maxRetryTimeout, &c.pusherWaitGroup, c.Concurrency, c.retryHeap) cwd := &cwDest{ pusher: p, retryer: logThrottleRetryer, diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pool.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pool.go index 1d6edf57e90..6aff5b522e5 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/pool.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pool.go @@ -91,14 +91,16 @@ func (p *workerPool) Stop() { type senderPool struct { workerPool WorkerPool sender Sender + retryHeap RetryHeap } var _ Sender = (*senderPool)(nil) -func newSenderPool(workerPool WorkerPool, sender Sender) Sender { +func newSenderPool(workerPool WorkerPool, sender Sender, retryHeap RetryHeap) Sender { return &senderPool{ workerPool: workerPool, sender: sender, + retryHeap: retryHeap, } } diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go index 94fe4b6713b..df0cd39fbf9 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go @@ -109,7 +109,7 @@ func TestSenderPool(t *testing.T) { mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil) s := newSender(logger, mockService, nil, time.Second, false) p := NewWorkerPool(12) - sp := newSenderPool(p, s) + sp := newSenderPool(p, s, nil) assert.Equal(t, time.Second, sp.RetryDuration()) sp.SetRetryDuration(time.Minute) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go index e833868931b..77707532fec 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go @@ -35,9 +35,12 @@ func NewPusher( retryDuration time.Duration, wg *sync.WaitGroup, concurrency int, + retryHeap RetryHeap, ) *Pusher { concurrencyEnabled := concurrency > 1 - s := createSender(logger, service, targetManager, workerPool, retryDuration, concurrencyEnabled) + + s := createSender(logger, service, targetManager, workerPool, retryDuration, concurrencyEnabled, retryHeap) + q := newQueue(logger, target, flushTimeout, entityProvider, s, wg) targetManager.PutRetentionPolicy(target) return &Pusher{ @@ -63,10 +66,11 @@ func createSender( workerPool WorkerPool, retryDuration time.Duration, concurrencyEnabled bool, + retryHeap RetryHeap, ) Sender { s := newSender(logger, service, targetManager, retryDuration, concurrencyEnabled) if workerPool == nil { return s } - return newSenderPool(workerPool, s) + return newSenderPool(workerPool, s, retryHeap) } diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go index ce575c213f3..dc1774e049e 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go @@ -113,7 +113,8 @@ func setupPusher(t *testing.T, workerPool WorkerPool, wg *sync.WaitGroup) *Pushe time.Second, time.Minute, wg, - 1, // concurrency + 1, // concurrency + nil, // retryHeap ) assert.NotNil(t, pusher) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go index 85b51304727..213359b8044 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go @@ -121,7 +121,11 @@ type RetryHeapProcessor struct { } // NewRetryHeapProcessor creates a new retry heap processor -func NewRetryHeapProcessor(retryHeap RetryHeap, senderPool Sender, logger telegraf.Logger, maxRetryDuration time.Duration) *RetryHeapProcessor { +func NewRetryHeapProcessor(retryHeap RetryHeap, workerPool WorkerPool, service cloudWatchLogsService, targetManager TargetManager, logger telegraf.Logger, maxRetryDuration time.Duration) *RetryHeapProcessor { + // Create processor's own sender and senderPool + sender := newSender(logger, service, targetManager, maxRetryDuration, true) + senderPool := newSenderPool(workerPool, sender, retryHeap) + return &RetryHeapProcessor{ retryHeap: retryHeap, senderPool: senderPool, diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go index 25cf27831ce..909ff25cd29 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go @@ -79,9 +79,13 @@ func TestRetryHeapProcessor(t *testing.T) { heap := NewRetryHeap(10) defer heap.Stop() - // Create mock senderPool - mockSenderPool := &mockSenderPool{} - processor := NewRetryHeapProcessor(heap, mockSenderPool, &testutil.Logger{}, time.Hour) + // Create mock components + mockWorkerPool := NewWorkerPool(2) + defer mockWorkerPool.Stop() + mockService := &mockLogsService{} + mockTargetManager := &mockTargetManager{} + + processor := NewRetryHeapProcessor(heap, mockWorkerPool, mockService, mockTargetManager, &testutil.Logger{}, time.Hour) defer processor.Stop() // Test start/stop @@ -96,8 +100,12 @@ func TestRetryHeapProcessorExpiredBatch(t *testing.T) { heap := NewRetryHeap(10) defer heap.Stop() - mockSenderPool := &mockSenderPool{} - processor := NewRetryHeapProcessor(heap, mockSenderPool, &testutil.Logger{}, 1*time.Millisecond) // Very short expiry + mockWorkerPool := NewWorkerPool(2) + defer mockWorkerPool.Stop() + mockService := &mockLogsService{} + mockTargetManager := &mockTargetManager{} + + processor := NewRetryHeapProcessor(heap, mockWorkerPool, mockService, mockTargetManager, &testutil.Logger{}, 1*time.Millisecond) // Very short expiry // Create expired batch target := Target{Group: "group", Stream: "stream"} @@ -109,16 +117,19 @@ func TestRetryHeapProcessorExpiredBatch(t *testing.T) { // Process should drop expired batch processor.processReadyMessages() - assert.Equal(t, 0, heap.Size()) - assert.Equal(t, 0, mockSenderPool.sendCount) // Should not send expired batch + assert.Equal(t, 0, heap.Size()) // Expired batch should be removed } func TestRetryHeapProcessorSendsBatch(t *testing.T) { heap := NewRetryHeap(10) defer heap.Stop() - mockSenderPool := &mockSenderPool{} - processor := NewRetryHeapProcessor(heap, mockSenderPool, &testutil.Logger{}, time.Hour) + mockWorkerPool := NewWorkerPool(2) + defer mockWorkerPool.Stop() + mockService := &mockLogsService{} + mockTargetManager := &mockTargetManager{} + + processor := NewRetryHeapProcessor(heap, mockWorkerPool, mockService, mockTargetManager, &testutil.Logger{}, time.Hour) // Create ready batch target := Target{Group: "group", Stream: "stream"} @@ -129,8 +140,7 @@ func TestRetryHeapProcessorSendsBatch(t *testing.T) { // Process should send batch processor.processReadyMessages() - assert.Equal(t, 0, heap.Size()) - assert.Equal(t, 1, mockSenderPool.sendCount) + assert.Equal(t, 0, heap.Size()) // Batch should be removed from heap } // Mock senderPool for testing From 9c1332a7ccbed2140680ee365f4c5ec90c8766d3 Mon Sep 17 00:00:00 2001 From: Akansha Agarwal Date: Tue, 30 Dec 2025 23:34:54 +0000 Subject: [PATCH 10/50] Add unit tests for retryheap instantiation --- .../cloudwatchlogs/cloudwatchlogs_test.go | 37 ++++++++++++++++++ .../internal/pusher/pool_test.go | 21 ++++++++++ .../internal/pusher/pusher_test.go | 39 +++++++++++++++++++ 3 files changed, 97 insertions(+) diff --git a/plugins/outputs/cloudwatchlogs/cloudwatchlogs_test.go b/plugins/outputs/cloudwatchlogs/cloudwatchlogs_test.go index 66f1643fd09..c06a4240093 100644 --- a/plugins/outputs/cloudwatchlogs/cloudwatchlogs_test.go +++ b/plugins/outputs/cloudwatchlogs/cloudwatchlogs_test.go @@ -8,6 +8,7 @@ import ( "testing" "github.com/influxdata/telegraf/testutil" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/aws/amazon-cloudwatch-agent/logs" @@ -100,3 +101,39 @@ func TestDuplicateDestination(t *testing.T) { // Then the destination for cloudwatchlogs endpoint would be the same require.Equal(t, d1, d2) } + +func TestRetryHeapCreation(t *testing.T) { + t.Run("ConcurrencyEnabled", func(t *testing.T) { + c := &CloudWatchLogs{ + Log: testutil.Logger{Name: "test"}, + AccessKey: "access_key", + SecretKey: "secret_key", + Concurrency: 2, // > 1 enables concurrency + cwDests: sync.Map{}, + } + + c.CreateDest("FILENAME", "", -1, util.StandardLogGroupClass, nil) + + // Should create RetryHeap and processor + assert.NotNil(t, c.retryHeap) + assert.NotNil(t, c.retryHeapProcessor) + assert.NotNil(t, c.workerPool) + }) + + t.Run("ConcurrencyDisabled", func(t *testing.T) { + c := &CloudWatchLogs{ + Log: testutil.Logger{Name: "test"}, + AccessKey: "access_key", + SecretKey: "secret_key", + Concurrency: 1, // <= 1 disables concurrency + cwDests: sync.Map{}, + } + + c.CreateDest("FILENAME", "", -1, util.StandardLogGroupClass, nil) + + // Should not create RetryHeap and processor + assert.Nil(t, c.retryHeap) + assert.Nil(t, c.retryHeapProcessor) + assert.Nil(t, c.workerPool) + }) +} diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go index df0cd39fbf9..0af043de52a 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go @@ -134,3 +134,24 @@ func TestSenderPool(t *testing.T) { s.Stop() assert.Equal(t, int32(200), completed.Load()) } + +func TestSenderPoolRetryHeap(t *testing.T) { + logger := testutil.NewNopLogger() + mockService := new(mockLogsService) + mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil) + s := newSender(logger, mockService, nil, time.Second, false) + p := NewWorkerPool(12) + defer p.Stop() + + // Create RetryHeap + retryHeap := NewRetryHeap(10) + defer retryHeap.Stop() + + sp := newSenderPool(p, s, retryHeap) + + // Verify senderPool has retryHeap + assert.NotNil(t, sp.(*senderPool).retryHeap) + assert.Equal(t, retryHeap, sp.(*senderPool).retryHeap) + + sp.Stop() +} diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go index dc1774e049e..2ec67aea452 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go @@ -126,3 +126,42 @@ func setupPusher(t *testing.T, workerPool WorkerPool, wg *sync.WaitGroup) *Pushe mockManager.AssertCalled(t, "PutRetentionPolicy", target) return pusher } + +func TestPusherRetryHeap(t *testing.T) { + logger := testutil.NewNopLogger() + target := Target{Group: "G", Stream: "S"} + service := &stubLogsService{} + mockManager := new(mockTargetManager) + mockManager.On("PutRetentionPolicy", target).Return() + + workerPool := NewWorkerPool(2) + defer workerPool.Stop() + + retryHeap := NewRetryHeap(10) + defer retryHeap.Stop() + + var wg sync.WaitGroup + pusher := NewPusher( + logger, + target, + service, + mockManager, + nil, + workerPool, + time.Second, + time.Minute, + &wg, + 2, // concurrency > 1 + retryHeap, + ) + + assert.NotNil(t, pusher) + assert.Equal(t, target, pusher.Target) + + // Verify senderPool has retryHeap when concurrency enabled + if senderPool, ok := pusher.Sender.(*senderPool); ok { + assert.Equal(t, retryHeap, senderPool.retryHeap) + } + + mockManager.AssertCalled(t, "PutRetentionPolicy", target) +} From dddb691d8d0f1391b9fde623ef87cf43ec1ad979 Mon Sep 17 00:00:00 2001 From: Akansha Agarwal Date: Tue, 30 Dec 2025 23:52:11 +0000 Subject: [PATCH 11/50] Update sender to reference retryHeap to call push on fail --- .../cloudwatchlogs/internal/pusher/pool.go | 4 +--- .../internal/pusher/pool_test.go | 19 +++++++-------- .../cloudwatchlogs/internal/pusher/pusher.go | 4 ++-- .../internal/pusher/pusher_test.go | 6 ++--- .../internal/pusher/queue_test.go | 2 +- .../internal/pusher/retryheap.go | 6 +++-- .../cloudwatchlogs/internal/pusher/sender.go | 11 ++++++--- .../internal/pusher/sender_test.go | 24 ++++++++++--------- 8 files changed, 39 insertions(+), 37 deletions(-) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pool.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pool.go index 6aff5b522e5..1d6edf57e90 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/pool.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pool.go @@ -91,16 +91,14 @@ func (p *workerPool) Stop() { type senderPool struct { workerPool WorkerPool sender Sender - retryHeap RetryHeap } var _ Sender = (*senderPool)(nil) -func newSenderPool(workerPool WorkerPool, sender Sender, retryHeap RetryHeap) Sender { +func newSenderPool(workerPool WorkerPool, sender Sender) Sender { return &senderPool{ workerPool: workerPool, sender: sender, - retryHeap: retryHeap, } } diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go index 0af043de52a..6b8269b00aa 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go @@ -107,9 +107,9 @@ func TestSenderPool(t *testing.T) { logger := testutil.NewNopLogger() mockService := new(mockLogsService) mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil) - s := newSender(logger, mockService, nil, time.Second, false) + s := newSender(logger, mockService, nil, time.Second, false, nil) p := NewWorkerPool(12) - sp := newSenderPool(p, s, nil) + sp := newSenderPool(p, s) assert.Equal(t, time.Second, sp.RetryDuration()) sp.SetRetryDuration(time.Minute) @@ -139,19 +139,16 @@ func TestSenderPoolRetryHeap(t *testing.T) { logger := testutil.NewNopLogger() mockService := new(mockLogsService) mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil) - s := newSender(logger, mockService, nil, time.Second, false) - p := NewWorkerPool(12) - defer p.Stop() - + // Create RetryHeap retryHeap := NewRetryHeap(10) defer retryHeap.Stop() + + s := newSender(logger, mockService, nil, time.Second, false, retryHeap) + p := NewWorkerPool(12) + defer p.Stop() - sp := newSenderPool(p, s, retryHeap) - - // Verify senderPool has retryHeap - assert.NotNil(t, sp.(*senderPool).retryHeap) - assert.Equal(t, retryHeap, sp.(*senderPool).retryHeap) + sp := newSenderPool(p, s) sp.Stop() } diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go index 77707532fec..6b8b5046617 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go @@ -68,9 +68,9 @@ func createSender( concurrencyEnabled bool, retryHeap RetryHeap, ) Sender { - s := newSender(logger, service, targetManager, retryDuration, concurrencyEnabled) + s := newSender(logger, service, targetManager, retryDuration, concurrencyEnabled, retryHeap) if workerPool == nil { return s } - return newSenderPool(workerPool, s, retryHeap) + return newSenderPool(workerPool, s) } diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go index 2ec67aea452..e3b11e6963c 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go @@ -158,10 +158,8 @@ func TestPusherRetryHeap(t *testing.T) { assert.NotNil(t, pusher) assert.Equal(t, target, pusher.Target) - // Verify senderPool has retryHeap when concurrency enabled - if senderPool, ok := pusher.Sender.(*senderPool); ok { - assert.Equal(t, retryHeap, senderPool.retryHeap) - } + // Verify pusher has retryHeap when concurrency enabled + // (RetryHeap is now passed to the underlying sender, not senderPool) mockManager.AssertCalled(t, "PutRetentionPolicy", target) } diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go index dab98651319..8d645ae4bd3 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go @@ -712,7 +712,7 @@ func testPreparationWithLogger( ) (*queue, Sender) { t.Helper() tm := NewTargetManager(logger, service) - s := newSender(logger, service, tm, retryDuration, false) + s := newSender(logger, service, tm, retryDuration, false, nil) q := newQueue( logger, Target{"G", "S", util.StandardLogGroupClass, retention}, diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go index 213359b8044..691ff89bf74 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go @@ -123,8 +123,10 @@ type RetryHeapProcessor struct { // NewRetryHeapProcessor creates a new retry heap processor func NewRetryHeapProcessor(retryHeap RetryHeap, workerPool WorkerPool, service cloudWatchLogsService, targetManager TargetManager, logger telegraf.Logger, maxRetryDuration time.Duration) *RetryHeapProcessor { // Create processor's own sender and senderPool - sender := newSender(logger, service, targetManager, maxRetryDuration, true) - senderPool := newSenderPool(workerPool, sender, retryHeap) + // Note: Pass nil for retryHeap to prevent infinite retry loops - + // batches from RetryHeap that fail again use synchronous retry behavior + sender := newSender(logger, service, targetManager, maxRetryDuration, true, nil) + senderPool := newSenderPool(workerPool, sender) return &RetryHeapProcessor{ retryHeap: retryHeap, diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go b/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go index 31a3b8be299..9d3339d963e 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go @@ -37,6 +37,7 @@ type sender struct { stopCh chan struct{} stopped bool concurrencyEnabled bool + retryHeap RetryHeap } var _ (Sender) = (*sender)(nil) @@ -47,6 +48,7 @@ func newSender( targetManager TargetManager, retryDuration time.Duration, concurrencyEnabled bool, + retryHeap RetryHeap, ) Sender { s := &sender{ logger: logger, @@ -55,6 +57,7 @@ func newSender( stopCh: make(chan struct{}), stopped: false, concurrencyEnabled: concurrencyEnabled, + retryHeap: retryHeap, } s.retryDuration.Store(retryDuration) return s @@ -124,10 +127,12 @@ func (s *sender) Send(batch *logEventBatch) { return } - // If concurrency enabled, notify failure (will handle RetryHeap push) and return + // If concurrency enabled, push to RetryHeap and return // Otherwise, continue with existing busy-wait retry behavior if s.isConcurrencyEnabled() { + s.retryHeap.Push(batch) batch.fail() + return } // Calculate wait time until next retry (synchronous mode) @@ -166,7 +171,7 @@ func (s *sender) RetryDuration() time.Duration { return s.retryDuration.Load().(time.Duration) } -// isConcurrencyEnabled returns whether concurrency mode is enabled for this sender. +// isConcurrencyEnabled returns whether concurrency mode is enabled and RetryHeap is available. func (s *sender) isConcurrencyEnabled() bool { - return s.concurrencyEnabled + return s.concurrencyEnabled && s.retryHeap != nil } diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go index 450e63006ad..24330112a77 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go @@ -81,7 +81,7 @@ func TestSender(t *testing.T) { mockManager := new(mockTargetManager) mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Once() - s := newSender(logger, mockService, mockManager, time.Second, false) + s := newSender(logger, mockService, mockManager, time.Second, false, nil) s.Send(batch) s.Stop() @@ -104,7 +104,7 @@ func TestSender(t *testing.T) { mockManager := new(mockTargetManager) mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{RejectedLogEventsInfo: rejectedInfo}, nil).Once() - s := newSender(logger, mockService, mockManager, time.Second, false) + s := newSender(logger, mockService, mockManager, time.Second, false, nil) s.Send(batch) s.Stop() @@ -123,7 +123,7 @@ func TestSender(t *testing.T) { mockManager.On("InitTarget", mock.Anything).Return(nil).Once() mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Once() - s := newSender(logger, mockService, mockManager, time.Second, false) + s := newSender(logger, mockService, mockManager, time.Second, false, nil) s.Send(batch) s.Stop() @@ -150,7 +150,7 @@ func TestSender(t *testing.T) { mockService.On("PutLogEvents", mock.Anything). Return(&cloudwatchlogs.PutLogEventsOutput{}, &cloudwatchlogs.InvalidParameterException{}).Once() - s := newSender(logger, mockService, mockManager, time.Second, false) + s := newSender(logger, mockService, mockManager, time.Second, false, nil) s.Send(batch) s.Stop() @@ -178,7 +178,7 @@ func TestSender(t *testing.T) { mockService.On("PutLogEvents", mock.Anything). Return(&cloudwatchlogs.PutLogEventsOutput{}, &cloudwatchlogs.DataAlreadyAcceptedException{}).Once() - s := newSender(logger, mockService, mockManager, time.Second, false) + s := newSender(logger, mockService, mockManager, time.Second, false, nil) s.Send(batch) s.Stop() @@ -206,7 +206,7 @@ func TestSender(t *testing.T) { mockService.On("PutLogEvents", mock.Anything). Return(&cloudwatchlogs.PutLogEventsOutput{}, errors.New("test")).Once() - s := newSender(logger, mockService, mockManager, time.Second, false) + s := newSender(logger, mockService, mockManager, time.Second, false, nil) s.Send(batch) s.Stop() @@ -226,7 +226,7 @@ func TestSender(t *testing.T) { mockService.On("PutLogEvents", mock.Anything). Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Once() - s := newSender(logger, mockService, mockManager, time.Second, false) + s := newSender(logger, mockService, mockManager, time.Second, false, nil) s.Send(batch) s.Stop() @@ -252,7 +252,7 @@ func TestSender(t *testing.T) { mockService.On("PutLogEvents", mock.Anything). Return(&cloudwatchlogs.PutLogEventsOutput{}, awserr.New("SomeAWSError", "Some AWS error", nil)).Once() - s := newSender(logger, mockService, mockManager, 100*time.Millisecond, false) + s := newSender(logger, mockService, mockManager, 100*time.Millisecond, false, nil) s.Send(batch) s.Stop() @@ -280,7 +280,7 @@ func TestSender(t *testing.T) { mockService.On("PutLogEvents", mock.Anything). Return(&cloudwatchlogs.PutLogEventsOutput{}, awserr.New("SomeAWSError", "Some AWS error", nil)).Once() - s := newSender(logger, mockService, mockManager, time.Second, false) + s := newSender(logger, mockService, mockManager, time.Second, false, nil) go func() { time.Sleep(50 * time.Millisecond) @@ -306,8 +306,10 @@ func TestSender(t *testing.T) { mockManager := new(mockTargetManager) mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, &cloudwatchlogs.ServiceUnavailableException{}).Once() - // Enable concurrency with 1 hour retry duration - s := newSender(logger, mockService, mockManager, time.Hour, true) + // Enable concurrency with 1 hour retry duration and RetryHeap + retryHeap := NewRetryHeap(10) + defer retryHeap.Stop() + s := newSender(logger, mockService, mockManager, time.Hour, true, retryHeap) // Track if fail callback was called failCalled := false From 02bc5c6c37228ae56dbbb6be1e08de155941c646 Mon Sep 17 00:00:00 2001 From: Akansha Agarwal Date: Wed, 31 Dec 2025 00:06:31 +0000 Subject: [PATCH 12/50] Add unit tests for sender logic --- .../internal/pusher/pool_test.go | 4 +- .../internal/pusher/retryheap.go | 2 +- .../internal/pusher/sender_test.go | 65 +++++++++++-------- 3 files changed, 40 insertions(+), 31 deletions(-) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go index 6b8269b00aa..4a7abe08073 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go @@ -139,11 +139,11 @@ func TestSenderPoolRetryHeap(t *testing.T) { logger := testutil.NewNopLogger() mockService := new(mockLogsService) mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil) - + // Create RetryHeap retryHeap := NewRetryHeap(10) defer retryHeap.Stop() - + s := newSender(logger, mockService, nil, time.Second, false, retryHeap) p := NewWorkerPool(12) defer p.Stop() diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go index 691ff89bf74..da4f3cd6dc9 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go @@ -123,7 +123,7 @@ type RetryHeapProcessor struct { // NewRetryHeapProcessor creates a new retry heap processor func NewRetryHeapProcessor(retryHeap RetryHeap, workerPool WorkerPool, service cloudWatchLogsService, targetManager TargetManager, logger telegraf.Logger, maxRetryDuration time.Duration) *RetryHeapProcessor { // Create processor's own sender and senderPool - // Note: Pass nil for retryHeap to prevent infinite retry loops - + // Note: Pass nil for retryHeap to prevent infinite retry loops - // batches from RetryHeap that fail again use synchronous retry behavior sender := newSender(logger, service, targetManager, maxRetryDuration, true, nil) senderPool := newSenderPool(workerPool, sender) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go index 24330112a77..0bdead2ff0c 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go @@ -15,7 +15,6 @@ import ( "github.com/aws/amazon-cloudwatch-agent/sdk/service/cloudwatchlogs" "github.com/aws/amazon-cloudwatch-agent/tool/testutil" - "github.com/aws/amazon-cloudwatch-agent/tool/util" ) type mockLogsService struct { @@ -293,39 +292,49 @@ func TestSender(t *testing.T) { assert.True(t, stateCallbackCalled, "State callback was not called when stop was requested") assert.False(t, doneCallbackCalled, "Done callback should not be called when stop was requested") }) +} +func TestSenderConcurrencyWithRetryHeap(t *testing.T) { + logger := testutil.NewNopLogger() + mockService := new(mockLogsService) + mockManager := new(mockTargetManager) + mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, &cloudwatchlogs.ServiceUnavailableException{}).Once() - t.Run("ConcurrencyEnabled/CallsFailCallback", func(t *testing.T) { - logger := testutil.NewNopLogger() - batch := newLogEventBatch(Target{"G", "S", util.StandardLogGroupClass, -1}, nil) - batch.append(newLogEvent(time.Now(), "Test message", nil)) + retryHeap := NewRetryHeap(10) + defer retryHeap.Stop() - // Initialize batch for retry logic - batch.initializeStartTime() + s := newSender(logger, mockService, mockManager, time.Hour, true, retryHeap) - mockService := new(mockLogsService) - mockManager := new(mockTargetManager) - mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, &cloudwatchlogs.ServiceUnavailableException{}).Once() + batch := newLogEventBatch(Target{Group: "test-group", Stream: "test-stream"}, nil) + batch.append(newLogEvent(time.Now(), "Test message", nil)) - // Enable concurrency with 1 hour retry duration and RetryHeap - retryHeap := NewRetryHeap(10) - defer retryHeap.Stop() - s := newSender(logger, mockService, mockManager, time.Hour, true, retryHeap) + var failCalled bool + batch.addFailCallback(func() { failCalled = true }) - // Track if fail callback was called - failCalled := false - batch.addFailCallback(func() { - failCalled = true - }) + s.Send(batch) - go func() { - time.Sleep(50 * time.Millisecond) - s.Stop() - }() + assert.True(t, failCalled, "Fail callback should be called") + assert.Equal(t, 1, retryHeap.Size(), "Batch should be in RetryHeap") + mockService.AssertExpectations(t) +} - s.Send(batch) +func TestSenderConcurrencyFallbackToSync(t *testing.T) { + logger := testutil.NewNopLogger() + mockService := new(mockLogsService) + mockManager := new(mockTargetManager) + mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, &cloudwatchlogs.ServiceUnavailableException{}).Once() + mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Once() - // Should call fail callback when concurrency is enabled - assert.True(t, failCalled, "fail callback should be called when concurrency is enabled") - mockService.AssertExpectations(t) - }) + // Concurrency enabled but nil RetryHeap should fall back to sync + s := newSender(logger, mockService, mockManager, 2*time.Second, true, nil) + + batch := newLogEventBatch(Target{Group: "test-group", Stream: "test-stream"}, nil) + batch.append(newLogEvent(time.Now(), "Test message", nil)) + + var doneCalled bool + batch.addDoneCallback(func() { doneCalled = true }) + + s.Send(batch) + + assert.True(t, doneCalled, "Done callback should be called") + mockService.AssertExpectations(t) } From ef7d6279de26f1978de33103d54f35501cb64c77 Mon Sep 17 00:00:00 2001 From: Akansha Agarwal Date: Wed, 31 Dec 2025 00:50:23 +0000 Subject: [PATCH 13/50] Implement halt on target logic --- .../cloudwatchlogs/internal/pusher/queue.go | 43 ++++++++++++++ .../internal/pusher/queue_test.go | 59 +++++++++++++++++++ 2 files changed, 102 insertions(+) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go b/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go index e8ad65ffdc3..a5468d0dffc 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go @@ -42,6 +42,10 @@ type queue struct { initNonBlockingChOnce sync.Once startNonBlockCh chan struct{} wg *sync.WaitGroup + + // Circuit breaker halt/resume functionality + haltCond *sync.Cond + halted bool } var _ (Queue) = (*queue)(nil) @@ -67,6 +71,8 @@ func newQueue( stopCh: make(chan struct{}), startNonBlockCh: make(chan struct{}), wg: wg, + haltCond: sync.NewCond(&sync.Mutex{}), + halted: false, } q.flushTimeout.Store(flushTimeout) q.wg.Add(1) @@ -175,6 +181,11 @@ func (q *queue) merge(mergeChan chan logs.LogEvent) { func (q *queue) send() { if len(q.batch.events) > 0 { q.batch.addDoneCallback(q.onSuccessCallback(q.batch.bufferedSize)) + q.batch.addFailCallback(q.onFailCallback()) + + // Wait if halted (circuit breaker) + q.waitIfHalted() + q.sender.Send(q.batch) q.batch = newLogEventBatch(q.target, q.entityProvider) } @@ -183,6 +194,7 @@ func (q *queue) send() { // onSuccessCallback returns a callback function to be executed after a successful send. func (q *queue) onSuccessCallback(bufferedSize int) func() { return func() { + q.resume() // Resume queue on success q.lastSentTime.Store(time.Now()) go q.addStats("rawSize", float64(bufferedSize)) q.resetFlushTimer() @@ -245,3 +257,34 @@ func hasValidTime(e logs.LogEvent) bool { } return true } + +// waitIfHalted blocks until the queue is unhalted (circuit breaker functionality) +func (q *queue) waitIfHalted() { + q.haltCond.L.Lock() + for q.halted { + q.haltCond.Wait() + } + q.haltCond.L.Unlock() +} + +// halt stops the queue from sending batches (called on failure) +func (q *queue) halt() { + q.haltCond.L.Lock() + q.halted = true + q.haltCond.L.Unlock() +} + +// resume allows the queue to send batches again (called on success) +func (q *queue) resume() { + q.haltCond.L.Lock() + q.halted = false + q.haltCond.Broadcast() + q.haltCond.L.Unlock() +} + +// onFailCallback returns a callback function to be executed after a failed send +func (q *queue) onFailCallback() func() { + return func() { + q.halt() + } +} diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go index 8d645ae4bd3..cbf62fd94a1 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go @@ -759,6 +759,8 @@ func TestQueueCallbackRegistration(t *testing.T) { flushTimer: time.NewTimer(10 * time.Millisecond), startNonBlockCh: make(chan struct{}), wg: &wg, + haltCond: sync.NewCond(&sync.Mutex{}), + halted: false, } q.flushTimeout.Store(10 * time.Millisecond) @@ -801,6 +803,8 @@ func TestQueueCallbackRegistration(t *testing.T) { flushTimer: time.NewTimer(10 * time.Millisecond), startNonBlockCh: make(chan struct{}), wg: &wg, + haltCond: sync.NewCond(&sync.Mutex{}), + halted: false, } q.flushTimeout.Store(10 * time.Millisecond) @@ -814,3 +818,58 @@ func TestQueueCallbackRegistration(t *testing.T) { mockSender.AssertExpectations(t) }) } +func TestQueueHaltResume(t *testing.T) { + logger := testutil.NewNopLogger() + + var sendCount atomic.Int32 + mockSender := &mockSender{} + mockSender.On("Send", mock.Anything).Run(func(args mock.Arguments) { + sendCount.Add(1) + batch := args.Get(0).(*logEventBatch) + // Simulate failure on first call, success on second + if sendCount.Load() == 1 { + batch.fail() // This should halt the queue + } else { + batch.done() // This should resume the queue + } + }).Return() + + var wg sync.WaitGroup + q := newQueue(logger, Target{"G", "S", util.StandardLogGroupClass, -1}, 10*time.Millisecond, nil, mockSender, &wg) + defer q.Stop() + + // Add first event - should trigger send and halt + q.AddEvent(newStubLogEvent("first message", time.Now())) + + // Wait a bit for the first send to complete and halt + time.Sleep(50 * time.Millisecond) + + // Add second event - should be queued but not sent due to halt + q.AddEvent(newStubLogEvent("second message", time.Now())) + + // Verify only one send happened (queue is halted) + assert.Equal(t, int32(1), sendCount.Load(), "Should have only one send due to halt") + + // Trigger flush to force send of second batch - this should block until resumed + done := make(chan bool) + go func() { + time.Sleep(100 * time.Millisecond) // Wait a bit + // Manually resume by calling success callback on a dummy batch + dummyBatch := newLogEventBatch(Target{"G", "S", util.StandardLogGroupClass, -1}, nil) + dummyBatch.addDoneCallback(func() { + // This simulates a successful send that should resume the queue + }) + dummyBatch.done() + done <- true + }() + + // This should eventually complete when the queue is resumed + select { + case <-done: + // Success - the resume worked + case <-time.After(5 * time.Second): + t.Fatal("Test timed out - queue may be permanently halted") + } + + mockSender.AssertExpectations(t) +} From 309f904bc71801b064f7432f34c882d2ddffe967 Mon Sep 17 00:00:00 2001 From: Akansha Agarwal Date: Wed, 31 Dec 2025 01:41:19 +0000 Subject: [PATCH 14/50] lint --- .../cloudwatchlogs/internal/pusher/pool_test.go | 2 +- .../cloudwatchlogs/internal/pusher/queue.go | 4 ++-- .../cloudwatchlogs/internal/pusher/queue_test.go | 14 +++++++------- .../internal/pusher/retryheap_test.go | 12 ------------ 4 files changed, 10 insertions(+), 22 deletions(-) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go index 4a7abe08073..34e83bc89b3 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go @@ -135,7 +135,7 @@ func TestSenderPool(t *testing.T) { assert.Equal(t, int32(200), completed.Load()) } -func TestSenderPoolRetryHeap(t *testing.T) { +func TestSenderPoolRetryHeap(_ *testing.T) { logger := testutil.NewNopLogger() mockService := new(mockLogsService) mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go b/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go index a5468d0dffc..5d297aed525 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go @@ -182,10 +182,10 @@ func (q *queue) send() { if len(q.batch.events) > 0 { q.batch.addDoneCallback(q.onSuccessCallback(q.batch.bufferedSize)) q.batch.addFailCallback(q.onFailCallback()) - + // Wait if halted (circuit breaker) q.waitIfHalted() - + q.sender.Send(q.batch) q.batch = newLogEventBatch(q.target, q.entityProvider) } diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go index cbf62fd94a1..8d030450dda 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go @@ -820,7 +820,7 @@ func TestQueueCallbackRegistration(t *testing.T) { } func TestQueueHaltResume(t *testing.T) { logger := testutil.NewNopLogger() - + var sendCount atomic.Int32 mockSender := &mockSender{} mockSender.On("Send", mock.Anything).Run(func(args mock.Arguments) { @@ -840,16 +840,16 @@ func TestQueueHaltResume(t *testing.T) { // Add first event - should trigger send and halt q.AddEvent(newStubLogEvent("first message", time.Now())) - + // Wait a bit for the first send to complete and halt time.Sleep(50 * time.Millisecond) - + // Add second event - should be queued but not sent due to halt q.AddEvent(newStubLogEvent("second message", time.Now())) - + // Verify only one send happened (queue is halted) assert.Equal(t, int32(1), sendCount.Load(), "Should have only one send due to halt") - + // Trigger flush to force send of second batch - this should block until resumed done := make(chan bool) go func() { @@ -862,7 +862,7 @@ func TestQueueHaltResume(t *testing.T) { dummyBatch.done() done <- true }() - + // This should eventually complete when the queue is resumed select { case <-done: @@ -870,6 +870,6 @@ func TestQueueHaltResume(t *testing.T) { case <-time.After(5 * time.Second): t.Fatal("Test timed out - queue may be permanently halted") } - + mockSender.AssertExpectations(t) } diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go index 909ff25cd29..b14477762c3 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go @@ -143,18 +143,6 @@ func TestRetryHeapProcessorSendsBatch(t *testing.T) { assert.Equal(t, 0, heap.Size()) // Batch should be removed from heap } -// Mock senderPool for testing -type mockSenderPool struct { - sendCount int -} - -func (m *mockSenderPool) Send(_ *logEventBatch) { - m.sendCount++ -} - -func (m *mockSenderPool) Stop() {} -func (m *mockSenderPool) SetRetryDuration(time.Duration) {} -func (m *mockSenderPool) RetryDuration() time.Duration { return time.Hour } func TestRetryHeap_SemaphoreBlockingAndUnblocking(t *testing.T) { heap := NewRetryHeap(2) // maxSize = 2 defer heap.Stop() From d9296a6c7d9a3f44f6dc543b1efb2ada7ddfc606 Mon Sep 17 00:00:00 2001 From: Marcus Mann Date: Mon, 9 Feb 2026 13:54:39 -0500 Subject: [PATCH 15/50] lint --- .../outputs/cloudwatchlogs/internal/pusher/retryheap_test.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go index e1164e0ea6e..0c39ba0c5f3 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go @@ -94,7 +94,6 @@ func TestRetryHeapProcessor(t *testing.T) { // Test start/stop processor.Start() - assert.NotNil(t, processor.ticker) processor.Stop() assert.True(t, processor.stopped) @@ -184,6 +183,8 @@ func TestRetryHeap_SemaphoreBlockingAndUnblocking(t *testing.T) { // Push is successfully blocked when at capacity } + time.Sleep(3 * time.Second) + // Pop ready batches to release semaphore slots readyBatches := heap.PopReady() assert.Len(t, readyBatches, 2, "Should pop exactly 2 ready batches") From 7051a0c3dca1e602d8be2edbd85936324bccc3d5 Mon Sep 17 00:00:00 2001 From: Marcus Mann Date: Mon, 9 Feb 2026 15:02:50 -0500 Subject: [PATCH 16/50] fix tests --- .../internal/pusher/circuitbreaker_test.go | 101 ++++++++++++++++++ .../internal/pusher/pool_test.go | 2 +- .../internal/pusher/retryheap_test.go | 2 - 3 files changed, 102 insertions(+), 3 deletions(-) create mode 100644 plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go new file mode 100644 index 00000000000..d541e9a46aa --- /dev/null +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go @@ -0,0 +1,101 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package pusher + +import ( + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/stretchr/testify/assert" + + "github.com/aws/amazon-cloudwatch-agent/sdk/service/cloudwatchlogs" + "github.com/aws/amazon-cloudwatch-agent/tool/testutil" +) + +// TestCircuitBreakerBlocksTargetAfterFailure verifies that when a batch fails +// for a target, the circuit breaker prevents additional batches from that target +// from being sent until the failing batch is retried successfully. +// +// Without a circuit breaker, a problematic target continues producing new batches +// that flood the SenderQueue/WorkerPool, starving healthy targets. +func TestCircuitBreakerBlocksTargetAfterFailure(t *testing.T) { + logger := testutil.NewNopLogger() + + failingTarget := Target{Group: "failing-group", Stream: "stream"} + healthyTarget := Target{Group: "healthy-group", Stream: "stream"} + + var failingTargetSendCount atomic.Int32 + var healthyTargetSendCount atomic.Int32 + + service := &stubLogsService{ + ple: func(input *cloudwatchlogs.PutLogEventsInput) (*cloudwatchlogs.PutLogEventsOutput, error) { + if *input.LogGroupName == failingTarget.Group { + failingTargetSendCount.Add(1) + return nil, &cloudwatchlogs.ServiceUnavailableException{} + } + healthyTargetSendCount.Add(1) + return &cloudwatchlogs.PutLogEventsOutput{}, nil + }, + cls: func(_ *cloudwatchlogs.CreateLogStreamInput) (*cloudwatchlogs.CreateLogStreamOutput, error) { + return &cloudwatchlogs.CreateLogStreamOutput{}, nil + }, + clg: func(_ *cloudwatchlogs.CreateLogGroupInput) (*cloudwatchlogs.CreateLogGroupOutput, error) { + return &cloudwatchlogs.CreateLogGroupOutput{}, nil + }, + dlg: func(_ *cloudwatchlogs.DescribeLogGroupsInput) (*cloudwatchlogs.DescribeLogGroupsOutput, error) { + return &cloudwatchlogs.DescribeLogGroupsOutput{}, nil + }, + } + + concurrency := 5 + workerPool := NewWorkerPool(concurrency) + retryHeap := NewRetryHeap(concurrency, logger) + defer workerPool.Stop() + defer retryHeap.Stop() + + tm := NewTargetManager(logger, service) + + var wg sync.WaitGroup + flushTimeout := 50 * time.Millisecond + retryDuration := time.Hour + + failingPusher := NewPusher(logger, failingTarget, service, tm, nil, workerPool, flushTimeout, retryDuration, &wg, retryHeap) + healthyPusher := NewPusher(logger, healthyTarget, service, tm, nil, workerPool, flushTimeout, retryDuration, &wg, retryHeap) + defer failingPusher.Stop() + defer healthyPusher.Stop() + + now := time.Now() + + // Send events to both targets. The failing target will fail on PutLogEvents, + // and the circuit breaker should block it from sending more batches. + for i := 0; i < 10; i++ { + failingPusher.AddEvent(newStubLogEvent("fail", now)) + healthyPusher.AddEvent(newStubLogEvent("ok", now)) + } + + // Wait for flushes to occur + time.Sleep(500 * time.Millisecond) + + // Send more events - the failing target should be blocked by circuit breaker + for i := 0; i < 10; i++ { + failingPusher.AddEvent(newStubLogEvent("fail-more", now)) + healthyPusher.AddEvent(newStubLogEvent("ok-more", now)) + } + + time.Sleep(500 * time.Millisecond) + + // Circuit breaker assertion: after the first failure, the failing target should + // NOT have sent additional batches. Only 1 send attempt should have been made + // before the circuit breaker blocks it. + assert.LessOrEqual(t, failingTargetSendCount.Load(), int32(1), + "Circuit breaker should block failing target from sending more than 1 batch, "+ + "but %d batches were sent. Without a circuit breaker, the failing target "+ + "continues flooding the worker pool with bad requests.", failingTargetSendCount.Load()) + + // Healthy target should continue sending successfully + assert.Greater(t, healthyTargetSendCount.Load(), int32(0), + "Healthy target should continue sending while failing target is blocked") +} diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go index f666e86560a..16fe906a778 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go @@ -144,7 +144,7 @@ func TestSenderPoolRetryHeap(_ *testing.T) { retryHeap := NewRetryHeap(10, logger) defer retryHeap.Stop() - s := newSender(logger, mockService, nil, time.Second, false, retryHeap) + s := newSender(logger, mockService, nil, time.Second, retryHeap) p := NewWorkerPool(12) defer p.Stop() diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go index 75f95f22f5c..fa450f82d61 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go @@ -94,8 +94,6 @@ func TestRetryHeapProcessor(t *testing.T) { // Test start/stop processor.Start() - assert.NotNil(t, processor.ticker) - processor.Stop() assert.True(t, processor.stopped) } From fd185dbc5cf4b5dcb91c6d90ba764e32bee4e03c Mon Sep 17 00:00:00 2001 From: Marcus Mann Date: Mon, 9 Feb 2026 15:18:35 -0500 Subject: [PATCH 17/50] Fix race condition in RetryHeap Stop and Push methods - Add mutex protection to Stop() method to prevent race conditions - Add stopped flag checks in Push() to prevent pushing after Stop() - Ensure Push() checks stopped flag both before and after acquiring semaphore - Fix TestRetryHeapStopTwice to verify correct behavior --- .../cloudwatchlogs/internal/pusher/retryheap.go | 16 ++++++++++++++++ .../internal/pusher/retryheap_test.go | 2 +- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go index ad33cdcf46b..b7202d648c1 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go @@ -75,11 +75,24 @@ func NewRetryHeap(maxSize int, logger telegraf.Logger) RetryHeap { // Push adds a batch to the heap, blocking if full func (rh *retryHeap) Push(batch *logEventBatch) error { + rh.mutex.RLock() + if rh.stopped { + rh.mutex.RUnlock() + return errors.New("retry heap stopped") + } + rh.mutex.RUnlock() + // Acquire semaphore slot (blocks if at maxSize capacity) select { case rh.semaphore <- struct{}{}: // add batch to heap with mutex protection rh.mutex.Lock() + if rh.stopped { + // Release semaphore if stopped after acquiring + <-rh.semaphore + rh.mutex.Unlock() + return errors.New("retry heap stopped") + } heap.Push(&rh.heap, batch) rh.mutex.Unlock() return nil @@ -116,6 +129,9 @@ func (rh *retryHeap) Size() int { // Stop stops the retry heap func (rh *retryHeap) Stop() { + rh.mutex.Lock() + defer rh.mutex.Unlock() + if rh.stopped { return } diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go index fa450f82d61..edc7dbc3145 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go @@ -275,7 +275,7 @@ func TestRetryHeapStopTwice(t *testing.T) { target := Target{Group: "test-group", Stream: "test-stream"} batch := newLogEventBatch(target, nil) - rh.Push(batch) // Should not panic or return error + rh.Push(batch) // Verify heap is empty (nothing was pushed) assert.Equal(t, 0, rh.Size()) From d79ae7f1da08ac6b41121703d9a33da08b10cef4 Mon Sep 17 00:00:00 2001 From: Marcus Mann Date: Mon, 9 Feb 2026 15:36:48 -0500 Subject: [PATCH 18/50] Add failing test for circuit breaker resume on batch expiry - Add TestRetryHeapProcessorExpiredBatchShouldResume to demonstrate bug - When a batch expires after 14 days, RetryHeapProcessor calls updateState() but not done(), leaving circuit breaker permanently closed - Target remains blocked forever even though bad batch was dropped - Test currently fails, demonstrating the bug from PR comment --- .../internal/pusher/queue_test.go | 86 +++++++++++++++ .../internal/pusher/retryheap_expiry_test.go | 104 ++++++++++++++++++ 2 files changed, 190 insertions(+) create mode 100644 plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go index 688ea474450..1db52ce32af 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go @@ -873,3 +873,89 @@ func TestQueueHaltResume(t *testing.T) { mockSender.AssertExpectations(t) } + +// TestQueueResumeOnBatchExpiry verifies that when a batch expires after 14 days of retrying, +// the circuit breaker resumes the queue to allow new batches to be processed. +// This prevents the target from being permanently blocked when a bad batch is eventually dropped. +// +// Scenario from PR comment: "Say a bad batch from a target caused this to halt. Now that bad batch +// is re-tried for 14 days and eventually dropped - but this never gets resumed in that case right? +// So this target is blocked forever in that scenario?" +func TestQueueResumeOnBatchExpiry(t *testing.T) { + logger := testutil.NewNopLogger() + + var sendCount atomic.Int32 + mockService := &stubLogsService{ + ple: func(input *cloudwatchlogs.PutLogEventsInput) (*cloudwatchlogs.PutLogEventsOutput, error) { + sendCount.Add(1) + // Always return an error to simulate a failing target + return nil, &cloudwatchlogs.ServiceUnavailableException{} + }, + cls: func(_ *cloudwatchlogs.CreateLogStreamInput) (*cloudwatchlogs.CreateLogStreamOutput, error) { + return &cloudwatchlogs.CreateLogStreamOutput{}, nil + }, + clg: func(_ *cloudwatchlogs.CreateLogGroupInput) (*cloudwatchlogs.CreateLogGroupOutput, error) { + return &cloudwatchlogs.CreateLogGroupOutput{}, nil + }, + dlg: func(_ *cloudwatchlogs.DescribeLogGroupsInput) (*cloudwatchlogs.DescribeLogGroupsOutput, error) { + return &cloudwatchlogs.DescribeLogGroupsOutput{}, nil + }, + } + + target := Target{Group: "test-group", Stream: "test-stream"} + + // Create components + workerPool := NewWorkerPool(5) + retryHeap := NewRetryHeap(10, logger) + tm := NewTargetManager(logger, mockService) + defer workerPool.Stop() + defer retryHeap.Stop() + + // Create RetryHeapProcessor with very short max retry duration for testing + maxRetryDuration := 100 * time.Millisecond // Normally 14 days + retryHeapProcessor := NewRetryHeapProcessor(retryHeap, workerPool, mockService, tm, logger, maxRetryDuration, nil) + retryHeapProcessor.Start() + defer retryHeapProcessor.Stop() + + // Create pusher/queue + var wg sync.WaitGroup + flushTimeout := 50 * time.Millisecond + pusher := NewPusher(logger, target, mockService, tm, nil, workerPool, flushTimeout, maxRetryDuration, &wg, retryHeap) + defer pusher.Stop() + + // Add first event - will fail and halt the queue + pusher.AddEvent(newStubLogEvent("first message", time.Now())) + + // Wait for batch to be sent, fail, and go to retry heap + time.Sleep(200 * time.Millisecond) + + // Verify at least one send attempt was made + assert.Greater(t, sendCount.Load(), int32(0), "Should have attempted to send") + + // Add second event - should be queued but blocked by circuit breaker + pusher.AddEvent(newStubLogEvent("second message", time.Now())) + + initialSendCount := sendCount.Load() + + // Wait for the batch in retry heap to expire + time.Sleep(200 * time.Millisecond) + + // After expiry, the RetryHeapProcessor should drop the expired batch + // but currently it only calls updateState(), not done() + // This means the circuit breaker remains closed and the second batch never gets sent + + // Add a third event to trigger another flush + pusher.AddEvent(newStubLogEvent("third message", time.Now())) + + // Wait for potential sends + time.Sleep(200 * time.Millisecond) + + finalSendCount := sendCount.Load() + + // BUG: The second and third batches should have been attempted after the first batch expired + // but they won't be because the queue remains halted forever + assert.Equal(t, initialSendCount, finalSendCount, + "No new send attempts should occur because the circuit breaker is permanently closed. "+ + "This demonstrates the bug: when a batch expires in RetryHeapProcessor, it calls "+ + "updateState() but not done(), so the queue never resumes. The target is blocked forever.") +} diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go new file mode 100644 index 00000000000..4571a107779 --- /dev/null +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go @@ -0,0 +1,104 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package pusher + +import ( + "sync/atomic" + "testing" + "time" + + "github.com/stretchr/testify/assert" + + "github.com/aws/amazon-cloudwatch-agent/sdk/service/cloudwatchlogs" + "github.com/aws/amazon-cloudwatch-agent/tool/testutil" +) + +// TestRetryHeapProcessorExpiredBatchShouldResume demonstrates the bug where +// expired batches don't resume the circuit breaker, leaving the target permanently blocked. +// +// From PR comment: "Say a bad batch from a target caused this to halt. Now that bad batch +// is re-tried for 14 days and eventually dropped - but this never gets resumed in that case right? +// So this target is blocked forever in that scenario?" +func TestRetryHeapProcessorExpiredBatchShouldResume(t *testing.T) { + logger := testutil.NewNopLogger() + + var sendAttempts atomic.Int32 + mockService := &stubLogsService{ + ple: func(input *cloudwatchlogs.PutLogEventsInput) (*cloudwatchlogs.PutLogEventsOutput, error) { + sendAttempts.Add(1) + // Always fail to simulate a problematic target + return nil, &cloudwatchlogs.ServiceUnavailableException{} + }, + cls: func(_ *cloudwatchlogs.CreateLogStreamInput) (*cloudwatchlogs.CreateLogStreamOutput, error) { + return &cloudwatchlogs.CreateLogStreamOutput{}, nil + }, + clg: func(_ *cloudwatchlogs.CreateLogGroupInput) (*cloudwatchlogs.CreateLogGroupOutput, error) { + return &cloudwatchlogs.CreateLogGroupOutput{}, nil + }, + dlg: func(_ *cloudwatchlogs.DescribeLogGroupsInput) (*cloudwatchlogs.DescribeLogGroupsOutput, error) { + return &cloudwatchlogs.DescribeLogGroupsOutput{}, nil + }, + } + + target := Target{Group: "failing-group", Stream: "stream"} + + // Create retry heap and processor with very short expiry for testing + retryHeap := NewRetryHeap(10, logger) + workerPool := NewWorkerPool(5) + tm := NewTargetManager(logger, mockService) + maxRetryDuration := 50 * time.Millisecond // Normally 14 days + + retryHeapProcessor := NewRetryHeapProcessor(retryHeap, workerPool, mockService, tm, logger, maxRetryDuration, nil) + retryHeapProcessor.Start() + + defer retryHeap.Stop() + defer workerPool.Stop() + defer retryHeapProcessor.Stop() + + // Create a batch that will expire + batch := newLogEventBatch(target, nil) + batch.append(newLogEvent(time.Now(), "test message", nil)) + + // Set up callbacks to track circuit breaker state + var circuitBreakerHalted atomic.Bool + var circuitBreakerResumed atomic.Bool + + batch.addFailCallback(func() { + circuitBreakerHalted.Store(true) + }) + + batch.addDoneCallback(func() { + circuitBreakerResumed.Store(true) + }) + + // Initialize the batch's start time to make it already expired + batch.initializeStartTime() + batch.startTime = time.Now().Add(-100 * time.Millisecond) // Already expired + + // Update retry metadata to simulate a failed attempt and make it ready for retry + batch.updateRetryMetadata(&cloudwatchlogs.ServiceUnavailableException{}) + // Set nextRetryTime to past so it's ready for retry + batch.nextRetryTime = time.Now().Add(-10 * time.Millisecond) + + // Push the expired batch to the retry heap + err := retryHeap.Push(batch) + assert.NoError(t, err) + + // Verify batch is in the heap + assert.Equal(t, 1, retryHeap.Size()) + + // Wait for RetryHeapProcessor to process the expired batch + time.Sleep(200 * time.Millisecond) + + // The batch should have been removed from the heap + assert.Equal(t, 0, retryHeap.Size(), "Expired batch should be removed from heap") + + // The circuit breaker SHOULD be resumed when the batch expires + // This allows the target to continue processing new batches after the bad batch is dropped + assert.True(t, circuitBreakerResumed.Load(), + "Circuit breaker should be resumed after batch expiry. "+ + "When a batch is retried for 14 days and eventually dropped, "+ + "the target must be unblocked to allow new batches to be processed. "+ + "Otherwise the target remains blocked forever.") +} From de410f11bef9e597f6d570e2373531074a90ac68 Mon Sep 17 00:00:00 2001 From: Marcus Mann Date: Mon, 9 Feb 2026 15:41:29 -0500 Subject: [PATCH 19/50] lx --- .../internal/pusher/queue_test.go | 82 +------------------ .../internal/pusher/retryheap.go | 3 +- .../internal/pusher/retryheap_expiry_test.go | 30 +++---- 3 files changed, 19 insertions(+), 96 deletions(-) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go index 1db52ce32af..80c343f3ef7 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go @@ -878,84 +878,6 @@ func TestQueueHaltResume(t *testing.T) { // the circuit breaker resumes the queue to allow new batches to be processed. // This prevents the target from being permanently blocked when a bad batch is eventually dropped. // -// Scenario from PR comment: "Say a bad batch from a target caused this to halt. Now that bad batch -// is re-tried for 14 days and eventually dropped - but this never gets resumed in that case right? +// Scenario from PR comment: "Say a bad batch from a target caused this to halt. Now that bad batch +// is re-tried for 14 days and eventually dropped - but this never gets resumed in that case right? // So this target is blocked forever in that scenario?" -func TestQueueResumeOnBatchExpiry(t *testing.T) { - logger := testutil.NewNopLogger() - - var sendCount atomic.Int32 - mockService := &stubLogsService{ - ple: func(input *cloudwatchlogs.PutLogEventsInput) (*cloudwatchlogs.PutLogEventsOutput, error) { - sendCount.Add(1) - // Always return an error to simulate a failing target - return nil, &cloudwatchlogs.ServiceUnavailableException{} - }, - cls: func(_ *cloudwatchlogs.CreateLogStreamInput) (*cloudwatchlogs.CreateLogStreamOutput, error) { - return &cloudwatchlogs.CreateLogStreamOutput{}, nil - }, - clg: func(_ *cloudwatchlogs.CreateLogGroupInput) (*cloudwatchlogs.CreateLogGroupOutput, error) { - return &cloudwatchlogs.CreateLogGroupOutput{}, nil - }, - dlg: func(_ *cloudwatchlogs.DescribeLogGroupsInput) (*cloudwatchlogs.DescribeLogGroupsOutput, error) { - return &cloudwatchlogs.DescribeLogGroupsOutput{}, nil - }, - } - - target := Target{Group: "test-group", Stream: "test-stream"} - - // Create components - workerPool := NewWorkerPool(5) - retryHeap := NewRetryHeap(10, logger) - tm := NewTargetManager(logger, mockService) - defer workerPool.Stop() - defer retryHeap.Stop() - - // Create RetryHeapProcessor with very short max retry duration for testing - maxRetryDuration := 100 * time.Millisecond // Normally 14 days - retryHeapProcessor := NewRetryHeapProcessor(retryHeap, workerPool, mockService, tm, logger, maxRetryDuration, nil) - retryHeapProcessor.Start() - defer retryHeapProcessor.Stop() - - // Create pusher/queue - var wg sync.WaitGroup - flushTimeout := 50 * time.Millisecond - pusher := NewPusher(logger, target, mockService, tm, nil, workerPool, flushTimeout, maxRetryDuration, &wg, retryHeap) - defer pusher.Stop() - - // Add first event - will fail and halt the queue - pusher.AddEvent(newStubLogEvent("first message", time.Now())) - - // Wait for batch to be sent, fail, and go to retry heap - time.Sleep(200 * time.Millisecond) - - // Verify at least one send attempt was made - assert.Greater(t, sendCount.Load(), int32(0), "Should have attempted to send") - - // Add second event - should be queued but blocked by circuit breaker - pusher.AddEvent(newStubLogEvent("second message", time.Now())) - - initialSendCount := sendCount.Load() - - // Wait for the batch in retry heap to expire - time.Sleep(200 * time.Millisecond) - - // After expiry, the RetryHeapProcessor should drop the expired batch - // but currently it only calls updateState(), not done() - // This means the circuit breaker remains closed and the second batch never gets sent - - // Add a third event to trigger another flush - pusher.AddEvent(newStubLogEvent("third message", time.Now())) - - // Wait for potential sends - time.Sleep(200 * time.Millisecond) - - finalSendCount := sendCount.Load() - - // BUG: The second and third batches should have been attempted after the first batch expired - // but they won't be because the queue remains halted forever - assert.Equal(t, initialSendCount, finalSendCount, - "No new send attempts should occur because the circuit breaker is permanently closed. "+ - "This demonstrates the bug: when a batch expires in RetryHeapProcessor, it calls "+ - "updateState() but not done(), so the queue never resumes. The target is blocked forever.") -} diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go index b7202d648c1..0c5f0f3e547 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go @@ -131,7 +131,7 @@ func (rh *retryHeap) Size() int { func (rh *retryHeap) Stop() { rh.mutex.Lock() defer rh.mutex.Unlock() - + if rh.stopped { return } @@ -220,6 +220,7 @@ func (p *RetryHeapProcessor) processReadyMessages() { if batch.isExpired(p.maxRetryDuration) { p.logger.Errorf("Dropping expired batch for %v/%v", batch.Group, batch.Stream) batch.updateState() + batch.done() // Resume circuit breaker to allow target to process new batches continue } diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go index 4571a107779..178a2bf8ae4 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go @@ -48,10 +48,10 @@ func TestRetryHeapProcessorExpiredBatchShouldResume(t *testing.T) { workerPool := NewWorkerPool(5) tm := NewTargetManager(logger, mockService) maxRetryDuration := 50 * time.Millisecond // Normally 14 days - + retryHeapProcessor := NewRetryHeapProcessor(retryHeap, workerPool, mockService, tm, logger, maxRetryDuration, nil) retryHeapProcessor.Start() - + defer retryHeap.Stop() defer workerPool.Stop() defer retryHeapProcessor.Stop() @@ -59,46 +59,46 @@ func TestRetryHeapProcessorExpiredBatchShouldResume(t *testing.T) { // Create a batch that will expire batch := newLogEventBatch(target, nil) batch.append(newLogEvent(time.Now(), "test message", nil)) - + // Set up callbacks to track circuit breaker state var circuitBreakerHalted atomic.Bool var circuitBreakerResumed atomic.Bool - + batch.addFailCallback(func() { circuitBreakerHalted.Store(true) }) - + batch.addDoneCallback(func() { circuitBreakerResumed.Store(true) }) - + // Initialize the batch's start time to make it already expired batch.initializeStartTime() batch.startTime = time.Now().Add(-100 * time.Millisecond) // Already expired - + // Update retry metadata to simulate a failed attempt and make it ready for retry batch.updateRetryMetadata(&cloudwatchlogs.ServiceUnavailableException{}) // Set nextRetryTime to past so it's ready for retry batch.nextRetryTime = time.Now().Add(-10 * time.Millisecond) - + // Push the expired batch to the retry heap err := retryHeap.Push(batch) assert.NoError(t, err) - + // Verify batch is in the heap assert.Equal(t, 1, retryHeap.Size()) - + // Wait for RetryHeapProcessor to process the expired batch time.Sleep(200 * time.Millisecond) - + // The batch should have been removed from the heap assert.Equal(t, 0, retryHeap.Size(), "Expired batch should be removed from heap") - + // The circuit breaker SHOULD be resumed when the batch expires // This allows the target to continue processing new batches after the bad batch is dropped assert.True(t, circuitBreakerResumed.Load(), "Circuit breaker should be resumed after batch expiry. "+ - "When a batch is retried for 14 days and eventually dropped, "+ - "the target must be unblocked to allow new batches to be processed. "+ - "Otherwise the target remains blocked forever.") + "When a batch is retried for 14 days and eventually dropped, "+ + "the target must be unblocked to allow new batches to be processed. "+ + "Otherwise the target remains blocked forever.") } From f4c7620447407ca47ee554a39d74f67f16e6198a Mon Sep 17 00:00:00 2001 From: Marcus Mann Date: Tue, 10 Feb 2026 12:02:29 -0500 Subject: [PATCH 20/50] Remove configurable maxRetryTimeout in favor of default hardcoded value --- .../outputs/cloudwatchlogs/cloudwatchlogs.go | 2 +- .../cloudwatchlogs/internal/pusher/batch.go | 16 +++++++++------- .../cloudwatchlogs/internal/pusher/pool.go | 11 ----------- .../cloudwatchlogs/internal/pusher/pusher.go | 7 +++---- .../cloudwatchlogs/internal/pusher/retryheap.go | 4 ++-- .../cloudwatchlogs/internal/pusher/sender.go | 17 +---------------- 6 files changed, 16 insertions(+), 41 deletions(-) diff --git a/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go b/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go index ee4c8422632..45bcc947d4d 100644 --- a/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go +++ b/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go @@ -183,7 +183,7 @@ func (c *CloudWatchLogs) getDest(t pusher.Target, logSrc logs.LogSrc) *cwDest { } c.targetManager = pusher.NewTargetManager(c.Log, client) }) - p := pusher.NewPusher(c.Log, t, client, c.targetManager, logSrc, c.workerPool, c.ForceFlushInterval.Duration, maxRetryTimeout, &c.pusherWaitGroup, c.retryHeap) + p := pusher.NewPusher(c.Log, t, client, c.targetManager, logSrc, c.workerPool, c.ForceFlushInterval.Duration, &c.pusherWaitGroup, c.Concurrency, c.retryHeap) cwd := &cwDest{ pusher: p, retryer: logThrottleRetryer, diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go b/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go index 3c83be15a0b..752eeba4b03 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go @@ -18,6 +18,9 @@ import ( // CloudWatch Logs PutLogEvents API limits // Taken from https://docs.aws.amazon.com/AmazonCloudWatchLogs/latest/APIReference/API_PutLogEvents.html const ( + // maxRetryTimeout is the default retry timeout for CloudWatch Logs operations + maxRetryTimeout = 14*24*time.Hour + 10*time.Minute + // The maximum batch size in bytes. This size is calculated as the sum of all event messages in UTF-8, // plus 26 bytes for each log event. reqSizeLimit = 1024 * 1024 @@ -109,6 +112,7 @@ type logEventBatch struct { retryCountLong int // Number of retries using long delay strategy startTime time.Time // Time of first request (for max retry duration calculation) nextRetryTime time.Time // When this batch should be retried next + expireAfter time.Time // When this batch expires and should be dropped lastError error // Last error encountered } @@ -252,11 +256,12 @@ func (t byTimestamp) Less(i, j int) bool { return *t[i].Timestamp < *t[j].Timestamp } -// initializeStartTime sets the start time if not already set. +// initializeStartTime sets the start time and expiration time if not already set. func (b *logEventBatch) initializeStartTime() { if b.startTime.IsZero() { b.startTime = time.Now() } + b.expireAfter = b.startTime.Add(maxRetryTimeout) } // updateRetryMetadata updates the retry metadata after a failed send attempt. @@ -279,12 +284,9 @@ func (b *logEventBatch) updateRetryMetadata(err error) { b.nextRetryTime = time.Now().Add(wait) } -// isExpired checks if the batch has exceeded the maximum retry duration. -func (b *logEventBatch) isExpired(maxRetryDuration time.Duration) bool { - if b.startTime.IsZero() { - return false - } - return time.Since(b.startTime) > maxRetryDuration +// isExpired checks if the batch has exceeded its expiration time. +func (b *logEventBatch) isExpired() bool { + return !b.expireAfter.IsZero() && time.Now().After(b.expireAfter) } // isReadyForRetry checks if enough time has passed since the last failure to retry this batch. diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pool.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pool.go index 1d6edf57e90..fb15ba9fab1 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/pool.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pool.go @@ -6,7 +6,6 @@ package pusher import ( "sync" "sync/atomic" - "time" ) type WorkerPool interface { @@ -113,13 +112,3 @@ func (s *senderPool) Stop() { // workerpool is stopped by the plugin s.sender.Stop() } - -// SetRetryDuration sets the retry duration on the wrapped Sender. -func (s *senderPool) SetRetryDuration(duration time.Duration) { - s.sender.SetRetryDuration(duration) -} - -// RetryDuration returns the retry duration of the wrapped Sender. -func (s *senderPool) RetryDuration() time.Duration { - return s.sender.RetryDuration() -} diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go index 43310d6861e..6a4c9f2df24 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go @@ -32,11 +32,11 @@ func NewPusher( entityProvider logs.LogEntityProvider, workerPool WorkerPool, flushTimeout time.Duration, - retryDuration time.Duration, wg *sync.WaitGroup, + _ int, retryHeap RetryHeap, ) *Pusher { - s := createSender(logger, service, targetManager, workerPool, retryDuration, retryHeap) + s := createSender(logger, service, targetManager, workerPool, retryHeap) q := newQueue(logger, target, flushTimeout, entityProvider, s, wg) targetManager.PutRetentionPolicy(target) @@ -61,10 +61,9 @@ func createSender( service cloudWatchLogsService, targetManager TargetManager, workerPool WorkerPool, - retryDuration time.Duration, retryHeap RetryHeap, ) Sender { - s := newSender(logger, service, targetManager, retryDuration, retryHeap) + s := newSender(logger, service, targetManager, retryHeap) if workerPool == nil { return s } diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go index 0c5f0f3e547..025a3063ad1 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go @@ -155,7 +155,7 @@ type RetryHeapProcessor struct { func NewRetryHeapProcessor(retryHeap RetryHeap, workerPool WorkerPool, service cloudWatchLogsService, targetManager TargetManager, logger telegraf.Logger, maxRetryDuration time.Duration, retryer *retryer.LogThrottleRetryer) *RetryHeapProcessor { // Create processor's own sender and senderPool // Pass retryHeap so failed batches go back to RetryHeap instead of blocking on sync retry - sender := newSender(logger, service, targetManager, maxRetryDuration, retryHeap) + sender := newSender(logger, service, targetManager, retryHeap) senderPool := newSenderPool(workerPool, sender) return &RetryHeapProcessor{ @@ -217,7 +217,7 @@ func (p *RetryHeapProcessor) processReadyMessages() { for _, batch := range readyBatches { // Check if batch has expired - if batch.isExpired(p.maxRetryDuration) { + if batch.isExpired() { p.logger.Errorf("Dropping expired batch for %v/%v", batch.Group, batch.Stream) batch.updateState() batch.done() // Resume circuit breaker to allow target to process new batches diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go b/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go index 1b5f13fd481..ad38d7960f7 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go @@ -5,7 +5,6 @@ package pusher import ( "errors" - "sync/atomic" "time" "github.com/aws/aws-sdk-go/aws/awserr" @@ -24,14 +23,11 @@ type cloudWatchLogsService interface { type Sender interface { Send(*logEventBatch) - SetRetryDuration(time.Duration) - RetryDuration() time.Duration Stop() } type sender struct { service cloudWatchLogsService - retryDuration atomic.Value targetManager TargetManager logger telegraf.Logger stopCh chan struct{} @@ -45,7 +41,6 @@ func newSender( logger telegraf.Logger, service cloudWatchLogsService, targetManager TargetManager, - retryDuration time.Duration, retryHeap RetryHeap, ) Sender { s := &sender{ @@ -56,7 +51,6 @@ func newSender( stopped: false, retryHeap: retryHeap, } - s.retryDuration.Store(retryDuration) return s } @@ -118,7 +112,7 @@ func (s *sender) Send(batch *logEventBatch) { // Check if retry would exceed max duration totalRetries := batch.retryCountShort + batch.retryCountLong - 1 - if batch.nextRetryTime.After(batch.startTime.Add(s.RetryDuration())) { + if batch.isExpired() { s.logger.Errorf("All %v retries to %v/%v failed for PutLogEvents, request dropped.", totalRetries, batch.Group, batch.Stream) batch.updateState() return @@ -158,12 +152,3 @@ func (s *sender) Stop() { s.stopped = true } -// SetRetryDuration sets the maximum duration for retrying failed log sends. -func (s *sender) SetRetryDuration(retryDuration time.Duration) { - s.retryDuration.Store(retryDuration) -} - -// RetryDuration returns the current maximum retry duration. -func (s *sender) RetryDuration() time.Duration { - return s.retryDuration.Load().(time.Duration) -} From c791abd5dca8e4a680af18cb67d37bcfe7d13911 Mon Sep 17 00:00:00 2001 From: Marcus Mann Date: Tue, 10 Feb 2026 12:21:43 -0500 Subject: [PATCH 21/50] Update tests for removed retryDuration parameter --- .../cloudwatchlogs/internal/pusher/batch.go | 2 +- .../internal/pusher/batch_test.go | 6 +- .../internal/pusher/circuitbreaker_test.go | 5 +- .../internal/pusher/pool_test.go | 8 +-- .../internal/pusher/pusher_test.go | 4 +- .../internal/pusher/queue_test.go | 60 +++++-------------- .../internal/pusher/retryheap_expiry_test.go | 2 +- .../internal/pusher/sender_test.go | 27 +++++---- 8 files changed, 43 insertions(+), 71 deletions(-) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go b/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go index 752eeba4b03..d68dfdaddd6 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/batch.go @@ -260,8 +260,8 @@ func (t byTimestamp) Less(i, j int) bool { func (b *logEventBatch) initializeStartTime() { if b.startTime.IsZero() { b.startTime = time.Now() + b.expireAfter = b.startTime.Add(maxRetryTimeout) } - b.expireAfter = b.startTime.Add(maxRetryTimeout) } // updateRetryMetadata updates the retry metadata after a failed send attempt. diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/batch_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/batch_test.go index e433cc74902..2f8db4f689c 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/batch_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/batch_test.go @@ -411,7 +411,7 @@ func TestBatchRetryMetadata(t *testing.T) { // Test initial state assert.True(t, batch.startTime.IsZero()) assert.True(t, batch.isReadyForRetry()) - assert.False(t, batch.isExpired(time.Hour)) + assert.False(t, batch.isExpired()) // Test initializeStartTime batch.initializeStartTime() @@ -433,6 +433,6 @@ func TestBatchRetryMetadata(t *testing.T) { assert.True(t, batch.isReadyForRetry()) // Test isExpired - batch.startTime = time.Now().Add(-25 * time.Hour) - assert.True(t, batch.isExpired(24*time.Hour)) + batch.expireAfter = time.Now().Add(-1 * time.Hour) + assert.True(t, batch.isExpired()) } diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go index d541e9a46aa..dd0651020a0 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go @@ -60,10 +60,9 @@ func TestCircuitBreakerBlocksTargetAfterFailure(t *testing.T) { var wg sync.WaitGroup flushTimeout := 50 * time.Millisecond - retryDuration := time.Hour - failingPusher := NewPusher(logger, failingTarget, service, tm, nil, workerPool, flushTimeout, retryDuration, &wg, retryHeap) - healthyPusher := NewPusher(logger, healthyTarget, service, tm, nil, workerPool, flushTimeout, retryDuration, &wg, retryHeap) + failingPusher := NewPusher(logger, failingTarget, service, tm, nil, workerPool, flushTimeout, &wg, 2, retryHeap) + healthyPusher := NewPusher(logger, healthyTarget, service, tm, nil, workerPool, flushTimeout, &wg, 2, retryHeap) defer failingPusher.Stop() defer healthyPusher.Stop() diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go index 16fe906a778..9abf746b5fd 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go @@ -107,14 +107,10 @@ func TestSenderPool(t *testing.T) { logger := testutil.NewNopLogger() mockService := new(mockLogsService) mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil) - s := newSender(logger, mockService, nil, time.Second, nil) + s := newSender(logger, mockService, nil, nil) p := NewWorkerPool(12) sp := newSenderPool(p, s) - assert.Equal(t, time.Second, sp.RetryDuration()) - sp.SetRetryDuration(time.Minute) - assert.Equal(t, time.Minute, sp.RetryDuration()) - var completed atomic.Int32 var evts []*logEvent for i := 0; i < 200; i++ { @@ -144,7 +140,7 @@ func TestSenderPoolRetryHeap(_ *testing.T) { retryHeap := NewRetryHeap(10, logger) defer retryHeap.Stop() - s := newSender(logger, mockService, nil, time.Second, retryHeap) + s := newSender(logger, mockService, nil, retryHeap) p := NewWorkerPool(12) defer p.Stop() diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go index ef5f514501c..41640490b1e 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go @@ -111,8 +111,8 @@ func setupPusher(t *testing.T, workerPool WorkerPool, wg *sync.WaitGroup) *Pushe nil, workerPool, time.Second, - time.Minute, wg, + 1, // concurrency nil, // retryHeap ) @@ -148,8 +148,8 @@ func TestPusherRetryHeap(t *testing.T) { nil, workerPool, time.Second, - time.Minute, &wg, + 2, // concurrency > 1 retryHeap, ) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go index 80c343f3ef7..d4b64a17a4c 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go @@ -123,7 +123,7 @@ func TestAddSingleEvent_WithAccountId(t *testing.T) { } ep := newMockEntityProvider(expectedEntity) - q, sender := testPreparation(t, -1, &s, 1*time.Hour, 2*time.Hour, ep, &wg) + q, sender := testPreparation(t, -1, &s, 1*time.Hour, ep, &wg) q.AddEvent(newStubLogEvent("MSG", time.Now())) require.False(t, called.Load(), "PutLogEvents has been called too fast, it should wait until FlushTimeout.") @@ -160,7 +160,7 @@ func TestAddSingleEvent_WithoutAccountId(t *testing.T) { } ep := newMockEntityProvider(nil) - q, sender := testPreparation(t, -1, &s, 1*time.Hour, 2*time.Hour, ep, &wg) + q, sender := testPreparation(t, -1, &s, 1*time.Hour, ep, &wg) q.AddEvent(newStubLogEvent("MSG", time.Now())) require.False(t, called.Load(), "PutLogEvents has been called too fast, it should wait until FlushTimeout.") @@ -190,7 +190,7 @@ func TestStopQueueWouldDoFinalSend(t *testing.T) { return &cloudwatchlogs.PutLogEventsOutput{}, nil } - q, sender := testPreparation(t, -1, &s, 1*time.Hour, 2*time.Hour, nil, &wg) + q, sender := testPreparation(t, -1, &s, 1*time.Hour, nil, &wg) q.AddEvent(newStubLogEvent("MSG", time.Now())) time.Sleep(10 * time.Millisecond) @@ -214,7 +214,7 @@ func TestStopPusherWouldStopRetries(t *testing.T) { } logSink := testutil.NewLogSink() - q, sender := testPreparationWithLogger(t, logSink, -1, &s, 1*time.Hour, 2*time.Hour, nil, &wg) + q, sender := testPreparationWithLogger(t, logSink, -1, &s, 1*time.Hour, nil, &wg) q.AddEvent(newStubLogEvent("MSG", time.Now())) time.Sleep(10 * time.Millisecond) @@ -256,7 +256,7 @@ func TestLongMessageHandling(t *testing.T) { return &cloudwatchlogs.PutLogEventsOutput{}, nil } - q, sender := testPreparation(t, -1, &s, 1*time.Hour, 2*time.Hour, nil, &wg) + q, sender := testPreparation(t, -1, &s, 1*time.Hour, nil, &wg) q.AddEvent(newStubLogEvent(longMsg, time.Now())) triggerSend(t, q) @@ -285,7 +285,7 @@ func TestRequestIsLessThan1MB(t *testing.T) { return &cloudwatchlogs.PutLogEventsOutput{}, nil } - q, sender := testPreparation(t, -1, &s, 1*time.Hour, 2*time.Hour, nil, &wg) + q, sender := testPreparation(t, -1, &s, 1*time.Hour, nil, &wg) for i := 0; i < 8; i++ { q.AddEvent(newStubLogEvent(longMsg, time.Now())) } @@ -311,7 +311,7 @@ func TestRequestIsLessThan10kEvents(t *testing.T) { return &cloudwatchlogs.PutLogEventsOutput{}, nil } - q, sender := testPreparation(t, -1, &s, 1*time.Hour, 2*time.Hour, nil, &wg) + q, sender := testPreparation(t, -1, &s, 1*time.Hour, nil, &wg) for i := 0; i < 30000; i++ { q.AddEvent(newStubLogEvent(msg, time.Now())) } @@ -337,7 +337,7 @@ func TestTimestampPopulation(t *testing.T) { return &cloudwatchlogs.PutLogEventsOutput{}, nil } - q, sender := testPreparation(t, -1, &s, 1*time.Hour, 2*time.Hour, nil, &wg) + q, sender := testPreparation(t, -1, &s, 1*time.Hour, nil, &wg) for i := 0; i < 3; i++ { q.AddEvent(newStubLogEvent("msg", time.Time{})) } @@ -361,7 +361,7 @@ func TestIgnoreOutOfTimeRangeEvent(t *testing.T) { } logSink := testutil.NewLogSink() - q, sender := testPreparationWithLogger(t, logSink, -1, &s, 10*time.Millisecond, 2*time.Hour, nil, &wg) + q, sender := testPreparationWithLogger(t, logSink, -1, &s, 10*time.Millisecond, nil, &wg) q.AddEvent(newStubLogEvent("MSG", time.Now().Add(-15*24*time.Hour))) q.AddEventNonBlocking(newStubLogEvent("MSG", time.Now().Add(2*time.Hour+1*time.Minute))) @@ -414,7 +414,7 @@ func TestAddMultipleEvents(t *testing.T) { )) } evts[10], evts[90] = evts[90], evts[10] // make events out of order - q, sender := testPreparation(t, -1, &s, 1*time.Hour, 2*time.Hour, nil, &wg) + q, sender := testPreparation(t, -1, &s, 1*time.Hour, nil, &wg) for _, e := range evts { q.AddEvent(e) } @@ -466,7 +466,7 @@ func TestSendReqWhenEventsSpanMoreThan24Hrs(t *testing.T) { return nil, nil } - q, sender := testPreparation(t, -1, &s, 1*time.Hour, 2*time.Hour, nil, &wg) + q, sender := testPreparation(t, -1, &s, 1*time.Hour, nil, &wg) q.AddEvent(newStubLogEvent("MSG 25hrs ago", time.Now().Add(-25*time.Hour))) q.AddEvent(newStubLogEvent("MSG 24hrs ago", time.Now().Add(-24*time.Hour))) q.AddEvent(newStubLogEvent("MSG 23hrs ago", time.Now().Add(-23*time.Hour))) @@ -496,7 +496,7 @@ func TestUnhandledErrorWouldNotResend(t *testing.T) { } logSink := testutil.NewLogSink() - q, sender := testPreparationWithLogger(t, logSink, -1, &s, 10*time.Millisecond, 2*time.Hour, nil, &wg) + q, sender := testPreparationWithLogger(t, logSink, -1, &s, 10*time.Millisecond, nil, &wg) q.AddEvent(newStubLogEvent("msg", time.Now())) time.Sleep(2 * time.Second) @@ -542,7 +542,7 @@ func TestCreateLogGroupAndLogStreamWhenNotFound(t *testing.T) { } logSink := testutil.NewLogSink() - q, sender := testPreparationWithLogger(t, logSink, -1, &s, 1*time.Hour, 2*time.Hour, nil, &wg) + q, sender := testPreparationWithLogger(t, logSink, -1, &s, 1*time.Hour, nil, &wg) var eventWG sync.WaitGroup eventWG.Add(1) q.AddEvent(&stubLogEvent{message: "msg", timestamp: time.Now(), done: eventWG.Done}) @@ -580,7 +580,7 @@ func TestLogRejectedLogEntryInfo(t *testing.T) { } logSink := testutil.NewLogSink() - q, sender := testPreparationWithLogger(t, logSink, -1, &s, 1*time.Hour, 2*time.Hour, nil, &wg) + q, sender := testPreparationWithLogger(t, logSink, -1, &s, 1*time.Hour, nil, &wg) var eventWG sync.WaitGroup eventWG.Add(1) q.AddEvent(&stubLogEvent{message: "msg", timestamp: time.Now(), done: eventWG.Done}) @@ -630,7 +630,7 @@ func TestAddEventNonBlocking(t *testing.T) { start.Add(time.Duration(i)*time.Millisecond), )) } - q, sender := testPreparation(t, -1, &s, 1*time.Hour, 2*time.Hour, nil, &wg) + q, sender := testPreparation(t, -1, &s, 1*time.Hour, nil, &wg) time.Sleep(200 * time.Millisecond) // Wait until pusher started, merge channel is blocked for _, e := range evts { @@ -646,31 +646,6 @@ func TestAddEventNonBlocking(t *testing.T) { wg.Wait() } -func TestResendWouldStopAfterExhaustedRetries(t *testing.T) { - t.Parallel() - var wg sync.WaitGroup - var s stubLogsService - var cnt atomic.Int32 - - s.ple = func(*cloudwatchlogs.PutLogEventsInput) (*cloudwatchlogs.PutLogEventsOutput, error) { - cnt.Add(1) - return nil, &cloudwatchlogs.ServiceUnavailableException{} - } - - logSink := testutil.NewLogSink() - q, sender := testPreparationWithLogger(t, logSink, -1, &s, 10*time.Millisecond, time.Second, nil, &wg) - q.AddEvent(newStubLogEvent("msg", time.Now())) - time.Sleep(2 * time.Second) - - logLines := logSink.Lines() - lastLine := logLines[len(logLines)-1] - expected := fmt.Sprintf("All %v retries to G/S failed for PutLogEvents, request dropped.", cnt.Load()-1) - require.True(t, strings.HasSuffix(lastLine, expected), fmt.Sprintf("Expecting error log to end with request dropped, but received '%s' in the log", logSink.String())) - - q.Stop() - sender.Stop() - wg.Wait() -} // Cannot call q.send() directly as it would cause a race condition. Reset last sent time and trigger flush. func triggerSend(t *testing.T, q *queue) { @@ -684,7 +659,6 @@ func testPreparation( retention int, service cloudWatchLogsService, flushTimeout time.Duration, - retryDuration time.Duration, entityProvider logs.LogEntityProvider, wg *sync.WaitGroup, ) (*queue, Sender) { @@ -694,7 +668,6 @@ func testPreparation( retention, service, flushTimeout, - retryDuration, entityProvider, wg, ) @@ -706,13 +679,12 @@ func testPreparationWithLogger( retention int, service cloudWatchLogsService, flushTimeout time.Duration, - retryDuration time.Duration, entityProvider logs.LogEntityProvider, wg *sync.WaitGroup, ) (*queue, Sender) { t.Helper() tm := NewTargetManager(logger, service) - s := newSender(logger, service, tm, retryDuration, nil) + s := newSender(logger, service, tm, nil) q := newQueue( logger, Target{"G", "S", util.StandardLogGroupClass, retention}, diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go index 178a2bf8ae4..8726a329890 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go @@ -74,7 +74,7 @@ func TestRetryHeapProcessorExpiredBatchShouldResume(t *testing.T) { // Initialize the batch's start time to make it already expired batch.initializeStartTime() - batch.startTime = time.Now().Add(-100 * time.Millisecond) // Already expired + batch.expireAfter = time.Now().Add(-10 * time.Millisecond) // Already expired // Update retry metadata to simulate a failed attempt and make it ready for retry batch.updateRetryMetadata(&cloudwatchlogs.ServiceUnavailableException{}) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go index 72072837431..fc52e673a88 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go @@ -80,7 +80,7 @@ func TestSender(t *testing.T) { mockManager := new(mockTargetManager) mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Once() - s := newSender(logger, mockService, mockManager, time.Second, nil) + s := newSender(logger, mockService, mockManager, nil) s.Send(batch) s.Stop() @@ -103,7 +103,7 @@ func TestSender(t *testing.T) { mockManager := new(mockTargetManager) mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{RejectedLogEventsInfo: rejectedInfo}, nil).Once() - s := newSender(logger, mockService, mockManager, time.Second, nil) + s := newSender(logger, mockService, mockManager, nil) s.Send(batch) s.Stop() @@ -122,7 +122,7 @@ func TestSender(t *testing.T) { mockManager.On("InitTarget", mock.Anything).Return(nil).Once() mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Once() - s := newSender(logger, mockService, mockManager, time.Second, nil) + s := newSender(logger, mockService, mockManager, nil) s.Send(batch) s.Stop() @@ -149,7 +149,7 @@ func TestSender(t *testing.T) { mockService.On("PutLogEvents", mock.Anything). Return(&cloudwatchlogs.PutLogEventsOutput{}, &cloudwatchlogs.InvalidParameterException{}).Once() - s := newSender(logger, mockService, mockManager, time.Second, nil) + s := newSender(logger, mockService, mockManager, nil) s.Send(batch) s.Stop() @@ -177,7 +177,7 @@ func TestSender(t *testing.T) { mockService.On("PutLogEvents", mock.Anything). Return(&cloudwatchlogs.PutLogEventsOutput{}, &cloudwatchlogs.DataAlreadyAcceptedException{}).Once() - s := newSender(logger, mockService, mockManager, time.Second, nil) + s := newSender(logger, mockService, mockManager, nil) s.Send(batch) s.Stop() @@ -205,7 +205,7 @@ func TestSender(t *testing.T) { mockService.On("PutLogEvents", mock.Anything). Return(&cloudwatchlogs.PutLogEventsOutput{}, errors.New("test")).Once() - s := newSender(logger, mockService, mockManager, time.Second, nil) + s := newSender(logger, mockService, mockManager, nil) s.Send(batch) s.Stop() @@ -225,7 +225,7 @@ func TestSender(t *testing.T) { mockService.On("PutLogEvents", mock.Anything). Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Once() - s := newSender(logger, mockService, mockManager, time.Second, nil) + s := newSender(logger, mockService, mockManager, nil) s.Send(batch) s.Stop() @@ -251,7 +251,12 @@ func TestSender(t *testing.T) { mockService.On("PutLogEvents", mock.Anything). Return(&cloudwatchlogs.PutLogEventsOutput{}, awserr.New("SomeAWSError", "Some AWS error", nil)).Once() - s := newSender(logger, mockService, mockManager, 100*time.Millisecond, nil) + s := newSender(logger, mockService, mockManager, nil) + + // Set expireAfter to past time so batch expires immediately after first retry + batch.initializeStartTime() + batch.expireAfter = time.Now().Add(-1 * time.Hour) + s.Send(batch) s.Stop() @@ -279,7 +284,7 @@ func TestSender(t *testing.T) { mockService.On("PutLogEvents", mock.Anything). Return(&cloudwatchlogs.PutLogEventsOutput{}, awserr.New("SomeAWSError", "Some AWS error", nil)).Once() - s := newSender(logger, mockService, mockManager, time.Second, nil) + s := newSender(logger, mockService, mockManager, nil) go func() { time.Sleep(50 * time.Millisecond) @@ -302,7 +307,7 @@ func TestSenderConcurrencyWithRetryHeap(t *testing.T) { retryHeap := NewRetryHeap(10, logger) defer retryHeap.Stop() - s := newSender(logger, mockService, mockManager, time.Hour, retryHeap) + s := newSender(logger, mockService, mockManager, retryHeap) batch := newLogEventBatch(Target{Group: "test-group", Stream: "test-stream"}, nil) batch.append(newLogEvent(time.Now(), "Test message", nil)) @@ -325,7 +330,7 @@ func TestSenderConcurrencyFallbackToSync(t *testing.T) { mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Once() // Concurrency enabled but nil RetryHeap should fall back to sync - s := newSender(logger, mockService, mockManager, 2*time.Second, nil) + s := newSender(logger, mockService, mockManager, nil) batch := newLogEventBatch(Target{Group: "test-group", Stream: "test-stream"}, nil) batch.append(newLogEvent(time.Now(), "Test message", nil)) From 4d798e2a7ec89ded1b210cbca5631c56fb65bfbb Mon Sep 17 00:00:00 2001 From: Marcus Mann Date: Tue, 10 Feb 2026 13:03:27 -0500 Subject: [PATCH 22/50] Add test for initializeStartTime idempotency Verifies that startTime and expireAfter are only set once on first call and remain unchanged on subsequent calls, ensuring the 14-day expiration is measured from the first send attempt, not from each retry. --- .../internal/pusher/batch_test.go | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/batch_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/batch_test.go index 2f8db4f689c..cc031362adc 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/batch_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/batch_test.go @@ -436,3 +436,37 @@ func TestBatchRetryMetadata(t *testing.T) { batch.expireAfter = time.Now().Add(-1 * time.Hour) assert.True(t, batch.isExpired()) } + +func TestBatchInitializeStartTimeIdempotent(t *testing.T) { + batch := newLogEventBatch(Target{Group: "test-group", Stream: "test-stream"}, nil) + + // Verify initial state + assert.True(t, batch.startTime.IsZero()) + assert.True(t, batch.expireAfter.IsZero()) + + // First call should set both values + batch.initializeStartTime() + assert.False(t, batch.startTime.IsZero()) + assert.False(t, batch.expireAfter.IsZero()) + + // Capture the values + firstStartTime := batch.startTime + firstExpireAfter := batch.expireAfter + + // Verify expireAfter is set to startTime + maxRetryTimeout + expectedExpireAfter := firstStartTime.Add(maxRetryTimeout) + assert.Equal(t, expectedExpireAfter, firstExpireAfter) + + // Wait a bit to ensure time has passed + time.Sleep(10 * time.Millisecond) + + // Second call should NOT change the values (idempotent) + batch.initializeStartTime() + assert.Equal(t, firstStartTime, batch.startTime, "startTime should not change on second call") + assert.Equal(t, firstExpireAfter, batch.expireAfter, "expireAfter should not change on second call") + + // Third call should also not change the values + batch.initializeStartTime() + assert.Equal(t, firstStartTime, batch.startTime, "startTime should not change on third call") + assert.Equal(t, firstExpireAfter, batch.expireAfter, "expireAfter should not change on third call") +} From cdf1651ae8c1b517e4febb6e63ee8ef5ad8a2373 Mon Sep 17 00:00:00 2001 From: Marcus Mann Date: Tue, 10 Feb 2026 13:42:31 -0500 Subject: [PATCH 23/50] refactor(pusher): Remove unused concurrency parameter from NewPusher MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Concurrency is now determined by whether workerPool and retryHeap are provided, making the explicit concurrency parameter redundant. 🤖 Assisted by AI --- plugins/outputs/cloudwatchlogs/cloudwatchlogs.go | 2 +- .../cloudwatchlogs/internal/pusher/circuitbreaker_test.go | 4 ++-- plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go | 1 - plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go | 2 -- 4 files changed, 3 insertions(+), 6 deletions(-) diff --git a/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go b/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go index 45bcc947d4d..d35542ff930 100644 --- a/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go +++ b/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go @@ -183,7 +183,7 @@ func (c *CloudWatchLogs) getDest(t pusher.Target, logSrc logs.LogSrc) *cwDest { } c.targetManager = pusher.NewTargetManager(c.Log, client) }) - p := pusher.NewPusher(c.Log, t, client, c.targetManager, logSrc, c.workerPool, c.ForceFlushInterval.Duration, &c.pusherWaitGroup, c.Concurrency, c.retryHeap) + p := pusher.NewPusher(c.Log, t, client, c.targetManager, logSrc, c.workerPool, c.ForceFlushInterval.Duration, &c.pusherWaitGroup, c.retryHeap) cwd := &cwDest{ pusher: p, retryer: logThrottleRetryer, diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go index dd0651020a0..972d1fab482 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go @@ -61,8 +61,8 @@ func TestCircuitBreakerBlocksTargetAfterFailure(t *testing.T) { var wg sync.WaitGroup flushTimeout := 50 * time.Millisecond - failingPusher := NewPusher(logger, failingTarget, service, tm, nil, workerPool, flushTimeout, &wg, 2, retryHeap) - healthyPusher := NewPusher(logger, healthyTarget, service, tm, nil, workerPool, flushTimeout, &wg, 2, retryHeap) + failingPusher := NewPusher(logger, failingTarget, service, tm, nil, workerPool, flushTimeout, &wg, retryHeap) + healthyPusher := NewPusher(logger, healthyTarget, service, tm, nil, workerPool, flushTimeout, &wg, retryHeap) defer failingPusher.Stop() defer healthyPusher.Stop() diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go index 6a4c9f2df24..aa2f4be722a 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher.go @@ -33,7 +33,6 @@ func NewPusher( workerPool WorkerPool, flushTimeout time.Duration, wg *sync.WaitGroup, - _ int, retryHeap RetryHeap, ) *Pusher { s := createSender(logger, service, targetManager, workerPool, retryHeap) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go index 41640490b1e..b80a201a07e 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go @@ -112,7 +112,6 @@ func setupPusher(t *testing.T, workerPool WorkerPool, wg *sync.WaitGroup) *Pushe workerPool, time.Second, wg, - 1, // concurrency nil, // retryHeap ) @@ -149,7 +148,6 @@ func TestPusherRetryHeap(t *testing.T) { workerPool, time.Second, &wg, - 2, // concurrency > 1 retryHeap, ) From 28ba9023a6a5084eaec7ecd764686e837d7893d0 Mon Sep 17 00:00:00 2001 From: Marcus Mann Date: Tue, 10 Feb 2026 15:47:05 -0500 Subject: [PATCH 24/50] test(pusher): Add automated recovery tests for poison pill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add comprehensive recovery tests validating: 1. Permission granted during retry - system recovers and publishes logs 2. System restart during retry - resumes correctly with preserved metadata 3. Multiple targets - healthy targets unaffected by failing target Tests validate circuit breaker behavior, retry heap functionality, and proper isolation between targets during permission failures. Addresses CWQS-3192 (P1 requirement) 🤖 Assisted by AI --- .../pusher/retryheap_recovery_test.go | 281 ++++++++++++++++++ 1 file changed, 281 insertions(+) create mode 100644 plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_recovery_test.go diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_recovery_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_recovery_test.go new file mode 100644 index 00000000000..067f4af3136 --- /dev/null +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_recovery_test.go @@ -0,0 +1,281 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package pusher + +import ( + "errors" + "sync" + "testing" + "time" + + "github.com/influxdata/telegraf/testutil" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/mock" + + "github.com/aws/amazon-cloudwatch-agent/internal/retryer" + "github.com/aws/amazon-cloudwatch-agent/sdk/service/cloudwatchlogs" +) + +// TestRecoveryWhenPermissionGrantedDuringRetry validates that when PLE permissions +// are missing initially but granted while retry is ongoing, the system recovers +// and successfully publishes logs. +// This test addresses CWQS-3192 requirement 1. +func TestRecoveryWhenPermissionGrantedDuringRetry(t *testing.T) { + heap := NewRetryHeap(10, &testutil.Logger{}) + defer heap.Stop() + + workerPool := NewWorkerPool(2) + defer workerPool.Stop() + + // Mock service that initially returns AccessDenied, then succeeds + mockService := &mockLogsService{} + accessDeniedErr := &cloudwatchlogs.AccessDeniedException{ + Message_: stringPtr("Access denied"), + } + + // First call fails with AccessDenied + mockService.On("PutLogEvents", mock.Anything).Return((*cloudwatchlogs.PutLogEventsOutput)(nil), accessDeniedErr).Once() + // Second call succeeds (permission granted) + mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Once() + + mockTargetManager := &mockTargetManager{} + mockTargetManager.On("EnsureTargetExists", mock.Anything).Return(nil) + + processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, time.Hour, retryer.NewLogThrottleRetryer(&testutil.Logger{})) + + // Create batch and track circuit breaker state + target := Target{Group: "group", Stream: "stream"} + batch := newLogEventBatch(target, nil) + batch.events = []*cloudwatchlogs.InputLogEvent{ + {Message: stringPtr("test message"), Timestamp: int64Ptr(time.Now().Unix() * 1000)}, + } + + var haltCalled, resumeCalled bool + var mu sync.Mutex + + // Register circuit breaker callbacks + batch.addFailCallback(func() { + mu.Lock() + haltCalled = true + mu.Unlock() + }) + batch.addDoneCallback(func() { + mu.Lock() + resumeCalled = true + mu.Unlock() + }) + + // Set batch ready for immediate retry + batch.nextRetryTime = time.Now().Add(-1 * time.Second) + + // Push batch to heap + err := heap.Push(batch) + assert.NoError(t, err) + + // Process first attempt - should fail with AccessDenied + processor.processReadyMessages() + + // Wait for async processing to complete + time.Sleep(100 * time.Millisecond) + + // Verify circuit breaker halted + mu.Lock() + assert.True(t, haltCalled, "Circuit breaker should halt on failure") + assert.False(t, resumeCalled, "Circuit breaker should not resume yet") + mu.Unlock() + + // Batch should be back in heap for retry + assert.Equal(t, 1, heap.Size(), "Failed batch should be in retry heap") + + // Simulate permission being granted by waiting for retry time + // Set batch ready for immediate retry + batch.nextRetryTime = time.Now().Add(-1 * time.Second) + + // Process second attempt - should succeed + processor.processReadyMessages() + + // Wait for async processing to complete + time.Sleep(100 * time.Millisecond) + + // Verify circuit breaker resumed + mu.Lock() + assert.True(t, resumeCalled, "Circuit breaker should resume on success") + mu.Unlock() + + // Heap should be empty (batch successfully sent) + assert.Equal(t, 0, heap.Size(), "Heap should be empty after successful retry") + + // Verify both PutLogEvents calls were made + mockService.AssertExpectations(t) +} + +// TestRecoveryAfterSystemRestart validates that when the system restarts with +// retry ongoing, it resumes correctly by loading state and continuing retries. +// This test addresses CWQS-3192 requirement 2. +func TestRecoveryAfterSystemRestart(t *testing.T) { + heap := NewRetryHeap(10, &testutil.Logger{}) + defer heap.Stop() + + workerPool := NewWorkerPool(2) + defer workerPool.Stop() + + mockService := &mockLogsService{} + mockTargetManager := &mockTargetManager{} + mockTargetManager.On("EnsureTargetExists", mock.Anything).Return(nil) + + // Simulate system restart scenario: + // 1. Initial failure puts batch in retry state + // 2. System "restarts" (new processor instance) + // 3. Batch is reloaded with retry metadata intact + // 4. Retry succeeds + + target := Target{Group: "group", Stream: "stream"} + batch := newLogEventBatch(target, nil) + batch.events = []*cloudwatchlogs.InputLogEvent{ + {Message: stringPtr("test message"), Timestamp: int64Ptr(time.Now().Unix() * 1000)}, + } + + // Simulate batch that was in retry state before restart + batch.retryCountShort = 2 + batch.startTime = time.Now().Add(-5 * time.Minute) + batch.nextRetryTime = time.Now().Add(-1 * time.Second) // Ready for retry + batch.lastError = errors.New("previous error before restart") + + var resumeCalled bool + var mu sync.Mutex + + batch.addDoneCallback(func() { + mu.Lock() + resumeCalled = true + mu.Unlock() + }) + + // Mock successful retry after restart + mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Once() + + // Create new processor (simulating restart) + processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, time.Hour, retryer.NewLogThrottleRetryer(&testutil.Logger{})) + + // Push batch with existing retry metadata + err := heap.Push(batch) + assert.NoError(t, err) + + // Process should succeed + processor.processReadyMessages() + + // Wait for async processing to complete + time.Sleep(100 * time.Millisecond) + + // Verify circuit breaker resumed + mu.Lock() + assert.True(t, resumeCalled, "Circuit breaker should resume after successful retry post-restart") + mu.Unlock() + + // Heap should be empty + assert.Equal(t, 0, heap.Size(), "Heap should be empty after successful retry") + + // Verify retry metadata was preserved + assert.Equal(t, 2, batch.retryCountShort, "Retry count should be preserved across restart") + assert.False(t, batch.startTime.IsZero(), "Start time should be preserved across restart") + + mockService.AssertExpectations(t) +} + +// TestRecoveryWithMultipleTargets validates that when one target has permission +// issues, other healthy targets continue publishing successfully. +func TestRecoveryWithMultipleTargets(t *testing.T) { + heap := NewRetryHeap(10, &testutil.Logger{}) + defer heap.Stop() + + workerPool := NewWorkerPool(2) + defer workerPool.Stop() + + mockService := &mockLogsService{} + mockTargetManager := &mockTargetManager{} + mockTargetManager.On("EnsureTargetExists", mock.Anything).Return(nil) + + processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, time.Hour, retryer.NewLogThrottleRetryer(&testutil.Logger{})) + + // Create two targets + target1 := Target{Group: "group1", Stream: "stream1"} + target2 := Target{Group: "group2", Stream: "stream2"} + + batch1 := newLogEventBatch(target1, nil) + batch1.events = []*cloudwatchlogs.InputLogEvent{ + {Message: stringPtr("message1"), Timestamp: int64Ptr(time.Now().Unix() * 1000)}, + } + batch1.nextRetryTime = time.Now().Add(-1 * time.Second) + + batch2 := newLogEventBatch(target2, nil) + batch2.events = []*cloudwatchlogs.InputLogEvent{ + {Message: stringPtr("message2"), Timestamp: int64Ptr(time.Now().Unix() * 1000)}, + } + batch2.nextRetryTime = time.Now().Add(-1 * time.Second) + + var halt1Called, resume1Called, resume2Called bool + var mu sync.Mutex + + // Target 1 fails with AccessDenied + batch1.addFailCallback(func() { + mu.Lock() + halt1Called = true + mu.Unlock() + }) + batch1.addDoneCallback(func() { + mu.Lock() + resume1Called = true + mu.Unlock() + }) + + // Target 2 succeeds + batch2.addDoneCallback(func() { + mu.Lock() + resume2Called = true + mu.Unlock() + }) + + // Mock responses: target1 fails, target2 succeeds + accessDeniedErr := &cloudwatchlogs.AccessDeniedException{ + Message_: stringPtr("Access denied"), + } + mockService.On("PutLogEvents", mock.MatchedBy(func(req *cloudwatchlogs.PutLogEventsInput) bool { + return *req.LogGroupName == "group1" + })).Return((*cloudwatchlogs.PutLogEventsOutput)(nil), accessDeniedErr).Once() + + mockService.On("PutLogEvents", mock.MatchedBy(func(req *cloudwatchlogs.PutLogEventsInput) bool { + return *req.LogGroupName == "group2" + })).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Once() + + // Push both batches + err := heap.Push(batch1) + assert.NoError(t, err) + err = heap.Push(batch2) + assert.NoError(t, err) + + // Process both batches + processor.processReadyMessages() + + // Wait for async processing to complete + time.Sleep(100 * time.Millisecond) + + // Verify target1 circuit breaker halted, target2 succeeded + mu.Lock() + assert.True(t, halt1Called, "Target1 circuit breaker should halt") + assert.False(t, resume1Called, "Target1 circuit breaker should not resume") + assert.True(t, resume2Called, "Target2 should succeed and resume") + mu.Unlock() + + // Target1 should be back in heap, target2 should be done + assert.Equal(t, 1, heap.Size(), "Only failed target should remain in heap") + + mockService.AssertExpectations(t) +} + +func stringPtr(s string) *string { + return &s +} + +func int64Ptr(i int64) *int64 { + return &i +} From 11b1d26ac66d2de6d41031dcb62c006d168f8021 Mon Sep 17 00:00:00 2001 From: Marcus Mann Date: Wed, 11 Feb 2026 11:52:04 -0500 Subject: [PATCH 25/50] Add test filtering to integration test workflows Add test_os_filter and test_dir_filter inputs to allow running specific tests on specific OS platforms. Filters use jq to filter generated test matrices before execution. Usage: -f test_os_filter=al2023 (run only on al2023) -f test_dir_filter=./test/cloudwatchlogs (run only cloudwatchlogs) When filters are omitted, all tests run (default behavior). --- .github/workflows/integration-test.yml | 10 ++++ .github/workflows/test-artifacts.yml | 69 +++++++++++++++++++------- 2 files changed, 62 insertions(+), 17 deletions(-) diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 37823e93eac..6de692f2838 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -33,6 +33,14 @@ on: test_repo_branch: description: 'Override for the GitHub test repository branch to use (default is main)' type: string + test_os_filter: + description: 'Filter tests to specific OS (e.g., al2023, ubuntu-22.04). Leave empty to run all OS.' + type: string + required: false + test_dir_filter: + description: 'Filter tests to specific test directory (e.g., ./test/cloudwatchlogs). Leave empty to run all tests.' + type: string + required: false jobs: CheckBuildTestArtifacts: @@ -67,3 +75,5 @@ jobs: with: build_id: ${{ inputs.build_sha }} test_repo_branch: ${{ inputs.test_repo_branch }} + test_os_filter: ${{ inputs.test_os_filter }} + test_dir_filter: ${{ inputs.test_dir_filter }} diff --git a/.github/workflows/test-artifacts.yml b/.github/workflows/test-artifacts.yml index d1965250ae2..2a17fd27a79 100644 --- a/.github/workflows/test-artifacts.yml +++ b/.github/workflows/test-artifacts.yml @@ -29,6 +29,14 @@ on: test_repo_branch: description: 'Override for the GitHub test repository branch to use (default is main)' type: string + test_os_filter: + description: 'Filter tests to specific OS (e.g., al2023, ubuntu-22.04)' + type: string + required: false + test_dir_filter: + description: 'Filter tests to specific test directory (e.g., ./test/cloudwatchlogs)' + type: string + required: false workflow_call: inputs: build_id: @@ -38,6 +46,14 @@ on: test_repo_branch: description: 'Override for the GitHub test repository branch to use (default is main)' type: string + test_os_filter: + description: 'Filter tests to specific OS (e.g., al2023, ubuntu-22.04)' + type: string + required: false + test_dir_filter: + description: 'Filter tests to specific test directory (e.g., ./test/cloudwatchlogs)' + type: string + required: false concurrency: group: ${{ github.workflow }}-${{ inputs.build_id }} @@ -161,23 +177,42 @@ jobs: id: set-matrix run: | go run --tags=generator generator/test_case_generator.go - echo "ec2_gpu_matrix=$(echo $(cat generator/resources/ec2_gpu_complete_test_matrix.json))" >> "$GITHUB_OUTPUT" - echo "eks_addon_matrix=$(echo $(cat generator/resources/eks_addon_complete_test_matrix.json))" >> "$GITHUB_OUTPUT" - echo "ec2_linux_matrix=$(echo $(cat generator/resources/ec2_linux_complete_test_matrix.json))" >> "$GITHUB_OUTPUT" - echo "ec2_linux_onprem_matrix=$(echo $(cat generator/resources/ec2_linux_onprem_complete_test_matrix.json))" >> "$GITHUB_OUTPUT" - echo "ec2_selinux_matrix=$(echo $(cat generator/resources/ec2_selinux_complete_test_matrix.json))" >> "$GITHUB_OUTPUT" - echo "ec2_windows_matrix=$(echo $(cat generator/resources/ec2_windows_complete_test_matrix.json))" >> "$GITHUB_OUTPUT" - echo "ec2_mac_matrix=$(echo $(cat generator/resources/ec2_mac_complete_test_matrix.json))" >> "$GITHUB_OUTPUT" - echo "ec2_performance_matrix=$(echo $(cat generator/resources/ec2_performance_complete_test_matrix.json))" >> "$GITHUB_OUTPUT" - echo "ec2_windows_performance_matrix=$(echo $(cat generator/resources/ec2_windows_performance_complete_test_matrix.json))" >> "$GITHUB_OUTPUT" - echo "ec2_stress_matrix=$(echo $(cat generator/resources/ec2_stress_complete_test_matrix.json))" >> "$GITHUB_OUTPUT" - echo "ec2_windows_stress_matrix=$(echo $(cat generator/resources/ec2_windows_stress_complete_test_matrix.json))" >> "$GITHUB_OUTPUT" - echo "ecs_ec2_launch_daemon_matrix=$(echo $(cat generator/resources/ecs_ec2_daemon_complete_test_matrix.json))" >> "$GITHUB_OUTPUT" - echo "ecs_fargate_matrix=$(echo $(cat generator/resources/ecs_fargate_complete_test_matrix.json))" >> "$GITHUB_OUTPUT" - echo "eks_daemon_matrix=$(echo $(cat generator/resources/eks_daemon_complete_test_matrix.json))" >> "$GITHUB_OUTPUT" - echo "eks_deployment_matrix=$(echo $(cat generator/resources/eks_deployment_complete_test_matrix.json))" >> "$GITHUB_OUTPUT" - echo "ec2_linux_itar_matrix=$(echo $(cat generator/resources/ec2_linux_itar_complete_test_matrix.json))" >> "$GITHUB_OUTPUT" - echo "ec2_linux_china_matrix=$(echo $(cat generator/resources/ec2_linux_china_complete_test_matrix.json))" >> "$GITHUB_OUTPUT" + + # Function to apply filters to a matrix + apply_filters() { + local matrix_file=$1 + local matrix_content=$(cat "$matrix_file") + + # Apply OS filter if provided + if [ -n "${{ inputs.test_os_filter }}" ]; then + matrix_content=$(echo "$matrix_content" | jq '[.[] | select(.os == "${{ inputs.test_os_filter }}")]') + fi + + # Apply test directory filter if provided + if [ -n "${{ inputs.test_dir_filter }}" ]; then + matrix_content=$(echo "$matrix_content" | jq '[.[] | select(.test_dir == "${{ inputs.test_dir_filter }}")]') + fi + + echo "$matrix_content" + } + + echo "ec2_gpu_matrix=$(apply_filters generator/resources/ec2_gpu_complete_test_matrix.json)" >> "$GITHUB_OUTPUT" + echo "eks_addon_matrix=$(apply_filters generator/resources/eks_addon_complete_test_matrix.json)" >> "$GITHUB_OUTPUT" + echo "ec2_linux_matrix=$(apply_filters generator/resources/ec2_linux_complete_test_matrix.json)" >> "$GITHUB_OUTPUT" + echo "ec2_linux_onprem_matrix=$(apply_filters generator/resources/ec2_linux_onprem_complete_test_matrix.json)" >> "$GITHUB_OUTPUT" + echo "ec2_selinux_matrix=$(apply_filters generator/resources/ec2_selinux_complete_test_matrix.json)" >> "$GITHUB_OUTPUT" + echo "ec2_windows_matrix=$(apply_filters generator/resources/ec2_windows_complete_test_matrix.json)" >> "$GITHUB_OUTPUT" + echo "ec2_mac_matrix=$(apply_filters generator/resources/ec2_mac_complete_test_matrix.json)" >> "$GITHUB_OUTPUT" + echo "ec2_performance_matrix=$(apply_filters generator/resources/ec2_performance_complete_test_matrix.json)" >> "$GITHUB_OUTPUT" + echo "ec2_windows_performance_matrix=$(apply_filters generator/resources/ec2_windows_performance_complete_test_matrix.json)" >> "$GITHUB_OUTPUT" + echo "ec2_stress_matrix=$(apply_filters generator/resources/ec2_stress_complete_test_matrix.json)" >> "$GITHUB_OUTPUT" + echo "ec2_windows_stress_matrix=$(apply_filters generator/resources/ec2_windows_stress_complete_test_matrix.json)" >> "$GITHUB_OUTPUT" + echo "ecs_ec2_launch_daemon_matrix=$(apply_filters generator/resources/ecs_ec2_daemon_complete_test_matrix.json)" >> "$GITHUB_OUTPUT" + echo "ecs_fargate_matrix=$(apply_filters generator/resources/ecs_fargate_complete_test_matrix.json)" >> "$GITHUB_OUTPUT" + echo "eks_daemon_matrix=$(apply_filters generator/resources/eks_daemon_complete_test_matrix.json)" >> "$GITHUB_OUTPUT" + echo "eks_deployment_matrix=$(apply_filters generator/resources/eks_deployment_complete_test_matrix.json)" >> "$GITHUB_OUTPUT" + echo "ec2_linux_itar_matrix=$(apply_filters generator/resources/ec2_linux_itar_complete_test_matrix.json)" >> "$GITHUB_OUTPUT" + echo "ec2_linux_china_matrix=$(apply_filters generator/resources/ec2_linux_china_complete_test_matrix.json)" >> "$GITHUB_OUTPUT" - name: Echo test plan matrix run: | From 8e20ddfc06a92f3664e6b933bf163ad0fc57e283 Mon Sep 17 00:00:00 2001 From: Marcus Mann Date: Wed, 11 Feb 2026 12:41:57 -0500 Subject: [PATCH 26/50] fix: Use compact JSON output in apply_filters to prevent multiline GITHUB_OUTPUT errors --- .github/workflows/test-artifacts.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-artifacts.yml b/.github/workflows/test-artifacts.yml index 2a17fd27a79..ea75854126f 100644 --- a/.github/workflows/test-artifacts.yml +++ b/.github/workflows/test-artifacts.yml @@ -185,12 +185,12 @@ jobs: # Apply OS filter if provided if [ -n "${{ inputs.test_os_filter }}" ]; then - matrix_content=$(echo "$matrix_content" | jq '[.[] | select(.os == "${{ inputs.test_os_filter }}")]') + matrix_content=$(echo "$matrix_content" | jq -c '[.[] | select(.os == "${{ inputs.test_os_filter }}")]') fi # Apply test directory filter if provided if [ -n "${{ inputs.test_dir_filter }}" ]; then - matrix_content=$(echo "$matrix_content" | jq '[.[] | select(.test_dir == "${{ inputs.test_dir_filter }}")]') + matrix_content=$(echo "$matrix_content" | jq -c '[.[] | select(.test_dir == "${{ inputs.test_dir_filter }}")]') fi echo "$matrix_content" From fd28e8e2a73dad30c64db4cb284ce8376fc89161 Mon Sep 17 00:00:00 2001 From: Marcus Mann Date: Wed, 11 Feb 2026 15:08:01 -0500 Subject: [PATCH 27/50] Add unit tests for poison pill scenario - TestPoisonPillScenario: Validates continuous batch generation with 10 denied + 1 allowed log group - TestSingleDeniedLogGroup: Baseline test with 1 denied + 1 allowed log group - TestRetryHeapSmallerThanFailingLogGroups: Demonstrates deadlock when heap size < failing log groups (SKIPPED) The third test intentionally deadlocks to prove the bug exists when: - Retry heap size = concurrency (2) - Number of failing log groups (10) > heap size (2) - Workers block trying to push to full heap - System deadlocks, starving allowed log group --- .../internal/pusher/poison_pill_test.go | 390 ++++++++++++++++++ 1 file changed, 390 insertions(+) create mode 100644 plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go new file mode 100644 index 00000000000..976f9840d09 --- /dev/null +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go @@ -0,0 +1,390 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package pusher + +import ( + "fmt" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/influxdata/telegraf/testutil" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/mock" + + "github.com/aws/amazon-cloudwatch-agent/internal/retryer" + "github.com/aws/amazon-cloudwatch-agent/sdk/service/cloudwatchlogs" +) + +// TestPoisonPillScenario validates that when multiple log groups encounter +// AccessDenied errors simultaneously with low concurrency, the agent continues +// publishing to allowed log groups without blocking the entire pipeline. +// +// This test recreates the scenario from poison-pill-test-findings.md where: +// - 1 allowed log group + 10 denied log groups +// - Concurrency = 2 +// - Continuous stream of new batches (simulating force_flush_interval=5s) +// - Expected: Allowed log group continues receiving events +// - Historical Bug: Agent stopped publishing to ALL log groups after ~5 minutes +// +// This test validates that the retry heap and worker pool architecture correctly +// handles this scenario by: +// 1. Continuously generating batches for 10 denied + 1 allowed log group +// 2. Processing with only 2 workers (low concurrency) +// 3. Verifying allowed log group continues to receive events throughout +// 4. Ensuring worker pool doesn't get saturated by failed retry attempts +// +// The test passes because the current implementation uses a retry heap with +// proper backoff, preventing failed batches from monopolizing worker threads. +func TestPoisonPillScenario(t *testing.T) { + heap := NewRetryHeap(100, &testutil.Logger{}) + defer heap.Stop() + + workerPool := NewWorkerPool(2) // Low concurrency as in the bug scenario + defer workerPool.Stop() + + mockService := &mockLogsService{} + mockTargetManager := &mockTargetManager{} + mockTargetManager.On("EnsureTargetExists", mock.Anything).Return(nil) + + accessDeniedErr := &cloudwatchlogs.AccessDeniedException{ + Message_: stringPtr("User is not authorized to perform: logs:PutLogEvents with an explicit deny"), + } + + // Track successful PutLogEvents calls for the allowed log group + var allowedGroupSuccessCount atomic.Int32 + var deniedGroupAttemptCount atomic.Int32 + + // Configure mock service responses with realistic latency + mockService.On("PutLogEvents", mock.MatchedBy(func(input *cloudwatchlogs.PutLogEventsInput) bool { + return *input.LogGroupName == "log-stream-ple-access-granted" + })).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Run(func(args mock.Arguments) { + time.Sleep(10 * time.Millisecond) // Simulate API latency + allowedGroupSuccessCount.Add(1) + }) + + mockService.On("PutLogEvents", mock.MatchedBy(func(input *cloudwatchlogs.PutLogEventsInput) bool { + return *input.LogGroupName != "log-stream-ple-access-granted" + })).Return((*cloudwatchlogs.PutLogEventsOutput)(nil), accessDeniedErr).Run(func(args mock.Arguments) { + time.Sleep(10 * time.Millisecond) // Simulate API latency + deniedGroupAttemptCount.Add(1) + }) + + processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, 100*time.Millisecond, retryer.NewLogThrottleRetryer(&testutil.Logger{})) + + // Targets + allowedTarget := Target{Group: "log-stream-ple-access-granted", Stream: "i-test"} + deniedTargets := make([]Target, 10) + for i := 0; i < 10; i++ { + deniedTargets[i] = Target{ + Group: "aws-restricted-log-group-name-log-stream-ple-access-denied" + string(rune('0'+i)), + Stream: "i-test", + } + } + + // Simulate continuous batch generation over time (like force_flush_interval=5s) + done := make(chan struct{}) + var wg sync.WaitGroup + + // Continuously generate batches for denied log groups (simulating continuous log writes) + for i := 0; i < 10; i++ { + wg.Add(1) + go func(target Target) { + defer wg.Done() + ticker := time.NewTicker(50 * time.Millisecond) // Simulate flush interval + defer ticker.Stop() + batchCount := 0 + for { + select { + case <-done: + return + case <-ticker.C: + if batchCount >= 5 { // Generate 5 batches per denied log group + return + } + batch := createBatch(target, 50) + batch.nextRetryTime = time.Now().Add(-1 * time.Second) + heap.Push(batch) + batchCount++ + } + } + }(deniedTargets[i]) + } + + // Continuously generate batches for allowed log group + wg.Add(1) + go func() { + defer wg.Done() + ticker := time.NewTicker(50 * time.Millisecond) + defer ticker.Stop() + batchCount := 0 + for { + select { + case <-done: + return + case <-ticker.C: + if batchCount >= 10 { // Generate 10 batches for allowed log group + return + } + batch := createBatch(allowedTarget, 20) + batch.nextRetryTime = time.Now().Add(-1 * time.Second) + heap.Push(batch) + batchCount++ + } + } + }() + + // Process batches continuously + processorDone := make(chan struct{}) + go func() { + ticker := time.NewTicker(20 * time.Millisecond) + defer ticker.Stop() + for { + select { + case <-processorDone: + return + case <-ticker.C: + processor.processReadyMessages() + } + } + }() + + // Run for 2 seconds to simulate sustained load + time.Sleep(2 * time.Second) + close(done) + wg.Wait() + + // Process remaining messages + time.Sleep(500 * time.Millisecond) + processor.processReadyMessages() + time.Sleep(200 * time.Millisecond) + close(processorDone) + + // CRITICAL ASSERTION: Allowed log group MUST receive events throughout the test + successCount := allowedGroupSuccessCount.Load() + t.Logf("Allowed group success count: %d, Denied group attempt count: %d", successCount, deniedGroupAttemptCount.Load()) + + assert.Greater(t, successCount, int32(5), + "Allowed log group must continue receiving events despite continuous denied log group failures. Got %d, expected > 5", successCount) + + // Verify denied log groups attempted to send + assert.Greater(t, deniedGroupAttemptCount.Load(), int32(0), + "Denied log groups should have attempted to send") +} + +// TestRetryHeapSmallerThanFailingLogGroups tests the specific bottleneck scenario where: +// - Retry heap size = concurrency (e.g., 2) +// - Number of failing log groups (10) > retry heap size (2) +// - This causes the retry heap to fill up with failed batches +// - New batches from failing log groups block trying to push to full heap +// - Workers get stuck waiting to push failed batches back to heap +// - Allowed log group gets starved of worker time +// +// This test validates the ACTUAL bug: when retry heap size (equal to concurrency) +// is smaller than the number of failing log groups, the system deadlocks. +// +// **EXPECTED BEHAVIOR**: This test will timeout/deadlock, proving the bug exists. +func TestRetryHeapSmallerThanFailingLogGroups(t *testing.T) { + t.Skip("This test intentionally deadlocks to demonstrate the poison pill bug where heap size < failing log groups") + + concurrency := 2 + numFailingLogGroups := 10 + + // CRITICAL: Retry heap size equals concurrency (this is the bug) + heap := NewRetryHeap(concurrency, &testutil.Logger{}) + defer heap.Stop() + + workerPool := NewWorkerPool(concurrency) + defer workerPool.Stop() + + mockService := &mockLogsService{} + mockTargetManager := &mockTargetManager{} + mockTargetManager.On("EnsureTargetExists", mock.Anything).Return(nil) + + accessDeniedErr := &cloudwatchlogs.AccessDeniedException{ + Message_: stringPtr("Access denied"), + } + + var allowedGroupSuccessCount atomic.Int32 + var deniedGroupAttemptCount atomic.Int32 + + mockService.On("PutLogEvents", mock.MatchedBy(func(input *cloudwatchlogs.PutLogEventsInput) bool { + return *input.LogGroupName == "allowed" + })).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Run(func(args mock.Arguments) { + time.Sleep(10 * time.Millisecond) + allowedGroupSuccessCount.Add(1) + }) + + mockService.On("PutLogEvents", mock.MatchedBy(func(input *cloudwatchlogs.PutLogEventsInput) bool { + return *input.LogGroupName != "allowed" + })).Return((*cloudwatchlogs.PutLogEventsOutput)(nil), accessDeniedErr).Run(func(args mock.Arguments) { + time.Sleep(10 * time.Millisecond) + deniedGroupAttemptCount.Add(1) + }) + + processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, 50*time.Millisecond, retryer.NewLogThrottleRetryer(&testutil.Logger{})) + + // Create targets + allowedTarget := Target{Group: "allowed", Stream: "stream"} + deniedTargets := make([]Target, numFailingLogGroups) + for i := 0; i < numFailingLogGroups; i++ { + deniedTargets[i] = Target{Group: fmt.Sprintf("denied-%d", i), Stream: "stream"} + } + + done := make(chan struct{}) + var wg sync.WaitGroup + + // Generate batches for all failing log groups continuously + // This will cause deadlock as heap fills up + for i := 0; i < numFailingLogGroups; i++ { + wg.Add(1) + go func(target Target) { + defer wg.Done() + ticker := time.NewTicker(30 * time.Millisecond) + defer ticker.Stop() + batchCount := 0 + for { + select { + case <-done: + return + case <-ticker.C: + if batchCount >= 3 { + return + } + batch := createBatch(target, 10) + batch.nextRetryTime = time.Now().Add(-1 * time.Second) + // This will block when heap is full + heap.Push(batch) + batchCount++ + } + } + }(deniedTargets[i]) + } + + // Generate batches for allowed log group + wg.Add(1) + go func() { + defer wg.Done() + ticker := time.NewTicker(30 * time.Millisecond) + defer ticker.Stop() + batchCount := 0 + for { + select { + case <-done: + return + case <-ticker.C: + if batchCount >= 5 { + return + } + batch := createBatch(allowedTarget, 10) + batch.nextRetryTime = time.Now().Add(-1 * time.Second) + heap.Push(batch) + batchCount++ + } + } + }() + + // Process continuously + processorDone := make(chan struct{}) + go func() { + ticker := time.NewTicker(15 * time.Millisecond) + defer ticker.Stop() + for { + select { + case <-processorDone: + return + case <-ticker.C: + processor.processReadyMessages() + } + } + }() + + // Run for 1 second + time.Sleep(1 * time.Second) + close(done) + wg.Wait() + time.Sleep(300 * time.Millisecond) + processor.processReadyMessages() + time.Sleep(100 * time.Millisecond) + close(processorDone) + + successCount := allowedGroupSuccessCount.Load() + + t.Logf("Results: Allowed success=%d, Denied attempts=%d, Heap size=%d, Failing groups=%d", + successCount, deniedGroupAttemptCount.Load(), concurrency, numFailingLogGroups) + + // This test documents the bug: with heap size < failing log groups, the system deadlocks + if successCount == 0 { + t.Errorf("POISON PILL BUG DETECTED: Allowed log group received 0 events. Heap size (%d) < failing groups (%d) caused deadlock", concurrency, numFailingLogGroups) + } +} + +// TestSingleDeniedLogGroup validates the baseline scenario where a single denied +// log group does not affect the allowed log group. +func TestSingleDeniedLogGroup(t *testing.T) { + heap := NewRetryHeap(10, &testutil.Logger{}) + defer heap.Stop() + + workerPool := NewWorkerPool(4) // Higher concurrency as in initial test + defer workerPool.Stop() + + mockService := &mockLogsService{} + mockTargetManager := &mockTargetManager{} + mockTargetManager.On("EnsureTargetExists", mock.Anything).Return(nil) + + accessDeniedErr := &cloudwatchlogs.AccessDeniedException{ + Message_: stringPtr("Access denied"), + } + + var allowedGroupSuccessCount atomic.Int32 + + mockService.On("PutLogEvents", mock.MatchedBy(func(input *cloudwatchlogs.PutLogEventsInput) bool { + return *input.LogGroupName == "log-stream-ple-access-granted" + })).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Run(func(args mock.Arguments) { + allowedGroupSuccessCount.Add(1) + }) + + mockService.On("PutLogEvents", mock.MatchedBy(func(input *cloudwatchlogs.PutLogEventsInput) bool { + return *input.LogGroupName == "aws-restricted-log-group-name-log-stream-ple-access-denied" + })).Return((*cloudwatchlogs.PutLogEventsOutput)(nil), accessDeniedErr) + + processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, time.Hour, retryer.NewLogThrottleRetryer(&testutil.Logger{})) + + // Create batches + allowedTarget := Target{Group: "log-stream-ple-access-granted", Stream: "i-test"} + deniedTarget := Target{Group: "aws-restricted-log-group-name-log-stream-ple-access-denied", Stream: "i-test"} + + allowedBatch := createBatch(allowedTarget, 40) + deniedBatch := createBatch(deniedTarget, 40) + + allowedBatch.nextRetryTime = time.Now().Add(-1 * time.Second) + deniedBatch.nextRetryTime = time.Now().Add(-1 * time.Second) + + err := heap.Push(allowedBatch) + assert.NoError(t, err) + err = heap.Push(deniedBatch) + assert.NoError(t, err) + + processor.processReadyMessages() + time.Sleep(100 * time.Millisecond) + + // Verify allowed log group received events + assert.Greater(t, allowedGroupSuccessCount.Load(), int32(0), + "Allowed log group must receive events with single denied log group") +} + +// createBatch creates a log event batch with the specified number of events +func createBatch(target Target, eventCount int) *logEventBatch { + batch := newLogEventBatch(target, nil) + batch.events = make([]*cloudwatchlogs.InputLogEvent, eventCount) + now := time.Now().Unix() * 1000 + for i := 0; i < eventCount; i++ { + batch.events[i] = &cloudwatchlogs.InputLogEvent{ + Message: stringPtr("test message"), + Timestamp: int64Ptr(now + int64(i)), + } + } + return batch +} From 6a6c0c6d7b5425421dc1c7f6f1a209acd4f9220c Mon Sep 17 00:00:00 2001 From: Marcus Mann Date: Wed, 11 Feb 2026 15:21:21 -0500 Subject: [PATCH 28/50] Fix poison pill bug: Make retry heap unbounded Remove max size constraint from retry heap to prevent deadlock when failing log groups exceed concurrency limit. Changes: - Remove maxSize and semaphore from retryHeap struct - Make Push() non-blocking (no semaphore wait) - Remove semaphore release from PopReady() - Update NewRetryHeap() to ignore maxSize parameter (kept for API compatibility) - Update TestRetryHeap_SemaphoreBlockingAndUnblocking -> TestRetryHeap_UnboundedPush - Update TestRetryHeapSmallerThanFailingLogGroups to validate fix Before: With concurrency=2 and 10 failing log groups, retry heap (size=2) would fill up, causing workers to block on Push(), leading to deadlock. After: Retry heap is unbounded, allowing all failed batches to be queued without blocking workers. Allowed log groups continue publishing normally. Test results: - TestRetryHeapSmallerThanFailingLogGroups: PASS (5/5 allowed batches published) - Heap grew to size 28 (beyond concurrency limit of 2) - No deadlock or starvation --- .gitignore | 1 + .../internal/pusher/poison_pill_test.go | 26 ++++---- .../internal/pusher/retryheap.go | 53 +++++------------ .../internal/pusher/retryheap_test.go | 59 +++++++------------ 4 files changed, 47 insertions(+), 92 deletions(-) diff --git a/.gitignore b/.gitignore index 9118e6e9c2f..38313bfed5b 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ CWAGENT_VERSION terraform.* **/.terraform/* coverage.txt +agent-sops/ diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go index 976f9840d09..bb4589d4e4e 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go @@ -177,22 +177,16 @@ func TestPoisonPillScenario(t *testing.T) { // TestRetryHeapSmallerThanFailingLogGroups tests the specific bottleneck scenario where: // - Retry heap size = concurrency (e.g., 2) // - Number of failing log groups (10) > retry heap size (2) -// - This causes the retry heap to fill up with failed batches -// - New batches from failing log groups block trying to push to full heap -// - Workers get stuck waiting to push failed batches back to heap -// - Allowed log group gets starved of worker time +// - With bounded heap: This caused deadlock as heap filled up +// - With unbounded heap: System handles this gracefully // -// This test validates the ACTUAL bug: when retry heap size (equal to concurrency) -// is smaller than the number of failing log groups, the system deadlocks. -// -// **EXPECTED BEHAVIOR**: This test will timeout/deadlock, proving the bug exists. +// This test validates the FIX: unbounded retry heap allows all failed batches +// to be queued without blocking workers. func TestRetryHeapSmallerThanFailingLogGroups(t *testing.T) { - t.Skip("This test intentionally deadlocks to demonstrate the poison pill bug where heap size < failing log groups") - concurrency := 2 numFailingLogGroups := 10 - // CRITICAL: Retry heap size equals concurrency (this is the bug) + // Retry heap is now unbounded (maxSize parameter ignored) heap := NewRetryHeap(concurrency, &testutil.Logger{}) defer heap.Stop() @@ -237,7 +231,6 @@ func TestRetryHeapSmallerThanFailingLogGroups(t *testing.T) { var wg sync.WaitGroup // Generate batches for all failing log groups continuously - // This will cause deadlock as heap fills up for i := 0; i < numFailingLogGroups; i++ { wg.Add(1) go func(target Target) { @@ -255,7 +248,6 @@ func TestRetryHeapSmallerThanFailingLogGroups(t *testing.T) { } batch := createBatch(target, 10) batch.nextRetryTime = time.Now().Add(-1 * time.Second) - // This will block when heap is full heap.Push(batch) batchCount++ } @@ -313,11 +305,13 @@ func TestRetryHeapSmallerThanFailingLogGroups(t *testing.T) { successCount := allowedGroupSuccessCount.Load() t.Logf("Results: Allowed success=%d, Denied attempts=%d, Heap size=%d, Failing groups=%d", - successCount, deniedGroupAttemptCount.Load(), concurrency, numFailingLogGroups) + successCount, deniedGroupAttemptCount.Load(), heap.Size(), numFailingLogGroups) - // This test documents the bug: with heap size < failing log groups, the system deadlocks + // With unbounded heap, allowed log group should receive events if successCount == 0 { - t.Errorf("POISON PILL BUG DETECTED: Allowed log group received 0 events. Heap size (%d) < failing groups (%d) caused deadlock", concurrency, numFailingLogGroups) + t.Errorf("UNEXPECTED: Allowed log group received 0 events with unbounded heap") + } else { + t.Logf("SUCCESS: Unbounded heap handled poison pill scenario: %d successful publishes despite %d failing groups", successCount, numFailingLogGroups) } } diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go index 025a3063ad1..90ff0c26539 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go @@ -49,56 +49,37 @@ type RetryHeap interface { } type retryHeap struct { - heap retryHeapImpl - mutex sync.RWMutex - semaphore chan struct{} // Size enforcer - stopCh chan struct{} - maxSize int - stopped bool - logger telegraf.Logger + heap retryHeapImpl + mutex sync.RWMutex + stopCh chan struct{} + stopped bool + logger telegraf.Logger } var _ RetryHeap = (*retryHeap)(nil) -// NewRetryHeap creates a new retry heap with the specified maximum size +// NewRetryHeap creates a new retry heap (unbounded) func NewRetryHeap(maxSize int, logger telegraf.Logger) RetryHeap { rh := &retryHeap{ - heap: make(retryHeapImpl, 0, maxSize), - maxSize: maxSize, - semaphore: make(chan struct{}, maxSize), // Semaphore for size enforcement - stopCh: make(chan struct{}), - logger: logger, + heap: make(retryHeapImpl, 0), + stopCh: make(chan struct{}), + logger: logger, } heap.Init(&rh.heap) return rh } -// Push adds a batch to the heap, blocking if full +// Push adds a batch to the heap (non-blocking) func (rh *retryHeap) Push(batch *logEventBatch) error { - rh.mutex.RLock() + rh.mutex.Lock() + defer rh.mutex.Unlock() + if rh.stopped { - rh.mutex.RUnlock() - return errors.New("retry heap stopped") - } - rh.mutex.RUnlock() - - // Acquire semaphore slot (blocks if at maxSize capacity) - select { - case rh.semaphore <- struct{}{}: - // add batch to heap with mutex protection - rh.mutex.Lock() - if rh.stopped { - // Release semaphore if stopped after acquiring - <-rh.semaphore - rh.mutex.Unlock() - return errors.New("retry heap stopped") - } - heap.Push(&rh.heap, batch) - rh.mutex.Unlock() - return nil - case <-rh.stopCh: return errors.New("retry heap stopped") } + + heap.Push(&rh.heap, batch) + return nil } // PopReady returns all batches that are ready for retry (nextRetryTime <= now) @@ -113,8 +94,6 @@ func (rh *retryHeap) PopReady() []*logEventBatch { for len(rh.heap) > 0 && !rh.heap[0].nextRetryTime.After(now) { batch := heap.Pop(&rh.heap).(*logEventBatch) ready = append(ready, batch) - // Release semaphore slot for each popped batch - <-rh.semaphore } return ready diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go index edc7dbc3145..038bdbfdbf5 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go @@ -145,65 +145,46 @@ func TestRetryHeapProcessorSendsBatch(t *testing.T) { assert.Equal(t, 0, heap.Size()) } -func TestRetryHeap_SemaphoreBlockingAndUnblocking(t *testing.T) { - heap := NewRetryHeap(2, &testutil.Logger{}) // maxSize = 2 +func TestRetryHeap_UnboundedPush(t *testing.T) { + heap := NewRetryHeap(2, &testutil.Logger{}) // maxSize parameter ignored (unbounded) defer heap.Stop() - // Fill heap to capacity with batches that will be ready in 3 seconds + // Push multiple batches without blocking target := Target{Group: "group", Stream: "stream"} batch1 := newLogEventBatch(target, nil) batch1.nextRetryTime = time.Now().Add(3 * time.Second) batch2 := newLogEventBatch(target, nil) batch2.nextRetryTime = time.Now().Add(3 * time.Second) + batch3 := newLogEventBatch(target, nil) + batch3.nextRetryTime = time.Now().Add(3 * time.Second) - heap.Push(batch1) - heap.Push(batch2) - - // Verify heap is at capacity - if heap.Size() != 2 { - t.Fatalf("Expected size 2, got %d", heap.Size()) - } + // All pushes should succeed immediately (non-blocking) + err := heap.Push(batch1) + assert.NoError(t, err) + err = heap.Push(batch2) + assert.NoError(t, err) + err = heap.Push(batch3) + assert.NoError(t, err) - // Test that semaphore is actually blocking by trying to push in a goroutine - pushResult := make(chan error, 1) - - go func() { - batch3 := newLogEventBatch(target, nil) - batch3.nextRetryTime = time.Now().Add(-1 * time.Hour) - heap.Push(batch3) // This should block on semaphore - pushResult <- nil - }() - - // Verify the push is blocked (expects no result in channel) - select { - case <-pushResult: - t.Fatal("Unexpected push, heap should be blocked") - case <-time.After(100 * time.Millisecond): - // Push is successfully blocked when at capacity + // Verify heap can grow beyond original maxSize parameter + if heap.Size() != 3 { + t.Fatalf("Expected size 3, got %d", heap.Size()) } time.Sleep(3 * time.Second) - // Pop ready batches to release semaphore slots + // Pop ready batches readyBatches := heap.PopReady() - assert.Len(t, readyBatches, 2, "Should pop exactly 2 ready batches") + assert.Len(t, readyBatches, 3, "Should pop exactly 3 ready batches") for _, batch := range readyBatches { assert.Equal(t, "group", batch.Group) assert.Equal(t, "stream", batch.Stream) } - // Expects push to now be unblocked - select { - case err := <-pushResult: - assert.NoError(t, err, "Push should succeed after PopReady") - case <-time.After(100 * time.Millisecond): - t.Fatal("Unexpected timeout, heap should be unblocked") - } - - // Verify 1 item remaining in heap (2 popped, 1 pushed) - if heap.Size() != 1 { - t.Fatalf("Expected size 1 after pop/push cycle, got %d", heap.Size()) + // Verify heap is empty + if heap.Size() != 0 { + t.Fatalf("Expected size 0 after pop, got %d", heap.Size()) } } From 60b6f49796924b6a60291e622b13b362614f6a84 Mon Sep 17 00:00:00 2001 From: Marcus Mann Date: Wed, 11 Feb 2026 15:29:20 -0500 Subject: [PATCH 29/50] Remove test filtering feature (moved to separate PR) --- .github/workflows/integration-test.yml | 10 ---- .github/workflows/test-artifacts.yml | 69 +++++++------------------- 2 files changed, 17 insertions(+), 62 deletions(-) diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 5534cd38889..0718615b597 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -33,14 +33,6 @@ on: test_repo_branch: description: 'Override for the GitHub test repository branch to use (default is main)' type: string - test_os_filter: - description: 'Filter tests to specific OS (e.g., al2023, ubuntu-22.04). Leave empty to run all OS.' - type: string - required: false - test_dir_filter: - description: 'Filter tests to specific test directory (e.g., ./test/cloudwatchlogs). Leave empty to run all tests.' - type: string - required: false jobs: CheckBuildTestArtifacts: @@ -78,5 +70,3 @@ jobs: with: build_id: ${{ inputs.build_sha }} test_repo_branch: ${{ inputs.test_repo_branch }} - test_os_filter: ${{ inputs.test_os_filter }} - test_dir_filter: ${{ inputs.test_dir_filter }} diff --git a/.github/workflows/test-artifacts.yml b/.github/workflows/test-artifacts.yml index ced412144c2..e85f5d9b6cc 100644 --- a/.github/workflows/test-artifacts.yml +++ b/.github/workflows/test-artifacts.yml @@ -29,14 +29,6 @@ on: test_repo_branch: description: 'Override for the GitHub test repository branch to use (default is main)' type: string - test_os_filter: - description: 'Filter tests to specific OS (e.g., al2023, ubuntu-22.04)' - type: string - required: false - test_dir_filter: - description: 'Filter tests to specific test directory (e.g., ./test/cloudwatchlogs)' - type: string - required: false workflow_call: inputs: build_id: @@ -46,14 +38,6 @@ on: test_repo_branch: description: 'Override for the GitHub test repository branch to use (default is main)' type: string - test_os_filter: - description: 'Filter tests to specific OS (e.g., al2023, ubuntu-22.04)' - type: string - required: false - test_dir_filter: - description: 'Filter tests to specific test directory (e.g., ./test/cloudwatchlogs)' - type: string - required: false concurrency: group: ${{ github.workflow }}-${{ inputs.build_id }} @@ -181,42 +165,23 @@ jobs: id: set-matrix run: | go run --tags=generator generator/test_case_generator.go - - # Function to apply filters to a matrix - apply_filters() { - local matrix_file=$1 - local matrix_content=$(cat "$matrix_file") - - # Apply OS filter if provided - if [ -n "${{ inputs.test_os_filter }}" ]; then - matrix_content=$(echo "$matrix_content" | jq '[.[] | select(.os == "${{ inputs.test_os_filter }}")]') - fi - - # Apply test directory filter if provided - if [ -n "${{ inputs.test_dir_filter }}" ]; then - matrix_content=$(echo "$matrix_content" | jq '[.[] | select(.test_dir == "${{ inputs.test_dir_filter }}")]') - fi - - echo "$matrix_content" - } - - echo "ec2_gpu_matrix=$(apply_filters generator/resources/ec2_gpu_complete_test_matrix.json)" >> "$GITHUB_OUTPUT" - echo "eks_addon_matrix=$(apply_filters generator/resources/eks_addon_complete_test_matrix.json)" >> "$GITHUB_OUTPUT" - echo "ec2_linux_matrix=$(apply_filters generator/resources/ec2_linux_complete_test_matrix.json)" >> "$GITHUB_OUTPUT" - echo "ec2_linux_onprem_matrix=$(apply_filters generator/resources/ec2_linux_onprem_complete_test_matrix.json)" >> "$GITHUB_OUTPUT" - echo "ec2_selinux_matrix=$(apply_filters generator/resources/ec2_selinux_complete_test_matrix.json)" >> "$GITHUB_OUTPUT" - echo "ec2_windows_matrix=$(apply_filters generator/resources/ec2_windows_complete_test_matrix.json)" >> "$GITHUB_OUTPUT" - echo "ec2_mac_matrix=$(apply_filters generator/resources/ec2_mac_complete_test_matrix.json)" >> "$GITHUB_OUTPUT" - echo "ec2_performance_matrix=$(apply_filters generator/resources/ec2_performance_complete_test_matrix.json)" >> "$GITHUB_OUTPUT" - echo "ec2_windows_performance_matrix=$(apply_filters generator/resources/ec2_windows_performance_complete_test_matrix.json)" >> "$GITHUB_OUTPUT" - echo "ec2_stress_matrix=$(apply_filters generator/resources/ec2_stress_complete_test_matrix.json)" >> "$GITHUB_OUTPUT" - echo "ec2_windows_stress_matrix=$(apply_filters generator/resources/ec2_windows_stress_complete_test_matrix.json)" >> "$GITHUB_OUTPUT" - echo "ecs_ec2_launch_daemon_matrix=$(apply_filters generator/resources/ecs_ec2_daemon_complete_test_matrix.json)" >> "$GITHUB_OUTPUT" - echo "ecs_fargate_matrix=$(apply_filters generator/resources/ecs_fargate_complete_test_matrix.json)" >> "$GITHUB_OUTPUT" - echo "eks_daemon_matrix=$(apply_filters generator/resources/eks_daemon_complete_test_matrix.json)" >> "$GITHUB_OUTPUT" - echo "eks_deployment_matrix=$(apply_filters generator/resources/eks_deployment_complete_test_matrix.json)" >> "$GITHUB_OUTPUT" - echo "ec2_linux_itar_matrix=$(apply_filters generator/resources/ec2_linux_itar_complete_test_matrix.json)" >> "$GITHUB_OUTPUT" - echo "ec2_linux_china_matrix=$(apply_filters generator/resources/ec2_linux_china_complete_test_matrix.json)" >> "$GITHUB_OUTPUT" + echo "ec2_gpu_matrix=$(echo $(cat generator/resources/ec2_gpu_complete_test_matrix.json))" >> "$GITHUB_OUTPUT" + echo "eks_addon_matrix=$(echo $(cat generator/resources/eks_addon_complete_test_matrix.json))" >> "$GITHUB_OUTPUT" + echo "ec2_linux_matrix=$(echo $(cat generator/resources/ec2_linux_complete_test_matrix.json))" >> "$GITHUB_OUTPUT" + echo "ec2_linux_onprem_matrix=$(echo $(cat generator/resources/ec2_linux_onprem_complete_test_matrix.json))" >> "$GITHUB_OUTPUT" + echo "ec2_selinux_matrix=$(echo $(cat generator/resources/ec2_selinux_complete_test_matrix.json))" >> "$GITHUB_OUTPUT" + echo "ec2_windows_matrix=$(echo $(cat generator/resources/ec2_windows_complete_test_matrix.json))" >> "$GITHUB_OUTPUT" + echo "ec2_mac_matrix=$(echo $(cat generator/resources/ec2_mac_complete_test_matrix.json))" >> "$GITHUB_OUTPUT" + echo "ec2_performance_matrix=$(echo $(cat generator/resources/ec2_performance_complete_test_matrix.json))" >> "$GITHUB_OUTPUT" + echo "ec2_windows_performance_matrix=$(echo $(cat generator/resources/ec2_windows_performance_complete_test_matrix.json))" >> "$GITHUB_OUTPUT" + echo "ec2_stress_matrix=$(echo $(cat generator/resources/ec2_stress_complete_test_matrix.json))" >> "$GITHUB_OUTPUT" + echo "ec2_windows_stress_matrix=$(echo $(cat generator/resources/ec2_windows_stress_complete_test_matrix.json))" >> "$GITHUB_OUTPUT" + echo "ecs_ec2_launch_daemon_matrix=$(echo $(cat generator/resources/ecs_ec2_daemon_complete_test_matrix.json))" >> "$GITHUB_OUTPUT" + echo "ecs_fargate_matrix=$(echo $(cat generator/resources/ecs_fargate_complete_test_matrix.json))" >> "$GITHUB_OUTPUT" + echo "eks_daemon_matrix=$(echo $(cat generator/resources/eks_daemon_complete_test_matrix.json))" >> "$GITHUB_OUTPUT" + echo "eks_deployment_matrix=$(echo $(cat generator/resources/eks_deployment_complete_test_matrix.json))" >> "$GITHUB_OUTPUT" + echo "ec2_linux_itar_matrix=$(echo $(cat generator/resources/ec2_linux_itar_complete_test_matrix.json))" >> "$GITHUB_OUTPUT" + echo "ec2_linux_china_matrix=$(echo $(cat generator/resources/ec2_linux_china_complete_test_matrix.json))" >> "$GITHUB_OUTPUT" - name: Echo test plan matrix run: | From 1b1973b98b442c348bfb128116429f1b48ad3c8d Mon Sep 17 00:00:00 2001 From: Marcus Mann Date: Wed, 11 Feb 2026 15:47:54 -0500 Subject: [PATCH 30/50] Trigger PR diff refresh From a814482696abb93191dd112adaa4fcd1c0ca3941 Mon Sep 17 00:00:00 2001 From: Marcus Mann Date: Wed, 11 Feb 2026 15:58:59 -0500 Subject: [PATCH 31/50] Revert gitignore changes (remove agent-sops) --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index 38313bfed5b..9118e6e9c2f 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,3 @@ CWAGENT_VERSION terraform.* **/.terraform/* coverage.txt -agent-sops/ From fd2ea56a0db3d2638b8ac806f17ca3b4118af5bf Mon Sep 17 00:00:00 2001 From: Marcus Mann Date: Thu, 12 Feb 2026 15:08:42 -0500 Subject: [PATCH 32/50] refactor(pusher): Remove unused maxSize parameter from NewRetryHeap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The retry heap is now unbounded, so maxSize is no longer used. 🤖 Assisted by AI --- .../outputs/cloudwatchlogs/cloudwatchlogs.go | 2 +- .../internal/pusher/poison_pill_test.go | 10 +++++----- .../internal/pusher/pool_test.go | 2 +- .../internal/pusher/pusher_test.go | 2 +- .../internal/pusher/queue_test.go | 1 - .../internal/pusher/retryheap.go | 2 +- .../internal/pusher/retryheap_expiry_test.go | 2 +- .../pusher/retryheap_recovery_test.go | 8 ++++---- .../internal/pusher/retryheap_test.go | 20 +++++++++---------- .../cloudwatchlogs/internal/pusher/sender.go | 1 - .../internal/pusher/sender_test.go | 6 +++--- 11 files changed, 27 insertions(+), 29 deletions(-) diff --git a/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go b/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go index d35542ff930..6283b90f3bd 100644 --- a/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go +++ b/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go @@ -174,7 +174,7 @@ func (c *CloudWatchLogs) getDest(t pusher.Target, logSrc logs.LogSrc) *cwDest { c.once.Do(func() { if c.Concurrency > 1 { c.workerPool = pusher.NewWorkerPool(c.Concurrency) - c.retryHeap = pusher.NewRetryHeap(c.Concurrency, c.Log) + c.retryHeap = pusher.NewRetryHeap(c.Log) retryHeapProcessorRetryer := retryer.NewLogThrottleRetryer(c.Log) retryHeapProcessorClient := c.createClient(retryHeapProcessorRetryer) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go index bb4589d4e4e..5652209831f 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go @@ -39,7 +39,7 @@ import ( // The test passes because the current implementation uses a retry heap with // proper backoff, preventing failed batches from monopolizing worker threads. func TestPoisonPillScenario(t *testing.T) { - heap := NewRetryHeap(100, &testutil.Logger{}) + heap := NewRetryHeap(&testutil.Logger{}) defer heap.Stop() workerPool := NewWorkerPool(2) // Low concurrency as in the bug scenario @@ -165,7 +165,7 @@ func TestPoisonPillScenario(t *testing.T) { // CRITICAL ASSERTION: Allowed log group MUST receive events throughout the test successCount := allowedGroupSuccessCount.Load() t.Logf("Allowed group success count: %d, Denied group attempt count: %d", successCount, deniedGroupAttemptCount.Load()) - + assert.Greater(t, successCount, int32(5), "Allowed log group must continue receiving events despite continuous denied log group failures. Got %d, expected > 5", successCount) @@ -185,7 +185,7 @@ func TestPoisonPillScenario(t *testing.T) { func TestRetryHeapSmallerThanFailingLogGroups(t *testing.T) { concurrency := 2 numFailingLogGroups := 10 - + // Retry heap is now unbounded (maxSize parameter ignored) heap := NewRetryHeap(concurrency, &testutil.Logger{}) defer heap.Stop() @@ -303,7 +303,7 @@ func TestRetryHeapSmallerThanFailingLogGroups(t *testing.T) { close(processorDone) successCount := allowedGroupSuccessCount.Load() - + t.Logf("Results: Allowed success=%d, Denied attempts=%d, Heap size=%d, Failing groups=%d", successCount, deniedGroupAttemptCount.Load(), heap.Size(), numFailingLogGroups) @@ -318,7 +318,7 @@ func TestRetryHeapSmallerThanFailingLogGroups(t *testing.T) { // TestSingleDeniedLogGroup validates the baseline scenario where a single denied // log group does not affect the allowed log group. func TestSingleDeniedLogGroup(t *testing.T) { - heap := NewRetryHeap(10, &testutil.Logger{}) + heap := NewRetryHeap(&testutil.Logger{}) defer heap.Stop() workerPool := NewWorkerPool(4) // Higher concurrency as in initial test diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go index 9abf746b5fd..52d2a1fbd63 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go @@ -137,7 +137,7 @@ func TestSenderPoolRetryHeap(_ *testing.T) { mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil) // Create RetryHeap - retryHeap := NewRetryHeap(10, logger) + retryHeap := NewRetryHeap(logger) defer retryHeap.Stop() s := newSender(logger, mockService, nil, retryHeap) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go index b80a201a07e..e862c99b64f 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pusher_test.go @@ -135,7 +135,7 @@ func TestPusherRetryHeap(t *testing.T) { workerPool := NewWorkerPool(2) defer workerPool.Stop() - retryHeap := NewRetryHeap(10, logger) + retryHeap := NewRetryHeap(logger) defer retryHeap.Stop() var wg sync.WaitGroup diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go index d4b64a17a4c..9ca08f7654b 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go @@ -646,7 +646,6 @@ func TestAddEventNonBlocking(t *testing.T) { wg.Wait() } - // Cannot call q.send() directly as it would cause a race condition. Reset last sent time and trigger flush. func triggerSend(t *testing.T, q *queue) { t.Helper() diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go index 90ff0c26539..b837be68310 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go @@ -59,7 +59,7 @@ type retryHeap struct { var _ RetryHeap = (*retryHeap)(nil) // NewRetryHeap creates a new retry heap (unbounded) -func NewRetryHeap(maxSize int, logger telegraf.Logger) RetryHeap { +func NewRetryHeap(logger telegraf.Logger) RetryHeap { rh := &retryHeap{ heap: make(retryHeapImpl, 0), stopCh: make(chan struct{}), diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go index 8726a329890..35bd0c28261 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go @@ -44,7 +44,7 @@ func TestRetryHeapProcessorExpiredBatchShouldResume(t *testing.T) { target := Target{Group: "failing-group", Stream: "stream"} // Create retry heap and processor with very short expiry for testing - retryHeap := NewRetryHeap(10, logger) + retryHeap := NewRetryHeap(logger) workerPool := NewWorkerPool(5) tm := NewTargetManager(logger, mockService) maxRetryDuration := 50 * time.Millisecond // Normally 14 days diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_recovery_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_recovery_test.go index 067f4af3136..7225355f0d2 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_recovery_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_recovery_test.go @@ -22,7 +22,7 @@ import ( // and successfully publishes logs. // This test addresses CWQS-3192 requirement 1. func TestRecoveryWhenPermissionGrantedDuringRetry(t *testing.T) { - heap := NewRetryHeap(10, &testutil.Logger{}) + heap := NewRetryHeap(&testutil.Logger{}) defer heap.Stop() workerPool := NewWorkerPool(2) @@ -33,7 +33,7 @@ func TestRecoveryWhenPermissionGrantedDuringRetry(t *testing.T) { accessDeniedErr := &cloudwatchlogs.AccessDeniedException{ Message_: stringPtr("Access denied"), } - + // First call fails with AccessDenied mockService.On("PutLogEvents", mock.Anything).Return((*cloudwatchlogs.PutLogEventsOutput)(nil), accessDeniedErr).Once() // Second call succeeds (permission granted) @@ -114,7 +114,7 @@ func TestRecoveryWhenPermissionGrantedDuringRetry(t *testing.T) { // retry ongoing, it resumes correctly by loading state and continuing retries. // This test addresses CWQS-3192 requirement 2. func TestRecoveryAfterSystemRestart(t *testing.T) { - heap := NewRetryHeap(10, &testutil.Logger{}) + heap := NewRetryHeap(&testutil.Logger{}) defer heap.Stop() workerPool := NewWorkerPool(2) @@ -185,7 +185,7 @@ func TestRecoveryAfterSystemRestart(t *testing.T) { // TestRecoveryWithMultipleTargets validates that when one target has permission // issues, other healthy targets continue publishing successfully. func TestRecoveryWithMultipleTargets(t *testing.T) { - heap := NewRetryHeap(10, &testutil.Logger{}) + heap := NewRetryHeap(&testutil.Logger{}) defer heap.Stop() workerPool := NewWorkerPool(2) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go index 038bdbfdbf5..2313239367c 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go @@ -16,7 +16,7 @@ import ( ) func TestRetryHeap(t *testing.T) { - heap := NewRetryHeap(10, &testutil.Logger{}) + heap := NewRetryHeap(&testutil.Logger{}) defer heap.Stop() // Test empty heap @@ -48,7 +48,7 @@ func TestRetryHeap(t *testing.T) { } func TestRetryHeapOrdering(t *testing.T) { - heap := NewRetryHeap(10, &testutil.Logger{}) + heap := NewRetryHeap(&testutil.Logger{}) defer heap.Stop() target := Target{Group: "group", Stream: "stream"} @@ -80,7 +80,7 @@ func TestRetryHeapOrdering(t *testing.T) { } func TestRetryHeapProcessor(t *testing.T) { - heap := NewRetryHeap(10, &testutil.Logger{}) + heap := NewRetryHeap(&testutil.Logger{}) defer heap.Stop() // Create mock components with proper signature @@ -99,7 +99,7 @@ func TestRetryHeapProcessor(t *testing.T) { } func TestRetryHeapProcessorExpiredBatch(t *testing.T) { - heap := NewRetryHeap(10, &testutil.Logger{}) + heap := NewRetryHeap(&testutil.Logger{}) defer heap.Stop() workerPool := NewWorkerPool(2) @@ -123,7 +123,7 @@ func TestRetryHeapProcessorExpiredBatch(t *testing.T) { } func TestRetryHeapProcessorSendsBatch(t *testing.T) { - heap := NewRetryHeap(10, &testutil.Logger{}) + heap := NewRetryHeap(&testutil.Logger{}) defer heap.Stop() workerPool := NewWorkerPool(2) @@ -146,7 +146,7 @@ func TestRetryHeapProcessorSendsBatch(t *testing.T) { } func TestRetryHeap_UnboundedPush(t *testing.T) { - heap := NewRetryHeap(2, &testutil.Logger{}) // maxSize parameter ignored (unbounded) + heap := NewRetryHeap(&testutil.Logger{}) // maxSize parameter ignored (unbounded) defer heap.Stop() // Push multiple batches without blocking @@ -189,7 +189,7 @@ func TestRetryHeap_UnboundedPush(t *testing.T) { } func TestRetryHeapProcessorNoReadyBatches(t *testing.T) { - heap := NewRetryHeap(10, &testutil.Logger{}) + heap := NewRetryHeap(&testutil.Logger{}) defer heap.Stop() workerPool := NewWorkerPool(2) @@ -206,7 +206,7 @@ func TestRetryHeapProcessorNoReadyBatches(t *testing.T) { } func TestRetryHeapProcessorFailedBatchGoesBackToHeap(t *testing.T) { - heap := NewRetryHeap(10, &testutil.Logger{}) + heap := NewRetryHeap(&testutil.Logger{}) defer heap.Stop() workerPool := NewWorkerPool(2) @@ -246,7 +246,7 @@ func TestRetryHeapProcessorFailedBatchGoesBackToHeap(t *testing.T) { } func TestRetryHeapStopTwice(t *testing.T) { - rh := NewRetryHeap(5, &testutil.Logger{}) + rh := NewRetryHeap(&testutil.Logger{}) // Call Stop twice - should not panic rh.Stop() @@ -263,7 +263,7 @@ func TestRetryHeapStopTwice(t *testing.T) { } func TestRetryHeapProcessorStoppedProcessReadyMessages(t *testing.T) { - heap := NewRetryHeap(10, &testutil.Logger{}) + heap := NewRetryHeap(&testutil.Logger{}) defer heap.Stop() workerPool := NewWorkerPool(2) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go b/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go index ad38d7960f7..6a34be1e43e 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go @@ -151,4 +151,3 @@ func (s *sender) Stop() { close(s.stopCh) s.stopped = true } - diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go index fc52e673a88..973533f3ab8 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/sender_test.go @@ -252,11 +252,11 @@ func TestSender(t *testing.T) { Return(&cloudwatchlogs.PutLogEventsOutput{}, awserr.New("SomeAWSError", "Some AWS error", nil)).Once() s := newSender(logger, mockService, mockManager, nil) - + // Set expireAfter to past time so batch expires immediately after first retry batch.initializeStartTime() batch.expireAfter = time.Now().Add(-1 * time.Hour) - + s.Send(batch) s.Stop() @@ -304,7 +304,7 @@ func TestSenderConcurrencyWithRetryHeap(t *testing.T) { mockManager := new(mockTargetManager) mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, &cloudwatchlogs.ServiceUnavailableException{}).Once() - retryHeap := NewRetryHeap(10, logger) + retryHeap := NewRetryHeap(logger) defer retryHeap.Stop() s := newSender(logger, mockService, mockManager, retryHeap) From 38afc5f638ada8935720fc8411b6c5438bf9a695 Mon Sep 17 00:00:00 2001 From: Marcus Mann Date: Thu, 12 Feb 2026 15:09:21 -0500 Subject: [PATCH 33/50] fix(pusher): Remove redundant updateState call in retryheap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit batch.done() already calls updateState() internally, so the explicit call is unnecessary. 🤖 Assisted by AI --- plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go | 1 - 1 file changed, 1 deletion(-) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go index b837be68310..258c2795e23 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go @@ -198,7 +198,6 @@ func (p *RetryHeapProcessor) processReadyMessages() { // Check if batch has expired if batch.isExpired() { p.logger.Errorf("Dropping expired batch for %v/%v", batch.Group, batch.Stream) - batch.updateState() batch.done() // Resume circuit breaker to allow target to process new batches continue } From 3e1ed82c3ee35361bb91e516a719b59d7047008b Mon Sep 17 00:00:00 2001 From: Marcus Mann Date: Thu, 12 Feb 2026 15:09:57 -0500 Subject: [PATCH 34/50] test(pusher): Remove empty TestSenderPoolRetryHeap test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test had no assertions and was not validating any behavior. 🤖 Assisted by AI --- .../internal/pusher/pool_test.go | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go index 52d2a1fbd63..ed74249250e 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/pool_test.go @@ -130,21 +130,3 @@ func TestSenderPool(t *testing.T) { s.Stop() assert.Equal(t, int32(200), completed.Load()) } - -func TestSenderPoolRetryHeap(_ *testing.T) { - logger := testutil.NewNopLogger() - mockService := new(mockLogsService) - mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil) - - // Create RetryHeap - retryHeap := NewRetryHeap(logger) - defer retryHeap.Stop() - - s := newSender(logger, mockService, nil, retryHeap) - p := NewWorkerPool(12) - defer p.Stop() - - sp := newSenderPool(p, s) - - sp.Stop() -} From 2d07b385050952eed980396ec8649e25b02ca10d Mon Sep 17 00:00:00 2001 From: Marcus Mann Date: Thu, 12 Feb 2026 15:10:31 -0500 Subject: [PATCH 35/50] docs(pusher): Clean up verbose test comment in queue_test.go MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Assisted by AI --- plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go | 5 ----- 1 file changed, 5 deletions(-) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go index 9ca08f7654b..cd85b17f98b 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go @@ -847,8 +847,3 @@ func TestQueueHaltResume(t *testing.T) { // TestQueueResumeOnBatchExpiry verifies that when a batch expires after 14 days of retrying, // the circuit breaker resumes the queue to allow new batches to be processed. -// This prevents the target from being permanently blocked when a bad batch is eventually dropped. -// -// Scenario from PR comment: "Say a bad batch from a target caused this to halt. Now that bad batch -// is re-tried for 14 days and eventually dropped - but this never gets resumed in that case right? -// So this target is blocked forever in that scenario?" From c739bb97422b475a6b867b2163aad13f42fdd8e9 Mon Sep 17 00:00:00 2001 From: Marcus Mann Date: Thu, 12 Feb 2026 15:11:06 -0500 Subject: [PATCH 36/50] docs(pusher): Clean up verbose test comment in retryheap_expiry_test.go MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Assisted by AI --- .../internal/pusher/retryheap_expiry_test.go | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go index 35bd0c28261..a3d0088f253 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go @@ -14,12 +14,8 @@ import ( "github.com/aws/amazon-cloudwatch-agent/tool/testutil" ) -// TestRetryHeapProcessorExpiredBatchShouldResume demonstrates the bug where -// expired batches don't resume the circuit breaker, leaving the target permanently blocked. -// -// From PR comment: "Say a bad batch from a target caused this to halt. Now that bad batch -// is re-tried for 14 days and eventually dropped - but this never gets resumed in that case right? -// So this target is blocked forever in that scenario?" +// TestRetryHeapProcessorExpiredBatchShouldResume verifies that expired batches +// resume the circuit breaker, preventing the target from being permanently blocked. func TestRetryHeapProcessorExpiredBatchShouldResume(t *testing.T) { logger := testutil.NewNopLogger() From 4e7393af9ccd75ae75044ffeb01142e01c4be0a7 Mon Sep 17 00:00:00 2001 From: Marcus Mann Date: Thu, 12 Feb 2026 15:11:47 -0500 Subject: [PATCH 37/50] test(pusher): Remove unused circuitBreakerHalted variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Variable was set but never checked in the test. 🤖 Assisted by AI --- .../internal/pusher/retryheap_expiry_test.go | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go index a3d0088f253..f04a26f39b6 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go @@ -56,14 +56,9 @@ func TestRetryHeapProcessorExpiredBatchShouldResume(t *testing.T) { batch := newLogEventBatch(target, nil) batch.append(newLogEvent(time.Now(), "test message", nil)) - // Set up callbacks to track circuit breaker state - var circuitBreakerHalted atomic.Bool + // Set up callback to track circuit breaker resume var circuitBreakerResumed atomic.Bool - batch.addFailCallback(func() { - circuitBreakerHalted.Store(true) - }) - batch.addDoneCallback(func() { circuitBreakerResumed.Store(true) }) From f640a193abfbe58345c3964a606344830a220139 Mon Sep 17 00:00:00 2001 From: Marcus Mann Date: Thu, 12 Feb 2026 15:12:26 -0500 Subject: [PATCH 38/50] test(pusher): Use exact assertion for circuit breaker send count MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Circuit breaker should always block after exactly 1 send attempt, not "at most 1". 🤖 Assisted by AI --- .../cloudwatchlogs/internal/pusher/circuitbreaker_test.go | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go index 972d1fab482..e141b6f5af1 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go @@ -89,10 +89,8 @@ func TestCircuitBreakerBlocksTargetAfterFailure(t *testing.T) { // Circuit breaker assertion: after the first failure, the failing target should // NOT have sent additional batches. Only 1 send attempt should have been made // before the circuit breaker blocks it. - assert.LessOrEqual(t, failingTargetSendCount.Load(), int32(1), - "Circuit breaker should block failing target from sending more than 1 batch, "+ - "but %d batches were sent. Without a circuit breaker, the failing target "+ - "continues flooding the worker pool with bad requests.", failingTargetSendCount.Load()) + assert.Equal(t, int32(1), failingTargetSendCount.Load(), + "Circuit breaker should block failing target after exactly 1 send attempt") // Healthy target should continue sending successfully assert.Greater(t, healthyTargetSendCount.Load(), int32(0), From eb589127d94453966b44890eedb5f203e99ca572 Mon Sep 17 00:00:00 2001 From: Marcus Mann Date: Thu, 12 Feb 2026 15:13:11 -0500 Subject: [PATCH 39/50] test(pusher): Remove ineffective dummyBatch code in TestQueueHaltResume MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The dummyBatch was not connected to the queue's circuit breaker, so calling done() on it had no effect. Simplified test to only verify halt behavior. 🤖 Assisted by AI --- .../internal/pusher/queue_test.go | 21 ------------------- 1 file changed, 21 deletions(-) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go index cd85b17f98b..75cdf220844 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go @@ -821,27 +821,6 @@ func TestQueueHaltResume(t *testing.T) { // Verify only one send happened (queue is halted) assert.Equal(t, int32(1), sendCount.Load(), "Should have only one send due to halt") - // Trigger flush to force send of second batch - this should block until resumed - done := make(chan bool) - go func() { - time.Sleep(100 * time.Millisecond) // Wait a bit - // Manually resume by calling success callback on a dummy batch - dummyBatch := newLogEventBatch(Target{"G", "S", util.StandardLogGroupClass, -1}, nil) - dummyBatch.addDoneCallback(func() { - // This simulates a successful send that should resume the queue - }) - dummyBatch.done() - done <- true - }() - - // This should eventually complete when the queue is resumed - select { - case <-done: - // Success - the resume worked - case <-time.After(5 * time.Second): - t.Fatal("Test timed out - queue may be permanently halted") - } - mockSender.AssertExpectations(t) } From 3d9d77916d3dcb3f14d1e4165a884f36f15971bb Mon Sep 17 00:00:00 2001 From: Marcus Mann Date: Thu, 12 Feb 2026 22:09:30 -0500 Subject: [PATCH 40/50] fix(pusher): Address review feedback on poison pill PR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace sync.Cond with channel-based halt/resume to prevent shutdown deadlock (waitIfHalted now selects on haltCh and stopCh) - Add mutex to halt/resume/waitIfHalted for thread safety - Add TestQueueStopWhileHalted to verify no shutdown deadlock - Add TestQueueHaltResume with proper resume assertions - Clean up verbose test comments and weak assertions - Remove orphaned TestQueueResumeOnBatchExpiry comment 🤖 Assisted by AI --- .../internal/pusher/circuitbreaker_test.go | 2 +- .../internal/pusher/poison_pill_test.go | 42 +++---------- .../cloudwatchlogs/internal/pusher/queue.go | 42 ++++++++----- .../internal/pusher/queue_test.go | 62 +++++++++++++++++-- .../internal/pusher/retryheap_expiry_test.go | 5 +- 5 files changed, 94 insertions(+), 59 deletions(-) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go index e141b6f5af1..f12e64df1f0 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/circuitbreaker_test.go @@ -52,7 +52,7 @@ func TestCircuitBreakerBlocksTargetAfterFailure(t *testing.T) { concurrency := 5 workerPool := NewWorkerPool(concurrency) - retryHeap := NewRetryHeap(concurrency, logger) + retryHeap := NewRetryHeap(logger) defer workerPool.Stop() defer retryHeap.Stop() diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go index 5652209831f..4c19b169c49 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go @@ -18,26 +18,9 @@ import ( "github.com/aws/amazon-cloudwatch-agent/sdk/service/cloudwatchlogs" ) -// TestPoisonPillScenario validates that when multiple log groups encounter -// AccessDenied errors simultaneously with low concurrency, the agent continues -// publishing to allowed log groups without blocking the entire pipeline. -// -// This test recreates the scenario from poison-pill-test-findings.md where: -// - 1 allowed log group + 10 denied log groups -// - Concurrency = 2 -// - Continuous stream of new batches (simulating force_flush_interval=5s) -// - Expected: Allowed log group continues receiving events -// - Historical Bug: Agent stopped publishing to ALL log groups after ~5 minutes -// -// This test validates that the retry heap and worker pool architecture correctly -// handles this scenario by: -// 1. Continuously generating batches for 10 denied + 1 allowed log group -// 2. Processing with only 2 workers (low concurrency) -// 3. Verifying allowed log group continues to receive events throughout -// 4. Ensuring worker pool doesn't get saturated by failed retry attempts -// -// The test passes because the current implementation uses a retry heap with -// proper backoff, preventing failed batches from monopolizing worker threads. +// TestPoisonPillScenario validates that when 10 denied + 1 allowed log groups +// share a worker pool with concurrency=2, the allowed log group continues +// publishing without being starved by failed retries. func TestPoisonPillScenario(t *testing.T) { heap := NewRetryHeap(&testutil.Logger{}) defer heap.Stop() @@ -174,20 +157,14 @@ func TestPoisonPillScenario(t *testing.T) { "Denied log groups should have attempted to send") } -// TestRetryHeapSmallerThanFailingLogGroups tests the specific bottleneck scenario where: -// - Retry heap size = concurrency (e.g., 2) -// - Number of failing log groups (10) > retry heap size (2) -// - With bounded heap: This caused deadlock as heap filled up -// - With unbounded heap: System handles this gracefully -// -// This test validates the FIX: unbounded retry heap allows all failed batches -// to be queued without blocking workers. +// TestRetryHeapSmallerThanFailingLogGroups verifies that with an unbounded retry +// heap, the system handles more failing log groups than workers without deadlock. func TestRetryHeapSmallerThanFailingLogGroups(t *testing.T) { concurrency := 2 numFailingLogGroups := 10 // Retry heap is now unbounded (maxSize parameter ignored) - heap := NewRetryHeap(concurrency, &testutil.Logger{}) + heap := NewRetryHeap(&testutil.Logger{}) defer heap.Stop() workerPool := NewWorkerPool(concurrency) @@ -308,11 +285,8 @@ func TestRetryHeapSmallerThanFailingLogGroups(t *testing.T) { successCount, deniedGroupAttemptCount.Load(), heap.Size(), numFailingLogGroups) // With unbounded heap, allowed log group should receive events - if successCount == 0 { - t.Errorf("UNEXPECTED: Allowed log group received 0 events with unbounded heap") - } else { - t.Logf("SUCCESS: Unbounded heap handled poison pill scenario: %d successful publishes despite %d failing groups", successCount, numFailingLogGroups) - } + assert.Greater(t, successCount, int32(0), + "Allowed log group must receive events despite %d failing groups", numFailingLogGroups) } // TestSingleDeniedLogGroup validates the baseline scenario where a single denied diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go b/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go index 5d297aed525..d32b0baa0f2 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go @@ -44,8 +44,9 @@ type queue struct { wg *sync.WaitGroup // Circuit breaker halt/resume functionality - haltCond *sync.Cond - halted bool + haltMu sync.Mutex + haltCh chan struct{} + halted bool } var _ (Queue) = (*queue)(nil) @@ -71,7 +72,7 @@ func newQueue( stopCh: make(chan struct{}), startNonBlockCh: make(chan struct{}), wg: wg, - haltCond: sync.NewCond(&sync.Mutex{}), + haltCh: make(chan struct{}), halted: false, } q.flushTimeout.Store(flushTimeout) @@ -258,28 +259,37 @@ func hasValidTime(e logs.LogEvent) bool { return true } -// waitIfHalted blocks until the queue is unhalted (circuit breaker functionality) +// waitIfHalted blocks until the queue is unhalted or stopped. func (q *queue) waitIfHalted() { - q.haltCond.L.Lock() - for q.halted { - q.haltCond.Wait() + q.haltMu.Lock() + if !q.halted { + q.haltMu.Unlock() + return + } + ch := q.haltCh + q.haltMu.Unlock() + select { + case <-ch: + case <-q.stopCh: } - q.haltCond.L.Unlock() } -// halt stops the queue from sending batches (called on failure) +// halt stops the queue from sending batches (called on failure). func (q *queue) halt() { - q.haltCond.L.Lock() + q.haltMu.Lock() + defer q.haltMu.Unlock() q.halted = true - q.haltCond.L.Unlock() } -// resume allows the queue to send batches again (called on success) +// resume allows the queue to send batches again (called on success). func (q *queue) resume() { - q.haltCond.L.Lock() - q.halted = false - q.haltCond.Broadcast() - q.haltCond.L.Unlock() + q.haltMu.Lock() + defer q.haltMu.Unlock() + if q.halted { + q.halted = false + close(q.haltCh) + q.haltCh = make(chan struct{}) + } } // onFailCallback returns a callback function to be executed after a failed send diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go index 75cdf220844..293218e185a 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go @@ -730,7 +730,7 @@ func TestQueueCallbackRegistration(t *testing.T) { flushTimer: time.NewTimer(10 * time.Millisecond), startNonBlockCh: make(chan struct{}), wg: &wg, - haltCond: sync.NewCond(&sync.Mutex{}), + haltCh: make(chan struct{}), halted: false, } q.flushTimeout.Store(10 * time.Millisecond) @@ -774,7 +774,7 @@ func TestQueueCallbackRegistration(t *testing.T) { flushTimer: time.NewTimer(10 * time.Millisecond), startNonBlockCh: make(chan struct{}), wg: &wg, - haltCond: sync.NewCond(&sync.Mutex{}), + haltCh: make(chan struct{}), halted: false, } q.flushTimeout.Store(10 * time.Millisecond) @@ -815,14 +815,68 @@ func TestQueueHaltResume(t *testing.T) { // Wait a bit for the first send to complete and halt time.Sleep(50 * time.Millisecond) + // Verify queue is halted + queueImpl := q.(*queue) + queueImpl.haltMu.Lock() + assert.True(t, queueImpl.halted, "Queue should be halted after failure") + queueImpl.haltMu.Unlock() + // Add second event - should be queued but not sent due to halt q.AddEvent(newStubLogEvent("second message", time.Now())) // Verify only one send happened (queue is halted) assert.Equal(t, int32(1), sendCount.Load(), "Should have only one send due to halt") + // Trigger resume by calling the success callback directly + queueImpl.resume() + + // Verify queue is no longer halted + queueImpl.haltMu.Lock() + assert.False(t, queueImpl.halted, "Queue should be resumed after success") + queueImpl.haltMu.Unlock() + + // Add third event - should trigger send since queue is resumed + q.AddEvent(newStubLogEvent("third message", time.Now())) + + // Wait for the second send to complete + time.Sleep(50 * time.Millisecond) + + // Verify second send happened (queue resumed) + assert.Equal(t, int32(2), sendCount.Load(), "Should have two sends after resume") + mockSender.AssertExpectations(t) } -// TestQueueResumeOnBatchExpiry verifies that when a batch expires after 14 days of retrying, -// the circuit breaker resumes the queue to allow new batches to be processed. +// TestQueueStopWhileHalted verifies that Stop() unblocks a halted queue. +// Without the stopCh select in waitIfHalted, this would deadlock. +func TestQueueStopWhileHalted(t *testing.T) { + logger := testutil.NewNopLogger() + + mockSender := &mockSender{} + mockSender.On("Send", mock.Anything).Run(func(args mock.Arguments) { + batch := args.Get(0).(*logEventBatch) + batch.fail() // Halt the queue + }).Return() + mockSender.On("Stop").Return() + + var wg sync.WaitGroup + q := newQueue(logger, Target{"G", "S", util.StandardLogGroupClass, -1}, 10*time.Millisecond, nil, mockSender, &wg) + + // Add event to trigger send → fail → halt + q.AddEvent(newStubLogEvent("msg", time.Now())) + time.Sleep(50 * time.Millisecond) + + // Queue is now halted. Stop must return without deadlocking. + done := make(chan struct{}) + go func() { + q.Stop() + close(done) + }() + + select { + case <-done: + // Success — Stop() returned + case <-time.After(2 * time.Second): + t.Fatal("Stop() deadlocked on halted queue") + } +} diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go index f04a26f39b6..1daac1e603f 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go @@ -88,8 +88,5 @@ func TestRetryHeapProcessorExpiredBatchShouldResume(t *testing.T) { // The circuit breaker SHOULD be resumed when the batch expires // This allows the target to continue processing new batches after the bad batch is dropped assert.True(t, circuitBreakerResumed.Load(), - "Circuit breaker should be resumed after batch expiry. "+ - "When a batch is retried for 14 days and eventually dropped, "+ - "the target must be unblocked to allow new batches to be processed. "+ - "Otherwise the target remains blocked forever.") + "Circuit breaker should resume after batch expiry to unblock the target") } From bef0a7d78f54d897c1fa84c27037f8e42fb845d2 Mon Sep 17 00:00:00 2001 From: Marcus Mann Date: Thu, 12 Feb 2026 22:20:30 -0500 Subject: [PATCH 41/50] test(pusher): Add state callback tests for retry heap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Verify state file management during retry, expiry, and shutdown: - Successful retry persists file offsets via state callbacks - Expired batch (14d) still persists offsets to prevent re-read - Clean shutdown does not persist state for unprocessed batches 🤖 Assisted by AI --- .../internal/pusher/state_callback_test.go | 188 ++++++++++++++++++ 1 file changed, 188 insertions(+) create mode 100644 plugins/outputs/cloudwatchlogs/internal/pusher/state_callback_test.go diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/state_callback_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/state_callback_test.go new file mode 100644 index 00000000000..8190e3a40b5 --- /dev/null +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/state_callback_test.go @@ -0,0 +1,188 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: MIT + +package pusher + +import ( + "sync/atomic" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/mock" + + "github.com/aws/amazon-cloudwatch-agent/internal/retryer" + "github.com/aws/amazon-cloudwatch-agent/internal/state" + "github.com/aws/amazon-cloudwatch-agent/sdk/service/cloudwatchlogs" + "github.com/aws/amazon-cloudwatch-agent/tool/testutil" +) + +type mockFileRangeQueue struct { + mock.Mock +} + +func (m *mockFileRangeQueue) ID() string { + return m.Called().String(0) +} + +func (m *mockFileRangeQueue) Enqueue(r state.Range) { + m.Called(r) +} + +// newStatefulBatch creates a batch with stateful events that register state callbacks. +func newStatefulBatch(target Target, queue *mockFileRangeQueue) *logEventBatch { + batch := newLogEventBatch(target, nil) + now := time.Now() + evt := newStatefulLogEvent(now, "test", nil, &logEventState{ + r: state.NewRange(0, 100), + queue: queue, + }) + batch.append(evt) + return batch +} + +// TestRetryHeapSuccessCallsStateCallback verifies that when a batch succeeds +// on retry through the heap, state callbacks fire to persist file offsets. +func TestRetryHeapSuccessCallsStateCallback(t *testing.T) { + logger := testutil.NewNopLogger() + target := Target{Group: "group", Stream: "stream"} + + queue := &mockFileRangeQueue{} + queue.On("ID").Return("file1") + queue.On("Enqueue", mock.Anything).Return() + + service := &stubLogsService{ + ple: func(_ *cloudwatchlogs.PutLogEventsInput) (*cloudwatchlogs.PutLogEventsOutput, error) { + return &cloudwatchlogs.PutLogEventsOutput{}, nil + }, + cls: func(_ *cloudwatchlogs.CreateLogStreamInput) (*cloudwatchlogs.CreateLogStreamOutput, error) { + return &cloudwatchlogs.CreateLogStreamOutput{}, nil + }, + clg: func(_ *cloudwatchlogs.CreateLogGroupInput) (*cloudwatchlogs.CreateLogGroupOutput, error) { + return &cloudwatchlogs.CreateLogGroupOutput{}, nil + }, + dlg: func(_ *cloudwatchlogs.DescribeLogGroupsInput) (*cloudwatchlogs.DescribeLogGroupsOutput, error) { + return &cloudwatchlogs.DescribeLogGroupsOutput{}, nil + }, + } + + retryHeap := NewRetryHeap(logger) + workerPool := NewWorkerPool(2) + tm := NewTargetManager(logger, service) + defer retryHeap.Stop() + defer workerPool.Stop() + + processor := NewRetryHeapProcessor(retryHeap, workerPool, service, tm, logger, time.Hour, retryer.NewLogThrottleRetryer(logger)) + + batch := newStatefulBatch(target, queue) + batch.nextRetryTime = time.Now().Add(-1 * time.Second) + + err := retryHeap.Push(batch) + assert.NoError(t, err) + + processor.processReadyMessages() + time.Sleep(200 * time.Millisecond) + + assert.Equal(t, 0, retryHeap.Size(), "Heap should be empty after success") + queue.AssertCalled(t, "Enqueue", mock.Anything) +} + +// TestRetryHeapExpiryCallsStateCallback verifies that when a batch expires +// after 14 days without successfully publishing, state callbacks still fire +// to persist file offsets and prevent re-reading on restart. +func TestRetryHeapExpiryCallsStateCallback(t *testing.T) { + logger := testutil.NewNopLogger() + target := Target{Group: "group", Stream: "stream"} + + queue := &mockFileRangeQueue{} + queue.On("ID").Return("file1") + queue.On("Enqueue", mock.Anything).Return() + + service := &stubLogsService{ + ple: func(_ *cloudwatchlogs.PutLogEventsInput) (*cloudwatchlogs.PutLogEventsOutput, error) { + return nil, &cloudwatchlogs.ServiceUnavailableException{} + }, + cls: func(_ *cloudwatchlogs.CreateLogStreamInput) (*cloudwatchlogs.CreateLogStreamOutput, error) { + return &cloudwatchlogs.CreateLogStreamOutput{}, nil + }, + clg: func(_ *cloudwatchlogs.CreateLogGroupInput) (*cloudwatchlogs.CreateLogGroupOutput, error) { + return &cloudwatchlogs.CreateLogGroupOutput{}, nil + }, + dlg: func(_ *cloudwatchlogs.DescribeLogGroupsInput) (*cloudwatchlogs.DescribeLogGroupsOutput, error) { + return &cloudwatchlogs.DescribeLogGroupsOutput{}, nil + }, + } + + retryHeap := NewRetryHeap(logger) + workerPool := NewWorkerPool(2) + tm := NewTargetManager(logger, service) + defer retryHeap.Stop() + defer workerPool.Stop() + + processor := NewRetryHeapProcessor(retryHeap, workerPool, service, tm, logger, 50*time.Millisecond, nil) + + batch := newStatefulBatch(target, queue) + batch.initializeStartTime() + batch.expireAfter = time.Now().Add(-10 * time.Millisecond) // Already expired + batch.updateRetryMetadata(&cloudwatchlogs.ServiceUnavailableException{}) + batch.nextRetryTime = time.Now().Add(-1 * time.Second) // Override to make it ready + + err := retryHeap.Push(batch) + assert.NoError(t, err) + + processor.processReadyMessages() + time.Sleep(200 * time.Millisecond) + + assert.Equal(t, 0, retryHeap.Size(), "Expired batch should be removed") + queue.AssertCalled(t, "Enqueue", mock.Anything) +} + +// TestShutdownDoesNotCallStateCallback verifies that during a clean shutdown +// via Stop(), remaining batches in the retry heap do NOT have their state +// callbacks invoked. This prevents marking undelivered data as processed. +func TestShutdownDoesNotCallStateCallback(t *testing.T) { + logger := testutil.NewNopLogger() + target := Target{Group: "group", Stream: "stream"} + + var stateCallCount atomic.Int32 + + retryHeap := NewRetryHeap(logger) + workerPool := NewWorkerPool(2) + defer workerPool.Stop() + + service := &stubLogsService{ + ple: func(_ *cloudwatchlogs.PutLogEventsInput) (*cloudwatchlogs.PutLogEventsOutput, error) { + return nil, &cloudwatchlogs.ServiceUnavailableException{} + }, + cls: func(_ *cloudwatchlogs.CreateLogStreamInput) (*cloudwatchlogs.CreateLogStreamOutput, error) { + return &cloudwatchlogs.CreateLogStreamOutput{}, nil + }, + clg: func(_ *cloudwatchlogs.CreateLogGroupInput) (*cloudwatchlogs.CreateLogGroupOutput, error) { + return &cloudwatchlogs.CreateLogGroupOutput{}, nil + }, + dlg: func(_ *cloudwatchlogs.DescribeLogGroupsInput) (*cloudwatchlogs.DescribeLogGroupsOutput, error) { + return &cloudwatchlogs.DescribeLogGroupsOutput{}, nil + }, + } + tm := NewTargetManager(logger, service) + + processor := NewRetryHeapProcessor(retryHeap, workerPool, service, tm, logger, time.Hour, nil) + processor.Start() + + // Push a batch with a future retry time so it won't be processed before Stop + batch := newLogEventBatch(target, nil) + batch.append(newLogEvent(time.Now(), "test", nil)) + batch.addStateCallback(func() { stateCallCount.Add(1) }) + batch.nextRetryTime = time.Now().Add(1 * time.Hour) // Not ready yet + + err := retryHeap.Push(batch) + assert.NoError(t, err) + + // Stop the processor — batch is still in heap, not ready + processor.Stop() + retryHeap.Stop() + + assert.Equal(t, int32(0), stateCallCount.Load(), + "State callback should not be called for unprocessed batches during shutdown") + assert.Equal(t, 1, retryHeap.Size(), "Batch should remain in heap after shutdown") +} From 55bc63cb42dabc90fa9433906810da483ef8f1e3 Mon Sep 17 00:00:00 2001 From: Marcus Mann Date: Thu, 12 Feb 2026 22:27:00 -0500 Subject: [PATCH 42/50] refactor(pusher): Audit and fix test assertions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix TestRetryHeapProcessorSendsBatch: add events to batch, verify PutLogEvents is called and done callback fires (was testing empty batch) - Fix TestRetryHeapProcessorExpiredBatch: set expireAfter field so isExpired() actually returns true, verify done() is called - Fix race in TestRetryHeapProcessorSendsBatch: use atomic.Bool - Reduce TestRetryHeap_UnboundedPush sleep from 3s to 100ms 🤖 Assisted by AI --- .../internal/pusher/retryheap_test.go | 33 +++++++++++++------ 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go index 2313239367c..d86888856bf 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go @@ -4,6 +4,7 @@ package pusher import ( + "sync/atomic" "testing" "time" @@ -109,17 +110,20 @@ func TestRetryHeapProcessorExpiredBatch(t *testing.T) { processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, 1*time.Millisecond, retryer.NewLogThrottleRetryer(&testutil.Logger{})) - // Create expired batch target := Target{Group: "group", Stream: "stream"} batch := newLogEventBatch(target, nil) - batch.startTime = time.Now().Add(-1 * time.Hour) + batch.initializeStartTime() + batch.expireAfter = time.Now().Add(-1 * time.Hour) // Already expired batch.nextRetryTime = time.Now().Add(-1 * time.Second) + var doneCalled bool + batch.addDoneCallback(func() { doneCalled = true }) + heap.Push(batch) - // Process should drop expired batch processor.processReadyMessages() - assert.Equal(t, 0, heap.Size()) + assert.Equal(t, 0, heap.Size(), "Expired batch should be removed from heap") + assert.True(t, doneCalled, "done() should be called on expired batch to resume circuit breaker") } func TestRetryHeapProcessorSendsBatch(t *testing.T) { @@ -128,21 +132,30 @@ func TestRetryHeapProcessorSendsBatch(t *testing.T) { workerPool := NewWorkerPool(2) defer workerPool.Stop() + mockService := &mockLogsService{} + mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil) mockTargetManager := &mockTargetManager{} + mockTargetManager.On("EnsureTargetExists", mock.Anything).Return(nil) processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, time.Hour, retryer.NewLogThrottleRetryer(&testutil.Logger{})) - // Create ready batch (retryTime already past) target := Target{Group: "group", Stream: "stream"} batch := newLogEventBatch(target, nil) + batch.append(newLogEvent(time.Now(), "test message", nil)) batch.nextRetryTime = time.Now().Add(-1 * time.Second) + var doneCalled atomic.Bool + batch.addDoneCallback(func() { doneCalled.Store(true) }) + heap.Push(batch) - // Process should send batch processor.processReadyMessages() + time.Sleep(200 * time.Millisecond) + assert.Equal(t, 0, heap.Size()) + assert.True(t, doneCalled.Load(), "Batch done callback should be called on successful send") + mockService.AssertCalled(t, "PutLogEvents", mock.Anything) } func TestRetryHeap_UnboundedPush(t *testing.T) { @@ -152,11 +165,11 @@ func TestRetryHeap_UnboundedPush(t *testing.T) { // Push multiple batches without blocking target := Target{Group: "group", Stream: "stream"} batch1 := newLogEventBatch(target, nil) - batch1.nextRetryTime = time.Now().Add(3 * time.Second) + batch1.nextRetryTime = time.Now().Add(50 * time.Millisecond) batch2 := newLogEventBatch(target, nil) - batch2.nextRetryTime = time.Now().Add(3 * time.Second) + batch2.nextRetryTime = time.Now().Add(50 * time.Millisecond) batch3 := newLogEventBatch(target, nil) - batch3.nextRetryTime = time.Now().Add(3 * time.Second) + batch3.nextRetryTime = time.Now().Add(50 * time.Millisecond) // All pushes should succeed immediately (non-blocking) err := heap.Push(batch1) @@ -171,7 +184,7 @@ func TestRetryHeap_UnboundedPush(t *testing.T) { t.Fatalf("Expected size 3, got %d", heap.Size()) } - time.Sleep(3 * time.Second) + time.Sleep(100 * time.Millisecond) // Pop ready batches readyBatches := heap.PopReady() From c1e194404e4a04b4e9bfb876dfb68284f41960aa Mon Sep 17 00:00:00 2001 From: Marcus Mann Date: Thu, 12 Feb 2026 22:39:40 -0500 Subject: [PATCH 43/50] refactor(pusher): Remove redundant TestRetryHeapSmallerThanFailingLogGroups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TestPoisonPillScenario already covers the same scenario (10 denied + 1 allowed with low concurrency). The bounded heap no longer exists so the 'smaller than' framing is no longer meaningful. 🤖 Assisted by AI --- .../internal/pusher/poison_pill_test.go | 133 ------------------ 1 file changed, 133 deletions(-) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go index 4c19b169c49..2959ce41e00 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go @@ -4,7 +4,6 @@ package pusher import ( - "fmt" "sync" "sync/atomic" "testing" @@ -157,138 +156,6 @@ func TestPoisonPillScenario(t *testing.T) { "Denied log groups should have attempted to send") } -// TestRetryHeapSmallerThanFailingLogGroups verifies that with an unbounded retry -// heap, the system handles more failing log groups than workers without deadlock. -func TestRetryHeapSmallerThanFailingLogGroups(t *testing.T) { - concurrency := 2 - numFailingLogGroups := 10 - - // Retry heap is now unbounded (maxSize parameter ignored) - heap := NewRetryHeap(&testutil.Logger{}) - defer heap.Stop() - - workerPool := NewWorkerPool(concurrency) - defer workerPool.Stop() - - mockService := &mockLogsService{} - mockTargetManager := &mockTargetManager{} - mockTargetManager.On("EnsureTargetExists", mock.Anything).Return(nil) - - accessDeniedErr := &cloudwatchlogs.AccessDeniedException{ - Message_: stringPtr("Access denied"), - } - - var allowedGroupSuccessCount atomic.Int32 - var deniedGroupAttemptCount atomic.Int32 - - mockService.On("PutLogEvents", mock.MatchedBy(func(input *cloudwatchlogs.PutLogEventsInput) bool { - return *input.LogGroupName == "allowed" - })).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Run(func(args mock.Arguments) { - time.Sleep(10 * time.Millisecond) - allowedGroupSuccessCount.Add(1) - }) - - mockService.On("PutLogEvents", mock.MatchedBy(func(input *cloudwatchlogs.PutLogEventsInput) bool { - return *input.LogGroupName != "allowed" - })).Return((*cloudwatchlogs.PutLogEventsOutput)(nil), accessDeniedErr).Run(func(args mock.Arguments) { - time.Sleep(10 * time.Millisecond) - deniedGroupAttemptCount.Add(1) - }) - - processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, 50*time.Millisecond, retryer.NewLogThrottleRetryer(&testutil.Logger{})) - - // Create targets - allowedTarget := Target{Group: "allowed", Stream: "stream"} - deniedTargets := make([]Target, numFailingLogGroups) - for i := 0; i < numFailingLogGroups; i++ { - deniedTargets[i] = Target{Group: fmt.Sprintf("denied-%d", i), Stream: "stream"} - } - - done := make(chan struct{}) - var wg sync.WaitGroup - - // Generate batches for all failing log groups continuously - for i := 0; i < numFailingLogGroups; i++ { - wg.Add(1) - go func(target Target) { - defer wg.Done() - ticker := time.NewTicker(30 * time.Millisecond) - defer ticker.Stop() - batchCount := 0 - for { - select { - case <-done: - return - case <-ticker.C: - if batchCount >= 3 { - return - } - batch := createBatch(target, 10) - batch.nextRetryTime = time.Now().Add(-1 * time.Second) - heap.Push(batch) - batchCount++ - } - } - }(deniedTargets[i]) - } - - // Generate batches for allowed log group - wg.Add(1) - go func() { - defer wg.Done() - ticker := time.NewTicker(30 * time.Millisecond) - defer ticker.Stop() - batchCount := 0 - for { - select { - case <-done: - return - case <-ticker.C: - if batchCount >= 5 { - return - } - batch := createBatch(allowedTarget, 10) - batch.nextRetryTime = time.Now().Add(-1 * time.Second) - heap.Push(batch) - batchCount++ - } - } - }() - - // Process continuously - processorDone := make(chan struct{}) - go func() { - ticker := time.NewTicker(15 * time.Millisecond) - defer ticker.Stop() - for { - select { - case <-processorDone: - return - case <-ticker.C: - processor.processReadyMessages() - } - } - }() - - // Run for 1 second - time.Sleep(1 * time.Second) - close(done) - wg.Wait() - time.Sleep(300 * time.Millisecond) - processor.processReadyMessages() - time.Sleep(100 * time.Millisecond) - close(processorDone) - - successCount := allowedGroupSuccessCount.Load() - - t.Logf("Results: Allowed success=%d, Denied attempts=%d, Heap size=%d, Failing groups=%d", - successCount, deniedGroupAttemptCount.Load(), heap.Size(), numFailingLogGroups) - - // With unbounded heap, allowed log group should receive events - assert.Greater(t, successCount, int32(0), - "Allowed log group must receive events despite %d failing groups", numFailingLogGroups) -} - // TestSingleDeniedLogGroup validates the baseline scenario where a single denied // log group does not affect the allowed log group. func TestSingleDeniedLogGroup(t *testing.T) { From 334acdff784121c7185ebda873eadda04f7a6090 Mon Sep 17 00:00:00 2001 From: Marcus Mann Date: Thu, 12 Feb 2026 22:48:06 -0500 Subject: [PATCH 44/50] docs(pusher): Remove internal ticket references from test comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Assisted by AI --- .../cloudwatchlogs/internal/pusher/retryheap_recovery_test.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_recovery_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_recovery_test.go index 7225355f0d2..747272d1bed 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_recovery_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_recovery_test.go @@ -20,7 +20,6 @@ import ( // TestRecoveryWhenPermissionGrantedDuringRetry validates that when PLE permissions // are missing initially but granted while retry is ongoing, the system recovers // and successfully publishes logs. -// This test addresses CWQS-3192 requirement 1. func TestRecoveryWhenPermissionGrantedDuringRetry(t *testing.T) { heap := NewRetryHeap(&testutil.Logger{}) defer heap.Stop() @@ -112,7 +111,6 @@ func TestRecoveryWhenPermissionGrantedDuringRetry(t *testing.T) { // TestRecoveryAfterSystemRestart validates that when the system restarts with // retry ongoing, it resumes correctly by loading state and continuing retries. -// This test addresses CWQS-3192 requirement 2. func TestRecoveryAfterSystemRestart(t *testing.T) { heap := NewRetryHeap(&testutil.Logger{}) defer heap.Stop() From b6f3b3ed5901717c0d6079eec63b7bd0ee590583 Mon Sep 17 00:00:00 2001 From: Marcus Mann Date: Thu, 12 Feb 2026 22:50:24 -0500 Subject: [PATCH 45/50] refactor(pusher): Simplify fail callback to direct method reference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Assisted by AI --- plugins/outputs/cloudwatchlogs/internal/pusher/queue.go | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go b/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go index d32b0baa0f2..86f1bd6e4c2 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go @@ -182,7 +182,7 @@ func (q *queue) merge(mergeChan chan logs.LogEvent) { func (q *queue) send() { if len(q.batch.events) > 0 { q.batch.addDoneCallback(q.onSuccessCallback(q.batch.bufferedSize)) - q.batch.addFailCallback(q.onFailCallback()) + q.batch.addFailCallback(q.halt) // Wait if halted (circuit breaker) q.waitIfHalted() @@ -291,10 +291,3 @@ func (q *queue) resume() { q.haltCh = make(chan struct{}) } } - -// onFailCallback returns a callback function to be executed after a failed send -func (q *queue) onFailCallback() func() { - return func() { - q.halt() - } -} From 98bdc89ea88ad5fd2e8602a3b35f2f165b5f7531 Mon Sep 17 00:00:00 2001 From: Marcus Mann Date: Fri, 13 Feb 2026 11:34:25 -0500 Subject: [PATCH 46/50] style(pusher): Fix unused parameter lint warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 🤖 Assisted by AI --- .../cloudwatchlogs/internal/pusher/poison_pill_test.go | 6 +++--- .../cloudwatchlogs/internal/pusher/retryheap_expiry_test.go | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go index 2959ce41e00..3500173e0af 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go @@ -42,14 +42,14 @@ func TestPoisonPillScenario(t *testing.T) { // Configure mock service responses with realistic latency mockService.On("PutLogEvents", mock.MatchedBy(func(input *cloudwatchlogs.PutLogEventsInput) bool { return *input.LogGroupName == "log-stream-ple-access-granted" - })).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Run(func(args mock.Arguments) { + })).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Run(func(_ mock.Arguments) { time.Sleep(10 * time.Millisecond) // Simulate API latency allowedGroupSuccessCount.Add(1) }) mockService.On("PutLogEvents", mock.MatchedBy(func(input *cloudwatchlogs.PutLogEventsInput) bool { return *input.LogGroupName != "log-stream-ple-access-granted" - })).Return((*cloudwatchlogs.PutLogEventsOutput)(nil), accessDeniedErr).Run(func(args mock.Arguments) { + })).Return((*cloudwatchlogs.PutLogEventsOutput)(nil), accessDeniedErr).Run(func(_ mock.Arguments) { time.Sleep(10 * time.Millisecond) // Simulate API latency deniedGroupAttemptCount.Add(1) }) @@ -177,7 +177,7 @@ func TestSingleDeniedLogGroup(t *testing.T) { mockService.On("PutLogEvents", mock.MatchedBy(func(input *cloudwatchlogs.PutLogEventsInput) bool { return *input.LogGroupName == "log-stream-ple-access-granted" - })).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Run(func(args mock.Arguments) { + })).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Run(func(_ mock.Arguments) { allowedGroupSuccessCount.Add(1) }) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go index 1daac1e603f..cdbbd56838f 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go @@ -21,7 +21,7 @@ func TestRetryHeapProcessorExpiredBatchShouldResume(t *testing.T) { var sendAttempts atomic.Int32 mockService := &stubLogsService{ - ple: func(input *cloudwatchlogs.PutLogEventsInput) (*cloudwatchlogs.PutLogEventsOutput, error) { + ple: func(_ *cloudwatchlogs.PutLogEventsInput) (*cloudwatchlogs.PutLogEventsOutput, error) { sendAttempts.Add(1) // Always fail to simulate a problematic target return nil, &cloudwatchlogs.ServiceUnavailableException{} From 1aad58edcbb6b21bffe01dd4163832e3149cc3ef Mon Sep 17 00:00:00 2001 From: Jeffrey Chien Date: Mon, 16 Feb 2026 16:47:36 -0500 Subject: [PATCH 47/50] Fix E2E test workflow permissions (#2028) --- .../application-signals-e2e-test.yml | 21 ++++++---- .github/workflows/build-test-artifacts.yml | 16 +++---- .github/workflows/e2e-test.yml | 42 ++++++++++++++++++- .../eks-performance-cluster-addon-install.yml | 6 ++- .github/workflows/integration-test.yml | 21 ++++++---- .github/workflows/wd-integration-test.yml | 21 ++++++---- 6 files changed, 96 insertions(+), 31 deletions(-) diff --git a/.github/workflows/application-signals-e2e-test.yml b/.github/workflows/application-signals-e2e-test.yml index 656d75b1298..94f4579a581 100644 --- a/.github/workflows/application-signals-e2e-test.yml +++ b/.github/workflows/application-signals-e2e-test.yml @@ -41,13 +41,20 @@ jobs: exit 1 fi - run: | - conclusion=$(gh run view ${{ inputs.build_run_id }} --repo $GITHUB_REPOSITORY --json conclusion -q '.conclusion') - if [[ $conclusion == "success" ]]; then - echo "Run succeeded" - else - echo "Run failed" - exit 1 - fi + for i in {1..6}; do + conclusion=$(gh run view ${{ inputs.build_run_id }} --repo $GITHUB_REPOSITORY --json conclusion -q '.conclusion') + if [[ "$conclusion" == "success" ]]; then + echo "Run succeeded" + exit 0 + elif [[ "$conclusion" == "failure" || "$conclusion" == "cancelled" ]]; then + echo "Run failed with: $conclusion" + exit 1 + fi + echo "Waiting for workflow to complete (attempt $i)..." + sleep 5 + done + echo "Timed out waiting for workflow" + exit 1 env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/build-test-artifacts.yml b/.github/workflows/build-test-artifacts.yml index ad180deb77d..e5b0d793a4b 100644 --- a/.github/workflows/build-test-artifacts.yml +++ b/.github/workflows/build-test-artifacts.yml @@ -19,14 +19,14 @@ on: - cron: '0 11 * * 1,2,3,4,5' # Every day at 11:00 UTC on Monday to Friday workflow_dispatch: inputs: - test-image-before-upload: - description: "Run Test on the new container image" + run-tests: + description: "Run test workflows after build" default: true type: boolean workflow_call: inputs: - test-image-before-upload: - description: "Run Test on the new container image" + run-tests: + description: "Run test workflows after build" default: true type: boolean @@ -114,7 +114,7 @@ jobs: StartIntegrationTests: needs: [ BuildAndUploadPackages, BuildAndUploadITAR, BuildAndUploadCN, BuildDocker, BuildDistributor ] - if: ${{ github.event_name == 'push' || inputs.test-image-before-upload }} + if: ${{ github.event_name == 'push' || inputs.run-tests }} runs-on: ubuntu-latest permissions: actions: write @@ -126,7 +126,7 @@ jobs: StartApplicationSignalsE2ETests: needs: [ BuildAndUploadPackages, BuildAndUploadITAR, BuildAndUploadCN, BuildDocker, BuildDistributor ] # Workflow only runs against main - if: ${{ github.event_name == 'push' || inputs.test-image-before-upload }} + if: ${{ github.event_name == 'push' || inputs.run-tests }} runs-on: ubuntu-latest permissions: actions: write @@ -137,7 +137,7 @@ jobs: StartEKSE2ETests: needs: [ BuildAndUploadPackages, BuildAndUploadITAR, BuildAndUploadCN, BuildDocker, BuildDistributor ] - if: ${{ github.event_name == 'push' || inputs.test-image-before-upload }} + if: ${{ github.event_name == 'push' || inputs.run-tests }} runs-on: ubuntu-latest permissions: actions: write @@ -148,7 +148,7 @@ jobs: StartWorkloadDiscoveryIntegrationTests: needs: [ BuildAndUploadPackages, BuildAndUploadITAR, BuildAndUploadCN, BuildDocker, BuildDistributor ] - if: ${{ github.event_name == 'push' || inputs.test-image-before-upload }} + if: ${{ github.event_name == 'push' || inputs.run-tests }} runs-on: ubuntu-latest permissions: actions: write diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml index 6b907c472b6..667992e6978 100644 --- a/.github/workflows/e2e-test.yml +++ b/.github/workflows/e2e-test.yml @@ -69,12 +69,16 @@ jobs: permissions: id-token: write contents: read + actions: write with: - test-image-before-upload: false + run-tests: false BuildOperator: needs: [GetLatestOperatorCommitSHA] uses: aws/amazon-cloudwatch-agent-operator/.github/workflows/build-and-upload.yml@main + permissions: + id-token: write + contents: read concurrency: group: ${{ github.workflow }}-operator-${{ inputs.operator-branch || 'main' }} cancel-in-progress: true @@ -159,6 +163,9 @@ jobs: needs: [ GetLatestOperatorCommitSHA, GenerateTestMatrix, OutputEnvVariables ] if: always() && !cancelled() && !contains(needs.*.result, 'failure') name: 'EKSE2EJVMTomcatTestHelm' + permissions: + id-token: write + contents: read uses: ./.github/workflows/eks-e2e-test.yml with: terraform_dir: terraform/eks/e2e @@ -183,6 +190,9 @@ jobs: needs: [ GetLatestOperatorCommitSHA, GenerateTestMatrix, OutputEnvVariables ] if: always() && !cancelled() && !contains(needs.*.result, 'failure') name: 'EKSE2EJVMTomcatTestAddon' + permissions: + id-token: write + contents: read uses: ./.github/workflows/eks-e2e-test.yml with: terraform_dir: terraform/eks/e2e @@ -207,6 +217,9 @@ jobs: needs: [ GetLatestOperatorCommitSHA, GenerateTestMatrix, OutputEnvVariables ] if: always() && !cancelled() && !contains(needs.*.result, 'failure') name: 'EKSE2EKafkaTestHelm' + permissions: + id-token: write + contents: read uses: ./.github/workflows/eks-e2e-test.yml with: terraform_dir: terraform/eks/e2e @@ -231,6 +244,9 @@ jobs: needs: [ GetLatestOperatorCommitSHA, GenerateTestMatrix, OutputEnvVariables ] if: always() && !cancelled() && !contains(needs.*.result, 'failure') name: 'EKSE2EKafkaTestAddon' + permissions: + id-token: write + contents: read uses: ./.github/workflows/eks-e2e-test.yml with: terraform_dir: terraform/eks/e2e @@ -255,6 +271,9 @@ jobs: needs: [ GetLatestOperatorCommitSHA, GenerateTestMatrix, OutputEnvVariables ] if: always() && !cancelled() && !contains(needs.*.result, 'failure') name: 'EKSE2EJMXContainerInsightsTestHelm' + permissions: + id-token: write + contents: read uses: ./.github/workflows/eks-e2e-test.yml with: terraform_dir: terraform/eks/e2e @@ -279,6 +298,9 @@ jobs: needs: [ GetLatestOperatorCommitSHA, GenerateTestMatrix, OutputEnvVariables ] if: always() && !cancelled() && !contains(needs.*.result, 'failure') name: 'EKSE2EJMXContainerInsightsTestAddon' + permissions: + id-token: write + contents: read uses: ./.github/workflows/eks-e2e-test.yml with: terraform_dir: terraform/eks/e2e @@ -303,6 +325,9 @@ jobs: needs: [ GetLatestOperatorCommitSHA, GenerateTestMatrix, OutputEnvVariables ] if: always() && !cancelled() && !contains(needs.*.result, 'failure') name: 'EKSE2EJVMTomcatTestHelmIPv6' + permissions: + id-token: write + contents: read uses: ./.github/workflows/eks-e2e-test.yml with: terraform_dir: terraform/eks/e2e @@ -329,6 +354,9 @@ jobs: needs: [ GetLatestOperatorCommitSHA, GenerateTestMatrix, OutputEnvVariables ] if: always() && !cancelled() && !contains(needs.*.result, 'failure') name: 'EKSE2EJVMTomcatTestAddonIPv6' + permissions: + id-token: write + contents: read uses: ./.github/workflows/eks-e2e-test.yml with: terraform_dir: terraform/eks/e2e @@ -355,6 +383,9 @@ jobs: needs: [ GetLatestOperatorCommitSHA, GenerateTestMatrix, OutputEnvVariables ] if: always() && !cancelled() && !contains(needs.*.result, 'failure') name: 'EKSE2EKafkaTestHelmIPv6' + permissions: + id-token: write + contents: read uses: ./.github/workflows/eks-e2e-test.yml with: terraform_dir: terraform/eks/e2e @@ -381,6 +412,9 @@ jobs: needs: [ GetLatestOperatorCommitSHA, GenerateTestMatrix, OutputEnvVariables ] if: always() && !cancelled() && !contains(needs.*.result, 'failure') name: 'EKSE2EKafkaTestAddonIPv6' + permissions: + id-token: write + contents: read uses: ./.github/workflows/eks-e2e-test.yml with: terraform_dir: terraform/eks/e2e @@ -407,6 +441,9 @@ jobs: needs: [ GetLatestOperatorCommitSHA, GenerateTestMatrix, OutputEnvVariables ] if: always() && !cancelled() && !contains(needs.*.result, 'failure') name: 'EKSE2EJMXContainerInsightsTestHelmIPv6' + permissions: + id-token: write + contents: read uses: ./.github/workflows/eks-e2e-test.yml with: terraform_dir: terraform/eks/e2e @@ -433,6 +470,9 @@ jobs: needs: [ GetLatestOperatorCommitSHA, GenerateTestMatrix, OutputEnvVariables ] if: always() && !cancelled() && !contains(needs.*.result, 'failure') name: 'EKSE2EJMXContainerInsightsTestAddonIPv6' + permissions: + id-token: write + contents: read uses: ./.github/workflows/eks-e2e-test.yml with: terraform_dir: terraform/eks/e2e diff --git a/.github/workflows/eks-performance-cluster-addon-install.yml b/.github/workflows/eks-performance-cluster-addon-install.yml index 60292bf99d7..37580e1680b 100644 --- a/.github/workflows/eks-performance-cluster-addon-install.yml +++ b/.github/workflows/eks-performance-cluster-addon-install.yml @@ -122,14 +122,18 @@ jobs: permissions: id-token: write contents: read + actions: write with: - test-image-before-upload: false + run-tests: false # Build and upload operator image to ECR repo BuildOperator: needs: [ check-trigger, GetLatestOperatorCommitSHA ] if: ${{ needs.check-trigger.outputs.should_continue == 'true' }} uses: aws/amazon-cloudwatch-agent-operator/.github/workflows/build-and-upload.yml@main + permissions: + id-token: write + contents: read concurrency: group: ${{ github.workflow }}-operator-${{ inputs.operator-branch || 'main' }} cancel-in-progress: true diff --git a/.github/workflows/integration-test.yml b/.github/workflows/integration-test.yml index 5534cd38889..ef433f8e011 100644 --- a/.github/workflows/integration-test.yml +++ b/.github/workflows/integration-test.yml @@ -57,13 +57,20 @@ jobs: exit 1 fi - run: | - conclusion=$(gh run view ${{ inputs.build_run_id }} --repo $GITHUB_REPOSITORY --json conclusion -q '.conclusion') - if [[ $conclusion == "success" ]]; then - echo "Run succeeded" - else - echo "Run failed" - exit 1 - fi + for i in {1..6}; do + conclusion=$(gh run view ${{ inputs.build_run_id }} --repo $GITHUB_REPOSITORY --json conclusion -q '.conclusion') + if [[ "$conclusion" == "success" ]]; then + echo "Run succeeded" + exit 0 + elif [[ "$conclusion" == "failure" || "$conclusion" == "cancelled" ]]; then + echo "Run failed with: $conclusion" + exit 1 + fi + echo "Waiting for workflow to complete (attempt $i)..." + sleep 5 + done + echo "Timed out waiting for workflow" + exit 1 env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/wd-integration-test.yml b/.github/workflows/wd-integration-test.yml index b3b35bf2025..c5edc3f1f5e 100644 --- a/.github/workflows/wd-integration-test.yml +++ b/.github/workflows/wd-integration-test.yml @@ -42,13 +42,20 @@ jobs: exit 1 fi - run: |- - conclusion=$(gh run view ${{ inputs.build_run_id }} --repo $GITHUB_REPOSITORY --json conclusion -q '.conclusion') - if [[ $conclusion == "success" ]]; then - echo "Run succeeded" - else - echo "Run failed" - exit 1 - fi + for i in {1..6}; do + conclusion=$(gh run view ${{ inputs.build_run_id }} --repo $GITHUB_REPOSITORY --json conclusion -q '.conclusion') + if [[ "$conclusion" == "success" ]]; then + echo "Run succeeded" + exit 0 + elif [[ "$conclusion" == "failure" || "$conclusion" == "cancelled" ]]; then + echo "Run failed with: $conclusion" + exit 1 + fi + echo "Waiting for workflow to complete (attempt $i)..." + sleep 5 + done + echo "Timed out waiting for workflow" + exit 1 env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 48918e2b6e6d35b36eb690853d7d8e56cf6ebea1 Mon Sep 17 00:00:00 2001 From: Jeffrey Chien Date: Tue, 24 Feb 2026 12:26:04 -0500 Subject: [PATCH 48/50] Bump minimum workflow go version to match go.mod (#2036) --- .github/workflows/PR-build.yml | 6 +++--- .github/workflows/PR-test.yml | 4 ++-- .github/workflows/e2e-test.yml | 4 ++-- .github/workflows/eks-performance-cluster-tests.yml | 2 +- .github/workflows/otel-fork-replace.yml | 2 +- .github/workflows/test-artifacts.yml | 6 +++--- .github/workflows/test-build-docker.yml | 2 +- .github/workflows/test-build-packages.yml | 2 +- .github/workflows/test-build.yml | 2 +- .github/workflows/upload-dependencies.yml | 2 +- 10 files changed, 16 insertions(+), 16 deletions(-) diff --git a/.github/workflows/PR-build.yml b/.github/workflows/PR-build.yml index ac764e576cd..9a89c0db3d6 100644 --- a/.github/workflows/PR-build.yml +++ b/.github/workflows/PR-build.yml @@ -48,7 +48,7 @@ jobs: if: needs.changes.outputs.lint == 'true' uses: actions/setup-go@v4 with: - go-version: ~1.25 + go-version: ~1.25.7 cache: false - name: Check out code @@ -105,7 +105,7 @@ jobs: if: needs.changes.outputs.build == 'true' uses: actions/setup-go@v4 with: - go-version: ~1.25 + go-version: ~1.25.7 cache: false - name: Check out code @@ -152,7 +152,7 @@ jobs: if: needs.changes.outputs.build == 'true' uses: actions/setup-go@v4 with: - go-version: ~1.25 + go-version: ~1.25.7 cache: false - name: Check out code diff --git a/.github/workflows/PR-test.yml b/.github/workflows/PR-test.yml index 7e4f85f7bf5..742e9870f29 100644 --- a/.github/workflows/PR-test.yml +++ b/.github/workflows/PR-test.yml @@ -95,7 +95,7 @@ jobs: - name: Set up Go 1.x uses: actions/setup-go@v4 with: - go-version: ~1.25 + go-version: ~1.25.7 StartLocalStack: name: 'StartLocalStack' @@ -138,7 +138,7 @@ jobs: - name: Set up Go 1.x uses: actions/setup-go@v4 with: - go-version: ~1.25 + go-version: ~1.25.7 - name: Install jq run: sudo apt-get install -y jq diff --git a/.github/workflows/e2e-test.yml b/.github/workflows/e2e-test.yml index 667992e6978..5397885df69 100644 --- a/.github/workflows/e2e-test.yml +++ b/.github/workflows/e2e-test.yml @@ -110,7 +110,7 @@ jobs: - name: Set up Go 1.x uses: actions/setup-go@v4 with: - go-version: ~1.25 + go-version: ~1.25.7 - name: SetOutputs id: set-outputs @@ -147,7 +147,7 @@ jobs: - name: Set up Go 1.x uses: actions/setup-go@v4 with: - go-version: ~1.25 + go-version: ~1.25.7 - name: Generate matrix id: set-matrix diff --git a/.github/workflows/eks-performance-cluster-tests.yml b/.github/workflows/eks-performance-cluster-tests.yml index 89ce188b2f9..90429fab79d 100644 --- a/.github/workflows/eks-performance-cluster-tests.yml +++ b/.github/workflows/eks-performance-cluster-tests.yml @@ -122,7 +122,7 @@ jobs: - name: Set up Go 1.x uses: actions/setup-go@v4 with: - go-version: ~1.25 + go-version: ~1.25.7 - uses: actions/checkout@v4 with: diff --git a/.github/workflows/otel-fork-replace.yml b/.github/workflows/otel-fork-replace.yml index 29707139b47..0f33b732b09 100644 --- a/.github/workflows/otel-fork-replace.yml +++ b/.github/workflows/otel-fork-replace.yml @@ -33,7 +33,7 @@ jobs: - name: Set up Go 1.x uses: actions/setup-go@v4 with: - go-version: ~1.25 + go-version: ~1.25.7 cache: false - name: Update OTel fork components version id: set-matrix diff --git a/.github/workflows/test-artifacts.yml b/.github/workflows/test-artifacts.yml index ff4c70a63d9..292adee2f38 100644 --- a/.github/workflows/test-artifacts.yml +++ b/.github/workflows/test-artifacts.yml @@ -145,7 +145,7 @@ jobs: - name: Set up Go 1.x uses: actions/setup-go@v4 with: - go-version: ~1.25 + go-version: ~1.25.7 GenerateTestMatrix: name: 'GenerateTestMatrix' @@ -183,7 +183,7 @@ jobs: - name: Set up Go 1.x uses: actions/setup-go@v4 with: - go-version: ~1.25 + go-version: ~1.25.7 - name: Generate matrix id: set-matrix @@ -303,7 +303,7 @@ jobs: - name: Set up Go 1.x uses: actions/setup-go@v2 with: - go-version: ~1.25 + go-version: ~1.25.7 - name: Configure AWS Credentials uses: aws-actions/configure-aws-credentials@v4 diff --git a/.github/workflows/test-build-docker.yml b/.github/workflows/test-build-docker.yml index 03c5d143c08..7d474f94dc6 100644 --- a/.github/workflows/test-build-docker.yml +++ b/.github/workflows/test-build-docker.yml @@ -143,7 +143,7 @@ jobs: - name: Set up Go 1.x uses: actions/setup-go@v4 with: - go-version: ~1.25 + go-version: ~1.25.7 - name: Configure AWS Credentials uses: aws-actions/configure-aws-credentials@v4 diff --git a/.github/workflows/test-build-packages.yml b/.github/workflows/test-build-packages.yml index 0814a80047b..a14bc4e960a 100644 --- a/.github/workflows/test-build-packages.yml +++ b/.github/workflows/test-build-packages.yml @@ -80,7 +80,7 @@ jobs: - name: Set up Go 1.x uses: actions/setup-go@v4 with: - go-version: ~1.25 + go-version: ~1.25.7 - name: Free up disk space working-directory: cwa diff --git a/.github/workflows/test-build.yml b/.github/workflows/test-build.yml index d72f59339c9..94e6196f01c 100644 --- a/.github/workflows/test-build.yml +++ b/.github/workflows/test-build.yml @@ -75,7 +75,7 @@ jobs: - name: Set up Go 1.x uses: actions/setup-go@v4 with: - go-version: ~1.25 + go-version: ~1.25.7 cache: false - name: Free up disk space diff --git a/.github/workflows/upload-dependencies.yml b/.github/workflows/upload-dependencies.yml index eb4dafb552e..14523c62cf6 100644 --- a/.github/workflows/upload-dependencies.yml +++ b/.github/workflows/upload-dependencies.yml @@ -45,7 +45,7 @@ jobs: - name: Set up Go uses: actions/setup-go@v3 with: - go-version: ~1.25 + go-version: ~1.25.7 - name: Upload Dependencies and Test Repo env: From ca1df31dc036086dbbbc9669c1d9da35e61c1250 Mon Sep 17 00:00:00 2001 From: Marcus Mann Date: Tue, 24 Feb 2026 14:24:30 -0500 Subject: [PATCH 49/50] fix: use main branch for helm-charts in EKS performance workflow (#2035) --- .github/workflows/eks-performance-cluster-addon-install.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/eks-performance-cluster-addon-install.yml b/.github/workflows/eks-performance-cluster-addon-install.yml index 37580e1680b..539b4fca860 100644 --- a/.github/workflows/eks-performance-cluster-addon-install.yml +++ b/.github/workflows/eks-performance-cluster-addon-install.yml @@ -177,10 +177,9 @@ jobs: run: | aws eks update-kubeconfig --name $CLUSTER_NAME --region $AWS_REGION - # TODO: Revert to using main helm branch when changes from leader-election are merged in - name: Clone Helm Charts Repository env: - HELM_CHARTS_BRANCH: ${{ inputs.helm-charts-branch || 'sky333999/leader-election' }} + HELM_CHARTS_BRANCH: ${{ inputs.helm-charts-branch || 'main' }} run: | rm -rf ./helm-charts git clone -b "$HELM_CHARTS_BRANCH" https://github.com/aws-observability/helm-charts.git ./helm-charts From 9aabc042b823a6f6cad9aa32db2683482710f05d Mon Sep 17 00:00:00 2001 From: Marcus Mann Date: Thu, 26 Feb 2026 14:17:42 -0500 Subject: [PATCH 50/50] fix(pusher): Address PR review feedback from Jeffrey Chien CRITICAL fixes: - Handle retryHeap.Push() error in sender.Send() when heap is stopped during shutdown. Now calls batch.done() to persist state and resume circuit breaker instead of silently dropping the batch. - Fix Close() ordering: pushers stop before heap to allow in-flight sends to push failed batches. Remove duplicate Stop() calls. HIGH priority fixes: - Remove dead maxRetryDuration field from RetryHeapProcessor (batch expiry is handled by batch.expireAfter set in initializeStartTime) - Remove duplicate maxRetryTimeout constant from cloudwatchlogs.go (canonical definition is in batch.go) - Add clarifying comment about circuit breaker in synchronous mode MEDIUM priority fixes: - Add stopMu mutex to RetryHeapProcessor.Stop() for thread safety - Rename TestPoisonPillScenario to TestRetryHeapProcessorDoesNotStarveAllowedTarget (test doesn't exercise full pipeline) - Delete TestRecoveryAfterSystemRestart (doesn't test actual restart) - Delete TestRecoveryWithMultipleTargets (duplicates TestSingleDeniedLogGroup) LOW priority fixes: - Fix TestQueueHaltResume to avoid race condition - Replace stringPtr/int64Ptr helpers with aws.String()/aws.Int64() --- .../outputs/cloudwatchlogs/cloudwatchlogs.go | 32 ++-- .../internal/pusher/poison_pill_test.go | 20 +- .../cloudwatchlogs/internal/pusher/queue.go | 5 +- .../internal/pusher/queue_test.go | 9 +- .../internal/pusher/retryheap.go | 54 ++++-- .../internal/pusher/retryheap_expiry_test.go | 3 +- .../pusher/retryheap_recovery_test.go | 177 +----------------- .../internal/pusher/retryheap_test.go | 12 +- .../cloudwatchlogs/internal/pusher/sender.go | 9 +- .../internal/pusher/state_callback_test.go | 6 +- 10 files changed, 86 insertions(+), 241 deletions(-) diff --git a/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go b/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go index 6283b90f3bd..854c5b515f8 100644 --- a/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go +++ b/plugins/outputs/cloudwatchlogs/cloudwatchlogs.go @@ -37,8 +37,6 @@ const ( LogEntryField = "value" defaultFlushTimeout = 5 * time.Second - - maxRetryTimeout = 14*24*time.Hour + 10*time.Minute ) var ( @@ -89,16 +87,12 @@ func (c *CloudWatchLogs) Connect() error { } func (c *CloudWatchLogs) Close() error { - // Stop components in specific order to prevent race conditions: - // 1. RetryHeap - stop accepting new batches first - // 2. Pushers - stop all active pushers (queues/senders) - // 3. Wait for pushers to complete - // 4. RetryHeapProcessor - stop retry processing and wait for WorkerPool usage to complete - // 5. WorkerPool - finally stop the worker threads - - if c.retryHeap != nil { - c.retryHeap.Stop() - } + // Shutdown order: + // 1. Stop all pushers (queues stop accepting new events, final send) + // 2. Wait for pushers to complete (in-flight sends finish, failed batches pushed to heap) + // 3. Stop RetryHeap (no more pushes accepted after this point) + // 4. Stop RetryHeapProcessor (flush remaining ready batches, stop goroutine) + // 5. Stop WorkerPool (drain worker threads) c.cwDests.Range(func(_, value interface{}) bool { if d, ok := value.(*cwDest); ok { @@ -109,20 +103,16 @@ func (c *CloudWatchLogs) Close() error { c.pusherWaitGroup.Wait() - if c.retryHeapProcessor != nil { - c.retryHeapProcessor.Stop() - } - - if c.workerPool != nil { - c.workerPool.Stop() + if c.retryHeap != nil { + c.retryHeap.Stop() } if c.retryHeapProcessor != nil { c.retryHeapProcessor.Stop() } - if c.retryHeap != nil { - c.retryHeap.Stop() + if c.workerPool != nil { + c.workerPool.Stop() } return nil @@ -178,7 +168,7 @@ func (c *CloudWatchLogs) getDest(t pusher.Target, logSrc logs.LogSrc) *cwDest { retryHeapProcessorRetryer := retryer.NewLogThrottleRetryer(c.Log) retryHeapProcessorClient := c.createClient(retryHeapProcessorRetryer) - c.retryHeapProcessor = pusher.NewRetryHeapProcessor(c.retryHeap, c.workerPool, retryHeapProcessorClient, c.targetManager, c.Log, maxRetryTimeout, retryHeapProcessorRetryer) + c.retryHeapProcessor = pusher.NewRetryHeapProcessor(c.retryHeap, c.workerPool, retryHeapProcessorClient, c.targetManager, c.Log, retryHeapProcessorRetryer) c.retryHeapProcessor.Start() } c.targetManager = pusher.NewTargetManager(c.Log, client) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go index 3500173e0af..5a322c7caac 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/poison_pill_test.go @@ -9,6 +9,7 @@ import ( "testing" "time" + "github.com/aws/aws-sdk-go/aws" "github.com/influxdata/telegraf/testutil" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" @@ -17,10 +18,13 @@ import ( "github.com/aws/amazon-cloudwatch-agent/sdk/service/cloudwatchlogs" ) -// TestPoisonPillScenario validates that when 10 denied + 1 allowed log groups +// TestRetryHeapProcessorDoesNotStarveAllowedTarget validates that when 10 denied + 1 allowed log groups // share a worker pool with concurrency=2, the allowed log group continues // publishing without being starved by failed retries. -func TestPoisonPillScenario(t *testing.T) { +// Note: This test pushes batches directly to the heap and bypasses the full +// queue → sender → retryHeap → processor pipeline. It validates RetryHeapProcessor +// behavior, not the end-to-end circuit breaker flow. +func TestRetryHeapProcessorDoesNotStarveAllowedTarget(t *testing.T) { heap := NewRetryHeap(&testutil.Logger{}) defer heap.Stop() @@ -32,7 +36,7 @@ func TestPoisonPillScenario(t *testing.T) { mockTargetManager.On("EnsureTargetExists", mock.Anything).Return(nil) accessDeniedErr := &cloudwatchlogs.AccessDeniedException{ - Message_: stringPtr("User is not authorized to perform: logs:PutLogEvents with an explicit deny"), + Message_: aws.String("User is not authorized to perform: logs:PutLogEvents with an explicit deny"), } // Track successful PutLogEvents calls for the allowed log group @@ -54,7 +58,7 @@ func TestPoisonPillScenario(t *testing.T) { deniedGroupAttemptCount.Add(1) }) - processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, 100*time.Millisecond, retryer.NewLogThrottleRetryer(&testutil.Logger{})) + processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, retryer.NewLogThrottleRetryer(&testutil.Logger{})) // Targets allowedTarget := Target{Group: "log-stream-ple-access-granted", Stream: "i-test"} @@ -170,7 +174,7 @@ func TestSingleDeniedLogGroup(t *testing.T) { mockTargetManager.On("EnsureTargetExists", mock.Anything).Return(nil) accessDeniedErr := &cloudwatchlogs.AccessDeniedException{ - Message_: stringPtr("Access denied"), + Message_: aws.String("Access denied"), } var allowedGroupSuccessCount atomic.Int32 @@ -185,7 +189,7 @@ func TestSingleDeniedLogGroup(t *testing.T) { return *input.LogGroupName == "aws-restricted-log-group-name-log-stream-ple-access-denied" })).Return((*cloudwatchlogs.PutLogEventsOutput)(nil), accessDeniedErr) - processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, time.Hour, retryer.NewLogThrottleRetryer(&testutil.Logger{})) + processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, retryer.NewLogThrottleRetryer(&testutil.Logger{})) // Create batches allowedTarget := Target{Group: "log-stream-ple-access-granted", Stream: "i-test"} @@ -217,8 +221,8 @@ func createBatch(target Target, eventCount int) *logEventBatch { now := time.Now().Unix() * 1000 for i := 0; i < eventCount; i++ { batch.events[i] = &cloudwatchlogs.InputLogEvent{ - Message: stringPtr("test message"), - Timestamp: int64Ptr(now + int64(i)), + Message: aws.String("test message"), + Timestamp: aws.Int64(now + int64(i)), } } return batch diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go b/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go index 86f1bd6e4c2..8899554df93 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/queue.go @@ -184,7 +184,10 @@ func (q *queue) send() { q.batch.addDoneCallback(q.onSuccessCallback(q.batch.bufferedSize)) q.batch.addFailCallback(q.halt) - // Wait if halted (circuit breaker) + // In synchronous mode (no retryHeap), halt() is never called because + // sender only calls batch.fail() when retryHeap != nil. So waitIfHalted + // is a no-op. The lock acquisition is negligible overhead (~20ns) on + // the uncontended path. q.waitIfHalted() q.sender.Send(q.batch) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go index 293218e185a..f2bd145fc0c 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/queue_test.go @@ -797,7 +797,7 @@ func TestQueueHaltResume(t *testing.T) { mockSender.On("Send", mock.Anything).Run(func(args mock.Arguments) { sendCount.Add(1) batch := args.Get(0).(*logEventBatch) - // Simulate failure on first call, success on second + // Simulate failure on first call, success on subsequent calls if sendCount.Load() == 1 { batch.fail() // This should halt the queue } else { @@ -821,9 +821,6 @@ func TestQueueHaltResume(t *testing.T) { assert.True(t, queueImpl.halted, "Queue should be halted after failure") queueImpl.haltMu.Unlock() - // Add second event - should be queued but not sent due to halt - q.AddEvent(newStubLogEvent("second message", time.Now())) - // Verify only one send happened (queue is halted) assert.Equal(t, int32(1), sendCount.Load(), "Should have only one send due to halt") @@ -835,8 +832,8 @@ func TestQueueHaltResume(t *testing.T) { assert.False(t, queueImpl.halted, "Queue should be resumed after success") queueImpl.haltMu.Unlock() - // Add third event - should trigger send since queue is resumed - q.AddEvent(newStubLogEvent("third message", time.Now())) + // Add second event - should trigger send since queue is resumed + q.AddEvent(newStubLogEvent("second message", time.Now())) // Wait for the second send to complete time.Sleep(50 * time.Millisecond) diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go index 258c2795e23..a4c708ad6fe 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap.go @@ -120,31 +120,30 @@ func (rh *retryHeap) Stop() { // RetryHeapProcessor manages the retry heap and moves ready batches back to sender queue type RetryHeapProcessor struct { - retryHeap RetryHeap - senderPool Sender - retryer *retryer.LogThrottleRetryer - stopCh chan struct{} - logger telegraf.Logger - stopped bool - maxRetryDuration time.Duration - wg sync.WaitGroup + retryHeap RetryHeap + senderPool Sender + retryer *retryer.LogThrottleRetryer + stopCh chan struct{} + logger telegraf.Logger + stopped bool + stopMu sync.Mutex + wg sync.WaitGroup } // NewRetryHeapProcessor creates a new retry heap processor -func NewRetryHeapProcessor(retryHeap RetryHeap, workerPool WorkerPool, service cloudWatchLogsService, targetManager TargetManager, logger telegraf.Logger, maxRetryDuration time.Duration, retryer *retryer.LogThrottleRetryer) *RetryHeapProcessor { +func NewRetryHeapProcessor(retryHeap RetryHeap, workerPool WorkerPool, service cloudWatchLogsService, targetManager TargetManager, logger telegraf.Logger, retryer *retryer.LogThrottleRetryer) *RetryHeapProcessor { // Create processor's own sender and senderPool // Pass retryHeap so failed batches go back to RetryHeap instead of blocking on sync retry sender := newSender(logger, service, targetManager, retryHeap) senderPool := newSenderPool(workerPool, sender) return &RetryHeapProcessor{ - retryHeap: retryHeap, - senderPool: senderPool, - retryer: retryer, - stopCh: make(chan struct{}), - logger: logger, - stopped: false, - maxRetryDuration: maxRetryDuration, + retryHeap: retryHeap, + senderPool: senderPool, + retryer: retryer, + stopCh: make(chan struct{}), + logger: logger, + stopped: false, } } @@ -156,18 +155,24 @@ func (p *RetryHeapProcessor) Start() { // Stop stops the retry heap processor func (p *RetryHeapProcessor) Stop() { + p.stopMu.Lock() + defer p.stopMu.Unlock() + if p.stopped { return } - // Process any remaining batches before stopping - p.processReadyMessages() + // Flush remaining ready batches before marking as stopped + p.flushReadyBatches() + + p.stopped = true - p.retryer.Stop() + if p.retryer != nil { + p.retryer.Stop() + } p.senderPool.Stop() close(p.stopCh) p.wg.Wait() - p.stopped = true } // processLoop runs the main processing loop @@ -188,10 +193,19 @@ func (p *RetryHeapProcessor) processLoop() { // processReadyMessages checks the heap for ready batches and moves them back to sender queue func (p *RetryHeapProcessor) processReadyMessages() { + p.stopMu.Lock() if p.stopped { + p.stopMu.Unlock() return } + p.stopMu.Unlock() + + p.flushReadyBatches() +} +// flushReadyBatches pops ready batches from the heap and sends them. +// Called by both processReadyMessages and Stop. +func (p *RetryHeapProcessor) flushReadyBatches() { readyBatches := p.retryHeap.PopReady() for _, batch := range readyBatches { diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go index cdbbd56838f..64bfd588e98 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_expiry_test.go @@ -43,9 +43,8 @@ func TestRetryHeapProcessorExpiredBatchShouldResume(t *testing.T) { retryHeap := NewRetryHeap(logger) workerPool := NewWorkerPool(5) tm := NewTargetManager(logger, mockService) - maxRetryDuration := 50 * time.Millisecond // Normally 14 days - retryHeapProcessor := NewRetryHeapProcessor(retryHeap, workerPool, mockService, tm, logger, maxRetryDuration, nil) + retryHeapProcessor := NewRetryHeapProcessor(retryHeap, workerPool, mockService, tm, logger, nil) retryHeapProcessor.Start() defer retryHeap.Stop() diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_recovery_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_recovery_test.go index 747272d1bed..7dfe1020c0d 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_recovery_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_recovery_test.go @@ -4,11 +4,11 @@ package pusher import ( - "errors" "sync" "testing" "time" + "github.com/aws/aws-sdk-go/aws" "github.com/influxdata/telegraf/testutil" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" @@ -30,7 +30,7 @@ func TestRecoveryWhenPermissionGrantedDuringRetry(t *testing.T) { // Mock service that initially returns AccessDenied, then succeeds mockService := &mockLogsService{} accessDeniedErr := &cloudwatchlogs.AccessDeniedException{ - Message_: stringPtr("Access denied"), + Message_: aws.String("Access denied"), } // First call fails with AccessDenied @@ -41,13 +41,13 @@ func TestRecoveryWhenPermissionGrantedDuringRetry(t *testing.T) { mockTargetManager := &mockTargetManager{} mockTargetManager.On("EnsureTargetExists", mock.Anything).Return(nil) - processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, time.Hour, retryer.NewLogThrottleRetryer(&testutil.Logger{})) + processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, retryer.NewLogThrottleRetryer(&testutil.Logger{})) // Create batch and track circuit breaker state target := Target{Group: "group", Stream: "stream"} batch := newLogEventBatch(target, nil) batch.events = []*cloudwatchlogs.InputLogEvent{ - {Message: stringPtr("test message"), Timestamp: int64Ptr(time.Now().Unix() * 1000)}, + {Message: aws.String("test message"), Timestamp: aws.Int64(time.Now().Unix() * 1000)}, } var haltCalled, resumeCalled bool @@ -108,172 +108,3 @@ func TestRecoveryWhenPermissionGrantedDuringRetry(t *testing.T) { // Verify both PutLogEvents calls were made mockService.AssertExpectations(t) } - -// TestRecoveryAfterSystemRestart validates that when the system restarts with -// retry ongoing, it resumes correctly by loading state and continuing retries. -func TestRecoveryAfterSystemRestart(t *testing.T) { - heap := NewRetryHeap(&testutil.Logger{}) - defer heap.Stop() - - workerPool := NewWorkerPool(2) - defer workerPool.Stop() - - mockService := &mockLogsService{} - mockTargetManager := &mockTargetManager{} - mockTargetManager.On("EnsureTargetExists", mock.Anything).Return(nil) - - // Simulate system restart scenario: - // 1. Initial failure puts batch in retry state - // 2. System "restarts" (new processor instance) - // 3. Batch is reloaded with retry metadata intact - // 4. Retry succeeds - - target := Target{Group: "group", Stream: "stream"} - batch := newLogEventBatch(target, nil) - batch.events = []*cloudwatchlogs.InputLogEvent{ - {Message: stringPtr("test message"), Timestamp: int64Ptr(time.Now().Unix() * 1000)}, - } - - // Simulate batch that was in retry state before restart - batch.retryCountShort = 2 - batch.startTime = time.Now().Add(-5 * time.Minute) - batch.nextRetryTime = time.Now().Add(-1 * time.Second) // Ready for retry - batch.lastError = errors.New("previous error before restart") - - var resumeCalled bool - var mu sync.Mutex - - batch.addDoneCallback(func() { - mu.Lock() - resumeCalled = true - mu.Unlock() - }) - - // Mock successful retry after restart - mockService.On("PutLogEvents", mock.Anything).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Once() - - // Create new processor (simulating restart) - processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, time.Hour, retryer.NewLogThrottleRetryer(&testutil.Logger{})) - - // Push batch with existing retry metadata - err := heap.Push(batch) - assert.NoError(t, err) - - // Process should succeed - processor.processReadyMessages() - - // Wait for async processing to complete - time.Sleep(100 * time.Millisecond) - - // Verify circuit breaker resumed - mu.Lock() - assert.True(t, resumeCalled, "Circuit breaker should resume after successful retry post-restart") - mu.Unlock() - - // Heap should be empty - assert.Equal(t, 0, heap.Size(), "Heap should be empty after successful retry") - - // Verify retry metadata was preserved - assert.Equal(t, 2, batch.retryCountShort, "Retry count should be preserved across restart") - assert.False(t, batch.startTime.IsZero(), "Start time should be preserved across restart") - - mockService.AssertExpectations(t) -} - -// TestRecoveryWithMultipleTargets validates that when one target has permission -// issues, other healthy targets continue publishing successfully. -func TestRecoveryWithMultipleTargets(t *testing.T) { - heap := NewRetryHeap(&testutil.Logger{}) - defer heap.Stop() - - workerPool := NewWorkerPool(2) - defer workerPool.Stop() - - mockService := &mockLogsService{} - mockTargetManager := &mockTargetManager{} - mockTargetManager.On("EnsureTargetExists", mock.Anything).Return(nil) - - processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, time.Hour, retryer.NewLogThrottleRetryer(&testutil.Logger{})) - - // Create two targets - target1 := Target{Group: "group1", Stream: "stream1"} - target2 := Target{Group: "group2", Stream: "stream2"} - - batch1 := newLogEventBatch(target1, nil) - batch1.events = []*cloudwatchlogs.InputLogEvent{ - {Message: stringPtr("message1"), Timestamp: int64Ptr(time.Now().Unix() * 1000)}, - } - batch1.nextRetryTime = time.Now().Add(-1 * time.Second) - - batch2 := newLogEventBatch(target2, nil) - batch2.events = []*cloudwatchlogs.InputLogEvent{ - {Message: stringPtr("message2"), Timestamp: int64Ptr(time.Now().Unix() * 1000)}, - } - batch2.nextRetryTime = time.Now().Add(-1 * time.Second) - - var halt1Called, resume1Called, resume2Called bool - var mu sync.Mutex - - // Target 1 fails with AccessDenied - batch1.addFailCallback(func() { - mu.Lock() - halt1Called = true - mu.Unlock() - }) - batch1.addDoneCallback(func() { - mu.Lock() - resume1Called = true - mu.Unlock() - }) - - // Target 2 succeeds - batch2.addDoneCallback(func() { - mu.Lock() - resume2Called = true - mu.Unlock() - }) - - // Mock responses: target1 fails, target2 succeeds - accessDeniedErr := &cloudwatchlogs.AccessDeniedException{ - Message_: stringPtr("Access denied"), - } - mockService.On("PutLogEvents", mock.MatchedBy(func(req *cloudwatchlogs.PutLogEventsInput) bool { - return *req.LogGroupName == "group1" - })).Return((*cloudwatchlogs.PutLogEventsOutput)(nil), accessDeniedErr).Once() - - mockService.On("PutLogEvents", mock.MatchedBy(func(req *cloudwatchlogs.PutLogEventsInput) bool { - return *req.LogGroupName == "group2" - })).Return(&cloudwatchlogs.PutLogEventsOutput{}, nil).Once() - - // Push both batches - err := heap.Push(batch1) - assert.NoError(t, err) - err = heap.Push(batch2) - assert.NoError(t, err) - - // Process both batches - processor.processReadyMessages() - - // Wait for async processing to complete - time.Sleep(100 * time.Millisecond) - - // Verify target1 circuit breaker halted, target2 succeeded - mu.Lock() - assert.True(t, halt1Called, "Target1 circuit breaker should halt") - assert.False(t, resume1Called, "Target1 circuit breaker should not resume") - assert.True(t, resume2Called, "Target2 should succeed and resume") - mu.Unlock() - - // Target1 should be back in heap, target2 should be done - assert.Equal(t, 1, heap.Size(), "Only failed target should remain in heap") - - mockService.AssertExpectations(t) -} - -func stringPtr(s string) *string { - return &s -} - -func int64Ptr(i int64) *int64 { - return &i -} diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go index d86888856bf..d79e388e071 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/retryheap_test.go @@ -90,7 +90,7 @@ func TestRetryHeapProcessor(t *testing.T) { mockService := &mockLogsService{} mockTargetManager := &mockTargetManager{} - processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, time.Hour, retryer.NewLogThrottleRetryer(&testutil.Logger{})) + processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, retryer.NewLogThrottleRetryer(&testutil.Logger{})) defer processor.Stop() // Test start/stop @@ -108,7 +108,7 @@ func TestRetryHeapProcessorExpiredBatch(t *testing.T) { mockService := &mockLogsService{} mockTargetManager := &mockTargetManager{} - processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, 1*time.Millisecond, retryer.NewLogThrottleRetryer(&testutil.Logger{})) + processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, retryer.NewLogThrottleRetryer(&testutil.Logger{})) target := Target{Group: "group", Stream: "stream"} batch := newLogEventBatch(target, nil) @@ -138,7 +138,7 @@ func TestRetryHeapProcessorSendsBatch(t *testing.T) { mockTargetManager := &mockTargetManager{} mockTargetManager.On("EnsureTargetExists", mock.Anything).Return(nil) - processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, time.Hour, retryer.NewLogThrottleRetryer(&testutil.Logger{})) + processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, retryer.NewLogThrottleRetryer(&testutil.Logger{})) target := Target{Group: "group", Stream: "stream"} batch := newLogEventBatch(target, nil) @@ -210,7 +210,7 @@ func TestRetryHeapProcessorNoReadyBatches(t *testing.T) { mockService := &mockLogsService{} mockTargetManager := &mockTargetManager{} - processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, time.Hour, retryer.NewLogThrottleRetryer(&testutil.Logger{})) + processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, retryer.NewLogThrottleRetryer(&testutil.Logger{})) // Process with empty heap - should not panic processor.processReadyMessages() @@ -232,7 +232,7 @@ func TestRetryHeapProcessorFailedBatchGoesBackToHeap(t *testing.T) { mockTargetManager := &mockTargetManager{} mockTargetManager.On("InitTarget", mock.Anything).Return(nil) - processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, time.Hour, retryer.NewLogThrottleRetryer(&testutil.Logger{})) + processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, retryer.NewLogThrottleRetryer(&testutil.Logger{})) processor.Start() defer processor.Stop() @@ -284,7 +284,7 @@ func TestRetryHeapProcessorStoppedProcessReadyMessages(t *testing.T) { mockService := &mockLogsService{} mockTargetManager := &mockTargetManager{} - processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, time.Hour, retryer.NewLogThrottleRetryer(&testutil.Logger{})) + processor := NewRetryHeapProcessor(heap, workerPool, mockService, mockTargetManager, &testutil.Logger{}, retryer.NewLogThrottleRetryer(&testutil.Logger{})) // Add a ready batch to the heap target := Target{Group: "group", Stream: "stream"} diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go b/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go index 6a34be1e43e..902bb166f7c 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/sender.go @@ -121,7 +121,14 @@ func (s *sender) Send(batch *logEventBatch) { // If RetryHeap available, push to RetryHeap and return // Otherwise, continue with existing busy-wait retry behavior if s.retryHeap != nil { - s.retryHeap.Push(batch) + if err := s.retryHeap.Push(batch); err != nil { + // Heap is stopped (shutdown in progress). Persist file offsets + // so these events aren't re-read on restart, then notify the + // circuit breaker so the queue isn't permanently halted. + s.logger.Warnf("RetryHeap stopped, dropping batch for %v/%v: %v", batch.Group, batch.Stream, err) + batch.done() + return + } batch.fail() return } diff --git a/plugins/outputs/cloudwatchlogs/internal/pusher/state_callback_test.go b/plugins/outputs/cloudwatchlogs/internal/pusher/state_callback_test.go index 8190e3a40b5..5450e52e808 100644 --- a/plugins/outputs/cloudwatchlogs/internal/pusher/state_callback_test.go +++ b/plugins/outputs/cloudwatchlogs/internal/pusher/state_callback_test.go @@ -72,7 +72,7 @@ func TestRetryHeapSuccessCallsStateCallback(t *testing.T) { defer retryHeap.Stop() defer workerPool.Stop() - processor := NewRetryHeapProcessor(retryHeap, workerPool, service, tm, logger, time.Hour, retryer.NewLogThrottleRetryer(logger)) + processor := NewRetryHeapProcessor(retryHeap, workerPool, service, tm, logger, retryer.NewLogThrottleRetryer(logger)) batch := newStatefulBatch(target, queue) batch.nextRetryTime = time.Now().Add(-1 * time.Second) @@ -119,7 +119,7 @@ func TestRetryHeapExpiryCallsStateCallback(t *testing.T) { defer retryHeap.Stop() defer workerPool.Stop() - processor := NewRetryHeapProcessor(retryHeap, workerPool, service, tm, logger, 50*time.Millisecond, nil) + processor := NewRetryHeapProcessor(retryHeap, workerPool, service, tm, logger, nil) batch := newStatefulBatch(target, queue) batch.initializeStartTime() @@ -166,7 +166,7 @@ func TestShutdownDoesNotCallStateCallback(t *testing.T) { } tm := NewTargetManager(logger, service) - processor := NewRetryHeapProcessor(retryHeap, workerPool, service, tm, logger, time.Hour, nil) + processor := NewRetryHeapProcessor(retryHeap, workerPool, service, tm, logger, nil) processor.Start() // Push a batch with a future retry time so it won't be processed before Stop