From c3e3fe086c4be3b97190f6fdd5b095825860fea2 Mon Sep 17 00:00:00 2001
From: ehila <ehila@redhat.com>
Date: Wed, 6 May 2026 15:34:09 -0400
Subject: [PATCH 1/3] monitortests: allow etcd CO blips during TNF jobs on
 two-node upgrades

Teach legacy CVO monitor tests to treat cluster-etcd-operator condition
reasons shaped as tnf-*_JobRunning as expected on DualReplica and
HighlyAvailableArbiter topologies: Available=False during upgrade, and
Progressing=True while machine-config is progressing.

Add isTNFJobClusterOperatorReason helper and unit tests so we only match
CEO job-running surfaces, not unrelated etcd failures.

Co-authored-by: Cursor Composer <noreply@cursor.com>
Signed-off-by: ehila <ehila@redhat.com>
---
 .../legacycvomonitortests/monitortest.go      |  2 +-
 .../legacycvomonitortests/operators.go        | 33 ++++++++++++++++++-
 .../legacycvomonitortests/operators_test.go   | 23 +++++++++++++
 3 files changed, 56 insertions(+), 2 deletions(-)
diff --git a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/monitortest.go b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/monitortest.go
index 5e0fad828ca0..63d17ff98c0a 100644
--- a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/monitortest.go
+++ b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/monitortest.go
@@ -96,7 +96,7 @@ func (w *legacyMonitorTests) EvaluateTestsFromConstructedIntervals(ctx context.C
 		if err != nil || level == unknownUpgradeLevel {
 			return nil, fmt.Errorf("failed to determine upgrade level: %w", err)
 		}
-		junits = append(junits, testUpgradeOperatorProgressingStateTransitions(finalIntervals, level == patchUpgradeLevel)...)
+		junits = append(junits, testUpgradeOperatorProgressingStateTransitions(finalIntervals, level == patchUpgradeLevel, w.adminRESTConfig)...)
 	} else {
 		junits = append(junits, testStableSystemOperatorStateTransitions(finalIntervals, w.adminRESTConfig)...)
 	}
diff --git a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go
index 54a5bb2ae696..5a49831171dd 100644
--- a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go
+++ b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go
@@ -162,6 +162,14 @@ func getControlPlaneTopology(clientConfig *rest.Config) (configv1.TopologyMode,
 	return *topo, nil
 }
 
+// isTNFJobClusterOperatorReason matches ClusterOperator condition Reason values emitted while
+// two-node fencing (TNF) batch Jobs run in openshift-etcd. The cluster-etcd-operator maps
+// active Job state into etcd's ClusterOperator with reasons shaped like
+// tnf-<workflow>_JobRunning (including a per-job hash suffix on some Jobs, e.g. tnf-auth-job-master-0-64736551_JobRunning).
+func isTNFJobClusterOperatorReason(reason string) bool {
+	return strings.HasPrefix(reason, "tnf-") && strings.HasSuffix(reason, "_JobRunning")
+}
+
 // isInUpgradeWindow determines if the given eventInterval falls within an upgrade window.
 // UpgradeStart and UpgradeRollback events start upgrade windows and can end and already started upgrade window.
 // UpgradeComplete and UpgradeFailed events end upgrade windows; if there was not an already started upgrade window,
@@ -290,6 +298,11 @@ func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConf
 					strings.Contains(condition.Message, `Waiting for Deployment`) {
 					return "csi snapshot controller is allowed to have Available=False due to CSI webhook test on two node"
 				}
+			case "etcd":
+				if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse &&
+					isTNFJobClusterOperatorReason(condition.Reason) {
+					return "clusteroperator/etcd may report Available=False while a TNF batch Job is running on dual-replica topology (CEO JobRunning condition reasons)"
+				}
 			}
 		}
 
@@ -600,11 +613,21 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes []
 	return ret
 }
 
-func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, isPatchLevelUpgrade bool) []*junitapi.JUnitTestCase {
+func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, isPatchLevelUpgrade bool, clientConfig *rest.Config) []*junitapi.JUnitTestCase {
 	var ret []*junitapi.JUnitTestCase
 	upgradeWindows := getUpgradeWindows(events)
 	multiUpgrades := platformidentification.UpgradeNumberDuringCollection(events, time.Time{}, time.Time{}) > 1
 
+	isTwoNode := false
+	if clientConfig != nil {
+		topology, err := getControlPlaneTopology(clientConfig)
+		if err != nil {
+			logrus.Warnf("Error checking for ControlPlaneTopology configuration for MCO co-progressing monitor (unable to apply two-node TNF exceptions): %v", err)
+		} else {
+			isTwoNode = topology == configv1.HighlyAvailableArbiterMode || topology == configv1.DualReplicaTopologyMode
+		}
+	}
+
 	var machineConfigProgressingStart time.Time
 	var eventsInUpgradeWindows monitorapi.Intervals
 
@@ -711,6 +734,10 @@ func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals,
 
 	except = func(co string, reason string) string {
 		switch co {
+		case "etcd":
+			if isTwoNode && isTNFJobClusterOperatorReason(reason) {
+				return "clusteroperator/etcd may report Progressing=True while a TNF batch Job is running during DualReplica topology upgrades (CEO JobRunning condition reasons)"
+			}
 		case "console":
 			if reason == "SyncLoopRefresh_InProgress" {
 				return "https://issues.redhat.com/browse/OCPBUGS-64688"
@@ -755,6 +782,10 @@ func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals,
 			if reason == "" {
 				return "https://issues.redhat.com/browse/OCPBUGS-63672"
 			}
+		case "openshift-apiserver":
+			if isTwoNode && reason == "OperatorConfig_NewGeneration" {
+				return "openshift-apiserver operator may reconcile openshiftapiserveroperatorconfigs (OperatorConfig_NewGeneration) during DualReplica upgrades while machine-config is progressing"
+			}
 		}
 		return ""
 	}
diff --git a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators_test.go b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators_test.go
index 017b8e966d6c..c6185d4a1751 100644
--- a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators_test.go
+++ b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators_test.go
@@ -370,3 +370,26 @@ func Test_patchUpgradeWithConfigClient(t *testing.T) {
 		})
 	}
 }
+
+func TestIsTNFJobClusterOperatorReason(t *testing.T) {
+	tests := []struct {
+		reason string
+		want   bool
+	}{
+		{"tnf-setup-job_JobRunning", true},
+		{"tnf-fencing-job_JobRunning", true},
+		{"tnf-auth-job-master-0-64736551_JobRunning", true},
+		{"tnf-update-setup-job-master-1-abc12345_JobRunning", true},
+		{"tnf-after-setup-job-master-0-deadbeef_JobRunning", true},
+		{"EtcdMembersProgressing", false},
+		{"NodeInstaller_InstallerPodRunning", false},
+		{"tnf-setup-job_JobComplete", false},
+		{"setup-job_JobRunning", false},
+		{"", false},
+	}
+	for _, tt := range tests {
+		t.Run(tt.reason, func(t *testing.T) {
+			assert.Equal(t, tt.want, isTNFJobClusterOperatorReason(tt.reason))
+		})
+	}
+}

From 5894664727fa84efc819f007bccf824bcc9e193d Mon Sep 17 00:00:00 2001
From: ehila <ehila@redhat.com>
Date: Thu, 7 May 2026 01:19:29 -0400
Subject: [PATCH 2/3] monitortests: broaden dual-replica CVO exceptions for TNF
 upgrade follow-ups

Allow etcd Progressing while MCO runs when CEO reports EtcdMembers_MembersNotStarted
(member still joining during fencing/replacement), in addition to existing tnf-*_JobRunning.

On dual-replica, tolerate openshift-apiserver Available blips for
APIServices_PreconditionNotReady during upgrade, and MCO-time Progressing for
OperatorConfig_NewGeneration (openshift-apiserver) and APIServerDeployment_NewGeneration
(authentication) as operators roll the control plane alongside machine-config.

Co-authored-by: Cursor Composer <noreply@cursor.com>
Signed-off-by: ehila <ehila@redhat.com>
---
 .../legacycvomonitortests/operators.go        | 25 ++++++++++++++-----
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go
index 5a49831171dd..76fb4839408e 100644
--- a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go
+++ b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go
@@ -379,13 +379,17 @@ func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConf
 				return "https://issues.redhat.com/browse/OCPBUGS-62517"
 			}
 		case "openshift-apiserver":
-			if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse &&
-				(condition.Reason == "APIServerDeployment_NoDeployment" ||
+			if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse {
+				if isTwoNode && condition.Reason == "APIServices_PreconditionNotReady" {
+					return "openshift-apiserver may briefly report Available=False with APIServices_PreconditionNotReady during dual-replica upgrade or fencing when aggregated API preconditions lag behind member recovery"
+				}
+				if condition.Reason == "APIServerDeployment_NoDeployment" ||
 					condition.Reason == "APIServerDeployment_NoPod" ||
 					condition.Reason == "APIServerDeployment_PreconditionNotFulfilled" ||
 					condition.Reason == "APIServerDeployment_UnavailablePod" ||
-					condition.Reason == "APIServices_Error") {
-				return "https://issues.redhat.com/browse/OCPBUGS-23746"
+					condition.Reason == "APIServices_Error" {
+					return "https://issues.redhat.com/browse/OCPBUGS-23746"
+				}
 			}
 		case "operator-lifecycle-manager-packageserver":
 			if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && condition.Reason == "ClusterServiceVersionNotSucceeded" {
@@ -734,9 +738,18 @@ func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals,
 
 	except = func(co string, reason string) string {
 		switch co {
+		case "authentication":
+			if isTwoNode && reason == "APIServerDeployment_NewGeneration" {
+				return "authentication operator may roll oauth-apiserver (APIServerDeployment_NewGeneration) during DualReplica upgrades while machine-config is progressing"
+			}
 		case "etcd":
-			if isTwoNode && isTNFJobClusterOperatorReason(reason) {
-				return "clusteroperator/etcd may report Progressing=True while a TNF batch Job is running during DualReplica topology upgrades (CEO JobRunning condition reasons)"
+			if isTwoNode {
+				if reason == "EtcdMembers_MembersNotStarted" {
+					return "clusteroperator/etcd may report Progressing=True while an etcd member is still joining (EtcdMembers_MembersNotStarted) during DualReplica fencing or replacement"
+				}
+				if isTNFJobClusterOperatorReason(reason) {
+					return "clusteroperator/etcd may report Progressing=True while a TNF batch Job is running during DualReplica topology upgrades (CEO JobRunning condition reasons)"
+				}
 			}
 		case "console":
 			if reason == "SyncLoopRefresh_InProgress" {

From 64fa46e80a2b8ffae3af4863edc9822268495ea8 Mon Sep 17 00:00:00 2001
From: ehila <ehila@redhat.com>
Date: Tue, 12 May 2026 23:43:19 -0400
Subject: [PATCH 3/3] monitortests: dual-replica etcd NodeInstaller and samples
 API flake exceptions

Allow clusteroperator/etcd Progressing=True with reason NodeInstaller while MCO
is progressing on DualReplica (static pod revision rollout overlapping MCO).

During upgrade on DualReplica, tolerate openshift-samples Available=False with
SampleUpsertsPending and Degraded=True with APIServerServiceUnavailableError
when template writes hit transient apiserver errors.

Co-authored-by: Cursor Composer <noreply@cursor.com>
Signed-off-by: ehila <ehila@redhat.com>
---
 .../legacycvomonitortests/operators.go             | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go
index 76fb4839408e..5f88269c55f2 100644
--- a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go
+++ b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go
@@ -391,6 +391,17 @@ func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConf
 					return "https://issues.redhat.com/browse/OCPBUGS-23746"
 				}
 			}
+		case "openshift-samples":
+			if isTwoNode {
+				if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse &&
+					condition.Reason == "SampleUpsertsPending" {
+					return "openshift-samples may report Available=False with SampleUpsertsPending when sample CR writes hit transient apiserver errors during DualReplica disruptive upgrades"
+				}
+				if condition.Type == configv1.OperatorDegraded && condition.Status == configv1.ConditionTrue &&
+					condition.Reason == "APIServerServiceUnavailableError" {
+					return "openshift-samples may report Degraded with APIServerServiceUnavailableError when the API server is briefly unavailable during DualReplica upgrades"
+				}
+			}
 		case "operator-lifecycle-manager-packageserver":
 			if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && condition.Reason == "ClusterServiceVersionNotSucceeded" {
 				return "https://issues.redhat.com/browse/OCPBUGS-23744"
@@ -744,6 +755,9 @@ func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals,
 			}
 		case "etcd":
 			if isTwoNode {
+				if reason == "NodeInstaller" {
+					return "clusteroperator/etcd may report Progressing=True while etcd static pods roll to a new revision (NodeInstaller) during DualReplica upgrades while machine-config is progressing"
+				}
 				if reason == "EtcdMembers_MembersNotStarted" {
 					return "clusteroperator/etcd may report Progressing=True while an etcd member is still joining (EtcdMembers_MembersNotStarted) during DualReplica fencing or replacement"
 				}