From c3e3fe086c4be3b97190f6fdd5b095825860fea2 Mon Sep 17 00:00:00 2001 From: ehila Date: Wed, 6 May 2026 15:34:09 -0400 Subject: [PATCH 1/3] monitortests: allow etcd CO blips during TNF jobs on two-node upgrades Teach legacy CVO monitor tests to treat cluster-etcd-operator condition reasons shaped as tnf-*_JobRunning as expected on DualReplica and HighlyAvailableArbiter topologies: Available=False during upgrade, and Progressing=True while machine-config is progressing. Add isTNFJobClusterOperatorReason helper and unit tests so we only match CEO job-running surfaces, not unrelated etcd failures. Co-authored-by: Cursor Composer Signed-off-by: ehila --- .../legacycvomonitortests/monitortest.go | 2 +- .../legacycvomonitortests/operators.go | 33 ++++++++++++++++++- .../legacycvomonitortests/operators_test.go | 23 +++++++++++++ 3 files changed, 56 insertions(+), 2 deletions(-) diff --git a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/monitortest.go b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/monitortest.go index 5e0fad828ca0..63d17ff98c0a 100644 --- a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/monitortest.go +++ b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/monitortest.go @@ -96,7 +96,7 @@ func (w *legacyMonitorTests) EvaluateTestsFromConstructedIntervals(ctx context.C if err != nil || level == unknownUpgradeLevel { return nil, fmt.Errorf("failed to determine upgrade level: %w", err) } - junits = append(junits, testUpgradeOperatorProgressingStateTransitions(finalIntervals, level == patchUpgradeLevel)...) + junits = append(junits, testUpgradeOperatorProgressingStateTransitions(finalIntervals, level == patchUpgradeLevel, w.adminRESTConfig)...) } else { junits = append(junits, testStableSystemOperatorStateTransitions(finalIntervals, w.adminRESTConfig)...) } diff --git a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go index 54a5bb2ae696..5a49831171dd 100644 --- a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go +++ b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go @@ -162,6 +162,14 @@ func getControlPlaneTopology(clientConfig *rest.Config) (configv1.TopologyMode, return *topo, nil } +// isTNFJobClusterOperatorReason matches ClusterOperator condition Reason values emitted while +// two-node fencing (TNF) batch Jobs run in openshift-etcd. The cluster-etcd-operator maps +// active Job state into etcd's ClusterOperator with reasons shaped like +// tnf-_JobRunning (including a per-job hash suffix on some Jobs, e.g. tnf-auth-job-master-0-64736551_JobRunning). +func isTNFJobClusterOperatorReason(reason string) bool { + return strings.HasPrefix(reason, "tnf-") && strings.HasSuffix(reason, "_JobRunning") +} + // isInUpgradeWindow determines if the given eventInterval falls within an upgrade window. // UpgradeStart and UpgradeRollback events start upgrade windows and can end and already started upgrade window. // UpgradeComplete and UpgradeFailed events end upgrade windows; if there was not an already started upgrade window, @@ -290,6 +298,11 @@ func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConf strings.Contains(condition.Message, `Waiting for Deployment`) { return "csi snapshot controller is allowed to have Available=False due to CSI webhook test on two node" } + case "etcd": + if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && + isTNFJobClusterOperatorReason(condition.Reason) { + return "clusteroperator/etcd may report Available=False while a TNF batch Job is running on dual-replica topology (CEO JobRunning condition reasons)" + } } } @@ -600,11 +613,21 @@ func testOperatorStateTransitions(events monitorapi.Intervals, conditionTypes [] return ret } -func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, isPatchLevelUpgrade bool) []*junitapi.JUnitTestCase { +func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, isPatchLevelUpgrade bool, clientConfig *rest.Config) []*junitapi.JUnitTestCase { var ret []*junitapi.JUnitTestCase upgradeWindows := getUpgradeWindows(events) multiUpgrades := platformidentification.UpgradeNumberDuringCollection(events, time.Time{}, time.Time{}) > 1 + isTwoNode := false + if clientConfig != nil { + topology, err := getControlPlaneTopology(clientConfig) + if err != nil { + logrus.Warnf("Error checking for ControlPlaneTopology configuration for MCO co-progressing monitor (unable to apply two-node TNF exceptions): %v", err) + } else { + isTwoNode = topology == configv1.HighlyAvailableArbiterMode || topology == configv1.DualReplicaTopologyMode + } + } + var machineConfigProgressingStart time.Time var eventsInUpgradeWindows monitorapi.Intervals @@ -711,6 +734,10 @@ func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, except = func(co string, reason string) string { switch co { + case "etcd": + if isTwoNode && isTNFJobClusterOperatorReason(reason) { + return "clusteroperator/etcd may report Progressing=True while a TNF batch Job is running during DualReplica topology upgrades (CEO JobRunning condition reasons)" + } case "console": if reason == "SyncLoopRefresh_InProgress" { return "https://issues.redhat.com/browse/OCPBUGS-64688" @@ -755,6 +782,10 @@ func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, if reason == "" { return "https://issues.redhat.com/browse/OCPBUGS-63672" } + case "openshift-apiserver": + if isTwoNode && reason == "OperatorConfig_NewGeneration" { + return "openshift-apiserver operator may reconcile openshiftapiserveroperatorconfigs (OperatorConfig_NewGeneration) during DualReplica upgrades while machine-config is progressing" + } } return "" } diff --git a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators_test.go b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators_test.go index 017b8e966d6c..c6185d4a1751 100644 --- a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators_test.go +++ b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators_test.go @@ -370,3 +370,26 @@ func Test_patchUpgradeWithConfigClient(t *testing.T) { }) } } + +func TestIsTNFJobClusterOperatorReason(t *testing.T) { + tests := []struct { + reason string + want bool + }{ + {"tnf-setup-job_JobRunning", true}, + {"tnf-fencing-job_JobRunning", true}, + {"tnf-auth-job-master-0-64736551_JobRunning", true}, + {"tnf-update-setup-job-master-1-abc12345_JobRunning", true}, + {"tnf-after-setup-job-master-0-deadbeef_JobRunning", true}, + {"EtcdMembersProgressing", false}, + {"NodeInstaller_InstallerPodRunning", false}, + {"tnf-setup-job_JobComplete", false}, + {"setup-job_JobRunning", false}, + {"", false}, + } + for _, tt := range tests { + t.Run(tt.reason, func(t *testing.T) { + assert.Equal(t, tt.want, isTNFJobClusterOperatorReason(tt.reason)) + }) + } +} From 5894664727fa84efc819f007bccf824bcc9e193d Mon Sep 17 00:00:00 2001 From: ehila Date: Thu, 7 May 2026 01:19:29 -0400 Subject: [PATCH 2/3] monitortests: broaden dual-replica CVO exceptions for TNF upgrade follow-ups Allow etcd Progressing while MCO runs when CEO reports EtcdMembers_MembersNotStarted (member still joining during fencing/replacement), in addition to existing tnf-*_JobRunning. On dual-replica, tolerate openshift-apiserver Available blips for APIServices_PreconditionNotReady during upgrade, and MCO-time Progressing for OperatorConfig_NewGeneration (openshift-apiserver) and APIServerDeployment_NewGeneration (authentication) as operators roll the control plane alongside machine-config. Co-authored-by: Cursor Composer Signed-off-by: ehila --- .../legacycvomonitortests/operators.go | 25 ++++++++++++++----- 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go index 5a49831171dd..76fb4839408e 100644 --- a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go +++ b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go @@ -379,13 +379,17 @@ func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConf return "https://issues.redhat.com/browse/OCPBUGS-62517" } case "openshift-apiserver": - if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && - (condition.Reason == "APIServerDeployment_NoDeployment" || + if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse { + if isTwoNode && condition.Reason == "APIServices_PreconditionNotReady" { + return "openshift-apiserver may briefly report Available=False with APIServices_PreconditionNotReady during dual-replica upgrade or fencing when aggregated API preconditions lag behind member recovery" + } + if condition.Reason == "APIServerDeployment_NoDeployment" || condition.Reason == "APIServerDeployment_NoPod" || condition.Reason == "APIServerDeployment_PreconditionNotFulfilled" || condition.Reason == "APIServerDeployment_UnavailablePod" || - condition.Reason == "APIServices_Error") { - return "https://issues.redhat.com/browse/OCPBUGS-23746" + condition.Reason == "APIServices_Error" { + return "https://issues.redhat.com/browse/OCPBUGS-23746" + } } case "operator-lifecycle-manager-packageserver": if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && condition.Reason == "ClusterServiceVersionNotSucceeded" { @@ -734,9 +738,18 @@ func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, except = func(co string, reason string) string { switch co { + case "authentication": + if isTwoNode && reason == "APIServerDeployment_NewGeneration" { + return "authentication operator may roll oauth-apiserver (APIServerDeployment_NewGeneration) during DualReplica upgrades while machine-config is progressing" + } case "etcd": - if isTwoNode && isTNFJobClusterOperatorReason(reason) { - return "clusteroperator/etcd may report Progressing=True while a TNF batch Job is running during DualReplica topology upgrades (CEO JobRunning condition reasons)" + if isTwoNode { + if reason == "EtcdMembers_MembersNotStarted" { + return "clusteroperator/etcd may report Progressing=True while an etcd member is still joining (EtcdMembers_MembersNotStarted) during DualReplica fencing or replacement" + } + if isTNFJobClusterOperatorReason(reason) { + return "clusteroperator/etcd may report Progressing=True while a TNF batch Job is running during DualReplica topology upgrades (CEO JobRunning condition reasons)" + } } case "console": if reason == "SyncLoopRefresh_InProgress" { From 64fa46e80a2b8ffae3af4863edc9822268495ea8 Mon Sep 17 00:00:00 2001 From: ehila Date: Tue, 12 May 2026 23:43:19 -0400 Subject: [PATCH 3/3] monitortests: dual-replica etcd NodeInstaller and samples API flake exceptions Allow clusteroperator/etcd Progressing=True with reason NodeInstaller while MCO is progressing on DualReplica (static pod revision rollout overlapping MCO). During upgrade on DualReplica, tolerate openshift-samples Available=False with SampleUpsertsPending and Degraded=True with APIServerServiceUnavailableError when template writes hit transient apiserver errors. Co-authored-by: Cursor Composer Signed-off-by: ehila --- .../legacycvomonitortests/operators.go | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go index 76fb4839408e..5f88269c55f2 100644 --- a/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go +++ b/pkg/monitortests/clusterversionoperator/legacycvomonitortests/operators.go @@ -391,6 +391,17 @@ func testUpgradeOperatorStateTransitions(events monitorapi.Intervals, clientConf return "https://issues.redhat.com/browse/OCPBUGS-23746" } } + case "openshift-samples": + if isTwoNode { + if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && + condition.Reason == "SampleUpsertsPending" { + return "openshift-samples may report Available=False with SampleUpsertsPending when sample CR writes hit transient apiserver errors during DualReplica disruptive upgrades" + } + if condition.Type == configv1.OperatorDegraded && condition.Status == configv1.ConditionTrue && + condition.Reason == "APIServerServiceUnavailableError" { + return "openshift-samples may report Degraded with APIServerServiceUnavailableError when the API server is briefly unavailable during DualReplica upgrades" + } + } case "operator-lifecycle-manager-packageserver": if condition.Type == configv1.OperatorAvailable && condition.Status == configv1.ConditionFalse && condition.Reason == "ClusterServiceVersionNotSucceeded" { return "https://issues.redhat.com/browse/OCPBUGS-23744" @@ -744,6 +755,9 @@ func testUpgradeOperatorProgressingStateTransitions(events monitorapi.Intervals, } case "etcd": if isTwoNode { + if reason == "NodeInstaller" { + return "clusteroperator/etcd may report Progressing=True while etcd static pods roll to a new revision (NodeInstaller) during DualReplica upgrades while machine-config is progressing" + } if reason == "EtcdMembers_MembersNotStarted" { return "clusteroperator/etcd may report Progressing=True while an etcd member is still joining (EtcdMembers_MembersNotStarted) during DualReplica fencing or replacement" }