From 0451c2be15e5ce81d79954c9afcc7ecc5ce5456d Mon Sep 17 00:00:00 2001 From: Britania Rodriguez Reyes <145056127+britaniar@users.noreply.github.com> Date: Mon, 23 Feb 2026 10:17:41 -0800 Subject: [PATCH 1/9] fix: fix flaky enveloped e2e (#455) add logging and minor changes to help debug and e2e Signed-off-by: Britania Rodriguez Reyes --- pkg/controllers/workgenerator/envelope.go | 8 +++++++- pkg/controllers/workgenerator/envelope_test.go | 12 ++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/pkg/controllers/workgenerator/envelope.go b/pkg/controllers/workgenerator/envelope.go index a7d2f5a8e..05518f98c 100644 --- a/pkg/controllers/workgenerator/envelope.go +++ b/pkg/controllers/workgenerator/envelope.go @@ -62,10 +62,10 @@ func (r *Reconciler) createOrUpdateEnvelopeCRWorkObj( fleetv1beta1.PlacementTrackingLabel: binding.GetLabels()[fleetv1beta1.PlacementTrackingLabel], fleetv1beta1.EnvelopeTypeLabel: envelopeReader.GetEnvelopeType(), fleetv1beta1.EnvelopeNameLabel: envelopeReader.GetName(), - fleetv1beta1.EnvelopeNamespaceLabel: envelopeReader.GetNamespace(), } // Add ParentNamespaceLabel if the binding is namespaced if binding.GetNamespace() != "" { + labelMatcher[fleetv1beta1.EnvelopeNamespaceLabel] = envelopeReader.GetNamespace() labelMatcher[fleetv1beta1.ParentNamespaceLabel] = binding.GetNamespace() } workList := &fleetv1beta1.WorkList{} @@ -87,6 +87,12 @@ func (r *Reconciler) createOrUpdateEnvelopeCRWorkObj( "resourceBinding", klog.KObj(binding), "resourceSnapshot", klog.KObj(resourceSnapshot), "envelope", envelopeReader.GetEnvelopeObjRef()) + // Log the work object names to help debug. + workNames := make([]string, len(workList.Items)) + for i := range workList.Items { + workNames[i] = workList.Items[i].Name + } + klog.ErrorS(wrappedErr, "Duplicate work objects found", "works", workNames) return nil, controller.NewUnexpectedBehaviorError(wrappedErr) case len(workList.Items) == 1: klog.V(2).InfoS("Found existing work object for the envelope; updating it", diff --git a/pkg/controllers/workgenerator/envelope_test.go b/pkg/controllers/workgenerator/envelope_test.go index 310553777..c8b112a53 100644 --- a/pkg/controllers/workgenerator/envelope_test.go +++ b/pkg/controllers/workgenerator/envelope_test.go @@ -483,6 +483,18 @@ func TestCreateOrUpdateEnvelopeCRWorkObj(t *testing.T) { want: nil, wantErr: true, }, + { + name: "two existing works should result in error", + envelopeReader: resourceEnvelope, + resourceOverrideSnapshotHash: "new-resource-hash", + clusterResourceOverrideSnapshotHash: "new-cluster-resource-hash", + existingObjects: func() []client.Object { + existingWork1 := existingWork.DeepCopy() + existingWork1.Name = "test-work-1" + return []client.Object{existingWork, existingWork1} + }(), + wantErr: true, + }, } for _, tt := range tests { From 83e9752b0ca1f4a4474a8e0f50c4225caad949ff Mon Sep 17 00:00:00 2001 From: Ryan Zhang Date: Tue, 24 Feb 2026 13:48:53 -0800 Subject: [PATCH 2/9] feat: Lint helm (#458) add helm lint to CI Signed-off-by: Ryan Zhang --- .github/workflows/code-lint.yml | 39 ++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/.github/workflows/code-lint.yml b/.github/workflows/code-lint.yml index 8bf3c0614..deeced9ec 100644 --- a/.github/workflows/code-lint.yml +++ b/.github/workflows/code-lint.yml @@ -14,10 +14,9 @@ on: env: # Common versions - GO_VERSION: '1.24.13' + GO_VERSION: "1.24.13" jobs: - detect-noop: runs-on: ubuntu-latest outputs: @@ -58,13 +57,33 @@ jobs: contents: read steps: - - name: Set up Go ${{ env.GO_VERSION }} - uses: actions/setup-go@v6 - with: - go-version: ${{ env.GO_VERSION }} + - name: Set up Go ${{ env.GO_VERSION }} + uses: actions/setup-go@v6 + with: + go-version: ${{ env.GO_VERSION }} - - name: Check out code into the Go module directory - uses: actions/checkout@v6.0.2 + - name: Check out code into the Go module directory + uses: actions/checkout@v6.0.2 + + - name: golangci-lint + run: make lint + + helm-lint: + name: "Helm Lint" + runs-on: ubuntu-latest + needs: detect-noop + if: needs.detect-noop.outputs.noop != 'true' + + steps: + - name: Check out code + uses: actions/checkout@v6.0.2 + + - name: Set up Helm + uses: azure/setup-helm@v4 + with: + version: v3.17.0 - - name: golangci-lint - run: make lint + - name: Lint Helm charts + run: | + helm lint charts/hub-agent + helm lint charts/member-agent From eade0c467875e30940d1ac08a2d8d48e36100d7c Mon Sep 17 00:00:00 2001 From: Britania Rodriguez Reyes <145056127+britaniar@users.noreply.github.com> Date: Tue, 24 Feb 2026 18:13:56 -0800 Subject: [PATCH 3/9] feat: add stage level metrics for update run (#437) --- pkg/controllers/updaterun/controller.go | 37 +-- .../updaterun/controller_integration_test.go | 113 ++++++++++ pkg/controllers/updaterun/execution.go | 16 +- .../updaterun/execution_integration_test.go | 210 ++++++++++++++++-- pkg/controllers/updaterun/metrics.go | 102 +++++++++ .../updaterun/stop_integration_test.go | 54 +++-- pkg/metrics/hub/metrics.go | 18 ++ 7 files changed, 476 insertions(+), 74 deletions(-) create mode 100644 pkg/controllers/updaterun/metrics.go diff --git a/pkg/controllers/updaterun/controller.go b/pkg/controllers/updaterun/controller.go index befdc3831..e1c77f455 100644 --- a/pkg/controllers/updaterun/controller.go +++ b/pkg/controllers/updaterun/controller.go @@ -23,7 +23,6 @@ import ( "fmt" "time" - "github.com/prometheus/client_golang/prometheus" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" @@ -40,7 +39,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" - hubmetrics "github.com/kubefleet-dev/kubefleet/pkg/metrics/hub" "github.com/kubefleet-dev/kubefleet/pkg/utils" "github.com/kubefleet-dev/kubefleet/pkg/utils/condition" "github.com/kubefleet-dev/kubefleet/pkg/utils/controller" @@ -237,8 +235,8 @@ func (r *Reconciler) handleDelete(ctx context.Context, updateRun placementv1beta } klog.V(2).InfoS("Deleted all approvalRequests associated with the updateRun", "updateRun", runObjRef) - // Delete the update run status metric. - hubmetrics.FleetUpdateRunStatusLastTimestampSeconds.DeletePartialMatch(prometheus.Labels{"namespace": updateRun.GetNamespace(), "name": updateRun.GetName()}) + // Delete the update run metrics. + deleteUpdateRunMetrics(updateRun) controllerutil.RemoveFinalizer(updateRun, placementv1beta1.UpdateRunFinalizer) if err := r.Client.Update(ctx, updateRun); err != nil { @@ -485,37 +483,6 @@ func handleApprovalRequestDelete(obj client.Object, q workqueue.TypedRateLimitin }) } -// emitUpdateRunStatusMetric emits the update run status metric based on status conditions in the updateRun. -func emitUpdateRunStatusMetric(updateRun placementv1beta1.UpdateRunObj) { - generation := updateRun.GetGeneration() - state := updateRun.GetUpdateRunSpec().State - - updateRunStatus := updateRun.GetUpdateRunStatus() - succeedCond := meta.FindStatusCondition(updateRunStatus.Conditions, string(placementv1beta1.StagedUpdateRunConditionSucceeded)) - if succeedCond != nil && succeedCond.ObservedGeneration == generation { - hubmetrics.FleetUpdateRunStatusLastTimestampSeconds.WithLabelValues(updateRun.GetNamespace(), updateRun.GetName(), string(state), - string(placementv1beta1.StagedUpdateRunConditionSucceeded), string(succeedCond.Status), succeedCond.Reason).SetToCurrentTime() - return - } - - progressingCond := meta.FindStatusCondition(updateRunStatus.Conditions, string(placementv1beta1.StagedUpdateRunConditionProgressing)) - if progressingCond != nil && progressingCond.ObservedGeneration == generation { - hubmetrics.FleetUpdateRunStatusLastTimestampSeconds.WithLabelValues(updateRun.GetNamespace(), updateRun.GetName(), string(state), - string(placementv1beta1.StagedUpdateRunConditionProgressing), string(progressingCond.Status), progressingCond.Reason).SetToCurrentTime() - return - } - - initializedCond := meta.FindStatusCondition(updateRunStatus.Conditions, string(placementv1beta1.StagedUpdateRunConditionInitialized)) - if initializedCond != nil && initializedCond.ObservedGeneration == generation { - hubmetrics.FleetUpdateRunStatusLastTimestampSeconds.WithLabelValues(updateRun.GetNamespace(), updateRun.GetName(), string(state), - string(placementv1beta1.StagedUpdateRunConditionInitialized), string(initializedCond.Status), initializedCond.Reason).SetToCurrentTime() - return - } - - // We should rarely reach here, it can only happen when updating updateRun status fails. - klog.V(2).InfoS("There's no valid status condition on updateRun, status updating failed possibly", "updateRun", klog.KObj(updateRun)) -} - func removeWaitTimeFromUpdateRunStatus(updateRun placementv1beta1.UpdateRunObj) { // Remove waitTime from the updateRun status for BeforeStageTask and AfterStageTask for type Approval. updateRunStatus := updateRun.GetUpdateRunStatus() diff --git a/pkg/controllers/updaterun/controller_integration_test.go b/pkg/controllers/updaterun/controller_integration_test.go index eb5db68a1..0d0efc63f 100644 --- a/pkg/controllers/updaterun/controller_integration_test.go +++ b/pkg/controllers/updaterun/controller_integration_test.go @@ -234,6 +234,8 @@ var _ = Describe("Test the clusterStagedUpdateRun controller", func() { func resetUpdateRunMetrics() { hubmetrics.FleetUpdateRunStatusLastTimestampSeconds.Reset() + hubmetrics.FleetUpdateRunStageClusterUpdatingDurationSeconds.Reset() + hubmetrics.FleetUpdateRunApprovalRequestLatencySeconds.Reset() } // validateUpdateRunMetricsEmitted validates the update run status metrics are emitted and are emitted in the correct order. @@ -258,6 +260,117 @@ func validateUpdateRunMetricsEmitted(wantMetrics ...*prometheusclientmodel.Metri }, timeout, interval).Should(Succeed(), "failed to validate the update run status metrics") } +// validateUpdateRunApprovalStageTaskMetric validates the update run approval stage task metric by checking labels and count. +func validateUpdateRunApprovalStageTaskMetric(wantMetrics ...*prometheusclientmodel.Metric) { + Eventually(func() error { + metricFamilies, err := ctrlmetrics.Registry.Gather() + if err != nil { + return fmt.Errorf("failed to gather metrics: %w", err) + } + var gotMetrics []*prometheusclientmodel.Metric + for _, mf := range metricFamilies { + if mf.GetName() == "fleet_workload_update_run_approval_request_latency_seconds" { + gotMetrics = mf.GetMetric() + break + } + } + + if len(gotMetrics) != len(wantMetrics) { + return fmt.Errorf("metric count mismatch: got %d, want %d", len(gotMetrics), len(wantMetrics)) + } + + for i, m := range gotMetrics { + // Compare labels by extracting values (avoids protobuf unexported field issues). + gotLabels := make(map[string]string) + for _, l := range m.GetLabel() { + gotLabels[l.GetName()] = l.GetValue() + } + wantLabels := make(map[string]string) + for _, l := range wantMetrics[i].GetLabel() { + wantLabels[l.GetName()] = l.GetValue() + } + if diff := cmp.Diff(gotLabels, wantLabels); diff != "" { + return fmt.Errorf("metric labels mismatch (-got, +want):\n%s", diff) + } + if m.GetHistogram().GetSampleCount() != wantMetrics[i].GetHistogram().GetSampleCount() { + return fmt.Errorf("metric sample count mismatch: got %d, want %d", m.GetHistogram().GetSampleCount(), wantMetrics[i].GetHistogram().GetSampleCount()) + } + } + return nil + }, timeout, interval).Should(Succeed(), "failed to validate the update run approval stage task metrics") +} + +// validateUpdateRunStageMetricsEmitted validates the update run stage metrics by checking labels and count. +func validateUpdateRunStageMetricsEmitted(wantMetrics ...*prometheusclientmodel.Metric) { + Eventually(func() error { + metricFamilies, err := ctrlmetrics.Registry.Gather() + if err != nil { + return fmt.Errorf("failed to gather metrics: %w", err) + } + var gotMetrics []*prometheusclientmodel.Metric + for _, mf := range metricFamilies { + if mf.GetName() == "fleet_workload_update_run_stage_cluster_updating_duration_seconds" { + gotMetrics = mf.GetMetric() + break + } + } + + if len(gotMetrics) != len(wantMetrics) { + return fmt.Errorf("metric count mismatch: got %d, want %d", len(gotMetrics), len(wantMetrics)) + } + + for i, m := range gotMetrics { + // Compare labels by extracting values (avoids protobuf unexported field issues). + gotLabels := make(map[string]string) + for _, l := range m.GetLabel() { + gotLabels[l.GetName()] = l.GetValue() + } + wantLabels := make(map[string]string) + for _, l := range wantMetrics[i].GetLabel() { + wantLabels[l.GetName()] = l.GetValue() + } + if diff := cmp.Diff(gotLabels, wantLabels); diff != "" { + return fmt.Errorf("metric labels mismatch (-got, +want):\n%s", diff) + } + if m.GetHistogram().GetSampleCount() != wantMetrics[i].GetHistogram().GetSampleCount() { + return fmt.Errorf("metric sample count mismatch: got %d, want %d", m.GetHistogram().GetSampleCount(), wantMetrics[i].GetHistogram().GetSampleCount()) + } + } + return nil + }, timeout, interval).Should(Succeed(), "failed to validate the update run stage metrics") +} + +func generateStageClusterUpdatingMetric( + updateRun *placementv1beta1.ClusterStagedUpdateRun, +) *prometheusclientmodel.Metric { + return &prometheusclientmodel.Metric{ + Label: []*prometheusclientmodel.LabelPair{ + {Name: ptr.To("namespace"), Value: &updateRun.Namespace}, + {Name: ptr.To("name"), Value: &updateRun.Name}, + }, + Histogram: &prometheusclientmodel.Histogram{ + SampleCount: ptr.To(uint64(len(updateRun.Status.StagesStatus))), + }, + } +} + +func generateApprovalStageTaskMetric( + updateRun *placementv1beta1.ClusterStagedUpdateRun, + stageTask string, + stageTaskCount uint64, +) *prometheusclientmodel.Metric { + return &prometheusclientmodel.Metric{ + Label: []*prometheusclientmodel.LabelPair{ + {Name: ptr.To("namespace"), Value: &updateRun.Namespace}, + {Name: ptr.To("name"), Value: &updateRun.Name}, + {Name: ptr.To("taskType"), Value: ptr.To(stageTask)}, + }, + Histogram: &prometheusclientmodel.Histogram{ + SampleCount: ptr.To(stageTaskCount), + }, + } +} + // generateMetricsLabels generates the labels for the update run status metrics. // We pass the state explicitly instead of using updateRun.Spec.State because the metric // should reflect the state at the time the condition occurred, which may be different from diff --git a/pkg/controllers/updaterun/execution.go b/pkg/controllers/updaterun/execution.go index bdb37ac62..09044149e 100644 --- a/pkg/controllers/updaterun/execution.go +++ b/pkg/controllers/updaterun/execution.go @@ -275,6 +275,14 @@ func (r *Reconciler) executeUpdatingStage( } if finishedClusterCount == len(updatingStageStatus.Clusters) { + // Only record the metric once when transitioning from clusters updating to waiting/succeeded. + // Record only when the stage reason is still "Started", meaning clusters just finished and we haven't yet + // transitioned to waiting for after-stage tasks. On subsequent reconciles, the reason will be "Waiting", + // "Succeeded", or "Stopped" (if the update run was stopped), so the metric won't be recorded again. + progressingCond := meta.FindStatusCondition(updatingStageStatus.Conditions, string(placementv1beta1.StageUpdatingConditionProgressing)) + if progressingCond != nil && progressingCond.Reason == condition.StageUpdatingStartedReason { + recordStageClusterUpdatingDuration(updatingStageStatus, updateRun) + } return r.handleStageCompletion(ctx, updatingStageIndex, updateRun, updatingStageStatus) } @@ -477,7 +485,7 @@ func (r *Reconciler) handleStageApprovalTask( // Approved state should not change once the approval is accepted. klog.V(2).InfoS("The approval request has been approval-accepted, ignoring changing back to unapproved", "approvalRequestTask", requestRef, "stage", updatingStage.Name, "updateRun", updateRunRef) } - markStageTaskRequestApproved(stageTaskStatus, updateRun.GetGeneration()) + markStageTaskRequestApproved(stageTaskStatus, updateRun, stageTaskType) } else { // retriable error klog.ErrorS(err, "Failed to create the approval request", "approvalRequest", requestRef, "stage", updatingStage.Name, "updateRun", updateRunRef) @@ -825,14 +833,16 @@ func markStageTaskRequestCreated(stageTaskStatus *placementv1beta1.StageTaskStat } // markStageTaskRequestApproved marks the Approval for the before or after stage task as Approved in memory. -func markStageTaskRequestApproved(stageTaskStatus *placementv1beta1.StageTaskStatus, generation int64) { +func markStageTaskRequestApproved(stageTaskStatus *placementv1beta1.StageTaskStatus, updateRun placementv1beta1.UpdateRunObj, taskType string) { meta.SetStatusCondition(&stageTaskStatus.Conditions, metav1.Condition{ Type: string(placementv1beta1.StageTaskConditionApprovalRequestApproved), Status: metav1.ConditionTrue, - ObservedGeneration: generation, + ObservedGeneration: updateRun.GetGeneration(), Reason: condition.StageTaskApprovalRequestApprovedReason, Message: "ApprovalRequest object is approved", }) + + recordApprovalRequestLatency(stageTaskStatus, updateRun, taskType) } // markAfterStageWaitTimeElapsed marks the TimeWait after stage task as TimeElapsed in memory. diff --git a/pkg/controllers/updaterun/execution_integration_test.go b/pkg/controllers/updaterun/execution_integration_test.go index 219c22fdb..3600e4268 100644 --- a/pkg/controllers/updaterun/execution_integration_test.go +++ b/pkg/controllers/updaterun/execution_integration_test.go @@ -148,7 +148,7 @@ var _ = Describe("UpdateRun execution tests - double stages", func() { Expect(k8sClient.Delete(ctx, clusterResourceOverride)).Should(SatisfyAny(Succeed(), utils.NotFoundMatcher{})) clusterResourceOverride = nil - By("Checking update run status metrics are removed") + By("Checking update run metrics are removed") // No metrics are emitted as all are removed after updateRun is deleted. validateUpdateRunMetricsEmitted() resetUpdateRunMetrics() @@ -214,6 +214,7 @@ var _ = Describe("UpdateRun execution tests - double stages", func() { // Approval task has been approved. wantStatus.StagesStatus[0].BeforeStageTaskStatus[0].Conditions = append(wantStatus.StagesStatus[0].BeforeStageTaskStatus[0].Conditions, generateTrueCondition(updateRun, placementv1beta1.StageTaskConditionApprovalRequestApproved)) + }) It("Should mark the 1st cluster in the 1st stage as succeeded after marking the binding available", func() { @@ -236,8 +237,9 @@ var _ = Describe("UpdateRun execution tests - double stages", func() { By("Validating the 1st stage has startTime set") Expect(updateRun.Status.StagesStatus[0].StartTime).ShouldNot(BeNil()) - By("Checking update run status metrics are emitted") + By("Checking update run metrics are emitted") validateUpdateRunMetricsEmitted(generateWaitingMetric(placementv1beta1.StateRun, updateRun), generateProgressingMetric(placementv1beta1.StateRun, updateRun)) + validateUpdateRunApprovalStageTaskMetric(generateApprovalStageTaskMetric(updateRun, placementv1beta1.BeforeStageTaskLabelValue, 1)) }) It("Should mark the 2nd cluster in the 1st stage as succeeded after marking the binding available", func() { @@ -254,8 +256,9 @@ var _ = Describe("UpdateRun execution tests - double stages", func() { wantStatus.StagesStatus[0].Clusters[2].Conditions = append(wantStatus.StagesStatus[0].Clusters[2].Conditions, generateTrueCondition(updateRun, placementv1beta1.ClusterUpdatingConditionStarted)) validateClusterStagedUpdateRunStatus(ctx, updateRun, wantStatus, "") - By("Checking update run status metrics are emitted") + By("Checking update run metrics are emitted") validateUpdateRunMetricsEmitted(generateWaitingMetric(placementv1beta1.StateRun, updateRun), generateProgressingMetric(placementv1beta1.StateRun, updateRun)) + validateUpdateRunApprovalStageTaskMetric(generateApprovalStageTaskMetric(updateRun, placementv1beta1.BeforeStageTaskLabelValue, 1)) }) It("Should mark the 3rd cluster in the 1st stage as succeeded after marking the binding available", func() { @@ -272,8 +275,9 @@ var _ = Describe("UpdateRun execution tests - double stages", func() { wantStatus.StagesStatus[0].Clusters[3].Conditions = append(wantStatus.StagesStatus[0].Clusters[3].Conditions, generateTrueCondition(updateRun, placementv1beta1.ClusterUpdatingConditionStarted)) validateClusterStagedUpdateRunStatus(ctx, updateRun, wantStatus, "") - By("Checking update run status metrics are emitted") + By("Checking update run metrics are emitted") validateUpdateRunMetricsEmitted(generateWaitingMetric(placementv1beta1.StateRun, updateRun), generateProgressingMetric(placementv1beta1.StateRun, updateRun)) + validateUpdateRunApprovalStageTaskMetric(generateApprovalStageTaskMetric(updateRun, placementv1beta1.BeforeStageTaskLabelValue, 1)) }) It("Should mark the 4th cluster in the 1st stage as succeeded after marking the binding available", func() { @@ -290,8 +294,9 @@ var _ = Describe("UpdateRun execution tests - double stages", func() { wantStatus.StagesStatus[0].Clusters[4].Conditions = append(wantStatus.StagesStatus[0].Clusters[4].Conditions, generateTrueCondition(updateRun, placementv1beta1.ClusterUpdatingConditionStarted)) validateClusterStagedUpdateRunStatus(ctx, updateRun, wantStatus, "") - By("Checking update run status metrics are emitted") + By("Checking update run metrics are emitted") validateUpdateRunMetricsEmitted(generateWaitingMetric(placementv1beta1.StateRun, updateRun), generateProgressingMetric(placementv1beta1.StateRun, updateRun)) + validateUpdateRunApprovalStageTaskMetric(generateApprovalStageTaskMetric(updateRun, placementv1beta1.BeforeStageTaskLabelValue, 1)) }) It("Should mark the 5th cluster in the 1st stage as succeeded after marking the binding available", func() { @@ -312,8 +317,9 @@ var _ = Describe("UpdateRun execution tests - double stages", func() { meta.SetStatusCondition(&wantStatus.Conditions, generateFalseCondition(updateRun, placementv1beta1.StagedUpdateRunConditionProgressing)) validateClusterStagedUpdateRunStatus(ctx, updateRun, wantStatus, "") - By("Checking update run status metrics are emitted") + By("Checking update run metrics are emitted") validateUpdateRunMetricsEmitted(generateProgressingMetric(placementv1beta1.StateRun, updateRun), generateWaitingMetric(placementv1beta1.StateRun, updateRun)) + validateUpdateRunApprovalStageTaskMetric(generateApprovalStageTaskMetric(updateRun, placementv1beta1.BeforeStageTaskLabelValue, 1)) }) It("Should complete the 1st stage after wait time passed and approval request approved and move on to the 2nd stage", func() { @@ -368,8 +374,9 @@ var _ = Describe("UpdateRun execution tests - double stages", func() { approvalCreateTime := meta.FindStatusCondition(updateRun.Status.StagesStatus[0].AfterStageTaskStatus[1].Conditions, string(placementv1beta1.StageTaskConditionApprovalRequestCreated)).LastTransitionTime.Time Expect(approvalCreateTime.Before(waitEndTime)).Should(BeTrue()) - By("Checking update run status metrics are emitted") + By("Checking update run metrics are emitted") validateUpdateRunMetricsEmitted(generateProgressingMetric(placementv1beta1.StateRun, updateRun), generateWaitingMetric(placementv1beta1.StateRun, updateRun)) + validateUpdateRunApprovalStageTaskMetric(generateApprovalStageTaskMetric(updateRun, placementv1beta1.AfterStageTaskLabelValue, 1), generateApprovalStageTaskMetric(updateRun, placementv1beta1.BeforeStageTaskLabelValue, 1)) }) It("Should create approval request before 2nd stage", func() { @@ -391,8 +398,9 @@ var _ = Describe("UpdateRun execution tests - double stages", func() { } validateApprovalRequestCreated(wantApprovalRequest) - By("Checking update run status metrics are emitted") + By("Checking update run metrics are emitted") validateUpdateRunMetricsEmitted(generateProgressingMetric(placementv1beta1.StateRun, updateRun), generateWaitingMetric(placementv1beta1.StateRun, updateRun)) + validateUpdateRunApprovalStageTaskMetric(generateApprovalStageTaskMetric(updateRun, placementv1beta1.AfterStageTaskLabelValue, 1), generateApprovalStageTaskMetric(updateRun, placementv1beta1.BeforeStageTaskLabelValue, 1)) }) It("Should not start rolling out 2nd stage", func() { @@ -446,8 +454,9 @@ var _ = Describe("UpdateRun execution tests - double stages", func() { By("Validating the 2nd stage has startTime set") Expect(updateRun.Status.StagesStatus[0].StartTime).ShouldNot(BeNil()) - By("Checking update run status metrics are emitted") + By("Checking update run metrics are emitted") validateUpdateRunMetricsEmitted(generateWaitingMetric(placementv1beta1.StateRun, updateRun), generateProgressingMetric(placementv1beta1.StateRun, updateRun)) + validateUpdateRunApprovalStageTaskMetric(generateApprovalStageTaskMetric(updateRun, placementv1beta1.AfterStageTaskLabelValue, 1), generateApprovalStageTaskMetric(updateRun, placementv1beta1.BeforeStageTaskLabelValue, 2)) }) It("Should mark the 2nd cluster in the 2nd stage as succeeded after marking the binding available", func() { @@ -464,8 +473,9 @@ var _ = Describe("UpdateRun execution tests - double stages", func() { wantStatus.StagesStatus[1].Clusters[2].Conditions = append(wantStatus.StagesStatus[1].Clusters[2].Conditions, generateTrueCondition(updateRun, placementv1beta1.ClusterUpdatingConditionStarted)) validateClusterStagedUpdateRunStatus(ctx, updateRun, wantStatus, "") - By("Checking update run status metrics are emitted") + By("Checking update run metrics are emitted") validateUpdateRunMetricsEmitted(generateWaitingMetric(placementv1beta1.StateRun, updateRun), generateProgressingMetric(placementv1beta1.StateRun, updateRun)) + validateUpdateRunApprovalStageTaskMetric(generateApprovalStageTaskMetric(updateRun, placementv1beta1.AfterStageTaskLabelValue, 1), generateApprovalStageTaskMetric(updateRun, placementv1beta1.BeforeStageTaskLabelValue, 2)) }) It("Should mark the 3rd cluster in the 2nd stage as succeeded after marking the binding available", func() { @@ -482,8 +492,9 @@ var _ = Describe("UpdateRun execution tests - double stages", func() { wantStatus.StagesStatus[1].Clusters[3].Conditions = append(wantStatus.StagesStatus[1].Clusters[3].Conditions, generateTrueCondition(updateRun, placementv1beta1.ClusterUpdatingConditionStarted)) validateClusterStagedUpdateRunStatus(ctx, updateRun, wantStatus, "") - By("Checking update run status metrics are emitted") + By("Checking update run metrics are emitted") validateUpdateRunMetricsEmitted(generateWaitingMetric(placementv1beta1.StateRun, updateRun), generateProgressingMetric(placementv1beta1.StateRun, updateRun)) + validateUpdateRunApprovalStageTaskMetric(generateApprovalStageTaskMetric(updateRun, placementv1beta1.AfterStageTaskLabelValue, 1), generateApprovalStageTaskMetric(updateRun, placementv1beta1.BeforeStageTaskLabelValue, 2)) }) It("Should mark the 4th cluster in the 2nd stage as succeeded after marking the binding available", func() { @@ -500,8 +511,9 @@ var _ = Describe("UpdateRun execution tests - double stages", func() { wantStatus.StagesStatus[1].Clusters[4].Conditions = append(wantStatus.StagesStatus[1].Clusters[4].Conditions, generateTrueCondition(updateRun, placementv1beta1.ClusterUpdatingConditionStarted)) validateClusterStagedUpdateRunStatus(ctx, updateRun, wantStatus, "") - By("Checking update run status metrics are emitted") + By("Checking update run metrics are emitted") validateUpdateRunMetricsEmitted(generateWaitingMetric(placementv1beta1.StateRun, updateRun), generateProgressingMetric(placementv1beta1.StateRun, updateRun)) + validateUpdateRunApprovalStageTaskMetric(generateApprovalStageTaskMetric(updateRun, placementv1beta1.AfterStageTaskLabelValue, 1), generateApprovalStageTaskMetric(updateRun, placementv1beta1.BeforeStageTaskLabelValue, 2)) }) It("Should mark the 5th cluster in the 2nd stage as succeeded after marking the binding available", func() { @@ -521,8 +533,9 @@ var _ = Describe("UpdateRun execution tests - double stages", func() { meta.SetStatusCondition(&wantStatus.Conditions, generateFalseCondition(updateRun, placementv1beta1.StagedUpdateRunConditionProgressing)) validateClusterStagedUpdateRunStatus(ctx, updateRun, wantStatus, "") - By("Checking update run status metrics are emitted") + By("Checking update run metrics are emitted") validateUpdateRunMetricsEmitted(generateProgressingMetric(placementv1beta1.StateRun, updateRun), generateWaitingMetric(placementv1beta1.StateRun, updateRun)) + validateUpdateRunApprovalStageTaskMetric(generateApprovalStageTaskMetric(updateRun, placementv1beta1.AfterStageTaskLabelValue, 1), generateApprovalStageTaskMetric(updateRun, placementv1beta1.BeforeStageTaskLabelValue, 2)) }) It("Should complete the 2nd stage after both after stage tasks are completed and move on to the delete stage", func() { @@ -584,8 +597,9 @@ var _ = Describe("UpdateRun execution tests - double stages", func() { return condition.IsConditionStatusTrue(meta.FindStatusCondition(approvalRequest.Status.Conditions, string(placementv1beta1.ApprovalRequestConditionApprovalAccepted)), approvalRequest.Generation), nil }, timeout, interval).Should(BeTrue(), "failed to validate the approvalRequest approval accepted") - By("Checking update run status metrics are emitted") + By("Checking update run metrics are emitted") validateUpdateRunMetricsEmitted(generateWaitingMetric(placementv1beta1.StateRun, updateRun), generateProgressingMetric(placementv1beta1.StateRun, updateRun)) + validateUpdateRunApprovalStageTaskMetric(generateApprovalStageTaskMetric(updateRun, placementv1beta1.AfterStageTaskLabelValue, 2), generateApprovalStageTaskMetric(updateRun, placementv1beta1.BeforeStageTaskLabelValue, 2)) }) It("Should delete all the clusterResourceBindings in the delete stage and complete the update run", func() { @@ -616,8 +630,10 @@ var _ = Describe("UpdateRun execution tests - double stages", func() { wantStatus.Conditions = append(wantStatus.Conditions, generateTrueCondition(updateRun, placementv1beta1.StagedUpdateRunConditionSucceeded)) validateClusterStagedUpdateRunStatus(ctx, updateRun, wantStatus, "") - By("Checking update run status metrics are emitted") + By("Checking update run metrics are emitted") validateUpdateRunMetricsEmitted(generateWaitingMetric(placementv1beta1.StateRun, updateRun), generateProgressingMetric(placementv1beta1.StateRun, updateRun), generateSucceededMetric(placementv1beta1.StateRun, updateRun)) + validateUpdateRunApprovalStageTaskMetric(generateApprovalStageTaskMetric(updateRun, placementv1beta1.AfterStageTaskLabelValue, 2), generateApprovalStageTaskMetric(updateRun, placementv1beta1.BeforeStageTaskLabelValue, 2)) + validateUpdateRunStageMetricsEmitted(generateStageClusterUpdatingMetric(updateRun)) }) }) @@ -803,7 +819,7 @@ var _ = Describe("UpdateRun execution tests - single stage", func() { resourceSnapshot = nil }) - Context("Cluster staged update run should update clusters one by one - no after stage task", Ordered, func() { + Context("Cluster staged update run should update clusters one by one - no before or after stage task", Ordered, func() { BeforeAll(func() { By("Creating a new clusterStagedUpdateRun") Expect(k8sClient.Create(ctx, updateRun)).To(Succeed()) @@ -881,8 +897,155 @@ var _ = Describe("UpdateRun execution tests - single stage", func() { By("Validating the 1st stage has endTime set") Expect(updateRun.Status.StagesStatus[0].EndTime).ShouldNot(BeNil()) - By("Checking update run status metrics are emitted") + By("Checking update run metrics are emitted") validateUpdateRunMetricsEmitted(generateProgressingMetric(placementv1beta1.StateRun, updateRun), generateSucceededMetric(placementv1beta1.StateRun, updateRun)) + validateUpdateRunStageMetricsEmitted(generateStageClusterUpdatingMetric(updateRun)) + }) + }) + + Context("Cluster staged update run should update clusters one by one - single approval before-stage task", Ordered, func() { + var wantApprovalRequest *placementv1beta1.ClusterApprovalRequest + + BeforeAll(func() { + By("Creating a strategy with single stage and approval after stage task") + updateStrategy.Spec.Stages[0].BeforeStageTasks = []placementv1beta1.StageTask{ + { + Type: placementv1beta1.StageTaskTypeApproval, + }, + } + Expect(k8sClient.Update(ctx, updateStrategy)).To(Succeed()) + + By("Creating a new clusterStagedUpdateRun") + Expect(k8sClient.Create(ctx, updateRun)).To(Succeed()) + + By("Validating the initialization succeeded and the execution started") + initialized := generateSucceededInitializationStatusForSmallClusters(crp, updateRun, testResourceSnapshotIndex, policySnapshot, updateStrategy, 0) + wantStatus = generateExecutionNotStartedStatus(updateRun, initialized) + validateClusterStagedUpdateRunStatus(ctx, updateRun, wantStatus, "") + + By("Validating the beforeStage approvalRequest has been created") + wantApprovalRequest = &placementv1beta1.ClusterApprovalRequest{ + ObjectMeta: metav1.ObjectMeta{ + Name: updateRun.Status.StagesStatus[0].BeforeStageTaskStatus[0].ApprovalRequestName, + Labels: map[string]string{ + placementv1beta1.TargetUpdatingStageNameLabel: updateRun.Status.StagesStatus[0].StageName, + placementv1beta1.TargetUpdateRunLabel: updateRun.Name, + placementv1beta1.TaskTypeLabel: placementv1beta1.BeforeStageTaskLabelValue, + placementv1beta1.IsLatestUpdateRunApprovalLabel: "true", + }, + }, + Spec: placementv1beta1.ApprovalRequestSpec{ + TargetUpdateRun: updateRun.Name, + TargetStage: updateRun.Status.StagesStatus[0].StageName, + }, + } + validateApprovalRequestCreated(wantApprovalRequest) + + By("Checking update run status metrics are emitted") + validateUpdateRunMetricsEmitted(generateWaitingMetric(placementv1beta1.StateRun, updateRun)) + }) + + It("Should not start rolling out 1st stage", func() { + By("Validating the 1st clusterResourceBinding is not updated to Bound") + binding := resourceBindings[0] + validateNotBoundBindingState(ctx, binding) + + By("Validating the 1st stage does not have startTime set") + Expect(updateRun.Status.StagesStatus[0].StartTime).Should(BeNil()) + + By("Checking update run status metrics are emitted") + validateUpdateRunMetricsEmitted(generateWaitingMetric(placementv1beta1.StateRun, updateRun)) + }) + + It("Should accept the approval request and start to rollout 1st stage", func() { + By("Approving the approvalRequest") + approveClusterApprovalRequest(ctx, wantApprovalRequest.Name) + + By("Validating the approvalRequest has ApprovalAccepted status") + Eventually(func() (bool, error) { + var approvalRequest placementv1beta1.ClusterApprovalRequest + if err := k8sClient.Get(ctx, types.NamespacedName{Name: wantApprovalRequest.Name}, &approvalRequest); err != nil { + return false, err + } + return condition.IsConditionStatusTrue(meta.FindStatusCondition(approvalRequest.Status.Conditions, string(placementv1beta1.ApprovalRequestConditionApprovalAccepted)), approvalRequest.Generation), nil + }, timeout, interval).Should(BeTrue(), "failed to validate the approvalRequest approval accepted") + // Approval task has been approved. + wantStatus.StagesStatus[0].BeforeStageTaskStatus[0].Conditions = append(wantStatus.StagesStatus[0].BeforeStageTaskStatus[0].Conditions, + generateTrueCondition(updateRun, placementv1beta1.StageTaskConditionApprovalRequestApproved)) + wantStatus = generateExecutionStartedStatus(updateRun, wantStatus) + }) + + It("Should mark the 1st cluster in the 1st stage as succeeded after marking the binding available", func() { + By("Validating the 1st clusterResourceBinding is updated to Bound") + binding := resourceBindings[0] // cluster-0 + validateBindingState(ctx, binding, resourceSnapshot.Name, updateRun, 0) + + By("Updating the 1st clusterResourceBinding to Available") + meta.SetStatusCondition(&binding.Status.Conditions, generateTrueCondition(binding, placementv1beta1.ResourceBindingAvailable)) + Expect(k8sClient.Status().Update(ctx, binding)).Should(Succeed(), "failed to update the binding status") + + By("Validating the 1st cluster has succeeded and 2nd cluster has started") + wantStatus.StagesStatus[0].Clusters[0].Conditions = append(wantStatus.StagesStatus[0].Clusters[0].Conditions, generateTrueCondition(updateRun, placementv1beta1.ClusterUpdatingConditionSucceeded)) + wantStatus.StagesStatus[0].Clusters[1].Conditions = append(wantStatus.StagesStatus[0].Clusters[1].Conditions, generateTrueCondition(updateRun, placementv1beta1.ClusterUpdatingConditionStarted)) + validateClusterStagedUpdateRunStatus(ctx, updateRun, wantStatus, "") + + By("Validating the 1st stage has startTime set") + Expect(updateRun.Status.StagesStatus[0].StartTime).ShouldNot(BeNil()) + + By("Checking update run status metrics are emitted") + validateUpdateRunMetricsEmitted(generateWaitingMetric(placementv1beta1.StateRun, updateRun), generateProgressingMetric(placementv1beta1.StateRun, updateRun)) + }) + + It("Should mark the 2nd cluster in the 1st stage as succeeded after marking the binding available", func() { + By("Validating the 2nd clusterResourceBinding is updated to Bound") + binding := resourceBindings[1] // cluster-1 + validateBindingState(ctx, binding, resourceSnapshot.Name, updateRun, 0) + + By("Updating the 2nd clusterResourceBinding to Available") + meta.SetStatusCondition(&binding.Status.Conditions, generateTrueCondition(binding, placementv1beta1.ResourceBindingAvailable)) + Expect(k8sClient.Status().Update(ctx, binding)).Should(Succeed(), "failed to update the binding status") + + By("Validating the 2nd cluster has succeeded and 3rd cluster has started") + wantStatus.StagesStatus[0].Clusters[1].Conditions = append(wantStatus.StagesStatus[0].Clusters[1].Conditions, generateTrueCondition(updateRun, placementv1beta1.ClusterUpdatingConditionSucceeded)) + wantStatus.StagesStatus[0].Clusters[2].Conditions = append(wantStatus.StagesStatus[0].Clusters[2].Conditions, generateTrueCondition(updateRun, placementv1beta1.ClusterUpdatingConditionStarted)) + validateClusterStagedUpdateRunStatus(ctx, updateRun, wantStatus, "") + + By("Checking update run status metrics are emitted") + validateUpdateRunMetricsEmitted(generateWaitingMetric(placementv1beta1.StateRun, updateRun), generateProgressingMetric(placementv1beta1.StateRun, updateRun)) + }) + + It("Should Should complete the 1st stage and complete the updateRun after the 3rd cluster succeeds", func() { + By("Validating the 3rd clusterResourceBinding is updated to Bound") + binding := resourceBindings[2] // cluster-2 + validateBindingState(ctx, binding, resourceSnapshot.Name, updateRun, 0) + + By("Updating the 3rd clusterResourceBinding to Available") + meta.SetStatusCondition(&binding.Status.Conditions, generateTrueCondition(binding, placementv1beta1.ResourceBindingAvailable)) + Expect(k8sClient.Status().Update(ctx, binding)).Should(Succeed(), "failed to update the binding status") + + By("Validating the 3rd cluster has succeeded") + wantStatus.StagesStatus[0].Clusters[2].Conditions = append(wantStatus.StagesStatus[0].Clusters[2].Conditions, generateTrueCondition(updateRun, placementv1beta1.ClusterUpdatingConditionSucceeded)) + meta.SetStatusCondition(&wantStatus.StagesStatus[0].Conditions, generateFalseProgressingCondition(updateRun, placementv1beta1.StageUpdatingConditionProgressing, condition.StageUpdatingSucceededReason)) + + By("Validating the 1st stage has completed and the updateRun has completed") + // 1st stage completed. + wantStatus.StagesStatus[0].Conditions[0] = generateFalseProgressingCondition(updateRun, placementv1beta1.StageUpdatingConditionProgressing, condition.StageUpdatingSucceededReason) + wantStatus.StagesStatus[0].Conditions = append(wantStatus.StagesStatus[0].Conditions, generateTrueCondition(updateRun, placementv1beta1.StageUpdatingConditionSucceeded)) + // Mark the deletion stage progressing condition as false with succeeded reason and add succeeded condition. + wantStatus.DeletionStageStatus.Conditions = append(wantStatus.DeletionStageStatus.Conditions, generateFalseProgressingCondition(updateRun, placementv1beta1.StageUpdatingConditionProgressing, condition.StageUpdatingSucceededReason)) + wantStatus.DeletionStageStatus.Conditions = append(wantStatus.DeletionStageStatus.Conditions, generateTrueCondition(updateRun, placementv1beta1.StageUpdatingConditionSucceeded)) + // Mark updateRun progressing condition as false with succeeded reason and add succeeded condition. + meta.SetStatusCondition(&wantStatus.Conditions, generateFalseProgressingCondition(updateRun, placementv1beta1.StagedUpdateRunConditionProgressing, condition.UpdateRunSucceededReason)) + wantStatus.Conditions = append(wantStatus.Conditions, generateTrueCondition(updateRun, placementv1beta1.StagedUpdateRunConditionSucceeded)) + validateClusterStagedUpdateRunStatus(ctx, updateRun, wantStatus, "") + + By("Validating the 1st stage has endTime set") + Expect(updateRun.Status.StagesStatus[0].EndTime).ShouldNot(BeNil()) + + By("Checking update run metrics are emitted") + validateUpdateRunMetricsEmitted(generateWaitingMetric(placementv1beta1.StateRun, updateRun), generateProgressingMetric(placementv1beta1.StateRun, updateRun), generateSucceededMetric(placementv1beta1.StateRun, updateRun)) + validateUpdateRunStageMetricsEmitted(generateStageClusterUpdatingMetric(updateRun)) + validateUpdateRunApprovalStageTaskMetric(generateApprovalStageTaskMetric(updateRun, placementv1beta1.BeforeStageTaskLabelValue, 1)) }) }) @@ -993,8 +1156,9 @@ var _ = Describe("UpdateRun execution tests - single stage", func() { Expect(waitStartTime.Add(updateStrategy.Spec.Stages[0].AfterStageTasks[0].WaitTime.Duration).After(waitEndTime)).Should(BeFalse(), fmt.Sprintf("waitEndTime %v did not pass waitStartTime %v long enough, want at least %v", waitEndTime, waitStartTime, updateStrategy.Spec.Stages[0].AfterStageTasks[0].WaitTime.Duration)) - By("Checking update run status metrics are emitted") + By("Checking update run metrics are emitted") validateUpdateRunMetricsEmitted(generateWaitingMetric(placementv1beta1.StateRun, updateRun), generateProgressingMetric(placementv1beta1.StateRun, updateRun), generateSucceededMetric(placementv1beta1.StateRun, updateRun)) + validateUpdateRunStageMetricsEmitted(generateStageClusterUpdatingMetric(updateRun)) }) }) @@ -1129,8 +1293,10 @@ var _ = Describe("UpdateRun execution tests - single stage", func() { return condition.IsConditionStatusTrue(meta.FindStatusCondition(approvalRequest.Status.Conditions, string(placementv1beta1.ApprovalRequestConditionApprovalAccepted)), approvalRequest.Generation), nil }, timeout, interval).Should(BeTrue(), "failed to validate the approvalRequest approval accepted") - By("Checking update run status metrics are emitted") + By("Checking update run metrics are emitted") validateUpdateRunMetricsEmitted(generateWaitingMetric(placementv1beta1.StateRun, updateRun), generateProgressingMetric(placementv1beta1.StateRun, updateRun), generateSucceededMetric(placementv1beta1.StateRun, updateRun)) + validateUpdateRunStageMetricsEmitted(generateStageClusterUpdatingMetric(updateRun)) + validateUpdateRunApprovalStageTaskMetric(generateApprovalStageTaskMetric(updateRun, placementv1beta1.AfterStageTaskLabelValue, 1)) }) }) @@ -1216,8 +1382,9 @@ var _ = Describe("UpdateRun execution tests - single stage", func() { By("Validating the 1st stage has endTime set") Expect(updateRun.Status.StagesStatus[0].EndTime).ShouldNot(BeNil()) - By("Checking update run status metrics are emitted") + By("Checking update run metrics are emitted") validateUpdateRunMetricsEmitted(generateProgressingMetric(placementv1beta1.StateRun, updateRun), generateSucceededMetric(placementv1beta1.StateRun, updateRun)) + validateUpdateRunStageMetricsEmitted(generateStageClusterUpdatingMetric(updateRun)) }) }) @@ -1633,8 +1800,9 @@ var _ = Describe("UpdateRun execution tests - single stage", func() { By("Validating the 1st stage has endTime set") Expect(updateRun.Status.StagesStatus[0].EndTime).ShouldNot(BeNil()) - By("Checking update run status metrics are emitted") + By("Checking update run metrics are emitted") validateUpdateRunMetricsEmitted(generateInitializationSucceededMetric(placementv1beta1.StateInitialize, updateRun), generateProgressingMetric(placementv1beta1.StateRun, updateRun), generateSucceededMetric(placementv1beta1.StateRun, updateRun)) + validateUpdateRunStageMetricsEmitted(generateStageClusterUpdatingMetric(updateRun)) }) }) }) diff --git a/pkg/controllers/updaterun/metrics.go b/pkg/controllers/updaterun/metrics.go new file mode 100644 index 000000000..5fb6a9694 --- /dev/null +++ b/pkg/controllers/updaterun/metrics.go @@ -0,0 +1,102 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package updaterun + +import ( + "time" + + "github.com/prometheus/client_golang/prometheus" + "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/klog/v2" + + placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" + hubmetrics "github.com/kubefleet-dev/kubefleet/pkg/metrics/hub" + "github.com/kubefleet-dev/kubefleet/pkg/utils/condition" +) + +// deleteUpdateRunMetrics deletes the metrics related to the update run when the update run is deleted. +func deleteUpdateRunMetrics(updateRun placementv1beta1.UpdateRunObj) { + hubmetrics.FleetUpdateRunStatusLastTimestampSeconds.DeletePartialMatch(prometheus.Labels{"namespace": updateRun.GetNamespace(), "name": updateRun.GetName()}) + hubmetrics.FleetUpdateRunStageClusterUpdatingDurationSeconds.DeletePartialMatch(prometheus.Labels{"namespace": updateRun.GetNamespace(), "name": updateRun.GetName()}) + hubmetrics.FleetUpdateRunApprovalRequestLatencySeconds.DeletePartialMatch(prometheus.Labels{"namespace": updateRun.GetNamespace(), "name": updateRun.GetName()}) +} + +// emitUpdateRunStatusMetric emits the update run status metric based on status conditions in the updateRun. +func emitUpdateRunStatusMetric(updateRun placementv1beta1.UpdateRunObj) { + generation := updateRun.GetGeneration() + state := updateRun.GetUpdateRunSpec().State + + updateRunStatus := updateRun.GetUpdateRunStatus() + succeedCond := meta.FindStatusCondition(updateRunStatus.Conditions, string(placementv1beta1.StagedUpdateRunConditionSucceeded)) + if succeedCond != nil && succeedCond.ObservedGeneration == generation { + hubmetrics.FleetUpdateRunStatusLastTimestampSeconds.WithLabelValues(updateRun.GetNamespace(), updateRun.GetName(), string(state), + string(placementv1beta1.StagedUpdateRunConditionSucceeded), string(succeedCond.Status), succeedCond.Reason).SetToCurrentTime() + return + } + + progressingCond := meta.FindStatusCondition(updateRunStatus.Conditions, string(placementv1beta1.StagedUpdateRunConditionProgressing)) + if progressingCond != nil && progressingCond.ObservedGeneration == generation { + hubmetrics.FleetUpdateRunStatusLastTimestampSeconds.WithLabelValues(updateRun.GetNamespace(), updateRun.GetName(), string(state), + string(placementv1beta1.StagedUpdateRunConditionProgressing), string(progressingCond.Status), progressingCond.Reason).SetToCurrentTime() + return + } + + initializedCond := meta.FindStatusCondition(updateRunStatus.Conditions, string(placementv1beta1.StagedUpdateRunConditionInitialized)) + if initializedCond != nil && initializedCond.ObservedGeneration == generation { + hubmetrics.FleetUpdateRunStatusLastTimestampSeconds.WithLabelValues(updateRun.GetNamespace(), updateRun.GetName(), string(state), + string(placementv1beta1.StagedUpdateRunConditionInitialized), string(initializedCond.Status), initializedCond.Reason).SetToCurrentTime() + return + } + + // We should rarely reach here, it can only happen when updating updateRun status fails. + klog.V(2).InfoS("There's no valid status condition on updateRun, status updating failed possibly", "updateRun", klog.KObj(updateRun)) +} + +// recordApprovalRequestLatency records the time from approval request creation to user approval. +func recordApprovalRequestLatency( + stageTaskStatus *placementv1beta1.StageTaskStatus, + updateRun placementv1beta1.UpdateRunObj, + taskType string, +) { + approvalCreatedCond := meta.FindStatusCondition(stageTaskStatus.Conditions, string(placementv1beta1.StageTaskConditionApprovalRequestCreated)) + approvalApprovedCond := meta.FindStatusCondition(stageTaskStatus.Conditions, string(placementv1beta1.StageTaskConditionApprovalRequestApproved)) + + // Only record latency when both approval request created and approved conditions are true, + // and their observed generation is the same as the update run generation to ensure the recorded latency is accurate. + if !condition.IsConditionStatusTrue(approvalCreatedCond, updateRun.GetGeneration()) || !condition.IsConditionStatusTrue(approvalApprovedCond, updateRun.GetGeneration()) { + return + } + + latencySeconds := approvalApprovedCond.LastTransitionTime.Sub(approvalCreatedCond.LastTransitionTime.Time).Seconds() + hubmetrics.FleetUpdateRunApprovalRequestLatencySeconds.WithLabelValues( + updateRun.GetNamespace(), + updateRun.GetName(), + taskType, + ).Observe(latencySeconds) +} + +// recordStageClusterUpdatingDuration records the time from stage start to when all clusters finish updating. +func recordStageClusterUpdatingDuration(stageStatus *placementv1beta1.StageUpdatingStatus, updateRun placementv1beta1.UpdateRunObj) { + if stageStatus.StartTime == nil { + return + } + durationSeconds := time.Since(stageStatus.StartTime.Time).Seconds() + hubmetrics.FleetUpdateRunStageClusterUpdatingDurationSeconds.WithLabelValues( + updateRun.GetNamespace(), + updateRun.GetName(), + ).Observe(durationSeconds) +} diff --git a/pkg/controllers/updaterun/stop_integration_test.go b/pkg/controllers/updaterun/stop_integration_test.go index c6e926ed5..0a7ea8af2 100644 --- a/pkg/controllers/updaterun/stop_integration_test.go +++ b/pkg/controllers/updaterun/stop_integration_test.go @@ -243,8 +243,9 @@ var _ = Describe("UpdateRun stop tests", func() { generateTrueCondition(updateRun, placementv1beta1.StageTaskConditionApprovalRequestApproved)) validateClusterStagedUpdateRunStatus(ctx, updateRun, wantStatus, "") - By("Checking update run status metrics are emitted") + By("Checking update run metrics are emitted") validateUpdateRunMetricsEmitted(generateWaitingMetric(placementv1beta1.StateRun, updateRun), generateStoppedMetric(placementv1beta1.StateStop, updateRun), generateProgressingMetric(placementv1beta1.StateRun, updateRun)) + validateUpdateRunApprovalStageTaskMetric(generateApprovalStageTaskMetric(updateRun, placementv1beta1.BeforeStageTaskLabelValue, 1)) }) It("Should mark the 1st cluster in the 1st stage as succeeded after marking the binding available", func() { @@ -264,8 +265,9 @@ var _ = Describe("UpdateRun stop tests", func() { By("Validating the 1st stage has startTime set") Expect(updateRun.Status.StagesStatus[0].StartTime).ShouldNot(BeNil()) - By("Checking update run status metrics are emitted") + By("Checking update run metrics are emitted") validateUpdateRunMetricsEmitted(generateWaitingMetric(placementv1beta1.StateRun, updateRun), generateStoppedMetric(placementv1beta1.StateStop, updateRun), generateProgressingMetric(placementv1beta1.StateRun, updateRun)) + validateUpdateRunApprovalStageTaskMetric(generateApprovalStageTaskMetric(updateRun, placementv1beta1.BeforeStageTaskLabelValue, 1)) }) It("Should be stopping in the middle of cluster updating when update run state is Stop", func() { @@ -282,16 +284,18 @@ var _ = Describe("UpdateRun stop tests", func() { meta.SetStatusCondition(&wantStatus.Conditions, generateProgressingUnknownConditionWithReason(updateRun, condition.UpdateRunStoppingReason)) validateClusterStagedUpdateRunStatus(ctx, updateRun, wantStatus, "") - By("Checking update run status metrics are emitted") + By("Checking update run metrics are emitted") validateUpdateRunMetricsEmitted(generateWaitingMetric(placementv1beta1.StateRun, updateRun), generateStoppedMetric(placementv1beta1.StateStop, updateRun), generateProgressingMetric(placementv1beta1.StateRun, updateRun), generateStoppingMetric(placementv1beta1.StateStop, updateRun)) + validateUpdateRunApprovalStageTaskMetric(generateApprovalStageTaskMetric(updateRun, placementv1beta1.BeforeStageTaskLabelValue, 1)) }) It("Should wait for cluster to finish updating so update run should still be stopping", func() { By("Validating the 2nd cluster has NOT succeeded and the update run is still stopping") validateClusterStagedUpdateRunStatusConsistently(ctx, updateRun, wantStatus, "") - By("Checking update run status metrics are emitted") + By("Checking update run metrics are emitted") validateUpdateRunMetricsEmitted(generateWaitingMetric(placementv1beta1.StateRun, updateRun), generateStoppedMetric(placementv1beta1.StateStop, updateRun), generateProgressingMetric(placementv1beta1.StateRun, updateRun), generateStoppingMetric(placementv1beta1.StateStop, updateRun)) + validateUpdateRunApprovalStageTaskMetric(generateApprovalStageTaskMetric(updateRun, placementv1beta1.BeforeStageTaskLabelValue, 1)) }) It("Should have completely stopped after the in-progress cluster has finished updating", func() { @@ -312,8 +316,9 @@ var _ = Describe("UpdateRun stop tests", func() { meta.SetStatusCondition(&wantStatus.Conditions, generateFalseProgressingCondition(updateRun, placementv1beta1.StagedUpdateRunConditionProgressing, condition.UpdateRunStoppedReason)) validateClusterStagedUpdateRunStatus(ctx, updateRun, wantStatus, "") - By("Checking update run status metrics are emitted") + By("Checking update run metrics are emitted") validateUpdateRunMetricsEmitted(generateWaitingMetric(placementv1beta1.StateRun, updateRun), generateProgressingMetric(placementv1beta1.StateRun, updateRun), generateStoppingMetric(placementv1beta1.StateStop, updateRun), generateStoppedMetric(placementv1beta1.StateStop, updateRun)) + validateUpdateRunApprovalStageTaskMetric(generateApprovalStageTaskMetric(updateRun, placementv1beta1.BeforeStageTaskLabelValue, 1)) By("Validating update run is in stopped state") validateClusterStagedUpdateRunStatusConsistently(ctx, updateRun, wantStatus, "") @@ -338,8 +343,9 @@ var _ = Describe("UpdateRun stop tests", func() { meta.SetStatusCondition(&wantStatus.Conditions, generateTrueCondition(updateRun, placementv1beta1.StagedUpdateRunConditionProgressing)) validateClusterStagedUpdateRunStatus(ctx, updateRun, wantStatus, "") - By("Checking update run status metrics are emitted") + By("Checking update run metrics are emitted") validateUpdateRunMetricsEmitted(generateWaitingMetric(placementv1beta1.StateRun, updateRun), generateStoppingMetric(placementv1beta1.StateStop, updateRun), generateStoppedMetric(placementv1beta1.StateStop, updateRun), generateProgressingMetric(placementv1beta1.StateRun, updateRun)) + validateUpdateRunApprovalStageTaskMetric(generateApprovalStageTaskMetric(updateRun, placementv1beta1.BeforeStageTaskLabelValue, 1)) }) It("Should mark the 3rd cluster in the 1st stage as succeeded after marking the binding available", func() { @@ -361,8 +367,10 @@ var _ = Describe("UpdateRun stop tests", func() { meta.SetStatusCondition(&wantStatus.Conditions, generateFalseCondition(updateRun, placementv1beta1.StagedUpdateRunConditionProgressing)) validateClusterStagedUpdateRunStatus(ctx, updateRun, wantStatus, "") - By("Checking update run status metrics are emitted") + By("Checking update run metrics are emitted") validateUpdateRunMetricsEmitted(generateStoppingMetric(placementv1beta1.StateStop, updateRun), generateStoppedMetric(placementv1beta1.StateStop, updateRun), generateProgressingMetric(placementv1beta1.StateRun, updateRun), generateWaitingMetric(placementv1beta1.StateRun, updateRun)) + validateUpdateRunApprovalStageTaskMetric(generateApprovalStageTaskMetric(updateRun, placementv1beta1.BeforeStageTaskLabelValue, 1)) + validateUpdateRunStageMetricsEmitted(generateStageClusterUpdatingMetric(updateRun)) }) It("Should have approval request created for 1st stage AfterStageTask", func() { @@ -384,8 +392,10 @@ var _ = Describe("UpdateRun stop tests", func() { } validateApprovalRequestCreated(wantApprovalRequest) - By("Checking update run status metrics are emitted") + By("Checking update run metrics are emitted") validateUpdateRunMetricsEmitted(generateStoppingMetric(placementv1beta1.StateStop, updateRun), generateStoppedMetric(placementv1beta1.StateStop, updateRun), generateProgressingMetric(placementv1beta1.StateRun, updateRun), generateWaitingMetric(placementv1beta1.StateRun, updateRun)) + validateUpdateRunApprovalStageTaskMetric(generateApprovalStageTaskMetric(updateRun, placementv1beta1.BeforeStageTaskLabelValue, 1)) + validateUpdateRunStageMetricsEmitted(generateStageClusterUpdatingMetric(updateRun)) }) It("Should stop the update run in AfterStageTask for 1st stage when state is Stop", func() { @@ -401,8 +411,10 @@ var _ = Describe("UpdateRun stop tests", func() { meta.SetStatusCondition(&wantStatus.Conditions, generateFalseConditionWithReason(updateRun, placementv1beta1.StagedUpdateRunConditionProgressing, condition.UpdateRunStoppedReason)) validateClusterStagedUpdateRunStatus(ctx, updateRun, wantStatus, "") - By("Checking update run status metrics are emitted") + By("Checking update run metrics are emitted") validateUpdateRunMetricsEmitted(generateStoppingMetric(placementv1beta1.StateStop, updateRun), generateProgressingMetric(placementv1beta1.StateRun, updateRun), generateWaitingMetric(placementv1beta1.StateRun, updateRun), generateStoppedMetric(placementv1beta1.StateStop, updateRun)) + validateUpdateRunApprovalStageTaskMetric(generateApprovalStageTaskMetric(updateRun, placementv1beta1.BeforeStageTaskLabelValue, 1)) + validateUpdateRunStageMetricsEmitted(generateStageClusterUpdatingMetric(updateRun)) }) It("Should not continue to delete stage after approval when still stopped", func() { @@ -427,8 +439,10 @@ var _ = Describe("UpdateRun stop tests", func() { By("Validating update run is stopped") validateClusterStagedUpdateRunStatusConsistently(ctx, updateRun, wantStatus, "") - By("Checking update run status metrics are emitted") + By("Checking update run metrics are emitted") validateUpdateRunMetricsEmitted(generateStoppingMetric(placementv1beta1.StateStop, updateRun), generateProgressingMetric(placementv1beta1.StateRun, updateRun), generateWaitingMetric(placementv1beta1.StateRun, updateRun), generateStoppedMetric(placementv1beta1.StateStop, updateRun)) + validateUpdateRunApprovalStageTaskMetric(generateApprovalStageTaskMetric(updateRun, placementv1beta1.BeforeStageTaskLabelValue, 1)) + validateUpdateRunStageMetricsEmitted(generateStageClusterUpdatingMetric(updateRun)) }) It("Should complete the 1st stage once it starts running again when wait time passed and approval request approved then move on to the Delete stage", func() { @@ -476,8 +490,10 @@ var _ = Describe("UpdateRun stop tests", func() { approvalCreateTime := meta.FindStatusCondition(updateRun.Status.StagesStatus[0].AfterStageTaskStatus[0].Conditions, string(placementv1beta1.StageTaskConditionApprovalRequestCreated)).LastTransitionTime.Time Expect(approvalCreateTime.Before(waitEndTime)).Should(BeTrue()) - By("Checking update run status metrics are emitted") + By("Checking update run metrics are emitted") validateUpdateRunMetricsEmitted(generateStoppingMetric(placementv1beta1.StateStop, updateRun), generateWaitingMetric(placementv1beta1.StateRun, updateRun), generateStoppedMetric(placementv1beta1.StateStop, updateRun), generateProgressingMetric(placementv1beta1.StateRun, updateRun)) + validateUpdateRunApprovalStageTaskMetric(generateApprovalStageTaskMetric(updateRun, placementv1beta1.AfterStageTaskLabelValue, 1), generateApprovalStageTaskMetric(updateRun, placementv1beta1.BeforeStageTaskLabelValue, 1)) + validateUpdateRunStageMetricsEmitted(generateStageClusterUpdatingMetric(updateRun)) }) It("Should stop the update run in deletion stage when state is Stop", func() { @@ -493,8 +509,10 @@ var _ = Describe("UpdateRun stop tests", func() { meta.SetStatusCondition(&wantStatus.Conditions, generateProgressingUnknownConditionWithReason(updateRun, condition.UpdateRunStoppingReason)) validateClusterStagedUpdateRunStatus(ctx, updateRun, wantStatus, "") - By("Checking update run status metrics are emitted") + By("Checking update run metrics are emitted") validateUpdateRunMetricsEmitted(generateWaitingMetric(placementv1beta1.StateRun, updateRun), generateStoppedMetric(placementv1beta1.StateStop, updateRun), generateProgressingMetric(placementv1beta1.StateRun, updateRun), generateStoppingMetric(placementv1beta1.StateStop, updateRun)) + validateUpdateRunApprovalStageTaskMetric(generateApprovalStageTaskMetric(updateRun, placementv1beta1.AfterStageTaskLabelValue, 1), generateApprovalStageTaskMetric(updateRun, placementv1beta1.BeforeStageTaskLabelValue, 1)) + validateUpdateRunStageMetricsEmitted(generateStageClusterUpdatingMetric(updateRun)) }) It("Should not complete deletion stage when in progress clusters still deleting while stopped", func() { @@ -513,8 +531,10 @@ var _ = Describe("UpdateRun stop tests", func() { By("Validating update run is stopping") validateClusterStagedUpdateRunStatusConsistently(ctx, updateRun, wantStatus, "") - By("Checking update run status metrics are emitted") + By("Checking update run metrics are emitted") validateUpdateRunMetricsEmitted(generateWaitingMetric(placementv1beta1.StateRun, updateRun), generateStoppedMetric(placementv1beta1.StateStop, updateRun), generateProgressingMetric(placementv1beta1.StateRun, updateRun), generateStoppingMetric(placementv1beta1.StateStop, updateRun)) + validateUpdateRunApprovalStageTaskMetric(generateApprovalStageTaskMetric(updateRun, placementv1beta1.AfterStageTaskLabelValue, 1), generateApprovalStageTaskMetric(updateRun, placementv1beta1.BeforeStageTaskLabelValue, 1)) + validateUpdateRunStageMetricsEmitted(generateStageClusterUpdatingMetric(updateRun)) }) It("Should stop completely after in-progress deletion is done when state is Stop", func() { @@ -556,8 +576,10 @@ var _ = Describe("UpdateRun stop tests", func() { meta.SetStatusCondition(&wantStatus.Conditions, generateFalseConditionWithReason(updateRun, placementv1beta1.StagedUpdateRunConditionProgressing, condition.UpdateRunStoppedReason)) validateClusterStagedUpdateRunStatus(ctx, updateRun, wantStatus, "") - By("Checking update run status metrics are emitted") + By("Checking update run metrics are emitted") validateUpdateRunMetricsEmitted(generateWaitingMetric(placementv1beta1.StateRun, updateRun), generateProgressingMetric(placementv1beta1.StateRun, updateRun), generateStoppingMetric(placementv1beta1.StateStop, updateRun), generateStoppedMetric(placementv1beta1.StateStop, updateRun)) + validateUpdateRunApprovalStageTaskMetric(generateApprovalStageTaskMetric(updateRun, placementv1beta1.AfterStageTaskLabelValue, 1), generateApprovalStageTaskMetric(updateRun, placementv1beta1.BeforeStageTaskLabelValue, 1)) + validateUpdateRunStageMetricsEmitted(generateStageClusterUpdatingMetric(updateRun)) }) It("Should complete delete stage and complete the update run when state is Run", func() { @@ -597,8 +619,10 @@ var _ = Describe("UpdateRun stop tests", func() { wantStatus.Conditions = append(wantStatus.Conditions, generateTrueCondition(updateRun, placementv1beta1.StagedUpdateRunConditionSucceeded)) validateClusterStagedUpdateRunStatus(ctx, updateRun, wantStatus, "") - By("Checking update run status metrics are emitted") + By("Checking update run metrics are emitted") validateUpdateRunMetricsEmitted(generateWaitingMetric(placementv1beta1.StateRun, updateRun), generateProgressingMetric(placementv1beta1.StateRun, updateRun), generateStoppingMetric(placementv1beta1.StateStop, updateRun), generateStoppedMetric(placementv1beta1.StateStop, updateRun), generateSucceededMetric(placementv1beta1.StateRun, updateRun)) + validateUpdateRunApprovalStageTaskMetric(generateApprovalStageTaskMetric(updateRun, placementv1beta1.AfterStageTaskLabelValue, 1), generateApprovalStageTaskMetric(updateRun, placementv1beta1.BeforeStageTaskLabelValue, 1)) + validateUpdateRunStageMetricsEmitted(generateStageClusterUpdatingMetric(updateRun)) }) }) }) diff --git a/pkg/metrics/hub/metrics.go b/pkg/metrics/hub/metrics.go index ade482161..e2be20782 100644 --- a/pkg/metrics/hub/metrics.go +++ b/pkg/metrics/hub/metrics.go @@ -42,6 +42,22 @@ var ( Name: "fleet_workload_update_run_status_last_timestamp_seconds", Help: "Last update timestamp of update run status in seconds", }, []string{"namespace", "name", "state", "condition", "status", "reason"}) + + // FleetUpdateRunApprovalRequestLatencySeconds tracks how long users take to approve approval requests. + FleetUpdateRunApprovalRequestLatencySeconds = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: "fleet_workload_update_run_approval_request_latency_seconds", + Help: "The latency from approval request creation to user approval in seconds", + // Buckets: 1min, 5min, 15min, 30min, 1hr, 2hr, 6hr, 12hr, 24hr + Buckets: []float64{60, 300, 900, 1800, 3600, 7200, 21600, 43200, 86400}, + }, []string{"namespace", "name", "taskType"}) + + // FleetUpdateRunStageClusterUpdatingDurationSeconds tracks the duration of each stage of an update run, excluding the execution time of stage tasks. + FleetUpdateRunStageClusterUpdatingDurationSeconds = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: "fleet_workload_update_run_stage_cluster_updating_duration_seconds", + Help: "The duration the stage of an update run in seconds without stage tasks execution time", + // Buckets: 15s, 30s, 1min, 2min, 5min, 10min, 30min, 1hr + Buckets: []float64{15, 30, 60, 120, 300, 600, 1800, 3600}, + }, []string{"namespace", "name"}) ) // The scheduler related metrics. @@ -74,6 +90,8 @@ func init() { FleetPlacementStatusLastTimeStampSeconds, FleetEvictionStatus, FleetUpdateRunStatusLastTimestampSeconds, + FleetUpdateRunApprovalRequestLatencySeconds, + FleetUpdateRunStageClusterUpdatingDurationSeconds, SchedulingCycleDurationMilliseconds, SchedulerActiveWorkers, ) From 232e27ed24b3a38df2a2b6a09c89d67e197546a9 Mon Sep 17 00:00:00 2001 From: Ryan Zhang Date: Wed, 25 Feb 2026 10:11:05 -0800 Subject: [PATCH 4/9] feat: create new resourceSnapshot if the snapshotIndex is empty (#462) --- cmd/hubagent/workload/setup.go | 14 +- pkg/controllers/placement/suite_test.go | 2 +- pkg/controllers/updaterun/controller.go | 6 + .../updaterun/controller_integration_test.go | 8 +- pkg/controllers/updaterun/execution_test.go | 4 +- pkg/controllers/updaterun/initialization.go | 73 ++++++---- .../initialization_integration_test.go | 130 ++++++++++++++---- pkg/controllers/updaterun/suite_test.go | 34 ++++- pkg/controllers/updaterun/validation.go | 2 +- .../controller/resource_snapshot_resolver.go | 8 +- .../resource_snapshot_resolver_test.go | 15 +- test/e2e/cluster_staged_updaterun_test.go | 18 +-- test/e2e/staged_updaterun_test.go | 14 +- 13 files changed, 240 insertions(+), 88 deletions(-) diff --git a/cmd/hubagent/workload/setup.go b/cmd/hubagent/workload/setup.go index 3d8450ee1..ec96f8b7b 100644 --- a/cmd/hubagent/workload/setup.go +++ b/cmd/hubagent/workload/setup.go @@ -172,7 +172,7 @@ func SetupControllers(ctx context.Context, wg *sync.WaitGroup, mgr ctrl.Manager, Scheme: mgr.GetScheme(), UncachedReader: mgr.GetAPIReader(), ResourceSelectorResolver: resourceSelectorResolver, - ResourceSnapshotResolver: *resourceSnapshotResolver, + ResourceSnapshotResolver: resourceSnapshotResolver, } rateLimiter := options.DefaultControllerRateLimiter(opts.RateLimiterOpts) @@ -310,8 +310,10 @@ func SetupControllers(ctx context.Context, wg *sync.WaitGroup, mgr ctrl.Manager, } klog.Info("Setting up clusterStagedUpdateRun controller") if err = (&updaterun.Reconciler{ - Client: mgr.GetClient(), - InformerManager: dynamicInformerManager, + Client: mgr.GetClient(), + InformerManager: dynamicInformerManager, + ResourceSelectorResolver: resourceSelectorResolver, + ResourceSnapshotResolver: resourceSnapshotResolver, }).SetupWithManagerForClusterStagedUpdateRun(mgr); err != nil { klog.ErrorS(err, "Unable to set up clusterStagedUpdateRun controller") return err @@ -326,8 +328,10 @@ func SetupControllers(ctx context.Context, wg *sync.WaitGroup, mgr ctrl.Manager, } klog.Info("Setting up stagedUpdateRun controller") if err = (&updaterun.Reconciler{ - Client: mgr.GetClient(), - InformerManager: dynamicInformerManager, + Client: mgr.GetClient(), + InformerManager: dynamicInformerManager, + ResourceSelectorResolver: resourceSelectorResolver, + ResourceSnapshotResolver: resourceSnapshotResolver, }).SetupWithManagerForStagedUpdateRun(mgr); err != nil { klog.ErrorS(err, "Unable to set up stagedUpdateRun controller") return err diff --git a/pkg/controllers/placement/suite_test.go b/pkg/controllers/placement/suite_test.go index d238bb0c7..922b4e9d0 100644 --- a/pkg/controllers/placement/suite_test.go +++ b/pkg/controllers/placement/suite_test.go @@ -126,7 +126,7 @@ var _ = BeforeSuite(func() { UncachedReader: mgr.GetAPIReader(), Recorder: mgr.GetEventRecorderFor(controllerName), ResourceSelectorResolver: resourceSelectorResolver, - ResourceSnapshotResolver: *resourceSnapshotResolver, + ResourceSnapshotResolver: resourceSnapshotResolver, } opts := options.RateLimitOptions{ RateLimiterBaseDelay: 5 * time.Millisecond, diff --git a/pkg/controllers/updaterun/controller.go b/pkg/controllers/updaterun/controller.go index e1c77f455..78cccec16 100644 --- a/pkg/controllers/updaterun/controller.go +++ b/pkg/controllers/updaterun/controller.go @@ -60,6 +60,12 @@ type Reconciler struct { recorder record.EventRecorder // the informer contains the cache for all the resources we need to check the resource scope. InformerManager informer.Manager + + // ResourceSelectorResolver selects resources for placement. + ResourceSelectorResolver controller.ResourceSelectorResolver + + // ResourceSnapshotResolver gets or creates resource snapshots. + ResourceSnapshotResolver controller.ResourceSnapshotResolver } func (r *Reconciler) Reconcile(ctx context.Context, req runtime.Request) (runtime.Result, error) { diff --git a/pkg/controllers/updaterun/controller_integration_test.go b/pkg/controllers/updaterun/controller_integration_test.go index 0d0efc63f..afdf47e39 100644 --- a/pkg/controllers/updaterun/controller_integration_test.go +++ b/pkg/controllers/updaterun/controller_integration_test.go @@ -504,7 +504,7 @@ func generateTestClusterResourcePlacement() *placementv1beta1.ClusterResourcePla Group: "", Version: "v1", Kind: "Namespace", - Name: "test-namespace", + Name: testNamespaceName, }, }, Policy: &placementv1beta1.PlacementPolicy{ @@ -734,9 +734,9 @@ func generateTestClusterResourceSnapshot() *placementv1beta1.ClusterResourceSnap Kind: "Namespace", }, ObjectMeta: metav1.ObjectMeta{ - Name: "test-namespace", + Name: testNamespaceName, Labels: map[string]string{ - "fleet.azure.com/name": "test-namespace", + "fleet.azure.com/name": testNamespaceName, }, }, }) @@ -780,7 +780,7 @@ func generateTestClusterResourceOverride() *placementv1beta1.ClusterResourceOver Group: "", Version: "v1", Kind: "Namespace", - Name: "test-namespace", + Name: testNamespaceName, }, }, Policy: &placementv1beta1.OverridePolicy{ diff --git a/pkg/controllers/updaterun/execution_test.go b/pkg/controllers/updaterun/execution_test.go index f4dd799aa..6e132f332 100644 --- a/pkg/controllers/updaterun/execution_test.go +++ b/pkg/controllers/updaterun/execution_test.go @@ -378,7 +378,7 @@ func TestBuildApprovalRequestObject(t *testing.T) { name: "should create namespaced ApprovalRequest when namespace is provided", namespacedName: types.NamespacedName{ Name: fmt.Sprintf(placementv1beta1.AfterStageApprovalTaskNameFmt, "test-update-run", "test-stage"), - Namespace: "test-namespace", + Namespace: testNamespaceName, }, stageName: "test-stage", updateRunName: "test-update-run", @@ -386,7 +386,7 @@ func TestBuildApprovalRequestObject(t *testing.T) { want: &placementv1beta1.ApprovalRequest{ ObjectMeta: metav1.ObjectMeta{ Name: fmt.Sprintf(placementv1beta1.AfterStageApprovalTaskNameFmt, "test-update-run", "test-stage"), - Namespace: "test-namespace", + Namespace: testNamespaceName, Labels: map[string]string{ placementv1beta1.TargetUpdatingStageNameLabel: "test-stage", placementv1beta1.TargetUpdateRunLabel: "test-update-run", diff --git a/pkg/controllers/updaterun/initialization.go b/pkg/controllers/updaterun/initialization.go index e45f419bf..a977c7de2 100644 --- a/pkg/controllers/updaterun/initialization.go +++ b/pkg/controllers/updaterun/initialization.go @@ -47,7 +47,7 @@ func (r *Reconciler) initialize( updateRun placementv1beta1.UpdateRunObj, ) ([]placementv1beta1.BindingObj, []placementv1beta1.BindingObj, error) { // Validate the Placement object referenced by the UpdateRun. - placementNamespacedName, err := r.validatePlacement(ctx, updateRun) + placement, placementNamespacedName, err := r.validatePlacement(ctx, updateRun) if err != nil { return nil, nil, err } @@ -67,7 +67,7 @@ func (r *Reconciler) initialize( return nil, nil, err } // Record the override snapshots associated with each cluster. - if err := r.recordOverrideSnapshots(ctx, placementNamespacedName, updateRun); err != nil { + if err := r.recordOverrideSnapshots(ctx, placement, updateRun); err != nil { return nil, nil, err } @@ -75,7 +75,7 @@ func (r *Reconciler) initialize( } // validatePlacement validates the Placement object referenced by the UpdateRun. -func (r *Reconciler) validatePlacement(ctx context.Context, updateRun placementv1beta1.UpdateRunObj) (types.NamespacedName, error) { +func (r *Reconciler) validatePlacement(ctx context.Context, updateRun placementv1beta1.UpdateRunObj) (placementv1beta1.PlacementObj, types.NamespacedName, error) { updateRunRef := klog.KObj(updateRun) placementName := updateRun.GetUpdateRunSpec().PlacementName @@ -91,10 +91,10 @@ func (r *Reconciler) validatePlacement(ctx context.Context, updateRun placementv if apierrors.IsNotFound(err) { placementNotFoundErr := controller.NewUserError(fmt.Errorf("parent placement not found")) klog.ErrorS(err, "Failed to get placement", "placement", placementKey, "updateRun", updateRunRef) - return types.NamespacedName{}, fmt.Errorf("%w: %s", errValidationFailed, placementNotFoundErr.Error()) + return nil, types.NamespacedName{}, fmt.Errorf("%w: %s", errValidationFailed, placementNotFoundErr.Error()) } klog.ErrorS(err, "Failed to get placement", "placement", placementKey, "updateRun", updateRunRef) - return types.NamespacedName{}, controller.NewAPIServerError(true, err) + return nil, types.NamespacedName{}, controller.NewAPIServerError(true, err) } // fill out all the default values for placement, mutation webhook is not setup for resource placement. @@ -106,13 +106,13 @@ func (r *Reconciler) validatePlacement(ctx context.Context, updateRun placementv if placementSpec.Strategy.Type != placementv1beta1.ExternalRolloutStrategyType { klog.V(2).InfoS("The placement does not have an external rollout strategy", "placement", placementKey, "updateRun", updateRunRef) wrongRolloutTypeErr := controller.NewUserError(errors.New("parent placement does not have an external rollout strategy, current strategy: " + string(placementSpec.Strategy.Type))) - return types.NamespacedName{}, fmt.Errorf("%w: %s", errValidationFailed, wrongRolloutTypeErr.Error()) + return nil, types.NamespacedName{}, fmt.Errorf("%w: %s", errValidationFailed, wrongRolloutTypeErr.Error()) } updateRunStatus := updateRun.GetUpdateRunStatus() updateRunStatus.ApplyStrategy = placementSpec.Strategy.ApplyStrategy - return placementKey, nil + return placement, placementKey, nil } // determinePolicySnapshot retrieves the latest policy snapshot associated with the Placement, @@ -499,11 +499,12 @@ func validateAfterStageTask(tasks []placementv1beta1.StageTask) error { } // recordOverrideSnapshots finds all the override snapshots that are associated with each cluster and record them in the UpdateRun status. -func (r *Reconciler) recordOverrideSnapshots(ctx context.Context, placementKey types.NamespacedName, updateRun placementv1beta1.UpdateRunObj) error { +func (r *Reconciler) recordOverrideSnapshots(ctx context.Context, placement placementv1beta1.PlacementObj, updateRun placementv1beta1.UpdateRunObj) error { updateRunRef := klog.KObj(updateRun) updateRunSpec := updateRun.GetUpdateRunSpec() + placementKey := types.NamespacedName{Name: placement.GetName(), Namespace: placement.GetNamespace()} - resourceSnapshotObjs, err := r.getResourceSnapshotObjs(ctx, placementKey, updateRun) + resourceSnapshotObjs, err := r.getResourceSnapshotObjs(ctx, placement, updateRun) if err != nil { return err } @@ -560,10 +561,12 @@ func (r *Reconciler) recordOverrideSnapshots(ctx context.Context, placementKey t } // getResourceSnapshotObjs retrieves the list of resource snapshot objects from the specified ResourceSnapshotIndex. -// If ResourceSnapshotIndex is unspecified, it returns the list of latest resource snapshots. -func (r *Reconciler) getResourceSnapshotObjs(ctx context.Context, placementKey types.NamespacedName, updateRun placementv1beta1.UpdateRunObj) ([]placementv1beta1.ResourceSnapshotObj, error) { +// If ResourceSnapshotIndex is unspecified, it takes a new snapshot using SelectResourcesForPlacement and +// GetOrCreateResourceSnapshot, similar to the placement controller but without waiting for snapshot creation intervals. +func (r *Reconciler) getResourceSnapshotObjs(ctx context.Context, placement placementv1beta1.PlacementObj, updateRun placementv1beta1.UpdateRunObj) ([]placementv1beta1.ResourceSnapshotObj, error) { updateRunRef := klog.KObj(updateRun) updateRunSpec := updateRun.GetUpdateRunSpec() + placementKey := types.NamespacedName{Name: placement.GetName(), Namespace: placement.GetNamespace()} var resourceSnapshotObjs []placementv1beta1.ResourceSnapshotObj if updateRunSpec.ResourceSnapshotIndex != "" { snapshotIndex, err := strconv.Atoi(updateRunSpec.ResourceSnapshotIndex) @@ -592,23 +595,45 @@ func (r *Reconciler) getResourceSnapshotObjs(ctx context.Context, placementKey t return resourceSnapshotObjs, nil } - klog.V(2).InfoS("No resource snapshot index specified, fetching latest resource snapshots", "placement", placementKey, "updateRun", updateRunRef) - latestResourceSnapshots, err := controller.ListLatestResourceSnapshots(ctx, r.Client, placementKey) + klog.V(2).InfoS("No resource snapshot index specified, creating a new resource snapshot", "placement", placementKey, "updateRun", updateRunRef) + + // Select the resources using the placement's resource selectors. + envelopeObjCount, selectedResources, _, err := r.ResourceSelectorResolver.SelectResourcesForPlacement(placement) if err != nil { - klog.ErrorS(err, "Failed to list the latest resourceSnapshots associated with the placement", - "placement", placementKey, "updateRun", updateRunRef) - // list err can be retried. - return nil, controller.NewAPIServerError(true, err) + klog.ErrorS(err, "Failed to select resources for placement", "placement", placementKey, "updateRun", updateRunRef) + if errors.Is(err, controller.ErrUserError) { + return nil, fmt.Errorf("%w: %s", errValidationFailed, err.Error()) + } + return nil, err } - resourceSnapshotObjs = latestResourceSnapshots.GetResourceSnapshotObjs() - if len(resourceSnapshotObjs) == 0 { - err := fmt.Errorf("no latest resourceSnapshots found for placement `%s`. This might be a transient state, need retry", placementKey) - klog.ErrorS(err, "No latest resourceSnapshots found for placement. This might be transient, need retry", "placement", placementKey, "updateRun", updateRunRef) - // retryable error. - return resourceSnapshotObjs, err + // Determine the revision history limit. + revisionLimit := int32(defaulter.DefaultRevisionHistoryLimitValue) + placementSpec := placement.GetPlacementSpec() + if placementSpec.RevisionHistoryLimit != nil && *placementSpec.RevisionHistoryLimit > 0 { + revisionLimit = *placementSpec.RevisionHistoryLimit } - return resourceSnapshotObjs, nil + + // Create or get the resource snapshot. Unlike the placement controller, we do not wait for snapshot creation intervals. + _, latestResourceSnapshot, err := r.ResourceSnapshotResolver.GetOrCreateResourceSnapshot(ctx, placement, envelopeObjCount, + &placementv1beta1.ResourceSnapshotSpec{SelectedResources: selectedResources}, int(revisionLimit)) + if err != nil { + klog.ErrorS(err, "Failed to get or create resource snapshot", "placement", placementKey, "updateRun", updateRunRef) + return nil, err + } + + if latestResourceSnapshot == nil { + err := fmt.Errorf("no resource snapshot created for placement `%s`", placementKey) + klog.ErrorS(err, "Failed to create resource snapshot", "placement", placementKey, "updateRun", updateRunRef) + return nil, err + } + + // Return the master snapshot directly rather than listing from the cache, because + // GetOrCreateResourceSnapshot writes to the API server and the controller-runtime + // cache may not have the newly created snapshot yet. + klog.V(2).InfoS("Created/fetched resource snapshot for updateRun", + "placement", placementKey, "resourceSnapshot", klog.KObj(latestResourceSnapshot), "updateRun", updateRunRef) + return []placementv1beta1.ResourceSnapshotObj{latestResourceSnapshot}, nil } // recordInitializationSucceeded records the successful initialization condition in the UpdateRun status. diff --git a/pkg/controllers/updaterun/initialization_integration_test.go b/pkg/controllers/updaterun/initialization_integration_test.go index ae0c980e9..41ff96c09 100644 --- a/pkg/controllers/updaterun/initialization_integration_test.go +++ b/pkg/controllers/updaterun/initialization_integration_test.go @@ -30,10 +30,12 @@ import ( "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" clusterv1beta1 "github.com/kubefleet-dev/kubefleet/apis/cluster/v1beta1" placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" "github.com/kubefleet-dev/kubefleet/pkg/utils" + "github.com/kubefleet-dev/kubefleet/pkg/utils/condition" ) var ( @@ -82,6 +84,7 @@ var _ = Describe("Updaterun initialization tests", func() { // Set smaller wait time for testing stageUpdatingWaitTime = time.Second * 3 clusterUpdatingWaitTime = time.Second * 2 + }) AfterEach(func() { @@ -757,6 +760,11 @@ var _ = Describe("Updaterun initialization tests", func() { Expect(k8sClient.Create(ctx, updateStrategy)).To(Succeed()) }) + AfterEach(func() { + By("Cleaning up auto-created resource snapshots") + cleanupAutoCreatedResourceSnapshots(ctx) + }) + It("Should fail to initialize if the specified resource snapshot index is invalid - not integer", func() { By("Creating a new clusterStagedUpdateRun with invalid resource snapshot index") updateRun.Spec.ResourceSnapshotIndex = "invalid-index" @@ -792,29 +800,31 @@ var _ = Describe("Updaterun initialization tests", func() { validateUpdateRunMetricsEmitted(generateInitializationFailedMetric(placementv1beta1.StateInitialize, updateRun)) }) - It("Should NOT fail to initialize if the specified resource snapshot is not found when no resource index specified - no resourceSnapshots at all", func() { + It("Should create a new resource snapshot and succeed initialization when no resource index specified - no resourceSnapshots at all", func() { By("Creating a new clusterStagedUpdateRun without specifying resourceSnapshotIndex") updateRun.Spec.ResourceSnapshotIndex = "" Expect(k8sClient.Create(ctx, updateRun)).To(Succeed()) - By("Validating the initialization did not fail due to resourceSnapshot not found (retryable error)") - // Populate the cache first. + By("Validating the initialization succeeded with a newly created resource snapshot") Eventually(func() error { - if err := k8sClient.Get(ctx, updateRunNamespacedName, updateRun); err != nil { - return err - } - return nil - }, timeout, interval).Should(Succeed(), "failed to get the updateRun") - Consistently(func() error { if err := k8sClient.Get(ctx, updateRunNamespacedName, updateRun); err != nil { return err } initCond := meta.FindStatusCondition(updateRun.Status.Conditions, string(placementv1beta1.StagedUpdateRunConditionInitialized)) - if initCond != nil { - return fmt.Errorf("got initialization condition: %v, want nil", initCond) + if initCond == nil { + return fmt.Errorf("initialization condition not found yet") + } + if initCond.Status != metav1.ConditionTrue { + return fmt.Errorf("initialization condition status = %v, want True, message = %s", initCond.Status, initCond.Message) + } + if updateRun.Status.ResourceSnapshotIndexUsed != "0" { + return fmt.Errorf("resourceSnapshotIndexUsed is not set correctly, got %s, want `0`", updateRun.Status.ResourceSnapshotIndexUsed) } return nil - }, duration, interval).Should(Succeed(), "the initialization should keep retrying, not failed") + }, timeout, interval).Should(Succeed(), "failed to validate initialization succeeded with auto-created snapshot") + + By("Checking update run status metrics are emitted") + validateUpdateRunMetricsEmitted(generateInitializationSucceededMetric(placementv1beta1.StateInitialize, updateRun)) }) It("Should fail to initialize if the specified resource snapshot is not found - no CRP label found", func() { @@ -862,8 +872,8 @@ var _ = Describe("Updaterun initialization tests", func() { validateUpdateRunMetricsEmitted(generateInitializationFailedMetric(placementv1beta1.StateInitialize, updateRun)) }) - It("Should select latest resource snapshot in the status when no resource index defined", func() { - By("Creating a new resource snapshot") + It("Should create a new resource snapshot when no resource index defined, previous snapshots hash does not match", func() { + By("Creating a pre-existing resource snapshot with index 0") Expect(k8sClient.Create(ctx, resourceSnapshot)).To(Succeed()) By("Creating a new cluster resource override") @@ -873,13 +883,41 @@ var _ = Describe("Updaterun initialization tests", func() { updateRun.Spec.ResourceSnapshotIndex = "" Expect(k8sClient.Create(ctx, updateRun)).To(Succeed()) - By("Validating the clusterStagedUpdateRun stats") - initialized := generateSucceededInitializationStatus(crp, updateRun, testResourceSnapshotIndex, policySnapshot, updateStrategy, clusterResourceOverride) - validateClusterStagedUpdateRunStatus(ctx, updateRun, initialized, "") - Expect(updateRun.Status.ResourceSnapshotIndexUsed).To(Equal(testResourceSnapshotIndex), "resource snapshot index used mismatch in the updateRun status") + By("Validating the initialization succeeded with a newly created resource snapshot") + Eventually(func() error { + if err := k8sClient.Get(ctx, updateRunNamespacedName, updateRun); err != nil { + return err + } + initCond := meta.FindStatusCondition(updateRun.Status.Conditions, string(placementv1beta1.StagedUpdateRunConditionInitialized)) + if initCond == nil { + return fmt.Errorf("initialization condition not found yet") + } + if !condition.IsConditionStatusTrue(initCond, updateRun.GetGeneration()) { + return fmt.Errorf("initialization condition status = %v, want True, message = %s", initCond.Status, initCond.Message) + } + if updateRun.Status.ResourceSnapshotIndexUsed != "1" { + return fmt.Errorf("resourceSnapshotIndexUsed is not set correctly, got %s, want `1`", updateRun.Status.ResourceSnapshotIndexUsed) + } + return nil + }, timeout, interval).Should(Succeed(), "failed to validate initialization succeeded with auto-created snapshot") By("Validating the clusterStagedUpdateRun initialized consistently") - validateClusterStagedUpdateRunStatusConsistently(ctx, updateRun, initialized, "") + Consistently(func() error { + if err := k8sClient.Get(ctx, updateRunNamespacedName, updateRun); err != nil { + return err + } + initCond := meta.FindStatusCondition(updateRun.Status.Conditions, string(placementv1beta1.StagedUpdateRunConditionInitialized)) + if !condition.IsConditionStatusTrue(initCond, updateRun.GetGeneration()) { + return fmt.Errorf("initialization condition changed unexpectedly") + } + if updateRun.Status.ResourceSnapshotIndexUsed != "1" { + return fmt.Errorf("resourceSnapshotIndexUsed is not set correctly, got %s, want `1`", updateRun.Status.ResourceSnapshotIndexUsed) + } + return nil + }, duration, interval).Should(Succeed(), "initialization should remain successful") + + By("Cleaning up auto-created resource snapshots") + cleanupAutoCreatedResourceSnapshots(ctx) By("Checking update run status metrics are emitted") validateUpdateRunMetricsEmitted(generateInitializationSucceededMetric(placementv1beta1.StateInitialize, updateRun)) @@ -906,18 +944,18 @@ var _ = Describe("Updaterun initialization tests", func() { validateUpdateRunMetricsEmitted(generateInitializationSucceededMetric(placementv1beta1.StateInitialize, updateRun)) }) - It("Should pick latest master resource snapshot if multiple snapshots", func() { - By("Creating a new resource snapshot") + It("Should create a new resource snapshot at next index even when multiple pre-existing snapshots exist and no index specified", func() { + By("Creating a pre-existing resource snapshot at index 0") resourceSnapshot.Labels[placementv1beta1.IsLatestSnapshotLabel] = "false" Expect(k8sClient.Create(ctx, resourceSnapshot)).To(Succeed()) - By("Creating a another new resource snapshot") + By("Creating a another pre-existing resource snapshot at index 1") resourceSnapshot2.Name = testCRPName + "-1-snapshot" resourceSnapshot2.Labels[placementv1beta1.IsLatestSnapshotLabel] = "false" resourceSnapshot2.Labels[placementv1beta1.ResourceIndexLabel] = "1" Expect(k8sClient.Create(ctx, resourceSnapshot2)).To(Succeed()) - By("Creating a latest master resource snapshot") + By("Creating a latest pre-existing resource snapshot at index 2") resourceSnapshot3.Name = testCRPName + "-2-snapshot" resourceSnapshot3.Labels[placementv1beta1.ResourceIndexLabel] = "2" Expect(k8sClient.Create(ctx, resourceSnapshot3)).To(Succeed()) @@ -929,12 +967,41 @@ var _ = Describe("Updaterun initialization tests", func() { updateRun.Spec.ResourceSnapshotIndex = "" Expect(k8sClient.Create(ctx, updateRun)).To(Succeed()) - By("Validating the clusterStagedUpdateRun status") - initialized := generateSucceededInitializationStatus(crp, updateRun, "2", policySnapshot, updateStrategy, clusterResourceOverride) - validateClusterStagedUpdateRunStatus(ctx, updateRun, initialized, "") + By("Validating the initialization succeeded and a new resource snapshot was created") + Eventually(func() error { + if err := k8sClient.Get(ctx, updateRunNamespacedName, updateRun); err != nil { + return err + } + initCond := meta.FindStatusCondition(updateRun.Status.Conditions, string(placementv1beta1.StagedUpdateRunConditionInitialized)) + if initCond == nil { + return fmt.Errorf("initialization condition not found yet") + } + if !condition.IsConditionStatusTrue(initCond, updateRun.GetGeneration()) { + return fmt.Errorf("initialization condition status = %v, want True, message = %s", initCond.Status, initCond.Message) + } + if updateRun.Status.ResourceSnapshotIndexUsed != "3" { + return fmt.Errorf("resourceSnapshotIndexUsed is not set correctly, got %s, want `3`", updateRun.Status.ResourceSnapshotIndexUsed) + } + return nil + }, timeout, interval).Should(Succeed(), "failed to validate initialization succeeded with auto-created snapshot") By("Validating the clusterStagedUpdateRun initialized consistently") - validateClusterStagedUpdateRunStatusConsistently(ctx, updateRun, initialized, "") + Consistently(func() error { + if err := k8sClient.Get(ctx, updateRunNamespacedName, updateRun); err != nil { + return err + } + initCond := meta.FindStatusCondition(updateRun.Status.Conditions, string(placementv1beta1.StagedUpdateRunConditionInitialized)) + if !condition.IsConditionStatusTrue(initCond, updateRun.GetGeneration()) { + return fmt.Errorf("initialization condition changed unexpectedly") + } + if updateRun.Status.ResourceSnapshotIndexUsed != "3" { + return fmt.Errorf("resourceSnapshotIndexUsed is not set correctly, got %s, want `3`", updateRun.Status.ResourceSnapshotIndexUsed) + } + return nil + }, duration, interval).Should(Succeed(), "initialization should remain successful") + + By("Cleaning up auto-created resource snapshots") + cleanupAutoCreatedResourceSnapshots(ctx) By("Checking update run status metrics are emitted") validateUpdateRunMetricsEmitted(generateInitializationSucceededMetric(placementv1beta1.StateInitialize, updateRun)) @@ -1123,3 +1190,12 @@ func generateExecutionNotStartedStatus( generateTrueCondition(updateRun, placementv1beta1.StageTaskConditionApprovalRequestCreated)) return status } + +// cleanupAutoCreatedResourceSnapshots deletes all ClusterResourceSnapshots associated with the test CRP. +// This is needed because auto-created snapshots from GetOrCreateResourceSnapshot are not tracked +// by the test's AfterEach cleanup, which only deletes the pre-created test snapshots. +func cleanupAutoCreatedResourceSnapshots(ctx context.Context) { + Expect(k8sClient.DeleteAllOf(ctx, &placementv1beta1.ClusterResourceSnapshot{}, + client.MatchingLabels{placementv1beta1.PlacementTrackingLabel: testCRPName}, + )).Should(Succeed()) +} diff --git a/pkg/controllers/updaterun/suite_test.go b/pkg/controllers/updaterun/suite_test.go index 9e5284969..22c16aa8b 100644 --- a/pkg/controllers/updaterun/suite_test.go +++ b/pkg/controllers/updaterun/suite_test.go @@ -25,6 +25,8 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/dynamic" "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" @@ -41,6 +43,7 @@ import ( placementv1alpha1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1alpha1" placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" "github.com/kubefleet-dev/kubefleet/pkg/utils" + controller "github.com/kubefleet-dev/kubefleet/pkg/utils/controller" "github.com/kubefleet-dev/kubefleet/pkg/utils/informer" ) @@ -53,6 +56,10 @@ var ( cancel context.CancelFunc ) +const ( + testNamespaceName = "test-namespace" +) + func TestAPIs(t *testing.T) { RegisterFailHandler(Fail) @@ -63,8 +70,11 @@ var _ = BeforeSuite(func() { ctx, cancel = context.WithCancel(context.TODO()) var err error - By("Setup klog") - klog.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true))) + By("Setup klog and controller runtime logger") + logger := zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true)) + klog.SetLogger(logger) + ctrl.SetLogger(logger) + fs := flag.NewFlagSet("klog", flag.ContinueOnError) klog.InitFlags(fs) Expect(fs.Parse([]string{"--v", "5", "-add_dir_header", "true"})).Should(Succeed()) @@ -111,12 +121,28 @@ var _ = BeforeSuite(func() { }, nil) // Setup our main reconciler. + resourceSelectorResolver := controller.ResourceSelectorResolver{ + RestMapper: mgr.GetRESTMapper(), + InformerManager: dynamicInformerManager, + ResourceConfig: utils.NewResourceConfig(false), + SkippedNamespaces: map[string]bool{}, + } err = (&Reconciler{ - Client: k8sClient, - InformerManager: dynamicInformerManager, + Client: k8sClient, + InformerManager: dynamicInformerManager, + ResourceSelectorResolver: resourceSelectorResolver, + ResourceSnapshotResolver: controller.NewResourceSnapshotResolver(mgr.GetClient(), mgr.GetScheme()), }).SetupWithManagerForClusterStagedUpdateRun(mgr) Expect(err).Should(Succeed()) + // create the namespace for testing + testNS := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: testNamespaceName, + }, + } + Expect(k8sClient.Create(ctx, testNS)).Should(Succeed()) + go func() { defer GinkgoRecover() err = mgr.Start(ctx) diff --git a/pkg/controllers/updaterun/validation.go b/pkg/controllers/updaterun/validation.go index fdf0c5391..99098f8a0 100644 --- a/pkg/controllers/updaterun/validation.go +++ b/pkg/controllers/updaterun/validation.go @@ -46,7 +46,7 @@ func (r *Reconciler) validate( klog.V(2).InfoS("Start to validate the updateRun", "updateRun", updateRunRef) // Validate the Placement object referenced by the UpdateRun. - placementNamespacedName, err := r.validatePlacement(ctx, updateRunCopy) + _, placementNamespacedName, err := r.validatePlacement(ctx, updateRunCopy) if err != nil { return -1, nil, nil, err } diff --git a/pkg/utils/controller/resource_snapshot_resolver.go b/pkg/utils/controller/resource_snapshot_resolver.go index b0d55d726..a8460fa75 100644 --- a/pkg/utils/controller/resource_snapshot_resolver.go +++ b/pkg/utils/controller/resource_snapshot_resolver.go @@ -56,8 +56,8 @@ type ResourceSnapshotResolver struct { } // NewResourceSnapshotResolver creates a new ResourceSnapshotResolver with the universal fields -func NewResourceSnapshotResolver(client client.Client, scheme *runtime.Scheme) *ResourceSnapshotResolver { - return &ResourceSnapshotResolver{ +func NewResourceSnapshotResolver(client client.Client, scheme *runtime.Scheme) ResourceSnapshotResolver { + return ResourceSnapshotResolver{ Client: client, Scheme: scheme, } @@ -343,7 +343,9 @@ func (r *ResourceSnapshotResolver) ensureLatestResourceSnapshot(ctx context.Cont // shouldCreateNewResourceSnapshotNow checks whether it is ready to create the new resource snapshot to avoid too frequent creation // based on the configured resourceSnapshotCreationMinimumInterval and resourceChangesCollectionDuration. func (r *ResourceSnapshotResolver) shouldCreateNewResourceSnapshotNow(ctx context.Context, latestResourceSnapshot fleetv1beta1.ResourceSnapshotObj) (ctrl.Result, error) { - if r.Config != nil && r.Config.ResourceSnapshotCreationMinimumInterval <= 0 && r.Config.ResourceChangesCollectionDuration <= 0 { + // If Config is nil (no restrictions) or both intervals are non-positive (effectively disabled), + // there is no delay needed — create immediately. + if r.Config == nil || (r.Config.ResourceSnapshotCreationMinimumInterval <= 0 && r.Config.ResourceChangesCollectionDuration <= 0) { return ctrl.Result{}, nil } diff --git a/pkg/utils/controller/resource_snapshot_resolver_test.go b/pkg/utils/controller/resource_snapshot_resolver_test.go index 2fc3c71c6..22670ad81 100644 --- a/pkg/utils/controller/resource_snapshot_resolver_test.go +++ b/pkg/utils/controller/resource_snapshot_resolver_test.go @@ -3938,6 +3938,7 @@ func TestShouldCreateNewResourceSnapshotNow(t *testing.T) { cases := []struct { name string + nilConfig bool creationInterval time.Duration collectionDuration time.Duration creationTime time.Time @@ -3945,6 +3946,14 @@ func TestShouldCreateNewResourceSnapshotNow(t *testing.T) { wantAnnoation bool wantRequeue ctrl.Result }{ + { + // Config == nil means no timing restrictions; should always create immediately. + name: "Config is nil", + nilConfig: true, + // Even with a very recent creation time, nil Config must not delay. + creationTime: time.Now(), + wantRequeue: ctrl.Result{RequeueAfter: 0, Requeue: false}, + }, { name: "ResourceSnapshotCreationMinimumInterval and ResourceChangesCollectionDuration are 0", creationInterval: 0, @@ -4028,9 +4037,9 @@ func TestShouldCreateNewResourceSnapshotNow(t *testing.T) { Build() resolver := NewResourceSnapshotResolver(client, nil) - resolver.Config = NewResourceSnapshotConfig(tc.creationInterval, // Fast creation - tc.collectionDuration, // Longer collection - ) + if !tc.nilConfig { + resolver.Config = NewResourceSnapshotConfig(tc.creationInterval, tc.collectionDuration) + } ctx := context.Background() if err := client.Get(ctx, types.NamespacedName{Name: snapshot.Name}, snapshot); err != nil { diff --git a/test/e2e/cluster_staged_updaterun_test.go b/test/e2e/cluster_staged_updaterun_test.go index 25ac8e486..a661ba502 100644 --- a/test/e2e/cluster_staged_updaterun_test.go +++ b/test/e2e/cluster_staged_updaterun_test.go @@ -67,7 +67,7 @@ var _ = Describe("test CRP rollout with staged update run", func() { crpName := fmt.Sprintf(crpNameTemplate, GinkgoParallelProcess()) strategyName := fmt.Sprintf(clusterStagedUpdateRunStrategyNameTemplate, GinkgoParallelProcess()) - Context("Test resource rollout with staged update run with latest resource snapshot when not specified", Ordered, func() { + Context("Test resource rollout with staged update run with auto-created resource snapshot when not specified", Ordered, func() { updateRunNames := []string{} var strategy *placementv1beta1.ClusterStagedUpdateStrategy var oldConfigMap, newConfigMap corev1.ConfigMap @@ -133,9 +133,9 @@ var _ = Describe("test CRP rollout with staged update run", func() { Eventually(crpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update CRP %s status as expected", crpName) }) - It("Should create a cluster staged update run successfully", func() { - By("Create a cluster staged update run without specifying resource snapshot index") - createClusterStagedUpdateRunSucceedWithNoResourceSnapshotIndex(updateRunNames[0], crpName, strategyName) + It("Should create a cluster staged update run successfully with auto-created resource snapshot", func() { + By("Create a cluster staged update run without specifying resource snapshot index, triggering auto-creation") + createClusterStagedUpdateRunWithAutoCreatedSnapshot(updateRunNames[0], crpName, strategyName) }) It("Should rollout resources to member-cluster-2 only and complete stage canary", func() { @@ -198,9 +198,9 @@ var _ = Describe("test CRP rollout with staged update run", func() { }, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed get the new latest resourcensnapshot") }) - It("Should create a new cluster staged update run successfully", func() { - By("Create a new cluster staged update run without specifying resource snapshot index") - createClusterStagedUpdateRunSucceedWithNoResourceSnapshotIndex(updateRunNames[1], crpName, strategyName) + It("Should create a new cluster staged update run successfully and use the auto-created snapshot for updated resources", func() { + By("Create a new cluster staged update run without specifying resource snapshot index, triggering auto-creation for updated resources") + createClusterStagedUpdateRunWithAutoCreatedSnapshot(updateRunNames[1], crpName, strategyName) }) It("Should rollout resources to member-cluster-2 only and complete stage canary", func() { @@ -2063,7 +2063,9 @@ func createClusterStagedUpdateRunSucceed(updateRunName, crpName, resourceSnapsho Expect(hubClient.Create(ctx, updateRun)).To(Succeed(), "Failed to create ClusterStagedUpdateRun %s", updateRunName) } -func createClusterStagedUpdateRunSucceedWithNoResourceSnapshotIndex(updateRunName, crpName, strategyName string) { +// createClusterStagedUpdateRunWithAutoCreatedSnapshot creates a ClusterStagedUpdateRun without specifying a +// ResourceSnapshotIndex, triggering the controller to auto-create or reuse an existing resource snapshot. +func createClusterStagedUpdateRunWithAutoCreatedSnapshot(updateRunName, crpName, strategyName string) { updateRun := &placementv1beta1.ClusterStagedUpdateRun{ ObjectMeta: metav1.ObjectMeta{ Name: updateRunName, diff --git a/test/e2e/staged_updaterun_test.go b/test/e2e/staged_updaterun_test.go index b6f80b928..a893d257f 100644 --- a/test/e2e/staged_updaterun_test.go +++ b/test/e2e/staged_updaterun_test.go @@ -60,7 +60,7 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem ensureCRPAndRelatedResourcesDeleted(crpName, allMemberClusters) }) - Context("Test resource rollout with staged update run with latest resource snapshot when not specified", Ordered, func() { + Context("Test resource rollout with staged update run with auto-created resource snapshot when not specified", Ordered, func() { updateRunNames := []string{} var strategy *placementv1beta1.StagedUpdateStrategy var oldConfigMap, newConfigMap corev1.ConfigMap @@ -124,8 +124,8 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem Eventually(rpStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to update RP %s/%s status as expected", testNamespace, rpName) }) - It("Should create a staged update run successfully", func() { - createStagedUpdateRunSucceedWithNoResourceSnapshotIndex(updateRunNames[0], testNamespace, rpName, strategyName) + It("Should create a staged update run successfully with auto-created resource snapshot", func() { + createStagedUpdateRunWithAutoCreatedSnapshot(updateRunNames[0], testNamespace, rpName, strategyName) }) It("Should rollout resources to member-cluster-2 only and complete stage canary", func() { @@ -188,8 +188,8 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem }, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed get the new latest resourcensnapshot") }) - It("Should create a new staged update run successfully", func() { - createStagedUpdateRunSucceedWithNoResourceSnapshotIndex(updateRunNames[1], testNamespace, rpName, strategyName) + It("Should create a new staged update run successfully and use the auto-created snapshot for updated resources", func() { + createStagedUpdateRunWithAutoCreatedSnapshot(updateRunNames[1], testNamespace, rpName, strategyName) }) It("Should rollout resources to member-cluster-2 only and complete stage canary", func() { @@ -1658,7 +1658,9 @@ func createStagedUpdateRunSucceed(updateRunName, namespace, rpName, resourceSnap Expect(hubClient.Create(ctx, updateRun)).To(Succeed(), "Failed to create StagedUpdateRun %s", updateRunName) } -func createStagedUpdateRunSucceedWithNoResourceSnapshotIndex(updateRunName, namespace, rpName, strategyName string) { +// createStagedUpdateRunWithAutoCreatedSnapshot creates a StagedUpdateRun without specifying a +// ResourceSnapshotIndex, triggering the controller to auto-create or reuse an existing resource snapshot. +func createStagedUpdateRunWithAutoCreatedSnapshot(updateRunName, namespace, rpName, strategyName string) { updateRun := &placementv1beta1.StagedUpdateRun{ ObjectMeta: metav1.ObjectMeta{ Name: updateRunName, From 06653771663f8a1b102e08a2ed83c0d6a4cbd4f8 Mon Sep 17 00:00:00 2001 From: Britania Rodriguez Reyes <145056127+britaniar@users.noreply.github.com> Date: Wed, 25 Feb 2026 10:54:13 -0800 Subject: [PATCH 5/9] fix: more fixes for enveloped objects flaky e2e (#460) --- .../workgenerator/controller_integration_test.go | 7 ++++--- pkg/controllers/workgenerator/envelope.go | 16 +++++++++++++--- pkg/controllers/workgenerator/envelope_test.go | 5 ++--- 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/pkg/controllers/workgenerator/controller_integration_test.go b/pkg/controllers/workgenerator/controller_integration_test.go index e388dd05f..5f9590d68 100644 --- a/pkg/controllers/workgenerator/controller_integration_test.go +++ b/pkg/controllers/workgenerator/controller_integration_test.go @@ -901,7 +901,6 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", placementv1beta1.ParentResourceSnapshotIndexLabel: "1", placementv1beta1.EnvelopeTypeLabel: string(placementv1beta1.ClusterResourceEnvelopeType), placementv1beta1.EnvelopeNameLabel: envelopedResourceName, - placementv1beta1.EnvelopeNamespaceLabel: envelopedResourceNameSpace, }, Annotations: map[string]string{ placementv1beta1.ParentResourceSnapshotNameAnnotation: binding.Spec.ResourceSnapshotName, @@ -969,7 +968,6 @@ var _ = Describe("Test Work Generator Controller for clusterResourcePlacement", placementv1beta1.PlacementTrackingLabel: testCRPName, placementv1beta1.EnvelopeTypeLabel: string(placementv1beta1.ClusterResourceEnvelopeType), placementv1beta1.EnvelopeNameLabel: envelopedResourceName, - placementv1beta1.EnvelopeNamespaceLabel: envelopedResourceNameSpace, } if err := k8sClient.List(ctx, &workList, envelopWorkLabelMatcher); err != nil { return err @@ -4164,7 +4162,10 @@ func fetchEnvelopedWork(workList *placementv1beta1.WorkList, binding *placementv placementv1beta1.PlacementTrackingLabel: testCRPName, placementv1beta1.EnvelopeTypeLabel: envelopeType, placementv1beta1.EnvelopeNameLabel: envelopeName, - placementv1beta1.EnvelopeNamespaceLabel: envelopeNamespace, + } + + if envelopeType == string(placementv1beta1.ResourceEnvelopeType) { + envelopWorkLabelMatcher[placementv1beta1.EnvelopeNamespaceLabel] = envelopeNamespace } if err := k8sClient.List(ctx, workList, envelopWorkLabelMatcher); err != nil { return err diff --git a/pkg/controllers/workgenerator/envelope.go b/pkg/controllers/workgenerator/envelope.go index 05518f98c..24cff56cd 100644 --- a/pkg/controllers/workgenerator/envelope.go +++ b/pkg/controllers/workgenerator/envelope.go @@ -65,11 +65,17 @@ func (r *Reconciler) createOrUpdateEnvelopeCRWorkObj( } // Add ParentNamespaceLabel if the binding is namespaced if binding.GetNamespace() != "" { - labelMatcher[fleetv1beta1.EnvelopeNamespaceLabel] = envelopeReader.GetNamespace() labelMatcher[fleetv1beta1.ParentNamespaceLabel] = binding.GetNamespace() } + + // Add EnvelopeNamespaceLabel if the envelope type is ResourceEnvelope. + if envelopeReader.GetEnvelopeType() == string(fleetv1beta1.ResourceEnvelopeType) { + labelMatcher[fleetv1beta1.EnvelopeNamespaceLabel] = envelopeReader.GetNamespace() + } + + namespaceMatcher := client.InNamespace(fmt.Sprintf(utils.NamespaceNameFormat, binding.GetBindingSpec().TargetCluster)) workList := &fleetv1beta1.WorkList{} - if err = r.Client.List(ctx, workList, labelMatcher); err != nil { + if err = r.Client.List(ctx, workList, labelMatcher, namespaceMatcher); err != nil { klog.ErrorS(err, "Failed to list work objects when finding the work object for an envelope", "resourceBinding", klog.KObj(binding), "resourceSnapshot", klog.KObj(resourceSnapshot), @@ -212,13 +218,17 @@ func buildNewWorkForEnvelopeCR( fleetv1beta1.ParentResourceSnapshotIndexLabel: resourceSnapshot.GetLabels()[fleetv1beta1.ResourceIndexLabel], fleetv1beta1.EnvelopeTypeLabel: envelopeReader.GetEnvelopeType(), fleetv1beta1.EnvelopeNameLabel: envelopeReader.GetName(), - fleetv1beta1.EnvelopeNamespaceLabel: envelopeReader.GetNamespace(), } // Add ParentNamespaceLabel if the binding is namespaced if resourceBinding.GetNamespace() != "" { labels[fleetv1beta1.ParentNamespaceLabel] = resourceBinding.GetNamespace() } + // Add EnvelopeNamespaceLabel if the envelope type is ResourceEnvelope. + if envelopeReader.GetEnvelopeType() == string(fleetv1beta1.ResourceEnvelopeType) { + labels[fleetv1beta1.EnvelopeNamespaceLabel] = envelopeReader.GetNamespace() + } + return &fleetv1beta1.Work{ ObjectMeta: metav1.ObjectMeta{ Name: workName, diff --git a/pkg/controllers/workgenerator/envelope_test.go b/pkg/controllers/workgenerator/envelope_test.go index c8b112a53..f262766f3 100644 --- a/pkg/controllers/workgenerator/envelope_test.go +++ b/pkg/controllers/workgenerator/envelope_test.go @@ -307,7 +307,7 @@ func TestCreateOrUpdateEnvelopeCRWorkObj(t *testing.T) { existingWork := &fleetv1beta1.Work{ ObjectMeta: metav1.ObjectMeta{ Name: workNamePrefix, - Namespace: "test-app", + Namespace: "fleet-member-test-cluster-1", Labels: map[string]string{ fleetv1beta1.ParentBindingLabel: resourceBinding.Name, fleetv1beta1.PlacementTrackingLabel: resourceBinding.Labels[fleetv1beta1.PlacementTrackingLabel], @@ -388,7 +388,6 @@ func TestCreateOrUpdateEnvelopeCRWorkObj(t *testing.T) { fleetv1beta1.ParentResourceSnapshotIndexLabel: resourceSnapshot.Labels[fleetv1beta1.ResourceIndexLabel], fleetv1beta1.EnvelopeTypeLabel: string(fleetv1beta1.ClusterResourceEnvelopeType), fleetv1beta1.EnvelopeNameLabel: clusterResourceEnvelope.Name, - fleetv1beta1.EnvelopeNamespaceLabel: "", }, Annotations: map[string]string{ fleetv1beta1.ParentResourceSnapshotNameAnnotation: resourceBinding.Spec.ResourceSnapshotName, @@ -416,7 +415,7 @@ func TestCreateOrUpdateEnvelopeCRWorkObj(t *testing.T) { existingObjects: []client.Object{existingWork}, want: &fleetv1beta1.Work{ ObjectMeta: metav1.ObjectMeta{ - Namespace: "test-app", //copy from the existing work + Namespace: "fleet-member-test-cluster-1", //copy from the existing work Labels: map[string]string{ fleetv1beta1.ParentBindingLabel: resourceBinding.Name, fleetv1beta1.PlacementTrackingLabel: resourceBinding.Labels[fleetv1beta1.PlacementTrackingLabel], From 8c995991185372d6b1bb3485e8842c92f8d1faba Mon Sep 17 00:00:00 2001 From: Wei Weng Date: Thu, 26 Feb 2026 09:58:40 -0500 Subject: [PATCH 6/9] feat: add namespace affinity scheduler plugin (not wired yet) (#463) --- apis/cluster/v1beta1/membercluster_types.go | 8 + apis/cluster/v1beta1/zz_generated.deepcopy.go | 7 + ...er.kubernetes-fleet.io_memberclusters.yaml | 10 + .../v1beta1/membercluster_controller.go | 2 + ...mbercluster_controller_integration_test.go | 19 ++ .../v1beta1/membercluster_controller_test.go | 8 + .../plugins/namespaceaffinity/filtering.go | 86 +++++ .../namespaceaffinity/filtering_test.go | 279 ++++++++++++++++ .../plugins/namespaceaffinity/plugin.go | 85 +++++ .../plugins/namespaceaffinity/plugin_test.go | 50 +++ .../controller_integration_test.go | 51 +++ .../watchers/membercluster/watcher.go | 11 + .../namespaceaffinity_integration_test.go | 310 ++++++++++++++++++ test/scheduler/utils_test.go | 5 + 14 files changed, 931 insertions(+) create mode 100644 pkg/scheduler/framework/plugins/namespaceaffinity/filtering.go create mode 100644 pkg/scheduler/framework/plugins/namespaceaffinity/filtering_test.go create mode 100644 pkg/scheduler/framework/plugins/namespaceaffinity/plugin.go create mode 100644 pkg/scheduler/framework/plugins/namespaceaffinity/plugin_test.go create mode 100644 test/scheduler/namespaceaffinity_integration_test.go diff --git a/apis/cluster/v1beta1/membercluster_types.go b/apis/cluster/v1beta1/membercluster_types.go index 3680c681a..ca53d2d4f 100644 --- a/apis/cluster/v1beta1/membercluster_types.go +++ b/apis/cluster/v1beta1/membercluster_types.go @@ -141,6 +141,14 @@ type MemberClusterStatus struct { // +optional ResourceUsage ResourceUsage `json:"resourceUsage,omitempty"` + // Namespaces is a map of namespace names to their associated work names for namespaces + // that are managed by Fleet (i.e., have AppliedWork owner references when created). + // The key is the namespace name and the value is the work name from the AppliedWork owner reference. + // If the namespace does not have an AppliedWork owner reference, the value will be an empty string. + // This field is copied from the corresponding InternalMemberCluster object. + // +optional + Namespaces map[string]string `json:"namespaces,omitempty"` + // AgentStatus is an array of current observed status, each corresponding to one member agent running in the member cluster. // +optional AgentStatus []AgentStatus `json:"agentStatus,omitempty"` diff --git a/apis/cluster/v1beta1/zz_generated.deepcopy.go b/apis/cluster/v1beta1/zz_generated.deepcopy.go index a5ea6533d..cec52aa39 100644 --- a/apis/cluster/v1beta1/zz_generated.deepcopy.go +++ b/apis/cluster/v1beta1/zz_generated.deepcopy.go @@ -285,6 +285,13 @@ func (in *MemberClusterStatus) DeepCopyInto(out *MemberClusterStatus) { } } in.ResourceUsage.DeepCopyInto(&out.ResourceUsage) + if in.Namespaces != nil { + in, out := &in.Namespaces, &out.Namespaces + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } if in.AgentStatus != nil { in, out := &in.AgentStatus, &out.AgentStatus *out = make([]AgentStatus, len(*in)) diff --git a/config/crd/bases/cluster.kubernetes-fleet.io_memberclusters.yaml b/config/crd/bases/cluster.kubernetes-fleet.io_memberclusters.yaml index fdc6093e7..cc28cb137 100644 --- a/config/crd/bases/cluster.kubernetes-fleet.io_memberclusters.yaml +++ b/config/crd/bases/cluster.kubernetes-fleet.io_memberclusters.yaml @@ -659,6 +659,16 @@ spec: x-kubernetes-list-map-keys: - type x-kubernetes-list-type: map + namespaces: + additionalProperties: + type: string + description: |- + Namespaces is a map of namespace names to their associated work names for namespaces + that are managed by Fleet (i.e., have AppliedWork owner references when created). + The key is the namespace name and the value is the work name from the AppliedWork owner reference. + If the namespace does not have an AppliedWork owner reference, the value will be an empty string. + This field is copied from the corresponding InternalMemberCluster object. + type: object properties: additionalProperties: description: PropertyValue is the value of a cluster property. diff --git a/pkg/controllers/membercluster/v1beta1/membercluster_controller.go b/pkg/controllers/membercluster/v1beta1/membercluster_controller.go index 35c61c972..0f05051fb 100644 --- a/pkg/controllers/membercluster/v1beta1/membercluster_controller.go +++ b/pkg/controllers/membercluster/v1beta1/membercluster_controller.go @@ -538,6 +538,8 @@ func (r *Reconciler) syncInternalMemberClusterStatus(imc *clusterv1beta1.Interna r.aggregateJoinedCondition(mc) // Copy resource usages. mc.Status.ResourceUsage = imc.Status.ResourceUsage + // Copy namespaces. + mc.Status.Namespaces = imc.Status.Namespaces // Copy additional conditions. for idx := range imc.Status.Conditions { cond := imc.Status.Conditions[idx] diff --git a/pkg/controllers/membercluster/v1beta1/membercluster_controller_integration_test.go b/pkg/controllers/membercluster/v1beta1/membercluster_controller_integration_test.go index 34a9c8df2..c90f4d9b2 100644 --- a/pkg/controllers/membercluster/v1beta1/membercluster_controller_integration_test.go +++ b/pkg/controllers/membercluster/v1beta1/membercluster_controller_integration_test.go @@ -139,6 +139,13 @@ var _ = Describe("Test MemberCluster Controller", func() { } Expect(cmp.Diff(mc.Status.ResourceUsage, wantResourceUsage, cmpopts.IgnoreTypes(time.Time{}))).To(BeEmpty()) + // Compare the namespaces. + wantNamespaces := map[string]string{ + "test-namespace-1": "work-1", + "test-namespace-2": "", + } + Expect(cmp.Diff(mc.Status.Namespaces, wantNamespaces)).To(BeEmpty()) + // Compare the property provider conditions. wantConditions := []metav1.Condition{ buildCondition(propertyProviderConditionType1, propertyProviderConditionStatus1, propertyProviderConditionReason1, propertyProviderConditionMessage1, mc.GetGeneration()), @@ -342,6 +349,7 @@ var _ = Describe("Test MemberCluster Controller", func() { }, Properties: imc.Status.Properties, ResourceUsage: imc.Status.ResourceUsage, + Namespaces: imc.Status.Namespaces, AgentStatus: imc.Status.AgentStatus, } Expect(cmp.Diff(wantMC, mc.Status, ignoreOption)).Should(BeEmpty()) @@ -373,6 +381,7 @@ var _ = Describe("Test MemberCluster Controller", func() { }, Properties: imc.Status.Properties, ResourceUsage: imc.Status.ResourceUsage, + Namespaces: imc.Status.Namespaces, AgentStatus: imc.Status.AgentStatus, } Expect(cmp.Diff(wantMC, mc.Status, ignoreOption)).Should(BeEmpty()) @@ -409,6 +418,7 @@ var _ = Describe("Test MemberCluster Controller", func() { }, Properties: imc.Status.Properties, ResourceUsage: imc.Status.ResourceUsage, + Namespaces: imc.Status.Namespaces, AgentStatus: imc.Status.AgentStatus, } Expect(cmp.Diff(wantMC, mc.Status, ignoreOption)).Should(BeEmpty()) @@ -445,6 +455,7 @@ var _ = Describe("Test MemberCluster Controller", func() { }, Properties: imc.Status.Properties, ResourceUsage: imc.Status.ResourceUsage, + Namespaces: imc.Status.Namespaces, AgentStatus: imc.Status.AgentStatus, } Expect(cmp.Diff(wantMC, mc.Status, ignoreOption)).Should(BeEmpty()) @@ -500,6 +511,7 @@ var _ = Describe("Test MemberCluster Controller", func() { }, Properties: imc.Status.Properties, ResourceUsage: imc.Status.ResourceUsage, + Namespaces: imc.Status.Namespaces, AgentStatus: imc.Status.AgentStatus, } options := cmpopts.IgnoreFields(metav1.Condition{}, "LastTransitionTime", "ObservedGeneration", "Message") @@ -530,6 +542,7 @@ var _ = Describe("Test MemberCluster Controller", func() { }, Properties: imc.Status.Properties, ResourceUsage: imc.Status.ResourceUsage, + Namespaces: imc.Status.Namespaces, AgentStatus: imc.Status.AgentStatus, } // ignore the ObservedGeneration here cause controller won't update the ReadyToJoin condition. @@ -623,6 +636,7 @@ var _ = Describe("Test MemberCluster Controller", func() { }, Properties: imc.Status.Properties, ResourceUsage: imc.Status.ResourceUsage, + Namespaces: imc.Status.Namespaces, AgentStatus: imc.Status.AgentStatus, } Expect(cmp.Diff(wantMC, mc.Status, ignoreOption)).Should(BeEmpty()) @@ -750,5 +764,10 @@ func checkIfMemberClusterResourcesExistsAndUpdateAgentStatusToTrue(ctx context.C // Add conditions reported by the property provider. meta.SetStatusCondition(&imc.Status.Conditions, buildCondition(propertyProviderConditionType1, propertyProviderConditionStatus1, propertyProviderConditionReason1, propertyProviderConditionMessage1, imc.GetGeneration())) meta.SetStatusCondition(&imc.Status.Conditions, buildCondition(propertyProviderConditionType2, propertyProviderConditionStatus2, propertyProviderConditionReason2, propertyProviderConditionMessage2, imc.GetGeneration())) + // Update the namespaces map. + imc.Status.Namespaces = map[string]string{ + "test-namespace-1": "work-1", + "test-namespace-2": "", + } Expect(k8sClient.Status().Update(ctx, &imc)).Should(Succeed()) } diff --git a/pkg/controllers/membercluster/v1beta1/membercluster_controller_test.go b/pkg/controllers/membercluster/v1beta1/membercluster_controller_test.go index 274b9ccf0..7a6f1b386 100644 --- a/pkg/controllers/membercluster/v1beta1/membercluster_controller_test.go +++ b/pkg/controllers/membercluster/v1beta1/membercluster_controller_test.go @@ -806,6 +806,10 @@ func TestSyncInternalMemberClusterStatus(t *testing.T) { }, ObservationTime: now, }, + Namespaces: map[string]string{ + "test-namespace-1": "work-1", + "test-namespace-2": "", + }, AgentStatus: []clusterv1beta1.AgentStatus{ { Type: clusterv1beta1.MemberAgent, @@ -886,6 +890,10 @@ func TestSyncInternalMemberClusterStatus(t *testing.T) { }, ObservationTime: now, }, + Namespaces: map[string]string{ + "test-namespace-1": "work-1", + "test-namespace-2": "", + }, AgentStatus: []clusterv1beta1.AgentStatus{ { Type: clusterv1beta1.MemberAgent, diff --git a/pkg/scheduler/framework/plugins/namespaceaffinity/filtering.go b/pkg/scheduler/framework/plugins/namespaceaffinity/filtering.go new file mode 100644 index 000000000..d613332ba --- /dev/null +++ b/pkg/scheduler/framework/plugins/namespaceaffinity/filtering.go @@ -0,0 +1,86 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package namespaceaffinity + +import ( + "context" + + "k8s.io/apimachinery/pkg/api/meta" + + clusterv1beta1 "github.com/kubefleet-dev/kubefleet/apis/cluster/v1beta1" + placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" + "github.com/kubefleet-dev/kubefleet/pkg/propertyprovider" + "github.com/kubefleet-dev/kubefleet/pkg/scheduler/framework" +) + +// PreFilter allows the plugin to connect to the PreFilter extension point in the scheduling framework. +func (p *Plugin) PreFilter( + _ context.Context, + _ framework.CycleStatePluginReadWriter, + ps placementv1beta1.PolicySnapshotObj, +) (status *framework.Status) { + // Check if this is a namespace-scoped policy snapshot (ResourcePlacement). + // ClusterResourcePlacement uses ClusterSchedulingPolicySnapshot which doesn't have a namespace, + // so we only need to filter for ResourcePlacement (SchedulingPolicySnapshot). + nsName := ps.GetNamespace() + if nsName == "" { + // This is a cluster-scoped policy (ClusterResourcePlacement). + // Skip namespace affinity filtering. + return framework.NewNonErrorStatus(framework.Skip, p.Name(), "cluster-scoped placement does not require namespace affinity filtering") + } + + // For namespace-scoped placements, we need to ensure the target namespace exists on clusters. + return nil +} + +// Filter allows the plugin to connect to the Filter extension point in the scheduling framework. +func (p *Plugin) Filter( + _ context.Context, + _ framework.CycleStatePluginReadWriter, + ps placementv1beta1.PolicySnapshotObj, + cluster *clusterv1beta1.MemberCluster, +) (status *framework.Status) { + // Get the target namespace for this ResourcePlacement. + nsName := ps.GetNamespace() + + // Check if namespace collection is enabled for this cluster. + // The condition can have three states: + // 1. Missing: namespace collection is not enabled (backward compatibility - skip filtering) + // 2. True: namespace collection is working normally + // 3. False: namespace collection is enabled but degraded (limit reached - still use the data) + cond := meta.FindStatusCondition(cluster.Status.Conditions, propertyprovider.NamespaceCollectionSucceededCondType) + if cond == nil { + // Namespace collection is not enabled, skip filtering for backward compatibility. + return nil + } + + // Check if the cluster has namespace information available. + if cluster.Status.Namespaces == nil { + // Namespace collection is enabled but no data is available. + // This is unexpected, so we mark the cluster as unschedulable. + return framework.NewNonErrorStatus(framework.ClusterUnschedulable, p.Name(), "cluster has no namespace information available") + } + + // Check if the target namespace exists on the cluster. + if _, exists := cluster.Status.Namespaces[nsName]; !exists { + // The namespace does not exist on this cluster. + return framework.NewNonErrorStatus(framework.ClusterUnschedulable, p.Name(), "target namespace does not exist on cluster") + } + + // The namespace exists on the cluster; mark it as eligible for resource placement. + return nil +} diff --git a/pkg/scheduler/framework/plugins/namespaceaffinity/filtering_test.go b/pkg/scheduler/framework/plugins/namespaceaffinity/filtering_test.go new file mode 100644 index 000000000..d9d6c60f0 --- /dev/null +++ b/pkg/scheduler/framework/plugins/namespaceaffinity/filtering_test.go @@ -0,0 +1,279 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package namespaceaffinity + +import ( + "context" + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + clusterv1beta1 "github.com/kubefleet-dev/kubefleet/apis/cluster/v1beta1" + placementv1beta1 "github.com/kubefleet-dev/kubefleet/apis/placement/v1beta1" + "github.com/kubefleet-dev/kubefleet/pkg/propertyprovider" + "github.com/kubefleet-dev/kubefleet/pkg/scheduler/framework" +) + +const ( + clusterName1 = "cluster-1" + pluginName = "NamespaceAffinity" +) + +var ( + ignoreStatusErrorField = cmpopts.IgnoreFields(framework.Status{}, "err") +) + +// TestPreFilter tests the PreFilter extension point of the plugin. +func TestPreFilter(t *testing.T) { + testCases := []struct { + name string + ps placementv1beta1.PolicySnapshotObj + wantStatus *framework.Status + }{ + { + name: "cluster-scoped placement", + ps: &placementv1beta1.ClusterSchedulingPolicySnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-policy", + }, + }, + wantStatus: framework.NewNonErrorStatus(framework.Skip, pluginName, "cluster-scoped placement does not require namespace affinity filtering"), + }, + { + name: "namespace-scoped placement", + ps: &placementv1beta1.SchedulingPolicySnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-policy", + Namespace: "test-namespace", + }, + }, + wantStatus: nil, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + p := New() + ctx := context.Background() + state := framework.NewCycleState(nil, nil, nil) + status := p.PreFilter(ctx, state, tc.ps) + + if diff := cmp.Diff( + status, tc.wantStatus, + cmp.AllowUnexported(framework.Status{}), + ignoreStatusErrorField, + ); diff != "" { + t.Errorf("PreFilter() unexpected status (-got, +want):\n%s", diff) + } + }) + } +} + +// TestFilter tests the Filter extension point of the plugin. +func TestFilter(t *testing.T) { + testCases := []struct { + name string + ps *placementv1beta1.SchedulingPolicySnapshot + cluster *clusterv1beta1.MemberCluster + wantStatus *framework.Status + }{ + { + name: "namespace collection not enabled - should pass", + ps: &placementv1beta1.SchedulingPolicySnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "test-namespace", + }, + }, + cluster: &clusterv1beta1.MemberCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: clusterName1, + }, + Status: clusterv1beta1.MemberClusterStatus{ + Namespaces: nil, + Conditions: []metav1.Condition{}, + }, + }, + wantStatus: nil, + }, + { + name: "namespace collection condition false (degraded) but no namespace info - should filter", + ps: &placementv1beta1.SchedulingPolicySnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "test-namespace", + }, + }, + cluster: &clusterv1beta1.MemberCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: clusterName1, + }, + Status: clusterv1beta1.MemberClusterStatus{ + Namespaces: nil, + Conditions: []metav1.Condition{ + { + Type: propertyprovider.NamespaceCollectionSucceededCondType, + Status: metav1.ConditionFalse, + }, + }, + }, + }, + wantStatus: framework.NewNonErrorStatus(framework.ClusterUnschedulable, pluginName, "cluster has no namespace information available"), + }, + { + name: "namespace collection condition false (degraded) with namespace exists - should pass", + ps: &placementv1beta1.SchedulingPolicySnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "test-namespace", + }, + }, + cluster: &clusterv1beta1.MemberCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: clusterName1, + }, + Status: clusterv1beta1.MemberClusterStatus{ + Namespaces: map[string]string{ + "test-namespace": "work-1", + }, + Conditions: []metav1.Condition{ + { + Type: propertyprovider.NamespaceCollectionSucceededCondType, + Status: metav1.ConditionFalse, + }, + }, + }, + }, + wantStatus: nil, + }, + { + name: "namespace collection enabled but no namespace info - should filter", + ps: &placementv1beta1.SchedulingPolicySnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "test-namespace", + }, + }, + cluster: &clusterv1beta1.MemberCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: clusterName1, + }, + Status: clusterv1beta1.MemberClusterStatus{ + Namespaces: nil, + Conditions: []metav1.Condition{ + { + Type: propertyprovider.NamespaceCollectionSucceededCondType, + Status: metav1.ConditionTrue, + }, + }, + }, + }, + wantStatus: framework.NewNonErrorStatus(framework.ClusterUnschedulable, pluginName, "cluster has no namespace information available"), + }, + { + name: "namespace collection enabled, namespace missing - should filter", + ps: &placementv1beta1.SchedulingPolicySnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "test-namespace", + }, + }, + cluster: &clusterv1beta1.MemberCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: clusterName1, + }, + Status: clusterv1beta1.MemberClusterStatus{ + Namespaces: map[string]string{ + "other-namespace": "work-1", + }, + Conditions: []metav1.Condition{ + { + Type: propertyprovider.NamespaceCollectionSucceededCondType, + Status: metav1.ConditionTrue, + }, + }, + }, + }, + wantStatus: framework.NewNonErrorStatus(framework.ClusterUnschedulable, pluginName, "target namespace does not exist on cluster"), + }, + { + name: "namespace collection enabled, namespace exists - should pass", + ps: &placementv1beta1.SchedulingPolicySnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "test-namespace", + }, + }, + cluster: &clusterv1beta1.MemberCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: clusterName1, + }, + Status: clusterv1beta1.MemberClusterStatus{ + Namespaces: map[string]string{ + "test-namespace": "work-1", + "other-namespace": "work-2", + }, + Conditions: []metav1.Condition{ + { + Type: propertyprovider.NamespaceCollectionSucceededCondType, + Status: metav1.ConditionTrue, + }, + }, + }, + }, + wantStatus: nil, + }, + { + name: "namespace collection enabled, namespace exists with empty work name - should pass", + ps: &placementv1beta1.SchedulingPolicySnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "test-namespace", + }, + }, + cluster: &clusterv1beta1.MemberCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: clusterName1, + }, + Status: clusterv1beta1.MemberClusterStatus{ + Namespaces: map[string]string{ + "test-namespace": "", + }, + Conditions: []metav1.Condition{ + { + Type: propertyprovider.NamespaceCollectionSucceededCondType, + Status: metav1.ConditionTrue, + }, + }, + }, + }, + wantStatus: nil, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + p := New() + ctx := context.Background() + state := framework.NewCycleState(nil, nil, nil) + status := p.Filter(ctx, state, tc.ps, tc.cluster) + + if diff := cmp.Diff( + status, tc.wantStatus, + cmp.AllowUnexported(framework.Status{}), + ignoreStatusErrorField, + ); diff != "" { + t.Errorf("Filter() unexpected status (-got, +want):\n%s", diff) + } + }) + } +} diff --git a/pkg/scheduler/framework/plugins/namespaceaffinity/plugin.go b/pkg/scheduler/framework/plugins/namespaceaffinity/plugin.go new file mode 100644 index 000000000..4262fe1a0 --- /dev/null +++ b/pkg/scheduler/framework/plugins/namespaceaffinity/plugin.go @@ -0,0 +1,85 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package namespaceaffinity features a scheduler plugin that filters clusters based on namespace availability. +// This plugin ensures that ResourcePlacements are only scheduled to clusters where the target namespace exists. +package namespaceaffinity + +import ( + "github.com/kubefleet-dev/kubefleet/pkg/scheduler/framework" +) + +// Plugin is the scheduler plugin that filters clusters based on namespace availability for ResourcePlacements. +type Plugin struct { + // The name of the plugin. + name string + + // The framework handle. + handle framework.Handle +} + +var ( + // Verify that Plugin can connect to relevant extension points at compile time. + // + // This plugin leverages the following extension points: + // * PreFilter + // * Filter + // + // Note that successful connection to any of the extension points implies that the + // plugin already implements the Plugin interface. + _ framework.PreFilterPlugin = &Plugin{} + _ framework.FilterPlugin = &Plugin{} +) + +type namespaceAffinityPluginOptions struct { + // The name of the plugin. + name string +} + +type Option func(*namespaceAffinityPluginOptions) + +var defaultPluginOptions = namespaceAffinityPluginOptions{ + name: "NamespaceAffinity", +} + +// WithName sets the name of the plugin. +func WithName(name string) Option { + return func(o *namespaceAffinityPluginOptions) { + o.name = name + } +} + +// New returns a new Plugin. +func New(opts ...Option) Plugin { + options := defaultPluginOptions + for _, opt := range opts { + opt(&options) + } + + return Plugin{ + name: options.name, + } +} + +// Name returns the name of the plugin. +func (p *Plugin) Name() string { + return p.name +} + +// SetUpWithFramework sets up this plugin with a scheduler framework. +func (p *Plugin) SetUpWithFramework(handle framework.Handle) { + p.handle = handle +} diff --git a/pkg/scheduler/framework/plugins/namespaceaffinity/plugin_test.go b/pkg/scheduler/framework/plugins/namespaceaffinity/plugin_test.go new file mode 100644 index 000000000..f33a8ac66 --- /dev/null +++ b/pkg/scheduler/framework/plugins/namespaceaffinity/plugin_test.go @@ -0,0 +1,50 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package namespaceaffinity + +import ( + "testing" +) + +// TestNew tests the New function. +func TestNew(t *testing.T) { + tests := []struct { + name string + opts []Option + wantName string + }{ + { + name: "default options", + opts: nil, + wantName: "NamespaceAffinity", + }, + { + name: "custom name", + opts: []Option{WithName("CustomNamespaceAffinity")}, + wantName: "CustomNamespaceAffinity", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + p := New(tc.opts...) + if got := p.Name(); got != tc.wantName { + t.Errorf("New() name = %v, want %v", got, tc.wantName) + } + }) + } +} diff --git a/pkg/scheduler/watchers/membercluster/controller_integration_test.go b/pkg/scheduler/watchers/membercluster/controller_integration_test.go index 2019ef1c4..ec3aa76dd 100644 --- a/pkg/scheduler/watchers/membercluster/controller_integration_test.go +++ b/pkg/scheduler/watchers/membercluster/controller_integration_test.go @@ -372,6 +372,57 @@ var _ = Describe("scheduler member cluster source controller", Serial, Ordered, }) }) + Context("ready cluster has a namespace collection change", func() { + BeforeAll(func() { + Consistently(noKeyEnqueuedActual, consistentlyDuration, consistentlyInterval).Should(Succeed(), "Workqueue is not empty") + + // Retrieve the cluster. + memberCluster := &clusterv1beta1.MemberCluster{} + Expect(hubClient.Get(ctx, types.NamespacedName{Name: clusterName1}, memberCluster)).To(Succeed(), "Failed to get member cluster") + + // Update the namespace collection. + memberCluster.Status.Namespaces = map[string]string{ + "namespace-1": "work-1", + "namespace-2": "work-2", + } + Expect(hubClient.Status().Update(ctx, memberCluster)).Should(Succeed(), "Failed to update member cluster namespace collection") + }) + + It("should enqueue CRPs (case 1a)", func() { + Eventually(qualifiedKeysEnqueuedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Keys are not enqueued as expected") + Consistently(qualifiedKeysEnqueuedActual, consistentlyDuration, consistentlyInterval).Should(Succeed(), "Keys are not enqueued as expected") + }) + + It("can empty the key collector", func() { + keyCollector.Reset() + Eventually(noKeyEnqueuedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Workqueue is not empty") + Consistently(noKeyEnqueuedActual, consistentlyDuration, consistentlyInterval).Should(Succeed(), "Workqueue is not empty") + }) + + It("can update the namespace collection", func() { + // Retrieve the cluster. + memberCluster := &clusterv1beta1.MemberCluster{} + Expect(hubClient.Get(ctx, types.NamespacedName{Name: clusterName1}, memberCluster)).To(Succeed(), "Failed to get member cluster") + + // Update the namespace collection by adding a new namespace. + memberCluster.Status.Namespaces = map[string]string{ + "namespace-1": "work-1", + "namespace-2": "work-2", + "namespace-3": "work-3", + } + Expect(hubClient.Status().Update(ctx, memberCluster)).Should(Succeed(), "Failed to update member cluster namespace collection") + }) + + It("should enqueue CRPs (case 1a)", func() { + Eventually(qualifiedKeysEnqueuedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Keys are not enqueued as expected") + Consistently(qualifiedKeysEnqueuedActual, consistentlyDuration, consistentlyInterval).Should(Succeed(), "Keys are not enqueued as expected") + }) + + AfterAll(func() { + keyCollector.Reset() + }) + }) + Context("ready cluster is out of sync", func() { BeforeAll(func() { Consistently(noKeyEnqueuedActual, consistentlyDuration, consistentlyInterval).Should(Succeed(), "Workqueue is not empty") diff --git a/pkg/scheduler/watchers/membercluster/watcher.go b/pkg/scheduler/watchers/membercluster/watcher.go index c504b496e..8a47d55c1 100644 --- a/pkg/scheduler/watchers/membercluster/watcher.go +++ b/pkg/scheduler/watchers/membercluster/watcher.go @@ -253,6 +253,17 @@ func (r *Reconciler) SetupWithManager(mgr ctrl.Manager) error { } } + // Capture namespace collection changes. + // + // This is important for the namespace affinity plugin to trigger rescheduling + // when namespaces are added or removed from clusters. + oldNamespaces := oldCluster.Status.Namespaces + newNamespaces := newCluster.Status.Namespaces + if !equality.Semantic.DeepEqual(oldNamespaces, newNamespaces) { + klog.V(2).InfoS("A member cluster namespace collection change has been detected", "memberCluster", clusterKObj) + return true + } + // Capture resource usage changes. oldCapacity := oldCluster.Status.ResourceUsage.Capacity newCapacity := newCluster.Status.ResourceUsage.Capacity diff --git a/test/scheduler/namespaceaffinity_integration_test.go b/test/scheduler/namespaceaffinity_integration_test.go new file mode 100644 index 000000000..9922f2bb3 --- /dev/null +++ b/test/scheduler/namespaceaffinity_integration_test.go @@ -0,0 +1,310 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package tests + +import ( + "fmt" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + + clusterv1beta1 "github.com/kubefleet-dev/kubefleet/apis/cluster/v1beta1" + "github.com/kubefleet-dev/kubefleet/pkg/propertyprovider" +) + +var _ = Describe("scheduling ResourcePlacements with namespace affinity", func() { + Context("PickAll policy, namespace exists on some clusters", Serial, Ordered, func() { + rpName := fmt.Sprintf(rpNameTemplate, GinkgoParallelProcess()) + testNamespace := "test-namespace-affinity" + policySnapshotName := fmt.Sprintf(policySnapshotNameTemplate, rpName, 0) + + // Clusters with namespace collection enabled and namespace exists + clustersWithNamespace := []string{memberCluster1EastProd, memberCluster2EastProd} + // Clusters with namespace collection enabled but namespace does NOT exist + clustersWithoutNamespace := []string{memberCluster4CentralProd, memberCluster5CentralProd} + // Clusters without namespace collection enabled (should be included - backward compatibility) + clustersNoCollection := []string{memberCluster3EastCanary, memberCluster6WestProd, memberCluster7WestCanary} + + BeforeAll(func() { + // Create the test namespace + ns := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: testNamespace, + }, + } + Expect(hubClient.Create(ctx, ns)).Should(Succeed(), "Failed to create test namespace") + + // Ensure that no bindings have been created so far. + noBindingsCreatedActual := noBindingsCreatedForPlacementActual(types.NamespacedName{Name: rpName, Namespace: testNamespace}) + Consistently(noBindingsCreatedActual, consistentlyDuration, consistentlyInterval).Should(Succeed(), "Some bindings have been created unexpectedly") + + // Set up namespace collection status on clusters + // Clusters 1 and 2: namespace collection enabled, namespace exists + for _, clusterName := range clustersWithNamespace { + setNamespaceCollectionOnCluster(clusterName, true, map[string]string{ + testNamespace: "work-1", + }) + } + + // Clusters 4 and 5: namespace collection enabled, namespace does NOT exist + for _, clusterName := range clustersWithoutNamespace { + setNamespaceCollectionOnCluster(clusterName, true, map[string]string{ + "other-namespace": "work-2", + }) + } + + // Clusters 3, 6, and 7: namespace collection NOT enabled (backward compatibility test) + for _, clusterName := range clustersNoCollection { + setNamespaceCollectionOnCluster(clusterName, false, nil) + } + + // Create the ResourcePlacement and its associated policy snapshot. + createPickAllRPWithPolicySnapshot(testNamespace, rpName, policySnapshotName, nil) + }) + + It("should add scheduler cleanup finalizer to the RP", func() { + finalizerAddedActual := placementSchedulerFinalizerAddedActual(types.NamespacedName{Name: rpName, Namespace: testNamespace}) + Eventually(finalizerAddedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to add scheduler cleanup finalizer to RP") + }) + + It("should create bindings only for clusters with the namespace (or no collection enabled)", func() { + // Should schedule to: clusters 1, 2 (have namespace), and 6 (no collection enabled) + expectedClusters := append(clustersWithNamespace, clustersNoCollection...) + scheduledBindingsCreatedActual := scheduledBindingsCreatedOrUpdatedForClustersActual( + expectedClusters, + zeroScoreByCluster, + types.NamespacedName{Name: rpName, Namespace: testNamespace}, + policySnapshotName, + ) + Eventually(scheduledBindingsCreatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to create the expected set of bindings") + Consistently(scheduledBindingsCreatedActual, consistentlyDuration, consistentlyInterval).Should(Succeed(), "Failed to create the expected set of bindings") + }) + + It("should not create bindings for clusters without the namespace", func() { + noBindingsCreatedActual := noBindingsCreatedForClustersActual(clustersWithoutNamespace, types.NamespacedName{Name: rpName, Namespace: testNamespace}) + Eventually(noBindingsCreatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Bindings created for clusters without namespace") + Consistently(noBindingsCreatedActual, consistentlyDuration, consistentlyInterval).Should(Succeed(), "Bindings created for clusters without namespace") + }) + + It("should report status correctly", func() { + expectedClusters := append(clustersWithNamespace, clustersNoCollection...) + filteredClusters := append(clustersWithoutNamespace, memberCluster8UnhealthyEastProd, memberCluster9LeftCentralProd) + statusUpdatedActual := pickAllPolicySnapshotStatusUpdatedActual( + expectedClusters, + filteredClusters, + types.NamespacedName{Name: policySnapshotName, Namespace: testNamespace}, + ) + Eventually(statusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to report correct policy snapshot status") + Consistently(statusUpdatedActual, consistentlyDuration, consistentlyInterval).Should(Succeed(), "Failed to report correct policy snapshot status") + }) + + AfterAll(func() { + // Clean up namespace collection status + for _, clusterName := range append(clustersWithNamespace, append(clustersWithoutNamespace, clustersNoCollection...)...) { + clearNamespaceCollectionOnCluster(clusterName) + } + + // Delete the ResourcePlacement. + ensurePlacementAndAllRelatedResourcesDeletion(types.NamespacedName{Name: rpName, Namespace: testNamespace}) + + // Delete the test namespace + ns := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: testNamespace, + }, + } + _ = hubClient.Delete(ctx, ns) + }) + }) + + Context("PickAll policy, namespace added after scheduling", Serial, Ordered, func() { + rpName := fmt.Sprintf(rpNameTemplate, GinkgoParallelProcess()) + testNamespace := "test-namespace-dynamic" + policySnapshotName := fmt.Sprintf(policySnapshotNameTemplate, rpName, 0) + + targetCluster := memberCluster3EastCanary + + BeforeAll(func() { + // Create the test namespace + ns := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: testNamespace, + }, + } + Expect(hubClient.Create(ctx, ns)).Should(Succeed(), "Failed to create test namespace") + + // Ensure that no bindings have been created so far. + noBindingsCreatedActual := noBindingsCreatedForPlacementActual(types.NamespacedName{Name: rpName, Namespace: testNamespace}) + Consistently(noBindingsCreatedActual, consistentlyDuration, consistentlyInterval).Should(Succeed(), "Some bindings have been created unexpectedly") + + // Initially, cluster does NOT have the namespace + setNamespaceCollectionOnCluster(targetCluster, true, map[string]string{ + "other-namespace": "work-1", + }) + + // Create the ResourcePlacement and its associated policy snapshot. + createPickAllRPWithPolicySnapshot(testNamespace, rpName, policySnapshotName, nil) + }) + + It("should add scheduler cleanup finalizer to the RP", func() { + finalizerAddedActual := placementSchedulerFinalizerAddedActual(types.NamespacedName{Name: rpName, Namespace: testNamespace}) + Eventually(finalizerAddedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to add scheduler cleanup finalizer to RP") + }) + + It("should not create binding initially when namespace is missing", func() { + noBindingsCreatedActual := noBindingsCreatedForClustersActual([]string{targetCluster}, types.NamespacedName{Name: rpName, Namespace: testNamespace}) + Eventually(noBindingsCreatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Binding created despite missing namespace") + Consistently(noBindingsCreatedActual, consistentlyDuration, consistentlyInterval).Should(Succeed(), "Binding created despite missing namespace") + }) + + It("can add the namespace to the cluster", func() { + // Update cluster to have the namespace + setNamespaceCollectionOnCluster(targetCluster, true, map[string]string{ + testNamespace: "work-new", + "other-namespace": "work-1", + }) + }) + + It("should create binding after namespace is added", func() { + scheduledBindingsCreatedActual := scheduledBindingsCreatedOrUpdatedForClustersActual( + []string{targetCluster}, + zeroScoreByCluster, + types.NamespacedName{Name: rpName, Namespace: testNamespace}, + policySnapshotName, + ) + Eventually(scheduledBindingsCreatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to create binding after namespace added") + Consistently(scheduledBindingsCreatedActual, consistentlyDuration, consistentlyInterval).Should(Succeed(), "Failed to create binding after namespace added") + }) + + AfterAll(func() { + // Clean up namespace collection status + clearNamespaceCollectionOnCluster(targetCluster) + + // Delete the ResourcePlacement. + ensurePlacementAndAllRelatedResourcesDeletion(types.NamespacedName{Name: rpName, Namespace: testNamespace}) + + // Delete the test namespace + ns := &corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: testNamespace, + }, + } + _ = hubClient.Delete(ctx, ns) + }) + }) + + Context("ClusterResourcePlacement should not be affected by namespace affinity", Serial, Ordered, func() { + crpName := fmt.Sprintf(crpNameTemplate, GinkgoParallelProcess()) + policySnapshotName := fmt.Sprintf(policySnapshotNameTemplate, crpName, 1) + + // All healthy clusters should be selected regardless of namespace collection + expectedClusters := healthyClusters + + BeforeAll(func() { + // Ensure that no bindings have been created so far. + noBindingsCreatedActual := noBindingsCreatedForPlacementActual(types.NamespacedName{Name: crpName}) + Consistently(noBindingsCreatedActual, consistentlyDuration, consistentlyInterval).Should(Succeed(), "Some bindings have been created unexpectedly") + + // Set namespace collection on some clusters - should NOT affect CRP + setNamespaceCollectionOnCluster(memberCluster1EastProd, true, map[string]string{}) + setNamespaceCollectionOnCluster(memberCluster2EastProd, true, map[string]string{ + "some-namespace": "work-1", + }) + + // Create the CRP (cluster-scoped) and its associated policy snapshot. + createNilSchedulingPolicyCRPWithPolicySnapshot(crpName, policySnapshotName, nil) + }) + + It("should add scheduler cleanup finalizer to the CRP", func() { + finalizerAddedActual := placementSchedulerFinalizerAddedActual(types.NamespacedName{Name: crpName}) + Eventually(finalizerAddedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to add scheduler cleanup finalizer to CRP") + }) + + It("should create bindings for all healthy clusters", func() { + // CRP should schedule to all healthy clusters, namespace affinity should be skipped + scheduledBindingsCreatedActual := scheduledBindingsCreatedOrUpdatedForClustersActual( + expectedClusters, + zeroScoreByCluster, + types.NamespacedName{Name: crpName}, + policySnapshotName, + ) + Eventually(scheduledBindingsCreatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to create the expected set of bindings") + Consistently(scheduledBindingsCreatedActual, consistentlyDuration, consistentlyInterval).Should(Succeed(), "Failed to create the expected set of bindings") + }) + + AfterAll(func() { + // Clean up namespace collection status + clearNamespaceCollectionOnCluster(memberCluster1EastProd) + clearNamespaceCollectionOnCluster(memberCluster2EastProd) + + // Delete the CRP. + ensurePlacementAndAllRelatedResourcesDeletion(types.NamespacedName{Name: crpName}) + }) + }) +}) + +// setNamespaceCollectionOnCluster sets the namespace collection status on a member cluster. +func setNamespaceCollectionOnCluster(clusterName string, enabled bool, namespaces map[string]string) { + Eventually(func() error { + mc := &clusterv1beta1.MemberCluster{} + if err := hubClient.Get(ctx, types.NamespacedName{Name: clusterName}, mc); err != nil { + return err + } + + // Update namespaces map + mc.Status.Namespaces = namespaces + + // Update condition based on whether namespace collection is enabled + if enabled { + // Add/update the NamespaceCollectionSucceeded condition to True + meta.SetStatusCondition(&mc.Status.Conditions, metav1.Condition{ + Type: propertyprovider.NamespaceCollectionSucceededCondType, + Status: metav1.ConditionTrue, + Reason: propertyprovider.NamespaceCollectionSucceededReason, + Message: propertyprovider.NamespaceCollectionSucceededMsg, + }) + } else { + // Remove the condition entirely (namespace collection not enabled) + // This is different from ConditionFalse which means degraded/limit reached + meta.RemoveStatusCondition(&mc.Status.Conditions, propertyprovider.NamespaceCollectionSucceededCondType) + } + + return hubClient.Status().Update(ctx, mc) + }, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to set namespace collection on cluster %s", clusterName) +} + +// clearNamespaceCollectionOnCluster removes namespace collection status from a member cluster. +func clearNamespaceCollectionOnCluster(clusterName string) { + Eventually(func() error { + mc := &clusterv1beta1.MemberCluster{} + if err := hubClient.Get(ctx, types.NamespacedName{Name: clusterName}, mc); err != nil { + return err + } + + // Clear namespaces map + mc.Status.Namespaces = nil + + // Remove the NamespaceCollectionSucceeded condition + meta.RemoveStatusCondition(&mc.Status.Conditions, propertyprovider.NamespaceCollectionSucceededCondType) + + return hubClient.Status().Update(ctx, mc) + }, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed to clear namespace collection on cluster %s", clusterName) +} diff --git a/test/scheduler/utils_test.go b/test/scheduler/utils_test.go index 318ea0f15..1748ae513 100644 --- a/test/scheduler/utils_test.go +++ b/test/scheduler/utils_test.go @@ -45,6 +45,7 @@ import ( "github.com/kubefleet-dev/kubefleet/pkg/scheduler/framework" "github.com/kubefleet-dev/kubefleet/pkg/scheduler/framework/plugins/clusteraffinity" "github.com/kubefleet-dev/kubefleet/pkg/scheduler/framework/plugins/clustereligibility" + "github.com/kubefleet-dev/kubefleet/pkg/scheduler/framework/plugins/namespaceaffinity" "github.com/kubefleet-dev/kubefleet/pkg/scheduler/framework/plugins/sameplacementaffinity" "github.com/kubefleet-dev/kubefleet/pkg/scheduler/framework/plugins/tainttoleration" "github.com/kubefleet-dev/kubefleet/pkg/scheduler/framework/plugins/topologyspreadconstraints" @@ -137,6 +138,7 @@ func buildSchedulerFramework(ctrlMgr manager.Manager, clusterEligibilityChecker taintTolerationPlugin := tainttoleration.New() samePlacementAffinityPlugin := sameplacementaffinity.New() topologyspreadconstraintsPlugin := topologyspreadconstraints.New() + namespaceAffinityPlugin := namespaceaffinity.New() profile. // Register cluster affinity plugin. WithPreFilterPlugin(&clusterAffinityPlugin). @@ -150,6 +152,9 @@ func buildSchedulerFramework(ctrlMgr manager.Manager, clusterEligibilityChecker // Register same placement affinity plugin. WithFilterPlugin(&samePlacementAffinityPlugin). WithScorePlugin(&samePlacementAffinityPlugin). + // Register namespace affinity plugin. + WithPreFilterPlugin(&namespaceAffinityPlugin). + WithFilterPlugin(&namespaceAffinityPlugin). // Register topology spread constraints plugin. WithPostBatchPlugin(&topologyspreadconstraintsPlugin). WithPreFilterPlugin(&topologyspreadconstraintsPlugin). From 4c4f848df6375d91290a123484269e5c376f11bd Mon Sep 17 00:00:00 2001 From: michaelawyu Date: Sat, 28 Feb 2026 00:16:28 +0800 Subject: [PATCH 7/9] feat: refactor hub agent flag handling logic + add per-flag validation logic (#459) --- cmd/hubagent/main.go | 34 +- cmd/hubagent/options/clustermgmt.go | 111 ++++++ cmd/hubagent/options/ctrlmanager.go | 178 +++++++++ cmd/hubagent/options/featureflags.go | 116 ++++++ cmd/hubagent/options/leaderelection.go | 98 +++++ cmd/hubagent/options/options.go | 191 ++-------- .../options/placementctrlsetratelimit.go | 184 +++++++++ cmd/hubagent/options/placementmgmt.go | 352 ++++++++++++++++++ cmd/hubagent/options/ratelimit.go | 69 ---- cmd/hubagent/options/validation.go | 50 ++- cmd/hubagent/options/validation_test.go | 98 ++--- cmd/hubagent/options/webhooks.go | 133 +++++++ cmd/hubagent/workload/setup.go | 54 +-- go.mod | 1 - pkg/utils/apiresources.go | 2 +- pkg/webhook/webhook.go | 12 +- pkg/webhook/webhook_test.go | 40 +- 17 files changed, 1345 insertions(+), 378 deletions(-) create mode 100644 cmd/hubagent/options/clustermgmt.go create mode 100644 cmd/hubagent/options/ctrlmanager.go create mode 100644 cmd/hubagent/options/featureflags.go create mode 100644 cmd/hubagent/options/leaderelection.go create mode 100644 cmd/hubagent/options/placementctrlsetratelimit.go create mode 100644 cmd/hubagent/options/placementmgmt.go delete mode 100644 cmd/hubagent/options/ratelimit.go create mode 100644 cmd/hubagent/options/webhooks.go diff --git a/cmd/hubagent/main.go b/cmd/hubagent/main.go index 2e67bd2b7..dfe7299ce 100644 --- a/cmd/hubagent/main.go +++ b/cmd/hubagent/main.go @@ -102,29 +102,31 @@ func main() { ctrl.SetLogger(zap.New(zap.UseDevMode(true))) config := ctrl.GetConfigOrDie() - config.QPS, config.Burst = float32(opts.HubQPS), opts.HubBurst + config.QPS, config.Burst = float32(opts.CtrlMgrOpts.HubQPS), opts.CtrlMgrOpts.HubBurst mgrOpts := ctrl.Options{ Scheme: scheme, Cache: cache.Options{ - SyncPeriod: &opts.ResyncPeriod.Duration, + SyncPeriod: &opts.CtrlMgrOpts.ResyncPeriod.Duration, DefaultTransform: cache.TransformStripManagedFields(), }, - LeaderElection: opts.LeaderElection.LeaderElect, - LeaderElectionID: opts.LeaderElection.ResourceName, - LeaderElectionNamespace: opts.LeaderElection.ResourceNamespace, - LeaderElectionResourceLock: opts.LeaderElection.ResourceLock, - HealthProbeBindAddress: opts.HealthProbeAddress, + LeaderElection: opts.LeaderElectionOpts.LeaderElect, + LeaderElectionID: "136224848560.hub.fleet.azure.com", + LeaderElectionNamespace: opts.LeaderElectionOpts.ResourceNamespace, + LeaseDuration: &opts.LeaderElectionOpts.LeaseDuration.Duration, + RenewDeadline: &opts.LeaderElectionOpts.RenewDeadline.Duration, + RetryPeriod: &opts.LeaderElectionOpts.RetryPeriod.Duration, + HealthProbeBindAddress: opts.CtrlMgrOpts.HealthProbeBindAddress, Metrics: metricsserver.Options{ - BindAddress: opts.MetricsBindAddress, + BindAddress: opts.CtrlMgrOpts.MetricsBindAddress, }, WebhookServer: ctrlwebhook.NewServer(ctrlwebhook.Options{ Port: FleetWebhookPort, CertDir: webhook.FleetWebhookCertDir, }), } - if opts.EnablePprof { - mgrOpts.PprofBindAddress = fmt.Sprintf(":%d", opts.PprofPort) + if opts.CtrlMgrOpts.EnablePprof { + mgrOpts.PprofBindAddress = fmt.Sprintf(":%d", opts.CtrlMgrOpts.PprofPort) } mgr, err := ctrl.NewManager(config, mgrOpts) if err != nil { @@ -133,13 +135,13 @@ func main() { } klog.V(2).InfoS("starting hubagent") - if opts.EnableV1Beta1APIs { + if opts.FeatureFlags.EnableV1Beta1APIs { klog.Info("Setting up memberCluster v1beta1 controller") if err = (&mcv1beta1.Reconciler{ Client: mgr.GetClient(), - NetworkingAgentsEnabled: opts.NetworkingAgentsEnabled, - MaxConcurrentReconciles: int(math.Ceil(float64(opts.MaxFleetSizeSupported) / 100)), //one member cluster reconciler routine per 100 member clusters - ForceDeleteWaitTime: opts.ForceDeleteWaitTime.Duration, + NetworkingAgentsEnabled: opts.ClusterMgmtOpts.NetworkingAgentsEnabled, + MaxConcurrentReconciles: int(math.Ceil(float64(opts.PlacementMgmtOpts.MaxFleetSize) / 100)), //one member cluster reconciler routine per 100 member clusters + ForceDeleteWaitTime: opts.ClusterMgmtOpts.ForceDeleteWaitTime.Duration, }).SetupWithManager(mgr, "membercluster-controller"); err != nil { klog.ErrorS(err, "unable to create v1beta1 controller", "controller", "MemberCluster") exitWithErrorFunc() @@ -155,7 +157,7 @@ func main() { exitWithErrorFunc() } - if opts.EnableWebhook { + if opts.WebhookOpts.EnableWebhooks { // Generate webhook configuration with certificates webhookConfig, err := webhook.NewWebhookConfigFromOptions(mgr, opts, FleetWebhookPort) if err != nil { @@ -172,7 +174,7 @@ func main() { // When using cert-manager, add a readiness check to ensure CA bundles are injected before marking ready. // This prevents the pod from accepting traffic before cert-manager has populated the webhook CA bundles, // which would cause webhook calls to fail. - if opts.UseCertManager { + if opts.WebhookOpts.UseCertManager { if err := mgr.AddReadyzCheck("cert-manager-ca-injection", func(req *http.Request) error { return webhookConfig.CheckCAInjection(req.Context()) }); err != nil { diff --git a/cmd/hubagent/options/clustermgmt.go b/cmd/hubagent/options/clustermgmt.go new file mode 100644 index 000000000..307340488 --- /dev/null +++ b/cmd/hubagent/options/clustermgmt.go @@ -0,0 +1,111 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package options + +import ( + "flag" + "fmt" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// ClusterManagementOptions is a set of options the KubeFleet hub agent exposes for +// managing member clusters. +type ClusterManagementOptions struct { + // Expect that Fleet networking agents have been installed in the fleet or not. If set to true, + // the hub agent will start to expect heartbeats from the networking agents on the member cluster related + // resources. + NetworkingAgentsEnabled bool + + // The duration the KubeFleet hub agent will wait for new heartbeats before marking a member cluster as unhealthy. + UnhealthyThreshold metav1.Duration + + // The duration the KubeFleet hub agent will wait before force-deleting a member cluster resource after it has been + // marked for deletion. + ForceDeleteWaitTime metav1.Duration +} + +// AddFlags adds flags for ClusterManagementOptions to the specified FlagSet. +func (o *ClusterManagementOptions) AddFlags(flags *flag.FlagSet) { + flags.BoolVar( + &o.NetworkingAgentsEnabled, + "networking-agents-enabled", + false, + "Expect that Fleet networking agents have been installed in the fleet or not. If set to true, the hub agent will start to expect heartbeats from the networking agents on the member cluster related resources.", + ) + + flags.Var( + newClusterUnhealthyThresholdValueWithValidation(60*time.Second, &o.UnhealthyThreshold), + "cluster-unhealthy-threshold", + "The duration the KubeFleet hub agent will wait for new heartbeats before marking a member cluster as unhealthy. Defaults to 60 seconds. Must be a duration in the range [30s, 1h].", + ) + + flags.Var( + newForceDeleteWaitTimeValueWithValidation(15*time.Minute, &o.ForceDeleteWaitTime), + "force-delete-wait-time", + "The duration the KubeFleet hub agent will wait before force-deleting a member cluster resource after it has been marked for deletion. Defaults to 15 minutes. Must be a duration in the range [30s, 1h].", + ) +} + +// A list of flag variables that allow pluggable validation logic when parsing the input args. + +type ClusterUnhealthyThresholdValueWithValidation metav1.Duration + +func (v *ClusterUnhealthyThresholdValueWithValidation) String() string { + return v.Duration.String() +} + +func (v *ClusterUnhealthyThresholdValueWithValidation) Set(s string) error { + duration, err := time.ParseDuration(s) + if err != nil { + return fmt.Errorf("failed to parse duration: %w", err) + } + if duration < 30*time.Second || duration > time.Hour { + return fmt.Errorf("duration must be in the range [30s, 1h]") + } + v.Duration = duration + return nil +} + +func newClusterUnhealthyThresholdValueWithValidation(defaultVal time.Duration, p *metav1.Duration) *ClusterUnhealthyThresholdValueWithValidation { + p.Duration = defaultVal + return (*ClusterUnhealthyThresholdValueWithValidation)(p) +} + +type ForceDeleteWaitTimeValueWithValidation metav1.Duration + +func (v *ForceDeleteWaitTimeValueWithValidation) String() string { + return v.Duration.String() +} + +func (v *ForceDeleteWaitTimeValueWithValidation) Set(s string) error { + duration, err := time.ParseDuration(s) + if err != nil { + return fmt.Errorf("failed to parse duration: %w", err) + } + if duration < 30*time.Second || duration > time.Hour { + return fmt.Errorf("duration must be in the range [30s, 1h]") + } + v.Duration = duration + return nil +} + +func newForceDeleteWaitTimeValueWithValidation(defaultVal time.Duration, p *metav1.Duration) *ForceDeleteWaitTimeValueWithValidation { + p.Duration = defaultVal + return (*ForceDeleteWaitTimeValueWithValidation)(p) +} diff --git a/cmd/hubagent/options/ctrlmanager.go b/cmd/hubagent/options/ctrlmanager.go new file mode 100644 index 000000000..23a83da0f --- /dev/null +++ b/cmd/hubagent/options/ctrlmanager.go @@ -0,0 +1,178 @@ +/* +Copyright 2026 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package options + +import ( + "flag" + "fmt" + "strconv" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// ControllerManagerOptions is a set of options the KubeFleet hub agent exposes for +// controlling the controller manager behavior. +type ControllerManagerOptions struct { + // The TCP address that the hub agent controller manager would bind to + // serve health probes. It can be set to "0" or the empty value to disable the health probe server. + HealthProbeBindAddress string + + // The TCP address that the hub agent controller manager should bind to serve prometheus metrics. + // It can be set to "0" or the empty value to disable the metrics server. + MetricsBindAddress string + + // Enable the pprof server for profiling the hub agent controller manager or not. + EnablePprof bool + + // The port in use by the pprof server for profiling the hub agent controller manager. + PprofPort int + + // The QPS limit set to the rate limiter of the Kubernetes client in use by the controller manager + // and all of its managed controller, for client-side throttling purposes. + HubQPS float64 + + // The burst limit set to the rate limiter of the Kubernetes client in use by the controller manager + // and all of its managed controller, for client-side throttling purposes. + HubBurst int + + // The duration for the informers in the controller manager to resync. + ResyncPeriod metav1.Duration +} + +// AddFlags adds flags for ControllerManagerOptions to the specified FlagSet. +func (o *ControllerManagerOptions) AddFlags(flags *flag.FlagSet) { + // This input is sent to the controller manager for validation; no further check here. + flags.StringVar( + &o.HealthProbeBindAddress, + "health-probe-bind-address", + ":8081", + "The address (and port) on which the controller manager serves health probe requests. Set to '0' or empty value to disable the health probe server. Defaults to ':8081'.") + + // This input is sent to the controller manager for validation; no further check here. + flags.StringVar( + &o.MetricsBindAddress, + "metrics-bind-address", + ":8080", + "The address (and port) on which the controller manager serves prometheus metrics. Set to '0' or empty value to disable the metrics server. Defaults to ':8080'.") + + flags.BoolVar( + &o.EnablePprof, + "enable-pprof", + false, + "Enable the pprof server for profiling the hub agent controller manager or not.", + ) + + // This input is sent to the controller manager for validation; no further check here. + flags.IntVar( + &o.PprofPort, + "pprof-port", + 6065, + "The port that the agent will listen for serving pprof profiling requests. This option only applies if the pprof server is enabled. Defaults to 6065.", + ) + + flags.Var(newHubQPSValueWithValidation(250.0, &o.HubQPS), "hub-api-qps", "The QPS limit set to the rate limiter of the Kubernetes client in use by the controller manager and all of its managed controller, for client-side throttling purposes. Defaults to 250. Use a positive float64 value in the range [10.0, 10000.0], or set a less or equal to zero value to disable client-side throttling.") + + flags.Var(newHubBurstValueWithValidation(1000, &o.HubBurst), "hub-api-burst", "The burst limit set to the rate limiter of the Kubernetes client in use by the controller manager and all of its managed controller, for client-side throttling purposes. Defaults to 1000. Must be a positive value in the range [10, 20000], and it should be no less than the QPS limit.") + + flags.Var(newResyncPeriodValueWithValidation(6*time.Hour, &o.ResyncPeriod), "resync-period", "The duration for the informers in the controller manager to resync. Defaults to 6 hours. Must be a duration in the range [1h, 12h].") +} + +// A list of flag variables that allow pluggable validation logic when parsing the input args. + +type HubQPSValueWithValidation float64 + +func (v *HubQPSValueWithValidation) String() string { + return fmt.Sprintf("%f", *v) +} + +func (v *HubQPSValueWithValidation) Set(s string) error { + // Some validation is also performed on the controller manager side and the client-go side. Just + // to be on the safer side we also impose some limits here. + qps, err := strconv.ParseFloat(s, 64) + if err != nil { + return fmt.Errorf("failed to parse float64 value: %w", err) + } + + if qps < 0.0 { + // Disable client-side throttling. + *v = -1.0 + return nil + } + + if qps < 10.0 || qps > 10000.0 { + return fmt.Errorf("QPS limit is set to an invalid value (%f), must be a value in the range [10.0, 10000.0]", qps) + } + *v = HubQPSValueWithValidation(qps) + return nil +} + +func newHubQPSValueWithValidation(defaultVal float64, p *float64) *HubQPSValueWithValidation { + *p = defaultVal + return (*HubQPSValueWithValidation)(p) +} + +type HubBurstValueWithValidation int + +func (v *HubBurstValueWithValidation) String() string { + return fmt.Sprintf("%d", *v) +} + +func (v *HubBurstValueWithValidation) Set(s string) error { + // Some validation is also performed on the controller manager side and the client-go side. Just + // to be on the safer side we also impose some limits here. + burst, err := strconv.Atoi(s) + if err != nil { + return fmt.Errorf("failed to parse int value: %w", err) + } + + if burst < 10 || burst > 20000 { + return fmt.Errorf("burst limit is set to an invalid value (%d), must be a value in the range [10, 20000]", burst) + } + *v = HubBurstValueWithValidation(burst) + return nil +} + +func newHubBurstValueWithValidation(defaultVal int, p *int) *HubBurstValueWithValidation { + *p = defaultVal + return (*HubBurstValueWithValidation)(p) +} + +type ResyncPeriodValueWithValidation metav1.Duration + +func (v *ResyncPeriodValueWithValidation) String() string { + return v.Duration.String() +} + +func (v *ResyncPeriodValueWithValidation) Set(s string) error { + // Some validation is also performed on the controller manager side. Just + // to be on the safer side we also impose some limits here. + dur, err := time.ParseDuration(s) + if err != nil { + return fmt.Errorf("failed to parse duration value: %w", err) + } + if dur < time.Hour || dur > 12*time.Hour { + return fmt.Errorf("resync period is set to an invalid value (%s), must be a value in the range [1h, 12h]", s) + } + v.Duration = dur + return nil +} + +func newResyncPeriodValueWithValidation(defaultVal time.Duration, p *metav1.Duration) *ResyncPeriodValueWithValidation { + p.Duration = defaultVal + return (*ResyncPeriodValueWithValidation)(p) +} diff --git a/cmd/hubagent/options/featureflags.go b/cmd/hubagent/options/featureflags.go new file mode 100644 index 000000000..00b0d6e6b --- /dev/null +++ b/cmd/hubagent/options/featureflags.go @@ -0,0 +1,116 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package options + +import ( + "flag" + "fmt" + "strconv" +) + +// FeatureFlags is a set of feature flags the KubeFleet hub agent exposes. +type FeatureFlags struct { + // Enable the hub agent to watch the KubeFleet v1beta1 APIs or not. This flag is kept only for + // compatibility reasons; it has no effect at this moment, as KubeFleet v1alpha1 APIs have + // been removed and the v1beta1 APIs are the storage version in use. + EnableV1Beta1APIs bool + + // Enable the ClusterInventory API support in the KubeFleet hub agent or not. + // + // ClusterInventory APIs are a set of Kubernetes Multi-Cluster SIG standard APIs for discovering + // currently registered member clusters in a multi-cluster management platform. + EnableClusterInventoryAPIs bool + + // Enable the StagedUpdateRun API support in the KubeFleet hub agent or not. + // + // StagedUpdateRun APIs are a set of KubeFleet APIs for progressively updating resource placements. + EnableStagedUpdateRunAPIs bool + + // Enable the Eviction API support in the KubeFleet hub agent or not. + // + // Eviction APIs are a set of KubeFleet APIs for evicting resource placements from member clusters + // with minimal disruptions. + EnableEvictionAPIs bool + + // Enable the ResourcePlacement API support in the KubeFleet hub agent or not. + // + // ResourcePlacement APIs are a set of KubeFleet APIs for processing namespace scoped resource placements. + // This flag does not concern the cluster-scoped placement APIs (`ClusterResourcePlacement` and its related APIs). + EnableResourcePlacementAPIs bool +} + +// AddFlags adds flags for FeatureFlags to the specified FlagSet. +func (o *FeatureFlags) AddFlags(flags *flag.FlagSet) { + flags.Var( + newEnableV1Beta1APIsValueWithValidation(true, &o.EnableV1Beta1APIs), + "enable-v1beta1-apis", + "Enable the hub agent to watch the KubeFleet v1beta1 APIs or not.", + ) + + flags.BoolVar( + &o.EnableClusterInventoryAPIs, + "enable-cluster-inventory-apis", + true, + "Enable the ClusterInventory API support in the KubeFleet hub agent or not.", + ) + + flags.BoolVar( + &o.EnableStagedUpdateRunAPIs, + "enable-staged-update-run-apis", + true, + "Enable the StagedUpdateRun API support in the KubeFleet hub agent or not.", + ) + + flags.BoolVar( + &o.EnableEvictionAPIs, + "enable-eviction-apis", + true, + "Enable the Eviction API support in the KubeFleet hub agent or not.", + ) + + flags.BoolVar( + &o.EnableResourcePlacementAPIs, + "enable-resource-placement", + true, + "Enable the ResourcePlacement API support (for namespace-scoped placements) in the KubeFleet hub agent or not.", + ) +} + +// A list of flag variables that allow pluggable validation logic when parsing the input args. + +type EnableV1Beta1APIsValueWithValidation bool + +func (v *EnableV1Beta1APIsValueWithValidation) String() string { + return fmt.Sprintf("%t", *v) +} + +func (v *EnableV1Beta1APIsValueWithValidation) Set(s string) error { + enabled, err := strconv.ParseBool(s) + if err != nil { + return fmt.Errorf("failed to parse bool value: %w", err) + } + if !enabled { + return fmt.Errorf("the KubeFleet v1beta1 APIs are the storage version and must be enabled") + } + *v = EnableV1Beta1APIsValueWithValidation(enabled) + return nil +} + +func newEnableV1Beta1APIsValueWithValidation(defaultVal bool, p *bool) *EnableV1Beta1APIsValueWithValidation { + *p = defaultVal + return (*EnableV1Beta1APIsValueWithValidation)(p) +} diff --git a/cmd/hubagent/options/leaderelection.go b/cmd/hubagent/options/leaderelection.go new file mode 100644 index 000000000..a16237582 --- /dev/null +++ b/cmd/hubagent/options/leaderelection.go @@ -0,0 +1,98 @@ +/* +Copyright 2026 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package options + +import ( + "flag" + "time" + + "github.com/kubefleet-dev/kubefleet/pkg/utils" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// LeaderElectionOptions is a set of options the KubeFleet hub agent exposes for controlling +// the leader election behaviors. +// +// Only a subset of leader election options supported by the controller manager is added here, +// for simplicity reasons. +type LeaderElectionOptions struct { + // Enable leader election or not. This helps ensure that there is only one active + // hub agent controller manager when multiple instances of the hub agent are running. + LeaderElect bool + + // The duration of a leader election lease. This is the period where a non-leader candidate + // will wait after observing a leadership renewal before attempting to acquire leadership of the + // current leader. And it is also effectively the maximum duration that a leader can be stopped + // before it is replaced by another candidate. The option only applies if leader election is enabled. + LeaseDuration metav1.Duration + + // The interval between attempts by the acting master to renew a leadership slot + // before it stops leading. This must be less than or equal to the lease duration. + // The option only applies if leader election is enabled. + RenewDeadline metav1.Duration + + // The duration the clients should wait between attempting acquisition and renewal of a + // leadership. The option only applies if leader election is enabled. + RetryPeriod metav1.Duration + + // The namespace of the resource object that will be used to lock during leader election cycles. + // This option only applies if leader election is enabled. + ResourceNamespace string +} + +// AddFlags adds flags for LeaderElectionOptions to the specified FlagSet. +func (o *LeaderElectionOptions) AddFlags(flags *flag.FlagSet) { + flags.BoolVar( + &o.LeaderElect, + "leader-elect", + // Note: this should be overridden to true even in cases where the hub agent deployment runs with only + // one replica, as this can help ensure system correctness during rolling updates. + false, + "Enable a leader election client to gain leadership before the hub agent controller manager starts to run or not.") + + // This input is sent to the controller manager for validation; no further check here. + flags.DurationVar( + &o.LeaseDuration.Duration, + "leader-lease-duration", + 15*time.Second, + "The duration of a leader election lease. This is the period where a non-leader candidate will wait after observing a leadership renewal before attempting to acquire leadership of the current leader. And it is also effectively the maximum duration that a leader can be stopped before it is replaced by another candidate. The option only applies if leader election is enabled.", + ) + + // This input is sent to the controller manager for validation; no further check here. + flags.DurationVar( + &o.RenewDeadline.Duration, + "leader-renew-deadline", + 10*time.Second, + "The interval between attempts by the acting master to renew a leadership slot before it stops leading. This must be less than or equal to the lease duration. The option only applies if leader election is enabled", + ) + + // This input is sent to the controller manager for validation; no further check here. + flags.DurationVar( + &o.RetryPeriod.Duration, + "leader-retry-period", + 2*time.Second, + "The duration the clients should wait between attempting acquisition and renewal of a leadership. The option only applies if leader election is enabled", + ) + + // This input is sent to the controller manager for validation; no further check here. + flags.StringVar( + &o.ResourceNamespace, + "leader-election-namespace", + utils.FleetSystemNamespace, + "The namespace of the resource object that will be used to lock during leader election cycles. The option only applies if leader election is enabled.", + ) +} diff --git a/cmd/hubagent/options/options.go b/cmd/hubagent/options/options.go index 7568ae7cc..d75404172 100644 --- a/cmd/hubagent/options/options.go +++ b/cmd/hubagent/options/options.go @@ -18,179 +18,38 @@ package options import ( "flag" - "time" - - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/tools/leaderelection/resourcelock" - componentbaseconfig "k8s.io/component-base/config" - - "github.com/kubefleet-dev/kubefleet/pkg/utils" ) -// Options contains everything necessary to create and run controller-manager. +// Options is the options to use for running the KubeFleet hub agent. type Options struct { - // Controllers is the list of controllers to enable or disable - // '*' means "all enabled by default controllers" - // 'foo' means "enable 'foo'" - // '-foo' means "disable 'foo'" - // first item for a particular name wins - Controllers []string - // LeaderElection defines the configuration of leader election client. - LeaderElection componentbaseconfig.LeaderElectionConfiguration - // HealthProbeAddress is the TCP address that the is used to serve the heath probes from k8s - HealthProbeAddress string - // MetricsBindAddress is the TCP address that the controller should bind to - // for serving prometheus metrics. - // It can be set to "0" to disable the metrics serving. - // Defaults to ":8080". - MetricsBindAddress string - // EnableWebhook indicates if we will run a webhook - EnableWebhook bool - // Webhook service name - WebhookServiceName string - // EnableGuardRail indicates if we will enable fleet guard rail webhook configurations. - EnableGuardRail bool - // WhiteListedUsers indicates the list of user who are allowed to modify fleet resources - WhiteListedUsers string - // Sets the connection type for the webhook. - WebhookClientConnectionType string - // NetworkingAgentsEnabled indicates if we enable network agents - NetworkingAgentsEnabled bool - // ClusterUnhealthyThreshold is the duration of failure for the cluster to be considered unhealthy. - ClusterUnhealthyThreshold metav1.Duration - // WorkPendingGracePeriod represents the grace period after a work is created/updated. - // We consider a work failed if a work's last applied condition doesn't change after period. - WorkPendingGracePeriod metav1.Duration - // SkippedPropagatingAPIs and AllowedPropagatingAPIs options are used to control the propagation of resources. - // If none of them are set, the default skippedPropagatingAPIs list will be used. - // SkippedPropagatingAPIs indicates semicolon separated resources that should be skipped for propagating. - SkippedPropagatingAPIs string - // AllowedPropagatingAPIs indicates semicolon separated resources that should be allowed for propagating. - // This is mutually exclusive with SkippedPropagatingAPIs. - AllowedPropagatingAPIs string - // SkippedPropagatingNamespaces is a list of namespaces that will be skipped for propagating. - SkippedPropagatingNamespaces string - // HubQPS is the QPS to use while talking with hub-apiserver. Default is 20.0. - HubQPS float64 - // HubBurst is the burst to allow while talking with hub-apiserver. Default is 100. - HubBurst int - // ResyncPeriod is the base frequency the informers are resynced. Defaults is 5 minutes. - ResyncPeriod metav1.Duration - // MaxConcurrentClusterPlacement is the number of cluster placement that are allowed to run concurrently. - MaxConcurrentClusterPlacement int - // ConcurrentResourceChangeSyncs is the number of resource change reconcilers that are allowed to sync concurrently. - ConcurrentResourceChangeSyncs int - // MaxFleetSizeSupported is the max number of member clusters this fleet supports. - // We will set the max concurrency of related reconcilers (membercluster, rollout,workgenerator) - // according to this value. - MaxFleetSizeSupported int - // RateLimiterOpts is the ratelimit parameters for the work queue - RateLimiterOpts RateLimitOptions - // EnableV1Alpha1APIs enables the agents to watch the v1alpha1 CRs. - // TODO(weiweng): remove this field soon. Only kept for backward compatibility. - EnableV1Alpha1APIs bool - // EnableV1Beta1APIs enables the agents to watch the v1beta1 CRs. - EnableV1Beta1APIs bool - // EnableClusterInventoryAPIs enables the agents to watch the cluster inventory CRs. - EnableClusterInventoryAPIs bool - // ForceDeleteWaitTime is the duration the hub agent waits before force deleting a member cluster. - ForceDeleteWaitTime metav1.Duration - // EnableStagedUpdateRunAPIs enables the agents to watch the clusterStagedUpdateRun CRs. - EnableStagedUpdateRunAPIs bool - // EnableEvictionAPIs enables to agents to watch the eviction and placement disruption budget CRs. - EnableEvictionAPIs bool - // EnableResourcePlacement enables the agents to watch the ResourcePlacement APIs. - EnableResourcePlacement bool - // EnablePprof enables the pprof profiling. - EnablePprof bool - // PprofPort is the port for pprof profiling. - PprofPort int - // DenyModifyMemberClusterLabels indicates if the member cluster labels cannot be modified by groups (excluding system:masters) - DenyModifyMemberClusterLabels bool - // EnableWorkload enables workload resources (pods and replicasets) to be created in the hub cluster. - // When set to true, the pod and replicaset validating webhooks are disabled. - EnableWorkload bool - // UseCertManager indicates whether to use cert-manager for webhook certificate management. - // When enabled, webhook certificates are managed by cert-manager instead of self-signed generation. - UseCertManager bool - // ResourceSnapshotCreationMinimumInterval is the minimum interval at which resource snapshots could be created. - // Whether the resource snapshot is created or not depends on the both ResourceSnapshotCreationMinimumInterval and ResourceChangesCollectionDuration. - ResourceSnapshotCreationMinimumInterval time.Duration - // ResourceChangesCollectionDuration is the duration for collecting resource changes into one snapshot. - ResourceChangesCollectionDuration time.Duration + // Leader election related options. + LeaderElectionOpts LeaderElectionOptions + + // Options that concern the setup of the controller manager instance in use by the KubeFleet hub agent. + CtrlMgrOpts ControllerManagerOptions + + // KubeFleet webhook related options. + WebhookOpts WebhookOptions + + // Feature flags that control the enabling of certain features in the hub agent. + FeatureFlags FeatureFlags + + // Options that fine-tune how KubeFleet hub agent manages member clusters in the fleet. + ClusterMgmtOpts ClusterManagementOptions + + // Options that fine-tune how KubeFleet hub agent manages resources placements in the fleet. + PlacementMgmtOpts PlacementManagementOptions } -// NewOptions builds an empty options. func NewOptions() *Options { - return &Options{ - LeaderElection: componentbaseconfig.LeaderElectionConfiguration{ - LeaderElect: true, - ResourceLock: resourcelock.LeasesResourceLock, - ResourceNamespace: utils.FleetSystemNamespace, - ResourceName: "136224848560.hub.fleet.azure.com", - }, - MaxConcurrentClusterPlacement: 10, - ConcurrentResourceChangeSyncs: 1, - MaxFleetSizeSupported: 100, - EnableV1Alpha1APIs: false, - EnableClusterInventoryAPIs: true, - EnableStagedUpdateRunAPIs: true, - EnableResourcePlacement: true, - EnablePprof: false, - PprofPort: 6065, - ResourceSnapshotCreationMinimumInterval: 30 * time.Second, - ResourceChangesCollectionDuration: 15 * time.Second, - } + return &Options{} } -// AddFlags adds flags to the specified FlagSet. func (o *Options) AddFlags(flags *flag.FlagSet) { - flags.StringVar(&o.HealthProbeAddress, "health-probe-bind-address", ":8081", - "The IP address on which to listen for the --secure-port port.") - flags.StringVar(&o.MetricsBindAddress, "metrics-bind-address", ":8080", "The TCP address that the controller should bind to for serving prometheus metrics(e.g. 127.0.0.1:8088, :8088)") - flags.BoolVar(&o.LeaderElection.LeaderElect, "leader-elect", false, "Start a leader election client and gain leadership before executing the main loop. Enable this when running replicated components for high availability.") - flags.DurationVar(&o.LeaderElection.LeaseDuration.Duration, "leader-lease-duration", 15*time.Second, "This is effectively the maximum duration that a leader can be stopped before someone else will replace it.") - flag.StringVar(&o.LeaderElection.ResourceNamespace, "leader-election-namespace", utils.FleetSystemNamespace, "The namespace in which the leader election resource will be created.") - flag.BoolVar(&o.EnableWebhook, "enable-webhook", true, "If set, the fleet webhook is enabled.") - // set a default value 'fleetwebhook' for webhook service name for backward compatibility. The service name was hard coded to 'fleetwebhook' in the past. - flag.StringVar(&o.WebhookServiceName, "webhook-service-name", "fleetwebhook", "Fleet webhook service name.") - flag.BoolVar(&o.EnableGuardRail, "enable-guard-rail", false, "If set, the fleet guard rail webhook configurations are enabled.") - flag.StringVar(&o.WhiteListedUsers, "whitelisted-users", "", "If set, white listed users can modify fleet related resources.") - flag.StringVar(&o.WebhookClientConnectionType, "webhook-client-connection-type", "url", "Sets the connection type used by the webhook client. Only URL or Service is valid.") - flag.BoolVar(&o.NetworkingAgentsEnabled, "networking-agents-enabled", false, "Whether the networking agents are enabled or not.") - flags.DurationVar(&o.ClusterUnhealthyThreshold.Duration, "cluster-unhealthy-threshold", 60*time.Second, "The duration for a member cluster to be in a degraded state before considered unhealthy.") - flags.DurationVar(&o.WorkPendingGracePeriod.Duration, "work-pending-grace-period", 15*time.Second, - "Specifies the grace period of allowing a manifest to be pending before marking it as failed.") - flags.StringVar(&o.AllowedPropagatingAPIs, "allowed-propagating-apis", "", "Semicolon separated resources that should be allowed for propagation. Supported formats are:\n"+ - " for allowing resources with a specific API group(e.g. networking.k8s.io),\n"+ - "/ for allowing resources with a specific API version(e.g. networking.k8s.io/v1beta1),\n"+ - "//, for allowing one or more specific resources (e.g. networking.k8s.io/v1beta1/Ingress,IngressClass) where the Kinds are case-insensitive.") - flags.StringVar(&o.SkippedPropagatingAPIs, "skipped-propagating-apis", "", "Semicolon separated resources that should be skipped from propagating in addition to the default skip list(cluster.fleet.io;policy.fleet.io;work.fleet.io). Supported formats are:\n"+ - " for skip resources with a specific API group(e.g. networking.k8s.io),\n"+ - "/ for skip resources with a specific API version(e.g. networking.k8s.io/v1beta1),\n"+ - "//, for skip one or more specific resource(e.g. networking.k8s.io/v1beta1/Ingress,IngressClass) where the kinds are case-insensitive.") - flags.StringVar(&o.SkippedPropagatingNamespaces, "skipped-propagating-namespaces", "", - "Comma-separated namespaces that should be skipped from propagating in addition to the default skipped namespaces(fleet-system, namespaces prefixed by kube- and fleet-work-).") - flags.Float64Var(&o.HubQPS, "hub-api-qps", 250, "QPS to use while talking with fleet-apiserver. Doesn't cover events and node heartbeat apis which rate limiting is controlled by a different set of flags.") - flags.IntVar(&o.HubBurst, "hub-api-burst", 1000, "Burst to use while talking with fleet-apiserver. Doesn't cover events and node heartbeat apis which rate limiting is controlled by a different set of flags.") - flags.DurationVar(&o.ResyncPeriod.Duration, "resync-period", 6*time.Hour, "Base frequency the informers are resynced.") - flags.IntVar(&o.MaxConcurrentClusterPlacement, "max-concurrent-cluster-placement", 100, "The max number of concurrent cluster placement to run concurrently.") - flags.IntVar(&o.ConcurrentResourceChangeSyncs, "concurrent-resource-change-syncs", 20, "The number of resourceChange reconcilers that are allowed to run concurrently.") - flags.IntVar(&o.MaxFleetSizeSupported, "max-fleet-size", 100, "The max number of member clusters supported in this fleet") - flags.BoolVar(&o.EnableV1Alpha1APIs, "enable-v1alpha1-apis", false, "If set, the agents will watch for the v1alpha1 APIs.") - flags.BoolVar(&o.EnableV1Beta1APIs, "enable-v1beta1-apis", true, "If set, the agents will watch for the v1beta1 APIs.") - flags.BoolVar(&o.EnableClusterInventoryAPIs, "enable-cluster-inventory-apis", true, "If set, the agents will watch for the ClusterInventory APIs.") - flags.DurationVar(&o.ForceDeleteWaitTime.Duration, "force-delete-wait-time", 15*time.Minute, "The duration the hub agent waits before force deleting a member cluster.") - flags.BoolVar(&o.EnableStagedUpdateRunAPIs, "enable-staged-update-run-apis", true, "If set, the agents will watch for the ClusterStagedUpdateRun APIs.") - flags.BoolVar(&o.EnableEvictionAPIs, "enable-eviction-apis", true, "If set, the agents will watch for the Eviction and PlacementDisruptionBudget APIs.") - flags.BoolVar(&o.EnableResourcePlacement, "enable-resource-placement", true, "If set, the agents will watch for the ResourcePlacement APIs.") - flags.BoolVar(&o.EnablePprof, "enable-pprof", false, "If set, the pprof profiling is enabled.") - flags.IntVar(&o.PprofPort, "pprof-port", 6065, "The port for pprof profiling.") - flags.BoolVar(&o.DenyModifyMemberClusterLabels, "deny-modify-member-cluster-labels", false, "If set, users not in the system:masters cannot modify member cluster labels.") - flags.BoolVar(&o.EnableWorkload, "enable-workload", false, "If set, workloads (pods and replicasets) can be created in the hub cluster. This disables the pod and replicaset validating webhooks.") - flags.BoolVar(&o.UseCertManager, "use-cert-manager", false, "If set, cert-manager will be used for webhook certificate management instead of self-signed certificates.") - flags.DurationVar(&o.ResourceSnapshotCreationMinimumInterval, "resource-snapshot-creation-minimum-interval", 30*time.Second, "The minimum interval at which resource snapshots could be created.") - flags.DurationVar(&o.ResourceChangesCollectionDuration, "resource-changes-collection-duration", 15*time.Second, - "The duration for collecting resource changes into one snapshot. The default is 15 seconds, which means that the controller will collect resource changes for 15 seconds before creating a resource snapshot.") - o.RateLimiterOpts.AddFlags(flags) + o.LeaderElectionOpts.AddFlags(flags) + o.CtrlMgrOpts.AddFlags(flags) + o.WebhookOpts.AddFlags(flags) + o.FeatureFlags.AddFlags(flags) + o.ClusterMgmtOpts.AddFlags(flags) + o.PlacementMgmtOpts.AddFlags(flags) } diff --git a/cmd/hubagent/options/placementctrlsetratelimit.go b/cmd/hubagent/options/placementctrlsetratelimit.go new file mode 100644 index 000000000..4bef73d95 --- /dev/null +++ b/cmd/hubagent/options/placementctrlsetratelimit.go @@ -0,0 +1,184 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package options + +import ( + "flag" + "fmt" + "strconv" + "time" + + "golang.org/x/time/rate" + "k8s.io/client-go/util/workqueue" +) + +// RateLimitOptions are options for rate limiter. +type RateLimitOptions struct { + // RateLimiterBaseDelay is the base delay for ItemExponentialFailureRateLimiter. + RateLimiterBaseDelay time.Duration + + // RateLimiterMaxDelay is the max delay for ItemExponentialFailureRateLimiter. + RateLimiterMaxDelay time.Duration + + // RateLimiterQPS is the qps for BucketRateLimiter + RateLimiterQPS int + + // RateLimiterBucketSize is the bucket size for BucketRateLimiter + RateLimiterBucketSize int +} + +// AddFlags adds flags to the specified FlagSet. +func (o *RateLimitOptions) AddFlags(fs *flag.FlagSet) { + fs.Var( + newRateLimiterBaseDelayValueWithValidation(5*time.Millisecond, &o.RateLimiterBaseDelay), + "rate-limiter-base-delay", + "The base delay for the placement controller set rate limiter. Default to 5ms. Must be a value between [1ms, 200ms].", + ) + + fs.Var( + newRateLimiterMaxDelayValueWithValidation(60*time.Second, &o.RateLimiterMaxDelay), + "rate-limiter-max-delay", + "The max delay for the placement controller set rate limiter. Default to 60s. Must be a value in the range [1s, 5m] and the value must be greater than the base delay.", + ) + + fs.Var( + newRateLimiterQPSValueWithValidation(10, &o.RateLimiterQPS), + "rate-limiter-qps", + "The QPS for the placement controller set rate limiter. Default to 10. Must be a positive integer in the range [1, 1000].", + ) + + fs.Var( + newRateLimiterBucketSizeValueWithValidation(100, &o.RateLimiterBucketSize), + "rate-limiter-bucket-size", + "The bucket size for the placement controller set rate limiter. Default to 100. Must be a positive integer in the range [1, 10000] and the value must be greater than the QPS.", + ) +} + +// DefaultControllerRateLimiter provide a default rate limiter for controller, and users can tune it by corresponding flags. +func DefaultControllerRateLimiter(opts RateLimitOptions) workqueue.TypedRateLimiter[any] { + // set defaults + if opts.RateLimiterBaseDelay <= 0 { + opts.RateLimiterBaseDelay = 5 * time.Millisecond + } + if opts.RateLimiterMaxDelay <= 0 { + opts.RateLimiterMaxDelay = 60 * time.Second + } + if opts.RateLimiterQPS <= 0 { + opts.RateLimiterQPS = 10 + } + if opts.RateLimiterBucketSize <= 0 { + opts.RateLimiterBucketSize = 100 + } + return workqueue.NewTypedMaxOfRateLimiter( + workqueue.NewTypedItemExponentialFailureRateLimiter[any](opts.RateLimiterBaseDelay, opts.RateLimiterMaxDelay), + &workqueue.TypedBucketRateLimiter[any]{Limiter: rate.NewLimiter(rate.Limit(opts.RateLimiterQPS), opts.RateLimiterBucketSize)}, + ) +} + +// A list of flag variables that allow pluggable validation logic when parsing the input args. + +type RateLimiterBaseDelayValueWithValidation time.Duration + +func (v *RateLimiterBaseDelayValueWithValidation) String() string { + return time.Duration(*v).String() +} + +func (v *RateLimiterBaseDelayValueWithValidation) Set(s string) error { + duration, err := time.ParseDuration(s) + if err != nil { + return fmt.Errorf("failed to parse time duration: %w", err) + } + if duration < time.Millisecond || duration > 200*time.Millisecond { + return fmt.Errorf("the base delay must be a value between [1ms, 200ms]") + } + *v = RateLimiterBaseDelayValueWithValidation(duration) + return nil +} + +func newRateLimiterBaseDelayValueWithValidation(defaultVal time.Duration, p *time.Duration) *RateLimiterBaseDelayValueWithValidation { + *p = defaultVal + return (*RateLimiterBaseDelayValueWithValidation)(p) +} + +type RateLimiterMaxDelayValueWithValidation time.Duration + +func (v *RateLimiterMaxDelayValueWithValidation) String() string { + return time.Duration(*v).String() +} + +func (v *RateLimiterMaxDelayValueWithValidation) Set(s string) error { + duration, err := time.ParseDuration(s) + if err != nil { + return fmt.Errorf("failed to parse time duration: %w", err) + } + if duration < time.Second || duration > time.Minute*5 { + return fmt.Errorf("the max delay must be a value between [1s, 5m]") + } + *v = RateLimiterMaxDelayValueWithValidation(duration) + return nil +} + +func newRateLimiterMaxDelayValueWithValidation(defaultVal time.Duration, p *time.Duration) *RateLimiterMaxDelayValueWithValidation { + *p = defaultVal + return (*RateLimiterMaxDelayValueWithValidation)(p) +} + +type RateLimiterQPSValueWithValidation int + +func (v *RateLimiterQPSValueWithValidation) String() string { + return fmt.Sprintf("%d", *v) +} + +func (v *RateLimiterQPSValueWithValidation) Set(s string) error { + qps, err := strconv.Atoi(s) + if err != nil { + return fmt.Errorf("failed to parse integer: %w", err) + } + if qps < 1 || qps > 1000 { + return fmt.Errorf("the QPS must be a positive integer in the range [1, 1000]") + } + *v = RateLimiterQPSValueWithValidation(qps) + return nil +} + +func newRateLimiterQPSValueWithValidation(defaultVal int, p *int) *RateLimiterQPSValueWithValidation { + *p = defaultVal + return (*RateLimiterQPSValueWithValidation)(p) +} + +type RateLimiterBucketSizeValueWithValidation int + +func (v *RateLimiterBucketSizeValueWithValidation) String() string { + return fmt.Sprintf("%d", *v) +} + +func (v *RateLimiterBucketSizeValueWithValidation) Set(s string) error { + bucketSize, err := strconv.Atoi(s) + if err != nil { + return fmt.Errorf("failed to parse integer: %w", err) + } + if bucketSize < 1 || bucketSize > 10000 { + return fmt.Errorf("the bucket size must be a positive integer in the range [1, 10000]") + } + *v = RateLimiterBucketSizeValueWithValidation(bucketSize) + return nil +} + +func newRateLimiterBucketSizeValueWithValidation(defaultVal int, bucketSize *int) *RateLimiterBucketSizeValueWithValidation { + *bucketSize = defaultVal + return (*RateLimiterBucketSizeValueWithValidation)(bucketSize) +} diff --git a/cmd/hubagent/options/placementmgmt.go b/cmd/hubagent/options/placementmgmt.go new file mode 100644 index 000000000..9df806ee9 --- /dev/null +++ b/cmd/hubagent/options/placementmgmt.go @@ -0,0 +1,352 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package options + +import ( + "flag" + "fmt" + "strconv" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/klog/v2" + + "github.com/kubefleet-dev/kubefleet/pkg/utils" +) + +// PlacementManagementOptions is a set of options the KubeFleet hub agent exposes for +// managing resource placements. +type PlacementManagementOptions struct { + // The period the KubeFleet hub agent will wait before marking a Work object as failed. + // + // This option is no longer in use and is only kept for compatibility reasons. + WorkPendingGracePeriod metav1.Duration + + // A list of APIs that are block-listed for resource placement. Any resources under such APIs will be ignored + // by the KubeFleet hub agent and will not be selected for resource placement. + // + // The list is a collection of GVKs separated by semicolons. A GVK can be of the format GROUP, + // GROUP/VERSION, or GROUP/VERSION/KINDS, where KINDS is a comma separated array of Kind values. If you would + // like to skip specific versions and/or kinds in the core API group, use the format VERSION, or + // VERSION/KINDS instead. Below are some examples: + // + // * networking.k8s.io: skip all resources in the networking.k8s.io API group for placement; + // * networking.k8s.io/v1beta1: skip all resources in the networking.k8s.io/v1beta1 group version for placement; + // * networking.k8s.io/v1beta1/Ingress,IngressClass: skip the Ingress and IngressClass resources + // in the networking.k8s.io/v1beta1 group version for placement; + // * v1beta1: skip all resources of version v1beta1 in the core API group for placement; + // * v1/ConfigMap: skip ConfigMap resources of version v1 in the core API group for placement; + // * networking.k8s.io/v1beta1/Ingress; v1beta1: skip the Ingress resource in the networking.k8s.io/v1beta1 group version + // and all resources of version v1beta1 in the core API group for placement. + // + // This option is mutually exclusive with the AllowedPropagatingAPIs option. KubeFleet comes with a built-in + // block list of APIs for resource placement that covers most KubeFleet APIs and a select few of critical Kubernetes + // system APIs; see the source code for more information. + SkippedPropagatingAPIs string + // A list of APIs that are allow-listed for resource placement. If specified, only resources under such APIs + // will be selected for resource placement by the KubeFleet hub agent. + // + // The list is a collection of GVKs separated by semicolons. A GVK can be of the format GROUP, + // GROUP/VERSION, or GROUP/VERSION/KINDS, where KINDS is a comma separated array of Kind values. If you would + // like to skip specific versions and/or kinds in the core API group, use the format VERSION, or + // VERSION/KINDS instead. Below are some examples: + // + // * networking.k8s.io: allow all resources in the networking.k8s.io API group for placement only; + // * networking.k8s.io/v1beta1: allow all resources in the networking.k8s.io/v1beta1 group version for placement only; + // * networking.k8s.io/v1beta1/Ingress,IngressClass: allow the Ingress and IngressClass resources + // in the networking.k8s.io/v1beta1 group version for placement only; + // * v1beta1: allow all resources of version v1beta1 in the core API group for placement only; + // * v1/ConfigMap: allow ConfigMap resources of version v1 in the core API group for placement only; + // * networking.k8s.io/v1beta1/Ingress; v1beta1: only allow the Ingress resource in the networking.k8s.io/v1beta1 + // group version and all resources of version v1beta1 in the core API group for placement. + // + // This option is mutually exclusive with the SkippedPropagatingAPIs option. + AllowedPropagatingAPIs string + + // A list of namespace names that are block-listed for resource placement. The KubeFleet hub agent + // will ignore the namespaces and any resources within them when selecting resources for placement. + // + // This list is a collection of names separated by commas, such as `internals,monitoring`. KubeFleet + // also blocks a number of reserved namespace names for placement by default; such namespaces include + // those that are prefixed with `kube-`, and `fleet-system`. + SkippedPropagatingNamespaces string + + // The number of concurrent workers that help process resource changes for the placement APIs. + ConcurrentResourceChangeSyncs int + + // The expected maximum number of member clusters in the fleet. This is used specifically for the purpose + // of setting the number of concurrent workers for several key placement related controllers; setting the value + // higher increases the concurrency of such controllers. KubeFleet will not enforce this limit on the actual + // number of member clusters in the fleet. + MaxFleetSize int + + // The expected maximum number of placements that are allowed to run concurrently. This is used specifically for + // the purpose of setting the number of concurrent workers for several key placement related controllers; setting + // the value higher increases the concurrency of such controllers. + MaxConcurrentClusterPlacement int + + // The rate limiting options for work queues in use by several placement related controllers. + PlacementControllerWorkQueueRateLimiterOpts RateLimitOptions + + // The minimum interval between resource snapshot creations. + // + // KubeFleet will collect resource changes periodically (as controlled by the ResourceChangesCollectionDuration parameter); + // if new changes are found, KubeFleet will build a new resource snapshot if there has not been any + // new snapshot built within the ResourceSnapshotCreationMinimumInterval. + ResourceSnapshotCreationMinimumInterval time.Duration + + // The interval between resource change collection attempts. + // + // KubeFleet will collect resource changes periodically (as controlled by the ResourceChangesCollectionDuration parameter); + // if new changes are found, KubeFleet will build a new resource snapshot if there has not been any + // new snapshot built within the ResourceSnapshotCreationMinimumInterval. + ResourceChangesCollectionDuration time.Duration +} + +// AddFlags adds flags for PlacementManagementOptions to the specified FlagSet. +func (o *PlacementManagementOptions) AddFlags(flags *flag.FlagSet) { + flags.Var( + newWorkPendingGracePeriodValueWithValidation(15*time.Second, &o.WorkPendingGracePeriod), + "work-pending-grace-period", + "The period the KubeFleet hub agent will wait before marking a Work object as failed. This option is no longer in use and is only kept for compatibility reasons.", + ) + + flags.Var( + newSkippedPropagatingAPIsValueWithValidation("", &o.SkippedPropagatingAPIs), + "skipped-propagating-apis", + "A list of APIs that are block-listed for resource placement. Any resources under such APIs will be ignored by the KubeFleet hub agent and will not be selected for resource placement. The list is a collection of GVKs separated by semicolons. A GVK can be of the format GROUP, GROUP/VERSION, or GROUP/VERSION/KINDS, where KINDS is a comma separated array of Kind values. If you would like to skip specific versions and/or kinds in the core API group, use the format VERSION, or VERSION/KINDS instead. For example, `networking.k8s.io/v1beta1/Ingress,IngressClass; v1/ConfigMap`. This option is mutually exclusive with the AllowedPropagatingAPIs option. KubeFleet comes with a built-in block list of APIs for resource placement that covers most KubeFleet APIs and a select few of critical Kubernetes system APIs; see the source code for more information.", + ) + + flags.Var( + newAllowedPropagatingAPIsValueWithValidation("", &o.AllowedPropagatingAPIs), + "allowed-propagating-apis", + "A list of APIs that are allow-listed for resource placement. If specified, only resources under such APIs will be selected for resource placement by the KubeFleet hub agent. The list is a collection of GVKs separated by semicolons. A GVK can be of the format GROUP, GROUP/VERSION, or GROUP/VERSION/KINDS, where KINDS is a comma separated array of Kind values. If you would like to skip specific versions and/or kinds in the core API group, use the format VERSION, or VERSION/KINDS instead. For example, `networking.k8s.io/v1beta1/Ingress,IngressClass; v1/ConfigMap`. This option is mutually exclusive with the SkippedPropagatingAPIs option.", + ) + + flags.StringVar( + &o.SkippedPropagatingNamespaces, + "skipped-propagating-namespaces", + "", + "A list of comma-separated namespace names that are block-listed for resource placement. The KubeFleet hub agent will ignore the namespaces and any resources within them when selecting resources for placement.", + ) + + flags.Var( + newConcurrentResourceChangeSyncsValueWithValidation(20, &o.ConcurrentResourceChangeSyncs), + "concurrent-resource-change-syncs", + "The number of concurrent workers that help process resource changes for the placement APIs. Default is 20. Must be a positive integer value in the range [1, 100].", + ) + + flags.Var( + newMaxFleetSizeValueWithValidation(100, &o.MaxFleetSize), + "max-fleet-size", + "The expected maximum number of member clusters in the fleet. This is used specifically for setting the number of concurrent workers for several key placement related controllers. Default is 100. Must be a positive integer value in the range [30, 200].", + ) + + flags.Var( + newMaxConcurrentClusterPlacementValueWithValidation(100, &o.MaxConcurrentClusterPlacement), + "max-concurrent-cluster-placement", + "The expected maximum number of placements that are allowed to run concurrently. This is used specifically for setting the number of concurrent workers for several key placement related controllers. Default is 100. Must be a positive integer value in the range [10, 200].", + ) + + o.PlacementControllerWorkQueueRateLimiterOpts.AddFlags(flags) + + flags.Var( + newResourceSnapshotCreationMinimumIntervalValueWithValidation(30*time.Second, &o.ResourceSnapshotCreationMinimumInterval), + "resource-snapshot-creation-minimum-interval", + "The minimum interval between resource snapshot creations. Default is 30 seconds. Must be a duration in the range [0s, 5m].", + ) + + flags.Var( + newResourceChangesCollectionDurationValueWithValidation(15*time.Second, &o.ResourceChangesCollectionDuration), + "resource-changes-collection-duration", + "The interval between resource change collection attempts. Default is 15 seconds. Must be a duration in the range [0s, 1m].", + ) +} + +// A list of flag variables that allow pluggable validation logic when parsing the input args. + +type WorkPendingGracePeriodValueWithValidation metav1.Duration + +func (v *WorkPendingGracePeriodValueWithValidation) String() string { + return v.Duration.String() +} + +func (v *WorkPendingGracePeriodValueWithValidation) Set(s string) error { + klog.Warningf("the work-pending-grace-period option is no longer in use and is only kept for compatibility reasons, it has no effect on the system behavior and should not be set") + v.Duration = 15 * time.Second + return nil +} + +func newWorkPendingGracePeriodValueWithValidation(defaultVal time.Duration, p *metav1.Duration) *WorkPendingGracePeriodValueWithValidation { + p.Duration = defaultVal + return (*WorkPendingGracePeriodValueWithValidation)(p) +} + +type SkippedPropagatingAPIsValueWithValidation string + +func (v *SkippedPropagatingAPIsValueWithValidation) String() string { + return string(*v) +} + +func (v *SkippedPropagatingAPIsValueWithValidation) Set(s string) error { + rc := utils.NewResourceConfig(false) + if err := rc.Parse(s); err != nil { + return fmt.Errorf("invalid list of skipped for propagation APIs: %w", err) + } + *v = SkippedPropagatingAPIsValueWithValidation(s) + return nil +} + +func newSkippedPropagatingAPIsValueWithValidation(defaultVal string, p *string) *SkippedPropagatingAPIsValueWithValidation { + *p = defaultVal + return (*SkippedPropagatingAPIsValueWithValidation)(p) +} + +type AllowedPropagatingAPIsValueWithValidation string + +func (v *AllowedPropagatingAPIsValueWithValidation) String() string { + return string(*v) +} + +func (v *AllowedPropagatingAPIsValueWithValidation) Set(s string) error { + rc := utils.NewResourceConfig(true) + if err := rc.Parse(s); err != nil { + return fmt.Errorf("invalid list of allowed for propagation APIs: %w", err) + } + *v = AllowedPropagatingAPIsValueWithValidation(s) + return nil +} + +func newAllowedPropagatingAPIsValueWithValidation(defaultVal string, p *string) *AllowedPropagatingAPIsValueWithValidation { + *p = defaultVal + return (*AllowedPropagatingAPIsValueWithValidation)(p) +} + +type ConcurrentResourceChangeSyncsValueWithValidation int + +func (v *ConcurrentResourceChangeSyncsValueWithValidation) String() string { + return fmt.Sprintf("%d", *v) +} + +func (v *ConcurrentResourceChangeSyncsValueWithValidation) Set(s string) error { + n, err := strconv.Atoi(s) + if err != nil { + return fmt.Errorf("failed to parse int value: %w", err) + } + if n < 1 || n > 100 { + return fmt.Errorf("number of concurrent resource change syncs must be in the range [1, 100]") + } + *v = ConcurrentResourceChangeSyncsValueWithValidation(n) + return nil +} + +func newConcurrentResourceChangeSyncsValueWithValidation(defaultVal int, p *int) *ConcurrentResourceChangeSyncsValueWithValidation { + *p = defaultVal + return (*ConcurrentResourceChangeSyncsValueWithValidation)(p) +} + +type MaxFleetSizeValueWithValidation int + +func (v *MaxFleetSizeValueWithValidation) String() string { + return fmt.Sprintf("%d", *v) +} + +func (v *MaxFleetSizeValueWithValidation) Set(s string) error { + n, err := strconv.Atoi(s) + if err != nil { + return fmt.Errorf("failed to parse int value: %w", err) + } + if n < 30 || n > 200 { + return fmt.Errorf("number of max fleet size must be in the range [30, 200]") + } + *v = MaxFleetSizeValueWithValidation(n) + return nil +} + +func newMaxFleetSizeValueWithValidation(defaultVal int, p *int) *MaxFleetSizeValueWithValidation { + *p = defaultVal + return (*MaxFleetSizeValueWithValidation)(p) +} + +type MaxConcurrentClusterPlacementValueWithValidation int + +func (v *MaxConcurrentClusterPlacementValueWithValidation) String() string { + return fmt.Sprintf("%d", *v) +} + +func (v *MaxConcurrentClusterPlacementValueWithValidation) Set(s string) error { + n, err := strconv.Atoi(s) + if err != nil { + return fmt.Errorf("failed to parse int value: %w", err) + } + if n < 10 || n > 200 { + return fmt.Errorf("number of max concurrent cluster placements must be in the range [10, 200]") + } + *v = MaxConcurrentClusterPlacementValueWithValidation(n) + return nil +} + +func newMaxConcurrentClusterPlacementValueWithValidation(defaultVal int, p *int) *MaxConcurrentClusterPlacementValueWithValidation { + *p = defaultVal + return (*MaxConcurrentClusterPlacementValueWithValidation)(p) +} + +type ResourceSnapshotCreationMinimumIntervalValueWithValidation time.Duration + +func (v *ResourceSnapshotCreationMinimumIntervalValueWithValidation) String() string { + return time.Duration(*v).String() +} + +func (v *ResourceSnapshotCreationMinimumIntervalValueWithValidation) Set(s string) error { + duration, err := time.ParseDuration(s) + if err != nil { + return fmt.Errorf("failed to parse duration: %w", err) + } + if duration < 0 || duration > 5*time.Minute { + return fmt.Errorf("duration must be in the range [0s, 5m]") + } + *v = ResourceSnapshotCreationMinimumIntervalValueWithValidation(duration) + return nil +} + +func newResourceSnapshotCreationMinimumIntervalValueWithValidation(defaultVal time.Duration, p *time.Duration) *ResourceSnapshotCreationMinimumIntervalValueWithValidation { + *p = defaultVal + return (*ResourceSnapshotCreationMinimumIntervalValueWithValidation)(p) +} + +type ResourceChangesCollectionDurationValueWithValidation time.Duration + +func (v *ResourceChangesCollectionDurationValueWithValidation) String() string { + return time.Duration(*v).String() +} + +func (v *ResourceChangesCollectionDurationValueWithValidation) Set(s string) error { + duration, err := time.ParseDuration(s) + if err != nil { + return fmt.Errorf("failed to parse duration: %w", err) + } + if duration < 0 || duration > time.Minute { + return fmt.Errorf("duration must be in the range [0s, 1m]") + } + *v = ResourceChangesCollectionDurationValueWithValidation(duration) + return nil +} + +func newResourceChangesCollectionDurationValueWithValidation(defaultVal time.Duration, p *time.Duration) *ResourceChangesCollectionDurationValueWithValidation { + *p = defaultVal + return (*ResourceChangesCollectionDurationValueWithValidation)(p) +} diff --git a/cmd/hubagent/options/ratelimit.go b/cmd/hubagent/options/ratelimit.go deleted file mode 100644 index edd477f3e..000000000 --- a/cmd/hubagent/options/ratelimit.go +++ /dev/null @@ -1,69 +0,0 @@ -/* -Copyright 2025 The KubeFleet Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package options - -import ( - "flag" - "time" - - "golang.org/x/time/rate" - "k8s.io/client-go/util/workqueue" -) - -// RateLimitOptions are options for rate limiter. -type RateLimitOptions struct { - // RateLimiterBaseDelay is the base delay for ItemExponentialFailureRateLimiter. - RateLimiterBaseDelay time.Duration - - // RateLimiterMaxDelay is the max delay for ItemExponentialFailureRateLimiter. - RateLimiterMaxDelay time.Duration - - // RateLimiterQPS is the qps for BucketRateLimiter - RateLimiterQPS int - - // RateLimiterBucketSize is the bucket size for BucketRateLimiter - RateLimiterBucketSize int -} - -// AddFlags adds flags to the specified FlagSet. -func (o *RateLimitOptions) AddFlags(fs *flag.FlagSet) { - fs.DurationVar(&o.RateLimiterBaseDelay, "rate-limiter-base-delay", 5*time.Millisecond, "The base delay for rate limiter.") - fs.DurationVar(&o.RateLimiterMaxDelay, "rate-limiter-max-delay", 60*time.Second, "The max delay for rate limiter.") - fs.IntVar(&o.RateLimiterQPS, "rate-limiter-qps", 10, "The QPS for rate limier.") - fs.IntVar(&o.RateLimiterBucketSize, "rate-limiter-bucket-size", 100, "The bucket size for rate limier.") -} - -// DefaultControllerRateLimiter provide a default rate limiter for controller, and users can tune it by corresponding flags. -func DefaultControllerRateLimiter(opts RateLimitOptions) workqueue.TypedRateLimiter[any] { - // set defaults - if opts.RateLimiterBaseDelay <= 0 { - opts.RateLimiterBaseDelay = 5 * time.Millisecond - } - if opts.RateLimiterMaxDelay <= 0 { - opts.RateLimiterMaxDelay = 60 * time.Second - } - if opts.RateLimiterQPS <= 0 { - opts.RateLimiterQPS = 10 - } - if opts.RateLimiterBucketSize <= 0 { - opts.RateLimiterBucketSize = 100 - } - return workqueue.NewTypedMaxOfRateLimiter[any]( - workqueue.NewTypedItemExponentialFailureRateLimiter[any](opts.RateLimiterBaseDelay, opts.RateLimiterMaxDelay), - &workqueue.TypedBucketRateLimiter[any]{Limiter: rate.NewLimiter(rate.Limit(opts.RateLimiterQPS), opts.RateLimiterBucketSize)}, - ) -} diff --git a/cmd/hubagent/options/validation.go b/cmd/hubagent/options/validation.go index 9e12ab049..6431dfcce 100644 --- a/cmd/hubagent/options/validation.go +++ b/cmd/hubagent/options/validation.go @@ -18,51 +18,47 @@ package options import ( "k8s.io/apimachinery/pkg/util/validation/field" - - "github.com/kubefleet-dev/kubefleet/pkg/utils" ) -// TODO: Clean up the validations we don't need and add the ones we need - // Validate checks Options and return a slice of found errs. +// +// Note: the logic here concerns primarily cross-option validation; for single-option validation, +// consider adding the logic directly as part of the flag parsing function, for clarity reasons. func (o *Options) Validate() field.ErrorList { errs := field.ErrorList{} newPath := field.NewPath("Options") - if o.AllowedPropagatingAPIs != "" && o.SkippedPropagatingAPIs != "" { - errs = append(errs, field.Invalid(newPath.Child("AllowedPropagatingAPIs"), o.AllowedPropagatingAPIs, "AllowedPropagatingAPIs and SkippedPropagatingAPIs are mutually exclusive")) + // Cross-field validation for controller manager options. + if float64(o.CtrlMgrOpts.HubBurst) < float64(o.CtrlMgrOpts.HubQPS) { + errs = append(errs, field.Invalid(newPath.Child("HubBurst"), o.CtrlMgrOpts.HubBurst, "The burst limit for client-side throttling must be greater than or equal to its QPS limit")) } - resourceConfig := utils.NewResourceConfig(o.AllowedPropagatingAPIs != "") - if err := resourceConfig.Parse(o.SkippedPropagatingAPIs); err != nil { - errs = append(errs, field.Invalid(newPath.Child("SkippedPropagatingAPIs"), o.SkippedPropagatingAPIs, "Invalid API string")) - } - if err := resourceConfig.Parse(o.AllowedPropagatingAPIs); err != nil { - errs = append(errs, field.Invalid(newPath.Child("AllowedPropagatingAPIs"), o.AllowedPropagatingAPIs, "Invalid API string")) - } + // Cross-field validation for webhook options. - if o.ClusterUnhealthyThreshold.Duration <= 0 { - errs = append(errs, field.Invalid(newPath.Child("ClusterUnhealthyThreshold"), o.ClusterUnhealthyThreshold, "Must be greater than 0")) - } - if o.WorkPendingGracePeriod.Duration <= 0 { - errs = append(errs, field.Invalid(newPath.Child("WorkPendingGracePeriod"), o.WorkPendingGracePeriod, "Must be greater than 0")) + // Note: this validation logic is a bit weird in the sense that the system accepts + // either a URL-based connection or a service-based connection for webhook calls, + // but here the logic enforces that a service name must be provided. The way we handle + // URLs is also problematic as the code will always format a service-targeted URL using + // the input. We keep this logic for now for compatibility reasons. + if o.WebhookOpts.EnableWebhooks && o.WebhookOpts.ServiceName == "" { + errs = append(errs, field.Invalid(newPath.Child("WebhookServiceName"), o.WebhookOpts.ServiceName, "A webhook service name is required when webhooks are enabled")) } - if o.EnableWebhook && o.WebhookServiceName == "" { - errs = append(errs, field.Invalid(newPath.Child("WebhookServiceName"), o.WebhookServiceName, "Webhook service name is required when webhook is enabled")) + if o.WebhookOpts.UseCertManager && !o.WebhookOpts.EnableWorkload { + errs = append(errs, field.Invalid(newPath.Child("UseCertManager"), o.WebhookOpts.UseCertManager, "If cert manager is used for securing webhook connections, the EnableWorkload option must be set to true, so that cert manager pods can run in the hub cluster.")) } - if o.UseCertManager && !o.EnableWorkload { - errs = append(errs, field.Invalid(newPath.Child("UseCertManager"), o.UseCertManager, "UseCertManager requires EnableWorkload to be true (when EnableWorkload is false, a validating webhook blocks pod creation except for certain system pods; cert-manager controller pods must be allowed to run in the hub cluster)")) + if o.PlacementMgmtOpts.AllowedPropagatingAPIs != "" && o.PlacementMgmtOpts.SkippedPropagatingAPIs != "" { + errs = append(errs, field.Invalid(newPath.Child("AllowedPropagatingAPIs"), o.PlacementMgmtOpts.AllowedPropagatingAPIs, "AllowedPropagatingAPIs and SkippedPropagatingAPIs options are mutually exclusive")) } - connectionType := o.WebhookClientConnectionType - if _, err := parseWebhookClientConnectionString(connectionType); err != nil { - errs = append(errs, field.Invalid(newPath.Child("WebhookClientConnectionType"), o.WebhookClientConnectionType, err.Error())) + // Cross-field validation for placement management options. + if o.PlacementMgmtOpts.PlacementControllerWorkQueueRateLimiterOpts.RateLimiterBaseDelay >= o.PlacementMgmtOpts.PlacementControllerWorkQueueRateLimiterOpts.RateLimiterMaxDelay { + errs = append(errs, field.Invalid(newPath.Child("PlacementControllerWorkQueueRateLimiterOpts").Child("RateLimiterBaseDelay"), o.PlacementMgmtOpts.PlacementControllerWorkQueueRateLimiterOpts.RateLimiterBaseDelay, "the base delay for the placement controller set rate limiter must be less than its max delay")) } - if !o.EnableV1Alpha1APIs && !o.EnableV1Beta1APIs { - errs = append(errs, field.Required(newPath.Child("EnableV1Alpha1APIs"), "Either EnableV1Alpha1APIs or EnableV1Beta1APIs is required")) + if o.PlacementMgmtOpts.PlacementControllerWorkQueueRateLimiterOpts.RateLimiterQPS > o.PlacementMgmtOpts.PlacementControllerWorkQueueRateLimiterOpts.RateLimiterBucketSize { + errs = append(errs, field.Invalid(newPath.Child("PlacementControllerWorkQueueRateLimiterOpts").Child("RateLimiterQPS"), o.PlacementMgmtOpts.PlacementControllerWorkQueueRateLimiterOpts.RateLimiterQPS, "the QPS for the placement controller set rate limiter must be less than its bucket size")) } return errs diff --git a/cmd/hubagent/options/validation_test.go b/cmd/hubagent/options/validation_test.go index 4711ff277..2e222acb7 100644 --- a/cmd/hubagent/options/validation_test.go +++ b/cmd/hubagent/options/validation_test.go @@ -23,7 +23,6 @@ import ( "github.com/google/go-cmp/cmp" "github.com/onsi/gomega" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/validation/field" ) @@ -37,11 +36,23 @@ type ModifyOptions func(option *Options) // newTestOptions creates an Options with default parameters. func newTestOptions(modifyOptions ModifyOptions) Options { option := Options{ - SkippedPropagatingAPIs: "fleet.azure.com;multicluster.x-k8s.io", - WorkPendingGracePeriod: metav1.Duration{Duration: 10 * time.Second}, - ClusterUnhealthyThreshold: metav1.Duration{Duration: 60 * time.Second}, - WebhookClientConnectionType: "url", - EnableV1Alpha1APIs: true, + CtrlMgrOpts: ControllerManagerOptions{ + HubQPS: 250, + HubBurst: 1000, + }, + WebhookOpts: WebhookOptions{ + ClientConnectionType: "url", + ServiceName: testWebhookServiceName, + }, + PlacementMgmtOpts: PlacementManagementOptions{ + SkippedPropagatingAPIs: "fleet.azure.com;multicluster.x-k8s.io", + PlacementControllerWorkQueueRateLimiterOpts: RateLimitOptions{ + RateLimiterBaseDelay: 5 * time.Millisecond, + RateLimiterMaxDelay: 60 * time.Second, + RateLimiterQPS: 10, + RateLimiterBucketSize: 100, + }, + }, } if modifyOptions != nil { @@ -60,68 +71,57 @@ func TestValidateControllerManagerConfiguration(t *testing.T) { opt: newTestOptions(nil), want: field.ErrorList{}, }, - "invalid SkippedPropagatingAPIs": { - opt: newTestOptions(func(options *Options) { - options.SkippedPropagatingAPIs = "a/b/c/d?" - }), - want: field.ErrorList{field.Invalid(newPath.Child("SkippedPropagatingAPIs"), "a/b/c/d?", "Invalid API string")}, - }, - "invalid ClusterUnhealthyThreshold": { - opt: newTestOptions(func(options *Options) { - options.ClusterUnhealthyThreshold.Duration = -40 * time.Second - }), - want: field.ErrorList{field.Invalid(newPath.Child("ClusterUnhealthyThreshold"), metav1.Duration{Duration: -40 * time.Second}, "Must be greater than 0")}, - }, - "invalid WorkPendingGracePeriod": { + "invalid HubBurst less than HubQPS": { opt: newTestOptions(func(options *Options) { - options.WorkPendingGracePeriod.Duration = -40 * time.Second + options.CtrlMgrOpts.HubQPS = 100 + options.CtrlMgrOpts.HubBurst = 50 }), - want: field.ErrorList{field.Invalid(newPath.Child("WorkPendingGracePeriod"), metav1.Duration{Duration: -40 * time.Second}, "Must be greater than 0")}, + want: field.ErrorList{field.Invalid(newPath.Child("HubBurst"), 50, "The burst limit for client-side throttling must be greater than or equal to its QPS limit")}, }, - "invalid EnableV1Alpha1APIs": { + "WebhookServiceName is empty": { opt: newTestOptions(func(option *Options) { - option.EnableV1Alpha1APIs = false + option.WebhookOpts.EnableWebhooks = true + option.WebhookOpts.ServiceName = "" }), - want: field.ErrorList{field.Required(newPath.Child("EnableV1Alpha1APIs"), "Either EnableV1Alpha1APIs or EnableV1Beta1APIs is required")}, + want: field.ErrorList{field.Invalid(newPath.Child("WebhookServiceName"), "", "A webhook service name is required when webhooks are enabled")}, }, - "invalid WebhookClientConnectionType": { + "UseCertManager without EnableWorkload": { opt: newTestOptions(func(option *Options) { - option.WebhookClientConnectionType = "invalid" + option.WebhookOpts.EnableWebhooks = true + option.WebhookOpts.ServiceName = testWebhookServiceName + option.WebhookOpts.UseCertManager = true + option.WebhookOpts.EnableWorkload = false }), - want: field.ErrorList{field.Invalid(newPath.Child("WebhookClientConnectionType"), "invalid", `must be "service" or "url"`)}, + want: field.ErrorList{field.Invalid(newPath.Child("UseCertManager"), true, "If cert manager is used for securing webhook connections, the EnableWorkload option must be set to true, so that cert manager pods can run in the hub cluster.")}, }, - "WebhookServiceName is empty": { + "UseCertManager with EnableWebhook and EnableWorkload": { opt: newTestOptions(func(option *Options) { - option.EnableWebhook = true - option.WebhookServiceName = "" + option.WebhookOpts.EnableWebhooks = true + option.WebhookOpts.ServiceName = testWebhookServiceName + option.WebhookOpts.UseCertManager = true + option.WebhookOpts.EnableWorkload = true }), - want: field.ErrorList{field.Invalid(newPath.Child("WebhookServiceName"), "", "Webhook service name is required when webhook is enabled")}, + want: field.ErrorList{}, }, - "UseCertManager with EnableWebhook": { + "mutually exclusive allowed/skipped propagating APIs": { opt: newTestOptions(func(option *Options) { - option.EnableWebhook = true - option.WebhookServiceName = testWebhookServiceName - option.UseCertManager = true + option.PlacementMgmtOpts.AllowedPropagatingAPIs = "apps/v1/Deployment" }), - want: field.ErrorList{field.Invalid(newPath.Child("UseCertManager"), true, "UseCertManager requires EnableWorkload to be true (when EnableWorkload is false, a validating webhook blocks pod creation except for certain system pods; cert-manager controller pods must be allowed to run in the hub cluster)")}, + want: field.ErrorList{field.Invalid(newPath.Child("AllowedPropagatingAPIs"), "apps/v1/Deployment", "AllowedPropagatingAPIs and SkippedPropagatingAPIs options are mutually exclusive")}, }, - "UseCertManager without EnableWorkload": { + "rate limiter base delay must be less than max delay": { opt: newTestOptions(func(option *Options) { - option.EnableWebhook = true - option.WebhookServiceName = testWebhookServiceName - option.UseCertManager = true - option.EnableWorkload = false + option.PlacementMgmtOpts.PlacementControllerWorkQueueRateLimiterOpts.RateLimiterBaseDelay = 60 * time.Second + option.PlacementMgmtOpts.PlacementControllerWorkQueueRateLimiterOpts.RateLimiterMaxDelay = 60 * time.Second }), - want: field.ErrorList{field.Invalid(newPath.Child("UseCertManager"), true, "UseCertManager requires EnableWorkload to be true (when EnableWorkload is false, a validating webhook blocks pod creation except for certain system pods; cert-manager controller pods must be allowed to run in the hub cluster)")}, + want: field.ErrorList{field.Invalid(newPath.Child("PlacementControllerWorkQueueRateLimiterOpts").Child("RateLimiterBaseDelay"), 60*time.Second, "the base delay for the placement controller set rate limiter must be less than its max delay")}, }, - "UseCertManager with EnableWebhook and EnableWorkload": { + "rate limiter qps must be less than bucket size": { opt: newTestOptions(func(option *Options) { - option.EnableWebhook = true - option.WebhookServiceName = testWebhookServiceName - option.UseCertManager = true - option.EnableWorkload = true + option.PlacementMgmtOpts.PlacementControllerWorkQueueRateLimiterOpts.RateLimiterQPS = 100 + option.PlacementMgmtOpts.PlacementControllerWorkQueueRateLimiterOpts.RateLimiterBucketSize = 10 }), - want: field.ErrorList{}, + want: field.ErrorList{field.Invalid(newPath.Child("PlacementControllerWorkQueueRateLimiterOpts").Child("RateLimiterQPS"), 100, "the QPS for the placement controller set rate limiter must be less than its bucket size")}, }, } @@ -142,5 +142,5 @@ func TestAddFlags(t *testing.T) { flags := flag.NewFlagSet("deny-modify-member-cluster-labels", flag.ExitOnError) opts.AddFlags(flags) - g.Expect(opts.DenyModifyMemberClusterLabels).To(gomega.BeFalse(), "deny-modify-member-cluster-labels should be false by default") + g.Expect(opts.WebhookOpts.GuardRailDenyModifyMemberClusterLabels).To(gomega.BeFalse(), "deny-modify-member-cluster-labels should be false by default") } diff --git a/cmd/hubagent/options/webhooks.go b/cmd/hubagent/options/webhooks.go new file mode 100644 index 000000000..efa23aa6c --- /dev/null +++ b/cmd/hubagent/options/webhooks.go @@ -0,0 +1,133 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package options + +import ( + "flag" + "fmt" +) + +// WebhookOptions is a set of options the KubeFleet hub agent exposes for +// controlling webhook behavior. +type WebhookOptions struct { + // Enable the KubeFleet webhooks or not. + EnableWebhooks bool + + // The connection type used by the webhook client. Valid values are `url` and `service`. + // NOTE: at this moment this setting seems to be superficial, as even with the value set to `url`, + // the system would just compose a Kubernetes service URL based on the provided service name. + // This option only applies if webhooks are enabled. + ClientConnectionType string + + // The Kubernetes service name for hosting the webhooks. This option applies only if + // webhooks are enabled. + ServiceName string + + // Enable the KubeFleet guard rail webhook or not. The guard rail webhook helps guard against + // inadvertent modifications to Fleet resources. This option only applies if webhooks are enabled. + EnableGuardRail bool + + // A list of comma-separated usernames who are whitelisted in the guard rail webhook and + // thus allowed to modify KubeFleet resources. This option only applies if the guard rail + // webhook is enabled. + GuardRailWhitelistedUsers string + + // Set the guard rail webhook to block users (with certain exceptions) from modifying the labels + // on the MemberCluster resources. This option only applies if the guard rail webhook is enabled. + GuardRailDenyModifyMemberClusterLabels bool + + // Enable workload resources (pods and replicaSets) to be created in the hub cluster or not. + // If set to false, the KubeFleet pod and replicaset validating webhooks, which blocks the creation + // of pods and replicaSets outside KubeFleet reserved namespaces for most users, will be disabled. + // This option only applies if webhooks are enabled. + EnableWorkload bool + + // Use the cert-manager project for managing KubeFleet webhook server certificates or not. + // If set to false, the system will use self-signed certificates. + // This option only applies if webhooks are enabled. + UseCertManager bool +} + +// AddFlags adds flags for WebhookOptions to the specified FlagSet. +func (o *WebhookOptions) AddFlags(flags *flag.FlagSet) { + flags.BoolVar( + &o.EnableWebhooks, + "enable-webhook", + true, + "Enable the KubeFleet webhooks or not.", + ) + + flags.Func( + "webhook-client-connection-type", + "The connection type used by the webhook client. Valid values are `url` and `service`. Defaults to `url`. This option only applies if webhooks are enabled.", + func(s string) error { + if len(s) == 0 { + o.ClientConnectionType = "url" + return nil + } + + parsedStr, err := parseWebhookClientConnectionString(s) + if err != nil { + return fmt.Errorf("invalid webhook client connection type: %w", err) + } + o.ClientConnectionType = string(parsedStr) + return nil + }, + ) + + flags.StringVar( + &o.ServiceName, + "webhook-service-name", + "fleetwebhook", + "The Kubernetes service name for hosting the webhooks. This option only applies if webhooks are enabled.", + ) + + flags.BoolVar( + &o.EnableGuardRail, + "enable-guard-rail", + false, + "Enable the KubeFleet guard rail webhook or not. The guard rail webhook helps guard against inadvertent modifications to Fleet resources. This option only applies if webhooks are enabled.", + ) + + flags.StringVar( + &o.GuardRailWhitelistedUsers, + "whitelisted-users", + "", + "A list of comma-separated usernames who are whitelisted in the guard rail webhook and thus allowed to modify KubeFleet resources. This option only applies if the guard rail webhook is enabled.", + ) + + flags.BoolVar( + &o.GuardRailDenyModifyMemberClusterLabels, + "deny-modify-member-cluster-labels", + false, + "Set the guard rail webhook to block users (with certain exceptions) from modifying the labels on the MemberCluster resources. This option only applies if the guard rail webhook is enabled.", + ) + + flags.BoolVar( + &o.EnableWorkload, + "enable-workload", + false, + "Enable workload resources (pods and replicaSets) to be created in the hub cluster or not. If set to false, the KubeFleet pod and replicaset validating webhooks, which blocks the creation of pods and replicaSets outside KubeFleet reserved namespaces for most users, will be disabled. This option only applies if webhooks are enabled.", + ) + + flags.BoolVar( + &o.UseCertManager, + "use-cert-manager", + false, + "Use the cert-manager project for managing KubeFleet webhook server certificates or not. If set to false, the system will use self-signed certificates. If set to true, the EnableWorkload option must be set to true as well. This option only applies if webhooks are enabled.", + ) +} diff --git a/cmd/hubagent/workload/setup.go b/cmd/hubagent/workload/setup.go index ec96f8b7b..b603978cc 100644 --- a/cmd/hubagent/workload/setup.go +++ b/cmd/hubagent/workload/setup.go @@ -130,12 +130,12 @@ func SetupControllers(ctx context.Context, wg *sync.WaitGroup, mgr ctrl.Manager, discoverClient := discovery.NewDiscoveryClientForConfigOrDie(config) // AllowedPropagatingAPIs and SkippedPropagatingAPIs are mutually exclusive. // If none of them are set, the resourceConfig by default stores a list of skipped propagation APIs. - resourceConfig := utils.NewResourceConfig(opts.AllowedPropagatingAPIs != "") - if err = resourceConfig.Parse(opts.AllowedPropagatingAPIs); err != nil { + resourceConfig := utils.NewResourceConfig(opts.PlacementMgmtOpts.AllowedPropagatingAPIs != "") + if err = resourceConfig.Parse(opts.PlacementMgmtOpts.AllowedPropagatingAPIs); err != nil { // The program will never go here because the parameters have been checked. return err } - if err = resourceConfig.Parse(opts.SkippedPropagatingAPIs); err != nil { + if err = resourceConfig.Parse(opts.PlacementMgmtOpts.SkippedPropagatingAPIs); err != nil { // The program will never go here because the parameters have been checked return err } @@ -143,7 +143,7 @@ func SetupControllers(ctx context.Context, wg *sync.WaitGroup, mgr ctrl.Manager, // setup namespaces we skip propagation skippedNamespaces := make(map[string]bool) skippedNamespaces["default"] = true - optionalSkipNS := strings.Split(opts.SkippedPropagatingNamespaces, ";") + optionalSkipNS := strings.Split(opts.PlacementMgmtOpts.SkippedPropagatingNamespaces, ";") for _, ns := range optionalSkipNS { if len(ns) > 0 { klog.InfoS("user specified a namespace to skip", "namespace", ns) @@ -152,7 +152,7 @@ func SetupControllers(ctx context.Context, wg *sync.WaitGroup, mgr ctrl.Manager, } // the manager for all the dynamically created informers - dynamicInformerManager := informer.NewInformerManager(dynamicClient, opts.ResyncPeriod.Duration, ctx.Done()) + dynamicInformerManager := informer.NewInformerManager(dynamicClient, opts.CtrlMgrOpts.ResyncPeriod.Duration, ctx.Done()) validator.ResourceInformer = dynamicInformerManager // webhook needs this to check resource scope validator.RestMapper = mgr.GetRESTMapper() // webhook needs this to validate GVK of resource selector @@ -162,10 +162,10 @@ func SetupControllers(ctx context.Context, wg *sync.WaitGroup, mgr ctrl.Manager, InformerManager: dynamicInformerManager, ResourceConfig: resourceConfig, SkippedNamespaces: skippedNamespaces, - EnableWorkload: opts.EnableWorkload, + EnableWorkload: opts.WebhookOpts.EnableWorkload, } resourceSnapshotResolver := controller.NewResourceSnapshotResolver(mgr.GetClient(), mgr.GetScheme()) - resourceSnapshotResolver.Config = controller.NewResourceSnapshotConfig(opts.ResourceSnapshotCreationMinimumInterval, opts.ResourceChangesCollectionDuration) + resourceSnapshotResolver.Config = controller.NewResourceSnapshotConfig(opts.PlacementMgmtOpts.ResourceSnapshotCreationMinimumInterval, opts.PlacementMgmtOpts.ResourceChangesCollectionDuration) pc := &placement.Reconciler{ Client: mgr.GetClient(), Recorder: mgr.GetEventRecorderFor(placementControllerName), @@ -175,11 +175,11 @@ func SetupControllers(ctx context.Context, wg *sync.WaitGroup, mgr ctrl.Manager, ResourceSnapshotResolver: resourceSnapshotResolver, } - rateLimiter := options.DefaultControllerRateLimiter(opts.RateLimiterOpts) + rateLimiter := options.DefaultControllerRateLimiter(opts.PlacementMgmtOpts.PlacementControllerWorkQueueRateLimiterOpts) var clusterResourcePlacementControllerV1Beta1 controller.Controller var resourcePlacementController controller.Controller - if opts.EnableV1Beta1APIs { + if opts.FeatureFlags.EnableV1Beta1APIs { for _, gvk := range v1Beta1RequiredGVKs { if err = utils.CheckCRDInstalled(discoverClient, gvk); err != nil { klog.ErrorS(err, "unable to find the required CRD", "GVK", gvk) @@ -223,7 +223,7 @@ func SetupControllers(ctx context.Context, wg *sync.WaitGroup, mgr ctrl.Manager, return err } - if opts.EnableResourcePlacement { + if opts.FeatureFlags.EnableResourcePlacementAPIs { for _, gvk := range rpRequiredGVKs { if err = utils.CheckCRDInstalled(discoverClient, gvk); err != nil { klog.ErrorS(err, "unable to find the required CRD", "GVK", gvk) @@ -264,18 +264,18 @@ func SetupControllers(ctx context.Context, wg *sync.WaitGroup, mgr ctrl.Manager, if err := (&rollout.Reconciler{ Client: mgr.GetClient(), UncachedReader: mgr.GetAPIReader(), - MaxConcurrentReconciles: int(math.Ceil(float64(opts.MaxFleetSizeSupported)/30) * math.Ceil(float64(opts.MaxConcurrentClusterPlacement)/10)), + MaxConcurrentReconciles: int(math.Ceil(float64(opts.PlacementMgmtOpts.MaxFleetSize)/30) * math.Ceil(float64(opts.PlacementMgmtOpts.MaxConcurrentClusterPlacement)/10)), InformerManager: dynamicInformerManager, }).SetupWithManagerForClusterResourcePlacement(mgr); err != nil { klog.ErrorS(err, "Unable to set up rollout controller for clusterResourcePlacement") return err } - if opts.EnableResourcePlacement { + if opts.FeatureFlags.EnableResourcePlacementAPIs { if err := (&rollout.Reconciler{ Client: mgr.GetClient(), UncachedReader: mgr.GetAPIReader(), - MaxConcurrentReconciles: int(math.Ceil(float64(opts.MaxFleetSizeSupported)/30) * math.Ceil(float64(opts.MaxConcurrentClusterPlacement)/10)), + MaxConcurrentReconciles: int(math.Ceil(float64(opts.PlacementMgmtOpts.MaxFleetSize)/30) * math.Ceil(float64(opts.PlacementMgmtOpts.MaxConcurrentClusterPlacement)/10)), InformerManager: dynamicInformerManager, }).SetupWithManagerForResourcePlacement(mgr); err != nil { klog.ErrorS(err, "Unable to set up rollout controller for resourcePlacement") @@ -283,7 +283,7 @@ func SetupControllers(ctx context.Context, wg *sync.WaitGroup, mgr ctrl.Manager, } } - if opts.EnableEvictionAPIs { + if opts.FeatureFlags.EnableEvictionAPIs { for _, gvk := range evictionGVKs { if err = utils.CheckCRDInstalled(discoverClient, gvk); err != nil { klog.ErrorS(err, "Unable to find the required CRD", "GVK", gvk) @@ -301,7 +301,7 @@ func SetupControllers(ctx context.Context, wg *sync.WaitGroup, mgr ctrl.Manager, } // Set up a controller to do staged update run, rolling out resources to clusters in a stage by stage manner. - if opts.EnableStagedUpdateRunAPIs { + if opts.FeatureFlags.EnableStagedUpdateRunAPIs { for _, gvk := range clusterStagedUpdateRunGVKs { if err = utils.CheckCRDInstalled(discoverClient, gvk); err != nil { klog.ErrorS(err, "Unable to find the required CRD", "GVK", gvk) @@ -319,7 +319,7 @@ func SetupControllers(ctx context.Context, wg *sync.WaitGroup, mgr ctrl.Manager, return err } - if opts.EnableResourcePlacement { + if opts.FeatureFlags.EnableResourcePlacementAPIs { for _, gvk := range stagedUpdateRunGVKs { if err = utils.CheckCRDInstalled(discoverClient, gvk); err != nil { klog.ErrorS(err, "Unable to find the required CRD", "GVK", gvk) @@ -343,17 +343,17 @@ func SetupControllers(ctx context.Context, wg *sync.WaitGroup, mgr ctrl.Manager, klog.Info("Setting up work generator") if err := (&workgenerator.Reconciler{ Client: mgr.GetClient(), - MaxConcurrentReconciles: int(math.Ceil(float64(opts.MaxFleetSizeSupported)/10) * math.Ceil(float64(opts.MaxConcurrentClusterPlacement)/10)), + MaxConcurrentReconciles: int(math.Ceil(float64(opts.PlacementMgmtOpts.MaxFleetSize)/10) * math.Ceil(float64(opts.PlacementMgmtOpts.MaxConcurrentClusterPlacement)/10)), InformerManager: dynamicInformerManager, }).SetupWithManagerForClusterResourceBinding(mgr); err != nil { klog.ErrorS(err, "Unable to set up work generator for clusterResourceBinding") return err } - if opts.EnableResourcePlacement { + if opts.FeatureFlags.EnableResourcePlacementAPIs { if err := (&workgenerator.Reconciler{ Client: mgr.GetClient(), - MaxConcurrentReconciles: int(math.Ceil(float64(opts.MaxFleetSizeSupported)/10) * math.Ceil(float64(opts.MaxConcurrentClusterPlacement)/10)), + MaxConcurrentReconciles: int(math.Ceil(float64(opts.PlacementMgmtOpts.MaxFleetSize)/10) * math.Ceil(float64(opts.PlacementMgmtOpts.MaxConcurrentClusterPlacement)/10)), InformerManager: dynamicInformerManager, }).SetupWithManagerForResourceBinding(mgr); err != nil { klog.ErrorS(err, "Unable to set up work generator for resourceBinding") @@ -370,7 +370,7 @@ func SetupControllers(ctx context.Context, wg *sync.WaitGroup, mgr ctrl.Manager, ) // we use one scheduler for every 10 concurrent placement defaultScheduler := scheduler.NewScheduler("DefaultScheduler", defaultFramework, defaultSchedulingQueue, mgr, - int(math.Ceil(float64(opts.MaxFleetSizeSupported)/50)*math.Ceil(float64(opts.MaxConcurrentClusterPlacement)/10))) + int(math.Ceil(float64(opts.PlacementMgmtOpts.MaxFleetSize)/50)*math.Ceil(float64(opts.PlacementMgmtOpts.MaxConcurrentClusterPlacement)/10))) klog.Info("Starting the scheduler") // Scheduler must run in a separate goroutine as Run() is a blocking call. wg.Add(1) @@ -411,7 +411,7 @@ func SetupControllers(ctx context.Context, wg *sync.WaitGroup, mgr ctrl.Manager, return err } - if opts.EnableResourcePlacement { + if opts.FeatureFlags.EnableResourcePlacementAPIs { klog.Info("Setting up the resourcePlacement watcher for scheduler") if err := (&schedulerplacementwatcher.Reconciler{ Client: mgr.GetClient(), @@ -445,7 +445,7 @@ func SetupControllers(ctx context.Context, wg *sync.WaitGroup, mgr ctrl.Manager, Client: mgr.GetClient(), SchedulerWorkQueue: defaultSchedulingQueue, ClusterEligibilityChecker: clustereligibilitychecker.New(), - EnableResourcePlacement: opts.EnableResourcePlacement, + EnableResourcePlacement: opts.FeatureFlags.EnableResourcePlacementAPIs, }).SetupWithManager(mgr); err != nil { klog.ErrorS(err, "Unable to set up memberCluster watcher for scheduler") return err @@ -473,7 +473,7 @@ func SetupControllers(ctx context.Context, wg *sync.WaitGroup, mgr ctrl.Manager, } // Verify cluster inventory CRD installation status. - if opts.EnableClusterInventoryAPIs { + if opts.FeatureFlags.EnableClusterInventoryAPIs { for _, gvk := range clusterInventoryGVKs { if err = utils.CheckCRDInstalled(discoverClient, gvk); err != nil { klog.ErrorS(err, "unable to find the required CRD", "GVK", gvk) @@ -484,7 +484,7 @@ func SetupControllers(ctx context.Context, wg *sync.WaitGroup, mgr ctrl.Manager, if err = (&clusterprofile.Reconciler{ Client: mgr.GetClient(), ClusterProfileNamespace: utils.FleetSystemNamespace, - ClusterUnhealthyThreshold: opts.ClusterUnhealthyThreshold.Duration, + ClusterUnhealthyThreshold: opts.ClusterMgmtOpts.UnhealthyThreshold.Duration, }).SetupWithManager(mgr); err != nil { klog.ErrorS(err, "unable to set up ClusterProfile controller") return err @@ -530,9 +530,9 @@ func SetupControllers(ctx context.Context, wg *sync.WaitGroup, mgr ctrl.Manager, InformerManager: dynamicInformerManager, ResourceConfig: resourceConfig, SkippedNamespaces: skippedNamespaces, - ConcurrentPlacementWorker: int(math.Ceil(float64(opts.MaxConcurrentClusterPlacement) / 10)), - ConcurrentResourceChangeWorker: opts.ConcurrentResourceChangeSyncs, - EnableWorkload: opts.EnableWorkload, + ConcurrentPlacementWorker: int(math.Ceil(float64(opts.PlacementMgmtOpts.MaxConcurrentClusterPlacement) / 10)), + ConcurrentResourceChangeWorker: opts.PlacementMgmtOpts.ConcurrentResourceChangeSyncs, + EnableWorkload: opts.WebhookOpts.EnableWorkload, } if err := mgr.Add(resourceChangeDetector); err != nil { diff --git a/go.mod b/go.mod index b46f8dc38..3c82f9f17 100644 --- a/go.mod +++ b/go.mod @@ -28,7 +28,6 @@ require ( k8s.io/apiextensions-apiserver v0.34.1 k8s.io/apimachinery v0.34.1 k8s.io/client-go v0.34.1 - k8s.io/component-base v0.34.1 k8s.io/component-helpers v0.32.3 k8s.io/klog/v2 v2.130.1 k8s.io/kubectl v0.32.3 diff --git a/pkg/utils/apiresources.go b/pkg/utils/apiresources.go index a767eed72..d14703ab2 100644 --- a/pkg/utils/apiresources.go +++ b/pkg/utils/apiresources.go @@ -238,7 +238,7 @@ func (r *ResourceConfig) Parse(c string) error { tokens := strings.Split(c, apiGroupSepToken) for _, token := range tokens { if err := r.parseSingle(token); err != nil { - return fmt.Errorf("parse --avoid-selecting-apis %w", err) + return fmt.Errorf("failed to parse token: %w", err) } } diff --git a/pkg/webhook/webhook.go b/pkg/webhook/webhook.go index 5f518d692..3eca6a06b 100644 --- a/pkg/webhook/webhook.go +++ b/pkg/webhook/webhook.go @@ -226,13 +226,13 @@ func NewWebhookConfig(mgr manager.Manager, webhookServiceName string, port int32 // String-to-enum conversions (e.g., WebhookClientConnectionType) are performed without // additional validation, as validation happens at the Options level. func NewWebhookConfigFromOptions(mgr manager.Manager, opts *options.Options, webhookPort int32) (*Config, error) { - webhookClientConnectionType := options.WebhookClientConnectionType(opts.WebhookClientConnectionType) - whiteListedUsers := strings.Split(opts.WhiteListedUsers, ",") + webhookClientConnectionType := options.WebhookClientConnectionType(opts.WebhookOpts.ClientConnectionType) + whiteListedUsers := strings.Split(opts.WebhookOpts.GuardRailWhitelistedUsers, ",") - return NewWebhookConfig(mgr, opts.WebhookServiceName, webhookPort, - &webhookClientConnectionType, FleetWebhookCertDir, opts.EnableGuardRail, - opts.DenyModifyMemberClusterLabels, opts.EnableWorkload, opts.UseCertManager, - FleetWebhookCertName, whiteListedUsers, opts.NetworkingAgentsEnabled) + return NewWebhookConfig(mgr, opts.WebhookOpts.ServiceName, webhookPort, + &webhookClientConnectionType, FleetWebhookCertDir, opts.WebhookOpts.EnableGuardRail, + opts.WebhookOpts.GuardRailDenyModifyMemberClusterLabels, opts.WebhookOpts.EnableWorkload, opts.WebhookOpts.UseCertManager, + FleetWebhookCertName, whiteListedUsers, opts.ClusterMgmtOpts.NetworkingAgentsEnabled) } func (w *Config) Start(ctx context.Context) error { diff --git a/pkg/webhook/webhook_test.go b/pkg/webhook/webhook_test.go index 1456fd74f..d7d168b23 100644 --- a/pkg/webhook/webhook_test.go +++ b/pkg/webhook/webhook_test.go @@ -238,14 +238,18 @@ func TestNewWebhookConfigFromOptions(t *testing.T) { }{ "valid options with cert-manager": { opts: &options.Options{ - WebhookServiceName: "test-webhook", - WebhookClientConnectionType: "service", - EnableGuardRail: true, - DenyModifyMemberClusterLabels: true, - EnableWorkload: true, - UseCertManager: true, - WhiteListedUsers: "user1,user2,user3", - NetworkingAgentsEnabled: true, + WebhookOpts: options.WebhookOptions{ + ServiceName: "test-webhook", + ClientConnectionType: "service", + EnableGuardRail: true, + GuardRailDenyModifyMemberClusterLabels: true, + EnableWorkload: true, + UseCertManager: true, + GuardRailWhitelistedUsers: "user1,user2,user3", + }, + ClusterMgmtOpts: options.ClusterManagementOptions{ + NetworkingAgentsEnabled: true, + }, }, wantErr: false, wantConfig: &Config{ @@ -263,14 +267,18 @@ func TestNewWebhookConfigFromOptions(t *testing.T) { }, "valid options without cert-manager": { opts: &options.Options{ - WebhookServiceName: "test-webhook", - WebhookClientConnectionType: "url", - EnableGuardRail: false, - DenyModifyMemberClusterLabels: false, - EnableWorkload: false, - UseCertManager: false, - WhiteListedUsers: "admin", - NetworkingAgentsEnabled: false, + WebhookOpts: options.WebhookOptions{ + ServiceName: "test-webhook", + ClientConnectionType: "url", + EnableGuardRail: false, + GuardRailDenyModifyMemberClusterLabels: false, + EnableWorkload: false, + UseCertManager: false, + GuardRailWhitelistedUsers: "admin", + }, + ClusterMgmtOpts: options.ClusterManagementOptions{ + NetworkingAgentsEnabled: false, + }, }, wantErr: false, wantConfig: &Config{ From 8ac3743b7c8c12a9c38ed374fa4990a13b0ff1ab Mon Sep 17 00:00:00 2001 From: michaelawyu Date: Wed, 4 Mar 2026 00:01:10 +0800 Subject: [PATCH 8/9] test: add UTs to the hub agent CLI flag setup (#467) --- cmd/hubagent/options/options_test.go | 797 +++++++++++++++++++++++++++ 1 file changed, 797 insertions(+) create mode 100644 cmd/hubagent/options/options_test.go diff --git a/cmd/hubagent/options/options_test.go b/cmd/hubagent/options/options_test.go new file mode 100644 index 000000000..ca84d5208 --- /dev/null +++ b/cmd/hubagent/options/options_test.go @@ -0,0 +1,797 @@ +/* +Copyright 2025 The KubeFleet Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package options + +import ( + "flag" + "strings" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + "github.com/kubefleet-dev/kubefleet/pkg/utils" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// TestLeaderElectionOpts tests the parsing and validation logic of the leader election options defined in LeaderElectionOptions. +func TestLeaderElectionOpts(t *testing.T) { + testCases := []struct { + name string + flagSetName string + args []string + wantLeaderElectionOpts LeaderElectionOptions + wantErred bool + wantErrMsgSubStr string + }{ + { + name: "all default", + flagSetName: "allDefault", + args: []string{}, + wantLeaderElectionOpts: LeaderElectionOptions{ + LeaderElect: false, + LeaseDuration: metav1.Duration{Duration: 15 * time.Second}, + RenewDeadline: metav1.Duration{Duration: 10 * time.Second}, + RetryPeriod: metav1.Duration{Duration: 2 * time.Second}, + ResourceNamespace: utils.FleetSystemNamespace, + }, + }, + { + name: "all specified", + flagSetName: "allSpecified", + args: []string{ + "--leader-elect=true", + "--leader-lease-duration=30s", + "--leader-renew-deadline=20s", + "--leader-retry-period=5s", + "--leader-election-namespace=test-namespace", + }, + wantLeaderElectionOpts: LeaderElectionOptions{ + LeaderElect: true, + LeaseDuration: metav1.Duration{Duration: 30 * time.Second}, + RenewDeadline: metav1.Duration{Duration: 20 * time.Second}, + RetryPeriod: metav1.Duration{Duration: 5 * time.Second}, + ResourceNamespace: "test-namespace", + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + flags := flag.NewFlagSet(tc.flagSetName, flag.ContinueOnError) + leaderElectionOpts := LeaderElectionOptions{} + leaderElectionOpts.AddFlags(flags) + + err := flags.Parse(tc.args) + if tc.wantErred { + if err == nil { + t.Fatalf("flag Parse() = nil, want erred") + } + + if !strings.Contains(err.Error(), tc.wantErrMsgSubStr) { + t.Fatalf("flag Parse() error = %v, want error msg with sub-string %s", err, tc.wantErrMsgSubStr) + } + return + } + + if err != nil { + t.Fatalf("flag Parse() = %v, want nil", err) + } + + if diff := cmp.Diff(leaderElectionOpts, tc.wantLeaderElectionOpts); diff != "" { + t.Errorf("leader election options diff (-got, +want):\n%s", diff) + } + }) + } +} + +// TestControllerManagerOptions tests the parsing and validation logic of the controller manager options defined in ControllerManagerOptions. +func TestControllerManagerOptions(t *testing.T) { + testCases := []struct { + name string + flagSetName string + args []string + wantCtrlMgrOpts ControllerManagerOptions + wantErred bool + wantErrMsgSubStr string + }{ + { + name: "all default", + flagSetName: "allDefault", + args: []string{}, + wantCtrlMgrOpts: ControllerManagerOptions{ + HealthProbeBindAddress: ":8081", + MetricsBindAddress: ":8080", + EnablePprof: false, + PprofPort: 6065, + HubQPS: 250.0, + HubBurst: 1000, + ResyncPeriod: metav1.Duration{Duration: 6 * time.Hour}, + }, + }, + { + name: "all specified", + flagSetName: "allSpecified", + args: []string{ + "--health-probe-bind-address=:18081", + "--metrics-bind-address=:18080", + "--enable-pprof=true", + "--pprof-port=16065", + "--hub-api-qps=500", + "--hub-api-burst=1500", + "--resync-period=2h", + }, + wantCtrlMgrOpts: ControllerManagerOptions{ + HealthProbeBindAddress: ":18081", + MetricsBindAddress: ":18080", + EnablePprof: true, + PprofPort: 16065, + HubQPS: 500, + HubBurst: 1500, + ResyncPeriod: metav1.Duration{Duration: 2 * time.Hour}, + }, + }, + { + name: "negative hub client QPS value", + flagSetName: "qpsNegative", + args: []string{"--hub-api-qps=-5"}, + wantCtrlMgrOpts: ControllerManagerOptions{ + HealthProbeBindAddress: ":8081", + MetricsBindAddress: ":8080", + EnablePprof: false, + PprofPort: 6065, + HubQPS: -1, + HubBurst: 1000, + ResyncPeriod: metav1.Duration{Duration: 6 * time.Hour}, + }, + }, + { + name: "hub client QPS parse error", + flagSetName: "qpsParseError", + args: []string{"--hub-api-qps=abc"}, + wantErred: true, + wantErrMsgSubStr: "failed to parse float64 value", + }, + { + name: "hub client QPS out of range (too small)", + flagSetName: "qpsOutOfRangeTooSmall", + args: []string{"--hub-api-qps=9.9"}, + wantErred: true, + wantErrMsgSubStr: "QPS limit is set to an invalid value", + }, + { + name: "hub client QPS out of range (too large)", + flagSetName: "qpsOutOfRangeTooLarge", + args: []string{"--hub-api-qps=10000.1"}, + wantErred: true, + wantErrMsgSubStr: "QPS limit is set to an invalid value", + }, + { + name: "hub client burst parse error", + flagSetName: "burstParseError", + args: []string{"--hub-api-burst=abc"}, + wantErred: true, + wantErrMsgSubStr: "failed to parse int value", + }, + { + name: "hub client burst out of range (too small)", + flagSetName: "burstOutOfRangeTooSmall", + args: []string{"--hub-api-burst=9"}, + wantErred: true, + wantErrMsgSubStr: "burst limit is set to an invalid value", + }, + { + name: "hub client burst out of range (too large)", + flagSetName: "burstOutOfRangeTooLarge", + args: []string{"--hub-api-burst=20001"}, + wantErred: true, + wantErrMsgSubStr: "burst limit is set to an invalid value", + }, + { + name: "resync period parse error", + flagSetName: "resyncParseError", + args: []string{"--resync-period=abc"}, + wantErred: true, + wantErrMsgSubStr: "failed to parse duration value", + }, + { + name: "resync period out of range (too small)", + flagSetName: "resyncOutOfRangeTooSmall", + args: []string{"--resync-period=59m"}, + wantErred: true, + wantErrMsgSubStr: "resync period is set to an invalid value", + }, + { + name: "resync period out of range (too large)", + flagSetName: "resyncOutOfRangeTooLarge", + args: []string{"--resync-period=13h"}, + wantErred: true, + wantErrMsgSubStr: "resync period is set to an invalid value", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + flags := flag.NewFlagSet(tc.flagSetName, flag.ContinueOnError) + ctrlMgrOpts := ControllerManagerOptions{} + ctrlMgrOpts.AddFlags(flags) + + err := flags.Parse(tc.args) + if tc.wantErred { + if err == nil { + t.Fatalf("flag Parse() = nil, want erred") + } + + if !strings.Contains(err.Error(), tc.wantErrMsgSubStr) { + t.Fatalf("flag Parse() error = %v, want error msg with sub-string %s", err, tc.wantErrMsgSubStr) + } + return + } + + if err != nil { + t.Fatalf("flag Parse() = %v, want nil", err) + } + + if diff := cmp.Diff(ctrlMgrOpts, tc.wantCtrlMgrOpts); diff != "" { + t.Errorf("controller manager options diff (-got, +want):\n%s", diff) + } + }) + } +} + +// TestFeatureFlags tests the parsing and validation logic of the feature flags defined in FeatureFlags. +func TestFeatureFlags(t *testing.T) { + testCases := []struct { + name string + flagSetName string + args []string + wantFeatureFlags FeatureFlags + wantErred bool + wantErrMsgSubStr string + }{ + { + name: "all default", + flagSetName: "allDefault", + args: []string{}, + wantFeatureFlags: FeatureFlags{ + EnableV1Beta1APIs: true, + EnableClusterInventoryAPIs: true, + EnableStagedUpdateRunAPIs: true, + EnableEvictionAPIs: true, + EnableResourcePlacementAPIs: true, + }, + }, + { + name: "all specified", + flagSetName: "allSpecified", + args: []string{ + "--enable-v1beta1-apis=true", + "--enable-cluster-inventory-apis=false", + "--enable-staged-update-run-apis=false", + "--enable-eviction-apis=false", + "--enable-resource-placement=false", + }, + wantFeatureFlags: FeatureFlags{ + EnableV1Beta1APIs: true, + EnableClusterInventoryAPIs: false, + EnableStagedUpdateRunAPIs: false, + EnableEvictionAPIs: false, + EnableResourcePlacementAPIs: false, + }, + }, + { + name: "enable v1beta1 API option parse error", + flagSetName: "enableV1Beta1ParseError", + args: []string{"--enable-v1beta1-apis=abc"}, + wantErred: true, + wantErrMsgSubStr: "failed to parse bool value", + }, + { + name: "enable v1beta1 API validation error", + flagSetName: "enableV1Beta1ValidationError", + args: []string{"--enable-v1beta1-apis=false"}, + wantErred: true, + wantErrMsgSubStr: "KubeFleet v1beta1 APIs are the storage version and must be enabled", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + flags := flag.NewFlagSet(tc.flagSetName, flag.ContinueOnError) + featureFlags := FeatureFlags{} + featureFlags.AddFlags(flags) + + err := flags.Parse(tc.args) + if tc.wantErred { + if err == nil { + t.Fatalf("flag Parse() = nil, want erred") + } + + if !strings.Contains(err.Error(), tc.wantErrMsgSubStr) { + t.Fatalf("flag Parse() error = %v, want error msg with sub-string %s", err, tc.wantErrMsgSubStr) + } + return + } + + if err != nil { + t.Fatalf("flag Parse() = %v, want nil", err) + } + + if diff := cmp.Diff(featureFlags, tc.wantFeatureFlags); diff != "" { + t.Errorf("feature flags diff (-got, +want):\n%s", diff) + } + }) + } +} + +// TestClusterManagementOptions tests the parsing and validation logic of the cluster management options defined in ClusterManagementOptions. +func TestClusterManagementOptions(t *testing.T) { + testCases := []struct { + name string + flagSetName string + args []string + wantClusterMgmtOpts ClusterManagementOptions + wantErred bool + wantErrMsgSubStr string + }{ + { + name: "all default", + flagSetName: "allDefault", + args: []string{}, + wantClusterMgmtOpts: ClusterManagementOptions{ + NetworkingAgentsEnabled: false, + UnhealthyThreshold: metav1.Duration{Duration: 60 * time.Second}, + ForceDeleteWaitTime: metav1.Duration{Duration: 15 * time.Minute}, + }, + }, + { + name: "all specified", + flagSetName: "allSpecified", + args: []string{ + "--networking-agents-enabled=true", + "--cluster-unhealthy-threshold=45s", + "--force-delete-wait-time=10m", + }, + wantClusterMgmtOpts: ClusterManagementOptions{ + NetworkingAgentsEnabled: true, + UnhealthyThreshold: metav1.Duration{Duration: 45 * time.Second}, + ForceDeleteWaitTime: metav1.Duration{Duration: 10 * time.Minute}, + }, + }, + { + name: "cluster unhealthy threshold parse error", + flagSetName: "clusterUnhealthyThresholdParseError", + args: []string{"--cluster-unhealthy-threshold=abc"}, + wantErred: true, + wantErrMsgSubStr: "failed to parse duration", + }, + { + name: "cluster unhealthy threshold out of range (too small)", + flagSetName: "clusterUnhealthyThresholdOutOfRangeTooSmall", + args: []string{"--cluster-unhealthy-threshold=29s"}, + wantErred: true, + wantErrMsgSubStr: "duration must be in the range [30s, 1h]", + }, + { + name: "cluster unhealthy threshold out of range (too large)", + flagSetName: "clusterUnhealthyThresholdOutOfRangeTooLarge", + args: []string{"--cluster-unhealthy-threshold=1h1s"}, + wantErred: true, + wantErrMsgSubStr: "duration must be in the range [30s, 1h]", + }, + { + name: "force delete wait time parse error", + flagSetName: "forceDeleteWaitTimeParseError", + args: []string{"--force-delete-wait-time=abc"}, + wantErred: true, + wantErrMsgSubStr: "failed to parse duration", + }, + { + name: "force delete wait time out of range (too small)", + flagSetName: "forceDeleteWaitTimeOutOfRangeTooSmall", + args: []string{"--force-delete-wait-time=20s"}, + wantErred: true, + wantErrMsgSubStr: "duration must be in the range [30s, 1h]", + }, + { + name: "force delete wait time out of range (too large)", + flagSetName: "forceDeleteWaitTimeOutOfRangeTooLarge", + args: []string{"--force-delete-wait-time=1h1s"}, + wantErred: true, + wantErrMsgSubStr: "duration must be in the range [30s, 1h]", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + flags := flag.NewFlagSet(tc.flagSetName, flag.ContinueOnError) + clusterMgmtOpts := ClusterManagementOptions{} + clusterMgmtOpts.AddFlags(flags) + + err := flags.Parse(tc.args) + if tc.wantErred { + if err == nil { + t.Fatalf("flag Parse() = nil, want erred") + } + + if !strings.Contains(err.Error(), tc.wantErrMsgSubStr) { + t.Fatalf("flag Parse() error = %v, want error msg with sub-string %s", err, tc.wantErrMsgSubStr) + } + return + } + + if err != nil { + t.Fatalf("flag Parse() = %v, want nil", err) + } + + if diff := cmp.Diff(clusterMgmtOpts, tc.wantClusterMgmtOpts); diff != "" { + t.Errorf("cluster management options diff (-got, +want):\n%s", diff) + } + }) + } +} + +// TestPlacementManagementOptions tests the parsing and validation logic of the placement management options defined in PlacementManagementOptions. +func TestPlacementManagementOptions(t *testing.T) { + testCases := []struct { + name string + flagSetName string + args []string + wantPlacementMgmtOpts PlacementManagementOptions + wantErred bool + wantErrMsgSubStr string + }{ + { + name: "all default", + flagSetName: "allDefault", + args: []string{}, + wantPlacementMgmtOpts: PlacementManagementOptions{ + WorkPendingGracePeriod: metav1.Duration{Duration: 15 * time.Second}, + SkippedPropagatingAPIs: "", + AllowedPropagatingAPIs: "", + SkippedPropagatingNamespaces: "", + ConcurrentResourceChangeSyncs: 20, + MaxFleetSize: 100, + MaxConcurrentClusterPlacement: 100, + PlacementControllerWorkQueueRateLimiterOpts: RateLimitOptions{ + RateLimiterBaseDelay: 5 * time.Millisecond, + RateLimiterMaxDelay: 60 * time.Second, + RateLimiterQPS: 10, + RateLimiterBucketSize: 100, + }, + ResourceSnapshotCreationMinimumInterval: 30 * time.Second, + ResourceChangesCollectionDuration: 15 * time.Second, + }, + }, + { + name: "all specified", + flagSetName: "allSpecified", + args: []string{ + "--work-pending-grace-period=30s", + "--skipped-propagating-apis=apps/v1/Deployment", + "--allowed-propagating-apis=batch/v1/Job", + "--skipped-propagating-namespaces=ns1,ns2", + "--concurrent-resource-change-syncs=30", + "--max-fleet-size=150", + "--max-concurrent-cluster-placement=120", + "--resource-snapshot-creation-minimum-interval=45s", + "--resource-changes-collection-duration=20s", + }, + wantPlacementMgmtOpts: PlacementManagementOptions{ + WorkPendingGracePeriod: metav1.Duration{Duration: 15 * time.Second}, + SkippedPropagatingAPIs: "apps/v1/Deployment", + AllowedPropagatingAPIs: "batch/v1/Job", + SkippedPropagatingNamespaces: "ns1,ns2", + ConcurrentResourceChangeSyncs: 30, + MaxFleetSize: 150, + MaxConcurrentClusterPlacement: 120, + PlacementControllerWorkQueueRateLimiterOpts: RateLimitOptions{ + RateLimiterBaseDelay: 5 * time.Millisecond, + RateLimiterMaxDelay: 60 * time.Second, + RateLimiterQPS: 10, + RateLimiterBucketSize: 100, + }, + ResourceSnapshotCreationMinimumInterval: 45 * time.Second, + ResourceChangesCollectionDuration: 20 * time.Second, + }, + }, + { + name: "skipped propagating APIs parse error", + flagSetName: "skippedPropagatingAPIsParseError", + args: []string{"--skipped-propagating-apis=a/b/c/d"}, + wantErred: true, + wantErrMsgSubStr: "invalid list of skipped for propagation APIs", + }, + { + name: "allowed propagating APIs parse error", + flagSetName: "allowedPropagatingAPIsParseError", + args: []string{"--allowed-propagating-apis=a/b/c/d"}, + wantErred: true, + wantErrMsgSubStr: "invalid list of allowed for propagation APIs", + }, + { + name: "concurrent resource change syncs parse error", + flagSetName: "concurrentResourceChangeSyncsParseError", + args: []string{"--concurrent-resource-change-syncs=abc"}, + wantErred: true, + wantErrMsgSubStr: "failed to parse int value", + }, + { + name: "concurrent resource change syncs out of range (too small)", + flagSetName: "concurrentResourceChangeSyncsOutOfRangeTooSmall", + args: []string{"--concurrent-resource-change-syncs=0"}, + wantErred: true, + wantErrMsgSubStr: "number of concurrent resource change syncs must be in the range [1, 100]", + }, + { + name: "concurrent resource change syncs out of range (too large)", + flagSetName: "concurrentResourceChangeSyncsOutOfRangeTooLarge", + args: []string{"--concurrent-resource-change-syncs=101"}, + wantErred: true, + wantErrMsgSubStr: "number of concurrent resource change syncs must be in the range [1, 100]", + }, + { + name: "max fleet size parse error", + flagSetName: "maxFleetSizeParseError", + args: []string{"--max-fleet-size=abc"}, + wantErred: true, + wantErrMsgSubStr: "failed to parse int value", + }, + { + name: "max fleet size out of range (too small)", + flagSetName: "maxFleetSizeOutOfRangeTooSmall", + args: []string{"--max-fleet-size=29"}, + wantErred: true, + wantErrMsgSubStr: "number of max fleet size must be in the range [30, 200]", + }, + { + name: "max fleet size out of range (too large)", + flagSetName: "maxFleetSizeOutOfRangeTooLarge", + args: []string{"--max-fleet-size=201"}, + wantErred: true, + wantErrMsgSubStr: "number of max fleet size must be in the range [30, 200]", + }, + { + name: "max concurrent cluster placement parse error", + flagSetName: "maxConcurrentClusterPlacementParseError", + args: []string{"--max-concurrent-cluster-placement=abc"}, + wantErred: true, + wantErrMsgSubStr: "failed to parse int value", + }, + { + name: "max concurrent cluster placement out of range (too small)", + flagSetName: "maxConcurrentClusterPlacementOutOfRangeTooSmall", + args: []string{"--max-concurrent-cluster-placement=9"}, + wantErred: true, + wantErrMsgSubStr: "number of max concurrent cluster placements must be in the range [10, 200]", + }, + { + name: "max concurrent cluster placement out of range (too large)", + flagSetName: "maxConcurrentClusterPlacementOutOfRangeTooLarge", + args: []string{"--max-concurrent-cluster-placement=201"}, + wantErred: true, + wantErrMsgSubStr: "number of max concurrent cluster placements must be in the range [10, 200]", + }, + { + name: "resource snapshot creation minimum interval parse error", + flagSetName: "resourceSnapshotCreationMinimumIntervalParseError", + args: []string{"--resource-snapshot-creation-minimum-interval=abc"}, + wantErred: true, + wantErrMsgSubStr: "failed to parse duration", + }, + { + name: "resource snapshot creation minimum interval out of range (too large)", + flagSetName: "resourceSnapshotCreationMinimumIntervalOutOfRangeTooLarge", + args: []string{"--resource-snapshot-creation-minimum-interval=6m"}, + wantErred: true, + wantErrMsgSubStr: "duration must be in the range [0s, 5m]", + }, + { + name: "resource changes collection duration parse error", + flagSetName: "resourceChangesCollectionDurationParseError", + args: []string{"--resource-changes-collection-duration=abc"}, + wantErred: true, + wantErrMsgSubStr: "failed to parse duration", + }, + { + name: "resource changes collection duration out of range (too large)", + flagSetName: "resourceChangesCollectionDurationOutOfRangeTooLarge", + args: []string{"--resource-changes-collection-duration=61s"}, + wantErred: true, + wantErrMsgSubStr: "duration must be in the range [0s, 1m]", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + flags := flag.NewFlagSet(tc.flagSetName, flag.ContinueOnError) + placementMgmtOpts := PlacementManagementOptions{} + placementMgmtOpts.AddFlags(flags) + + err := flags.Parse(tc.args) + if tc.wantErred { + if err == nil { + t.Fatalf("flag Parse() = nil, want erred") + } + + if !strings.Contains(err.Error(), tc.wantErrMsgSubStr) { + t.Fatalf("flag Parse() error = %v, want error msg with sub-string %s", err, tc.wantErrMsgSubStr) + } + return + } + + if err != nil { + t.Fatalf("flag Parse() = %v, want nil", err) + } + + if diff := cmp.Diff(placementMgmtOpts, tc.wantPlacementMgmtOpts); diff != "" { + t.Errorf("placement management options diff (-got, +want):\n%s", diff) + } + }) + } +} + +// TestRateLimitOptions tests the parsing and validation logic of the rate limit options defined in RateLimitOptions. +func TestRateLimitOptions(t *testing.T) { + testCases := []struct { + name string + flagSetName string + args []string + wantRateLimitOpts RateLimitOptions + wantErred bool + wantErrMsgSubStr string + }{ + { + name: "all default", + flagSetName: "allDefault", + args: []string{}, + wantRateLimitOpts: RateLimitOptions{ + RateLimiterBaseDelay: 5 * time.Millisecond, + RateLimiterMaxDelay: 60 * time.Second, + RateLimiterQPS: 10, + RateLimiterBucketSize: 100, + }, + }, + { + name: "all specified", + flagSetName: "allSpecified", + args: []string{ + "--rate-limiter-base-delay=10ms", + "--rate-limiter-max-delay=2s", + "--rate-limiter-qps=20", + "--rate-limiter-bucket-size=200", + }, + wantRateLimitOpts: RateLimitOptions{ + RateLimiterBaseDelay: 10 * time.Millisecond, + RateLimiterMaxDelay: 2 * time.Second, + RateLimiterQPS: 20, + RateLimiterBucketSize: 200, + }, + }, + { + name: "rate limiter base delay parse error", + flagSetName: "rateLimiterBaseDelayParseError", + args: []string{"--rate-limiter-base-delay=abc"}, + wantErred: true, + wantErrMsgSubStr: "failed to parse time duration", + }, + { + name: "rate limiter base delay out of range (too small)", + flagSetName: "rateLimiterBaseDelayOutOfRangeTooSmall", + args: []string{"--rate-limiter-base-delay=500us"}, + wantErred: true, + wantErrMsgSubStr: "the base delay must be a value between [1ms, 200ms]", + }, + { + name: "rate limiter base delay out of range (too large)", + flagSetName: "rateLimiterBaseDelayOutOfRangeTooLarge", + args: []string{"--rate-limiter-base-delay=201ms"}, + wantErred: true, + wantErrMsgSubStr: "the base delay must be a value between [1ms, 200ms]", + }, + { + name: "rate limiter max delay parse error", + flagSetName: "rateLimiterMaxDelayParseError", + args: []string{"--rate-limiter-max-delay=abc"}, + wantErred: true, + wantErrMsgSubStr: "failed to parse time duration", + }, + { + name: "rate limiter max delay out of range (too small)", + flagSetName: "rateLimiterMaxDelayOutOfRangeTooSmall", + args: []string{"--rate-limiter-max-delay=500ms"}, + wantErred: true, + wantErrMsgSubStr: "the max delay must be a value between [1s, 5m]", + }, + { + name: "rate limiter max delay out of range (too large)", + flagSetName: "rateLimiterMaxDelayOutOfRangeTooLarge", + args: []string{"--rate-limiter-max-delay=6m"}, + wantErred: true, + wantErrMsgSubStr: "the max delay must be a value between [1s, 5m]", + }, + { + name: "rate limiter QPS parse error", + flagSetName: "rateLimiterQPSParseError", + args: []string{"--rate-limiter-qps=abc"}, + wantErred: true, + wantErrMsgSubStr: "failed to parse integer", + }, + { + name: "rate limiter QPS out of range (too small)", + flagSetName: "rateLimiterQPSOutOfRangeTooSmall", + args: []string{"--rate-limiter-qps=0"}, + wantErred: true, + wantErrMsgSubStr: "the QPS must be a positive integer in the range [1, 1000]", + }, + { + name: "rate limiter QPS out of range (too large)", + flagSetName: "rateLimiterQPSOutOfRangeTooLarge", + args: []string{"--rate-limiter-qps=1001"}, + wantErred: true, + wantErrMsgSubStr: "the QPS must be a positive integer in the range [1, 1000]", + }, + { + name: "rate limiter bucket size parse error", + flagSetName: "rateLimiterBucketSizeParseError", + args: []string{"--rate-limiter-bucket-size=abc"}, + wantErred: true, + wantErrMsgSubStr: "failed to parse integer", + }, + { + name: "rate limiter bucket size out of range (too small)", + flagSetName: "rateLimiterBucketSizeOutOfRangeTooSmall", + args: []string{"--rate-limiter-bucket-size=0"}, + wantErred: true, + wantErrMsgSubStr: "the bucket size must be a positive integer in the range [1, 10000]", + }, + { + name: "rate limiter bucket size out of range (too large)", + flagSetName: "rateLimiterBucketSizeOutOfRangeTooLarge", + args: []string{"--rate-limiter-bucket-size=10001"}, + wantErred: true, + wantErrMsgSubStr: "the bucket size must be a positive integer in the range [1, 10000]", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + flags := flag.NewFlagSet(tc.flagSetName, flag.ContinueOnError) + rateLimitOpts := RateLimitOptions{} + rateLimitOpts.AddFlags(flags) + + err := flags.Parse(tc.args) + if tc.wantErred { + if err == nil { + t.Fatalf("flag Parse() = nil, want erred") + } + + if !strings.Contains(err.Error(), tc.wantErrMsgSubStr) { + t.Fatalf("flag Parse() error = %v, want error msg with sub-string %s", err, tc.wantErrMsgSubStr) + } + return + } + + if err != nil { + t.Fatalf("flag Parse() = %v, want nil", err) + } + + if diff := cmp.Diff(rateLimitOpts, tc.wantRateLimitOpts); diff != "" { + t.Errorf("rate limit options diff (-got, +want):\n%s", diff) + } + }) + } +} From 43438facff1a0b723109e2b2f24ae57aea2c2ece Mon Sep 17 00:00:00 2001 From: Britania Rodriguez Reyes <145056127+britaniar@users.noreply.github.com> Date: Tue, 3 Mar 2026 09:42:35 -0800 Subject: [PATCH 9/9] feat: placement controller does not create resource snapshot when External rollout strategy (#465) --- pkg/controllers/placement/controller.go | 79 +++++-- .../placement/controller_integration_test.go | 12 +- pkg/controllers/placement/controller_test.go | 209 ++++++++++++++++++ test/e2e/cluster_staged_updaterun_test.go | 151 ++++++++----- test/e2e/staged_updaterun_test.go | 126 +++++++---- 5 files changed, 459 insertions(+), 118 deletions(-) diff --git a/pkg/controllers/placement/controller.go b/pkg/controllers/placement/controller.go index 7001ccf9a..5dcd05f22 100644 --- a/pkg/controllers/placement/controller.go +++ b/pkg/controllers/placement/controller.go @@ -213,31 +213,12 @@ func (r *Reconciler) handleUpdate(ctx context.Context, placementObj fleetv1beta1 return ctrl.Result{}, err } - createResourceSnapshotRes, latestResourceSnapshot, err := r.ResourceSnapshotResolver.GetOrCreateResourceSnapshot(ctx, placementObj, envelopeObjCount, - &fleetv1beta1.ResourceSnapshotSpec{SelectedResources: selectedResources}, int(revisionLimit)) + createResourceSnapshotRes, latestResourceSnapshot, selectedResourceIDs, err := r.handleResourceSnapshotByStrategy( + ctx, placementObj, envelopeObjCount, selectedResources, selectedResourceIDs, int(revisionLimit)) if err != nil { return ctrl.Result{}, err } - // We don't requeue the request here immediately so that placement can keep tracking the rollout status. - if createResourceSnapshotRes.RequeueAfter > 0 { - latestResourceSnapshotKObj := klog.KObj(latestResourceSnapshot) - // We cannot create the resource snapshot immediately because of the resource snapshot creation interval. - // Rebuild the seletedResourceIDs using the latestResourceSnapshot. - latestResourceSnapshotIndex, err := labels.ExtractResourceIndexFromResourceSnapshot(latestResourceSnapshot) - if err != nil { - klog.ErrorS(err, "Failed to extract the resource index from the resourceSnapshot", "placement", placementKObj, "resourceSnapshot", latestResourceSnapshotKObj) - return ctrl.Result{}, controller.NewUnexpectedBehaviorError(err) - } - placementKey := controller.GetObjectKeyFromNamespaceName(placementObj.GetNamespace(), placementObj.GetName()) - selectedResourceIDs, err = controller.CollectResourceIdentifiersUsingMasterResourceSnapshot(ctx, r.Client, placementKey, latestResourceSnapshot, strconv.Itoa(latestResourceSnapshotIndex)) - if err != nil { - klog.ErrorS(err, "Failed to collect resource identifiers from the resourceSnapshot", "placement", placementKObj, "resourceSnapshot", latestResourceSnapshotKObj) - return ctrl.Result{}, err - } - klog.V(2).InfoS("Fetched the selected resources from the lastestResourceSnapshot", "placement", placementKObj, "resourceSnapshot", latestResourceSnapshotKObj, "generation", placementObj.GetGeneration()) - } - // isScheduleFullfilled is to indicate whether we need to requeue the placement request to track the rollout status. isScheduleFullfilled, err := r.setPlacementStatus(ctx, placementObj, selectedResourceIDs, latestSchedulingPolicySnapshot, latestResourceSnapshot) if err != nil { @@ -315,6 +296,55 @@ func (r *Reconciler) handleUpdate(ctx context.Context, placementObj fleetv1beta1 return ctrl.Result{RequeueAfter: controllerResyncPeriod}, nil } +// handleResourceSnapshotByStrategy handles resource snapshot resolution based on rollout strategy. +// For External rollout strategy, it only fetches the existing snapshot (can be nil). +// For other strategies, it creates or gets a resource snapshot and may update selectedResourceIDs if requeue is needed. +func (r *Reconciler) handleResourceSnapshotByStrategy( + ctx context.Context, + placementObj fleetv1beta1.PlacementObj, + envelopeObjCount int, + selectedResources []fleetv1beta1.ResourceContent, + selectedResourceIDs []fleetv1beta1.ResourceIdentifier, + revisionHistoryLimit int, +) (ctrl.Result, fleetv1beta1.ResourceSnapshotObj, []fleetv1beta1.ResourceIdentifier, error) { + placementKObj := klog.KObj(placementObj) + placementSpec := placementObj.GetPlacementSpec() + + // For External rollout strategy, the placement controller should not create new resource snapshots. + // The external controller (e.g., UpdateRun controller) is responsible for creating them. + if placementSpec.Strategy.Type == fleetv1beta1.ExternalRolloutStrategyType { + // latestResourceSnapshot is nil for External strategy - the external controller will create it. + klog.V(2).InfoS("Using external rollout strategy, skipping resource snapshot creation", "placement", placementKObj) + return ctrl.Result{}, nil, selectedResourceIDs, nil + } + + createResourceSnapshotRes, latestResourceSnapshot, err := r.ResourceSnapshotResolver.GetOrCreateResourceSnapshot(ctx, placementObj, envelopeObjCount, + &fleetv1beta1.ResourceSnapshotSpec{SelectedResources: selectedResources}, revisionHistoryLimit) + if err != nil { + return ctrl.Result{}, nil, selectedResourceIDs, err + } + + // We don't requeue the request here immediately so that placement can keep tracking the rollout status. + if createResourceSnapshotRes.RequeueAfter > 0 { + latestResourceSnapshotKObj := klog.KObj(latestResourceSnapshot) + // We cannot create the resource snapshot immediately because of the resource snapshot creation interval. + // Rebuild the selectedResourceIDs using the latestResourceSnapshot. + latestResourceSnapshotIndex, err := labels.ExtractResourceIndexFromResourceSnapshot(latestResourceSnapshot) + if err != nil { + klog.ErrorS(err, "Failed to extract the resource index from the resourceSnapshot", "placement", placementKObj, "resourceSnapshot", latestResourceSnapshotKObj) + return ctrl.Result{}, nil, selectedResourceIDs, controller.NewUnexpectedBehaviorError(err) + } + placementKey := controller.GetObjectKeyFromNamespaceName(placementObj.GetNamespace(), placementObj.GetName()) + selectedResourceIDs, err = controller.CollectResourceIdentifiersUsingMasterResourceSnapshot(ctx, r.Client, placementKey, latestResourceSnapshot, strconv.Itoa(latestResourceSnapshotIndex)) + if err != nil { + klog.ErrorS(err, "Failed to collect resource identifiers from the resourceSnapshot", "placement", placementKObj, "resourceSnapshot", latestResourceSnapshotKObj) + return ctrl.Result{}, nil, selectedResourceIDs, err + } + klog.V(2).InfoS("Fetched the selected resources from the latestResourceSnapshot", "placement", placementKObj, "resourceSnapshot", latestResourceSnapshotKObj, "generation", placementObj.GetGeneration()) + } + return createResourceSnapshotRes, latestResourceSnapshot, selectedResourceIDs, nil +} + func (r *Reconciler) getOrCreateSchedulingPolicySnapshot(ctx context.Context, placementObj fleetv1beta1.PlacementObj, revisionHistoryLimit int) (fleetv1beta1.PolicySnapshotObj, error) { placementKObj := klog.KObj(placementObj) placementSpec := placementObj.GetPlacementSpec() @@ -571,7 +601,12 @@ func (r *Reconciler) setPlacementStatus( scheduledCondition := buildScheduledCondition(placementObj, latestSchedulingPolicySnapshot) placementObj.SetConditions(scheduledCondition) // set ObservedResourceIndex from the latest resource snapshot's resource index label, before we set Synchronized, Applied conditions. - placementStatus.ObservedResourceIndex = latestResourceSnapshot.GetLabels()[fleetv1beta1.ResourceIndexLabel] + // For External rollout strategy, latestResourceSnapshot can be nil if no snapshot has been created yet by the external controller. + if latestResourceSnapshot != nil { + placementStatus.ObservedResourceIndex = latestResourceSnapshot.GetLabels()[fleetv1beta1.ResourceIndexLabel] + } else { + placementStatus.ObservedResourceIndex = "" + } // When scheduledCondition is unknown, appliedCondition should be unknown too. // Note: If the scheduledCondition is failed, it means the placement requirement cannot be satisfied fully. For example, diff --git a/pkg/controllers/placement/controller_integration_test.go b/pkg/controllers/placement/controller_integration_test.go index 9d304df04..1206a4789 100644 --- a/pkg/controllers/placement/controller_integration_test.go +++ b/pkg/controllers/placement/controller_integration_test.go @@ -2029,8 +2029,14 @@ var _ = Describe("Test ClusterResourcePlacement Controller", func() { By("Check clusterSchedulingPolicySnapshot") gotPolicySnapshot = checkClusterSchedulingPolicySnapshot() - By("Check clusterResourceSnapshot") - gotResourceSnapshot = checkClusterResourceSnapshot() + By("Check that no clusterResourceSnapshot is created (External rollout strategy)") + // For External rollout strategy, the placement controller should NOT create resource snapshots. + // The external controller is responsible for creating them. + resourceSnapshotList := &placementv1beta1.ClusterResourceSnapshotList{} + Consistently(func() int { + Expect(k8sClient.List(ctx, resourceSnapshotList, client.MatchingLabels{placementv1beta1.PlacementTrackingLabel: crp.Name})).Should(Succeed()) + return len(resourceSnapshotList.Items) + }, consistentlyTimeout, interval).Should(Equal(0), "Resource snapshot should not be created for External rollout strategy") By("Validate CRP status") wantCRP := &placementv1beta1.ClusterResourcePlacement{ @@ -2040,7 +2046,7 @@ var _ = Describe("Test ClusterResourcePlacement Controller", func() { }, Spec: crp.Spec, Status: placementv1beta1.PlacementStatus{ - ObservedResourceIndex: "0", + ObservedResourceIndex: "", Conditions: []metav1.Condition{ { Status: metav1.ConditionUnknown, diff --git a/pkg/controllers/placement/controller_test.go b/pkg/controllers/placement/controller_test.go index 50b3c0aae..2d1e37ff3 100644 --- a/pkg/controllers/placement/controller_test.go +++ b/pkg/controllers/placement/controller_test.go @@ -23,6 +23,7 @@ import ( "fmt" "strconv" "testing" + "time" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" @@ -2390,3 +2391,211 @@ func TestDetermineRolloutStateForPlacementWithExternalRolloutStrategy(t *testing }) } } + +func TestHandleResourceSnapshotByStrategy(t *testing.T) { + tests := []struct { + name string + crp *fleetv1beta1.ClusterResourcePlacement + existingSnapshots []client.Object + selectedResources []fleetv1beta1.ResourceContent + selectedResourceIDs []fleetv1beta1.ResourceIdentifier + snapshotResolverConfig *controller.ResourceSnapshotConfig // optional Config for the resolver + wantSnapshot bool + wantSnapshotName string + wantSelectedResourceIDs []fleetv1beta1.ResourceIdentifier + wantRequeueAfter bool // true if we expect RequeueAfter > 0 + wantErr bool + }{ + { + name: "External rollout strategy with no existing snapshot", + crp: &fleetv1beta1.ClusterResourcePlacement{ + ObjectMeta: metav1.ObjectMeta{ + Name: testCRPName, + Generation: 1, + }, + Spec: fleetv1beta1.PlacementSpec{ + ResourceSelectors: []fleetv1beta1.ResourceSelectorTerm{ + { + Group: corev1.GroupName, + Version: "v1", + Kind: "Namespace", + }, + }, + Strategy: fleetv1beta1.RolloutStrategy{ + Type: fleetv1beta1.ExternalRolloutStrategyType, + }, + }, + }, + existingSnapshots: []client.Object{}, + selectedResources: []fleetv1beta1.ResourceContent{}, + selectedResourceIDs: []fleetv1beta1.ResourceIdentifier{{Kind: "Namespace", Name: "test"}}, + wantSnapshot: false, + wantSnapshotName: "", + wantSelectedResourceIDs: []fleetv1beta1.ResourceIdentifier{{Kind: "Namespace", Name: "test"}}, + wantRequeueAfter: false, + wantErr: false, + }, + { + name: "RollingUpdate strategy creates new snapshot when none exists", + crp: &fleetv1beta1.ClusterResourcePlacement{ + ObjectMeta: metav1.ObjectMeta{ + Name: testCRPName, + Generation: 1, + }, + Spec: fleetv1beta1.PlacementSpec{ + ResourceSelectors: []fleetv1beta1.ResourceSelectorTerm{ + { + Group: corev1.GroupName, + Version: "v1", + Kind: "Namespace", + }, + }, + Strategy: fleetv1beta1.RolloutStrategy{ + Type: fleetv1beta1.RollingUpdateRolloutStrategyType, + }, + }, + }, + existingSnapshots: []client.Object{}, + selectedResources: []fleetv1beta1.ResourceContent{ + { + RawExtension: runtime.RawExtension{ + Raw: []byte(`{"apiVersion":"v1","kind":"Namespace","metadata":{"name":"test-ns"}}`), + }, + }, + }, + selectedResourceIDs: []fleetv1beta1.ResourceIdentifier{{Kind: "Namespace", Name: "test-ns"}}, + wantSnapshot: true, + wantSnapshotName: fmt.Sprintf(fleetv1beta1.ResourceSnapshotNameFmt, testCRPName, 0), + wantSelectedResourceIDs: []fleetv1beta1.ResourceIdentifier{{Kind: "Namespace", Name: "test-ns"}}, + wantRequeueAfter: false, + wantErr: false, + }, + { + name: "RollingUpdate strategy with different hash triggers requeue when interval configured", + crp: &fleetv1beta1.ClusterResourcePlacement{ + ObjectMeta: metav1.ObjectMeta{ + Name: testCRPName, + Generation: 1, + }, + Spec: fleetv1beta1.PlacementSpec{ + ResourceSelectors: []fleetv1beta1.ResourceSelectorTerm{ + { + Group: corev1.GroupName, + Version: "v1", + Kind: "Namespace", + }, + }, + Strategy: fleetv1beta1.RolloutStrategy{ + Type: fleetv1beta1.RollingUpdateRolloutStrategyType, + }, + }, + }, + existingSnapshots: []client.Object{ + &fleetv1beta1.ClusterResourceSnapshot{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf(fleetv1beta1.ResourceSnapshotNameFmt, testCRPName, 0), + CreationTimestamp: metav1.Now(), + Labels: map[string]string{ + fleetv1beta1.PlacementTrackingLabel: testCRPName, + fleetv1beta1.IsLatestSnapshotLabel: strconv.FormatBool(true), + fleetv1beta1.ResourceIndexLabel: "0", + }, + Annotations: map[string]string{ + fleetv1beta1.ResourceGroupHashAnnotation: "old-hash-different-from-new", + fleetv1beta1.NumberOfResourceSnapshotsAnnotation: "1", + }, + }, + Spec: fleetv1beta1.ResourceSnapshotSpec{ + SelectedResources: []fleetv1beta1.ResourceContent{ + { + RawExtension: runtime.RawExtension{ + Raw: []byte(`{"apiVersion":"v1","kind":"Namespace","metadata":{"name":"old-ns"}}`), + }, + }, + }, + }, + }, + }, + selectedResources: []fleetv1beta1.ResourceContent{ + { + RawExtension: runtime.RawExtension{ + Raw: []byte(`{"apiVersion":"v1","kind":"Namespace","metadata":{"name":"new-ns"}}`), + }, + }, + }, + selectedResourceIDs: []fleetv1beta1.ResourceIdentifier{{Kind: "Namespace", Name: "new-ns"}}, + snapshotResolverConfig: controller.NewResourceSnapshotConfig(15*time.Second, 10*time.Second), + wantSnapshot: true, + wantSnapshotName: fmt.Sprintf(fleetv1beta1.ResourceSnapshotNameFmt, testCRPName, 0), + // When requeue is triggered, selectedResourceIDs are rebuilt from the existing snapshot + wantSelectedResourceIDs: []fleetv1beta1.ResourceIdentifier{ + { + Group: "", + Version: "v1", + Kind: "Namespace", + Name: "old-ns", + }, + }, + wantRequeueAfter: true, + wantErr: false, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + scheme := serviceScheme(t) + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(tc.existingSnapshots...). + Build() + resolver := controller.NewResourceSnapshotResolver(fakeClient, scheme) + if tc.snapshotResolverConfig != nil { + resolver.Config = tc.snapshotResolverConfig + } + r := Reconciler{ + Client: fakeClient, + Scheme: scheme, + ResourceSnapshotResolver: resolver, + } + + gotResult, gotSnapshot, gotSelectedResourceIDs, gotErr := r.handleResourceSnapshotByStrategy( + context.Background(), tc.crp, 0, tc.selectedResources, tc.selectedResourceIDs, 10) + + if (gotErr != nil) != tc.wantErr { + t.Errorf("handleResourceSnapshotByStrategy() error = %v, wantErr %v", gotErr, tc.wantErr) + return + } + + if tc.wantSnapshot { + if gotSnapshot == nil { + t.Errorf("handleResourceSnapshotByStrategy() gotSnapshot = nil, want non-nil") + return + } + if gotSnapshot.GetName() != tc.wantSnapshotName { + t.Errorf("handleResourceSnapshotByStrategy() gotSnapshot.Name = %v, want %v", gotSnapshot.GetName(), tc.wantSnapshotName) + } + } else { + if gotSnapshot != nil { + t.Errorf("handleResourceSnapshotByStrategy() gotSnapshot = %v, want nil", gotSnapshot.GetName()) + } + } + + if diff := cmp.Diff(tc.wantSelectedResourceIDs, gotSelectedResourceIDs); diff != "" { + t.Errorf("handleResourceSnapshotByStrategy() selectedResourceIDs mismatch (-want, +got):\n%s", diff) + } + + // Verify RequeueAfter behavior + gotRequeueAfter := gotResult.RequeueAfter > 0 + if gotRequeueAfter != tc.wantRequeueAfter { + t.Errorf("handleResourceSnapshotByStrategy() gotResult.RequeueAfter > 0 = %v, want %v", gotRequeueAfter, tc.wantRequeueAfter) + } + + // For External strategy, we always expect no requeue + if tc.crp.Spec.Strategy.Type == fleetv1beta1.ExternalRolloutStrategyType { + if gotResult.RequeueAfter != 0 { + t.Errorf("handleResourceSnapshotByStrategy() gotResult.RequeueAfter = %v, want 0 for External strategy", gotResult.RequeueAfter) + } + } + }) + } +} diff --git a/test/e2e/cluster_staged_updaterun_test.go b/test/e2e/cluster_staged_updaterun_test.go index a661ba502..eed3ef16e 100644 --- a/test/e2e/cluster_staged_updaterun_test.go +++ b/test/e2e/cluster_staged_updaterun_test.go @@ -120,8 +120,8 @@ var _ = Describe("test CRP rollout with staged update run", func() { It("Should not rollout any resources to member clusters as there's no update run yet", checkIfRemovedWorkResourcesFromAllMemberClustersConsistently) - It("Should have the latest resource snapshot", func() { - validateLatestClusterResourceSnapshot(crpName, resourceSnapshotIndex1st) + It("Should NOT create the latest resource snapshot yet", func() { + validateNoClusterResourceSnapshot(crpName) }) It("Should successfully schedule the crp", func() { @@ -135,7 +135,11 @@ var _ = Describe("test CRP rollout with staged update run", func() { It("Should create a cluster staged update run successfully with auto-created resource snapshot", func() { By("Create a cluster staged update run without specifying resource snapshot index, triggering auto-creation") - createClusterStagedUpdateRunWithAutoCreatedSnapshot(updateRunNames[0], crpName, strategyName) + createClusterStagedUpdateRunWithAutoCreatedSnapshot(updateRunNames[0], crpName, strategyName, placementv1beta1.StateRun) + }) + + It("Should have the latest resource snapshot", func() { + validateLatestClusterResourceSnapshot(crpName, resourceSnapshotIndex1st) }) It("Should rollout resources to member-cluster-2 only and complete stage canary", func() { @@ -182,6 +186,11 @@ var _ = Describe("test CRP rollout with staged update run", func() { Consistently(crpStatusUpdatedActual, consistentlyDuration, consistentlyInterval).Should(Succeed(), "Failed to keep CRP %s status as expected", crpName) }) + It("Should create a new cluster staged update run successfully and use the auto-created snapshot for updated resources", func() { + By("Create a new cluster staged update run without specifying resource snapshot index, triggering auto-creation for updated resources") + createClusterStagedUpdateRunWithAutoCreatedSnapshot(updateRunNames[1], crpName, strategyName, placementv1beta1.StateRun) + }) + It("Should create a new latest resource snapshot", func() { crsList := &placementv1beta1.ClusterResourceSnapshotList{} Eventually(func() error { @@ -198,11 +207,6 @@ var _ = Describe("test CRP rollout with staged update run", func() { }, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed get the new latest resourcensnapshot") }) - It("Should create a new cluster staged update run successfully and use the auto-created snapshot for updated resources", func() { - By("Create a new cluster staged update run without specifying resource snapshot index, triggering auto-creation for updated resources") - createClusterStagedUpdateRunWithAutoCreatedSnapshot(updateRunNames[1], crpName, strategyName) - }) - It("Should rollout resources to member-cluster-2 only and complete stage canary", func() { By("Verify that the new configmap is updated on member-cluster-2") configMapActual := configMapPlacedOnClusterActual(allMemberClusters[1], &newConfigMap) @@ -301,8 +305,8 @@ var _ = Describe("test CRP rollout with staged update run", func() { It("Should not rollout any resources to member clusters as there's no update run yet", checkIfRemovedWorkResourcesFromAllMemberClustersConsistently) - It("Should have the latest resource snapshot", func() { - validateLatestClusterResourceSnapshot(crpName, resourceSnapshotIndex1st) + It("Should NOT create the latest resource snapshot yet", func() { + validateNoClusterResourceSnapshot(crpName) }) It("Should successfully schedule the crp", func() { @@ -315,7 +319,11 @@ var _ = Describe("test CRP rollout with staged update run", func() { }) It("Should create a cluster staged update run successfully", func() { - createClusterStagedUpdateRunSucceed(updateRunNames[0], crpName, resourceSnapshotIndex1st, strategyName, placementv1beta1.StateRun) + createClusterStagedUpdateRunWithAutoCreatedSnapshot(updateRunNames[0], crpName, strategyName, placementv1beta1.StateRun) + }) + + It("Should have the latest resource snapshot", func() { + validateLatestClusterResourceSnapshot(crpName, resourceSnapshotIndex1st) }) It("Should rollout resources to member-cluster-2 only and complete stage canary", func() { @@ -362,6 +370,10 @@ var _ = Describe("test CRP rollout with staged update run", func() { Consistently(crpStatusUpdatedActual, consistentlyDuration, consistentlyInterval).Should(Succeed(), "Failed to keep CRP %s status as expected", crpName) }) + It("Should create a new cluster staged update run successfully", func() { + createClusterStagedUpdateRunWithAutoCreatedSnapshot(updateRunNames[1], crpName, strategyName, placementv1beta1.StateRun) + }) + It("Should create a new latest resource snapshot", func() { crsList := &placementv1beta1.ClusterResourceSnapshotList{} Eventually(func() error { @@ -378,10 +390,6 @@ var _ = Describe("test CRP rollout with staged update run", func() { }, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed get the new latest resourcensnapshot") }) - It("Should create a new cluster staged update run successfully", func() { - createClusterStagedUpdateRunSucceed(updateRunNames[1], crpName, resourceSnapshotIndex2nd, strategyName, placementv1beta1.StateRun) - }) - It("Should rollout resources to member-cluster-2 only and complete stage canary", func() { By("Verify that the new configmap is updated on member-cluster-2") configMapActual := configMapPlacedOnClusterActual(allMemberClusters[1], &newConfigMap) @@ -526,8 +534,8 @@ var _ = Describe("test CRP rollout with staged update run", func() { It("Should not rollout any resources to member clusters as there's no update run yet", checkIfRemovedWorkResourcesFromAllMemberClustersConsistently) - It("Should have the latest resource snapshot", func() { - validateLatestClusterResourceSnapshot(crpName, resourceSnapshotIndex1st) + It("Should NOT create the latest resource snapshot yet", func() { + validateNoClusterResourceSnapshot(crpName) }) It("Should successfully schedule the crp", func() { @@ -540,7 +548,11 @@ var _ = Describe("test CRP rollout with staged update run", func() { }) It("Should create a cluster staged update run successfully", func() { - createClusterStagedUpdateRunSucceed(updateRunNames[0], crpName, resourceSnapshotIndex1st, strategyName, placementv1beta1.StateRun) + createClusterStagedUpdateRunWithAutoCreatedSnapshot(updateRunNames[0], crpName, strategyName, placementv1beta1.StateRun) + }) + + It("Should have the latest resource snapshot", func() { + validateLatestClusterResourceSnapshot(crpName, resourceSnapshotIndex1st) }) It("Should rollout resources to member-cluster-2 only and complete stage canary", func() { @@ -730,8 +742,8 @@ var _ = Describe("test CRP rollout with staged update run", func() { It("Should not rollout any resources to member clusters as there's no update run yet", checkIfRemovedWorkResourcesFromAllMemberClustersConsistently) - It("Should have the latest resource snapshot", func() { - validateLatestClusterResourceSnapshot(crpName, resourceSnapshotIndex1st) + It("Should NOT create the latest resource snapshot yet", func() { + validateNoClusterResourceSnapshot(crpName) }) It("Should successfully schedule the crp", func() { @@ -744,7 +756,11 @@ var _ = Describe("test CRP rollout with staged update run", func() { }) It("Should create a cluster staged update run successfully", func() { - createClusterStagedUpdateRunSucceed(updateRunNames[0], crpName, resourceSnapshotIndex1st, strategyName, placementv1beta1.StateRun) + createClusterStagedUpdateRunWithAutoCreatedSnapshot(updateRunNames[0], crpName, strategyName, placementv1beta1.StateRun) + }) + + It("Should have the latest resource snapshot", func() { + validateLatestClusterResourceSnapshot(crpName, resourceSnapshotIndex1st) }) It("Should not rollout any resources to member clusters and complete stage canary", func() { @@ -1013,8 +1029,8 @@ var _ = Describe("test CRP rollout with staged update run", func() { It("Should not rollout any resources to member clusters as there's no update run yet", checkIfRemovedWorkResourcesFromAllMemberClustersConsistently) - It("Should have the latest resource snapshot", func() { - validateLatestClusterResourceSnapshot(crpName, resourceSnapshotIndex1st) + It("Should NOT create the latest resource snapshot yet", func() { + validateNoClusterResourceSnapshot(crpName) }) It("Should successfully schedule the crp", func() { @@ -1027,7 +1043,11 @@ var _ = Describe("test CRP rollout with staged update run", func() { }) It("Should create a cluster staged update run successfully", func() { - createClusterStagedUpdateRunSucceed(updateRunName, crpName, resourceSnapshotIndex1st, strategyName, placementv1beta1.StateRun) + createClusterStagedUpdateRunWithAutoCreatedSnapshot(updateRunName, crpName, strategyName, placementv1beta1.StateRun) + }) + + It("Should have the latest resource snapshot", func() { + validateLatestClusterResourceSnapshot(crpName, resourceSnapshotIndex1st) }) It("Should rollout resources to member-cluster-2 only and complete stage canary", func() { @@ -1122,8 +1142,8 @@ var _ = Describe("test CRP rollout with staged update run", func() { It("Should not rollout any resources to member clusters as there's no update run yet", checkIfRemovedWorkResourcesFromAllMemberClustersConsistently) - It("Should have the latest resource snapshot", func() { - validateLatestClusterResourceSnapshot(crpName, resourceSnapshotIndex1st) + It("Should NOT create the latest resource snapshot yet", func() { + validateNoClusterResourceSnapshot(crpName) }) It("Should successfully schedule the crp", func() { @@ -1136,7 +1156,11 @@ var _ = Describe("test CRP rollout with staged update run", func() { }) It("Should create a cluster staged update run successfully", func() { - createClusterStagedUpdateRunSucceed(updateRunName, crpName, resourceSnapshotIndex1st, strategyName, placementv1beta1.StateRun) + createClusterStagedUpdateRunWithAutoCreatedSnapshot(updateRunName, crpName, strategyName, placementv1beta1.StateRun) + }) + + It("Should have the latest resource snapshot", func() { + validateLatestClusterResourceSnapshot(crpName, resourceSnapshotIndex1st) }) It("Should report diff for member-cluster-2 only and completes stage canary", func() { @@ -1243,16 +1267,16 @@ var _ = Describe("test CRP rollout with staged update run", func() { } }) - It("Should have the new resource snapshot but CRP status should remain completed with old snapshot", func() { - validateLatestClusterResourceSnapshot(crpName, resourceSnapshotIndex2nd) + It("Should NOT have a new resource snapshot and CRP status should remain completed with old snapshot", func() { + validateLatestClusterResourceSnapshot(crpName, resourceSnapshotIndex1st) // CRP status should still show completed with old snapshot crpStatusUpdatedActual := crpStatusUpdatedActual(workResourceIdentifiers(), allMemberClusterNames, nil, resourceSnapshotIndex1st) Consistently(crpStatusUpdatedActual, consistentlyDuration, consistentlyInterval).Should(Succeed(), "Failed to keep CRP %s status as expected", crpName) }) - It("Create a staged update run with new resourceSnapshotIndex and verify rollout happens", func() { - createClusterStagedUpdateRunSucceed(updateRunName, crpName, resourceSnapshotIndex2nd, strategyName, placementv1beta1.StateRun) + It("Create a staged update run with auto-created snapshot and verify rollout happens", func() { + createClusterStagedUpdateRunWithAutoCreatedSnapshot(updateRunName, crpName, strategyName, placementv1beta1.StateRun) // Verify rollout to canary cluster first By("Verify that the new configmap is updated on member-cluster-2 during canary stage") @@ -1324,9 +1348,10 @@ var _ = Describe("test CRP rollout with staged update run", func() { }) It("Should create a staged update run and verify cluster approval request is created", func() { - validateLatestClusterResourceSnapshot(crpName, resourceSnapshotIndex1st) + validateNoClusterResourceSnapshot(crpName) validateLatestClusterSchedulingPolicySnapshot(crpName, policySnapshotIndex1st, 3) - createClusterStagedUpdateRunSucceed(updateRunName, crpName, resourceSnapshotIndex1st, strategyName, placementv1beta1.StateRun) + createClusterStagedUpdateRunWithAutoCreatedSnapshot(updateRunName, crpName, strategyName, placementv1beta1.StateRun) + validateLatestClusterResourceSnapshot(crpName, resourceSnapshotIndex1st) // Verify that cluster approval request is created for canary stage. Eventually(func() error { @@ -1424,8 +1449,8 @@ var _ = Describe("test CRP rollout with staged update run", func() { It("Should not rollout any resources to member clusters as there's no update run yet", checkIfRemovedWorkResourcesFromAllMemberClustersConsistently) - It("Should have the latest resource snapshot", func() { - validateLatestClusterResourceSnapshot(crpName, resourceSnapshotIndex1st) + It("Should NOT create the latest resource snapshot yet", func() { + validateNoClusterResourceSnapshot(crpName) }) It("Should successfully schedule the crp", func() { @@ -1438,7 +1463,11 @@ var _ = Describe("test CRP rollout with staged update run", func() { }) It("Should create a cluster staged update run successfully", func() { - createClusterStagedUpdateRunSucceed(updateRunName, crpName, resourceSnapshotIndex1st, strategyName, placementv1beta1.StateRun) + createClusterStagedUpdateRunWithAutoCreatedSnapshot(updateRunName, crpName, strategyName, placementv1beta1.StateRun) + }) + + It("Should have the latest resource snapshot", func() { + validateLatestClusterResourceSnapshot(crpName, resourceSnapshotIndex1st) }) It("Should complete the cluster staged update run with all 3 clusters updated in parallel", func() { @@ -1514,8 +1543,8 @@ var _ = Describe("test CRP rollout with staged update run", func() { It("Should not rollout any resources to member clusters as there's no update run yet", checkIfRemovedWorkResourcesFromAllMemberClustersConsistently) - It("Should have the latest resource snapshot", func() { - validateLatestClusterResourceSnapshot(crpName, resourceSnapshotIndex1st) + It("Should NOT create the latest resource snapshot yet", func() { + validateNoClusterResourceSnapshot(crpName) }) It("Should successfully schedule the crp", func() { @@ -1528,7 +1557,11 @@ var _ = Describe("test CRP rollout with staged update run", func() { }) It("Should create a cluster staged update run successfully", func() { - createClusterStagedUpdateRunSucceed(updateRunName, crpName, resourceSnapshotIndex1st, strategyName, placementv1beta1.StateRun) + createClusterStagedUpdateRunWithAutoCreatedSnapshot(updateRunName, crpName, strategyName, placementv1beta1.StateRun) + }) + + It("Should have the latest resource snapshot", func() { + validateLatestClusterResourceSnapshot(crpName, resourceSnapshotIndex1st) }) It("Should complete the cluster staged update run with all 3 clusters", func() { @@ -1595,8 +1628,8 @@ var _ = Describe("test CRP rollout with staged update run", func() { It("Should not rollout any resources to member clusters as there's no update run yet", checkIfRemovedWorkResourcesFromAllMemberClustersConsistently) - It("Should have the latest resource snapshot", func() { - validateLatestClusterResourceSnapshot(crpName, resourceSnapshotIndex1st) + It("Should NOT create the latest resource snapshot yet", func() { + validateNoClusterResourceSnapshot(crpName) }) It("Should successfully schedule the crp", func() { @@ -1610,7 +1643,11 @@ var _ = Describe("test CRP rollout with staged update run", func() { It("Should create a cluster staged update run successfully", func() { By("Creating Cluster Staged Update Run in state Initialize") - createClusterStagedUpdateRunSucceed(updateRunNames[0], crpName, resourceSnapshotIndex1st, strategyName, placementv1beta1.StateInitialize) + createClusterStagedUpdateRunWithAutoCreatedSnapshot(updateRunNames[0], crpName, strategyName, placementv1beta1.StateInitialize) + }) + + It("Should have the latest resource snapshot", func() { + validateLatestClusterResourceSnapshot(crpName, resourceSnapshotIndex1st) }) It("Should not start rollout as the update run is in Initialize state", func() { @@ -1749,12 +1786,14 @@ var _ = Describe("Test member cluster join and leave flow with updateRun", Label checkIfRemovedWorkResourcesFromAllMemberClustersConsistently() - By("Validating created resource snapshot and policy snapshot") - validateLatestClusterResourceSnapshot(crpName, resourceSnapshotIndex1st) + By("Validating created policy snapshot and no resource snapshot") validateLatestClusterSchedulingPolicySnapshot(crpName, policySnapshotIndex1st, 3) By("Creating the first staged update run") - createClusterStagedUpdateRunSucceed(updateRunNames[0], crpName, resourceSnapshotIndex1st, strategyName, placementv1beta1.StateRun) + createClusterStagedUpdateRunWithAutoCreatedSnapshot(updateRunNames[0], crpName, strategyName, placementv1beta1.StateRun) + + By("validating created resource snapshot") + validateLatestClusterResourceSnapshot(crpName, resourceSnapshotIndex1st) By("Validating staged update run has succeeded") csurSucceededActual := testutilsupdaterun.ClusterStagedUpdateRunStatusSucceededActual(ctx, hubClient, updateRunNames[0], resourceSnapshotIndex1st, policySnapshotIndex1st, 3, defaultApplyStrategy, &strategy.Spec, [][]string{{allMemberClusterNames[0], allMemberClusterNames[1], allMemberClusterNames[2]}}, nil, nil, nil, true) @@ -1890,13 +1929,13 @@ var _ = Describe("Test member cluster join and leave flow with updateRun", Label updateConfigMapSucceed(&newConfigMap) }) - It("Should have the latest resource snapshot with updated resources", func() { - validateLatestClusterResourceSnapshot(crpName, resourceSnapshotIndex2nd) - }) - It("Should reschedule to member cluster 1 and create a new cluster staged update run successfully", func() { validateLatestClusterSchedulingPolicySnapshot(crpName, policySnapshotIndex1st, 3) - createClusterStagedUpdateRunSucceed(updateRunNames[1], crpName, resourceSnapshotIndex2nd, strategyName, placementv1beta1.StateRun) + createClusterStagedUpdateRunWithAutoCreatedSnapshot(updateRunNames[1], crpName, strategyName, placementv1beta1.StateRun) + }) + + It("Should have the latest resource snapshot with updated resources", func() { + validateLatestClusterResourceSnapshot(crpName, resourceSnapshotIndex2nd) }) It("Should complete the staged update run, complete CRP, and rollout updated resources to all member clusters", func() { @@ -2048,6 +2087,14 @@ func validateLatestClusterResourceSnapshot(crpName, wantResourceSnapshotIndex st }, eventuallyDuration, eventuallyInterval).Should(Equal(wantResourceSnapshotIndex), "Resource snapshot index does not match") } +func validateNoClusterResourceSnapshot(crpName string) { + Consistently(func() int { + crList := &placementv1beta1.ClusterResourceSnapshotList{} + Expect(hubClient.List(ctx, crList, client.MatchingLabels{placementv1beta1.PlacementTrackingLabel: crpName})).Should(Succeed()) + return len(crList.Items) + }, consistentlyDuration, consistentlyInterval).Should(Equal(0), "Resource snapshot should not be created for External rollout strategy") +} + func createClusterStagedUpdateRunSucceed(updateRunName, crpName, resourceSnapshotIndex, strategyName string, state placementv1beta1.State) { updateRun := &placementv1beta1.ClusterStagedUpdateRun{ ObjectMeta: metav1.ObjectMeta{ @@ -2065,13 +2112,13 @@ func createClusterStagedUpdateRunSucceed(updateRunName, crpName, resourceSnapsho // createClusterStagedUpdateRunWithAutoCreatedSnapshot creates a ClusterStagedUpdateRun without specifying a // ResourceSnapshotIndex, triggering the controller to auto-create or reuse an existing resource snapshot. -func createClusterStagedUpdateRunWithAutoCreatedSnapshot(updateRunName, crpName, strategyName string) { +func createClusterStagedUpdateRunWithAutoCreatedSnapshot(updateRunName, crpName, strategyName string, state placementv1beta1.State) { updateRun := &placementv1beta1.ClusterStagedUpdateRun{ ObjectMeta: metav1.ObjectMeta{ Name: updateRunName, }, Spec: placementv1beta1.UpdateRunSpec{ - State: placementv1beta1.StateRun, + State: state, PlacementName: crpName, StagedUpdateStrategyName: strategyName, }, diff --git a/test/e2e/staged_updaterun_test.go b/test/e2e/staged_updaterun_test.go index a893d257f..3b8b52484 100644 --- a/test/e2e/staged_updaterun_test.go +++ b/test/e2e/staged_updaterun_test.go @@ -111,8 +111,8 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem It("Should not rollout any resources to member clusters as there's no update run yet", checkIfRemovedConfigMapFromAllMemberClustersConsistently) - It("Should have the latest resource snapshot", func() { - validateLatestResourceSnapshot(rpName, testNamespace, resourceSnapshotIndex1st) + It("Should NOT create the latest resource snapshot yet", func() { + validateNoResourceSnapshot(rpName, testNamespace) }) It("Should successfully schedule the rp", func() { @@ -125,7 +125,11 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem }) It("Should create a staged update run successfully with auto-created resource snapshot", func() { - createStagedUpdateRunWithAutoCreatedSnapshot(updateRunNames[0], testNamespace, rpName, strategyName) + createStagedUpdateRunWithAutoCreatedSnapshot(updateRunNames[0], testNamespace, rpName, strategyName, placementv1beta1.StateRun) + }) + + It("Should have the latest resource snapshot", func() { + validateLatestResourceSnapshot(rpName, testNamespace, resourceSnapshotIndex1st) }) It("Should rollout resources to member-cluster-2 only and complete stage canary", func() { @@ -172,6 +176,10 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem Consistently(rpStatusUpdatedActual, consistentlyDuration, consistentlyInterval).Should(Succeed(), "Failed to keep RP %s/%s status as expected", testNamespace, rpName) }) + It("Should create a new staged update run successfully and use the auto-created snapshot for updated resources", func() { + createStagedUpdateRunWithAutoCreatedSnapshot(updateRunNames[1], testNamespace, rpName, strategyName, placementv1beta1.StateRun) + }) + It("Should create a new latest resource snapshot", func() { rsList := &placementv1beta1.ResourceSnapshotList{} Eventually(func() error { @@ -188,10 +196,6 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem }, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed get the new latest resourcensnapshot") }) - It("Should create a new staged update run successfully and use the auto-created snapshot for updated resources", func() { - createStagedUpdateRunWithAutoCreatedSnapshot(updateRunNames[1], testNamespace, rpName, strategyName) - }) - It("Should rollout resources to member-cluster-2 only and complete stage canary", func() { By("Verify that the new configmap is updated on member-cluster-2") configMapActual := configMapPlacedOnClusterActual(allMemberClusters[1], &newConfigMap) @@ -287,8 +291,8 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem It("Should not rollout any resources to member clusters as there's no update run yet", checkIfRemovedConfigMapFromAllMemberClustersConsistently) - It("Should have the latest resource snapshot", func() { - validateLatestResourceSnapshot(rpName, testNamespace, resourceSnapshotIndex1st) + It("Should NOT create the latest resource snapshot yet", func() { + validateNoResourceSnapshot(rpName, testNamespace) }) It("Should successfully schedule the rp", func() { @@ -301,7 +305,11 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem }) It("Should create a staged update run successfully", func() { - createStagedUpdateRunSucceed(updateRunNames[0], testNamespace, rpName, resourceSnapshotIndex1st, strategyName, placementv1beta1.StateRun) + createStagedUpdateRunWithAutoCreatedSnapshot(updateRunNames[0], testNamespace, rpName, strategyName, placementv1beta1.StateRun) + }) + + It("Should have the latest resource snapshot", func() { + validateLatestResourceSnapshot(rpName, testNamespace, resourceSnapshotIndex1st) }) It("Should rollout resources to member-cluster-2 only and complete stage canary", func() { @@ -348,6 +356,10 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem Consistently(rpStatusUpdatedActual, consistentlyDuration, consistentlyInterval).Should(Succeed(), "Failed to keep RP %s/%s status as expected", testNamespace, rpName) }) + It("Should create a new staged update run successfully", func() { + createStagedUpdateRunWithAutoCreatedSnapshot(updateRunNames[1], testNamespace, rpName, strategyName, placementv1beta1.StateRun) + }) + It("Should create a new latest resource snapshot", func() { rsList := &placementv1beta1.ResourceSnapshotList{} Eventually(func() error { @@ -364,10 +376,6 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem }, eventuallyDuration, eventuallyInterval).Should(Succeed(), "Failed get the new latest resourcensnapshot") }) - It("Should create a new staged update run successfully", func() { - createStagedUpdateRunSucceed(updateRunNames[1], testNamespace, rpName, resourceSnapshotIndex2nd, strategyName, placementv1beta1.StateRun) - }) - It("Should rollout resources to member-cluster-2 only and complete stage canary", func() { By("Verify that the new configmap is updated on member-cluster-2") configMapActual := configMapPlacedOnClusterActual(allMemberClusters[1], &newConfigMap) @@ -510,8 +518,8 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem It("Should not rollout any resources to member clusters as there's no update run yet", checkIfRemovedConfigMapFromAllMemberClustersConsistently) - It("Should have the latest resource snapshot", func() { - validateLatestResourceSnapshot(rpName, testNamespace, resourceSnapshotIndex1st) + It("Should NOT create the latest resource snapshot yet", func() { + validateNoResourceSnapshot(rpName, testNamespace) }) It("Should successfully schedule the rp", func() { @@ -524,7 +532,11 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem }) It("Should create a staged update run successfully", func() { - createStagedUpdateRunSucceed(updateRunNames[0], testNamespace, rpName, resourceSnapshotIndex1st, strategyName, placementv1beta1.StateRun) + createStagedUpdateRunWithAutoCreatedSnapshot(updateRunNames[0], testNamespace, rpName, strategyName, placementv1beta1.StateRun) + }) + + It("Should have the latest resource snapshot", func() { + validateLatestResourceSnapshot(rpName, testNamespace, resourceSnapshotIndex1st) }) It("Should rollout resources to member-cluster-2 only and complete stage canary", func() { @@ -712,8 +724,8 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem It("Should not rollout any resources to member clusters as there's no update run yet", checkIfRemovedConfigMapFromAllMemberClustersConsistently) - It("Should have the latest resource snapshot", func() { - validateLatestResourceSnapshot(rpName, testNamespace, resourceSnapshotIndex1st) + It("Should NOT create the latest resource snapshot yet", func() { + validateNoResourceSnapshot(rpName, testNamespace) }) It("Should successfully schedule the rp", func() { @@ -726,7 +738,11 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem }) It("Should create a namespaced staged update run successfully", func() { - createStagedUpdateRunSucceed(updateRunNames[0], testNamespace, rpName, resourceSnapshotIndex1st, strategyName, placementv1beta1.StateRun) + createStagedUpdateRunWithAutoCreatedSnapshot(updateRunNames[0], testNamespace, rpName, strategyName, placementv1beta1.StateRun) + }) + + It("Should have the latest resource snapshot", func() { + validateLatestResourceSnapshot(rpName, testNamespace, resourceSnapshotIndex1st) }) It("Should not rollout any resources to member clusters and complete stage canary", func() { @@ -967,8 +983,8 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem It("Should not rollout any resources to member clusters as there's no update run yet", checkIfRemovedConfigMapFromAllMemberClustersConsistently) - It("Should have the latest resource snapshot", func() { - validateLatestResourceSnapshot(rpName, testNamespace, resourceSnapshotIndex1st) + It("Should NOT create the latest resource snapshot yet", func() { + validateNoResourceSnapshot(rpName, testNamespace) }) It("Should successfully schedule the rp", func() { @@ -981,7 +997,11 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem }) It("Should create a staged update run successfully", func() { - createStagedUpdateRunSucceed(updateRunName, testNamespace, rpName, resourceSnapshotIndex1st, strategyName, placementv1beta1.StateRun) + createStagedUpdateRunWithAutoCreatedSnapshot(updateRunName, testNamespace, rpName, strategyName, placementv1beta1.StateRun) + }) + + It("Should have the latest resource snapshot", func() { + validateLatestResourceSnapshot(rpName, testNamespace, resourceSnapshotIndex1st) }) It("Should rollout resources to member-cluster-2 only and complete stage canary", func() { @@ -1071,8 +1091,8 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem It("Should not rollout any resources to member clusters as there's no update run yet", checkIfRemovedConfigMapFromAllMemberClustersConsistently) - It("Should have the latest resource snapshot", func() { - validateLatestResourceSnapshot(rpName, testNamespace, resourceSnapshotIndex1st) + It("Should NOT create the latest resource snapshot yet", func() { + validateNoResourceSnapshot(rpName, testNamespace) }) It("Should successfully schedule the rp", func() { @@ -1085,7 +1105,11 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem }) It("Should create a staged update run successfully", func() { - createStagedUpdateRunSucceed(updateRunName, testNamespace, rpName, resourceSnapshotIndex1st, strategyName, placementv1beta1.StateRun) + createStagedUpdateRunWithAutoCreatedSnapshot(updateRunName, testNamespace, rpName, strategyName, placementv1beta1.StateRun) + }) + + It("Should have the latest resource snapshot", func() { + validateLatestResourceSnapshot(rpName, testNamespace, resourceSnapshotIndex1st) }) It("Should report diff for member-cluster-2 only and completes stage canary", func() { @@ -1190,16 +1214,16 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem } }) - It("Should have the new resource snapshot but RP status should remain completed with old snapshot", func() { - validateLatestResourceSnapshot(rpName, testNamespace, resourceSnapshotIndex2nd) + It("Should NOT have a new resource snapshot and RP status should remain completed with old snapshot", func() { + validateLatestResourceSnapshot(rpName, testNamespace, resourceSnapshotIndex1st) // RP status should still show completed with old snapshot. rpStatusUpdatedActual := rpStatusUpdatedActual(appConfigMapIdentifiers(), allMemberClusterNames, nil, resourceSnapshotIndex1st) Consistently(rpStatusUpdatedActual, consistentlyDuration, consistentlyInterval).Should(Succeed(), "Failed to keep RP %s/%s status as expected", testNamespace, rpName) }) - It("Create a staged update run with new resourceSnapshotIndex and verify rollout happens", func() { - createStagedUpdateRunSucceed(updateRunName, testNamespace, rpName, resourceSnapshotIndex2nd, strategyName, placementv1beta1.StateRun) + It("Create a staged update run with auto-created snapshot and verify rollout happens", func() { + createStagedUpdateRunWithAutoCreatedSnapshot(updateRunName, testNamespace, rpName, strategyName, placementv1beta1.StateRun) // Verify rollout to canary cluster first. By("Verify that the new configmap is updated on member-cluster-2 during canary stage") @@ -1286,8 +1310,8 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem It("Should not rollout any resources to member clusters as there's no update run yet", checkIfRemovedConfigMapFromAllMemberClustersConsistently) - It("Should have the latest resource snapshot", func() { - validateLatestResourceSnapshot(rpName, testNamespace, resourceSnapshotIndex1st) + It("Should NOT create the latest resource snapshot yet", func() { + validateNoResourceSnapshot(rpName, testNamespace) }) It("Should successfully schedule the rp", func() { @@ -1300,7 +1324,11 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem }) It("Should create a staged update run successfully", func() { - createStagedUpdateRunSucceed(updateRunName, testNamespace, rpName, resourceSnapshotIndex1st, strategyName, placementv1beta1.StateRun) + createStagedUpdateRunWithAutoCreatedSnapshot(updateRunName, testNamespace, rpName, strategyName, placementv1beta1.StateRun) + }) + + It("Should have the latest resource snapshot", func() { + validateLatestResourceSnapshot(rpName, testNamespace, resourceSnapshotIndex1st) }) It("Should complete the staged update run with all 3 clusters updated in parallel", func() { @@ -1375,8 +1403,8 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem It("Should not rollout any resources to member clusters as there's no update run yet", checkIfRemovedConfigMapFromAllMemberClustersConsistently) - It("Should have the latest resource snapshot", func() { - validateLatestResourceSnapshot(rpName, testNamespace, resourceSnapshotIndex1st) + It("Should NOT create the latest resource snapshot yet", func() { + validateNoResourceSnapshot(rpName, testNamespace) }) It("Should successfully schedule the rp", func() { @@ -1389,7 +1417,11 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem }) It("Should create a staged update run successfully", func() { - createStagedUpdateRunSucceed(updateRunName, testNamespace, rpName, resourceSnapshotIndex1st, strategyName, placementv1beta1.StateRun) + createStagedUpdateRunWithAutoCreatedSnapshot(updateRunName, testNamespace, rpName, strategyName, placementv1beta1.StateRun) + }) + + It("Should have the latest resource snapshot", func() { + validateLatestResourceSnapshot(rpName, testNamespace, resourceSnapshotIndex1st) }) It("Should complete the staged update run with all 3 clusters", func() { @@ -1454,8 +1486,8 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem It("Should not rollout any resources to member clusters as there's no update run yet", checkIfRemovedConfigMapFromAllMemberClustersConsistently) - It("Should have the latest resource snapshot", func() { - validateLatestResourceSnapshot(rpName, testNamespace, resourceSnapshotIndex1st) + It("Should NOT create the latest resource snapshot yet", func() { + validateNoResourceSnapshot(rpName, testNamespace) }) It("Should successfully schedule the rp", func() { @@ -1469,7 +1501,11 @@ var _ = Describe("test RP rollout with staged update run", Label("resourceplacem It("Should create a staged update run successfully", func() { By("Creating staged update run in Initialize state") - createStagedUpdateRunSucceed(updateRunNames[0], testNamespace, rpName, resourceSnapshotIndex1st, strategyName, placementv1beta1.StateInitialize) + createStagedUpdateRunWithAutoCreatedSnapshot(updateRunNames[0], testNamespace, rpName, strategyName, placementv1beta1.StateInitialize) + }) + + It("Should have the latest resource snapshot", func() { + validateLatestResourceSnapshot(rpName, testNamespace, resourceSnapshotIndex1st) }) It("Should not start rollout as the update run is in Initialize state", func() { @@ -1642,6 +1678,14 @@ func validateLatestResourceSnapshot(rpName, namespace, wantResourceSnapshotIndex }, eventuallyDuration, eventuallyInterval).Should(Equal(wantResourceSnapshotIndex), "Resource snapshot index does not match") } +func validateNoResourceSnapshot(rpName, namespace string) { + Eventually(func() int { + rsList := &placementv1beta1.ResourceSnapshotList{} + Expect(hubClient.List(ctx, rsList, client.InNamespace(namespace), client.MatchingLabels{placementv1beta1.PlacementTrackingLabel: rpName})).Should(Succeed()) + return len(rsList.Items) + }, eventuallyDuration, eventuallyInterval).Should(Equal(0), "Resource snapshot should not be created for External rollout strategy") +} + func createStagedUpdateRunSucceed(updateRunName, namespace, rpName, resourceSnapshotIndex, strategyName string, state placementv1beta1.State) { updateRun := &placementv1beta1.StagedUpdateRun{ ObjectMeta: metav1.ObjectMeta{ @@ -1660,14 +1704,14 @@ func createStagedUpdateRunSucceed(updateRunName, namespace, rpName, resourceSnap // createStagedUpdateRunWithAutoCreatedSnapshot creates a StagedUpdateRun without specifying a // ResourceSnapshotIndex, triggering the controller to auto-create or reuse an existing resource snapshot. -func createStagedUpdateRunWithAutoCreatedSnapshot(updateRunName, namespace, rpName, strategyName string) { +func createStagedUpdateRunWithAutoCreatedSnapshot(updateRunName, namespace, rpName, strategyName string, state placementv1beta1.State) { updateRun := &placementv1beta1.StagedUpdateRun{ ObjectMeta: metav1.ObjectMeta{ Name: updateRunName, Namespace: namespace, }, Spec: placementv1beta1.UpdateRunSpec{ - State: placementv1beta1.StateRun, + State: state, PlacementName: rpName, StagedUpdateStrategyName: strategyName, },