From 248f28307dd5b02982ff4a37fd924d55936de4b9 Mon Sep 17 00:00:00 2001 From: Gianluca Mardente Date: Mon, 16 Feb 2026 21:48:40 +0100 Subject: [PATCH] (opt) Add HealthCheckError and Configurable Retry Logic This PR introduces a specialized HealthCheckError type to differentiate between clusterSUmmary deployment failures and functional health check failures. It also adds a configurable flag to control the retry interval when these health checks fail. Introduced a new CLI flag `--health-error-retry-time` (default: 90 seconds). This allows the controller to back off specifically on health check failures without impacting the standard reconciliation requeue logic for other errors. When a health check fails, the reconciler now uses the dedicated healthErrorRetryTime duration. --- Makefile | 2 +- cmd/main.go | 7 +++ controllers/clustersummary_controller.go | 21 ++++++++- controllers/clustersummary_deployer.go | 54 ++++++++++++++++-------- controllers/delete_checks.go | 9 ++-- go.mod | 6 +-- go.sum | 16 +++---- lib/clusterops/validate_health.go | 21 ++++++++- 8 files changed, 99 insertions(+), 37 deletions(-) diff --git a/Makefile b/Makefile index 233e6516..e5aef886 100644 --- a/Makefile +++ b/Makefile @@ -73,7 +73,7 @@ KUBECTL := $(TOOLS_BIN_DIR)/kubectl GOVULNCHECK_VERSION := "v1.1.4" GOLANGCI_LINT_VERSION := "v2.8.0" -CLUSTERCTL_VERSION := v1.12.2 +CLUSTERCTL_VERSION := v1.12.3 KUSTOMIZE_VER := v5.8.0 KUSTOMIZE_BIN := kustomize diff --git a/cmd/main.go b/cmd/main.go index 79e4100d..c012cd81 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -88,6 +88,7 @@ var ( webhookPort int syncPeriod time.Duration conflictRetryTime time.Duration + healthErrorRetryTime time.Duration version string healthAddr string profilerAddress string @@ -287,6 +288,11 @@ func initFlags(fs *pflag.FlagSet) { fmt.Sprintf("The minimum interval at which watched ClusterProfile with conflicts are retried. Defaul: %d seconds", defaultConflictRetryTime)) + const defaultHealthErrorRetryTime = 60 + fs.DurationVar(&healthErrorRetryTime, "health-error-retry-time", defaultHealthErrorRetryTime*time.Second, + fmt.Sprintf("The minimum interval at which health check failures are retried. Default: %d seconds", + defaultHealthErrorRetryTime)) + // AutoDeployDependencies enables automatic deployment of prerequisite profiles. // // Profile instances can specify dependencies on other profiles using the @@ -518,6 +524,7 @@ func getClusterSummaryReconciler(ctx context.Context, mgr manager.Manager) *cont PolicyMux: sync.Mutex{}, ConcurrentReconciles: concurrentReconciles, ConflictRetryTime: conflictRetryTime, + HealthErrorRetryTime: healthErrorRetryTime, Logger: ctrl.Log.WithName("clustersummaryreconciler"), } } diff --git a/controllers/clustersummary_controller.go b/controllers/clustersummary_controller.go index 6f2ad348..e26d61eb 100644 --- a/controllers/clustersummary_controller.go +++ b/controllers/clustersummary_controller.go @@ -113,8 +113,9 @@ type ClusterSummaryReconciler struct { ReferenceMap map[corev1.ObjectReference]*libsveltosset.Set // key: Referenced object; value: set of all ClusterSummaries referencing the resource ClusterMap map[corev1.ObjectReference]*libsveltosset.Set // key: Sveltos/Cluster; value: set of all ClusterSummaries for that Cluster - ConflictRetryTime time.Duration - ctrl controller.Controller + ConflictRetryTime time.Duration + HealthErrorRetryTime time.Duration + ctrl controller.Controller eventRecorder events.EventRecorder @@ -463,6 +464,17 @@ func (r *ClusterSummaryReconciler) proceedDeployingClusterSummary(ctx context.Co return reconcile.Result{Requeue: true, RequeueAfter: r.ConflictRetryTime}, nil } + var healthCheckError *clusterops.HealthCheckError + ok = errors.As(err, &healthCheckError) + if ok { + logger.V(logs.LogInfo).Info("failed to deploy health check not passing", + "feature", healthCheckError.FeatureID, + "checkName", healthCheckError.CheckName, + "reason", healthCheckError.InternalErr.Error(), + "requeueAfter", r.HealthErrorRetryTime.String()) + return reconcile.Result{Requeue: true, RequeueAfter: r.HealthErrorRetryTime}, nil + } + requeueAfter := normalRequeueAfter maxFailures := r.getMaxConsecutiveFailures(clusterSummaryScope) @@ -1764,6 +1776,11 @@ func (r *ClusterSummaryReconciler) processUndeployError(clusterSummaryScope *sco return reconcile.Result{Requeue: true, RequeueAfter: deleteHandOverRequeueAfter}, nil } + var healthCheckError *clusterops.HealthCheckError + if errors.As(undeployError, &healthCheckError) { + return reconcile.Result{Requeue: true, RequeueAfter: r.HealthErrorRetryTime}, nil + } + return reconcile.Result{Requeue: true, RequeueAfter: deleteRequeueAfter}, nil } diff --git a/controllers/clustersummary_deployer.go b/controllers/clustersummary_deployer.go index a813571b..377aec43 100644 --- a/controllers/clustersummary_deployer.go +++ b/controllers/clustersummary_deployer.go @@ -30,6 +30,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" configv1beta1 "github.com/projectsveltos/addon-controller/api/v1beta1" + "github.com/projectsveltos/addon-controller/lib/clusterops" "github.com/projectsveltos/addon-controller/pkg/scope" libsveltosv1beta1 "github.com/projectsveltos/libsveltos/api/v1beta1" "github.com/projectsveltos/libsveltos/lib/clusterproxy" @@ -170,24 +171,9 @@ func (r *ClusterSummaryReconciler) proceedDeployingFeature(ctx context.Context, r.updateFeatureStatus(clusterSummaryScope, f.id, deployerStatus, currentHash, deployerError, logger) if deployerError != nil { - // Check if error is a NonRetriableError type - var nonRetriableError *configv1beta1.NonRetriableError - if errors.As(deployerError, &nonRetriableError) { - nonRetriableStatus := libsveltosv1beta1.FeatureStatusFailedNonRetriable - r.updateFeatureStatus(clusterSummaryScope, f.id, &nonRetriableStatus, currentHash, deployerError, logger) - return nil - } - var templateError *configv1beta1.TemplateInstantiationError - if errors.As(deployerError, &templateError) { - nonRetriableStatus := libsveltosv1beta1.FeatureStatusFailedNonRetriable - r.updateFeatureStatus(clusterSummaryScope, f.id, &nonRetriableStatus, currentHash, deployerError, logger) - return nil - } - if r.maxNumberOfConsecutiveFailureReached(clusterSummaryScope, f, logger) { - nonRetriableStatus := libsveltosv1beta1.FeatureStatusFailedNonRetriable - resultError := errors.New("the maximum number of consecutive errors has been reached") - r.updateFeatureStatus(clusterSummaryScope, f.id, &nonRetriableStatus, currentHash, resultError, logger) - return nil + shouldReturn, err := r.handleDeployerError(deployerError, clusterSummaryScope, f, currentHash, logger) + if shouldReturn { + return err } } if *deployerStatus == libsveltosv1beta1.FeatureStatusProvisioning { @@ -224,6 +210,38 @@ func (r *ClusterSummaryReconciler) proceedDeployingFeature(ctx context.Context, return fmt.Errorf("request is queued") } +func (r *ClusterSummaryReconciler) handleDeployerError(deployerError error, clusterSummaryScope *scope.ClusterSummaryScope, + f feature, currentHash []byte, logger logr.Logger) (bool, error) { + + // Check if error is a NonRetriableError type + var nonRetriableError *configv1beta1.NonRetriableError + if errors.As(deployerError, &nonRetriableError) { + nonRetriableStatus := libsveltosv1beta1.FeatureStatusFailedNonRetriable + r.updateFeatureStatus(clusterSummaryScope, f.id, &nonRetriableStatus, currentHash, deployerError, logger) + return true, nil + } + var templateError *configv1beta1.TemplateInstantiationError + if errors.As(deployerError, &templateError) { + nonRetriableStatus := libsveltosv1beta1.FeatureStatusFailedNonRetriable + r.updateFeatureStatus(clusterSummaryScope, f.id, &nonRetriableStatus, currentHash, deployerError, logger) + return true, nil + } + var healthCheckError *clusterops.HealthCheckError + if errors.As(deployerError, &healthCheckError) { + retriableStatus := libsveltosv1beta1.FeatureStatusFailed + r.updateFeatureStatus(clusterSummaryScope, f.id, &retriableStatus, currentHash, deployerError, logger) + return true, healthCheckError + } + if r.maxNumberOfConsecutiveFailureReached(clusterSummaryScope, f, logger) { + nonRetriableStatus := libsveltosv1beta1.FeatureStatusFailedNonRetriable + resultError := errors.New("the maximum number of consecutive errors has been reached") + r.updateFeatureStatus(clusterSummaryScope, f.id, &nonRetriableStatus, currentHash, resultError, logger) + return true, nil + } + + return false, deployerError +} + func (r *ClusterSummaryReconciler) proceedDeployingFeatureInPullMode(ctx context.Context, clusterSummaryScope *scope.ClusterSummaryScope, f feature, isConfigSame bool, currentHash []byte, logger logr.Logger) error { diff --git a/controllers/delete_checks.go b/controllers/delete_checks.go index 41fecfad..6440a99a 100644 --- a/controllers/delete_checks.go +++ b/controllers/delete_checks.go @@ -66,16 +66,17 @@ func validateDeleteChecks(ctx context.Context, clusterSummary *configv1beta1.Clu adminNamespace, adminName := getClusterSummaryAdmin(clusterSummary) cacheMgr := clustercache.GetManager() - remoteRestConfig, err := cacheMgr.GetKubernetesRestConfig(ctx, getManagementClusterClient(), clusterSummary.Spec.ClusterNamespace, - clusterSummary.Spec.ClusterName, adminNamespace, adminName, clusterSummary.Spec.ClusterType, logger) + remoteRestConfig, err := cacheMgr.GetKubernetesRestConfig(ctx, getManagementClusterClient(), + clusterSummary.Spec.ClusterNamespace, clusterSummary.Spec.ClusterName, adminNamespace, + adminName, clusterSummary.Spec.ClusterType, logger) if err != nil { logger.V(logs.LogDebug).Error(err, "failed to get cluster rest.Config") return err } logger.V(logs.LogDebug).Info("validate delete checks") - err = clusterops.ValidateHealthPolicies(ctx, remoteRestConfig, clusterSummary.Spec.ClusterProfileSpec.PreDeleteChecks, - featureID, true, logger) + err = clusterops.ValidateHealthPolicies(ctx, remoteRestConfig, + clusterSummary.Spec.ClusterProfileSpec.PreDeleteChecks, featureID, true, logger) if err != nil { logger.V(logs.LogDebug).Error(err, "delete check failed") return err diff --git a/go.mod b/go.mod index 563f2a77..4a6a9578 100644 --- a/go.mod +++ b/go.mod @@ -11,14 +11,14 @@ require ( github.com/fluxcd/pkg/apis/meta v1.25.0 github.com/fluxcd/pkg/http/fetch v0.22.0 github.com/fluxcd/pkg/tar v0.17.0 - github.com/fluxcd/source-controller/api v1.7.4 + github.com/fluxcd/source-controller/api v1.8.0 github.com/gdexlab/go-render v1.0.1 github.com/go-logr/logr v1.4.3 github.com/hexops/gotextdiff v1.0.3 github.com/onsi/ginkgo/v2 v2.28.1 github.com/onsi/gomega v1.39.1 github.com/pkg/errors v0.9.1 - github.com/projectsveltos/libsveltos v1.5.0 + github.com/projectsveltos/libsveltos v1.5.1 github.com/prometheus/client_golang v1.23.2 github.com/robfig/cron v1.2.0 github.com/spf13/pflag v1.0.10 @@ -33,7 +33,7 @@ require ( k8s.io/component-base v0.35.1 k8s.io/klog/v2 v2.130.1 k8s.io/utils v0.0.0-20260210185600-b8788abfbbc2 - sigs.k8s.io/cluster-api v1.12.2 + sigs.k8s.io/cluster-api v1.12.3 sigs.k8s.io/controller-runtime v0.23.1 sigs.k8s.io/kustomize/api v0.21.1 sigs.k8s.io/kustomize/kyaml v0.21.1 diff --git a/go.sum b/go.sum index ff1e56f6..2cc08b9a 100644 --- a/go.sum +++ b/go.sum @@ -50,8 +50,8 @@ github.com/containerd/platforms v0.2.1 h1:zvwtM3rz2YHPQsF2CHYM8+KtB5dvhISiXh5ZpS github.com/containerd/platforms v0.2.1/go.mod h1:XHCb+2/hzowdiut9rkudds9bE5yJ7npe7dG/wG+uFPw= github.com/coredns/caddy v1.1.1 h1:2eYKZT7i6yxIfGP3qLJoJ7HAsDJqYB+X68g4NYjSrE0= github.com/coredns/caddy v1.1.1/go.mod h1:A6ntJQlAWuQfFlsd9hvigKbo2WS0VUs2l1e2F+BawD4= -github.com/coredns/corefile-migration v1.0.29 h1:g4cPYMXXDDs9uLE2gFYrJaPBuUAR07eEMGyh9JBE13w= -github.com/coredns/corefile-migration v1.0.29/go.mod h1:56DPqONc3njpVPsdilEnfijCwNGC3/kTJLl7i7SPavY= +github.com/coredns/corefile-migration v1.0.30 h1:ljZNPGgna+4yKv81gfkvkgLEWdtz0NjBR1glaiPI140= +github.com/coredns/corefile-migration v1.0.30/go.mod h1:56DPqONc3njpVPsdilEnfijCwNGC3/kTJLl7i7SPavY= github.com/coreos/go-systemd v0.0.0-20191104093116-d3cd4ed1dbcf h1:iW4rZ826su+pqaw19uhpSCzhj44qo35pNgKFGqzDKkU= github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8iXXhfZs= github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= @@ -104,8 +104,8 @@ github.com/fluxcd/pkg/tar v0.17.0 h1:uNxbFXy8ly8C7fJ8D7w3rjTNJFrb4Hp1aY/30XkfvxY github.com/fluxcd/pkg/tar v0.17.0/go.mod h1:b1xyIRYDD0ket4SV5u0UXYv+ZdN/O/HmIO5jZQdHQls= github.com/fluxcd/pkg/testserver v0.13.0 h1:xEpBcEYtD7bwvZ+i0ZmChxKkDo/wfQEV3xmnzVybSSg= github.com/fluxcd/pkg/testserver v0.13.0/go.mod h1:akRYv3FLQUsme15na9ihECRG6hBuqni4XEY9W8kzs8E= -github.com/fluxcd/source-controller/api v1.7.4 h1:+EOVnRA9LmLxOx7J273l7IOEU39m+Slt/nQGBy69ygs= -github.com/fluxcd/source-controller/api v1.7.4/go.mod h1:ruf49LEgZRBfcP+eshl2n9SX1MfHayCcViAIGnZcaDY= +github.com/fluxcd/source-controller/api v1.8.0 h1:ndrYmcv6ZMcdQHFSUkOrFVDO7h16SfDBSw/DOqf/LPo= +github.com/fluxcd/source-controller/api v1.8.0/go.mod h1:1O7+sMbqc1+3tPvjmtgFz+bASTl794Y9SxpebHDDSGA= github.com/foxcpp/go-mockdns v1.2.0 h1:omK3OrHRD1IWJz1FuFBCFquhXslXoF17OvBS6JPzZF0= github.com/foxcpp/go-mockdns v1.2.0/go.mod h1:IhLeSFGed3mJIAXPH2aiRQB+kqz7oqu8ld2qVbOu7Wk= github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= @@ -279,8 +279,8 @@ github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRI github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/poy/onpar v1.1.2 h1:QaNrNiZx0+Nar5dLgTVp5mXkyoVFIbepjyEoGSnhbAY= github.com/poy/onpar v1.1.2/go.mod h1:6X8FLNoxyr9kkmnlqpK6LSoiOtrO6MICtWwEuWkLjzg= -github.com/projectsveltos/libsveltos v1.5.0 h1:XUxdDMYJAujLntep9u1mgKy4ArqhsE7nZhIbZEnC0ZM= -github.com/projectsveltos/libsveltos v1.5.0/go.mod h1:AM2uJ4uasH6PnK8aMeLpc1p8WW7J+uzBA1Nx2g0r5Ug= +github.com/projectsveltos/libsveltos v1.5.1 h1:PDTl+JoT5uVREldz8aXITzdEj3YRo9EXkoyrB0NNeUA= +github.com/projectsveltos/libsveltos v1.5.1/go.mod h1:AM2uJ4uasH6PnK8aMeLpc1p8WW7J+uzBA1Nx2g0r5Ug= github.com/projectsveltos/lua-utils/glua-json v0.0.0-20251212200258-2b3cdcb7c0f5 h1:khnc+994UszxZYu69J+R5FKiLA/Nk1JQj0EYAkwTWz0= github.com/projectsveltos/lua-utils/glua-json v0.0.0-20251212200258-2b3cdcb7c0f5/go.mod h1:yVL8KQFa9tmcxgwl9nwIMtKgtmIVC1zaFRSCfOwYvPY= github.com/projectsveltos/lua-utils/glua-runes v0.0.0-20251212200258-2b3cdcb7c0f5 h1:YbsebwRwTRhV8QacvEAdFqxcxHdeu7JTVtsBovbkgos= @@ -521,8 +521,8 @@ oras.land/oras-go/v2 v2.6.0 h1:X4ELRsiGkrbeox69+9tzTu492FMUu7zJQW6eJU+I2oc= oras.land/oras-go/v2 v2.6.0/go.mod h1:magiQDfG6H1O9APp+rOsvCPcW1GD2MM7vgnKY0Y+u1o= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 h1:jpcvIRr3GLoUoEKRkHKSmGjxb6lWwrBlJsXc+eUYQHM= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw= -sigs.k8s.io/cluster-api v1.12.2 h1:+b+M2IygfvFZJq7bsaloNakimMEVNf81zkGR1IiuxXs= -sigs.k8s.io/cluster-api v1.12.2/go.mod h1:2XuF/dmN3c/1VITb6DB44N5+Ecvsvd5KOWqrY9Q53nU= +sigs.k8s.io/cluster-api v1.12.3 h1:cuOl3fWXhlXFuQcyIH4C8i3ns8rLhtcnK+x00MVdKBs= +sigs.k8s.io/cluster-api v1.12.3/go.mod h1:EAiTJtf/8M5eBetPwumi6t8DJJ55Ln6Fkvh2OAa7PD4= sigs.k8s.io/controller-runtime v0.23.1 h1:TjJSM80Nf43Mg21+RCy3J70aj/W6KyvDtOlpKf+PupE= sigs.k8s.io/controller-runtime v0.23.1/go.mod h1:B6COOxKptp+YaUT5q4l6LqUJTRpizbgf9KSRNdQGns0= sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= diff --git a/lib/clusterops/validate_health.go b/lib/clusterops/validate_health.go index ab9c89bf..a85fcbfd 100644 --- a/lib/clusterops/validate_health.go +++ b/lib/clusterops/validate_health.go @@ -43,6 +43,21 @@ type healthStatus struct { Message string `json:"message"` } +type HealthCheckError struct { + FeatureID libsveltosv1beta1.FeatureID + CheckName string + InternalErr error +} + +func (e *HealthCheckError) Error() string { + return fmt.Sprintf("health check '%s' for feature %s failed: %v", + e.CheckName, e.FeatureID, e.InternalErr) +} + +func (e *HealthCheckError) Unwrap() error { + return e.InternalErr +} + // ValidateHealthPolicies runs all validateDeployment checks registered for the feature (Helm/Kustomize/Resources) func ValidateHealthPolicies(ctx context.Context, remoteConfig *rest.Config, validateHealths []libsveltosv1beta1.ValidateHealth, featureID libsveltosv1beta1.FeatureID, isDelete bool, logger logr.Logger) error { @@ -61,7 +76,11 @@ func ValidateHealthPolicies(ctx context.Context, remoteConfig *rest.Config, vali if err := validateHealthPolicy(ctx, remoteConfig, check, isDelete, logger); err != nil { logger.V(logs.LogInfo).Info(fmt.Sprintf("failed to validate check: %s", err)) - return err + return &HealthCheckError{ + FeatureID: featureID, + CheckName: check.Name, + InternalErr: err, + } } }