Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ KUBECTL := $(TOOLS_BIN_DIR)/kubectl

GOVULNCHECK_VERSION := "v1.1.4"
GOLANGCI_LINT_VERSION := "v2.8.0"
CLUSTERCTL_VERSION := v1.12.2
CLUSTERCTL_VERSION := v1.12.3

KUSTOMIZE_VER := v5.8.0
KUSTOMIZE_BIN := kustomize
Expand Down
7 changes: 7 additions & 0 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ var (
webhookPort int
syncPeriod time.Duration
conflictRetryTime time.Duration
healthErrorRetryTime time.Duration
version string
healthAddr string
profilerAddress string
Expand Down Expand Up @@ -287,6 +288,11 @@ func initFlags(fs *pflag.FlagSet) {
fmt.Sprintf("The minimum interval at which watched ClusterProfile with conflicts are retried. Defaul: %d seconds",
defaultConflictRetryTime))

const defaultHealthErrorRetryTime = 60
fs.DurationVar(&healthErrorRetryTime, "health-error-retry-time", defaultHealthErrorRetryTime*time.Second,
fmt.Sprintf("The minimum interval at which health check failures are retried. Default: %d seconds",
defaultHealthErrorRetryTime))

// AutoDeployDependencies enables automatic deployment of prerequisite profiles.
//
// Profile instances can specify dependencies on other profiles using the
Expand Down Expand Up @@ -518,6 +524,7 @@ func getClusterSummaryReconciler(ctx context.Context, mgr manager.Manager) *cont
PolicyMux: sync.Mutex{},
ConcurrentReconciles: concurrentReconciles,
ConflictRetryTime: conflictRetryTime,
HealthErrorRetryTime: healthErrorRetryTime,
Logger: ctrl.Log.WithName("clustersummaryreconciler"),
}
}
Expand Down
21 changes: 19 additions & 2 deletions controllers/clustersummary_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,9 @@ type ClusterSummaryReconciler struct {
ReferenceMap map[corev1.ObjectReference]*libsveltosset.Set // key: Referenced object; value: set of all ClusterSummaries referencing the resource
ClusterMap map[corev1.ObjectReference]*libsveltosset.Set // key: Sveltos/Cluster; value: set of all ClusterSummaries for that Cluster

ConflictRetryTime time.Duration
ctrl controller.Controller
ConflictRetryTime time.Duration
HealthErrorRetryTime time.Duration
ctrl controller.Controller

eventRecorder events.EventRecorder

Expand Down Expand Up @@ -463,6 +464,17 @@ func (r *ClusterSummaryReconciler) proceedDeployingClusterSummary(ctx context.Co
return reconcile.Result{Requeue: true, RequeueAfter: r.ConflictRetryTime}, nil
}

var healthCheckError *clusterops.HealthCheckError
ok = errors.As(err, &healthCheckError)
if ok {
logger.V(logs.LogInfo).Info("failed to deploy health check not passing",
"feature", healthCheckError.FeatureID,
"checkName", healthCheckError.CheckName,
"reason", healthCheckError.InternalErr.Error(),
"requeueAfter", r.HealthErrorRetryTime.String())
return reconcile.Result{Requeue: true, RequeueAfter: r.HealthErrorRetryTime}, nil
}

requeueAfter := normalRequeueAfter
maxFailures := r.getMaxConsecutiveFailures(clusterSummaryScope)

Expand Down Expand Up @@ -1764,6 +1776,11 @@ func (r *ClusterSummaryReconciler) processUndeployError(clusterSummaryScope *sco
return reconcile.Result{Requeue: true, RequeueAfter: deleteHandOverRequeueAfter}, nil
}

var healthCheckError *clusterops.HealthCheckError
if errors.As(undeployError, &healthCheckError) {
return reconcile.Result{Requeue: true, RequeueAfter: r.HealthErrorRetryTime}, nil
}

return reconcile.Result{Requeue: true, RequeueAfter: deleteRequeueAfter}, nil
}

Expand Down
54 changes: 36 additions & 18 deletions controllers/clustersummary_deployer.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"

configv1beta1 "github.com/projectsveltos/addon-controller/api/v1beta1"
"github.com/projectsveltos/addon-controller/lib/clusterops"
"github.com/projectsveltos/addon-controller/pkg/scope"
libsveltosv1beta1 "github.com/projectsveltos/libsveltos/api/v1beta1"
"github.com/projectsveltos/libsveltos/lib/clusterproxy"
Expand Down Expand Up @@ -170,24 +171,9 @@ func (r *ClusterSummaryReconciler) proceedDeployingFeature(ctx context.Context,

r.updateFeatureStatus(clusterSummaryScope, f.id, deployerStatus, currentHash, deployerError, logger)
if deployerError != nil {
// Check if error is a NonRetriableError type
var nonRetriableError *configv1beta1.NonRetriableError
if errors.As(deployerError, &nonRetriableError) {
nonRetriableStatus := libsveltosv1beta1.FeatureStatusFailedNonRetriable
r.updateFeatureStatus(clusterSummaryScope, f.id, &nonRetriableStatus, currentHash, deployerError, logger)
return nil
}
var templateError *configv1beta1.TemplateInstantiationError
if errors.As(deployerError, &templateError) {
nonRetriableStatus := libsveltosv1beta1.FeatureStatusFailedNonRetriable
r.updateFeatureStatus(clusterSummaryScope, f.id, &nonRetriableStatus, currentHash, deployerError, logger)
return nil
}
if r.maxNumberOfConsecutiveFailureReached(clusterSummaryScope, f, logger) {
nonRetriableStatus := libsveltosv1beta1.FeatureStatusFailedNonRetriable
resultError := errors.New("the maximum number of consecutive errors has been reached")
r.updateFeatureStatus(clusterSummaryScope, f.id, &nonRetriableStatus, currentHash, resultError, logger)
return nil
shouldReturn, err := r.handleDeployerError(deployerError, clusterSummaryScope, f, currentHash, logger)
if shouldReturn {
return err
}
}
if *deployerStatus == libsveltosv1beta1.FeatureStatusProvisioning {
Expand Down Expand Up @@ -224,6 +210,38 @@ func (r *ClusterSummaryReconciler) proceedDeployingFeature(ctx context.Context,
return fmt.Errorf("request is queued")
}

func (r *ClusterSummaryReconciler) handleDeployerError(deployerError error, clusterSummaryScope *scope.ClusterSummaryScope,
f feature, currentHash []byte, logger logr.Logger) (bool, error) {

// Check if error is a NonRetriableError type
var nonRetriableError *configv1beta1.NonRetriableError
if errors.As(deployerError, &nonRetriableError) {
nonRetriableStatus := libsveltosv1beta1.FeatureStatusFailedNonRetriable
r.updateFeatureStatus(clusterSummaryScope, f.id, &nonRetriableStatus, currentHash, deployerError, logger)
return true, nil
}
var templateError *configv1beta1.TemplateInstantiationError
if errors.As(deployerError, &templateError) {
nonRetriableStatus := libsveltosv1beta1.FeatureStatusFailedNonRetriable
r.updateFeatureStatus(clusterSummaryScope, f.id, &nonRetriableStatus, currentHash, deployerError, logger)
return true, nil
}
var healthCheckError *clusterops.HealthCheckError
if errors.As(deployerError, &healthCheckError) {
retriableStatus := libsveltosv1beta1.FeatureStatusFailed
r.updateFeatureStatus(clusterSummaryScope, f.id, &retriableStatus, currentHash, deployerError, logger)
return true, healthCheckError
}
if r.maxNumberOfConsecutiveFailureReached(clusterSummaryScope, f, logger) {
nonRetriableStatus := libsveltosv1beta1.FeatureStatusFailedNonRetriable
resultError := errors.New("the maximum number of consecutive errors has been reached")
r.updateFeatureStatus(clusterSummaryScope, f.id, &nonRetriableStatus, currentHash, resultError, logger)
return true, nil
}

return false, deployerError
}

func (r *ClusterSummaryReconciler) proceedDeployingFeatureInPullMode(ctx context.Context,
clusterSummaryScope *scope.ClusterSummaryScope, f feature, isConfigSame bool, currentHash []byte,
logger logr.Logger) error {
Expand Down
9 changes: 5 additions & 4 deletions controllers/delete_checks.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,16 +66,17 @@ func validateDeleteChecks(ctx context.Context, clusterSummary *configv1beta1.Clu

adminNamespace, adminName := getClusterSummaryAdmin(clusterSummary)
cacheMgr := clustercache.GetManager()
remoteRestConfig, err := cacheMgr.GetKubernetesRestConfig(ctx, getManagementClusterClient(), clusterSummary.Spec.ClusterNamespace,
clusterSummary.Spec.ClusterName, adminNamespace, adminName, clusterSummary.Spec.ClusterType, logger)
remoteRestConfig, err := cacheMgr.GetKubernetesRestConfig(ctx, getManagementClusterClient(),
clusterSummary.Spec.ClusterNamespace, clusterSummary.Spec.ClusterName, adminNamespace,
adminName, clusterSummary.Spec.ClusterType, logger)
if err != nil {
logger.V(logs.LogDebug).Error(err, "failed to get cluster rest.Config")
return err
}

logger.V(logs.LogDebug).Info("validate delete checks")
err = clusterops.ValidateHealthPolicies(ctx, remoteRestConfig, clusterSummary.Spec.ClusterProfileSpec.PreDeleteChecks,
featureID, true, logger)
err = clusterops.ValidateHealthPolicies(ctx, remoteRestConfig,
clusterSummary.Spec.ClusterProfileSpec.PreDeleteChecks, featureID, true, logger)
if err != nil {
logger.V(logs.LogDebug).Error(err, "delete check failed")
return err
Expand Down
6 changes: 3 additions & 3 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@ require (
github.com/fluxcd/pkg/apis/meta v1.25.0
github.com/fluxcd/pkg/http/fetch v0.22.0
github.com/fluxcd/pkg/tar v0.17.0
github.com/fluxcd/source-controller/api v1.7.4
github.com/fluxcd/source-controller/api v1.8.0
github.com/gdexlab/go-render v1.0.1
github.com/go-logr/logr v1.4.3
github.com/hexops/gotextdiff v1.0.3
github.com/onsi/ginkgo/v2 v2.28.1
github.com/onsi/gomega v1.39.1
github.com/pkg/errors v0.9.1
github.com/projectsveltos/libsveltos v1.5.0
github.com/projectsveltos/libsveltos v1.5.1
github.com/prometheus/client_golang v1.23.2
github.com/robfig/cron v1.2.0
github.com/spf13/pflag v1.0.10
Expand All @@ -33,7 +33,7 @@ require (
k8s.io/component-base v0.35.1
k8s.io/klog/v2 v2.130.1
k8s.io/utils v0.0.0-20260210185600-b8788abfbbc2
sigs.k8s.io/cluster-api v1.12.2
sigs.k8s.io/cluster-api v1.12.3
sigs.k8s.io/controller-runtime v0.23.1
sigs.k8s.io/kustomize/api v0.21.1
sigs.k8s.io/kustomize/kyaml v0.21.1
Expand Down
16 changes: 8 additions & 8 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ github.com/containerd/platforms v0.2.1 h1:zvwtM3rz2YHPQsF2CHYM8+KtB5dvhISiXh5ZpS
github.com/containerd/platforms v0.2.1/go.mod h1:XHCb+2/hzowdiut9rkudds9bE5yJ7npe7dG/wG+uFPw=
github.com/coredns/caddy v1.1.1 h1:2eYKZT7i6yxIfGP3qLJoJ7HAsDJqYB+X68g4NYjSrE0=
github.com/coredns/caddy v1.1.1/go.mod h1:A6ntJQlAWuQfFlsd9hvigKbo2WS0VUs2l1e2F+BawD4=
github.com/coredns/corefile-migration v1.0.29 h1:g4cPYMXXDDs9uLE2gFYrJaPBuUAR07eEMGyh9JBE13w=
github.com/coredns/corefile-migration v1.0.29/go.mod h1:56DPqONc3njpVPsdilEnfijCwNGC3/kTJLl7i7SPavY=
github.com/coredns/corefile-migration v1.0.30 h1:ljZNPGgna+4yKv81gfkvkgLEWdtz0NjBR1glaiPI140=
github.com/coredns/corefile-migration v1.0.30/go.mod h1:56DPqONc3njpVPsdilEnfijCwNGC3/kTJLl7i7SPavY=
github.com/coreos/go-systemd v0.0.0-20191104093116-d3cd4ed1dbcf h1:iW4rZ826su+pqaw19uhpSCzhj44qo35pNgKFGqzDKkU=
github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8iXXhfZs=
github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
Expand Down Expand Up @@ -104,8 +104,8 @@ github.com/fluxcd/pkg/tar v0.17.0 h1:uNxbFXy8ly8C7fJ8D7w3rjTNJFrb4Hp1aY/30XkfvxY
github.com/fluxcd/pkg/tar v0.17.0/go.mod h1:b1xyIRYDD0ket4SV5u0UXYv+ZdN/O/HmIO5jZQdHQls=
github.com/fluxcd/pkg/testserver v0.13.0 h1:xEpBcEYtD7bwvZ+i0ZmChxKkDo/wfQEV3xmnzVybSSg=
github.com/fluxcd/pkg/testserver v0.13.0/go.mod h1:akRYv3FLQUsme15na9ihECRG6hBuqni4XEY9W8kzs8E=
github.com/fluxcd/source-controller/api v1.7.4 h1:+EOVnRA9LmLxOx7J273l7IOEU39m+Slt/nQGBy69ygs=
github.com/fluxcd/source-controller/api v1.7.4/go.mod h1:ruf49LEgZRBfcP+eshl2n9SX1MfHayCcViAIGnZcaDY=
github.com/fluxcd/source-controller/api v1.8.0 h1:ndrYmcv6ZMcdQHFSUkOrFVDO7h16SfDBSw/DOqf/LPo=
github.com/fluxcd/source-controller/api v1.8.0/go.mod h1:1O7+sMbqc1+3tPvjmtgFz+bASTl794Y9SxpebHDDSGA=
github.com/foxcpp/go-mockdns v1.2.0 h1:omK3OrHRD1IWJz1FuFBCFquhXslXoF17OvBS6JPzZF0=
github.com/foxcpp/go-mockdns v1.2.0/go.mod h1:IhLeSFGed3mJIAXPH2aiRQB+kqz7oqu8ld2qVbOu7Wk=
github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8=
Expand Down Expand Up @@ -279,8 +279,8 @@ github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRI
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/poy/onpar v1.1.2 h1:QaNrNiZx0+Nar5dLgTVp5mXkyoVFIbepjyEoGSnhbAY=
github.com/poy/onpar v1.1.2/go.mod h1:6X8FLNoxyr9kkmnlqpK6LSoiOtrO6MICtWwEuWkLjzg=
github.com/projectsveltos/libsveltos v1.5.0 h1:XUxdDMYJAujLntep9u1mgKy4ArqhsE7nZhIbZEnC0ZM=
github.com/projectsveltos/libsveltos v1.5.0/go.mod h1:AM2uJ4uasH6PnK8aMeLpc1p8WW7J+uzBA1Nx2g0r5Ug=
github.com/projectsveltos/libsveltos v1.5.1 h1:PDTl+JoT5uVREldz8aXITzdEj3YRo9EXkoyrB0NNeUA=
github.com/projectsveltos/libsveltos v1.5.1/go.mod h1:AM2uJ4uasH6PnK8aMeLpc1p8WW7J+uzBA1Nx2g0r5Ug=
github.com/projectsveltos/lua-utils/glua-json v0.0.0-20251212200258-2b3cdcb7c0f5 h1:khnc+994UszxZYu69J+R5FKiLA/Nk1JQj0EYAkwTWz0=
github.com/projectsveltos/lua-utils/glua-json v0.0.0-20251212200258-2b3cdcb7c0f5/go.mod h1:yVL8KQFa9tmcxgwl9nwIMtKgtmIVC1zaFRSCfOwYvPY=
github.com/projectsveltos/lua-utils/glua-runes v0.0.0-20251212200258-2b3cdcb7c0f5 h1:YbsebwRwTRhV8QacvEAdFqxcxHdeu7JTVtsBovbkgos=
Expand Down Expand Up @@ -521,8 +521,8 @@ oras.land/oras-go/v2 v2.6.0 h1:X4ELRsiGkrbeox69+9tzTu492FMUu7zJQW6eJU+I2oc=
oras.land/oras-go/v2 v2.6.0/go.mod h1:magiQDfG6H1O9APp+rOsvCPcW1GD2MM7vgnKY0Y+u1o=
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2 h1:jpcvIRr3GLoUoEKRkHKSmGjxb6lWwrBlJsXc+eUYQHM=
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.31.2/go.mod h1:Ve9uj1L+deCXFrPOk1LpFXqTg7LCFzFso6PA48q/XZw=
sigs.k8s.io/cluster-api v1.12.2 h1:+b+M2IygfvFZJq7bsaloNakimMEVNf81zkGR1IiuxXs=
sigs.k8s.io/cluster-api v1.12.2/go.mod h1:2XuF/dmN3c/1VITb6DB44N5+Ecvsvd5KOWqrY9Q53nU=
sigs.k8s.io/cluster-api v1.12.3 h1:cuOl3fWXhlXFuQcyIH4C8i3ns8rLhtcnK+x00MVdKBs=
sigs.k8s.io/cluster-api v1.12.3/go.mod h1:EAiTJtf/8M5eBetPwumi6t8DJJ55Ln6Fkvh2OAa7PD4=
sigs.k8s.io/controller-runtime v0.23.1 h1:TjJSM80Nf43Mg21+RCy3J70aj/W6KyvDtOlpKf+PupE=
sigs.k8s.io/controller-runtime v0.23.1/go.mod h1:B6COOxKptp+YaUT5q4l6LqUJTRpizbgf9KSRNdQGns0=
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg=
Expand Down
21 changes: 20 additions & 1 deletion lib/clusterops/validate_health.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,21 @@ type healthStatus struct {
Message string `json:"message"`
}

type HealthCheckError struct {
FeatureID libsveltosv1beta1.FeatureID
CheckName string
InternalErr error
}

func (e *HealthCheckError) Error() string {
return fmt.Sprintf("health check '%s' for feature %s failed: %v",
e.CheckName, e.FeatureID, e.InternalErr)
}

func (e *HealthCheckError) Unwrap() error {
return e.InternalErr
}

// ValidateHealthPolicies runs all validateDeployment checks registered for the feature (Helm/Kustomize/Resources)
func ValidateHealthPolicies(ctx context.Context, remoteConfig *rest.Config, validateHealths []libsveltosv1beta1.ValidateHealth,
featureID libsveltosv1beta1.FeatureID, isDelete bool, logger logr.Logger) error {
Expand All @@ -61,7 +76,11 @@ func ValidateHealthPolicies(ctx context.Context, remoteConfig *rest.Config, vali

if err := validateHealthPolicy(ctx, remoteConfig, check, isDelete, logger); err != nil {
logger.V(logs.LogInfo).Info(fmt.Sprintf("failed to validate check: %s", err))
return err
return &HealthCheckError{
FeatureID: featureID,
CheckName: check.Name,
InternalErr: err,
}
}
}

Expand Down