diff --git a/docs/autoremediation/auto-remediation.md b/docs/autoremediation/auto-remediation.md index a4bdc6069..2d0399235 100644 --- a/docs/autoremediation/auto-remediation.md +++ b/docs/autoremediation/auto-remediation.md @@ -68,7 +68,13 @@ The GPU Operator Helm installation includes the following Argo Workflows compone The GPU Operator installs Argo Workflows v3.6.5, using a [customized installation YAML](https://github.com/argoproj/argo-workflows/releases/download/v3.6.5/install.yaml) tailored for auto-remediation requirements. This customization excludes components not needed for remediation, such as the Argo workflow server. For more information about Argo Workflows concepts, refer to the [official documentation](https://argo-workflows.readthedocs.io/en/release-3.6/workflow-concepts/). -> **Note:** By default, auto-remediation components (workflow controller and CRDs) are installed during Helm deployment. To disable the installation of these components, use the following Helm flag: +> **Note:** By default, the GPU Operator installs auto-remediation components (the workflow controller and CRDs) during Helm deployment. If Argo Workflows is already present in the cluster, you can skip installation of only the CRDs by setting: +> +> ```bash +> --set remediation.installCRDs=false +> ``` +> +> To disable the auto node remediation feature entirely, use: > > ```bash > --set remediation.enabled=false diff --git a/internal/controllers/remediation_handler.go b/internal/controllers/remediation_handler.go index f9922d46b..b01e61dc0 100644 --- a/internal/controllers/remediation_handler.go +++ b/internal/controllers/remediation_handler.go @@ -73,7 +73,7 @@ const ( AmdGpuRemediationFailed = "amd-gpu-remediation-failed" DefaultUtilityImage = "docker.io/rocm/gpu-operator-utils:latest" // DefaultRecoveryPolicyWindowSize - defines the time window size for recovery policy - DefaultRecoveryPolicyWindowSize = "15m" + DefaultRecoveryPolicyWindowSize = "300m" // DefaultRecoveryPolicyMaxRunsPerWindow - defines the max allowed runs per window for recovery policy // If a specific node condition is hit more than this number of times within the window size, no new remediation workflows will be scheduled DefaultRecoveryPolicyMaxRunsPerWindow = 3 @@ -1543,7 +1543,7 @@ func (h *remediationMgrHelper) isRecoveryPolicyViolated(ctx context.Context, nod logger.Info(fmt.Sprintf("Recent recovery count for node %s and condition %s: %d", nodeName, mapping.NodeCondition, recentRecoveryCount)) logger.Info(fmt.Sprintf("Max allowed runs per window for node %s and condition %s: %d", nodeName, mapping.NodeCondition, maxAllowedRuns)) - return recentRecoveryCount > maxAllowedRuns + return recentRecoveryCount >= maxAllowedRuns } func (h *remediationMgrHelper) isNodeLabelledForForceResume(ctx context.Context, nodeObj *v1.Node) bool {