diff --git a/docs/playbook-reference/actions/remediation.rst b/docs/playbook-reference/actions/remediation.rst index 6f89a2c0f..abeee9198 100644 --- a/docs/playbook-reference/actions/remediation.rst +++ b/docs/playbook-reference/actions/remediation.rst @@ -10,6 +10,8 @@ Robusta includes actions that modify Kubernetes resources in your cluster. See a .. robusta-action:: playbooks.robusta_playbooks.pod_actions.delete_pod +.. robusta-action:: playbooks.robusta_playbooks.pod_actions.delete_alert_pod on_prometheus_alert + .. robusta-action:: playbooks.robusta_playbooks.job_actions.delete_job on_job_failure .. robusta-action:: playbooks.robusta_playbooks.autoscaler.alert_on_hpa_reached_limit on_horizontalpodautoscaler_update diff --git a/playbooks/robusta_playbooks/pod_actions.py b/playbooks/robusta_playbooks/pod_actions.py index 421346d71..85df8dfc2 100644 --- a/playbooks/robusta_playbooks/pod_actions.py +++ b/playbooks/robusta_playbooks/pod_actions.py @@ -1,4 +1,15 @@ -from robusta.api import ActionException, ErrorCodes, PodEvent, action +import logging +from typing import Optional + +from robusta.api import ( + ActionException, + ActionParams, + ErrorCodes, + PodEvent, + PrometheusKubernetesAlert, + RateLimiter, + action, +) @action @@ -10,3 +21,45 @@ def delete_pod(event: PodEvent): raise ActionException(ErrorCodes.RESOURCE_NOT_FOUND, "Failed to get the pod for deletion") event.get_pod().delete() + + +class DeleteAlertPodParams(ActionParams): + """ + :var rate_limit: Optional rate limit (seconds). If set, the action will only run once per period for the same alert label value. + :var rate_limit_field: Alert label name whose value is used to build the rate limit key. + """ + + rate_limit: Optional[int] = None + rate_limit_field: Optional[str] = None + + +@action +def delete_alert_pod(event: PrometheusKubernetesAlert, params: DeleteAlertPodParams): + """ + Deletes the pod associated with a Prometheus alert. + + Supports an optional rate limit, scoped by an alert label value. + """ + pod = event.get_pod() + if not pod: + raise ActionException(ErrorCodes.RESOURCE_NOT_FOUND, "Failed to get the pod for deletion") + + if params.rate_limit is not None: + if not params.rate_limit_field: + raise ActionException( + ErrorCodes.ILLEGAL_ACTION_PARAMS, + "rate_limit_field must be set when rate_limit is configured", + ) + + field_value = event.alert.labels.get(params.rate_limit_field) + if field_value is None: + logging.warning( + f"delete_alert_pod: alert missing label '{params.rate_limit_field}'; skipping rate limit check" + ) + else: + key = f"{params.rate_limit_field}:{field_value}" + if not RateLimiter.mark_and_test("delete_alert_pod", key, params.rate_limit): + logging.info(f"delete_alert_pod rate limited for {key}; skipping deletion") + return + + pod.delete()