diff --git a/common/src/components/TerminalOutput.jsx b/common/src/components/TerminalOutput.jsx
index fd64b281..20b7dca8 100644
--- a/common/src/components/TerminalOutput.jsx
+++ b/common/src/components/TerminalOutput.jsx
@@ -35,8 +35,19 @@ function ansi2HTML(str, command) {
export default function Ansi({ command, children }) {
+ let txt = ""
+ try {
- let txt = `[92m❯[0m [37m${command}[0m
` + children.props.children.replaceAll("\n", "
")
+
+ if (children?.props?.children) {
+ txt = `[92m❯[0m [37m${command}[0m
` + children.props.children.replaceAll("\n", "
")
+ } else {
+ txt = `[92m❯[0m [37m${command}[0m`
+ }
+ } catch (e) {
+ console.error(e)
+ txt = ""
+ }
let html = new Convert().toHtml(txt, command)
return (
diff --git a/mission-control/blog/state-based-alerting/canary.yaml b/mission-control/blog/state-based-alerting/canary.yaml
new file mode 100644
index 00000000..bbc884cd
--- /dev/null
+++ b/mission-control/blog/state-based-alerting/canary.yaml
@@ -0,0 +1,16 @@
+apiVersion: canaries.flanksource.com/v1
+kind: Canary
+metadata:
+ name: kubernetes-checks
+ annotations:
+ trace: "true"
+spec:
+ schedule: "@every 5m"
+ kubernetes:
+ - name: pods
+ kind: Pod
+ namespaceSelector:
+ name: default
+ resource:
+ name: test-pod
+ healthy: true # use the is-healthy library to check the health of pods
diff --git a/mission-control/blog/state-based-alerting/index.md b/mission-control/blog/state-based-alerting/index.md
new file mode 100644
index 00000000..88e65bf3
--- /dev/null
+++ b/mission-control/blog/state-based-alerting/index.md
@@ -0,0 +1,500 @@
+# State-Based Alerting: Understanding Why Kubernetes Deployments Fail
+
+
+## Application vs Infrastructure
+
+Application and infrastucture normally have very different failure scenarios, Application errors are normally due to bugs (that produce exceptions) or performance related problems. When there are problems it becomes immiedatly obvious - page fails to load or starts timing out. Infrastructure health is more often related to configuration errors, drift, permissions and unhealthy dependencies problems can lay latent and be influenced by drift and dependences.
+
+Common application health methodologies include **USE** (**u**tilization, **s**aturation,**e**rrors) and **RED** (**r**quests, **e**rrors, **d**uration) that primarily use metrics (and log/trace derived metrics) that define thresholds for known health states. It is fairly straightforard to define healthy, unhealthy and warning states. These methodoligies struggle with unknown states i.e. we are not receiving any traffic so we don't if there are any errors. Synthetic testing helps to surface problems by creating artificial transactions
+
+## Metric (Thresholds and Anomalies)
+
+
+## State Based
+
+## Synthetic Testing
+
+
+Infrastructure errors tend be more state oreinted
+
+## Alerting Types and Examples
+
+There are various types of alerting methods, and choosing the right one can be challenging.
+
+| Alerting Type | Example(s) | Use Cases |
+| :---- | :---- | :---- |
+| **Metrics (Threshold)** | \- CPU \> 90% for 5 minutes.
\- Available disk space \< 10GB. | Best for USE (**u**tilization, **s**aturation,**e**rrors) and known errors. |
+| **Anomaly Detection** | \- Website traffic drops 50% compared to the same time last week.
\- Login attempts spike far beyond the normal range. | Useful for detecting unusual patterns and behavior that deviate from historical norms. Suitable for security monitoring and business metrics. |
+| **Log-Based** | \- More than 10 "HTTP 500" errors in web server logs within 1 minute.
\- Any log containing `OutOfMemoryError`. | Ideal for error detection, security events, and application-specific issues that are captured in logs. Good for detailed troubleshooting context. |
+| **State Based** | \- Kubernetes Node condition `Ready` \= False for 10 minutes.
\- Pod status is `CrashLoopBackOff`.
\- Deployment condition `Progressing = False` with reason: `ProgressDeadlineExceeded`. | Suitable for infrastructure and platform monitoring where resources have defined states. Good for Kubernetes and cloud resource health monitoring. |
+| **Synthetic** | \- Simulated user login journey fails from an external testing location.
\- Critical API endpoint response time exceeds 2 seconds from an external check.
\- Website homepage fails to load correctly from an external probe. | Best for end-to-end monitoring and user experience validation. Ideal for critical business flows and external service dependency checks. |
+
+
+This article compares Metric vs State Based alerts needed by platform teams managing infrastructure and deployments.
+
+Traditional monitoring relies on metrics. Tools like Prometheus collect numerical data and trigger alerts when values cross thresholds. For example, a `PodCrashLooping` rule in Prometheus might fire when the container restart count increases.
+
+State-based alerting takes a different approach. It monitors the actual state that objects report about themselves (in Kubernetes, this includes conditions and status fields), which often contain human-readable explanations. These states often follow RAG (Red-Amber-Green) patterns, making them intuitive to understand.
+
+## Metrics
+
+Using the example of a crashing pod, kubelet does not expose a native metric for this purpose. You need to use `kube-state-metrics`, which exposes the number of container restarts under `kube_pod_container_status_restarts_total`. You can then write a Prometheus Rule (alert):
+
+```yaml
+alert: KubernetesPodCrashLooping
+expr: increase(kube_pod_container_status_restarts_total[1m]) > 3
+for: 2m
+labels:
+ severity: warning
+annotations:
+ summary: Kubernetes pod crash looping (instance {{ $labels.instance }})
+ description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping"
+```
+
+And would produce an alert similar to this:
+
+
+
+There are some drawbacks with this approach:
+
+* **Limited Details** - The alert tells you **_what_** happened, but not **_why_**.
+* **Limited Context** - You know the name of the pod and namespace, but not much else. If you want to restrict alerts to only pods labelled `env: production`, `kube-state-metrics` needs to be updated to whitelist the label.
+* **Cardinality Challenges** - Whitelisting is required, as without it, you risk a cardinality explosion. Ingesting large amounts of metrics can be expensive and inefficient.
+* **Configuration Overhead** - Each failure scenario requires configuration, first with the extraction of metrics and then by creating and fine-tuning alerts.
+
+These challenges are due to how TSDBs handle textual vs numerical data - the details and context you need is all in the text.
+
+
+## State-Based Alerting
+
+
+The first step to "configuration-less" alerts is some standardization on what it means for something to be unhealthy. This is still an unsolved problem outside of Pod Probes. Kubernetes has taken some early steps with Conditions - which is an interface for reporting the state of a resource as either `unhealthy` or `healthy`.
+
+If you run the following command to get the YAML definition of a Pod:
+
+
+
+[36mapiVersion[0m:[32m v1[0m
+[32m[0m[36mkind[0m:[32m Pod[0m
+[32m[0m[36mmetadata[0m:[36m[0m
+[36m creationTimestamp[0m:[32m "2025-03-26T10:17:16Z"[0m[36m[0m
+[36m generateName[0m:[32m nginx-744c4cb859-[0m
+[32m [0m[36mlabels[0m:[36m[0m
+[36m app[0m:[32m nginx[0m
+[32m [0m[36mpod-template-hash[0m:[32m 744c4cb859[0m
+[32m [0m[36mname[0m:[32m nginx-744c4cb859-5p5hk[0m
+[32m [0m[36mnamespace[0m:[32m default[0m
+[32m [0m[36mownerReferences[0m:
+ -[36m apiVersion[0m:[32m apps/v1[0m
+[32m [0m[36mblockOwnerDeletion[0m:[95m true[0m
+[95m [0m[36mcontroller[0m:[95m true[0m
+[95m [0m[36mkind[0m:[32m ReplicaSet[0m
+[32m [0m[36mname[0m:[32m nginx-744c4cb859[0m
+[32m [0m[36muid[0m:[32m ae5213f5-988b-4d00-9101-bf6779dc17e4[0m
+[32m [0m[36mresourceVersion[0m:[32m "471187955"[0m[36m[0m
+[36m uid[0m:[32m 138dd15f-2433-479c-9ef9-8fe5034dfb9c[0m
+[32m[0m[36mspec[0m:[36m[0m
+[36m containers[0m:
+ -[36m image[0m:[32m nginx:invalid[0m
+[32m [0m[36mimagePullPolicy[0m:[32m Always[0m
+[32m [0m[36mname[0m:[32m nginx[0m
+[32m [0m[36mresources[0m: {}[36m[0m
+[36m terminationMessagePath[0m:[32m /dev/termination-log[0m
+[32m [0m[36mterminationMessagePolicy[0m:[32m File[0m
+[32m [0m[36mvolumeMounts[0m:
+ -[36m mountPath[0m:[32m /var/run/secrets/kubernetes.io/serviceaccount[0m
+[32m [0m[36mname[0m:[32m kube-api-access-9pnfj[0m
+[32m [0m[36mreadOnly[0m:[95m true[0m
+[95m [0m[36mdnsPolicy[0m:[32m ClusterFirst[0m
+[32m [0m[36menableServiceLinks[0m:[95m true[0m
+[95m [0m[36mimagePullSecrets[0m:
+ -[36m name[0m:[32m dockerhub[0m
+[32m [0m[36mnodeName[0m:[32m ip-10-0-5-138.eu-west-1.compute.internal[0m
+[32m [0m[36mpreemptionPolicy[0m:[32m PreemptLowerPriority[0m
+[32m [0m[36mpriority[0m:[95m 0[0m
+[95m [0m[36mrestartPolicy[0m:[32m Always[0m
+[32m [0m[36mschedulerName[0m:[32m default-scheduler[0m
+[32m [0m[36msecurityContext[0m: {}[36m[0m
+[36m serviceAccount[0m:[32m default[0m
+[32m [0m[36mserviceAccountName[0m:[32m default[0m
+[32m [0m[36mterminationGracePeriodSeconds[0m:[95m 30[0m
+[95m [0m[36mtolerations[0m:
+ -[36m effect[0m:[32m NoExecute[0m
+[32m [0m[36mkey[0m:[32m node.kubernetes.io/not-ready[0m
+[32m [0m[36moperator[0m:[32m Exists[0m
+[32m [0m[36mtolerationSeconds[0m:[95m 300[0m
+[95m [0m-[36m effect[0m:[32m NoExecute[0m
+[32m [0m[36mkey[0m:[32m node.kubernetes.io/unreachable[0m
+[32m [0m[36moperator[0m:[32m Exists[0m
+[32m [0m[36mtolerationSeconds[0m:[95m 300[0m
+[95m [0m[36mvolumes[0m:
+ -[36m name[0m:[32m kube-api-access-9pnfj[0m
+[32m [0m[36mprojected[0m:[36m[0m
+[36m defaultMode[0m:[95m 420[0m
+[95m [0m[36msources[0m:
+ -[36m serviceAccountToken[0m:[36m[0m
+[36m expirationSeconds[0m:[95m 3607[0m
+[95m [0m[36mpath[0m:[32m token[0m
+[32m [0m-[36m configMap[0m:[36m[0m
+[36m items[0m:
+ -[36m key[0m:[32m ca.crt[0m
+[32m [0m[36mpath[0m:[32m ca.crt[0m
+[32m [0m[36mname[0m:[32m kube-root-ca.crt[0m
+[32m [0m-[36m downwardAPI[0m:[36m[0m
+[36m items[0m:
+ -[36m fieldRef[0m:[36m[0m
+[36m apiVersion[0m:[32m v1[0m
+[32m [0m[36mfieldPath[0m:[32m metadata.namespace[0m
+[32m [0m[36mpath[0m:[32m namespace[0m
+[32m[0m[36mstatus[0m:[36m[0m
+[36m conditions[0m:
+ -[36m lastProbeTime[0m: null
+ [36mlastTransitionTime[0m:[32m "2025-03-26T10:17:18Z"[0m[36m[0m
+[36m status[0m:[32m "True"[0m[36m[0m
+[36m type[0m:[32m PodReadyToStartContainers[0m
+[32m [0m-[36m lastProbeTime[0m: null
+ [36mlastTransitionTime[0m:[32m "2025-03-26T10:17:16Z"[0m[36m[0m
+[36m status[0m:[32m "True"[0m[36m[0m
+[36m type[0m:[32m Initialized[0m
+[32m [0m-[36m lastProbeTime[0m: null
+ [36mlastTransitionTime[0m:[32m "2025-03-26T10:17:16Z"[0m[36m[0m
+[36m message[0m:[32m 'containers with unready status: [nginx]'[0m[36m[0m
+[36m reason[0m:[32m ContainersNotReady[0m
+[32m [0m[36mstatus[0m:[32m "False"[0m[36m[0m
+[36m type[0m:[32m Ready[0m
+[32m [0m-[36m lastProbeTime[0m: null
+ [36mlastTransitionTime[0m:[32m "2025-03-26T10:17:16Z"[0m[36m[0m
+[36m message[0m:[32m 'containers with unready status: [nginx]'[0m[36m[0m
+[36m reason[0m:[32m ContainersNotReady[0m
+[32m [0m[36mstatus[0m:[32m "False"[0m[36m[0m
+[36m type[0m:[32m ContainersReady[0m
+[32m [0m-[36m lastProbeTime[0m: null
+ [36mlastTransitionTime[0m:[32m "2025-03-26T10:17:16Z"[0m[36m[0m
+[36m status[0m:[32m "True"[0m[36m[0m
+[36m type[0m:[32m PodScheduled[0m
+[32m [0m[36mcontainerStatuses[0m:
+ -[36m image[0m:[32m nginx:invalid[0m
+[32m [0m[36mimageID[0m:[32m ""[0m[36m[0m
+[36m lastState[0m: {}[36m[0m
+[36m name[0m:[32m nginx[0m
+[32m [0m[36mready[0m:[95m false[0m
+[95m [0m[36mrestartCount[0m:[95m 0[0m
+[95m [0m[36mstarted[0m:[95m false[0m
+[95m [0m[36mstate[0m:[36m[0m
+[36m waiting[0m:[36m[0m
+[36m message[0m:[32m Back-off pulling image "nginx:invalid"[0m
+[32m [0m[36mreason[0m:[32m ImagePullBackOff[0m
+[32m [0m[36mhostIP[0m:[32m 10.0.5.138[0m
+[32m [0m[36mhostIPs[0m:
+ -[36m ip[0m:[32m 10.0.5.138[0m
+[32m [0m[36mphase[0m:[32m Pending[0m
+[32m [0m[36mpodIP[0m:[32m 10.0.5.78[0m
+[32m [0m[36mpodIPs[0m:
+ -[36m ip[0m:[32m 10.0.5.78[0m
+[32m [0m[36mqosClass[0m:[32m BestEffort[0m
+[32m [0m[36mstartTime[0m:[32m "2025-03-26T10:17:16Z"[0m
+
+
+
+
+While standards exist for exposing metrics, there's no equivalent standard for exposing the thresholds or conditions that trigger alerts. This leads to fragmentation and complexity in monitoring setups.
+
+[is-healthy](https://github.com/flanksource/is-healthy) is a tool designed to assess and report the health status of Kubernetes and other cloud resources (such as AWS) without the limitations of metric-based approaches.
+
+You can use `is-healthy` to check the status of a resource. For example, to check a pod and output the health status as JSON:
+
+
+
+```yaml
+ready: false
+health: unhealthy
+status: ImagePullBackOff
+message: Back-off pulling image "nginx:invalid"
+lastUpdated: "2025-03-26T10:17:18Z"
+```
+
+
+This example output shows:
+* **ready**: Whether the resource is reconciling or provisioning. Note: `ready` indicates if the resource's desired state matches its actual state, which is different from its health. A pod in a failure state can be `ready` if its state is stable (not changing).
+* **health**: One of `healthy`, `unhealthy`, `warning`, `unknown`. This indicates the overall health assessment.
+* **status**: A text description of the state of the resource, for example, `Running` or `ImagePullBackOff`.
+* **message**: A reason providing more detail for the current status.
+* **lastUpdated**: The timestamp when the resource was lastUpdated or reconciled.
+
+This is example isn't really thay useful, as it needs to be run continously, [canary-checker](https://canarychecker.io/) is a kubernetes health-check platform with support for 30+ check types, The [`kubernetes`](https://canarychecker.io/reference/kubernetes) check uses the `is-healthy` library:
+
+```yaml title=kubernetes.yaml file=./canary.yaml
+```
+
+This can be run locally:
+
+
+
+
+
+## Step-by-Step Guide to State-Based Alerting for Deployments
+
+### Understanding Deployment States
+
+Kubernetes Deployments have a `status` field that contains critical information about rollout progress. Examine what a healthy Deployment status looks like.
+
+Open your terminal and create a simple Nginx deployment, waiting for it to become ready:
+
+
+deployment.apps/nginx created
+
+
+Retrieve the status and it will look like this for a healthy object.
+
+
+[36mavailableReplicas[0m:[95m 1[0m
+[95m[0m[36mconditions[0m:
+ -[36m lastTransitionTime[0m:[32m "2025-03-26T10:11:23Z"[0m[36m[0m
+[36m lastUpdateTime[0m:[32m "2025-03-26T10:11:23Z"[0m[36m[0m
+[36m message[0m:[32m Deployment has minimum availability.[0m
+[32m [0m[36mreason[0m:[32m MinimumReplicasAvailable[0m
+[32m [0m[36mstatus[0m:[32m "True"[0m[36m[0m
+[36m type[0m:[32m Available[0m
+[32m [0m-[36m lastTransitionTime[0m:[32m "2025-03-26T10:11:18Z"[0m[36m[0m
+[36m lastUpdateTime[0m:[32m "2025-03-26T10:11:23Z"[0m[36m[0m
+[36m message[0m:[32m ReplicaSet "nginx-7584b6f84c" has successfully progressed.[0m
+[32m [0m[36mreason[0m:[32m NewReplicaSetAvailable[0m
+[32m [0m[36mstatus[0m:[32m "True"[0m[36m[0m
+[36m type[0m:[32m Progressing[0m
+[32m[0m[36mobservedGeneration[0m:[95m 1[0m
+[95m[0m[36mreadyReplicas[0m:[95m 1[0m
+[95m[0m[36mreplicas[0m:[95m 1[0m
+[95m[0m[36mupdatedReplicas[0m:[95m 1[0m
+
+
+Simulating a failure:
+
+
+deployment.apps/nginx image updated
+
+And then checking on the status:
+
+[36mavailableReplicas[0m:[95m 1[0m
+[95m[0m[36mconditions[0m:
+ -[36m lastTransitionTime[0m:[32m "2025-03-26T10:11:23Z"[0m[36m[0m
+[36m lastUpdateTime[0m:[32m "2025-03-26T10:11:23Z"[0m[36m[0m
+[36m message[0m:[32m Deployment has minimum availability.[0m
+[32m [0m[36mreason[0m:[32m MinimumReplicasAvailable[0m
+[32m [0m[36mstatus[0m:[32m "True"[0m[36m[0m
+[36m type[0m:[32m Available[0m
+[32m [0m-[36m lastTransitionTime[0m:[32m "2025-03-26T10:11:18Z"[0m[36m[0m
+[36m lastUpdateTime[0m:[32m "2025-03-26T10:17:16Z"[0m[36m[0m
+[36m message[0m:[32m ReplicaSet "nginx-744c4cb859" is progressing.[0m
+[32m [0m[36mreason[0m:[32m ReplicaSetUpdated[0m
+[32m [0m[36mstatus[0m:[32m "True"[0m[36m[0m
+[36m type[0m:[32m Progressing[0m
+[32m[0m[36mobservedGeneration[0m:[95m 2[0m
+[95m[0m[36mreadyReplicas[0m:[95m 1[0m
+[95m[0m[36mreplicas[0m:[95m 2[0m
+[95m[0m[36munavailableReplicas[0m:[95m 1[0m
+[95m[0m[36mupdatedReplicas[0m:[95m 1[0m
+
+
+
+
+### Setting Up State-Based Alerting with Mission Control
+
+Mission Control can monitor these states and alert when they indicate problems. Let's create a check to monitor deployment rollout status.
+
+Create a new file named `deployment-state-check.yaml`:
+
+```yaml
+apiVersion: canaries.flanksource.com/v1
+kind: Canary
+metadata:
+ name: deployment-state-check
+spec:
+ interval: 30
+ kubernetes:
+ - name: check-deployment-rollout
+ description: "Monitor deployment rollout state"
+ resource:
+ apiVersion: apps/v1
+ kind: Deployment
+ name: nginx-deployment
+ namespace: default
+ results:
+ - name: Available
+ selector: $.status.conditions[?(@.type=="Available")].status
+ condition: Equal
+ error: "False"
+ - name: Progressing
+ selector: $.status.conditions[?(@.type=="Progressing")].status
+ condition: Equal
+ error: "False"
+ - name: ProgressingReason
+ selector: $.status.conditions[?(@.type=="Progressing")].reason
+ condition: Equal
+ error: "ProgressDeadlineExceeded"
+ - name: ErrorMessage
+ selector: $.status.conditions[?(@.type=="Progressing")].message
+ display: true
+```
+This Canary check:
+1. Runs every 30 seconds (`interval: 30`).
+2. Targets the `Deployment` named `nginx-deployment` in the `default` namespace.
+3. Defines results based on JSONPath selectors applied to the Deployment's status:
+ - Checks if the `Available` condition status is `False`.
+ - Checks if the `Progressing` condition status is `False`.
+ - Checks if the `Progressing` condition reason is `ProgressDeadlineExceeded`.
+ - Captures the `Progressing` condition message for display (`display: true`).
+ An alert is triggered if any condition marked with `error:` is met.
+
+Use `kubectl` to apply the Canary resource definition to your cluster:
+
+```bash
+kubectl apply -f deployment-state-check.yaml
+```
+
+This command registers your state-based check in Mission Control, which will now monitor your Deployment's state.
+
+### Simulating a Failed Deployment
+
+Create a problematic Deployment to see state-based alerting in action.
+
+Create a file named `failing-deployment.yaml`. This YAML defines a Deployment named `failing-deployment`:
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: failing-deployment
+ namespace: default
+spec:
+ replicas: 3
+ selector:
+ matchLabels:
+ app: failing-app
+ strategy:
+ rollingUpdate:
+ maxSurge: 1
+ maxUnavailable: 1
+ type: RollingUpdate
+ template:
+ metadata:
+ labels:
+ app: failing-app
+ spec:
+ containers:
+ - name: container
+ image: nginx:latest
+ resources:
+ limits:
+ memory: "10Mi" # Intentionally too small
+ requests:
+ memory: "10Mi"
+ ports:
+ - containerPort: 80
+```
+This Deployment requests 3 replicas but sets a very low memory limit (`10Mi`), which is likely to cause Pods to be terminated with Out Of Memory (OOM) errors.
+
+Use `kubectl` to apply the failing Deployment definition to your cluster:
+
+```bash
+kubectl apply -f failing-deployment.yaml
+```
+
+This command creates the Deployment, which will likely fail because the Pods require more memory than the specified limit.
+
+### Comparing Alerts: State-Based vs. Prometheus
+
+Now, compare how different monitoring approaches handle this failure.
+
+#### Prometheus Alert (Metric-Based)
+
+With Prometheus, a common alert rule for deployment issues checks for generation mismatches:
+
+```yaml title=KubernetesDeploymentGenerationMismatch
+ - alert: KubernetesDeploymentGenerationMismatch
+ expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
+ for: 10m
+ labels:
+ severity: critical
+ annotations:
+ summary: Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})
+ description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+```
+
+This alert fires when there is a mismatch between the observed and expected generation numbers of a Kubernetes Deployment. The generation number increments whenever the Deployment spec changes. A mismatch indicates that the latest configuration change has not been successfully rolled out by the controller. While useful, it doesn't explain *why* the rollout failed. See [KubernetesDeploymentGenerationMismatch](https://samber.github.io/awesome-prometheus-alerts/rules#rule-kubernetes-1-24) for more details on this type of alert.
+
+
+#### Mission Control State-Based Alert
+
+Use `kubectl` and `jq` to inspect the `conditions` within the status of the `failing-deployment`:
+
+```bash
+kubectl get deployment failing-deployment -o json | jq '.status.conditions'
+```
+
+The output might resemble the following:
+
+```json
+[
+ {
+ "lastTransitionTime": "2023-06-15T15:10:23Z",
+ "lastUpdateTime": "2023-06-15T15:10:23Z",
+ "message": "Deployment does not have minimum availability.",
+ "reason": "MinimumReplicasUnavailable",
+ "status": "False",
+ "type": "Available"
+ },
+ {
+ "lastTransitionTime": "2023-06-15T15:15:45Z",
+ "lastUpdateTime": "2023-06-15T15:15:45Z",
+ "message": "ReplicaSet \"failing-deployment-75d55d96c\" has timed out progressing. 0/3 replicas available. Pods are being killed due to memory limit exceeded.",
+ "reason": "ProgressDeadlineExceeded",
+ "status": "False",
+ "type": "Progressing"
+ }
+]
+```
+This output shows two conditions:
+1. `Available` is `False` because the deployment does not have the minimum required replicas ready (`MinimumReplicasUnavailable`).
+2. `Progressing` is `False` because the rollout timed out (`ProgressDeadlineExceeded`). The message provides specific details about the failure, potentially including reasons like OOM killing if the system surfaces that information here.
+
+Mission Control captures this state and provides an alert with the error message from the `Progressing` condition (e.g., "ReplicaSet ... has timed out progressing..."). This points more directly to the root cause or the symptom reported by Kubernetes.
+
+## Common Pitfalls of State-Based Alerting
+
+### When State-Based Alerting Works Best (and When It Doesn't)
+
+State-based alerting excels when:
+- Resources self-report meaningful status
+- Problems have descriptive error messages
+- You need context for troubleshooting
+
+It's less effective when:
+- Resources don't update status fields
+- You need to alert on trends over time
+- Complex conditions require correlation between multiple resources
+
+### Avoiding Alert Storms
+
+State changes can trigger multiple alerts. To avoid this:
+
+- Group related states into single alerts
+- Add debounce periods for flapping states
+- Use a severity hierarchy based on states
+
+### Combining with Metric-Based Monitoring
+
+The best approach is often a combination:
+- Use state-based alerts for detailed diagnostics
+- Use metric-based alerts for performance issues and trends
+- Create correlation between the two for complete visibility
+
+## Conclusion
+
+State-based alerting transforms monitoring from "something is wrong" to "this is why it's wrong." By capturing the actual state of resources rather than only metrics, Mission Control helps DevOps teams troubleshoot faster and understand failures better.
+
+The ability to extract human-readable error messages directly from Kubernetes resources provides context that metrics alone cannot. As systems become more complex, this context becomes critical for effective incident management.
+
+For Kubernetes operators, combining state-based alerting with traditional metrics creates a complete view of your system's health and gives you the power to resolve issues faster.
+```
+
+- [KubernetesDeploymentGenerationMismatch](https://samber.github.io/awesome-prometheus-alerts/rules#rule-kubernetes-1-24)
diff --git a/mission-control/blog/state-based-alerting/index.mdx b/mission-control/blog/state-based-alerting/index.mdx
new file mode 100644
index 00000000..bf6eb3e0
--- /dev/null
+++ b/mission-control/blog/state-based-alerting/index.mdx
@@ -0,0 +1,502 @@
+# State-Based Alerting: Understanding Why Kubernetes Deployments Fail
+
+
+## Application vs Infrastructure
+
+Application and infrastucture normally have very different failure scenarios, Application errors are normally due to bugs (that produce exceptions) or performance related problems. When there are problems it becomes immiedatly obvious - page fails to load or starts timing out. Infrastructure health is more often related to configuration errors, drift, permissions and unhealthy dependencies problems can lay latent and be influenced by drift and dependences.
+
+Common application health methodologies include USE (**u**tilization, **s**aturation,**e**rrors) and RED (**r**quests, **e**rrors, **d**uration) that primarily use metrics (and log/trace derived metrics) that define thresholds for known health states. It is fairly straightforard to define healthy, unhealthy and warning states. These methodoligies struggle with unknown states i.e. we are not receiving any traffic so we don't if there are any errors. Synthetic testing helps to surface problems by creating artificial transactions
+
+## Metric (Thresholds and Anomalies)
+
+
+## State Based
+
+State-based alerting takes a different approach. It monitors the actual state that objects report about themselves (in Kubernetes, this includes `conditions` and `status`fields), which often contain human-readable explanations. These states often follow RAG (Red-Amber-Green) patterns, making them intuitive to understand.
+
+## Synthetic Testing
+
+
+Infrastructure errors tend be more state oreinted
+
+## Alerting Types and Examples
+
+There are various types of alerting methods, and choosing the right one can be challenging.
+
+| Alerting Type | Example(s) | Use Cases |
+| :---- | :---- | :---- |
+| **Metrics (Threshold)** | \- CPU \> 90% for 5 minutes.
\- Available disk space \< 10GB. | Best for USE (**u**tilization, **s**aturation,**e**rrors) and known errors. |
+| **Anomaly Detection** | \- Website traffic drops 50% compared to the same time last week.
\- Login attempts spike far beyond the normal range. | Useful for detecting unusual patterns and behavior that deviate from historical norms. Suitable for security monitoring and business metrics. |
+| **Log-Based** | \- More than 10 "HTTP 500" errors in web server logs within 1 minute.
\- Any log containing `OutOfMemoryError`. | Ideal for error detection, security events, and application-specific issues that are captured in logs. Good for detailed troubleshooting context. |
+| **State Based** | \- Kubernetes Node condition `Ready` \= False for 10 minutes.
\- Pod status is `CrashLoopBackOff`.
\- Deployment condition `Progressing = False` with reason: `ProgressDeadlineExceeded`. | Suitable for infrastructure and platform monitoring where resources have defined states. Good for Kubernetes and cloud resource health monitoring. |
+| **Synthetic** | \- Simulated user login journey fails from an external testing location.
\- Critical API endpoint response time exceeds 2 seconds from an external check.
\- Website homepage fails to load correctly from an external probe. | Best for end-to-end monitoring and user experience validation. Ideal for critical business flows and external service dependency checks. |
+
+
+This article compares Metric vs State Based alerts needed by platform teams managing infrastructure and deployments.
+
+Traditional monitoring relies on metrics. Tools like Prometheus collect numerical data and trigger alerts when values cross thresholds. For example, a `PodCrashLooping` rule in Prometheus might fire when the container restart count increases.
+
+
+
+## Metrics
+
+Using the example of a crashing pod, kubelet does not expose a native metric for this purpose. You need to use `kube-state-metrics`, which exposes the number of container restarts under `kube_pod_container_status_restarts_total`. You can then write a Prometheus Rule (alert):
+
+```yaml
+alert: KubernetesPodCrashLooping
+expr: increase(kube_pod_container_status_restarts_total[1m]) > 3
+for: 2m
+labels:
+ severity: warning
+annotations:
+ summary: Kubernetes pod crash looping (instance {{ $labels.instance }})
+ description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping"
+```
+
+And would produce an alert similar to this:
+
+
+
+There are some drawbacks with this approach:
+
+* **Limited Details** - The alert tells you **_what_** happened, but not **_why_**.
+* **Limited Context** - You know the name of the pod and namespace, but not much else. If you want to restrict alerts to only pods labelled `env: production`, `kube-state-metrics` needs to be updated to whitelist the label.
+* **Cardinality Challenges** - Whitelisting is required, as without it, you risk a cardinality explosion. Ingesting large amounts of metrics can be expensive and inefficient.
+* **Configuration Overhead** - Each failure scenario requires configuration, first with the extraction of metrics and then by creating and fine-tuning alerts.
+
+These challenges are due to how TSDBs handle textual vs numerical data - the details and context you need is all in the text.
+
+
+## State-Based Alerting
+
+
+The first step to "configuration-less" alerts is some standardization on what it means for something to be unhealthy. This is still an unsolved problem outside of Pod Probes. Kubernetes has taken some early steps with Conditions - which is an interface for reporting the state of a resource as either `unhealthy` or `healthy`.
+
+If you run the following command to get the YAML definition of a Pod:
+
+
+
+[36mapiVersion[0m:[32m v1[0m
+[32m[0m[36mkind[0m:[32m Pod[0m
+[32m[0m[36mmetadata[0m:[36m[0m
+[36m creationTimestamp[0m:[32m "2025-03-26T10:17:16Z"[0m[36m[0m
+[36m generateName[0m:[32m nginx-744c4cb859-[0m
+[32m [0m[36mlabels[0m:[36m[0m
+[36m app[0m:[32m nginx[0m
+[32m [0m[36mpod-template-hash[0m:[32m 744c4cb859[0m
+[32m [0m[36mname[0m:[32m nginx-744c4cb859-5p5hk[0m
+[32m [0m[36mnamespace[0m:[32m default[0m
+[32m [0m[36mownerReferences[0m:
+ -[36m apiVersion[0m:[32m apps/v1[0m
+[32m [0m[36mblockOwnerDeletion[0m:[95m true[0m
+[95m [0m[36mcontroller[0m:[95m true[0m
+[95m [0m[36mkind[0m:[32m ReplicaSet[0m
+[32m [0m[36mname[0m:[32m nginx-744c4cb859[0m
+[32m [0m[36muid[0m:[32m ae5213f5-988b-4d00-9101-bf6779dc17e4[0m
+[32m [0m[36mresourceVersion[0m:[32m "471187955"[0m[36m[0m
+[36m uid[0m:[32m 138dd15f-2433-479c-9ef9-8fe5034dfb9c[0m
+[32m[0m[36mspec[0m:[36m[0m
+[36m containers[0m:
+ -[36m image[0m:[32m nginx:invalid[0m
+[32m [0m[36mimagePullPolicy[0m:[32m Always[0m
+[32m [0m[36mname[0m:[32m nginx[0m
+[32m [0m[36mresources[0m: {}[36m[0m
+[36m terminationMessagePath[0m:[32m /dev/termination-log[0m
+[32m [0m[36mterminationMessagePolicy[0m:[32m File[0m
+[32m [0m[36mvolumeMounts[0m:
+ -[36m mountPath[0m:[32m /var/run/secrets/kubernetes.io/serviceaccount[0m
+[32m [0m[36mname[0m:[32m kube-api-access-9pnfj[0m
+[32m [0m[36mreadOnly[0m:[95m true[0m
+[95m [0m[36mdnsPolicy[0m:[32m ClusterFirst[0m
+[32m [0m[36menableServiceLinks[0m:[95m true[0m
+[95m [0m[36mimagePullSecrets[0m:
+ -[36m name[0m:[32m dockerhub[0m
+[32m [0m[36mnodeName[0m:[32m ip-10-0-5-138.eu-west-1.compute.internal[0m
+[32m [0m[36mpreemptionPolicy[0m:[32m PreemptLowerPriority[0m
+[32m [0m[36mpriority[0m:[95m 0[0m
+[95m [0m[36mrestartPolicy[0m:[32m Always[0m
+[32m [0m[36mschedulerName[0m:[32m default-scheduler[0m
+[32m [0m[36msecurityContext[0m: {}[36m[0m
+[36m serviceAccount[0m:[32m default[0m
+[32m [0m[36mserviceAccountName[0m:[32m default[0m
+[32m [0m[36mterminationGracePeriodSeconds[0m:[95m 30[0m
+[95m [0m[36mtolerations[0m:
+ -[36m effect[0m:[32m NoExecute[0m
+[32m [0m[36mkey[0m:[32m node.kubernetes.io/not-ready[0m
+[32m [0m[36moperator[0m:[32m Exists[0m
+[32m [0m[36mtolerationSeconds[0m:[95m 300[0m
+[95m [0m-[36m effect[0m:[32m NoExecute[0m
+[32m [0m[36mkey[0m:[32m node.kubernetes.io/unreachable[0m
+[32m [0m[36moperator[0m:[32m Exists[0m
+[32m [0m[36mtolerationSeconds[0m:[95m 300[0m
+[95m [0m[36mvolumes[0m:
+ -[36m name[0m:[32m kube-api-access-9pnfj[0m
+[32m [0m[36mprojected[0m:[36m[0m
+[36m defaultMode[0m:[95m 420[0m
+[95m [0m[36msources[0m:
+ -[36m serviceAccountToken[0m:[36m[0m
+[36m expirationSeconds[0m:[95m 3607[0m
+[95m [0m[36mpath[0m:[32m token[0m
+[32m [0m-[36m configMap[0m:[36m[0m
+[36m items[0m:
+ -[36m key[0m:[32m ca.crt[0m
+[32m [0m[36mpath[0m:[32m ca.crt[0m
+[32m [0m[36mname[0m:[32m kube-root-ca.crt[0m
+[32m [0m-[36m downwardAPI[0m:[36m[0m
+[36m items[0m:
+ -[36m fieldRef[0m:[36m[0m
+[36m apiVersion[0m:[32m v1[0m
+[32m [0m[36mfieldPath[0m:[32m metadata.namespace[0m
+[32m [0m[36mpath[0m:[32m namespace[0m
+[32m[0m[36mstatus[0m:[36m[0m
+[36m conditions[0m:
+ -[36m lastProbeTime[0m: null
+ [36mlastTransitionTime[0m:[32m "2025-03-26T10:17:18Z"[0m[36m[0m
+[36m status[0m:[32m "True"[0m[36m[0m
+[36m type[0m:[32m PodReadyToStartContainers[0m
+[32m [0m-[36m lastProbeTime[0m: null
+ [36mlastTransitionTime[0m:[32m "2025-03-26T10:17:16Z"[0m[36m[0m
+[36m status[0m:[32m "True"[0m[36m[0m
+[36m type[0m:[32m Initialized[0m
+[32m [0m-[36m lastProbeTime[0m: null
+ [36mlastTransitionTime[0m:[32m "2025-03-26T10:17:16Z"[0m[36m[0m
+[36m message[0m:[32m 'containers with unready status: [nginx]'[0m[36m[0m
+[36m reason[0m:[32m ContainersNotReady[0m
+[32m [0m[36mstatus[0m:[32m "False"[0m[36m[0m
+[36m type[0m:[32m Ready[0m
+[32m [0m-[36m lastProbeTime[0m: null
+ [36mlastTransitionTime[0m:[32m "2025-03-26T10:17:16Z"[0m[36m[0m
+[36m message[0m:[32m 'containers with unready status: [nginx]'[0m[36m[0m
+[36m reason[0m:[32m ContainersNotReady[0m
+[32m [0m[36mstatus[0m:[32m "False"[0m[36m[0m
+[36m type[0m:[32m ContainersReady[0m
+[32m [0m-[36m lastProbeTime[0m: null
+ [36mlastTransitionTime[0m:[32m "2025-03-26T10:17:16Z"[0m[36m[0m
+[36m status[0m:[32m "True"[0m[36m[0m
+[36m type[0m:[32m PodScheduled[0m
+[32m [0m[36mcontainerStatuses[0m:
+ -[36m image[0m:[32m nginx:invalid[0m
+[32m [0m[36mimageID[0m:[32m ""[0m[36m[0m
+[36m lastState[0m: {}[36m[0m
+[36m name[0m:[32m nginx[0m
+[32m [0m[36mready[0m:[95m false[0m
+[95m [0m[36mrestartCount[0m:[95m 0[0m
+[95m [0m[36mstarted[0m:[95m false[0m
+[95m [0m[36mstate[0m:[36m[0m
+[36m waiting[0m:[36m[0m
+[36m message[0m:[32m Back-off pulling image "nginx:invalid"[0m
+[32m [0m[36mreason[0m:[32m ImagePullBackOff[0m
+[32m [0m[36mhostIP[0m:[32m 10.0.5.138[0m
+[32m [0m[36mhostIPs[0m:
+ -[36m ip[0m:[32m 10.0.5.138[0m
+[32m [0m[36mphase[0m:[32m Pending[0m
+[32m [0m[36mpodIP[0m:[32m 10.0.5.78[0m
+[32m [0m[36mpodIPs[0m:
+ -[36m ip[0m:[32m 10.0.5.78[0m
+[32m [0m[36mqosClass[0m:[32m BestEffort[0m
+[32m [0m[36mstartTime[0m:[32m "2025-03-26T10:17:16Z"[0m
+
+
+
+
+While standards exist for exposing metrics, there's no equivalent standard for exposing the thresholds or conditions that trigger alerts. This leads to fragmentation and complexity in monitoring setups.
+
+[is-healthy](https://github.com/flanksource/is-healthy) is a tool designed to assess and report the health status of Kubernetes and other cloud resources (such as AWS) without the limitations of metric-based approaches.
+
+You can use `is-healthy` to check the status of a resource. For example, to check a pod and output the health status as JSON:
+
+
+
+```yaml
+ready: false
+health: unhealthy
+status: ImagePullBackOff
+message: Back-off pulling image "nginx:invalid"
+lastUpdated: "2025-03-26T10:17:18Z"
+```
+
+
+This example output shows:
+* **ready**: Whether the resource is reconciling or provisioning. Note: `ready` indicates if the resource's desired state matches its actual state, which is different from its health. A pod in a failure state can be `ready` if its state is stable (not changing).
+* **health**: One of `healthy`, `unhealthy`, `warning`, `unknown`. This indicates the overall health assessment.
+* **status**: A text description of the state of the resource, for example, `Running` or `ImagePullBackOff`.
+* **message**: A reason providing more detail for the current status.
+* **lastUpdated**: The timestamp when the resource was lastUpdated or reconciled.
+
+This is example isn't really thay useful, as it needs to be run continously, [canary-checker](https://canarychecker.io/) is a kubernetes health-check platform with support for 30+ check types, The [`kubernetes`](https://canarychecker.io/reference/kubernetes) check uses the `is-healthy` library:
+
+```yaml title=kubernetes.yaml file=./canary.yaml
+```
+
+This can be run locally:
+
+
+
+
+
+## Step-by-Step Guide to State-Based Alerting for Deployments
+
+### Understanding Deployment States
+
+Kubernetes Deployments have a `status` field that contains critical information about rollout progress. Examine what a healthy Deployment status looks like.
+
+Open your terminal and create a simple Nginx deployment, waiting for it to become ready:
+
+
+deployment.apps/nginx created
+
+
+Retrieve the status and it will look like this for a healthy object.
+
+
+[36mavailableReplicas[0m:[95m 1[0m
+[95m[0m[36mconditions[0m:
+ -[36m lastTransitionTime[0m:[32m "2025-03-26T10:11:23Z"[0m[36m[0m
+[36m lastUpdateTime[0m:[32m "2025-03-26T10:11:23Z"[0m[36m[0m
+[36m message[0m:[32m Deployment has minimum availability.[0m
+[32m [0m[36mreason[0m:[32m MinimumReplicasAvailable[0m
+[32m [0m[36mstatus[0m:[32m "True"[0m[36m[0m
+[36m type[0m:[32m Available[0m
+[32m [0m-[36m lastTransitionTime[0m:[32m "2025-03-26T10:11:18Z"[0m[36m[0m
+[36m lastUpdateTime[0m:[32m "2025-03-26T10:11:23Z"[0m[36m[0m
+[36m message[0m:[32m ReplicaSet "nginx-7584b6f84c" has successfully progressed.[0m
+[32m [0m[36mreason[0m:[32m NewReplicaSetAvailable[0m
+[32m [0m[36mstatus[0m:[32m "True"[0m[36m[0m
+[36m type[0m:[32m Progressing[0m
+[32m[0m[36mobservedGeneration[0m:[95m 1[0m
+[95m[0m[36mreadyReplicas[0m:[95m 1[0m
+[95m[0m[36mreplicas[0m:[95m 1[0m
+[95m[0m[36mupdatedReplicas[0m:[95m 1[0m
+
+
+Simulating a failure:
+
+
+deployment.apps/nginx image updated
+
+And then checking on the status:
+
+[36mavailableReplicas[0m:[95m 1[0m
+[95m[0m[36mconditions[0m:
+ -[36m lastTransitionTime[0m:[32m "2025-03-26T10:11:23Z"[0m[36m[0m
+[36m lastUpdateTime[0m:[32m "2025-03-26T10:11:23Z"[0m[36m[0m
+[36m message[0m:[32m Deployment has minimum availability.[0m
+[32m [0m[36mreason[0m:[32m MinimumReplicasAvailable[0m
+[32m [0m[36mstatus[0m:[32m "True"[0m[36m[0m
+[36m type[0m:[32m Available[0m
+[32m [0m-[36m lastTransitionTime[0m:[32m "2025-03-26T10:11:18Z"[0m[36m[0m
+[36m lastUpdateTime[0m:[32m "2025-03-26T10:17:16Z"[0m[36m[0m
+[36m message[0m:[32m ReplicaSet "nginx-744c4cb859" is progressing.[0m
+[32m [0m[36mreason[0m:[32m ReplicaSetUpdated[0m
+[32m [0m[36mstatus[0m:[32m "True"[0m[36m[0m
+[36m type[0m:[32m Progressing[0m
+[32m[0m[36mobservedGeneration[0m:[95m 2[0m
+[95m[0m[36mreadyReplicas[0m:[95m 1[0m
+[95m[0m[36mreplicas[0m:[95m 2[0m
+[95m[0m[36munavailableReplicas[0m:[95m 1[0m
+[95m[0m[36mupdatedReplicas[0m:[95m 1[0m
+
+
+
+
+### Setting Up State-Based Alerting with Mission Control
+
+Mission Control can monitor these states and alert when they indicate problems. Let's create a check to monitor deployment rollout status.
+
+Create a new file named `deployment-state-check.yaml`:
+
+```yaml
+apiVersion: canaries.flanksource.com/v1
+kind: Canary
+metadata:
+ name: deployment-state-check
+spec:
+ interval: 30
+ kubernetes:
+ - name: check-deployment-rollout
+ description: "Monitor deployment rollout state"
+ resource:
+ apiVersion: apps/v1
+ kind: Deployment
+ name: nginx-deployment
+ namespace: default
+ results:
+ - name: Available
+ selector: $.status.conditions[?(@.type=="Available")].status
+ condition: Equal
+ error: "False"
+ - name: Progressing
+ selector: $.status.conditions[?(@.type=="Progressing")].status
+ condition: Equal
+ error: "False"
+ - name: ProgressingReason
+ selector: $.status.conditions[?(@.type=="Progressing")].reason
+ condition: Equal
+ error: "ProgressDeadlineExceeded"
+ - name: ErrorMessage
+ selector: $.status.conditions[?(@.type=="Progressing")].message
+ display: true
+```
+This Canary check:
+1. Runs every 30 seconds (`interval: 30`).
+2. Targets the `Deployment` named `nginx-deployment` in the `default` namespace.
+3. Defines results based on JSONPath selectors applied to the Deployment's status:
+ - Checks if the `Available` condition status is `False`.
+ - Checks if the `Progressing` condition status is `False`.
+ - Checks if the `Progressing` condition reason is `ProgressDeadlineExceeded`.
+ - Captures the `Progressing` condition message for display (`display: true`).
+An alert is triggered if any condition marked with `error:` is met.
+
+Use `kubectl` to apply the Canary resource definition to your cluster:
+
+```bash
+kubectl apply -f deployment-state-check.yaml
+```
+
+This command registers your state-based check in Mission Control, which will now monitor your Deployment's state.
+
+### Simulating a Failed Deployment
+
+Create a problematic Deployment to see state-based alerting in action.
+
+Create a file named `failing-deployment.yaml`. This YAML defines a Deployment named `failing-deployment`:
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: failing-deployment
+ namespace: default
+spec:
+ replicas: 3
+ selector:
+ matchLabels:
+ app: failing-app
+ strategy:
+ rollingUpdate:
+ maxSurge: 1
+ maxUnavailable: 1
+ type: RollingUpdate
+ template:
+ metadata:
+ labels:
+ app: failing-app
+ spec:
+ containers:
+ - name: container
+ image: nginx:latest
+ resources:
+ limits:
+ memory: "10Mi" # Intentionally too small
+ requests:
+ memory: "10Mi"
+ ports:
+ - containerPort: 80
+```
+This Deployment requests 3 replicas but sets a very low memory limit (`10Mi`), which is likely to cause Pods to be terminated with Out Of Memory (OOM) errors.
+
+Use `kubectl` to apply the failing Deployment definition to your cluster:
+
+```bash
+kubectl apply -f failing-deployment.yaml
+```
+
+This command creates the Deployment, which will likely fail because the Pods require more memory than the specified limit.
+
+### Comparing Alerts: State-Based vs. Prometheus
+
+Now, compare how different monitoring approaches handle this failure.
+
+#### Prometheus Alert (Metric-Based)
+
+With Prometheus, a common alert rule for deployment issues checks for generation mismatches:
+
+```yaml title=KubernetesDeploymentGenerationMismatch
+ - alert: KubernetesDeploymentGenerationMismatch
+ expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
+ for: 10m
+ labels:
+ severity: critical
+ annotations:
+ summary: Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})
+ description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+```
+
+This alert fires when there is a mismatch between the observed and expected generation numbers of a Kubernetes Deployment. The generation number increments whenever the Deployment spec changes. A mismatch indicates that the latest configuration change has not been successfully rolled out by the controller. While useful, it doesn't explain *why* the rollout failed. See [KubernetesDeploymentGenerationMismatch](https://samber.github.io/awesome-prometheus-alerts/rules#rule-kubernetes-1-24) for more details on this type of alert.
+
+
+#### Mission Control State-Based Alert
+
+Use `kubectl` and `jq` to inspect the `conditions` within the status of the `failing-deployment`:
+
+```bash
+kubectl get deployment failing-deployment -o json | jq '.status.conditions'
+```
+
+The output might resemble the following:
+
+```json
+[
+ {
+ "lastTransitionTime": "2023-06-15T15:10:23Z",
+ "lastUpdateTime": "2023-06-15T15:10:23Z",
+ "message": "Deployment does not have minimum availability.",
+ "reason": "MinimumReplicasUnavailable",
+ "status": "False",
+ "type": "Available"
+ },
+ {
+ "lastTransitionTime": "2023-06-15T15:15:45Z",
+ "lastUpdateTime": "2023-06-15T15:15:45Z",
+ "message": "ReplicaSet \"failing-deployment-75d55d96c\" has timed out progressing. 0/3 replicas available. Pods are being killed due to memory limit exceeded.",
+ "reason": "ProgressDeadlineExceeded",
+ "status": "False",
+ "type": "Progressing"
+ }
+]
+```
+This output shows two conditions:
+1. `Available` is `False` because the deployment does not have the minimum required replicas ready (`MinimumReplicasUnavailable`).
+2. `Progressing` is `False` because the rollout timed out (`ProgressDeadlineExceeded`). The message provides specific details about the failure, potentially including reasons like OOM killing if the system surfaces that information here.
+
+Mission Control captures this state and provides an alert with the error message from the `Progressing` condition (e.g., "ReplicaSet ... has timed out progressing..."). This points more directly to the root cause or the symptom reported by Kubernetes.
+
+## Common Pitfalls of State-Based Alerting
+
+### When State-Based Alerting Works Best (and When It Doesn't)
+
+State-based alerting excels when:
+- Resources self-report meaningful status
+- Problems have descriptive error messages
+- You need context for troubleshooting
+
+It's less effective when:
+- Resources don't update status fields
+- You need to alert on trends over time
+- Complex conditions require correlation between multiple resources
+
+### Avoiding Alert Storms
+
+State changes can trigger multiple alerts. To avoid this:
+
+- Group related states into single alerts
+- Add debounce periods for flapping states
+- Use a severity hierarchy based on states
+
+### Combining with Metric-Based Monitoring
+
+The best approach is often a combination:
+- Use state-based alerts for detailed diagnostics
+- Use metric-based alerts for performance issues and trends
+- Create correlation between the two for complete visibility
+
+## Conclusion
+
+State-based alerting transforms monitoring from "something is wrong" to "this is why it's wrong." By capturing the actual state of resources rather than only metrics, Mission Control helps DevOps teams troubleshoot faster and understand failures better.
+
+The ability to extract human-readable error messages directly from Kubernetes resources provides context that metrics alone cannot. As systems become more complex, this context becomes critical for effective incident management.
+
+For Kubernetes operators, combining state-based alerting with traditional metrics creates a complete view of your system's health and gives you the power to resolve issues faster.
+```
+
+- [KubernetesDeploymentGenerationMismatch](https://samber.github.io/awesome-prometheus-alerts/rules#rule-kubernetes-1-24)
diff --git a/mission-control/blog/state-based-alerting/nginx.yaml b/mission-control/blog/state-based-alerting/nginx.yaml
new file mode 100644
index 00000000..2445e68f
--- /dev/null
+++ b/mission-control/blog/state-based-alerting/nginx.yaml
@@ -0,0 +1,8 @@
+apiVersion: v1
+kind: Pod
+metadata:
+ name: test-pod
+spec:
+ containers:
+ - image: nginx:invalid
+ name: nginx
diff --git a/mission-control/static/img/PodCrashLooping.png b/mission-control/static/img/PodCrashLooping.png
new file mode 100644
index 00000000..c59590ec
Binary files /dev/null and b/mission-control/static/img/PodCrashLooping.png differ
diff --git a/prompts/blog.md b/prompts/blog.md
new file mode 100644
index 00000000..b817d0aa
--- /dev/null
+++ b/prompts/blog.md
@@ -0,0 +1,55 @@
+Write a 1250-1500 introductor blog post for a DevOps / Platform Engineering focused product or feature.
+
+The tone should be developer-friendly, technical and informative without sounding salesy or promotional.
+
+The writing should have no fluff, use short punchy sentences, avoid buzzwords and speak like a senior engineer would.
+
+The purpose of this copy is to generate interest in a new approach to a feature, educate DevOps engineers, increase awareness and reduce friction to trial
+
+Speak directly to platform engineers, SREs and DevOps leads and address their pain points of change and tool fatigue.
+
+Avoid generic language. Favor clarity over cleverness. Highlight real-world outcomes and developer-first thinking.
+
+Including working code examples / snippets and links were appropriate.
+
+Format the output in raw markdown suitable for copy and pasting into vscode.
+
+Write a blog post on Flanksource MIssion Control approach to AIops, primarily building a real-time and update to mirror state of cloud resources that can be queried rapidly, plus an advanced graph that builds relationships between resources e.g. Cloudformation -> Auto Scaling Group > EC2 instance and then layers on soft relationsyhips like ebs volumes, subnets, IAM poilcies - For Kubernetes it understands Flux and Gitops being able to build a graph of a Flux Kustomization object creating a HelmRelease CRD, which then creates a deployment -> replicset -> pod and then layeying relationships like services, nodes, PVC, ingress, etc..
+
+State based alerting (i.e. whene resource self-report failure) and traditioanl alerts from APM tools trigger playbooks that can then proactively collect infomation in a distrubuted fashion from agents deployed closest to the data, the graph, changes to the graph resources, events and pro-acrtive playboks are then fed into the model which tan the recommend futher playbooks to execute.
+
+This is advantage as acess to systems is pushed down to agents who can use secrets like pod identity and service accounts to collect duta, new agent actions are use to create with YAML based playbooks.
+
+Write a blog post on the benefits of GitOps and the challenges of adoption - especially with mixed maturity teams (some prefer working in git and others like clickops) - Highlight the mission control approach to gitops (tracking resources and building a graph on how they map to git repository and sources), which enables "editing" kubernetes objects with the changes being submitted back to git. The benefits include a
+
+contrasting metrics vs state driven alerting, store with concepts such as RED and USE and how they are more apropriate for monitoring transactions and staady state workloads and fall short for more platform engineering tasks such as monitoring the rollout of a new application or checking if a cluster is healthy after an upgrade. and then use examples of Prometheus and the canary-checker kubernetes check (https://flanksource.com/docs/guide/canary-checker/reference/kubernetes) which used the underlying https://github.com/flanksource/is-healthy library
+
+Highlight the drawbacks of the canary-checker approach that is poll-based and does not scale very well and demononstrate how https://github.com/flanksource/config-db takes this one step further by using a state driven approach that watches for changes to cloud resources, and then fires events when the state becomes unhealthy.
+
+
+ is more scalable and can be used to monitor the health of a cluster or application in real-time.
+
+Optionally, include {optional elements} (e.g. a strong CTA, technical example, code snippet, customer proof, comparison table).
+
+
+Act as a technical blog writer targeting devops and platform engineers working with Kubernetes, GitOps, Helm and Flux, when editing and rewriting content follow these instructions strictly:
+
+1. Use the following outline for the blog:
+ * Introduction - introduce the topic of the blog with a pain point or teaser
+ * Background - Describe the context and initial challenges.
+ * Step by step guide
+ * Common Pitfalls - Highlight common mistakes and how to avoid them and add use-cases that are not a good fiit
+ * Conclustion - Offer final thoughts and potential future implications.
+2. Write at a Grade 10 level
+3. Use clear, concise simple language, even when explaining complex topics.
+4. Bias toward short sentences.
+5. Mix and match lists and paragraphs
+6. Do not use any salesy or marketing terms, Do not use adverbs
+7. Use MDX formatting
+8. Precede every command with an explanation of what the command does. After the command, provide additional details about the command, such as what the arguments do and why your reader is using them.
+9. Explicitly tell the user to create or open each file you’ll have them use.
+10. Like commands, always introduce a file or script by describing its general purpose, then explain any changes that the reader will be making in the file. Without these explanations, readers won’t be able to customize, update, or troubleshoot issues in the long run.
+11. If you’re asking the reader to write code, follow the same approach for commands: introduce the code block with a high-level explanation of what it does. Then show the code, and then call out any important details.
+12. Do not use the term "this document", when referring to the system or product being documented always use "Mission Control"
+13. Ensure all examples and use cases are relevant
+
diff --git a/prompts/style.md b/prompts/style.md
new file mode 100644
index 00000000..e95a2c4c
--- /dev/null
+++ b/prompts/style.md
@@ -0,0 +1,56 @@
+# Writing Style Guidelines
+
+Follow these strict rules:
+
+1. Avoid adverbs and complex language
+
+## Formating
+- Format all output using MDX (markdowon)
+- Format code and examples using this example:
+
+ ```yaml title=some-title.yaml
+ ```
+- Do not remove any "```" or "//highlight-next-line" text
+- Follow standard markdown rules provided by markdownlint
+
+
+## Verb Tense
+- Use present tense verbs instead of future tense.
+- Say "this happens" rather than "this will happen."
+- Avoid words like "will," "shall," "won't," "shan't," and contractions with "'ll."
+
+## Voice
+- Do not use first person (I, me, my, mine, we, our).
+- Avoid phrases like "I'd," "I'll," "I'm," and "I've."
+- Use passive voice sparingly. Active voice is generally clearer.
+
+## Inclusive Language
+- Use considerate language that respects all readers.
+- Use "they" as a singular pronoun instead of "he/she" or "s/he."
+- Avoid terms that might be insensitive:
+ - Replace "sane" with "correct," "adequate," "sensible," etc.
+ - Replace "crazy/insane" with "extremely," "very," "illogical," etc.
+ - Replace "dummy" with "placeholder" or "test"
+ - Replace "hang" with "freeze" or "become unresponsive"
+ - Avoid phrases like "fall on deaf ears" or "blind spot"
+
+## Tone
+- Don't assume success with statements like "congratulations," "that's it," or "you did it."
+- Avoid condescending terms like "obvious," "obviously," "simple," "simply," "easy," "easily," "of course," "clearly," or "everyone knows."
+- Don't add "-ly" to ordinal numbers (avoid "firstly," "secondly," etc.)
+
+## Clarity and Brevity
+- Use simple words instead of complex ones.
+- Avoid foreign phrases like "i.e.," "viz.," or "ergo."
+- Eliminate wordiness and redundant phrases:
+ - "in order to" → "to"
+ - "due to the fact that" → "because"
+ - "at this point in time" → "now"
+ - "in the event that" → "if"
+- Remove phrases that don't add clarity:
+ - "note that"
+ - "it is important to note"
+ - "keep in mind"
+ - "as you can see"
+
+Remember that clear, direct language helps readers understand your content more easily.
diff --git a/styles/Flanksource/Acronyms.yml b/styles/Flanksource/Acronyms.yml
index 150db768..2c85cb77 100644
--- a/styles/Flanksource/Acronyms.yml
+++ b/styles/Flanksource/Acronyms.yml
@@ -19,25 +19,24 @@ exceptions:
- CRUD
- CSS
- CSV
- - RDS
- - SQS
- - Subnet
- - IAMRole
- - ECSTask
- - ECSCluster
- - EBSVolume
- - EBSVolume
- - DNSZone
+ - DEBUG
- DHCP
- - ECS
- DNS
- - EKS
- - IAM
- - IRSA
- - VPC
- - DEBUG
+ - DNSZone
- DOM
- DPI
+ - EBS
+ - EBSVolume
+ - ECR
+ - ECS
+ - ECSCluster
+ - ECSService
+ - ECSTask
+ - ECSTask
+ - EFS
+ - EKS
+ - EKSCluster
+ - EKSCluster
- FAQ
- GCC
- GDB
@@ -48,7 +47,11 @@ exceptions:
- HTML
- HTTP
- HTTPS
+ - IAM
+ - IAMRole
+ - IAMUser
- IDE
+ - IRSA
- JAR
- JPG
- JSON
@@ -65,14 +68,21 @@ exceptions:
- POST
- RAM
- RBAC
+ - RDS
+ - RDSInstance
- REPL
- RSA
- SCM
- SCSS
- SDK
+ - SNS
+ - SNSTopic
- SQL
+ - SQS
+ - SQSQueue
- SSH
- SSL
+ - Subnet
- SVG
- TBD
- TCP
@@ -82,6 +92,7 @@ exceptions:
- USB
- UTF
- UUID
+ - VPC
- XML
- XSS
- YAML
diff --git a/styles/Flanksource/CaseSensitiveSpellingSuggestions.yml b/styles/Flanksource/CaseSensitiveSpellingSuggestions.yml
index 61391c4e..f6995126 100644
--- a/styles/Flanksource/CaseSensitiveSpellingSuggestions.yml
+++ b/styles/Flanksource/CaseSensitiveSpellingSuggestions.yml
@@ -5,10 +5,4 @@ level: error
action:
name: replace
swap:
- "[Ii]ngress.NGINX": Ingress-NGINX
- ".ngress.[Nn]ginx": Ingress-NGINX
- "[Nn]ginx [Ii]ngress [Cc]ontroller": NGINX Ingress Controller
- "NGINX ingress [Cc]ontroller": NGINX Ingress Controller
- "NGINX [Ii]ngress controller": NGINX Ingress Controller
- "(?