From 8a5d38cc2c37b9c67eab79e9c48c3414aa9a0620 Mon Sep 17 00:00:00 2001 From: Matt Wise <768067+diranged@users.noreply.github.com> Date: Mon, 27 May 2024 18:27:43 -0700 Subject: [PATCH] chore(prometheus-alerts): allow opting out of the validity selector alerts (#307) There are some cases where we know the selector is fine (ie, `.*`) and yet there are not ALWAYS going to be resources that match that selector - an example would be any `Job` that is created as part of a release process... Jobs may come and go, and there may be periods of times when a `Job` does not exist, so the `kube_job_info{}` metric won't exist. In these cases, we need an escape hatch to allow the critical alarms to exist without alerting oncall engineers just because a job is _missing_.. Co-authored-by: Matt Wise --- charts/prometheus-alerts/Chart.yaml | 2 +- charts/prometheus-alerts/README.md | 14 +++++++------- .../templates/containers-prometheusrule.yaml | 16 ++++++++++++++-- charts/prometheus-alerts/values.yaml | 6 ++++++ 4 files changed, 28 insertions(+), 10 deletions(-) diff --git a/charts/prometheus-alerts/Chart.yaml b/charts/prometheus-alerts/Chart.yaml index 6fa52bb4..0e807225 100644 --- a/charts/prometheus-alerts/Chart.yaml +++ b/charts/prometheus-alerts/Chart.yaml @@ -2,7 +2,7 @@ apiVersion: v2 name: prometheus-alerts description: Helm Chart that provisions a series of common Prometheus Alerts type: application -version: 1.7.3 +version: 1.7.4 appVersion: 0.0.1 maintainers: - name: diranged diff --git a/charts/prometheus-alerts/README.md b/charts/prometheus-alerts/README.md index 340f6047..995ecda7 100644 --- a/charts/prometheus-alerts/README.md +++ b/charts/prometheus-alerts/README.md @@ -2,7 +2,7 @@ Helm Chart that provisions a series of common Prometheus Alerts -![Version: 1.7.3](https://img.shields.io/badge/Version-1.7.3-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 0.0.1](https://img.shields.io/badge/AppVersion-0.0.1-informational?style=flat-square) +![Version: 1.7.4](https://img.shields.io/badge/Version-1.7.4-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 0.0.1](https://img.shields.io/badge/AppVersion-0.0.1-informational?style=flat-square) [deployments]: https://kubernetes.io/docs/concepts/workloads/controllers/deployment/ [hpa]: https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/ @@ -101,7 +101,7 @@ This behavior can be tuned via the `defaults.podNameSelector`, | alertManager.repeatInterval | string | `"1h"` | How long to wait before sending a notification again if it has already been sent successfully for an alert. (Usually ~3h or more). | | chart_name | string | `"prometheus-rules"` | | | chart_source | string | `"https://github.com/Nextdoor/k8s-charts"` | | -| containerRules.daemonsets.DaemonsetSelectorValidity | object | `{"for":"1h","labels":{},"severity":"warning"}` | Does a basic lookup using the defined selectors to see if we can see any info for a given selector. This is the "watcher for the watcher". If we get alerted by this, we likely have a bad selector and our alerts are not going to ever fire. | +| containerRules.daemonsets.DaemonsetSelectorValidity | object | `{"enabled":true,"for":"1h","labels":{},"severity":"warning"}` | Does a basic lookup using the defined selectors to see if we can see any info for a given selector. This is the "watcher for the watcher". If we get alerted by this, we likely have a bad selector and our alerts are not going to ever fire. | | containerRules.daemonsets.KubeDaemonSetMisScheduled.for | string | `"15m"` | | | containerRules.daemonsets.KubeDaemonSetMisScheduled.labels | object | `{}` | | | containerRules.daemonsets.KubeDaemonSetMisScheduled.severity | string | `"warning"` | | @@ -112,11 +112,11 @@ This behavior can be tuned via the `defaults.podNameSelector`, | containerRules.daemonsets.KubeDaemonSetRolloutStuck.labels | object | `{}` | | | containerRules.daemonsets.KubeDaemonSetRolloutStuck.severity | string | `"warning"` | | | containerRules.daemonsets.enabled | bool | `true` | Enables the DaemonSet resource rules | -| containerRules.deployments.DeploymentSelectorValidity | object | `{"for":"1h","labels":{},"severity":"warning"}` | Does a basic lookup using the defined selectors to see if we can see any info for a given selector. This is the "watcher for the watcher". If we get alerted by this, we likely have a bad selector and our alerts are not going to ever fire. | +| containerRules.deployments.DeploymentSelectorValidity | object | `{"enabled":true,"for":"1h","labels":{},"severity":"warning"}` | Does a basic lookup using the defined selectors to see if we can see any info for a given selector. This is the "watcher for the watcher". If we get alerted by this, we likely have a bad selector and our alerts are not going to ever fire. | | containerRules.deployments.KubeDeploymentGenerationMismatch | object | `{"for":"15m","labels":{},"severity":"warning"}` | Deployment generation mismatch due to possible roll-back | | containerRules.deployments.enabled | bool | `true` | Enables the Deployment resource rules | | containerRules.enabled | bool | `true` | Whether or not to enable the container rules template | -| containerRules.hpas.HpaSelectorValidity | object | `{"for":"1h","labels":{},"severity":"warning"}` | Does a basic lookup using the defined selectors to see if we can see any info for a given selector. This is the "watcher for the watcher". If we get alerted by this, we likely have a bad selector and our alerts are not going to ever fire. | +| containerRules.hpas.HpaSelectorValidity | object | `{"enabled":true,"for":"1h","labels":{},"severity":"warning"}` | Does a basic lookup using the defined selectors to see if we can see any info for a given selector. This is the "watcher for the watcher". If we get alerted by this, we likely have a bad selector and our alerts are not going to ever fire. | | containerRules.hpas.KubeHpaMaxedOut.for | string | `"15m"` | | | containerRules.hpas.KubeHpaMaxedOut.labels | object | `{}` | | | containerRules.hpas.KubeHpaMaxedOut.severity | string | `"warning"` | | @@ -124,7 +124,7 @@ This behavior can be tuned via the `defaults.podNameSelector`, | containerRules.hpas.KubeHpaReplicasMismatch.labels | object | `{}` | | | containerRules.hpas.KubeHpaReplicasMismatch.severity | string | `"warning"` | | | containerRules.hpas.enabled | bool | `true` | Enables the HorizontalPodAutoscaler resource rules | -| containerRules.jobs.JobSelectorValidity | object | `{"for":"1h","labels":{},"severity":"warning"}` | Does a basic lookup using the defined selectors to see if we can see any info for a given selector. This is the "watcher for the watcher". If we get alerted by this, we likely have a bad selector and our alerts are not going to ever fire. | +| containerRules.jobs.JobSelectorValidity | object | `{"enabled":true,"for":"1h","labels":{},"severity":"warning"}` | Does a basic lookup using the defined selectors to see if we can see any info for a given selector. This is the "watcher for the watcher". If we get alerted by this, we likely have a bad selector and our alerts are not going to ever fire. | | containerRules.jobs.KubeJobCompletion.for | string | `"12h"` | | | containerRules.jobs.KubeJobCompletion.labels | object | `{}` | | | containerRules.jobs.KubeJobCompletion.severity | string | `"warning"` | | @@ -140,7 +140,7 @@ This behavior can be tuned via the `defaults.podNameSelector`, | containerRules.pods.PodContainerTerminated | object | `{"for":"1m","labels":{},"over":"10m","reasons":["ContainerCannotRun","DeadlineExceeded"],"severity":"warning","threshold":0}` | Monitors Pods for Containers that are terminated either for unexpected reasons like ContainerCannotRun. If that number breaches the $threshold (1) for $for (1m), then it will alert. | | containerRules.pods.PodCrashLoopBackOff | object | `{"for":"10m","labels":{},"severity":"warning"}` | Pod is in a CrashLoopBackOff state and is not becoming healthy. | | containerRules.pods.PodNotReady | object | `{"for":"15m","labels":{},"severity":"warning"}` | Pod has been in a non-ready state for more than a specific threshold | -| containerRules.pods.PodSelectorValidity | object | `{"for":"1h","labels":{},"severity":"warning"}` | Does a basic lookup using the defined selectors to see if we can see any info for a given selector. This is the "watcher for the watcher". If we get alerted by this, we likely have a bad selector and our alerts are not going to ever fire. | +| containerRules.pods.PodSelectorValidity | object | `{"enabled":true,"for":"1h","labels":{},"severity":"warning"}` | Does a basic lookup using the defined selectors to see if we can see any info for a given selector. This is the "watcher for the watcher". If we get alerted by this, we likely have a bad selector and our alerts are not going to ever fire. | | containerRules.pods.enabled | bool | `true` | Enables the Pod resource rules | | containerRules.statefulsets.KubeStatefulSetGenerationMismatch.for | string | `"15m"` | | | containerRules.statefulsets.KubeStatefulSetGenerationMismatch.labels | object | `{}` | | @@ -151,7 +151,7 @@ This behavior can be tuned via the `defaults.podNameSelector`, | containerRules.statefulsets.KubeStatefulSetUpdateNotRolledOut.for | string | `"15m"` | | | containerRules.statefulsets.KubeStatefulSetUpdateNotRolledOut.labels | object | `{}` | | | containerRules.statefulsets.KubeStatefulSetUpdateNotRolledOut.severity | string | `"warning"` | | -| containerRules.statefulsets.StatefulsetSelectorValidity | object | `{"for":"1h","labels":{},"severity":"warning"}` | Does a basic lookup using the defined selectors to see if we can see any info for a given selector. This is the "watcher for the watcher". If we get alerted by this, we likely have a bad selector and our alerts are not going to ever fire. | +| containerRules.statefulsets.StatefulsetSelectorValidity | object | `{"enabled":true,"for":"1h","labels":{},"severity":"warning"}` | Does a basic lookup using the defined selectors to see if we can see any info for a given selector. This is the "watcher for the watcher". If we get alerted by this, we likely have a bad selector and our alerts are not going to ever fire. | | containerRules.statefulsets.enabled | bool | `true` | Enables the StatefulSet resource rules | | defaults.additionalRuleLabels | `map` | `{}` | Additional custom labels attached to every PrometheusRule | | defaults.daemonsetNameSelector | `string` | `".*"` | Pattern used to scope down the DaemonSet alerts to pods that are part of this general application. Set to `None` if you want to disable this selector and apply the rules to all the DaemonSets in the namespace. This string is run through the `tpl` function. | diff --git a/charts/prometheus-alerts/templates/containers-prometheusrule.yaml b/charts/prometheus-alerts/templates/containers-prometheusrule.yaml index df3dd786..0888a883 100644 --- a/charts/prometheus-alerts/templates/containers-prometheusrule.yaml +++ b/charts/prometheus-alerts/templates/containers-prometheusrule.yaml @@ -219,6 +219,7 @@ spec: {{- end }} {{ with .PodSelectorValidity -}} + {{- if .enabled }} - alert: PodSelectorValidity annotations: summary: PodSelector for prometheus-alerts is invalid @@ -249,6 +250,7 @@ spec: {{ toYaml . | nindent 8 }} {{- end }} {{- end }} + {{- end }} {{- end }} {{- end }} @@ -281,7 +283,8 @@ spec: {{- end }} {{- end }} - {{ with .DeploymentSelectorValidity -}} + {{- with .DeploymentSelectorValidity -}} + {{- if .enabled }} - alert: DeploymentSelectorValidity annotations: summary: DeploymentSelector for prometheus-alerts is invalid @@ -311,6 +314,7 @@ spec: {{ toYaml . | nindent 8 }} {{- end }} {{- end }} + {{- end }} {{- end }} {{- end }} @@ -410,7 +414,8 @@ spec: {{- end }} {{- end }} - {{ with .StatefulsetSelectorValidity -}} + {{- with .StatefulsetSelectorValidity -}} + {{- if .enabled }} - alert: StatefulsetSelectorValidity annotations: summary: StatefulsetSelector for prometheus-alerts is invalid @@ -440,6 +445,7 @@ spec: {{ toYaml . | nindent 8 }} {{- end }} {{- end }} + {{- end }} {{- end }} {{- end }} @@ -537,6 +543,7 @@ spec: {{- end }} {{ with .DaemonsetSelectorValidity -}} + {{- if .enabled }} - alert: DaemonsetSelectorValidity annotations: summary: DaemonsetSelector for prometheus-alerts is invalid @@ -566,6 +573,7 @@ spec: {{ toYaml . | nindent 8 }} {{- end }} {{- end }} + {{- end }} {{- end }} {{- end }} @@ -619,6 +627,7 @@ spec: {{- end }} {{ with .JobSelectorValidity -}} + {{- if .enabled }} - alert: JobSelectorValidity annotations: summary: JobSelector for prometheus-alerts is invalid @@ -648,6 +657,7 @@ spec: {{ toYaml . | nindent 8 }} {{- end }} {{- end }} + {{- end }} {{- end }} {{- end }} @@ -717,6 +727,7 @@ spec: {{- end }} {{ with .HpaSelectorValidity -}} + {{- if .enabled }} - alert: HpaSelectorValidity annotations: summary: HpaSelector for prometheus-alerts is invalid @@ -746,6 +757,7 @@ spec: {{ toYaml . | nindent 8 }} {{- end }} {{- end }} + {{- end }} {{- end }} {{- end }} diff --git a/charts/prometheus-alerts/values.yaml b/charts/prometheus-alerts/values.yaml index df7ef993..cf2d3754 100644 --- a/charts/prometheus-alerts/values.yaml +++ b/charts/prometheus-alerts/values.yaml @@ -138,6 +138,7 @@ containerRules: # alerted by this, we likely have a bad selector and our alerts are not going # to ever fire. PodSelectorValidity: + enabled: true severity: warning for: 1h labels: {} @@ -202,6 +203,7 @@ containerRules: # alerted by this, we likely have a bad selector and our alerts are not going # to ever fire. DeploymentSelectorValidity: + enabled: true severity: warning for: 1h labels: {} @@ -221,6 +223,7 @@ containerRules: # alerted by this, we likely have a bad selector and our alerts are not going # to ever fire. StatefulsetSelectorValidity: + enabled: true severity: warning for: 1h labels: {} @@ -252,6 +255,7 @@ containerRules: # alerted by this, we likely have a bad selector and our alerts are not going # to ever fire. DaemonsetSelectorValidity: + enabled: true severity: warning for: 1h labels: {} @@ -283,6 +287,7 @@ containerRules: # alerted by this, we likely have a bad selector and our alerts are not going # to ever fire. JobSelectorValidity: + enabled: true severity: warning for: 1h labels: {} @@ -308,6 +313,7 @@ containerRules: # alerted by this, we likely have a bad selector and our alerts are not going # to ever fire. HpaSelectorValidity: + enabled: true severity: warning for: 1h labels: {}