From b3de8f498dcf76bf01f8de1592c370e92cc515c8 Mon Sep 17 00:00:00 2001 From: viktoryathegreat Date: Mon, 24 Jul 2023 19:26:22 +0400 Subject: [PATCH] feat(DMVP-2469): Added a feature for having complex alert expressions. --- README.md | 40 ++++--------------- modules/alerts/README.md | 8 +++- modules/alerts/main.tf | 10 +++-- modules/alerts/tests/expressions/0-setup.tf | 15 +++++++ modules/alerts/tests/expressions/1-example.tf | 26 ++++++++++++ modules/alerts/tests/expressions/2-assert.tf | 9 +++++ modules/alerts/tests/expressions/README.md | 33 +++++++++++++++ modules/alerts/variables.tf | 5 ++- 8 files changed, 107 insertions(+), 39 deletions(-) create mode 100644 modules/alerts/tests/expressions/0-setup.tf create mode 100644 modules/alerts/tests/expressions/1-example.tf create mode 100644 modules/alerts/tests/expressions/2-assert.tf create mode 100644 modules/alerts/tests/expressions/README.md diff --git a/README.md b/README.md index dc00bd3..ba1deff 100644 --- a/README.md +++ b/README.md @@ -3,20 +3,10 @@ This module is created to manage OnPremise Grafana stack with Terraform. At this moment we support managing - Grafana Alerts with `alerts` submodule - Grafana Contact Points with `contact-points` submodule +- Grafana Notification Policies with `notifications` submodule More parts are coming soon. -## Tips -1. Alert conditions are formed based on $B blocks and `equation`, `threshold` parameters users pass to the module. -`equation` parameter can only get these values: -- `lt` corresponds to `<` -- `gt` corresponds to `>` -- `e` corresponds to `=` -- `lte` corresponds to `<=` -- `gte` corresponds to `>=` -And `threshold` parameter is the number value against which B blocks are compared in the math expression. -2. We pass `null` value to `filters` variable. It's needed when we use such Prometheus metrics which don't get any filters when querying. - ## Example for Alert Rules ``` module "grafana_alerts" { @@ -37,28 +27,14 @@ module "grafana_alerts" { threshold = 1 }, { - name = "App_2 has 0 available replicas" - folder_name = "Replica Count" + name = "Nginx Expressions" + folder_name = "Nginx Expressions Group" datasource = "prometheus" - metric_name = "kube_deployment_status_replicas_available" - filters = { - deployment = "app-2-microservice" - } - function = "last" - equation = "lt" - threshold = 1 - }, - { - name = "Insufficient nodes in cluster" - summary = "Cluster is using fewer nodes than the required count" - folder_name = "Node Autoscaling" - datasource = "prometheus" - filters = null - metric_name = "sum(kube_node_info)" + expr = "sum(rate(nginx_ingress_controller_requests{status=~'5..'}[1m])) by (ingress,cluster) / sum(rate(nginx_ingress_controller_requests[1m]))by (ingress) * 100 > 5" function = "mean" - equation = "lte" - threshold = 2 - } + equation = "gt" + threshold = 2 + }, ] } ``` @@ -185,7 +161,7 @@ module "grafana_alerts" { ``` ## Usage -Check `modules/alerts/tests`, `modules/contact-points/test` and `modules/notifications/test` folders to see more examples. +Check `modules/alerts/tests`, `modules/contact-points/tests` and `modules/notifications/tests` folders to see more examples. ## Requirements diff --git a/modules/alerts/README.md b/modules/alerts/README.md index 7ceb296..af816c4 100644 --- a/modules/alerts/README.md +++ b/modules/alerts/README.md @@ -1,7 +1,11 @@ ## Usage To enable some of these alerts for your applications, you just need to replace `App_1`, `App_2` and `App_3` with the actual names of your applications. You can refer to the Prometheus metrics to identify the available filters that can be used for each application. Additionally, modify the values in the conditions to reflect the real cases of your applications. These adjustments will ensure that the alerts accurately monitor your specific applications and their scaling needs. -## Tips +## Alert Expressions +Alert expressions are formed based on `metric_name`, `metric_function`, `metric_interval`, and `filters` parameters. They form alert expressions like: `kube_deployment_status_replicas_available{deployment=\"nginx\"}`, `rate(kube_pod_container_status_restarts_total{container=\"nginx\"}[5m])`, but sometimes we need to have more complex queries like this one: `sum(rate(nginx_ingress_controller_requests{status=~'5..'}[1m])) by (ingress,cluster) / sum(rate(nginx_ingress_controller_requests[1m])) by (ingress) * 100 > 5`. +When you want to create simple queries, use the parameters counted above. And when you need to create complex queries, don't pass those parameters; instead, pass the query string to the `expr` variable. Check the `tests/expressions` folder for an example with complex queries." + +## Conditions and Thresholds Alert conditions are formed based on $B blocks and `equation`, `threshold` parameters users pass to the module. `equation` parameter can only get these values: - `lt` corresponds to `<` @@ -45,7 +49,7 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [alert\_interval\_seconds](#input\_alert\_interval\_seconds) | The interval, in seconds, at which all rules in the group are evaluated. If a group contains many rules, the rules are evaluated sequentially. | `number` | `10` | no | -| [alert\_rules](#input\_alert\_rules) | This varibale describes alert folders, groups and rules. |
list(object({
name = string # The name of the alert rule
summary = optional(string, "") # Rule annotation as a summary
priority = optional(string, "P2") # Rule priority level: P2 is for non-critical alerts, P1 will be set for critical alerts
folder_name = optional(string, "Main Alerts") # Grafana folder name in which the rule will be created
datasource = string # Name of the datasource used for the alert
metric_name = string # Prometheus metric name which queries the data for the alert
metric_function = optional(string, "") # Prometheus function used with metric for queries, like rate, sum etc.
metric_interval = optional(string, "") # The time interval with using functions like rate
filters = optional(any, {}) # Filters object to identify each service for alerting
function = optional(string, "mean") # One of Reduce functions which will be used in B block for alerting
equation = string # The equation in the math expression which compares B blocks value with a number and generates an alert if needed. Possible values: gt, lt, gte, lte, e
threshold = number # The value against which B blocks are compared in the math expression
}))
| `[]` | no | +| [alert\_rules](#input\_alert\_rules) | This varibale describes alert folders, groups and rules. |
list(object({
name = string # The name of the alert rule
no_data_state = optional(string, "NoData") # Describes what state to enter when the rule's query returns No Data
exec_err_state = optional(string, "Error") # Describes what state to enter when the rule's query is invalid and the rule cannot be executed
summary = optional(string, "") # Rule annotation as a summary
priority = optional(string, "P2") # Rule priority level: P2 is for non-critical alerts, P1 will be set for critical alerts
folder_name = optional(string, "Main Alerts") # Grafana folder name in which the rule will be created
datasource = string # Name of the datasource used for the alert
expr = optional(string, null) # Full expression for the alert
metric_name = optional(string, "") # Prometheus metric name which queries the data for the alert
metric_function = optional(string, "") # Prometheus function used with metric for queries, like rate, sum etc.
metric_interval = optional(string, "") # The time interval with using functions like rate
filters = optional(any, {}) # Filters object to identify each service for alerting
function = optional(string, "mean") # One of Reduce functions which will be used in B block for alerting
equation = string # The equation in the math expression which compares B blocks value with a number and generates an alert if needed. Possible values: gt, lt, gte, lte, e
threshold = number # The value against which B blocks are compared in the math expression
}))
| `[]` | no | ## Outputs diff --git a/modules/alerts/main.tf b/modules/alerts/main.tf index 6211586..90e3fad 100644 --- a/modules/alerts/main.tf +++ b/modules/alerts/main.tf @@ -1,6 +1,8 @@ locals { folders = toset(distinct([for rule in var.alert_rules : rule.folder_name])) - alerts = { for member in local.folders : member => [for rule in var.alert_rules : rule if rule.folder_name == member] } + alerts = { for member in local.folders : member => [for rule in var.alert_rules : merge(rule, { + expr : coalesce(rule.expr, "${rule.metric_function}(${rule.metric_name}${(rule.filters != null && length(rule.filters) > 0) ? format("{%s}", replace(join(", ", [for k, v in rule.filters : "${k}=\"${v}\""]), "\"", "\\\"")) : ""}${rule.metric_interval})") + }) if rule.folder_name == member] } comparison_operators = { gte : ">=", gt : ">", @@ -28,8 +30,8 @@ resource "grafana_rule_group" "alert_rule" { name = rule.value["name"] for = "0" condition = "C" - no_data_state = "NoData" - exec_err_state = "Error" + no_data_state = lookup(rule.value, "no_data_state", "NoData") + exec_err_state = lookup(rule.value, "exec_err_state", "Error") annotations = { "Managed By" = "Terraform" "Summary" = lookup(rule.value, "summary", rule.value.name) @@ -49,7 +51,7 @@ resource "grafana_rule_group" "alert_rule" { model = < 0) ? format("{%s}", replace(join(", ", [for k, v in rule.value.filters : "${k}=\"${v}\""]), "\"", "\\\"")) : ""}${rule.value.metric_interval})", + "expr": "${rule.value.expr}", "hide": false, "intervalMs": "1000", "legendFormat": "__auto", diff --git a/modules/alerts/tests/expressions/0-setup.tf b/modules/alerts/tests/expressions/0-setup.tf new file mode 100644 index 0000000..0cc3f74 --- /dev/null +++ b/modules/alerts/tests/expressions/0-setup.tf @@ -0,0 +1,15 @@ +terraform { + required_providers { + test = { + source = "terraform.io/builtin/test" + } + grafana = { + source = "grafana/grafana" + } + } +} + +provider "grafana" { + url = "https://grafana.example.com/" + auth = "glsa_xxxxxxxxxxxxxx" +} diff --git a/modules/alerts/tests/expressions/1-example.tf b/modules/alerts/tests/expressions/1-example.tf new file mode 100644 index 0000000..ef35cb9 --- /dev/null +++ b/modules/alerts/tests/expressions/1-example.tf @@ -0,0 +1,26 @@ +module "this" { + source = "../../" + + alert_rules = [ + { + name = "Nginx Expressions" + folder_name = "Nginx Expressions Group" + datasource = "prometheus" + expr = "sum(rate(nginx_ingress_controller_requests{status=~'5..'}[1m])) by (ingress,cluster) / sum(rate(nginx_ingress_controller_requests[1m]))by (ingress) * 100 > 5" + function = "mean" + equation = "gt" + threshold = 2 + }, + { + name = "Nginx Expressions" + folder_name = "Nginx Expressions Group" + no_data_state = "OK" + exec_err_state = "OK" + datasource = "prometheus" + expr = "(sum(rate(nginx_ingress_controller_requests{status=~'4..'}[1m])) by (ingress) / sum(rate(nginx_ingress_controller_requests[1m])) by (ingress)) * 100 > 5" + function = "mean" + equation = "gt" + threshold = 2 + } + ] +} diff --git a/modules/alerts/tests/expressions/2-assert.tf b/modules/alerts/tests/expressions/2-assert.tf new file mode 100644 index 0000000..302130e --- /dev/null +++ b/modules/alerts/tests/expressions/2-assert.tf @@ -0,0 +1,9 @@ +resource "test_assertions" "dummy" { + component = "grafana-modules-alerts" + + equal "scheme" { + description = "As module does not have any output and data just make sure the case runs. Probably can be thrown away." + got = "all good" + want = "all good" + } +} diff --git a/modules/alerts/tests/expressions/README.md b/modules/alerts/tests/expressions/README.md new file mode 100644 index 0000000..0e67cd3 --- /dev/null +++ b/modules/alerts/tests/expressions/README.md @@ -0,0 +1,33 @@ +# expressions + + +## Requirements + +No requirements. + +## Providers + +| Name | Version | +|------|---------| +| [test](#provider\_test) | n/a | + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [this](#module\_this) | ../../ | n/a | + +## Resources + +| Name | Type | +|------|------| +| test_assertions.dummy | resource | + +## Inputs + +No inputs. + +## Outputs + +No outputs. + diff --git a/modules/alerts/variables.tf b/modules/alerts/variables.tf index 2f6ff64..b72bf51 100644 --- a/modules/alerts/variables.tf +++ b/modules/alerts/variables.tf @@ -7,11 +7,14 @@ variable "alert_interval_seconds" { variable "alert_rules" { type = list(object({ name = string # The name of the alert rule + no_data_state = optional(string, "NoData") # Describes what state to enter when the rule's query returns No Data + exec_err_state = optional(string, "Error") # Describes what state to enter when the rule's query is invalid and the rule cannot be executed summary = optional(string, "") # Rule annotation as a summary priority = optional(string, "P2") # Rule priority level: P2 is for non-critical alerts, P1 will be set for critical alerts folder_name = optional(string, "Main Alerts") # Grafana folder name in which the rule will be created datasource = string # Name of the datasource used for the alert - metric_name = string # Prometheus metric name which queries the data for the alert + expr = optional(string, null) # Full expression for the alert + metric_name = optional(string, "") # Prometheus metric name which queries the data for the alert metric_function = optional(string, "") # Prometheus function used with metric for queries, like rate, sum etc. metric_interval = optional(string, "") # The time interval with using functions like rate filters = optional(any, {}) # Filters object to identify each service for alerting