From caba62bd6448ffa6ace4408b7ccf8b2997579f43 Mon Sep 17 00:00:00 2001 From: Gabriel Cocenza Date: Wed, 27 Nov 2024 17:07:18 -0300 Subject: [PATCH 1/2] Fix rejection ratio alert OpenSearch does not have thread pool with the name bulk. However, there is a field named "write" that can be used by bulk or writing a single instance. This PR adds the necessary logic to use the OpenSearch data coming from the exporter and because the expression is big, it was separated into smaller expressions using record. To see the original rule that was inspiration for this alert see this [repo](https://github.com/lukas-vlcek/prometheus-elasticsearch-rules/blob/master/logging_elasticsearch.rules.yaml) Fix: #503 --- .../prometheus/prometheus_alerts.yaml | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/src/alert_rules/prometheus/prometheus_alerts.yaml b/src/alert_rules/prometheus/prometheus_alerts.yaml index 600441cfd..695022654 100644 --- a/src/alert_rules/prometheus/prometheus_alerts.yaml +++ b/src/alert_rules/prometheus/prometheus_alerts.yaml @@ -2,6 +2,19 @@ - "name": "opensearch.alerts" "rules": + # Write requests rates + # ===================== + - record: write:rejected_requests:rate2m + expr: rate(opensearch_threadpool_threads_count{name="write", type="rejected"}[2m]) + + - record: write:completed_requests:rate2m + expr: rate(opensearch_threadpool_threads_count{name="write", type="completed"}[2m]) + + # If there are no write rejections then we get can 0/0 which is NaN. This does not affect the + # OpenSearchWriteRequestsRejectionJumps alert + - record: write:reject_ratio:rate2m + expr: sum by (cluster, instance, node) (write:rejected_requests:rate2m) / on (cluster, instance, node) (write:completed_requests:rate2m) + - "alert": "OpenSearchScrapeFailed" "annotations": "message": "Scrape on {{ $labels.juju_unit }} failed. Ensure that the OpenSearch systemd service is healthy and that the unit is part of the cluster." @@ -32,12 +45,12 @@ "labels": "severity": "warning" - - "alert": "OpenSearchBulkRequestsRejectionJumps" + - "alert": "OpenSearchWriteRequestsRejectionJumps" "annotations": - "message": "High Bulk Rejection Ratio at {{ $labels.node }} node in {{ $labels.cluster }} cluster. This node may not be keeping up with the indexing speed." - "summary": "High Bulk Rejection Ratio - {{ $value }}%" + "message": "High Write Rejection Ratio at {{ $labels.node }} node in {{ $labels.cluster }} cluster. This node may not be keeping up with the indexing speed." + "summary": "High Write Rejection Ratio - {{ $value }}%" "expr": | - round( bulk:reject_ratio:rate2m * 100, 0.001 ) > 5 + round( write:reject_ratio:rate2m * 100, 0.001 ) > 5 "for": "10m" "labels": "severity": "warning" From 433be1e396a04baae93a5904d42bbe5dccbc20a2 Mon Sep 17 00:00:00 2001 From: Gabriel Cocenza Date: Fri, 29 Nov 2024 10:32:09 -0300 Subject: [PATCH 2/2] Fix denominator to include all write operations on thread pool --- src/alert_rules/prometheus/prometheus_alerts.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/alert_rules/prometheus/prometheus_alerts.yaml b/src/alert_rules/prometheus/prometheus_alerts.yaml index 695022654..b51ead8af 100644 --- a/src/alert_rules/prometheus/prometheus_alerts.yaml +++ b/src/alert_rules/prometheus/prometheus_alerts.yaml @@ -5,15 +5,15 @@ # Write requests rates # ===================== - record: write:rejected_requests:rate2m - expr: rate(opensearch_threadpool_threads_count{name="write", type="rejected"}[2m]) + expr: sum by (cluster, instance, node) (rate(opensearch_threadpool_threads_count{name="write", type="rejected"}[2m])) - - record: write:completed_requests:rate2m - expr: rate(opensearch_threadpool_threads_count{name="write", type="completed"}[2m]) + - record: write:total_requests:rate2m + expr: sum by (cluster, instance, node) (rate(opensearch_threadpool_threads_count{name="write"}[2m])) # If there are no write rejections then we get can 0/0 which is NaN. This does not affect the # OpenSearchWriteRequestsRejectionJumps alert - record: write:reject_ratio:rate2m - expr: sum by (cluster, instance, node) (write:rejected_requests:rate2m) / on (cluster, instance, node) (write:completed_requests:rate2m) + expr: write:rejected_requests:rate2m / write:total_requests:rate2m - "alert": "OpenSearchScrapeFailed" "annotations":