-
Notifications
You must be signed in to change notification settings - Fork 102
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Krkn lib prometheus client + kube_burner references removed
Signed-off-by: Tullio Sebastiani <[email protected]>
- Loading branch information
1 parent
93f1f19
commit f2d7f88
Showing
19 changed files
with
154 additions
and
275 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
# etcd | ||
|
||
- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[10m:]) > 0.01 | ||
description: 10 minutes avg. 99th etcd fsync latency on {{$labels.pod}} higher than 10ms. {{$value}}s | ||
severity: warning | ||
|
||
- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[10m:]) > 1 | ||
description: 10 minutes avg. 99th etcd fsync latency on {{$labels.pod}} higher than 1s. {{$value}}s | ||
severity: error | ||
|
||
- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[2m]))[10m:]) > 0.007 | ||
description: 10 minutes avg. 99th etcd commit latency on {{$labels.pod}} higher than 30ms. {{$value}}s | ||
severity: warning | ||
|
||
- expr: rate(etcd_server_leader_changes_seen_total[2m]) > 0 | ||
description: etcd leader changes observed | ||
severity: warning | ||
|
||
- expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95 | ||
description: etcd cluster database is running full. | ||
severity: critical | ||
|
||
- expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5 | ||
description: etcd database size in use is less than 50% of the actual allocated storage. | ||
severity: warning | ||
|
||
- expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 | ||
description: etcd cluster has high number of proposal failures. | ||
severity: warning | ||
|
||
- expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.15 | ||
description: etcd cluster member communication is slow. | ||
severity: warning | ||
|
||
- expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type)) > 0.15 | ||
description: etcd grpc requests are slow. | ||
severity: critical | ||
|
||
- expr: 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code) / sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) > 5 | ||
description: etcd cluster has high number of failed grpc requests. | ||
severity: critical | ||
|
||
- expr: etcd_server_has_leader{job=~".*etcd.*"} == 0 | ||
description: etcd cluster has no leader. | ||
severity: warning | ||
|
||
- expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2) | ||
description: etcd cluster has insufficient number of members. | ||
severity: warning | ||
|
||
- expr: max without (endpoint) ( sum without (instance) (up{job=~".*etcd.*"} == bool 0) or count without (To) ( sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01 )) > 0 | ||
description: etcd cluster members are down. | ||
severity: warning | ||
|
||
# API server | ||
- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"POST|PUT|DELETE|PATCH", subresource!~"log|exec|portforward|attach|proxy"}[2m])) by (le, resource, verb))[10m:]) > 1 | ||
description: 10 minutes avg. 99th mutating API call latency for {{$labels.verb}}/{{$labels.resource}} higher than 1 second. {{$value}}s | ||
severity: error | ||
|
||
- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy", scope="resource"}[2m])) by (le, resource, verb, scope))[5m:]) > 1 | ||
description: 5 minutes avg. 99th read-only API call latency for {{$labels.verb}}/{{$labels.resource}} in scope {{$labels.scope}} higher than 1 second. {{$value}}s | ||
severity: error | ||
|
||
- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy", scope="namespace"}[2m])) by (le, resource, verb, scope))[5m:]) > 5 | ||
description: 5 minutes avg. 99th read-only API call latency for {{$labels.verb}}/{{$labels.resource}} in scope {{$labels.scope}} higher than 5 seconds. {{$value}}s | ||
severity: error | ||
|
||
- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy", scope="cluster"}[2m])) by (le, resource, verb, scope))[5m:]) > 30 | ||
description: 5 minutes avg. 99th read-only API call latency for {{$labels.verb}}/{{$labels.resource}} in scope {{$labels.scope}} higher than 30 seconds. {{$value}}s | ||
severity: error | ||
|
||
# Control plane pods | ||
|
||
- expr: up{job=~"crio|kubelet"} == 0 | ||
description: "{{$labels.node}}/{{$labels.job}} down" | ||
severity: warning | ||
|
||
- expr: up{job="ovnkube-node"} == 0 | ||
description: "{{$labels.instance}}/{{$labels.pod}} {{$labels.job}} down" | ||
severity: warning | ||
|
||
# Service sync latency | ||
- expr: histogram_quantile(0.99, sum(rate(kubeproxy_network_programming_duration_seconds_bucket[2m])) by (le)) > 10 | ||
description: 99th Kubeproxy network programming latency higher than 10 seconds. {{$value}}s | ||
severity: warning | ||
|
||
# Prometheus alerts | ||
- expr: ALERTS{severity="critical", alertstate="firing"} > 0 | ||
description: Critical prometheus alert. {{$labels.alertname}} | ||
severity: warning |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.