From f2d7f88cb8439d9cda7dc5a4d5b6066e3b8a844d Mon Sep 17 00:00:00 2001 From: Tullio Sebastiani Date: Tue, 9 Jan 2024 16:28:49 +0100 Subject: [PATCH] Krkn lib prometheus client + kube_burner references removed Signed-off-by: Tullio Sebastiani --- CI/config/common_test_config.yaml | 4 +- README.md | 2 +- config/alerts.yaml | 90 ++++++++++++++++++ config/{alerts => alerts_openshift.yaml} | 0 config/config.yaml | 7 +- config/config_kind.yaml | 4 +- config/config_kubernetes.yaml | 4 +- config/config_performance.yaml | 4 +- config/kube_burner.yaml | 15 --- docs/SLOs_validation.md | 7 +- docs/metrics.md | 24 +---- kraken/kube_burner/__init__.py | 0 kraken/kube_burner/client.py | 116 ----------------------- kraken/prometheus/__init__.py | 1 + kraken/prometheus/client.py | 61 ++++-------- requirements.txt | 3 +- run_kraken.py | 69 ++++---------- scenarios/arcaflow/cpu-hog/config.yaml | 6 +- scenarios/arcaflow/cpu-hog/input.yaml | 12 +-- 19 files changed, 154 insertions(+), 275 deletions(-) create mode 100644 config/alerts.yaml rename config/{alerts => alerts_openshift.yaml} (100%) delete mode 100644 config/kube_burner.yaml delete mode 100644 kraken/kube_burner/__init__.py delete mode 100644 kraken/kube_burner/client.py diff --git a/CI/config/common_test_config.yaml b/CI/config/common_test_config.yaml index c36a6d59..c5e758eb 100644 --- a/CI/config/common_test_config.yaml +++ b/CI/config/common_test_config.yaml @@ -15,15 +15,13 @@ cerberus: performance_monitoring: deploy_dashboards: False # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift. repo: "https://github.com/cloud-bulldozer/performance-dashboards.git" - kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz" capture_metrics: False - config_path: config/kube_burner.yaml # Define the Elasticsearch url and index name in this config. metrics_profile_path: config/metrics-aggregated.yaml prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus. uuid: # uuid for the run is generated by default if not set. enable_alerts: False # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error. - alert_profile: config/alerts # Path to alert profile with the prometheus queries. + alert_profile: config/alerts.yaml # Path to alert profile with the prometheus queries. tunings: wait_duration: 6 # Duration to wait between each chaos scenario. diff --git a/README.md b/README.md index 0887b0b1..354a82bf 100644 --- a/README.md +++ b/README.md @@ -80,7 +80,7 @@ Scenario type | Kubernetes | OpenShift It is important to make sure to check if the targeted component recovered from the chaos injection and also if the Kubernetes/OpenShift cluster is healthy as failures in one component can have an adverse impact on other components. Kraken does this by: - Having built in checks for pod and node based scenarios to ensure the expected number of replicas and nodes are up. It also supports running custom scripts with the checks. - Leveraging [Cerberus](https://github.com/openshift-scale/cerberus) to monitor the cluster under test and consuming the aggregated go/no-go signal to determine pass/fail post chaos. It is highly recommended to turn on the Cerberus health check feature available in Kraken. Instructions on installing and setting up Cerberus can be found [here](https://github.com/openshift-scale/cerberus#installation) or can be installed from Kraken using the [instructions](https://github.com/redhat-chaos/krkn#setting-up-infrastructure-dependencies). Once Cerberus is up and running, set cerberus_enabled to True and cerberus_url to the url where Cerberus publishes go/no-go signal in the Kraken config file. Cerberus can monitor [application routes](https://github.com/redhat-chaos/cerberus/blob/main/docs/config.md#watch-routes) during the chaos and fails the run if it encounters downtime as it is a potential downtime in a customers, or users environment as well. It is especially important during the control plane chaos scenarios including the API server, Etcd, Ingress etc. It can be enabled by setting `check_applicaton_routes: True` in the [Kraken config](https://github.com/redhat-chaos/krkn/blob/main/config/config.yaml) provided application routes are being monitored in the [cerberus config](https://github.com/redhat-chaos/krkn/blob/main/config/cerberus.yaml). -- Leveraging [kube-burner](docs/alerts.md) alerting feature to fail the runs in case of critical alerts. +- Leveraging built-in alert collection feature to fail the runs in case of critical alerts. ### Signaling In CI runs or any external job it is useful to stop Kraken once a certain test or state gets reached. We created a way to signal to kraken to pause the chaos or stop it completely using a signal posted to a port of your choice. diff --git a/config/alerts.yaml b/config/alerts.yaml new file mode 100644 index 00000000..7dfc912d --- /dev/null +++ b/config/alerts.yaml @@ -0,0 +1,90 @@ +# etcd + +- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[10m:]) > 0.01 + description: 10 minutes avg. 99th etcd fsync latency on {{$labels.pod}} higher than 10ms. {{$value}}s + severity: warning + +- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[10m:]) > 1 + description: 10 minutes avg. 99th etcd fsync latency on {{$labels.pod}} higher than 1s. {{$value}}s + severity: error + +- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[2m]))[10m:]) > 0.007 + description: 10 minutes avg. 99th etcd commit latency on {{$labels.pod}} higher than 30ms. {{$value}}s + severity: warning + +- expr: rate(etcd_server_leader_changes_seen_total[2m]) > 0 + description: etcd leader changes observed + severity: warning + +- expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95 + description: etcd cluster database is running full. + severity: critical + +- expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5 + description: etcd database size in use is less than 50% of the actual allocated storage. + severity: warning + +- expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 + description: etcd cluster has high number of proposal failures. + severity: warning + +- expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.15 + description: etcd cluster member communication is slow. + severity: warning + +- expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type)) > 0.15 + description: etcd grpc requests are slow. + severity: critical + +- expr: 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code) / sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) > 5 + description: etcd cluster has high number of failed grpc requests. + severity: critical + +- expr: etcd_server_has_leader{job=~".*etcd.*"} == 0 + description: etcd cluster has no leader. + severity: warning + +- expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2) + description: etcd cluster has insufficient number of members. + severity: warning + +- expr: max without (endpoint) ( sum without (instance) (up{job=~".*etcd.*"} == bool 0) or count without (To) ( sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01 )) > 0 + description: etcd cluster members are down. + severity: warning + +# API server +- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"POST|PUT|DELETE|PATCH", subresource!~"log|exec|portforward|attach|proxy"}[2m])) by (le, resource, verb))[10m:]) > 1 + description: 10 minutes avg. 99th mutating API call latency for {{$labels.verb}}/{{$labels.resource}} higher than 1 second. {{$value}}s + severity: error + +- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy", scope="resource"}[2m])) by (le, resource, verb, scope))[5m:]) > 1 + description: 5 minutes avg. 99th read-only API call latency for {{$labels.verb}}/{{$labels.resource}} in scope {{$labels.scope}} higher than 1 second. {{$value}}s + severity: error + +- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy", scope="namespace"}[2m])) by (le, resource, verb, scope))[5m:]) > 5 + description: 5 minutes avg. 99th read-only API call latency for {{$labels.verb}}/{{$labels.resource}} in scope {{$labels.scope}} higher than 5 seconds. {{$value}}s + severity: error + +- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy", scope="cluster"}[2m])) by (le, resource, verb, scope))[5m:]) > 30 + description: 5 minutes avg. 99th read-only API call latency for {{$labels.verb}}/{{$labels.resource}} in scope {{$labels.scope}} higher than 30 seconds. {{$value}}s + severity: error + +# Control plane pods + +- expr: up{job=~"crio|kubelet"} == 0 + description: "{{$labels.node}}/{{$labels.job}} down" + severity: warning + +- expr: up{job="ovnkube-node"} == 0 + description: "{{$labels.instance}}/{{$labels.pod}} {{$labels.job}} down" + severity: warning + +# Service sync latency +- expr: histogram_quantile(0.99, sum(rate(kubeproxy_network_programming_duration_seconds_bucket[2m])) by (le)) > 10 + description: 99th Kubeproxy network programming latency higher than 10 seconds. {{$value}}s + severity: warning + +# Prometheus alerts +- expr: ALERTS{severity="critical", alertstate="firing"} > 0 + description: Critical prometheus alert. {{$labels.alertname}} + severity: warning diff --git a/config/alerts b/config/alerts_openshift.yaml similarity index 100% rename from config/alerts rename to config/alerts_openshift.yaml diff --git a/config/config.yaml b/config/config.yaml index e26779f8..970fbb5d 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -51,15 +51,13 @@ cerberus: performance_monitoring: deploy_dashboards: False # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift repo: "https://github.com/cloud-bulldozer/performance-dashboards.git" - kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v1.7.0/kube-burner-1.7.0-Linux-x86_64.tar.gz" capture_metrics: False - config_path: config/kube_burner.yaml # Define the Elasticsearch url and index name in this config metrics_profile_path: config/metrics-aggregated.yaml prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus. uuid: # uuid for the run is generated by default if not set enable_alerts: False # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error - alert_profile: config/alerts # Path or URL to alert profile with the prometheus queries + alert_profile: config/alerts.yaml # Path or URL to alert profile with the prometheus queries check_critical_alerts: False # When enabled will check prometheus for critical alerts firing post chaos tunings: wait_duration: 60 # Duration to wait between each chaos scenario @@ -90,3 +88,6 @@ telemetry: oc_cli_path: /usr/bin/oc # optional, if not specified will be search in $PATH events_backup: True # enables/disables cluster events collection + + + diff --git a/config/config_kind.yaml b/config/config_kind.yaml index 88645dc5..2d247cd3 100644 --- a/config/config_kind.yaml +++ b/config/config_kind.yaml @@ -20,15 +20,13 @@ cerberus: performance_monitoring: deploy_dashboards: False # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift repo: "https://github.com/cloud-bulldozer/performance-dashboards.git" - kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz" capture_metrics: False - config_path: config/kube_burner.yaml # Define the Elasticsearch url and index name in this config metrics_profile_path: config/metrics-aggregated.yaml prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus. uuid: # uuid for the run is generated by default if not set enable_alerts: False # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error - alert_profile: config/alerts # Path to alert profile with the prometheus queries + alert_profile: config/alerts.yaml # Path to alert profile with the prometheus queries tunings: wait_duration: 60 # Duration to wait between each chaos scenario diff --git a/config/config_kubernetes.yaml b/config/config_kubernetes.yaml index efb1d72e..55884bfc 100644 --- a/config/config_kubernetes.yaml +++ b/config/config_kubernetes.yaml @@ -19,15 +19,13 @@ cerberus: performance_monitoring: deploy_dashboards: False # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift repo: "https://github.com/cloud-bulldozer/performance-dashboards.git" - kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz" capture_metrics: False - config_path: config/kube_burner.yaml # Define the Elasticsearch url and index name in this config metrics_profile_path: config/metrics-aggregated.yaml prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus. uuid: # uuid for the run is generated by default if not set enable_alerts: False # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error - alert_profile: config/alerts # Path to alert profile with the prometheus queries + alert_profile: config/alerts.yaml # Path to alert profile with the prometheus queries check_critical_alerts: False # When enabled will check prometheus for critical alerts firing post chaos after soak time for the cluster to settle down tunings: wait_duration: 60 # Duration to wait between each chaos scenario diff --git a/config/config_performance.yaml b/config/config_performance.yaml index 5143a3ea..368b8415 100644 --- a/config/config_performance.yaml +++ b/config/config_performance.yaml @@ -41,15 +41,13 @@ cerberus: performance_monitoring: deploy_dashboards: True # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift repo: "https://github.com/cloud-bulldozer/performance-dashboards.git" - kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz" capture_metrics: True - config_path: config/kube_burner.yaml # Define the Elasticsearch url and index name in this config metrics_profile_path: config/metrics-aggregated.yaml prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus. uuid: # uuid for the run is generated by default if not set enable_alerts: True # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error - alert_profile: config/alerts # Path to alert profile with the prometheus queries + alert_profile: config/alerts.yaml # Path to alert profile with the prometheus queries tunings: wait_duration: 60 # Duration to wait between each chaos scenario diff --git a/config/kube_burner.yaml b/config/kube_burner.yaml deleted file mode 100644 index dbea38d9..00000000 --- a/config/kube_burner.yaml +++ /dev/null @@ -1,15 +0,0 @@ ---- - -global: - writeToFile: true - metricsDirectory: collected-metrics - measurements: - - name: podLatency - esIndex: kraken - - indexerConfig: - enabled: true - esServers: [http://0.0.0.0:9200] # Please change this to the respective Elasticsearch in use if you haven't run the podman-compose command to setup the infrastructure containers - insecureSkipVerify: true - defaultIndex: kraken - type: elastic diff --git a/docs/SLOs_validation.md b/docs/SLOs_validation.md index 50b88be3..09905931 100644 --- a/docs/SLOs_validation.md +++ b/docs/SLOs_validation.md @@ -11,19 +11,18 @@ performance_monitoring: ``` ### Validation and alerting based on the queries defined by the user during chaos -Takes PromQL queries as input and modifies the return code of the run to determine pass/fail. It's especially useful in case of automated runs in CI where user won't be able to monitor the system. It uses [Kube-burner](https://kube-burner.readthedocs.io/en/latest/) under the hood. This feature can be enabled in the [config](https://github.com/redhat-chaos/krkn/blob/main/config/config.yaml) by setting the following: +Takes PromQL queries as input and modifies the return code of the run to determine pass/fail. It's especially useful in case of automated runs in CI where user won't be able to monitor the system. This feature can be enabled in the [config](https://github.com/redhat-chaos/krkn/blob/main/config/config.yaml) by setting the following: ``` performance_monitoring: - kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz" prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus. enable_alerts: True # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error. - alert_profile: config/alerts # Path to alert profile with the prometheus queries. + alert_profile: config/alerts.yaml # Path to alert profile with the prometheus queries. ``` #### Alert profile -A couple of [alert profiles](https://github.com/redhat-chaos/krkn/tree/main/config) [alerts](https://github.com/redhat-chaos/krkn/blob/main/config/alerts) are shipped by default and can be tweaked to add more queries to alert on. User can provide a URL or path to the file in the [config](https://github.com/redhat-chaos/krkn/blob/main/config/config.yaml). The following are a few alerts examples: +A couple of [alert profiles](https://github.com/redhat-chaos/krkn/tree/main/config) [alerts](https://github.com/redhat-chaos/krkn/blob/main/config/alerts.yaml) are shipped by default and can be tweaked to add more queries to alert on. User can provide a URL or path to the file in the [config](https://github.com/redhat-chaos/krkn/blob/main/config/config.yaml). The following are a few alerts examples: ``` - expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[5m:]) > 0.01 diff --git a/docs/metrics.md b/docs/metrics.md index bcb8360d..663e07c2 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -1,14 +1,12 @@ ## Scraping and storing metrics for the run -There are cases where the state of the cluster and metrics on the cluster during the chaos test run need to be stored long term to review after the cluster is terminated, for example CI and automation test runs. To help with this, Kraken supports capturing metrics for the duration of the scenarios defined in the config and indexes them into Elasticsearch. The indexed metrics can be visualized with the help of Grafana. +There are cases where the state of the cluster and metrics on the cluster during the chaos test run need to be stored long term to review after the cluster is terminated, for example CI and automation test runs. To help with this, Kraken supports capturing metrics for the duration of the scenarios defined in the config. -It uses [Kube-burner](https://github.com/cloud-bulldozer/kube-burner) under the hood. The metrics to capture need to be defined in a metrics profile which Kraken consumes to query prometheus ( installed by default in OpenShift ) with the start and end timestamp of the run. Each run has a unique identifier ( uuid ) and all the metrics/documents in Elasticsearch will be associated with it. The uuid is generated automatically if not set in the config. This feature can be enabled in the [config](https://github.com/redhat-chaos/krkn/blob/main/config/config.yaml) by setting the following: +The metrics to capture need to be defined in a metrics profile which Kraken consumes to query prometheus with the start and end timestamp of the run. Each run has a unique identifier ( uuid ). The uuid is generated automatically if not set in the config. This feature can be enabled in the [config](https://github.com/redhat-chaos/krkn/blob/main/config/config.yaml) by setting the following: ``` performance_monitoring: - kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz" capture_metrics: True - config_path: config/kube_burner.yaml # Define the Elasticsearch url and index name in this config. metrics_profile_path: config/metrics-aggregated.yaml prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus. @@ -31,21 +29,3 @@ metrics: metricName: APIInflightRequests ``` -### Indexing -Define the Elasticsearch and index to store the metrics/documents in the kube_burner config: - -``` -global: - writeToFile: true - metricsDirectory: collected-metrics - measurements: - - name: podLatency - esIndex: kube-burner - - indexerConfig: - enabled: true - esServers: [https://elastic.example.com:9200] - insecureSkipVerify: true - defaultIndex: kraken - type: elastic -``` diff --git a/kraken/kube_burner/__init__.py b/kraken/kube_burner/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/kraken/kube_burner/client.py b/kraken/kube_burner/client.py deleted file mode 100644 index 2529a34a..00000000 --- a/kraken/kube_burner/client.py +++ /dev/null @@ -1,116 +0,0 @@ -import subprocess -import logging -import urllib.request -import shutil -import sys -import requests -import tempfile -import kraken.prometheus.client as prometheus -from urllib.parse import urlparse - - -def setup(url): - """ - Downloads and unpacks kube-burner binary - """ - - filename = "kube_burner.tar" - try: - logging.info("Fetching kube-burner binary") - urllib.request.urlretrieve(url, filename) - except Exception as e: - logging.error("Failed to download kube-burner binary located at %s" % url, e) - sys.exit(1) - try: - logging.info("Unpacking kube-burner tar ball") - shutil.unpack_archive(filename) - except Exception as e: - logging.error("Failed to unpack the kube-burner binary tarball: %s" % e) - sys.exit(1) - - -def scrape_metrics( - distribution, uuid, prometheus_url, prometheus_bearer_token, start_time, end_time, config_path, metrics_profile -): - """ - Scrapes metrics defined in the profile from Prometheus and indexes them into Elasticsearch - """ - - if not prometheus_url: - if distribution == "openshift": - logging.info("Looks like prometheus_url is not defined, trying to use the default instance on the cluster") - prometheus_url, prometheus_bearer_token = prometheus.instance( - distribution, prometheus_url, prometheus_bearer_token - ) - else: - logging.error("Looks like prometheus url is not defined, exiting") - sys.exit(1) - command = ( - "./kube-burner index --uuid " - + str(uuid) - + " -u " - + str(prometheus_url) - + " -t " - + str(prometheus_bearer_token) - + " -m " - + str(metrics_profile) - + " --start " - + str(start_time) - + " --end " - + str(end_time) - + " -c " - + str(config_path) - ) - try: - logging.info("Running kube-burner to capture the metrics: %s" % command) - logging.info("UUID for the run: %s" % uuid) - subprocess.run(command, shell=True, universal_newlines=True) - except Exception as e: - logging.error("Failed to run kube-burner, error: %s" % (e)) - sys.exit(1) - - -def alerts(distribution, prometheus_url, prometheus_bearer_token, start_time, end_time, alert_profile): - """ - Scrapes metrics defined in the profile from Prometheus and alerts based on the severity defined - """ - - is_url = urlparse(alert_profile) - if is_url.scheme and is_url.netloc: - response = requests.get(alert_profile) - temp_alerts = tempfile.NamedTemporaryFile() - temp_alerts.write(response.content) - temp_alerts.flush() - alert_profile = temp_alerts.name - - if not prometheus_url: - if distribution == "openshift": - logging.info("Looks like prometheus_url is not defined, trying to use the default instance on the cluster") - prometheus_url, prometheus_bearer_token = prometheus.instance( - distribution, prometheus_url, prometheus_bearer_token - ) - else: - logging.error("Looks like prometheus url is not defined, exiting") - sys.exit(1) - command = ( - "./kube-burner check-alerts " - + " -u " - + str(prometheus_url) - + " -t " - + str(prometheus_bearer_token) - + " -a " - + str(alert_profile) - + " --start " - + str(start_time) - + " --end " - + str(end_time) - ) - try: - logging.info("Running kube-burner to capture the metrics: %s" % command) - output = subprocess.run(command, shell=True, universal_newlines=True) - if output.returncode != 0: - logging.error("command exited with a non-zero rc, please check the logs for errors or critical alerts") - sys.exit(output.returncode) - except Exception as e: - logging.error("Failed to run kube-burner, error: %s" % (e)) - sys.exit(1) diff --git a/kraken/prometheus/__init__.py b/kraken/prometheus/__init__.py index e69de29b..4c51a321 100644 --- a/kraken/prometheus/__init__.py +++ b/kraken/prometheus/__init__.py @@ -0,0 +1 @@ +from .client import * \ No newline at end of file diff --git a/kraken/prometheus/client.py b/kraken/prometheus/client.py index df8eb780..5ebab349 100644 --- a/kraken/prometheus/client.py +++ b/kraken/prometheus/client.py @@ -1,49 +1,30 @@ +import datetime +import os.path import urllib3 import logging -import prometheus_api_client import sys -import kraken.invoke.command as runcommand +import yaml +from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) +def alerts(prom_cli: KrknPrometheus, start_time, end_time, alert_profile): -# Initialize the client -def initialize_prom_client(distribution, prometheus_url, prometheus_bearer_token): - global prom_cli - prometheus_url, prometheus_bearer_token = instance(distribution, prometheus_url, prometheus_bearer_token) - if prometheus_url and prometheus_bearer_token: - bearer = "Bearer " + prometheus_bearer_token - headers = {"Authorization": bearer} - try: - prom_cli = prometheus_api_client.PrometheusConnect(url=prometheus_url, headers=headers, disable_ssl=True) - except Exception as e: - logging.error("Not able to initialize the client %s" % e) - sys.exit(1) - else: - prom_cli = None - + if alert_profile is None or os.path.exists(alert_profile) is False: + logging.error(f"{alert_profile} alert profile does not exist") + sys.exit(1) -# Process custom prometheus query -def process_prom_query(query): - if prom_cli: - try: - return prom_cli.custom_query(query=query, params=None) - except Exception as e: - logging.error("Failed to get the metrics: %s" % e) + with open(alert_profile) as profile: + profile_yaml = yaml.safe_load(profile) + if not isinstance(profile_yaml, list): + logging.error(f"{alert_profile} wrong file format, alert profile must be " + f"a valid yaml file containing a list of items with 3 properties: " + f"expr, description, severity" ) sys.exit(1) - else: - logging.info("Skipping the prometheus query as the prometheus client couldn't " "be initialized\n") -# Get prometheus details -def instance(distribution, prometheus_url, prometheus_bearer_token): - if distribution == "openshift" and not prometheus_url: - url = runcommand.invoke( - r"""oc get routes -n openshift-monitoring -o=jsonpath='{.items[?(@.metadata.name=="prometheus-k8s")].spec.host}'""" # noqa - ) - prometheus_url = "https://" + url - if distribution == "openshift" and not prometheus_bearer_token: - prometheus_bearer_token = runcommand.invoke( - "oc create token -n openshift-monitoring prometheus-k8s --duration=12h " - "|| oc -n openshift-monitoring sa get-token prometheus-k8s " - "|| oc sa new-token -n openshift-monitoring prometheus-k8s" - ) - return prometheus_url, prometheus_bearer_token + for alert in profile_yaml: + if list(alert.keys()).sort() != ["expr", "description", "severity"].sort(): + logging.error(f"wrong alert {alert}, skipping") + + prom_cli.process_alert(alert, + datetime.datetime.fromtimestamp(start_time), + datetime.datetime.fromtimestamp(end_time)) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index a06d863d..7cc9eea5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,14 +19,13 @@ ibm_cloud_sdk_core ibm_vpc itsdangerous==2.0.1 jinja2==3.0.3 -krkn-lib>=1.4.5 +krkn-lib >= 1.4.5 kubernetes lxml >= 4.3.0 oauth2client>=4.1.3 openshift-client paramiko podman-compose -prometheus_api_client pyVmomi >= 6.7 pyfiglet pytest diff --git a/run_kraken.py b/run_kraken.py index b62aed2e..afcb05db 100644 --- a/run_kraken.py +++ b/run_kraken.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +import datetime import json import os import sys @@ -8,6 +9,7 @@ import pyfiglet import uuid import time +from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus import kraken.time_actions.common_time_functions as time_actions import kraken.performance_dashboards.setup as performance_dashboards import kraken.pod_scenarios.setup as pod_scenarios @@ -15,16 +17,14 @@ import kraken.shut_down.common_shut_down_func as shut_down import kraken.node_actions.run as nodeaction import kraken.managedcluster_scenarios.run as managedcluster_scenarios -import kraken.kube_burner.client as kube_burner import kraken.zone_outage.actions as zone_outages import kraken.application_outage.actions as application_outage import kraken.pvc.pvc_scenario as pvc_scenario import kraken.network_chaos.actions as network_chaos import kraken.arcaflow_plugin as arcaflow_plugin +import kraken.prometheus as prometheus_plugin import server as server -import kraken.prometheus.client as promcli from kraken import plugins - from krkn_lib.k8s import KrknKubernetes from krkn_lib.ocp import KrknOpenshift from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes @@ -33,11 +33,7 @@ from krkn_lib.utils import SafeLogger from krkn_lib.utils.functions import get_yaml_item_value -KUBE_BURNER_URL = ( - "https://github.com/cloud-bulldozer/kube-burner/" - "releases/download/v{version}/kube-burner-{version}-Linux-x86_64.tar.gz" -) -KUBE_BURNER_VERSION = "1.7.0" + # Main function @@ -84,21 +80,7 @@ def main(cfg): config["performance_monitoring"], "repo", "https://github.com/cloud-bulldozer/performance-dashboards.git" ) - capture_metrics = get_yaml_item_value( - config["performance_monitoring"], "capture_metrics", False - ) - kube_burner_url = get_yaml_item_value( - config["performance_monitoring"], "kube_burner_binary_url", - KUBE_BURNER_URL.format(version=KUBE_BURNER_VERSION), - ) - config_path = get_yaml_item_value( - config["performance_monitoring"], "config_path", - "config/kube_burner.yaml" - ) - metrics_profile = get_yaml_item_value( - config["performance_monitoring"], "metrics_profile_path", - "config/metrics-aggregated.yaml" - ) + prometheus_url = config["performance_monitoring"].get("prometheus_url") prometheus_bearer_token = config["performance_monitoring"].get( "prometheus_bearer_token" @@ -147,9 +129,6 @@ def main(cfg): except: kubecli.initialize_clients(None) - # KrknTelemetry init - telemetry_k8s = KrknTelemetryKubernetes(safe_logger, kubecli) - telemetry_ocp = KrknTelemetryOpenshift(safe_logger, ocpcli) # find node kraken might be running on @@ -179,11 +158,20 @@ def main(cfg): cv = "" if config["kraken"]["distribution"] == "openshift": cv = ocpcli.get_clusterversion_string() + if prometheus_url is None: + connection_data = ocpcli.get_prometheus_api_connection_data() + prometheus_url = connection_data.endpoint + prometheus_bearer_token = connection_data.token if cv != "": logging.info(cv) else: logging.info("Cluster version CRD not detected, skipping") + # KrknTelemetry init + telemetry_k8s = KrknTelemetryKubernetes(safe_logger, kubecli) + telemetry_ocp = KrknTelemetryOpenshift(safe_logger, ocpcli) + prometheus = KrknPrometheus(prometheus_url, prometheus_bearer_token) + logging.info("Server URL: %s" % kubecli.get_host()) # Deploy performance dashboards @@ -351,9 +339,10 @@ def main(cfg): # Check for critical alerts when enabled if check_critical_alerts: logging.info("Checking for critical alerts firing post choas") - promcli.initialize_prom_client(distribution, prometheus_url, prometheus_bearer_token) + + ##PROM query = r"""ALERTS{severity="critical"}""" - critical_alerts = promcli.process_prom_query(query) + critical_alerts = prometheus.process_prom_query_in_range(query, datetime.datetime.fromtimestamp(start_time)) critical_alerts_count = len(critical_alerts) if critical_alerts_count > 0: logging.error("Critical alerts are firing: %s", critical_alerts) @@ -401,33 +390,13 @@ def main(cfg): else: logging.info("telemetry collection disabled, skipping.") - # Capture the end time - - - # Capture metrics for the run - if capture_metrics: - logging.info("Capturing metrics") - kube_burner.setup(kube_burner_url) - kube_burner.scrape_metrics( - distribution, - run_uuid, - prometheus_url, - prometheus_bearer_token, - start_time, - end_time, - config_path, - metrics_profile, - ) # Check for the alerts specified if enable_alerts: logging.info("Alerts checking is enabled") - kube_burner.setup(kube_burner_url) if alert_profile: - kube_burner.alerts( - distribution, - prometheus_url, - prometheus_bearer_token, + prometheus_plugin.alerts( + prometheus, start_time, end_time, alert_profile, diff --git a/scenarios/arcaflow/cpu-hog/config.yaml b/scenarios/arcaflow/cpu-hog/config.yaml index a03beb4c..e6bcce96 100644 --- a/scenarios/arcaflow/cpu-hog/config.yaml +++ b/scenarios/arcaflow/cpu-hog/config.yaml @@ -1,6 +1,10 @@ --- deployer: - connection: {} + connection: + cacert: '' + cert: '' + host: https://api.tsebasti-lab.aws.rhperfscale.org:6443 + key: '' type: kubernetes log: level: debug diff --git a/scenarios/arcaflow/cpu-hog/input.yaml b/scenarios/arcaflow/cpu-hog/input.yaml index 3bcbece9..2e359172 100644 --- a/scenarios/arcaflow/cpu-hog/input.yaml +++ b/scenarios/arcaflow/cpu-hog/input.yaml @@ -2,13 +2,7 @@ input_list: - cpu_count: 1 cpu_load_percentage: 80 cpu_method: all - duration: 30s - node_selector: {} - # node selector example - # node_selector: - # kubernetes.io/hostname: master - kubeconfig: "" + duration: 1s + kubeconfig: '' namespace: default - -# duplicate this section to run simultaneous stressors in the same run - + node_selector: {}