From f2d7f88cb8439d9cda7dc5a4d5b6066e3b8a844d Mon Sep 17 00:00:00 2001
From: Tullio Sebastiani <tsebasti@redhat.com>
Date: Tue, 9 Jan 2024 16:28:49 +0100
Subject: [PATCH] Krkn lib prometheus client + kube_burner references removed

Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>
---
 CI/config/common_test_config.yaml        |   4 +-
 README.md                                |   2 +-
 config/alerts.yaml                       |  90 ++++++++++++++++++
 config/{alerts => alerts_openshift.yaml} |   0
 config/config.yaml                       |   7 +-
 config/config_kind.yaml                  |   4 +-
 config/config_kubernetes.yaml            |   4 +-
 config/config_performance.yaml           |   4 +-
 config/kube_burner.yaml                  |  15 ---
 docs/SLOs_validation.md                  |   7 +-
 docs/metrics.md                          |  24 +----
 kraken/kube_burner/__init__.py           |   0
 kraken/kube_burner/client.py             | 116 -----------------------
 kraken/prometheus/__init__.py            |   1 +
 kraken/prometheus/client.py              |  61 ++++--------
 requirements.txt                         |   3 +-
 run_kraken.py                            |  69 ++++----------
 scenarios/arcaflow/cpu-hog/config.yaml   |   6 +-
 scenarios/arcaflow/cpu-hog/input.yaml    |  12 +--
 19 files changed, 154 insertions(+), 275 deletions(-)
 create mode 100644 config/alerts.yaml
 rename config/{alerts => alerts_openshift.yaml} (100%)
 delete mode 100644 config/kube_burner.yaml
 delete mode 100644 kraken/kube_burner/__init__.py
 delete mode 100644 kraken/kube_burner/client.py

diff --git a/CI/config/common_test_config.yaml b/CI/config/common_test_config.yaml
index c36a6d59..c5e758eb 100644
--- a/CI/config/common_test_config.yaml
+++ b/CI/config/common_test_config.yaml
@@ -15,15 +15,13 @@ cerberus:
 performance_monitoring:
     deploy_dashboards: False                               # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift.
     repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
-    kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz"
     capture_metrics: False
-    config_path: config/kube_burner.yaml                   # Define the Elasticsearch url and index name in this config.
     metrics_profile_path: config/metrics-aggregated.yaml
     prometheus_url:                                        # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
     prometheus_bearer_token:                               # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
     uuid:                                                  # uuid for the run is generated by default if not set.
     enable_alerts: False                                   # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error.
-    alert_profile: config/alerts                           # Path to alert profile with the prometheus queries.
+    alert_profile: config/alerts.yaml                           # Path to alert profile with the prometheus queries.
 
 tunings:
     wait_duration: 6                                       # Duration to wait between each chaos scenario.
diff --git a/README.md b/README.md
index 0887b0b1..354a82bf 100644
--- a/README.md
+++ b/README.md
@@ -80,7 +80,7 @@ Scenario type               | Kubernetes    | OpenShift
 It is important to make sure to check if the targeted component recovered from the chaos injection and also if the Kubernetes/OpenShift cluster is healthy as failures in one component can have an adverse impact on other components. Kraken does this by:
 - Having built in checks for pod and node based scenarios to ensure the expected number of replicas and nodes are up. It also supports running custom scripts with the checks.
 - Leveraging [Cerberus](https://github.com/openshift-scale/cerberus) to monitor the cluster under test and consuming the aggregated go/no-go signal to determine pass/fail post chaos. It is highly recommended to turn on the Cerberus health check feature available in Kraken. Instructions on installing and setting up Cerberus can be found [here](https://github.com/openshift-scale/cerberus#installation) or can be installed from Kraken using the [instructions](https://github.com/redhat-chaos/krkn#setting-up-infrastructure-dependencies). Once Cerberus is up and running, set cerberus_enabled to True and cerberus_url to the url where Cerberus publishes go/no-go signal in the Kraken config file. Cerberus can monitor [application routes](https://github.com/redhat-chaos/cerberus/blob/main/docs/config.md#watch-routes) during the chaos and fails the run if it encounters downtime as it is a potential downtime in a customers, or users environment as well. It is especially important during the control plane chaos scenarios including the API server, Etcd, Ingress etc. It can be enabled by setting `check_applicaton_routes: True` in the [Kraken config](https://github.com/redhat-chaos/krkn/blob/main/config/config.yaml) provided application routes are being monitored in the [cerberus config](https://github.com/redhat-chaos/krkn/blob/main/config/cerberus.yaml).
-- Leveraging [kube-burner](docs/alerts.md) alerting feature to fail the runs in case of critical alerts.
+- Leveraging built-in alert collection feature to fail the runs in case of critical alerts.
 
 ### Signaling
 In CI runs or any external job it is useful to stop Kraken once a certain test or state gets reached. We created a way to signal to kraken to pause the chaos or stop it completely using a signal posted to a port of your choice.
diff --git a/config/alerts.yaml b/config/alerts.yaml
new file mode 100644
index 00000000..7dfc912d
--- /dev/null
+++ b/config/alerts.yaml
@@ -0,0 +1,90 @@
+# etcd
+
+- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[10m:]) > 0.01
+  description: 10 minutes avg. 99th etcd fsync latency on {{$labels.pod}} higher than 10ms. {{$value}}s
+  severity: warning
+
+- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[10m:]) > 1
+  description: 10 minutes avg. 99th etcd fsync latency on {{$labels.pod}} higher than 1s. {{$value}}s
+  severity: error
+
+- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[2m]))[10m:]) > 0.007
+  description: 10 minutes avg. 99th etcd commit latency on {{$labels.pod}} higher than 30ms. {{$value}}s
+  severity: warning
+
+- expr: rate(etcd_server_leader_changes_seen_total[2m]) > 0
+  description: etcd leader changes observed
+  severity: warning
+
+- expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95
+  description: etcd cluster database is running full. 
+  severity: critical
+  
+- expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5
+  description: etcd database size in use is less than 50% of the actual allocated storage.
+  severity: warning  
+
+- expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
+  description: etcd cluster has high number of proposal failures.
+  severity: warning 
+
+- expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.15
+  description: etcd cluster member communication is slow.
+  severity: warning 
+
+- expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type)) > 0.15
+  description: etcd grpc requests are slow.
+  severity: critical
+      
+- expr: 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code) / sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) > 5
+  description: etcd cluster has high number of failed grpc requests.
+  severity: critical      
+      
+- expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
+  description: etcd cluster has no leader.
+  severity: warning    
+    
+- expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
+  description: etcd cluster has insufficient number of members.
+  severity: warning      
+
+- expr: max without (endpoint) ( sum without (instance) (up{job=~".*etcd.*"} == bool 0) or count without (To) ( sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01 )) > 0
+  description: etcd cluster members are down.
+  severity: warning  
+
+# API server
+- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"POST|PUT|DELETE|PATCH", subresource!~"log|exec|portforward|attach|proxy"}[2m])) by (le, resource, verb))[10m:]) > 1
+  description: 10 minutes avg. 99th mutating API call latency for {{$labels.verb}}/{{$labels.resource}} higher than 1 second. {{$value}}s
+  severity: error
+
+- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy", scope="resource"}[2m])) by (le, resource, verb, scope))[5m:]) > 1
+  description: 5 minutes avg. 99th read-only API call latency for {{$labels.verb}}/{{$labels.resource}} in scope {{$labels.scope}} higher than 1 second. {{$value}}s
+  severity: error
+
+- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy", scope="namespace"}[2m])) by (le, resource, verb, scope))[5m:]) > 5
+  description: 5 minutes avg. 99th read-only API call latency for {{$labels.verb}}/{{$labels.resource}} in scope {{$labels.scope}} higher than 5 seconds. {{$value}}s
+  severity: error
+
+- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"LIST|GET", subresource!~"log|exec|portforward|attach|proxy", scope="cluster"}[2m])) by (le, resource, verb, scope))[5m:]) > 30
+  description: 5 minutes avg. 99th read-only API call latency for {{$labels.verb}}/{{$labels.resource}} in scope {{$labels.scope}} higher than 30 seconds. {{$value}}s
+  severity: error
+
+# Control plane pods
+
+- expr: up{job=~"crio|kubelet"} == 0
+  description: "{{$labels.node}}/{{$labels.job}} down"
+  severity: warning
+
+- expr: up{job="ovnkube-node"} == 0
+  description: "{{$labels.instance}}/{{$labels.pod}} {{$labels.job}} down"
+  severity: warning
+
+# Service sync latency
+- expr: histogram_quantile(0.99, sum(rate(kubeproxy_network_programming_duration_seconds_bucket[2m])) by (le)) > 10
+  description: 99th Kubeproxy network programming latency higher than 10 seconds. {{$value}}s 
+  severity: warning
+
+# Prometheus alerts
+- expr: ALERTS{severity="critical", alertstate="firing"} > 0
+  description: Critical prometheus alert. {{$labels.alertname}}
+  severity: warning
diff --git a/config/alerts b/config/alerts_openshift.yaml
similarity index 100%
rename from config/alerts
rename to config/alerts_openshift.yaml
diff --git a/config/config.yaml b/config/config.yaml
index e26779f8..970fbb5d 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -51,15 +51,13 @@ cerberus:
 performance_monitoring:
     deploy_dashboards: False                              # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift
     repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
-    kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v1.7.0/kube-burner-1.7.0-Linux-x86_64.tar.gz"
     capture_metrics: False
-    config_path: config/kube_burner.yaml                  # Define the Elasticsearch url and index name in this config
     metrics_profile_path: config/metrics-aggregated.yaml
     prometheus_url:                                       # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
     prometheus_bearer_token:                              # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
     uuid:                                                 # uuid for the run is generated by default if not set
     enable_alerts: False                                  # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
-    alert_profile: config/alerts                          # Path or URL to alert profile with the prometheus queries
+    alert_profile: config/alerts.yaml                          # Path or URL to alert profile with the prometheus queries
     check_critical_alerts: False                          # When enabled will check prometheus for critical alerts firing post chaos
 tunings:
     wait_duration: 60                                      # Duration to wait between each chaos scenario
@@ -90,3 +88,6 @@ telemetry:
     oc_cli_path: /usr/bin/oc                                # optional, if not specified will be search in $PATH
     events_backup: True                                     # enables/disables cluster events collection
 
+
+
+
diff --git a/config/config_kind.yaml b/config/config_kind.yaml
index 88645dc5..2d247cd3 100644
--- a/config/config_kind.yaml
+++ b/config/config_kind.yaml
@@ -20,15 +20,13 @@ cerberus:
 performance_monitoring:
     deploy_dashboards: False                              # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift
     repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
-    kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz"
     capture_metrics: False
-    config_path: config/kube_burner.yaml                  # Define the Elasticsearch url and index name in this config
     metrics_profile_path: config/metrics-aggregated.yaml
     prometheus_url:                                       # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
     prometheus_bearer_token:                              # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
     uuid:                                                 # uuid for the run is generated by default if not set
     enable_alerts: False                                  # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
-    alert_profile: config/alerts                          # Path to alert profile with the prometheus queries
+    alert_profile: config/alerts.yaml                          # Path to alert profile with the prometheus queries
 
 tunings:
     wait_duration: 60                                      # Duration to wait between each chaos scenario
diff --git a/config/config_kubernetes.yaml b/config/config_kubernetes.yaml
index efb1d72e..55884bfc 100644
--- a/config/config_kubernetes.yaml
+++ b/config/config_kubernetes.yaml
@@ -19,15 +19,13 @@ cerberus:
 performance_monitoring:
     deploy_dashboards: False                              # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift
     repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
-    kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz"
     capture_metrics: False
-    config_path: config/kube_burner.yaml                  # Define the Elasticsearch url and index name in this config
     metrics_profile_path: config/metrics-aggregated.yaml
     prometheus_url:                                       # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
     prometheus_bearer_token:                              # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
     uuid:                                                 # uuid for the run is generated by default if not set
     enable_alerts: False                                  # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
-    alert_profile: config/alerts                          # Path to alert profile with the prometheus queries
+    alert_profile: config/alerts.yaml                         # Path to alert profile with the prometheus queries
     check_critical_alerts: False                          # When enabled will check prometheus for critical alerts firing post chaos after soak time for the cluster to settle down
 tunings:
     wait_duration: 60                                      # Duration to wait between each chaos scenario
diff --git a/config/config_performance.yaml b/config/config_performance.yaml
index 5143a3ea..368b8415 100644
--- a/config/config_performance.yaml
+++ b/config/config_performance.yaml
@@ -41,15 +41,13 @@ cerberus:
 performance_monitoring:
     deploy_dashboards: True                               # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift
     repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
-    kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz"
     capture_metrics: True
-    config_path: config/kube_burner.yaml                  # Define the Elasticsearch url and index name in this config
     metrics_profile_path: config/metrics-aggregated.yaml
     prometheus_url:                                       # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
     prometheus_bearer_token:                              # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
     uuid:                                                 # uuid for the run is generated by default if not set
     enable_alerts: True                                   # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
-    alert_profile: config/alerts                          # Path to alert profile with the prometheus queries
+    alert_profile: config/alerts.yaml                          # Path to alert profile with the prometheus queries
 
 tunings:
     wait_duration: 60                                      # Duration to wait between each chaos scenario
diff --git a/config/kube_burner.yaml b/config/kube_burner.yaml
deleted file mode 100644
index dbea38d9..00000000
--- a/config/kube_burner.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
----
-
-global:
-  writeToFile: true
-  metricsDirectory: collected-metrics
-  measurements:
-    - name: podLatency
-      esIndex: kraken
-
-  indexerConfig:
-    enabled: true
-    esServers: [http://0.0.0.0:9200]  # Please change this to the respective Elasticsearch in use if you haven't run the podman-compose command to setup the infrastructure containers
-    insecureSkipVerify: true
-    defaultIndex: kraken
-    type: elastic
diff --git a/docs/SLOs_validation.md b/docs/SLOs_validation.md
index 50b88be3..09905931 100644
--- a/docs/SLOs_validation.md
+++ b/docs/SLOs_validation.md
@@ -11,19 +11,18 @@ performance_monitoring:
 ```
 
 ### Validation and alerting based on the queries defined by the user during chaos
-Takes PromQL queries as input and modifies the return code of the run to determine pass/fail. It's especially useful in case of automated runs in CI where user won't be able to monitor the system. It uses [Kube-burner](https://kube-burner.readthedocs.io/en/latest/) under the hood. This feature can be enabled in the [config](https://github.com/redhat-chaos/krkn/blob/main/config/config.yaml) by setting the following:
+Takes PromQL queries as input and modifies the return code of the run to determine pass/fail. It's especially useful in case of automated runs in CI where user won't be able to monitor the system. This feature can be enabled in the [config](https://github.com/redhat-chaos/krkn/blob/main/config/config.yaml) by setting the following:
 
 ```
 performance_monitoring:
-    kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz"
     prometheus_url:                                       # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
     prometheus_bearer_token:                              # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
     enable_alerts: True                                   # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error.
-    alert_profile: config/alerts                          # Path to alert profile with the prometheus queries.
+    alert_profile: config/alerts.yaml                          # Path to alert profile with the prometheus queries.
 ```
 
 #### Alert profile
-A couple of [alert profiles](https://github.com/redhat-chaos/krkn/tree/main/config) [alerts](https://github.com/redhat-chaos/krkn/blob/main/config/alerts) are shipped by default and can be tweaked to add more queries to alert on. User can provide a URL or path to the file in the [config](https://github.com/redhat-chaos/krkn/blob/main/config/config.yaml). The following are a few alerts examples:
+A couple of [alert profiles](https://github.com/redhat-chaos/krkn/tree/main/config) [alerts](https://github.com/redhat-chaos/krkn/blob/main/config/alerts.yaml) are shipped by default and can be tweaked to add more queries to alert on. User can provide a URL or path to the file in the [config](https://github.com/redhat-chaos/krkn/blob/main/config/config.yaml). The following are a few alerts examples:
 
 ```
 - expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[5m:]) > 0.01
diff --git a/docs/metrics.md b/docs/metrics.md
index bcb8360d..663e07c2 100644
--- a/docs/metrics.md
+++ b/docs/metrics.md
@@ -1,14 +1,12 @@
 ## Scraping and storing metrics for the run
 
-There are cases where the state of the cluster and metrics on the cluster during the chaos test run need to be stored long term to review after the cluster is terminated, for example CI and automation test runs. To help with this, Kraken supports capturing metrics for the duration of the scenarios defined in the config and indexes them into Elasticsearch. The indexed metrics can be visualized with the help of Grafana.
+There are cases where the state of the cluster and metrics on the cluster during the chaos test run need to be stored long term to review after the cluster is terminated, for example CI and automation test runs. To help with this, Kraken supports capturing metrics for the duration of the scenarios defined in the config.
 
-It uses [Kube-burner](https://github.com/cloud-bulldozer/kube-burner) under the hood. The metrics to capture need to be defined in a metrics profile which Kraken consumes to query prometheus ( installed by default in OpenShift ) with the start and end timestamp of the run. Each run has a unique identifier ( uuid ) and all the metrics/documents in Elasticsearch will be associated with it. The uuid is generated automatically if not set in the config. This feature can be enabled in the [config](https://github.com/redhat-chaos/krkn/blob/main/config/config.yaml) by setting the following:
+The metrics to capture need to be defined in a metrics profile which Kraken consumes to query prometheus with the start and end timestamp of the run. Each run has a unique identifier ( uuid ). The uuid is generated automatically if not set in the config. This feature can be enabled in the [config](https://github.com/redhat-chaos/krkn/blob/main/config/config.yaml) by setting the following:
 
 ```
 performance_monitoring:
-    kube_burner_binary_url: "https://github.com/cloud-bulldozer/kube-burner/releases/download/v0.9.1/kube-burner-0.9.1-Linux-x86_64.tar.gz"
     capture_metrics: True
-    config_path: config/kube_burner.yaml                  # Define the Elasticsearch url and index name in this config.
     metrics_profile_path: config/metrics-aggregated.yaml
     prometheus_url:                                       # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
     prometheus_bearer_token:                              # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
@@ -31,21 +29,3 @@ metrics:
     metricName: APIInflightRequests
 ```
 
-### Indexing
-Define the Elasticsearch and index to store the metrics/documents in the kube_burner config:
-
-```
-global:
-  writeToFile: true
-  metricsDirectory: collected-metrics
-  measurements:
-    - name: podLatency
-      esIndex: kube-burner
-
-  indexerConfig:
-    enabled: true
-    esServers: [https://elastic.example.com:9200]
-    insecureSkipVerify: true
-    defaultIndex: kraken
-    type: elastic
-```
diff --git a/kraken/kube_burner/__init__.py b/kraken/kube_burner/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/kraken/kube_burner/client.py b/kraken/kube_burner/client.py
deleted file mode 100644
index 2529a34a..00000000
--- a/kraken/kube_burner/client.py
+++ /dev/null
@@ -1,116 +0,0 @@
-import subprocess
-import logging
-import urllib.request
-import shutil
-import sys
-import requests
-import tempfile
-import kraken.prometheus.client as prometheus
-from urllib.parse import urlparse
-
-
-def setup(url):
-    """
-    Downloads and unpacks kube-burner binary
-    """
-
-    filename = "kube_burner.tar"
-    try:
-        logging.info("Fetching kube-burner binary")
-        urllib.request.urlretrieve(url, filename)
-    except Exception as e:
-        logging.error("Failed to download kube-burner binary located at %s" % url, e)
-        sys.exit(1)
-    try:
-        logging.info("Unpacking kube-burner tar ball")
-        shutil.unpack_archive(filename)
-    except Exception as e:
-        logging.error("Failed to unpack the kube-burner binary tarball: %s" % e)
-        sys.exit(1)
-
-
-def scrape_metrics(
-    distribution, uuid, prometheus_url, prometheus_bearer_token, start_time, end_time, config_path, metrics_profile
-):
-    """
-    Scrapes metrics defined in the profile from Prometheus and indexes them into Elasticsearch
-    """
-
-    if not prometheus_url:
-        if distribution == "openshift":
-            logging.info("Looks like prometheus_url is not defined, trying to use the default instance on the cluster")
-            prometheus_url, prometheus_bearer_token = prometheus.instance(
-                distribution, prometheus_url, prometheus_bearer_token
-            )
-        else:
-            logging.error("Looks like prometheus url is not defined, exiting")
-            sys.exit(1)
-    command = (
-        "./kube-burner index --uuid "
-        + str(uuid)
-        + " -u "
-        + str(prometheus_url)
-        + " -t "
-        + str(prometheus_bearer_token)
-        + " -m "
-        + str(metrics_profile)
-        + " --start "
-        + str(start_time)
-        + " --end "
-        + str(end_time)
-        + " -c "
-        + str(config_path)
-    )
-    try:
-        logging.info("Running kube-burner to capture the metrics: %s" % command)
-        logging.info("UUID for the run: %s" % uuid)
-        subprocess.run(command, shell=True, universal_newlines=True)
-    except Exception as e:
-        logging.error("Failed to run kube-burner, error: %s" % (e))
-        sys.exit(1)
-
-
-def alerts(distribution, prometheus_url, prometheus_bearer_token, start_time, end_time, alert_profile):
-    """
-    Scrapes metrics defined in the profile from Prometheus and alerts based on the severity defined
-    """
-
-    is_url = urlparse(alert_profile)
-    if is_url.scheme and is_url.netloc:
-        response = requests.get(alert_profile)
-        temp_alerts = tempfile.NamedTemporaryFile()
-        temp_alerts.write(response.content)
-        temp_alerts.flush()
-        alert_profile = temp_alerts.name
-
-    if not prometheus_url:
-        if distribution == "openshift":
-            logging.info("Looks like prometheus_url is not defined, trying to use the default instance on the cluster")
-            prometheus_url, prometheus_bearer_token = prometheus.instance(
-                distribution, prometheus_url, prometheus_bearer_token
-            )
-        else:
-            logging.error("Looks like prometheus url is not defined, exiting")
-            sys.exit(1)
-    command = (
-        "./kube-burner check-alerts "
-        + " -u "
-        + str(prometheus_url)
-        + " -t "
-        + str(prometheus_bearer_token)
-        + " -a "
-        + str(alert_profile)
-        + " --start "
-        + str(start_time)
-        + " --end "
-        + str(end_time)
-    )
-    try:
-        logging.info("Running kube-burner to capture the metrics: %s" % command)
-        output = subprocess.run(command, shell=True, universal_newlines=True)
-        if output.returncode != 0:
-            logging.error("command exited with a non-zero rc, please check the logs for errors or critical alerts")
-            sys.exit(output.returncode)
-    except Exception as e:
-        logging.error("Failed to run kube-burner, error: %s" % (e))
-        sys.exit(1)
diff --git a/kraken/prometheus/__init__.py b/kraken/prometheus/__init__.py
index e69de29b..4c51a321 100644
--- a/kraken/prometheus/__init__.py
+++ b/kraken/prometheus/__init__.py
@@ -0,0 +1 @@
+from .client import *
\ No newline at end of file
diff --git a/kraken/prometheus/client.py b/kraken/prometheus/client.py
index df8eb780..5ebab349 100644
--- a/kraken/prometheus/client.py
+++ b/kraken/prometheus/client.py
@@ -1,49 +1,30 @@
+import datetime
+import os.path
 import urllib3
 import logging
-import prometheus_api_client
 import sys
-import kraken.invoke.command as runcommand
 
+import yaml
+from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+def alerts(prom_cli: KrknPrometheus, start_time, end_time, alert_profile):
 
-# Initialize the client
-def initialize_prom_client(distribution, prometheus_url, prometheus_bearer_token):
-    global prom_cli
-    prometheus_url, prometheus_bearer_token = instance(distribution, prometheus_url, prometheus_bearer_token)
-    if prometheus_url and prometheus_bearer_token:
-        bearer = "Bearer " + prometheus_bearer_token
-        headers = {"Authorization": bearer}
-        try:
-            prom_cli = prometheus_api_client.PrometheusConnect(url=prometheus_url, headers=headers, disable_ssl=True)
-        except Exception as e:
-            logging.error("Not able to initialize the client %s" % e)
-            sys.exit(1)
-    else:
-        prom_cli = None
-
+    if alert_profile is None or os.path.exists(alert_profile) is False:
+        logging.error(f"{alert_profile} alert profile does not exist")
+        sys.exit(1)
 
-# Process custom prometheus query
-def process_prom_query(query):
-    if prom_cli:
-        try:
-            return prom_cli.custom_query(query=query, params=None)
-        except Exception as e:
-            logging.error("Failed to get the metrics: %s" % e)
+    with open(alert_profile) as profile:
+        profile_yaml = yaml.safe_load(profile)
+        if not isinstance(profile_yaml, list):
+            logging.error(f"{alert_profile} wrong file format, alert profile must be "
+                          f"a valid yaml file containing a list of items with 3 properties: "
+                          f"expr, description, severity" )
             sys.exit(1)
-    else:
-        logging.info("Skipping the prometheus query as the prometheus client couldn't " "be initialized\n")
 
-# Get prometheus details
-def instance(distribution, prometheus_url, prometheus_bearer_token):
-    if distribution == "openshift" and not prometheus_url:
-        url = runcommand.invoke(
-            r"""oc get routes -n openshift-monitoring -o=jsonpath='{.items[?(@.metadata.name=="prometheus-k8s")].spec.host}'"""  # noqa
-        )
-        prometheus_url = "https://" + url
-    if distribution == "openshift" and not prometheus_bearer_token:
-        prometheus_bearer_token = runcommand.invoke(
-            "oc create token -n openshift-monitoring prometheus-k8s  --duration=12h "
-            "|| oc -n openshift-monitoring sa get-token prometheus-k8s "
-            "|| oc sa new-token -n openshift-monitoring prometheus-k8s"
-        )
-    return prometheus_url, prometheus_bearer_token
+        for alert in profile_yaml:
+            if list(alert.keys()).sort() != ["expr", "description", "severity"].sort():
+                logging.error(f"wrong alert {alert}, skipping")
+
+            prom_cli.process_alert(alert,
+                                   datetime.datetime.fromtimestamp(start_time),
+                                   datetime.datetime.fromtimestamp(end_time))
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index a06d863d..7cc9eea5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,14 +19,13 @@ ibm_cloud_sdk_core
 ibm_vpc
 itsdangerous==2.0.1
 jinja2==3.0.3
-krkn-lib>=1.4.5
+krkn-lib >= 1.4.5
 kubernetes
 lxml >= 4.3.0
 oauth2client>=4.1.3
 openshift-client
 paramiko
 podman-compose
-prometheus_api_client
 pyVmomi >= 6.7
 pyfiglet
 pytest
diff --git a/run_kraken.py b/run_kraken.py
index b62aed2e..afcb05db 100644
--- a/run_kraken.py
+++ b/run_kraken.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+import datetime
 import json
 import os
 import sys
@@ -8,6 +9,7 @@
 import pyfiglet
 import uuid
 import time
+from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus
 import kraken.time_actions.common_time_functions as time_actions
 import kraken.performance_dashboards.setup as performance_dashboards
 import kraken.pod_scenarios.setup as pod_scenarios
@@ -15,16 +17,14 @@
 import kraken.shut_down.common_shut_down_func as shut_down
 import kraken.node_actions.run as nodeaction
 import kraken.managedcluster_scenarios.run as managedcluster_scenarios
-import kraken.kube_burner.client as kube_burner
 import kraken.zone_outage.actions as zone_outages
 import kraken.application_outage.actions as application_outage
 import kraken.pvc.pvc_scenario as pvc_scenario
 import kraken.network_chaos.actions as network_chaos
 import kraken.arcaflow_plugin as arcaflow_plugin
+import kraken.prometheus as prometheus_plugin
 import server as server
-import kraken.prometheus.client as promcli
 from kraken import plugins
-
 from krkn_lib.k8s import KrknKubernetes
 from krkn_lib.ocp import KrknOpenshift
 from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
@@ -33,11 +33,7 @@
 from krkn_lib.utils import SafeLogger
 from krkn_lib.utils.functions import get_yaml_item_value
 
-KUBE_BURNER_URL = (
-    "https://github.com/cloud-bulldozer/kube-burner/"
-    "releases/download/v{version}/kube-burner-{version}-Linux-x86_64.tar.gz"
-)
-KUBE_BURNER_VERSION = "1.7.0"
+
 
 
 # Main function
@@ -84,21 +80,7 @@ def main(cfg):
             config["performance_monitoring"], "repo",
             "https://github.com/cloud-bulldozer/performance-dashboards.git"
         )
-        capture_metrics = get_yaml_item_value(
-            config["performance_monitoring"], "capture_metrics", False
-        )
-        kube_burner_url = get_yaml_item_value(
-            config["performance_monitoring"], "kube_burner_binary_url",
-            KUBE_BURNER_URL.format(version=KUBE_BURNER_VERSION),
-        )
-        config_path = get_yaml_item_value(
-            config["performance_monitoring"], "config_path",
-            "config/kube_burner.yaml"
-        )
-        metrics_profile = get_yaml_item_value(
-            config["performance_monitoring"], "metrics_profile_path",
-            "config/metrics-aggregated.yaml"
-        )
+
         prometheus_url = config["performance_monitoring"].get("prometheus_url")
         prometheus_bearer_token = config["performance_monitoring"].get(
             "prometheus_bearer_token"
@@ -147,9 +129,6 @@ def main(cfg):
         except:
             kubecli.initialize_clients(None)
 
-        # KrknTelemetry init
-        telemetry_k8s = KrknTelemetryKubernetes(safe_logger, kubecli)
-        telemetry_ocp = KrknTelemetryOpenshift(safe_logger, ocpcli)
 
 
         # find node kraken might be running on
@@ -179,11 +158,20 @@ def main(cfg):
         cv = ""
         if config["kraken"]["distribution"] == "openshift":
             cv = ocpcli.get_clusterversion_string()
+            if prometheus_url is None:
+                connection_data = ocpcli.get_prometheus_api_connection_data()
+                prometheus_url = connection_data.endpoint
+                prometheus_bearer_token = connection_data.token
         if cv != "":
             logging.info(cv)
         else:
             logging.info("Cluster version CRD not detected, skipping")
 
+        # KrknTelemetry init
+        telemetry_k8s = KrknTelemetryKubernetes(safe_logger, kubecli)
+        telemetry_ocp = KrknTelemetryOpenshift(safe_logger, ocpcli)
+        prometheus = KrknPrometheus(prometheus_url, prometheus_bearer_token)
+
         logging.info("Server URL: %s" % kubecli.get_host())
 
         # Deploy performance dashboards
@@ -351,9 +339,10 @@ def main(cfg):
                         # Check for critical alerts when enabled
                         if check_critical_alerts:
                             logging.info("Checking for critical alerts firing post choas")
-                            promcli.initialize_prom_client(distribution, prometheus_url, prometheus_bearer_token)
+
+                            ##PROM
                             query = r"""ALERTS{severity="critical"}"""
-                            critical_alerts = promcli.process_prom_query(query)
+                            critical_alerts = prometheus.process_prom_query_in_range(query, datetime.datetime.fromtimestamp(start_time))
                             critical_alerts_count = len(critical_alerts)
                             if critical_alerts_count > 0:
                                 logging.error("Critical alerts are firing: %s", critical_alerts)
@@ -401,33 +390,13 @@ def main(cfg):
         else:
             logging.info("telemetry collection disabled, skipping.")
 
-        # Capture the end time
-
-
-        # Capture metrics for the run
-        if capture_metrics:
-            logging.info("Capturing metrics")
-            kube_burner.setup(kube_burner_url)
-            kube_burner.scrape_metrics(
-                distribution,
-                run_uuid,
-                prometheus_url,
-                prometheus_bearer_token,
-                start_time,
-                end_time,
-                config_path,
-                metrics_profile,
-            )
 
         # Check for the alerts specified
         if enable_alerts:
             logging.info("Alerts checking is enabled")
-            kube_burner.setup(kube_burner_url)
             if alert_profile:
-                kube_burner.alerts(
-                    distribution,
-                    prometheus_url,
-                    prometheus_bearer_token,
+                prometheus_plugin.alerts(
+                    prometheus,
                     start_time,
                     end_time,
                     alert_profile,
diff --git a/scenarios/arcaflow/cpu-hog/config.yaml b/scenarios/arcaflow/cpu-hog/config.yaml
index a03beb4c..e6bcce96 100644
--- a/scenarios/arcaflow/cpu-hog/config.yaml
+++ b/scenarios/arcaflow/cpu-hog/config.yaml
@@ -1,6 +1,10 @@
 ---
 deployer:
-  connection: {}
+  connection:
+    cacert: ''
+    cert: ''
+    host: https://api.tsebasti-lab.aws.rhperfscale.org:6443
+    key: ''
   type: kubernetes
 log:
   level: debug
diff --git a/scenarios/arcaflow/cpu-hog/input.yaml b/scenarios/arcaflow/cpu-hog/input.yaml
index 3bcbece9..2e359172 100644
--- a/scenarios/arcaflow/cpu-hog/input.yaml
+++ b/scenarios/arcaflow/cpu-hog/input.yaml
@@ -2,13 +2,7 @@ input_list:
 - cpu_count: 1
   cpu_load_percentage: 80
   cpu_method: all
-  duration: 30s
-  node_selector: {}
-  # node selector example
-  # node_selector:
-  #   kubernetes.io/hostname: master
-  kubeconfig: ""
+  duration: 1s
+  kubeconfig: ''
   namespace: default
-
-# duplicate this section to run simultaneous stressors in the same run
-
+  node_selector: {}