From e3948ed518f79ccc5d6fdd4830dc06a21a3bc3ad Mon Sep 17 00:00:00 2001
From: Tullio Sebastiani <tsebasti@redhat.com>
Date: Fri, 9 Jun 2023 18:15:53 +0200
Subject: [PATCH 01/23] adapted config.yaml to the new feature

---
 config/config.yaml | 74 ++++++++++++++++++++++++----------------------
 1 file changed, 39 insertions(+), 35 deletions(-)

diff --git a/config/config.yaml b/config/config.yaml
index 4f0f779f..6680e137 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -7,41 +7,40 @@ kraken:
     signal_address: 0.0.0.0                                # Signal listening address
     port: 8081                                             # Signal port
     chaos_scenarios:                                       # List of policies/chaos scenarios to load
-        -   arcaflow_scenarios:
-                - scenarios/arcaflow/cpu-hog/input.yaml
-                - scenarios/arcaflow/memory-hog/input.yaml
-        -   container_scenarios:                                 # List of chaos pod scenarios to load
-            - -    scenarios/openshift/container_etcd.yml
-        -   plugin_scenarios:
-            - scenarios/openshift/etcd.yml
-            - scenarios/openshift/regex_openshift_pod_kill.yml
-            - scenarios/openshift/vmware_node_scenarios.yml
-            - scenarios/openshift/ibmcloud_node_scenarios.yml
-            - scenarios/openshift/network_chaos_ingress.yml
-            - scenarios/openshift/pod_network_outage.yml
-            - scenarios/openshift/pod_network_shaping.yml
-        -   node_scenarios:                                # List of chaos node scenarios to load
-            -   scenarios/openshift/node_scenarios_example.yml
-        -   plugin_scenarios:
-            - scenarios/openshift/openshift-apiserver.yml
-            - scenarios/openshift/openshift-kube-apiserver.yml
-        -   time_scenarios:                                # List of chaos time scenarios to load
-            - scenarios/openshift/time_scenarios_example.yml
-        -   cluster_shut_down_scenarios:
-            - - scenarios/openshift/cluster_shut_down_scenario.yml
-              - scenarios/openshift/post_action_shut_down.py
-        -   namespace_scenarios:
-             - - scenarios/openshift/regex_namespace.yaml
-             - - scenarios/openshift/ingress_namespace.yaml
-               - scenarios/openshift/post_action_namespace.py
-        -   zone_outages:
-            - scenarios/openshift/zone_outage.yaml
-        -   application_outages:
-            - scenarios/openshift/app_outage.yaml
-        -   pvc_scenarios:
-            - scenarios/openshift/pvc_scenario.yaml
-        -   network_chaos:
-            - scenarios/openshift/network_chaos.yaml
+        - arcaflow_scenarios:
+              - scenarios/arcaflow/cpu-hog/input.yaml
+              - scenarios/arcaflow/memory-hog/input.yaml
+        - container_scenarios: # List of chaos pod scenarios to load
+              - - scenarios/openshift/container_etcd.yml
+        - plugin_scenarios:
+              - scenarios/openshift/etcd.yml
+              - scenarios/openshift/regex_openshift_pod_kill.yml
+              - scenarios/openshift/vmware_node_scenarios.yml
+              - scenarios/openshift/ibmcloud_node_scenarios.yml
+              - scenarios/openshift/network_chaos_ingress.yml
+              - scenarios/openshift/pod_network_outage.yml
+        - node_scenarios: # List of chaos node scenarios to load
+              - scenarios/openshift/node_scenarios_example.yml
+        - plugin_scenarios:
+              - scenarios/openshift/openshift-apiserver.yml
+              - scenarios/openshift/openshift-kube-apiserver.yml
+        - time_scenarios: # List of chaos time scenarios to load
+              - scenarios/openshift/time_scenarios_example.yml
+        - cluster_shut_down_scenarios:
+              - - scenarios/openshift/cluster_shut_down_scenario.yml
+                - scenarios/openshift/post_action_shut_down.py
+        - namespace_scenarios:
+              - - scenarios/openshift/regex_namespace.yaml
+              - - scenarios/openshift/ingress_namespace.yaml
+                - scenarios/openshift/post_action_namespace.py
+        - zone_outages:
+              - scenarios/openshift/zone_outage.yaml
+        - application_outages:
+              - scenarios/openshift/app_outage.yaml
+        - pvc_scenarios:
+              - scenarios/openshift/pvc_scenario.yaml
+        - network_chaos:
+              - scenarios/openshift/network_chaos.yaml
 
 cerberus:
     cerberus_enabled: False                                # Enable it when cerberus is previously installed
@@ -65,3 +64,8 @@ tunings:
     wait_duration: 60                                      # Duration to wait between each chaos scenario
     iterations: 1                                          # Number of times to execute the scenarios
     daemon_mode: False                                     # Iterations are set to infinity which means that the kraken will cause chaos forever
+telemetry:
+    enabled: True
+    api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production
+    username: username
+    password: password
\ No newline at end of file

From d351ec5575f32867b85fefa6b139b0db3c5dc9c1 Mon Sep 17 00:00:00 2001
From: Tullio Sebastiani <tsebasti@redhat.com>
Date: Fri, 9 Jun 2023 18:16:19 +0200
Subject: [PATCH 02/23] temporarly pointing requirement.txt to the lib feature
 branch

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 5ec8629b..237a4eca 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -37,4 +37,4 @@ prometheus_api_client
 ibm_cloud_sdk_core
 ibm_vpc
 pytest
-krkn-lib-kubernetes > 0.1.1
+krkn-lib-kubernetes@git+https://github.com/redhat-chaos/krkn-lib-kubernetes.git@krkn_telemetry
\ No newline at end of file

From 048ccad37d7b7aa82e114313119a28b7736df3db Mon Sep 17 00:00:00 2001
From: Tullio Sebastiani <tsebasti@redhat.com>
Date: Mon, 12 Jun 2023 09:30:24 +0200
Subject: [PATCH 03/23] run_kraken.py + arcaflow scenarios refactoring

typo
---
 kraken/arcaflow_plugin/arcaflow_plugin.py | 27 ++++++++++++++++-------
 run_kraken.py                             | 12 +++++-----
 2 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/kraken/arcaflow_plugin/arcaflow_plugin.py b/kraken/arcaflow_plugin/arcaflow_plugin.py
index 5c8ee707..52c33baf 100644
--- a/kraken/arcaflow_plugin/arcaflow_plugin.py
+++ b/kraken/arcaflow_plugin/arcaflow_plugin.py
@@ -1,3 +1,5 @@
+import time
+
 import arcaflow
 import os
 import yaml
@@ -6,22 +8,31 @@
 from pathlib import Path
 from typing import List
 from .context_auth import ContextAuth
+from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
 
 
-def run(scenarios_list: List[str], kubeconfig_path: str):
+def run(scenarios_list: List[str], kubeconfig_path: str, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
+    scenario_telemetries: list[ScenarioTelemetry] = []
+    failed_post_scenarios = []
     for scenario in scenarios_list:
+        scenario_telemetry = ScenarioTelemetry()
+        scenario_telemetry.scenario = scenario
+        scenario_telemetry.startTimeStamp = time.time()
+        telemetry.set_parameters_base64(scenario_telemetry,scenario)
         engine_args = build_args(scenario)
-        run_workflow(engine_args, kubeconfig_path)
+        status_code = run_workflow(engine_args, kubeconfig_path)
+        scenario_telemetry.endTimeStamp = time.time()
+        scenario_telemetry.exitStatus = status_code
+        scenario_telemetries.append(scenario_telemetry)
+        if status_code != 0:
+            failed_post_scenarios.append(scenario)
+    return failed_post_scenarios, scenario_telemetries
 
 
-def run_workflow(engine_args: arcaflow.EngineArgs, kubeconfig_path: str):
+def run_workflow(engine_args: arcaflow.EngineArgs, kubeconfig_path: str) -> int:
     set_arca_kubeconfig(engine_args, kubeconfig_path)
     exit_status = arcaflow.run(engine_args)
-    if exit_status != 0:
-        logging.error(
-            f"failed to run arcaflow scenario {engine_args.input}"
-        )
-        sys.exit(exit_status)
+    return exit_status
 
 
 def build_args(input_file: str) -> arcaflow.EngineArgs:
diff --git a/run_kraken.py b/run_kraken.py
index f48dbd17..2a5218ab 100644
--- a/run_kraken.py
+++ b/run_kraken.py
@@ -25,7 +25,7 @@
 import server as server
 import kraken.prometheus.client as promcli
 from kraken import plugins
-from krkn_lib_kubernetes import KrknLibKubernetes
+from krkn_lib_kubernetes import KrknLibKubernetes, KrknTelemetry, ChaosRunTelemetry
 
 KUBE_BURNER_URL = (
     "https://github.com/cloud-bulldozer/kube-burner/"
@@ -171,7 +171,7 @@ def main(cfg):
         # Capture the start time
         start_time = int(time.time())
         litmus_installed = False
-
+        chaos_telemetry = ChaosRunTelemetry()
         # Loop to run the chaos starts here
         while int(iteration) < iterations and run_signal != "STOP":
             # Inject chaos scenarios specified in the config
@@ -203,9 +203,10 @@ def main(cfg):
                             )
                             sys.exit(1)
                         elif scenario_type == "arcaflow_scenarios":
-                            failed_post_scenarios = arcaflow_plugin.run(
-                                scenarios_list, kubeconfig_path
+                            failed_post_scenarios, scenario_telemetries = arcaflow_plugin.run(
+                                scenarios_list, kubeconfig_path, telemetry
                             )
+                            chaos_telemetry.scenarios.extend(scenario_telemetries)
 
                         elif scenario_type == "plugin_scenarios":
                             failed_post_scenarios = plugins.run(
@@ -352,7 +353,8 @@ def main(cfg):
 
             iteration += 1
             logging.info("")
-
+        # send telemetry
+        telemetry.send_telemetry(config["telemetry"], chaos_telemetry)
         # Capture the end time
         end_time = int(time.time())
 

From 77e81a3994abc3c8e26e7746dd70a8b306c6a56a Mon Sep 17 00:00:00 2001
From: Tullio Sebastiani <tsebasti@redhat.com>
Date: Wed, 14 Jun 2023 11:41:10 +0200
Subject: [PATCH 04/23] plugin scenario

---
 kraken/plugins/__init__.py | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/kraken/plugins/__init__.py b/kraken/plugins/__init__.py
index 9c1b7768..7fb28dfb 100644
--- a/kraken/plugins/__init__.py
+++ b/kraken/plugins/__init__.py
@@ -13,6 +13,7 @@
 from kraken.plugins.network.ingress_shaping import network_chaos
 from kraken.plugins.pod_network_outage.pod_network_outage_plugin import pod_outage
 from kraken.plugins.pod_network_outage.pod_network_outage_plugin import pod_egress_shaping
+from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
 
 
 @dataclasses.dataclass
@@ -225,16 +226,26 @@ def json_schema(self):
 )
 
 
-def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_post_scenarios: List[str], wait_duration: int) -> List[str]:
+def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_post_scenarios: List[str], wait_duration: int, telemetry: KrknTelemetry) -> (List[str], list[ScenarioTelemetry]):
+    scenario_telemetries: list[ScenarioTelemetry] = []
     for scenario in scenarios:
+        scenario_telemetry = ScenarioTelemetry()
+        scenario_telemetry.scenario = scenario
+        scenario_telemetry.startTimeStamp = time.time()
+        telemetry.set_parameters_base64(scenario_telemetry, scenario)
         logging.info('scenario '+ str(scenario))
         try:
             PLUGINS.run(scenario, kubeconfig_path, kraken_config)
         except Exception as e:
+            scenario_telemetry.exitStatus = 1
             failed_post_scenarios.append(scenario)
-            logging.error("Error while running {}: {}".format(scenario, e))
-            return failed_post_scenarios
-        logging.info("Waiting for the specified duration: %s" % (wait_duration))
-        time.sleep(wait_duration)
-
-    return failed_post_scenarios
+            telemetry.log_exception(scenario)
+        else:
+            scenario_telemetry.exitStatus = 0
+            logging.info("Waiting for the specified duration: %s" % (wait_duration))
+            scenario_telemetries.append(scenario_telemetry)
+            time.sleep(wait_duration)
+        scenario_telemetries.append(scenario_telemetry)
+        scenario_telemetry.endTimeStamp = time.time()
+
+    return failed_post_scenarios, scenario_telemetries

From 87fa6d1501063484fc0256840f684dd28578f158 Mon Sep 17 00:00:00 2001
From: Tullio Sebastiani <tsebasti@redhat.com>
Date: Wed, 14 Jun 2023 11:48:14 +0200
Subject: [PATCH 05/23] node scenarios

return failed scenarios
---
 kraken/node_actions/aws_node_scenarios.py     | 48 ++++++++++++++-----
 kraken/node_actions/az_node_scenarios.py      | 32 +++++++++----
 kraken/node_actions/gcp_node_scenarios.py     | 36 ++++++++++----
 .../node_actions/openstack_node_scenarios.py  | 36 ++++++++++----
 kraken/node_actions/run.py                    | 34 +++++++++----
 5 files changed, 140 insertions(+), 46 deletions(-)

diff --git a/kraken/node_actions/aws_node_scenarios.py b/kraken/node_actions/aws_node_scenarios.py
index ded183e5..d53ff5a3 100644
--- a/kraken/node_actions/aws_node_scenarios.py
+++ b/kraken/node_actions/aws_node_scenarios.py
@@ -27,7 +27,9 @@ def start_instances(self, instance_id):
             logging.error(
                 "Failed to start node instance %s. Encountered following " "exception: %s." % (instance_id, e)
             )
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()
 
     # Stop the node instance
     def stop_instances(self, instance_id):
@@ -36,7 +38,9 @@ def stop_instances(self, instance_id):
             logging.info("EC2 instance: " + str(instance_id) + " stopped")
         except Exception as e:
             logging.error("Failed to stop node instance %s. Encountered following " "exception: %s." % (instance_id, e))
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()
 
     # Terminate the node instance
     def terminate_instances(self, instance_id):
@@ -47,7 +51,9 @@ def terminate_instances(self, instance_id):
             logging.error(
                 "Failed to terminate node instance %s. Encountered following " "exception: %s." % (instance_id, e)
             )
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()
 
     # Reboot the node instance
     def reboot_instances(self, instance_id):
@@ -58,7 +64,9 @@ def reboot_instances(self, instance_id):
             logging.error(
                 "Failed to reboot node instance %s. Encountered following " "exception: %s." % (instance_id, e)
             )
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()
 
     # Below functions poll EC2.Client.describe_instances() every 15 seconds
     # until a successful state is reached. An error is returned after 40 failed checks
@@ -102,7 +110,9 @@ def create_default_network_acl(self, vpc_id):
                 "Failed to create the default network_acl: %s"
                 "Make sure you have aws cli configured on the host and set for the region of your vpc/subnet" % (e)
             )
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()
         return acl_id
 
     # Replace network acl association
@@ -114,7 +124,9 @@ def replace_network_acl_association(self, association_id, acl_id):
             new_association_id = status["NewAssociationId"]
         except Exception as e:
             logging.error("Failed to replace network acl association: %s" % (e))
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()
         return new_association_id
 
     # Describe network acl
@@ -131,7 +143,9 @@ def describe_network_acls(self, vpc_id, subnet_id):
                 "Failed to describe network acl: %s."
                 "Make sure you have aws cli configured on the host and set for the region of your vpc/subnet" % (e)
             )
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()
         associations = response["NetworkAcls"][0]["Associations"]
         # grab the current network_acl in use
         original_acl_id = response["NetworkAcls"][0]["Associations"][0]["NetworkAclId"]
@@ -148,7 +162,9 @@ def delete_network_acl(self, acl_id):
                 "Make sure you have aws cli configured on the host and set for the region of your vpc/subnet"
                 % (acl_id, e)
             )
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()
 
 # krkn_lib_kubernetes
 class aws_node_scenarios(abstract_node_scenarios):
@@ -173,7 +189,9 @@ def node_start_scenario(self, instance_kill_count, node, timeout):
                     "Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e)
                 )
                 logging.error("node_start_scenario injection failed!")
-                sys.exit(1)
+                # removed_exit
+                # sys.exit(1)
+                raise RuntimeError()
 
     # Node scenario to stop the node
     def node_stop_scenario(self, instance_kill_count, node, timeout):
@@ -189,7 +207,9 @@ def node_stop_scenario(self, instance_kill_count, node, timeout):
             except Exception as e:
                 logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % (e))
                 logging.error("node_stop_scenario injection failed!")
-                sys.exit(1)
+                # removed_exit
+                # sys.exit(1)
+                raise RuntimeError()
 
     # Node scenario to terminate the node
     def node_termination_scenario(self, instance_kill_count, node, timeout):
@@ -213,7 +233,9 @@ def node_termination_scenario(self, instance_kill_count, node, timeout):
                     "Failed to terminate node instance. Encountered following exception:" " %s. Test Failed" % (e)
                 )
                 logging.error("node_termination_scenario injection failed!")
-                sys.exit(1)
+                # removed_exit
+                # sys.exit(1)
+                raise RuntimeError()
 
     # Node scenario to reboot the node
     def node_reboot_scenario(self, instance_kill_count, node, timeout):
@@ -232,4 +254,6 @@ def node_reboot_scenario(self, instance_kill_count, node, timeout):
                     "Failed to reboot node instance. Encountered following exception:" " %s. Test Failed" % (e)
                 )
                 logging.error("node_reboot_scenario injection failed!")
-                sys.exit(1)
+                # removed_exit
+                # sys.exit(1)
+                raise RuntimeError()
diff --git a/kraken/node_actions/az_node_scenarios.py b/kraken/node_actions/az_node_scenarios.py
index 5a093254..adffb79d 100644
--- a/kraken/node_actions/az_node_scenarios.py
+++ b/kraken/node_actions/az_node_scenarios.py
@@ -39,7 +39,9 @@ def start_instances(self, group_name, vm_name):
             logging.info("vm name " + str(vm_name) + " started")
         except Exception as e:
             logging.error("Failed to start node instance %s. Encountered following " "exception: %s." % (vm_name, e))
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()
 
     # Stop the node instance
     def stop_instances(self, group_name, vm_name):
@@ -48,7 +50,9 @@ def stop_instances(self, group_name, vm_name):
             logging.info("vm name " + str(vm_name) + " stopped")
         except Exception as e:
             logging.error("Failed to stop node instance %s. Encountered following " "exception: %s." % (vm_name, e))
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()
 
     # Terminate the node instance
     def terminate_instances(self, group_name, vm_name):
@@ -59,7 +63,9 @@ def terminate_instances(self, group_name, vm_name):
             logging.error(
                 "Failed to terminate node instance %s. Encountered following " "exception: %s." % (vm_name, e)
             )
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()
 
     # Reboot the node instance
     def reboot_instances(self, group_name, vm_name):
@@ -68,7 +74,9 @@ def reboot_instances(self, group_name, vm_name):
             logging.info("vm name " + str(vm_name) + " rebooted")
         except Exception as e:
             logging.error("Failed to reboot node instance %s. Encountered following " "exception: %s." % (vm_name, e))
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()
 
     def get_vm_status(self, resource_group, vm_name):
         statuses = self.compute_client.virtual_machines.instance_view(resource_group, vm_name).statuses
@@ -145,7 +153,9 @@ def node_start_scenario(self, instance_kill_count, node, timeout):
                     "Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e)
                 )
                 logging.error("node_start_scenario injection failed!")
-                sys.exit(1)
+                # removed_exit
+                # sys.exit(1)
+                raise RuntimeError()
 
     # Node scenario to stop the node
     def node_stop_scenario(self, instance_kill_count, node, timeout):
@@ -161,7 +171,9 @@ def node_stop_scenario(self, instance_kill_count, node, timeout):
             except Exception as e:
                 logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % e)
                 logging.error("node_stop_scenario injection failed!")
-                sys.exit(1)
+                # removed_exit
+                # sys.exit(1)
+                raise RuntimeError()
 
     # Node scenario to terminate the node
     def node_termination_scenario(self, instance_kill_count, node, timeout):
@@ -185,7 +197,9 @@ def node_termination_scenario(self, instance_kill_count, node, timeout):
                     "Failed to terminate node instance. Encountered following exception:" " %s. Test Failed" % (e)
                 )
                 logging.error("node_termination_scenario injection failed!")
-                sys.exit(1)
+                # removed_exit
+                # sys.exit(1)
+                raise RuntimeError()
 
     # Node scenario to reboot the node
     def node_reboot_scenario(self, instance_kill_count, node, timeout):
@@ -204,4 +218,6 @@ def node_reboot_scenario(self, instance_kill_count, node, timeout):
                     "Failed to reboot node instance. Encountered following exception:" " %s. Test Failed" % (e)
                 )
                 logging.error("node_reboot_scenario injection failed!")
-                sys.exit(1)
+                # removed_exit
+                # sys.exit(1)
+                raise RuntimeError()
\ No newline at end of file
diff --git a/kraken/node_actions/gcp_node_scenarios.py b/kraken/node_actions/gcp_node_scenarios.py
index db7a11d7..64711a2a 100644
--- a/kraken/node_actions/gcp_node_scenarios.py
+++ b/kraken/node_actions/gcp_node_scenarios.py
@@ -45,7 +45,9 @@ def start_instances(self, zone, instance_id):
             logging.error(
                 "Failed to start node instance %s. Encountered following " "exception: %s." % (instance_id, e)
             )
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()
 
     # Stop the node instance
     def stop_instances(self, zone, instance_id):
@@ -54,7 +56,9 @@ def stop_instances(self, zone, instance_id):
             logging.info("vm name " + str(instance_id) + " stopped")
         except Exception as e:
             logging.error("Failed to stop node instance %s. Encountered following " "exception: %s." % (instance_id, e))
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()
 
     # Start the node instance
     def suspend_instances(self, zone, instance_id):
@@ -65,7 +69,9 @@ def suspend_instances(self, zone, instance_id):
             logging.error(
                 "Failed to suspend node instance %s. Encountered following " "exception: %s." % (instance_id, e)
             )
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()
 
     # Terminate the node instance
     def terminate_instances(self, zone, instance_id):
@@ -76,7 +82,9 @@ def terminate_instances(self, zone, instance_id):
             logging.error(
                 "Failed to start node instance %s. Encountered following " "exception: %s." % (instance_id, e)
             )
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()
 
     # Reboot the node instance
     def reboot_instances(self, zone, instance_id):
@@ -87,7 +95,9 @@ def reboot_instances(self, zone, instance_id):
             logging.error(
                 "Failed to start node instance %s. Encountered following " "exception: %s." % (instance_id, e)
             )
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()
 
     # Get instance status
     def get_instance_status(self, zone, instance_id, expected_status, timeout):
@@ -156,7 +166,9 @@ def node_start_scenario(self, instance_kill_count, node, timeout):
                     "Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e)
                 )
                 logging.error("node_start_scenario injection failed!")
-                sys.exit(1)
+                # removed_exit
+                # sys.exit(1)
+                raise RuntimeError()
 
     # Node scenario to stop the node
     def node_stop_scenario(self, instance_kill_count, node, timeout):
@@ -173,7 +185,9 @@ def node_stop_scenario(self, instance_kill_count, node, timeout):
             except Exception as e:
                 logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % (e))
                 logging.error("node_stop_scenario injection failed!")
-                sys.exit(1)
+                # removed_exit
+                # sys.exit(1)
+                raise RuntimeError()
 
     # Node scenario to terminate the node
     def node_termination_scenario(self, instance_kill_count, node, timeout):
@@ -197,7 +211,9 @@ def node_termination_scenario(self, instance_kill_count, node, timeout):
                     "Failed to terminate node instance. Encountered following exception:" " %s. Test Failed" % e
                 )
                 logging.error("node_termination_scenario injection failed!")
-                sys.exit(1)
+                # removed_exit
+                # sys.exit(1)
+                raise RuntimeError()
 
     # Node scenario to reboot the node
     def node_reboot_scenario(self, instance_kill_count, node, timeout):
@@ -215,4 +231,6 @@ def node_reboot_scenario(self, instance_kill_count, node, timeout):
                     "Failed to reboot node instance. Encountered following exception:" " %s. Test Failed" % (e)
                 )
                 logging.error("node_reboot_scenario injection failed!")
-                sys.exit(1)
+                # removed_exit
+                # sys.exit(1)
+                raise RuntimeError()
diff --git a/kraken/node_actions/openstack_node_scenarios.py b/kraken/node_actions/openstack_node_scenarios.py
index 9e8bcdde..bae44e90 100644
--- a/kraken/node_actions/openstack_node_scenarios.py
+++ b/kraken/node_actions/openstack_node_scenarios.py
@@ -24,7 +24,9 @@ def start_instances(self, node):
             logging.info("Instance: " + str(node) + " started")
         except Exception as e:
             logging.error("Failed to start node instance %s. Encountered following " "exception: %s." % (node, e))
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()
 
     # Stop the node instance
     def stop_instances(self, node):
@@ -33,7 +35,9 @@ def stop_instances(self, node):
             logging.info("Instance: " + str(node) + " stopped")
         except Exception as e:
             logging.error("Failed to stop node instance %s. Encountered following " "exception: %s." % (node, e))
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()
 
     # Reboot the node instance
     def reboot_instances(self, node):
@@ -42,7 +46,9 @@ def reboot_instances(self, node):
             logging.info("Instance: " + str(node) + " rebooted")
         except Exception as e:
             logging.error("Failed to reboot node instance %s. Encountered following " "exception: %s." % (node, e))
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()
 
     # Wait until the node instance is running
     def wait_until_running(self, node, timeout):
@@ -109,7 +115,9 @@ def node_start_scenario(self, instance_kill_count, node, timeout):
                     "Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e)
                 )
                 logging.error("node_start_scenario injection failed!")
-                sys.exit(1)
+                # removed_exit
+                # sys.exit(1)
+                raise RuntimeError()
 
     # Node scenario to stop the node
     def node_stop_scenario(self, instance_kill_count, node, timeout):
@@ -125,7 +133,9 @@ def node_stop_scenario(self, instance_kill_count, node, timeout):
             except Exception as e:
                 logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % (e))
                 logging.error("node_stop_scenario injection failed!")
-                sys.exit(1)
+                # removed_exit
+                # sys.exit(1)
+                raise RuntimeError()
 
     # Node scenario to reboot the node
     def node_reboot_scenario(self, instance_kill_count, node, timeout):
@@ -144,7 +154,9 @@ def node_reboot_scenario(self, instance_kill_count, node, timeout):
                     "Failed to reboot node instance. Encountered following exception:" " %s. Test Failed" % (e)
                 )
                 logging.error("node_reboot_scenario injection failed!")
-                sys.exit(1)
+                # removed_exit
+                # sys.exit(1)
+                raise RuntimeError()
 
     # Node scenario to start the node
     def helper_node_start_scenario(self, instance_kill_count, node_ip, timeout):
@@ -162,7 +174,9 @@ def helper_node_start_scenario(self, instance_kill_count, node_ip, timeout):
                     "Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e)
                 )
                 logging.error("helper_node_start_scenario injection failed!")
-                sys.exit(1)
+                # removed_exit
+                # sys.exit(1)
+                raise RuntimeError()
 
     # Node scenario to stop the node
     def helper_node_stop_scenario(self, instance_kill_count, node_ip, timeout):
@@ -177,7 +191,9 @@ def helper_node_stop_scenario(self, instance_kill_count, node_ip, timeout):
             except Exception as e:
                 logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % (e))
                 logging.error("helper_node_stop_scenario injection failed!")
-                sys.exit(1)
+                # removed_exit
+                # sys.exit(1)
+                raise RuntimeError()
 
     def helper_node_service_status(self, node_ip, service, ssh_private_key, timeout):
         try:
@@ -188,4 +204,6 @@ def helper_node_service_status(self, node_ip, service, ssh_private_key, timeout)
         except Exception as e:
             logging.error("Failed to check service status. Encountered following exception:" " %s. Test Failed" % (e))
             logging.error("helper_node_service_status injection failed!")
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()
diff --git a/kraken/node_actions/run.py b/kraken/node_actions/run.py
index b37660e7..c7e0bf5c 100644
--- a/kraken/node_actions/run.py
+++ b/kraken/node_actions/run.py
@@ -13,7 +13,7 @@
 from kraken.node_actions.docker_node_scenarios import docker_node_scenarios
 import kraken.node_actions.common_node_functions as common_node_functions
 import kraken.cerberus.setup as cerberus
-
+from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
 
 node_general = False
 
@@ -53,8 +53,14 @@ def get_node_scenario_object(node_scenario, kubecli: krkn_lib_kubernetes.KrknLib
 
 # Run defined scenarios
 # krkn_lib_kubernetes
-def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
+def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
+    scenario_telemetries: list[ScenarioTelemetry] = []
+    failed_scenarios = []
     for node_scenario_config in scenarios_list:
+        scenario_telemetry = ScenarioTelemetry()
+        scenario_telemetry.scenario = node_scenario_config
+        scenario_telemetry.startTimeStamp = time.time()
+        telemetry.set_parameters_base64(scenario_telemetry, node_scenario_config)
         with open(node_scenario_config, "r") as f:
             node_scenario_config = yaml.full_load(f)
             for node_scenario in node_scenario_config["node_scenarios"]:
@@ -62,12 +68,24 @@ def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.Krkn
                 if node_scenario["actions"]:
                     for action in node_scenario["actions"]:
                         start_time = int(time.time())
-                        inject_node_scenario(action, node_scenario, node_scenario_object, kubecli)
-                        logging.info("Waiting for the specified duration: %s" % (wait_duration))
-                        time.sleep(wait_duration)
-                        end_time = int(time.time())
-                        cerberus.get_status(config, start_time, end_time)
-                        logging.info("")
+                        try:
+                            inject_node_scenario(action, node_scenario, node_scenario_object, kubecli)
+                            logging.info("Waiting for the specified duration: %s" % (wait_duration))
+                            time.sleep(wait_duration)
+                            end_time = int(time.time())
+                            cerberus.get_status(config, start_time, end_time)
+                            logging.info("")
+                        except (RuntimeError, Exception) as e:
+                            scenario_telemetry.exitStatus = 1
+                            failed_scenarios.append(node_scenario_config)
+                            telemetry.log_exception(node_scenario_config)
+                        else:
+                            scenario_telemetry.exitStatus = 0
+
+                        scenario_telemetry.endTimeStamp = time.time()
+                        scenario_telemetries.append(scenario_telemetry)
+
+    return failed_scenarios, scenario_telemetries
 
 
 # Inject the specified node scenario

From 4bf5afb21de9b7bda7fafacf78c9df68f743a175 Mon Sep 17 00:00:00 2001
From: Tullio Sebastiani <tsebasti@redhat.com>
Date: Wed, 14 Jun 2023 11:56:05 +0200
Subject: [PATCH 06/23] container scenarios

fix
---
 kraken/pod_scenarios/setup.py | 80 ++++++++++++++++++++++++-----------
 1 file changed, 56 insertions(+), 24 deletions(-)

diff --git a/kraken/pod_scenarios/setup.py b/kraken/pod_scenarios/setup.py
index 2eeb4db5..6071c327 100644
--- a/kraken/pod_scenarios/setup.py
+++ b/kraken/pod_scenarios/setup.py
@@ -10,7 +10,7 @@
 import yaml
 import sys
 import random
-
+from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
 
 # Run pod based scenarios
 def run(kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_duration):
@@ -67,8 +67,22 @@ def run(kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_dur
     return failed_post_scenarios
 
 # krkn_lib_kubernetes
-def container_run(kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
+def container_run(kubeconfig_path,
+                  scenarios_list,
+                  config,
+                  failed_post_scenarios,
+                  wait_duration,
+                  kubecli: krkn_lib_kubernetes.KrknLibKubernetes,
+                  telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
+
+    failed_scenarios = []
+    scenario_telemetries: list[ScenarioTelemetry] = []
+
     for container_scenario_config in scenarios_list:
+        scenario_telemetry = ScenarioTelemetry()
+        scenario_telemetry.scenario = container_scenario_config
+        scenario_telemetry.startTimeStamp = time.time()
+        telemetry.set_parameters_base64(scenario_telemetry, container_scenario_config[0])
         if len(container_scenario_config) > 1:
             pre_action_output = post_actions.run(kubeconfig_path, container_scenario_config[1])
         else:
@@ -78,30 +92,41 @@ def container_run(kubeconfig_path, scenarios_list, config, failed_post_scenarios
             for cont_scenario in cont_scenario_config["scenarios"]:
                 # capture start time
                 start_time = int(time.time())
-                killed_containers = container_killing_in_pod(cont_scenario, kubecli)
-
-                if len(container_scenario_config) > 1:
-                    try:
+                try:
+                    killed_containers = container_killing_in_pod(cont_scenario, kubecli)
+                    if len(container_scenario_config) > 1:
                         failed_post_scenarios = post_actions.check_recovery(
-                            kubeconfig_path, container_scenario_config, failed_post_scenarios, pre_action_output
+                            kubeconfig_path,
+                            container_scenario_config,
+                            failed_post_scenarios,
+                            pre_action_output
+                        )
+                    else:
+                        failed_post_scenarios = check_failed_containers(
+                            killed_containers, cont_scenario.get("retry_wait", 120), kubecli
                         )
-                    except Exception as e:
-                        logging.error("Failed to run post action checks: %s" % e)
-                        sys.exit(1)
-                else:
-                    failed_post_scenarios = check_failed_containers(
-                        killed_containers, cont_scenario.get("retry_wait", 120), kubecli
-                    )
 
-                logging.info("Waiting for the specified duration: %s" % (wait_duration))
-                time.sleep(wait_duration)
+                    logging.info("Waiting for the specified duration: %s" % (wait_duration))
+                    time.sleep(wait_duration)
 
-                # capture end time
-                end_time = int(time.time())
+                    # capture end time
+                    end_time = int(time.time())
+
+                    # publish cerberus status
+                    cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
+                except (RuntimeError, Exception):
+                    failed_scenarios.append(container_scenario_config[0])
+                    telemetry.log_exception(container_scenario_config[0])
+                    scenario_telemetry.exitStatus = 1
+                    # removed_exit
+                    # sys.exit(1)
+                else:
+                    scenario_telemetry.exitStatus = 0
+                scenario_telemetry.endTimeStamp = time.time()
+                scenario_telemetries.append(scenario_telemetry)
+
+    return failed_scenarios, scenario_telemetries
 
-                # publish cerberus status
-                cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
-                logging.info("")
 
 
 def container_killing_in_pod(cont_scenario, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
@@ -114,7 +139,9 @@ def container_killing_in_pod(cont_scenario, kubecli: krkn_lib_kubernetes.KrknLib
     kill_count = cont_scenario.get("count", 1)
     if type(pod_names) != list:
         logging.error("Please make sure your pod_names are in a list format")
-        sys.exit(1)
+        # removed_exit
+        # sys.exit(1)
+        raise RuntimeError()
     if len(pod_names) == 0:
         if namespace == "*":
             # returns double array of pod name and namespace
@@ -126,7 +153,9 @@ def container_killing_in_pod(cont_scenario, kubecli: krkn_lib_kubernetes.KrknLib
         if namespace == "*":
             logging.error("You must specify the namespace to kill a container in a specific pod")
             logging.error("Scenario " + scenario_name + " failed")
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()
         pods = pod_names
     # get container and pod name
     container_pod_list = []
@@ -147,7 +176,9 @@ def container_killing_in_pod(cont_scenario, kubecli: krkn_lib_kubernetes.KrknLib
         if len(container_pod_list) == 0:
             logging.error("Trying to kill more containers than were found, try lowering kill count")
             logging.error("Scenario " + scenario_name + " failed")
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()
         selected_container_pod = container_pod_list[random.randint(0, len(container_pod_list) - 1)]
         for c_name in selected_container_pod[2]:
             if container_name != "":
@@ -178,6 +209,7 @@ def retry_container_killing(kill_action, podname, namespace, container_name, kub
             time.sleep(2)
             continue
         else:
+            logging.warning(response)
             continue
 
 
From 83a0f8248abb1d8948809df2a24b76ca0bb439bd Mon Sep 17 00:00:00 2001
From: Tullio Sebastiani <tsebasti@redhat.com>
Date: Wed, 14 Jun 2023 11:58:34 +0200
Subject: [PATCH 07/23] time scenarios

---
 kraken/time_actions/common_time_functions.py | 74 +++++++++++++-------
 1 file changed, 50 insertions(+), 24 deletions(-)

diff --git a/kraken/time_actions/common_time_functions.py b/kraken/time_actions/common_time_functions.py
index b96f0c53..5ace3c2c 100644
--- a/kraken/time_actions/common_time_functions.py
+++ b/kraken/time_actions/common_time_functions.py
@@ -8,6 +8,7 @@
 import krkn_lib_kubernetes
 from ..cerberus import setup as cerberus
 from ..invoke import command as runcommand
+from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
 
 # krkn_lib_kubernetes
 def pod_exec(pod_name, command, namespace, container_name, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
@@ -93,7 +94,9 @@ def skew_time(scenario, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
             for name in scenario["object_name"]:
                 if "namespace" not in scenario.keys():
                     logging.error("Need to set namespace when using pod name")
-                    sys.exit(1)
+                    # removed_exit
+                    # sys.exit(1)
+                    raise RuntimeError()
                 pod_names.append([name, scenario["namespace"]])
         elif "namespace" in scenario.keys() and scenario["namespace"]:
             if "label_selector" not in scenario.keys():
@@ -127,7 +130,9 @@ def skew_time(scenario, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
                 "Cannot find pods matching the namespace/label_selector, "
                 "please check"
             )
-            sys.exit(1)
+            # removed_exit
+            # sys.exit(1)
+            raise RuntimeError()
         pod_counter = 0
         for pod in pod_names:
             if len(pod) > 1:
@@ -152,7 +157,9 @@ def skew_time(scenario, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
                         "in pod %s in namespace %s"
                         % (selected_container_name, pod[0], pod[1])
                     )
-                    sys.exit(1)
+                    # removed_exit
+                    # sys.exit(1)
+                    raise RuntimeError()
                 pod_names[pod_counter].append(selected_container_name)
             else:
                 selected_container_name = get_container_name(
@@ -178,7 +185,9 @@ def skew_time(scenario, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
                             scenario["namespace"]
                         )
                     )
-                    sys.exit(1)
+                    # removed_exit
+                    # sys.exit(1)
+                    raise RuntimeError()
                 pod_names[pod_counter].append(selected_container_name)
             logging.info("Reset date/time on pod " + str(pod[0]))
             pod_counter += 1
@@ -299,24 +308,41 @@ def check_date_time(object_type, names, kubecli: krkn_lib_kubernetes.KrknLibKube
 
 
 # krkn_lib_kubernetes
-def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
+def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
+    failed_scenarios = []
+    scenario_telemetries: list[ScenarioTelemetry] = []
     for time_scenario_config in scenarios_list:
-        with open(time_scenario_config, "r") as f:
-            scenario_config = yaml.full_load(f)
-            for time_scenario in scenario_config["time_scenarios"]:
-                start_time = int(time.time())
-                object_type, object_names = skew_time(time_scenario, kubecli)
-                not_reset = check_date_time(object_type, object_names, kubecli)
-                if len(not_reset) > 0:
-                    logging.info("Object times were not reset")
-                logging.info(
-                    "Waiting for the specified duration: %s" % (wait_duration)
-                )
-                time.sleep(wait_duration)
-                end_time = int(time.time())
-                cerberus.publish_kraken_status(
-                    config,
-                    not_reset,
-                    start_time,
-                    end_time
-                )
+        scenario_telemetry = ScenarioTelemetry()
+        scenario_telemetry.scenario = time_scenario_config
+        scenario_telemetry.startTimeStamp = time.time()
+        telemetry.set_parameters_base64(scenario_telemetry, time_scenario_config)
+        try:
+            with open(time_scenario_config, "r") as f:
+                scenario_config = yaml.full_load(f)
+                for time_scenario in scenario_config["time_scenarios"]:
+                    start_time = int(time.time())
+                    object_type, object_names = skew_time(time_scenario, kubecli)
+                    not_reset = check_date_time(object_type, object_names, kubecli)
+                    if len(not_reset) > 0:
+                        logging.info("Object times were not reset")
+                    logging.info(
+                        "Waiting for the specified duration: %s" % (wait_duration)
+                    )
+                    time.sleep(wait_duration)
+                    end_time = int(time.time())
+                    cerberus.publish_kraken_status(
+                        config,
+                        not_reset,
+                        start_time,
+                        end_time
+                    )
+        except (RuntimeError, Exception):
+            scenario_telemetry.exitStatus = 1
+            telemetry.log_exception(time_scenario_config)
+            failed_scenarios.append(time_scenario_config)
+        else:
+            scenario_telemetry.exitStatus = 0
+        scenario_telemetry.endTimeStamp = time.time()
+        scenario_telemetries.append(scenario_telemetry)
+
+    return failed_scenarios, scenario_telemetries

From 68036695a7c909c9180e737c2f0e4d9f0058234f Mon Sep 17 00:00:00 2001
From: Tullio Sebastiani <tsebasti@redhat.com>
Date: Wed, 14 Jun 2023 11:59:01 +0200
Subject: [PATCH 08/23] cluster shutdown  scenarios

---
 kraken/shut_down/common_shut_down_func.py | 61 ++++++++++++++++-------
 1 file changed, 42 insertions(+), 19 deletions(-)

diff --git a/kraken/shut_down/common_shut_down_func.py b/kraken/shut_down/common_shut_down_func.py
index ed0ac51c..e498b819 100644
--- a/kraken/shut_down/common_shut_down_func.py
+++ b/kraken/shut_down/common_shut_down_func.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-
+import os
 import sys
 import yaml
 import logging
@@ -13,7 +13,7 @@
 from ..node_actions.openstack_node_scenarios import OPENSTACKCLOUD
 from ..node_actions.az_node_scenarios import Azure
 from ..node_actions.gcp_node_scenarios import GCP
-
+from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
 
 def multiprocess_nodes(cloud_object_function, nodes):
     try:
@@ -59,7 +59,9 @@ def cluster_shut_down(shut_down_config, kubecli: krkn_lib_kubernetes.KrknLibKube
             "Cloud type %s is not currently supported for cluster shut down" %
             cloud_type
         )
-        sys.exit(1)
+        # removed_exit
+        # sys.exit(1)
+        raise RuntimeError()
 
     nodes = kubecli.list_nodes()
     node_id = []
@@ -128,9 +130,16 @@ def cluster_shut_down(shut_down_config, kubecli: krkn_lib_kubernetes.KrknLibKube
 
 # krkn_lib_kubernetes
 
-def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
+def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
     failed_post_scenarios = []
+    failed_scenarios = []
+    scenario_telemetries: list[ScenarioTelemetry] = []
+
     for shut_down_config in scenarios_list:
+        scenario_telemetry = ScenarioTelemetry()
+        scenario_telemetry.scenario = shut_down_config
+        scenario_telemetry.startTimeStamp = time.time()
+        telemetry.set_parameters_base64(scenario_telemetry, shut_down_config[0])
         if len(shut_down_config) > 1:
             pre_action_output = post_actions.run("", shut_down_config[1])
         else:
@@ -140,18 +149,32 @@ def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.Krkn
             shut_down_config_scenario = \
                 shut_down_config_yaml["cluster_shut_down_scenario"]
             start_time = int(time.time())
-            cluster_shut_down(shut_down_config_scenario, kubecli)
-            logging.info(
-                "Waiting for the specified duration: %s" % (wait_duration)
-            )
-            time.sleep(wait_duration)
-            failed_post_scenarios = post_actions.check_recovery(
-                "", shut_down_config, failed_post_scenarios, pre_action_output
-            )
-            end_time = int(time.time())
-            cerberus.publish_kraken_status(
-                config,
-                failed_post_scenarios,
-                start_time,
-                end_time
-            )
+            try:
+                cluster_shut_down(shut_down_config_scenario, kubecli)
+                logging.info(
+                    "Waiting for the specified duration: %s" % (wait_duration)
+                )
+                time.sleep(wait_duration)
+                failed_post_scenarios = post_actions.check_recovery(
+                    "", shut_down_config, failed_post_scenarios, pre_action_output
+                )
+                end_time = int(time.time())
+                cerberus.publish_kraken_status(
+                    config,
+                    failed_post_scenarios,
+                    start_time,
+                    end_time
+                )
+
+            except (RuntimeError, Exception):
+                telemetry.log_exception(shut_down_config[0])
+                failed_scenarios.append(shut_down_config[0])
+                scenario_telemetry.exitStatus = 1
+            else:
+                scenario_telemetry.exitStatus = 0
+
+            scenario_telemetry.endTimeStamp = time.time()
+            scenario_telemetries.append(scenario_telemetry)
+
+    return failed_scenarios, scenario_telemetries
+

From afb678cf84b701fa5266182bd87bef0aad364d0a Mon Sep 17 00:00:00 2001
From: Tullio Sebastiani <tsebasti@redhat.com>
Date: Wed, 14 Jun 2023 16:36:59 +0200
Subject: [PATCH 09/23] namespace scenarios

---
 .../common_namespace_functions.py             | 148 ++++++++++--------
 1 file changed, 86 insertions(+), 62 deletions(-)

diff --git a/kraken/namespace_actions/common_namespace_functions.py b/kraken/namespace_actions/common_namespace_functions.py
index 1ebbbe9b..7d7212d6 100644
--- a/kraken/namespace_actions/common_namespace_functions.py
+++ b/kraken/namespace_actions/common_namespace_functions.py
@@ -6,7 +6,7 @@
 import kraken.post_actions.actions as post_actions
 import yaml
 import sys
-
+from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
 
 # krkn_lib_kubernetes
 def run(
@@ -15,72 +15,96 @@ def run(
         wait_duration,
         failed_post_scenarios,
         kubeconfig_path,
-        kubecli: krkn_lib_kubernetes.KrknLibKubernetes
-):
-
+        kubecli: krkn_lib_kubernetes.KrknLibKubernetes,
+        telemetry: KrknTelemetry
+) -> (list[str], list[ScenarioTelemetry]):
+    scenario_telemetries: list[ScenarioTelemetry] = []
+    failed_scenarios = []
     for scenario_config in scenarios_list:
-        if len(scenario_config) > 1:
-            pre_action_output = post_actions.run(kubeconfig_path, scenario_config[1])
-        else:
-            pre_action_output = ""
-        with open(scenario_config[0], "r") as f:
-            scenario_config_yaml = yaml.full_load(f)
-            for scenario in scenario_config_yaml["scenarios"]:
-                scenario_namespace = scenario.get("namespace", "")
-                scenario_label = scenario.get("label_selector", "")
-                if scenario_namespace is not None and scenario_namespace.strip() != "":
-                    if scenario_label is not None and scenario_label.strip() != "":
-                        logging.error("You can only have namespace or label set in your namespace scenario")
-                        logging.error(
-                            "Current scenario config has namespace '%s' and label selector '%s'"
-                            % (scenario_namespace, scenario_label)
-                        )
-                        logging.error(
-                            "Please set either namespace to blank ('') or label_selector to blank ('') to continue"
-                        )
-                        sys.exit(1)
-                delete_count = scenario.get("delete_count", 1)
-                run_count = scenario.get("runs", 1)
-                run_sleep = scenario.get("sleep", 10)
-                wait_time = scenario.get("wait_time", 30)
-                killed_namespaces = []
-                start_time = int(time.time())
-                for i in range(run_count):
-                    namespaces = kubecli.check_namespaces([scenario_namespace], scenario_label)
-                    for j in range(delete_count):
-                        if len(namespaces) == 0:
+        scenario_telemetry = ScenarioTelemetry()
+        scenario_telemetry.scenario = scenario_config[0]
+        scenario_telemetry.startTimeStamp = time.time()
+        telemetry.set_parameters_base64(scenario_telemetry, scenario_config[0])
+        try:
+            if len(scenario_config) > 1:
+                pre_action_output = post_actions.run(kubeconfig_path, scenario_config[1])
+            else:
+                pre_action_output = ""
+            with open(scenario_config[0], "r") as f:
+                scenario_config_yaml = yaml.full_load(f)
+                for scenario in scenario_config_yaml["scenarios"]:
+                    scenario_namespace = scenario.get("namespace", "")
+                    scenario_label = scenario.get("label_selector", "")
+                    if scenario_namespace is not None and scenario_namespace.strip() != "":
+                        if scenario_label is not None and scenario_label.strip() != "":
+                            logging.error("You can only have namespace or label set in your namespace scenario")
                             logging.error(
-                                "Couldn't delete %s namespaces, not enough namespaces matching %s with label %s"
-                                % (str(run_count), scenario_namespace, str(scenario_label))
+                                "Current scenario config has namespace '%s' and label selector '%s'"
+                                % (scenario_namespace, scenario_label)
                             )
-                            sys.exit(1)
-                        selected_namespace = namespaces[random.randint(0, len(namespaces) - 1)]
-                        killed_namespaces.append(selected_namespace)
-                        try:
-                            kubecli.delete_namespace(selected_namespace)
-                            logging.info("Delete on namespace %s was successful" % str(selected_namespace))
-                        except Exception as e:
-                            logging.info("Delete on namespace %s was unsuccessful" % str(selected_namespace))
-                            logging.info("Namespace action error: " + str(e))
-                            sys.exit(1)
-                        namespaces.remove(selected_namespace)
-                        logging.info("Waiting %s seconds between namespace deletions" % str(run_sleep))
-                        time.sleep(run_sleep)
-
-                        logging.info("Waiting for the specified duration: %s" % wait_duration)
-                        time.sleep(wait_duration)
-                        if len(scenario_config) > 1:
-                            try:
-                                failed_post_scenarios = post_actions.check_recovery(
-                                    kubeconfig_path, scenario_config, failed_post_scenarios, pre_action_output
+                            logging.error(
+                                "Please set either namespace to blank ('') or label_selector to blank ('') to continue"
+                            )
+                            # removed_exit
+                            # sys.exit(1)
+                            raise RuntimeError()
+                    delete_count = scenario.get("delete_count", 1)
+                    run_count = scenario.get("runs", 1)
+                    run_sleep = scenario.get("sleep", 10)
+                    wait_time = scenario.get("wait_time", 30)
+                    killed_namespaces = []
+                    start_time = int(time.time())
+                    for i in range(run_count):
+                        namespaces = kubecli.check_namespaces([scenario_namespace], scenario_label)
+                        for j in range(delete_count):
+                            if len(namespaces) == 0:
+                                logging.error(
+                                    "Couldn't delete %s namespaces, not enough namespaces matching %s with label %s"
+                                    % (str(run_count), scenario_namespace, str(scenario_label))
                                 )
+                                # removed_exit
+                                # sys.exit(1)
+                                raise RuntimeError()
+                            selected_namespace = namespaces[random.randint(0, len(namespaces) - 1)]
+                            killed_namespaces.append(selected_namespace)
+                            try:
+                                kubecli.delete_namespace(selected_namespace)
+                                logging.info("Delete on namespace %s was successful" % str(selected_namespace))
                             except Exception as e:
-                                logging.error("Failed to run post action checks: %s" % e)
-                                sys.exit(1)
-                        else:
-                            failed_post_scenarios = check_active_namespace(killed_namespaces, wait_time, kubecli)
-                end_time = int(time.time())
-                cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
+                                logging.info("Delete on namespace %s was unsuccessful" % str(selected_namespace))
+                                logging.info("Namespace action error: " + str(e))
+                                # removed_exit
+                                # sys.exit(1)
+                                raise RuntimeError()
+                            namespaces.remove(selected_namespace)
+                            logging.info("Waiting %s seconds between namespace deletions" % str(run_sleep))
+                            time.sleep(run_sleep)
+
+                            logging.info("Waiting for the specified duration: %s" % wait_duration)
+                            time.sleep(wait_duration)
+                            if len(scenario_config) > 1:
+                                try:
+                                    failed_post_scenarios = post_actions.check_recovery(
+                                        kubeconfig_path, scenario_config, failed_post_scenarios, pre_action_output
+                                    )
+                                except Exception as e:
+                                    logging.error("Failed to run post action checks: %s" % e)
+                                    # removed_exit
+                                    # sys.exit(1)
+                                    raise RuntimeError()
+                            else:
+                                failed_post_scenarios = check_active_namespace(killed_namespaces, wait_time, kubecli)
+                    end_time = int(time.time())
+                    cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
+        except (Exception, RuntimeError):
+            scenario_telemetry.exitStatus = 1
+            failed_scenarios.append(scenario_config[0])
+            telemetry.log_exception(scenario_config[0])
+        else:
+            scenario_telemetry.exitStatus = 0
+        scenario_telemetry.endTimeStamp = time.time()
+        scenario_telemetries.append(scenario_telemetry)
+    return failed_scenarios, scenario_telemetries
 
 # krkn_lib_kubernetes
 def check_active_namespace(killed_namespaces, wait_time, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):

From 1de6de8c2f81768096512b6cf0babf9a6ce74333 Mon Sep 17 00:00:00 2001
From: Tullio Sebastiani <tsebasti@redhat.com>
Date: Wed, 14 Jun 2023 16:37:13 +0200
Subject: [PATCH 10/23] zone outage scenarios

---
 kraken/zone_outage/actions.py | 169 +++++++++++++++++++---------------
 1 file changed, 94 insertions(+), 75 deletions(-)

diff --git a/kraken/zone_outage/actions.py b/kraken/zone_outage/actions.py
index 64f4c6e4..3c2e3e01 100644
--- a/kraken/zone_outage/actions.py
+++ b/kraken/zone_outage/actions.py
@@ -1,100 +1,119 @@
 import yaml
-import sys
 import logging
 import time
 from ..node_actions.aws_node_scenarios import AWS
 from ..cerberus import setup as cerberus
+from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
 
-
-def run(scenarios_list, config, wait_duration):
+def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]) :
     """
     filters the subnet of interest and applies the network acl
     to create zone outage
     """
     failed_post_scenarios = ""
+    scenario_telemetries: list[ScenarioTelemetry] = []
+    failed_scenarios = []
+
     for zone_outage_config in scenarios_list:
-        if len(zone_outage_config) > 1:
-            with open(zone_outage_config, "r") as f:
-                zone_outage_config_yaml = yaml.full_load(f)
-                scenario_config = zone_outage_config_yaml["zone_outage"]
-                vpc_id = scenario_config["vpc_id"]
-                subnet_ids = scenario_config["subnet_id"]
-                duration = scenario_config["duration"]
-                cloud_type = scenario_config["cloud_type"]
-                ids = {}
-                acl_ids_created = []
+        scenario_telemetry = ScenarioTelemetry()
+        scenario_telemetry.scenario = zone_outage_config[0]
+        scenario_telemetry.startTimeStamp = time.time()
+        telemetry.set_parameters_base64(scenario_telemetry, zone_outage_config[0])
+        try:
+            if len(zone_outage_config) > 1:
+                with open(zone_outage_config, "r") as f:
+                    zone_outage_config_yaml = yaml.full_load(f)
+                    scenario_config = zone_outage_config_yaml["zone_outage"]
+                    vpc_id = scenario_config["vpc_id"]
+                    subnet_ids = scenario_config["subnet_id"]
+                    duration = scenario_config["duration"]
+                    cloud_type = scenario_config["cloud_type"]
+                    ids = {}
+                    acl_ids_created = []
 
-                if cloud_type.lower() == "aws":
-                    cloud_object = AWS()
-                else:
-                    logging.error(
-                        "Cloud type %s is not currently supported for "
-                        "zone outage scenarios"
-                        % cloud_type
-                    )
-                    sys.exit(1)
+                    if cloud_type.lower() == "aws":
+                        cloud_object = AWS()
+                    else:
+                        logging.error(
+                            "Cloud type %s is not currently supported for "
+                            "zone outage scenarios"
+                            % cloud_type
+                        )
+                        # removed_exit
+                        # sys.exit(1)
+                        raise RuntimeError()
 
-                start_time = int(time.time())
+                    start_time = int(time.time())
 
-                for subnet_id in subnet_ids:
-                    logging.info("Targeting subnet_id")
-                    network_association_ids = []
-                    associations, original_acl_id = \
-                        cloud_object.describe_network_acls(vpc_id, subnet_id)
-                    for entry in associations:
-                        if entry["SubnetId"] == subnet_id:
-                            network_association_ids.append(
-                                entry["NetworkAclAssociationId"]
+                    for subnet_id in subnet_ids:
+                        logging.info("Targeting subnet_id")
+                        network_association_ids = []
+                        associations, original_acl_id = \
+                            cloud_object.describe_network_acls(vpc_id, subnet_id)
+                        for entry in associations:
+                            if entry["SubnetId"] == subnet_id:
+                                network_association_ids.append(
+                                    entry["NetworkAclAssociationId"]
+                                )
+                        logging.info(
+                            "Network association ids associated with "
+                            "the subnet %s: %s"
+                            % (subnet_id, network_association_ids)
+                        )
+                        acl_id = cloud_object.create_default_network_acl(vpc_id)
+                        new_association_id = \
+                            cloud_object.replace_network_acl_association(
+                                network_association_ids[0], acl_id
                             )
+
+                        # capture the orginal_acl_id, created_acl_id and
+                        # new association_id to use during the recovery
+                        ids[new_association_id] = original_acl_id
+                        acl_ids_created.append(acl_id)
+
+                    # wait for the specified duration
                     logging.info(
-                        "Network association ids associated with "
-                        "the subnet %s: %s"
-                        % (subnet_id, network_association_ids)
+                        "Waiting for the specified duration "
+                        "in the config: %s" % (duration)
                     )
-                    acl_id = cloud_object.create_default_network_acl(vpc_id)
-                    new_association_id = \
+                    time.sleep(duration)
+
+                    # replace the applied acl with the previous acl in use
+                    for new_association_id, original_acl_id in ids.items():
                         cloud_object.replace_network_acl_association(
-                            network_association_ids[0], acl_id
+                            new_association_id,
+                            original_acl_id
                         )
+                    logging.info(
+                        "Wating for 60 seconds to make sure "
+                        "the changes are in place"
+                    )
+                    time.sleep(60)
 
-                    # capture the orginal_acl_id, created_acl_id and
-                    # new association_id to use during the recovery
-                    ids[new_association_id] = original_acl_id
-                    acl_ids_created.append(acl_id)
-
-                # wait for the specified duration
-                logging.info(
-                    "Waiting for the specified duration "
-                    "in the config: %s" % (duration)
-                )
-                time.sleep(duration)
+                    # delete the network acl created for the run
+                    for acl_id in acl_ids_created:
+                        cloud_object.delete_network_acl(acl_id)
 
-                # replace the applied acl with the previous acl in use
-                for new_association_id, original_acl_id in ids.items():
-                    cloud_object.replace_network_acl_association(
-                        new_association_id,
-                        original_acl_id
+                    logging.info(
+                        "End of scenario. "
+                        "Waiting for the specified duration: %s" % (wait_duration)
                     )
-                logging.info(
-                    "Wating for 60 seconds to make sure "
-                    "the changes are in place"
-                )
-                time.sleep(60)
-
-                # delete the network acl created for the run
-                for acl_id in acl_ids_created:
-                    cloud_object.delete_network_acl(acl_id)
+                    time.sleep(wait_duration)
 
-                logging.info(
-                    "End of scenario. "
-                    "Waiting for the specified duration: %s" % (wait_duration)
-                )
-                time.sleep(wait_duration)
+                    end_time = int(time.time())
+                    cerberus.publish_kraken_status(
+                        config,
+                        failed_post_scenarios,
+                        start_time,
+                        end_time
+                    )
+        except (RuntimeError, Exception):
+            scenario_telemetry.exitStatus = 1
+            failed_scenarios.append(zone_outage_config[0])
+            telemetry.log_exception(zone_outage_config[0])
+        else:
+            scenario_telemetry.exitStatus = 0
+        scenario_telemetry.endTimeStamp = time.time()
+        scenario_telemetries.append(scenario_telemetry)
+    return failed_scenarios, scenario_telemetries
 
-                end_time = int(time.time())
-                cerberus.publish_kraken_status(
-                    config,
-                    failed_post_scenarios,
-                    start_time,
-                    end_time
-                )

From 6ea49f14ce7d3ad0d92252862c182cba5a3fd83d Mon Sep 17 00:00:00 2001
From: Tullio Sebastiani <tsebasti@redhat.com>
Date: Tue, 27 Jun 2023 11:08:57 +0200
Subject: [PATCH 11/23] app outage scenarios

---
 kraken/application_outage/actions.py | 89 +++++++++++++++++-----------
 1 file changed, 53 insertions(+), 36 deletions(-)

diff --git a/kraken/application_outage/actions.py b/kraken/application_outage/actions.py
index 6c82f8d8..f8d32085 100644
--- a/kraken/application_outage/actions.py
+++ b/kraken/application_outage/actions.py
@@ -4,25 +4,32 @@
 import kraken.cerberus.setup as cerberus
 from jinja2 import Template
 import kraken.invoke.command as runcommand
-
+from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
 
 # Reads the scenario config, applies and deletes a network policy to
 # block the traffic for the specified duration
-def run(scenarios_list, config, wait_duration):
+def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
     failed_post_scenarios = ""
+    scenario_telemetries: list[ScenarioTelemetry] = []
+    failed_scenarios = []
     for app_outage_config in scenarios_list:
+        scenario_telemetry = ScenarioTelemetry()
+        scenario_telemetry.scenario = app_outage_config[0]
+        scenario_telemetry.startTimeStamp = time.time()
+        telemetry.set_parameters_base64(scenario_telemetry, app_outage_config[0])
         if len(app_outage_config) > 1:
-            with open(app_outage_config, "r") as f:
-                app_outage_config_yaml = yaml.full_load(f)
-                scenario_config = app_outage_config_yaml["application_outage"]
-                pod_selector = scenario_config.get("pod_selector", "{}")
-                traffic_type = scenario_config.get("block", "[Ingress, Egress]")
-                namespace = scenario_config.get("namespace", "")
-                duration = scenario_config.get("duration", 60)
+            try:
+                with open(app_outage_config, "r") as f:
+                    app_outage_config_yaml = yaml.full_load(f)
+                    scenario_config = app_outage_config_yaml["application_outage"]
+                    pod_selector = scenario_config.get("pod_selector", "{}")
+                    traffic_type = scenario_config.get("block", "[Ingress, Egress]")
+                    namespace = scenario_config.get("namespace", "")
+                    duration = scenario_config.get("duration", 60)
 
-                start_time = int(time.time())
+                    start_time = int(time.time())
 
-                network_policy_template = """---
+                    network_policy_template = """---
 apiVersion: networking.k8s.io/v1
 kind: NetworkPolicy
 metadata:
@@ -31,28 +38,38 @@ def run(scenarios_list, config, wait_duration):
   podSelector:
     matchLabels: {{ pod_selector }}
   policyTypes: {{ traffic_type }}
-                """
-                t = Template(network_policy_template)
-                rendered_spec = t.render(pod_selector=pod_selector, traffic_type=traffic_type)
-                # Write the rendered template to a file
-                with open("kraken_network_policy.yaml", "w") as f:
-                    f.write(rendered_spec)
-                # Block the traffic by creating network policy
-                logging.info("Creating the network policy")
-                runcommand.invoke(
-                    "kubectl create -f %s -n %s --validate=false" % ("kraken_network_policy.yaml", namespace)
-                )
-
-                # wait for the specified duration
-                logging.info("Waiting for the specified duration in the config: %s" % (duration))
-                time.sleep(duration)
-
-                # unblock the traffic by deleting the network policy
-                logging.info("Deleting the network policy")
-                runcommand.invoke("kubectl delete -f %s -n %s" % ("kraken_network_policy.yaml", namespace))
-
-                logging.info("End of scenario. Waiting for the specified duration: %s" % (wait_duration))
-                time.sleep(wait_duration)
-
-                end_time = int(time.time())
-                cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
+                    """
+                    t = Template(network_policy_template)
+                    rendered_spec = t.render(pod_selector=pod_selector, traffic_type=traffic_type)
+                    # Write the rendered template to a file
+                    with open("kraken_network_policy.yaml", "w") as f:
+                        f.write(rendered_spec)
+                    # Block the traffic by creating network policy
+                    logging.info("Creating the network policy")
+                    runcommand.invoke(
+                        "kubectl create -f %s -n %s --validate=false" % ("kraken_network_policy.yaml", namespace)
+                    )
+
+                    # wait for the specified duration
+                    logging.info("Waiting for the specified duration in the config: %s" % (duration))
+                    time.sleep(duration)
+
+                    # unblock the traffic by deleting the network policy
+                    logging.info("Deleting the network policy")
+                    runcommand.invoke("kubectl delete -f %s -n %s" % ("kraken_network_policy.yaml", namespace))
+
+                    logging.info("End of scenario. Waiting for the specified duration: %s" % (wait_duration))
+                    time.sleep(wait_duration)
+
+                    end_time = int(time.time())
+                    cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
+            except Exception as e :
+                scenario_telemetry.exitStatus = 1
+                failed_scenarios.append(app_outage_config[0])
+                telemetry.log_exception(app_outage_config[0])
+            else:
+                scenario_telemetry.exitStatus = 0
+            scenario_telemetry.endTimeStamp = time.time()
+            scenario_telemetries.append(scenario_telemetry)
+    return failed_scenarios, scenario_telemetries
+

From 6c7f168a81b81ec19ef5d24a665954bc8a60ec3e Mon Sep 17 00:00:00 2001
From: Tullio Sebastiani <tsebasti@redhat.com>
Date: Mon, 10 Jul 2023 10:15:46 +0200
Subject: [PATCH 12/23] pvc scenarios

---
 kraken/pvc/pvc_scenario.py | 357 ++++++++++++++++++++++---------------
 1 file changed, 214 insertions(+), 143 deletions(-)

diff --git a/kraken/pvc/pvc_scenario.py b/kraken/pvc/pvc_scenario.py
index 95119716..aa9e297a 100644
--- a/kraken/pvc/pvc_scenario.py
+++ b/kraken/pvc/pvc_scenario.py
@@ -7,150 +7,233 @@
 import yaml
 
 from ..cerberus import setup as cerberus
-
+from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
 
 # krkn_lib_kubernetes
-def run(scenarios_list, config, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
+def run(scenarios_list, config, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
     """
     Reads the scenario config and creates a temp file to fill up the PVC
     """
     failed_post_scenarios = ""
+    scenario_telemetries: list[ScenarioTelemetry] = []
+    failed_scenarios = []
     for app_config in scenarios_list:
-        if len(app_config) > 1:
-            with open(app_config, "r") as f:
-                config_yaml = yaml.full_load(f)
-                scenario_config = config_yaml["pvc_scenario"]
-                pvc_name = scenario_config.get("pvc_name", "")
-                pod_name = scenario_config.get("pod_name", "")
-                namespace = scenario_config.get("namespace", "")
-                target_fill_percentage = scenario_config.get(
-                    "fill_percentage", "50"
-                )
-                duration = scenario_config.get("duration", 60)
-
-                logging.info(
-                    "Input params:\n"
-                    "pvc_name: '%s'\n"
-                    "pod_name: '%s'\n"
-                    "namespace: '%s'\n"
-                    "target_fill_percentage: '%s%%'\nduration: '%ss'"
-                    % (
-                        str(pvc_name),
-                        str(pod_name),
-                        str(namespace),
-                        str(target_fill_percentage),
-                        str(duration)
+        scenario_telemetry = ScenarioTelemetry()
+        scenario_telemetry.scenario = app_config[0]
+        scenario_telemetry.startTimeStamp = time.time()
+        telemetry.set_parameters_base64(scenario_telemetry, app_config[0])
+        try:
+            if len(app_config) > 1:
+                with open(app_config, "r") as f:
+                    config_yaml = yaml.full_load(f)
+                    scenario_config = config_yaml["pvc_scenario"]
+                    pvc_name = scenario_config.get("pvc_name", "")
+                    pod_name = scenario_config.get("pod_name", "")
+                    namespace = scenario_config.get("namespace", "")
+                    target_fill_percentage = scenario_config.get(
+                        "fill_percentage", "50"
                     )
-                )
+                    duration = scenario_config.get("duration", 60)
 
-                # Check input params
-                if namespace is None:
-                    logging.error(
-                        "You must specify the namespace where the PVC is"
-                    )
-                    sys.exit(1)
-                if pvc_name is None and pod_name is None:
-                    logging.error(
-                        "You must specify the pvc_name or the pod_name"
-                    )
-                    sys.exit(1)
-                if pvc_name and pod_name:
                     logging.info(
-                        "pod_name will be ignored, pod_name used will be "
-                        "a retrieved from the pod used in the pvc_name"
+                        "Input params:\n"
+                        "pvc_name: '%s'\n"
+                        "pod_name: '%s'\n"
+                        "namespace: '%s'\n"
+                        "target_fill_percentage: '%s%%'\nduration: '%ss'"
+                        % (
+                            str(pvc_name),
+                            str(pod_name),
+                            str(namespace),
+                            str(target_fill_percentage),
+                            str(duration)
+                        )
                     )
 
-                # Get pod name
-                if pvc_name:
-                    if pod_name:
+                    # Check input params
+                    if namespace is None:
+                        logging.error(
+                            "You must specify the namespace where the PVC is"
+                        )
+                        #sys.exit(1)
+                        raise RuntimeError()
+                    if pvc_name is None and pod_name is None:
+                        logging.error(
+                            "You must specify the pvc_name or the pod_name"
+                        )
+                        # sys.exit(1)
+                        raise RuntimeError()
+                    if pvc_name and pod_name:
                         logging.info(
-                            "pod_name '%s' will be overridden with one of "
-                            "the pods mounted in the PVC" % (str(pod_name))
+                            "pod_name will be ignored, pod_name used will be "
+                            "a retrieved from the pod used in the pvc_name"
+                        )
+
+                    # Get pod name
+                    if pvc_name:
+                        if pod_name:
+                            logging.info(
+                                "pod_name '%s' will be overridden with one of "
+                                "the pods mounted in the PVC" % (str(pod_name))
+                            )
+                        pvc = kubecli.get_pvc_info(pvc_name, namespace)
+                        try:
+                            # random generator not used for
+                            # security/cryptographic purposes.
+                            pod_name = random.choice(pvc.podNames)  # nosec
+                            logging.info("Pod name: %s" % pod_name)
+                        except Exception:
+                            logging.error(
+                                "Pod associated with %s PVC, on namespace %s, "
+                                "not found" % (str(pvc_name), str(namespace))
+                            )
+                            # sys.exit(1)
+                            raise RuntimeError()
+
+                    # Get volume name
+                    pod = kubecli.get_pod_info(name=pod_name, namespace=namespace)
+
+                    if pod is None:
+                        logging.error(
+                            "Exiting as pod '%s' doesn't exist "
+                            "in namespace '%s'" % (
+                                str(pod_name),
+                                str(namespace)
+                            )
                         )
-                    pvc = kubecli.get_pvc_info(pvc_name, namespace)
-                    try:
-                        # random generator not used for
-                        # security/cryptographic purposes.
-                        pod_name = random.choice(pvc.podNames)  # nosec
-                        logging.info("Pod name: %s" % pod_name)
-                    except Exception:
+                        # sys.exit(1)
+                        raise RuntimeError()
+
+                    for volume in pod.volumes:
+                        if volume.pvcName is not None:
+                            volume_name = volume.name
+                            pvc_name = volume.pvcName
+                            pvc = kubecli.get_pvc_info(pvc_name, namespace)
+                            break
+                    if 'pvc' not in locals():
                         logging.error(
-                            "Pod associated with %s PVC, on namespace %s, "
-                            "not found" % (str(pvc_name), str(namespace))
+                            "Pod '%s' in namespace '%s' does not use a pvc" % (
+                                str(pod_name),
+                                str(namespace)
+                            )
                         )
-                        sys.exit(1)
+                        # sys.exit(1)
+                        raise RuntimeError()
+                    logging.info("Volume name: %s" % volume_name)
+                    logging.info("PVC name: %s" % pvc_name)
 
-                # Get volume name
-                pod = kubecli.get_pod_info(name=pod_name, namespace=namespace)
+                    # Get container name and mount path
+                    for container in pod.containers:
+                        for vol in container.volumeMounts:
+                            if vol.name == volume_name:
+                                mount_path = vol.mountPath
+                                container_name = container.name
+                                break
+                    logging.info("Container path: %s" % container_name)
+                    logging.info("Mount path: %s" % mount_path)
 
-                if pod is None:
-                    logging.error(
-                        "Exiting as pod '%s' doesn't exist "
-                        "in namespace '%s'" % (
-                            str(pod_name),
-                            str(namespace)
+                    # Get PVC capacity and used bytes
+                    command = "df %s -B 1024 | sed 1d" % (str(mount_path))
+                    command_output = (
+                        kubecli.exec_cmd_in_pod(
+                            command,
+                            pod_name,
+                            namespace,
+                            container_name,
+                            "sh"
                         )
+                    ).split()
+                    pvc_used_kb = int(command_output[2])
+                    pvc_capacity_kb = pvc_used_kb + int(command_output[3])
+                    logging.info("PVC used: %s KB" % pvc_used_kb)
+                    logging.info("PVC capacity: %s KB" % pvc_capacity_kb)
+
+                    # Check valid fill percentage
+                    current_fill_percentage = pvc_used_kb / pvc_capacity_kb
+                    if not (
+                        current_fill_percentage * 100
+                        < float(target_fill_percentage)
+                        <= 99
+                    ):
+                        logging.error(
+                            "Target fill percentage (%.2f%%) is lower than "
+                            "current fill percentage (%.2f%%) "
+                            "or higher than 99%%" % (
+                                target_fill_percentage,
+                                current_fill_percentage * 100
+                            )
+                        )
+                        # sys.exit(1)
+                        raise RuntimeError()
+
+                    # Calculate file size
+                    file_size_kb = int(
+                        (
+                            float(
+                                target_fill_percentage / 100
+                            ) * float(pvc_capacity_kb)
+                        ) - float(pvc_used_kb)
                     )
-                    sys.exit(1)
+                    logging.debug("File size: %s KB" % file_size_kb)
 
-                for volume in pod.volumes:
-                    if volume.pvcName is not None:
-                        volume_name = volume.name
-                        pvc_name = volume.pvcName
-                        pvc = kubecli.get_pvc_info(pvc_name, namespace)
-                        break
-                if 'pvc' not in locals():
-                    logging.error(
-                        "Pod '%s' in namespace '%s' does not use a pvc" % (
+                    file_name = "kraken.tmp"
+                    logging.info(
+                        "Creating %s file, %s KB size, in pod %s at %s (ns %s)"
+                        % (
+                            str(file_name),
+                            str(file_size_kb),
                             str(pod_name),
+                            str(mount_path),
                             str(namespace)
                         )
                     )
-                    sys.exit(1)
-                logging.info("Volume name: %s" % volume_name)
-                logging.info("PVC name: %s" % pvc_name)
 
-                # Get container name and mount path
-                for container in pod.containers:
-                    for vol in container.volumeMounts:
-                        if vol.name == volume_name:
-                            mount_path = vol.mountPath
-                            container_name = container.name
-                            break
-                logging.info("Container path: %s" % container_name)
-                logging.info("Mount path: %s" % mount_path)
-
-                # Get PVC capacity and used bytes
-                command = "df %s -B 1024 | sed 1d" % (str(mount_path))
-                command_output = (
+                    start_time = int(time.time())
+                    # Create temp file in the PVC
+                    full_path = "%s/%s" % (str(mount_path), str(file_name))
+                    command = "fallocate -l $((%s*1024)) %s" % (
+                        str(file_size_kb),
+                        str(full_path)
+                    )
+                    logging.debug(
+                        "Create temp file in the PVC command:\n %s" % command
+                    )
                     kubecli.exec_cmd_in_pod(
                         command,
                         pod_name,
                         namespace,
                         container_name,
                     )
-                ).split()
-                pvc_used_kb = int(command_output[2])
-                pvc_capacity_kb = pvc_used_kb + int(command_output[3])
-                logging.info("PVC used: %s KB" % pvc_used_kb)
-                logging.info("PVC capacity: %s KB" % pvc_capacity_kb)
 
-                # Check valid fill percentage
-                current_fill_percentage = pvc_used_kb / pvc_capacity_kb
-                if not (
-                    current_fill_percentage * 100
-                    < float(target_fill_percentage)
-                    <= 99
-                ):
-                    logging.error(
-                        "Target fill percentage (%.2f%%) is lower than "
-                        "current fill percentage (%.2f%%) "
-                        "or higher than 99%%" % (
-                            target_fill_percentage,
-                            current_fill_percentage * 100
-                        )
+                    # Check if file is created
+                    command = "ls -lh %s" % (str(mount_path))
+                    logging.debug("Check file is created command:\n %s" % command)
+                    response = kubecli.exec_cmd_in_pod(
+                        command, pod_name, namespace, container_name, "sh"
                     )
-                    sys.exit(1)
+                    logging.info("\n" + str(response))
+                    if str(file_name).lower() in str(response).lower():
+                        logging.info(
+                            "%s file successfully created" % (str(full_path))
+                        )
+                    else:
+                        logging.error(
+                            "Failed to create tmp file with %s size" % (
+                                str(file_size_kb)
+                            )
+                        )
+                        remove_temp_file(
+                            file_name,
+                            full_path,
+                            pod_name,
+                            namespace,
+                            container_name,
+                            mount_path,
+                            file_size_kb,
+                            kubecli
+                        )
+                        # sys.exit(1)
+                        raise RuntimeError()
 
                 # Calculate file size
                 file_size_kb = int(
@@ -197,14 +280,13 @@ def run(scenarios_list, config, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
                 logging.info("\n" + str(response))
                 if str(file_name).lower() in str(response).lower():
                     logging.info(
-                        "%s file successfully created" % (str(full_path))
-                    )
-                else:
-                    logging.error(
-                        "Failed to create tmp file with %s size" % (
-                            str(file_size_kb)
+                        "Waiting for the specified duration in the config: %ss" % (
+                            duration
                         )
                     )
+                    time.sleep(duration)
+                    logging.info("Finish waiting")
+
                     remove_temp_file(
                         file_name,
                         full_path,
@@ -215,35 +297,24 @@ def run(scenarios_list, config, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
                         file_size_kb,
                         kubecli
                     )
-                    sys.exit(1)
 
-                # Wait for the specified duration
-                logging.info(
-                    "Waiting for the specified duration in the config: %ss" % (
-                        duration
+                    end_time = int(time.time())
+                    cerberus.publish_kraken_status(
+                        config,
+                        failed_post_scenarios,
+                        start_time,
+                        end_time
                     )
-                )
-                time.sleep(duration)
-                logging.info("Finish waiting")
+        except (RuntimeError, Exception):
+            scenario_telemetry.exitStatus = 1
+            failed_scenarios.append(app_config[0])
+            telemetry.log_exception(app_config[0])
+        else:
+            scenario_telemetry.exitStatus = 0
+
+    return failed_scenarios, scenario_telemetries
 
-                remove_temp_file(
-                    file_name,
-                    full_path,
-                    pod_name,
-                    namespace,
-                    container_name,
-                    mount_path,
-                    file_size_kb,
-                    kubecli
-                )
 
-                end_time = int(time.time())
-                cerberus.publish_kraken_status(
-                    config,
-                    failed_post_scenarios,
-                    start_time,
-                    end_time
-                )
 
 
 # krkn_lib_kubernetes
@@ -275,7 +346,7 @@ def remove_temp_file(
         logging.error(
             "Failed to delete tmp file with %s size" % (str(file_size_kb))
         )
-        sys.exit(1)
+        raise RuntimeError()
 
 
 def toKbytes(value):
@@ -284,7 +355,7 @@ def toKbytes(value):
             "PVC capacity %s does not match expression "
             "regexp '^[0-9]+[K|M|G|T]i$'"
         )
-        sys.exit(1)
+        raise RuntimeError()
     unit = {"K": 0, "M": 1, "G": 2, "T": 3}
     base = 1024 if ("i" in value) else 1000
     exp = unit[value[-2:-1]]

From 51e3dbebdd4fca2a4120a37c9b73191143c4ae6e Mon Sep 17 00:00:00 2001
From: Tullio Sebastiani <tsebasti@redhat.com>
Date: Mon, 10 Jul 2023 10:24:35 +0200
Subject: [PATCH 13/23] network chaos scenarios

---
 kraken/network_chaos/actions.py | 158 ++++++++++++++++++--------------
 1 file changed, 87 insertions(+), 71 deletions(-)

diff --git a/kraken/network_chaos/actions.py b/kraken/network_chaos/actions.py
index b06ef764..9909e1bb 100644
--- a/kraken/network_chaos/actions.py
+++ b/kraken/network_chaos/actions.py
@@ -8,88 +8,103 @@
 from jinja2 import Environment, FileSystemLoader
 import kraken.cerberus.setup as cerberus
 import kraken.node_actions.common_node_functions as common_node_functions
-
+from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
 
 # krkn_lib_kubernetes
 # Reads the scenario config and introduces traffic variations in Node's host network interface.
-def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
+def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
     failed_post_scenarios = ""
     logging.info("Runing the Network Chaos tests")
+    failed_post_scenarios = ""
+    scenario_telemetries: list[ScenarioTelemetry] = []
+    failed_scenarios = []
     for net_config in scenarios_list:
-        with open(net_config, "r") as file:
-            param_lst = ["latency", "loss", "bandwidth"]
-            test_config = yaml.safe_load(file)
-            test_dict = test_config["network_chaos"]
-            test_duration = int(test_dict.get("duration", 300))
-            test_interface = test_dict.get("interfaces", [])
-            test_node = test_dict.get("node_name", "")
-            test_node_label = test_dict.get("label_selector", "node-role.kubernetes.io/master")
-            test_execution = test_dict.get("execution", "serial")
-            test_instance_count = test_dict.get("instance_count", 1)
-            test_egress = test_dict.get("egress", {"bandwidth": "100mbit"})
-            if test_node:
-                node_name_list = test_node.split(",")
-            else:
-                node_name_list = [test_node]
-            nodelst = []
-            for single_node_name in node_name_list:
-                nodelst.extend(common_node_functions.get_node(single_node_name, test_node_label, test_instance_count, kubecli))
-            file_loader = FileSystemLoader(os.path.abspath(os.path.dirname(__file__)))
-            env = Environment(loader=file_loader, autoescape=True)
-            pod_template = env.get_template("pod.j2")
-            test_interface = verify_interface(test_interface, nodelst, pod_template, kubecli)
-            joblst = []
-            egress_lst = [i for i in param_lst if i in test_egress]
-            chaos_config = {
-                "network_chaos": {
-                    "duration": test_duration,
-                    "interfaces": test_interface,
-                    "node_name": ",".join(nodelst),
-                    "execution": test_execution,
-                    "instance_count": test_instance_count,
-                    "egress": test_egress,
+        scenario_telemetry = ScenarioTelemetry()
+        scenario_telemetry.scenario = net_config
+        scenario_telemetry.startTimeStamp = time.time()
+        telemetry.set_parameters_base64(scenario_telemetry, net_config)
+        try:
+            with open(net_config, "r") as file:
+                param_lst = ["latency", "loss", "bandwidth"]
+                test_config = yaml.safe_load(file)
+                test_dict = test_config["network_chaos"]
+                test_duration = int(test_dict.get("duration", 300))
+                test_interface = test_dict.get("interfaces", [])
+                test_node = test_dict.get("node_name", "")
+                test_node_label = test_dict.get("label_selector", "node-role.kubernetes.io/master")
+                test_execution = test_dict.get("execution", "serial")
+                test_instance_count = test_dict.get("instance_count", 1)
+                test_egress = test_dict.get("egress", {"bandwidth": "100mbit"})
+                if test_node:
+                    node_name_list = test_node.split(",")
+                else:
+                    node_name_list = [test_node]
+                nodelst = []
+                for single_node_name in node_name_list:
+                    nodelst.extend(common_node_functions.get_node(single_node_name, test_node_label, test_instance_count, kubecli))
+                file_loader = FileSystemLoader(os.path.abspath(os.path.dirname(__file__)))
+                env = Environment(loader=file_loader, autoescape=True)
+                pod_template = env.get_template("pod.j2")
+                test_interface = verify_interface(test_interface, nodelst, pod_template, kubecli)
+                joblst = []
+                egress_lst = [i for i in param_lst if i in test_egress]
+                chaos_config = {
+                    "network_chaos": {
+                        "duration": test_duration,
+                        "interfaces": test_interface,
+                        "node_name": ",".join(nodelst),
+                        "execution": test_execution,
+                        "instance_count": test_instance_count,
+                        "egress": test_egress,
+                    }
                 }
-            }
-            logging.info("Executing network chaos with config \n %s" % yaml.dump(chaos_config))
-            job_template = env.get_template("job.j2")
-            try:
-                for i in egress_lst:
-                    for node in nodelst:
-                        exec_cmd = get_egress_cmd(
-                            test_execution, test_interface, i, test_dict["egress"], duration=test_duration
-                        )
-                        logging.info("Executing %s on node %s" % (exec_cmd, node))
-                        job_body = yaml.safe_load(
-                            job_template.render(jobname=i + str(hash(node))[:5], nodename=node, cmd=exec_cmd)
-                        )
-                        joblst.append(job_body["metadata"]["name"])
-                        api_response = kubecli.create_job(job_body)
-                        if api_response is None:
-                            raise Exception("Error creating job")
-                    if test_execution == "serial":
-                        logging.info("Waiting for serial job to finish")
+                logging.info("Executing network chaos with config \n %s" % yaml.dump(chaos_config))
+                job_template = env.get_template("job.j2")
+                try:
+                    for i in egress_lst:
+                        for node in nodelst:
+                            exec_cmd = get_egress_cmd(
+                                test_execution, test_interface, i, test_dict["egress"], duration=test_duration
+                            )
+                            logging.info("Executing %s on node %s" % (exec_cmd, node))
+                            job_body = yaml.safe_load(
+                                job_template.render(jobname=i + str(hash(node))[:5], nodename=node, cmd=exec_cmd)
+                            )
+                            joblst.append(job_body["metadata"]["name"])
+                            api_response = kubecli.create_job(job_body)
+                            if api_response is None:
+                                raise Exception("Error creating job")
+                        if test_execution == "serial":
+                            logging.info("Waiting for serial job to finish")
+                            start_time = int(time.time())
+                            wait_for_job(joblst[:], kubecli, test_duration + 300)
+                            logging.info("Waiting for wait_duration %s" % wait_duration)
+                            time.sleep(wait_duration)
+                            end_time = int(time.time())
+                            cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
+                        if test_execution == "parallel":
+                            break
+                    if test_execution == "parallel":
+                        logging.info("Waiting for parallel job to finish")
                         start_time = int(time.time())
                         wait_for_job(joblst[:], kubecli, test_duration + 300)
                         logging.info("Waiting for wait_duration %s" % wait_duration)
                         time.sleep(wait_duration)
                         end_time = int(time.time())
                         cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
-                    if test_execution == "parallel":
-                        break
-                if test_execution == "parallel":
-                    logging.info("Waiting for parallel job to finish")
-                    start_time = int(time.time())
-                    wait_for_job(joblst[:], kubecli, test_duration + 300)
-                    logging.info("Waiting for wait_duration %s" % wait_duration)
-                    time.sleep(wait_duration)
-                    end_time = int(time.time())
-                    cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
-            except Exception as e:
-                logging.error("Network Chaos exiting due to Exception %s" % e)
-                sys.exit(1)
-            finally:
-                logging.info("Deleting jobs")
-                delete_job(joblst[:], kubecli)
+                except Exception as e:
+                    logging.error("Network Chaos exiting due to Exception %s" % e)
+                    raise RuntimeError()
+                finally:
+                    logging.info("Deleting jobs")
+                    delete_job(joblst[:], kubecli)
+        except (RuntimeError, Exception):
+            scenario_telemetry.exitStatus = 1
+            failed_scenarios.append(net_config)
+            telemetry.log_exception(net_config)
+        else:
+            scenario_telemetry.exitStatus = 0
+    return failed_scenarios, scenario_telemetries
 
 
 # krkn_lib_kubernetes
@@ -110,7 +125,8 @@ def verify_interface(test_interface, nodelst, template, kubecli: krkn_lib_kubern
             for interface in test_interface:
                 if interface not in interface_lst:
                     logging.error("Interface %s not found in node %s interface list %s" % (interface, nodelst[pod_index], interface_lst))
-                    sys.exit(1)
+                    #sys.exit(1)
+                    raise RuntimeError()
         return test_interface
     finally:
         logging.info("Deleteing pod to query interface on node")
@@ -158,7 +174,7 @@ def delete_job(joblst, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
                 logging.error(pod_log)
         except Exception:
             logging.warning("Exception in getting job status")
-        api_response = kubecli.delete_job(name=jobname, namespace="default")
+        kubecli.delete_job(name=jobname, namespace="default")
 
 
 def get_egress_cmd(execution, test_interface, mod, vallst, duration=30):

From ae2edb32cd87546eef7c42ed95cb3ca2c0cd7f39 Mon Sep 17 00:00:00 2001
From: Tullio Sebastiani <tsebasti@redhat.com>
Date: Wed, 19 Jul 2023 11:29:28 +0200
Subject: [PATCH 14/23] run_kraken.py adaptation to telemetry

---
 run_kraken.py | 44 +++++++++++++++++++++++++++-----------------
 1 file changed, 27 insertions(+), 17 deletions(-)

diff --git a/run_kraken.py b/run_kraken.py
index 2a5218ab..39ef3806 100644
--- a/run_kraken.py
+++ b/run_kraken.py
@@ -209,31 +209,35 @@ def main(cfg):
                             chaos_telemetry.scenarios.extend(scenario_telemetries)
 
                         elif scenario_type == "plugin_scenarios":
-                            failed_post_scenarios = plugins.run(
+                            failed_post_scenarios, scenario_telemetries = plugins.run(
                                 scenarios_list,
                                 kubeconfig_path,
                                 kraken_config,
                                 failed_post_scenarios,
                                 wait_duration,
+                                telemetry
                             )
+                            chaos_telemetry.scenarios.extend(scenario_telemetries)
                         # krkn_lib_kubernetes
                         elif scenario_type == "container_scenarios":
                             logging.info("Running container scenarios")
-                            failed_post_scenarios = pod_scenarios.container_run(
+                            failed_post_scenarios, scenario_telemetries = pod_scenarios.container_run(
                                 kubeconfig_path,
                                 scenarios_list,
                                 config,
                                 failed_post_scenarios,
                                 wait_duration,
-                                kubecli
+                                kubecli,
+                                telemetry
                             )
+                            chaos_telemetry.scenarios.extend(scenario_telemetries)
 
                         # Inject node chaos scenarios specified in the config
                         # krkn_lib_kubernetes
                         elif scenario_type == "node_scenarios":
                             logging.info("Running node scenarios")
-                            nodeaction.run(scenarios_list, config, wait_duration, kubecli)
-
+                            failed_post_scenarios, scenario_telemetries = nodeaction.run(scenarios_list, config, wait_duration, kubecli, telemetry)
+                            chaos_telemetry.scenarios.extend(scenario_telemetries)
                         # Inject managedcluster chaos scenarios specified in the config
                         # krkn_lib_kubernetes
                         elif scenario_type == "managedcluster_scenarios":
@@ -248,7 +252,8 @@ def main(cfg):
                         elif scenario_type == "time_scenarios":
                             if distribution == "openshift":
                                 logging.info("Running time skew scenarios")
-                                time_actions.run(scenarios_list, config, wait_duration, kubecli)
+                                failed_post_scenarios, scenario_telemetries = time_actions.run(scenarios_list, config, wait_duration, kubecli, telemetry)
+                                chaos_telemetry.scenarios.extend(scenario_telemetries)
                             else:
                                 logging.error(
                                     "Litmus scenarios are currently "
@@ -298,44 +303,48 @@ def main(cfg):
                         # Inject cluster shutdown scenarios
                         # krkn_lib_kubernetes
                         elif scenario_type == "cluster_shut_down_scenarios":
-                            shut_down.run(scenarios_list, config, wait_duration, kubecli)
+                            failed_post_scenarios, scenario_telemetries = shut_down.run(scenarios_list, config, wait_duration, kubecli, telemetry)
+                            chaos_telemetry.scenarios.extend(scenario_telemetries)
 
                         # Inject namespace chaos scenarios
                         # krkn_lib_kubernetes
                         elif scenario_type == "namespace_scenarios":
                             logging.info("Running namespace scenarios")
-                            namespace_actions.run(
+                            failed_post_scenarios, scenario_telemetries = namespace_actions.run(
                                 scenarios_list,
                                 config,
                                 wait_duration,
                                 failed_post_scenarios,
                                 kubeconfig_path,
-                                kubecli
+                                kubecli,
+                                telemetry
                             )
+                            chaos_telemetry.scenarios.extend(scenario_telemetries)
 
                         # Inject zone failures
                         elif scenario_type == "zone_outages":
                             logging.info("Inject zone outages")
-                            zone_outages.run(scenarios_list, config, wait_duration)
-
+                            failed_post_scenarios, scenario_telemetries = zone_outages.run(scenarios_list, config, wait_duration, telemetry)
+                            chaos_telemetry.scenarios.extend(scenario_telemetries)
                         # Application outages
                         elif scenario_type == "application_outages":
                             logging.info("Injecting application outage")
-                            application_outage.run(
-                                scenarios_list, config, wait_duration
-                            )
+                            failed_post_scenarios, scenario_telemetries = application_outage.run(
+                                scenarios_list, config, wait_duration, telemetry)
+                            chaos_telemetry.scenarios.extend(scenario_telemetries)
 
                         # PVC scenarios
                         # krkn_lib_kubernetes
                         elif scenario_type == "pvc_scenarios":
                             logging.info("Running PVC scenario")
-                            pvc_scenario.run(scenarios_list, config, kubecli)
+                            failed_post_scenarios, scenario_telemetries = pvc_scenario.run(scenarios_list, config, kubecli, telemetry)
+                            chaos_telemetry.scenarios.extend(scenario_telemetries)
 
                         # Network scenarios
                         # krkn_lib_kubernetes
                         elif scenario_type == "network_chaos":
                             logging.info("Running Network Chaos")
-                            network_chaos.run(scenarios_list, config, wait_duration, kubecli)
+                            failed_post_scenarios, scenario_telemetries = network_chaos.run(scenarios_list, config, wait_duration, kubecli, telemetry)
 
                         # Check for critical alerts when enabled
                         if check_critical_alerts:
@@ -354,7 +363,8 @@ def main(cfg):
             iteration += 1
             logging.info("")
         # send telemetry
-        telemetry.send_telemetry(config["telemetry"], chaos_telemetry)
+
+        telemetry.send_telemetry(config["telemetry"],str(uuid.uuid1()), chaos_telemetry, kubecli)
         # Capture the end time
         end_time = int(time.time())
 

From ea175fe817e9ffb70655e7d0dec9cd5f9372f878 Mon Sep 17 00:00:00 2001
From: Tullio Sebastiani <tsebasti@redhat.com>
Date: Wed, 2 Aug 2023 12:22:56 +0200
Subject: [PATCH 15/23] prometheus telemetry upload + config.yaml

some fixes


typos and logs


max retries in config


telemetry id with run_uuid


safe_logger
---
 config/config.yaml |  7 ++++++-
 run_kraken.py      | 43 ++++++++++++++++++++++++++++++++-----------
 2 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/config/config.yaml b/config/config.yaml
index 6680e137..0317a72f 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -68,4 +68,9 @@ telemetry:
     enabled: True
     api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production
     username: username
-    password: password
\ No newline at end of file
+    password: password
+    prometheus_backup: True
+    full_prometheus_backup: False
+    backup_threads: 5
+    archive_path: /tmp
+    max_retries: 0
\ No newline at end of file
diff --git a/run_kraken.py b/run_kraken.py
index 39ef3806..9c6ee4c1 100644
--- a/run_kraken.py
+++ b/run_kraken.py
@@ -25,7 +25,7 @@
 import server as server
 import kraken.prometheus.client as promcli
 from kraken import plugins
-from krkn_lib_kubernetes import KrknLibKubernetes, KrknTelemetry, ChaosRunTelemetry
+from krkn_lib_kubernetes import KrknLibKubernetes, KrknTelemetry, ChaosRunTelemetry, SafeLogger
 
 KUBE_BURNER_URL = (
     "https://github.com/cloud-bulldozer/kube-burner/"
@@ -98,13 +98,32 @@ def main(cfg):
             )
             sys.exit(1)
         logging.info("Initializing client to talk to the Kubernetes cluster")
+
+        # Generate uuid for the run
+        if run_uuid:
+            logging.info(
+                "Using the uuid defined by the user for the run: %s" % run_uuid
+            )
+        else:
+            run_uuid = str(uuid.uuid4())
+            logging.info("Generated a uuid for the run: %s" % run_uuid)
+
+        # request_id for telemetry is generated once here and used everywhere
+        telemetry_request_id = f"{int(time.time())}-{run_uuid}"
+        telemetry_log_file = f'{config["telemetry"]["archive_path"]}/{telemetry_request_id}.log'
+        safe_logger = SafeLogger(filename=telemetry_log_file)
+
         try:
             kubeconfig_path
             os.environ["KUBECONFIG"] = str(kubeconfig_path)
+            # krkn-lib-kubernetes init
             kubecli = KrknLibKubernetes(kubeconfig_path=kubeconfig_path)
         except:
             kubecli.initialize_clients(None)
 
+        # KrknTelemetry init
+        telemetry = KrknTelemetry(safe_logger, kubecli)
+
         # find node kraken might be running on
         kubecli.find_kraken_node()
 
@@ -141,14 +160,7 @@ def main(cfg):
         if deploy_performance_dashboards:
             performance_dashboards.setup(dashboard_repo, distribution)
 
-        # Generate uuid for the run
-        if run_uuid:
-            logging.info(
-                "Using the uuid defined by the user for the run: %s" % run_uuid
-            )
-        else:
-            run_uuid = str(uuid.uuid4())
-            logging.info("Generated a uuid for the run: %s" % run_uuid)
+
 
         # Initialize the start iteration to 0
         iteration = 0
@@ -172,6 +184,7 @@ def main(cfg):
         start_time = int(time.time())
         litmus_installed = False
         chaos_telemetry = ChaosRunTelemetry()
+        chaos_telemetry.run_uuid = run_uuid
         # Loop to run the chaos starts here
         while int(iteration) < iterations and run_signal != "STOP":
             # Inject chaos scenarios specified in the config
@@ -362,9 +375,17 @@ def main(cfg):
 
             iteration += 1
             logging.info("")
-        # send telemetry
 
-        telemetry.send_telemetry(config["telemetry"],str(uuid.uuid1()), chaos_telemetry, kubecli)
+        # telemetry
+        logging.info(f"telemetry data will be stored on s3 bucket folder: {telemetry_request_id}")
+        logging.info(f"telemetry upload log: {safe_logger.log_file_name}")
+
+        telemetry.send_telemetry(config["telemetry"], telemetry_request_id, chaos_telemetry)
+        safe_logger.info("archives download started:")
+        prometheus_archive_files = telemetry.get_ocp_prometheus_data(config["telemetry"], telemetry_request_id)
+        safe_logger.info("archives upload started:")
+        telemetry.put_ocp_prometheus_data(config["telemetry"], prometheus_archive_files, telemetry_request_id)
+
         # Capture the end time
         end_time = int(time.time())
 

From 5c13597bd11363c7804e3f37e2c875fbe7797403 Mon Sep 17 00:00:00 2001
From: Tullio Sebastiani <tsebasti@redhat.com>
Date: Wed, 2 Aug 2023 12:51:40 +0200
Subject: [PATCH 16/23] catch send_telemetry exception

---
 run_kraken.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/run_kraken.py b/run_kraken.py
index 9c6ee4c1..ee69b08d 100644
--- a/run_kraken.py
+++ b/run_kraken.py
@@ -380,11 +380,14 @@ def main(cfg):
         logging.info(f"telemetry data will be stored on s3 bucket folder: {telemetry_request_id}")
         logging.info(f"telemetry upload log: {safe_logger.log_file_name}")
 
-        telemetry.send_telemetry(config["telemetry"], telemetry_request_id, chaos_telemetry)
-        safe_logger.info("archives download started:")
-        prometheus_archive_files = telemetry.get_ocp_prometheus_data(config["telemetry"], telemetry_request_id)
-        safe_logger.info("archives upload started:")
-        telemetry.put_ocp_prometheus_data(config["telemetry"], prometheus_archive_files, telemetry_request_id)
+        try:
+            telemetry.send_telemetry(config["telemetry"], telemetry_request_id, chaos_telemetry)
+            safe_logger.info("archives download started:")
+            prometheus_archive_files = telemetry.get_ocp_prometheus_data(config["telemetry"], telemetry_request_id)
+            safe_logger.info("archives upload started:")
+            telemetry.put_ocp_prometheus_data(config["telemetry"], prometheus_archive_files, telemetry_request_id)
+        except Exception as e:
+            logging.error(f"failed to send telemetry data: {str(e)}")
 
         # Capture the end time
         end_time = int(time.time())

From a3dbf933e6790747a874263abc09ed0026288bd8 Mon Sep 17 00:00:00 2001
From: Tullio Sebastiani <tsebasti@redhat.com>
Date: Wed, 2 Aug 2023 15:47:55 +0200
Subject: [PATCH 17/23] scenario collection bug fixes

---
 kraken/network_chaos/actions.py | 1 +
 kraken/plugins/__init__.py      | 1 -
 kraken/pvc/pvc_scenario.py      | 1 +
 3 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/kraken/network_chaos/actions.py b/kraken/network_chaos/actions.py
index 9909e1bb..0b209c72 100644
--- a/kraken/network_chaos/actions.py
+++ b/kraken/network_chaos/actions.py
@@ -104,6 +104,7 @@ def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.Krkn
             telemetry.log_exception(net_config)
         else:
             scenario_telemetry.exitStatus = 0
+        scenario_telemetries.append(scenario_telemetry)
     return failed_scenarios, scenario_telemetries
 
 
diff --git a/kraken/plugins/__init__.py b/kraken/plugins/__init__.py
index 7fb28dfb..aff7beb0 100644
--- a/kraken/plugins/__init__.py
+++ b/kraken/plugins/__init__.py
@@ -243,7 +243,6 @@ def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_p
         else:
             scenario_telemetry.exitStatus = 0
             logging.info("Waiting for the specified duration: %s" % (wait_duration))
-            scenario_telemetries.append(scenario_telemetry)
             time.sleep(wait_duration)
         scenario_telemetries.append(scenario_telemetry)
         scenario_telemetry.endTimeStamp = time.time()
diff --git a/kraken/pvc/pvc_scenario.py b/kraken/pvc/pvc_scenario.py
index aa9e297a..ca055009 100644
--- a/kraken/pvc/pvc_scenario.py
+++ b/kraken/pvc/pvc_scenario.py
@@ -311,6 +311,7 @@ def run(scenarios_list, config, kubecli: krkn_lib_kubernetes.KrknLibKubernetes,
             telemetry.log_exception(app_config[0])
         else:
             scenario_telemetry.exitStatus = 0
+        scenario_telemetries.append(scenario_telemetry)
 
     return failed_scenarios, scenario_telemetries
 

From f63fe8d811d9f863e4b84ddbb872788e51cbfec7 Mon Sep 17 00:00:00 2001
From: Tullio Sebastiani <tsebasti@redhat.com>
Date: Thu, 3 Aug 2023 16:54:25 +0200
Subject: [PATCH 18/23] telemetry enabled check

---
 run_kraken.py | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/run_kraken.py b/run_kraken.py
index ee69b08d..23a2a710 100644
--- a/run_kraken.py
+++ b/run_kraken.py
@@ -377,17 +377,19 @@ def main(cfg):
             logging.info("")
 
         # telemetry
-        logging.info(f"telemetry data will be stored on s3 bucket folder: {telemetry_request_id}")
-        logging.info(f"telemetry upload log: {safe_logger.log_file_name}")
-
-        try:
-            telemetry.send_telemetry(config["telemetry"], telemetry_request_id, chaos_telemetry)
-            safe_logger.info("archives download started:")
-            prometheus_archive_files = telemetry.get_ocp_prometheus_data(config["telemetry"], telemetry_request_id)
-            safe_logger.info("archives upload started:")
-            telemetry.put_ocp_prometheus_data(config["telemetry"], prometheus_archive_files, telemetry_request_id)
-        except Exception as e:
-            logging.error(f"failed to send telemetry data: {str(e)}")
+        if config["telemetry"]["enabled"]:
+            logging.info(f"telemetry data will be stored on s3 bucket folder: {telemetry_request_id}")
+            logging.info(f"telemetry upload log: {safe_logger.log_file_name}")
+            try:
+                telemetry.send_telemetry(config["telemetry"], telemetry_request_id, chaos_telemetry)
+                safe_logger.info("archives download started:")
+                prometheus_archive_files = telemetry.get_ocp_prometheus_data(config["telemetry"], telemetry_request_id)
+                safe_logger.info("archives upload started:")
+                telemetry.put_ocp_prometheus_data(config["telemetry"], prometheus_archive_files, telemetry_request_id)
+            except Exception as e:
+                logging.error(f"failed to send telemetry data: {str(e)}")
+        else:
+            logging.info("telemetry collection disabled, skipping.")
 
         # Capture the end time
         end_time = int(time.time())

From 805593590dcd2e1ed14c415dbf3057d8b80088a2 Mon Sep 17 00:00:00 2001
From: Tullio Sebastiani <tsebasti@redhat.com>
Date: Thu, 3 Aug 2023 17:19:55 +0200
Subject: [PATCH 19/23] telemetry run tag

---
 config/config.yaml | 3 ++-
 run_kraken.py      | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/config/config.yaml b/config/config.yaml
index 0317a72f..4fbfb509 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -73,4 +73,5 @@ telemetry:
     full_prometheus_backup: False
     backup_threads: 5
     archive_path: /tmp
-    max_retries: 0
\ No newline at end of file
+    max_retries: 0
+    run_tag: chaos
\ No newline at end of file
diff --git a/run_kraken.py b/run_kraken.py
index 23a2a710..816f9b09 100644
--- a/run_kraken.py
+++ b/run_kraken.py
@@ -110,6 +110,8 @@ def main(cfg):
 
         # request_id for telemetry is generated once here and used everywhere
         telemetry_request_id = f"{int(time.time())}-{run_uuid}"
+        if config["telemetry"].get("run_tag"):
+            telemetry_request_id = f"{telemetry_request_id}-{config['telemetry']['run_tag']}"
         telemetry_log_file = f'{config["telemetry"]["archive_path"]}/{telemetry_request_id}.log'
         safe_logger = SafeLogger(filename=telemetry_log_file)
 

From dff994a65822441f50f1df6c4fe68c10e8ade2d0 Mon Sep 17 00:00:00 2001
From: Tullio Sebastiani <tsebasti@redhat.com>
Date: Fri, 4 Aug 2023 17:02:44 +0200
Subject: [PATCH 20/23] requirements pointing to main + archive_size

---
 config/config.yaml | 3 ++-
 requirements.txt   | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/config/config.yaml b/config/config.yaml
index 4fbfb509..7f91ac98 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -74,4 +74,5 @@ telemetry:
     backup_threads: 5
     archive_path: /tmp
     max_retries: 0
-    run_tag: chaos
\ No newline at end of file
+    run_tag: chaos
+    archive_size: 10000 #in KiloBytes
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 237a4eca..5e766acd 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -37,4 +37,4 @@ prometheus_api_client
 ibm_cloud_sdk_core
 ibm_vpc
 pytest
-krkn-lib-kubernetes@git+https://github.com/redhat-chaos/krkn-lib-kubernetes.git@krkn_telemetry
\ No newline at end of file
+krkn-lib-kubernetes@git+https://github.com/redhat-chaos/krkn-lib-kubernetes.git@main
\ No newline at end of file

From 4059898fda6cc8f5f96d8ab245fec8d4471a6283 Mon Sep 17 00:00:00 2001
From: Tullio Sebastiani <tsebasti@redhat.com>
Date: Thu, 10 Aug 2023 17:10:02 +0200
Subject: [PATCH 21/23] requirements.txt and config.yaml update

---
 config/config.yaml | 27 ++++++++++++++++-----------
 requirements.txt   |  2 +-
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/config/config.yaml b/config/config.yaml
index 7f91ac98..156d23e0 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -65,14 +65,19 @@ tunings:
     iterations: 1                                          # Number of times to execute the scenarios
     daemon_mode: False                                     # Iterations are set to infinity which means that the kraken will cause chaos forever
 telemetry:
-    enabled: True
-    api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production
-    username: username
-    password: password
-    prometheus_backup: True
-    full_prometheus_backup: False
-    backup_threads: 5
-    archive_path: /tmp
-    max_retries: 0
-    run_tag: chaos
-    archive_size: 10000 #in KiloBytes
\ No newline at end of file
+    enabled: False                                           # enable/disables the telemetry collection feature
+    api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production #telemetry service endpoint
+    username: username                                      # telemetry service username
+    password: password                                      # telemetry service password
+    prometheus_backup: True                                 # enables/disables prometheus data collection
+    full_prometheus_backup: False                           # if is set to False only the /prometheus/wal folder will be downloaded.
+    backup_threads: 5                                       # number of telemetry download/upload threads
+    archive_path: /tmp                                      # local path where the archive files will be temporarly stored
+    max_retries: 0                                          # maximum number of upload retries (if 0 will retry forever)
+    run_tag: ''                                             # if set, this will be appended to the run folder in the bucket (useful to group the runs)
+    archive_size: 10000                                     # the size of the prometheus data archive size in KB. The lower the size of archive is
+                                                            # the higher the number of archive files will be produced and uploaded (and processed by backup_threads
+                                                            # simultaneously).
+                                                            # For unstable/slow connection is better to keep this value low
+                                                            # increasing the number of backup_threads, in this way, on upload failure, the retry will happen only on the
+                                                            # failed chunk without affecting the whole upload.
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 5e766acd..223d6ca0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -37,4 +37,4 @@ prometheus_api_client
 ibm_cloud_sdk_core
 ibm_vpc
 pytest
-krkn-lib-kubernetes@git+https://github.com/redhat-chaos/krkn-lib-kubernetes.git@main
\ No newline at end of file
+krkn-lib-kubernetes >= 0.1.3
\ No newline at end of file

From 3fbb887dff6c07118014886a6c112bac14327a2d Mon Sep 17 00:00:00 2001
From: Tullio Sebastiani <tsebasti@redhat.com>
Date: Thu, 10 Aug 2023 18:15:07 +0200
Subject: [PATCH 22/23] added telemetry config to common config

---
 CI/config/common_test_config.yaml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/CI/config/common_test_config.yaml b/CI/config/common_test_config.yaml
index f77435e4..c36a6d59 100644
--- a/CI/config/common_test_config.yaml
+++ b/CI/config/common_test_config.yaml
@@ -29,3 +29,15 @@ tunings:
     wait_duration: 6                                       # Duration to wait between each chaos scenario.
     iterations: 1                                          # Number of times to execute the scenarios.
     daemon_mode: False                                     # Iterations are set to infinity which means that the kraken will cause chaos forever.
+telemetry:
+    enabled: False                                           # enable/disables the telemetry collection feature
+    api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production #telemetry service endpoint
+    username: username                                      # telemetry service username
+    password: password                                      # telemetry service password
+    prometheus_backup: True                                 # enables/disables prometheus data collection
+    full_prometheus_backup: False                           # if is set to False only the /prometheus/wal folder will be downloaded.
+    backup_threads: 5                                       # number of telemetry download/upload threads
+    archive_path: /tmp                                      # local path where the archive files will be temporarly stored
+    max_retries: 0                                          # maximum number of upload retries (if 0 will retry forever)
+    run_tag: ''                                             # if set, this will be appended to the run folder in the bucket (useful to group the runs)
+    archive_size: 10000                                     # the size of the prometheus data archive size in KB. The lower the size of archive is

From a4f44eef096343e3ee70439a8f95bf74b5f3b693 Mon Sep 17 00:00:00 2001
From: Tullio Sebastiani <tsebasti@redhat.com>
Date: Thu, 10 Aug 2023 19:11:08 +0200
Subject: [PATCH 23/23] fixed scenario array elements for telemetry

---
 config/config.yaml                   | 32 ----------------------------
 kraken/application_outage/actions.py |  8 +++----
 kraken/pod_scenarios/setup.py        |  2 +-
 kraken/pvc/pvc_scenario.py           |  8 +++----
 kraken/zone_outage/actions.py        |  8 +++----
 5 files changed, 13 insertions(+), 45 deletions(-)

diff --git a/config/config.yaml b/config/config.yaml
index 156d23e0..d2d3ee9a 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -7,40 +7,8 @@ kraken:
     signal_address: 0.0.0.0                                # Signal listening address
     port: 8081                                             # Signal port
     chaos_scenarios:                                       # List of policies/chaos scenarios to load
-        - arcaflow_scenarios:
-              - scenarios/arcaflow/cpu-hog/input.yaml
-              - scenarios/arcaflow/memory-hog/input.yaml
-        - container_scenarios: # List of chaos pod scenarios to load
-              - - scenarios/openshift/container_etcd.yml
-        - plugin_scenarios:
-              - scenarios/openshift/etcd.yml
-              - scenarios/openshift/regex_openshift_pod_kill.yml
-              - scenarios/openshift/vmware_node_scenarios.yml
-              - scenarios/openshift/ibmcloud_node_scenarios.yml
-              - scenarios/openshift/network_chaos_ingress.yml
-              - scenarios/openshift/pod_network_outage.yml
-        - node_scenarios: # List of chaos node scenarios to load
-              - scenarios/openshift/node_scenarios_example.yml
-        - plugin_scenarios:
-              - scenarios/openshift/openshift-apiserver.yml
-              - scenarios/openshift/openshift-kube-apiserver.yml
-        - time_scenarios: # List of chaos time scenarios to load
-              - scenarios/openshift/time_scenarios_example.yml
-        - cluster_shut_down_scenarios:
-              - - scenarios/openshift/cluster_shut_down_scenario.yml
-                - scenarios/openshift/post_action_shut_down.py
-        - namespace_scenarios:
-              - - scenarios/openshift/regex_namespace.yaml
-              - - scenarios/openshift/ingress_namespace.yaml
-                - scenarios/openshift/post_action_namespace.py
-        - zone_outages:
-              - scenarios/openshift/zone_outage.yaml
         - application_outages:
               - scenarios/openshift/app_outage.yaml
-        - pvc_scenarios:
-              - scenarios/openshift/pvc_scenario.yaml
-        - network_chaos:
-              - scenarios/openshift/network_chaos.yaml
 
 cerberus:
     cerberus_enabled: False                                # Enable it when cerberus is previously installed
diff --git a/kraken/application_outage/actions.py b/kraken/application_outage/actions.py
index f8d32085..4d2ae0c6 100644
--- a/kraken/application_outage/actions.py
+++ b/kraken/application_outage/actions.py
@@ -14,9 +14,9 @@ def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetry) -> (lis
     failed_scenarios = []
     for app_outage_config in scenarios_list:
         scenario_telemetry = ScenarioTelemetry()
-        scenario_telemetry.scenario = app_outage_config[0]
+        scenario_telemetry.scenario = app_outage_config
         scenario_telemetry.startTimeStamp = time.time()
-        telemetry.set_parameters_base64(scenario_telemetry, app_outage_config[0])
+        telemetry.set_parameters_base64(scenario_telemetry, app_outage_config)
         if len(app_outage_config) > 1:
             try:
                 with open(app_outage_config, "r") as f:
@@ -65,8 +65,8 @@ def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetry) -> (lis
                     cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
             except Exception as e :
                 scenario_telemetry.exitStatus = 1
-                failed_scenarios.append(app_outage_config[0])
-                telemetry.log_exception(app_outage_config[0])
+                failed_scenarios.append(app_outage_config)
+                telemetry.log_exception(app_outage_config)
             else:
                 scenario_telemetry.exitStatus = 0
             scenario_telemetry.endTimeStamp = time.time()
diff --git a/kraken/pod_scenarios/setup.py b/kraken/pod_scenarios/setup.py
index 6071c327..2c02a5fd 100644
--- a/kraken/pod_scenarios/setup.py
+++ b/kraken/pod_scenarios/setup.py
@@ -80,7 +80,7 @@ def container_run(kubeconfig_path,
 
     for container_scenario_config in scenarios_list:
         scenario_telemetry = ScenarioTelemetry()
-        scenario_telemetry.scenario = container_scenario_config
+        scenario_telemetry.scenario = container_scenario_config[0]
         scenario_telemetry.startTimeStamp = time.time()
         telemetry.set_parameters_base64(scenario_telemetry, container_scenario_config[0])
         if len(container_scenario_config) > 1:
diff --git a/kraken/pvc/pvc_scenario.py b/kraken/pvc/pvc_scenario.py
index ca055009..b9d1433a 100644
--- a/kraken/pvc/pvc_scenario.py
+++ b/kraken/pvc/pvc_scenario.py
@@ -19,9 +19,9 @@ def run(scenarios_list, config, kubecli: krkn_lib_kubernetes.KrknLibKubernetes,
     failed_scenarios = []
     for app_config in scenarios_list:
         scenario_telemetry = ScenarioTelemetry()
-        scenario_telemetry.scenario = app_config[0]
+        scenario_telemetry.scenario = app_config
         scenario_telemetry.startTimeStamp = time.time()
-        telemetry.set_parameters_base64(scenario_telemetry, app_config[0])
+        telemetry.set_parameters_base64(scenario_telemetry, app_config)
         try:
             if len(app_config) > 1:
                 with open(app_config, "r") as f:
@@ -307,8 +307,8 @@ def run(scenarios_list, config, kubecli: krkn_lib_kubernetes.KrknLibKubernetes,
                     )
         except (RuntimeError, Exception):
             scenario_telemetry.exitStatus = 1
-            failed_scenarios.append(app_config[0])
-            telemetry.log_exception(app_config[0])
+            failed_scenarios.append(app_config)
+            telemetry.log_exception(app_config)
         else:
             scenario_telemetry.exitStatus = 0
         scenario_telemetries.append(scenario_telemetry)
diff --git a/kraken/zone_outage/actions.py b/kraken/zone_outage/actions.py
index 3c2e3e01..96690068 100644
--- a/kraken/zone_outage/actions.py
+++ b/kraken/zone_outage/actions.py
@@ -16,9 +16,9 @@ def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetry) -> (lis
 
     for zone_outage_config in scenarios_list:
         scenario_telemetry = ScenarioTelemetry()
-        scenario_telemetry.scenario = zone_outage_config[0]
+        scenario_telemetry.scenario = zone_outage_config
         scenario_telemetry.startTimeStamp = time.time()
-        telemetry.set_parameters_base64(scenario_telemetry, zone_outage_config[0])
+        telemetry.set_parameters_base64(scenario_telemetry, zone_outage_config)
         try:
             if len(zone_outage_config) > 1:
                 with open(zone_outage_config, "r") as f:
@@ -109,8 +109,8 @@ def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetry) -> (lis
                     )
         except (RuntimeError, Exception):
             scenario_telemetry.exitStatus = 1
-            failed_scenarios.append(zone_outage_config[0])
-            telemetry.log_exception(zone_outage_config[0])
+            failed_scenarios.append(zone_outage_config)
+            telemetry.log_exception(zone_outage_config)
         else:
             scenario_telemetry.exitStatus = 0
         scenario_telemetry.endTimeStamp = time.time()