From e3948ed518f79ccc5d6fdd4830dc06a21a3bc3ad Mon Sep 17 00:00:00 2001 From: Tullio Sebastiani Date: Fri, 9 Jun 2023 18:15:53 +0200 Subject: [PATCH 01/23] adapted config.yaml to the new feature --- config/config.yaml | 74 ++++++++++++++++++++++++---------------------- 1 file changed, 39 insertions(+), 35 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 4f0f779f..6680e137 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -7,41 +7,40 @@ kraken: signal_address: 0.0.0.0 # Signal listening address port: 8081 # Signal port chaos_scenarios: # List of policies/chaos scenarios to load - - arcaflow_scenarios: - - scenarios/arcaflow/cpu-hog/input.yaml - - scenarios/arcaflow/memory-hog/input.yaml - - container_scenarios: # List of chaos pod scenarios to load - - - scenarios/openshift/container_etcd.yml - - plugin_scenarios: - - scenarios/openshift/etcd.yml - - scenarios/openshift/regex_openshift_pod_kill.yml - - scenarios/openshift/vmware_node_scenarios.yml - - scenarios/openshift/ibmcloud_node_scenarios.yml - - scenarios/openshift/network_chaos_ingress.yml - - scenarios/openshift/pod_network_outage.yml - - scenarios/openshift/pod_network_shaping.yml - - node_scenarios: # List of chaos node scenarios to load - - scenarios/openshift/node_scenarios_example.yml - - plugin_scenarios: - - scenarios/openshift/openshift-apiserver.yml - - scenarios/openshift/openshift-kube-apiserver.yml - - time_scenarios: # List of chaos time scenarios to load - - scenarios/openshift/time_scenarios_example.yml - - cluster_shut_down_scenarios: - - - scenarios/openshift/cluster_shut_down_scenario.yml - - scenarios/openshift/post_action_shut_down.py - - namespace_scenarios: - - - scenarios/openshift/regex_namespace.yaml - - - scenarios/openshift/ingress_namespace.yaml - - scenarios/openshift/post_action_namespace.py - - zone_outages: - - scenarios/openshift/zone_outage.yaml - - application_outages: - - scenarios/openshift/app_outage.yaml - - pvc_scenarios: - - scenarios/openshift/pvc_scenario.yaml - - network_chaos: - - scenarios/openshift/network_chaos.yaml + - arcaflow_scenarios: + - scenarios/arcaflow/cpu-hog/input.yaml + - scenarios/arcaflow/memory-hog/input.yaml + - container_scenarios: # List of chaos pod scenarios to load + - - scenarios/openshift/container_etcd.yml + - plugin_scenarios: + - scenarios/openshift/etcd.yml + - scenarios/openshift/regex_openshift_pod_kill.yml + - scenarios/openshift/vmware_node_scenarios.yml + - scenarios/openshift/ibmcloud_node_scenarios.yml + - scenarios/openshift/network_chaos_ingress.yml + - scenarios/openshift/pod_network_outage.yml + - node_scenarios: # List of chaos node scenarios to load + - scenarios/openshift/node_scenarios_example.yml + - plugin_scenarios: + - scenarios/openshift/openshift-apiserver.yml + - scenarios/openshift/openshift-kube-apiserver.yml + - time_scenarios: # List of chaos time scenarios to load + - scenarios/openshift/time_scenarios_example.yml + - cluster_shut_down_scenarios: + - - scenarios/openshift/cluster_shut_down_scenario.yml + - scenarios/openshift/post_action_shut_down.py + - namespace_scenarios: + - - scenarios/openshift/regex_namespace.yaml + - - scenarios/openshift/ingress_namespace.yaml + - scenarios/openshift/post_action_namespace.py + - zone_outages: + - scenarios/openshift/zone_outage.yaml + - application_outages: + - scenarios/openshift/app_outage.yaml + - pvc_scenarios: + - scenarios/openshift/pvc_scenario.yaml + - network_chaos: + - scenarios/openshift/network_chaos.yaml cerberus: cerberus_enabled: False # Enable it when cerberus is previously installed @@ -65,3 +64,8 @@ tunings: wait_duration: 60 # Duration to wait between each chaos scenario iterations: 1 # Number of times to execute the scenarios daemon_mode: False # Iterations are set to infinity which means that the kraken will cause chaos forever +telemetry: + enabled: True + api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production + username: username + password: password \ No newline at end of file From d351ec5575f32867b85fefa6b139b0db3c5dc9c1 Mon Sep 17 00:00:00 2001 From: Tullio Sebastiani Date: Fri, 9 Jun 2023 18:16:19 +0200 Subject: [PATCH 02/23] temporarly pointing requirement.txt to the lib feature branch --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 5ec8629b..237a4eca 100644 --- a/requirements.txt +++ b/requirements.txt @@ -37,4 +37,4 @@ prometheus_api_client ibm_cloud_sdk_core ibm_vpc pytest -krkn-lib-kubernetes > 0.1.1 +krkn-lib-kubernetes@git+https://github.com/redhat-chaos/krkn-lib-kubernetes.git@krkn_telemetry \ No newline at end of file From 048ccad37d7b7aa82e114313119a28b7736df3db Mon Sep 17 00:00:00 2001 From: Tullio Sebastiani Date: Mon, 12 Jun 2023 09:30:24 +0200 Subject: [PATCH 03/23] run_kraken.py + arcaflow scenarios refactoring typo --- kraken/arcaflow_plugin/arcaflow_plugin.py | 27 ++++++++++++++++------- run_kraken.py | 12 +++++----- 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/kraken/arcaflow_plugin/arcaflow_plugin.py b/kraken/arcaflow_plugin/arcaflow_plugin.py index 5c8ee707..52c33baf 100644 --- a/kraken/arcaflow_plugin/arcaflow_plugin.py +++ b/kraken/arcaflow_plugin/arcaflow_plugin.py @@ -1,3 +1,5 @@ +import time + import arcaflow import os import yaml @@ -6,22 +8,31 @@ from pathlib import Path from typing import List from .context_auth import ContextAuth +from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry -def run(scenarios_list: List[str], kubeconfig_path: str): +def run(scenarios_list: List[str], kubeconfig_path: str, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]): + scenario_telemetries: list[ScenarioTelemetry] = [] + failed_post_scenarios = [] for scenario in scenarios_list: + scenario_telemetry = ScenarioTelemetry() + scenario_telemetry.scenario = scenario + scenario_telemetry.startTimeStamp = time.time() + telemetry.set_parameters_base64(scenario_telemetry,scenario) engine_args = build_args(scenario) - run_workflow(engine_args, kubeconfig_path) + status_code = run_workflow(engine_args, kubeconfig_path) + scenario_telemetry.endTimeStamp = time.time() + scenario_telemetry.exitStatus = status_code + scenario_telemetries.append(scenario_telemetry) + if status_code != 0: + failed_post_scenarios.append(scenario) + return failed_post_scenarios, scenario_telemetries -def run_workflow(engine_args: arcaflow.EngineArgs, kubeconfig_path: str): +def run_workflow(engine_args: arcaflow.EngineArgs, kubeconfig_path: str) -> int: set_arca_kubeconfig(engine_args, kubeconfig_path) exit_status = arcaflow.run(engine_args) - if exit_status != 0: - logging.error( - f"failed to run arcaflow scenario {engine_args.input}" - ) - sys.exit(exit_status) + return exit_status def build_args(input_file: str) -> arcaflow.EngineArgs: diff --git a/run_kraken.py b/run_kraken.py index f48dbd17..2a5218ab 100644 --- a/run_kraken.py +++ b/run_kraken.py @@ -25,7 +25,7 @@ import server as server import kraken.prometheus.client as promcli from kraken import plugins -from krkn_lib_kubernetes import KrknLibKubernetes +from krkn_lib_kubernetes import KrknLibKubernetes, KrknTelemetry, ChaosRunTelemetry KUBE_BURNER_URL = ( "https://github.com/cloud-bulldozer/kube-burner/" @@ -171,7 +171,7 @@ def main(cfg): # Capture the start time start_time = int(time.time()) litmus_installed = False - + chaos_telemetry = ChaosRunTelemetry() # Loop to run the chaos starts here while int(iteration) < iterations and run_signal != "STOP": # Inject chaos scenarios specified in the config @@ -203,9 +203,10 @@ def main(cfg): ) sys.exit(1) elif scenario_type == "arcaflow_scenarios": - failed_post_scenarios = arcaflow_plugin.run( - scenarios_list, kubeconfig_path + failed_post_scenarios, scenario_telemetries = arcaflow_plugin.run( + scenarios_list, kubeconfig_path, telemetry ) + chaos_telemetry.scenarios.extend(scenario_telemetries) elif scenario_type == "plugin_scenarios": failed_post_scenarios = plugins.run( @@ -352,7 +353,8 @@ def main(cfg): iteration += 1 logging.info("") - + # send telemetry + telemetry.send_telemetry(config["telemetry"], chaos_telemetry) # Capture the end time end_time = int(time.time()) From 77e81a3994abc3c8e26e7746dd70a8b306c6a56a Mon Sep 17 00:00:00 2001 From: Tullio Sebastiani Date: Wed, 14 Jun 2023 11:41:10 +0200 Subject: [PATCH 04/23] plugin scenario --- kraken/plugins/__init__.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/kraken/plugins/__init__.py b/kraken/plugins/__init__.py index 9c1b7768..7fb28dfb 100644 --- a/kraken/plugins/__init__.py +++ b/kraken/plugins/__init__.py @@ -13,6 +13,7 @@ from kraken.plugins.network.ingress_shaping import network_chaos from kraken.plugins.pod_network_outage.pod_network_outage_plugin import pod_outage from kraken.plugins.pod_network_outage.pod_network_outage_plugin import pod_egress_shaping +from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry @dataclasses.dataclass @@ -225,16 +226,26 @@ def json_schema(self): ) -def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_post_scenarios: List[str], wait_duration: int) -> List[str]: +def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_post_scenarios: List[str], wait_duration: int, telemetry: KrknTelemetry) -> (List[str], list[ScenarioTelemetry]): + scenario_telemetries: list[ScenarioTelemetry] = [] for scenario in scenarios: + scenario_telemetry = ScenarioTelemetry() + scenario_telemetry.scenario = scenario + scenario_telemetry.startTimeStamp = time.time() + telemetry.set_parameters_base64(scenario_telemetry, scenario) logging.info('scenario '+ str(scenario)) try: PLUGINS.run(scenario, kubeconfig_path, kraken_config) except Exception as e: + scenario_telemetry.exitStatus = 1 failed_post_scenarios.append(scenario) - logging.error("Error while running {}: {}".format(scenario, e)) - return failed_post_scenarios - logging.info("Waiting for the specified duration: %s" % (wait_duration)) - time.sleep(wait_duration) - - return failed_post_scenarios + telemetry.log_exception(scenario) + else: + scenario_telemetry.exitStatus = 0 + logging.info("Waiting for the specified duration: %s" % (wait_duration)) + scenario_telemetries.append(scenario_telemetry) + time.sleep(wait_duration) + scenario_telemetries.append(scenario_telemetry) + scenario_telemetry.endTimeStamp = time.time() + + return failed_post_scenarios, scenario_telemetries From 87fa6d1501063484fc0256840f684dd28578f158 Mon Sep 17 00:00:00 2001 From: Tullio Sebastiani Date: Wed, 14 Jun 2023 11:48:14 +0200 Subject: [PATCH 05/23] node scenarios return failed scenarios --- kraken/node_actions/aws_node_scenarios.py | 48 ++++++++++++++----- kraken/node_actions/az_node_scenarios.py | 32 +++++++++---- kraken/node_actions/gcp_node_scenarios.py | 36 ++++++++++---- .../node_actions/openstack_node_scenarios.py | 36 ++++++++++---- kraken/node_actions/run.py | 34 +++++++++---- 5 files changed, 140 insertions(+), 46 deletions(-) diff --git a/kraken/node_actions/aws_node_scenarios.py b/kraken/node_actions/aws_node_scenarios.py index ded183e5..d53ff5a3 100644 --- a/kraken/node_actions/aws_node_scenarios.py +++ b/kraken/node_actions/aws_node_scenarios.py @@ -27,7 +27,9 @@ def start_instances(self, instance_id): logging.error( "Failed to start node instance %s. Encountered following " "exception: %s." % (instance_id, e) ) - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() # Stop the node instance def stop_instances(self, instance_id): @@ -36,7 +38,9 @@ def stop_instances(self, instance_id): logging.info("EC2 instance: " + str(instance_id) + " stopped") except Exception as e: logging.error("Failed to stop node instance %s. Encountered following " "exception: %s." % (instance_id, e)) - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() # Terminate the node instance def terminate_instances(self, instance_id): @@ -47,7 +51,9 @@ def terminate_instances(self, instance_id): logging.error( "Failed to terminate node instance %s. Encountered following " "exception: %s." % (instance_id, e) ) - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() # Reboot the node instance def reboot_instances(self, instance_id): @@ -58,7 +64,9 @@ def reboot_instances(self, instance_id): logging.error( "Failed to reboot node instance %s. Encountered following " "exception: %s." % (instance_id, e) ) - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() # Below functions poll EC2.Client.describe_instances() every 15 seconds # until a successful state is reached. An error is returned after 40 failed checks @@ -102,7 +110,9 @@ def create_default_network_acl(self, vpc_id): "Failed to create the default network_acl: %s" "Make sure you have aws cli configured on the host and set for the region of your vpc/subnet" % (e) ) - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() return acl_id # Replace network acl association @@ -114,7 +124,9 @@ def replace_network_acl_association(self, association_id, acl_id): new_association_id = status["NewAssociationId"] except Exception as e: logging.error("Failed to replace network acl association: %s" % (e)) - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() return new_association_id # Describe network acl @@ -131,7 +143,9 @@ def describe_network_acls(self, vpc_id, subnet_id): "Failed to describe network acl: %s." "Make sure you have aws cli configured on the host and set for the region of your vpc/subnet" % (e) ) - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() associations = response["NetworkAcls"][0]["Associations"] # grab the current network_acl in use original_acl_id = response["NetworkAcls"][0]["Associations"][0]["NetworkAclId"] @@ -148,7 +162,9 @@ def delete_network_acl(self, acl_id): "Make sure you have aws cli configured on the host and set for the region of your vpc/subnet" % (acl_id, e) ) - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() # krkn_lib_kubernetes class aws_node_scenarios(abstract_node_scenarios): @@ -173,7 +189,9 @@ def node_start_scenario(self, instance_kill_count, node, timeout): "Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e) ) logging.error("node_start_scenario injection failed!") - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() # Node scenario to stop the node def node_stop_scenario(self, instance_kill_count, node, timeout): @@ -189,7 +207,9 @@ def node_stop_scenario(self, instance_kill_count, node, timeout): except Exception as e: logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % (e)) logging.error("node_stop_scenario injection failed!") - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() # Node scenario to terminate the node def node_termination_scenario(self, instance_kill_count, node, timeout): @@ -213,7 +233,9 @@ def node_termination_scenario(self, instance_kill_count, node, timeout): "Failed to terminate node instance. Encountered following exception:" " %s. Test Failed" % (e) ) logging.error("node_termination_scenario injection failed!") - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() # Node scenario to reboot the node def node_reboot_scenario(self, instance_kill_count, node, timeout): @@ -232,4 +254,6 @@ def node_reboot_scenario(self, instance_kill_count, node, timeout): "Failed to reboot node instance. Encountered following exception:" " %s. Test Failed" % (e) ) logging.error("node_reboot_scenario injection failed!") - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() diff --git a/kraken/node_actions/az_node_scenarios.py b/kraken/node_actions/az_node_scenarios.py index 5a093254..adffb79d 100644 --- a/kraken/node_actions/az_node_scenarios.py +++ b/kraken/node_actions/az_node_scenarios.py @@ -39,7 +39,9 @@ def start_instances(self, group_name, vm_name): logging.info("vm name " + str(vm_name) + " started") except Exception as e: logging.error("Failed to start node instance %s. Encountered following " "exception: %s." % (vm_name, e)) - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() # Stop the node instance def stop_instances(self, group_name, vm_name): @@ -48,7 +50,9 @@ def stop_instances(self, group_name, vm_name): logging.info("vm name " + str(vm_name) + " stopped") except Exception as e: logging.error("Failed to stop node instance %s. Encountered following " "exception: %s." % (vm_name, e)) - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() # Terminate the node instance def terminate_instances(self, group_name, vm_name): @@ -59,7 +63,9 @@ def terminate_instances(self, group_name, vm_name): logging.error( "Failed to terminate node instance %s. Encountered following " "exception: %s." % (vm_name, e) ) - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() # Reboot the node instance def reboot_instances(self, group_name, vm_name): @@ -68,7 +74,9 @@ def reboot_instances(self, group_name, vm_name): logging.info("vm name " + str(vm_name) + " rebooted") except Exception as e: logging.error("Failed to reboot node instance %s. Encountered following " "exception: %s." % (vm_name, e)) - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() def get_vm_status(self, resource_group, vm_name): statuses = self.compute_client.virtual_machines.instance_view(resource_group, vm_name).statuses @@ -145,7 +153,9 @@ def node_start_scenario(self, instance_kill_count, node, timeout): "Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e) ) logging.error("node_start_scenario injection failed!") - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() # Node scenario to stop the node def node_stop_scenario(self, instance_kill_count, node, timeout): @@ -161,7 +171,9 @@ def node_stop_scenario(self, instance_kill_count, node, timeout): except Exception as e: logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % e) logging.error("node_stop_scenario injection failed!") - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() # Node scenario to terminate the node def node_termination_scenario(self, instance_kill_count, node, timeout): @@ -185,7 +197,9 @@ def node_termination_scenario(self, instance_kill_count, node, timeout): "Failed to terminate node instance. Encountered following exception:" " %s. Test Failed" % (e) ) logging.error("node_termination_scenario injection failed!") - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() # Node scenario to reboot the node def node_reboot_scenario(self, instance_kill_count, node, timeout): @@ -204,4 +218,6 @@ def node_reboot_scenario(self, instance_kill_count, node, timeout): "Failed to reboot node instance. Encountered following exception:" " %s. Test Failed" % (e) ) logging.error("node_reboot_scenario injection failed!") - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() \ No newline at end of file diff --git a/kraken/node_actions/gcp_node_scenarios.py b/kraken/node_actions/gcp_node_scenarios.py index db7a11d7..64711a2a 100644 --- a/kraken/node_actions/gcp_node_scenarios.py +++ b/kraken/node_actions/gcp_node_scenarios.py @@ -45,7 +45,9 @@ def start_instances(self, zone, instance_id): logging.error( "Failed to start node instance %s. Encountered following " "exception: %s." % (instance_id, e) ) - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() # Stop the node instance def stop_instances(self, zone, instance_id): @@ -54,7 +56,9 @@ def stop_instances(self, zone, instance_id): logging.info("vm name " + str(instance_id) + " stopped") except Exception as e: logging.error("Failed to stop node instance %s. Encountered following " "exception: %s." % (instance_id, e)) - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() # Start the node instance def suspend_instances(self, zone, instance_id): @@ -65,7 +69,9 @@ def suspend_instances(self, zone, instance_id): logging.error( "Failed to suspend node instance %s. Encountered following " "exception: %s." % (instance_id, e) ) - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() # Terminate the node instance def terminate_instances(self, zone, instance_id): @@ -76,7 +82,9 @@ def terminate_instances(self, zone, instance_id): logging.error( "Failed to start node instance %s. Encountered following " "exception: %s." % (instance_id, e) ) - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() # Reboot the node instance def reboot_instances(self, zone, instance_id): @@ -87,7 +95,9 @@ def reboot_instances(self, zone, instance_id): logging.error( "Failed to start node instance %s. Encountered following " "exception: %s." % (instance_id, e) ) - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() # Get instance status def get_instance_status(self, zone, instance_id, expected_status, timeout): @@ -156,7 +166,9 @@ def node_start_scenario(self, instance_kill_count, node, timeout): "Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e) ) logging.error("node_start_scenario injection failed!") - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() # Node scenario to stop the node def node_stop_scenario(self, instance_kill_count, node, timeout): @@ -173,7 +185,9 @@ def node_stop_scenario(self, instance_kill_count, node, timeout): except Exception as e: logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % (e)) logging.error("node_stop_scenario injection failed!") - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() # Node scenario to terminate the node def node_termination_scenario(self, instance_kill_count, node, timeout): @@ -197,7 +211,9 @@ def node_termination_scenario(self, instance_kill_count, node, timeout): "Failed to terminate node instance. Encountered following exception:" " %s. Test Failed" % e ) logging.error("node_termination_scenario injection failed!") - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() # Node scenario to reboot the node def node_reboot_scenario(self, instance_kill_count, node, timeout): @@ -215,4 +231,6 @@ def node_reboot_scenario(self, instance_kill_count, node, timeout): "Failed to reboot node instance. Encountered following exception:" " %s. Test Failed" % (e) ) logging.error("node_reboot_scenario injection failed!") - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() diff --git a/kraken/node_actions/openstack_node_scenarios.py b/kraken/node_actions/openstack_node_scenarios.py index 9e8bcdde..bae44e90 100644 --- a/kraken/node_actions/openstack_node_scenarios.py +++ b/kraken/node_actions/openstack_node_scenarios.py @@ -24,7 +24,9 @@ def start_instances(self, node): logging.info("Instance: " + str(node) + " started") except Exception as e: logging.error("Failed to start node instance %s. Encountered following " "exception: %s." % (node, e)) - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() # Stop the node instance def stop_instances(self, node): @@ -33,7 +35,9 @@ def stop_instances(self, node): logging.info("Instance: " + str(node) + " stopped") except Exception as e: logging.error("Failed to stop node instance %s. Encountered following " "exception: %s." % (node, e)) - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() # Reboot the node instance def reboot_instances(self, node): @@ -42,7 +46,9 @@ def reboot_instances(self, node): logging.info("Instance: " + str(node) + " rebooted") except Exception as e: logging.error("Failed to reboot node instance %s. Encountered following " "exception: %s." % (node, e)) - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() # Wait until the node instance is running def wait_until_running(self, node, timeout): @@ -109,7 +115,9 @@ def node_start_scenario(self, instance_kill_count, node, timeout): "Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e) ) logging.error("node_start_scenario injection failed!") - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() # Node scenario to stop the node def node_stop_scenario(self, instance_kill_count, node, timeout): @@ -125,7 +133,9 @@ def node_stop_scenario(self, instance_kill_count, node, timeout): except Exception as e: logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % (e)) logging.error("node_stop_scenario injection failed!") - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() # Node scenario to reboot the node def node_reboot_scenario(self, instance_kill_count, node, timeout): @@ -144,7 +154,9 @@ def node_reboot_scenario(self, instance_kill_count, node, timeout): "Failed to reboot node instance. Encountered following exception:" " %s. Test Failed" % (e) ) logging.error("node_reboot_scenario injection failed!") - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() # Node scenario to start the node def helper_node_start_scenario(self, instance_kill_count, node_ip, timeout): @@ -162,7 +174,9 @@ def helper_node_start_scenario(self, instance_kill_count, node_ip, timeout): "Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e) ) logging.error("helper_node_start_scenario injection failed!") - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() # Node scenario to stop the node def helper_node_stop_scenario(self, instance_kill_count, node_ip, timeout): @@ -177,7 +191,9 @@ def helper_node_stop_scenario(self, instance_kill_count, node_ip, timeout): except Exception as e: logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % (e)) logging.error("helper_node_stop_scenario injection failed!") - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() def helper_node_service_status(self, node_ip, service, ssh_private_key, timeout): try: @@ -188,4 +204,6 @@ def helper_node_service_status(self, node_ip, service, ssh_private_key, timeout) except Exception as e: logging.error("Failed to check service status. Encountered following exception:" " %s. Test Failed" % (e)) logging.error("helper_node_service_status injection failed!") - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() diff --git a/kraken/node_actions/run.py b/kraken/node_actions/run.py index b37660e7..c7e0bf5c 100644 --- a/kraken/node_actions/run.py +++ b/kraken/node_actions/run.py @@ -13,7 +13,7 @@ from kraken.node_actions.docker_node_scenarios import docker_node_scenarios import kraken.node_actions.common_node_functions as common_node_functions import kraken.cerberus.setup as cerberus - +from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry node_general = False @@ -53,8 +53,14 @@ def get_node_scenario_object(node_scenario, kubecli: krkn_lib_kubernetes.KrknLib # Run defined scenarios # krkn_lib_kubernetes -def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes): +def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]): + scenario_telemetries: list[ScenarioTelemetry] = [] + failed_scenarios = [] for node_scenario_config in scenarios_list: + scenario_telemetry = ScenarioTelemetry() + scenario_telemetry.scenario = node_scenario_config + scenario_telemetry.startTimeStamp = time.time() + telemetry.set_parameters_base64(scenario_telemetry, node_scenario_config) with open(node_scenario_config, "r") as f: node_scenario_config = yaml.full_load(f) for node_scenario in node_scenario_config["node_scenarios"]: @@ -62,12 +68,24 @@ def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.Krkn if node_scenario["actions"]: for action in node_scenario["actions"]: start_time = int(time.time()) - inject_node_scenario(action, node_scenario, node_scenario_object, kubecli) - logging.info("Waiting for the specified duration: %s" % (wait_duration)) - time.sleep(wait_duration) - end_time = int(time.time()) - cerberus.get_status(config, start_time, end_time) - logging.info("") + try: + inject_node_scenario(action, node_scenario, node_scenario_object, kubecli) + logging.info("Waiting for the specified duration: %s" % (wait_duration)) + time.sleep(wait_duration) + end_time = int(time.time()) + cerberus.get_status(config, start_time, end_time) + logging.info("") + except (RuntimeError, Exception) as e: + scenario_telemetry.exitStatus = 1 + failed_scenarios.append(node_scenario_config) + telemetry.log_exception(node_scenario_config) + else: + scenario_telemetry.exitStatus = 0 + + scenario_telemetry.endTimeStamp = time.time() + scenario_telemetries.append(scenario_telemetry) + + return failed_scenarios, scenario_telemetries # Inject the specified node scenario From 4bf5afb21de9b7bda7fafacf78c9df68f743a175 Mon Sep 17 00:00:00 2001 From: Tullio Sebastiani Date: Wed, 14 Jun 2023 11:56:05 +0200 Subject: [PATCH 06/23] container scenarios fix --- kraken/pod_scenarios/setup.py | 80 ++++++++++++++++++++++++----------- 1 file changed, 56 insertions(+), 24 deletions(-) diff --git a/kraken/pod_scenarios/setup.py b/kraken/pod_scenarios/setup.py index 2eeb4db5..6071c327 100644 --- a/kraken/pod_scenarios/setup.py +++ b/kraken/pod_scenarios/setup.py @@ -10,7 +10,7 @@ import yaml import sys import random - +from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry # Run pod based scenarios def run(kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_duration): @@ -67,8 +67,22 @@ def run(kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_dur return failed_post_scenarios # krkn_lib_kubernetes -def container_run(kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes): +def container_run(kubeconfig_path, + scenarios_list, + config, + failed_post_scenarios, + wait_duration, + kubecli: krkn_lib_kubernetes.KrknLibKubernetes, + telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]): + + failed_scenarios = [] + scenario_telemetries: list[ScenarioTelemetry] = [] + for container_scenario_config in scenarios_list: + scenario_telemetry = ScenarioTelemetry() + scenario_telemetry.scenario = container_scenario_config + scenario_telemetry.startTimeStamp = time.time() + telemetry.set_parameters_base64(scenario_telemetry, container_scenario_config[0]) if len(container_scenario_config) > 1: pre_action_output = post_actions.run(kubeconfig_path, container_scenario_config[1]) else: @@ -78,30 +92,41 @@ def container_run(kubeconfig_path, scenarios_list, config, failed_post_scenarios for cont_scenario in cont_scenario_config["scenarios"]: # capture start time start_time = int(time.time()) - killed_containers = container_killing_in_pod(cont_scenario, kubecli) - - if len(container_scenario_config) > 1: - try: + try: + killed_containers = container_killing_in_pod(cont_scenario, kubecli) + if len(container_scenario_config) > 1: failed_post_scenarios = post_actions.check_recovery( - kubeconfig_path, container_scenario_config, failed_post_scenarios, pre_action_output + kubeconfig_path, + container_scenario_config, + failed_post_scenarios, + pre_action_output + ) + else: + failed_post_scenarios = check_failed_containers( + killed_containers, cont_scenario.get("retry_wait", 120), kubecli ) - except Exception as e: - logging.error("Failed to run post action checks: %s" % e) - sys.exit(1) - else: - failed_post_scenarios = check_failed_containers( - killed_containers, cont_scenario.get("retry_wait", 120), kubecli - ) - logging.info("Waiting for the specified duration: %s" % (wait_duration)) - time.sleep(wait_duration) + logging.info("Waiting for the specified duration: %s" % (wait_duration)) + time.sleep(wait_duration) - # capture end time - end_time = int(time.time()) + # capture end time + end_time = int(time.time()) + + # publish cerberus status + cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time) + except (RuntimeError, Exception): + failed_scenarios.append(container_scenario_config[0]) + telemetry.log_exception(container_scenario_config[0]) + scenario_telemetry.exitStatus = 1 + # removed_exit + # sys.exit(1) + else: + scenario_telemetry.exitStatus = 0 + scenario_telemetry.endTimeStamp = time.time() + scenario_telemetries.append(scenario_telemetry) + + return failed_scenarios, scenario_telemetries - # publish cerberus status - cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time) - logging.info("") def container_killing_in_pod(cont_scenario, kubecli: krkn_lib_kubernetes.KrknLibKubernetes): @@ -114,7 +139,9 @@ def container_killing_in_pod(cont_scenario, kubecli: krkn_lib_kubernetes.KrknLib kill_count = cont_scenario.get("count", 1) if type(pod_names) != list: logging.error("Please make sure your pod_names are in a list format") - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() if len(pod_names) == 0: if namespace == "*": # returns double array of pod name and namespace @@ -126,7 +153,9 @@ def container_killing_in_pod(cont_scenario, kubecli: krkn_lib_kubernetes.KrknLib if namespace == "*": logging.error("You must specify the namespace to kill a container in a specific pod") logging.error("Scenario " + scenario_name + " failed") - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() pods = pod_names # get container and pod name container_pod_list = [] @@ -147,7 +176,9 @@ def container_killing_in_pod(cont_scenario, kubecli: krkn_lib_kubernetes.KrknLib if len(container_pod_list) == 0: logging.error("Trying to kill more containers than were found, try lowering kill count") logging.error("Scenario " + scenario_name + " failed") - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() selected_container_pod = container_pod_list[random.randint(0, len(container_pod_list) - 1)] for c_name in selected_container_pod[2]: if container_name != "": @@ -178,6 +209,7 @@ def retry_container_killing(kill_action, podname, namespace, container_name, kub time.sleep(2) continue else: + logging.warning(response) continue From 83a0f8248abb1d8948809df2a24b76ca0bb439bd Mon Sep 17 00:00:00 2001 From: Tullio Sebastiani Date: Wed, 14 Jun 2023 11:58:34 +0200 Subject: [PATCH 07/23] time scenarios --- kraken/time_actions/common_time_functions.py | 74 +++++++++++++------- 1 file changed, 50 insertions(+), 24 deletions(-) diff --git a/kraken/time_actions/common_time_functions.py b/kraken/time_actions/common_time_functions.py index b96f0c53..5ace3c2c 100644 --- a/kraken/time_actions/common_time_functions.py +++ b/kraken/time_actions/common_time_functions.py @@ -8,6 +8,7 @@ import krkn_lib_kubernetes from ..cerberus import setup as cerberus from ..invoke import command as runcommand +from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry # krkn_lib_kubernetes def pod_exec(pod_name, command, namespace, container_name, kubecli: krkn_lib_kubernetes.KrknLibKubernetes): @@ -93,7 +94,9 @@ def skew_time(scenario, kubecli: krkn_lib_kubernetes.KrknLibKubernetes): for name in scenario["object_name"]: if "namespace" not in scenario.keys(): logging.error("Need to set namespace when using pod name") - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() pod_names.append([name, scenario["namespace"]]) elif "namespace" in scenario.keys() and scenario["namespace"]: if "label_selector" not in scenario.keys(): @@ -127,7 +130,9 @@ def skew_time(scenario, kubecli: krkn_lib_kubernetes.KrknLibKubernetes): "Cannot find pods matching the namespace/label_selector, " "please check" ) - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() pod_counter = 0 for pod in pod_names: if len(pod) > 1: @@ -152,7 +157,9 @@ def skew_time(scenario, kubecli: krkn_lib_kubernetes.KrknLibKubernetes): "in pod %s in namespace %s" % (selected_container_name, pod[0], pod[1]) ) - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() pod_names[pod_counter].append(selected_container_name) else: selected_container_name = get_container_name( @@ -178,7 +185,9 @@ def skew_time(scenario, kubecli: krkn_lib_kubernetes.KrknLibKubernetes): scenario["namespace"] ) ) - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() pod_names[pod_counter].append(selected_container_name) logging.info("Reset date/time on pod " + str(pod[0])) pod_counter += 1 @@ -299,24 +308,41 @@ def check_date_time(object_type, names, kubecli: krkn_lib_kubernetes.KrknLibKube # krkn_lib_kubernetes -def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes): +def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]): + failed_scenarios = [] + scenario_telemetries: list[ScenarioTelemetry] = [] for time_scenario_config in scenarios_list: - with open(time_scenario_config, "r") as f: - scenario_config = yaml.full_load(f) - for time_scenario in scenario_config["time_scenarios"]: - start_time = int(time.time()) - object_type, object_names = skew_time(time_scenario, kubecli) - not_reset = check_date_time(object_type, object_names, kubecli) - if len(not_reset) > 0: - logging.info("Object times were not reset") - logging.info( - "Waiting for the specified duration: %s" % (wait_duration) - ) - time.sleep(wait_duration) - end_time = int(time.time()) - cerberus.publish_kraken_status( - config, - not_reset, - start_time, - end_time - ) + scenario_telemetry = ScenarioTelemetry() + scenario_telemetry.scenario = time_scenario_config + scenario_telemetry.startTimeStamp = time.time() + telemetry.set_parameters_base64(scenario_telemetry, time_scenario_config) + try: + with open(time_scenario_config, "r") as f: + scenario_config = yaml.full_load(f) + for time_scenario in scenario_config["time_scenarios"]: + start_time = int(time.time()) + object_type, object_names = skew_time(time_scenario, kubecli) + not_reset = check_date_time(object_type, object_names, kubecli) + if len(not_reset) > 0: + logging.info("Object times were not reset") + logging.info( + "Waiting for the specified duration: %s" % (wait_duration) + ) + time.sleep(wait_duration) + end_time = int(time.time()) + cerberus.publish_kraken_status( + config, + not_reset, + start_time, + end_time + ) + except (RuntimeError, Exception): + scenario_telemetry.exitStatus = 1 + telemetry.log_exception(time_scenario_config) + failed_scenarios.append(time_scenario_config) + else: + scenario_telemetry.exitStatus = 0 + scenario_telemetry.endTimeStamp = time.time() + scenario_telemetries.append(scenario_telemetry) + + return failed_scenarios, scenario_telemetries From 68036695a7c909c9180e737c2f0e4d9f0058234f Mon Sep 17 00:00:00 2001 From: Tullio Sebastiani Date: Wed, 14 Jun 2023 11:59:01 +0200 Subject: [PATCH 08/23] cluster shutdown scenarios --- kraken/shut_down/common_shut_down_func.py | 61 ++++++++++++++++------- 1 file changed, 42 insertions(+), 19 deletions(-) diff --git a/kraken/shut_down/common_shut_down_func.py b/kraken/shut_down/common_shut_down_func.py index ed0ac51c..e498b819 100644 --- a/kraken/shut_down/common_shut_down_func.py +++ b/kraken/shut_down/common_shut_down_func.py @@ -1,5 +1,5 @@ #!/usr/bin/env python - +import os import sys import yaml import logging @@ -13,7 +13,7 @@ from ..node_actions.openstack_node_scenarios import OPENSTACKCLOUD from ..node_actions.az_node_scenarios import Azure from ..node_actions.gcp_node_scenarios import GCP - +from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry def multiprocess_nodes(cloud_object_function, nodes): try: @@ -59,7 +59,9 @@ def cluster_shut_down(shut_down_config, kubecli: krkn_lib_kubernetes.KrknLibKube "Cloud type %s is not currently supported for cluster shut down" % cloud_type ) - sys.exit(1) + # removed_exit + # sys.exit(1) + raise RuntimeError() nodes = kubecli.list_nodes() node_id = [] @@ -128,9 +130,16 @@ def cluster_shut_down(shut_down_config, kubecli: krkn_lib_kubernetes.KrknLibKube # krkn_lib_kubernetes -def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes): +def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]): failed_post_scenarios = [] + failed_scenarios = [] + scenario_telemetries: list[ScenarioTelemetry] = [] + for shut_down_config in scenarios_list: + scenario_telemetry = ScenarioTelemetry() + scenario_telemetry.scenario = shut_down_config + scenario_telemetry.startTimeStamp = time.time() + telemetry.set_parameters_base64(scenario_telemetry, shut_down_config[0]) if len(shut_down_config) > 1: pre_action_output = post_actions.run("", shut_down_config[1]) else: @@ -140,18 +149,32 @@ def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.Krkn shut_down_config_scenario = \ shut_down_config_yaml["cluster_shut_down_scenario"] start_time = int(time.time()) - cluster_shut_down(shut_down_config_scenario, kubecli) - logging.info( - "Waiting for the specified duration: %s" % (wait_duration) - ) - time.sleep(wait_duration) - failed_post_scenarios = post_actions.check_recovery( - "", shut_down_config, failed_post_scenarios, pre_action_output - ) - end_time = int(time.time()) - cerberus.publish_kraken_status( - config, - failed_post_scenarios, - start_time, - end_time - ) + try: + cluster_shut_down(shut_down_config_scenario, kubecli) + logging.info( + "Waiting for the specified duration: %s" % (wait_duration) + ) + time.sleep(wait_duration) + failed_post_scenarios = post_actions.check_recovery( + "", shut_down_config, failed_post_scenarios, pre_action_output + ) + end_time = int(time.time()) + cerberus.publish_kraken_status( + config, + failed_post_scenarios, + start_time, + end_time + ) + + except (RuntimeError, Exception): + telemetry.log_exception(shut_down_config[0]) + failed_scenarios.append(shut_down_config[0]) + scenario_telemetry.exitStatus = 1 + else: + scenario_telemetry.exitStatus = 0 + + scenario_telemetry.endTimeStamp = time.time() + scenario_telemetries.append(scenario_telemetry) + + return failed_scenarios, scenario_telemetries + From afb678cf84b701fa5266182bd87bef0aad364d0a Mon Sep 17 00:00:00 2001 From: Tullio Sebastiani Date: Wed, 14 Jun 2023 16:36:59 +0200 Subject: [PATCH 09/23] namespace scenarios --- .../common_namespace_functions.py | 148 ++++++++++-------- 1 file changed, 86 insertions(+), 62 deletions(-) diff --git a/kraken/namespace_actions/common_namespace_functions.py b/kraken/namespace_actions/common_namespace_functions.py index 1ebbbe9b..7d7212d6 100644 --- a/kraken/namespace_actions/common_namespace_functions.py +++ b/kraken/namespace_actions/common_namespace_functions.py @@ -6,7 +6,7 @@ import kraken.post_actions.actions as post_actions import yaml import sys - +from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry # krkn_lib_kubernetes def run( @@ -15,72 +15,96 @@ def run( wait_duration, failed_post_scenarios, kubeconfig_path, - kubecli: krkn_lib_kubernetes.KrknLibKubernetes -): - + kubecli: krkn_lib_kubernetes.KrknLibKubernetes, + telemetry: KrknTelemetry +) -> (list[str], list[ScenarioTelemetry]): + scenario_telemetries: list[ScenarioTelemetry] = [] + failed_scenarios = [] for scenario_config in scenarios_list: - if len(scenario_config) > 1: - pre_action_output = post_actions.run(kubeconfig_path, scenario_config[1]) - else: - pre_action_output = "" - with open(scenario_config[0], "r") as f: - scenario_config_yaml = yaml.full_load(f) - for scenario in scenario_config_yaml["scenarios"]: - scenario_namespace = scenario.get("namespace", "") - scenario_label = scenario.get("label_selector", "") - if scenario_namespace is not None and scenario_namespace.strip() != "": - if scenario_label is not None and scenario_label.strip() != "": - logging.error("You can only have namespace or label set in your namespace scenario") - logging.error( - "Current scenario config has namespace '%s' and label selector '%s'" - % (scenario_namespace, scenario_label) - ) - logging.error( - "Please set either namespace to blank ('') or label_selector to blank ('') to continue" - ) - sys.exit(1) - delete_count = scenario.get("delete_count", 1) - run_count = scenario.get("runs", 1) - run_sleep = scenario.get("sleep", 10) - wait_time = scenario.get("wait_time", 30) - killed_namespaces = [] - start_time = int(time.time()) - for i in range(run_count): - namespaces = kubecli.check_namespaces([scenario_namespace], scenario_label) - for j in range(delete_count): - if len(namespaces) == 0: + scenario_telemetry = ScenarioTelemetry() + scenario_telemetry.scenario = scenario_config[0] + scenario_telemetry.startTimeStamp = time.time() + telemetry.set_parameters_base64(scenario_telemetry, scenario_config[0]) + try: + if len(scenario_config) > 1: + pre_action_output = post_actions.run(kubeconfig_path, scenario_config[1]) + else: + pre_action_output = "" + with open(scenario_config[0], "r") as f: + scenario_config_yaml = yaml.full_load(f) + for scenario in scenario_config_yaml["scenarios"]: + scenario_namespace = scenario.get("namespace", "") + scenario_label = scenario.get("label_selector", "") + if scenario_namespace is not None and scenario_namespace.strip() != "": + if scenario_label is not None and scenario_label.strip() != "": + logging.error("You can only have namespace or label set in your namespace scenario") logging.error( - "Couldn't delete %s namespaces, not enough namespaces matching %s with label %s" - % (str(run_count), scenario_namespace, str(scenario_label)) + "Current scenario config has namespace '%s' and label selector '%s'" + % (scenario_namespace, scenario_label) ) - sys.exit(1) - selected_namespace = namespaces[random.randint(0, len(namespaces) - 1)] - killed_namespaces.append(selected_namespace) - try: - kubecli.delete_namespace(selected_namespace) - logging.info("Delete on namespace %s was successful" % str(selected_namespace)) - except Exception as e: - logging.info("Delete on namespace %s was unsuccessful" % str(selected_namespace)) - logging.info("Namespace action error: " + str(e)) - sys.exit(1) - namespaces.remove(selected_namespace) - logging.info("Waiting %s seconds between namespace deletions" % str(run_sleep)) - time.sleep(run_sleep) - - logging.info("Waiting for the specified duration: %s" % wait_duration) - time.sleep(wait_duration) - if len(scenario_config) > 1: - try: - failed_post_scenarios = post_actions.check_recovery( - kubeconfig_path, scenario_config, failed_post_scenarios, pre_action_output + logging.error( + "Please set either namespace to blank ('') or label_selector to blank ('') to continue" + ) + # removed_exit + # sys.exit(1) + raise RuntimeError() + delete_count = scenario.get("delete_count", 1) + run_count = scenario.get("runs", 1) + run_sleep = scenario.get("sleep", 10) + wait_time = scenario.get("wait_time", 30) + killed_namespaces = [] + start_time = int(time.time()) + for i in range(run_count): + namespaces = kubecli.check_namespaces([scenario_namespace], scenario_label) + for j in range(delete_count): + if len(namespaces) == 0: + logging.error( + "Couldn't delete %s namespaces, not enough namespaces matching %s with label %s" + % (str(run_count), scenario_namespace, str(scenario_label)) ) + # removed_exit + # sys.exit(1) + raise RuntimeError() + selected_namespace = namespaces[random.randint(0, len(namespaces) - 1)] + killed_namespaces.append(selected_namespace) + try: + kubecli.delete_namespace(selected_namespace) + logging.info("Delete on namespace %s was successful" % str(selected_namespace)) except Exception as e: - logging.error("Failed to run post action checks: %s" % e) - sys.exit(1) - else: - failed_post_scenarios = check_active_namespace(killed_namespaces, wait_time, kubecli) - end_time = int(time.time()) - cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time) + logging.info("Delete on namespace %s was unsuccessful" % str(selected_namespace)) + logging.info("Namespace action error: " + str(e)) + # removed_exit + # sys.exit(1) + raise RuntimeError() + namespaces.remove(selected_namespace) + logging.info("Waiting %s seconds between namespace deletions" % str(run_sleep)) + time.sleep(run_sleep) + + logging.info("Waiting for the specified duration: %s" % wait_duration) + time.sleep(wait_duration) + if len(scenario_config) > 1: + try: + failed_post_scenarios = post_actions.check_recovery( + kubeconfig_path, scenario_config, failed_post_scenarios, pre_action_output + ) + except Exception as e: + logging.error("Failed to run post action checks: %s" % e) + # removed_exit + # sys.exit(1) + raise RuntimeError() + else: + failed_post_scenarios = check_active_namespace(killed_namespaces, wait_time, kubecli) + end_time = int(time.time()) + cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time) + except (Exception, RuntimeError): + scenario_telemetry.exitStatus = 1 + failed_scenarios.append(scenario_config[0]) + telemetry.log_exception(scenario_config[0]) + else: + scenario_telemetry.exitStatus = 0 + scenario_telemetry.endTimeStamp = time.time() + scenario_telemetries.append(scenario_telemetry) + return failed_scenarios, scenario_telemetries # krkn_lib_kubernetes def check_active_namespace(killed_namespaces, wait_time, kubecli: krkn_lib_kubernetes.KrknLibKubernetes): From 1de6de8c2f81768096512b6cf0babf9a6ce74333 Mon Sep 17 00:00:00 2001 From: Tullio Sebastiani Date: Wed, 14 Jun 2023 16:37:13 +0200 Subject: [PATCH 10/23] zone outage scenarios --- kraken/zone_outage/actions.py | 169 +++++++++++++++++++--------------- 1 file changed, 94 insertions(+), 75 deletions(-) diff --git a/kraken/zone_outage/actions.py b/kraken/zone_outage/actions.py index 64f4c6e4..3c2e3e01 100644 --- a/kraken/zone_outage/actions.py +++ b/kraken/zone_outage/actions.py @@ -1,100 +1,119 @@ import yaml -import sys import logging import time from ..node_actions.aws_node_scenarios import AWS from ..cerberus import setup as cerberus +from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry - -def run(scenarios_list, config, wait_duration): +def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]) : """ filters the subnet of interest and applies the network acl to create zone outage """ failed_post_scenarios = "" + scenario_telemetries: list[ScenarioTelemetry] = [] + failed_scenarios = [] + for zone_outage_config in scenarios_list: - if len(zone_outage_config) > 1: - with open(zone_outage_config, "r") as f: - zone_outage_config_yaml = yaml.full_load(f) - scenario_config = zone_outage_config_yaml["zone_outage"] - vpc_id = scenario_config["vpc_id"] - subnet_ids = scenario_config["subnet_id"] - duration = scenario_config["duration"] - cloud_type = scenario_config["cloud_type"] - ids = {} - acl_ids_created = [] + scenario_telemetry = ScenarioTelemetry() + scenario_telemetry.scenario = zone_outage_config[0] + scenario_telemetry.startTimeStamp = time.time() + telemetry.set_parameters_base64(scenario_telemetry, zone_outage_config[0]) + try: + if len(zone_outage_config) > 1: + with open(zone_outage_config, "r") as f: + zone_outage_config_yaml = yaml.full_load(f) + scenario_config = zone_outage_config_yaml["zone_outage"] + vpc_id = scenario_config["vpc_id"] + subnet_ids = scenario_config["subnet_id"] + duration = scenario_config["duration"] + cloud_type = scenario_config["cloud_type"] + ids = {} + acl_ids_created = [] - if cloud_type.lower() == "aws": - cloud_object = AWS() - else: - logging.error( - "Cloud type %s is not currently supported for " - "zone outage scenarios" - % cloud_type - ) - sys.exit(1) + if cloud_type.lower() == "aws": + cloud_object = AWS() + else: + logging.error( + "Cloud type %s is not currently supported for " + "zone outage scenarios" + % cloud_type + ) + # removed_exit + # sys.exit(1) + raise RuntimeError() - start_time = int(time.time()) + start_time = int(time.time()) - for subnet_id in subnet_ids: - logging.info("Targeting subnet_id") - network_association_ids = [] - associations, original_acl_id = \ - cloud_object.describe_network_acls(vpc_id, subnet_id) - for entry in associations: - if entry["SubnetId"] == subnet_id: - network_association_ids.append( - entry["NetworkAclAssociationId"] + for subnet_id in subnet_ids: + logging.info("Targeting subnet_id") + network_association_ids = [] + associations, original_acl_id = \ + cloud_object.describe_network_acls(vpc_id, subnet_id) + for entry in associations: + if entry["SubnetId"] == subnet_id: + network_association_ids.append( + entry["NetworkAclAssociationId"] + ) + logging.info( + "Network association ids associated with " + "the subnet %s: %s" + % (subnet_id, network_association_ids) + ) + acl_id = cloud_object.create_default_network_acl(vpc_id) + new_association_id = \ + cloud_object.replace_network_acl_association( + network_association_ids[0], acl_id ) + + # capture the orginal_acl_id, created_acl_id and + # new association_id to use during the recovery + ids[new_association_id] = original_acl_id + acl_ids_created.append(acl_id) + + # wait for the specified duration logging.info( - "Network association ids associated with " - "the subnet %s: %s" - % (subnet_id, network_association_ids) + "Waiting for the specified duration " + "in the config: %s" % (duration) ) - acl_id = cloud_object.create_default_network_acl(vpc_id) - new_association_id = \ + time.sleep(duration) + + # replace the applied acl with the previous acl in use + for new_association_id, original_acl_id in ids.items(): cloud_object.replace_network_acl_association( - network_association_ids[0], acl_id + new_association_id, + original_acl_id ) + logging.info( + "Wating for 60 seconds to make sure " + "the changes are in place" + ) + time.sleep(60) - # capture the orginal_acl_id, created_acl_id and - # new association_id to use during the recovery - ids[new_association_id] = original_acl_id - acl_ids_created.append(acl_id) - - # wait for the specified duration - logging.info( - "Waiting for the specified duration " - "in the config: %s" % (duration) - ) - time.sleep(duration) + # delete the network acl created for the run + for acl_id in acl_ids_created: + cloud_object.delete_network_acl(acl_id) - # replace the applied acl with the previous acl in use - for new_association_id, original_acl_id in ids.items(): - cloud_object.replace_network_acl_association( - new_association_id, - original_acl_id + logging.info( + "End of scenario. " + "Waiting for the specified duration: %s" % (wait_duration) ) - logging.info( - "Wating for 60 seconds to make sure " - "the changes are in place" - ) - time.sleep(60) - - # delete the network acl created for the run - for acl_id in acl_ids_created: - cloud_object.delete_network_acl(acl_id) + time.sleep(wait_duration) - logging.info( - "End of scenario. " - "Waiting for the specified duration: %s" % (wait_duration) - ) - time.sleep(wait_duration) + end_time = int(time.time()) + cerberus.publish_kraken_status( + config, + failed_post_scenarios, + start_time, + end_time + ) + except (RuntimeError, Exception): + scenario_telemetry.exitStatus = 1 + failed_scenarios.append(zone_outage_config[0]) + telemetry.log_exception(zone_outage_config[0]) + else: + scenario_telemetry.exitStatus = 0 + scenario_telemetry.endTimeStamp = time.time() + scenario_telemetries.append(scenario_telemetry) + return failed_scenarios, scenario_telemetries - end_time = int(time.time()) - cerberus.publish_kraken_status( - config, - failed_post_scenarios, - start_time, - end_time - ) From 6ea49f14ce7d3ad0d92252862c182cba5a3fd83d Mon Sep 17 00:00:00 2001 From: Tullio Sebastiani Date: Tue, 27 Jun 2023 11:08:57 +0200 Subject: [PATCH 11/23] app outage scenarios --- kraken/application_outage/actions.py | 89 +++++++++++++++++----------- 1 file changed, 53 insertions(+), 36 deletions(-) diff --git a/kraken/application_outage/actions.py b/kraken/application_outage/actions.py index 6c82f8d8..f8d32085 100644 --- a/kraken/application_outage/actions.py +++ b/kraken/application_outage/actions.py @@ -4,25 +4,32 @@ import kraken.cerberus.setup as cerberus from jinja2 import Template import kraken.invoke.command as runcommand - +from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry # Reads the scenario config, applies and deletes a network policy to # block the traffic for the specified duration -def run(scenarios_list, config, wait_duration): +def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]): failed_post_scenarios = "" + scenario_telemetries: list[ScenarioTelemetry] = [] + failed_scenarios = [] for app_outage_config in scenarios_list: + scenario_telemetry = ScenarioTelemetry() + scenario_telemetry.scenario = app_outage_config[0] + scenario_telemetry.startTimeStamp = time.time() + telemetry.set_parameters_base64(scenario_telemetry, app_outage_config[0]) if len(app_outage_config) > 1: - with open(app_outage_config, "r") as f: - app_outage_config_yaml = yaml.full_load(f) - scenario_config = app_outage_config_yaml["application_outage"] - pod_selector = scenario_config.get("pod_selector", "{}") - traffic_type = scenario_config.get("block", "[Ingress, Egress]") - namespace = scenario_config.get("namespace", "") - duration = scenario_config.get("duration", 60) + try: + with open(app_outage_config, "r") as f: + app_outage_config_yaml = yaml.full_load(f) + scenario_config = app_outage_config_yaml["application_outage"] + pod_selector = scenario_config.get("pod_selector", "{}") + traffic_type = scenario_config.get("block", "[Ingress, Egress]") + namespace = scenario_config.get("namespace", "") + duration = scenario_config.get("duration", 60) - start_time = int(time.time()) + start_time = int(time.time()) - network_policy_template = """--- + network_policy_template = """--- apiVersion: networking.k8s.io/v1 kind: NetworkPolicy metadata: @@ -31,28 +38,38 @@ def run(scenarios_list, config, wait_duration): podSelector: matchLabels: {{ pod_selector }} policyTypes: {{ traffic_type }} - """ - t = Template(network_policy_template) - rendered_spec = t.render(pod_selector=pod_selector, traffic_type=traffic_type) - # Write the rendered template to a file - with open("kraken_network_policy.yaml", "w") as f: - f.write(rendered_spec) - # Block the traffic by creating network policy - logging.info("Creating the network policy") - runcommand.invoke( - "kubectl create -f %s -n %s --validate=false" % ("kraken_network_policy.yaml", namespace) - ) - - # wait for the specified duration - logging.info("Waiting for the specified duration in the config: %s" % (duration)) - time.sleep(duration) - - # unblock the traffic by deleting the network policy - logging.info("Deleting the network policy") - runcommand.invoke("kubectl delete -f %s -n %s" % ("kraken_network_policy.yaml", namespace)) - - logging.info("End of scenario. Waiting for the specified duration: %s" % (wait_duration)) - time.sleep(wait_duration) - - end_time = int(time.time()) - cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time) + """ + t = Template(network_policy_template) + rendered_spec = t.render(pod_selector=pod_selector, traffic_type=traffic_type) + # Write the rendered template to a file + with open("kraken_network_policy.yaml", "w") as f: + f.write(rendered_spec) + # Block the traffic by creating network policy + logging.info("Creating the network policy") + runcommand.invoke( + "kubectl create -f %s -n %s --validate=false" % ("kraken_network_policy.yaml", namespace) + ) + + # wait for the specified duration + logging.info("Waiting for the specified duration in the config: %s" % (duration)) + time.sleep(duration) + + # unblock the traffic by deleting the network policy + logging.info("Deleting the network policy") + runcommand.invoke("kubectl delete -f %s -n %s" % ("kraken_network_policy.yaml", namespace)) + + logging.info("End of scenario. Waiting for the specified duration: %s" % (wait_duration)) + time.sleep(wait_duration) + + end_time = int(time.time()) + cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time) + except Exception as e : + scenario_telemetry.exitStatus = 1 + failed_scenarios.append(app_outage_config[0]) + telemetry.log_exception(app_outage_config[0]) + else: + scenario_telemetry.exitStatus = 0 + scenario_telemetry.endTimeStamp = time.time() + scenario_telemetries.append(scenario_telemetry) + return failed_scenarios, scenario_telemetries + From 6c7f168a81b81ec19ef5d24a665954bc8a60ec3e Mon Sep 17 00:00:00 2001 From: Tullio Sebastiani Date: Mon, 10 Jul 2023 10:15:46 +0200 Subject: [PATCH 12/23] pvc scenarios --- kraken/pvc/pvc_scenario.py | 357 ++++++++++++++++++++++--------------- 1 file changed, 214 insertions(+), 143 deletions(-) diff --git a/kraken/pvc/pvc_scenario.py b/kraken/pvc/pvc_scenario.py index 95119716..aa9e297a 100644 --- a/kraken/pvc/pvc_scenario.py +++ b/kraken/pvc/pvc_scenario.py @@ -7,150 +7,233 @@ import yaml from ..cerberus import setup as cerberus - +from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry # krkn_lib_kubernetes -def run(scenarios_list, config, kubecli: krkn_lib_kubernetes.KrknLibKubernetes): +def run(scenarios_list, config, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]): """ Reads the scenario config and creates a temp file to fill up the PVC """ failed_post_scenarios = "" + scenario_telemetries: list[ScenarioTelemetry] = [] + failed_scenarios = [] for app_config in scenarios_list: - if len(app_config) > 1: - with open(app_config, "r") as f: - config_yaml = yaml.full_load(f) - scenario_config = config_yaml["pvc_scenario"] - pvc_name = scenario_config.get("pvc_name", "") - pod_name = scenario_config.get("pod_name", "") - namespace = scenario_config.get("namespace", "") - target_fill_percentage = scenario_config.get( - "fill_percentage", "50" - ) - duration = scenario_config.get("duration", 60) - - logging.info( - "Input params:\n" - "pvc_name: '%s'\n" - "pod_name: '%s'\n" - "namespace: '%s'\n" - "target_fill_percentage: '%s%%'\nduration: '%ss'" - % ( - str(pvc_name), - str(pod_name), - str(namespace), - str(target_fill_percentage), - str(duration) + scenario_telemetry = ScenarioTelemetry() + scenario_telemetry.scenario = app_config[0] + scenario_telemetry.startTimeStamp = time.time() + telemetry.set_parameters_base64(scenario_telemetry, app_config[0]) + try: + if len(app_config) > 1: + with open(app_config, "r") as f: + config_yaml = yaml.full_load(f) + scenario_config = config_yaml["pvc_scenario"] + pvc_name = scenario_config.get("pvc_name", "") + pod_name = scenario_config.get("pod_name", "") + namespace = scenario_config.get("namespace", "") + target_fill_percentage = scenario_config.get( + "fill_percentage", "50" ) - ) + duration = scenario_config.get("duration", 60) - # Check input params - if namespace is None: - logging.error( - "You must specify the namespace where the PVC is" - ) - sys.exit(1) - if pvc_name is None and pod_name is None: - logging.error( - "You must specify the pvc_name or the pod_name" - ) - sys.exit(1) - if pvc_name and pod_name: logging.info( - "pod_name will be ignored, pod_name used will be " - "a retrieved from the pod used in the pvc_name" + "Input params:\n" + "pvc_name: '%s'\n" + "pod_name: '%s'\n" + "namespace: '%s'\n" + "target_fill_percentage: '%s%%'\nduration: '%ss'" + % ( + str(pvc_name), + str(pod_name), + str(namespace), + str(target_fill_percentage), + str(duration) + ) ) - # Get pod name - if pvc_name: - if pod_name: + # Check input params + if namespace is None: + logging.error( + "You must specify the namespace where the PVC is" + ) + #sys.exit(1) + raise RuntimeError() + if pvc_name is None and pod_name is None: + logging.error( + "You must specify the pvc_name or the pod_name" + ) + # sys.exit(1) + raise RuntimeError() + if pvc_name and pod_name: logging.info( - "pod_name '%s' will be overridden with one of " - "the pods mounted in the PVC" % (str(pod_name)) + "pod_name will be ignored, pod_name used will be " + "a retrieved from the pod used in the pvc_name" + ) + + # Get pod name + if pvc_name: + if pod_name: + logging.info( + "pod_name '%s' will be overridden with one of " + "the pods mounted in the PVC" % (str(pod_name)) + ) + pvc = kubecli.get_pvc_info(pvc_name, namespace) + try: + # random generator not used for + # security/cryptographic purposes. + pod_name = random.choice(pvc.podNames) # nosec + logging.info("Pod name: %s" % pod_name) + except Exception: + logging.error( + "Pod associated with %s PVC, on namespace %s, " + "not found" % (str(pvc_name), str(namespace)) + ) + # sys.exit(1) + raise RuntimeError() + + # Get volume name + pod = kubecli.get_pod_info(name=pod_name, namespace=namespace) + + if pod is None: + logging.error( + "Exiting as pod '%s' doesn't exist " + "in namespace '%s'" % ( + str(pod_name), + str(namespace) + ) ) - pvc = kubecli.get_pvc_info(pvc_name, namespace) - try: - # random generator not used for - # security/cryptographic purposes. - pod_name = random.choice(pvc.podNames) # nosec - logging.info("Pod name: %s" % pod_name) - except Exception: + # sys.exit(1) + raise RuntimeError() + + for volume in pod.volumes: + if volume.pvcName is not None: + volume_name = volume.name + pvc_name = volume.pvcName + pvc = kubecli.get_pvc_info(pvc_name, namespace) + break + if 'pvc' not in locals(): logging.error( - "Pod associated with %s PVC, on namespace %s, " - "not found" % (str(pvc_name), str(namespace)) + "Pod '%s' in namespace '%s' does not use a pvc" % ( + str(pod_name), + str(namespace) + ) ) - sys.exit(1) + # sys.exit(1) + raise RuntimeError() + logging.info("Volume name: %s" % volume_name) + logging.info("PVC name: %s" % pvc_name) - # Get volume name - pod = kubecli.get_pod_info(name=pod_name, namespace=namespace) + # Get container name and mount path + for container in pod.containers: + for vol in container.volumeMounts: + if vol.name == volume_name: + mount_path = vol.mountPath + container_name = container.name + break + logging.info("Container path: %s" % container_name) + logging.info("Mount path: %s" % mount_path) - if pod is None: - logging.error( - "Exiting as pod '%s' doesn't exist " - "in namespace '%s'" % ( - str(pod_name), - str(namespace) + # Get PVC capacity and used bytes + command = "df %s -B 1024 | sed 1d" % (str(mount_path)) + command_output = ( + kubecli.exec_cmd_in_pod( + command, + pod_name, + namespace, + container_name, + "sh" ) + ).split() + pvc_used_kb = int(command_output[2]) + pvc_capacity_kb = pvc_used_kb + int(command_output[3]) + logging.info("PVC used: %s KB" % pvc_used_kb) + logging.info("PVC capacity: %s KB" % pvc_capacity_kb) + + # Check valid fill percentage + current_fill_percentage = pvc_used_kb / pvc_capacity_kb + if not ( + current_fill_percentage * 100 + < float(target_fill_percentage) + <= 99 + ): + logging.error( + "Target fill percentage (%.2f%%) is lower than " + "current fill percentage (%.2f%%) " + "or higher than 99%%" % ( + target_fill_percentage, + current_fill_percentage * 100 + ) + ) + # sys.exit(1) + raise RuntimeError() + + # Calculate file size + file_size_kb = int( + ( + float( + target_fill_percentage / 100 + ) * float(pvc_capacity_kb) + ) - float(pvc_used_kb) ) - sys.exit(1) + logging.debug("File size: %s KB" % file_size_kb) - for volume in pod.volumes: - if volume.pvcName is not None: - volume_name = volume.name - pvc_name = volume.pvcName - pvc = kubecli.get_pvc_info(pvc_name, namespace) - break - if 'pvc' not in locals(): - logging.error( - "Pod '%s' in namespace '%s' does not use a pvc" % ( + file_name = "kraken.tmp" + logging.info( + "Creating %s file, %s KB size, in pod %s at %s (ns %s)" + % ( + str(file_name), + str(file_size_kb), str(pod_name), + str(mount_path), str(namespace) ) ) - sys.exit(1) - logging.info("Volume name: %s" % volume_name) - logging.info("PVC name: %s" % pvc_name) - # Get container name and mount path - for container in pod.containers: - for vol in container.volumeMounts: - if vol.name == volume_name: - mount_path = vol.mountPath - container_name = container.name - break - logging.info("Container path: %s" % container_name) - logging.info("Mount path: %s" % mount_path) - - # Get PVC capacity and used bytes - command = "df %s -B 1024 | sed 1d" % (str(mount_path)) - command_output = ( + start_time = int(time.time()) + # Create temp file in the PVC + full_path = "%s/%s" % (str(mount_path), str(file_name)) + command = "fallocate -l $((%s*1024)) %s" % ( + str(file_size_kb), + str(full_path) + ) + logging.debug( + "Create temp file in the PVC command:\n %s" % command + ) kubecli.exec_cmd_in_pod( command, pod_name, namespace, container_name, ) - ).split() - pvc_used_kb = int(command_output[2]) - pvc_capacity_kb = pvc_used_kb + int(command_output[3]) - logging.info("PVC used: %s KB" % pvc_used_kb) - logging.info("PVC capacity: %s KB" % pvc_capacity_kb) - # Check valid fill percentage - current_fill_percentage = pvc_used_kb / pvc_capacity_kb - if not ( - current_fill_percentage * 100 - < float(target_fill_percentage) - <= 99 - ): - logging.error( - "Target fill percentage (%.2f%%) is lower than " - "current fill percentage (%.2f%%) " - "or higher than 99%%" % ( - target_fill_percentage, - current_fill_percentage * 100 - ) + # Check if file is created + command = "ls -lh %s" % (str(mount_path)) + logging.debug("Check file is created command:\n %s" % command) + response = kubecli.exec_cmd_in_pod( + command, pod_name, namespace, container_name, "sh" ) - sys.exit(1) + logging.info("\n" + str(response)) + if str(file_name).lower() in str(response).lower(): + logging.info( + "%s file successfully created" % (str(full_path)) + ) + else: + logging.error( + "Failed to create tmp file with %s size" % ( + str(file_size_kb) + ) + ) + remove_temp_file( + file_name, + full_path, + pod_name, + namespace, + container_name, + mount_path, + file_size_kb, + kubecli + ) + # sys.exit(1) + raise RuntimeError() # Calculate file size file_size_kb = int( @@ -197,14 +280,13 @@ def run(scenarios_list, config, kubecli: krkn_lib_kubernetes.KrknLibKubernetes): logging.info("\n" + str(response)) if str(file_name).lower() in str(response).lower(): logging.info( - "%s file successfully created" % (str(full_path)) - ) - else: - logging.error( - "Failed to create tmp file with %s size" % ( - str(file_size_kb) + "Waiting for the specified duration in the config: %ss" % ( + duration ) ) + time.sleep(duration) + logging.info("Finish waiting") + remove_temp_file( file_name, full_path, @@ -215,35 +297,24 @@ def run(scenarios_list, config, kubecli: krkn_lib_kubernetes.KrknLibKubernetes): file_size_kb, kubecli ) - sys.exit(1) - # Wait for the specified duration - logging.info( - "Waiting for the specified duration in the config: %ss" % ( - duration + end_time = int(time.time()) + cerberus.publish_kraken_status( + config, + failed_post_scenarios, + start_time, + end_time ) - ) - time.sleep(duration) - logging.info("Finish waiting") + except (RuntimeError, Exception): + scenario_telemetry.exitStatus = 1 + failed_scenarios.append(app_config[0]) + telemetry.log_exception(app_config[0]) + else: + scenario_telemetry.exitStatus = 0 + + return failed_scenarios, scenario_telemetries - remove_temp_file( - file_name, - full_path, - pod_name, - namespace, - container_name, - mount_path, - file_size_kb, - kubecli - ) - end_time = int(time.time()) - cerberus.publish_kraken_status( - config, - failed_post_scenarios, - start_time, - end_time - ) # krkn_lib_kubernetes @@ -275,7 +346,7 @@ def remove_temp_file( logging.error( "Failed to delete tmp file with %s size" % (str(file_size_kb)) ) - sys.exit(1) + raise RuntimeError() def toKbytes(value): @@ -284,7 +355,7 @@ def toKbytes(value): "PVC capacity %s does not match expression " "regexp '^[0-9]+[K|M|G|T]i$'" ) - sys.exit(1) + raise RuntimeError() unit = {"K": 0, "M": 1, "G": 2, "T": 3} base = 1024 if ("i" in value) else 1000 exp = unit[value[-2:-1]] From 51e3dbebdd4fca2a4120a37c9b73191143c4ae6e Mon Sep 17 00:00:00 2001 From: Tullio Sebastiani Date: Mon, 10 Jul 2023 10:24:35 +0200 Subject: [PATCH 13/23] network chaos scenarios --- kraken/network_chaos/actions.py | 158 ++++++++++++++++++-------------- 1 file changed, 87 insertions(+), 71 deletions(-) diff --git a/kraken/network_chaos/actions.py b/kraken/network_chaos/actions.py index b06ef764..9909e1bb 100644 --- a/kraken/network_chaos/actions.py +++ b/kraken/network_chaos/actions.py @@ -8,88 +8,103 @@ from jinja2 import Environment, FileSystemLoader import kraken.cerberus.setup as cerberus import kraken.node_actions.common_node_functions as common_node_functions - +from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry # krkn_lib_kubernetes # Reads the scenario config and introduces traffic variations in Node's host network interface. -def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes): +def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]): failed_post_scenarios = "" logging.info("Runing the Network Chaos tests") + failed_post_scenarios = "" + scenario_telemetries: list[ScenarioTelemetry] = [] + failed_scenarios = [] for net_config in scenarios_list: - with open(net_config, "r") as file: - param_lst = ["latency", "loss", "bandwidth"] - test_config = yaml.safe_load(file) - test_dict = test_config["network_chaos"] - test_duration = int(test_dict.get("duration", 300)) - test_interface = test_dict.get("interfaces", []) - test_node = test_dict.get("node_name", "") - test_node_label = test_dict.get("label_selector", "node-role.kubernetes.io/master") - test_execution = test_dict.get("execution", "serial") - test_instance_count = test_dict.get("instance_count", 1) - test_egress = test_dict.get("egress", {"bandwidth": "100mbit"}) - if test_node: - node_name_list = test_node.split(",") - else: - node_name_list = [test_node] - nodelst = [] - for single_node_name in node_name_list: - nodelst.extend(common_node_functions.get_node(single_node_name, test_node_label, test_instance_count, kubecli)) - file_loader = FileSystemLoader(os.path.abspath(os.path.dirname(__file__))) - env = Environment(loader=file_loader, autoescape=True) - pod_template = env.get_template("pod.j2") - test_interface = verify_interface(test_interface, nodelst, pod_template, kubecli) - joblst = [] - egress_lst = [i for i in param_lst if i in test_egress] - chaos_config = { - "network_chaos": { - "duration": test_duration, - "interfaces": test_interface, - "node_name": ",".join(nodelst), - "execution": test_execution, - "instance_count": test_instance_count, - "egress": test_egress, + scenario_telemetry = ScenarioTelemetry() + scenario_telemetry.scenario = net_config + scenario_telemetry.startTimeStamp = time.time() + telemetry.set_parameters_base64(scenario_telemetry, net_config) + try: + with open(net_config, "r") as file: + param_lst = ["latency", "loss", "bandwidth"] + test_config = yaml.safe_load(file) + test_dict = test_config["network_chaos"] + test_duration = int(test_dict.get("duration", 300)) + test_interface = test_dict.get("interfaces", []) + test_node = test_dict.get("node_name", "") + test_node_label = test_dict.get("label_selector", "node-role.kubernetes.io/master") + test_execution = test_dict.get("execution", "serial") + test_instance_count = test_dict.get("instance_count", 1) + test_egress = test_dict.get("egress", {"bandwidth": "100mbit"}) + if test_node: + node_name_list = test_node.split(",") + else: + node_name_list = [test_node] + nodelst = [] + for single_node_name in node_name_list: + nodelst.extend(common_node_functions.get_node(single_node_name, test_node_label, test_instance_count, kubecli)) + file_loader = FileSystemLoader(os.path.abspath(os.path.dirname(__file__))) + env = Environment(loader=file_loader, autoescape=True) + pod_template = env.get_template("pod.j2") + test_interface = verify_interface(test_interface, nodelst, pod_template, kubecli) + joblst = [] + egress_lst = [i for i in param_lst if i in test_egress] + chaos_config = { + "network_chaos": { + "duration": test_duration, + "interfaces": test_interface, + "node_name": ",".join(nodelst), + "execution": test_execution, + "instance_count": test_instance_count, + "egress": test_egress, + } } - } - logging.info("Executing network chaos with config \n %s" % yaml.dump(chaos_config)) - job_template = env.get_template("job.j2") - try: - for i in egress_lst: - for node in nodelst: - exec_cmd = get_egress_cmd( - test_execution, test_interface, i, test_dict["egress"], duration=test_duration - ) - logging.info("Executing %s on node %s" % (exec_cmd, node)) - job_body = yaml.safe_load( - job_template.render(jobname=i + str(hash(node))[:5], nodename=node, cmd=exec_cmd) - ) - joblst.append(job_body["metadata"]["name"]) - api_response = kubecli.create_job(job_body) - if api_response is None: - raise Exception("Error creating job") - if test_execution == "serial": - logging.info("Waiting for serial job to finish") + logging.info("Executing network chaos with config \n %s" % yaml.dump(chaos_config)) + job_template = env.get_template("job.j2") + try: + for i in egress_lst: + for node in nodelst: + exec_cmd = get_egress_cmd( + test_execution, test_interface, i, test_dict["egress"], duration=test_duration + ) + logging.info("Executing %s on node %s" % (exec_cmd, node)) + job_body = yaml.safe_load( + job_template.render(jobname=i + str(hash(node))[:5], nodename=node, cmd=exec_cmd) + ) + joblst.append(job_body["metadata"]["name"]) + api_response = kubecli.create_job(job_body) + if api_response is None: + raise Exception("Error creating job") + if test_execution == "serial": + logging.info("Waiting for serial job to finish") + start_time = int(time.time()) + wait_for_job(joblst[:], kubecli, test_duration + 300) + logging.info("Waiting for wait_duration %s" % wait_duration) + time.sleep(wait_duration) + end_time = int(time.time()) + cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time) + if test_execution == "parallel": + break + if test_execution == "parallel": + logging.info("Waiting for parallel job to finish") start_time = int(time.time()) wait_for_job(joblst[:], kubecli, test_duration + 300) logging.info("Waiting for wait_duration %s" % wait_duration) time.sleep(wait_duration) end_time = int(time.time()) cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time) - if test_execution == "parallel": - break - if test_execution == "parallel": - logging.info("Waiting for parallel job to finish") - start_time = int(time.time()) - wait_for_job(joblst[:], kubecli, test_duration + 300) - logging.info("Waiting for wait_duration %s" % wait_duration) - time.sleep(wait_duration) - end_time = int(time.time()) - cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time) - except Exception as e: - logging.error("Network Chaos exiting due to Exception %s" % e) - sys.exit(1) - finally: - logging.info("Deleting jobs") - delete_job(joblst[:], kubecli) + except Exception as e: + logging.error("Network Chaos exiting due to Exception %s" % e) + raise RuntimeError() + finally: + logging.info("Deleting jobs") + delete_job(joblst[:], kubecli) + except (RuntimeError, Exception): + scenario_telemetry.exitStatus = 1 + failed_scenarios.append(net_config) + telemetry.log_exception(net_config) + else: + scenario_telemetry.exitStatus = 0 + return failed_scenarios, scenario_telemetries # krkn_lib_kubernetes @@ -110,7 +125,8 @@ def verify_interface(test_interface, nodelst, template, kubecli: krkn_lib_kubern for interface in test_interface: if interface not in interface_lst: logging.error("Interface %s not found in node %s interface list %s" % (interface, nodelst[pod_index], interface_lst)) - sys.exit(1) + #sys.exit(1) + raise RuntimeError() return test_interface finally: logging.info("Deleteing pod to query interface on node") @@ -158,7 +174,7 @@ def delete_job(joblst, kubecli: krkn_lib_kubernetes.KrknLibKubernetes): logging.error(pod_log) except Exception: logging.warning("Exception in getting job status") - api_response = kubecli.delete_job(name=jobname, namespace="default") + kubecli.delete_job(name=jobname, namespace="default") def get_egress_cmd(execution, test_interface, mod, vallst, duration=30): From ae2edb32cd87546eef7c42ed95cb3ca2c0cd7f39 Mon Sep 17 00:00:00 2001 From: Tullio Sebastiani Date: Wed, 19 Jul 2023 11:29:28 +0200 Subject: [PATCH 14/23] run_kraken.py adaptation to telemetry --- run_kraken.py | 44 +++++++++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/run_kraken.py b/run_kraken.py index 2a5218ab..39ef3806 100644 --- a/run_kraken.py +++ b/run_kraken.py @@ -209,31 +209,35 @@ def main(cfg): chaos_telemetry.scenarios.extend(scenario_telemetries) elif scenario_type == "plugin_scenarios": - failed_post_scenarios = plugins.run( + failed_post_scenarios, scenario_telemetries = plugins.run( scenarios_list, kubeconfig_path, kraken_config, failed_post_scenarios, wait_duration, + telemetry ) + chaos_telemetry.scenarios.extend(scenario_telemetries) # krkn_lib_kubernetes elif scenario_type == "container_scenarios": logging.info("Running container scenarios") - failed_post_scenarios = pod_scenarios.container_run( + failed_post_scenarios, scenario_telemetries = pod_scenarios.container_run( kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_duration, - kubecli + kubecli, + telemetry ) + chaos_telemetry.scenarios.extend(scenario_telemetries) # Inject node chaos scenarios specified in the config # krkn_lib_kubernetes elif scenario_type == "node_scenarios": logging.info("Running node scenarios") - nodeaction.run(scenarios_list, config, wait_duration, kubecli) - + failed_post_scenarios, scenario_telemetries = nodeaction.run(scenarios_list, config, wait_duration, kubecli, telemetry) + chaos_telemetry.scenarios.extend(scenario_telemetries) # Inject managedcluster chaos scenarios specified in the config # krkn_lib_kubernetes elif scenario_type == "managedcluster_scenarios": @@ -248,7 +252,8 @@ def main(cfg): elif scenario_type == "time_scenarios": if distribution == "openshift": logging.info("Running time skew scenarios") - time_actions.run(scenarios_list, config, wait_duration, kubecli) + failed_post_scenarios, scenario_telemetries = time_actions.run(scenarios_list, config, wait_duration, kubecli, telemetry) + chaos_telemetry.scenarios.extend(scenario_telemetries) else: logging.error( "Litmus scenarios are currently " @@ -298,44 +303,48 @@ def main(cfg): # Inject cluster shutdown scenarios # krkn_lib_kubernetes elif scenario_type == "cluster_shut_down_scenarios": - shut_down.run(scenarios_list, config, wait_duration, kubecli) + failed_post_scenarios, scenario_telemetries = shut_down.run(scenarios_list, config, wait_duration, kubecli, telemetry) + chaos_telemetry.scenarios.extend(scenario_telemetries) # Inject namespace chaos scenarios # krkn_lib_kubernetes elif scenario_type == "namespace_scenarios": logging.info("Running namespace scenarios") - namespace_actions.run( + failed_post_scenarios, scenario_telemetries = namespace_actions.run( scenarios_list, config, wait_duration, failed_post_scenarios, kubeconfig_path, - kubecli + kubecli, + telemetry ) + chaos_telemetry.scenarios.extend(scenario_telemetries) # Inject zone failures elif scenario_type == "zone_outages": logging.info("Inject zone outages") - zone_outages.run(scenarios_list, config, wait_duration) - + failed_post_scenarios, scenario_telemetries = zone_outages.run(scenarios_list, config, wait_duration, telemetry) + chaos_telemetry.scenarios.extend(scenario_telemetries) # Application outages elif scenario_type == "application_outages": logging.info("Injecting application outage") - application_outage.run( - scenarios_list, config, wait_duration - ) + failed_post_scenarios, scenario_telemetries = application_outage.run( + scenarios_list, config, wait_duration, telemetry) + chaos_telemetry.scenarios.extend(scenario_telemetries) # PVC scenarios # krkn_lib_kubernetes elif scenario_type == "pvc_scenarios": logging.info("Running PVC scenario") - pvc_scenario.run(scenarios_list, config, kubecli) + failed_post_scenarios, scenario_telemetries = pvc_scenario.run(scenarios_list, config, kubecli, telemetry) + chaos_telemetry.scenarios.extend(scenario_telemetries) # Network scenarios # krkn_lib_kubernetes elif scenario_type == "network_chaos": logging.info("Running Network Chaos") - network_chaos.run(scenarios_list, config, wait_duration, kubecli) + failed_post_scenarios, scenario_telemetries = network_chaos.run(scenarios_list, config, wait_duration, kubecli, telemetry) # Check for critical alerts when enabled if check_critical_alerts: @@ -354,7 +363,8 @@ def main(cfg): iteration += 1 logging.info("") # send telemetry - telemetry.send_telemetry(config["telemetry"], chaos_telemetry) + + telemetry.send_telemetry(config["telemetry"],str(uuid.uuid1()), chaos_telemetry, kubecli) # Capture the end time end_time = int(time.time()) From ea175fe817e9ffb70655e7d0dec9cd5f9372f878 Mon Sep 17 00:00:00 2001 From: Tullio Sebastiani Date: Wed, 2 Aug 2023 12:22:56 +0200 Subject: [PATCH 15/23] prometheus telemetry upload + config.yaml some fixes typos and logs max retries in config telemetry id with run_uuid safe_logger --- config/config.yaml | 7 ++++++- run_kraken.py | 43 ++++++++++++++++++++++++++++++++----------- 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 6680e137..0317a72f 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -68,4 +68,9 @@ telemetry: enabled: True api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production username: username - password: password \ No newline at end of file + password: password + prometheus_backup: True + full_prometheus_backup: False + backup_threads: 5 + archive_path: /tmp + max_retries: 0 \ No newline at end of file diff --git a/run_kraken.py b/run_kraken.py index 39ef3806..9c6ee4c1 100644 --- a/run_kraken.py +++ b/run_kraken.py @@ -25,7 +25,7 @@ import server as server import kraken.prometheus.client as promcli from kraken import plugins -from krkn_lib_kubernetes import KrknLibKubernetes, KrknTelemetry, ChaosRunTelemetry +from krkn_lib_kubernetes import KrknLibKubernetes, KrknTelemetry, ChaosRunTelemetry, SafeLogger KUBE_BURNER_URL = ( "https://github.com/cloud-bulldozer/kube-burner/" @@ -98,13 +98,32 @@ def main(cfg): ) sys.exit(1) logging.info("Initializing client to talk to the Kubernetes cluster") + + # Generate uuid for the run + if run_uuid: + logging.info( + "Using the uuid defined by the user for the run: %s" % run_uuid + ) + else: + run_uuid = str(uuid.uuid4()) + logging.info("Generated a uuid for the run: %s" % run_uuid) + + # request_id for telemetry is generated once here and used everywhere + telemetry_request_id = f"{int(time.time())}-{run_uuid}" + telemetry_log_file = f'{config["telemetry"]["archive_path"]}/{telemetry_request_id}.log' + safe_logger = SafeLogger(filename=telemetry_log_file) + try: kubeconfig_path os.environ["KUBECONFIG"] = str(kubeconfig_path) + # krkn-lib-kubernetes init kubecli = KrknLibKubernetes(kubeconfig_path=kubeconfig_path) except: kubecli.initialize_clients(None) + # KrknTelemetry init + telemetry = KrknTelemetry(safe_logger, kubecli) + # find node kraken might be running on kubecli.find_kraken_node() @@ -141,14 +160,7 @@ def main(cfg): if deploy_performance_dashboards: performance_dashboards.setup(dashboard_repo, distribution) - # Generate uuid for the run - if run_uuid: - logging.info( - "Using the uuid defined by the user for the run: %s" % run_uuid - ) - else: - run_uuid = str(uuid.uuid4()) - logging.info("Generated a uuid for the run: %s" % run_uuid) + # Initialize the start iteration to 0 iteration = 0 @@ -172,6 +184,7 @@ def main(cfg): start_time = int(time.time()) litmus_installed = False chaos_telemetry = ChaosRunTelemetry() + chaos_telemetry.run_uuid = run_uuid # Loop to run the chaos starts here while int(iteration) < iterations and run_signal != "STOP": # Inject chaos scenarios specified in the config @@ -362,9 +375,17 @@ def main(cfg): iteration += 1 logging.info("") - # send telemetry - telemetry.send_telemetry(config["telemetry"],str(uuid.uuid1()), chaos_telemetry, kubecli) + # telemetry + logging.info(f"telemetry data will be stored on s3 bucket folder: {telemetry_request_id}") + logging.info(f"telemetry upload log: {safe_logger.log_file_name}") + + telemetry.send_telemetry(config["telemetry"], telemetry_request_id, chaos_telemetry) + safe_logger.info("archives download started:") + prometheus_archive_files = telemetry.get_ocp_prometheus_data(config["telemetry"], telemetry_request_id) + safe_logger.info("archives upload started:") + telemetry.put_ocp_prometheus_data(config["telemetry"], prometheus_archive_files, telemetry_request_id) + # Capture the end time end_time = int(time.time()) From 5c13597bd11363c7804e3f37e2c875fbe7797403 Mon Sep 17 00:00:00 2001 From: Tullio Sebastiani Date: Wed, 2 Aug 2023 12:51:40 +0200 Subject: [PATCH 16/23] catch send_telemetry exception --- run_kraken.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/run_kraken.py b/run_kraken.py index 9c6ee4c1..ee69b08d 100644 --- a/run_kraken.py +++ b/run_kraken.py @@ -380,11 +380,14 @@ def main(cfg): logging.info(f"telemetry data will be stored on s3 bucket folder: {telemetry_request_id}") logging.info(f"telemetry upload log: {safe_logger.log_file_name}") - telemetry.send_telemetry(config["telemetry"], telemetry_request_id, chaos_telemetry) - safe_logger.info("archives download started:") - prometheus_archive_files = telemetry.get_ocp_prometheus_data(config["telemetry"], telemetry_request_id) - safe_logger.info("archives upload started:") - telemetry.put_ocp_prometheus_data(config["telemetry"], prometheus_archive_files, telemetry_request_id) + try: + telemetry.send_telemetry(config["telemetry"], telemetry_request_id, chaos_telemetry) + safe_logger.info("archives download started:") + prometheus_archive_files = telemetry.get_ocp_prometheus_data(config["telemetry"], telemetry_request_id) + safe_logger.info("archives upload started:") + telemetry.put_ocp_prometheus_data(config["telemetry"], prometheus_archive_files, telemetry_request_id) + except Exception as e: + logging.error(f"failed to send telemetry data: {str(e)}") # Capture the end time end_time = int(time.time()) From a3dbf933e6790747a874263abc09ed0026288bd8 Mon Sep 17 00:00:00 2001 From: Tullio Sebastiani Date: Wed, 2 Aug 2023 15:47:55 +0200 Subject: [PATCH 17/23] scenario collection bug fixes --- kraken/network_chaos/actions.py | 1 + kraken/plugins/__init__.py | 1 - kraken/pvc/pvc_scenario.py | 1 + 3 files changed, 2 insertions(+), 1 deletion(-) diff --git a/kraken/network_chaos/actions.py b/kraken/network_chaos/actions.py index 9909e1bb..0b209c72 100644 --- a/kraken/network_chaos/actions.py +++ b/kraken/network_chaos/actions.py @@ -104,6 +104,7 @@ def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.Krkn telemetry.log_exception(net_config) else: scenario_telemetry.exitStatus = 0 + scenario_telemetries.append(scenario_telemetry) return failed_scenarios, scenario_telemetries diff --git a/kraken/plugins/__init__.py b/kraken/plugins/__init__.py index 7fb28dfb..aff7beb0 100644 --- a/kraken/plugins/__init__.py +++ b/kraken/plugins/__init__.py @@ -243,7 +243,6 @@ def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_p else: scenario_telemetry.exitStatus = 0 logging.info("Waiting for the specified duration: %s" % (wait_duration)) - scenario_telemetries.append(scenario_telemetry) time.sleep(wait_duration) scenario_telemetries.append(scenario_telemetry) scenario_telemetry.endTimeStamp = time.time() diff --git a/kraken/pvc/pvc_scenario.py b/kraken/pvc/pvc_scenario.py index aa9e297a..ca055009 100644 --- a/kraken/pvc/pvc_scenario.py +++ b/kraken/pvc/pvc_scenario.py @@ -311,6 +311,7 @@ def run(scenarios_list, config, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, telemetry.log_exception(app_config[0]) else: scenario_telemetry.exitStatus = 0 + scenario_telemetries.append(scenario_telemetry) return failed_scenarios, scenario_telemetries From f63fe8d811d9f863e4b84ddbb872788e51cbfec7 Mon Sep 17 00:00:00 2001 From: Tullio Sebastiani Date: Thu, 3 Aug 2023 16:54:25 +0200 Subject: [PATCH 18/23] telemetry enabled check --- run_kraken.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/run_kraken.py b/run_kraken.py index ee69b08d..23a2a710 100644 --- a/run_kraken.py +++ b/run_kraken.py @@ -377,17 +377,19 @@ def main(cfg): logging.info("") # telemetry - logging.info(f"telemetry data will be stored on s3 bucket folder: {telemetry_request_id}") - logging.info(f"telemetry upload log: {safe_logger.log_file_name}") - - try: - telemetry.send_telemetry(config["telemetry"], telemetry_request_id, chaos_telemetry) - safe_logger.info("archives download started:") - prometheus_archive_files = telemetry.get_ocp_prometheus_data(config["telemetry"], telemetry_request_id) - safe_logger.info("archives upload started:") - telemetry.put_ocp_prometheus_data(config["telemetry"], prometheus_archive_files, telemetry_request_id) - except Exception as e: - logging.error(f"failed to send telemetry data: {str(e)}") + if config["telemetry"]["enabled"]: + logging.info(f"telemetry data will be stored on s3 bucket folder: {telemetry_request_id}") + logging.info(f"telemetry upload log: {safe_logger.log_file_name}") + try: + telemetry.send_telemetry(config["telemetry"], telemetry_request_id, chaos_telemetry) + safe_logger.info("archives download started:") + prometheus_archive_files = telemetry.get_ocp_prometheus_data(config["telemetry"], telemetry_request_id) + safe_logger.info("archives upload started:") + telemetry.put_ocp_prometheus_data(config["telemetry"], prometheus_archive_files, telemetry_request_id) + except Exception as e: + logging.error(f"failed to send telemetry data: {str(e)}") + else: + logging.info("telemetry collection disabled, skipping.") # Capture the end time end_time = int(time.time()) From 805593590dcd2e1ed14c415dbf3057d8b80088a2 Mon Sep 17 00:00:00 2001 From: Tullio Sebastiani Date: Thu, 3 Aug 2023 17:19:55 +0200 Subject: [PATCH 19/23] telemetry run tag --- config/config.yaml | 3 ++- run_kraken.py | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/config/config.yaml b/config/config.yaml index 0317a72f..4fbfb509 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -73,4 +73,5 @@ telemetry: full_prometheus_backup: False backup_threads: 5 archive_path: /tmp - max_retries: 0 \ No newline at end of file + max_retries: 0 + run_tag: chaos \ No newline at end of file diff --git a/run_kraken.py b/run_kraken.py index 23a2a710..816f9b09 100644 --- a/run_kraken.py +++ b/run_kraken.py @@ -110,6 +110,8 @@ def main(cfg): # request_id for telemetry is generated once here and used everywhere telemetry_request_id = f"{int(time.time())}-{run_uuid}" + if config["telemetry"].get("run_tag"): + telemetry_request_id = f"{telemetry_request_id}-{config['telemetry']['run_tag']}" telemetry_log_file = f'{config["telemetry"]["archive_path"]}/{telemetry_request_id}.log' safe_logger = SafeLogger(filename=telemetry_log_file) From dff994a65822441f50f1df6c4fe68c10e8ade2d0 Mon Sep 17 00:00:00 2001 From: Tullio Sebastiani Date: Fri, 4 Aug 2023 17:02:44 +0200 Subject: [PATCH 20/23] requirements pointing to main + archive_size --- config/config.yaml | 3 ++- requirements.txt | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 4fbfb509..7f91ac98 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -74,4 +74,5 @@ telemetry: backup_threads: 5 archive_path: /tmp max_retries: 0 - run_tag: chaos \ No newline at end of file + run_tag: chaos + archive_size: 10000 #in KiloBytes \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 237a4eca..5e766acd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -37,4 +37,4 @@ prometheus_api_client ibm_cloud_sdk_core ibm_vpc pytest -krkn-lib-kubernetes@git+https://github.com/redhat-chaos/krkn-lib-kubernetes.git@krkn_telemetry \ No newline at end of file +krkn-lib-kubernetes@git+https://github.com/redhat-chaos/krkn-lib-kubernetes.git@main \ No newline at end of file From 4059898fda6cc8f5f96d8ab245fec8d4471a6283 Mon Sep 17 00:00:00 2001 From: Tullio Sebastiani Date: Thu, 10 Aug 2023 17:10:02 +0200 Subject: [PATCH 21/23] requirements.txt and config.yaml update --- config/config.yaml | 27 ++++++++++++++++----------- requirements.txt | 2 +- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 7f91ac98..156d23e0 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -65,14 +65,19 @@ tunings: iterations: 1 # Number of times to execute the scenarios daemon_mode: False # Iterations are set to infinity which means that the kraken will cause chaos forever telemetry: - enabled: True - api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production - username: username - password: password - prometheus_backup: True - full_prometheus_backup: False - backup_threads: 5 - archive_path: /tmp - max_retries: 0 - run_tag: chaos - archive_size: 10000 #in KiloBytes \ No newline at end of file + enabled: False # enable/disables the telemetry collection feature + api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production #telemetry service endpoint + username: username # telemetry service username + password: password # telemetry service password + prometheus_backup: True # enables/disables prometheus data collection + full_prometheus_backup: False # if is set to False only the /prometheus/wal folder will be downloaded. + backup_threads: 5 # number of telemetry download/upload threads + archive_path: /tmp # local path where the archive files will be temporarly stored + max_retries: 0 # maximum number of upload retries (if 0 will retry forever) + run_tag: '' # if set, this will be appended to the run folder in the bucket (useful to group the runs) + archive_size: 10000 # the size of the prometheus data archive size in KB. The lower the size of archive is + # the higher the number of archive files will be produced and uploaded (and processed by backup_threads + # simultaneously). + # For unstable/slow connection is better to keep this value low + # increasing the number of backup_threads, in this way, on upload failure, the retry will happen only on the + # failed chunk without affecting the whole upload. \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 5e766acd..223d6ca0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -37,4 +37,4 @@ prometheus_api_client ibm_cloud_sdk_core ibm_vpc pytest -krkn-lib-kubernetes@git+https://github.com/redhat-chaos/krkn-lib-kubernetes.git@main \ No newline at end of file +krkn-lib-kubernetes >= 0.1.3 \ No newline at end of file From 3fbb887dff6c07118014886a6c112bac14327a2d Mon Sep 17 00:00:00 2001 From: Tullio Sebastiani Date: Thu, 10 Aug 2023 18:15:07 +0200 Subject: [PATCH 22/23] added telemetry config to common config --- CI/config/common_test_config.yaml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/CI/config/common_test_config.yaml b/CI/config/common_test_config.yaml index f77435e4..c36a6d59 100644 --- a/CI/config/common_test_config.yaml +++ b/CI/config/common_test_config.yaml @@ -29,3 +29,15 @@ tunings: wait_duration: 6 # Duration to wait between each chaos scenario. iterations: 1 # Number of times to execute the scenarios. daemon_mode: False # Iterations are set to infinity which means that the kraken will cause chaos forever. +telemetry: + enabled: False # enable/disables the telemetry collection feature + api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production #telemetry service endpoint + username: username # telemetry service username + password: password # telemetry service password + prometheus_backup: True # enables/disables prometheus data collection + full_prometheus_backup: False # if is set to False only the /prometheus/wal folder will be downloaded. + backup_threads: 5 # number of telemetry download/upload threads + archive_path: /tmp # local path where the archive files will be temporarly stored + max_retries: 0 # maximum number of upload retries (if 0 will retry forever) + run_tag: '' # if set, this will be appended to the run folder in the bucket (useful to group the runs) + archive_size: 10000 # the size of the prometheus data archive size in KB. The lower the size of archive is From a4f44eef096343e3ee70439a8f95bf74b5f3b693 Mon Sep 17 00:00:00 2001 From: Tullio Sebastiani Date: Thu, 10 Aug 2023 19:11:08 +0200 Subject: [PATCH 23/23] fixed scenario array elements for telemetry --- config/config.yaml | 32 ---------------------------- kraken/application_outage/actions.py | 8 +++---- kraken/pod_scenarios/setup.py | 2 +- kraken/pvc/pvc_scenario.py | 8 +++---- kraken/zone_outage/actions.py | 8 +++---- 5 files changed, 13 insertions(+), 45 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 156d23e0..d2d3ee9a 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -7,40 +7,8 @@ kraken: signal_address: 0.0.0.0 # Signal listening address port: 8081 # Signal port chaos_scenarios: # List of policies/chaos scenarios to load - - arcaflow_scenarios: - - scenarios/arcaflow/cpu-hog/input.yaml - - scenarios/arcaflow/memory-hog/input.yaml - - container_scenarios: # List of chaos pod scenarios to load - - - scenarios/openshift/container_etcd.yml - - plugin_scenarios: - - scenarios/openshift/etcd.yml - - scenarios/openshift/regex_openshift_pod_kill.yml - - scenarios/openshift/vmware_node_scenarios.yml - - scenarios/openshift/ibmcloud_node_scenarios.yml - - scenarios/openshift/network_chaos_ingress.yml - - scenarios/openshift/pod_network_outage.yml - - node_scenarios: # List of chaos node scenarios to load - - scenarios/openshift/node_scenarios_example.yml - - plugin_scenarios: - - scenarios/openshift/openshift-apiserver.yml - - scenarios/openshift/openshift-kube-apiserver.yml - - time_scenarios: # List of chaos time scenarios to load - - scenarios/openshift/time_scenarios_example.yml - - cluster_shut_down_scenarios: - - - scenarios/openshift/cluster_shut_down_scenario.yml - - scenarios/openshift/post_action_shut_down.py - - namespace_scenarios: - - - scenarios/openshift/regex_namespace.yaml - - - scenarios/openshift/ingress_namespace.yaml - - scenarios/openshift/post_action_namespace.py - - zone_outages: - - scenarios/openshift/zone_outage.yaml - application_outages: - scenarios/openshift/app_outage.yaml - - pvc_scenarios: - - scenarios/openshift/pvc_scenario.yaml - - network_chaos: - - scenarios/openshift/network_chaos.yaml cerberus: cerberus_enabled: False # Enable it when cerberus is previously installed diff --git a/kraken/application_outage/actions.py b/kraken/application_outage/actions.py index f8d32085..4d2ae0c6 100644 --- a/kraken/application_outage/actions.py +++ b/kraken/application_outage/actions.py @@ -14,9 +14,9 @@ def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetry) -> (lis failed_scenarios = [] for app_outage_config in scenarios_list: scenario_telemetry = ScenarioTelemetry() - scenario_telemetry.scenario = app_outage_config[0] + scenario_telemetry.scenario = app_outage_config scenario_telemetry.startTimeStamp = time.time() - telemetry.set_parameters_base64(scenario_telemetry, app_outage_config[0]) + telemetry.set_parameters_base64(scenario_telemetry, app_outage_config) if len(app_outage_config) > 1: try: with open(app_outage_config, "r") as f: @@ -65,8 +65,8 @@ def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetry) -> (lis cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time) except Exception as e : scenario_telemetry.exitStatus = 1 - failed_scenarios.append(app_outage_config[0]) - telemetry.log_exception(app_outage_config[0]) + failed_scenarios.append(app_outage_config) + telemetry.log_exception(app_outage_config) else: scenario_telemetry.exitStatus = 0 scenario_telemetry.endTimeStamp = time.time() diff --git a/kraken/pod_scenarios/setup.py b/kraken/pod_scenarios/setup.py index 6071c327..2c02a5fd 100644 --- a/kraken/pod_scenarios/setup.py +++ b/kraken/pod_scenarios/setup.py @@ -80,7 +80,7 @@ def container_run(kubeconfig_path, for container_scenario_config in scenarios_list: scenario_telemetry = ScenarioTelemetry() - scenario_telemetry.scenario = container_scenario_config + scenario_telemetry.scenario = container_scenario_config[0] scenario_telemetry.startTimeStamp = time.time() telemetry.set_parameters_base64(scenario_telemetry, container_scenario_config[0]) if len(container_scenario_config) > 1: diff --git a/kraken/pvc/pvc_scenario.py b/kraken/pvc/pvc_scenario.py index ca055009..b9d1433a 100644 --- a/kraken/pvc/pvc_scenario.py +++ b/kraken/pvc/pvc_scenario.py @@ -19,9 +19,9 @@ def run(scenarios_list, config, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, failed_scenarios = [] for app_config in scenarios_list: scenario_telemetry = ScenarioTelemetry() - scenario_telemetry.scenario = app_config[0] + scenario_telemetry.scenario = app_config scenario_telemetry.startTimeStamp = time.time() - telemetry.set_parameters_base64(scenario_telemetry, app_config[0]) + telemetry.set_parameters_base64(scenario_telemetry, app_config) try: if len(app_config) > 1: with open(app_config, "r") as f: @@ -307,8 +307,8 @@ def run(scenarios_list, config, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, ) except (RuntimeError, Exception): scenario_telemetry.exitStatus = 1 - failed_scenarios.append(app_config[0]) - telemetry.log_exception(app_config[0]) + failed_scenarios.append(app_config) + telemetry.log_exception(app_config) else: scenario_telemetry.exitStatus = 0 scenario_telemetries.append(scenario_telemetry) diff --git a/kraken/zone_outage/actions.py b/kraken/zone_outage/actions.py index 3c2e3e01..96690068 100644 --- a/kraken/zone_outage/actions.py +++ b/kraken/zone_outage/actions.py @@ -16,9 +16,9 @@ def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetry) -> (lis for zone_outage_config in scenarios_list: scenario_telemetry = ScenarioTelemetry() - scenario_telemetry.scenario = zone_outage_config[0] + scenario_telemetry.scenario = zone_outage_config scenario_telemetry.startTimeStamp = time.time() - telemetry.set_parameters_base64(scenario_telemetry, zone_outage_config[0]) + telemetry.set_parameters_base64(scenario_telemetry, zone_outage_config) try: if len(zone_outage_config) > 1: with open(zone_outage_config, "r") as f: @@ -109,8 +109,8 @@ def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetry) -> (lis ) except (RuntimeError, Exception): scenario_telemetry.exitStatus = 1 - failed_scenarios.append(zone_outage_config[0]) - telemetry.log_exception(zone_outage_config[0]) + failed_scenarios.append(zone_outage_config) + telemetry.log_exception(zone_outage_config) else: scenario_telemetry.exitStatus = 0 scenario_telemetry.endTimeStamp = time.time()