Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Krkn telemetry integration #435

Merged
merged 23 commits into from
Aug 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions CI/config/common_test_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,15 @@ tunings:
wait_duration: 6 # Duration to wait between each chaos scenario.
iterations: 1 # Number of times to execute the scenarios.
daemon_mode: False # Iterations are set to infinity which means that the kraken will cause chaos forever.
telemetry:
enabled: False # enable/disables the telemetry collection feature
api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production #telemetry service endpoint
username: username # telemetry service username
password: password # telemetry service password
prometheus_backup: True # enables/disables prometheus data collection
full_prometheus_backup: False # if is set to False only the /prometheus/wal folder will be downloaded.
backup_threads: 5 # number of telemetry download/upload threads
archive_path: /tmp # local path where the archive files will be temporarly stored
max_retries: 0 # maximum number of upload retries (if 0 will retry forever)
run_tag: '' # if set, this will be appended to the run folder in the bucket (useful to group the runs)
archive_size: 10000 # the size of the prometheus data archive size in KB. The lower the size of archive is
54 changes: 19 additions & 35 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,41 +7,8 @@ kraken:
signal_address: 0.0.0.0 # Signal listening address
port: 8081 # Signal port
chaos_scenarios: # List of policies/chaos scenarios to load
- arcaflow_scenarios:
- scenarios/arcaflow/cpu-hog/input.yaml
- scenarios/arcaflow/memory-hog/input.yaml
- container_scenarios: # List of chaos pod scenarios to load
- - scenarios/openshift/container_etcd.yml
- plugin_scenarios:
- scenarios/openshift/etcd.yml
- scenarios/openshift/regex_openshift_pod_kill.yml
- scenarios/openshift/vmware_node_scenarios.yml
- scenarios/openshift/ibmcloud_node_scenarios.yml
- scenarios/openshift/network_chaos_ingress.yml
- scenarios/openshift/pod_network_outage.yml
- scenarios/openshift/pod_network_shaping.yml
- node_scenarios: # List of chaos node scenarios to load
- scenarios/openshift/node_scenarios_example.yml
- plugin_scenarios:
- scenarios/openshift/openshift-apiserver.yml
- scenarios/openshift/openshift-kube-apiserver.yml
- time_scenarios: # List of chaos time scenarios to load
- scenarios/openshift/time_scenarios_example.yml
- cluster_shut_down_scenarios:
- - scenarios/openshift/cluster_shut_down_scenario.yml
- scenarios/openshift/post_action_shut_down.py
- namespace_scenarios:
- - scenarios/openshift/regex_namespace.yaml
- - scenarios/openshift/ingress_namespace.yaml
- scenarios/openshift/post_action_namespace.py
- zone_outages:
- scenarios/openshift/zone_outage.yaml
- application_outages:
- scenarios/openshift/app_outage.yaml
- pvc_scenarios:
- scenarios/openshift/pvc_scenario.yaml
- network_chaos:
- scenarios/openshift/network_chaos.yaml
- application_outages:
- scenarios/openshift/app_outage.yaml

cerberus:
cerberus_enabled: False # Enable it when cerberus is previously installed
Expand All @@ -65,3 +32,20 @@ tunings:
wait_duration: 60 # Duration to wait between each chaos scenario
iterations: 1 # Number of times to execute the scenarios
daemon_mode: False # Iterations are set to infinity which means that the kraken will cause chaos forever
telemetry:
chaitanyaenr marked this conversation as resolved.
Show resolved Hide resolved
enabled: False # enable/disables the telemetry collection feature
api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production #telemetry service endpoint
username: username # telemetry service username
password: password # telemetry service password
prometheus_backup: True # enables/disables prometheus data collection
full_prometheus_backup: False # if is set to False only the /prometheus/wal folder will be downloaded.
backup_threads: 5 # number of telemetry download/upload threads
archive_path: /tmp # local path where the archive files will be temporarly stored
max_retries: 0 # maximum number of upload retries (if 0 will retry forever)
run_tag: '' # if set, this will be appended to the run folder in the bucket (useful to group the runs)
archive_size: 10000 # the size of the prometheus data archive size in KB. The lower the size of archive is
# the higher the number of archive files will be produced and uploaded (and processed by backup_threads
# simultaneously).
# For unstable/slow connection is better to keep this value low
# increasing the number of backup_threads, in this way, on upload failure, the retry will happen only on the
# failed chunk without affecting the whole upload.
89 changes: 53 additions & 36 deletions kraken/application_outage/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,32 @@
import kraken.cerberus.setup as cerberus
from jinja2 import Template
import kraken.invoke.command as runcommand

from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry

# Reads the scenario config, applies and deletes a network policy to
# block the traffic for the specified duration
def run(scenarios_list, config, wait_duration):
def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
failed_post_scenarios = ""
scenario_telemetries: list[ScenarioTelemetry] = []
failed_scenarios = []
for app_outage_config in scenarios_list:
scenario_telemetry = ScenarioTelemetry()
scenario_telemetry.scenario = app_outage_config
scenario_telemetry.startTimeStamp = time.time()
telemetry.set_parameters_base64(scenario_telemetry, app_outage_config)
if len(app_outage_config) > 1:
with open(app_outage_config, "r") as f:
app_outage_config_yaml = yaml.full_load(f)
scenario_config = app_outage_config_yaml["application_outage"]
pod_selector = scenario_config.get("pod_selector", "{}")
traffic_type = scenario_config.get("block", "[Ingress, Egress]")
namespace = scenario_config.get("namespace", "")
duration = scenario_config.get("duration", 60)
try:
with open(app_outage_config, "r") as f:
app_outage_config_yaml = yaml.full_load(f)
scenario_config = app_outage_config_yaml["application_outage"]
pod_selector = scenario_config.get("pod_selector", "{}")
traffic_type = scenario_config.get("block", "[Ingress, Egress]")
namespace = scenario_config.get("namespace", "")
duration = scenario_config.get("duration", 60)

start_time = int(time.time())
start_time = int(time.time())

network_policy_template = """---
network_policy_template = """---
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
Expand All @@ -31,28 +38,38 @@ def run(scenarios_list, config, wait_duration):
podSelector:
matchLabels: {{ pod_selector }}
policyTypes: {{ traffic_type }}
"""
t = Template(network_policy_template)
rendered_spec = t.render(pod_selector=pod_selector, traffic_type=traffic_type)
# Write the rendered template to a file
with open("kraken_network_policy.yaml", "w") as f:
f.write(rendered_spec)
# Block the traffic by creating network policy
logging.info("Creating the network policy")
runcommand.invoke(
"kubectl create -f %s -n %s --validate=false" % ("kraken_network_policy.yaml", namespace)
)

# wait for the specified duration
logging.info("Waiting for the specified duration in the config: %s" % (duration))
time.sleep(duration)

# unblock the traffic by deleting the network policy
logging.info("Deleting the network policy")
runcommand.invoke("kubectl delete -f %s -n %s" % ("kraken_network_policy.yaml", namespace))

logging.info("End of scenario. Waiting for the specified duration: %s" % (wait_duration))
time.sleep(wait_duration)

end_time = int(time.time())
cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
"""
t = Template(network_policy_template)
rendered_spec = t.render(pod_selector=pod_selector, traffic_type=traffic_type)
# Write the rendered template to a file
with open("kraken_network_policy.yaml", "w") as f:
f.write(rendered_spec)
# Block the traffic by creating network policy
logging.info("Creating the network policy")
runcommand.invoke(
"kubectl create -f %s -n %s --validate=false" % ("kraken_network_policy.yaml", namespace)
)

# wait for the specified duration
logging.info("Waiting for the specified duration in the config: %s" % (duration))
time.sleep(duration)

# unblock the traffic by deleting the network policy
logging.info("Deleting the network policy")
runcommand.invoke("kubectl delete -f %s -n %s" % ("kraken_network_policy.yaml", namespace))

logging.info("End of scenario. Waiting for the specified duration: %s" % (wait_duration))
time.sleep(wait_duration)

end_time = int(time.time())
cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
except Exception as e :
scenario_telemetry.exitStatus = 1
failed_scenarios.append(app_outage_config)
telemetry.log_exception(app_outage_config)
else:
scenario_telemetry.exitStatus = 0
scenario_telemetry.endTimeStamp = time.time()
scenario_telemetries.append(scenario_telemetry)
return failed_scenarios, scenario_telemetries

27 changes: 19 additions & 8 deletions kraken/arcaflow_plugin/arcaflow_plugin.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import time

import arcaflow
import os
import yaml
Expand All @@ -6,22 +8,31 @@
from pathlib import Path
from typing import List
from .context_auth import ContextAuth
from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry


def run(scenarios_list: List[str], kubeconfig_path: str):
def run(scenarios_list: List[str], kubeconfig_path: str, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
scenario_telemetries: list[ScenarioTelemetry] = []
failed_post_scenarios = []
for scenario in scenarios_list:
scenario_telemetry = ScenarioTelemetry()
scenario_telemetry.scenario = scenario
scenario_telemetry.startTimeStamp = time.time()
telemetry.set_parameters_base64(scenario_telemetry,scenario)
engine_args = build_args(scenario)
run_workflow(engine_args, kubeconfig_path)
status_code = run_workflow(engine_args, kubeconfig_path)
scenario_telemetry.endTimeStamp = time.time()
scenario_telemetry.exitStatus = status_code
scenario_telemetries.append(scenario_telemetry)
if status_code != 0:
failed_post_scenarios.append(scenario)
return failed_post_scenarios, scenario_telemetries


def run_workflow(engine_args: arcaflow.EngineArgs, kubeconfig_path: str):
def run_workflow(engine_args: arcaflow.EngineArgs, kubeconfig_path: str) -> int:
set_arca_kubeconfig(engine_args, kubeconfig_path)
exit_status = arcaflow.run(engine_args)
if exit_status != 0:
logging.error(
f"failed to run arcaflow scenario {engine_args.input}"
)
sys.exit(exit_status)
return exit_status


def build_args(input_file: str) -> arcaflow.EngineArgs:
Expand Down
Loading