Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add alert rules to metacontroller-operator based on the KF093 spec #124

Merged
merged 3 commits into from
Aug 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions requirements-integration.in
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,6 @@ pytest-operator
pyyaml
requests
tenacity
# Pin to >=0.4.0 because the reusable test infrastructure is on that version and above
# This prevents pip-compile from trying to pin an earlier version
charmed-kubeflow-chisme>=0.4.0
69 changes: 65 additions & 4 deletions requirements-integration.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,16 @@ aiohttp==3.8.5
# via -r requirements-integration.in
aiosignal==1.3.1
# via aiohttp
anyio==4.4.0
# via httpx
asttokens==2.4.0
# via stack-data
async-timeout==4.0.3
# via aiohttp
attrs==23.1.0
# via aiohttp
# via
# aiohttp
# jsonschema
backcall==0.2.0
# via ipython
bcrypt==4.0.1
Expand All @@ -22,12 +26,16 @@ cachetools==5.3.1
# via google-auth
certifi==2023.7.22
# via
# httpcore
# httpx
# kubernetes
# requests
cffi==1.15.1
# via
# cryptography
# pynacl
charmed-kubeflow-chisme==0.4.3
# via -r requirements-integration.in
charset-normalizer==3.2.0
# via
# aiohttp
Expand All @@ -38,8 +46,12 @@ decorator==5.1.1
# via
# ipdb
# ipython
deepdiff==6.2.1
# via charmed-kubeflow-chisme
exceptiongroup==1.1.3
# via pytest
# via
# anyio
# pytest
executing==1.2.0
# via stack-data
frozenlist==1.4.0
Expand All @@ -48,12 +60,22 @@ frozenlist==1.4.0
# aiosignal
google-auth==2.22.0
# via kubernetes
h11==0.14.0
# via httpcore
httpcore==1.0.5
# via httpx
httpx==0.27.0
# via lightkube
hvac==1.2.1
# via juju
idna==3.4
# via
# anyio
# httpx
# requests
# yarl
importlib-resources==6.4.3
# via jsonschema
iniconfig==2.0.0
# via pytest
ipdb==0.13.13
Expand All @@ -65,13 +87,21 @@ jedi==0.19.0
jinja2==3.1.2
# via
# -r requirements-integration.in
# charmed-kubeflow-chisme
# pytest-operator
jsonschema==4.17.3
# via serialized-data-interface
juju==3.2.2
# via
# -r requirements-integration.in
# charmed-kubeflow-chisme
# pytest-operator
kubernetes==27.2.0
# via juju
lightkube==0.15.3
# via charmed-kubeflow-chisme
lightkube-models==1.30.0.8
# via lightkube
macaroonbakery==1.3.1
# via juju
markupsafe==2.1.3
Expand All @@ -88,6 +118,12 @@ oauthlib==3.2.2
# via
# kubernetes
# requests-oauthlib
ops==2.15.0
# via
# charmed-kubeflow-chisme
# serialized-data-interface
ordered-set==4.1.0
# via deepdiff
packaging==23.1
# via pytest
paramiko==2.12.0
Expand All @@ -98,6 +134,8 @@ pexpect==4.8.0
# via ipython
pickleshare==0.7.5
# via ipython
pkgutil-resolve-name==1.3.10
# via jsonschema
pluggy==1.3.0
# via pytest
prompt-toolkit==3.0.39
Expand Down Expand Up @@ -132,6 +170,8 @@ pyrfc3339==1.1
# via
# juju
# macaroonbakery
pyrsistent==0.20.0
# via jsonschema
pytest==7.4.2
# via
# -r requirements-integration.in
Expand All @@ -150,18 +190,28 @@ pyyaml==6.0.1
# -r requirements-integration.in
# juju
# kubernetes
# lightkube
# ops
# pytest-operator
# serialized-data-interface
requests==2.31.0
# via
# -r requirements-integration.in
# hvac
# kubernetes
# macaroonbakery
# requests-oauthlib
# serialized-data-interface
requests-oauthlib==1.3.1
# via kubernetes
rsa==4.9
# via google-auth
ruamel-yaml==0.18.6
# via charmed-kubeflow-chisme
ruamel-yaml-clib==0.2.8
# via ruamel-yaml
serialized-data-interface==0.7.0
# via charmed-kubeflow-chisme
six==1.16.0
# via
# asttokens
Expand All @@ -171,10 +221,16 @@ six==1.16.0
# paramiko
# pymacaroons
# python-dateutil
sniffio==1.3.1
# via
# anyio
# httpx
stack-data==0.6.2
# via ipython
tenacity==8.2.3
# via -r requirements-integration.in
# via
# -r requirements-integration.in
# charmed-kubeflow-chisme
tomli==2.0.1
# via
# ipdb
Expand All @@ -187,6 +243,7 @@ traitlets==5.9.0
# matplotlib-inline
typing-extensions==4.7.1
# via
# anyio
# ipython
# typing-inspect
typing-inspect==0.9.0
Expand All @@ -199,8 +256,12 @@ urllib3==1.26.16
wcwidth==0.2.6
# via prompt-toolkit
websocket-client==1.6.3
# via kubernetes
# via
# kubernetes
# ops
websockets==8.1
# via juju
yarl==1.9.2
# via aiohttp
zipp==3.20.0
# via importlib-resources
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
groups:
- name: KubeflowMetacontrollerOperatorServices
rules:
- alert: KubeflowServiceDown
expr: up{} < 1
for: 5m
labels:
severity: critical
annotations:
summary: "{{ $labels.juju_charm }} service is Down ({{ $labels.juju_model }}/{{ $labels.juju_unit }})"
description: |
One or more targets of {{ $labels.juju_charm }} charm are down on unit {{ $labels.juju_model }}/{{ $labels.juju_unit }}.
LABELS = {{ $labels }}

- alert: KubeflowServiceIsNotStable
expr: avg_over_time(up{}[10m]) < 0.5
for: 0m
labels:
severity: warning
annotations:
summary: "{{ $labels.juju_charm }} service is not stable ({{ $labels.juju_model }}/{{ $labels.juju_unit }})"
description: |
{{ $labels.juju_charm }} unit {{ $labels.juju_model }}/{{ $labels.juju_unit }} has been unreachable at least 50% of the time over the last 10 minutes.
LABELS = {{ $labels }}
10 changes: 0 additions & 10 deletions src/prometheus_alert_rules/unit_unavailable.rule

This file was deleted.

106 changes: 24 additions & 82 deletions tests/integration/test_charm.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,23 @@
# Copyright 2021 Canonical Ltd.
# See LICENSE file for licensing details.

import json
import logging
from pathlib import Path

import pytest
import requests
import tenacity
import yaml
from charmed_kubeflow_chisme.testing import (
assert_alert_rules,
assert_metrics_endpoint,
deploy_and_assert_grafana_agent,
get_alert_rules,
)
from pytest_operator.plugin import OpsTest

logger = logging.getLogger(__name__)

METADATA = yaml.safe_load(Path("./metadata.yaml").read_text())
APP_NAME = "metacontroller-operator"
PROMETHEUS = "prometheus-k8s"
PROMETHEUS_CHANNEL = "latest/stable"
GRAFANA = "grafana-k8s"
GRAFANA_CHANNEL = "latest/stable"
PROMETHEUS_SCRAPE = "prometheus-scrape-config-k8s"
PROMETHEUS_SCRAPE_CHANNEL = "latest/stable"


@pytest.mark.abort_on_fail
Expand Down Expand Up @@ -54,83 +51,28 @@ async def test_build_and_deploy_with_trust(ops_test: OpsTest):
), f"Application {app_name}.Unit {i_unit}.workload_status != active"
assert ops_test.model.applications[APP_NAME].units[0].workload_status == "active"

# Deploying grafana-agent-k8s and add all relations
await deploy_and_assert_grafana_agent(
ops_test.model, APP_NAME, metrics=True, dashboard=True, logging=False
)

async def test_prometheus_grafana_integration(ops_test: OpsTest):
"""Deploy prometheus, grafana and required relations, then test the metrics."""
scrape_config = {"scrape_interval": "30s"}

# Deploy and relate prometheus
await ops_test.juju(
"deploy",
PROMETHEUS,
"--channel",
PROMETHEUS_CHANNEL,
"--trust",
check=True,
)
await ops_test.juju(
"deploy",
GRAFANA,
"--channel",
GRAFANA_CHANNEL,
"--trust",
check=True,
)
await ops_test.model.deploy(
PROMETHEUS_SCRAPE,
channel=PROMETHEUS_SCRAPE_CHANNEL,
config=scrape_config,
)
async def test_metrics_enpoint(ops_test):
"""Test metrics_endpoints are defined in relation data bag and their accessibility.
This function gets all the metrics_endpoints from the relation data bag, checks if
they are available from the grafana-agent-k8s charm and finally compares them with the
ones provided to the function.
"""
app = ops_test.model.applications[APP_NAME]
await assert_metrics_endpoint(app, metrics_port=9999, metrics_path="/metrics")

await ops_test.model.add_relation(APP_NAME, PROMETHEUS_SCRAPE)
await ops_test.model.add_relation(
f"{PROMETHEUS}:grafana-dashboard", f"{GRAFANA}:grafana-dashboard"
)
await ops_test.model.add_relation(
f"{APP_NAME}:grafana-dashboard", f"{GRAFANA}:grafana-dashboard"
)
await ops_test.model.add_relation(
f"{PROMETHEUS}:metrics-endpoint",
f"{PROMETHEUS_SCRAPE}:metrics-endpoint",
)

await ops_test.model.wait_for_idle(status="active", timeout=60 * 20)

status = await ops_test.model.get_status()
prometheus_unit_ip = status["applications"][PROMETHEUS]["units"][f"{PROMETHEUS}/0"]["address"]
logger.info(f"Prometheus available at http://{prometheus_unit_ip}:9090")

for attempt in retry_for_5_attempts:
logger.info(
f"Testing prometheus deployment (attempt " f"{attempt.retry_state.attempt_number})"
)
with attempt:
r = requests.get(
f"http://{prometheus_unit_ip}:9090/api/v1/query?"
f'query=up{{juju_application="{APP_NAME}"}}'
)
response = json.loads(r.content.decode("utf-8"))
response_status = response["status"]
logger.info(f"Response status is {response_status}")
assert response_status == "success"

response_metric = response["data"]["result"][0]["metric"]
assert response_metric["juju_application"] == APP_NAME
assert response_metric["juju_model"] == ops_test.model_name

# Assert the unit is available by checking the query result
# The data is presented as a list [1707357912.349, '1'], where the
# first value is a timestamp and the second value is the state of the unit
# 1 means available, 0 means unavailable
assert response["data"]["result"][0]["value"][1] == "1"


# Helper to retry calling a function over 30 seconds or 5 attempts
retry_for_5_attempts = tenacity.Retrying(
stop=(tenacity.stop_after_attempt(5) | tenacity.stop_after_delay(30)),
wait=tenacity.wait_exponential(multiplier=1, min=1, max=10),
reraise=True,
)
async def test_alert_rules(ops_test):
"""Test check charm alert rules and rules defined in relation data bag."""
app = ops_test.model.applications[APP_NAME]
alert_rules = get_alert_rules()
logger.info("found alert_rules: %s", alert_rules)
await assert_alert_rules(app, alert_rules)


# TODO: Add test for charm removal
Expand Down
Loading