Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement COS integration #306

Merged
merged 11 commits into from
Oct 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
842 changes: 842 additions & 0 deletions lib/charms/grafana_agent/v0/cos_agent.py

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,12 @@ provides:
container-runtime:
interface: container-runtime
scope: container
cos-agent:
interface: cos_agent
kube-control:
interface: kube-control
tokens:
interface: tokens
requires:
certificates:
interface: tls-certificates
Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@ charm-lib-interface-container-runtime @ git+https://github.com/charmed-kubernete
charm-lib-interface-external-cloud-provider @ git+https://github.com/charmed-kubernetes/charm-lib-interface-external-cloud-provider
charm-lib-interface-kube-dns @ git+https://github.com/charmed-kubernetes/charm-lib-interface-kube-dns
charm-lib-interface-kubernetes-cni @ git+https://github.com/charmed-kubernetes/charm-lib-interface-kubernetes-cni
charm-lib-interface-tokens @ git+https://github.com/charmed-kubernetes/charm-lib-interface-tokens
charm-lib-kubernetes-snaps @ git+https://github.com/charmed-kubernetes/charm-lib-kubernetes-snaps
charm-lib-reconciler @ git+https://github.com/charmed-kubernetes/charm-lib-reconciler
cosl == 0.0.7
gunicorn >= 20.0.0,<21.0.0
jinja2
loadbalancer_interface
Expand Down
48 changes: 48 additions & 0 deletions scripts/update_alert_rules.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Copyright 2023 Canonical, Ltd.
"""Sync AlertManager rules from upstream repository."""

import os
import shutil
from urllib.error import URLError
from urllib.request import urlopen

# Configuration constants
VERSION = "v0.13.0"
SOURCE = (
f"https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/{VERSION}/manifests"
)
FILES = [
"kubePrometheus-prometheusRule.yaml",
"kubeStateMetrics-prometheusRule.yaml",
"kubernetesControlPlane-prometheusRule.yaml",
]
DIR = "src/prometheus_alert_rules"


def fetch_file(source_url):
"""Fetches the file content from the given URL."""
try:
return urlopen(source_url).read().decode().strip()
except URLError as e:
print(f"Failed to fetch {source_url}. Error: {e}")
return None


def main():
# Ensure the target directory is clean and exists
shutil.rmtree(DIR, ignore_errors=True)
os.makedirs(DIR, exist_ok=True)

for file in FILES:
source_url = os.path.join(SOURCE, file)
data = fetch_file(source_url)

# Only write data if it's successfully fetched
if data:
print(f"Saving Rule {file}")
with open(os.path.join(DIR, file), "w") as f:
f.write(data)


if __name__ == "__main__":
main()
88 changes: 88 additions & 0 deletions scripts/update_grafana_dashboards.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# Copyright 2023 Canonical, Ltd.
"""Sync Grafana dashboards from upstream repository.

Dashboard changes:
- Remove built-in $prometheus datasource (COS adds the datasource automatically)
"""

import json
import os
import shutil
from urllib.request import urlopen

import yaml

VERSION = "v0.13.0"
SOURCE_URL = f"https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/{VERSION}/manifests/grafana-dashboardDefinitions.yaml"
DASHBOARDS = {
"apiserver.json",
"cluster-total.json",
"controller-manager.json",
"k8s-resources-cluster.json",
"k8s-resources-multicluster.json",
"k8s-resources-namespace.json",
"k8s-resources-node.json",
"k8s-resources-pod.json",
"k8s-resources-workload.json",
"k8s-resources-workloads-namespace.json",
"kubelet.json",
"namespace-by-pod.json",
"namespace-by-workload.json",
"persistentvolumesusage.json",
"pod-total.json",
"proxy.json",
"scheduler.json",
"workload-total.json",
}
TARGET_DIR = "src/grafana_dashboards"


def fetch_dashboards(source_url):
print(f"Fetching dashboard data from {source_url}")
with urlopen(source_url) as request:
return yaml.safe_load(request.read())


def process_dashboards_data(data):
for config_map in data["items"]:
for key, value in config_map["data"].items():
if key not in DASHBOARDS:
continue

yield key, json.loads(value)


def prepare_dashboard(json_value):
"""Prepare dashboard data for COS integration."""
# Remove the built-in Prometheus datasource
templating_list = json_value.get("templating", {}).get("list", [])
for item in templating_list:
if item.get("name") == "datasource" and item.get("type") == "datasource":
templating_list.remove(item)
break

# convert json value to string and perform replacement
as_string = json.dumps(json_value, indent=4)
return as_string.replace("$datasource", "$prometheusds")


def save_dashboard_to_file(name, data):
filepath = os.path.join(TARGET_DIR, name)
print(f"Saving dashboard '{name}' to {filepath}")
with open(filepath, "w") as f:
f.write(data)


def main():
shutil.rmtree(TARGET_DIR, ignore_errors=True)
os.mkdir(TARGET_DIR)

data = fetch_dashboards(SOURCE_URL)

for name, dashboard_data in process_dashboards_data(data):
dashboard = prepare_dashboard(dashboard_data)
save_dashboard_to_file(name, dashboard)


if __name__ == "__main__":
main()
56 changes: 52 additions & 4 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Copyright 2023 Canonical
# See LICENSE file for licensing details.

"""Charm."""
"""Charmed Machine Operator for Kubernetes Control Plane."""

import logging
import os
Expand All @@ -15,23 +15,29 @@
import yaml
from cdk_addons import CdkAddons
from charms import kubernetes_snaps
from charms.grafana_agent.v0.cos_agent import COSAgentProvider
from charms.interface_container_runtime import ContainerRuntimeProvides
from charms.interface_external_cloud_provider import ExternalCloudProvider
from charms.interface_kube_dns import KubeDnsRequires
from charms.interface_kubernetes_cni import KubernetesCniProvides
from charms.interface_tokens import TokensProvider
from charms.kubernetes_libs.v0.etcd import EtcdReactiveRequires
from charms.reconciler import Reconciler
from cos_integration import COSIntegration
from k8s_api_endpoints import K8sApiEndpoints
from kubectl import kubectl
from loadbalancer_interface import LBProvider
from ops import BlockedStatus, WaitingStatus
from ops.interface_kube_control import KubeControlProvides
from ops.interface_tls_certificates import CertificatesRequires

log = logging.getLogger(__name__)

OBSERVABILITY_ROLE = "system:cos"


class KubernetesControlPlaneCharm(ops.CharmBase):
"""Charm."""
"""Charmed Operator for Kubernetes Control Plane."""

def __init__(self, *args):
super().__init__(*args)
Expand All @@ -41,6 +47,16 @@ def __init__(self, *args):
self, endpoint="cni", default_cni=self.model.config["default-cni"]
)
self.container_runtime = ContainerRuntimeProvides(self, endpoint="container-runtime")
self.cos_integration = COSIntegration(self)
self.cos_agent = COSAgentProvider(
self,
relation_name="cos-agent",
scrape_configs=self.cos_integration.get_metrics_endpoints,
refresh_events=[
self.on.peer_relation_changed,
self.on.upgrade_charm,
],
)
self.etcd = EtcdReactiveRequires(self)
self.k8s_api_endpoints = K8sApiEndpoints(self)
self.kube_control = KubeControlProvides(self, endpoint="kube-control")
Expand All @@ -49,6 +65,7 @@ def __init__(self, *args):
self.lb_internal = LBProvider(self, "loadbalancer-internal")
self.external_cloud_provider = ExternalCloudProvider(self, "external-cloud-provider")
self.reconciler = Reconciler(self, self.reconcile)
self.tokens = TokensProvider(self, endpoint="tokens")

def api_dependencies_ready(self):
common_name = kubernetes_snaps.get_public_address()
Expand Down Expand Up @@ -213,9 +230,8 @@ def configure_scheduler(self):

def create_kubeconfigs(self):
ca = self.certificates.ca
fqdn = self.external_cloud_provider.name == "aws"
local_server = self.k8s_api_endpoints.local()
node_name = kubernetes_snaps.get_node_name(fqdn)
node_name = self.get_node_name()
public_server = self.k8s_api_endpoints.external()

if not os.path.exists("/root/.kube/config"):
Expand Down Expand Up @@ -296,6 +312,32 @@ def create_kubeconfigs(self):
user="kube-proxy",
)

def configure_observability(self):
"""Apply observability configurations to the cluster."""
# Apply Clusterrole and Clusterrole binding for COS observability
if self.unit.is_leader():
kubectl("apply", "-f", "templates/observability.yaml")
# Issue a token for metrics scraping
node_name = self.get_node_name()
cos_user = f"system:cos:{node_name}"
auth_webhook.create_token(
uid=self.model.unit.name, username=cos_user, groups=[OBSERVABILITY_ROLE]
)

def generate_tokens(self):
"""Generate and send tokens for units that request them."""
if not self.unit.is_leader():
return

self.tokens.remove_stale_tokens()

for request in self.tokens.token_requests:
tokens = {
user: auth_webhook.create_token(uid=request.unit, username=user, groups=[group])
for user, group in request.requests.items()
}
self.tokens.send_token(request, tokens)

def get_cluster_name(self):
peer_relation = self.model.get_relation("peer")
cluster_name = peer_relation.data[self.app].get("cluster-name")
Expand Down Expand Up @@ -327,6 +369,10 @@ def get_dns_domain(self):
def get_dns_port(self):
return self.kube_dns.port or 53

def get_node_name(self) -> str:
fqdn = self.external_cloud_provider.name == "aws"
return kubernetes_snaps.get_node_name(fqdn)

def reconcile(self, event):
"""Reconcile state change events."""
kubernetes_snaps.install(channel=self.model.config["channel"], control_plane=True)
Expand All @@ -349,6 +395,8 @@ def reconcile(self, event):
self.configure_kubelet()
self.configure_kube_proxy()
self.configure_kube_control()
self.generate_tokens()
self.configure_observability()

def request_certificates(self):
"""Request client and server certificates."""
Expand Down
99 changes: 99 additions & 0 deletions src/cos_integration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import logging
from dataclasses import dataclass
from subprocess import CalledProcessError

import auth_webhook
from ops import CharmBase
from tenacity import RetryError

log = logging.getLogger(__name__)

OBSERVABILITY_ROLE = "system:cos"


@dataclass
class JobConfig:
"""Data class representing the configuration for a Prometheus scrape job.

Attributes:
name (str): The name of the scrape job. Corresponds to the name of the Kubernetes
component being monitored (e.g., 'kube-apiserver').
metrics_path (str): The endpoint path where the metrics are exposed by the
component (e.g., '/metrics').
scheme (str): The scheme used for the endpoint. (e.g.'http' or 'https').
target (str): The network address of the target component along with the port.
Format is 'hostname:port' (e.g., 'localhost:6443').
"""

name: str
metrics_path: str
scheme: str
target: str


class COSIntegration:
"""Utility class that handles the integration with COS for Charmed Kubernetes.

This class provides methods to retrieve and configure Prometheus metrics scraping endpoints
based on the Kubernetes components running within the cluster.

Attributes:
charm (CharmBase): Reference to the base charm instance.
"""

def __init__(self, charm: CharmBase) -> None:
self.charm = charm

def _create_scrape_jobs(self, config: JobConfig, node_name: str, token: str) -> dict:
return {
"tls_config": {"insecure_skip_verify": True},
"authorization": {"credentials": token},
"job_name": config.name,
"metrics_path": config.metrics_path,
"scheme": config.scheme,
"static_configs": [
{
"targets": [config.target],
"labels": {"node": node_name},
}
],
"relabel_configs": [
{"target_label": "metrics_path", "replacement": config.metrics_path},
{"target_label": "job", "replacement": config.name},
],
}

def get_metrics_endpoints(self) -> list:
"""Return the metrics endpoints for K8s components."""
log.info("Building Prometheus scraping jobs.")

try:
node_name = self.charm.get_node_name()
cos_user = f"system:cos:{node_name}"
token = auth_webhook.get_token(cos_user)
except (CalledProcessError, RetryError):
log.error("Failed to retrieve observability token.")
return []

if not token:
log.info("COS Token not yet available")
return []

kubernetes_jobs = [
JobConfig("kube-proxy", "/metrics", "http", "localhost:10249"),
JobConfig("kube-apiserver", "/metrics", "https", "localhost:6443"),
JobConfig("kube-controller-manager", "/metrics", "https", "localhost:10257"),
]
kubelet_paths = [
"/metrics",
"/metrics/resource",
"/metrics/cadvisor",
"/metrics/probes",
]

kubelet_jobs = [
JobConfig(f"kubelet-{path.split('/')[-1]}", path, "https", "localhost:10250")
for path in kubelet_paths
]

return [self.create_scrape_job(job) for job in kubernetes_jobs + kubelet_jobs]
Loading