From d0eae3f542c91cd3f13a2495f8f0d3229c8205bc Mon Sep 17 00:00:00 2001 From: Ivan Chvets Date: Tue, 29 Nov 2022 13:21:02 -0500 Subject: [PATCH] Bug: MLFlow Seldon secrets is incorrectly formatted. https://github.com/canonical/bundle-kubeflow/issues/429 Summary of changes: - Updated libs. - Updated ops (removed pinning). --- .../grafana_k8s/v0/grafana_dashboard.py | 432 +++++++++++-- .../observability_libs/v0/juju_topology.py | 49 +- .../prometheus_k8s/v0/prometheus_scrape.py | 602 ++++++++++-------- charms/mlflow-server/requirements.txt | 2 +- charms/mlflow-server/tests/unit/__init__.py | 10 + 5 files changed, 761 insertions(+), 334 deletions(-) create mode 100644 charms/mlflow-server/tests/unit/__init__.py diff --git a/charms/mlflow-server/lib/charms/grafana_k8s/v0/grafana_dashboard.py b/charms/mlflow-server/lib/charms/grafana_k8s/v0/grafana_dashboard.py index 4a2c739a..44f4d14b 100644 --- a/charms/mlflow-server/lib/charms/grafana_k8s/v0/grafana_dashboard.py +++ b/charms/mlflow-server/lib/charms/grafana_k8s/v0/grafana_dashboard.py @@ -176,15 +176,20 @@ def __init__(self, *args): """ import base64 +import hashlib import json import logging import lzma import os +import platform import re +import subprocess +import tempfile import uuid from pathlib import Path -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Tuple, Union +import yaml from ops.charm import ( CharmBase, HookEvent, @@ -213,7 +218,7 @@ def __init__(self, *args): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 13 +LIBPATCH = 17 logger = logging.getLogger(__name__) @@ -222,7 +227,7 @@ def __init__(self, *args): DEFAULT_PEER_NAME = "grafana" RELATION_INTERFACE_NAME = "grafana_dashboard" -TEMPLATE_DROPDOWNS = [ +TOPOLOGY_TEMPLATE_DROPDOWNS = [ # type: ignore { "allValue": None, "datasource": "${prometheusds}", @@ -323,6 +328,9 @@ def __init__(self, *args): "type": "query", "useTags": False, }, +] + +DATASOURCE_TEMPLATE_DROPDOWNS = [ # type: ignore { "description": None, "error": None, @@ -338,6 +346,21 @@ def __init__(self, *args): "skipUrlSync": False, "type": "datasource", }, + { + "description": None, + "error": None, + "hide": 0, + "includeAll": False, + "label": None, + "multi": False, + "name": "lokids", + "options": [], + "query": "loki", + "refresh": 1, + "regex": "", + "skipUrlSync": False, + "type": "datasource", + }, ] REACTIVE_CONVERTER = { # type: ignore @@ -531,7 +554,7 @@ def _decode_dashboard_content(encoded_content: str) -> str: return lzma.decompress(base64.b64decode(encoded_content.encode("utf-8"))).decode() -def _convert_dashboard_fields(content: str) -> str: +def _convert_dashboard_fields(content: str, inject_dropdowns: bool = True) -> str: """Make sure values are present for Juju topology. Inserts Juju topology variables and selectors into the template, as well as @@ -541,6 +564,12 @@ def _convert_dashboard_fields(content: str) -> str: datasources = {} existing_templates = False + template_dropdowns = ( + TOPOLOGY_TEMPLATE_DROPDOWNS + DATASOURCE_TEMPLATE_DROPDOWNS # type: ignore + if inject_dropdowns + else DATASOURCE_TEMPLATE_DROPDOWNS + ) + # If the dashboard has __inputs, get the names to replace them. These are stripped # from reactive dashboards in GrafanaDashboardAggregator, but charm authors in # newer charms may import them directly from the marketplace @@ -552,7 +581,7 @@ def _convert_dashboard_fields(content: str) -> str: # If no existing template variables exist, just insert our own if "templating" not in dict_content: - dict_content["templating"] = {"list": [d for d in TEMPLATE_DROPDOWNS]} + dict_content["templating"] = {"list": [d for d in template_dropdowns]} # type: ignore else: # Otherwise, set a flag so we can go back later existing_templates = True @@ -563,12 +592,11 @@ def _convert_dashboard_fields(content: str) -> str: datasources[template_value["name"]] = template_value["query"].lower() # Put our own variables in the template - for d in TEMPLATE_DROPDOWNS: + for d in template_dropdowns: # type: ignore if d not in dict_content["templating"]["list"]: dict_content["templating"]["list"].insert(0, d) dict_content = _replace_template_fields(dict_content, datasources, existing_templates) - return json.dumps(dict_content) @@ -577,8 +605,7 @@ def _replace_template_fields( # noqa: C901 ) -> dict: """Make templated fields get cleaned up afterwards. - If existing datasource variables are present, try to substitute them, otherwise - assume they are all for Prometheus and put the prometheus variable there. + If existing datasource variables are present, try to substitute them. """ replacements = {"loki": "${lokids}", "prometheus": "${prometheusds}"} used_replacements = [] @@ -588,7 +615,7 @@ def _replace_template_fields( # noqa: C901 if datasources or not existing_templates: panels = dict_content["panels"] - # Go through all of the panels. If they have a datasource set, AND it's one + # Go through all the panels. If they have a datasource set, AND it's one # that we can convert to ${lokids} or ${prometheusds}, by stripping off the # ${} templating and comparing the name to the list we built, replace it, # otherwise, leave it alone. @@ -598,14 +625,14 @@ def _replace_template_fields( # noqa: C901 if "datasource" not in panel or not panel.get("datasource", ""): continue if not existing_templates: - panel["datasource"] = "${prometheusds}" + if "loki" in panel.get("datasource"): + panel["datasource"] = "${lokids}" + else: + panel["datasource"] = "${prometheusds}" else: if panel["datasource"].lower() in replacements.values(): # Already a known template variable continue - if not panel["datasource"]: - # Don't worry about null values - continue # Strip out variable characters and maybe braces ds = re.sub(r"(\$|\{|\})", "", panel["datasource"]) replacement = replacements.get(datasources[ds], "") @@ -628,6 +655,171 @@ def _replace_template_fields( # noqa: C901 return dict_content +def _inject_labels(content: str, topology: dict, transformer: "CosTool") -> str: + """Inject Juju topology into panel expressions via CosTool. + + A dashboard will have a structure approximating: + { + "__inputs": [], + "templating": { + "list": [ + { + "name": "prometheusds", + "type": "prometheus" + } + ] + }, + "panels": [ + { + "foo": "bar", + "targets": [ + { + "some": "field", + "expr": "up{job="foo"}" + }, + { + "some_other": "field", + "expr": "sum(http_requests_total{instance="$foo"}[5m])} + } + ], + "datasource": "${someds}" + } + ] + } + + `templating` is used elsewhere in this library, but the structure is not rigid. It is + not guaranteed that a panel will actually have any targets (it could be a "spacer" with + no datasource, hence no expression). It could have only one target. It could have multiple + targets. It could have multiple targets of which only one has an `expr` to evaluate. We need + to try to handle all of these concisely. + + `cos-tool` (`github.com/canonical/cos-tool` as a Go module in general) + does not know "Grafana-isms", such as using `[$_variable]` to modify the query from the user + interface, so we add placeholders (as `5y`, since it must parse, but a dashboard looking for + five years for a panel query would be unusual). + + Args: + content: dashboard content as a string + topology: a dict containing topology values + transformer: a 'CosTool' instance + Returns: + dashboard content with replaced values. + """ + dict_content = json.loads(content) + + if "panels" not in dict_content.keys(): + return json.dumps(dict_content) + + # Go through all the panels and inject topology labels + # Panels may have more than one 'target' where the expressions live, so that must be + # accounted for. Additionally, `promql-transform` does not necessarily gracefully handle + # expressions with range queries including variables. Exclude these. + # + # It is not a certainty that the `datasource` field will necessarily reflect the type, so + # operate on all fields. + panels = dict_content["panels"] + topology_with_prefix = {"juju_{}".format(k): v for k, v in topology.items()} + + # We need to use an index so we can insert the changed element back later + for panel_idx, panel in enumerate(panels): + if type(panel) is not dict: + continue + + # Use the index to insert it back in the same location + panels[panel_idx] = _modify_panel(panel, topology_with_prefix, transformer) + + return json.dumps(dict_content) + + +def _modify_panel(panel: dict, topology: dict, transformer: "CosTool") -> dict: + """Inject Juju topology into panel expressions via CosTool. + + Args: + panel: a dashboard panel as a dict + topology: a dict containing topology values + transformer: a 'CosTool' instance + Returns: + the panel with injected values + """ + if "targets" not in panel.keys(): + return panel + + # Pre-compile a regular expression to grab values from inside of [] + range_re = re.compile(r"\[(?P.*?)\]") + # Do the same for any offsets + offset_re = re.compile(r"offset\s+(?P-?\s*[$\w]+)") + + known_datasources = {"${prometheusds}": "promql", "${lokids}": "logql"} + + targets = panel["targets"] + + # We need to use an index so we can insert the changed element back later + for idx, target in enumerate(targets): + # If there's no expression, we don't need to do anything + if "expr" not in target.keys(): + continue + + if "datasource" not in panel.keys(): + continue + elif panel["datasource"] not in known_datasources: + continue + querytype = known_datasources[panel["datasource"]] + expr = target["expr"] + + # Capture all values inside `[]` into a list which we'll iterate over later to + # put them back in-order. Then apply the regex again and replace everything with + # `[5y]` so promql/parser will take it. + # + # Then do it again for offsets + range_values = [m.group("value") for m in range_re.finditer(expr)] + expr = range_re.sub(r"[5y]", expr) + + offset_values = [m.group("value") for m in offset_re.finditer(expr)] + expr = offset_re.sub(r"offset 5y", expr) + # Retrieve the new expression (which may be unchanged if there were no label + # matchers in the expression, or if tt was unable to be parsed like logql. It's + # virtually impossible to tell from any datasource "name" in a panel what the + # actual type is without re-implementing a complete dashboard parser, but no + # harm will some from passing invalid promql -- we'll just get the original back. + # + replacement = transformer.inject_label_matchers(expr, topology, querytype) + + if replacement == target["expr"]: + # promql-tranform caught an error. Move on + continue + + # Go back and substitute values in [] which were pulled out + # Enumerate with an index... again. The same regex is ok, since it will still match + # `[(.*?)]`, which includes `[5y]`, our placeholder + for i, match in enumerate(range_re.finditer(replacement)): + # Replace one-by-one, starting from the left. We build the string back with + # `str.replace(string_to_replace, replacement_value, count)`. Limit the count + # to one, since we are going through one-by-one through the list we saved earlier + # in `range_values`. + replacement = replacement.replace( + "[{}]".format(match.group("value")), + "[{}]".format(range_values[i]), + 1, + ) + + for i, match in enumerate(offset_re.finditer(replacement)): + # Replace one-by-one, starting from the left. We build the string back with + # `str.replace(string_to_replace, replacement_value, count)`. Limit the count + # to one, since we are going through one-by-one through the list we saved earlier + # in `range_values`. + replacement = replacement.replace( + "offset {}".format(match.group("value")), + "offset {}".format(offset_values[i]), + 1, + ) + + # Use the index to insert it back in the same location + targets[idx]["expr"] = replacement + + panel["targets"] = targets + return panel + + def _type_convert_stored(obj): """Convert Stored* to their appropriate types, recursively.""" if isinstance(obj, StoredList): @@ -732,7 +924,7 @@ def __init__( If you would like to use relation name other than `grafana-dashboard`, you will need to specify the relation name via the `relation_name` argument when instantiating the :class:`GrafanaDashboardProvider` object. - However, it is strongly advised to keep the the default relation name, + However, it is strongly advised to keep the default relation name, so that people deploying your charm will have a consistent experience with all other charms that provide Grafana dashboards. @@ -792,13 +984,15 @@ def __init__( self._on_grafana_dashboard_relation_changed, ) - def add_dashboard(self, content: str) -> None: + def add_dashboard(self, content: str, inject_dropdowns: bool = True) -> None: """Add a dashboard to the relation managed by this :class:`GrafanaDashboardProvider`. Args: content: a string representing a Jinja template. Currently, no global variables are added to the Jinja template evaluation context. + inject_dropdowns: a :boolean: indicating whether topology dropdowns should be + added to the dashboard """ # Update of storage must be done irrespective of leadership, so # that the stored state is there when this unit becomes leader. @@ -809,7 +1003,11 @@ def add_dashboard(self, content: str) -> None: # Use as id the first chars of the encoded dashboard, so that # it is predictable across units. id = "prog:{}".format(encoded_dashboard[-24:-16]) - stored_dashboard_templates[id] = self._content_to_dashboard_object(encoded_dashboard) + + stored_dashboard_templates[id] = self._content_to_dashboard_object( + encoded_dashboard, inject_dropdowns + ) + stored_dashboard_templates[id]["dashboard_alt_uid"] = self._generate_alt_uid(id) if self._charm.unit.is_leader(): for dashboard_relation in self._charm.model.relations[self._relation_name]: @@ -836,7 +1034,9 @@ def update_dashboards(self) -> None: for dashboard_relation in self._charm.model.relations[self._relation_name]: self._upset_dashboards_on_relation(dashboard_relation) - def _update_all_dashboards_from_dir(self, _: Optional[HookEvent] = None) -> None: + def _update_all_dashboards_from_dir( + self, _: Optional[HookEvent] = None, inject_dropdowns: bool = True + ) -> None: """Scans the built-in dashboards and updates relations with changes.""" # Update of storage must be done irrespective of leadership, so # that the stored state is there when this unit becomes leader. @@ -852,15 +1052,16 @@ def _update_all_dashboards_from_dir(self, _: Optional[HookEvent] = None) -> None # Path.glob uses fnmatch on the backend, which is pretty limited, so use a # custom function for the filter - def _is_dashbaord(p: Path) -> bool: - return p.is_file and p.name.endswith((".json", ".json.tmpl", ".tmpl")) + def _is_dashboard(p: Path) -> bool: + return p.is_file() and p.name.endswith((".json", ".json.tmpl", ".tmpl")) - for path in filter(_is_dashbaord, Path(self._dashboards_path).glob("*")): + for path in filter(_is_dashboard, Path(self._dashboards_path).glob("*")): # path = Path(path) id = "file:{}".format(path.stem) stored_dashboard_templates[id] = self._content_to_dashboard_object( - _encode_dashboard_content(path.read_bytes()) + _encode_dashboard_content(path.read_bytes()), inject_dropdowns ) + stored_dashboard_templates[id]["dashboard_alt_uid"] = self._generate_alt_uid(id) self._stored.dashboard_templates = stored_dashboard_templates @@ -868,14 +1069,28 @@ def _is_dashbaord(p: Path) -> bool: for dashboard_relation in self._charm.model.relations[self._relation_name]: self._upset_dashboards_on_relation(dashboard_relation) - def _reinitialize_dashboard_data(self) -> None: + def _generate_alt_uid(self, key: str) -> str: + """Generate alternative uid for dashboards. + + Args: + key: A string used (along with charm.meta.name) to build the hash uid. + + Returns: A hash string. + """ + raw_dashboard_alt_uid = "{}-{}".format(self._charm.meta.name, key) + return hashlib.shake_256(raw_dashboard_alt_uid.encode("utf-8")).hexdigest(8) + + def _reinitialize_dashboard_data(self, inject_dropdowns: bool = True) -> None: """Triggers a reload of dashboard outside of an eventing workflow. + Args: + inject_dropdowns: a :bool: used to indicate whether topology dropdowns should be added + This will destroy any existing relation data. """ try: _resolve_dir_against_charm_path(self._charm, self._dashboards_path) - self._update_all_dashboards_from_dir() + self._update_all_dashboards_from_dir(inject_dropdowns=inject_dropdowns) except InvalidDirectoryPathError as e: logger.warning( @@ -890,7 +1105,7 @@ def _reinitialize_dashboard_data(self) -> None: del stored_dashboard_templates[dashboard_id] self._stored.dashboard_templates = stored_dashboard_templates - # With all of the file-based dashboards cleared out, force a refresh + # With all the file-based dashboards cleared out, force a refresh # of relation data if self._charm.unit.is_leader(): for dashboard_relation in self._charm.model.relations[self._relation_name]: @@ -936,11 +1151,12 @@ def _upset_dashboards_on_relation(self, relation: Relation) -> None: relation.data[self._charm.app]["dashboards"] = json.dumps(stored_data) - def _content_to_dashboard_object(self, content: str) -> Dict: + def _content_to_dashboard_object(self, content: str, inject_dropdowns: bool = True) -> Dict: return { "charm": self._charm.meta.name, "content": content, "juju_topology": self._juju_topology, + "inject_dropdowns": inject_dropdowns, } # This is not actually used in the dashboards, but is present to provide a secondary @@ -1010,6 +1226,7 @@ def __init__( super().__init__(charm, relation_name) self._charm = charm self._relation_name = relation_name + self._tranformer = CosTool(self._charm) self._stored.set_default(dashboards=dict()) @@ -1074,7 +1291,6 @@ def update_dashboards(self, relation: Optional[Relation] = None) -> None: updated. If not specified, all relations managed by this :class:`GrafanaDashboardConsumer` will be updated. """ - changes = False if self._charm.unit.is_leader(): relations = ( [relation] if relation else self._charm.model.relations[self._relation_name] @@ -1083,9 +1299,6 @@ def update_dashboards(self, relation: Optional[Relation] = None) -> None: for relation in relations: self._render_dashboards_and_signal_changed(relation) - if changes: - self.on.dashboards_changed.emit() - def _on_grafana_dashboard_relation_broken(self, event: RelationBrokenEvent) -> None: """Update job config when providers depart. @@ -1129,8 +1342,8 @@ def _render_dashboards_and_signal_changed(self, relation: Relation) -> bool: # # Import only if a charmed operator uses the consumer, we don't impose these # dependencies on the client - from jinja2 import Template # type: ignore - from jinja2.exceptions import TemplateSyntaxError # type: ignore + from jinja2 import Template + from jinja2.exceptions import TemplateSyntaxError # The dashboards are WAY too big since this ultimately calls out to Juju to # set the relation data, and it overflows the maximum argument length for @@ -1147,10 +1360,18 @@ def _render_dashboards_and_signal_changed(self, relation: Relation) -> bool: # decoded_content = None content = None error = None + topology = template.get("juju_topology", {}) try: decoded_content = _decode_dashboard_content(template["content"]) + inject_dropdowns = template.get("inject_dropdowns", True) content = Template(decoded_content).render() - content = _encode_dashboard_content(_convert_dashboard_fields(content)) + content = self._manage_dashboard_uid(content, template) + content = _convert_dashboard_fields(content, inject_dropdowns) + + if topology: + content = _inject_labels(content, topology, self._tranformer) + + content = _encode_dashboard_content(content) except lzma.LZMAError as e: error = str(e) relation_has_invalid_dashboards = True @@ -1221,6 +1442,15 @@ def _render_dashboards_and_signal_changed(self, relation: Relation) -> bool: # self.set_peer_data("dashboards", stored_dashboards) return True + def _manage_dashboard_uid(self, dashboard: str, template: dict) -> str: + """Add an uid to the dashboard if it is not present.""" + dashboard = json.loads(dashboard) + + if not dashboard.get("uid", None) and "dashboard_alt_uid" in template: + dashboard["uid"] = template["dashboard_alt_uid"] + + return json.dumps(dashboard) + def _remove_all_dashboards_for_relation(self, relation: Relation) -> None: """If an errored dashboard is in stored data, remove it and trigger a deletion.""" if self._get_stored_dashboards(relation.id): @@ -1267,11 +1497,11 @@ def _set_default_data(self) -> None: def set_peer_data(self, key: str, data: Any) -> None: """Put information into the peer data bucket instead of `StoredState`.""" - self._charm.peers.data[self._charm.app][key] = json.dumps(data) # type: ignore + self._charm.peers.data[self._charm.app][key] = json.dumps(data) # type: ignore[attr-defined] def get_peer_data(self, key: str) -> Any: """Retrieve information from the peer data bucket instead of `StoredState`.""" - data = self._charm.peers.data[self._charm.app].get(key, "") # type: ignore + data = self._charm.peers.data[self._charm.app].get(key, "") # type: ignore[attr-defined] return json.loads(data) if data else {} @@ -1521,10 +1751,10 @@ def _maybe_get_builtin_dashboards(self, event: RelationEvent) -> Dict: if dashboards_path: - def _is_dashbaord(p: Path) -> bool: - return p.is_file and p.name.endswith((".json", ".json.tmpl", ".tmpl")) + def _is_dashboard(p: Path) -> bool: + return p.is_file() and p.name.endswith((".json", ".json.tmpl", ".tmpl")) - for path in filter(_is_dashbaord, Path(dashboards_path).glob("*")): + for path in filter(_is_dashboard, Path(dashboards_path).glob("*")): # path = Path(path) if event.app.name in path.name: id = "file:{}".format(path.stem) @@ -1539,6 +1769,7 @@ def _content_to_dashboard_object(self, content: str, event: RelationEvent) -> Di "charm": event.app.name, "content": content, "juju_topology": self._juju_topology(event), + "inject_dropdowns": True, } # This is not actually used in the dashboards, but is present to provide a secondary @@ -1551,3 +1782,128 @@ def _juju_topology(self, event: RelationEvent) -> Dict: "application": event.app.name, "unit": event.unit.name, } + + +class CosTool: + """Uses cos-tool to inject label matchers into alert rule expressions and validate rules.""" + + _path = None + _disabled = False + + def __init__(self, charm): + self._charm = charm + + @property + def path(self): + """Lazy lookup of the path of cos-tool.""" + if self._disabled: + return None + if not self._path: + self._path = self._get_tool_path() + if not self._path: + logger.debug("Skipping injection of juju topology as label matchers") + self._disabled = True + return self._path + + def apply_label_matchers(self, rules: dict, type: str) -> dict: + """Will apply label matchers to the expression of all alerts in all supplied groups.""" + if not self.path: + return rules + for group in rules["groups"]: + rules_in_group = group.get("rules", []) + for rule in rules_in_group: + topology = {} + # if the user for some reason has provided juju_unit, we'll need to honor it + # in most cases, however, this will be empty + for label in [ + "juju_model", + "juju_model_uuid", + "juju_application", + "juju_charm", + "juju_unit", + ]: + if label in rule["labels"]: + topology[label] = rule["labels"][label] + + rule["expr"] = self.inject_label_matchers(rule["expr"], topology, type) + return rules + + def validate_alert_rules(self, rules: dict) -> Tuple[bool, str]: + """Will validate correctness of alert rules, returning a boolean and any errors.""" + if not self.path: + logger.debug("`cos-tool` unavailable. Not validating alert correctness.") + return True, "" + + with tempfile.TemporaryDirectory() as tmpdir: + rule_path = Path(tmpdir + "/validate_rule.yaml") + + # Smash "our" rules format into what upstream actually uses, which is more like: + # + # groups: + # - name: foo + # rules: + # - alert: SomeAlert + # expr: up + # - alert: OtherAlert + # expr: up + transformed_rules = {"groups": []} # type: ignore + for rule in rules["groups"]: + transformed = {"name": str(uuid.uuid4()), "rules": [rule]} + transformed_rules["groups"].append(transformed) + + rule_path.write_text(yaml.dump(transformed_rules)) + + args = [str(self.path), "validate", str(rule_path)] + # noinspection PyBroadException + try: + self._exec(args) + return True, "" + except subprocess.CalledProcessError as e: + logger.debug("Validating the rules failed: %s", e.output) + return False, ", ".join([line for line in e.output if "error validating" in line]) + + def inject_label_matchers(self, expression: str, topology: dict, type: str) -> str: + """Add label matchers to an expression.""" + if not topology: + return expression + if not self.path: + logger.debug("`cos-tool` unavailable. Leaving expression unchanged: %s", expression) + return expression + args = [str(self.path), "--format", type, "transform"] + + variable_topology = {k: "${}".format(k) for k in topology.keys()} + args.extend( + [ + "--label-matcher={}={}".format(key, value) + for key, value in variable_topology.items() + ] + ) + + # Pass a leading "--" so expressions with a negation or subtraction aren't interpreted as + # flags + args.extend(["--", "{}".format(expression)]) + # noinspection PyBroadException + try: + return self._exec(args) + except subprocess.CalledProcessError as e: + logger.debug('Applying the expression failed: "%s", falling back to the original', e) + return expression + + def _get_tool_path(self) -> Optional[Path]: + arch = platform.machine() + arch = "amd64" if arch == "x86_64" else arch + res = "cos-tool-{}".format(arch) + try: + path = Path(res).resolve() + path.chmod(0o777) + return path + except NotImplementedError: + logger.debug("System lacks support for chmod") + except FileNotFoundError: + logger.debug('Could not locate cos-tool at: "{}"'.format(res)) + return None + + def _exec(self, cmd) -> str: + result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE) + output = result.stdout.decode("utf-8").strip() + return output diff --git a/charms/mlflow-server/lib/charms/observability_libs/v0/juju_topology.py b/charms/mlflow-server/lib/charms/observability_libs/v0/juju_topology.py index c985b1e7..e68e93ff 100644 --- a/charms/mlflow-server/lib/charms/observability_libs/v0/juju_topology.py +++ b/charms/mlflow-server/lib/charms/observability_libs/v0/juju_topology.py @@ -67,16 +67,15 @@ ``` """ - -import re from collections import OrderedDict from typing import Dict, List, Optional +from uuid import UUID # The unique Charmhub library identifier, never change it LIBID = "bced1658f20f49d28b88f61f83c2d232" LIBAPI = 0 -LIBPATCH = 2 +LIBPATCH = 4 class InvalidUUIDError(Exception): @@ -95,8 +94,8 @@ def __init__( model: str, model_uuid: str, application: str, - unit: str = None, - charm_name: str = None, + unit: Optional[str] = None, + charm_name: Optional[str] = None, ): """Build a JujuTopology object. @@ -126,29 +125,18 @@ def __init__( self._unit = unit def is_valid_uuid(self, uuid): - """Validate the supplied UUID against the Juju Model UUID pattern.""" - # TODO: - # Harness is harcoding an UUID that is v1 not v4: f2c1b2a6-e006-11eb-ba80-0242ac130004 - # See: https://github.com/canonical/operator/issues/779 - # - # >>> uuid.UUID("f2c1b2a6-e006-11eb-ba80-0242ac130004").version - # 1 - # - # we changed the validation of the 3ed UUID block: 4[a-f0-9]{3} -> [a-f0-9]{4} - # See: https://github.com/canonical/operator/blob/main/ops/testing.py#L1094 - # - # Juju in fact generates a UUID v4: https://github.com/juju/utils/blob/master/uuid.go#L62 - # but does not validate it is actually v4: - # See: - # - https://github.com/juju/utils/blob/master/uuid.go#L22 - # - https://github.com/juju/schema/blob/master/strings.go#L79 - # - # Once Harness fixes this, we should remove this comment and refactor the regex or - # the entire method using the uuid module to validate UUIDs - regex = re.compile( - "^[a-f0-9]{8}-?[a-f0-9]{4}-?[a-f0-9]{4}-?[89ab][a-f0-9]{3}-?[a-f0-9]{12}$" - ) - return bool(regex.match(uuid)) + """Validate the supplied UUID against the Juju Model UUID pattern. + + Args: + uuid: string that needs to be checked if it is valid v4 UUID. + + Returns: + True if parameter is a valid v4 UUID, False otherwise. + """ + try: + return str(UUID(uuid, version=4)) == uuid + except (ValueError, TypeError): + return False @classmethod def from_charm(cls, charm): @@ -193,7 +181,10 @@ def from_dict(cls, data: dict): ) def as_dict( - self, *, remapped_keys: Dict[str, str] = None, excluded_keys: List[str] = None + self, + *, + remapped_keys: Optional[Dict[str, str]] = None, + excluded_keys: Optional[List[str]] = None, ) -> OrderedDict: """Format the topology information into an ordered dict. diff --git a/charms/mlflow-server/lib/charms/prometheus_k8s/v0/prometheus_scrape.py b/charms/mlflow-server/lib/charms/prometheus_k8s/v0/prometheus_scrape.py index 85e922a9..92ac4506 100644 --- a/charms/mlflow-server/lib/charms/prometheus_k8s/v0/prometheus_scrape.py +++ b/charms/mlflow-server/lib/charms/prometheus_k8s/v0/prometheus_scrape.py @@ -1,6 +1,8 @@ # Copyright 2021 Canonical Ltd. # See LICENSE file for licensing details. -"""## Overview. +"""Source code can be found on GitHub at canonical/observability-libs/lib/charms/observability_libs. + +## Overview This document explains how to integrate with the Prometheus charm for the purpose of providing a metrics endpoint to Prometheus. It @@ -11,6 +13,13 @@ shared between Prometheus charms and any other charm that intends to provide a scrape target for Prometheus. +## Dependencies + +Using this library requires you to fetch the juju_topology library from +[observability-libs](https://charmhub.io/observability-libs/libraries/juju_topology). + +`charmcraft fetch-lib charms.observability_libs.v0.juju_topology` + ## Provider Library Usage This Prometheus charm interacts with its scrape targets using its @@ -246,7 +255,11 @@ def _on_scrape_targets_changed(self, event): - a single rule format, which is a simplified subset of the official format, comprising a single alert rule per file, using the same YAML fields. -The file name must have the `.rule` extension. +The file name must have one of the following extensions: +- `.rule` +- `.rules` +- `.yml` +- `.yaml` An example of the contents of such a file in the custom single rule format is shown below. @@ -322,14 +335,16 @@ def _on_scrape_targets_changed(self, event): import socket import subprocess import tempfile -import uuid +from collections import defaultdict from pathlib import Path -from typing import Dict, List, Optional, Tuple, Union +from typing import Callable, Dict, List, Optional, Tuple, Union +from urllib.parse import urlparse import yaml from charms.observability_libs.v0.juju_topology import JujuTopology from ops.charm import CharmBase, RelationRole from ops.framework import BoundEvent, EventBase, EventSource, Object, ObjectEvents +from ops.model import Relation # The unique Charmhub library identifier, never change it LIBID = "bc84295fef5f4049878f07b131968ee2" @@ -339,7 +354,7 @@ def _on_scrape_targets_changed(self, event): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 21 +LIBPATCH = 25 logger = logging.getLogger(__name__) @@ -356,7 +371,10 @@ def _on_scrape_targets_changed(self, event): "sample_limit", "label_limit", "label_name_length_limit", - "label_value_lenght_limit", + "label_value_length_limit", + "scheme", + "basic_auth", + "tls_config", } DEFAULT_JOB = { "metrics_path": "/metrics", @@ -370,6 +388,216 @@ def _on_scrape_targets_changed(self, event): DEFAULT_ALERT_RULES_RELATIVE_PATH = "./src/prometheus_alert_rules" +class PrometheusConfig: + """A namespace for utility functions for manipulating the prometheus config dict.""" + + # relabel instance labels so that instance identifiers are globally unique + # stable over unit recreation + topology_relabel_config = { + "source_labels": ["juju_model", "juju_model_uuid", "juju_application"], + "separator": "_", + "target_label": "instance", + "regex": "(.*)", + } + + topology_relabel_config_wildcard = { + "source_labels": ["juju_model", "juju_model_uuid", "juju_application", "juju_unit"], + "separator": "_", + "target_label": "instance", + "regex": "(.*)", + } + + @staticmethod + def sanitize_scrape_config(job: dict) -> dict: + """Restrict permissible scrape configuration options. + + If job is empty then a default job is returned. The + default job is + + ``` + { + "metrics_path": "/metrics", + "static_configs": [{"targets": ["*:80"]}], + } + ``` + + Args: + job: a dict containing a single Prometheus job + specification. + + Returns: + a dictionary containing a sanitized job specification. + """ + sanitized_job = DEFAULT_JOB.copy() + sanitized_job.update({key: value for key, value in job.items() if key in ALLOWED_KEYS}) + return sanitized_job + + @staticmethod + def sanitize_scrape_configs(scrape_configs: List[dict]) -> List[dict]: + """A vectorized version of `sanitize_scrape_config`.""" + return [PrometheusConfig.sanitize_scrape_config(job) for job in scrape_configs] + + @staticmethod + def prefix_job_names(scrape_configs: List[dict], prefix: str) -> List[dict]: + """Adds the given prefix to all the job names in the given scrape_configs list.""" + modified_scrape_configs = [] + for scrape_config in scrape_configs: + job_name = scrape_config.get("job_name") + modified = scrape_config.copy() + modified["job_name"] = prefix + "_" + job_name if job_name else prefix + modified_scrape_configs.append(modified) + + return modified_scrape_configs + + @staticmethod + def expand_wildcard_targets_into_individual_jobs( + scrape_jobs: List[dict], + hosts: Dict[str, Tuple[str, str]], + topology: JujuTopology = None, + ) -> List[dict]: + """Extract wildcard hosts from the given scrape_configs list into separate jobs. + + Args: + scrape_jobs: list of scrape jobs. + hosts: a dictionary mapping host names to host address for + all units of the relation for which this job configuration + must be constructed. + topology: optional arg for adding topology labels to scrape targets. + """ + # hosts = self._relation_hosts(relation) + + modified_scrape_jobs = [] + for job in scrape_jobs: + static_configs = job.get("static_configs") + if not static_configs: + continue + + # When a single unit specified more than one wildcard target, then they are expanded + # into a static_config per target + non_wildcard_static_configs = [] + + for static_config in static_configs: + targets = static_config.get("targets") + if not targets: + continue + + # All non-wildcard targets remain in the same static_config + non_wildcard_targets = [] + + # All wildcard targets are extracted to a job per unit. If multiple wildcard + # targets are specified, they remain in the same static_config (per unit). + wildcard_targets = [] + + for target in targets: + match = re.compile(r"\*(?:(:\d+))?").match(target) + if match: + # This is a wildcard target. + # Need to expand into separate jobs and remove it from this job here + wildcard_targets.append(target) + else: + # This is not a wildcard target. Copy it over into its own static_config. + non_wildcard_targets.append(target) + + # All non-wildcard targets remain in the same static_config + if non_wildcard_targets: + non_wildcard_static_config = static_config.copy() + non_wildcard_static_config["targets"] = non_wildcard_targets + + if topology: + # When non-wildcard targets (aka fully qualified hostnames) are specified, + # there is no reliable way to determine the name (Juju topology unit name) + # for such a target. Therefore labeling with Juju topology, excluding the + # unit name. + non_wildcard_static_config["labels"] = { + **non_wildcard_static_config.get("labels", {}), + **topology.label_matcher_dict, + } + + non_wildcard_static_configs.append(non_wildcard_static_config) + + # Extract wildcard targets into individual jobs + if wildcard_targets: + for unit_name, (unit_hostname, unit_path) in hosts.items(): + modified_job = job.copy() + modified_job["static_configs"] = [static_config.copy()] + modified_static_config = modified_job["static_configs"][0] + modified_static_config["targets"] = [ + target.replace("*", unit_hostname) for target in wildcard_targets + ] + + unit_num = unit_name.split("/")[-1] + job_name = modified_job.get("job_name", "unnamed-job") + "-" + unit_num + modified_job["job_name"] = job_name + modified_job["metrics_path"] = unit_path + ( + job.get("metrics_path") or "/metrics" + ) + + if topology: + # Add topology labels + modified_static_config["labels"] = { + **modified_static_config.get("labels", {}), + **topology.label_matcher_dict, + **{"juju_unit": unit_name}, + } + + # Instance relabeling for topology should be last in order. + modified_job["relabel_configs"] = modified_job.get( + "relabel_configs", [] + ) + [PrometheusConfig.topology_relabel_config_wildcard] + + modified_scrape_jobs.append(modified_job) + + if non_wildcard_static_configs: + modified_job = job.copy() + modified_job["static_configs"] = non_wildcard_static_configs + modified_job["metrics_path"] = modified_job.get("metrics_path") or "/metrics" + + if topology: + # Instance relabeling for topology should be last in order. + modified_job["relabel_configs"] = modified_job.get("relabel_configs", []) + [ + PrometheusConfig.topology_relabel_config + ] + + modified_scrape_jobs.append(modified_job) + + return modified_scrape_jobs + + @staticmethod + def render_alertmanager_static_configs(alertmanagers: List[str]): + """Render the alertmanager static_configs section from a list of URLs. + + Each target must be in the hostname:port format, and prefixes are specified in a separate + key. Therefore, with ingress in place, would need to extract the path into the + `path_prefix` key, which is higher up in the config hierarchy. + + https://prometheus.io/docs/prometheus/latest/configuration/configuration/#alertmanager_config + + Args: + alertmanagers: List of alertmanager URLs. + + Returns: + A dict representation for the static_configs section. + """ + # Make sure it's a valid url so urlparse could parse it. + scheme = re.compile(r"^https?://") + sanitized = [am if scheme.search(am) else "http://" + am for am in alertmanagers] + + # Create a mapping from paths to netlocs + # Group alertmanager targets into a dictionary of lists: + # {path: [netloc1, netloc2]} + paths = defaultdict(list) # type: Dict[str, List[str]] + for parsed in map(urlparse, sanitized): + path = parsed.path or "/" + paths[path].append(parsed.netloc) + + return { + "alertmanagers": [ + {"path_prefix": path_prefix, "static_configs": [{"targets": netlocs}]} + for path_prefix, netlocs in paths.items() + ] + } + + class RelationNotFoundError(Exception): """Raised if there is no relation with the given name is found.""" @@ -506,31 +734,6 @@ def _validate_relation_by_interface_and_direction( raise Exception("Unexpected RelationDirection: {}".format(expected_relation_role)) -def _sanitize_scrape_configuration(job) -> dict: - """Restrict permissible scrape configuration options. - - If job is empty then a default job is returned. The - default job is - - ``` - { - "metrics_path": "/metrics", - "static_configs": [{"targets": ["*:80"]}], - } - ``` - - Args: - job: a dict containing a single Prometheus job - specification. - - Returns: - a dictionary containing a sanitized job specification. - """ - sanitized_job = DEFAULT_JOB.copy() - sanitized_job.update({key: value for key, value in job.items() if key in ALLOWED_KEYS}) - return sanitized_job - - class InvalidAlertRulePathError(Exception): """Raised if the alert rules folder cannot be found or is otherwise invalid.""" @@ -639,6 +842,12 @@ def _from_file(self, root_path: Path, file_path: Path) -> List[dict]: logger.error("Failed to read alert rules from %s: %s", file_path.name, e) return [] + if not rule_file: + logger.warning("Empty rules file: %s", file_path.name) + return [] + if not isinstance(rule_file, dict): + logger.error("Invalid rules file (must be a dict): %s", file_path.name) + return [] if _is_official_alert_rule_format(rule_file): alert_groups = rule_file["groups"] elif _is_single_alert_rule_format(rule_file): @@ -734,7 +943,9 @@ def _from_dir(self, dir_path: Path, recursive: bool) -> List[dict]: alert_groups = [] # type: List[dict] # Gather all alerts into a list of groups - for file_path in self._multi_suffix_glob(dir_path, [".rule", ".rules"], recursive): + for file_path in self._multi_suffix_glob( + dir_path, [".rule", ".rules", ".yml", ".yaml"], recursive + ): alert_groups_from_file = self._from_file(dir_path, file_path) if alert_groups_from_file: logger.debug("Reading alert rule from %s", file_path) @@ -920,7 +1131,7 @@ def alerts(self) -> dict: for topology_identifier, alert_rule_groups in self.metrics_consumer.alerts().items(): filename = "juju_" + topology_identifier + ".rules" path = os.path.join(PROMETHEUS_RULES_DIR, filename) - rules = yaml.dump(alert_rule_groups) + rules = yaml.safe_dump(alert_rule_groups) container.push(path, rules, make_dirs=True) ``` @@ -937,7 +1148,6 @@ def alerts(self) -> dict: if not alert_rules: continue - identifier = None try: scrape_metadata = json.loads(relation.data[relation.app]["scrape_metadata"]) identifier = JujuTopology.from_dict(scrape_metadata).identifier @@ -1036,34 +1246,22 @@ def _static_scrape_config(self, relation) -> list: if not scrape_metadata: return scrape_jobs - job_name_prefix = "juju_{}_prometheus_scrape".format( - JujuTopology.from_dict(scrape_metadata).identifier - ) - hosts = self._relation_hosts(relation) + topology = JujuTopology.from_dict(scrape_metadata) - labeled_job_configs = [] - for job in scrape_jobs: - config = self._labeled_static_job_config( - _sanitize_scrape_configuration(job), - job_name_prefix, - hosts, - scrape_metadata, - ) - labeled_job_configs.append(config) + job_name_prefix = "juju_{}_prometheus_scrape".format(topology.identifier) + scrape_jobs = PrometheusConfig.prefix_job_names(scrape_jobs, job_name_prefix) + scrape_jobs = PrometheusConfig.sanitize_scrape_configs(scrape_jobs) - return labeled_job_configs + hosts = self._relation_hosts(relation) - def _relation_hosts(self, relation) -> dict: - """Fetch unit names and address of all metrics provider units for a single relation. + scrape_jobs = PrometheusConfig.expand_wildcard_targets_into_individual_jobs( + scrape_jobs, hosts, topology + ) - Args: - relation: An `ops.model.Relation` object for which the unit name to - address mapping is required. + return scrape_jobs - Returns: - A dictionary that maps unit names to unit addresses for - the specified relation. - """ + def _relation_hosts(self, relation: Relation) -> Dict[str, Tuple[str, str]]: + """Returns a mapping from unit names to (address, path) tuples, for the given relation.""" hosts = {} for unit in relation.units: # TODO deprecate and remove unit.name @@ -1072,91 +1270,12 @@ def _relation_hosts(self, relation) -> dict: unit_address = relation.data[unit].get( "prometheus_scrape_unit_address" ) or relation.data[unit].get("prometheus_scrape_host") + unit_path = relation.data[unit].get("prometheus_scrape_unit_path", "") if unit_name and unit_address: - hosts.update({unit_name: unit_address}) + hosts.update({unit_name: (unit_address, unit_path)}) return hosts - def _labeled_static_job_config(self, job, job_name_prefix, hosts, scrape_metadata) -> dict: - """Construct labeled job configuration for a single job. - - Args: - - job: a dictionary representing the job configuration as obtained from - `MetricsEndpointProvider` over relation data. - job_name_prefix: a string that may either be used as the - job name if the job has no associated name or used as a prefix for - the job if it does have a job name. - hosts: a dictionary mapping host names to host address for - all units of the relation for which this job configuration - must be constructed. - scrape_metadata: scrape configuration metadata obtained - from `MetricsEndpointProvider` from the same relation for - which this job configuration is being constructed. - - Returns: - A dictionary representing a Prometheus job configuration - for a single job. - """ - name = job.get("job_name") - job_name = "{}_{}".format(job_name_prefix, name) if name else job_name_prefix - - labeled_job = job.copy() - labeled_job["job_name"] = job_name - - static_configs = job.get("static_configs") - labeled_job["static_configs"] = [] - - # relabel instance labels so that instance identifiers are globally unique - # stable over unit recreation - instance_relabel_config = { - "source_labels": ["juju_model", "juju_model_uuid", "juju_application"], - "separator": "_", - "target_label": "instance", - "regex": "(.*)", - } - - # label all static configs in the Prometheus job - # labeling inserts Juju topology information and - # sets a relable config for instance labels - for static_config in static_configs: - labels = static_config.get("labels", {}) if static_configs else {} - all_targets = static_config.get("targets", []) - - # split all targets into those which will have unit labels - # and those which will not - ports = [] - unitless_targets = [] - for target in all_targets: - host, port = self._target_parts(target) - if host.strip() == "*": - ports.append(port.strip()) - else: - unitless_targets.append(target) - - # label scrape targets that do not have unit labels - if unitless_targets: - unitless_config = self._labeled_unitless_config( - unitless_targets, labels, scrape_metadata - ) - labeled_job["static_configs"].append(unitless_config) - - # label scrape targets that do have unit labels - for host_name, host_address in hosts.items(): - static_config = self._labeled_unit_config( - host_name, host_address, ports, labels, scrape_metadata - ) - labeled_job["static_configs"].append(static_config) - if "juju_unit" not in instance_relabel_config["source_labels"]: - instance_relabel_config["source_labels"].append("juju_unit") # type: ignore - - # ensure topology relabeling of instance label is last in order of relabelings - relabel_configs = job.get("relabel_configs", []) - relabel_configs.append(instance_relabel_config) - labeled_job["relabel_configs"] = relabel_configs - - return labeled_job - def _target_parts(self, target) -> list: """Extract host and port from a wildcard target. @@ -1177,92 +1296,11 @@ def _target_parts(self, target) -> list: return parts - def _set_juju_labels(self, labels, scrape_metadata) -> dict: - """Create a copy of metric labels with Juju topology information. - - Args: - labels: a dictionary containing Prometheus metric labels. - scrape_metadata: scrape related metadata provided by - `MetricsEndpointProvider`. - - Returns: - a copy of the `labels` dictionary augmented with Juju - topology information with the exception of unit name. - """ - juju_labels = labels.copy() # deep copy not needed - juju_labels.update(JujuTopology.from_dict(scrape_metadata).label_matcher_dict) - - return juju_labels - - def _labeled_unitless_config(self, targets, labels, scrape_metadata) -> dict: - """Static scrape configuration for fully qualified host addresses. - - Fully qualified hosts are those scrape targets for which the - address are specified by the `MetricsEndpointProvider` as part - of the scrape job specification set in application relation data. - The address specified need not belong to any unit of the - `MetricsEndpointProvider` charm. As a result there is no reliable - way to determine the name (Juju topology unit name) for such a - target. - - Args: - targets: a list of addresses of fully qualified hosts. - labels: labels specified by `MetricsEndpointProvider` clients - which are associated with `targets`. - scrape_metadata: scrape related metadata provided by `MetricsEndpointProvider`. - - Returns: - A dictionary containing the static scrape configuration - for a list of fully qualified hosts. - """ - juju_labels = self._set_juju_labels(labels, scrape_metadata) - unitless_config = {"targets": targets, "labels": juju_labels} - return unitless_config - - def _labeled_unit_config( - self, unit_name, host_address, ports, labels, scrape_metadata - ) -> dict: - """Static scrape configuration for a wildcard host. - - Wildcard hosts are those scrape targets whose name (Juju unit - name) and address (unit IP address) is set into unit relation - data by the `MetricsEndpointProvider` charm, which sets this - data for ALL its units. - - Args: - unit_name: a string representing the unit name of the wildcard host. - host_address: a string representing the address of the wildcard host. - ports: list of ports on which this wildcard host exposes its metrics. - labels: a dictionary of labels provided by - `MetricsEndpointProvider` intended to be associated with - this wildcard host. - scrape_metadata: scrape related metadata provided by `MetricsEndpointProvider`. - - Returns: - A dictionary containing the static scrape configuration - for a single wildcard host. - """ - juju_labels = self._set_juju_labels(labels, scrape_metadata) - - juju_labels["juju_unit"] = unit_name - - static_config = {"labels": juju_labels} - - if ports: - targets = [] - for port in ports: - targets.append("{}:{}".format(host_address, port)) - static_config["targets"] = targets # type: ignore - else: - static_config["targets"] = [host_address] # type: ignore - - return static_config - def _dedupe_job_names(jobs: List[dict]): """Deduplicate a list of dicts by appending a hash to the value of the 'job_name' key. - Additionally fully dedeuplicate any identical jobs. + Additionally, fully de-duplicate any identical jobs. Args: jobs: A list of prometheus scrape jobs @@ -1345,6 +1383,8 @@ def __init__( jobs=None, alert_rules_path: str = DEFAULT_ALERT_RULES_RELATIVE_PATH, refresh_event: Optional[Union[BoundEvent, List[BoundEvent]]] = None, + external_url: str = "", + lookaside_jobs_callable: Callable = None, ): """Construct a metrics provider for a Prometheus charm. @@ -1430,7 +1470,7 @@ def __init__( Args: charm: a `CharmBase` object that manages this - `MetricsEndpointProvider` object. Typically this is + `MetricsEndpointProvider` object. Typically, this is `self` in the instantiating class. relation_name: an optional string name of the relation between `charm` and the Prometheus charmed service. The default is "metrics-endpoint". @@ -1449,6 +1489,13 @@ def __init__( The alert rules are automatically updated on charm upgrade. refresh_event: an optional bound event or list of bound events which will be observed to re-set scrape job data (IP address and others) + external_url: an optional argument that represents an external url that + can be generated by an Ingress or a Proxy. + lookaside_jobs_callable: an optional `Callable` which should be invoked + when the job configuration is built as a secondary mapping. The callable + should return a `List[Dict]` which is syntactically identical to the + `jobs` parameter, but can be updated out of step initialization of + this library without disrupting the 'global' job spec. Raises: RelationNotFoundError: If there is no relation in the charm's metadata.yaml @@ -1481,13 +1528,21 @@ def __init__( self._relation_name = relation_name # sanitize job configurations to the supported subset of parameters jobs = [] if jobs is None else jobs - self._jobs = [_sanitize_scrape_configuration(job) for job in jobs] + self._jobs = PrometheusConfig.sanitize_scrape_configs(jobs) + + if external_url: + external_url = ( + external_url if urlparse(external_url).scheme else ("http://" + external_url) + ) + self.external_url = external_url + self._lookaside_jobs = lookaside_jobs_callable events = self._charm.on[self._relation_name] - self.framework.observe(events.relation_joined, self._set_scrape_job_spec) self.framework.observe(events.relation_changed, self._on_relation_changed) if not refresh_event: + # FIXME remove once podspec charms are verified. + # `self.set_scrape_job_spec()` is called every re-init so this should not be needed. if len(self._charm.meta.containers) == 1: if "kubernetes" in self._charm.meta.series: # This is a podspec charm @@ -1510,12 +1565,18 @@ def __init__( refresh_event = [refresh_event] for ev in refresh_event: - self.framework.observe(ev, self._set_unit_ip) - - self.framework.observe(self._charm.on.upgrade_charm, self._set_scrape_job_spec) - - # If there is no leader during relation_joined we will still need to set alert rules. - self.framework.observe(self._charm.on.leader_elected, self._set_scrape_job_spec) + self.framework.observe(ev, self.set_scrape_job_spec) + + # Update relation data every reinit. If instead we used event hooks then observing only + # relation-joined would not be sufficient: + # - Would need to observe leader-elected, in case there was no leader during + # relation-joined. + # - If later related to an ingress provider, then would need to register and wait for + # update-status interval to elapse before changes would apply. + # - The ingerss-ready custom event is currently emitted prematurely and cannot be relied + # upon: https://github.com/canonical/traefik-k8s-operator/issues/78 + # NOTE We may still end up waiting for update-status before changes are applied. + self.set_scrape_job_spec() def _on_relation_changed(self, event): """Check for alert rule messages in the relation data before moving on.""" @@ -1531,18 +1592,21 @@ def _on_relation_changed(self, event): else: self.on.alert_rule_status_changed.emit(valid=valid, errors=errors) - self._set_scrape_job_spec(event) + def update_scrape_job_spec(self, jobs): + """Update scrape job specification.""" + self._jobs = PrometheusConfig.sanitize_scrape_configs(jobs) + self.set_scrape_job_spec() - def _set_scrape_job_spec(self, event): + def set_scrape_job_spec(self, _=None): """Ensure scrape target information is made available to prometheus. When a metrics provider charm is related to a prometheus charm, the metrics provider sets specification and metadata related to its own scrape configuration. This information is set using Juju application - data. In addition each of the consumer units also sets its own + data. In addition, each of the consumer units also sets its own host address in Juju unit relation data. """ - self._set_unit_ip(event) + self._set_unit_ip() if not self._charm.unit.is_leader(): return @@ -1562,22 +1626,34 @@ def _set_scrape_job_spec(self, event): # that is written to the filesystem. relation.data[self._charm.app]["alert_rules"] = json.dumps(alert_rules_as_dict) - def _set_unit_ip(self, _): + def _set_unit_ip(self, _=None): """Set unit host address. Each time a metrics provider charm container is restarted it updates its own host address in the unit relation data for the prometheus charm. - The only argument specified is an event and it ignored. this is for expediency + The only argument specified is an event, and it ignored. This is for expediency to be able to use this method as an event handler, although no access to the event is actually needed. """ for relation in self._charm.model.relations[self._relation_name]: unit_ip = str(self._charm.model.get_binding(relation).network.bind_address) - relation.data[self._charm.unit]["prometheus_scrape_unit_address"] = ( - unit_ip if self._is_valid_unit_address(unit_ip) else socket.getfqdn() - ) + # TODO store entire url in relation data, instead of only select url parts. + + if self.external_url: + parsed = urlparse(self.external_url) + unit_address = parsed.hostname + path = parsed.path + elif self._is_valid_unit_address(unit_ip): + unit_address = unit_ip + path = "" + else: + unit_address = socket.getfqdn() + path = "" + + relation.data[self._charm.unit]["prometheus_scrape_unit_address"] = unit_address + relation.data[self._charm.unit]["prometheus_scrape_unit_path"] = path relation.data[self._charm.unit]["prometheus_scrape_unit_name"] = str( self._charm.model.unit.name ) @@ -1606,7 +1682,11 @@ def _scrape_jobs(self) -> list: A list of dictionaries, where each dictionary specifies a single scrape job for Prometheus. """ - return self._jobs if self._jobs else [DEFAULT_JOB] + jobs = self._jobs if self._jobs else [DEFAULT_JOB] + if callable(self._lookaside_jobs): + return jobs + PrometheusConfig.sanitize_scrape_configs(self._lookaside_jobs()) + else: + return jobs @property def _scrape_metadata(self) -> dict: @@ -1634,7 +1714,7 @@ class PrometheusRulesProvider(Object): relation_name: Name of the relation in `metadata.yaml` that has the `prometheus_scrape` interface. dir_path: Root directory for the collection of rule files. - recursive: Whether or not to scan for rule files recursively. + recursive: Whether to scan for rule files recursively. """ def __init__( @@ -1696,7 +1776,7 @@ class MetricsEndpointAggregator(Object): `MetricsEndpointAggregator` collects scrape target information from one or more related charms and forwards this to a `MetricsEndpointConsumer` - charm, which may be in a different Juju model. However it is + charm, which may be in a different Juju model. However, it is essential that `MetricsEndpointAggregator` itself resides in the same model as its scrape targets, as this is currently the only way to ensure in Juju that the `MetricsEndpointAggregator` will be able to @@ -1765,7 +1845,7 @@ class MetricsEndpointAggregator(Object): information, just like `MetricsEndpointProvider` and `MetricsEndpointConsumer` do. - By default `MetricsEndpointAggregator` ensures that Prometheus + By default, `MetricsEndpointAggregator` ensures that Prometheus "instance" labels refer to Juju topology. This ensures that instance labels are stable over unit recreation. While it is not advisable to change this option, if required it can be done by @@ -1778,7 +1858,7 @@ def __init__(self, charm, relation_names, relabel_instance=True): Args: charm: a `CharmBase` object that manages this - `MetricsEndpointAggregator` object. Typically this is + `MetricsEndpointAggregator` object. Typically, this is `self` in the instantiating class. relation_names: a dictionary with three keys. The value of the "scrape_target" and "alert_rules" keys are @@ -1843,7 +1923,7 @@ def _set_target_job_data(self, targets: dict, app_name: str, **kwargs) -> None: When there is any change in relation data with any scrape target, the Prometheus scrape job, for that specific target is updated. Additionally, if this method is called manually, do the - sameself. + same. Args: targets: a `dict` containing target information @@ -1985,7 +2065,7 @@ def _get_targets(self, relation) -> dict: Scrape target information is returned for each unit in the relation. This information contains the unit name, network - hostname (or address) for that unit, and port on which an + hostname (or address) for that unit, and port on which a metrics endpoint is exposed in that unit. Args: @@ -2142,7 +2222,7 @@ def _relabel_configs(self) -> list: labels are stable across unit recreation. Returns: - a list of Prometheus relabling configurations. Each item in + a list of Prometheus relabeling configurations. Each item in this list is one relabel configuration. """ return ( @@ -2216,22 +2296,7 @@ def validate_alert_rules(self, rules: dict) -> Tuple[bool, str]: with tempfile.TemporaryDirectory() as tmpdir: rule_path = Path(tmpdir + "/validate_rule.yaml") - - # Smash "our" rules format into what upstream actually uses, which is more like: - # - # groups: - # - name: foo - # rules: - # - alert: SomeAlert - # expr: up - # - alert: OtherAlert - # expr: up - transformed_rules = {"groups": []} # type: ignore - for rule in rules["groups"]: - transformed = {"name": str(uuid.uuid4()), "rules": [rule]} - transformed_rules["groups"].append(transformed) - - rule_path.write_text(yaml.dump(transformed_rules)) + rule_path.write_text(yaml.dump(rules)) args = [str(self.path), "validate", str(rule_path)] # noinspection PyBroadException @@ -2240,7 +2305,13 @@ def validate_alert_rules(self, rules: dict) -> Tuple[bool, str]: return True, "" except subprocess.CalledProcessError as e: logger.debug("Validating the rules failed: %s", e.output) - return False, ", ".join([line for line in e.output if "error validating" in line]) + return False, ", ".join( + [ + line + for line in e.output.decode("utf8").splitlines() + if "error validating" in line + ] + ) def inject_label_matchers(self, expression, topology) -> str: """Add label matchers to an expression.""" @@ -2277,6 +2348,5 @@ def _get_tool_path(self) -> Optional[Path]: return None def _exec(self, cmd) -> str: - result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE) - output = result.stdout.decode("utf-8").strip() - return output + result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + return result.stdout.decode("utf-8").strip() diff --git a/charms/mlflow-server/requirements.txt b/charms/mlflow-server/requirements.txt index c78b0b5b..2a27e272 100644 --- a/charms/mlflow-server/requirements.txt +++ b/charms/mlflow-server/requirements.txt @@ -1,5 +1,5 @@ boto3 -ops==1.3.0 +ops oci-image==1.0.0 ops-lib-mysql serialized-data-interface<0.4 diff --git a/charms/mlflow-server/tests/unit/__init__.py b/charms/mlflow-server/tests/unit/__init__.py new file mode 100644 index 00000000..95355c3e --- /dev/null +++ b/charms/mlflow-server/tests/unit/__init__.py @@ -0,0 +1,10 @@ +# +# Initialize unit tests +# + +"""Setup test environment for unit tests.""" + +import ops.testing + +# enable simulation of container networking +ops.testing.SIMULATE_CAN_CONNECT = True