From ac1f189726efa1112c84a145865b508e562e173c Mon Sep 17 00:00:00 2001 From: Dragomir Penev <6687393+dragomirp@users.noreply.github.com> Date: Sat, 26 Aug 2023 14:28:13 +0300 Subject: [PATCH] [DPE-2038] Basic alerts (#211) * Basic alert rules * Bump libs --- .../data_platform_libs/v0/data_interfaces.py | 148 +++++++----------- lib/charms/data_platform_libs/v0/upgrade.py | 30 ++-- src/loki_alert_rules/.gitkeep | 0 .../postgresql_rules.yaml | 22 +++ 4 files changed, 90 insertions(+), 110 deletions(-) create mode 100644 src/loki_alert_rules/.gitkeep create mode 100644 src/prometheus_alert_rules/postgresql_rules.yaml diff --git a/lib/charms/data_platform_libs/v0/data_interfaces.py b/lib/charms/data_platform_libs/v0/data_interfaces.py index 74db75dbe1..d894130e2b 100644 --- a/lib/charms/data_platform_libs/v0/data_interfaces.py +++ b/lib/charms/data_platform_libs/v0/data_interfaces.py @@ -316,7 +316,7 @@ def _on_topic_requested(self, event: TopicRequestedEvent): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 16 +LIBPATCH = 17 PYDEPS = ["ops>=2.0.0"] @@ -365,11 +365,11 @@ def diff(event: RelationChangedEvent, bucket: Union[Unit, Application]) -> Diff: return Diff(added, changed, deleted) -# Base DataProvides and DataRequires +# Base DataRelation -class DataProvides(Object, ABC): - """Base provides-side of the data products relation.""" +class DataRelation(Object, ABC): + """Base relation data mainpulation class.""" def __init__(self, charm: CharmBase, relation_name: str) -> None: super().__init__(charm, relation_name) @@ -379,23 +379,11 @@ def __init__(self, charm: CharmBase, relation_name: str) -> None: self.relation_name = relation_name self.framework.observe( charm.on[relation_name].relation_changed, - self._on_relation_changed, + self._on_relation_changed_event, ) - def _diff(self, event: RelationChangedEvent) -> Diff: - """Retrieves the diff of the data in the relation changed databag. - - Args: - event: relation changed event. - - Returns: - a Diff instance containing the added, deleted and changed - keys from the event relation databag. - """ - return diff(event, self.local_app) - @abstractmethod - def _on_relation_changed(self, event: RelationChangedEvent) -> None: + def _on_relation_changed_event(self, event: RelationChangedEvent) -> None: """Event emitted when the relation data has changed.""" raise NotImplementedError @@ -404,10 +392,11 @@ def fetch_relation_data(self) -> dict: This function can be used to retrieve data from a relation in the charm code when outside an event callback. + Function cannot be used in `*-relation-broken` events and will raise an exception. Returns: a dict of the values stored in the relation data bag - for all relation instances (indexed by the relation id). + for all relation instances (indexed by the relation ID). """ data = {} for relation in self.relations: @@ -430,13 +419,49 @@ def _update_relation_data(self, relation_id: int, data: dict) -> None: that should be updated in the relation. """ if self.local_unit.is_leader(): - if relation := self.charm.model.get_relation(self.relation_name, relation_id): + relation = self.charm.model.get_relation(self.relation_name, relation_id) + if relation: relation.data[self.local_app].update(data) + @staticmethod + def _is_relation_active(relation: Relation): + """Whether the relation is active based on contained data.""" + try: + _ = repr(relation.data) + return True + except (RuntimeError, ModelError): + return False + @property def relations(self) -> List[Relation]: """The list of Relation instances associated with this relation_name.""" - return list(self.charm.model.relations[self.relation_name]) + return [ + relation + for relation in self.charm.model.relations[self.relation_name] + if self._is_relation_active(relation) + ] + + +# Base DataProvides and DataRequires + + +class DataProvides(DataRelation): + """Base provides-side of the data products relation.""" + + def __init__(self, charm: CharmBase, relation_name: str) -> None: + super().__init__(charm, relation_name) + + def _diff(self, event: RelationChangedEvent) -> Diff: + """Retrieves the diff of the data in the relation changed databag. + + Args: + event: relation changed event. + + Returns: + a Diff instance containing the added, deleted and changed + keys from the event relation databag. + """ + return diff(event, self.local_app) def set_credentials(self, relation_id: int, username: str, password: str) -> None: """Set credentials. @@ -476,7 +501,7 @@ def set_tls_ca(self, relation_id: int, tls_ca: str) -> None: self._update_relation_data(relation_id, {"tls-ca": tls_ca}) -class DataRequires(Object, ABC): +class DataRequires(DataRelation): """Requires-side of the relation.""" def __init__( @@ -487,62 +512,16 @@ def __init__( ): """Manager of base client relations.""" super().__init__(charm, relation_name) - self.charm = charm self.extra_user_roles = extra_user_roles - self.local_app = self.charm.model.app - self.local_unit = self.charm.unit - self.relation_name = relation_name self.framework.observe( self.charm.on[relation_name].relation_created, self._on_relation_created_event ) - self.framework.observe( - self.charm.on[relation_name].relation_changed, self._on_relation_changed_event - ) @abstractmethod def _on_relation_created_event(self, event: RelationCreatedEvent) -> None: """Event emitted when the relation is created.""" raise NotImplementedError - @abstractmethod - def _on_relation_changed_event(self, event: RelationChangedEvent) -> None: - raise NotImplementedError - - def fetch_relation_data(self) -> dict: - """Retrieves data from relation. - - This function can be used to retrieve data from a relation - in the charm code when outside an event callback. - Function cannot be used in `*-relation-broken` events and will raise an exception. - - Returns: - a dict of the values stored in the relation data bag - for all relation instances (indexed by the relation ID). - """ - data = {} - for relation in self.relations: - data[relation.id] = ( - {key: value for key, value in relation.data[relation.app].items() if key != "data"} - if relation.app - else {} - ) - return data - - def _update_relation_data(self, relation_id: int, data: dict) -> None: - """Updates a set of key-value pairs in the relation. - - This function writes in the application data bag, therefore, - only the leader unit can call it. - - Args: - relation_id: the identifier for a particular relation. - data: dict containing the key-value pairs - that should be updated in the relation. - """ - if self.local_unit.is_leader(): - relation = self.charm.model.get_relation(self.relation_name, relation_id) - relation.data[self.local_app].update(data) - def _diff(self, event: RelationChangedEvent) -> Diff: """Retrieves the diff of the data in the relation changed databag. @@ -555,23 +534,6 @@ def _diff(self, event: RelationChangedEvent) -> Diff: """ return diff(event, self.local_unit) - @property - def relations(self) -> List[Relation]: - """The list of Relation instances associated with this relation_name.""" - return [ - relation - for relation in self.charm.model.relations[self.relation_name] - if self._is_relation_active(relation) - ] - - @staticmethod - def _is_relation_active(relation: Relation): - try: - _ = repr(relation.data) - return True - except (RuntimeError, ModelError): - return False - @staticmethod def _is_resource_created_for_relation(relation: Relation) -> bool: if not relation.app: @@ -797,7 +759,7 @@ class DatabaseProvides(DataProvides): def __init__(self, charm: CharmBase, relation_name: str) -> None: super().__init__(charm, relation_name) - def _on_relation_changed(self, event: RelationChangedEvent) -> None: + def _on_relation_changed_event(self, event: RelationChangedEvent) -> None: """Event emitted when the relation has changed.""" # Only the leader should handle this event. if not self.local_unit.is_leader(): @@ -938,11 +900,8 @@ def _assign_relation_alias(self, relation_id: int) -> None: # Return if an alias was already assigned to this relation # (like when there are more than one unit joining the relation). - if ( - self.charm.model.get_relation(self.relation_name, relation_id) - .data[self.local_unit] - .get("alias") - ): + relation = self.charm.model.get_relation(self.relation_name, relation_id) + if relation and relation.data[self.local_unit].get("alias"): return # Retrieve the available aliases (the ones that weren't assigned to any relation). @@ -955,7 +914,8 @@ def _assign_relation_alias(self, relation_id: int) -> None: # Set the alias in the unit relation databag of the specific relation. relation = self.charm.model.get_relation(self.relation_name, relation_id) - relation.data[self.local_unit].update({"alias": available_aliases[0]}) + if relation: + relation.data[self.local_unit].update({"alias": available_aliases[0]}) def _emit_aliased_event(self, event: RelationChangedEvent, event_name: str) -> None: """Emit an aliased event to a particular relation if it has an alias. @@ -1197,7 +1157,7 @@ class KafkaProvides(DataProvides): def __init__(self, charm: CharmBase, relation_name: str) -> None: super().__init__(charm, relation_name) - def _on_relation_changed(self, event: RelationChangedEvent) -> None: + def _on_relation_changed_event(self, event: RelationChangedEvent) -> None: """Event emitted when the relation has changed.""" # Only the leader should handle this event. if not self.local_unit.is_leader(): @@ -1377,7 +1337,7 @@ class OpenSearchProvides(DataProvides): def __init__(self, charm: CharmBase, relation_name: str) -> None: super().__init__(charm, relation_name) - def _on_relation_changed(self, event: RelationChangedEvent) -> None: + def _on_relation_changed_event(self, event: RelationChangedEvent) -> None: """Event emitted when the relation has changed.""" # Only the leader should handle this event. if not self.local_unit.is_leader(): diff --git a/lib/charms/data_platform_libs/v0/upgrade.py b/lib/charms/data_platform_libs/v0/upgrade.py index 4d528d05b5..9f22eea630 100644 --- a/lib/charms/data_platform_libs/v0/upgrade.py +++ b/lib/charms/data_platform_libs/v0/upgrade.py @@ -284,7 +284,7 @@ def restart(self, event) -> None: # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 8 +LIBPATCH = 11 PYDEPS = ["pydantic>=1.10,<2"] @@ -323,27 +323,26 @@ def verify_caret_requirements(version: str, requirement: str) -> bool: sem_version = build_complete_sem_ver(version) sem_requirement = build_complete_sem_ver(requirement) - # caret uses first non-zero character, not enough to just count '. - max_version_index = requirement.count(".") - for i, semver in enumerate(sem_requirement): - if semver != 0: - max_version_index = i - break + # caret uses first non-zero character, not enough to just count '.' + if sem_requirement[0] == 0: + max_version_index = requirement.count(".") + for i, semver in enumerate(sem_requirement): + if semver != 0: + max_version_index = i + break + else: + max_version_index = 0 for i in range(3): # version higher than first non-zero - if (i < max_version_index) and (sem_version[i] > sem_requirement[i]): + if (i <= max_version_index) and (sem_version[i] != sem_requirement[i]): return False # version either higher or lower than first non-zero - if (i == max_version_index) and (sem_version[i] != sem_requirement[i]): + if (i > max_version_index) and (sem_version[i] < sem_requirement[i]): return False - # valid - if (i > max_version_index) and (sem_version[i] > sem_requirement[i]): - return True - - return False + return True def verify_tilde_requirements(version: str, requirement: str) -> bool: @@ -818,7 +817,7 @@ def idle(self) -> Optional[bool]: Returns: True if all application units in idle state. Otherwise False """ - return self.cluster_state == "idle" + return set(self.unit_states) == {"idle"} @abstractmethod def pre_upgrade_check(self) -> None: @@ -1118,7 +1117,6 @@ def on_upgrade_changed(self, event: EventBase) -> None: return logger.debug("Did not find upgrade-stack or completed cluster state, deferring...") - event.defer() return # upgrade ongoing, set status for waiting units diff --git a/src/loki_alert_rules/.gitkeep b/src/loki_alert_rules/.gitkeep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/prometheus_alert_rules/postgresql_rules.yaml b/src/prometheus_alert_rules/postgresql_rules.yaml new file mode 100644 index 0000000000..9b3f98c564 --- /dev/null +++ b/src/prometheus_alert_rules/postgresql_rules.yaml @@ -0,0 +1,22 @@ +groups: + - name: PostgresqlExporterK8s + + rules: + # Based on https://samber.github.io/awesome-prometheus-alerts/rules#rule-postgresql-1-1 + - alert: PostgresqlDown + expr: pg_up == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql down (instance {{ $labels.instance }}) + description: "Postgresql instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Based on https://samber.github.io/awesome-prometheus-alerts/rules#rule-postgresql-1-2 + - alert: PostgresqlRestarted + expr: time() - pg_postmaster_start_time_seconds < 60 + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql restarted (instance {{ $labels.instance }}) + description: "Postgresql restarted\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"