Skip to content

Commit

Permalink
[DPE-2038] Basic alerts (#211)
Browse files Browse the repository at this point in the history
* Basic alert rules

* Bump libs
  • Loading branch information
dragomirp authored Aug 26, 2023
1 parent 19f7da8 commit ac1f189
Show file tree
Hide file tree
Showing 4 changed files with 90 additions and 110 deletions.
148 changes: 54 additions & 94 deletions lib/charms/data_platform_libs/v0/data_interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ def _on_topic_requested(self, event: TopicRequestedEvent):

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 16
LIBPATCH = 17

PYDEPS = ["ops>=2.0.0"]

Expand Down Expand Up @@ -365,11 +365,11 @@ def diff(event: RelationChangedEvent, bucket: Union[Unit, Application]) -> Diff:
return Diff(added, changed, deleted)


# Base DataProvides and DataRequires
# Base DataRelation


class DataProvides(Object, ABC):
"""Base provides-side of the data products relation."""
class DataRelation(Object, ABC):
"""Base relation data mainpulation class."""

def __init__(self, charm: CharmBase, relation_name: str) -> None:
super().__init__(charm, relation_name)
Expand All @@ -379,23 +379,11 @@ def __init__(self, charm: CharmBase, relation_name: str) -> None:
self.relation_name = relation_name
self.framework.observe(
charm.on[relation_name].relation_changed,
self._on_relation_changed,
self._on_relation_changed_event,
)

def _diff(self, event: RelationChangedEvent) -> Diff:
"""Retrieves the diff of the data in the relation changed databag.
Args:
event: relation changed event.
Returns:
a Diff instance containing the added, deleted and changed
keys from the event relation databag.
"""
return diff(event, self.local_app)

@abstractmethod
def _on_relation_changed(self, event: RelationChangedEvent) -> None:
def _on_relation_changed_event(self, event: RelationChangedEvent) -> None:
"""Event emitted when the relation data has changed."""
raise NotImplementedError

Expand All @@ -404,10 +392,11 @@ def fetch_relation_data(self) -> dict:
This function can be used to retrieve data from a relation
in the charm code when outside an event callback.
Function cannot be used in `*-relation-broken` events and will raise an exception.
Returns:
a dict of the values stored in the relation data bag
for all relation instances (indexed by the relation id).
for all relation instances (indexed by the relation ID).
"""
data = {}
for relation in self.relations:
Expand All @@ -430,13 +419,49 @@ def _update_relation_data(self, relation_id: int, data: dict) -> None:
that should be updated in the relation.
"""
if self.local_unit.is_leader():
if relation := self.charm.model.get_relation(self.relation_name, relation_id):
relation = self.charm.model.get_relation(self.relation_name, relation_id)
if relation:
relation.data[self.local_app].update(data)

@staticmethod
def _is_relation_active(relation: Relation):
"""Whether the relation is active based on contained data."""
try:
_ = repr(relation.data)
return True
except (RuntimeError, ModelError):
return False

@property
def relations(self) -> List[Relation]:
"""The list of Relation instances associated with this relation_name."""
return list(self.charm.model.relations[self.relation_name])
return [
relation
for relation in self.charm.model.relations[self.relation_name]
if self._is_relation_active(relation)
]


# Base DataProvides and DataRequires


class DataProvides(DataRelation):
"""Base provides-side of the data products relation."""

def __init__(self, charm: CharmBase, relation_name: str) -> None:
super().__init__(charm, relation_name)

def _diff(self, event: RelationChangedEvent) -> Diff:
"""Retrieves the diff of the data in the relation changed databag.
Args:
event: relation changed event.
Returns:
a Diff instance containing the added, deleted and changed
keys from the event relation databag.
"""
return diff(event, self.local_app)

def set_credentials(self, relation_id: int, username: str, password: str) -> None:
"""Set credentials.
Expand Down Expand Up @@ -476,7 +501,7 @@ def set_tls_ca(self, relation_id: int, tls_ca: str) -> None:
self._update_relation_data(relation_id, {"tls-ca": tls_ca})


class DataRequires(Object, ABC):
class DataRequires(DataRelation):
"""Requires-side of the relation."""

def __init__(
Expand All @@ -487,62 +512,16 @@ def __init__(
):
"""Manager of base client relations."""
super().__init__(charm, relation_name)
self.charm = charm
self.extra_user_roles = extra_user_roles
self.local_app = self.charm.model.app
self.local_unit = self.charm.unit
self.relation_name = relation_name
self.framework.observe(
self.charm.on[relation_name].relation_created, self._on_relation_created_event
)
self.framework.observe(
self.charm.on[relation_name].relation_changed, self._on_relation_changed_event
)

@abstractmethod
def _on_relation_created_event(self, event: RelationCreatedEvent) -> None:
"""Event emitted when the relation is created."""
raise NotImplementedError

@abstractmethod
def _on_relation_changed_event(self, event: RelationChangedEvent) -> None:
raise NotImplementedError

def fetch_relation_data(self) -> dict:
"""Retrieves data from relation.
This function can be used to retrieve data from a relation
in the charm code when outside an event callback.
Function cannot be used in `*-relation-broken` events and will raise an exception.
Returns:
a dict of the values stored in the relation data bag
for all relation instances (indexed by the relation ID).
"""
data = {}
for relation in self.relations:
data[relation.id] = (
{key: value for key, value in relation.data[relation.app].items() if key != "data"}
if relation.app
else {}
)
return data

def _update_relation_data(self, relation_id: int, data: dict) -> None:
"""Updates a set of key-value pairs in the relation.
This function writes in the application data bag, therefore,
only the leader unit can call it.
Args:
relation_id: the identifier for a particular relation.
data: dict containing the key-value pairs
that should be updated in the relation.
"""
if self.local_unit.is_leader():
relation = self.charm.model.get_relation(self.relation_name, relation_id)
relation.data[self.local_app].update(data)

def _diff(self, event: RelationChangedEvent) -> Diff:
"""Retrieves the diff of the data in the relation changed databag.
Expand All @@ -555,23 +534,6 @@ def _diff(self, event: RelationChangedEvent) -> Diff:
"""
return diff(event, self.local_unit)

@property
def relations(self) -> List[Relation]:
"""The list of Relation instances associated with this relation_name."""
return [
relation
for relation in self.charm.model.relations[self.relation_name]
if self._is_relation_active(relation)
]

@staticmethod
def _is_relation_active(relation: Relation):
try:
_ = repr(relation.data)
return True
except (RuntimeError, ModelError):
return False

@staticmethod
def _is_resource_created_for_relation(relation: Relation) -> bool:
if not relation.app:
Expand Down Expand Up @@ -797,7 +759,7 @@ class DatabaseProvides(DataProvides):
def __init__(self, charm: CharmBase, relation_name: str) -> None:
super().__init__(charm, relation_name)

def _on_relation_changed(self, event: RelationChangedEvent) -> None:
def _on_relation_changed_event(self, event: RelationChangedEvent) -> None:
"""Event emitted when the relation has changed."""
# Only the leader should handle this event.
if not self.local_unit.is_leader():
Expand Down Expand Up @@ -938,11 +900,8 @@ def _assign_relation_alias(self, relation_id: int) -> None:

# Return if an alias was already assigned to this relation
# (like when there are more than one unit joining the relation).
if (
self.charm.model.get_relation(self.relation_name, relation_id)
.data[self.local_unit]
.get("alias")
):
relation = self.charm.model.get_relation(self.relation_name, relation_id)
if relation and relation.data[self.local_unit].get("alias"):
return

# Retrieve the available aliases (the ones that weren't assigned to any relation).
Expand All @@ -955,7 +914,8 @@ def _assign_relation_alias(self, relation_id: int) -> None:

# Set the alias in the unit relation databag of the specific relation.
relation = self.charm.model.get_relation(self.relation_name, relation_id)
relation.data[self.local_unit].update({"alias": available_aliases[0]})
if relation:
relation.data[self.local_unit].update({"alias": available_aliases[0]})

def _emit_aliased_event(self, event: RelationChangedEvent, event_name: str) -> None:
"""Emit an aliased event to a particular relation if it has an alias.
Expand Down Expand Up @@ -1197,7 +1157,7 @@ class KafkaProvides(DataProvides):
def __init__(self, charm: CharmBase, relation_name: str) -> None:
super().__init__(charm, relation_name)

def _on_relation_changed(self, event: RelationChangedEvent) -> None:
def _on_relation_changed_event(self, event: RelationChangedEvent) -> None:
"""Event emitted when the relation has changed."""
# Only the leader should handle this event.
if not self.local_unit.is_leader():
Expand Down Expand Up @@ -1377,7 +1337,7 @@ class OpenSearchProvides(DataProvides):
def __init__(self, charm: CharmBase, relation_name: str) -> None:
super().__init__(charm, relation_name)

def _on_relation_changed(self, event: RelationChangedEvent) -> None:
def _on_relation_changed_event(self, event: RelationChangedEvent) -> None:
"""Event emitted when the relation has changed."""
# Only the leader should handle this event.
if not self.local_unit.is_leader():
Expand Down
30 changes: 14 additions & 16 deletions lib/charms/data_platform_libs/v0/upgrade.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ def restart(self, event) -> None:

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 8
LIBPATCH = 11

PYDEPS = ["pydantic>=1.10,<2"]

Expand Down Expand Up @@ -323,27 +323,26 @@ def verify_caret_requirements(version: str, requirement: str) -> bool:
sem_version = build_complete_sem_ver(version)
sem_requirement = build_complete_sem_ver(requirement)

# caret uses first non-zero character, not enough to just count '.
max_version_index = requirement.count(".")
for i, semver in enumerate(sem_requirement):
if semver != 0:
max_version_index = i
break
# caret uses first non-zero character, not enough to just count '.'
if sem_requirement[0] == 0:
max_version_index = requirement.count(".")
for i, semver in enumerate(sem_requirement):
if semver != 0:
max_version_index = i
break
else:
max_version_index = 0

for i in range(3):
# version higher than first non-zero
if (i < max_version_index) and (sem_version[i] > sem_requirement[i]):
if (i <= max_version_index) and (sem_version[i] != sem_requirement[i]):
return False

# version either higher or lower than first non-zero
if (i == max_version_index) and (sem_version[i] != sem_requirement[i]):
if (i > max_version_index) and (sem_version[i] < sem_requirement[i]):
return False

# valid
if (i > max_version_index) and (sem_version[i] > sem_requirement[i]):
return True

return False
return True


def verify_tilde_requirements(version: str, requirement: str) -> bool:
Expand Down Expand Up @@ -818,7 +817,7 @@ def idle(self) -> Optional[bool]:
Returns:
True if all application units in idle state. Otherwise False
"""
return self.cluster_state == "idle"
return set(self.unit_states) == {"idle"}

@abstractmethod
def pre_upgrade_check(self) -> None:
Expand Down Expand Up @@ -1118,7 +1117,6 @@ def on_upgrade_changed(self, event: EventBase) -> None:
return

logger.debug("Did not find upgrade-stack or completed cluster state, deferring...")
event.defer()
return

# upgrade ongoing, set status for waiting units
Expand Down
Empty file added src/loki_alert_rules/.gitkeep
Empty file.
22 changes: 22 additions & 0 deletions src/prometheus_alert_rules/postgresql_rules.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
groups:
- name: PostgresqlExporterK8s

rules:
# Based on https://samber.github.io/awesome-prometheus-alerts/rules#rule-postgresql-1-1
- alert: PostgresqlDown
expr: pg_up == 0
for: 0m
labels:
severity: critical
annotations:
summary: Postgresql down (instance {{ $labels.instance }})
description: "Postgresql instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Based on https://samber.github.io/awesome-prometheus-alerts/rules#rule-postgresql-1-2
- alert: PostgresqlRestarted
expr: time() - pg_postmaster_start_time_seconds < 60
for: 0m
labels:
severity: critical
annotations:
summary: Postgresql restarted (instance {{ $labels.instance }})
description: "Postgresql restarted\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

0 comments on commit ac1f189

Please sign in to comment.