From 0358c376bfdc0937884424bd059bae203e8ec8ed Mon Sep 17 00:00:00 2001 From: Dragomir Penev <6687393+dragomirp@users.noreply.github.com> Date: Sat, 5 Aug 2023 12:43:44 +0300 Subject: [PATCH] More resilient topology observer (#200) --- .github/workflows/ci.yaml | 6 +++--- src/charm.py | 3 +++ src/cluster_topology_observer.py | 14 +++++++++----- tests/unit/test_charm.py | 5 +++++ 4 files changed, 20 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 8218dfefab..8e2769600a 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -86,9 +86,9 @@ jobs: agent-versions: - "2.9.44" # renovate: latest juju 2 - "3.1.5" # renovate: latest juju 3 - include: - - tox-environments: db-admin-relation-integration - agent-versions: "2.9.43" # renovate: latest juju 2 + # include: + # - tox-environments: db-admin-relation-integration + # agent-versions: "2.9.43" # renovate: latest juju 2 name: ${{ matrix.tox-environments }} | ${{ matrix.agent-versions }} needs: - lib-check diff --git a/src/charm.py b/src/charm.py index 2383029ece..2497e3c987 100755 --- a/src/charm.py +++ b/src/charm.py @@ -1154,6 +1154,9 @@ def _on_update_status(self, _) -> None: self._set_primary_status_message() + # Restart topology observer if it is gone + self._observer.start_observer() + def _handle_processes_failures(self) -> bool: """Handle Patroni and PostgreSQL OS processes failures. diff --git a/src/cluster_topology_observer.py b/src/cluster_topology_observer.py index bbc44af4e4..6d529bbdb8 100644 --- a/src/cluster_topology_observer.py +++ b/src/cluster_topology_observer.py @@ -54,12 +54,16 @@ def __init__(self, charm: CharmBase): def start_observer(self): """Start the cluster topology observer running in a new process.""" - if ( - not isinstance(self._charm.unit.status, ActiveStatus) - or self._charm._peers is None - or "observer-pid" in self._charm._peers.data[self._charm.unit] - ): + if not isinstance(self._charm.unit.status, ActiveStatus) or self._charm._peers is None: return + if "observer-pid" in self._charm._peers.data[self._charm.unit]: + # Double check that the PID exists + pid = int(self._charm._peers.data[self._charm.unit]["observer-pid"]) + try: + os.kill(pid, 0) + return + except OSError: + pass logging.info("Starting cluster topology observer process") diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index c9c4c1cfca..1c82c977de 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -560,6 +560,7 @@ def test_on_set_password( ) @patch_network_get(private_address="1.1.1.1") + @patch("charm.ClusterTopologyObserver.start_observer") @patch("charm.PostgresqlOperatorCharm._set_primary_status_message") @patch("charm.Patroni.restart_patroni") @patch("charm.Patroni.is_member_isolated") @@ -583,6 +584,7 @@ def test_on_update_status( _is_member_isolated, _restart_patroni, _set_primary_status_message, + _start_observer, ): # Test before the cluster is initialised. self.charm.on.update_status.emit() @@ -625,8 +627,10 @@ def test_on_update_status( ) self.charm.on.update_status.emit() _restart_patroni.assert_called_once() + _start_observer.assert_called_once() @patch_network_get(private_address="1.1.1.1") + @patch("charm.ClusterTopologyObserver.start_observer") @patch("charm.PostgresqlOperatorCharm._set_primary_status_message") @patch("charm.PostgresqlOperatorCharm._handle_workload_failures") @patch("charm.PostgresqlOperatorCharm._update_relation_endpoints") @@ -652,6 +656,7 @@ def test_on_update_status_after_restore_operation( _update_relation_endpoints, _handle_workload_failures, _set_primary_status_message, + _, ): # Test when the restore operation fails. with self.harness.hooks_disabled():