Skip to content

Commit

Permalink
[DPE-3559] Stabilise restore cluster test (#351)
Browse files Browse the repository at this point in the history
* Improve unit status about no connection to primary

Signed-off-by: Marcelo Henrique Neppel <[email protected]>

* Update other users passwords in the second cluster

Signed-off-by: Marcelo Henrique Neppel <[email protected]>

* Add unit tests

Signed-off-by: Marcelo Henrique Neppel <[email protected]>

---------

Signed-off-by: Marcelo Henrique Neppel <[email protected]>
  • Loading branch information
marceloneppel authored Feb 17, 2024
1 parent 64b65b4 commit f843b52
Show file tree
Hide file tree
Showing 3 changed files with 235 additions and 23 deletions.
14 changes: 6 additions & 8 deletions src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@

logger = logging.getLogger(__name__)

NO_PRIMARY_MESSAGE = "no primary in the cluster"
PRIMARY_NOT_REACHABLE_MESSAGE = "waiting for primary to be reachable from this unit"
EXTENSIONS_DEPENDENCY_MESSAGE = "Unsatisfied plugin dependencies. Please check the logs"

Scopes = Literal[APP_SCOPE, UNIT_SCOPE]
Expand Down Expand Up @@ -387,7 +387,7 @@ def _on_peer_relation_departed(self, event: RelationDepartedEvent) -> None:
if self.primary_endpoint:
self._update_relation_endpoints()
else:
self.unit.status = BlockedStatus(NO_PRIMARY_MESSAGE)
self.unit.status = WaitingStatus(PRIMARY_NOT_REACHABLE_MESSAGE)
return

def _on_pgdata_storage_detaching(self, _) -> None:
Expand Down Expand Up @@ -513,10 +513,10 @@ def _update_new_unit_status(self) -> None:
# a failed switchover, so wait until the primary is elected.
if self.primary_endpoint:
self._update_relation_endpoints()
if not self.is_blocked or self.unit.status.message == NO_PRIMARY_MESSAGE:
if not self.is_blocked:
self.unit.status = ActiveStatus()
else:
self.unit.status = BlockedStatus(NO_PRIMARY_MESSAGE)
self.unit.status = WaitingStatus(PRIMARY_NOT_REACHABLE_MESSAGE)

def _reconfigure_cluster(self, event: HookEvent):
"""Reconfigure the cluster by adding and removing members IPs to it.
Expand Down Expand Up @@ -765,9 +765,7 @@ def _on_cluster_topology_change(self, _):
logger.info("Cluster topology changed")
if self.primary_endpoint:
self._update_relation_endpoints()
if self.is_blocked and self.unit.status.message == NO_PRIMARY_MESSAGE:
if self.primary_endpoint:
self.unit.status = ActiveStatus()
self.unit.status = ActiveStatus()

def _on_install(self, event: InstallEvent) -> None:
"""Install prerequisites for the application."""
Expand Down Expand Up @@ -837,7 +835,7 @@ def _on_leader_elected(self, event: LeaderElectedEvent) -> None:
if self.primary_endpoint:
self._update_relation_endpoints()
else:
self.unit.status = BlockedStatus(NO_PRIMARY_MESSAGE)
self.unit.status = WaitingStatus(PRIMARY_NOT_REACHABLE_MESSAGE)

def _on_config_changed(self, _) -> None:
"""Handle configuration changes, like enabling plugins."""
Expand Down
7 changes: 4 additions & 3 deletions tests/integration/ha_tests/test_restore_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,10 @@ async def test_build_and_deploy(ops_test: OpsTest) -> None:
primary = await get_primary(
ops_test, ops_test.model.applications[FIRST_APPLICATION].units[0].name
)
password = await get_password(ops_test, primary)
second_primary = ops_test.model.applications[SECOND_APPLICATION].units[0].name
await set_password(ops_test, second_primary, password=password)
for user in ["monitoring", "operator", "replication", "rewind"]:
password = await get_password(ops_test, primary, user)
second_primary = ops_test.model.applications[SECOND_APPLICATION].units[0].name
await set_password(ops_test, second_primary, user, password)
await ops_test.model.destroy_unit(second_primary)


Expand Down
237 changes: 225 additions & 12 deletions tests/unit/test_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import platform
import subprocess
import unittest
from unittest.mock import MagicMock, Mock, PropertyMock, mock_open, patch
from unittest.mock import MagicMock, Mock, PropertyMock, call, mock_open, patch

import pytest
from charms.operator_libs_linux.v2 import snap
Expand All @@ -14,6 +14,7 @@
PostgreSQLEnableDisableExtensionError,
PostgreSQLUpdateUserPasswordError,
)
from ops import Unit
from ops.framework import EventBase
from ops.model import (
ActiveStatus,
Expand All @@ -26,7 +27,11 @@
from parameterized import parameterized
from tenacity import RetryError

from charm import EXTENSIONS_DEPENDENCY_MESSAGE, NO_PRIMARY_MESSAGE, PostgresqlOperatorCharm
from charm import (
EXTENSIONS_DEPENDENCY_MESSAGE,
PRIMARY_NOT_REACHABLE_MESSAGE,
PostgresqlOperatorCharm,
)
from cluster import RemoveRaftMemberFailedError
from constants import PEER, POSTGRESQL_SNAP_NAME, SECRET_INTERNAL_LABEL, SNAP_PACKAGES
from tests.helpers import patch_network_get
Expand Down Expand Up @@ -208,12 +213,12 @@ def test_on_leader_elected(
_update_relation_endpoints.assert_called_once()
self.assertFalse(isinstance(self.harness.model.unit.status, BlockedStatus))

# Check for a BlockedStatus when there is no primary endpoint.
# Check for a WaitingStatus when the primary is not reachable yet.
_primary_endpoint.return_value = None
self.harness.set_leader(False)
self.harness.set_leader()
_update_relation_endpoints.assert_called_once() # Assert it was not called again.
self.assertTrue(isinstance(self.harness.model.unit.status, BlockedStatus))
self.assertTrue(isinstance(self.harness.model.unit.status, WaitingStatus))

def test_is_cluster_initialised(self):
# Test when the cluster was not initialised yet.
Expand Down Expand Up @@ -1270,15 +1275,14 @@ def test_on_cluster_topology_change(self, _primary_endpoint, _update_relation_en
def test_on_cluster_topology_change_keep_blocked(
self, _update_relation_endpoints, _primary_endpoint
):
self.harness.model.unit.status = BlockedStatus(NO_PRIMARY_MESSAGE)
self.harness.model.unit.status = WaitingStatus(PRIMARY_NOT_REACHABLE_MESSAGE)

self.charm._on_cluster_topology_change(Mock())

_update_relation_endpoints.assert_not_called()
self.assertEqual(_primary_endpoint.call_count, 2)
_primary_endpoint.assert_called_with()
self.assertTrue(isinstance(self.harness.model.unit.status, BlockedStatus))
self.assertEqual(self.harness.model.unit.status.message, NO_PRIMARY_MESSAGE)
_primary_endpoint.assert_called_once_with()
self.assertTrue(isinstance(self.harness.model.unit.status, WaitingStatus))
self.assertEqual(self.harness.model.unit.status.message, PRIMARY_NOT_REACHABLE_MESSAGE)

@patch(
"charm.PostgresqlOperatorCharm.primary_endpoint",
Expand All @@ -1289,13 +1293,12 @@ def test_on_cluster_topology_change_keep_blocked(
def test_on_cluster_topology_change_clear_blocked(
self, _update_relation_endpoints, _primary_endpoint
):
self.harness.model.unit.status = BlockedStatus(NO_PRIMARY_MESSAGE)
self.harness.model.unit.status = WaitingStatus(PRIMARY_NOT_REACHABLE_MESSAGE)

self.charm._on_cluster_topology_change(Mock())

_update_relation_endpoints.assert_called_once_with()
self.assertEqual(_primary_endpoint.call_count, 2)
_primary_endpoint.assert_called_with()
_primary_endpoint.assert_called_once_with()
self.assertTrue(isinstance(self.harness.model.unit.status, ActiveStatus))

@patch_network_get(private_address="1.1.1.1")
Expand Down Expand Up @@ -1936,3 +1939,213 @@ def test_migration_from_single_secret(self, scope, is_leader, _, __):
assert SECRET_INTERNAL_LABEL not in self.harness.get_relation_data(
self.rel_id, getattr(self.charm, scope).name
)

@patch("charm.PostgresqlOperatorCharm._update_relation_endpoints")
@patch("charm.PostgresqlOperatorCharm.primary_endpoint", new_callable=PropertyMock)
@patch("charm.PostgresqlOperatorCharm.update_config")
@patch("charm.PostgresqlOperatorCharm._remove_from_members_ips")
@patch("charm.Patroni.are_all_members_ready")
@patch("charm.PostgresqlOperatorCharm._get_ips_to_remove")
@patch("charm.PostgresqlOperatorCharm._updated_synchronous_node_count")
@patch("charm.Patroni.remove_raft_member")
@patch("charm.PostgresqlOperatorCharm._unit_ip")
@patch("charm.Patroni.get_member_ip")
def test_on_peer_relation_departed(
self,
_get_member_ip,
_unit_ip,
_remove_raft_member,
_updated_synchronous_node_count,
_get_ips_to_remove,
_are_all_members_ready,
_remove_from_members_ips,
_update_config,
_primary_endpoint,
_update_relation_endpoints,
):
# Test when the current unit is the departing unit.
self.charm.unit.status = ActiveStatus()
event = Mock()
event.departing_unit = self.harness.charm.unit
self.charm._on_peer_relation_departed(event)
_remove_raft_member.assert_not_called()
event.defer.assert_not_called()
_updated_synchronous_node_count.assert_not_called()
_get_ips_to_remove.assert_not_called()
_remove_from_members_ips.assert_not_called()
_update_config.assert_not_called()
_update_relation_endpoints.assert_not_called()
self.assertIsInstance(self.charm.unit.status, ActiveStatus)

# Test when the current unit is not the departing unit, but removing
# the member from the raft cluster fails.
_remove_raft_member.side_effect = RemoveRaftMemberFailedError
event.departing_unit = Unit(
f"{self.charm.app.name}/1", None, self.harness.charm.app._backend, {}
)
mock_ip_address = "1.1.1.1"
_get_member_ip.return_value = mock_ip_address
self.charm._on_peer_relation_departed(event)
_remove_raft_member.assert_called_once_with(mock_ip_address)
event.defer.assert_called_once()
_updated_synchronous_node_count.assert_not_called()
_get_ips_to_remove.assert_not_called()
_remove_from_members_ips.assert_not_called()
_update_config.assert_not_called()
_update_relation_endpoints.assert_not_called()
self.assertIsInstance(self.charm.unit.status, ActiveStatus)

# Test when the member is successfully removed from the raft cluster,
# but the unit is not the leader.
_remove_raft_member.reset_mock()
event.defer.reset_mock()
_remove_raft_member.side_effect = None
self.charm._on_peer_relation_departed(event)
_remove_raft_member.assert_called_once_with(mock_ip_address)
event.defer.assert_not_called()
_updated_synchronous_node_count.assert_not_called()
_get_ips_to_remove.assert_not_called()
_remove_from_members_ips.assert_not_called()
_update_config.assert_not_called()
_update_relation_endpoints.assert_not_called()
self.assertIsInstance(self.charm.unit.status, ActiveStatus)

# Test when the unit is the leader, but the cluster hasn't initialized yet,
# or it was unable to set synchronous_node_count.
_remove_raft_member.reset_mock()
with self.harness.hooks_disabled():
self.harness.set_leader()
self.charm._on_peer_relation_departed(event)
_remove_raft_member.assert_called_once_with(mock_ip_address)
event.defer.assert_called_once()
_updated_synchronous_node_count.assert_not_called()
_get_ips_to_remove.assert_not_called()
_remove_from_members_ips.assert_not_called()
_update_config.assert_not_called()
_update_relation_endpoints.assert_not_called()
self.assertIsInstance(self.charm.unit.status, ActiveStatus)

_remove_raft_member.reset_mock()
event.defer.reset_mock()
_updated_synchronous_node_count.return_value = False
with self.harness.hooks_disabled():
self.harness.update_relation_data(
self.rel_id, self.charm.app.name, {"cluster_initialised": "True"}
)
self.charm._on_peer_relation_departed(event)
_remove_raft_member.assert_called_once_with(mock_ip_address)
event.defer.assert_called_once()
_updated_synchronous_node_count.assert_called_once_with(1)
_get_ips_to_remove.assert_not_called()
_remove_from_members_ips.assert_not_called()
_update_config.assert_not_called()
_update_relation_endpoints.assert_not_called()
self.assertIsInstance(self.charm.unit.status, ActiveStatus)

# Test when there is more units in the cluster.
_remove_raft_member.reset_mock()
event.defer.reset_mock()
_updated_synchronous_node_count.reset_mock()
self.harness.add_relation_unit(self.rel_id, f"{self.charm.app.name}/2")
self.charm._on_peer_relation_departed(event)
_remove_raft_member.assert_called_once_with(mock_ip_address)
event.defer.assert_called_once()
_updated_synchronous_node_count.assert_called_once_with(2)
_get_ips_to_remove.assert_not_called()
_remove_from_members_ips.assert_not_called()
_update_config.assert_not_called()
_update_relation_endpoints.assert_not_called()
self.assertIsInstance(self.charm.unit.status, ActiveStatus)

# Test when the cluster is initialised, and it could set synchronous_node_count,
# but there is no IPs to be removed from the members list.
_remove_raft_member.reset_mock()
event.defer.reset_mock()
_updated_synchronous_node_count.reset_mock()
_updated_synchronous_node_count.return_value = True
self.charm._on_peer_relation_departed(event)
_remove_raft_member.assert_called_once_with(mock_ip_address)
event.defer.assert_not_called()
_updated_synchronous_node_count.assert_called_once_with(2)
_get_ips_to_remove.assert_called_once()
_remove_from_members_ips.assert_not_called()
_update_config.assert_not_called()
_update_relation_endpoints.assert_not_called()
self.assertIsInstance(self.charm.unit.status, ActiveStatus)

# Test when there are IPs to be removed from the members list, but not all
# the members are ready yet.
_remove_raft_member.reset_mock()
_updated_synchronous_node_count.reset_mock()
_get_ips_to_remove.reset_mock()
ips_to_remove = ["2.2.2.2", "3.3.3.3"]
_get_ips_to_remove.return_value = ips_to_remove
_are_all_members_ready.return_value = False
self.charm._on_peer_relation_departed(event)
_remove_raft_member.assert_called_once_with(mock_ip_address)
event.defer.assert_called_once()
_updated_synchronous_node_count.assert_called_once_with(2)
_get_ips_to_remove.assert_called_once()
_remove_from_members_ips.assert_not_called()
_update_config.assert_not_called()
_update_relation_endpoints.assert_not_called()
self.assertIsInstance(self.charm.unit.status, ActiveStatus)

# Test when all members are ready.
_remove_raft_member.reset_mock()
event.defer.reset_mock()
_updated_synchronous_node_count.reset_mock()
_get_ips_to_remove.reset_mock()
_are_all_members_ready.return_value = True
self.charm._on_peer_relation_departed(event)
_remove_raft_member.assert_called_once_with(mock_ip_address)
event.defer.assert_not_called()
_updated_synchronous_node_count.assert_called_once_with(2)
_get_ips_to_remove.assert_called_once()
_remove_from_members_ips.assert_has_calls([call(ips_to_remove[0]), call(ips_to_remove[1])])
self.assertEqual(_update_config.call_count, 2)
self.assertEqual(_update_relation_endpoints.call_count, 2)
self.assertIsInstance(self.charm.unit.status, ActiveStatus)

# Test when the primary is not reachable yet.
_remove_raft_member.reset_mock()
event.defer.reset_mock()
_updated_synchronous_node_count.reset_mock()
_get_ips_to_remove.reset_mock()
_remove_from_members_ips.reset_mock()
_update_config.reset_mock()
_update_relation_endpoints.reset_mock()
_primary_endpoint.return_value = None
self.charm._on_peer_relation_departed(event)
_remove_raft_member.assert_called_once_with(mock_ip_address)
event.defer.assert_not_called()
_updated_synchronous_node_count.assert_called_once_with(2)
_get_ips_to_remove.assert_called_once()
_remove_from_members_ips.assert_called_once()
_update_config.assert_called_once()
_update_relation_endpoints.assert_not_called()
self.assertIsInstance(self.charm.unit.status, WaitingStatus)

@patch("charm.PostgresqlOperatorCharm._update_relation_endpoints")
@patch("charm.PostgresqlOperatorCharm.primary_endpoint", new_callable=PropertyMock)
def test_update_new_unit_status(self, _primary_endpoint, _update_relation_endpoints):
# Test when the charm is blocked.
_primary_endpoint.return_value = "endpoint"
self.charm.unit.status = BlockedStatus("fake blocked status")
self.charm._update_new_unit_status()
_update_relation_endpoints.assert_called_once()
self.assertIsInstance(self.charm.unit.status, BlockedStatus)

# Test when the charm is not blocked.
_update_relation_endpoints.reset_mock()
self.charm.unit.status = WaitingStatus()
self.charm._update_new_unit_status()
_update_relation_endpoints.assert_called_once()
self.assertIsInstance(self.charm.unit.status, ActiveStatus)

# Test when the primary endpoint is not reachable yet.
_update_relation_endpoints.reset_mock()
_primary_endpoint.return_value = None
self.charm._update_new_unit_status()
_update_relation_endpoints.assert_not_called()
self.assertIsInstance(self.charm.unit.status, WaitingStatus)

0 comments on commit f843b52

Please sign in to comment.