Make minion reconnecting on changing master IP (bsc#1228182)

* Minions check dns when re-connecting to a master Check for a chainging dns record anytime a minion gets disconnected from it's master. See github issue #63654 #61482. * Regression tests for dns defined masters Adding tests to validate we check for changing dns anytime we're disconnected from the currently connected master * Update docs for master dns changes Update docs to use master_alive_interval to detect master ip changes via DNS. * Remove comment which is not true anymore * Make minion reconnecting on changing master IP with zeromq transport * Don't create schedule for alive if no master_alive_interval * Skip the tests if running with non-root user * Skip if unable to set additional IP address * Set master_tries to -1 for minions * Fix the tests --------- Co-authored-by: Daniel A. Wozniak <[email protected]>
openSUSE · Nov 26, 2024 · eb6c67e · eb6c67e
1 parent b9865ba
commit eb6c67e
Show file tree

Hide file tree

Showing 13 changed files with 422 additions and 120 deletions.
diff --git a/conf/minion b/conf/minion
@@ -271,9 +271,8 @@
 #ping_interval: 0
 
 # To auto recover minions if master changes IP address (DDNS)
-#    auth_tries: 10
-#    auth_safemode: True
-#    ping_interval: 2
+#    master_alive_interval: 10
+#    master_tries: -1
 #
 # Minions won't know master is missing until a ping fails. After the ping fail,
 # the minion will attempt authentication and likely fails out and cause a restart.

diff --git a/doc/ref/configuration/minion.rst b/doc/ref/configuration/minion.rst
@@ -291,7 +291,9 @@ Default: ``0``
 
 Configures how often, in seconds, the minion will verify that the current
 master is alive and responding.  The minion will try to establish a connection
-to the next master in the list if it finds the existing one is dead.
+to the next master in the list if it finds the existing one is dead. This
+setting can also be used to detect master DNS record changes when a minion has
+been disconnected.
 
 .. code-block:: yaml
 

diff --git a/salt/channel/client.py b/salt/channel/client.py
@@ -385,8 +385,6 @@ def connect(self):
             # else take the relayed publish_port master reports
             else:
                 publish_port = self.auth.creds["publish_port"]
-            # TODO: The zeromq transport does not use connect_callback and
-            # disconnect_callback.
             yield self.transport.connect(
                 publish_port, self.connect_callback, self.disconnect_callback
             )

diff --git a/salt/config/__init__.py b/salt/config/__init__.py
@@ -75,7 +75,7 @@
 else:
     _DFLT_IPC_MODE = "ipc"
     _DFLT_FQDNS_GRAINS = False
-    _MASTER_TRIES = 1
+    _MASTER_TRIES = -1
     _MASTER_USER = salt.utils.user.get_user()
 
 
@@ -1272,7 +1272,7 @@ def _gather_buffer_space():
         "username": None,
         "password": None,
         "zmq_filtering": False,
-        "zmq_monitor": False,
+        "zmq_monitor": True,
         "cache_sreqs": True,
         "cmd_safe": True,
         "sudo_user": "",

diff --git a/salt/minion.py b/salt/minion.py
@@ -2737,10 +2737,64 @@ def handle_event(self, package):
                 # we are not connected anymore
                 self.connected = False
                 log.info("Connection to master %s lost", self.opts["master"])
+                if self.opts["transport"] != "tcp":
+                    self.schedule.delete_job(name=master_event(type="alive"))
+
+                log.info("Trying to tune in to next master from master-list")
+
+                if hasattr(self, "pub_channel"):
+                    self.pub_channel.on_recv(None)
+                    if hasattr(self.pub_channel, "auth"):
+                        self.pub_channel.auth.invalidate()
+                    if hasattr(self.pub_channel, "close"):
+                        self.pub_channel.close()
+                if hasattr(self, "req_channel") and self.req_channel:
+                    self.req_channel.close()
+                    self.req_channel = None
+
+                # if eval_master finds a new master for us, self.connected
+                # will be True again on successful master authentication
+                try:
+                    master, self.pub_channel = yield self.eval_master(
+                        opts=self.opts,
+                        failed=True,
+                        failback=tag.startswith(master_event(type="failback")),
+                    )
+                except SaltClientError:
+                    pass
+
+                if self.connected:
+                    self.opts["master"] = master
+
+                    # re-init the subsystems to work with the new master
+                    log.info(
+                        "Re-initialising subsystems for new master %s",
+                        self.opts["master"],
+                    )
+
+                    self.req_channel = salt.channel.client.AsyncReqChannel.factory(
+                        self.opts, io_loop=self.io_loop
+                    )
 
-                if self.opts["master_type"] != "failover":
-                    # modify the scheduled job to fire on reconnect
-                    if self.opts["transport"] != "tcp":
+                    # put the current schedule into the new loaders
+                    self.opts["schedule"] = self.schedule.option("schedule")
+                    (
+                        self.functions,
+                        self.returners,
+                        self.function_errors,
+                        self.executors,
+                    ) = self._load_modules()
+                    # make the schedule to use the new 'functions' loader
+                    self.schedule.functions = self.functions
+                    self.pub_channel.on_recv(self._handle_payload)
+                    self._fire_master_minion_start()
+                    log.info("Minion is ready to receive requests!")
+
+                    # update scheduled job to run with the new master addr
+                    if (
+                        self.opts["transport"] != "tcp"
+                        and self.opts["master_alive_interval"] > 0
+                    ):
                         schedule = {
                             "function": "status.master",
                             "seconds": self.opts["master_alive_interval"],
@@ -2749,116 +2803,35 @@ def handle_event(self, package):
                             "return_job": False,
                             "kwargs": {
                                 "master": self.opts["master"],
-                                "connected": False,
+                                "connected": True,
                             },
                         }
                         self.schedule.modify_job(
                             name=master_event(type="alive", master=self.opts["master"]),
                             schedule=schedule,
                         )
-                else:
-                    # delete the scheduled job to don't interfere with the failover process
-                    if self.opts["transport"] != "tcp":
-                        self.schedule.delete_job(name=master_event(type="alive"))
-
-                    log.info("Trying to tune in to next master from master-list")
-
-                    if hasattr(self, "pub_channel"):
-                        self.pub_channel.on_recv(None)
-                        if hasattr(self.pub_channel, "auth"):
-                            self.pub_channel.auth.invalidate()
-                        if hasattr(self.pub_channel, "close"):
-                            self.pub_channel.close()
-                        del self.pub_channel
-
-                    # if eval_master finds a new master for us, self.connected
-                    # will be True again on successful master authentication
-                    try:
-                        master, self.pub_channel = yield self.eval_master(
-                            opts=self.opts,
-                            failed=True,
-                            failback=tag.startswith(master_event(type="failback")),
-                        )
-                    except SaltClientError:
-                        pass
-
-                    if self.connected:
-                        self.opts["master"] = master
-
-                        # re-init the subsystems to work with the new master
-                        log.info(
-                            "Re-initialising subsystems for new master %s",
-                            self.opts["master"],
-                        )
-
-                        self.req_channel = (
-                            salt.transport.client.AsyncReqChannel.factory(
-                                self.opts, io_loop=self.io_loop
-                            )
-                        )
-
-                        # put the current schedule into the new loaders
-                        self.opts["schedule"] = self.schedule.option("schedule")
-                        (
-                            self.functions,
-                            self.returners,
-                            self.function_errors,
-                            self.executors,
-                        ) = self._load_modules()
-                        # make the schedule to use the new 'functions' loader
-                        self.schedule.functions = self.functions
-                        self.pub_channel.on_recv(self._handle_payload)
-                        self._fire_master_minion_start()
-                        log.info("Minion is ready to receive requests!")
-
-                        # update scheduled job to run with the new master addr
-                        if self.opts["transport"] != "tcp":
-                            schedule = {
-                                "function": "status.master",
-                                "seconds": self.opts["master_alive_interval"],
-                                "jid_include": True,
-                                "maxrunning": 1,
-                                "return_job": False,
-                                "kwargs": {
-                                    "master": self.opts["master"],
-                                    "connected": True,
-                                },
-                            }
-                            self.schedule.modify_job(
-                                name=master_event(
-                                    type="alive", master=self.opts["master"]
-                                ),
-                                schedule=schedule,
-                            )
 
-                            if (
-                                self.opts["master_failback"]
-                                and "master_list" in self.opts
-                            ):
-                                if self.opts["master"] != self.opts["master_list"][0]:
-                                    schedule = {
-                                        "function": "status.ping_master",
-                                        "seconds": self.opts[
-                                            "master_failback_interval"
-                                        ],
-                                        "jid_include": True,
-                                        "maxrunning": 1,
-                                        "return_job": False,
-                                        "kwargs": {
-                                            "master": self.opts["master_list"][0]
-                                        },
-                                    }
-                                    self.schedule.modify_job(
-                                        name=master_event(type="failback"),
-                                        schedule=schedule,
-                                    )
-                                else:
-                                    self.schedule.delete_job(
-                                        name=master_event(type="failback"), persist=True
-                                    )
-                    else:
-                        self.restart = True
-                        self.io_loop.stop()
+                        if self.opts["master_failback"] and "master_list" in self.opts:
+                            if self.opts["master"] != self.opts["master_list"][0]:
+                                schedule = {
+                                    "function": "status.ping_master",
+                                    "seconds": self.opts["master_failback_interval"],
+                                    "jid_include": True,
+                                    "maxrunning": 1,
+                                    "return_job": False,
+                                    "kwargs": {"master": self.opts["master_list"][0]},
+                                }
+                                self.schedule.modify_job(
+                                    name=master_event(type="failback"),
+                                    schedule=schedule,
+                                )
+                            else:
+                                self.schedule.delete_job(
+                                    name=master_event(type="failback"), persist=True
+                                )
+                else:
+                    self.restart = True
+                    self.io_loop.stop()
 
         elif tag.startswith(master_event(type="connected")):
             # handle this event only once. otherwise it will pollute the log
@@ -2870,7 +2843,10 @@ def handle_event(self, package):
                 self.connected = True
                 # modify the __master_alive job to only fire,
                 # if the connection is lost again
-                if self.opts["transport"] != "tcp":
+                if (
+                    self.opts["transport"] != "tcp"
+                    and self.opts["master_alive_interval"] > 0
+                ):
                     schedule = {
                         "function": "status.master",
                         "seconds": self.opts["master_alive_interval"],

diff --git a/salt/transport/zeromq.py b/salt/transport/zeromq.py
@@ -1,6 +1,7 @@
 """
 Zeromq transport classes
 """
+
 import errno
 import hashlib
 import logging
@@ -211,6 +212,12 @@ def connect(self, publish_port, connect_callback=None, disconnect_callback=None)
             self.master_pub,
         )
         log.debug("%r connecting to %s", self, self.master_pub)
+        if (
+            hasattr(self, "_monitor")
+            and self._monitor is not None
+            and disconnect_callback is not None
+        ):
+            self._monitor.disconnect_callback = disconnect_callback
         self._socket.connect(self.master_pub)
         connect_callback(True)
 
@@ -680,13 +687,21 @@ def monitor_callback(self, msg):
         log.debug("ZeroMQ event: %s", evt)
         if evt["event"] == zmq.EVENT_MONITOR_STOPPED:
             self.stop()
+        elif evt["event"] == zmq.EVENT_DISCONNECTED:
+            if (
+                hasattr(self, "disconnect_callback")
+                and self.disconnect_callback is not None
+            ):
+                self.disconnect_callback()
 
     def stop(self):
         if self._socket is None:
             return
         self._socket.disable_monitor()
         self._socket = None
-        self._monitor_socket = None
+        if self._monitor_socket is not None:
+            self._monitor_socket.close()
+            self._monitor_socket = None
         if self._monitor_stream is not None:
             self._monitor_stream.close()
             self._monitor_stream = None

diff --git a/tests/pytests/scenarios/dns/__init__.py b/tests/pytests/scenarios/dns/__init__.py