Skip to content

Commit

Permalink
Make minion reconnecting on changing master IP (bsc#1228182)
Browse files Browse the repository at this point in the history
* Minions check dns when re-connecting to a master

Check for a chainging dns record anytime a minion gets disconnected from
it's master. See github issue #63654 #61482.

* Regression tests for dns defined masters

Adding tests to validate we check for changing dns anytime we're
disconnected from the currently connected master

* Update docs for master dns changes

Update docs to use master_alive_interval to detect master ip changes via
DNS.

* Remove comment which is not true anymore

* Make minion reconnecting on changing master IP

with zeromq transport

* Don't create schedule for alive if no master_alive_interval

* Skip the tests if running with non-root user

* Skip if unable to set additional IP address

* Set master_tries to -1 for minions

* Fix the tests

---------

Co-authored-by: Daniel A. Wozniak <[email protected]>
  • Loading branch information
vzhestkov and dwoz committed Nov 26, 2024
1 parent b9865ba commit eb6c67e
Show file tree
Hide file tree
Showing 13 changed files with 422 additions and 120 deletions.
5 changes: 2 additions & 3 deletions conf/minion
Original file line number Diff line number Diff line change
Expand Up @@ -271,9 +271,8 @@
#ping_interval: 0

# To auto recover minions if master changes IP address (DDNS)
# auth_tries: 10
# auth_safemode: True
# ping_interval: 2
# master_alive_interval: 10
# master_tries: -1
#
# Minions won't know master is missing until a ping fails. After the ping fail,
# the minion will attempt authentication and likely fails out and cause a restart.
Expand Down
4 changes: 3 additions & 1 deletion doc/ref/configuration/minion.rst
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,9 @@ Default: ``0``

Configures how often, in seconds, the minion will verify that the current
master is alive and responding. The minion will try to establish a connection
to the next master in the list if it finds the existing one is dead.
to the next master in the list if it finds the existing one is dead. This
setting can also be used to detect master DNS record changes when a minion has
been disconnected.

.. code-block:: yaml
Expand Down
2 changes: 0 additions & 2 deletions salt/channel/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,8 +385,6 @@ def connect(self):
# else take the relayed publish_port master reports
else:
publish_port = self.auth.creds["publish_port"]
# TODO: The zeromq transport does not use connect_callback and
# disconnect_callback.
yield self.transport.connect(
publish_port, self.connect_callback, self.disconnect_callback
)
Expand Down
4 changes: 2 additions & 2 deletions salt/config/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@
else:
_DFLT_IPC_MODE = "ipc"
_DFLT_FQDNS_GRAINS = False
_MASTER_TRIES = 1
_MASTER_TRIES = -1
_MASTER_USER = salt.utils.user.get_user()


Expand Down Expand Up @@ -1272,7 +1272,7 @@ def _gather_buffer_space():
"username": None,
"password": None,
"zmq_filtering": False,
"zmq_monitor": False,
"zmq_monitor": True,
"cache_sreqs": True,
"cmd_safe": True,
"sudo_user": "",
Expand Down
190 changes: 83 additions & 107 deletions salt/minion.py
Original file line number Diff line number Diff line change
Expand Up @@ -2737,10 +2737,64 @@ def handle_event(self, package):
# we are not connected anymore
self.connected = False
log.info("Connection to master %s lost", self.opts["master"])
if self.opts["transport"] != "tcp":
self.schedule.delete_job(name=master_event(type="alive"))

log.info("Trying to tune in to next master from master-list")

if hasattr(self, "pub_channel"):
self.pub_channel.on_recv(None)
if hasattr(self.pub_channel, "auth"):
self.pub_channel.auth.invalidate()
if hasattr(self.pub_channel, "close"):
self.pub_channel.close()
if hasattr(self, "req_channel") and self.req_channel:
self.req_channel.close()
self.req_channel = None

# if eval_master finds a new master for us, self.connected
# will be True again on successful master authentication
try:
master, self.pub_channel = yield self.eval_master(
opts=self.opts,
failed=True,
failback=tag.startswith(master_event(type="failback")),
)
except SaltClientError:
pass

if self.connected:
self.opts["master"] = master

# re-init the subsystems to work with the new master
log.info(
"Re-initialising subsystems for new master %s",
self.opts["master"],
)

self.req_channel = salt.channel.client.AsyncReqChannel.factory(
self.opts, io_loop=self.io_loop
)

if self.opts["master_type"] != "failover":
# modify the scheduled job to fire on reconnect
if self.opts["transport"] != "tcp":
# put the current schedule into the new loaders
self.opts["schedule"] = self.schedule.option("schedule")
(
self.functions,
self.returners,
self.function_errors,
self.executors,
) = self._load_modules()
# make the schedule to use the new 'functions' loader
self.schedule.functions = self.functions
self.pub_channel.on_recv(self._handle_payload)
self._fire_master_minion_start()
log.info("Minion is ready to receive requests!")

# update scheduled job to run with the new master addr
if (
self.opts["transport"] != "tcp"
and self.opts["master_alive_interval"] > 0
):
schedule = {
"function": "status.master",
"seconds": self.opts["master_alive_interval"],
Expand All @@ -2749,116 +2803,35 @@ def handle_event(self, package):
"return_job": False,
"kwargs": {
"master": self.opts["master"],
"connected": False,
"connected": True,
},
}
self.schedule.modify_job(
name=master_event(type="alive", master=self.opts["master"]),
schedule=schedule,
)
else:
# delete the scheduled job to don't interfere with the failover process
if self.opts["transport"] != "tcp":
self.schedule.delete_job(name=master_event(type="alive"))

log.info("Trying to tune in to next master from master-list")

if hasattr(self, "pub_channel"):
self.pub_channel.on_recv(None)
if hasattr(self.pub_channel, "auth"):
self.pub_channel.auth.invalidate()
if hasattr(self.pub_channel, "close"):
self.pub_channel.close()
del self.pub_channel

# if eval_master finds a new master for us, self.connected
# will be True again on successful master authentication
try:
master, self.pub_channel = yield self.eval_master(
opts=self.opts,
failed=True,
failback=tag.startswith(master_event(type="failback")),
)
except SaltClientError:
pass

if self.connected:
self.opts["master"] = master

# re-init the subsystems to work with the new master
log.info(
"Re-initialising subsystems for new master %s",
self.opts["master"],
)

self.req_channel = (
salt.transport.client.AsyncReqChannel.factory(
self.opts, io_loop=self.io_loop
)
)

# put the current schedule into the new loaders
self.opts["schedule"] = self.schedule.option("schedule")
(
self.functions,
self.returners,
self.function_errors,
self.executors,
) = self._load_modules()
# make the schedule to use the new 'functions' loader
self.schedule.functions = self.functions
self.pub_channel.on_recv(self._handle_payload)
self._fire_master_minion_start()
log.info("Minion is ready to receive requests!")

# update scheduled job to run with the new master addr
if self.opts["transport"] != "tcp":
schedule = {
"function": "status.master",
"seconds": self.opts["master_alive_interval"],
"jid_include": True,
"maxrunning": 1,
"return_job": False,
"kwargs": {
"master": self.opts["master"],
"connected": True,
},
}
self.schedule.modify_job(
name=master_event(
type="alive", master=self.opts["master"]
),
schedule=schedule,
)

if (
self.opts["master_failback"]
and "master_list" in self.opts
):
if self.opts["master"] != self.opts["master_list"][0]:
schedule = {
"function": "status.ping_master",
"seconds": self.opts[
"master_failback_interval"
],
"jid_include": True,
"maxrunning": 1,
"return_job": False,
"kwargs": {
"master": self.opts["master_list"][0]
},
}
self.schedule.modify_job(
name=master_event(type="failback"),
schedule=schedule,
)
else:
self.schedule.delete_job(
name=master_event(type="failback"), persist=True
)
else:
self.restart = True
self.io_loop.stop()
if self.opts["master_failback"] and "master_list" in self.opts:
if self.opts["master"] != self.opts["master_list"][0]:
schedule = {
"function": "status.ping_master",
"seconds": self.opts["master_failback_interval"],
"jid_include": True,
"maxrunning": 1,
"return_job": False,
"kwargs": {"master": self.opts["master_list"][0]},
}
self.schedule.modify_job(
name=master_event(type="failback"),
schedule=schedule,
)
else:
self.schedule.delete_job(
name=master_event(type="failback"), persist=True
)
else:
self.restart = True
self.io_loop.stop()

elif tag.startswith(master_event(type="connected")):
# handle this event only once. otherwise it will pollute the log
Expand All @@ -2870,7 +2843,10 @@ def handle_event(self, package):
self.connected = True
# modify the __master_alive job to only fire,
# if the connection is lost again
if self.opts["transport"] != "tcp":
if (
self.opts["transport"] != "tcp"
and self.opts["master_alive_interval"] > 0
):
schedule = {
"function": "status.master",
"seconds": self.opts["master_alive_interval"],
Expand Down
17 changes: 16 additions & 1 deletion salt/transport/zeromq.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Zeromq transport classes
"""

import errno
import hashlib
import logging
Expand Down Expand Up @@ -211,6 +212,12 @@ def connect(self, publish_port, connect_callback=None, disconnect_callback=None)
self.master_pub,
)
log.debug("%r connecting to %s", self, self.master_pub)
if (
hasattr(self, "_monitor")
and self._monitor is not None
and disconnect_callback is not None
):
self._monitor.disconnect_callback = disconnect_callback
self._socket.connect(self.master_pub)
connect_callback(True)

Expand Down Expand Up @@ -680,13 +687,21 @@ def monitor_callback(self, msg):
log.debug("ZeroMQ event: %s", evt)
if evt["event"] == zmq.EVENT_MONITOR_STOPPED:
self.stop()
elif evt["event"] == zmq.EVENT_DISCONNECTED:
if (
hasattr(self, "disconnect_callback")
and self.disconnect_callback is not None
):
self.disconnect_callback()

def stop(self):
if self._socket is None:
return
self._socket.disable_monitor()
self._socket = None
self._monitor_socket = None
if self._monitor_socket is not None:
self._monitor_socket.close()
self._monitor_socket = None
if self._monitor_stream is not None:
self._monitor_stream.close()
self._monitor_stream = None
Expand Down
Empty file.
Loading

0 comments on commit eb6c67e

Please sign in to comment.