From 31e29c1b536951eb89cd96ebeb95620dac4b8963 Mon Sep 17 00:00:00 2001 From: Vincent Liu Date: Fri, 2 Aug 2024 17:49:45 +0100 Subject: [PATCH] CA-396635: Wait for corosync to update its info Sometimes it takes the underlying cluster stack (corosync) some time to return a consistent view of the quorum. For example, it may be that the membership information correctly reflects the new members after a membership change, while the quorum field is still out of date. Add a delay here to make sure that the information from corosync represents a consistent snapshot of the current cluster state. Signed-off-by: Vincent Liu --- ocaml/xapi/xapi_clustering.ml | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/ocaml/xapi/xapi_clustering.ml b/ocaml/xapi/xapi_clustering.ml index 545674e92e0..249efa74da1 100644 --- a/ocaml/xapi/xapi_clustering.ml +++ b/ocaml/xapi/xapi_clustering.ml @@ -545,6 +545,13 @@ module Watcher = struct is an update. *) let cluster_change_interval = Mtime.Span.(5 * min) + (* CA-396635: Sometimes it takes the underlying cluster stack (corosync) some time + to return a consistent view of the quorum. For example, it may be that the membership + information correctly reflects the new members after a membership change, while the + quorum field is still out of date. Add a delay here to make sure that the information + from corosync represents a consistent snapshot of the current cluster state. *) + let stabilising_period = Mtime.Span.(5 * s) + let cluster_stack_watcher : bool Atomic.t = Atomic.make false (* we handle unclean hosts join and leave in the watcher, i.e. hosts joining and leaving @@ -558,10 +565,12 @@ module Watcher = struct "cluster change watcher call" (Clock.Timer.span_to_s cluster_change_interval) in - let find_cluster_and_update updates = + let find_cluster_and_update ?(wait = false) updates = match find_cluster_host ~__context ~host with | Some ch -> let cluster = Db.Cluster_host.get_cluster ~__context ~self:ch in + if wait then + Thread.delay (Clock.Timer.span_to_s stabilising_period) ; on_corosync_update ~__context ~cluster updates | None -> () @@ -569,7 +578,7 @@ module Watcher = struct match Idl.IdM.run @@ Cluster_client.IDL.T.get m with | Ok updates -> (* Received updates from corosync-notifyd *) - find_cluster_and_update updates + find_cluster_and_update ~wait:true updates | Error (InternalError "UPDATES.Timeout") -> (* UPDATES.get timed out, this is normal. *) (* CA-395789: We send a query to xapi-clusterd to fetch the latest state