diff --git a/ocaml/xapi/xapi_clustering.ml b/ocaml/xapi/xapi_clustering.ml index 545674e92e0..249efa74da1 100644 --- a/ocaml/xapi/xapi_clustering.ml +++ b/ocaml/xapi/xapi_clustering.ml @@ -545,6 +545,13 @@ module Watcher = struct is an update. *) let cluster_change_interval = Mtime.Span.(5 * min) + (* CA-396635: Sometimes it takes the underlying cluster stack (corosync) some time + to return a consistent view of the quorum. For example, it may be that the membership + information correctly reflects the new members after a membership change, while the + quorum field is still out of date. Add a delay here to make sure that the information + from corosync represents a consistent snapshot of the current cluster state. *) + let stabilising_period = Mtime.Span.(5 * s) + let cluster_stack_watcher : bool Atomic.t = Atomic.make false (* we handle unclean hosts join and leave in the watcher, i.e. hosts joining and leaving @@ -558,10 +565,12 @@ module Watcher = struct "cluster change watcher call" (Clock.Timer.span_to_s cluster_change_interval) in - let find_cluster_and_update updates = + let find_cluster_and_update ?(wait = false) updates = match find_cluster_host ~__context ~host with | Some ch -> let cluster = Db.Cluster_host.get_cluster ~__context ~self:ch in + if wait then + Thread.delay (Clock.Timer.span_to_s stabilising_period) ; on_corosync_update ~__context ~cluster updates | None -> () @@ -569,7 +578,7 @@ module Watcher = struct match Idl.IdM.run @@ Cluster_client.IDL.T.get m with | Ok updates -> (* Received updates from corosync-notifyd *) - find_cluster_and_update updates + find_cluster_and_update ~wait:true updates | Error (InternalError "UPDATES.Timeout") -> (* UPDATES.get timed out, this is normal. *) (* CA-395789: We send a query to xapi-clusterd to fetch the latest state