From 9291f21c0d5b8ab7068b4eb800489403107e7eba Mon Sep 17 00:00:00 2001 From: Ming Lu Date: Tue, 18 Jun 2024 13:53:22 +0800 Subject: [PATCH] CA-394343: After clock jump the xapi assumed the host is HOST_OFFLINE Prior to this commit, the xapi on the coordinator host records the 'Unix.gettimeofday' as the timestamps of the heartbeat with other pool supporter hosts. When the system clock is updated with a huge jump forward, the timestamps would be far back into the past. This would cause the xapi assumes that the hosts are offline as long time no heartbeats. In this commit, the timestamps are changed to get from a monotonic clock. In this way, the system clock changes will not impact the heartbeats' timestamps any more. Additionally, Host_metrics.last_updated is only set when the object is created. It's useless in check_host_liveness at all. Signed-off-by: Ming Lu --- ocaml/xapi/db_gc.ml | 56 +++++++++++++++------------------------- ocaml/xapi/xapi_globs.ml | 6 +++-- ocaml/xapi/xapi_ha.ml | 2 +- 3 files changed, 26 insertions(+), 38 deletions(-) diff --git a/ocaml/xapi/db_gc.ml b/ocaml/xapi/db_gc.ml index a0442314448..2efe11b89ee 100644 --- a/ocaml/xapi/db_gc.ml +++ b/ocaml/xapi/db_gc.ml @@ -30,7 +30,8 @@ let use_host_heartbeat_for_liveness = ref true let use_host_heartbeat_for_liveness_m = Mutex.create () -let host_heartbeat_table : (API.ref_host, float) Hashtbl.t = Hashtbl.create 16 +let host_heartbeat_table : (API.ref_host, Clock.Timer.t) Hashtbl.t = + Hashtbl.create 16 let host_skew_table : (API.ref_host, float) Hashtbl.t = Hashtbl.create 16 @@ -77,45 +78,24 @@ let detect_clock_skew ~__context host skew = (* Master compares the database with the in-memory host heartbeat table and sets the live flag accordingly. Called with the use_host_heartbeat_for_liveness_m and use_host_heartbeat_for_liveness is true (ie non-HA mode) *) let check_host_liveness ~__context = - (* Check for rolling upgrade mode - if so, use host metrics for liveness else use hashtbl *) - let rum = - try Helpers.rolling_upgrade_in_progress ~__context with _ -> false - in (* CA-16351: when performing the initial GC pass on first boot there won't be a localhost *) let localhost = try Helpers.get_localhost ~__context with _ -> Ref.null in - (* Look for "true->false" transition on Host_metrics.live *) let check_host host = if host <> localhost then try let hmetric = Db.Host.get_metrics ~__context ~self:host in let live = Db.Host_metrics.get_live ~__context ~self:hmetric in - (* See if the host is using the new HB mechanism, if so we'll use that *) - let new_heartbeat_time = + let timer = with_lock host_table_m (fun () -> - Option.value - (Hashtbl.find_opt host_heartbeat_table host) - ~default:Clock.Date.(epoch |> to_unix_time) + match Hashtbl.find_opt host_heartbeat_table host with + | Some x -> + x + | None -> + Clock.Timer.start + ~duration:!Xapi_globs.host_assumed_dead_interval ) in - let old_heartbeat_time = - if - rum - && Xapi_version.platform_version () - <> Helpers.version_string_of ~__context (Helpers.LocalObject host) - then ( - debug - "Host %s considering using metrics last update time as heartbeat" - (Ref.string_of host) ; - Date.to_float - (Db.Host_metrics.get_last_updated ~__context ~self:hmetric) - ) else - 0.0 - in - (* Use whichever value is the most recent to determine host liveness *) - let host_time = max old_heartbeat_time new_heartbeat_time in - let now = Unix.gettimeofday () in - (* we can now compare 'host_time' with 'now' *) - if now -. host_time < !Xapi_globs.host_assumed_dead_interval then + if not (Clock.Timer.has_expired timer) then (* From the heartbeat PoV the host looks alive. We try to (i) minimise database sets; and (ii) avoid toggling the host back to live if it has been marked as shutting_down. *) with_lock Xapi_globs.hosts_which_are_shutting_down_m (fun () -> @@ -131,10 +111,14 @@ let check_host_liveness ~__context = ) ) else if live then ( + let host_name_label = Db.Host.get_name_label ~__context ~self:host in + let host_uuid = Db.Host.get_uuid ~__context ~self:host in + let elapsed = Clock.Timer.elapsed timer in debug - "Assuming host is offline since the heartbeat/metrics haven't been \ - updated for %.2f seconds; setting live to false" - (now -. host_time) ; + "Assuming host '%s' (%s) is offline since the heartbeat hasn't \ + been updated for %s seconds; setting live to false" + host_name_label host_uuid + (Clock.Timer.span_to_s elapsed |> string_of_float) ; Db.Host_metrics.set_live ~__context ~self:hmetric ~value:false ; Xapi_host_helpers.update_allowed_operations ~__context ~self:host ) ; @@ -252,9 +236,10 @@ let tickle_heartbeat ~__context host stuff = let reason = Xapi_hooks.reason__clean_shutdown in if use_host_heartbeat_for_liveness then Xapi_host_helpers.mark_host_as_dead ~__context ~host ~reason - ) else + ) else ( + Hashtbl.replace host_heartbeat_table host + (Clock.Timer.start ~duration:!Xapi_globs.host_assumed_dead_interval) ; let now = Unix.gettimeofday () in - Hashtbl.replace host_heartbeat_table host now ; (* compute the clock skew for later analysis *) if List.mem_assoc _time stuff then try @@ -262,6 +247,7 @@ let tickle_heartbeat ~__context host stuff = let skew = abs_float (now -. slave) in Hashtbl.replace host_skew_table host skew with _ -> () + ) ) ; [] diff --git a/ocaml/xapi/xapi_globs.ml b/ocaml/xapi/xapi_globs.ml index ad4f35e37ed..9993b27acdd 100644 --- a/ocaml/xapi/xapi_globs.ml +++ b/ocaml/xapi/xapi_globs.ml @@ -707,7 +707,7 @@ let snapshot_with_quiesce_timeout = ref 600. let host_heartbeat_interval = ref 30. (* If we haven't heard a heartbeat from a host for this interval then the host is assumed dead *) -let host_assumed_dead_interval = ref 600.0 +let host_assumed_dead_interval = ref Mtime.Span.(10 * min) (* If a session has a last_active older than this we delete it *) let inactive_session_timeout = ref 86400. (* 24 hrs in seconds *) @@ -1070,7 +1070,9 @@ let xapi_globs_spec = ; ("wait_memory_target_timeout", Float wait_memory_target_timeout) ; ("snapshot_with_quiesce_timeout", Float snapshot_with_quiesce_timeout) ; ("host_heartbeat_interval", Float host_heartbeat_interval) - ; ("host_assumed_dead_interval", Float host_assumed_dead_interval) + ; ( "host_assumed_dead_interval" + , LongDurationFromSeconds host_assumed_dead_interval + ) ; ("fuse_time", Float Constants.fuse_time) ; ("db_restore_fuse_time", Float Constants.db_restore_fuse_time) ; ("inactive_session_timeout", Float inactive_session_timeout) diff --git a/ocaml/xapi/xapi_ha.ml b/ocaml/xapi/xapi_ha.ml index 9937fea6f28..578788f8c9c 100644 --- a/ocaml/xapi/xapi_ha.ml +++ b/ocaml/xapi/xapi_ha.ml @@ -837,7 +837,7 @@ module Monitor = struct (ExnHelper.string_of_exn e) ; Thread.delay !Xapi_globs.ha_monitor_interval done ; - debug "Re-enabling old Host_metrics.live heartbeat" ; + debug "Re-enabling host heartbeat" ; with_lock Db_gc.use_host_heartbeat_for_liveness_m (fun () -> Db_gc.use_host_heartbeat_for_liveness := true ) ;