diff --git a/ocaml/xapi/db_gc.ml b/ocaml/xapi/db_gc.ml index a0442314448..2efe11b89ee 100644 --- a/ocaml/xapi/db_gc.ml +++ b/ocaml/xapi/db_gc.ml @@ -30,7 +30,8 @@ let use_host_heartbeat_for_liveness = ref true let use_host_heartbeat_for_liveness_m = Mutex.create () -let host_heartbeat_table : (API.ref_host, float) Hashtbl.t = Hashtbl.create 16 +let host_heartbeat_table : (API.ref_host, Clock.Timer.t) Hashtbl.t = + Hashtbl.create 16 let host_skew_table : (API.ref_host, float) Hashtbl.t = Hashtbl.create 16 @@ -77,45 +78,24 @@ let detect_clock_skew ~__context host skew = (* Master compares the database with the in-memory host heartbeat table and sets the live flag accordingly. Called with the use_host_heartbeat_for_liveness_m and use_host_heartbeat_for_liveness is true (ie non-HA mode) *) let check_host_liveness ~__context = - (* Check for rolling upgrade mode - if so, use host metrics for liveness else use hashtbl *) - let rum = - try Helpers.rolling_upgrade_in_progress ~__context with _ -> false - in (* CA-16351: when performing the initial GC pass on first boot there won't be a localhost *) let localhost = try Helpers.get_localhost ~__context with _ -> Ref.null in - (* Look for "true->false" transition on Host_metrics.live *) let check_host host = if host <> localhost then try let hmetric = Db.Host.get_metrics ~__context ~self:host in let live = Db.Host_metrics.get_live ~__context ~self:hmetric in - (* See if the host is using the new HB mechanism, if so we'll use that *) - let new_heartbeat_time = + let timer = with_lock host_table_m (fun () -> - Option.value - (Hashtbl.find_opt host_heartbeat_table host) - ~default:Clock.Date.(epoch |> to_unix_time) + match Hashtbl.find_opt host_heartbeat_table host with + | Some x -> + x + | None -> + Clock.Timer.start + ~duration:!Xapi_globs.host_assumed_dead_interval ) in - let old_heartbeat_time = - if - rum - && Xapi_version.platform_version () - <> Helpers.version_string_of ~__context (Helpers.LocalObject host) - then ( - debug - "Host %s considering using metrics last update time as heartbeat" - (Ref.string_of host) ; - Date.to_float - (Db.Host_metrics.get_last_updated ~__context ~self:hmetric) - ) else - 0.0 - in - (* Use whichever value is the most recent to determine host liveness *) - let host_time = max old_heartbeat_time new_heartbeat_time in - let now = Unix.gettimeofday () in - (* we can now compare 'host_time' with 'now' *) - if now -. host_time < !Xapi_globs.host_assumed_dead_interval then + if not (Clock.Timer.has_expired timer) then (* From the heartbeat PoV the host looks alive. We try to (i) minimise database sets; and (ii) avoid toggling the host back to live if it has been marked as shutting_down. *) with_lock Xapi_globs.hosts_which_are_shutting_down_m (fun () -> @@ -131,10 +111,14 @@ let check_host_liveness ~__context = ) ) else if live then ( + let host_name_label = Db.Host.get_name_label ~__context ~self:host in + let host_uuid = Db.Host.get_uuid ~__context ~self:host in + let elapsed = Clock.Timer.elapsed timer in debug - "Assuming host is offline since the heartbeat/metrics haven't been \ - updated for %.2f seconds; setting live to false" - (now -. host_time) ; + "Assuming host '%s' (%s) is offline since the heartbeat hasn't \ + been updated for %s seconds; setting live to false" + host_name_label host_uuid + (Clock.Timer.span_to_s elapsed |> string_of_float) ; Db.Host_metrics.set_live ~__context ~self:hmetric ~value:false ; Xapi_host_helpers.update_allowed_operations ~__context ~self:host ) ; @@ -252,9 +236,10 @@ let tickle_heartbeat ~__context host stuff = let reason = Xapi_hooks.reason__clean_shutdown in if use_host_heartbeat_for_liveness then Xapi_host_helpers.mark_host_as_dead ~__context ~host ~reason - ) else + ) else ( + Hashtbl.replace host_heartbeat_table host + (Clock.Timer.start ~duration:!Xapi_globs.host_assumed_dead_interval) ; let now = Unix.gettimeofday () in - Hashtbl.replace host_heartbeat_table host now ; (* compute the clock skew for later analysis *) if List.mem_assoc _time stuff then try @@ -262,6 +247,7 @@ let tickle_heartbeat ~__context host stuff = let skew = abs_float (now -. slave) in Hashtbl.replace host_skew_table host skew with _ -> () + ) ) ; [] diff --git a/ocaml/xapi/xapi_globs.ml b/ocaml/xapi/xapi_globs.ml index 8b899e6d054..f52dd8a2709 100644 --- a/ocaml/xapi/xapi_globs.ml +++ b/ocaml/xapi/xapi_globs.ml @@ -707,7 +707,7 @@ let snapshot_with_quiesce_timeout = ref 600. let host_heartbeat_interval = ref 30. (* If we haven't heard a heartbeat from a host for this interval then the host is assumed dead *) -let host_assumed_dead_interval = ref 600.0 +let host_assumed_dead_interval = ref Mtime.Span.(10 * min) (* If a session has a last_active older than this we delete it *) let inactive_session_timeout = ref 86400. (* 24 hrs in seconds *) @@ -1074,7 +1074,9 @@ let xapi_globs_spec = ; ("wait_memory_target_timeout", Float wait_memory_target_timeout) ; ("snapshot_with_quiesce_timeout", Float snapshot_with_quiesce_timeout) ; ("host_heartbeat_interval", Float host_heartbeat_interval) - ; ("host_assumed_dead_interval", Float host_assumed_dead_interval) + ; ( "host_assumed_dead_interval" + , LongDurationFromSeconds host_assumed_dead_interval + ) ; ("fuse_time", Float Constants.fuse_time) ; ("db_restore_fuse_time", Float Constants.db_restore_fuse_time) ; ("inactive_session_timeout", Float inactive_session_timeout) diff --git a/ocaml/xapi/xapi_ha.ml b/ocaml/xapi/xapi_ha.ml index 9937fea6f28..578788f8c9c 100644 --- a/ocaml/xapi/xapi_ha.ml +++ b/ocaml/xapi/xapi_ha.ml @@ -837,7 +837,7 @@ module Monitor = struct (ExnHelper.string_of_exn e) ; Thread.delay !Xapi_globs.ha_monitor_interval done ; - debug "Re-enabling old Host_metrics.live heartbeat" ; + debug "Re-enabling host heartbeat" ; with_lock Db_gc.use_host_heartbeat_for_liveness_m (fun () -> Db_gc.use_host_heartbeat_for_liveness := true ) ;