Skip to content

Commit

Permalink
Merge pull request #5700 from minglumlu/private/mingl/CA-394343
Browse files Browse the repository at this point in the history
  • Loading branch information
psafont authored Jul 23, 2024
2 parents b411adc + 9291f21 commit 090e846
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 38 deletions.
56 changes: 21 additions & 35 deletions ocaml/xapi/db_gc.ml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ let use_host_heartbeat_for_liveness = ref true

let use_host_heartbeat_for_liveness_m = Mutex.create ()

let host_heartbeat_table : (API.ref_host, float) Hashtbl.t = Hashtbl.create 16
let host_heartbeat_table : (API.ref_host, Clock.Timer.t) Hashtbl.t =
Hashtbl.create 16

let host_skew_table : (API.ref_host, float) Hashtbl.t = Hashtbl.create 16

Expand Down Expand Up @@ -77,45 +78,24 @@ let detect_clock_skew ~__context host skew =
(* Master compares the database with the in-memory host heartbeat table and sets the live flag accordingly.
Called with the use_host_heartbeat_for_liveness_m and use_host_heartbeat_for_liveness is true (ie non-HA mode) *)
let check_host_liveness ~__context =
(* Check for rolling upgrade mode - if so, use host metrics for liveness else use hashtbl *)
let rum =
try Helpers.rolling_upgrade_in_progress ~__context with _ -> false
in
(* CA-16351: when performing the initial GC pass on first boot there won't be a localhost *)
let localhost = try Helpers.get_localhost ~__context with _ -> Ref.null in
(* Look for "true->false" transition on Host_metrics.live *)
let check_host host =
if host <> localhost then
try
let hmetric = Db.Host.get_metrics ~__context ~self:host in
let live = Db.Host_metrics.get_live ~__context ~self:hmetric in
(* See if the host is using the new HB mechanism, if so we'll use that *)
let new_heartbeat_time =
let timer =
with_lock host_table_m (fun () ->
Option.value
(Hashtbl.find_opt host_heartbeat_table host)
~default:Clock.Date.(epoch |> to_unix_time)
match Hashtbl.find_opt host_heartbeat_table host with
| Some x ->
x
| None ->
Clock.Timer.start
~duration:!Xapi_globs.host_assumed_dead_interval
)
in
let old_heartbeat_time =
if
rum
&& Xapi_version.platform_version ()
<> Helpers.version_string_of ~__context (Helpers.LocalObject host)
then (
debug
"Host %s considering using metrics last update time as heartbeat"
(Ref.string_of host) ;
Date.to_float
(Db.Host_metrics.get_last_updated ~__context ~self:hmetric)
) else
0.0
in
(* Use whichever value is the most recent to determine host liveness *)
let host_time = max old_heartbeat_time new_heartbeat_time in
let now = Unix.gettimeofday () in
(* we can now compare 'host_time' with 'now' *)
if now -. host_time < !Xapi_globs.host_assumed_dead_interval then
if not (Clock.Timer.has_expired timer) then
(* From the heartbeat PoV the host looks alive. We try to (i) minimise database sets; and (ii)
avoid toggling the host back to live if it has been marked as shutting_down. *)
with_lock Xapi_globs.hosts_which_are_shutting_down_m (fun () ->
Expand All @@ -131,10 +111,14 @@ let check_host_liveness ~__context =
)
)
else if live then (
let host_name_label = Db.Host.get_name_label ~__context ~self:host in
let host_uuid = Db.Host.get_uuid ~__context ~self:host in
let elapsed = Clock.Timer.elapsed timer in
debug
"Assuming host is offline since the heartbeat/metrics haven't been \
updated for %.2f seconds; setting live to false"
(now -. host_time) ;
"Assuming host '%s' (%s) is offline since the heartbeat hasn't \
been updated for %s seconds; setting live to false"
host_name_label host_uuid
(Clock.Timer.span_to_s elapsed |> string_of_float) ;
Db.Host_metrics.set_live ~__context ~self:hmetric ~value:false ;
Xapi_host_helpers.update_allowed_operations ~__context ~self:host
) ;
Expand Down Expand Up @@ -252,16 +236,18 @@ let tickle_heartbeat ~__context host stuff =
let reason = Xapi_hooks.reason__clean_shutdown in
if use_host_heartbeat_for_liveness then
Xapi_host_helpers.mark_host_as_dead ~__context ~host ~reason
) else
) else (
Hashtbl.replace host_heartbeat_table host
(Clock.Timer.start ~duration:!Xapi_globs.host_assumed_dead_interval) ;
let now = Unix.gettimeofday () in
Hashtbl.replace host_heartbeat_table host now ;
(* compute the clock skew for later analysis *)
if List.mem_assoc _time stuff then
try
let slave = float_of_string (List.assoc _time stuff) in
let skew = abs_float (now -. slave) in
Hashtbl.replace host_skew_table host skew
with _ -> ()
)
) ;
[]

Expand Down
6 changes: 4 additions & 2 deletions ocaml/xapi/xapi_globs.ml
Original file line number Diff line number Diff line change
Expand Up @@ -707,7 +707,7 @@ let snapshot_with_quiesce_timeout = ref 600.
let host_heartbeat_interval = ref 30.

(* If we haven't heard a heartbeat from a host for this interval then the host is assumed dead *)
let host_assumed_dead_interval = ref 600.0
let host_assumed_dead_interval = ref Mtime.Span.(10 * min)

(* If a session has a last_active older than this we delete it *)
let inactive_session_timeout = ref 86400. (* 24 hrs in seconds *)
Expand Down Expand Up @@ -1074,7 +1074,9 @@ let xapi_globs_spec =
; ("wait_memory_target_timeout", Float wait_memory_target_timeout)
; ("snapshot_with_quiesce_timeout", Float snapshot_with_quiesce_timeout)
; ("host_heartbeat_interval", Float host_heartbeat_interval)
; ("host_assumed_dead_interval", Float host_assumed_dead_interval)
; ( "host_assumed_dead_interval"
, LongDurationFromSeconds host_assumed_dead_interval
)
; ("fuse_time", Float Constants.fuse_time)
; ("db_restore_fuse_time", Float Constants.db_restore_fuse_time)
; ("inactive_session_timeout", Float inactive_session_timeout)
Expand Down
2 changes: 1 addition & 1 deletion ocaml/xapi/xapi_ha.ml
Original file line number Diff line number Diff line change
Expand Up @@ -837,7 +837,7 @@ module Monitor = struct
(ExnHelper.string_of_exn e) ;
Thread.delay !Xapi_globs.ha_monitor_interval
done ;
debug "Re-enabling old Host_metrics.live heartbeat" ;
debug "Re-enabling host heartbeat" ;
with_lock Db_gc.use_host_heartbeat_for_liveness_m (fun () ->
Db_gc.use_host_heartbeat_for_liveness := true
) ;
Expand Down

0 comments on commit 090e846

Please sign in to comment.