Skip to content

Commit

Permalink
CP-49635: Add FIST point for corosync upgrade
Browse files Browse the repository at this point in the history
This forces the failure on a host that is trying to perform corosync
upgrade. There are ways to recover: if the failure happens early, before
the cluster is created in the DB, then a recreate ought to fix the
problem. This happens when the corosync upgrade fails on the
coordinator.

If the failure happens after the cluster is created on a pool member,
then a `pool-resync` should help retry this upgrade.

Hopefully this can simulate some of the failure paths, but is by no
means exhaustive. Other more complicated failures are not easily
recoverable and therefore not simulated for now.

Signed-off-by: Vincent Liu <[email protected]>
  • Loading branch information
Vincent-lau committed May 21, 2024
1 parent 772b6cb commit 43e710b
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 13 deletions.
29 changes: 16 additions & 13 deletions ocaml/xapi/xapi_clustering.ml
Original file line number Diff line number Diff line change
Expand Up @@ -328,19 +328,22 @@ let rpc ~__context =

let maybe_switch_cluster_stack_version ~__context ~self ~cluster_stack =
if Xapi_cluster_helpers.corosync3_enabled ~__context then
let dbg = Context.string_of_task_and_tracing __context in
let result =
Cluster_client.LocalClient.switch_cluster_stack (rpc ~__context) dbg
cluster_stack
in
match Idl.IdM.run @@ Cluster_client.IDL.T.get result with
| Ok () ->
debug "cluster stack switching was successful for cluster_host: %s"
(Ref.string_of self)
| Error error ->
warn "Error encountered when switching cluster stack cluster_host %s"
(Ref.string_of self) ;
handle_error error
if Xapi_fist.fail_corosync_upgrade () then
handle_error (InternalError "simulated corosync upgrade failure")
else
let dbg = Context.string_of_task_and_tracing __context in
let result =
Cluster_client.LocalClient.switch_cluster_stack (rpc ~__context) dbg
cluster_stack
in
match Idl.IdM.run @@ Cluster_client.IDL.T.get result with
| Ok () ->
debug "cluster stack switching was successful for cluster_host: %s"
(Ref.string_of self)
| Error error ->
warn "Error encountered when switching cluster stack cluster_host %s"
(Ref.string_of self) ;
handle_error error

let assert_cluster_host_quorate ~__context ~self =
(* With the latest kernel GFS2 would hang on mount if clustering is not working yet,
Expand Down
3 changes: 3 additions & 0 deletions ocaml/xapi/xapi_fist.ml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ let reconfigure_host () = fistpoint "reconfigure_host"
(** allow starting up a corosync2 cluster *)
let allow_corosync2 () = fistpoint "allow_corosync2"

(** Make the current node fail the corosync upgrade *)
let fail_corosync_upgrade () = fistpoint "fail_corosync_upgrade"

(** Raise MTC_EXIT_CAN_NOT_ACCESS_STATEFILE *)
let ha_cannot_access_statefile () = fistpoint "ha_cannot_access_statefile"

Expand Down

0 comments on commit 43e710b

Please sign in to comment.