Skip to content

Commit

Permalink
CA-380580: cross-pool migration - no CPU checks for halted VMs, move …
Browse files Browse the repository at this point in the history
…CPU check to the target host (#6175)

Rebased the work from 2023 merged in #5111 and #5132, that caused issues
and was partially fixed in #5148, but was completely reverted in #5147.
I've integrated the fix from #5148 and additionally the fix suggested by
@minglumlu in CA-380715 that was not merged at the time due to time
constraints.

This series passed the tests that were originally failing: sxm-unres
(Job ID 4177739), vGPUSXMM60CrossPool (4177750), and also passed the
Ring3 BST+BVT (209341). I can run more migration tests if needed - I've
heard @Vincent-lau has requested for these to be separated into its own
suite instead of being only in Core and Distribution regression tests.
  • Loading branch information
last-genius authored Dec 13, 2024
2 parents 9aa2baa + b0f93fc commit 88534a0
Show file tree
Hide file tree
Showing 10 changed files with 150 additions and 39 deletions.
38 changes: 27 additions & 11 deletions ocaml/xapi/cpuid_helpers.ml
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,9 @@ let threads_per_core = Map_check.(field "threads_per_core" int)

let vendor = Map_check.(field "vendor" string)

let get_flags_for_vm ~__context vm cpu_info =
let get_flags_for_vm ~__context domain_type cpu_info =
let features_field =
match Helpers.domain_type ~__context ~self:vm with
match domain_type with
| `hvm | `pv_in_pvh | `pvh ->
features_hvm
| `pv ->
Expand Down Expand Up @@ -79,36 +79,51 @@ let next_boot_cpu_features ~__context ~vm =
Map_check.getf features_field_boot pool_cpu_info
|> Xenops_interface.CPU_policy.to_string

let get_host_cpu_info ~__context ~vm:_ ~host ?remote () =
let get_host_cpu_info ~__context ~host ?remote () =
match remote with
| None ->
Db.Host.get_cpu_info ~__context ~self:host
| Some (rpc, session_id) ->
Client.Client.Host.get_cpu_info ~rpc ~session_id ~self:host

let get_host_compatibility_info ~__context ~vm ~host ?remote () =
get_host_cpu_info ~__context ~vm ~host ?remote ()
|> get_flags_for_vm ~__context vm
let get_host_compatibility_info ~__context ~domain_type ~host ?remote () =
get_host_cpu_info ~__context ~host ?remote ()
|> get_flags_for_vm ~__context domain_type

(* Compare the CPU on which the given VM was last booted to the CPU of the given host. *)
let assert_vm_is_compatible ~__context ~vm ~host ?remote () =
let assert_vm_is_compatible ~__context ~vm ~host =
let vm_ref, vm_rec, domain_type =
match vm with
| `db self ->
( self
, Db.VM.get_record ~__context ~self
, Helpers.domain_type ~__context ~self
)
| `import (vm_rec, dt) ->
(* Ref.null, because the VM to be imported does not yet have a ref *)
(Ref.null, vm_rec, Helpers.check_domain_type dt)
in
let fail msg =
raise
(Api_errors.Server_error
( Api_errors.vm_incompatible_with_this_host
, [Ref.string_of vm; Ref.string_of host; msg]
, [Ref.string_of vm_ref; Ref.string_of host; msg]
)
)
in
if Db.VM.get_power_state ~__context ~self:vm <> `Halted then
if vm_rec.API.vM_power_state <> `Halted then (
let host_uuid = Db.Host.get_uuid ~__context ~self:host in
debug "Checking CPU compatibility of %s VM %s with host %s"
(Record_util.domain_type_to_string domain_type)
vm_rec.API.vM_uuid host_uuid ;
let open Xapi_xenops_queue in
let module Xenopsd = (val make_client (default_xenopsd ()) : XENOPS) in
let dbg = Context.string_of_task __context in
try
let host_cpu_vendor, host_cpu_features =
get_host_compatibility_info ~__context ~vm ~host ?remote ()
get_host_compatibility_info ~__context ~domain_type ~host ()
in
let vm_cpu_info = Db.VM.get_last_boot_CPU_flags ~__context ~self:vm in
let vm_cpu_info = vm_rec.API.vM_last_boot_CPU_flags in
if List.mem_assoc cpu_info_vendor_key vm_cpu_info then (
(* Check the VM was last booted on a CPU with the same vendor as this host's CPU. *)
let vm_cpu_vendor = List.assoc cpu_info_vendor_key vm_cpu_info in
Expand Down Expand Up @@ -141,3 +156,4 @@ let assert_vm_is_compatible ~__context ~vm ~host ?remote () =
fail
"Host does not have new leveling feature keys - not comparing VM's \
flags"
)
8 changes: 4 additions & 4 deletions ocaml/xapi/cpuid_helpers.mli
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,12 @@ val next_boot_cpu_features : __context:Context.t -> vm:[`VM] API.Ref.t -> string

val assert_vm_is_compatible :
__context:Context.t
-> vm:[`VM] API.Ref.t
-> vm:[`db of [`VM] API.Ref.t | `import of API.vM_t * API.domain_type]
-> host:[`host] API.Ref.t
-> ?remote:(Rpc.call -> Rpc.response Client.Id.t) * [< `session] Ref.t
-> unit
-> unit
(** Checks whether the CPU vendor and features used by the VM are compatible
with the given host. The VM can be one that is currently in the DB, or a record
coming from a metadata import as used for cross-pool migration. *)

val vendor : string Map_check.field

Expand All @@ -42,7 +43,6 @@ val features_hvm_host : [`host] Xenops_interface.CPU_policy.t Map_check.field

val get_host_cpu_info :
__context:Context.t
-> vm:[`VM] API.Ref.t
-> host:[`host] API.Ref.t
-> ?remote:(Rpc.call -> Rpc.response Client.Id.t) * [< `session] Ref.t
-> unit
Expand Down
19 changes: 17 additions & 2 deletions ocaml/xapi/export.ml
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,8 @@ let rec update_table ~__context ~include_snapshots ~preserve_power_state
&& Db.is_valid_ref __context vdi
then
add_vdi vdi ;
(* Add also the guest metrics *)
(* Add also the metrics and guest metrics *)
add vm.API.vM_metrics ;
add vm.API.vM_guest_metrics ;
(* Add the hosts links *)
add vm.API.vM_resident_on ;
Expand Down Expand Up @@ -264,7 +265,7 @@ let make_vm ?(with_snapshot_metadata = false) ~preserve_power_state table
; API.vM_resident_on= lookup table (Ref.string_of vm.API.vM_resident_on)
; API.vM_affinity= lookup table (Ref.string_of vm.API.vM_affinity)
; API.vM_consoles= []
; API.vM_metrics= Ref.null
; API.vM_metrics= lookup table (Ref.string_of vm.API.vM_metrics)
; API.vM_guest_metrics= lookup table (Ref.string_of vm.API.vM_guest_metrics)
; API.vM_protection_policy= Ref.null
; API.vM_bios_strings= vm.API.vM_bios_strings
Expand All @@ -277,6 +278,15 @@ let make_vm ?(with_snapshot_metadata = false) ~preserve_power_state table
; snapshot= API.rpc_of_vM_t vm
}

(** Convert a VM metrics reference to an obj *)
let make_vmm table __context self =
let vmm = Db.VM_metrics.get_record ~__context ~self in
{
cls= Datamodel_common._vm_metrics
; id= Ref.string_of (lookup table (Ref.string_of self))
; snapshot= API.rpc_of_vM_metrics_t vmm
}

(** Convert a guest-metrics reference to an obj *)
let make_gm table __context self =
let gm = Db.VM_guest_metrics.get_record ~__context ~self in
Expand Down Expand Up @@ -506,6 +516,10 @@ let make_all ~with_snapshot_metadata ~preserve_power_state table __context =
(make_vm ~with_snapshot_metadata ~preserve_power_state table __context)
(filter table (Db.VM.get_all ~__context))
in
let vmms =
List.map (make_vmm table __context)
(filter table (Db.VM_metrics.get_all ~__context))
in
let gms =
List.map (make_gm table __context)
(filter table (Db.VM_guest_metrics.get_all ~__context))
Expand Down Expand Up @@ -566,6 +580,7 @@ let make_all ~with_snapshot_metadata ~preserve_power_state table __context =
[
hosts
; vms
; vmms
; gms
; vbds
; vifs
Expand Down
86 changes: 81 additions & 5 deletions ocaml/xapi/import.ml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ type metadata_options = {
* - If the migration is for real, we will expect the VM export code on the source host to have mapped the VDI locations onto their
* mirrored counterparts which are present on this host. *)
live: bool
; check_cpu: bool
; (* An optional src VDI -> destination VDI rewrite list *)
vdi_map: (string * string) list
}
Expand All @@ -75,6 +76,13 @@ type config = {
let is_live config =
match config.import_type with Metadata_import {live; _} -> live | _ -> false

let needs_cpu_check config =
match config.import_type with
| Metadata_import {check_cpu; _} ->
check_cpu
| _ ->
false

(** List of (datamodel classname * Reference in export * Reference in database) *)
type table = (string * string * string) list

Expand Down Expand Up @@ -253,8 +261,8 @@ let assert_can_restore_backup ~__context rpc session_id (x : header) =
import_vms

let assert_can_live_import __context vm_record =
let host = Helpers.get_localhost ~__context in
let assert_memory_available () =
let host = Helpers.get_localhost ~__context in
let host_mem_available =
Memory_check.host_compute_free_memory_with_maximum_compression ~__context
~host None
Expand Down Expand Up @@ -416,7 +424,7 @@ module VM : HandlerTools = struct
| Skip
| Clean_import of API.vM_t

let precheck __context config _rpc _session_id _state x =
let precheck __context config _rpc _session_id state x =
let vm_record = get_vm_record x.snapshot in
let is_default_template =
vm_record.API.vM_is_default_template
Expand Down Expand Up @@ -511,6 +519,28 @@ module VM : HandlerTools = struct
| Replace (_, vm_record) | Clean_import vm_record ->
if is_live config then
assert_can_live_import __context vm_record ;
( if needs_cpu_check config then
let vmm_record =
find_in_export
(Ref.string_of vm_record.API.vM_metrics)
state.export
|> API.vM_metrics_t_of_rpc
in
let host = Helpers.get_localhost ~__context in
(* Don't check CPUID on 'unspecified' snapshots *)
match
( vm_record.API.vM_is_a_snapshot
, vmm_record.API.vM_metrics_current_domain_type
)
with
| true, `unspecified ->
(* A snapshot which was taken from a shutdown VM. *)
()
| _, dt ->
Cpuid_helpers.assert_vm_is_compatible ~__context
~vm:(`import (vm_record, dt))
~host
) ;
import_action
| _ ->
import_action
Expand Down Expand Up @@ -730,6 +760,15 @@ module VM : HandlerTools = struct
then
Db.VM.set_suspend_SR ~__context ~self:vm ~value:Ref.null ;
Db.VM.set_parent ~__context ~self:vm ~value:vm_record.API.vM_parent ;
( try
let vmm = lookup vm_record.API.vM_metrics state.table in
(* We have VM_metrics in the imported metadata, so use it, and destroy
the record created by VM.create_from_record above. *)
let replaced_vmm = Db.VM.get_metrics ~__context ~self:vm in
Db.VM.set_metrics ~__context ~self:vm ~value:vmm ;
Db.VM_metrics.destroy ~__context ~self:replaced_vmm
with _ -> ()
) ;
( try
let gm = lookup vm_record.API.vM_guest_metrics state.table in
Db.VM.set_guest_metrics ~__context ~self:vm ~value:gm
Expand Down Expand Up @@ -805,6 +844,40 @@ module GuestMetrics : HandlerTools = struct
state.table <- (x.cls, x.id, Ref.string_of gm) :: state.table
end

(** Create the VM metrics *)
module Metrics : HandlerTools = struct
type precheck_t = OK

let precheck __context _config _rpc _session_id _state _x = OK

let handle_dry_run __context _config _rpc _session_id state x _precheck_result
=
let dummy_gm = Ref.make () in
state.table <- (x.cls, x.id, Ref.string_of dummy_gm) :: state.table

let handle __context _config _rpc _session_id state x _precheck_result =
let vmm_record = API.vM_metrics_t_of_rpc x.snapshot in
let vmm = Ref.make () in
Db.VM_metrics.create ~__context ~ref:vmm
~uuid:(Uuidx.to_string (Uuidx.make ()))
~memory_actual:vmm_record.API.vM_metrics_memory_actual
~vCPUs_number:vmm_record.API.vM_metrics_VCPUs_number
~vCPUs_utilisation:vmm_record.API.vM_metrics_VCPUs_utilisation
~vCPUs_CPU:vmm_record.API.vM_metrics_VCPUs_CPU
~vCPUs_params:vmm_record.API.vM_metrics_VCPUs_params
~vCPUs_flags:vmm_record.API.vM_metrics_VCPUs_flags
~state:vmm_record.API.vM_metrics_state
~start_time:vmm_record.API.vM_metrics_start_time
~install_time:vmm_record.API.vM_metrics_install_time
~last_updated:vmm_record.API.vM_metrics_last_updated
~other_config:vmm_record.API.vM_metrics_other_config
~hvm:vmm_record.API.vM_metrics_hvm
~nested_virt:vmm_record.API.vM_metrics_nested_virt
~nomigrate:vmm_record.API.vM_metrics_nomigrate
~current_domain_type:vmm_record.API.vM_metrics_current_domain_type ;
state.table <- (x.cls, x.id, Ref.string_of vmm) :: state.table
end

(** If we're restoring VM metadata only then lookup the SR by uuid. If we can't find
the SR then we will still try to match VDIs later (except CDROMs) *)
module SR : HandlerTools = struct
Expand Down Expand Up @@ -1910,6 +1983,7 @@ module HostHandler = MakeHandler (Host)
module SRHandler = MakeHandler (SR)
module VDIHandler = MakeHandler (VDI)
module GuestMetricsHandler = MakeHandler (GuestMetrics)
module MetricsHandler = MakeHandler (Metrics)
module VMHandler = MakeHandler (VM)
module NetworkHandler = MakeHandler (Net)
module GPUGroupHandler = MakeHandler (GPUGroup)
Expand All @@ -1928,6 +2002,7 @@ let handlers =
; (Datamodel_common._sr, SRHandler.handle)
; (Datamodel_common._vdi, VDIHandler.handle)
; (Datamodel_common._vm_guest_metrics, GuestMetricsHandler.handle)
; (Datamodel_common._vm_metrics, MetricsHandler.handle)
; (Datamodel_common._vm, VMHandler.handle)
; (Datamodel_common._network, NetworkHandler.handle)
; (Datamodel_common._gpu_group, GPUGroupHandler.handle)
Expand Down Expand Up @@ -2265,13 +2340,14 @@ let metadata_handler (req : Request.t) s _ =
let force = find_query_flag req.Request.query "force" in
let dry_run = find_query_flag req.Request.query "dry_run" in
let live = find_query_flag req.Request.query "live" in
let check_cpu = find_query_flag req.Request.query "check_cpu" in
let vdi_map = read_map_params "vdi" req.Request.query in
info
"VM.import_metadata: force = %b; full_restore = %b dry_run = %b; \
live = %b; vdi_map = [ %s ]"
force full_restore dry_run live
live = %b; check_cpu = %b; vdi_map = [ %s ]"
force full_restore dry_run live check_cpu
(String.concat "; " (List.map (fun (a, b) -> a ^ "=" ^ b) vdi_map)) ;
let metadata_options = {dry_run; live; vdi_map} in
let metadata_options = {dry_run; live; vdi_map; check_cpu} in
let config =
{import_type= Metadata_import metadata_options; full_restore; force}
in
Expand Down
4 changes: 3 additions & 1 deletion ocaml/xapi/importexport.ml
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,7 @@ type vm_export_import = {
; dry_run: bool
; live: bool
; send_snapshots: bool
; check_cpu: bool
}

(* Copy VM metadata to a remote pool *)
Expand All @@ -269,11 +270,12 @@ let remote_metadata_export_import ~__context ~rpc ~session_id ~remote_address
match which with
| `All ->
[]
| `Only {live; dry_run; send_snapshots; _} ->
| `Only {live; dry_run; send_snapshots; check_cpu; _} ->
[
Printf.sprintf "live=%b" live
; Printf.sprintf "dry_run=%b" dry_run
; Printf.sprintf "export_snapshots=%b" send_snapshots
; Printf.sprintf "check_cpu=%b" check_cpu
]
in
let params = Printf.sprintf "restore=%b" restore :: params in
Expand Down
2 changes: 1 addition & 1 deletion ocaml/xapi/message_forwarding.ml
Original file line number Diff line number Diff line change
Expand Up @@ -2473,7 +2473,7 @@ functor
try bool_of_string (List.assoc "force" options) with _ -> false
in
if not force then
Cpuid_helpers.assert_vm_is_compatible ~__context ~vm ~host () ;
Cpuid_helpers.assert_vm_is_compatible ~__context ~vm:(`db vm) ~host ;
let source_host = Db.VM.get_resident_on ~__context ~self:vm in
with_vm_operation ~__context ~self:vm ~doc:"VM.pool_migrate"
~op:`pool_migrate ~strict:(not force) (fun () ->
Expand Down
1 change: 1 addition & 0 deletions ocaml/xapi/xapi_dr.ml
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,7 @@ let recover_vms ~__context ~vms ~session_to ~force =
{
Import.dry_run= false
; Import.live= false
; check_cpu= false
; vdi_map= [] (* we expect the VDI metadata to be present *)
}
in
Expand Down
2 changes: 1 addition & 1 deletion ocaml/xapi/xapi_vm.ml
Original file line number Diff line number Diff line change
Expand Up @@ -566,7 +566,7 @@ let resume ~__context ~vm ~start_paused ~force =
) ;
let host = Helpers.get_localhost ~__context in
if not force then
Cpuid_helpers.assert_vm_is_compatible ~__context ~vm ~host () ;
Cpuid_helpers.assert_vm_is_compatible ~__context ~vm:(`db vm) ~host ;
(* Update CPU feature set, which will be passed to xenopsd *)
Xapi_xenops.resume ~__context ~self:vm ~start_paused ~force

Expand Down
4 changes: 2 additions & 2 deletions ocaml/xapi/xapi_vm_helpers.ml
Original file line number Diff line number Diff line change
Expand Up @@ -628,7 +628,7 @@ let assert_matches_control_domain_affinity ~__context ~self ~host =
let assert_enough_pcpus ~__context ~self ~host ?remote () =
let vcpus = Db.VM.get_VCPUs_max ~__context ~self in
let pcpus =
Cpuid_helpers.get_host_cpu_info ~__context ~vm:self ~host ?remote ()
Cpuid_helpers.get_host_cpu_info ~__context ~host ?remote ()
|> Map_check.getf Cpuid_helpers.cpu_count
|> Int64.of_int
in
Expand Down Expand Up @@ -699,7 +699,7 @@ let assert_can_boot_here ~__context ~self ~host ~snapshot ~do_cpuid_check
assert_hardware_platform_support ~__context ~vm:self
~host:(Helpers.LocalObject host) ;
if do_cpuid_check then
Cpuid_helpers.assert_vm_is_compatible ~__context ~vm:self ~host () ;
Cpuid_helpers.assert_vm_is_compatible ~__context ~vm:(`db self) ~host ;
if do_sr_check then
assert_can_see_SRs ~__context ~self ~host ;
assert_can_see_networks ~__context ~self ~host ;
Expand Down
Loading

0 comments on commit 88534a0

Please sign in to comment.