Skip to content

Commit

Permalink
CA-392163 on start failure, clear a VM's resource allocations
Browse files Browse the repository at this point in the history
As part of a start, resources are allocated for a VM in "scheduled_to.."
fields. These need to be cleared if the start fails. It turned out that
this was incomplete for PCI slots and those were leaking. This patch
tries to be more systematical about it.

Signed-off-by: Christian Lindig <[email protected]>
  • Loading branch information
Christian Lindig authored and lindig committed May 8, 2024
1 parent bdbf705 commit d0879c7
Showing 1 changed file with 33 additions and 17 deletions.
50 changes: 33 additions & 17 deletions ocaml/xapi/message_forwarding.ml
Original file line number Diff line number Diff line change
Expand Up @@ -1346,18 +1346,8 @@ functor
else
local_fn ~__context

(* Clear scheduled_to_be_resident_on for a VM and all its vGPUs. *)
let clear_scheduled_to_be_resident_on ~__context ~vm =
Db.VM.set_scheduled_to_be_resident_on ~__context ~self:vm
~value:Ref.null ;
List.iter
(fun vgpu ->
Db.VGPU.set_scheduled_to_be_resident_on ~__context ~self:vgpu
~value:Ref.null
)
(Db.VM.get_VGPUs ~__context ~self:vm)

let clear_reserved_netsriov_vfs_on ~__context ~vm =
let clear_vif_reservations ~__context ~vm =
debug "%s VM=%s" __FUNCTION__ (Ref.string_of vm) ;
Db.VM.get_VIFs ~__context ~self:vm
|> List.iter (fun vif ->
let vf = Db.VIF.get_reserved_pci ~__context ~self:vif in
Expand All @@ -1367,6 +1357,32 @@ functor
~value:Ref.null
)

let clear_reservations ~__context ~vm =
debug "%s VM=%s" __FUNCTION__ (Ref.string_of vm) ;
(* host *)
Db.VM.set_scheduled_to_be_resident_on ~__context ~self:vm
~value:Ref.null ;
(* vgpu *)
Db.VM.get_VGPUs ~__context ~self:vm
|> List.iter (fun vgpu ->
Db.VGPU.set_scheduled_to_be_resident_on ~__context ~self:vgpu
~value:Ref.null
) ;
(* pcis *)
Db.PCI.get_refs_where ~__context
~expr:
(Eq (Field "scheduled_to_be_attached_to", Literal (Ref.string_of vm))
)
|> List.iter (function
| pci when pci <> Ref.null ->
debug "%s: clearing reservation of PCI %s for VM %s"
__FUNCTION__ (Ref.string_of pci) (Ref.string_of vm) ;
Db.PCI.set_scheduled_to_be_attached_to ~__context ~self:pci
~value:Ref.null
| _ ->
()
)

(* Notes on memory checking/reservation logic:
When computing the hosts free memory we consider all VMs resident_on (ie running
and consuming resources NOW) and scheduled_to_be_resident_on (ie those which are
Expand Down Expand Up @@ -1399,8 +1415,8 @@ functor
(Helpers.will_have_qemu ~__context ~self:vm) ;
Xapi_network_sriov_helpers.reserve_sriov_vfs ~__context ~host ~vm
with e ->
clear_scheduled_to_be_resident_on ~__context ~vm ;
clear_reserved_netsriov_vfs_on ~__context ~vm ;
clear_vif_reservations ~__context ~vm ;
clear_reservations ~__context ~vm ;
raise e

(* For start/start_on/resume/resume_on/migrate *)
Expand Down Expand Up @@ -1469,7 +1485,7 @@ functor
?host_op () ;
(* In certain cases, VM might have been destroyed as a consequence of operation *)
if Db.is_valid_ref __context vm then
clear_scheduled_to_be_resident_on ~__context ~vm
clear_reservations ~__context ~vm
)
)

Expand All @@ -1488,7 +1504,7 @@ functor
finally f (fun () ->
Helpers.with_global_lock (fun () ->
finally_clear_host_operation ~__context ~host ?host_op () ;
clear_scheduled_to_be_resident_on ~__context ~vm
clear_reservations ~__context ~vm
)
)

Expand Down Expand Up @@ -2544,7 +2560,7 @@ functor
Helpers.with_global_lock (fun () ->
finally_clear_host_operation ~__context ~host
~host_op:`vm_migrate () ;
clear_scheduled_to_be_resident_on ~__context ~vm
clear_reservations ~__context ~vm
)
in
finally
Expand Down

0 comments on commit d0879c7

Please sign in to comment.