From 71c39605b3a2dd6eddeba42a5b482b4fc1b3a4e8 Mon Sep 17 00:00:00 2001 From: Pau Ruiz Safont Date: Mon, 8 Jul 2024 14:14:52 +0100 Subject: [PATCH 1/5] CA-395174: Try to unarchive VM's metrics when they aren't running Non-running VMs' metrics are stored in the coordinator. When the coordinator is asked about the metrics try to unarchive them instead of failing while trying to fetch the coordinator's IP address. This needs to force the HTTP method of the query to be POST Also returns a Service Unavailable when the host is marked as Broken. Signed-off-by: Pau Ruiz Safont --- ocaml/libs/http-lib/http.ml | 7 +++++++ ocaml/libs/http-lib/http.mli | 2 ++ ocaml/xapi/rrdd_proxy.ml | 30 +++++++++++++++++++++--------- 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/ocaml/libs/http-lib/http.ml b/ocaml/libs/http-lib/http.ml index c6ff41be709..09dc4a66ed4 100644 --- a/ocaml/libs/http-lib/http.ml +++ b/ocaml/libs/http-lib/http.ml @@ -94,6 +94,13 @@ let http_501_method_not_implemented ?(version = "1.0") () = ; "Cache-Control: no-cache, no-store" ] +let http_503_service_unavailable ?(version = "1.0") () = + [ + Printf.sprintf "HTTP/%s 503 Service Unavailable" version + ; "Connection: close" + ; "Cache-Control: no-cache, no-store" + ] + module Hdr = struct let task_id = "task-id" diff --git a/ocaml/libs/http-lib/http.mli b/ocaml/libs/http-lib/http.mli index 687c4d2f8c7..384367e2463 100644 --- a/ocaml/libs/http-lib/http.mli +++ b/ocaml/libs/http-lib/http.mli @@ -189,6 +189,8 @@ val http_500_internal_server_error : ?version:string -> unit -> string list val http_501_method_not_implemented : ?version:string -> unit -> string list +val http_503_service_unavailable : ?version:string -> unit -> string list + module Hdr : sig val task_id : string (** Header used for task id *) diff --git a/ocaml/xapi/rrdd_proxy.ml b/ocaml/xapi/rrdd_proxy.ml index 68b04862f73..fdea2a40373 100644 --- a/ocaml/xapi/rrdd_proxy.ml +++ b/ocaml/xapi/rrdd_proxy.ml @@ -75,17 +75,19 @@ let get_vm_rrd_forwarder (req : Http.Request.t) (s : Unix.file_descr) _ = Http_svr.headers s (Http.http_302_redirect url) in let unarchive () = - let req = {req with uri= Constants.rrd_unarchive_uri} in + let req = {req with m= Post; uri= Constants.rrd_unarchive_uri} in ignore (Xapi_services.hand_over_connection req s !Rrd_interface.forwarded_path ) in + let unavailable () = + Http_svr.headers s (Http.http_503_service_unavailable ()) + in (* List of conditions involved. *) let is_unarchive_request = List.mem_assoc Constants.rrd_unarchive query in - let is_master = Pool_role.is_master () in let is_owner_online owner = Db.is_valid_ref __context owner in let is_xapi_initialising = List.mem_assoc "dbsync" query in (* The logic. *) @@ -97,15 +99,25 @@ let get_vm_rrd_forwarder (req : Http.Request.t) (s : Unix.file_descr) _ = let owner = Db.VM.get_resident_on ~__context ~self:vm_ref in let owner_uuid = Db.Host.get_uuid ~__context ~self:owner in let is_owner_localhost = owner_uuid = localhost_uuid in - if is_owner_localhost then - if is_master then + let owner_is_available = + is_owner_online owner && not is_xapi_initialising + in + match + (Pool_role.get_role (), is_owner_localhost, owner_is_available) + with + | (Master | Slave _), false, true -> + (* VM is running elsewhere *) + read_at_owner owner + | Master, true, _ | Master, false, false -> + (* VM running on node, or not running at all. *) unarchive () - else + | Slave _, true, _ | Slave _, _, false -> + (* Coordinator knows best *) unarchive_at_master () - else if is_owner_online owner && not is_xapi_initialising then - read_at_owner owner - else - unarchive_at_master () + | Broken, _, _ -> + info "%s: host is broken, VM's metrics are not available" + __FUNCTION__ ; + unavailable () ) (* Forward the request for host RRD data to the RRDD HTTP handler. If the host From 7fe19554f3dcfa99b4e72015a7c62974a4a19424 Mon Sep 17 00:00:00 2001 From: Pau Ruiz Safont Date: Mon, 8 Jul 2024 14:29:40 +0100 Subject: [PATCH 2/5] rrdd_proxy: Change *_at to specify the IP address Forces users to use an address, instead of being implicit, this avoid the underlying cause for the issue fixed in the previous commit: it allowed a coordinator to call Pool_role.get_master_address, which always fails. Signed-off-by: Pau Ruiz Safont --- ocaml/xapi/rrdd_proxy.ml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/ocaml/xapi/rrdd_proxy.ml b/ocaml/xapi/rrdd_proxy.ml index fdea2a40373..3ca7687f361 100644 --- a/ocaml/xapi/rrdd_proxy.ml +++ b/ocaml/xapi/rrdd_proxy.ml @@ -68,8 +68,7 @@ let get_vm_rrd_forwarder (req : Http.Request.t) (s : Unix.file_descr) _ = let url = make_url ~address ~req in Http_svr.headers s (Http.http_302_redirect url) in - let unarchive_at_master () = - let address = Pool_role.get_master_address () in + let unarchive_at address = let query = (Constants.rrd_unarchive, "") :: query in let url = make_url_from_query ~address ~uri:req.uri ~query in Http_svr.headers s (Http.http_302_redirect url) @@ -111,9 +110,9 @@ let get_vm_rrd_forwarder (req : Http.Request.t) (s : Unix.file_descr) _ = | Master, true, _ | Master, false, false -> (* VM running on node, or not running at all. *) unarchive () - | Slave _, true, _ | Slave _, _, false -> + | Slave coordinator, true, _ | Slave coordinator, _, false -> (* Coordinator knows best *) - unarchive_at_master () + unarchive_at coordinator | Broken, _, _ -> info "%s: host is broken, VM's metrics are not available" __FUNCTION__ ; From 6bb7702454f291db6815235c9695f41b4d6b1acf Mon Sep 17 00:00:00 2001 From: Pau Ruiz Safont Date: Mon, 8 Jul 2024 14:54:14 +0100 Subject: [PATCH 3/5] rrdd_proxy: Use Option to encode where VMs might be available at This makes the selection of the action obvious, previously the two booleans made it hazy to understand the decision, and was part of the error why the coordinator tried to get the coordinator address from the pool_role file (and failed badly) Signed-off-by: Pau Ruiz Safont --- ocaml/xapi/rrdd_proxy.ml | 50 ++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/ocaml/xapi/rrdd_proxy.ml b/ocaml/xapi/rrdd_proxy.ml index 3ca7687f361..bec5ef0f84b 100644 --- a/ocaml/xapi/rrdd_proxy.ml +++ b/ocaml/xapi/rrdd_proxy.ml @@ -63,8 +63,7 @@ let get_vm_rrd_forwarder (req : Http.Request.t) (s : Unix.file_descr) _ = Xapi_http.with_context ~dummy:true "Get VM RRD." req s (fun __context -> let open Http.Request in (* List of possible actions. *) - let read_at_owner owner = - let address = Db.Host.get_address ~__context ~self:owner in + let read_at address = let url = make_url ~address ~req in Http_svr.headers s (Http.http_302_redirect url) in @@ -87,33 +86,38 @@ let get_vm_rrd_forwarder (req : Http.Request.t) (s : Unix.file_descr) _ = let is_unarchive_request = List.mem_assoc Constants.rrd_unarchive query in - let is_owner_online owner = Db.is_valid_ref __context owner in - let is_xapi_initialising = List.mem_assoc "dbsync" query in + let metrics_at () = + let ( let* ) = Option.bind in + let owner_of vm = + let owner = Db.VM.get_resident_on ~__context ~self:vm in + let is_xapi_initialising = List.mem_assoc "dbsync" query in + let is_available = not is_xapi_initialising in + if Db.is_valid_ref __context owner && is_available then + Some owner + else + None + in + let* owner = owner_of (Db.VM.get_by_uuid ~__context ~uuid:vm_uuid) in + let owner_uuid = Db.Host.get_uuid ~__context ~self:owner in + if owner_uuid = Helpers.get_localhost_uuid () then + (* VM is local but metrics aren't available *) + None + else + let address = Db.Host.get_address ~__context ~self:owner in + Some address + in (* The logic. *) if is_unarchive_request then unarchive () else - let localhost_uuid = Helpers.get_localhost_uuid () in - let vm_ref = Db.VM.get_by_uuid ~__context ~uuid:vm_uuid in - let owner = Db.VM.get_resident_on ~__context ~self:vm_ref in - let owner_uuid = Db.Host.get_uuid ~__context ~self:owner in - let is_owner_localhost = owner_uuid = localhost_uuid in - let owner_is_available = - is_owner_online owner && not is_xapi_initialising - in - match - (Pool_role.get_role (), is_owner_localhost, owner_is_available) - with - | (Master | Slave _), false, true -> - (* VM is running elsewhere *) - read_at_owner owner - | Master, true, _ | Master, false, false -> - (* VM running on node, or not running at all. *) + match (Pool_role.get_role (), metrics_at ()) with + | (Master | Slave _), Some owner -> + read_at owner + | Master, None -> unarchive () - | Slave coordinator, true, _ | Slave coordinator, _, false -> - (* Coordinator knows best *) + | Slave coordinator, None -> unarchive_at coordinator - | Broken, _, _ -> + | Broken, _ -> info "%s: host is broken, VM's metrics are not available" __FUNCTION__ ; unavailable () From 110c1121f1e5faa0baba1028457819688b9a290e Mon Sep 17 00:00:00 2001 From: Pau Ruiz Safont Date: Thu, 18 Jul 2024 08:47:06 +0100 Subject: [PATCH 4/5] http-lib: avoid double-queries to the radix tree Signed-off-by: Pau Ruiz Safont --- ocaml/libs/http-lib/http_svr.ml | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/ocaml/libs/http-lib/http_svr.ml b/ocaml/libs/http-lib/http_svr.ml index d8718bd68a6..26ad35f712f 100644 --- a/ocaml/libs/http-lib/http_svr.ml +++ b/ocaml/libs/http-lib/http_svr.ml @@ -41,6 +41,8 @@ open D module E = Debug.Make (struct let name = "http_internal_errors" end) +let ( let* ) = Option.bind + type uri_path = string module Stats = struct @@ -296,10 +298,7 @@ module Server = struct let add_handler x ty uri handler = let existing = - if MethodMap.mem ty x.handlers then - MethodMap.find ty x.handlers - else - Radix_tree.empty + Option.value (MethodMap.find_opt ty x.handlers) ~default:Radix_tree.empty in x.handlers <- MethodMap.add ty @@ -307,11 +306,9 @@ module Server = struct x.handlers let find_stats x m uri = - if not (MethodMap.mem m x.handlers) then - None - else - let rt = MethodMap.find m x.handlers in - Option.map (fun te -> te.TE.stats) (Radix_tree.longest_prefix uri rt) + let* rt = MethodMap.find_opt m x.handlers in + let* te = Radix_tree.longest_prefix uri rt in + Some te.TE.stats let all_stats x = let open Radix_tree in From 3658806b80ec5f17032fd3b242e98560cc28a5c4 Mon Sep 17 00:00:00 2001 From: Pau Ruiz Safont Date: Fri, 19 Jul 2024 13:06:16 +0100 Subject: [PATCH 5/5] rrdd_proxy: Return 400 on bad vm request Currently a List.assoc is used, which raises an unhandled exception. Signed-off-by: Pau Ruiz Safont --- ocaml/xapi/rrdd_proxy.ml | 130 +++++++++++++++++++-------------------- 1 file changed, 63 insertions(+), 67 deletions(-) diff --git a/ocaml/xapi/rrdd_proxy.ml b/ocaml/xapi/rrdd_proxy.ml index bec5ef0f84b..a824f77f23a 100644 --- a/ocaml/xapi/rrdd_proxy.ml +++ b/ocaml/xapi/rrdd_proxy.ml @@ -51,76 +51,72 @@ let get_vm_rrd_forwarder (req : Http.Request.t) (s : Unix.file_descr) _ = debug "put_rrd_forwarder: start" ; let query = req.Http.Request.query in req.Http.Request.close <- true ; - let vm_uuid = List.assoc "uuid" query in - if (not (List.mem_assoc "ref" query)) && not (List.mem_assoc "uuid" query) - then - fail_req_with s "get_vm_rrd: missing the 'uuid' parameter" - Http.http_400_badrequest - else if Rrdd.has_vm_rrd vm_uuid then - ignore - (Xapi_services.hand_over_connection req s !Rrd_interface.forwarded_path) - else - Xapi_http.with_context ~dummy:true "Get VM RRD." req s (fun __context -> - let open Http.Request in - (* List of possible actions. *) - let read_at address = - let url = make_url ~address ~req in - Http_svr.headers s (Http.http_302_redirect url) - in - let unarchive_at address = - let query = (Constants.rrd_unarchive, "") :: query in - let url = make_url_from_query ~address ~uri:req.uri ~query in - Http_svr.headers s (Http.http_302_redirect url) - in - let unarchive () = - let req = {req with m= Post; uri= Constants.rrd_unarchive_uri} in - ignore - (Xapi_services.hand_over_connection req s - !Rrd_interface.forwarded_path - ) - in - let unavailable () = - Http_svr.headers s (Http.http_503_service_unavailable ()) - in - (* List of conditions involved. *) - let is_unarchive_request = - List.mem_assoc Constants.rrd_unarchive query - in - let metrics_at () = - let ( let* ) = Option.bind in - let owner_of vm = - let owner = Db.VM.get_resident_on ~__context ~self:vm in - let is_xapi_initialising = List.mem_assoc "dbsync" query in - let is_available = not is_xapi_initialising in - if Db.is_valid_ref __context owner && is_available then - Some owner - else - None - in - let* owner = owner_of (Db.VM.get_by_uuid ~__context ~uuid:vm_uuid) in - let owner_uuid = Db.Host.get_uuid ~__context ~self:owner in - if owner_uuid = Helpers.get_localhost_uuid () then - (* VM is local but metrics aren't available *) - None + match List.assoc_opt "uuid" query with + | None -> + fail_req_with s "get_vm_rrd: missing the 'uuid' parameter" + Http.http_400_badrequest + | Some vm_uuid when Rrdd.has_vm_rrd vm_uuid -> + ignore + (Xapi_services.hand_over_connection req s !Rrd_interface.forwarded_path) + | Some vm_uuid -> ( + Xapi_http.with_context ~dummy:true "Get VM RRD." req s @@ fun __context -> + (* List of possible actions. *) + let read_at address = + let url = make_url ~address ~req in + Http_svr.headers s (Http.http_302_redirect url) + in + let unarchive_at address = + let query = (Constants.rrd_unarchive, "") :: query in + let url = make_url_from_query ~address ~uri:req.uri ~query in + Http_svr.headers s (Http.http_302_redirect url) + in + let unarchive () = + let req = {req with m= Post; uri= Constants.rrd_unarchive_uri} in + ignore + (Xapi_services.hand_over_connection req s + !Rrd_interface.forwarded_path + ) + in + let unavailable () = + Http_svr.headers s (Http.http_503_service_unavailable ()) + in + (* List of conditions involved. *) + let is_unarchive_request = List.mem_assoc Constants.rrd_unarchive query in + let metrics_at () = + let ( let* ) = Option.bind in + let owner_of vm = + let owner = Db.VM.get_resident_on ~__context ~self:vm in + let is_xapi_initialising = List.mem_assoc "dbsync" query in + let is_available = not is_xapi_initialising in + if Db.is_valid_ref __context owner && is_available then + Some owner else - let address = Db.Host.get_address ~__context ~self:owner in - Some address + None in - (* The logic. *) - if is_unarchive_request then - unarchive () + let* owner = owner_of (Db.VM.get_by_uuid ~__context ~uuid:vm_uuid) in + let owner_uuid = Db.Host.get_uuid ~__context ~self:owner in + if owner_uuid = Helpers.get_localhost_uuid () then + (* VM is local but metrics aren't available *) + None else - match (Pool_role.get_role (), metrics_at ()) with - | (Master | Slave _), Some owner -> - read_at owner - | Master, None -> - unarchive () - | Slave coordinator, None -> - unarchive_at coordinator - | Broken, _ -> - info "%s: host is broken, VM's metrics are not available" - __FUNCTION__ ; - unavailable () + let address = Db.Host.get_address ~__context ~self:owner in + Some address + in + (* The logic. *) + if is_unarchive_request then + unarchive () + else + match (Pool_role.get_role (), metrics_at ()) with + | (Master | Slave _), Some owner -> + read_at owner + | Master, None -> + unarchive () + | Slave coordinator, None -> + unarchive_at coordinator + | Broken, _ -> + info "%s: host is broken, VM's metrics are not available" + __FUNCTION__ ; + unavailable () ) (* Forward the request for host RRD data to the RRDD HTTP handler. If the host