Skip to content

Commit

Permalink
IH-615: move metrics collection out of RRDD (#6016)
Browse files Browse the repository at this point in the history
Moves out two metrics collectors out of RRDD into separate plugins:
* CPU-related metrics
* Network-related metrics

Three more collectors are still in `xcp_rrdd.ml`, this is because
memory-, HA-, and SR-cache-related metrics collection is tightly
integrated with RRDD itself, exposing calls through RRDD bindings for
XAPI to use. To convert these into plugins (and drop the
collection-related code in `xcp_rrdd.ml`, and stop linking to Xenctrl),
these calls need to be done through standard message switch mechanism
instead. This is for a future PR.

Also:
* netdev metrics generation is tightly integrated with networkd,
serializing these into a file, which the plugin reads from. not terribly
efficient but i don't see a way to disentangle it from networkd, so it's
kept this way.

Needs to be merged together with a corresponding spec repo PR, I will
open it after XenRT testing completes (currently running BVT/BST) + also
need to test manually
  • Loading branch information
robhoes authored Oct 1, 2024
2 parents 9e1ea63 + aa631b9 commit 6e16163
Show file tree
Hide file tree
Showing 31 changed files with 722 additions and 404 deletions.
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ quality-gate:

install-scripts:
$(MAKE) -C scripts install

install-python3:
$(MAKE) -C python3 install

Expand Down Expand Up @@ -164,7 +164,7 @@ install-dune1:
dune install $(DUNE_IU_PACKAGES1)

DUNE_IU_PACKAGES2=-j $(JOBS) --destdir=$(DESTDIR) --prefix=$(OPTDIR) --libdir=$(LIBDIR) --mandir=$(MANDIR) --libexecdir=$(OPTDIR)/libexec --datadir=$(DOCDIR) xapi xe

install-dune2:
dune install $(DUNE_IU_PACKAGES2)

Expand Down
3 changes: 3 additions & 0 deletions doc/content/toolstack/high-level/daemons.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ xcp-rrdd
- xcp-rrdd-iostat
- xcp-rrdd-squeezed
- xcp-rrdd-xenpm
- xcp-rrdd-dcmi
- xcp-rrdd-netdev
- xcp-rrdd-cpu

xcp-networkd
: a host network manager which takes care of configuring interfaces, bridges
Expand Down
4 changes: 0 additions & 4 deletions ocaml/xcp-rrdd/bin/rrdd/rrdd_common.ml

This file was deleted.

351 changes: 0 additions & 351 deletions ocaml/xcp-rrdd/bin/rrdd/xcp_rrdd.ml
Original file line number Diff line number Diff line change
Expand Up @@ -200,352 +200,6 @@ end

module Watcher = Watch.WatchXenstore (Meminfo)

(*****************************************************)
(* cpu related code *)
(*****************************************************)

let xen_flag_complement = Int64.(shift_left 1L 63 |> lognot)

(* This function is used for getting vcpu stats of the VMs present on this host. *)
let dss_vcpus xc doms =
List.fold_left
(fun dss (dom, uuid, domid) ->
let maxcpus = dom.Xenctrl.max_vcpu_id + 1 in
let rec cpus i dss =
if i >= maxcpus then
dss
else
let vcpuinfo = Xenctrl.domain_get_vcpuinfo xc domid i in
(* Workaround for Xen leaking the flag XEN_RUNSTATE_UPDATE; using a
mask of its complement ~(1 << 63) *)
let cpu_time =
Int64.(
to_float @@ logand vcpuinfo.Xenctrl.cputime xen_flag_complement
)
in
(* Convert from nanoseconds to seconds *)
let cpu_time = cpu_time /. 1.0e9 in
let cputime_rrd =
( Rrd.VM uuid
, Ds.ds_make ~name:(Printf.sprintf "cpu%d" i) ~units:"(fraction)"
~description:(Printf.sprintf "CPU%d usage" i)
~value:(Rrd.VT_Float cpu_time) ~ty:Rrd.Derive ~default:true
~min:0.0 ~max:1.0 ()
)
in
cpus (i + 1) (cputime_rrd :: dss)
in
(* Runstate info is per-domain rather than per-vcpu *)
let dss =
let dom_cpu_time =
Int64.(to_float @@ logand dom.Xenctrl.cpu_time xen_flag_complement)
in
let dom_cpu_time =
dom_cpu_time /. (1.0e9 *. float_of_int dom.Xenctrl.nr_online_vcpus)
in
try
let ri = Xenctrl.domain_get_runstate_info xc domid in
( Rrd.VM uuid
, Ds.ds_make ~name:"runstate_fullrun" ~units:"(fraction)"
~value:(Rrd.VT_Float (Int64.to_float ri.Xenctrl.time0 /. 1.0e9))
~description:"Fraction of time that all VCPUs are running"
~ty:Rrd.Derive ~default:false ~min:0.0 ()
)
:: ( Rrd.VM uuid
, Ds.ds_make ~name:"runstate_full_contention" ~units:"(fraction)"
~value:(Rrd.VT_Float (Int64.to_float ri.Xenctrl.time1 /. 1.0e9))
~description:
"Fraction of time that all VCPUs are runnable (i.e., \
waiting for CPU)"
~ty:Rrd.Derive ~default:false ~min:0.0 ()
)
:: ( Rrd.VM uuid
, Ds.ds_make ~name:"runstate_concurrency_hazard"
~units:"(fraction)"
~value:(Rrd.VT_Float (Int64.to_float ri.Xenctrl.time2 /. 1.0e9))
~description:
"Fraction of time that some VCPUs are running and some are \
runnable"
~ty:Rrd.Derive ~default:false ~min:0.0 ()
)
:: ( Rrd.VM uuid
, Ds.ds_make ~name:"runstate_blocked" ~units:"(fraction)"
~value:(Rrd.VT_Float (Int64.to_float ri.Xenctrl.time3 /. 1.0e9))
~description:
"Fraction of time that all VCPUs are blocked or offline"
~ty:Rrd.Derive ~default:false ~min:0.0 ()
)
:: ( Rrd.VM uuid
, Ds.ds_make ~name:"runstate_partial_run" ~units:"(fraction)"
~value:(Rrd.VT_Float (Int64.to_float ri.Xenctrl.time4 /. 1.0e9))
~description:
"Fraction of time that some VCPUs are running, and some are \
blocked"
~ty:Rrd.Derive ~default:false ~min:0.0 ()
)
:: ( Rrd.VM uuid
, Ds.ds_make ~name:"runstate_partial_contention"
~units:"(fraction)"
~value:(Rrd.VT_Float (Int64.to_float ri.Xenctrl.time5 /. 1.0e9))
~description:
"Fraction of time that some VCPUs are runnable and some are \
blocked"
~ty:Rrd.Derive ~default:false ~min:0.0 ()
)
:: ( Rrd.VM uuid
, Ds.ds_make
~name:(Printf.sprintf "cpu_usage")
~units:"(fraction)"
~description:(Printf.sprintf "Domain CPU usage")
~value:(Rrd.VT_Float dom_cpu_time) ~ty:Rrd.Derive ~default:true
~min:0.0 ~max:1.0 ()
)
:: dss
with _ -> dss
in
try cpus 0 dss with _ -> dss
)
[] doms

let physcpus = ref [||]

let dss_pcpus xc =
let len = Array.length !physcpus in
let newinfos =
if len = 0 then (
let physinfo = Xenctrl.physinfo xc in
let pcpus = physinfo.Xenctrl.nr_cpus in
physcpus := if pcpus > 0 then Array.make pcpus 0L else [||] ;
Xenctrl.pcpu_info xc pcpus
) else
Xenctrl.pcpu_info xc len
in
let dss, len_newinfos =
Array.fold_left
(fun (acc, i) v ->
( ( Rrd.Host
, Ds.ds_make ~name:(Printf.sprintf "cpu%d" i) ~units:"(fraction)"
~description:("Physical cpu usage for cpu " ^ string_of_int i)
~value:(Rrd.VT_Float (Int64.to_float v /. 1.0e9))
~min:0.0 ~max:1.0 ~ty:Rrd.Derive ~default:true
~transform:(fun x -> 1.0 -. x)
()
)
:: acc
, i + 1
)
)
([], 0) newinfos
in
let sum_array = Array.fold_left (fun acc v -> Int64.add acc v) 0L newinfos in
let avg_array = Int64.to_float sum_array /. float_of_int len_newinfos in
let avgcpu_ds =
( Rrd.Host
, Ds.ds_make ~name:"cpu_avg" ~units:"(fraction)"
~description:"Average physical cpu usage"
~value:(Rrd.VT_Float (avg_array /. 1.0e9))
~min:0.0 ~max:1.0 ~ty:Rrd.Derive ~default:true
~transform:(fun x -> 1.0 -. x)
()
)
in
avgcpu_ds :: dss

let dss_loadavg () =
[
( Rrd.Host
, Ds.ds_make ~name:"loadavg" ~units:"(fraction)"
~description:"Domain0 loadavg"
~value:(Rrd.VT_Float (Rrdd_common.loadavg ()))
~ty:Rrd.Gauge ~default:true ()
)
]

let count_power_state_running_domains domains =
List.fold_left
(fun count (dom, _, _) ->
if not dom.Xenctrl.paused then count + 1 else count
)
0 domains

let dss_hostload xc domains =
let physinfo = Xenctrl.physinfo xc in
let pcpus = physinfo.Xenctrl.nr_cpus in
let rec sum acc n f =
match n with n when n >= 0 -> sum (acc + f n) (n - 1) f | _ -> acc
in
let load =
List.fold_left
(fun acc (dom, _, domid) ->
sum 0 dom.Xenctrl.max_vcpu_id (fun id ->
let vcpuinfo = Xenctrl.domain_get_vcpuinfo xc domid id in
if vcpuinfo.Xenctrl.online && not vcpuinfo.Xenctrl.blocked then
1
else
0
)
+ acc
)
0 domains
in
let running_domains = count_power_state_running_domains domains in

let load_per_cpu = float_of_int load /. float_of_int pcpus in
[
( Rrd.Host
, Ds.ds_make ~name:"hostload" ~units:"(fraction)"
~description:
("Host load per physical cpu, where load refers to "
^ "the number of vCPU(s) in running or runnable status."
)
~value:(Rrd.VT_Float load_per_cpu) ~min:0.0 ~ty:Rrd.Gauge ~default:true
()
)
; ( Rrd.Host
, Ds.ds_make ~name:"running_vcpus" ~units:"count"
~description:"The total number of running vCPUs per host"
~value:(Rrd.VT_Int64 (Int64.of_int load))
~min:0.0 ~ty:Rrd.Gauge ~default:true ()
)
; ( Rrd.Host
, Ds.ds_make ~name:"running_domains" ~units:"count"
~description:"The total number of running domains per host"
~value:(Rrd.VT_Int64 (Int64.of_int running_domains))
~min:0.0 ~ty:Rrd.Gauge ~default:true ()
)
]

(*****************************************************)
(* network related code *)
(*****************************************************)

let dss_netdev doms =
let uuid_of_domid domains domid =
let _, uuid, _ =
try List.find (fun (_, _, domid') -> domid = domid') domains
with Not_found ->
failwith
(Printf.sprintf "Failed to find uuid corresponding to domid: %d" domid)
in
uuid
in
let open Network_stats in
let stats = Network_stats.read_stats () in
let dss, sum_rx, sum_tx =
List.fold_left
(fun (dss, sum_rx, sum_tx) (dev, stat) ->
if not Astring.String.(is_prefix ~affix:"vif" dev) then
let pif_name = "pif_" ^ dev in
( ( Rrd.Host
, Ds.ds_make ~name:(pif_name ^ "_rx")
~description:
("Bytes per second received on physical interface " ^ dev)
~units:"B/s" ~value:(Rrd.VT_Int64 stat.rx_bytes) ~ty:Rrd.Derive
~min:0.0 ~default:true ()
)
:: ( Rrd.Host
, Ds.ds_make ~name:(pif_name ^ "_tx")
~description:
("Bytes per second sent on physical interface " ^ dev)
~units:"B/s" ~value:(Rrd.VT_Int64 stat.tx_bytes)
~ty:Rrd.Derive ~min:0.0 ~default:true ()
)
:: ( Rrd.Host
, Ds.ds_make ~name:(pif_name ^ "_rx_errors")
~description:
("Receive errors per second on physical interface " ^ dev)
~units:"err/s" ~value:(Rrd.VT_Int64 stat.rx_errors)
~ty:Rrd.Derive ~min:0.0 ~default:false ()
)
:: ( Rrd.Host
, Ds.ds_make ~name:(pif_name ^ "_tx_errors")
~description:
("Transmit errors per second on physical interface " ^ dev)
~units:"err/s" ~value:(Rrd.VT_Int64 stat.tx_errors)
~ty:Rrd.Derive ~min:0.0 ~default:false ()
)
:: dss
, Int64.add stat.rx_bytes sum_rx
, Int64.add stat.tx_bytes sum_tx
)
else
( ( try
let d1, d2 =
Scanf.sscanf dev "vif%d.%d" (fun d1 d2 -> (d1, d2))
in
let vif_name = Printf.sprintf "vif_%d" d2 in
(* Note: rx and tx are the wrong way round because from dom0 we
see the vms backwards *)
let uuid = uuid_of_domid doms d1 in
( Rrd.VM uuid
, Ds.ds_make ~name:(vif_name ^ "_tx") ~units:"B/s"
~description:
("Bytes per second transmitted on virtual interface \
number '"
^ string_of_int d2
^ "'"
)
~value:(Rrd.VT_Int64 stat.rx_bytes) ~ty:Rrd.Derive ~min:0.0
~default:true ()
)
:: ( Rrd.VM uuid
, Ds.ds_make ~name:(vif_name ^ "_rx") ~units:"B/s"
~description:
("Bytes per second received on virtual interface \
number '"
^ string_of_int d2
^ "'"
)
~value:(Rrd.VT_Int64 stat.tx_bytes) ~ty:Rrd.Derive
~min:0.0 ~default:true ()
)
:: ( Rrd.VM uuid
, Ds.ds_make ~name:(vif_name ^ "_rx_errors") ~units:"err/s"
~description:
("Receive errors per second on virtual interface \
number '"
^ string_of_int d2
^ "'"
)
~value:(Rrd.VT_Int64 stat.tx_errors) ~ty:Rrd.Derive
~min:0.0 ~default:false ()
)
:: ( Rrd.VM uuid
, Ds.ds_make ~name:(vif_name ^ "_tx_errors") ~units:"err/s"
~description:
("Transmit errors per second on virtual interface \
number '"
^ string_of_int d2
^ "'"
)
~value:(Rrd.VT_Int64 stat.rx_errors) ~ty:Rrd.Derive
~min:0.0 ~default:false ()
)
:: dss
with _ -> dss
)
, sum_rx
, sum_tx
)
)
([], 0L, 0L) stats
in
[
( Rrd.Host
, Ds.ds_make ~name:"pif_aggr_rx"
~description:"Bytes per second received on all physical interfaces"
~units:"B/s" ~value:(Rrd.VT_Int64 sum_rx) ~ty:Rrd.Derive ~min:0.0
~default:true ()
)
; ( Rrd.Host
, Ds.ds_make ~name:"pif_aggr_tx"
~description:"Bytes per second sent on all physical interfaces"
~units:"B/s" ~value:(Rrd.VT_Int64 sum_tx) ~ty:Rrd.Derive ~min:0.0
~default:true ()
)
]
@ dss

(*****************************************************)
(* memory stats *)
(*****************************************************)
Expand Down Expand Up @@ -830,11 +484,6 @@ let dom0_stat_generators =
("ha", fun _ _ _ -> Rrdd_ha_stats.all ())
; ("mem_host", fun xc _ _ -> dss_mem_host xc)
; ("mem_vms", fun _ _ domains -> dss_mem_vms domains)
; ("pcpus", fun xc _ _ -> dss_pcpus xc)
; ("vcpus", fun xc _ domains -> dss_vcpus xc domains)
; ("loadavg", fun _ _ _ -> dss_loadavg ())
; ("hostload", fun xc _ domains -> dss_hostload xc domains)
; ("netdev", fun _ _ domains -> dss_netdev domains)
; ("cache", fun _ timestamp _ -> dss_cache timestamp)
]

Expand Down
Loading

0 comments on commit 6e16163

Please sign in to comment.