Skip to content

Commit

Permalink
plugin-rapl: Detect common errors and give useful hints to the user.
Browse files Browse the repository at this point in the history
  • Loading branch information
TheElectronWill committed Apr 12, 2024
1 parent 8702751 commit 02e3701
Show file tree
Hide file tree
Showing 4 changed files with 108 additions and 23 deletions.
1 change: 1 addition & 0 deletions plugin-rapl/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ edition = "2021"
[dependencies]
alumet = { version = "0.2.0", path = "../alumet" }
anyhow = "1.0.79"
indoc = "2.0.5"
log = "0.4.20"
perf-event-open-sys = "4.0.0"
regex = "1.10.3"
38 changes: 36 additions & 2 deletions plugin-rapl/src/consistency.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,33 @@ pub struct SafeSubset {
pub is_whole: bool,
}

impl SafeSubset {
pub fn from_perf_only(perf_events: Vec<PowerEvent>) -> Self {
let mut domains: Vec<RaplDomainType> = perf_events.iter().map(|e| e.domain).collect();
domains.sort_by_key(|k| k.to_string());
domains.dedup_by_key(|k| k.to_string());
Self {
domains,
perf_events,
power_zones: Vec::new(),
is_whole: true,
}
}

pub fn from_powercap_only(power_zones: PowerZoneHierarchy) -> Self {
let power_zones = power_zones.flat;
let mut domains: Vec<RaplDomainType> = power_zones.iter().map(|z| z.domain).collect();
domains.sort_by_key(|k| k.to_string());
domains.dedup_by_key(|k| k.to_string());
Self {
domains,
perf_events: Vec::new(),
power_zones,
is_whole: true,
}
}
}

/// Checks the consistency of the RAPL domains reported by the different interfaces of the Linux kernel,
/// and returns the list of domains that are available everywhere ("safe subset").
pub fn check_domains_consistency(perf_events: Vec<PowerEvent>, power_zones: PowerZoneHierarchy) -> SafeSubset {
Expand Down Expand Up @@ -62,8 +89,15 @@ pub fn check_domains_consistency(perf_events: Vec<PowerEvent>, power_zones: Powe
domains_subset.push(d);
}
}
let perf_events_subset = perf_events.into_iter().filter(|e| domains_subset.contains(&e.domain)).collect();
let power_zones_subset = power_zones.flat.into_iter().filter(|z| domains_subset.contains(&z.domain)).collect();
let perf_events_subset = perf_events
.into_iter()
.filter(|e| domains_subset.contains(&e.domain))
.collect();
let power_zones_subset = power_zones
.flat
.into_iter()
.filter(|z| domains_subset.contains(&z.domain))
.collect();
SafeSubset {
domains: domains_subset,
perf_events: perf_events_subset,
Expand Down
90 changes: 70 additions & 20 deletions plugin-rapl/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
use alumet::{pipeline::Source, plugin::rust::AlumetPlugin, units::Unit};
use indoc::indoc;

use crate::{consistency::check_domains_consistency, perf_event::PerfEventProbe, powercap::PowercapProbe};
use crate::{
consistency::{check_domains_consistency, SafeSubset},
perf_event::PerfEventProbe,
powercap::PowercapProbe,
};

mod consistency;
mod cpus;
Expand All @@ -18,46 +23,91 @@ impl AlumetPlugin for RaplPlugin {
fn version() -> &'static str {
"0.1.0"
}

fn init(_config: &mut alumet::config::ConfigTable) -> anyhow::Result<Box<Self>> {
Ok(Box::new(RaplPlugin))
}

fn start(&mut self, alumet: &mut alumet::plugin::AlumetStart) -> anyhow::Result<()> {
// get cpu info, accessible perf events and power zones
// Get cpu info.
let all_cpus = cpus::online_cpus()?;
let socket_cpus = cpus::cpus_to_monitor()?;
let perf_events = perf_event::all_power_events()?;
let power_zones = powercap::all_power_zones()?;

let n_sockets = socket_cpus.len();
let n_cpu_cores = all_cpus.len();
log::debug!("{n_sockets}/{n_cpu_cores} monitorable CPU (cores) found: {socket_cpus:?}");

let available_domains = check_domains_consistency(perf_events, power_zones);
let subset_indicator = if available_domains.is_whole { "" } else { "(\"safe\" subset)" };
log::info!("Available RAPL domains {subset_indicator}: {}", consistency::mkstring(&available_domains.domains, ", "));
// Discover RAPL domains available in perf_events and powercap.
let perf_events = perf_event::all_power_events()?;
let (available_domains, subset_indicator) = match powercap::all_power_zones() {
Ok(power_zones) => {
let domains = check_domains_consistency(perf_events, power_zones);
let subset_indicator = if domains.is_whole { "" } else { " (\"safe\" subset)" };
(domains, subset_indicator)
}
Err(e) => {
log::warn!("The consistency of the RAPL domains reported by the different interfaces of the Linux kernel cannot be checked (this is useful to work around bugs in some kernel versions on some machines): {e}");
let domains = SafeSubset::from_perf_only(perf_events);
let subset_indicator = " (unchecked consistency)";
(domains, subset_indicator)
}
};
log::info!(
"Available RAPL domains{subset_indicator}: {}",
consistency::mkstring(&available_domains.domains, ", ")
);

// create the probe
let metric = alumet.create_metric::<f64>("rapl_consumed_energy", Unit::Joule, "Energy consumed since the previous measurement, as reported by RAPL.")?;
// Create the probe.
let metric = alumet.create_metric::<f64>(
"rapl_consumed_energy",
Unit::Joule,
"Energy consumed since the previous measurement, as reported by RAPL.",
)?;
let mut events_on_cpus = Vec::new();
for event in &available_domains.perf_events {
for cpu in &socket_cpus {
events_on_cpus.push((event, cpu));
}
}
log::debug!("Events to read: {events_on_cpus:?}");
let source: Box<dyn Source> = match PerfEventProbe::new(metric, &events_on_cpus) {
Ok(perf_event_probe) => Box::new(perf_event_probe),
Err(_) => {
let source: anyhow::Result<Box<dyn Source>> = match PerfEventProbe::new(metric, &events_on_cpus) {
Ok(perf_event_probe) => Ok(Box::new(perf_event_probe)),
Err(e) => {
// perf_events failed, log an error and try powercap instead
log::error!("I could not use perf_events to read RAPL energy counters.");
// TODO print some hints about permissions, setcap, sysctl -w perf_event_paranoid
// TODO print how to configure alumet to disable this error.
Box::new(PowercapProbe::new(metric, &available_domains.power_zones)?)
},
log::warn!("I could not use perf_events to read RAPL energy counters: {e}");
let msg = indoc! {"
I will fallback to the powercap sysfs, but perf_events is more efficient (see https://hal.science/hal-04420527).
This warning is probably caused by insufficient privileges.
To fix this, you have 3 possibilities:
1. Grant the CAP_PERFMON (CAP_SYS_ADMIN on Linux < 5.8) capability to the agent binary.
sudo setcap cap_perfmon=ep $(readlink -f path/to/alumet-agent)
2. Change a kernel setting to allow every process to read the perf_events.
sudo sysctl -w kernel.perf_event_paranoid=0
3. Run the agent as root (not recommanded).
"};
log::warn!("{msg}");
// TODO add an option to disable perf_events and always use sysfs.

match PowercapProbe::new(metric, &available_domains.power_zones) {
Ok(powercap_probe) => Ok(Box::new(powercap_probe)),
Err(e) => {
let msg = indoc! {"
I could not use the powercap sysfs to read RAPL energy counters.
This is probably caused by insufficient privileges.
Please check that you have read access to everything in '/sys/devices/virtual/powercap/intel-rapl'.
A solution could be:
sudo chmod a+r -R /sys/devices/virtual/powercap/intel-rapl
"};
log::error!("{msg}");
Err(e)
}
}
}
};
alumet.add_source(source);
alumet.add_source(source?);
Ok(())
}

Expand Down
2 changes: 1 addition & 1 deletion plugin-rapl/src/powercap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ const POWERCAP_RAPL_PATH: &str = "/sys/devices/virtual/powercap/intel-rapl";
const POWER_ZONE_PREFIX: &str = "intel-rapl";
const POWERCAP_ENERGY_UNIT: f64 = 0.000_001; // 1 microJoules

const PERMISSION_ADVICE: &str = "Try to set kernel.perf_event_paranoid to 0 or -1, and/or to adjust file permissions.";
const PERMISSION_ADVICE: &str = "Try to adjust file permissions.";

/// Hierarchy of power zones
pub struct PowerZoneHierarchy {
Expand Down

0 comments on commit 02e3701

Please sign in to comment.