Skip to content

Commit

Permalink
feat: memory limit supports metric available
Browse files Browse the repository at this point in the history
  • Loading branch information
yuanchaoa authored and rvql committed Oct 24, 2024
1 parent 0386bb7 commit db5deb2
Show file tree
Hide file tree
Showing 12 changed files with 238 additions and 88 deletions.
46 changes: 34 additions & 12 deletions agent/src/config/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ use public::{
bitmap::Bitmap,
consts::NPB_DEFAULT_PORT,
proto::{
agent::{self, SocketType, SystemLoadMetric},
agent::{self, SocketType, SysMemoryMetric, SystemLoadMetric},
common,
trident::{self, KubernetesClusterIdRequest, TapMode},
},
Expand Down Expand Up @@ -1843,10 +1843,26 @@ impl Default for Alerts {
}
}

fn to_sys_memory_metric<'de, D>(deserializer: D) -> Result<agent::SysMemoryMetric, D::Error>
where
D: Deserializer<'de>,
{
match String::deserialize(deserializer)?.as_str() {
"free" => Ok(agent::SysMemoryMetric::Free),
"available" => Ok(agent::SysMemoryMetric::Available),
other => Err(de::Error::invalid_value(
Unexpected::Str(other),
&"[free|available]",
)),
}
}

#[derive(Clone, Copy, Default, Debug, Deserialize, PartialEq, Eq)]
#[serde(default)]
pub struct SysFreeMemoryPercentage {
pub struct SysMemoryPercentage {
pub trigger_threshold: u32,
#[serde(deserialize_with = "to_sys_memory_metric")]
pub metric: agent::SysMemoryMetric,
}

fn to_system_load_metric<'de, D>(deserializer: D) -> Result<agent::SystemLoadMetric, D::Error>
Expand All @@ -1870,14 +1886,14 @@ pub struct RelativeSysLoad {
pub trigger_threshold: f32,
pub recovery_threshold: f32,
#[serde(deserialize_with = "to_system_load_metric")]
pub system_load_circuit_breaker_metric: agent::SystemLoadMetric,
pub metric: agent::SystemLoadMetric,
}

impl PartialEq for RelativeSysLoad {
fn eq(&self, other: &Self) -> bool {
self.trigger_threshold == other.trigger_threshold
|| self.recovery_threshold == other.recovery_threshold
|| self.system_load_circuit_breaker_metric == other.system_load_circuit_breaker_metric
|| self.metric == other.metric
}
}
impl Eq for RelativeSysLoad {}
Expand All @@ -1887,7 +1903,7 @@ impl Default for RelativeSysLoad {
RelativeSysLoad {
trigger_threshold: 1.0,
recovery_threshold: 0.9,
system_load_circuit_breaker_metric: agent::SystemLoadMetric::Load15,
metric: agent::SystemLoadMetric::Load15,
}
}
}
Expand All @@ -1912,7 +1928,7 @@ impl Default for TxThroughput {
#[derive(Clone, Copy, Default, Debug, Deserialize, PartialEq, Eq)]
#[serde(default)]
pub struct CircuitBreakers {
pub sys_free_memory_percentage: SysFreeMemoryPercentage,
pub sys_memory_percentage: SysMemoryPercentage,
pub relative_sys_load: RelativeSysLoad,
pub tx_throughput: TxThroughput,
}
Expand Down Expand Up @@ -2369,13 +2385,15 @@ impl From<&RuntimeConfig> for UserConfig {
check_core_file_disabled: rc.yaml_config.check_core_file_disabled,
},
circuit_breakers: CircuitBreakers {
sys_free_memory_percentage: SysFreeMemoryPercentage {
trigger_threshold: rc.sys_free_memory_limit,
sys_memory_percentage: SysMemoryPercentage {
trigger_threshold: rc.sys_memory_limit,
metric: SysMemoryMetric::from_str_name(rc.sys_memory_metric.as_str_name())
.unwrap_or(SysMemoryMetric::Free),
},
relative_sys_load: RelativeSysLoad {
trigger_threshold: rc.system_load_circuit_breaker_threshold,
recovery_threshold: rc.system_load_circuit_breaker_recover,
system_load_circuit_breaker_metric: SystemLoadMetric::from_str_name(
metric: SystemLoadMetric::from_str_name(
rc.system_load_circuit_breaker_metric.as_str_name(),
)
.unwrap_or(SystemLoadMetric::Load15),
Expand Down Expand Up @@ -4538,7 +4556,9 @@ pub struct RuntimeConfig {
pub kubernetes_api_enabled: bool,
#[serde(deserialize_with = "bool_from_int")]
pub ntp_enabled: bool,
pub sys_free_memory_limit: u32,
pub sys_memory_limit: u32,
#[serde(skip)]
pub sys_memory_metric: trident::SysMemoryMetric,
pub log_file_size: u32,
#[serde(deserialize_with = "bool_from_int")]
pub external_agent_http_proxy_enabled: bool,
Expand Down Expand Up @@ -4644,7 +4664,8 @@ impl RuntimeConfig {
l4_performance_enabled: true,
kubernetes_api_enabled: false,
ntp_enabled: false,
sys_free_memory_limit: 0,
sys_memory_limit: 0,
sys_memory_metric: trident::SysMemoryMetric::Free,
log_file_size: 1000,
external_agent_http_proxy_enabled: false,
external_agent_http_proxy_port: 38086,
Expand Down Expand Up @@ -4902,7 +4923,8 @@ impl TryFrom<trident::Config> for RuntimeConfig {
l4_performance_enabled: conf.l4_performance_enabled(),
kubernetes_api_enabled: conf.kubernetes_api_enabled(),
ntp_enabled: conf.ntp_enabled(),
sys_free_memory_limit: conf.sys_free_memory_limit(),
sys_memory_limit: conf.sys_free_memory_limit(),
sys_memory_metric: conf.sys_free_memory_metric(),
log_file_size: conf.log_file_size(),
external_agent_http_proxy_enabled: conf.external_agent_http_proxy_enabled(),
external_agent_http_proxy_port: conf.external_agent_http_proxy_port() as u16,
Expand Down
50 changes: 30 additions & 20 deletions agent/src/config/handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,8 @@ pub struct EnvironmentConfig {
pub max_millicpus: u32,
pub process_threshold: u32,
pub thread_threshold: u32,
pub sys_free_memory_limit: u32,
pub sys_memory_limit: u32,
pub sys_memory_metric: agent::SysMemoryMetric,
pub log_file_size: u32,
pub capture_mode: PacketCaptureType,
pub guard_interval: Duration,
Expand Down Expand Up @@ -1567,11 +1568,12 @@ impl TryFrom<(Config, UserConfig, DynamicConfig)> for ModuleConfig {
max_millicpus: conf.global.limits.max_millicpus,
process_threshold: conf.global.alerts.process_threshold,
thread_threshold: conf.global.alerts.thread_threshold,
sys_free_memory_limit: conf
sys_memory_limit: conf
.global
.circuit_breakers
.sys_free_memory_percentage
.sys_memory_percentage
.trigger_threshold,
sys_memory_metric: conf.global.circuit_breakers.sys_memory_percentage.metric,
log_file_size: conf.global.limits.max_local_log_file_size,
capture_mode: conf.inputs.cbpf.common.capture_mode,
guard_interval: conf.global.tunning.resource_monitoring_interval,
Expand All @@ -1589,7 +1591,7 @@ impl TryFrom<(Config, UserConfig, DynamicConfig)> for ModuleConfig {
.global
.circuit_breakers
.relative_sys_load
.system_load_circuit_breaker_metric,
.metric,
},
synchronizer: SynchronizerConfig {
sync_interval: conf.global.communication.proactive_request_interval,
Expand Down Expand Up @@ -3499,28 +3501,31 @@ impl ConfigHandler {
relative_sys_load.recovery_threshold, new_relative_sys_load.recovery_threshold);
relative_sys_load.recovery_threshold = new_relative_sys_load.recovery_threshold;
}
if relative_sys_load.system_load_circuit_breaker_metric
!= new_relative_sys_load.system_load_circuit_breaker_metric
{
info!("Update global.circuit_breakers.relative_sys_load.system_load_circuit_breaker_metric from {:?} to {:?}.",
relative_sys_load.system_load_circuit_breaker_metric, new_relative_sys_load.system_load_circuit_breaker_metric);
relative_sys_load.system_load_circuit_breaker_metric =
new_relative_sys_load.system_load_circuit_breaker_metric;
if relative_sys_load.metric != new_relative_sys_load.metric {
info!(
"Update global.circuit_breakers.relative_sys_load.metric from {:?} to {:?}.",
relative_sys_load.metric, new_relative_sys_load.metric
);
relative_sys_load.metric = new_relative_sys_load.metric;
}
if relative_sys_load.trigger_threshold != new_relative_sys_load.trigger_threshold {
info!("Update global.circuit_breakers.relative_sys_load.trigger_threshold from {:?} to {:?}.",
relative_sys_load.trigger_threshold, new_relative_sys_load.trigger_threshold);
relative_sys_load.trigger_threshold = new_relative_sys_load.trigger_threshold;
}
let sys_free_memory_percentage = &mut circuit_breakers.sys_free_memory_percentage;
let new_sys_free_memory_percentage = &mut new_circuit_breakers.sys_free_memory_percentage;
if sys_free_memory_percentage.trigger_threshold
!= new_sys_free_memory_percentage.trigger_threshold
{
info!("Update global.circuit_breakers.sys_free_memory_percentage.trigger_threshold from {:?} to {:?}.",
sys_free_memory_percentage.trigger_threshold, new_sys_free_memory_percentage.trigger_threshold);
sys_free_memory_percentage.trigger_threshold =
new_sys_free_memory_percentage.trigger_threshold;
let sys_memory_percentage = &mut circuit_breakers.sys_memory_percentage;
let new_sys_memory_percentage = &mut new_circuit_breakers.sys_memory_percentage;
if sys_memory_percentage.trigger_threshold != new_sys_memory_percentage.trigger_threshold {
info!("Update global.circuit_breakers.sys_memory_percentage.trigger_threshold from {:?} to {:?}.",
sys_memory_percentage.trigger_threshold, new_sys_memory_percentage.trigger_threshold);
sys_memory_percentage.trigger_threshold = new_sys_memory_percentage.trigger_threshold;
}
if sys_memory_percentage.metric != new_sys_memory_percentage.metric {
info!(
"Update global.circuit_breakers.sys_memory_percentage.metric from {:?} to {:?}.",
sys_memory_percentage.metric, new_sys_memory_percentage.metric
);
sys_memory_percentage.metric = new_sys_memory_percentage.metric;
}
let tx_throughput = &mut circuit_breakers.tx_throughput;
let new_tx_throughput = &mut new_circuit_breakers.tx_throughput;
Expand Down Expand Up @@ -4813,6 +4818,11 @@ impl ConfigHandler {
return vec![];
}

candidate_config.environment = new_config.environment;
candidate_config.log = new_config.log;
candidate_config.port_config = new_config.port_config;
candidate_config.pcap = new_config.pcap;

// avoid first config changed to restart dispatcher
if components.is_some() && restart_dispatcher && candidate_config.dispatcher.enabled {
callbacks.push(Self::set_restart_dispatcher);
Expand Down
32 changes: 24 additions & 8 deletions agent/src/monitor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ use crate::config::handler::EnvironmentAccess;
use crate::{
error::{Error, Result},
utils::{
process::{get_current_sys_free_memory_percentage, get_file_and_size_sum},
process::{get_current_sys_memory_percentage, get_file_and_size_sum},
stats::{
self, Collector, Countable, Counter, CounterType, CounterValue, RefCountable,
StatsOption,
Expand Down Expand Up @@ -211,24 +211,40 @@ impl RefCountable for SysStatusBroker {
}

let mut metrics = vec![];
let current_sys_free_memory_percentage = get_current_sys_free_memory_percentage();
let (current_sys_free_memory_percentage, current_sys_available_memory_percentage) =
get_current_sys_memory_percentage();
metrics.push((
"sys_free_memory",
CounterType::Gauged,
CounterValue::Unsigned(current_sys_free_memory_percentage as u64),
));
metrics.push((
"sys_available_memory",
CounterType::Gauged,
CounterValue::Unsigned(current_sys_available_memory_percentage as u64),
));

let sys_free_memory_limit = self.config.load().sys_free_memory_limit as f64;
let sys_free_memory_limit_ratio = if sys_free_memory_limit > 0.0 {
current_sys_free_memory_percentage as f64 / sys_free_memory_limit
} else {
0.0 // If sys_free_memory_limit is set to 0, it means that there is no need to check if the system's free memory is too low. In this case, 0.0 will be directly returned, indicating that there will be no low system free memory alert.
};
let sys_memory_limit = self.config.load().sys_memory_limit as f64;

let (sys_free_memory_limit_ratio, sys_available_memory_limit_ratio) =
if sys_memory_limit > 0.0 {
(
current_sys_free_memory_percentage as f64 / sys_memory_limit,
current_sys_available_memory_percentage as f64 / sys_memory_limit,
)
} else {
(0.0, 0.0) // If sys_memory_limit is set to 0, it means that there is no need to check if the system's free/available memory is too low. In this case, 0.0 will be directly returned, indicating that there will be no low system free/available memory alert.
};
metrics.push((
"sys_free_memory_limit_ratio",
CounterType::Gauged,
CounterValue::Float(sys_free_memory_limit_ratio),
));
metrics.push((
"sys_available_memory_limit_ratio",
CounterType::Gauged,
CounterValue::Float(sys_available_memory_limit_ratio),
));

match get_file_and_size_sum(&self.log_dir) {
Ok(file_and_size_sum) => {
Expand Down
54 changes: 31 additions & 23 deletions agent/src/utils/guard.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ use log::{debug, error, info, warn};
use sysinfo::{get_current_pid, Pid, ProcessExt, ProcessRefreshKind, System, SystemExt};

use super::process::{
get_current_sys_free_memory_percentage, get_file_and_size_sum, get_memory_rss, get_thread_num,
get_current_sys_memory_percentage, get_file_and_size_sum, get_memory_rss, get_thread_num,
FileAndSizeSum,
};
use crate::common::{
Expand All @@ -45,7 +45,7 @@ use crate::exception::ExceptionHandler;
use crate::rpc::get_timestamp;
use crate::utils::{cgroups::is_kernel_available_for_cgroups, environment::running_in_container};

use public::proto::agent::{Exception, PacketCaptureType, SystemLoadMetric};
use public::proto::agent::{Exception, PacketCaptureType, SysMemoryMetric, SystemLoadMetric};

struct SystemLoadGuard {
system: Arc<Mutex<System>>,
Expand Down Expand Up @@ -267,43 +267,51 @@ impl Guard {
(cpu_limit / 10) as f32 > cpu_usage // The cpu_usage is in percentage, and the unit of cpu_limit is milli-cores. Divide cpu_limit by 10 to align the units
}

fn check_sys_free_memory(
sys_free_memory_limit: f64,
under_sys_free_memory_limit: &mut bool,
fn check_sys_memory(
sys_memory_limit: f64,
sys_memory_metric: SysMemoryMetric,
under_sys_memory_limit: &mut bool,
last_exceeded: &mut Duration,
exception_handler: &ExceptionHandler,
) {
let current_sys_free_memory_percentage = get_current_sys_free_memory_percentage() as f64;
let (current_sys_free_memory_percentage, current_sys_available_memory_percentage) =
get_current_sys_memory_percentage();
debug!(
"current_sys_free_memory_percentage: {}, sys_free_memory_limit: {}",
current_sys_free_memory_percentage, sys_free_memory_limit
"current_sys_memory_percentage: [ free: {}, available: {} ], sys_memory_metric: {:?} sys_memory_limit: {}",
current_sys_free_memory_percentage, current_sys_available_memory_percentage, sys_memory_metric, sys_memory_limit
);
if sys_free_memory_limit != 0.0 {
if current_sys_free_memory_percentage < sys_free_memory_limit * 0.7 {
let current_memory_percentage = if sys_memory_metric == SysMemoryMetric::Free {
current_sys_free_memory_percentage as f64
} else {
current_sys_available_memory_percentage as f64
};

if sys_memory_limit != 0.0 {
if current_memory_percentage < sys_memory_limit * 0.7 {
*last_exceeded = get_timestamp(0);
exception_handler.set(Exception::FreeMemExceeded);
*under_sys_free_memory_limit = true;
*under_sys_memory_limit = true;
error!(
"current system free memory percentage is less than the 70% of sys_free_memory_limit, current system free memory percentage={}%, sys_free_memory_limit={}%, deepflow-agent restart...",
current_sys_free_memory_percentage, sys_free_memory_limit
"current system {:?} memory percentage is less than the 70% of sys_memory_limit, current system memory percentage={}%, sys_memory_limit={}%, deepflow-agent restart...",
sys_memory_metric, current_memory_percentage, sys_memory_limit
);
crate::utils::notify_exit(-1);
} else if current_sys_free_memory_percentage < sys_free_memory_limit {
} else if current_memory_percentage < sys_memory_limit {
*last_exceeded = get_timestamp(0);
exception_handler.set(Exception::FreeMemExceeded);
*under_sys_free_memory_limit = true;
*under_sys_memory_limit = true;
error!(
"current system free memory percentage is less than sys_free_memory_limit, current system free memory percentage={}%, sys_free_memory_limit={}%, set the agent to disabled",
current_sys_free_memory_percentage, sys_free_memory_limit
"current system {:?} memory percentage is less than sys_memory_limit, current system memory percentage={}%, sys_memory_limit={}%, set the agent to disabled",
sys_memory_metric, current_memory_percentage, sys_memory_limit
);
} else if current_sys_free_memory_percentage >= sys_free_memory_limit * 1.1 {
} else if current_memory_percentage >= sys_memory_limit * 1.1 {
let now = get_timestamp(0);
if *under_sys_free_memory_limit && now > *last_exceeded + CONTINUOUS_SAFETY_TIME {
if *under_sys_memory_limit && now > *last_exceeded + CONTINUOUS_SAFETY_TIME {
exception_handler.clear(Exception::FreeMemExceeded);
*under_sys_free_memory_limit = false;
*under_sys_memory_limit = false;
info!(
"current system free memory percentage: {}% remains above sys_free_memory_limit: {} * 110%, set the agent to enabled.",
current_sys_free_memory_percentage, sys_free_memory_limit
"current system {:?} memory percentage: {}% remains above sys_memory_limit: {} * 110%, set the agent to enabled.",
sys_memory_metric, current_memory_percentage, sys_memory_limit
);
}
}
Expand Down Expand Up @@ -453,7 +461,7 @@ impl Guard {
}
}

Self::check_sys_free_memory(config.sys_free_memory_limit as f64, &mut under_sys_free_memory_limit, &mut last_exceeded, &exception_handler);
Self::check_sys_memory(config.sys_memory_limit as f64, config.sys_memory_metric, &mut under_sys_free_memory_limit, &mut last_exceeded, &exception_handler);

match get_thread_num() {
Ok(thread_num) => {
Expand Down
Loading

0 comments on commit db5deb2

Please sign in to comment.