From 8bbadd4d4430440dbb72b6757d56b68161c1d815 Mon Sep 17 00:00:00 2001 From: Raj Nishtala Date: Mon, 14 Aug 2023 12:32:32 -0400 Subject: [PATCH 1/3] feat(prometheus): Removing prometheus recording rules --- .changelog/3211.changed.txt | 1 + deploy/helm/sumologic/README.md | 1 - deploy/helm/sumologic/values.yaml | 221 +----------------- .../values-prometheus.yaml | 192 --------------- tests/integration/internal/constants.go | 69 +++--- 5 files changed, 39 insertions(+), 445 deletions(-) create mode 100644 .changelog/3211.changed.txt diff --git a/.changelog/3211.changed.txt b/.changelog/3211.changed.txt new file mode 100644 index 0000000000..815b5fafb9 --- /dev/null +++ b/.changelog/3211.changed.txt @@ -0,0 +1 @@ +feat(prometheus): Removing prometheus recording rules \ No newline at end of file diff --git a/deploy/helm/sumologic/README.md b/deploy/helm/sumologic/README.md index 43adbbe590..a921914e6e 100644 --- a/deploy/helm/sumologic/README.md +++ b/deploy/helm/sumologic/README.md @@ -205,7 +205,6 @@ The following table lists the configurable parameters of the Sumo Logic chart an | `kube-prometheus-stack.prometheus-node-exporter.nodeSelector` | Node selector for prometheus node exporter. [See docs/best-practices.md for more information.](/docs/best-practices.md) | `{}` | | `kube-prometheus-stack.kube-state-metrics.nodeSelector` | Node selector for kube-state-metrics. [See docs/best-practices.md for more information.](/docs/best-practices.md) | `{}` | | `kube-prometheus-stack.kube-state-metrics.image.tag` | Tag for kube-state-metrics Docker image. | `v2.7.0` | -| `kube-prometheus-stack.additionalPrometheusRulesMap` | Custom recording or alerting rules to be deployed into the cluster | See [values.yaml] | | `kube-prometheus-stack.commonLabels` | Labels to apply to all Kube Prometheus Stack resources | `{}` | | `kube-prometheus-stack.coreDns.serviceMonitor.interval` | Core DNS metrics scrape interval. If not set, the Prometheus default scrape interval is used. | `Nil` | | `kube-prometheus-stack.coreDns.serviceMonitor.metricRelabelings` | Core DNS MetricRelabelConfigs | See [values.yaml] | diff --git a/deploy/helm/sumologic/values.yaml b/deploy/helm/sumologic/values.yaml index 509481b906..0c3e2ed5c7 100644 --- a/deploy/helm/sumologic/values.yaml +++ b/deploy/helm/sumologic/values.yaml @@ -757,198 +757,6 @@ kube-prometheus-stack: prometheusOperator: false windows: false - ## k8s pre-1.14 prometheus recording rules - additionalPrometheusRulesMap: - pre-1.14-node-rules: - groups: - - name: node-pre-1.14.rules - rules: - - expr: sum(min(kube_pod_info) by (node)) - record: ":kube_pod_info_node_count:" - - expr: 1 - avg(rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m])) - record: :node_cpu_utilisation:avg1m - - expr: |- - 1 - avg by (node) ( - rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m]) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info:) - record: node:node_cpu_utilisation:avg1m - - expr: |- - 1 - - sum( - node_memory_MemFree_bytes{job="node-exporter"} + - node_memory_Cached_bytes{job="node-exporter"} + - node_memory_Buffers_bytes{job="node-exporter"} - ) - / - sum(node_memory_MemTotal_bytes{job="node-exporter"}) - record: ":node_memory_utilisation:" - - expr: |- - sum by (node) ( - ( - node_memory_MemFree_bytes{job="node-exporter"} + - node_memory_Cached_bytes{job="node-exporter"} + - node_memory_Buffers_bytes{job="node-exporter"} - ) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_memory_bytes_available:sum - - expr: |- - (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) - / - node:node_memory_bytes_total:sum - record: node:node_memory_utilisation:ratio - - expr: |- - 1 - - sum by (node) ( - ( - node_memory_MemFree_bytes{job="node-exporter"} + - node_memory_Cached_bytes{job="node-exporter"} + - node_memory_Buffers_bytes{job="node-exporter"} - ) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - / - sum by (node) ( - node_memory_MemTotal_bytes{job="node-exporter"} - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: "node:node_memory_utilisation:" - - expr: 1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum) - record: "node:node_memory_utilisation_2:" - - expr: |- - max by (instance, namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} - - node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) - / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) - record: "node:node_filesystem_usage:" - - expr: |- - sum by (node) ( - node_memory_MemTotal_bytes{job="node-exporter"} - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_memory_bytes_total:sum - - expr: |- - sum(irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) + - sum(irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) - record: :node_net_utilisation:sum_irate - - expr: |- - sum by (node) ( - (irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m]) + - irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_net_utilisation:sum_irate - - expr: |- - sum(irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m])) + - sum(irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m])) - record: :node_net_saturation:sum_irate - - expr: |- - sum by (node) ( - (irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m]) + - irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m])) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_net_saturation:sum_irate - - expr: |- - sum(node_load1{job="node-exporter"}) - / - sum(node:node_num_cpu:sum) - record: ":node_cpu_saturation_load1:" - - expr: avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])) - record: :node_disk_saturation:avg_irate - - expr: |- - avg by (node) ( - irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_disk_saturation:avg_irate - - expr: avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])) - record: :node_disk_utilisation:avg_irate - - expr: |- - avg by (node) ( - irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_disk_utilisation:avg_irate - - expr: |- - 1e3 * sum( - (rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) - + rate(node_vmstat_pgpgout{job="node-exporter"}[1m])) - ) - record: :node_memory_swap_io_bytes:sum_rate - - expr: |- - 1e3 * sum by (node) ( - (rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) - + rate(node_vmstat_pgpgout{job="node-exporter"}[1m])) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_memory_swap_io_bytes:sum_rate - - expr: |- - node:node_cpu_utilisation:avg1m - * - node:node_num_cpu:sum - / - scalar(sum(node:node_num_cpu:sum)) - record: node:cluster_cpu_utilisation:ratio - - expr: |- - (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) - / - scalar(sum(node:node_memory_bytes_total:sum)) - record: node:cluster_memory_utilisation:ratio - - expr: |- - sum by (node) ( - node_load1{job="node-exporter"} - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - / - node:node_num_cpu:sum - record: "node:node_cpu_saturation_load1:" - - expr: |- - max by (instance, namespace, pod, device) ( - node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} - / - node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} - ) - record: "node:node_filesystem_avail:" - - expr: |- - max( - max( - kube_pod_info{job="kube-state-metrics", host_ip!=""} - ) by (node, host_ip) - * on (host_ip) group_right (node) - label_replace( - ( - max(node_filesystem_files{job="node-exporter", mountpoint="/"}) - by (instance) - ), "host_ip", "$1", "instance", "(.*):.*" - ) - ) by (node) - record: "node:node_inodes_total:" - - expr: |- - max( - max( - kube_pod_info{job="kube-state-metrics", host_ip!=""} - ) by (node, host_ip) - * on (host_ip) group_right (node) - label_replace( - ( - max(node_filesystem_files_free{job="node-exporter", mountpoint="/"}) - by (instance) - ), "host_ip", "$1", "instance", "(.*):.*" - ) - ) by (node) - record: "node:node_inodes_free:" - ## NOTE changing the serviceMonitor scrape interval to be >1m can result in metrics from recording ## rules to be missing and empty panels in Sumo Logic Kubernetes apps. kubeApiServer: @@ -1416,40 +1224,13 @@ kube-prometheus-stack: sourceLabels: [job, __name__] ## prometheus operator rules ## :kube_pod_info_node_count: - ## :node_cpu_saturation_load1: - ## :node_cpu_utilisation:avg1m - ## :node_disk_saturation:avg_irate - ## :node_disk_utilisation:avg_irate - ## :node_memory_swap_io_bytes:sum_rate - ## :node_memory_utilisation: - ## :node_net_saturation:sum_irate - ## :node_net_utilisation:sum_irate ## cluster_quantile:apiserver_request_duration_seconds:histogram_quantile ## cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile ## cluster_quantile:scheduler_framework_extension_point_duration_seconds:histogram_quantile ## cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile ## cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile - ## instance:node_filesystem_usage:sum # no rules definition found - ## instance:node_network_receive_bytes:rate:sum - ## node:cluster_cpu_utilisation:ratio - ## node:cluster_memory_utilisation:ratio - ## node:node_cpu_saturation_load1: - ## node:node_cpu_utilisation:avg1m - ## node:node_disk_saturation:avg_irate - ## node:node_disk_utilisation:avg_irate - ## node:node_filesystem_avail: - ## node:node_filesystem_usage: ## node:node_inodes_free: ## node:node_inodes_total: - ## node:node_memory_bytes_total:sum - ## node:node_memory_swap_io_bytes:sum_rate - ## node:node_memory_utilisation: - ## node:node_memory_utilisation:ratio - ## node:node_memory_utilisation_2: - ## node:node_net_saturation:sum_irate - ## node:node_net_utilisation:sum_irate - ## node:node_num_cpu:sum - ## node_namespace_pod:kube_pod_info: - url: http://$(METADATA_METRICS_SVC).$(NAMESPACE).svc.cluster.local.:9888/prometheus.metrics.operator.rule remoteTimeout: 5s writeRelabelConfigs: @@ -1457,7 +1238,7 @@ kube-prometheus-stack: regex: ^true$ sourceLabels: [_sumo_forward_] - action: keep - regex: "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile|instance:node_filesystem_usage:sum|instance:node_network_receive_bytes:rate:sum|cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile|cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile|cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile|cluster_quantile:scheduler_framework_extension_point_duration_seconds:histogram_quantile|node_namespace_pod:kube_pod_info:|:kube_pod_info_node_count:|node:node_num_cpu:sum|:node_cpu_utilisation:avg1m|node:node_cpu_utilisation:avg1m|node:cluster_cpu_utilisation:ratio|:node_cpu_saturation_load1:|node:node_cpu_saturation_load1:|:node_memory_utilisation:|node:node_memory_bytes_total:sum|node:node_memory_utilisation:ratio|node:cluster_memory_utilisation:ratio|:node_memory_swap_io_bytes:sum_rate|node:node_memory_utilisation:|node:node_memory_utilisation_2:|node:node_memory_swap_io_bytes:sum_rate|:node_disk_utilisation:avg_irate|node:node_disk_utilisation:avg_irate|:node_disk_saturation:avg_irate|node:node_disk_saturation:avg_irate|node:node_filesystem_usage:|node:node_filesystem_avail:|:node_net_utilisation:sum_irate|node:node_net_utilisation:sum_irate|:node_net_saturation:sum_irate|node:node_net_saturation:sum_irate|node:node_inodes_total:|node:node_inodes_free:" + regex: "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile|cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile|cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile|cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile|cluster_quantile:scheduler_framework_extension_point_duration_seconds:histogram_quantile|:kube_pod_info_node_count:|node:node_inodes_total:|node:node_inodes_free:" sourceLabels: [__name__] ## Nginx ingress controller metrics ## rel: https://docs.nginx.com/nginx-ingress-controller/logging-and-monitoring/prometheus/#available-metrics diff --git a/examples/kube_prometheus_stack/values-prometheus.yaml b/examples/kube_prometheus_stack/values-prometheus.yaml index de25b04941..f26ac0661d 100644 --- a/examples/kube_prometheus_stack/values-prometheus.yaml +++ b/examples/kube_prometheus_stack/values-prometheus.yaml @@ -16,198 +16,6 @@ ## Labels to apply to all kube-prometheus-stack resources commonLabels: {} -## k8s pre-1.14 prometheus recording rules -additionalPrometheusRulesMap: - pre-1.14-node-rules: - groups: - - name: node-pre-1.14.rules - rules: - - expr: sum(min(kube_pod_info) by (node)) - record: ":kube_pod_info_node_count:" - - expr: 1 - avg(rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m])) - record: :node_cpu_utilisation:avg1m - - expr: |- - 1 - avg by (node) ( - rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m]) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info:) - record: node:node_cpu_utilisation:avg1m - - expr: |- - 1 - - sum( - node_memory_MemFree_bytes{job="node-exporter"} + - node_memory_Cached_bytes{job="node-exporter"} + - node_memory_Buffers_bytes{job="node-exporter"} - ) - / - sum(node_memory_MemTotal_bytes{job="node-exporter"}) - record: ":node_memory_utilisation:" - - expr: |- - sum by (node) ( - ( - node_memory_MemFree_bytes{job="node-exporter"} + - node_memory_Cached_bytes{job="node-exporter"} + - node_memory_Buffers_bytes{job="node-exporter"} - ) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_memory_bytes_available:sum - - expr: |- - (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) - / - node:node_memory_bytes_total:sum - record: node:node_memory_utilisation:ratio - - expr: |- - 1 - - sum by (node) ( - ( - node_memory_MemFree_bytes{job="node-exporter"} + - node_memory_Cached_bytes{job="node-exporter"} + - node_memory_Buffers_bytes{job="node-exporter"} - ) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - / - sum by (node) ( - node_memory_MemTotal_bytes{job="node-exporter"} - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: "node:node_memory_utilisation:" - - expr: 1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum) - record: "node:node_memory_utilisation_2:" - - expr: |- - max by (instance, namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} - - node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) - / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) - record: "node:node_filesystem_usage:" - - expr: |- - sum by (node) ( - node_memory_MemTotal_bytes{job="node-exporter"} - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_memory_bytes_total:sum - - expr: |- - sum(irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) + - sum(irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) - record: :node_net_utilisation:sum_irate - - expr: |- - sum by (node) ( - (irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m]) + - irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_net_utilisation:sum_irate - - expr: |- - sum(irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m])) + - sum(irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m])) - record: :node_net_saturation:sum_irate - - expr: |- - sum by (node) ( - (irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m]) + - irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m])) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_net_saturation:sum_irate - - expr: |- - sum(node_load1{job="node-exporter"}) - / - sum(node:node_num_cpu:sum) - record: ":node_cpu_saturation_load1:" - - expr: avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])) - record: :node_disk_saturation:avg_irate - - expr: |- - avg by (node) ( - irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_disk_saturation:avg_irate - - expr: avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])) - record: :node_disk_utilisation:avg_irate - - expr: |- - avg by (node) ( - irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_disk_utilisation:avg_irate - - expr: |- - 1e3 * sum( - (rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) - + rate(node_vmstat_pgpgout{job="node-exporter"}[1m])) - ) - record: :node_memory_swap_io_bytes:sum_rate - - expr: |- - 1e3 * sum by (node) ( - (rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) - + rate(node_vmstat_pgpgout{job="node-exporter"}[1m])) - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - record: node:node_memory_swap_io_bytes:sum_rate - - expr: |- - node:node_cpu_utilisation:avg1m - * - node:node_num_cpu:sum - / - scalar(sum(node:node_num_cpu:sum)) - record: node:cluster_cpu_utilisation:ratio - - expr: |- - (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) - / - scalar(sum(node:node_memory_bytes_total:sum)) - record: node:cluster_memory_utilisation:ratio - - expr: |- - sum by (node) ( - node_load1{job="node-exporter"} - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - ) - / - node:node_num_cpu:sum - record: "node:node_cpu_saturation_load1:" - - expr: |- - max by (instance, namespace, pod, device) ( - node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} - / - node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} - ) - record: "node:node_filesystem_avail:" - - expr: |- - max( - max( - kube_pod_info{job="kube-state-metrics", host_ip!=""} - ) by (node, host_ip) - * on (host_ip) group_right (node) - label_replace( - ( - max(node_filesystem_files{job="node-exporter", mountpoint="/"}) - by (instance) - ), "host_ip", "$1", "instance", "(.*):.*" - ) - ) by (node) - record: "node:node_inodes_total:" - - expr: |- - max( - max( - kube_pod_info{job="kube-state-metrics", host_ip!=""} - ) by (node, host_ip) - * on (host_ip) group_right (node) - label_replace( - ( - max(node_filesystem_files_free{job="node-exporter", mountpoint="/"}) - by (instance) - ), "host_ip", "$1", "instance", "(.*):.*" - ) - ) by (node) - record: "node:node_inodes_free:" - ## NOTE changing the serviceMonitor scrape interval to be >1m can result in metrics from recording ## rules to be missing and empty panels in Sumo Logic Kubernetes apps. kubeApiServer: diff --git a/tests/integration/internal/constants.go b/tests/integration/internal/constants.go index 5ac79f0eba..80715f1056 100644 --- a/tests/integration/internal/constants.go +++ b/tests/integration/internal/constants.go @@ -284,38 +284,44 @@ var ( "prometheus_remote_storage_shards_min", "prometheus_remote_storage_string_interner_zero_reference_releases_total", } - RecordingRuleMetrics = []string{ - ":kube_pod_info_node_count:", - "node:node_num_cpu:sum", - "node_namespace_pod:kube_pod_info:", - "node:node_cpu_utilisation:avg1m", - ":node_memory_utilisation:", - // "node:node_memory_bytes_available:sum", // not present, depends on other recording rules that don't exist - "node:node_memory_utilisation:ratio", - "node:node_memory_utilisation:", - "node:node_memory_utilisation_2:", - "node:node_filesystem_usage:", - "node:node_memory_bytes_total:sum", - ":node_net_utilisation:sum_irate", - "node:node_net_utilisation:sum_irate", - ":node_net_saturation:sum_irate", - "node:node_net_saturation:sum_irate", - ":node_cpu_utilisation:avg1m", - ":node_cpu_saturation_load1:", - ":node_disk_saturation:avg_irate", - "node:node_disk_saturation:avg_irate", - ":node_disk_utilisation:avg_irate", - "node:node_disk_utilisation:avg_irate", - ":node_memory_swap_io_bytes:sum_rate", - "node:node_memory_swap_io_bytes:sum_rate", - "node:cluster_cpu_utilisation:ratio", - "node:cluster_memory_utilisation:ratio", - "node:node_cpu_saturation_load1:", - "node:node_filesystem_avail:", - // "node:node_inodes_total:", // looks like we're not collecting node_filesystem_files which this requires - // "node:node_inodes_free:", // looks like we're not collecting node_filesystem_files_free which this requires - "instance:node_network_receive_bytes:rate:sum", + FluentBitMetrics = []string{ + "fluentbit_build_info", + "fluentbit_filter_add_records_total", + "fluentbit_filter_bytes_total", + "fluentbit_filter_drop_records_total", + "fluentbit_filter_records_total", + "fluentbit_input_bytes_total", + "fluentbit_input_files_closed_total", + "fluentbit_input_files_opened_total", + "fluentbit_input_files_rotated_total", + "fluentbit_input_records_total", + "fluentbit_output_dropped_records_total", + "fluentbit_output_errors_total", + "fluentbit_output_proc_bytes_total", + "fluentbit_output_proc_records_total", + "fluentbit_output_retried_records_total", + "fluentbit_output_retries_failed_total", + "fluentbit_output_retries_total", + "fluentbit_uptime", + } + FluentDMetrics = []string{ + "fluentd_output_status_buffer_available_space_ratio", + "fluentd_output_status_buffer_queue_length", + "fluentd_output_status_buffer_stage_byte_size", + "fluentd_output_status_buffer_stage_length", + "fluentd_output_status_buffer_total_bytes", + "fluentd_output_status_emit_count", + "fluentd_output_status_emit_records", + "fluentd_output_status_flush_time_count", + "fluentd_output_status_num_errors", + "fluentd_output_status_queue_byte_size", + "fluentd_output_status_retry_count", + "fluentd_output_status_retry_wait", + "fluentd_output_status_rollback_count", + "fluentd_output_status_slow_flush_count", + "fluentd_output_status_write_count", } + OtherMetrics = []string{ "up", } @@ -379,7 +385,6 @@ var ( CAdvisorMetrics, NodeExporterMetrics, PrometheusMetrics, - RecordingRuleMetrics, OtherMetrics, } DefaultExpectedNginxAnnotatedMetricsGroups = [][]string{ From d6c4554e572f227d23390f8db8069c067ae77d68 Mon Sep 17 00:00:00 2001 From: Raj Nishtala Date: Wed, 16 Aug 2023 16:41:26 -0400 Subject: [PATCH 2/3] Drop the default recording rules --- deploy/helm/sumologic/values.yaml | 25 ++++--------------------- tests/integration/internal/constants.go | 1 + 2 files changed, 5 insertions(+), 21 deletions(-) diff --git a/deploy/helm/sumologic/values.yaml b/deploy/helm/sumologic/values.yaml index 0c3e2ed5c7..16fc8afbae 100644 --- a/deploy/helm/sumologic/values.yaml +++ b/deploy/helm/sumologic/values.yaml @@ -741,7 +741,7 @@ kube-prometheus-stack: kubelet: false kubeProxy: false kubePrometheusGeneral: false - kubePrometheusNodeRecording: true + kubePrometheusNodeRecording: false kubernetesApps: false kubernetesResources: false kubernetesStorage: false @@ -750,7 +750,7 @@ kube-prometheus-stack: kubeSchedulerRecording: false kubeStateMetrics: false network: false - node: true + node: false nodeExporterAlerting: false nodeExporterRecording: false prometheus: false @@ -1220,26 +1220,9 @@ kube-prometheus-stack: remoteTimeout: 5s writeRelabelConfigs: - action: keep - regex: node-exporter;(?:node_load1|node_load5|node_load15|node_cpu_seconds_total) + regex: node-exporter;(?:node_load1|node_load5|node_load15|node_cpu_seconds_total|node_disk_io_time_weighted_seconds_total|node_disk_io_time_seconds_total|node_vmstat_pgpgin|node_vmstat_pgpgout|node_memory_MemFree_bytes|node_memory_Cached_bytes|node_memory_Buffers_bytes|node_memory_MemTotal_bytes|node_network_receive_drop_total|node_network_transmit_drop_total|node_network_receive_bytes_total|node_network_transmit_bytes_total|node_filesystem_avail_bytes|node_filesystem_size_bytes) sourceLabels: [job, __name__] - ## prometheus operator rules - ## :kube_pod_info_node_count: - ## cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - ## cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile - ## cluster_quantile:scheduler_framework_extension_point_duration_seconds:histogram_quantile - ## cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile - ## cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile - ## node:node_inodes_free: - ## node:node_inodes_total: - - url: http://$(METADATA_METRICS_SVC).$(NAMESPACE).svc.cluster.local.:9888/prometheus.metrics.operator.rule - remoteTimeout: 5s - writeRelabelConfigs: - - action: drop - regex: ^true$ - sourceLabels: [_sumo_forward_] - - action: keep - regex: "cluster_quantile:apiserver_request_duration_seconds:histogram_quantile|cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile|cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile|cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile|cluster_quantile:scheduler_framework_extension_point_duration_seconds:histogram_quantile|:kube_pod_info_node_count:|node:node_inodes_total:|node:node_inodes_free:" - sourceLabels: [__name__] + ## Nginx ingress controller metrics ## rel: https://docs.nginx.com/nginx-ingress-controller/logging-and-monitoring/prometheus/#available-metrics ## nginx_ingress_controller_ingress_resources_total diff --git a/tests/integration/internal/constants.go b/tests/integration/internal/constants.go index 80715f1056..74029ad90a 100644 --- a/tests/integration/internal/constants.go +++ b/tests/integration/internal/constants.go @@ -386,6 +386,7 @@ var ( NodeExporterMetrics, PrometheusMetrics, OtherMetrics, + AdditionalNodeExporterMetrics, } DefaultExpectedNginxAnnotatedMetricsGroups = [][]string{ NginxMetrics, From 56debcdd5d13d49a364233b65ba8e08287b00500 Mon Sep 17 00:00:00 2001 From: Raj Nishtala Date: Thu, 17 Aug 2023 16:51:48 -0400 Subject: [PATCH 3/3] feat(prometheus): test fixes and doc changes after dropping recording rule metrics --- docs/v4-migration-doc.md | 62 ++++++++++++++++++- .../additional_endpoints.output.yaml | 1 - .../metadata_metrics_otc/basic.output.yaml | 3 +- .../metadata_metrics_otc/custom.output.yaml | 3 +- 4 files changed, 63 insertions(+), 6 deletions(-) diff --git a/docs/v4-migration-doc.md b/docs/v4-migration-doc.md index a65c5f4606..07497730b7 100644 --- a/docs/v4-migration-doc.md +++ b/docs/v4-migration-doc.md @@ -31,6 +31,11 @@ format. Please check [Solution Overview][solution-overview] and see below for de [solution-overview]: /docs/README.md#solution-overview +### Drop Prometheus recording rule metrics + +OpenTelemetry can't collect Prometheus recording rule metrics. The new version therefore stops collecting recording rule metrics and updates +will be made to the Kubernetes App to remove these metrics. + ## How to upgrade ### Requirements @@ -95,4 +100,59 @@ require additional action. ## Full list of changes -:construction: +- Drop Prometheus recording rule metrics + + OpenTelemetry can't collect Prometheus recording rule metrics. The new version therefore stops collecting the following recording rule + metrics + + - kube_pod_info_node_count + - node_cpu_saturation_load1 + - node_cpu_utilisation:avg1m + - node_disk_saturation:avg_irate + - node_disk_utilisation:avg_irate + - node_memory_swap_io_bytes:sum_rate + - node_memory_utilisation + - node_net_saturation:sum_irate + - node_net_utilisation:sum_irate + - cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile + - cluster_quantile:scheduler_framework_extension_point_duration_seconds:histogram_quantile + - cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + - cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile + - instance:node_network_receive_bytes:rate:sum + - node:cluster_cpu_utilisation:ratio + - node:cluster_memory_utilisation:ratio + - node:node_cpu_saturation_load1 + - node:node_cpu_utilisation:avg1m + - node:node_disk_saturation:avg_irate + - node:node_disk_utilisation:avg_irate + - node:node_filesystem_avail + - node:node_filesystem_usage + - node:node_inodes_free + - node:node_inodes_total + - node:node_memory_bytes_total:sum + - node:node_memory_swap_io_bytes:sum_rate + - node:node_memory_utilisation + - node:node_memory_utilisation:ratio + - node:node_memory_utilisation_2 + - node:node_net_saturation:sum_irate + - node:node_net_utilisation:sum_irate + - node:node_num_cpu:sum + - node_namespace_pod:kube_pod_info + + Instead, the following new node metrics are now collected + + - node_disk_io_time_weighted_seconds_total + - node_disk_io_time_seconds_total + - node_vmstat_pgpgin + - node_vmstat_pgpgout + - node_memory_MemFree_bytes + - node_memory_Cached_bytes + - node_memory_Buffers_bytes + - node_memory_MemTotal_bytes + - node_network_receive_drop_total + - node_network_transmit_drop_total + - node_network_receive_bytes_total + - node_network_transmit_bytes_total + - node_filesystem_avail_bytes + - node_filesystem_size_bytes diff --git a/tests/helm/testdata/goldenfile/metadata_metrics_otc/additional_endpoints.output.yaml b/tests/helm/testdata/goldenfile/metadata_metrics_otc/additional_endpoints.output.yaml index d835fca4fe..761ce8ed83 100644 --- a/tests/helm/testdata/goldenfile/metadata_metrics_otc/additional_endpoints.output.yaml +++ b/tests/helm/testdata/goldenfile/metadata_metrics_otc/additional_endpoints.output.yaml @@ -305,7 +305,6 @@ data: "/prometheus.metrics.applications.varnish", "/prometheus.metrics.custom", "/prometheus.metrics.node", - "/prometheus.metrics.operator.rule", "/prometheus.metrics.others" ] service: diff --git a/tests/helm/testdata/goldenfile/metadata_metrics_otc/basic.output.yaml b/tests/helm/testdata/goldenfile/metadata_metrics_otc/basic.output.yaml index 388a7323c0..2abcc4709c 100644 --- a/tests/helm/testdata/goldenfile/metadata_metrics_otc/basic.output.yaml +++ b/tests/helm/testdata/goldenfile/metadata_metrics_otc/basic.output.yaml @@ -303,8 +303,7 @@ data: "/prometheus.metrics.applications.squidproxy", "/prometheus.metrics.applications.tomcat", "/prometheus.metrics.applications.varnish", - "/prometheus.metrics.node", - "/prometheus.metrics.operator.rule" + "/prometheus.metrics.node" ] service: extensions: diff --git a/tests/helm/testdata/goldenfile/metadata_metrics_otc/custom.output.yaml b/tests/helm/testdata/goldenfile/metadata_metrics_otc/custom.output.yaml index 69c63e68f8..2394db5d5e 100644 --- a/tests/helm/testdata/goldenfile/metadata_metrics_otc/custom.output.yaml +++ b/tests/helm/testdata/goldenfile/metadata_metrics_otc/custom.output.yaml @@ -184,8 +184,7 @@ data: "/prometheus.metrics.applications.squidproxy", "/prometheus.metrics.applications.tomcat", "/prometheus.metrics.applications.varnish", - "/prometheus.metrics.node", - "/prometheus.metrics.operator.rule" + "/prometheus.metrics.node" ] service: extensions: