From 20e9428336d8cb6fcfe97bf556700daff98b53c4 Mon Sep 17 00:00:00 2001 From: Thibault Mange <22740367+thibaultmg@users.noreply.github.com> Date: Thu, 19 Dec 2024 10:28:36 +0100 Subject: [PATCH] fix deprecated metrics Signed-off-by: Thibault Mange <22740367+thibaultmg@users.noreply.github.com> --- .../dash-acm-nexus-clusters-overview.yaml | 1910 ----------------- .../grafana/dash-k8s-networking-cluster.yaml | 15 +- .../acm/dash-acm-nexus-clusters-overview.yaml | 2 +- .../base/grafana/nexus/acm/dash-k8s-etcd.yaml | 2 +- .../grafana/nexus/acm/prometheus-rule.yaml | 31 +- .../base/grafana/nexus/acm/scrape-config.yaml | 2 +- .../hcp/dash-acm-nexus-hcp-overview.yaml | 855 ++++++++ 7 files changed, 890 insertions(+), 1927 deletions(-) delete mode 100644 operators/multiclusterobservability/manifests/base/grafana/dash-acm-nexus-clusters-overview.yaml create mode 100644 operators/multiclusterobservability/manifests/base/grafana/nexus/hcp/dash-acm-nexus-hcp-overview.yaml diff --git a/operators/multiclusterobservability/manifests/base/grafana/dash-acm-nexus-clusters-overview.yaml b/operators/multiclusterobservability/manifests/base/grafana/dash-acm-nexus-clusters-overview.yaml deleted file mode 100644 index 670c60875..000000000 --- a/operators/multiclusterobservability/manifests/base/grafana/dash-acm-nexus-clusters-overview.yaml +++ /dev/null @@ -1,1910 +0,0 @@ -apiVersion: v1 -data: - acm-clusters-overview.json: |- - { - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": "-- Grafana --", - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "target": { - "limit": 100, - "matchAny": false, - "tags": [], - "type": "dashboard" - }, - "type": "dashboard" - } - ] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "id": 1, - "iteration": 1682528664304, - "links": [ - { - "asDropdown": true, - "icon": "external link", - "includeVars": false, - "keepTime": false, - "tags": [], - "targetBlank": true, - "title": "All Dashboards", - "tooltip": "", - "type": "dashboards", - "url": "" - } - ], - "panels": [ - { - "collapsed": false, - "datasource": "$datasource", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 138, - "panels": [], - "title": "Control Plane Health", - "type": "row" - }, - { - "datasource": "$datasource", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "custom": { - "align": null, - "displayMode": "auto", - "filterable": false - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Value #A" - }, - "properties": [ - { - "id": "displayName", - "value": "Max latency (99th percentile)" - }, - { - "id": "unit", - "value": "s" - }, - { - "id": "custom.displayMode", - "value": "color-background" - }, - { - "id": "custom.align", - "value": "left" - }, - { - "id": "thresholds", - "value": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "orange", - "value": 1 - }, - { - "color": "red", - "value": 2 - } - ] - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Value #B" - }, - "properties": [ - { - "id": "displayName", - "value": "API Errors [1h]" - }, - { - "id": "unit", - "value": "short" - }, - { - "id": "custom.displayMode", - "value": "color-text" - }, - { - "id": "custom.align", - "value": "left" - }, - { - "id": "thresholds", - "value": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "orange", - "value": 1 - }, - { - "color": "red", - "value": 2 - } - ] - } - }, - { - "id": "noValue", - "value": "0" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "api_up" - }, - "properties": [ - { - "id": "displayName", - "value": "API servers up" - }, - { - "id": "unit", - "value": "percentunit" - }, - { - "id": "custom.displayMode", - "value": "color-text" - }, - { - "id": "custom.align", - "value": "left" - }, - { - "id": "thresholds", - "value": { - "mode": "absolute", - "steps": [ - { - "color": "red", - "value": null - }, - { - "color": "orange", - "value": 0 - }, - { - "color": "green", - "value": 1 - } - ] - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "cluster" - }, - "properties": [ - { - "id": "displayName", - "value": "Cluster" - }, - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "links", - "value": [ - { - "targetBlank": false, - "title": "Drill down to cluster", - "url": "/d/09ec8aa1e996d6ffcd6817bbaff4db1b/kubernetes-api-server?${__url_time_range}&var-cluster=${__data.fields.cluster}&var-instance=All" - } - ] - }, - { - "id": "custom.align", - "value": "left" - } - ] - } - ] - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 1 - }, - "id": 146, - "interval": "4m", - "options": { - "showHeader": true - }, - "pluginVersion": "8.5.20", - "targets": [ - { - "exemplar": true, - "expr": "topk(50, max(apiserver_request_duration_seconds:histogram_quantile_99{cluster=~\"$cluster\",clusterType!=\"ocp3\"}) by (cluster))\n* on(cluster) group_left(api_up) count_values without() (\"api_up\", (sum(up{cluster=~\"$cluster\",job=\"apiserver\",clusterType!=\"ocp3\"} == 1) by (cluster) / count(up{cluster=~\"$cluster\",job=\"apiserver\",clusterType!=\"ocp3\"}) by (cluster)))", - "format": "table", - "instant": true, - "interval": "", - "legendFormat": "", - "refId": "A" - }, - { - "exemplar": true, - "expr": "sum by (cluster)(sum:apiserver_request_total:1h{cluster=~\"$cluster\",code=~\"5..\",clusterType!=\"ocp3\"})", - "format": "table", - "hide": false, - "instant": true, - "interval": "", - "legendFormat": "", - "refId": "B" - } - ], - "title": "Top 50 Max Latency API Server", - "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, - { - "id": "organize", - "options": { - "excludeByName": { - "Time": true - }, - "indexByName": { - "Time": 0, - "Value #A": 2, - "Value #B": 4, - "api_up": 3, - "cluster": 1 - }, - "renameByName": {} - } - } - ], - "type": "table" - }, - { - "datasource": null, - "description": "Leader election changes per cluster over the time range selected for dashboard.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "custom": { - "align": null, - "displayMode": "auto", - "filterable": false - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [] - } - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "cluster" - }, - "properties": [ - { - "id": "displayName", - "value": "Cluster" - }, - { - "id": "links", - "value": [ - { - "title": "Drill down to cluster", - "url": "/d/N8BxQ2jMz/kubernetes-etcd-cluster?${__url_time_range}&var-cluster=${__data.fields.cluster}" - } - ] - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Value" - }, - "properties": [ - { - "id": "displayName", - "value": "Leader Election Changes" - }, - { - "id": "custom.align", - "value": "left" - }, - { - "id": "thresholds", - "value": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "yellow", - "value": 1 - }, - { - "color": "red", - "value": 2 - } - ] - } - }, - { - "id": "custom.displayMode", - "value": "color-text" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "db_size" - }, - "properties": [ - { - "id": "displayName", - "value": "DB Size" - }, - { - "id": "unit", - "value": "bytes" - }, - { - "id": "custom.align", - "value": "left" - }, - { - "id": "decimals", - "value": 2 - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "has_leader" - }, - "properties": [ - { - "id": "displayName", - "value": "Has a Leader" - }, - { - "id": "mappings", - "value": [ - { - "options": { - "0": { - "text": "No" - }, - "1": { - "text": "Yes" - } - }, - "type": "value" - } - ] - }, - { - "id": "custom.align", - "value": "left" - } - ] - } - ] - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 1 - }, - "id": 150, - "interval": "1m", - "options": { - "frameIndex": 2, - "showHeader": true, - "sortBy": [] - }, - "pluginVersion": "8.5.20", - "targets": [ - { - "exemplar": true, - "expr": "sum(changes(etcd_server_leader_changes_seen_total{cluster=~\"$cluster\",job=\"etcd\"}[$__range])) by (cluster)\n* on(cluster) group_left(db_size) count_values without() (\"db_size\", max(etcd_debugging_mvcc_db_total_size_in_bytes{cluster=~\"$cluster\",job=\"etcd\"}) by (cluster))\n* on(cluster) group_left(has_leader) count_values without() (\"has_leader\", max(etcd_server_has_leader{cluster=~\"$cluster\",job=\"etcd\"}) by (cluster))", - "format": "table", - "instant": true, - "interval": "", - "legendFormat": "", - "refId": "A" - } - ], - "title": "etcd", - "transformations": [ - { - "id": "filterFieldsByName", - "options": { - "include": { - "names": [ - "cluster", - "db_size", - "has_leader", - "Value" - ] - } - } - }, - { - "id": "organize", - "options": { - "excludeByName": {}, - "indexByName": { - "Value": 2, - "cluster": 0, - "db_size": 3, - "has_leader": 1 - }, - "renameByName": {} - } - } - ], - "type": "table" - }, - { - "collapsed": false, - "datasource": "$datasource", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 8 - }, - "id": 140, - "panels": [], - "title": "Optimization", - "type": "row" - }, - { - "datasource": "$datasource", - "description": "Highlights % differences between CPU requests commitments vs utilization. When this difference is large ( >20%), it means that resources are reserved but unused.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "custom": { - "align": null, - "displayMode": "auto", - "filterable": false - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Value" - }, - "properties": [ - { - "id": "displayName", - "value": "Overestimation" - }, - { - "id": "unit", - "value": "percentunit" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.displayMode", - "value": "color-background" - }, - { - "id": "custom.align", - "value": "left" - }, - { - "id": "thresholds", - "value": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 0.2 - } - ] - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "cpu_requested" - }, - "properties": [ - { - "id": "displayName", - "value": "Requested" - }, - { - "id": "unit", - "value": "percentunit" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.align", - "value": "left" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "cpu_utilized" - }, - "properties": [ - { - "id": "displayName", - "value": "Utilized" - }, - { - "id": "unit", - "value": "percentunit" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.align", - "value": "left" - }, - { - "id": "thresholds", - "value": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "cluster" - }, - "properties": [ - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "links", - "value": [ - { - "targetBlank": false, - "title": "Drill down to cluster", - "url": "/d/8Qvi3edMz/acm-resource-optimization-cluster?${__url_time_range}&var-cluster=${__data.fields.cluster}" - } - ] - }, - { - "id": "custom.align", - "value": "left" - }, - { - "id": "displayName", - "value": "Cluster" - } - ] - } - ] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 9 - }, - "id": 151, - "interval": "5m", - "options": { - "showHeader": true, - "sortBy": [] - }, - "pluginVersion": "8.5.20", - "targets": [ - { - "exemplar": true, - "expr": "topk(50, sum by (cluster) (cluster:cpu_requested:ratio)- sum by (cluster) (cluster:node_cpu:ratio{cluster=~\"$cluster\",clusterType!=\"ocp3\"}))\n* on(cluster) group_left(cpu_requested) count_values without() (\"cpu_requested\", cluster:cpu_requested:ratio)\n* on(cluster) group_left(cpu_utilized) count_values without() (\"cpu_utilized\", cluster:node_cpu:ratio{cluster=~\"$cluster\",clusterType!=\"ocp3\"})", - "instant": true, - "interval": "", - "legendFormat": "", - "refId": "A" - } - ], - "title": "Top 50 CPU Overestimation Clusters", - "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, - { - "id": "organize", - "options": { - "excludeByName": { - "Time": true - }, - "indexByName": { - "Time": 0, - "Value": 2, - "cluster": 1, - "cpu_requested": 3, - "cpu_utilized": 4 - }, - "renameByName": {} - } - } - ], - "type": "table" - }, - { - "datasource": "$datasource", - "description": "Highlights % differences between Memory requests commitments vs utilization. When this difference is large ( >20%), it means that resources are reserved but unused.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "custom": { - "align": null, - "displayMode": "auto", - "filterable": false - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "cluster" - }, - "properties": [ - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "links", - "value": [ - { - "targetBlank": false, - "title": "Drill down to cluster", - "url": "/d/8Qvi3edMz/acm-resource-optimization-cluster?${__url_time_range}&var-cluster=${__data.fields.cluster}" - } - ] - }, - { - "id": "custom.align", - "value": "left" - }, - { - "id": "displayName", - "value": "Cluster" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Value" - }, - "properties": [ - { - "id": "displayName", - "value": "Overestimation" - }, - { - "id": "unit", - "value": "percentunit" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.displayMode", - "value": "color-background" - }, - { - "id": "custom.align", - "value": "left" - }, - { - "id": "thresholds", - "value": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 0.2 - } - ] - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "memory_requested" - }, - "properties": [ - { - "id": "displayName", - "value": "Requested" - }, - { - "id": "unit", - "value": "percentunit" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.align", - "value": "left" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "memory_utilized" - }, - "properties": [ - { - "id": "displayName", - "value": "Utilized" - }, - { - "id": "unit", - "value": "percentunit" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.align", - "value": "left" - }, - { - "id": "thresholds", - "value": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - } - } - ] - } - ] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 9 - }, - "id": 153, - "interval": "5m", - "options": { - "showHeader": true, - "sortBy": [] - }, - "pluginVersion": "8.5.20", - "targets": [ - { - "exemplar": true, - "expr": "topk(50, cluster:memory_requested:ratio{cluster=~\"$cluster\"} - ignoring(usage) cluster:memory_utilized:ratio{cluster=~\"$cluster\"})\n* on(cluster) group_left(memory_requested) count_values without() (\"memory_requested\", cluster:memory_requested:ratio)\n* on(cluster) group_left(memory_utilized) count_values without() (\"memory_utilized\", cluster:memory_utilized:ratio)", - "instant": true, - "interval": "", - "legendFormat": "", - "refId": "A" - } - ], - "title": "Top 50 Memory Overestimation Clusters", - "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, - { - "id": "organize", - "options": { - "excludeByName": { - "Time": true - }, - "indexByName": { - "Time": 0, - "Value": 2, - "cluster": 1, - "memory_requested": 3, - "memory_utilized": 4 - }, - "renameByName": {} - } - } - ], - "type": "table" - }, - { - "collapsed": false, - "datasource": "$datasource", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 17 - }, - "id": 34, - "panels": [], - "repeat": null, - "title": "Capacity / Utilization", - "type": "row" - }, - { - "datasource": "$datasource", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "custom": { - "align": null, - "displayMode": "auto", - "filterable": false - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "cluster" - }, - "properties": [ - { - "id": "displayName", - "value": "Cluster" - }, - { - "id": "links", - "value": [ - { - "targetBlank": false, - "title": "Drill down to cluster", - "url": "/d/8Qvi3edMz/acm-resource-optimization-cluster?${__url_time_range}&var-cluster=${__data.fields.cluster}" - } - ] - }, - { - "id": "custom.align", - "value": "left" - }, - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.align", - "value": "left" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "machine_cpu_cores_sum" - }, - "properties": [ - { - "id": "displayName", - "value": "Total Cores" - }, - { - "id": "unit", - "value": "short" - }, - { - "id": "custom.align", - "value": "left" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "node_allocatable_cpu_cores_sum" - }, - "properties": [ - { - "id": "displayName", - "value": "Allocatable Cores" - }, - { - "id": "unit", - "value": "short" - }, - { - "id": "custom.align", - "value": "left" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "cpu_requested" - }, - "properties": [ - { - "id": "displayName", - "value": "Requested" - }, - { - "id": "unit", - "value": "percentunit" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.align", - "value": "left" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Value" - }, - "properties": [ - { - "id": "displayName", - "value": "Utilized" - }, - { - "id": "unit", - "value": "percentunit" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.align", - "value": "left" - } - ] - } - ] - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 18 - }, - "id": 47, - "interval": "5m", - "options": { - "showHeader": true - }, - "pluginVersion": "8.5.20", - "targets": [ - { - "exemplar": true, - "expr": "topk(50, cluster:node_cpu:ratio{cluster=~\"$cluster\"})\n* on(cluster) group_left(machine_cpu_cores_sum) count_values without() (\"machine_cpu_cores_sum\", cluster:cpu_cores:sum)\n* on(cluster) group_left(node_allocatable_cpu_cores_sum) count_values without() (\"node_allocatable_cpu_cores_sum\", cluster:cpu_allocatable:sum)\n* on(cluster) group_left(cpu_requested) count_values without() (\"cpu_requested\", cluster:cpu_requested:ratio)", - "instant": true, - "interval": "", - "legendFormat": "", - "refId": "A" - } - ], - "title": "Top 50 CPU Utilized Clusters", - "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, - { - "id": "organize", - "options": { - "excludeByName": { - "Time": true, - "usage": true - }, - "indexByName": { - "Time": 0, - "Value": 5, - "cluster": 1, - "cpu_requested": 4, - "machine_cpu_cores_sum": 2, - "node_allocatable_cpu_cores_sum": 3 - }, - "renameByName": {} - } - } - ], - "type": "table" - }, - { - "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fieldConfig": { - "defaults": { - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 18 - }, - "hiddenSeries": false, - "id": 64, - "interval": "4m", - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": false, - "linewidth": 1, - "nullPointMode": "null as zero", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "8.5.20", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "topk(5, cluster:node_cpu:ratio{cluster=~\"$cluster\",clusterType!=\"ocp3\"})", - "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{cluster}}", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Top 5 Utilized Clusters (% CPU usage)", - "tooltip": { - "shared": false, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": "1", - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ], - "yaxis": { - "align": false, - "alignLevel": 1 - } - }, - { - "datasource": "$datasource", - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "custom": { - "align": null, - "displayMode": "auto", - "filterable": false - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "cluster" - }, - "properties": [ - { - "id": "displayName", - "value": "Cluster" - }, - { - "id": "links", - "value": [ - { - "targetBlank": false, - "title": "Drill down to cluster", - "url": "/d/8Qvi3edMz/acm-resource-optimization-cluster?${__url_time_range}&var-cluster=${__data.fields.cluster}" - } - ] - }, - { - "id": "custom.align", - "value": "left" - }, - { - "id": "unit", - "value": "short" - }, - { - "id": "decimals", - "value": 2 - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "machine_memory_sum" - }, - "properties": [ - { - "id": "displayName", - "value": "Available Memory" - }, - { - "id": "unit", - "value": "bytes" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.align", - "value": "left" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "machine_memory_requested" - }, - "properties": [ - { - "id": "displayName", - "value": "Requested" - }, - { - "id": "unit", - "value": "percentunit" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.align", - "value": "left" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Value" - }, - "properties": [ - { - "id": "displayName", - "value": "Utilized" - }, - { - "id": "unit", - "value": "percentunit" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.align", - "value": "left" - } - ] - } - ] - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 25 - }, - "id": 60, - "interval": "5m", - "options": { - "showHeader": true - }, - "pluginVersion": "8.5.20", - "targets": [ - { - "exemplar": true, - "expr": "topk(50, cluster:memory_utilized:ratio{cluster=~\"$cluster\"})\n* on(cluster) group_left(machine_memory_sum) count_values without() (\"machine_memory_sum\", cluster:machine_memory:sum)\n* on(cluster) group_left(machine_memory_requested) count_values without() (\"machine_memory_requested\", cluster:memory_requested:ratio)", - "instant": true, - "interval": "", - "legendFormat": "", - "refId": "A" - } - ], - "title": "Top 50 Memory Utilized Clusters", - "transformations": [ - { - "id": "labelsToFields", - "options": {} - }, - { - "id": "organize", - "options": { - "excludeByName": { - "Time": true, - "usage": true - }, - "indexByName": { - "Time": 0, - "Value": 4, - "cluster": 1, - "machine_memory_requested": 3, - "machine_memory_sum": 2 - }, - "renameByName": {} - } - } - ], - "type": "table" - }, - { - "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fieldConfig": { - "defaults": { - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 25 - }, - "hiddenSeries": false, - "id": 65, - "interval": "4m", - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": false, - "linewidth": 1, - "nullPointMode": "null as zero", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "8.5.20", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - {} - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "topk(5, (1 - sum(:node_memory_MemAvailable_bytes:sum) by (cluster) / sum(kube_node_status_allocatable{cluster=~\"$cluster\",resource=\"memory\"}) by (cluster)))", - "format": "time_series", - "instant": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{cluster}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Top 5 Utilized Clusters (% Memory usage)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "percentunit", - "label": null, - "logBase": 1, - "max": "1", - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "datasource": null, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "custom": { - "align": null, - "displayMode": "auto", - "filterable": false - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Value" - }, - "properties": [ - { - "id": "displayName", - "value": "Current Bandwidth Received" - }, - { - "id": "unit", - "value": "Bps" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.align", - "value": "left" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "node_transmit" - }, - "properties": [ - { - "id": "displayName", - "value": "Current Bandwidth Transmitted" - }, - { - "id": "unit", - "value": "Bps" - }, - { - "id": "decimals", - "value": 2 - }, - { - "id": "custom.align", - "value": "left" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "cluster" - }, - "properties": [ - { - "id": "displayName", - "value": "Cluster" - }, - { - "id": "links", - "value": [ - { - "title": "Drill down to cluster", - "url": "/d/ff635a025bcfea7bc3dd4f508990a3e9/kubernetes-networking-cluster?${__url_time_range}&var-cluster=${__data.fields.cluster}" - } - ] - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "node_transmit_drop" - }, - "properties": [ - { - "id": "displayName", - "value": "Rate of Transmitted Packets Dropped" - }, - { - "id": "unit", - "value": "pps" - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "node_receive_drop" - }, - "properties": [ - { - "id": "displayName", - "value": "Rate of Received Packets Dropped" - }, - { - "id": "unit", - "value": "pps" - } - ] - } - ] - }, - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 32 - }, - "id": 148, - "options": { - "showHeader": true, - "sortBy": [ - { - "desc": true, - "displayName": "Current Bandwidth Received" - } - ] - }, - "pluginVersion": "8.5.20", - "targets": [ - { - "exemplar": true, - "expr": "sum(instance:node_network_receive_bytes_excluding_lo:rate1m{cluster=~\"$cluster\",job=\"node-exporter\",clusterType!=\"ocp3\"}) by (cluster)\n* on(cluster) group_left(node_transmit) count_values without() (\"node_transmit\", sum(instance:node_network_transmit_bytes_excluding_lo:rate1m{cluster=~\"$cluster\",job=\"node-exporter\"}) by (cluster))\n* on(cluster) group_left(node_receive_drop) count_values without() (\"node_receive_drop\", sum(instance:node_network_receive_drop_excluding_lo:rate1m{cluster=~\"$cluster\",job=\"node-exporter\"}) by (cluster))\n* on(cluster) group_left(node_transmit_drop) count_values without() (\"node_transmit_drop\", sum(instance:node_network_transmit_drop_excluding_lo:rate1m{cluster=~\"$cluster\",job=\"node-exporter\"}) by (cluster))", - "format": "table", - "instant": true, - "interval": "", - "legendFormat": "", - "refId": "A" - } - ], - "title": "Bandwidth Utilization", - "transformations": [ - { - "id": "filterFieldsByName", - "options": { - "include": { - "names": [ - "cluster", - "node_receive_drop", - "node_transmit", - "node_transmit_drop", - "Value" - ] - } - } - }, - { - "id": "sortBy", - "options": { - "fields": {}, - "sort": [ - { - "field": "Value #A" - } - ] - } - }, - { - "id": "organize", - "options": { - "excludeByName": {}, - "indexByName": { - "Value": 1, - "cluster": 0, - "node_receive_drop": 3, - "node_transmit": 2, - "node_transmit_drop": 4 - }, - "renameByName": {} - } - } - ], - "type": "table" - } - ], - "refresh": "5m", - "schemaVersion": 30, - "style": "light", - "tags": [ - "ACM" - ], - "templating": { - "list": [ - { - "current": { - "selected": false, - "text": "Observatorium", - "value": "Observatorium" - }, - "description": null, - "error": null, - "hide": 2, - "includeAll": false, - "label": null, - "multi": false, - "name": "datasource", - "options": [], - "query": "prometheus", - "refresh": 2, - "regex": "", - "skipUrlSync": false, - "type": "datasource" - }, - { - "allValue": null, - "current": { - "selected": false, - "text": "name", - "value": "name" - }, - "datasource": null, - "definition": "label_values(acm_label_names, label_name)", - "description": null, - "error": null, - "hide": 0, - "includeAll": false, - "label": "Label", - "multi": false, - "name": "acm_label_names", - "options": [], - "query": { - "query": "label_values(acm_label_names, label_name)", - "refId": "StandardVariableQuery" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 1, - "type": "query" - }, - { - "allValue": null, - "current": { - "selected": false, - "text": "All", - "value": "$__all" - }, - "datasource": null, - "definition": "label_values(acm_managed_cluster_labels, $acm_label_names)", - "description": null, - "error": null, - "hide": 0, - "includeAll": true, - "label": "Value", - "multi": true, - "name": "value", - "options": [], - "query": { - "query": "label_values(acm_managed_cluster_labels, $acm_label_names)", - "refId": "StandardVariableQuery" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "type": "query" - }, - { - "allValue": null, - "current": { - "selected": true, - "text": [ - "All" - ], - "value": [ - "$__all" - ] - }, - "datasource": null, - "definition": "label_values(acm_managed_cluster_labels{$acm_label_names=~\"$value\"}, name)", - "description": null, - "error": null, - "hide": 2, - "includeAll": true, - "label": "Cluster", - "multi": true, - "name": "cluster", - "options": [], - "query": { - "query": "label_values(acm_managed_cluster_labels{$acm_label_names=~\"$value\"}, name)", - "refId": "StandardVariableQuery" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "type": "query" - } - ] - }, - "time": { - "from": "now-3h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "1m", - "5m", - "10m", - "15m", - "30m", - "1h", - "2h", - "1d" - ] - }, - "timezone": "browser", - "title": "ACM - Clusters Overview", - "uid": "89eaec849a6e4837a619fb0540c22b13", - "version": 1 - } -kind: ConfigMap -metadata: - name: grafana-dashboard-acm-clusters-overview-nexus - namespace: open-cluster-management-observability - annotations: - observability.open-cluster-management.io/dashboard-folder: "Nexus" diff --git a/operators/multiclusterobservability/manifests/base/grafana/dash-k8s-networking-cluster.yaml b/operators/multiclusterobservability/manifests/base/grafana/dash-k8s-networking-cluster.yaml index c9ed27c46..e23f32465 100644 --- a/operators/multiclusterobservability/manifests/base/grafana/dash-k8s-networking-cluster.yaml +++ b/operators/multiclusterobservability/manifests/base/grafana/dash-k8s-networking-cluster.yaml @@ -1155,12 +1155,19 @@ data: }, "timezone": "browser", "title": "Kubernetes / Networking / Cluster", - "uid": "ff635a025bcfea7bc3dd4f508990a3e9", + "uid": "efd9761bb5dd42a3800fdd6577a8430c", "version": 1 } kind: ConfigMap metadata: - name: grafana-dashboard-k8s-networking-cluster + name: grafana-dashboard-k8s-networking-cluster-nexus namespace: open-cluster-management-observability - labels: - general-folder: 'true' + annotations: + observability.open-cluster-management.io/dashboard-folder: Nexus + ownerReferences: + - apiVersion: observability.open-cluster-management.io/v1beta2 + kind: MultiClusterObservability + name: observability + uid: 577e70b7-fe3d-44a0-a8ef-427bd7c073e5 + controller: true + blockOwnerDeletion: true diff --git a/operators/multiclusterobservability/manifests/base/grafana/nexus/acm/dash-acm-nexus-clusters-overview.yaml b/operators/multiclusterobservability/manifests/base/grafana/nexus/acm/dash-acm-nexus-clusters-overview.yaml index 8478ef21f..af54a4590 100644 --- a/operators/multiclusterobservability/manifests/base/grafana/nexus/acm/dash-acm-nexus-clusters-overview.yaml +++ b/operators/multiclusterobservability/manifests/base/grafana/nexus/acm/dash-acm-nexus-clusters-overview.yaml @@ -469,7 +469,7 @@ data: "targets": [ { "exemplar": true, - "expr": "sum(changes(etcd_server_leader_changes_seen_total{cluster=~\"$cluster\",job=\"etcd\"}[$__range])) by (cluster)\n* on(cluster) group_left(db_size) count_values without() (\"db_size\", max(etcd_debugging_mvcc_db_total_size_in_bytes{cluster=~\"$cluster\",job=\"etcd\"}) by (cluster))\n* on(cluster) group_left(has_leader) count_values without() (\"has_leader\", max(etcd_server_has_leader{cluster=~\"$cluster\",job=\"etcd\"}) by (cluster))", + "expr": "sum(changes(etcd_server_leader_changes_seen_total{cluster=~\"$cluster\",job=\"etcd\"}[$__range])) by (cluster)\n* on(cluster) group_left(db_size) count_values without() (\"db_size\", max(etcd_mvcc_db_total_size_in_bytes{cluster=~\"$cluster\",job=\"etcd\"}) by (cluster))\n* on(cluster) group_left(has_leader) count_values without() (\"has_leader\", max(etcd_server_has_leader{cluster=~\"$cluster\",job=\"etcd\"}) by (cluster))", "format": "table", "instant": true, "interval": "", diff --git a/operators/multiclusterobservability/manifests/base/grafana/nexus/acm/dash-k8s-etcd.yaml b/operators/multiclusterobservability/manifests/base/grafana/nexus/acm/dash-k8s-etcd.yaml index 67f4a9adc..57c09882d 100644 --- a/operators/multiclusterobservability/manifests/base/grafana/nexus/acm/dash-k8s-etcd.yaml +++ b/operators/multiclusterobservability/manifests/base/grafana/nexus/acm/dash-k8s-etcd.yaml @@ -367,7 +367,7 @@ data: "steppedLine": false, "targets": [ { - "expr": "etcd_debugging_mvcc_db_total_size_in_bytes{cluster=\"$cluster\",job=\"etcd\"}", + "expr": "etcd_mvcc_db_total_size_in_bytes{cluster=\"$cluster\",job=\"etcd\"}", "hide": false, "interval": "", "intervalFactor": 2, diff --git a/operators/multiclusterobservability/manifests/base/grafana/nexus/acm/prometheus-rule.yaml b/operators/multiclusterobservability/manifests/base/grafana/nexus/acm/prometheus-rule.yaml index a3286da9b..6f5bb841a 100644 --- a/operators/multiclusterobservability/manifests/base/grafana/nexus/acm/prometheus-rule.yaml +++ b/operators/multiclusterobservability/manifests/base/grafana/nexus/acm/prometheus-rule.yaml @@ -21,11 +21,11 @@ spec: - expr: sum(grpc_server_started_total{job="etcd",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{job="etcd",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"}) record: active_streams_watch:grpc_server_handled_total:sum - - expr: (histogram_quantile(0.99,sum(rate(apiserver_request_latencies_bucket{job="apiserver", - verb!="WATCH"}[5m])) by (le)))/1000000 + - expr: (histogram_quantile(0.99,sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver", + verb!="WATCH"}[5m])) by (le))) record: apiserver_request_duration_seconds:histogram_quantile_99 - - expr: (histogram_quantile(0.99,sum(rate(apiserver_request_latencies_bucket{job="apiserver", - verb!="WATCH"}[5m])) by (le, verb, instance)))/1000000 + - expr: (histogram_quantile(0.99,sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver", + verb!="WATCH"}[5m])) by (le, verb, instance))) record: apiserver_request_duration_seconds:histogram_quantile_99:instance - expr: sum(kube_node_status_allocatable{resource="cpu"}) record: cluster:cpu_allocatable:sum @@ -37,13 +37,13 @@ spec: record: cluster:memory_allocatable:sum - expr: 1 - sum(:node_memory_MemAvailable_bytes:sum) / cluster:memory_allocatable:sum record: cluster:memory_utilized:ratio - - expr: sum(sum(sum(kube_pod_container_resource_requests_cpu_cores) by (pod,namespace,container) + - expr: sum(sum(sum(kube_pod_container_resource_requests{resource="cpu",unit="core"}) by (pod,namespace,container) * on(pod,namespace) group_left(phase) max(kube_pod_status_phase{phase=~"Running|Pending|Unknown"} >0) by (pod,namespace,phase)) by (pod,namespace,phase)) record: cluster:kube_pod_container_resource_requests:cpu:sum - expr: sum(cluster:kube_pod_container_resource_requests:cpu:sum) by (cluster) / sum(kube_node_status_allocatable{resource="cpu"}) by (cluster) record: cluster:cpu_requested:ratio - - expr: sum(sum(sum(kube_pod_container_resource_requests_memory_bytes) by (pod,namespace,container) + - expr: sum(sum(sum(kube_pod_container_resource_requests{resource="memory",unit="byte"}) by (pod,namespace,container) * on(pod,namespace) group_left(phase) max(kube_pod_status_phase{phase=~"Running|Pending|Unknown"} >0) by (pod,namespace,phase)) by (pod,namespace,phase)) record: cluster:kube_pod_container_resource_requests:memory:sum @@ -176,12 +176,23 @@ spec: expr: sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{container!=""}) by (namespace) or sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{container!=""}) by (namespace) - expr: sum(rate(grpc_server_handled_total{job="etcd",grpc_type="unary",grpc_code!="OK"}[5m])) record: rpc_rate:grpc_server_handled_total:sum_rate - - expr: sum(increase(apiserver_request_latencies_bucket{job="apiserver",service="kubernetes",le="1",verb=~"POST|PUT|DELETE|PATCH"}[1m])) - / sum(increase(apiserver_request_latencies_count{job="apiserver",service="kubernetes",verb=~"POST|PUT|DELETE|PATCH"}[1m])) + - expr: sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",service="kubernetes",le="1",verb=~"POST|PUT|DELETE|PATCH"}[1m])) + / sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",service="kubernetes",verb=~"POST|PUT|DELETE|PATCH"}[1m])) record: sli:apiserver_request_duration_seconds:trend:1m - expr: sli:apiserver_request_duration_seconds:trend:1m >= bool 0.9900 record: sli:apiserver_request_duration_seconds:bin:trend:1m # Really needed? - - expr: sum(rate(apiserver_request_count{job="apiserver"}[1h])) by (code, instance) + - expr: sum(rate(apiserver_request_total{job="apiserver"}[1h])) by (code, instance) record: sum:apiserver_request_total:1h - - expr: sum(rate(apiserver_request_count{job="apiserver"}[5m])) by (code, instance) + - expr: sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (code, instance) record: sum:apiserver_request_total:5m + - record: kube_pod_container_resource_limits:sum + expr: sum(kube_pod_container_resource_limits) by (resource, namespace) + - record: kube_pod_container_resource_requests:sum + expr: sum(kube_pod_container_resource_requests{container!=""}) by (resource, namespace) + - expr: |- + sum by (cluster, namespace, pod, container) ( + rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m]) + ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( + 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate \ No newline at end of file diff --git a/operators/multiclusterobservability/manifests/base/grafana/nexus/acm/scrape-config.yaml b/operators/multiclusterobservability/manifests/base/grafana/nexus/acm/scrape-config.yaml index fcebc3c46..a7e167557 100644 --- a/operators/multiclusterobservability/manifests/base/grafana/nexus/acm/scrape-config.yaml +++ b/operators/multiclusterobservability/manifests/base/grafana/nexus/acm/scrape-config.yaml @@ -35,7 +35,7 @@ spec: - '{__name__="container_memory_rss:sum",container!=""}' - '{__name__="container_memory_swap",container!=""}' - '{__name__="container_memory_working_set_bytes",container!=""}' - - '{__name__="etcd_debugging_mvcc_db_total_size_in_bytes"}' + - '{__name__="etcd_mvcc_db_total_size_in_bytes"}' - '{__name__="etcd_disk_backend_commit_duration_seconds_bucket"}' - '{__name__="etcd_disk_wal_fsync_duration_seconds_bucket"}' - '{__name__="etcd_network_client_grpc_received_bytes_total"}' diff --git a/operators/multiclusterobservability/manifests/base/grafana/nexus/hcp/dash-acm-nexus-hcp-overview.yaml b/operators/multiclusterobservability/manifests/base/grafana/nexus/hcp/dash-acm-nexus-hcp-overview.yaml new file mode 100644 index 000000000..284574ba6 --- /dev/null +++ b/operators/multiclusterobservability/manifests/base/grafana/nexus/hcp/dash-acm-nexus-hcp-overview.yaml @@ -0,0 +1,855 @@ +apiVersion: v1 +data: + acm-hcp-overview.json: |- + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 33, + "iteration": 1707320583517, + "links": [], + "panels": [ + { + "datasource": "$datasource", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 20, + "title": "Estimated capacity based on HCP resource requests", + "type": "row" + }, + { + "datasource": "$datasource", + "description": "", + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 18, + "options": { + "content": "## Request-based resource limit\n\nTo understand the request-based resource limit, consider the total request value of a hosted control plane. To calculate that value, add the request values of all highly available hosted control plane pods across the namespace. The estimates are calculated based on the following resource request samples:\n\n* 78 pods\n* Five vCPU requests for each highly available hosted control plane\n* 18 GiB memory requests for each highly available hosted control plane", + "mode": "markdown" + }, + "pluginVersion": "8.5.20", + "title": " Resource Request-base Limit Estimation", + "type": "text" + }, + { + "datasource": "$datasource", + "description": "These are the worker nodes that can run hosted control planes.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "displayMode": "auto" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 26, + "options": { + "showHeader": true + }, + "pluginVersion": "8.5.20", + "targets": [ + { + "exemplar": true, + "expr": "mce_hs_addon_worker_node_resource_capacities_gauge", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Worker Nodes Capacities", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value": true, + "__name__": true, + "cluster": true, + "clusterID": true, + "container": true, + "endpoint": true, + "instance": true, + "job": true, + "namespace": true, + "pod": true, + "receive": true, + "service": true, + "tenant_id": true + }, + "indexByName": { + "Time": 0, + "Value": 17, + "__name__": 1, + "cluster": 2, + "clusterID": 3, + "container": 4, + "cpu": 6, + "endpoint": 8, + "instance": 9, + "job": 10, + "maxPods": 11, + "memory": 7, + "namespace": 12, + "node": 5, + "pod": 13, + "receive": 14, + "service": 15, + "tenant_id": 16 + }, + "renameByName": { + "cpu": "CPU", + "maxPods": "Pod Limit", + "memory": "Memory (GiB)", + "node": "Worker Node" + } + } + } + ], + "type": "table" + }, + { + "datasource": "$datasource", + "description": "This panel displays the current number of unavailable/failing and available hosted control planes. Based on the hosted control plane resource requirements, it also displays the estimated maximum number of hosted control planes that can be hosted in this cluster.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Currently Unavailable" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "est. Max" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 12, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "text": {} + }, + "pluginVersion": "8.5.20", + "targets": [ + { + "exemplar": true, + "expr": "sum(mce_hs_addon_hosted_control_planes_status_gauge{ready=\"false\"})", + "format": "time_series", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Currently Unavailable", + "refId": "B" + }, + { + "exemplar": true, + "expr": "sum(mce_hs_addon_hosted_control_planes_status_gauge{ready=\"true\"})", + "format": "time_series", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Currently Available", + "refId": "A" + }, + { + "exemplar": true, + "expr": "mce_hs_addon_request_based_hcp_capacity_gauge", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "est. Max", + "refId": "C" + } + ], + "title": "Number of HCPs", + "transformations": [], + "type": "bargauge" + }, + { + "collapsed": false, + "datasource": "$datasource", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 13 + }, + "id": 11, + "panels": [], + "title": "Estimated capacity based on API server query (QPS)", + "type": "row" + }, + { + "datasource": "$datasource", + "gridPos": { + "h": 14, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 24, + "options": { + "content": "## Load-based limit\n\nRequest-based sizing provides a maximum number of hosted control planes that can run based on the minimum request totals for the `Burstable` class, which meet the average resource usage. For sizing guidance that is tuned to higher levels of hosted cluster load, the load-based approach demonstrates resource usage at increasing API rates. The load-based approach builds in resource capacity for each hosted control plane to handle higher API load points.\n\nResource utilization is measured as the workload increased to the total namespace count. This data provides an estimation factor to increase the compute resource capacity based on the expected API load. Exact utilization rates can vary based on the type and pace of the cluster workload. \n\n| **Hosted control plane resource utilization scaling** | **vCPUs** | **Memory (GiB)** |\n| --- | --- | --- |\n| Default requests | 5 | 18 |\n| Usage when idle | 2.9 | 11.1 |\n| Incremental usage per 1000 increase in API rate | 9.0 | 2.5 |\n\nBy using these examples, you can factor in a load-based limit that is based on the expected rate of stress on the API, which is measured as the aggregated QPS across the 3 hosted API servers. For general sizing purposes, consider a 1000 QPS API rate to be a medium hosted cluster load and a 2000 QPS API to be a heavy hosted cluster load.", + "mode": "markdown" + }, + "pluginVersion": "8.5.20", + "title": "Load-based Limit Estimation", + "type": "text" + }, + { + "datasource": "$datasource", + "description": "These API server loads are used for estimating the maximum number of hosted control planes that can be hosted. For example, the est. Max. (low QPS) in the panel below is the estimate maximum number of hosted control planes that can be hosted when all hosted control planes put low load on the API server.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "center", + "displayMode": "auto" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 14 + }, + "id": 28, + "options": { + "showHeader": true + }, + "pluginVersion": "8.5.20", + "targets": [ + { + "exemplar": true, + "expr": "mce_hs_addon_qps_gauge", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "QPS Settings", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "__name__": true, + "cluster": true, + "clusterID": true, + "container": true, + "endpoint": true, + "instance": true, + "job": true, + "namespace": true, + "pod": true, + "receive": true, + "service": true, + "tenant_id": true + }, + "indexByName": {}, + "renameByName": { + "Value": "Query Rate (QPS)", + "rate": "Load on API Server" + } + } + }, + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "desc": false, + "field": "Query Rate (QPS)" + } + ] + } + } + ], + "type": "table" + }, + { + "datasource": "$datasource", + "description": "This panel displays the current number of unavailable/failing and available hosted control planes. Based on various loads, it also displays the estimated maximum number of hosted control planes that can be hosted in this cluster.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Currently Unavailable" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "est. Max. (low QPS)" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "est. Max. (medium QPS)" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "est. Max. (high QPS)" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "est. Max. (avg QPS)" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 20 + }, + "id": 6, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "text": {} + }, + "pluginVersion": "8.5.20", + "targets": [ + { + "exemplar": true, + "expr": "sum(mce_hs_addon_hosted_control_planes_status_gauge{ready=\"false\"})", + "format": "time_series", + "instant": true, + "interval": "", + "legendFormat": "Currently Unavailable", + "refId": "A" + }, + { + "exemplar": true, + "expr": "sum(mce_hs_addon_hosted_control_planes_status_gauge{ready=\"true\"})", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Currently Available", + "refId": "B" + }, + { + "exemplar": true, + "expr": "mce_hs_addon_qps_based_hcp_capacity_gauge{qps_rate=\"low\"}", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "est. Max. (low QPS)", + "refId": "C" + }, + { + "exemplar": true, + "expr": "mce_hs_addon_qps_based_hcp_capacity_gauge{qps_rate=\"medium\"}", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "est. Max. (medium QPS)", + "refId": "D" + }, + { + "exemplar": true, + "expr": "mce_hs_addon_qps_based_hcp_capacity_gauge{qps_rate=\"high\"}", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "est. Max. (high QPS)", + "refId": "E" + }, + { + "exemplar": true, + "expr": "mce_hs_addon_qps_based_hcp_capacity_gauge{qps_rate=\"average\"}", + "hide": false, + "interval": "", + "instant": true, + "legendFormat": "est. Max. (avg QPS)", + "refId": "F" + } + ], + "title": "Number of HCPs ", + "type": "bargauge" + }, + { + "collapsed": false, + "datasource": "$datasource", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 28 + }, + "id": 22, + "panels": [], + "title": "Hosted Control Planes List", + "type": "row" + }, + { + "datasource": "$datasource", + "description": "This is the list of all hosted control planes in this cluster. Click on the hosted control plane name to see its resource utilization.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "displayMode": "auto" + }, + "mappings": [ + { + "options": { + "false": { + "color": "orange", + "index": 1, + "text": "Not ready" + }, + "true": { + "index": 0, + "text": "Ready" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": "" + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "hcp_name" + }, + "properties": [ + { + "id": "links", + "value": [ + { + "title": "", + "url": "d/ZGfrZUtIz/acm-resources-hosted-control-plane?${__url_time_range}&var-hcp_ns=${__data.fields.hcp_namespace}" + } + ] + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "ready" + }, + "properties": [ + { + "id": "color", + "value": { + "mode": "continuous-GrYlRd" + } + }, + { + "id": "custom.displayMode", + "value": "color-text" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "HCP name" + }, + "properties": [ + { + "id": "custom.width", + "value": null + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 29 + }, + "id": 16, + "options": { + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "8.5.20", + "targets": [ + { + "exemplar": true, + "expr": "mce_hs_addon_hosted_control_planes_status_gauge", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Hosted Control Plane List", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value": true, + "Value #A": true, + "__name__": true, + "cluster": true, + "clusterID": true, + "container": true, + "endpoint": true, + "instance": true, + "job": true, + "namespace": true, + "pod": true, + "receive": true, + "service": true, + "tenant_id": true + }, + "indexByName": {}, + "renameByName": { + "hcp_name": "HCP name", + "hcp_namespace": "HCP namespace", + "ready": "Status", + "version": "Version" + } + } + }, + { + "id": "filterByValue", + "options": { + "filters": [ + { + "config": { + "id": "greater", + "options": { + "value": 0 + } + }, + "fieldName": "HCP name" + }, + { + "config": { + "id": "equal", + "options": { + "value": "" + } + }, + "fieldName": "HCP name" + } + ], + "match": "any", + "type": "exclude" + } + } + ], + "type": "table" + } + ], + "refresh": "1m", + "schemaVersion": 30, + "style": "dark", + "tags": ["ACM"], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Observatorium", + "value": "Observatorium" + }, + "description": null, + "error": null, + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": null, + "current": { + "isNone": true, + "selected": false, + "text": "None", + "value": "" + }, + "datasource": "$datasource", + "definition": "mce_hs_addon_total_hosted_control_planes_gaug", + "description": null, + "error": null, + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "num_hcps", + "options": [], + "query": { + "query": "mce_hs_addon_total_hosted_control_planes_gaug", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "browser", + "title": "ACM - Hosted Control Planes Overview", + "uid": "0a749f7cde834f5685194bdb00e9548a", + "version": 1 + } +kind: ConfigMap +metadata: + annotations: + observability.open-cluster-management.io/dashboard-folder: Nexus + name: grafana-dashboard-acm-nexus-hcp-overview + namespace: open-cluster-management-observability + ownerReferences: + - apiVersion: observability.open-cluster-management.io/v1beta2 + kind: MultiClusterObservability + name: observability + uid: 577e70b7-fe3d-44a0-a8ef-427bd7c073e5 + controller: true + blockOwnerDeletion: true \ No newline at end of file