From ae5fa1239e63a7a684996a78d1a06fb7993df9f9 Mon Sep 17 00:00:00 2001 From: Brandon Johnson Date: Wed, 11 Jan 2023 10:31:46 -0500 Subject: [PATCH 1/4] fix incorrect desc for load metrics --- collector/collector.go | 40 +++++++++---------- collector/query.go | 2 +- mixin/.lint | 12 ++++++ mixin/dashboards/snowflake-overview.libsonnet | 10 ++--- 4 files changed, 38 insertions(+), 26 deletions(-) diff --git a/collector/collector.go b/collector/collector.go index 0d831af..be69904 100644 --- a/collector/collector.go +++ b/collector/collector.go @@ -66,10 +66,10 @@ type Collector struct { logins *prometheus.Desc successfulLogins *prometheus.Desc failedLogins *prometheus.Desc - warehouseExecutedQueries *prometheus.Desc - warehouseOverloadedQueueSize *prometheus.Desc - warehouseProvisioningQueueSize *prometheus.Desc - warehouseBlockedQueries *prometheus.Desc + warehouseExecutedQueryLoad *prometheus.Desc + warehouseOverloadedQueueLoad *prometheus.Desc + warehouseProvisioningQueueLoad *prometheus.Desc + warehouseBlockedQueryLoad *prometheus.Desc autoClusteringCredits *prometheus.Desc autoClusteringBytes *prometheus.Desc autoClusteringRows *prometheus.Desc @@ -161,27 +161,27 @@ func NewCollector(logger log.Logger, c *Config) *Collector { []string{labelClientType, labelClientVersion}, nil, ), - warehouseExecutedQueries: prometheus.NewDesc( + warehouseExecutedQueryLoad: prometheus.NewDesc( prometheus.BuildFQName(namespace, "warehouse", "executed_queries"), - "Average number of queries executed.", + "Average query load for queries executed over the last hour.", []string{labelName, labelID}, nil, ), - warehouseOverloadedQueueSize: prometheus.NewDesc( + warehouseOverloadedQueueLoad: prometheus.NewDesc( prometheus.BuildFQName(namespace, "warehouse", "overloaded_queue_size"), - "Average number of queries queued because the warehouse was overloaded.", + "Average load value for queries queued because the warehouse was being overloaded over the last hour.", []string{labelName, labelID}, nil, ), - warehouseProvisioningQueueSize: prometheus.NewDesc( + warehouseProvisioningQueueLoad: prometheus.NewDesc( prometheus.BuildFQName(namespace, "warehouse", "provisioning_queue_size"), - "Average number of queries queued because the warehouse was being provisioned.", + "Average load value for queries queued because the warehouse was being provisioned over the last hour.", []string{labelName, labelID}, nil, ), - warehouseBlockedQueries: prometheus.NewDesc( + warehouseBlockedQueryLoad: prometheus.NewDesc( prometheus.BuildFQName(namespace, "warehouse", "blocked_queries"), - "Average number of queries blocked by a transaction lock.", + "Average load value for queries blocked by a transaction lock over the last hour.", []string{labelName, labelID}, nil, ), @@ -264,10 +264,10 @@ func (c *Collector) Describe(descs chan<- *prometheus.Desc) { descs <- c.logins descs <- c.successfulLogins descs <- c.failedLogins - descs <- c.warehouseExecutedQueries - descs <- c.warehouseOverloadedQueueSize - descs <- c.warehouseProvisioningQueueSize - descs <- c.warehouseBlockedQueries + descs <- c.warehouseExecutedQueryLoad + descs <- c.warehouseOverloadedQueueLoad + descs <- c.warehouseProvisioningQueueLoad + descs <- c.warehouseBlockedQueryLoad descs <- c.autoClusteringCredits descs <- c.autoClusteringBytes descs <- c.autoClusteringRows @@ -470,10 +470,10 @@ func (c *Collector) collectWarehouseLoadMetrics(db *sql.DB, metrics chan<- prome return fmt.Errorf("failed to scan row: %w", err) } - metrics <- prometheus.MustNewConstMetric(c.warehouseExecutedQueries, prometheus.GaugeValue, avgRunning, warehouseName, warehouseID) - metrics <- prometheus.MustNewConstMetric(c.warehouseOverloadedQueueSize, prometheus.GaugeValue, avgQueued, warehouseName, warehouseID) - metrics <- prometheus.MustNewConstMetric(c.warehouseProvisioningQueueSize, prometheus.GaugeValue, avgQueuedProvisioning, warehouseName, warehouseID) - metrics <- prometheus.MustNewConstMetric(c.warehouseBlockedQueries, prometheus.GaugeValue, avgBlocked, warehouseName, warehouseID) + metrics <- prometheus.MustNewConstMetric(c.warehouseExecutedQueryLoad, prometheus.GaugeValue, avgRunning, warehouseName, warehouseID) + metrics <- prometheus.MustNewConstMetric(c.warehouseOverloadedQueueLoad, prometheus.GaugeValue, avgQueued, warehouseName, warehouseID) + metrics <- prometheus.MustNewConstMetric(c.warehouseProvisioningQueueLoad, prometheus.GaugeValue, avgQueuedProvisioning, warehouseName, warehouseID) + metrics <- prometheus.MustNewConstMetric(c.warehouseBlockedQueryLoad, prometheus.GaugeValue, avgBlocked, warehouseName, warehouseID) } return rows.Err() diff --git a/collector/query.go b/collector/query.go index 966e1ee..05fc53e 100644 --- a/collector/query.go +++ b/collector/query.go @@ -47,7 +47,7 @@ const ( // https://docs.snowflake.com/en/sql-reference/account-usage/warehouse_load_history.html warehouseLoadMetricQuery = `SELECT WAREHOUSE_NAME, WAREHOUSE_ID, avg(AVG_RUNNING), avg(AVG_QUEUED_LOAD), avg(AVG_QUEUED_PROVISIONING), avg(AVG_BLOCKED) FROM ACCOUNT_USAGE.WAREHOUSE_LOAD_HISTORY - WHERE START_TIME >= dateadd(hour, -24, current_timestamp()) + WHERE START_TIME >= dateadd(hour, -1, current_timestamp()) GROUP BY WAREHOUSE_NAME, WAREHOUSE_ID;` // https://docs.snowflake.com/en/sql-reference/account-usage/automatic_clustering_history.html diff --git a/mixin/.lint b/mixin/.lint index 1fcab6c..4c4451f 100644 --- a/mixin/.lint +++ b/mixin/.lint @@ -7,3 +7,15 @@ exclusions: reason: "Based on new convention we are using variable names prometheus_datasource and loki_datasource where as linter expects 'datasource'" template-instance-rule: reason: "Based on new convention we are using variable names prometheus_datasource and loki_datasource where as linter expects 'datasource'" + panel-units-rule: + reason: "Custom units are used for better user experience in some panels" + entries: + - panel: "Warehouse activity" + - panel: "Login attempts" + - panel: "Autoclustering credits used" + - panel: "Top 5 service compute credits used" + - panel: "Top 5 service cloud service credits used" + - panel: "Top 5 warehouse compute credits used" + - panel: "Top 5 warehouse cloud services credits used" + - panel: "Top 5 database autoclustering credits used" + - panel: "Top 5 table autoclustering credits used" diff --git a/mixin/dashboards/snowflake-overview.libsonnet b/mixin/dashboards/snowflake-overview.libsonnet index 095fc02..d89bf1f 100644 --- a/mixin/dashboards/snowflake-overview.libsonnet +++ b/mixin/dashboards/snowflake-overview.libsonnet @@ -18,22 +18,22 @@ local warehouseActivityPanel = { prometheus.target( 'snowflake_warehouse_executed_queries{instance=~"$instance", job=~"$job", name=~"$warehouse"}', datasource=promDatasource, - legendFormat='{{instance}} - {{name}} - Executed queries' + legendFormat='{{instance}} - {{name}} - Executed query load' ), prometheus.target( 'snowflake_warehouse_overloaded_queue_size{instance=~"$instance", job=~"$job", name=~"$warehouse"}', datasource=promDatasource, - legendFormat='{{instance}} - {{name}} - Overloaded queue size' + legendFormat='{{instance}} - {{name}} - Overloaded queue load' ), prometheus.target( 'snowflake_warehouse_provisioning_queue_size{instance=~"$instance", job=~"$job", name=~"$warehouse"}', datasource=promDatasource, - legendFormat='{{instance}} - {{name}} - Provisioning queue size' + legendFormat='{{instance}} - {{name}} - Provisioning queue load' ), prometheus.target( 'snowflake_warehouse_blocked_queries{instance=~"$instance", job=~"$job", name=~"$warehouse"}', datasource=promDatasource, - legendFormat='{{instance}} - {{name}} - Blocked queries' + legendFormat='{{instance}} - {{name}} - Blocked query load' ), ], type: 'timeseries', @@ -78,7 +78,7 @@ local warehouseActivityPanel = { }, }, mappings: [], - unit: 'queries / hr', + unit: 'load', }, overrides: [], }, From fc24622362729c8eb439870bb90620483ef25f8b Mon Sep 17 00:00:00 2001 From: Brandon Johnson Date: Wed, 11 Jan 2023 10:34:47 -0500 Subject: [PATCH 2/4] back to 24 hour interval because of 3 hour latency --- collector/collector.go | 8 ++++---- collector/query.go | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/collector/collector.go b/collector/collector.go index be69904..334d27e 100644 --- a/collector/collector.go +++ b/collector/collector.go @@ -163,25 +163,25 @@ func NewCollector(logger log.Logger, c *Config) *Collector { ), warehouseExecutedQueryLoad: prometheus.NewDesc( prometheus.BuildFQName(namespace, "warehouse", "executed_queries"), - "Average query load for queries executed over the last hour.", + "Average query load for queries executed over the last 24 hours.", []string{labelName, labelID}, nil, ), warehouseOverloadedQueueLoad: prometheus.NewDesc( prometheus.BuildFQName(namespace, "warehouse", "overloaded_queue_size"), - "Average load value for queries queued because the warehouse was being overloaded over the last hour.", + "Average load value for queries queued because the warehouse was being overloaded over the last 24 hours.", []string{labelName, labelID}, nil, ), warehouseProvisioningQueueLoad: prometheus.NewDesc( prometheus.BuildFQName(namespace, "warehouse", "provisioning_queue_size"), - "Average load value for queries queued because the warehouse was being provisioned over the last hour.", + "Average load value for queries queued because the warehouse was being provisioned over the last 24 hours.", []string{labelName, labelID}, nil, ), warehouseBlockedQueryLoad: prometheus.NewDesc( prometheus.BuildFQName(namespace, "warehouse", "blocked_queries"), - "Average load value for queries blocked by a transaction lock over the last hour.", + "Average load value for queries blocked by a transaction lock over the last 24 hours.", []string{labelName, labelID}, nil, ), diff --git a/collector/query.go b/collector/query.go index 05fc53e..966e1ee 100644 --- a/collector/query.go +++ b/collector/query.go @@ -47,7 +47,7 @@ const ( // https://docs.snowflake.com/en/sql-reference/account-usage/warehouse_load_history.html warehouseLoadMetricQuery = `SELECT WAREHOUSE_NAME, WAREHOUSE_ID, avg(AVG_RUNNING), avg(AVG_QUEUED_LOAD), avg(AVG_QUEUED_PROVISIONING), avg(AVG_BLOCKED) FROM ACCOUNT_USAGE.WAREHOUSE_LOAD_HISTORY - WHERE START_TIME >= dateadd(hour, -1, current_timestamp()) + WHERE START_TIME >= dateadd(hour, -24, current_timestamp()) GROUP BY WAREHOUSE_NAME, WAREHOUSE_ID;` // https://docs.snowflake.com/en/sql-reference/account-usage/automatic_clustering_history.html From 557a497a6cf1093e6e45e4ee8cf2664011fc17af Mon Sep 17 00:00:00 2001 From: Brandon Johnson Date: Wed, 11 Jan 2023 10:55:51 -0500 Subject: [PATCH 3/4] use percent as unit --- mixin/dashboards/snowflake-overview.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mixin/dashboards/snowflake-overview.libsonnet b/mixin/dashboards/snowflake-overview.libsonnet index d89bf1f..5777add 100644 --- a/mixin/dashboards/snowflake-overview.libsonnet +++ b/mixin/dashboards/snowflake-overview.libsonnet @@ -78,7 +78,7 @@ local warehouseActivityPanel = { }, }, mappings: [], - unit: 'load', + unit: 'percentunit', }, overrides: [], }, From 8aa49dc98265786d96b3e8c7d28aefd94df48903 Mon Sep 17 00:00:00 2001 From: Brandon Johnson Date: Wed, 11 Jan 2023 10:56:20 -0500 Subject: [PATCH 4/4] remove panel from linting --- mixin/.lint | 1 - 1 file changed, 1 deletion(-) diff --git a/mixin/.lint b/mixin/.lint index 4c4451f..60f4a3a 100644 --- a/mixin/.lint +++ b/mixin/.lint @@ -10,7 +10,6 @@ exclusions: panel-units-rule: reason: "Custom units are used for better user experience in some panels" entries: - - panel: "Warehouse activity" - panel: "Login attempts" - panel: "Autoclustering credits used" - panel: "Top 5 service compute credits used"