From 8bac1d2c5e285564184dd40054a7dd4dc57db2a2 Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Mon, 17 Jun 2024 15:41:52 -0400 Subject: [PATCH 01/38] Better assertions in tests. --- receiver/dcgmreceiver/client_gpu_test.go | 2 +- receiver/dcgmreceiver/scraper_gpu_test.go | 27 ++++++++++++----------- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/receiver/dcgmreceiver/client_gpu_test.go b/receiver/dcgmreceiver/client_gpu_test.go index 76ad622a7..479c34502 100644 --- a/receiver/dcgmreceiver/client_gpu_test.go +++ b/receiver/dcgmreceiver/client_gpu_test.go @@ -212,7 +212,7 @@ func TestCollectGpuProfilingMetrics(t *testing.T) { for _, gpuIndex := range client.deviceIndices { for _, metric := range expectedMetrics { - assert.Equal(t, seenMetric[fmt.Sprintf("gpu{%d}.metric{%s}", gpuIndex, metric)], true) + assert.True(t, seenMetric[fmt.Sprintf("gpu{%d}.metric{%s}", gpuIndex, metric)], fmt.Sprintf("%s on gpu %d", metric, gpuIndex)) } } client.cleanup() diff --git a/receiver/dcgmreceiver/scraper_gpu_test.go b/receiver/dcgmreceiver/scraper_gpu_test.go index e0a3584f2..4ca6976a5 100644 --- a/receiver/dcgmreceiver/scraper_gpu_test.go +++ b/receiver/dcgmreceiver/scraper_gpu_test.go @@ -70,12 +70,12 @@ func TestScrapeWithDelayedDcgmService(t *testing.T) { metrics, err := scraper.scrape(context.Background()) assert.NoError(t, err) // If failed to init DCGM, should have no error - assert.Equal(t, metrics.MetricCount(), 0) + assert.Equal(t, 0, metrics.MetricCount()) // Scrape again with DCGM not available metrics, err = scraper.scrape(context.Background()) assert.NoError(t, err) - assert.Equal(t, metrics.MetricCount(), 0) + assert.Equal(t, 0, metrics.MetricCount()) // Simulate DCGM becomes available dcgmInit = realDcgmInit @@ -131,7 +131,7 @@ func TestScrapeWithEmptyMetricsConfig(t *testing.T) { metrics, err := scraper.scrape(context.Background()) assert.NoError(t, err) - assert.Equal(t, metrics.MetricCount(), 0) + assert.Equal(t, 0, metrics.MetricCount()) } func TestScrapeOnPollingError(t *testing.T) { @@ -153,7 +153,7 @@ func TestScrapeOnPollingError(t *testing.T) { metrics, err := scraper.scrape(context.Background()) assert.Error(t, err) - assert.Equal(t, metrics.MetricCount(), 0) + assert.Equal(t, 0, metrics.MetricCount()) } func TestScrapeOnProfilingPaused(t *testing.T) { @@ -176,13 +176,14 @@ func TestScrapeOnProfilingPaused(t *testing.T) { metrics, err := scraper.scrape(context.Background()) assert.NoError(t, err) - require.Equal(t, metrics.MetricCount(), 2) expectedMetrics := []string{ "dcgm.gpu.utilization", "dcgm.gpu.memory.bytes_used", } + require.Equal(t, len(expectedMetrics), metrics.MetricCount()) + ms := metrics.ResourceMetrics().At(0).ScopeMetrics().At(0).Metrics() metricWasSeen := make(map[string]bool) for i := 0; i < ms.Len(); i++ { @@ -190,7 +191,7 @@ func TestScrapeOnProfilingPaused(t *testing.T) { } for _, metric := range expectedMetrics { - assert.Equal(t, metricWasSeen[metric], true) + assert.True(t, metricWasSeen[metric], metric) } } @@ -243,9 +244,9 @@ func validateScraperResult(t *testing.T, metrics pmetric.Metrics, expectedMetric m := ms.At(i) dps := m.Gauge().DataPoints() for j := 0; j < dps.Len(); j++ { - assert.Regexp(t, ".*gpu_number:.*", dps.At(j).Attributes().AsRaw()) - assert.Regexp(t, ".*model:.*", dps.At(j).Attributes().AsRaw()) - assert.Regexp(t, ".*uuid:.*", dps.At(j).Attributes().AsRaw()) + assert.Contains(t, dps.At(j).Attributes().AsRaw(), "gpu_number") + assert.Contains(t, dps.At(j).Attributes().AsRaw(), "model") + assert.Contains(t, dps.At(j).Attributes().AsRaw(), "uuid") } assert.LessOrEqual(t, expectedMetrics[m.Name()], dps.Len()) @@ -254,20 +255,20 @@ func validateScraperResult(t *testing.T, metrics pmetric.Metrics, expectedMetric case "dcgm.gpu.utilization": case "dcgm.gpu.memory.bytes_used": for j := 0; j < dps.Len(); j++ { - assert.Regexp(t, ".*memory_state:.*", dps.At(j).Attributes().AsRaw()) + assert.Contains(t, dps.At(j).Attributes().AsRaw(), "memory_state") } case "dcgm.gpu.profiling.sm_utilization": case "dcgm.gpu.profiling.sm_occupancy": case "dcgm.gpu.profiling.dram_utilization": case "dcgm.gpu.profiling.pipe_utilization": for j := 0; j < dps.Len(); j++ { - assert.Regexp(t, ".*pipe:.*", dps.At(j).Attributes().AsRaw()) + assert.Contains(t, dps.At(j).Attributes().AsRaw(), "pipe") } case "dcgm.gpu.profiling.pcie_traffic_rate": fallthrough case "dcgm.gpu.profiling.nvlink_traffic_rate": for j := 0; j < dps.Len(); j++ { - assert.Regexp(t, ".*direction:.*", dps.At(j).Attributes().AsRaw()) + assert.Contains(t, dps.At(j).Attributes().AsRaw(), "direction") } default: t.Errorf("Unexpected metric %s", m.Name()) @@ -277,6 +278,6 @@ func validateScraperResult(t *testing.T, metrics pmetric.Metrics, expectedMetric } for metric := range expectedMetrics { - assert.Equal(t, metricWasSeen[metric], true) + assert.True(t, metricWasSeen[metric], metric) } } From 00976dd1cbc2f71092e422fe0ac53b66ff53ca1a Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Wed, 19 Jun 2024 19:54:23 -0400 Subject: [PATCH 02/38] Remove support for K80 (EOL). --- receiver/dcgmreceiver/testdata/Tesla_K80.yaml | 17 ----------------- 1 file changed, 17 deletions(-) delete mode 100644 receiver/dcgmreceiver/testdata/Tesla_K80.yaml diff --git a/receiver/dcgmreceiver/testdata/Tesla_K80.yaml b/receiver/dcgmreceiver/testdata/Tesla_K80.yaml deleted file mode 100644 index 1ddf5ea1f..000000000 --- a/receiver/dcgmreceiver/testdata/Tesla_K80.yaml +++ /dev/null @@ -1,17 +0,0 @@ -model: Tesla K80 -supported_fields: - - DCGM_FI_DEV_GPU_UTIL - - DCGM_FI_DEV_FB_USED - - DCGM_FI_DEV_FB_FREE -unsupported_fields: - - DCGM_FI_PROF_SM_ACTIVE - - DCGM_FI_PROF_SM_OCCUPANCY - - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE - - DCGM_FI_PROF_PIPE_FP64_ACTIVE - - DCGM_FI_PROF_PIPE_FP32_ACTIVE - - DCGM_FI_PROF_PIPE_FP16_ACTIVE - - DCGM_FI_PROF_DRAM_ACTIVE - - DCGM_FI_PROF_PCIE_TX_BYTES - - DCGM_FI_PROF_PCIE_RX_BYTES - - DCGM_FI_PROF_NVLINK_TX_BYTES - - DCGM_FI_PROF_NVLINK_RX_BYTES From afd9512129b1fb1e0ecaa55bd8ebfc1876588177 Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Wed, 19 Jun 2024 21:42:34 -0400 Subject: [PATCH 03/38] Fix supported field filtering to only consider profiling fields. --- receiver/dcgmreceiver/client.go | 38 +++++++++++++----------- receiver/dcgmreceiver/client_gpu_test.go | 8 ++--- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/receiver/dcgmreceiver/client.go b/receiver/dcgmreceiver/client.go index 8f4baef2a..a6a4354b5 100644 --- a/receiver/dcgmreceiver/client.go +++ b/receiver/dcgmreceiver/client.go @@ -29,6 +29,8 @@ import ( const maxWarningsForFailedDeviceMetricQuery = 5 +const dcgmProfilingFieldsStart = dcgm.Short(1000) + var ErrDcgmInitialization = errors.New("error initializing DCGM") type dcgmClient struct { @@ -66,13 +68,13 @@ func newClient(config *Config, logger *zap.Logger) (*dcgmClient, error) { UUIDs := make([]string, 0) enabledFieldGroup := dcgm.FieldHandle{} requestedFieldIDs := discoverRequestedFieldIDs(config) - supportedFieldIDs, err := getAllSupportedFields() + supportedProfilingFieldIDs, err := getSupportedProfilingFields() if err != nil { // If there is error querying the supported fields at all, let the // receiver collect basic metrics: (GPU utilization, used/free memory). logger.Sugar().Warnf("Error querying supported profiling fields on '%w'. GPU profiling metrics will not be collected.", err) } - enabledFields, unavailableFields := filterSupportedFields(requestedFieldIDs, supportedFieldIDs) + enabledFields, unavailableFields := filterSupportedFields(requestedFieldIDs, supportedProfilingFieldIDs) for _, f := range unavailableFields { logger.Sugar().Warnf("Field '%s' is not supported. Metric '%s' will not be collected", dcgmIDToName[f], dcgmNameToMetricName[dcgmIDToName[f]]) } @@ -199,16 +201,10 @@ func discoverRequestedFieldIDs(config *Config) []dcgm.Short { return requestedFieldIDs } -// getAllSupportedFields calls the DCGM query function to find out all the -// fields that are supported by the current GPUs -func getAllSupportedFields() ([]dcgm.Short, error) { - // Fields like `DCGM_FI_DEV_*` are not profiling fields, and they are always - // supported on all devices - supported := []dcgm.Short{ - dcgm.DCGM_FI["DCGM_FI_DEV_GPU_UTIL"], - dcgm.DCGM_FI["DCGM_FI_DEV_FB_USED"], - dcgm.DCGM_FI["DCGM_FI_DEV_FB_FREE"], - } +// getSupportedProfilingFields calls the DCGM query function to find out all +// profiling fields that are supported by the current GPUs +func getSupportedProfilingFields() ([]dcgm.Short, error) { + supported := []dcgm.Short{} // GetSupportedMetricGroups currently does not support passing the actual // group handle; here we pass 0 to query supported fields for group 0, which // is the default DCGM group that is **supposed** to include all GPUs of the @@ -236,21 +232,27 @@ func getAllSupportedFields() ([]dcgm.Short, error) { } // filterSupportedFields takes the user requested fields and device supported -// fields, and filter to return those that are requested & supported to be the -// enabledFields and requested but not supported as unavailableFields -func filterSupportedFields(requestedFields []dcgm.Short, supportedFields []dcgm.Short) ([]dcgm.Short, []dcgm.Short) { +// profiling fields, and filters to return those that are requested & supported +// to be the enabledFields and requested but not supported as unavailableFields +func filterSupportedFields(requestedFields []dcgm.Short, supportedProfilingFields []dcgm.Short) ([]dcgm.Short, []dcgm.Short) { var enabledFields []dcgm.Short var unavailableFields []dcgm.Short for _, ef := range requestedFields { support := false - for _, sf := range supportedFields { + if ef < dcgmProfilingFieldsStart { + // Fields like `DCGM_FI_DEV_*` are not profiling + // fields, and they are always supported on all devices + support = true + } + for _, sf := range supportedProfilingFields { if sf == ef { - enabledFields = append(enabledFields, ef) support = true break } } - if !support { + if support { + enabledFields = append(enabledFields, ef) + } else { unavailableFields = append(unavailableFields, ef) } } diff --git a/receiver/dcgmreceiver/client_gpu_test.go b/receiver/dcgmreceiver/client_gpu_test.go index 479c34502..17812ece4 100644 --- a/receiver/dcgmreceiver/client_gpu_test.go +++ b/receiver/dcgmreceiver/client_gpu_test.go @@ -48,8 +48,8 @@ type modelSupportedFields struct { UnsupportedFields []string `yaml:"unsupported_fields"` } -// TestSupportedFieldsWithGolden test getAllSupportedFields() against the golden -// files for the current GPU model +// TestSupportedFieldsWithGolden test getSupportedProfilingFields() against the +// golden files for the current GPU model func TestSupportedFieldsWithGolden(t *testing.T) { config := createDefaultConfig().(*Config) client, err := newClient(config, zaptest.NewLogger(t)) @@ -58,9 +58,9 @@ func TestSupportedFieldsWithGolden(t *testing.T) { assert.NotEmpty(t, client.devicesModelName) gpuModel := client.getDeviceModelName(0) allFields := discoverRequestedFieldIDs(config) - supportedFields, err := getAllSupportedFields() + supportedProfilingFields, err := getSupportedProfilingFields() require.Nil(t, err) - enabledFields, unavailableFields := filterSupportedFields(allFields, supportedFields) + enabledFields, unavailableFields := filterSupportedFields(allFields, supportedProfilingFields) dcgmIDToNameMap := make(map[dcgm.Short]string, len(dcgm.DCGM_FI)) for fieldName, fieldID := range dcgm.DCGM_FI { From e78b8295f428e2c75d9871716d30478e09c01e1d Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Fri, 21 Jun 2024 19:13:53 -0400 Subject: [PATCH 04/38] Enabled option reorder. --- receiver/dcgmreceiver/metadata.yaml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/receiver/dcgmreceiver/metadata.yaml b/receiver/dcgmreceiver/metadata.yaml index 60999eff8..47f927f6a 100644 --- a/receiver/dcgmreceiver/metadata.yaml +++ b/receiver/dcgmreceiver/metadata.yaml @@ -35,65 +35,65 @@ attributes: metrics: dcgm.gpu.utilization: - enabled: true description: Fraction of time the GPU was not idle. unit: 1 gauge: value_type: double attributes: [model, gpu_number, uuid] + enabled: true dcgm.gpu.memory.bytes_used: - enabled: true description: Current number of GPU memory bytes used by state. Summing the values of all states yields the total GPU memory space. unit: By gauge: value_type: int attributes: [model, gpu_number, uuid, memory_state] + enabled: true dcgm.gpu.profiling.sm_utilization: - enabled: true description: Fraction of time at least one warp was active on a multiprocessor, averaged over all multiprocessors. unit: 1 gauge: value_type: double attributes: [model, gpu_number, uuid] + enabled: true dcgm.gpu.profiling.sm_occupancy: - enabled: true description: Fraction of resident warps on a multiprocessor relative to the maximum number supported, averaged over time and all multiprocessors. unit: 1 gauge: value_type: double attributes: [model, gpu_number, uuid] + enabled: true dcgm.gpu.profiling.pipe_utilization: - enabled: true description: Fraction of cycles the corresponding GPU pipe was active, averaged over time and all multiprocessors. unit: 1 gauge: value_type: double attributes: [model, gpu_number, uuid, pipe] + enabled: true dcgm.gpu.profiling.dram_utilization: - enabled: true description: Fraction of cycles data was being sent or received from GPU memory. unit: 1 gauge: value_type: double attributes: [model, gpu_number, uuid] + enabled: true dcgm.gpu.profiling.pcie_traffic_rate: - enabled: true description: The average rate of bytes sent from the GPU over the PCIe bus over the sample period, including both protocol headers and data payloads. unit: By/s gauge: value_type: int attributes: [model, gpu_number, uuid, direction] + enabled: true dcgm.gpu.profiling.nvlink_traffic_rate: - enabled: true description: The average rate of bytes received from the GPU over NVLink over the sample period, not including protocol headers. unit: By/s gauge: value_type: int attributes: [model, gpu_number, uuid, direction] + enabled: true From c33595caafaf3e815b80bf17929fbd29b31955ba Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Fri, 24 May 2024 16:42:07 -0400 Subject: [PATCH 05/38] Turn the GPU device metric attributes into resource attributes. --- receiver/dcgmreceiver/client.go | 9 +- receiver/dcgmreceiver/client_gpu_test.go | 95 +++++++-------- receiver/dcgmreceiver/documentation.md | 48 +------- .../internal/metadata/generated_config.go | 54 ++++++++- .../metadata/generated_config_test.go | 63 +++++++++- .../internal/metadata/generated_metrics.go | 114 ++++++++++-------- .../metadata/generated_metrics_test.go | 111 ++++------------- .../internal/metadata/generated_resource.go | 50 ++++++++ .../metadata/generated_resource_test.go | 52 ++++++++ .../internal/metadata/testdata/config.yaml | 42 +++++++ receiver/dcgmreceiver/metadata.yaml | 30 ++--- receiver/dcgmreceiver/scraper.go | 79 ++++++------ receiver/dcgmreceiver/scraper_gpu_test.go | 17 +-- 13 files changed, 470 insertions(+), 294 deletions(-) create mode 100644 receiver/dcgmreceiver/internal/metadata/generated_resource.go create mode 100644 receiver/dcgmreceiver/internal/metadata/generated_resource_test.go diff --git a/receiver/dcgmreceiver/client.go b/receiver/dcgmreceiver/client.go index a6a4354b5..15cd5e864 100644 --- a/receiver/dcgmreceiver/client.go +++ b/receiver/dcgmreceiver/client.go @@ -46,7 +46,6 @@ type dcgmClient struct { type dcgmMetric struct { timestamp int64 - gpuIndex uint name string value [4096]byte } @@ -304,13 +303,13 @@ func (client *dcgmClient) getDeviceUUID(gpuIndex uint) string { return client.devicesUUID[gpuIndex] } -func (client *dcgmClient) collectDeviceMetrics() ([]dcgmMetric, error) { +func (client *dcgmClient) collectDeviceMetrics() (map[uint][]dcgmMetric, error) { var err scrapererror.ScrapeErrors - gpuMetrics := make([]dcgmMetric, 0, len(client.enabledFieldIDs)*len(client.deviceIndices)) + gpuMetrics := make(map[uint][]dcgmMetric) for _, gpuIndex := range client.deviceIndices { fieldValues, pollErr := dcgmGetLatestValuesForFields(gpuIndex, client.enabledFieldIDs) if pollErr == nil { - gpuMetrics = client.appendMetric(gpuMetrics, gpuIndex, fieldValues) + gpuMetrics[gpuIndex] = client.appendMetric(gpuMetrics[gpuIndex], gpuIndex, fieldValues) client.logger.Debugf("Successful poll of DCGM daemon for GPU %d", gpuIndex) } else { msg := fmt.Sprintf("Unable to poll DCGM daemon for GPU %d on %s", gpuIndex, pollErr) @@ -337,7 +336,7 @@ func (client *dcgmClient) appendMetric(gpuMetrics []dcgmMetric, gpuIndex uint, f case dcgm.DCGM_FT_INT64: client.logger.Debugf("Discovered (ts %d gpu %d) %s = %d (i64)", fieldValue.Ts, gpuIndex, metricName, fieldValue.Int64()) } - gpuMetrics = append(gpuMetrics, dcgmMetric{fieldValue.Ts, gpuIndex, metricName, fieldValue.Value}) + gpuMetrics = append(gpuMetrics, dcgmMetric{fieldValue.Ts, metricName, fieldValue.Value}) } return gpuMetrics diff --git a/receiver/dcgmreceiver/client_gpu_test.go b/receiver/dcgmreceiver/client_gpu_test.go index 17812ece4..ade5050aa 100644 --- a/receiver/dcgmreceiver/client_gpu_test.go +++ b/receiver/dcgmreceiver/client_gpu_test.go @@ -156,58 +156,59 @@ func TestCollectGpuProfilingMetrics(t *testing.T) { expectedMetrics := LoadExpectedMetrics(t, client.devicesModelName[0]) var maxCollectionInterval = 60 * time.Second before := time.Now().UnixMicro() - maxCollectionInterval.Microseconds() - metrics, err := client.collectDeviceMetrics() + deviceMetrics, err := client.collectDeviceMetrics() after := time.Now().UnixMicro() assert.Nil(t, err) seenMetric := make(map[string]bool) - for _, metric := range metrics { - assert.GreaterOrEqual(t, metric.gpuIndex, uint(0)) - assert.LessOrEqual(t, metric.gpuIndex, uint(32)) - - switch metric.name { - case "dcgm.gpu.profiling.tensor_utilization": - fallthrough - case "dcgm.gpu.profiling.dram_utilization": - fallthrough - case "dcgm.gpu.profiling.fp64_utilization": - fallthrough - case "dcgm.gpu.profiling.fp32_utilization": - fallthrough - case "dcgm.gpu.profiling.fp16_utilization": - fallthrough - case "dcgm.gpu.profiling.sm_occupancy": - fallthrough - case "dcgm.gpu.profiling.sm_utilization": - assert.GreaterOrEqual(t, metric.asFloat64(), float64(0.0)) - assert.LessOrEqual(t, metric.asFloat64(), float64(1.0)) - case "dcgm.gpu.utilization": - assert.GreaterOrEqual(t, metric.asInt64(), int64(0)) - assert.LessOrEqual(t, metric.asInt64(), int64(100)) - case "dcgm.gpu.memory.bytes_free": - fallthrough - case "dcgm.gpu.memory.bytes_used": - // arbitrary max of 10 TiB - assert.GreaterOrEqual(t, metric.asInt64(), int64(0)) - assert.LessOrEqual(t, metric.asInt64(), int64(10485760)) - case "dcgm.gpu.profiling.pcie_sent_bytes": - fallthrough - case "dcgm.gpu.profiling.pcie_received_bytes": - fallthrough - case "dcgm.gpu.profiling.nvlink_sent_bytes": - fallthrough - case "dcgm.gpu.profiling.nvlink_received_bytes": - // arbitrary max of 10 TiB/sec - assert.GreaterOrEqual(t, metric.asInt64(), int64(0)) - assert.LessOrEqual(t, metric.asInt64(), int64(10995116277760)) - default: - t.Errorf("Unexpected metric '%s'", metric.name) + assert.GreaterOrEqual(t, len(deviceMetrics), 0) + assert.LessOrEqual(t, len(deviceMetrics), 32) + for gpuIndex, metrics := range deviceMetrics { + for _, metric := range metrics { + switch metric.name { + case "dcgm.gpu.profiling.tensor_utilization": + fallthrough + case "dcgm.gpu.profiling.dram_utilization": + fallthrough + case "dcgm.gpu.profiling.fp64_utilization": + fallthrough + case "dcgm.gpu.profiling.fp32_utilization": + fallthrough + case "dcgm.gpu.profiling.fp16_utilization": + fallthrough + case "dcgm.gpu.profiling.sm_occupancy": + fallthrough + case "dcgm.gpu.profiling.sm_utilization": + assert.GreaterOrEqual(t, metric.asFloat64(), float64(0.0)) + assert.LessOrEqual(t, metric.asFloat64(), float64(1.0)) + case "dcgm.gpu.utilization": + assert.GreaterOrEqual(t, metric.asInt64(), int64(0)) + assert.LessOrEqual(t, metric.asInt64(), int64(100)) + case "dcgm.gpu.memory.bytes_free": + fallthrough + case "dcgm.gpu.memory.bytes_used": + // arbitrary max of 10 TiB + assert.GreaterOrEqual(t, metric.asInt64(), int64(0)) + assert.LessOrEqual(t, metric.asInt64(), int64(10485760)) + case "dcgm.gpu.profiling.pcie_sent_bytes": + fallthrough + case "dcgm.gpu.profiling.pcie_received_bytes": + fallthrough + case "dcgm.gpu.profiling.nvlink_sent_bytes": + fallthrough + case "dcgm.gpu.profiling.nvlink_received_bytes": + // arbitrary max of 10 TiB/sec + assert.GreaterOrEqual(t, metric.asInt64(), int64(0)) + assert.LessOrEqual(t, metric.asInt64(), int64(10995116277760)) + default: + t.Errorf("Unexpected metric '%s'", metric.name) + } + + assert.GreaterOrEqual(t, metric.timestamp, before) + assert.LessOrEqual(t, metric.timestamp, after) + + seenMetric[fmt.Sprintf("gpu{%d}.metric{%s}", gpuIndex, metric.name)] = true } - - assert.GreaterOrEqual(t, metric.timestamp, before) - assert.LessOrEqual(t, metric.timestamp, after) - - seenMetric[fmt.Sprintf("gpu{%d}.metric{%s}", metric.gpuIndex, metric.name)] = true } for _, gpuIndex := range client.deviceIndices { diff --git a/receiver/dcgmreceiver/documentation.md b/receiver/dcgmreceiver/documentation.md index 5151ea5fc..5a7f947d8 100644 --- a/receiver/dcgmreceiver/documentation.md +++ b/receiver/dcgmreceiver/documentation.md @@ -24,9 +24,6 @@ Current number of GPU memory bytes used by state. Summing the values of all stat | Name | Description | Values | | ---- | ----------- | ------ | -| model | GPU model | Any Str | -| gpu_number | GPU index starting at 0. | Any Str | -| uuid | GPU universally unique identifier | Any Str | | memory_state | GPU memory used or free | Str: ``used``, ``free`` | ### dcgm.gpu.profiling.dram_utilization @@ -37,14 +34,6 @@ Fraction of cycles data was being sent or received from GPU memory. | ---- | ----------- | ---------- | | 1 | Gauge | Double | -#### Attributes - -| Name | Description | Values | -| ---- | ----------- | ------ | -| model | GPU model | Any Str | -| gpu_number | GPU index starting at 0. | Any Str | -| uuid | GPU universally unique identifier | Any Str | - ### dcgm.gpu.profiling.nvlink_traffic_rate The average rate of bytes received from the GPU over NVLink over the sample period, not including protocol headers. @@ -57,9 +46,6 @@ The average rate of bytes received from the GPU over NVLink over the sample peri | Name | Description | Values | | ---- | ----------- | ------ | -| model | GPU model | Any Str | -| gpu_number | GPU index starting at 0. | Any Str | -| uuid | GPU universally unique identifier | Any Str | | direction | Direction of the link traffic, one of [tx, rx]. | Str: ``tx``, ``rx`` | ### dcgm.gpu.profiling.pcie_traffic_rate @@ -74,9 +60,6 @@ The average rate of bytes sent from the GPU over the PCIe bus over the sample pe | Name | Description | Values | | ---- | ----------- | ------ | -| model | GPU model | Any Str | -| gpu_number | GPU index starting at 0. | Any Str | -| uuid | GPU universally unique identifier | Any Str | | direction | Direction of the link traffic, one of [tx, rx]. | Str: ``tx``, ``rx`` | ### dcgm.gpu.profiling.pipe_utilization @@ -91,9 +74,6 @@ Fraction of cycles the corresponding GPU pipe was active, averaged over time and | Name | Description | Values | | ---- | ----------- | ------ | -| model | GPU model | Any Str | -| gpu_number | GPU index starting at 0. | Any Str | -| uuid | GPU universally unique identifier | Any Str | | pipe | GPU pipe in use, one of [tensor, fp64, fp32, fp16]. | Str: ``tensor``, ``fp64``, ``fp32``, ``fp16`` | ### dcgm.gpu.profiling.sm_occupancy @@ -104,14 +84,6 @@ Fraction of resident warps on a multiprocessor relative to the maximum number su | ---- | ----------- | ---------- | | 1 | Gauge | Double | -#### Attributes - -| Name | Description | Values | -| ---- | ----------- | ------ | -| model | GPU model | Any Str | -| gpu_number | GPU index starting at 0. | Any Str | -| uuid | GPU universally unique identifier | Any Str | - ### dcgm.gpu.profiling.sm_utilization Fraction of time at least one warp was active on a multiprocessor, averaged over all multiprocessors. @@ -120,14 +92,6 @@ Fraction of time at least one warp was active on a multiprocessor, averaged over | ---- | ----------- | ---------- | | 1 | Gauge | Double | -#### Attributes - -| Name | Description | Values | -| ---- | ----------- | ------ | -| model | GPU model | Any Str | -| gpu_number | GPU index starting at 0. | Any Str | -| uuid | GPU universally unique identifier | Any Str | - ### dcgm.gpu.utilization Fraction of time the GPU was not idle. @@ -136,10 +100,10 @@ Fraction of time the GPU was not idle. | ---- | ----------- | ---------- | | 1 | Gauge | Double | -#### Attributes +## Resource Attributes -| Name | Description | Values | -| ---- | ----------- | ------ | -| model | GPU model | Any Str | -| gpu_number | GPU index starting at 0. | Any Str | -| uuid | GPU universally unique identifier | Any Str | +| Name | Description | Values | Enabled | +| ---- | ----------- | ------ | ------- | +| gpu.model | GPU model name. | Any Str | true | +| gpu.number | GPU index starting at 0. | Any Str | true | +| gpu.uuid | GPU universally unique identifier. | Any Str | true | diff --git a/receiver/dcgmreceiver/internal/metadata/generated_config.go b/receiver/dcgmreceiver/internal/metadata/generated_config.go index ec7383f79..46c17ee8e 100644 --- a/receiver/dcgmreceiver/internal/metadata/generated_config.go +++ b/receiver/dcgmreceiver/internal/metadata/generated_config.go @@ -4,6 +4,7 @@ package metadata import ( "go.opentelemetry.io/collector/confmap" + "go.opentelemetry.io/collector/filter" ) // MetricConfig provides common config for a particular metric. @@ -66,13 +67,62 @@ func DefaultMetricsConfig() MetricsConfig { } } +// ResourceAttributeConfig provides common config for a particular resource attribute. +type ResourceAttributeConfig struct { + Enabled bool `mapstructure:"enabled"` + // Experimental: MetricsInclude defines a list of filters for attribute values. + // If the list is not empty, only metrics with matching resource attribute values will be emitted. + MetricsInclude []filter.Config `mapstructure:"metrics_include"` + // Experimental: MetricsExclude defines a list of filters for attribute values. + // If the list is not empty, metrics with matching resource attribute values will not be emitted. + // MetricsInclude has higher priority than MetricsExclude. + MetricsExclude []filter.Config `mapstructure:"metrics_exclude"` + + enabledSetByUser bool +} + +func (rac *ResourceAttributeConfig) Unmarshal(parser *confmap.Conf) error { + if parser == nil { + return nil + } + err := parser.Unmarshal(rac) + if err != nil { + return err + } + rac.enabledSetByUser = parser.IsSet("enabled") + return nil +} + +// ResourceAttributesConfig provides config for dcgm resource attributes. +type ResourceAttributesConfig struct { + GpuModel ResourceAttributeConfig `mapstructure:"gpu.model"` + GpuNumber ResourceAttributeConfig `mapstructure:"gpu.number"` + GpuUUID ResourceAttributeConfig `mapstructure:"gpu.uuid"` +} + +func DefaultResourceAttributesConfig() ResourceAttributesConfig { + return ResourceAttributesConfig{ + GpuModel: ResourceAttributeConfig{ + Enabled: true, + }, + GpuNumber: ResourceAttributeConfig{ + Enabled: true, + }, + GpuUUID: ResourceAttributeConfig{ + Enabled: true, + }, + } +} + // MetricsBuilderConfig is a configuration for dcgm metrics builder. type MetricsBuilderConfig struct { - Metrics MetricsConfig `mapstructure:"metrics"` + Metrics MetricsConfig `mapstructure:"metrics"` + ResourceAttributes ResourceAttributesConfig `mapstructure:"resource_attributes"` } func DefaultMetricsBuilderConfig() MetricsBuilderConfig { return MetricsBuilderConfig{ - Metrics: DefaultMetricsConfig(), + Metrics: DefaultMetricsConfig(), + ResourceAttributes: DefaultResourceAttributesConfig(), } } diff --git a/receiver/dcgmreceiver/internal/metadata/generated_config_test.go b/receiver/dcgmreceiver/internal/metadata/generated_config_test.go index 61c444bbb..4ca3fcd32 100644 --- a/receiver/dcgmreceiver/internal/metadata/generated_config_test.go +++ b/receiver/dcgmreceiver/internal/metadata/generated_config_test.go @@ -9,6 +9,7 @@ import ( "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" "github.com/stretchr/testify/require" + "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/confmap/confmaptest" ) @@ -34,6 +35,11 @@ func TestMetricsBuilderConfig(t *testing.T) { DcgmGpuProfilingSmUtilization: MetricConfig{Enabled: true}, DcgmGpuUtilization: MetricConfig{Enabled: true}, }, + ResourceAttributes: ResourceAttributesConfig{ + GpuModel: ResourceAttributeConfig{Enabled: true}, + GpuNumber: ResourceAttributeConfig{Enabled: true}, + GpuUUID: ResourceAttributeConfig{Enabled: true}, + }, }, }, { @@ -49,13 +55,18 @@ func TestMetricsBuilderConfig(t *testing.T) { DcgmGpuProfilingSmUtilization: MetricConfig{Enabled: false}, DcgmGpuUtilization: MetricConfig{Enabled: false}, }, + ResourceAttributes: ResourceAttributesConfig{ + GpuModel: ResourceAttributeConfig{Enabled: false}, + GpuNumber: ResourceAttributeConfig{Enabled: false}, + GpuUUID: ResourceAttributeConfig{Enabled: false}, + }, }, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { cfg := loadMetricsBuilderConfig(t, tt.name) - if diff := cmp.Diff(tt.want, cfg, cmpopts.IgnoreUnexported(MetricConfig{})); diff != "" { + if diff := cmp.Diff(tt.want, cfg, cmpopts.IgnoreUnexported(MetricConfig{}, ResourceAttributeConfig{})); diff != "" { t.Errorf("Config mismatch (-expected +actual):\n%s", diff) } }) @@ -68,6 +79,54 @@ func loadMetricsBuilderConfig(t *testing.T, name string) MetricsBuilderConfig { sub, err := cm.Sub(name) require.NoError(t, err) cfg := DefaultMetricsBuilderConfig() - require.NoError(t, sub.Unmarshal(&cfg)) + require.NoError(t, component.UnmarshalConfig(sub, &cfg)) + return cfg +} + +func TestResourceAttributesConfig(t *testing.T) { + tests := []struct { + name string + want ResourceAttributesConfig + }{ + { + name: "default", + want: DefaultResourceAttributesConfig(), + }, + { + name: "all_set", + want: ResourceAttributesConfig{ + GpuModel: ResourceAttributeConfig{Enabled: true}, + GpuNumber: ResourceAttributeConfig{Enabled: true}, + GpuUUID: ResourceAttributeConfig{Enabled: true}, + }, + }, + { + name: "none_set", + want: ResourceAttributesConfig{ + GpuModel: ResourceAttributeConfig{Enabled: false}, + GpuNumber: ResourceAttributeConfig{Enabled: false}, + GpuUUID: ResourceAttributeConfig{Enabled: false}, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + cfg := loadResourceAttributesConfig(t, tt.name) + if diff := cmp.Diff(tt.want, cfg, cmpopts.IgnoreUnexported(ResourceAttributeConfig{})); diff != "" { + t.Errorf("Config mismatch (-expected +actual):\n%s", diff) + } + }) + } +} + +func loadResourceAttributesConfig(t *testing.T, name string) ResourceAttributesConfig { + cm, err := confmaptest.LoadConf(filepath.Join("testdata", "config.yaml")) + require.NoError(t, err) + sub, err := cm.Sub(name) + require.NoError(t, err) + sub, err = sub.Sub("resource_attributes") + require.NoError(t, err) + cfg := DefaultResourceAttributesConfig() + require.NoError(t, component.UnmarshalConfig(sub, &cfg)) return cfg } diff --git a/receiver/dcgmreceiver/internal/metadata/generated_metrics.go b/receiver/dcgmreceiver/internal/metadata/generated_metrics.go index 6d681ee43..85a76fdc7 100644 --- a/receiver/dcgmreceiver/internal/metadata/generated_metrics.go +++ b/receiver/dcgmreceiver/internal/metadata/generated_metrics.go @@ -6,6 +6,7 @@ import ( "time" "go.opentelemetry.io/collector/component" + "go.opentelemetry.io/collector/filter" "go.opentelemetry.io/collector/pdata/pcommon" "go.opentelemetry.io/collector/pdata/pmetric" "go.opentelemetry.io/collector/receiver" @@ -112,7 +113,7 @@ func (m *metricDcgmGpuMemoryBytesUsed) init() { m.data.Gauge().DataPoints().EnsureCapacity(m.capacity) } -func (m *metricDcgmGpuMemoryBytesUsed) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, modelAttributeValue string, gpuNumberAttributeValue string, uuidAttributeValue string, memoryStateAttributeValue string) { +func (m *metricDcgmGpuMemoryBytesUsed) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, memoryStateAttributeValue string) { if !m.config.Enabled { return } @@ -120,9 +121,6 @@ func (m *metricDcgmGpuMemoryBytesUsed) recordDataPoint(start pcommon.Timestamp, dp.SetStartTimestamp(start) dp.SetTimestamp(ts) dp.SetIntValue(val) - dp.Attributes().PutStr("model", modelAttributeValue) - dp.Attributes().PutStr("gpu_number", gpuNumberAttributeValue) - dp.Attributes().PutStr("uuid", uuidAttributeValue) dp.Attributes().PutStr("memory_state", memoryStateAttributeValue) } @@ -163,10 +161,9 @@ func (m *metricDcgmGpuProfilingDramUtilization) init() { m.data.SetDescription("Fraction of cycles data was being sent or received from GPU memory.") m.data.SetUnit("1") m.data.SetEmptyGauge() - m.data.Gauge().DataPoints().EnsureCapacity(m.capacity) } -func (m *metricDcgmGpuProfilingDramUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64, modelAttributeValue string, gpuNumberAttributeValue string, uuidAttributeValue string) { +func (m *metricDcgmGpuProfilingDramUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64) { if !m.config.Enabled { return } @@ -174,9 +171,6 @@ func (m *metricDcgmGpuProfilingDramUtilization) recordDataPoint(start pcommon.Ti dp.SetStartTimestamp(start) dp.SetTimestamp(ts) dp.SetDoubleValue(val) - dp.Attributes().PutStr("model", modelAttributeValue) - dp.Attributes().PutStr("gpu_number", gpuNumberAttributeValue) - dp.Attributes().PutStr("uuid", uuidAttributeValue) } // updateCapacity saves max length of data point slices that will be used for the slice capacity. @@ -219,7 +213,7 @@ func (m *metricDcgmGpuProfilingNvlinkTrafficRate) init() { m.data.Gauge().DataPoints().EnsureCapacity(m.capacity) } -func (m *metricDcgmGpuProfilingNvlinkTrafficRate) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, modelAttributeValue string, gpuNumberAttributeValue string, uuidAttributeValue string, directionAttributeValue string) { +func (m *metricDcgmGpuProfilingNvlinkTrafficRate) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, directionAttributeValue string) { if !m.config.Enabled { return } @@ -227,9 +221,6 @@ func (m *metricDcgmGpuProfilingNvlinkTrafficRate) recordDataPoint(start pcommon. dp.SetStartTimestamp(start) dp.SetTimestamp(ts) dp.SetIntValue(val) - dp.Attributes().PutStr("model", modelAttributeValue) - dp.Attributes().PutStr("gpu_number", gpuNumberAttributeValue) - dp.Attributes().PutStr("uuid", uuidAttributeValue) dp.Attributes().PutStr("direction", directionAttributeValue) } @@ -273,7 +264,7 @@ func (m *metricDcgmGpuProfilingPcieTrafficRate) init() { m.data.Gauge().DataPoints().EnsureCapacity(m.capacity) } -func (m *metricDcgmGpuProfilingPcieTrafficRate) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, modelAttributeValue string, gpuNumberAttributeValue string, uuidAttributeValue string, directionAttributeValue string) { +func (m *metricDcgmGpuProfilingPcieTrafficRate) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, directionAttributeValue string) { if !m.config.Enabled { return } @@ -281,9 +272,6 @@ func (m *metricDcgmGpuProfilingPcieTrafficRate) recordDataPoint(start pcommon.Ti dp.SetStartTimestamp(start) dp.SetTimestamp(ts) dp.SetIntValue(val) - dp.Attributes().PutStr("model", modelAttributeValue) - dp.Attributes().PutStr("gpu_number", gpuNumberAttributeValue) - dp.Attributes().PutStr("uuid", uuidAttributeValue) dp.Attributes().PutStr("direction", directionAttributeValue) } @@ -327,7 +315,7 @@ func (m *metricDcgmGpuProfilingPipeUtilization) init() { m.data.Gauge().DataPoints().EnsureCapacity(m.capacity) } -func (m *metricDcgmGpuProfilingPipeUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64, modelAttributeValue string, gpuNumberAttributeValue string, uuidAttributeValue string, pipeAttributeValue string) { +func (m *metricDcgmGpuProfilingPipeUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64, pipeAttributeValue string) { if !m.config.Enabled { return } @@ -335,9 +323,6 @@ func (m *metricDcgmGpuProfilingPipeUtilization) recordDataPoint(start pcommon.Ti dp.SetStartTimestamp(start) dp.SetTimestamp(ts) dp.SetDoubleValue(val) - dp.Attributes().PutStr("model", modelAttributeValue) - dp.Attributes().PutStr("gpu_number", gpuNumberAttributeValue) - dp.Attributes().PutStr("uuid", uuidAttributeValue) dp.Attributes().PutStr("pipe", pipeAttributeValue) } @@ -378,10 +363,9 @@ func (m *metricDcgmGpuProfilingSmOccupancy) init() { m.data.SetDescription("Fraction of resident warps on a multiprocessor relative to the maximum number supported, averaged over time and all multiprocessors.") m.data.SetUnit("1") m.data.SetEmptyGauge() - m.data.Gauge().DataPoints().EnsureCapacity(m.capacity) } -func (m *metricDcgmGpuProfilingSmOccupancy) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64, modelAttributeValue string, gpuNumberAttributeValue string, uuidAttributeValue string) { +func (m *metricDcgmGpuProfilingSmOccupancy) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64) { if !m.config.Enabled { return } @@ -389,9 +373,6 @@ func (m *metricDcgmGpuProfilingSmOccupancy) recordDataPoint(start pcommon.Timest dp.SetStartTimestamp(start) dp.SetTimestamp(ts) dp.SetDoubleValue(val) - dp.Attributes().PutStr("model", modelAttributeValue) - dp.Attributes().PutStr("gpu_number", gpuNumberAttributeValue) - dp.Attributes().PutStr("uuid", uuidAttributeValue) } // updateCapacity saves max length of data point slices that will be used for the slice capacity. @@ -431,10 +412,9 @@ func (m *metricDcgmGpuProfilingSmUtilization) init() { m.data.SetDescription("Fraction of time at least one warp was active on a multiprocessor, averaged over all multiprocessors.") m.data.SetUnit("1") m.data.SetEmptyGauge() - m.data.Gauge().DataPoints().EnsureCapacity(m.capacity) } -func (m *metricDcgmGpuProfilingSmUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64, modelAttributeValue string, gpuNumberAttributeValue string, uuidAttributeValue string) { +func (m *metricDcgmGpuProfilingSmUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64) { if !m.config.Enabled { return } @@ -442,9 +422,6 @@ func (m *metricDcgmGpuProfilingSmUtilization) recordDataPoint(start pcommon.Time dp.SetStartTimestamp(start) dp.SetTimestamp(ts) dp.SetDoubleValue(val) - dp.Attributes().PutStr("model", modelAttributeValue) - dp.Attributes().PutStr("gpu_number", gpuNumberAttributeValue) - dp.Attributes().PutStr("uuid", uuidAttributeValue) } // updateCapacity saves max length of data point slices that will be used for the slice capacity. @@ -484,10 +461,9 @@ func (m *metricDcgmGpuUtilization) init() { m.data.SetDescription("Fraction of time the GPU was not idle.") m.data.SetUnit("1") m.data.SetEmptyGauge() - m.data.Gauge().DataPoints().EnsureCapacity(m.capacity) } -func (m *metricDcgmGpuUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64, modelAttributeValue string, gpuNumberAttributeValue string, uuidAttributeValue string) { +func (m *metricDcgmGpuUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64) { if !m.config.Enabled { return } @@ -495,9 +471,6 @@ func (m *metricDcgmGpuUtilization) recordDataPoint(start pcommon.Timestamp, ts p dp.SetStartTimestamp(start) dp.SetTimestamp(ts) dp.SetDoubleValue(val) - dp.Attributes().PutStr("model", modelAttributeValue) - dp.Attributes().PutStr("gpu_number", gpuNumberAttributeValue) - dp.Attributes().PutStr("uuid", uuidAttributeValue) } // updateCapacity saves max length of data point slices that will be used for the slice capacity. @@ -533,6 +506,8 @@ type MetricsBuilder struct { metricsCapacity int // maximum observed number of metrics per resource. metricsBuffer pmetric.Metrics // accumulates metrics data before emitting. buildInfo component.BuildInfo // contains version information. + resourceAttributeIncludeFilter map[string]filter.Filter + resourceAttributeExcludeFilter map[string]filter.Filter metricDcgmGpuMemoryBytesUsed metricDcgmGpuMemoryBytesUsed metricDcgmGpuProfilingDramUtilization metricDcgmGpuProfilingDramUtilization metricDcgmGpuProfilingNvlinkTrafficRate metricDcgmGpuProfilingNvlinkTrafficRate @@ -567,6 +542,26 @@ func NewMetricsBuilder(mbc MetricsBuilderConfig, settings receiver.CreateSetting metricDcgmGpuProfilingSmOccupancy: newMetricDcgmGpuProfilingSmOccupancy(mbc.Metrics.DcgmGpuProfilingSmOccupancy), metricDcgmGpuProfilingSmUtilization: newMetricDcgmGpuProfilingSmUtilization(mbc.Metrics.DcgmGpuProfilingSmUtilization), metricDcgmGpuUtilization: newMetricDcgmGpuUtilization(mbc.Metrics.DcgmGpuUtilization), + resourceAttributeIncludeFilter: make(map[string]filter.Filter), + resourceAttributeExcludeFilter: make(map[string]filter.Filter), + } + if mbc.ResourceAttributes.GpuModel.MetricsInclude != nil { + mb.resourceAttributeIncludeFilter["gpu.model"] = filter.CreateFilter(mbc.ResourceAttributes.GpuModel.MetricsInclude) + } + if mbc.ResourceAttributes.GpuModel.MetricsExclude != nil { + mb.resourceAttributeExcludeFilter["gpu.model"] = filter.CreateFilter(mbc.ResourceAttributes.GpuModel.MetricsExclude) + } + if mbc.ResourceAttributes.GpuNumber.MetricsInclude != nil { + mb.resourceAttributeIncludeFilter["gpu.number"] = filter.CreateFilter(mbc.ResourceAttributes.GpuNumber.MetricsInclude) + } + if mbc.ResourceAttributes.GpuNumber.MetricsExclude != nil { + mb.resourceAttributeExcludeFilter["gpu.number"] = filter.CreateFilter(mbc.ResourceAttributes.GpuNumber.MetricsExclude) + } + if mbc.ResourceAttributes.GpuUUID.MetricsInclude != nil { + mb.resourceAttributeIncludeFilter["gpu.uuid"] = filter.CreateFilter(mbc.ResourceAttributes.GpuUUID.MetricsInclude) + } + if mbc.ResourceAttributes.GpuUUID.MetricsExclude != nil { + mb.resourceAttributeExcludeFilter["gpu.uuid"] = filter.CreateFilter(mbc.ResourceAttributes.GpuUUID.MetricsExclude) } for _, op := range options { @@ -575,6 +570,11 @@ func NewMetricsBuilder(mbc MetricsBuilderConfig, settings receiver.CreateSetting return mb } +// NewResourceBuilder returns a new resource builder that should be used to build a resource associated with for the emitted metrics. +func (mb *MetricsBuilder) NewResourceBuilder() *ResourceBuilder { + return NewResourceBuilder(mb.config.ResourceAttributes) +} + // updateCapacity updates max length of metrics and resource attributes that will be used for the slice capacity. func (mb *MetricsBuilder) updateCapacity(rm pmetric.ResourceMetrics) { if mb.metricsCapacity < rm.ScopeMetrics().At(0).Metrics().Len() { @@ -636,6 +636,16 @@ func (mb *MetricsBuilder) EmitForResource(rmo ...ResourceMetricsOption) { for _, op := range rmo { op(rm) } + for attr, filter := range mb.resourceAttributeIncludeFilter { + if val, ok := rm.Resource().Attributes().Get(attr); ok && !filter.Matches(val.AsString()) { + return + } + } + for attr, filter := range mb.resourceAttributeExcludeFilter { + if val, ok := rm.Resource().Attributes().Get(attr); ok && filter.Matches(val.AsString()) { + return + } + } if ils.Metrics().Len() > 0 { mb.updateCapacity(rm) @@ -654,43 +664,43 @@ func (mb *MetricsBuilder) Emit(rmo ...ResourceMetricsOption) pmetric.Metrics { } // RecordDcgmGpuMemoryBytesUsedDataPoint adds a data point to dcgm.gpu.memory.bytes_used metric. -func (mb *MetricsBuilder) RecordDcgmGpuMemoryBytesUsedDataPoint(ts pcommon.Timestamp, val int64, modelAttributeValue string, gpuNumberAttributeValue string, uuidAttributeValue string, memoryStateAttributeValue AttributeMemoryState) { - mb.metricDcgmGpuMemoryBytesUsed.recordDataPoint(mb.startTime, ts, val, modelAttributeValue, gpuNumberAttributeValue, uuidAttributeValue, memoryStateAttributeValue.String()) +func (mb *MetricsBuilder) RecordDcgmGpuMemoryBytesUsedDataPoint(ts pcommon.Timestamp, val int64, memoryStateAttributeValue AttributeMemoryState) { + mb.metricDcgmGpuMemoryBytesUsed.recordDataPoint(mb.startTime, ts, val, memoryStateAttributeValue.String()) } // RecordDcgmGpuProfilingDramUtilizationDataPoint adds a data point to dcgm.gpu.profiling.dram_utilization metric. -func (mb *MetricsBuilder) RecordDcgmGpuProfilingDramUtilizationDataPoint(ts pcommon.Timestamp, val float64, modelAttributeValue string, gpuNumberAttributeValue string, uuidAttributeValue string) { - mb.metricDcgmGpuProfilingDramUtilization.recordDataPoint(mb.startTime, ts, val, modelAttributeValue, gpuNumberAttributeValue, uuidAttributeValue) +func (mb *MetricsBuilder) RecordDcgmGpuProfilingDramUtilizationDataPoint(ts pcommon.Timestamp, val float64) { + mb.metricDcgmGpuProfilingDramUtilization.recordDataPoint(mb.startTime, ts, val) } // RecordDcgmGpuProfilingNvlinkTrafficRateDataPoint adds a data point to dcgm.gpu.profiling.nvlink_traffic_rate metric. -func (mb *MetricsBuilder) RecordDcgmGpuProfilingNvlinkTrafficRateDataPoint(ts pcommon.Timestamp, val int64, modelAttributeValue string, gpuNumberAttributeValue string, uuidAttributeValue string, directionAttributeValue AttributeDirection) { - mb.metricDcgmGpuProfilingNvlinkTrafficRate.recordDataPoint(mb.startTime, ts, val, modelAttributeValue, gpuNumberAttributeValue, uuidAttributeValue, directionAttributeValue.String()) +func (mb *MetricsBuilder) RecordDcgmGpuProfilingNvlinkTrafficRateDataPoint(ts pcommon.Timestamp, val int64, directionAttributeValue AttributeDirection) { + mb.metricDcgmGpuProfilingNvlinkTrafficRate.recordDataPoint(mb.startTime, ts, val, directionAttributeValue.String()) } // RecordDcgmGpuProfilingPcieTrafficRateDataPoint adds a data point to dcgm.gpu.profiling.pcie_traffic_rate metric. -func (mb *MetricsBuilder) RecordDcgmGpuProfilingPcieTrafficRateDataPoint(ts pcommon.Timestamp, val int64, modelAttributeValue string, gpuNumberAttributeValue string, uuidAttributeValue string, directionAttributeValue AttributeDirection) { - mb.metricDcgmGpuProfilingPcieTrafficRate.recordDataPoint(mb.startTime, ts, val, modelAttributeValue, gpuNumberAttributeValue, uuidAttributeValue, directionAttributeValue.String()) +func (mb *MetricsBuilder) RecordDcgmGpuProfilingPcieTrafficRateDataPoint(ts pcommon.Timestamp, val int64, directionAttributeValue AttributeDirection) { + mb.metricDcgmGpuProfilingPcieTrafficRate.recordDataPoint(mb.startTime, ts, val, directionAttributeValue.String()) } // RecordDcgmGpuProfilingPipeUtilizationDataPoint adds a data point to dcgm.gpu.profiling.pipe_utilization metric. -func (mb *MetricsBuilder) RecordDcgmGpuProfilingPipeUtilizationDataPoint(ts pcommon.Timestamp, val float64, modelAttributeValue string, gpuNumberAttributeValue string, uuidAttributeValue string, pipeAttributeValue AttributePipe) { - mb.metricDcgmGpuProfilingPipeUtilization.recordDataPoint(mb.startTime, ts, val, modelAttributeValue, gpuNumberAttributeValue, uuidAttributeValue, pipeAttributeValue.String()) +func (mb *MetricsBuilder) RecordDcgmGpuProfilingPipeUtilizationDataPoint(ts pcommon.Timestamp, val float64, pipeAttributeValue AttributePipe) { + mb.metricDcgmGpuProfilingPipeUtilization.recordDataPoint(mb.startTime, ts, val, pipeAttributeValue.String()) } // RecordDcgmGpuProfilingSmOccupancyDataPoint adds a data point to dcgm.gpu.profiling.sm_occupancy metric. -func (mb *MetricsBuilder) RecordDcgmGpuProfilingSmOccupancyDataPoint(ts pcommon.Timestamp, val float64, modelAttributeValue string, gpuNumberAttributeValue string, uuidAttributeValue string) { - mb.metricDcgmGpuProfilingSmOccupancy.recordDataPoint(mb.startTime, ts, val, modelAttributeValue, gpuNumberAttributeValue, uuidAttributeValue) +func (mb *MetricsBuilder) RecordDcgmGpuProfilingSmOccupancyDataPoint(ts pcommon.Timestamp, val float64) { + mb.metricDcgmGpuProfilingSmOccupancy.recordDataPoint(mb.startTime, ts, val) } // RecordDcgmGpuProfilingSmUtilizationDataPoint adds a data point to dcgm.gpu.profiling.sm_utilization metric. -func (mb *MetricsBuilder) RecordDcgmGpuProfilingSmUtilizationDataPoint(ts pcommon.Timestamp, val float64, modelAttributeValue string, gpuNumberAttributeValue string, uuidAttributeValue string) { - mb.metricDcgmGpuProfilingSmUtilization.recordDataPoint(mb.startTime, ts, val, modelAttributeValue, gpuNumberAttributeValue, uuidAttributeValue) +func (mb *MetricsBuilder) RecordDcgmGpuProfilingSmUtilizationDataPoint(ts pcommon.Timestamp, val float64) { + mb.metricDcgmGpuProfilingSmUtilization.recordDataPoint(mb.startTime, ts, val) } // RecordDcgmGpuUtilizationDataPoint adds a data point to dcgm.gpu.utilization metric. -func (mb *MetricsBuilder) RecordDcgmGpuUtilizationDataPoint(ts pcommon.Timestamp, val float64, modelAttributeValue string, gpuNumberAttributeValue string, uuidAttributeValue string) { - mb.metricDcgmGpuUtilization.recordDataPoint(mb.startTime, ts, val, modelAttributeValue, gpuNumberAttributeValue, uuidAttributeValue) +func (mb *MetricsBuilder) RecordDcgmGpuUtilizationDataPoint(ts pcommon.Timestamp, val float64) { + mb.metricDcgmGpuUtilization.recordDataPoint(mb.startTime, ts, val) } // Reset resets metrics builder to its initial state. It should be used when external metrics source is restarted, diff --git a/receiver/dcgmreceiver/internal/metadata/generated_metrics_test.go b/receiver/dcgmreceiver/internal/metadata/generated_metrics_test.go index 0b68d49bd..9de3b9b1d 100644 --- a/receiver/dcgmreceiver/internal/metadata/generated_metrics_test.go +++ b/receiver/dcgmreceiver/internal/metadata/generated_metrics_test.go @@ -42,6 +42,15 @@ func TestMetricsBuilder(t *testing.T) { resAttrsSet: testDataSetNone, expectEmpty: true, }, + { + name: "filter_set_include", + resAttrsSet: testDataSetAll, + }, + { + name: "filter_set_exclude", + resAttrsSet: testDataSetAll, + expectEmpty: true, + }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { @@ -61,37 +70,41 @@ func TestMetricsBuilder(t *testing.T) { defaultMetricsCount++ allMetricsCount++ - mb.RecordDcgmGpuMemoryBytesUsedDataPoint(ts, 1, "model-val", "gpu_number-val", "uuid-val", AttributeMemoryStateUsed) + mb.RecordDcgmGpuMemoryBytesUsedDataPoint(ts, 1, AttributeMemoryStateUsed) defaultMetricsCount++ allMetricsCount++ - mb.RecordDcgmGpuProfilingDramUtilizationDataPoint(ts, 1, "model-val", "gpu_number-val", "uuid-val") + mb.RecordDcgmGpuProfilingDramUtilizationDataPoint(ts, 1) defaultMetricsCount++ allMetricsCount++ - mb.RecordDcgmGpuProfilingNvlinkTrafficRateDataPoint(ts, 1, "model-val", "gpu_number-val", "uuid-val", AttributeDirectionTx) + mb.RecordDcgmGpuProfilingNvlinkTrafficRateDataPoint(ts, 1, AttributeDirectionTx) defaultMetricsCount++ allMetricsCount++ - mb.RecordDcgmGpuProfilingPcieTrafficRateDataPoint(ts, 1, "model-val", "gpu_number-val", "uuid-val", AttributeDirectionTx) + mb.RecordDcgmGpuProfilingPcieTrafficRateDataPoint(ts, 1, AttributeDirectionTx) defaultMetricsCount++ allMetricsCount++ - mb.RecordDcgmGpuProfilingPipeUtilizationDataPoint(ts, 1, "model-val", "gpu_number-val", "uuid-val", AttributePipeTensor) + mb.RecordDcgmGpuProfilingPipeUtilizationDataPoint(ts, 1, AttributePipeTensor) defaultMetricsCount++ allMetricsCount++ - mb.RecordDcgmGpuProfilingSmOccupancyDataPoint(ts, 1, "model-val", "gpu_number-val", "uuid-val") + mb.RecordDcgmGpuProfilingSmOccupancyDataPoint(ts, 1) defaultMetricsCount++ allMetricsCount++ - mb.RecordDcgmGpuProfilingSmUtilizationDataPoint(ts, 1, "model-val", "gpu_number-val", "uuid-val") + mb.RecordDcgmGpuProfilingSmUtilizationDataPoint(ts, 1) defaultMetricsCount++ allMetricsCount++ - mb.RecordDcgmGpuUtilizationDataPoint(ts, 1, "model-val", "gpu_number-val", "uuid-val") + mb.RecordDcgmGpuUtilizationDataPoint(ts, 1) - res := pcommon.NewResource() + rb := mb.NewResourceBuilder() + rb.SetGpuModel("gpu.model-val") + rb.SetGpuNumber("gpu.number-val") + rb.SetGpuUUID("gpu.uuid-val") + res := rb.Emit() metrics := mb.Emit(WithResource(res)) if test.expectEmpty { @@ -125,16 +138,7 @@ func TestMetricsBuilder(t *testing.T) { assert.Equal(t, ts, dp.Timestamp()) assert.Equal(t, pmetric.NumberDataPointValueTypeInt, dp.ValueType()) assert.Equal(t, int64(1), dp.IntValue()) - attrVal, ok := dp.Attributes().Get("model") - assert.True(t, ok) - assert.EqualValues(t, "model-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("gpu_number") - assert.True(t, ok) - assert.EqualValues(t, "gpu_number-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("uuid") - assert.True(t, ok) - assert.EqualValues(t, "uuid-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("memory_state") + attrVal, ok := dp.Attributes().Get("memory_state") assert.True(t, ok) assert.EqualValues(t, "used", attrVal.Str()) case "dcgm.gpu.profiling.dram_utilization": @@ -149,15 +153,6 @@ func TestMetricsBuilder(t *testing.T) { assert.Equal(t, ts, dp.Timestamp()) assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) assert.Equal(t, float64(1), dp.DoubleValue()) - attrVal, ok := dp.Attributes().Get("model") - assert.True(t, ok) - assert.EqualValues(t, "model-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("gpu_number") - assert.True(t, ok) - assert.EqualValues(t, "gpu_number-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("uuid") - assert.True(t, ok) - assert.EqualValues(t, "uuid-val", attrVal.Str()) case "dcgm.gpu.profiling.nvlink_traffic_rate": assert.False(t, validatedMetrics["dcgm.gpu.profiling.nvlink_traffic_rate"], "Found a duplicate in the metrics slice: dcgm.gpu.profiling.nvlink_traffic_rate") validatedMetrics["dcgm.gpu.profiling.nvlink_traffic_rate"] = true @@ -170,16 +165,7 @@ func TestMetricsBuilder(t *testing.T) { assert.Equal(t, ts, dp.Timestamp()) assert.Equal(t, pmetric.NumberDataPointValueTypeInt, dp.ValueType()) assert.Equal(t, int64(1), dp.IntValue()) - attrVal, ok := dp.Attributes().Get("model") - assert.True(t, ok) - assert.EqualValues(t, "model-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("gpu_number") - assert.True(t, ok) - assert.EqualValues(t, "gpu_number-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("uuid") - assert.True(t, ok) - assert.EqualValues(t, "uuid-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("direction") + attrVal, ok := dp.Attributes().Get("direction") assert.True(t, ok) assert.EqualValues(t, "tx", attrVal.Str()) case "dcgm.gpu.profiling.pcie_traffic_rate": @@ -194,16 +180,7 @@ func TestMetricsBuilder(t *testing.T) { assert.Equal(t, ts, dp.Timestamp()) assert.Equal(t, pmetric.NumberDataPointValueTypeInt, dp.ValueType()) assert.Equal(t, int64(1), dp.IntValue()) - attrVal, ok := dp.Attributes().Get("model") - assert.True(t, ok) - assert.EqualValues(t, "model-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("gpu_number") - assert.True(t, ok) - assert.EqualValues(t, "gpu_number-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("uuid") - assert.True(t, ok) - assert.EqualValues(t, "uuid-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("direction") + attrVal, ok := dp.Attributes().Get("direction") assert.True(t, ok) assert.EqualValues(t, "tx", attrVal.Str()) case "dcgm.gpu.profiling.pipe_utilization": @@ -218,16 +195,7 @@ func TestMetricsBuilder(t *testing.T) { assert.Equal(t, ts, dp.Timestamp()) assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) assert.Equal(t, float64(1), dp.DoubleValue()) - attrVal, ok := dp.Attributes().Get("model") - assert.True(t, ok) - assert.EqualValues(t, "model-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("gpu_number") - assert.True(t, ok) - assert.EqualValues(t, "gpu_number-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("uuid") - assert.True(t, ok) - assert.EqualValues(t, "uuid-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("pipe") + attrVal, ok := dp.Attributes().Get("pipe") assert.True(t, ok) assert.EqualValues(t, "tensor", attrVal.Str()) case "dcgm.gpu.profiling.sm_occupancy": @@ -242,15 +210,6 @@ func TestMetricsBuilder(t *testing.T) { assert.Equal(t, ts, dp.Timestamp()) assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) assert.Equal(t, float64(1), dp.DoubleValue()) - attrVal, ok := dp.Attributes().Get("model") - assert.True(t, ok) - assert.EqualValues(t, "model-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("gpu_number") - assert.True(t, ok) - assert.EqualValues(t, "gpu_number-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("uuid") - assert.True(t, ok) - assert.EqualValues(t, "uuid-val", attrVal.Str()) case "dcgm.gpu.profiling.sm_utilization": assert.False(t, validatedMetrics["dcgm.gpu.profiling.sm_utilization"], "Found a duplicate in the metrics slice: dcgm.gpu.profiling.sm_utilization") validatedMetrics["dcgm.gpu.profiling.sm_utilization"] = true @@ -263,15 +222,6 @@ func TestMetricsBuilder(t *testing.T) { assert.Equal(t, ts, dp.Timestamp()) assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) assert.Equal(t, float64(1), dp.DoubleValue()) - attrVal, ok := dp.Attributes().Get("model") - assert.True(t, ok) - assert.EqualValues(t, "model-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("gpu_number") - assert.True(t, ok) - assert.EqualValues(t, "gpu_number-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("uuid") - assert.True(t, ok) - assert.EqualValues(t, "uuid-val", attrVal.Str()) case "dcgm.gpu.utilization": assert.False(t, validatedMetrics["dcgm.gpu.utilization"], "Found a duplicate in the metrics slice: dcgm.gpu.utilization") validatedMetrics["dcgm.gpu.utilization"] = true @@ -284,15 +234,6 @@ func TestMetricsBuilder(t *testing.T) { assert.Equal(t, ts, dp.Timestamp()) assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) assert.Equal(t, float64(1), dp.DoubleValue()) - attrVal, ok := dp.Attributes().Get("model") - assert.True(t, ok) - assert.EqualValues(t, "model-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("gpu_number") - assert.True(t, ok) - assert.EqualValues(t, "gpu_number-val", attrVal.Str()) - attrVal, ok = dp.Attributes().Get("uuid") - assert.True(t, ok) - assert.EqualValues(t, "uuid-val", attrVal.Str()) } } }) diff --git a/receiver/dcgmreceiver/internal/metadata/generated_resource.go b/receiver/dcgmreceiver/internal/metadata/generated_resource.go new file mode 100644 index 000000000..3b9be9a4d --- /dev/null +++ b/receiver/dcgmreceiver/internal/metadata/generated_resource.go @@ -0,0 +1,50 @@ +// Code generated by mdatagen. DO NOT EDIT. + +package metadata + +import ( + "go.opentelemetry.io/collector/pdata/pcommon" +) + +// ResourceBuilder is a helper struct to build resources predefined in metadata.yaml. +// The ResourceBuilder is not thread-safe and must not to be used in multiple goroutines. +type ResourceBuilder struct { + config ResourceAttributesConfig + res pcommon.Resource +} + +// NewResourceBuilder creates a new ResourceBuilder. This method should be called on the start of the application. +func NewResourceBuilder(rac ResourceAttributesConfig) *ResourceBuilder { + return &ResourceBuilder{ + config: rac, + res: pcommon.NewResource(), + } +} + +// SetGpuModel sets provided value as "gpu.model" attribute. +func (rb *ResourceBuilder) SetGpuModel(val string) { + if rb.config.GpuModel.Enabled { + rb.res.Attributes().PutStr("gpu.model", val) + } +} + +// SetGpuNumber sets provided value as "gpu.number" attribute. +func (rb *ResourceBuilder) SetGpuNumber(val string) { + if rb.config.GpuNumber.Enabled { + rb.res.Attributes().PutStr("gpu.number", val) + } +} + +// SetGpuUUID sets provided value as "gpu.uuid" attribute. +func (rb *ResourceBuilder) SetGpuUUID(val string) { + if rb.config.GpuUUID.Enabled { + rb.res.Attributes().PutStr("gpu.uuid", val) + } +} + +// Emit returns the built resource and resets the internal builder state. +func (rb *ResourceBuilder) Emit() pcommon.Resource { + r := rb.res + rb.res = pcommon.NewResource() + return r +} diff --git a/receiver/dcgmreceiver/internal/metadata/generated_resource_test.go b/receiver/dcgmreceiver/internal/metadata/generated_resource_test.go new file mode 100644 index 000000000..eeaa832fc --- /dev/null +++ b/receiver/dcgmreceiver/internal/metadata/generated_resource_test.go @@ -0,0 +1,52 @@ +// Code generated by mdatagen. DO NOT EDIT. + +package metadata + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestResourceBuilder(t *testing.T) { + for _, test := range []string{"default", "all_set", "none_set"} { + t.Run(test, func(t *testing.T) { + cfg := loadResourceAttributesConfig(t, test) + rb := NewResourceBuilder(cfg) + rb.SetGpuModel("gpu.model-val") + rb.SetGpuNumber("gpu.number-val") + rb.SetGpuUUID("gpu.uuid-val") + + res := rb.Emit() + assert.Equal(t, 0, rb.Emit().Attributes().Len()) // Second call should return empty Resource + + switch test { + case "default": + assert.Equal(t, 3, res.Attributes().Len()) + case "all_set": + assert.Equal(t, 3, res.Attributes().Len()) + case "none_set": + assert.Equal(t, 0, res.Attributes().Len()) + return + default: + assert.Failf(t, "unexpected test case: %s", test) + } + + val, ok := res.Attributes().Get("gpu.model") + assert.True(t, ok) + if ok { + assert.EqualValues(t, "gpu.model-val", val.Str()) + } + val, ok = res.Attributes().Get("gpu.number") + assert.True(t, ok) + if ok { + assert.EqualValues(t, "gpu.number-val", val.Str()) + } + val, ok = res.Attributes().Get("gpu.uuid") + assert.True(t, ok) + if ok { + assert.EqualValues(t, "gpu.uuid-val", val.Str()) + } + }) + } +} diff --git a/receiver/dcgmreceiver/internal/metadata/testdata/config.yaml b/receiver/dcgmreceiver/internal/metadata/testdata/config.yaml index 20fbd34ce..2b5c665af 100644 --- a/receiver/dcgmreceiver/internal/metadata/testdata/config.yaml +++ b/receiver/dcgmreceiver/internal/metadata/testdata/config.yaml @@ -17,6 +17,13 @@ all_set: enabled: true dcgm.gpu.utilization: enabled: true + resource_attributes: + gpu.model: + enabled: true + gpu.number: + enabled: true + gpu.uuid: + enabled: true none_set: metrics: dcgm.gpu.memory.bytes_used: @@ -35,3 +42,38 @@ none_set: enabled: false dcgm.gpu.utilization: enabled: false + resource_attributes: + gpu.model: + enabled: false + gpu.number: + enabled: false + gpu.uuid: + enabled: false +filter_set_include: + resource_attributes: + gpu.model: + enabled: true + metrics_include: + - regexp: ".*" + gpu.number: + enabled: true + metrics_include: + - regexp: ".*" + gpu.uuid: + enabled: true + metrics_include: + - regexp: ".*" +filter_set_exclude: + resource_attributes: + gpu.model: + enabled: true + metrics_exclude: + - strict: "gpu.model-val" + gpu.number: + enabled: true + metrics_exclude: + - strict: "gpu.number-val" + gpu.uuid: + enabled: true + metrics_exclude: + - strict: "gpu.uuid-val" diff --git a/receiver/dcgmreceiver/metadata.yaml b/receiver/dcgmreceiver/metadata.yaml index 47f927f6a..d226fbad6 100644 --- a/receiver/dcgmreceiver/metadata.yaml +++ b/receiver/dcgmreceiver/metadata.yaml @@ -5,19 +5,23 @@ status: beta: [metrics] -attributes: - model: +resource_attributes: + gpu.number: type: string - description: GPU model + description: GPU index starting at 0. + enabled: true - uuid: + gpu.uuid: type: string - description: GPU universally unique identifier + description: GPU universally unique identifier. + enabled: true - gpu_number: + gpu.model: type: string - description: GPU index starting at 0. + description: GPU model name. + enabled: true +attributes: memory_state: type: string description: GPU memory used or free @@ -39,7 +43,6 @@ metrics: unit: 1 gauge: value_type: double - attributes: [model, gpu_number, uuid] enabled: true dcgm.gpu.memory.bytes_used: @@ -47,7 +50,7 @@ metrics: unit: By gauge: value_type: int - attributes: [model, gpu_number, uuid, memory_state] + attributes: [memory_state] enabled: true dcgm.gpu.profiling.sm_utilization: @@ -55,7 +58,6 @@ metrics: unit: 1 gauge: value_type: double - attributes: [model, gpu_number, uuid] enabled: true dcgm.gpu.profiling.sm_occupancy: @@ -63,7 +65,6 @@ metrics: unit: 1 gauge: value_type: double - attributes: [model, gpu_number, uuid] enabled: true dcgm.gpu.profiling.pipe_utilization: @@ -71,7 +72,7 @@ metrics: unit: 1 gauge: value_type: double - attributes: [model, gpu_number, uuid, pipe] + attributes: [pipe] enabled: true dcgm.gpu.profiling.dram_utilization: @@ -79,7 +80,6 @@ metrics: unit: 1 gauge: value_type: double - attributes: [model, gpu_number, uuid] enabled: true dcgm.gpu.profiling.pcie_traffic_rate: @@ -87,7 +87,7 @@ metrics: unit: By/s gauge: value_type: int - attributes: [model, gpu_number, uuid, direction] + attributes: [direction] enabled: true dcgm.gpu.profiling.nvlink_traffic_rate: @@ -95,5 +95,5 @@ metrics: unit: By/s gauge: value_type: int - attributes: [model, gpu_number, uuid, direction] + attributes: [direction] enabled: true diff --git a/receiver/dcgmreceiver/scraper.go b/receiver/dcgmreceiver/scraper.go index 2768e50d9..f2061fd9c 100644 --- a/receiver/dcgmreceiver/scraper.go +++ b/receiver/dcgmreceiver/scraper.go @@ -90,44 +90,49 @@ func (s *dcgmScraper) scrape(_ context.Context) (pmetric.Metrics, error) { deviceMetrics, err := s.client.collectDeviceMetrics() now := pcommon.NewTimestampFromTime(time.Now()) - for _, metric := range deviceMetrics { - model := s.client.getDeviceModelName(metric.gpuIndex) - UUID := s.client.getDeviceUUID(metric.gpuIndex) - gpuIndex := fmt.Sprintf("%d", metric.gpuIndex) - switch metric.name { - case "dcgm.gpu.utilization": - gpuUtil := float64(metric.asInt64()) / 100.0 /* normalize */ - s.mb.RecordDcgmGpuUtilizationDataPoint(now, gpuUtil, model, gpuIndex, UUID) - case "dcgm.gpu.memory.bytes_used": - bytesUsed := 1e6 * metric.asInt64() /* MB to B */ - s.mb.RecordDcgmGpuMemoryBytesUsedDataPoint(now, bytesUsed, model, gpuIndex, UUID, metadata.AttributeMemoryStateUsed) - case "dcgm.gpu.memory.bytes_free": - bytesFree := 1e6 * metric.asInt64() /* MB to B */ - s.mb.RecordDcgmGpuMemoryBytesUsedDataPoint(now, bytesFree, model, gpuIndex, UUID, metadata.AttributeMemoryStateFree) - case "dcgm.gpu.profiling.sm_utilization": - s.mb.RecordDcgmGpuProfilingSmUtilizationDataPoint(now, metric.asFloat64(), model, gpuIndex, UUID) - case "dcgm.gpu.profiling.sm_occupancy": - s.mb.RecordDcgmGpuProfilingSmOccupancyDataPoint(now, metric.asFloat64(), model, gpuIndex, UUID) - case "dcgm.gpu.profiling.tensor_utilization": - s.mb.RecordDcgmGpuProfilingPipeUtilizationDataPoint(now, metric.asFloat64(), model, gpuIndex, UUID, metadata.AttributePipeTensor) - case "dcgm.gpu.profiling.fp64_utilization": - s.mb.RecordDcgmGpuProfilingPipeUtilizationDataPoint(now, metric.asFloat64(), model, gpuIndex, UUID, metadata.AttributePipeFp64) - case "dcgm.gpu.profiling.fp32_utilization": - s.mb.RecordDcgmGpuProfilingPipeUtilizationDataPoint(now, metric.asFloat64(), model, gpuIndex, UUID, metadata.AttributePipeFp32) - case "dcgm.gpu.profiling.fp16_utilization": - s.mb.RecordDcgmGpuProfilingPipeUtilizationDataPoint(now, metric.asFloat64(), model, gpuIndex, UUID, metadata.AttributePipeFp16) - case "dcgm.gpu.profiling.dram_utilization": - s.mb.RecordDcgmGpuProfilingDramUtilizationDataPoint(now, metric.asFloat64(), model, gpuIndex, UUID) - case "dcgm.gpu.profiling.pcie_sent_bytes": - /* DCGM already returns these as bytes/sec despite the name */ - s.mb.RecordDcgmGpuProfilingPcieTrafficRateDataPoint(now, metric.asInt64(), model, gpuIndex, UUID, metadata.AttributeDirectionTx) - case "dcgm.gpu.profiling.pcie_received_bytes": - s.mb.RecordDcgmGpuProfilingPcieTrafficRateDataPoint(now, metric.asInt64(), model, gpuIndex, UUID, metadata.AttributeDirectionRx) - case "dcgm.gpu.profiling.nvlink_sent_bytes": - s.mb.RecordDcgmGpuProfilingNvlinkTrafficRateDataPoint(now, metric.asInt64(), model, gpuIndex, UUID, metadata.AttributeDirectionTx) - case "dcgm.gpu.profiling.nvlink_received_bytes": - s.mb.RecordDcgmGpuProfilingNvlinkTrafficRateDataPoint(now, metric.asInt64(), model, gpuIndex, UUID, metadata.AttributeDirectionRx) + for gpuIndex, metrics := range deviceMetrics { + rb := s.mb.NewResourceBuilder() + rb.SetGpuNumber(fmt.Sprintf("%d", gpuIndex)) + rb.SetGpuUUID(s.client.getDeviceUUID(gpuIndex)) + rb.SetGpuModel(s.client.getDeviceModelName(gpuIndex)) + gpuResource := rb.Emit() + for _, metric := range metrics { + switch metric.name { + case "dcgm.gpu.utilization": + gpuUtil := float64(metric.asInt64()) / 100.0 /* normalize */ + s.mb.RecordDcgmGpuUtilizationDataPoint(now, gpuUtil) + case "dcgm.gpu.memory.bytes_used": + bytesUsed := 1e6 * metric.asInt64() /* MB to B */ + s.mb.RecordDcgmGpuMemoryBytesUsedDataPoint(now, bytesUsed, metadata.AttributeMemoryStateUsed) + case "dcgm.gpu.memory.bytes_free": + bytesFree := 1e6 * metric.asInt64() /* MB to B */ + s.mb.RecordDcgmGpuMemoryBytesUsedDataPoint(now, bytesFree, metadata.AttributeMemoryStateFree) + case "dcgm.gpu.profiling.sm_utilization": + s.mb.RecordDcgmGpuProfilingSmUtilizationDataPoint(now, metric.asFloat64()) + case "dcgm.gpu.profiling.sm_occupancy": + s.mb.RecordDcgmGpuProfilingSmOccupancyDataPoint(now, metric.asFloat64()) + case "dcgm.gpu.profiling.tensor_utilization": + s.mb.RecordDcgmGpuProfilingPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributePipeTensor) + case "dcgm.gpu.profiling.fp64_utilization": + s.mb.RecordDcgmGpuProfilingPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributePipeFp64) + case "dcgm.gpu.profiling.fp32_utilization": + s.mb.RecordDcgmGpuProfilingPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributePipeFp32) + case "dcgm.gpu.profiling.fp16_utilization": + s.mb.RecordDcgmGpuProfilingPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributePipeFp16) + case "dcgm.gpu.profiling.dram_utilization": + s.mb.RecordDcgmGpuProfilingDramUtilizationDataPoint(now, metric.asFloat64()) + case "dcgm.gpu.profiling.pcie_sent_bytes": + /* DCGM already returns these as bytes/sec despite the name */ + s.mb.RecordDcgmGpuProfilingPcieTrafficRateDataPoint(now, metric.asInt64(), metadata.AttributeDirectionTx) + case "dcgm.gpu.profiling.pcie_received_bytes": + s.mb.RecordDcgmGpuProfilingPcieTrafficRateDataPoint(now, metric.asInt64(), metadata.AttributeDirectionRx) + case "dcgm.gpu.profiling.nvlink_sent_bytes": + s.mb.RecordDcgmGpuProfilingNvlinkTrafficRateDataPoint(now, metric.asInt64(), metadata.AttributeDirectionTx) + case "dcgm.gpu.profiling.nvlink_received_bytes": + s.mb.RecordDcgmGpuProfilingNvlinkTrafficRateDataPoint(now, metric.asInt64(), metadata.AttributeDirectionRx) + } } + s.mb.EmitForResource(metadata.WithResource(gpuResource)) } return s.mb.Emit(), err diff --git a/receiver/dcgmreceiver/scraper_gpu_test.go b/receiver/dcgmreceiver/scraper_gpu_test.go index 4ca6976a5..31018e41b 100644 --- a/receiver/dcgmreceiver/scraper_gpu_test.go +++ b/receiver/dcgmreceiver/scraper_gpu_test.go @@ -182,9 +182,12 @@ func TestScrapeOnProfilingPaused(t *testing.T) { "dcgm.gpu.memory.bytes_used", } - require.Equal(t, len(expectedMetrics), metrics.MetricCount()) + ilms := metrics.ResourceMetrics().At(0).ScopeMetrics() + require.Equal(t, 1, ilms.Len()) + + ms := ilms.At(0).Metrics() + require.Equal(t, len(expectedMetrics), ms.Len()) - ms := metrics.ResourceMetrics().At(0).ScopeMetrics().At(0).Metrics() metricWasSeen := make(map[string]bool) for i := 0; i < ms.Len(); i++ { metricWasSeen[ms.At(i).Name()] = true @@ -236,6 +239,11 @@ func validateScraperResult(t *testing.T, metrics pmetric.Metrics, expectedMetric assert.LessOrEqual(t, len(expectedMetrics), metrics.MetricCount()) assert.LessOrEqual(t, expectedDataPointCount, metrics.DataPointCount()) + r := metrics.ResourceMetrics().At(0).Resource() + assert.Contains(t, r.Attributes().AsRaw(), "gpu.number") + assert.Contains(t, r.Attributes().AsRaw(), "gpu.uuid") + assert.Contains(t, r.Attributes().AsRaw(), "gpu.model") + ilms := metrics.ResourceMetrics().At(0).ScopeMetrics() require.Equal(t, 1, ilms.Len()) @@ -243,11 +251,6 @@ func validateScraperResult(t *testing.T, metrics pmetric.Metrics, expectedMetric for i := 0; i < ms.Len(); i++ { m := ms.At(i) dps := m.Gauge().DataPoints() - for j := 0; j < dps.Len(); j++ { - assert.Contains(t, dps.At(j).Attributes().AsRaw(), "gpu_number") - assert.Contains(t, dps.At(j).Attributes().AsRaw(), "model") - assert.Contains(t, dps.At(j).Attributes().AsRaw(), "uuid") - } assert.LessOrEqual(t, expectedMetrics[m.Name()], dps.Len()) From 89c39197e8e2e74508c150cbb876cd10122504a0 Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Wed, 29 May 2024 14:42:51 -0400 Subject: [PATCH 06/38] Get rid of dcgmNameToMetricName and use DCGM field names directly. --- receiver/dcgmreceiver/client.go | 22 ++++----- receiver/dcgmreceiver/client_gpu_test.go | 57 +++++++---------------- receiver/dcgmreceiver/factory_gpu_on.go | 28 ++--------- receiver/dcgmreceiver/scraper.go | 28 +++++------ receiver/dcgmreceiver/scraper_gpu_test.go | 28 +++++------ 5 files changed, 62 insertions(+), 101 deletions(-) diff --git a/receiver/dcgmreceiver/client.go b/receiver/dcgmreceiver/client.go index 15cd5e864..6e3f3e6d5 100644 --- a/receiver/dcgmreceiver/client.go +++ b/receiver/dcgmreceiver/client.go @@ -75,7 +75,7 @@ func newClient(config *Config, logger *zap.Logger) (*dcgmClient, error) { } enabledFields, unavailableFields := filterSupportedFields(requestedFieldIDs, supportedProfilingFieldIDs) for _, f := range unavailableFields { - logger.Sugar().Warnf("Field '%s' is not supported. Metric '%s' will not be collected", dcgmIDToName[f], dcgmNameToMetricName[dcgmIDToName[f]]) + logger.Sugar().Warnf("Field '%s' is not supported. Metric '%s' will not be collected", dcgmIDToName[f], dcgmIDToName[f]) } if len(enabledFields) != 0 { deviceIndices, names, UUIDs, err = discoverDevices(logger) @@ -323,34 +323,34 @@ func (client *dcgmClient) collectDeviceMetrics() (map[uint][]dcgmMetric, error) func (client *dcgmClient) appendMetric(gpuMetrics []dcgmMetric, gpuIndex uint, fieldValues []dcgm.FieldValue_v1) []dcgmMetric { for _, fieldValue := range fieldValues { - metricName := dcgmNameToMetricName[dcgmIDToName[dcgm.Short(fieldValue.FieldId)]] + dcgmName := dcgmIDToName[dcgm.Short(fieldValue.FieldId)] if !isValidValue(fieldValue) { - msg := fmt.Sprintf("Received invalid value (ts %d gpu %d) %s", fieldValue.Ts, gpuIndex, metricName) - client.issueWarningForFailedQueryUptoThreshold(gpuIndex, metricName, msg) + msg := fmt.Sprintf("Received invalid value (ts %d gpu %d) %s", fieldValue.Ts, gpuIndex, dcgmName) + client.issueWarningForFailedQueryUptoThreshold(gpuIndex, dcgmName, msg) continue } switch fieldValue.FieldType { case dcgm.DCGM_FT_DOUBLE: - client.logger.Debugf("Discovered (ts %d gpu %d) %s = %.3f (f64)", fieldValue.Ts, gpuIndex, metricName, fieldValue.Float64()) + client.logger.Debugf("Discovered (ts %d gpu %d) %s = %.3f (f64)", fieldValue.Ts, gpuIndex, dcgmName, fieldValue.Float64()) case dcgm.DCGM_FT_INT64: - client.logger.Debugf("Discovered (ts %d gpu %d) %s = %d (i64)", fieldValue.Ts, gpuIndex, metricName, fieldValue.Int64()) + client.logger.Debugf("Discovered (ts %d gpu %d) %s = %d (i64)", fieldValue.Ts, gpuIndex, dcgmName, fieldValue.Int64()) } - gpuMetrics = append(gpuMetrics, dcgmMetric{fieldValue.Ts, metricName, fieldValue.Value}) + gpuMetrics = append(gpuMetrics, dcgmMetric{fieldValue.Ts, dcgmName, fieldValue.Value}) } return gpuMetrics } -func (client *dcgmClient) issueWarningForFailedQueryUptoThreshold(deviceIdx uint, metricName string, reason string) { - deviceMetric := fmt.Sprintf("device%d.%s", deviceIdx, metricName) +func (client *dcgmClient) issueWarningForFailedQueryUptoThreshold(deviceIdx uint, dcgmName string, reason string) { + deviceMetric := fmt.Sprintf("device%d.%s", deviceIdx, dcgmName) client.deviceMetricToFailedQueryCount[deviceMetric]++ failedCount := client.deviceMetricToFailedQueryCount[deviceMetric] if failedCount <= maxWarningsForFailedDeviceMetricQuery { - client.logger.Warnf("Unable to query '%s' for Nvidia device %d on '%s'", metricName, deviceIdx, reason) + client.logger.Warnf("Unable to query '%s' for Nvidia device %d on '%s'", dcgmName, deviceIdx, reason) if failedCount == maxWarningsForFailedDeviceMetricQuery { - client.logger.Warnf("Surpressing further device query warnings for '%s' for Nvidia device %d", metricName, deviceIdx) + client.logger.Warnf("Surpressing further device query warnings for '%s' for Nvidia device %d", dcgmName, deviceIdx) } } } diff --git a/receiver/dcgmreceiver/client_gpu_test.go b/receiver/dcgmreceiver/client_gpu_test.go index ade5050aa..b451eaf58 100644 --- a/receiver/dcgmreceiver/client_gpu_test.go +++ b/receiver/dcgmreceiver/client_gpu_test.go @@ -28,7 +28,6 @@ import ( "testing" "time" - "github.com/NVIDIA/go-dcgm/pkg/dcgm" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.uber.org/zap/zaptest" @@ -62,17 +61,13 @@ func TestSupportedFieldsWithGolden(t *testing.T) { require.Nil(t, err) enabledFields, unavailableFields := filterSupportedFields(allFields, supportedProfilingFields) - dcgmIDToNameMap := make(map[dcgm.Short]string, len(dcgm.DCGM_FI)) - for fieldName, fieldID := range dcgm.DCGM_FI { - dcgmIDToNameMap[fieldID] = fieldName - } var enabledFieldsString []string var unavailableFieldsString []string for _, f := range enabledFields { - enabledFieldsString = append(enabledFieldsString, dcgmIDToNameMap[f]) + enabledFieldsString = append(enabledFieldsString, dcgmIDToName[f]) } for _, f := range unavailableFields { - unavailableFieldsString = append(unavailableFieldsString, dcgmIDToNameMap[f]) + unavailableFieldsString = append(unavailableFieldsString, dcgmIDToName[f]) } m := modelSupportedFields{ Model: gpuModel, @@ -83,7 +78,7 @@ func TestSupportedFieldsWithGolden(t *testing.T) { if err != nil { t.Fatal(err) } - assert.Equal(t, len(dcgmNameToMetricName), len(client.enabledFieldIDs)+len(unavailableFieldsString)) + assert.Equal(t, len(allFields), len(client.enabledFieldIDs)+len(unavailableFieldsString)) goldenPath := getModelGoldenFilePath(t, gpuModel) golden.Assert(t, string(actual), goldenPath) client.cleanup() @@ -93,22 +88,6 @@ func TestSupportedFieldsWithGolden(t *testing.T) { // file, given a GPU model string func LoadExpectedMetrics(t *testing.T, model string) []string { t.Helper() - dcgmNameToMetricNameMap := map[string]string{ - "DCGM_FI_DEV_GPU_UTIL": "dcgm.gpu.utilization", - "DCGM_FI_DEV_FB_USED": "dcgm.gpu.memory.bytes_used", - "DCGM_FI_DEV_FB_FREE": "dcgm.gpu.memory.bytes_free", - "DCGM_FI_PROF_SM_ACTIVE": "dcgm.gpu.profiling.sm_utilization", - "DCGM_FI_PROF_SM_OCCUPANCY": "dcgm.gpu.profiling.sm_occupancy", - "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE": "dcgm.gpu.profiling.tensor_utilization", - "DCGM_FI_PROF_DRAM_ACTIVE": "dcgm.gpu.profiling.dram_utilization", - "DCGM_FI_PROF_PIPE_FP64_ACTIVE": "dcgm.gpu.profiling.fp64_utilization", - "DCGM_FI_PROF_PIPE_FP32_ACTIVE": "dcgm.gpu.profiling.fp32_utilization", - "DCGM_FI_PROF_PIPE_FP16_ACTIVE": "dcgm.gpu.profiling.fp16_utilization", - "DCGM_FI_PROF_PCIE_TX_BYTES": "dcgm.gpu.profiling.pcie_sent_bytes", - "DCGM_FI_PROF_PCIE_RX_BYTES": "dcgm.gpu.profiling.pcie_received_bytes", - "DCGM_FI_PROF_NVLINK_TX_BYTES": "dcgm.gpu.profiling.nvlink_sent_bytes", - "DCGM_FI_PROF_NVLINK_RX_BYTES": "dcgm.gpu.profiling.nvlink_received_bytes", - } goldenPath := getModelGoldenFilePath(t, model) goldenFile, err := ioutil.ReadFile(goldenPath) if err != nil { @@ -121,7 +100,7 @@ func LoadExpectedMetrics(t *testing.T, model string) []string { } var expectedMetrics []string for _, supported := range m.SupportedFields { - expectedMetrics = append(expectedMetrics, dcgmNameToMetricNameMap[supported]) + expectedMetrics = append(expectedMetrics, supported) } return expectedMetrics } @@ -166,37 +145,37 @@ func TestCollectGpuProfilingMetrics(t *testing.T) { for gpuIndex, metrics := range deviceMetrics { for _, metric := range metrics { switch metric.name { - case "dcgm.gpu.profiling.tensor_utilization": + case "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE": fallthrough - case "dcgm.gpu.profiling.dram_utilization": + case "DCGM_FI_PROF_DRAM_ACTIVE": fallthrough - case "dcgm.gpu.profiling.fp64_utilization": + case "DCGM_FI_PROF_PIPE_FP64_ACTIVE": fallthrough - case "dcgm.gpu.profiling.fp32_utilization": + case "DCGM_FI_PROF_PIPE_FP32_ACTIVE": fallthrough - case "dcgm.gpu.profiling.fp16_utilization": + case "DCGM_FI_PROF_PIPE_FP16_ACTIVE": fallthrough - case "dcgm.gpu.profiling.sm_occupancy": + case "DCGM_FI_PROF_SM_OCCUPANCY": fallthrough - case "dcgm.gpu.profiling.sm_utilization": + case "DCGM_FI_PROF_SM_ACTIVE": assert.GreaterOrEqual(t, metric.asFloat64(), float64(0.0)) assert.LessOrEqual(t, metric.asFloat64(), float64(1.0)) - case "dcgm.gpu.utilization": + case "DCGM_FI_DEV_GPU_UTIL": assert.GreaterOrEqual(t, metric.asInt64(), int64(0)) assert.LessOrEqual(t, metric.asInt64(), int64(100)) - case "dcgm.gpu.memory.bytes_free": + case "DCGM_FI_DEV_FB_FREE": fallthrough - case "dcgm.gpu.memory.bytes_used": + case "DCGM_FI_DEV_FB_USED": // arbitrary max of 10 TiB assert.GreaterOrEqual(t, metric.asInt64(), int64(0)) assert.LessOrEqual(t, metric.asInt64(), int64(10485760)) - case "dcgm.gpu.profiling.pcie_sent_bytes": + case "DCGM_FI_PROF_PCIE_TX_BYTES": fallthrough - case "dcgm.gpu.profiling.pcie_received_bytes": + case "DCGM_FI_PROF_PCIE_RX_BYTES": fallthrough - case "dcgm.gpu.profiling.nvlink_sent_bytes": + case "DCGM_FI_PROF_NVLINK_TX_BYTES": fallthrough - case "dcgm.gpu.profiling.nvlink_received_bytes": + case "DCGM_FI_PROF_NVLINK_RX_BYTES": // arbitrary max of 10 TiB/sec assert.GreaterOrEqual(t, metric.asInt64(), int64(0)) assert.LessOrEqual(t, metric.asInt64(), int64(10995116277760)) diff --git a/receiver/dcgmreceiver/factory_gpu_on.go b/receiver/dcgmreceiver/factory_gpu_on.go index 08295aa12..b3114e334 100644 --- a/receiver/dcgmreceiver/factory_gpu_on.go +++ b/receiver/dcgmreceiver/factory_gpu_on.go @@ -21,6 +21,7 @@ import ( "context" "fmt" "math/rand" + "strings" "time" "github.com/NVIDIA/go-dcgm/pkg/dcgm" @@ -33,37 +34,18 @@ import ( ) var dcgmIDToName map[dcgm.Short]string -var dcgmNameToMetricName map[string]string var metricNameToDcgmName map[string]string +var dcgmNameToMetricName map[string]string var randSource = rand.New(rand.NewSource(time.Now().UnixMicro())) func init() { dcgmIDToName = make(map[dcgm.Short]string, len(dcgm.DCGM_FI)) for fieldName, fieldID := range dcgm.DCGM_FI { + if strings.HasPrefix(fieldName, "DCGM_FT_") { + continue + } dcgmIDToName[fieldID] = fieldName } - - dcgmNameToMetricName = map[string]string{ - "DCGM_FI_DEV_GPU_UTIL": "dcgm.gpu.utilization", - "DCGM_FI_DEV_FB_USED": "dcgm.gpu.memory.bytes_used", - "DCGM_FI_DEV_FB_FREE": "dcgm.gpu.memory.bytes_free", - "DCGM_FI_PROF_SM_ACTIVE": "dcgm.gpu.profiling.sm_utilization", - "DCGM_FI_PROF_SM_OCCUPANCY": "dcgm.gpu.profiling.sm_occupancy", - "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE": "dcgm.gpu.profiling.tensor_utilization", - "DCGM_FI_PROF_DRAM_ACTIVE": "dcgm.gpu.profiling.dram_utilization", - "DCGM_FI_PROF_PIPE_FP64_ACTIVE": "dcgm.gpu.profiling.fp64_utilization", - "DCGM_FI_PROF_PIPE_FP32_ACTIVE": "dcgm.gpu.profiling.fp32_utilization", - "DCGM_FI_PROF_PIPE_FP16_ACTIVE": "dcgm.gpu.profiling.fp16_utilization", - "DCGM_FI_PROF_PCIE_TX_BYTES": "dcgm.gpu.profiling.pcie_sent_bytes", - "DCGM_FI_PROF_PCIE_RX_BYTES": "dcgm.gpu.profiling.pcie_received_bytes", - "DCGM_FI_PROF_NVLINK_TX_BYTES": "dcgm.gpu.profiling.nvlink_sent_bytes", - "DCGM_FI_PROF_NVLINK_RX_BYTES": "dcgm.gpu.profiling.nvlink_received_bytes", - } - - metricNameToDcgmName = make(map[string]string, len(dcgmNameToMetricName)) - for dcgmName, metricName := range dcgmNameToMetricName { - metricNameToDcgmName[metricName] = dcgmName - } } func createMetricsReceiver( diff --git a/receiver/dcgmreceiver/scraper.go b/receiver/dcgmreceiver/scraper.go index f2061fd9c..e8db2904b 100644 --- a/receiver/dcgmreceiver/scraper.go +++ b/receiver/dcgmreceiver/scraper.go @@ -98,37 +98,37 @@ func (s *dcgmScraper) scrape(_ context.Context) (pmetric.Metrics, error) { gpuResource := rb.Emit() for _, metric := range metrics { switch metric.name { - case "dcgm.gpu.utilization": + case "DCGM_FI_DEV_GPU_UTIL": gpuUtil := float64(metric.asInt64()) / 100.0 /* normalize */ s.mb.RecordDcgmGpuUtilizationDataPoint(now, gpuUtil) - case "dcgm.gpu.memory.bytes_used": + case "DCGM_FI_DEV_FB_USED": bytesUsed := 1e6 * metric.asInt64() /* MB to B */ s.mb.RecordDcgmGpuMemoryBytesUsedDataPoint(now, bytesUsed, metadata.AttributeMemoryStateUsed) - case "dcgm.gpu.memory.bytes_free": + case "DCGM_FI_DEV_FB_FREE": bytesFree := 1e6 * metric.asInt64() /* MB to B */ s.mb.RecordDcgmGpuMemoryBytesUsedDataPoint(now, bytesFree, metadata.AttributeMemoryStateFree) - case "dcgm.gpu.profiling.sm_utilization": + case "DCGM_FI_PROF_SM_ACTIVE": s.mb.RecordDcgmGpuProfilingSmUtilizationDataPoint(now, metric.asFloat64()) - case "dcgm.gpu.profiling.sm_occupancy": + case "DCGM_FI_PROF_SM_OCCUPANCY": s.mb.RecordDcgmGpuProfilingSmOccupancyDataPoint(now, metric.asFloat64()) - case "dcgm.gpu.profiling.tensor_utilization": + case "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE": s.mb.RecordDcgmGpuProfilingPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributePipeTensor) - case "dcgm.gpu.profiling.fp64_utilization": + case "DCGM_FI_PROF_PIPE_FP64_ACTIVE": s.mb.RecordDcgmGpuProfilingPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributePipeFp64) - case "dcgm.gpu.profiling.fp32_utilization": + case "DCGM_FI_PROF_PIPE_FP32_ACTIVE": s.mb.RecordDcgmGpuProfilingPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributePipeFp32) - case "dcgm.gpu.profiling.fp16_utilization": + case "DCGM_FI_PROF_PIPE_FP16_ACTIVE": s.mb.RecordDcgmGpuProfilingPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributePipeFp16) - case "dcgm.gpu.profiling.dram_utilization": + case "DCGM_FI_PROF_DRAM_ACTIVE": s.mb.RecordDcgmGpuProfilingDramUtilizationDataPoint(now, metric.asFloat64()) - case "dcgm.gpu.profiling.pcie_sent_bytes": + case "DCGM_FI_PROF_PCIE_TX_BYTES": /* DCGM already returns these as bytes/sec despite the name */ s.mb.RecordDcgmGpuProfilingPcieTrafficRateDataPoint(now, metric.asInt64(), metadata.AttributeDirectionTx) - case "dcgm.gpu.profiling.pcie_received_bytes": + case "DCGM_FI_PROF_PCIE_RX_BYTES": s.mb.RecordDcgmGpuProfilingPcieTrafficRateDataPoint(now, metric.asInt64(), metadata.AttributeDirectionRx) - case "dcgm.gpu.profiling.nvlink_sent_bytes": + case "DCGM_FI_PROF_NVLINK_TX_BYTES": s.mb.RecordDcgmGpuProfilingNvlinkTrafficRateDataPoint(now, metric.asInt64(), metadata.AttributeDirectionTx) - case "dcgm.gpu.profiling.nvlink_received_bytes": + case "DCGM_FI_PROF_NVLINK_RX_BYTES": s.mb.RecordDcgmGpuProfilingNvlinkTrafficRateDataPoint(now, metric.asInt64(), metadata.AttributeDirectionRx) } } diff --git a/receiver/dcgmreceiver/scraper_gpu_test.go b/receiver/dcgmreceiver/scraper_gpu_test.go index 31018e41b..2cd5845b7 100644 --- a/receiver/dcgmreceiver/scraper_gpu_test.go +++ b/receiver/dcgmreceiver/scraper_gpu_test.go @@ -205,20 +205,20 @@ func loadExpectedScraperMetrics(t *testing.T, model string) map[string]int { t.Helper() expectedMetrics := make(map[string]int) receiverMetricNameToScraperMetricName := map[string]string{ - "dcgm.gpu.utilization": "dcgm.gpu.utilization", - "dcgm.gpu.memory.bytes_used": "dcgm.gpu.memory.bytes_used", - "dcgm.gpu.memory.bytes_free": "dcgm.gpu.memory.bytes_used", - "dcgm.gpu.profiling.sm_utilization": "dcgm.gpu.profiling.sm_utilization", - "dcgm.gpu.profiling.sm_occupancy": "dcgm.gpu.profiling.sm_occupancy", - "dcgm.gpu.profiling.dram_utilization": "dcgm.gpu.profiling.dram_utilization", - "dcgm.gpu.profiling.tensor_utilization": "dcgm.gpu.profiling.pipe_utilization", - "dcgm.gpu.profiling.fp64_utilization": "dcgm.gpu.profiling.pipe_utilization", - "dcgm.gpu.profiling.fp32_utilization": "dcgm.gpu.profiling.pipe_utilization", - "dcgm.gpu.profiling.fp16_utilization": "dcgm.gpu.profiling.pipe_utilization", - "dcgm.gpu.profiling.pcie_sent_bytes": "dcgm.gpu.profiling.pcie_traffic_rate", - "dcgm.gpu.profiling.pcie_received_bytes": "dcgm.gpu.profiling.pcie_traffic_rate", - "dcgm.gpu.profiling.nvlink_sent_bytes": "dcgm.gpu.profiling.nvlink_traffic_rate", - "dcgm.gpu.profiling.nvlink_received_bytes": "dcgm.gpu.profiling.nvlink_traffic_rate", + "DCGM_FI_DEV_GPU_UTIL": "dcgm.gpu.utilization", + "DCGM_FI_DEV_FB_USED": "dcgm.gpu.memory.bytes_used", + "DCGM_FI_DEV_FB_FREE": "dcgm.gpu.memory.bytes_used", + "DCGM_FI_PROF_SM_ACTIVE": "dcgm.gpu.profiling.sm_utilization", + "DCGM_FI_PROF_SM_OCCUPANCY": "dcgm.gpu.profiling.sm_occupancy", + "DCGM_FI_PROF_DRAM_ACTIVE": "dcgm.gpu.profiling.dram_utilization", + "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE": "dcgm.gpu.profiling.pipe_utilization", + "DCGM_FI_PROF_PIPE_FP64_ACTIVE": "dcgm.gpu.profiling.pipe_utilization", + "DCGM_FI_PROF_PIPE_FP32_ACTIVE": "dcgm.gpu.profiling.pipe_utilization", + "DCGM_FI_PROF_PIPE_FP16_ACTIVE": "dcgm.gpu.profiling.pipe_utilization", + "DCGM_FI_PROF_PCIE_TX_BYTES": "dcgm.gpu.profiling.pcie_traffic_rate", + "DCGM_FI_PROF_PCIE_RX_BYTES": "dcgm.gpu.profiling.pcie_traffic_rate", + "DCGM_FI_PROF_NVLINK_TX_BYTES": "dcgm.gpu.profiling.nvlink_traffic_rate", + "DCGM_FI_PROF_NVLINK_RX_BYTES": "dcgm.gpu.profiling.nvlink_traffic_rate", } expectedReceiverMetrics := LoadExpectedMetrics(t, model) for _, em := range expectedReceiverMetrics { From 70d1caf749a751892f223a044a9893b0c2d1b63b Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Thu, 20 Jun 2024 15:06:53 -0400 Subject: [PATCH 07/38] More precise value errors. --- receiver/dcgmreceiver/client.go | 4 ++-- receiver/dcgmreceiver/util.go | 39 ++++++++++++++++++++------------- 2 files changed, 26 insertions(+), 17 deletions(-) diff --git a/receiver/dcgmreceiver/client.go b/receiver/dcgmreceiver/client.go index 6e3f3e6d5..b10283194 100644 --- a/receiver/dcgmreceiver/client.go +++ b/receiver/dcgmreceiver/client.go @@ -324,8 +324,8 @@ func (client *dcgmClient) collectDeviceMetrics() (map[uint][]dcgmMetric, error) func (client *dcgmClient) appendMetric(gpuMetrics []dcgmMetric, gpuIndex uint, fieldValues []dcgm.FieldValue_v1) []dcgmMetric { for _, fieldValue := range fieldValues { dcgmName := dcgmIDToName[dcgm.Short(fieldValue.FieldId)] - if !isValidValue(fieldValue) { - msg := fmt.Sprintf("Received invalid value (ts %d gpu %d) %s", fieldValue.Ts, gpuIndex, dcgmName) + if err := isValidValue(fieldValue); err != nil { + msg := fmt.Sprintf("Received invalid value (ts %d gpu %d) %s: %v", fieldValue.Ts, gpuIndex, dcgmName, err) client.issueWarningForFailedQueryUptoThreshold(gpuIndex, dcgmName, msg) continue } diff --git a/receiver/dcgmreceiver/util.go b/receiver/dcgmreceiver/util.go index 0ee795a4d..9961b0105 100644 --- a/receiver/dcgmreceiver/util.go +++ b/receiver/dcgmreceiver/util.go @@ -18,11 +18,20 @@ package dcgmreceiver import ( + "fmt" "unsafe" "github.com/NVIDIA/go-dcgm/pkg/dcgm" ) +var ( + blankValueError = fmt.Errorf("unspecified blank value") + dataNotFoundError = fmt.Errorf("data not found") + notSupportedError = fmt.Errorf("field not supported") + permissionDeniedError = fmt.Errorf("no permission to fetch value") + unexpectedTypeError = fmt.Errorf("unexpected data type") +) + func (m *dcgmMetric) setFloat64(val float64) { *(*float64)(unsafe.Pointer(&m.value[0])) = val } @@ -39,44 +48,44 @@ func (m *dcgmMetric) asInt64() int64 { return *(*int64)(unsafe.Pointer(&m.value[0])) } -func isValidValue(fieldValue dcgm.FieldValue_v1) bool { +func isValidValue(fieldValue dcgm.FieldValue_v1) error { switch fieldValue.FieldType { case dcgm.DCGM_FT_DOUBLE: switch v := fieldValue.Float64(); v { case dcgm.DCGM_FT_FP64_BLANK: - return false + return blankValueError case dcgm.DCGM_FT_FP64_NOT_FOUND: - return false + return dataNotFoundError case dcgm.DCGM_FT_FP64_NOT_SUPPORTED: - return false + return notSupportedError case dcgm.DCGM_FT_FP64_NOT_PERMISSIONED: - return false + return permissionDeniedError } case dcgm.DCGM_FT_INT64: switch v := fieldValue.Int64(); v { case dcgm.DCGM_FT_INT32_BLANK: - return false + return blankValueError case dcgm.DCGM_FT_INT32_NOT_FOUND: - return false + return dataNotFoundError case dcgm.DCGM_FT_INT32_NOT_SUPPORTED: - return false + return notSupportedError case dcgm.DCGM_FT_INT32_NOT_PERMISSIONED: - return false + return permissionDeniedError case dcgm.DCGM_FT_INT64_BLANK: - return false + return blankValueError case dcgm.DCGM_FT_INT64_NOT_FOUND: - return false + return dataNotFoundError case dcgm.DCGM_FT_INT64_NOT_SUPPORTED: - return false + return notSupportedError case dcgm.DCGM_FT_INT64_NOT_PERMISSIONED: - return false + return permissionDeniedError } // dcgm.DCGM_FT_STRING also exists but we don't expect it default: - return false + return unexpectedTypeError } - return true + return nil } From 6c3748cfa5ab9b1ec99b9f0ad7a460d2f294373a Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Wed, 29 May 2024 01:09:59 -0400 Subject: [PATCH 08/38] Add most metrics from the doc. --- receiver/dcgmreceiver/documentation.md | 172 ++- .../internal/metadata/generated_config.go | 64 + .../metadata/generated_config_test.go | 32 + .../internal/metadata/generated_metrics.go | 1294 +++++++++++++++-- .../metadata/generated_metrics_test.go | 289 ++++ .../internal/metadata/testdata/config.yaml | 64 + receiver/dcgmreceiver/metadata.yaml | 150 +- 7 files changed, 1924 insertions(+), 141 deletions(-) diff --git a/receiver/dcgmreceiver/documentation.md b/receiver/dcgmreceiver/documentation.md index 5a7f947d8..5e17a9c1d 100644 --- a/receiver/dcgmreceiver/documentation.md +++ b/receiver/dcgmreceiver/documentation.md @@ -24,7 +24,7 @@ Current number of GPU memory bytes used by state. Summing the values of all stat | Name | Description | Values | | ---- | ----------- | ------ | -| memory_state | GPU memory used or free | Str: ``used``, ``free`` | +| memory_state | GPU memory state, one of [free, used, reserved]. | Str: ``used``, ``free``, ``reserved`` | ### dcgm.gpu.profiling.dram_utilization @@ -100,6 +100,176 @@ Fraction of time the GPU was not idle. | ---- | ----------- | ---------- | | 1 | Gauge | Double | +### gpu.dcgm.clock.frequency + +Multiprocessor clock frequency. + +| Unit | Metric Type | Value Type | +| ---- | ----------- | ---------- | +| Hz | Gauge | Double | + +### gpu.dcgm.clock.throttle_duration.time + +Clock throttle total duration. + +| Unit | Metric Type | Value Type | Aggregation Temporality | Monotonic | +| ---- | ----------- | ---------- | ----------------------- | --------- | +| s | Sum | Double | Cumulative | true | + +#### Attributes + +| Name | Description | Values | +| ---- | ----------- | ------ | +| violation | Reason for throttling, one of [power, thermal, sync_boost, board_limit, low_util, reliability, app_clock, base_clock]. | Str: ``power``, ``thermal``, ``sync_boost``, ``board_limit``, ``low_util``, ``reliability``, ``app_clock``, ``base_clock`` | + +### gpu.dcgm.codec.decoder.utilization + +Decoder utilization. + +| Unit | Metric Type | Value Type | +| ---- | ----------- | ---------- | +| 1 | Gauge | Double | + +### gpu.dcgm.codec.encoder.utilization + +Encoder utilization. + +| Unit | Metric Type | Value Type | +| ---- | ----------- | ---------- | +| 1 | Gauge | Double | + +### gpu.dcgm.ecc_errors + +Data corruption errors. + +| Unit | Metric Type | Value Type | Aggregation Temporality | Monotonic | +| ---- | ----------- | ---------- | ----------------------- | --------- | +| 1 | Sum | Int | Cumulative | true | + +#### Attributes + +| Name | Description | Values | +| ---- | ----------- | ------ | +| error_type | The type of error, one of [sbe, dbe]. | Str: ``sbe``, ``dbe`` | + +### gpu.dcgm.energy_consumption + +Total energy consumption for the GPU in J since the driver was last reloaded. + +| Unit | Metric Type | Value Type | Aggregation Temporality | Monotonic | +| ---- | ----------- | ---------- | ----------------------- | --------- | +| J | Sum | Double | Cumulative | true | + +### gpu.dcgm.memory.bandwidth_utilization + +Fraction of cycles data was being sent or received from GPU memory. + +| Unit | Metric Type | Value Type | +| ---- | ----------- | ---------- | +| 1 | Gauge | Double | + +### gpu.dcgm.memory.bytes_used + +Current number of GPU memory bytes used by state. Summing the values of all states yields the total GPU memory space. + +| Unit | Metric Type | Value Type | +| ---- | ----------- | ---------- | +| By | Gauge | Int | + +#### Attributes + +| Name | Description | Values | +| ---- | ----------- | ------ | +| memory_state | GPU memory state, one of [free, used, reserved]. | Str: ``used``, ``free``, ``reserved`` | + +### gpu.dcgm.nvlink.traffic + +The number of bytes sent over NVLink, not including protocol headers. + +| Unit | Metric Type | Value Type | Aggregation Temporality | Monotonic | +| ---- | ----------- | ---------- | ----------------------- | --------- | +| By | Sum | Int | Delta | true | + +#### Attributes + +| Name | Description | Values | +| ---- | ----------- | ------ | +| direction | Direction of the link traffic, one of [tx, rx]. | Str: ``tx``, ``rx`` | + +### gpu.dcgm.pcie.traffic + +The number of bytes sent over the PCIe bus, including both protocol headers and data payloads. + +| Unit | Metric Type | Value Type | Aggregation Temporality | Monotonic | +| ---- | ----------- | ---------- | ----------------------- | --------- | +| By | Sum | Int | Delta | true | + +#### Attributes + +| Name | Description | Values | +| ---- | ----------- | ------ | +| direction | Direction of the link traffic, one of [tx, rx]. | Str: ``tx``, ``rx`` | + +### gpu.dcgm.pipe.utilization + +Fraction of cycles the corresponding GPU pipe was active, averaged over time and all multiprocessors. + +| Unit | Metric Type | Value Type | +| ---- | ----------- | ---------- | +| 1 | Gauge | Double | + +#### Attributes + +| Name | Description | Values | +| ---- | ----------- | ------ | +| pipe | GPU pipe in use, one of [tensor, fp64, fp32, fp16]. | Str: ``tensor``, ``fp64``, ``fp32``, ``fp16`` | + +### gpu.dcgm.sm.occupancy + +Fraction of the number of warps resident on a multiprocessor, averaged over all multiprocessors. + +| Unit | Metric Type | Value Type | +| ---- | ----------- | ---------- | +| 1 | Gauge | Double | + +### gpu.dcgm.sm.utilization + +Fraction of time at least one warp was active on a multiprocessor, averaged over all multiprocessors. + +| Unit | Metric Type | Value Type | +| ---- | ----------- | ---------- | +| 1 | Gauge | Double | + +### gpu.dcgm.temperature + +Current temperature readings for the device, in ˚C. + +| Unit | Metric Type | Value Type | +| ---- | ----------- | ---------- | +| Cel | Gauge | Double | + +### gpu.dcgm.utilization + +Ratio of time the graphics engine is active. + +| Unit | Metric Type | Value Type | +| ---- | ----------- | ---------- | +| 1 | Gauge | Double | + +### gpu.dcgm.xid_errors + +XID errors. + +| Unit | Metric Type | Value Type | Aggregation Temporality | Monotonic | +| ---- | ----------- | ---------- | ----------------------- | --------- | +| 1 | Sum | Int | Cumulative | true | + +#### Attributes + +| Name | Description | Values | +| ---- | ----------- | ------ | +| xid | The XID code for the error, 1..143. | Any Int | + ## Resource Attributes | Name | Description | Values | Enabled | diff --git a/receiver/dcgmreceiver/internal/metadata/generated_config.go b/receiver/dcgmreceiver/internal/metadata/generated_config.go index 46c17ee8e..2d6f5708b 100644 --- a/receiver/dcgmreceiver/internal/metadata/generated_config.go +++ b/receiver/dcgmreceiver/internal/metadata/generated_config.go @@ -36,6 +36,22 @@ type MetricsConfig struct { DcgmGpuProfilingSmOccupancy MetricConfig `mapstructure:"dcgm.gpu.profiling.sm_occupancy"` DcgmGpuProfilingSmUtilization MetricConfig `mapstructure:"dcgm.gpu.profiling.sm_utilization"` DcgmGpuUtilization MetricConfig `mapstructure:"dcgm.gpu.utilization"` + GpuDcgmClockFrequency MetricConfig `mapstructure:"gpu.dcgm.clock.frequency"` + GpuDcgmClockThrottleDurationTime MetricConfig `mapstructure:"gpu.dcgm.clock.throttle_duration.time"` + GpuDcgmCodecDecoderUtilization MetricConfig `mapstructure:"gpu.dcgm.codec.decoder.utilization"` + GpuDcgmCodecEncoderUtilization MetricConfig `mapstructure:"gpu.dcgm.codec.encoder.utilization"` + GpuDcgmEccErrors MetricConfig `mapstructure:"gpu.dcgm.ecc_errors"` + GpuDcgmEnergyConsumption MetricConfig `mapstructure:"gpu.dcgm.energy_consumption"` + GpuDcgmMemoryBandwidthUtilization MetricConfig `mapstructure:"gpu.dcgm.memory.bandwidth_utilization"` + GpuDcgmMemoryBytesUsed MetricConfig `mapstructure:"gpu.dcgm.memory.bytes_used"` + GpuDcgmNvlinkTraffic MetricConfig `mapstructure:"gpu.dcgm.nvlink.traffic"` + GpuDcgmPcieTraffic MetricConfig `mapstructure:"gpu.dcgm.pcie.traffic"` + GpuDcgmPipeUtilization MetricConfig `mapstructure:"gpu.dcgm.pipe.utilization"` + GpuDcgmSmOccupancy MetricConfig `mapstructure:"gpu.dcgm.sm.occupancy"` + GpuDcgmSmUtilization MetricConfig `mapstructure:"gpu.dcgm.sm.utilization"` + GpuDcgmTemperature MetricConfig `mapstructure:"gpu.dcgm.temperature"` + GpuDcgmUtilization MetricConfig `mapstructure:"gpu.dcgm.utilization"` + GpuDcgmXidErrors MetricConfig `mapstructure:"gpu.dcgm.xid_errors"` } func DefaultMetricsConfig() MetricsConfig { @@ -64,6 +80,54 @@ func DefaultMetricsConfig() MetricsConfig { DcgmGpuUtilization: MetricConfig{ Enabled: true, }, + GpuDcgmClockFrequency: MetricConfig{ + Enabled: true, + }, + GpuDcgmClockThrottleDurationTime: MetricConfig{ + Enabled: true, + }, + GpuDcgmCodecDecoderUtilization: MetricConfig{ + Enabled: true, + }, + GpuDcgmCodecEncoderUtilization: MetricConfig{ + Enabled: true, + }, + GpuDcgmEccErrors: MetricConfig{ + Enabled: true, + }, + GpuDcgmEnergyConsumption: MetricConfig{ + Enabled: true, + }, + GpuDcgmMemoryBandwidthUtilization: MetricConfig{ + Enabled: true, + }, + GpuDcgmMemoryBytesUsed: MetricConfig{ + Enabled: true, + }, + GpuDcgmNvlinkTraffic: MetricConfig{ + Enabled: true, + }, + GpuDcgmPcieTraffic: MetricConfig{ + Enabled: true, + }, + GpuDcgmPipeUtilization: MetricConfig{ + Enabled: true, + }, + GpuDcgmSmOccupancy: MetricConfig{ + Enabled: true, + }, + GpuDcgmSmUtilization: MetricConfig{ + Enabled: true, + }, + GpuDcgmTemperature: MetricConfig{ + Enabled: true, + }, + GpuDcgmUtilization: MetricConfig{ + Enabled: true, + }, + GpuDcgmXidErrors: MetricConfig{ + Enabled: true, + }, } } diff --git a/receiver/dcgmreceiver/internal/metadata/generated_config_test.go b/receiver/dcgmreceiver/internal/metadata/generated_config_test.go index 4ca3fcd32..d9336c8dd 100644 --- a/receiver/dcgmreceiver/internal/metadata/generated_config_test.go +++ b/receiver/dcgmreceiver/internal/metadata/generated_config_test.go @@ -34,6 +34,22 @@ func TestMetricsBuilderConfig(t *testing.T) { DcgmGpuProfilingSmOccupancy: MetricConfig{Enabled: true}, DcgmGpuProfilingSmUtilization: MetricConfig{Enabled: true}, DcgmGpuUtilization: MetricConfig{Enabled: true}, + GpuDcgmClockFrequency: MetricConfig{Enabled: true}, + GpuDcgmClockThrottleDurationTime: MetricConfig{Enabled: true}, + GpuDcgmCodecDecoderUtilization: MetricConfig{Enabled: true}, + GpuDcgmCodecEncoderUtilization: MetricConfig{Enabled: true}, + GpuDcgmEccErrors: MetricConfig{Enabled: true}, + GpuDcgmEnergyConsumption: MetricConfig{Enabled: true}, + GpuDcgmMemoryBandwidthUtilization: MetricConfig{Enabled: true}, + GpuDcgmMemoryBytesUsed: MetricConfig{Enabled: true}, + GpuDcgmNvlinkTraffic: MetricConfig{Enabled: true}, + GpuDcgmPcieTraffic: MetricConfig{Enabled: true}, + GpuDcgmPipeUtilization: MetricConfig{Enabled: true}, + GpuDcgmSmOccupancy: MetricConfig{Enabled: true}, + GpuDcgmSmUtilization: MetricConfig{Enabled: true}, + GpuDcgmTemperature: MetricConfig{Enabled: true}, + GpuDcgmUtilization: MetricConfig{Enabled: true}, + GpuDcgmXidErrors: MetricConfig{Enabled: true}, }, ResourceAttributes: ResourceAttributesConfig{ GpuModel: ResourceAttributeConfig{Enabled: true}, @@ -54,6 +70,22 @@ func TestMetricsBuilderConfig(t *testing.T) { DcgmGpuProfilingSmOccupancy: MetricConfig{Enabled: false}, DcgmGpuProfilingSmUtilization: MetricConfig{Enabled: false}, DcgmGpuUtilization: MetricConfig{Enabled: false}, + GpuDcgmClockFrequency: MetricConfig{Enabled: false}, + GpuDcgmClockThrottleDurationTime: MetricConfig{Enabled: false}, + GpuDcgmCodecDecoderUtilization: MetricConfig{Enabled: false}, + GpuDcgmCodecEncoderUtilization: MetricConfig{Enabled: false}, + GpuDcgmEccErrors: MetricConfig{Enabled: false}, + GpuDcgmEnergyConsumption: MetricConfig{Enabled: false}, + GpuDcgmMemoryBandwidthUtilization: MetricConfig{Enabled: false}, + GpuDcgmMemoryBytesUsed: MetricConfig{Enabled: false}, + GpuDcgmNvlinkTraffic: MetricConfig{Enabled: false}, + GpuDcgmPcieTraffic: MetricConfig{Enabled: false}, + GpuDcgmPipeUtilization: MetricConfig{Enabled: false}, + GpuDcgmSmOccupancy: MetricConfig{Enabled: false}, + GpuDcgmSmUtilization: MetricConfig{Enabled: false}, + GpuDcgmTemperature: MetricConfig{Enabled: false}, + GpuDcgmUtilization: MetricConfig{Enabled: false}, + GpuDcgmXidErrors: MetricConfig{Enabled: false}, }, ResourceAttributes: ResourceAttributesConfig{ GpuModel: ResourceAttributeConfig{Enabled: false}, diff --git a/receiver/dcgmreceiver/internal/metadata/generated_metrics.go b/receiver/dcgmreceiver/internal/metadata/generated_metrics.go index 85a76fdc7..0e54765c5 100644 --- a/receiver/dcgmreceiver/internal/metadata/generated_metrics.go +++ b/receiver/dcgmreceiver/internal/metadata/generated_metrics.go @@ -38,6 +38,32 @@ var MapAttributeDirection = map[string]AttributeDirection{ "rx": AttributeDirectionRx, } +// AttributeErrorType specifies the a value error_type attribute. +type AttributeErrorType int + +const ( + _ AttributeErrorType = iota + AttributeErrorTypeSbe + AttributeErrorTypeDbe +) + +// String returns the string representation of the AttributeErrorType. +func (av AttributeErrorType) String() string { + switch av { + case AttributeErrorTypeSbe: + return "sbe" + case AttributeErrorTypeDbe: + return "dbe" + } + return "" +} + +// MapAttributeErrorType is a helper map of string to AttributeErrorType attribute value. +var MapAttributeErrorType = map[string]AttributeErrorType{ + "sbe": AttributeErrorTypeSbe, + "dbe": AttributeErrorTypeDbe, +} + // AttributeMemoryState specifies the a value memory_state attribute. type AttributeMemoryState int @@ -45,75 +71,882 @@ const ( _ AttributeMemoryState = iota AttributeMemoryStateUsed AttributeMemoryStateFree + AttributeMemoryStateReserved +) + +// String returns the string representation of the AttributeMemoryState. +func (av AttributeMemoryState) String() string { + switch av { + case AttributeMemoryStateUsed: + return "used" + case AttributeMemoryStateFree: + return "free" + case AttributeMemoryStateReserved: + return "reserved" + } + return "" +} + +// MapAttributeMemoryState is a helper map of string to AttributeMemoryState attribute value. +var MapAttributeMemoryState = map[string]AttributeMemoryState{ + "used": AttributeMemoryStateUsed, + "free": AttributeMemoryStateFree, + "reserved": AttributeMemoryStateReserved, +} + +// AttributePipe specifies the a value pipe attribute. +type AttributePipe int + +const ( + _ AttributePipe = iota + AttributePipeTensor + AttributePipeFp64 + AttributePipeFp32 + AttributePipeFp16 +) + +// String returns the string representation of the AttributePipe. +func (av AttributePipe) String() string { + switch av { + case AttributePipeTensor: + return "tensor" + case AttributePipeFp64: + return "fp64" + case AttributePipeFp32: + return "fp32" + case AttributePipeFp16: + return "fp16" + } + return "" +} + +// MapAttributePipe is a helper map of string to AttributePipe attribute value. +var MapAttributePipe = map[string]AttributePipe{ + "tensor": AttributePipeTensor, + "fp64": AttributePipeFp64, + "fp32": AttributePipeFp32, + "fp16": AttributePipeFp16, +} + +// AttributeViolation specifies the a value violation attribute. +type AttributeViolation int + +const ( + _ AttributeViolation = iota + AttributeViolationPower + AttributeViolationThermal + AttributeViolationSyncBoost + AttributeViolationBoardLimit + AttributeViolationLowUtil + AttributeViolationReliability + AttributeViolationAppClock + AttributeViolationBaseClock ) -// String returns the string representation of the AttributeMemoryState. -func (av AttributeMemoryState) String() string { - switch av { - case AttributeMemoryStateUsed: - return "used" - case AttributeMemoryStateFree: - return "free" +// String returns the string representation of the AttributeViolation. +func (av AttributeViolation) String() string { + switch av { + case AttributeViolationPower: + return "power" + case AttributeViolationThermal: + return "thermal" + case AttributeViolationSyncBoost: + return "sync_boost" + case AttributeViolationBoardLimit: + return "board_limit" + case AttributeViolationLowUtil: + return "low_util" + case AttributeViolationReliability: + return "reliability" + case AttributeViolationAppClock: + return "app_clock" + case AttributeViolationBaseClock: + return "base_clock" + } + return "" +} + +// MapAttributeViolation is a helper map of string to AttributeViolation attribute value. +var MapAttributeViolation = map[string]AttributeViolation{ + "power": AttributeViolationPower, + "thermal": AttributeViolationThermal, + "sync_boost": AttributeViolationSyncBoost, + "board_limit": AttributeViolationBoardLimit, + "low_util": AttributeViolationLowUtil, + "reliability": AttributeViolationReliability, + "app_clock": AttributeViolationAppClock, + "base_clock": AttributeViolationBaseClock, +} + +type metricDcgmGpuMemoryBytesUsed struct { + data pmetric.Metric // data buffer for generated metric. + config MetricConfig // metric config provided by user. + capacity int // max observed number of data points added to the metric. +} + +// init fills dcgm.gpu.memory.bytes_used metric with initial data. +func (m *metricDcgmGpuMemoryBytesUsed) init() { + m.data.SetName("dcgm.gpu.memory.bytes_used") + m.data.SetDescription("Current number of GPU memory bytes used by state. Summing the values of all states yields the total GPU memory space.") + m.data.SetUnit("By") + m.data.SetEmptyGauge() + m.data.Gauge().DataPoints().EnsureCapacity(m.capacity) +} + +func (m *metricDcgmGpuMemoryBytesUsed) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, memoryStateAttributeValue string) { + if !m.config.Enabled { + return + } + dp := m.data.Gauge().DataPoints().AppendEmpty() + dp.SetStartTimestamp(start) + dp.SetTimestamp(ts) + dp.SetIntValue(val) + dp.Attributes().PutStr("memory_state", memoryStateAttributeValue) +} + +// updateCapacity saves max length of data point slices that will be used for the slice capacity. +func (m *metricDcgmGpuMemoryBytesUsed) updateCapacity() { + if m.data.Gauge().DataPoints().Len() > m.capacity { + m.capacity = m.data.Gauge().DataPoints().Len() + } +} + +// emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. +func (m *metricDcgmGpuMemoryBytesUsed) emit(metrics pmetric.MetricSlice) { + if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { + m.updateCapacity() + m.data.MoveTo(metrics.AppendEmpty()) + m.init() + } +} + +func newMetricDcgmGpuMemoryBytesUsed(cfg MetricConfig) metricDcgmGpuMemoryBytesUsed { + m := metricDcgmGpuMemoryBytesUsed{config: cfg} + if cfg.Enabled { + m.data = pmetric.NewMetric() + m.init() + } + return m +} + +type metricDcgmGpuProfilingDramUtilization struct { + data pmetric.Metric // data buffer for generated metric. + config MetricConfig // metric config provided by user. + capacity int // max observed number of data points added to the metric. +} + +// init fills dcgm.gpu.profiling.dram_utilization metric with initial data. +func (m *metricDcgmGpuProfilingDramUtilization) init() { + m.data.SetName("dcgm.gpu.profiling.dram_utilization") + m.data.SetDescription("Fraction of cycles data was being sent or received from GPU memory.") + m.data.SetUnit("1") + m.data.SetEmptyGauge() +} + +func (m *metricDcgmGpuProfilingDramUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64) { + if !m.config.Enabled { + return + } + dp := m.data.Gauge().DataPoints().AppendEmpty() + dp.SetStartTimestamp(start) + dp.SetTimestamp(ts) + dp.SetDoubleValue(val) +} + +// updateCapacity saves max length of data point slices that will be used for the slice capacity. +func (m *metricDcgmGpuProfilingDramUtilization) updateCapacity() { + if m.data.Gauge().DataPoints().Len() > m.capacity { + m.capacity = m.data.Gauge().DataPoints().Len() + } +} + +// emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. +func (m *metricDcgmGpuProfilingDramUtilization) emit(metrics pmetric.MetricSlice) { + if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { + m.updateCapacity() + m.data.MoveTo(metrics.AppendEmpty()) + m.init() + } +} + +func newMetricDcgmGpuProfilingDramUtilization(cfg MetricConfig) metricDcgmGpuProfilingDramUtilization { + m := metricDcgmGpuProfilingDramUtilization{config: cfg} + if cfg.Enabled { + m.data = pmetric.NewMetric() + m.init() + } + return m +} + +type metricDcgmGpuProfilingNvlinkTrafficRate struct { + data pmetric.Metric // data buffer for generated metric. + config MetricConfig // metric config provided by user. + capacity int // max observed number of data points added to the metric. +} + +// init fills dcgm.gpu.profiling.nvlink_traffic_rate metric with initial data. +func (m *metricDcgmGpuProfilingNvlinkTrafficRate) init() { + m.data.SetName("dcgm.gpu.profiling.nvlink_traffic_rate") + m.data.SetDescription("The average rate of bytes received from the GPU over NVLink over the sample period, not including protocol headers.") + m.data.SetUnit("By/s") + m.data.SetEmptyGauge() + m.data.Gauge().DataPoints().EnsureCapacity(m.capacity) +} + +func (m *metricDcgmGpuProfilingNvlinkTrafficRate) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, directionAttributeValue string) { + if !m.config.Enabled { + return + } + dp := m.data.Gauge().DataPoints().AppendEmpty() + dp.SetStartTimestamp(start) + dp.SetTimestamp(ts) + dp.SetIntValue(val) + dp.Attributes().PutStr("direction", directionAttributeValue) +} + +// updateCapacity saves max length of data point slices that will be used for the slice capacity. +func (m *metricDcgmGpuProfilingNvlinkTrafficRate) updateCapacity() { + if m.data.Gauge().DataPoints().Len() > m.capacity { + m.capacity = m.data.Gauge().DataPoints().Len() + } +} + +// emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. +func (m *metricDcgmGpuProfilingNvlinkTrafficRate) emit(metrics pmetric.MetricSlice) { + if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { + m.updateCapacity() + m.data.MoveTo(metrics.AppendEmpty()) + m.init() + } +} + +func newMetricDcgmGpuProfilingNvlinkTrafficRate(cfg MetricConfig) metricDcgmGpuProfilingNvlinkTrafficRate { + m := metricDcgmGpuProfilingNvlinkTrafficRate{config: cfg} + if cfg.Enabled { + m.data = pmetric.NewMetric() + m.init() + } + return m +} + +type metricDcgmGpuProfilingPcieTrafficRate struct { + data pmetric.Metric // data buffer for generated metric. + config MetricConfig // metric config provided by user. + capacity int // max observed number of data points added to the metric. +} + +// init fills dcgm.gpu.profiling.pcie_traffic_rate metric with initial data. +func (m *metricDcgmGpuProfilingPcieTrafficRate) init() { + m.data.SetName("dcgm.gpu.profiling.pcie_traffic_rate") + m.data.SetDescription("The average rate of bytes sent from the GPU over the PCIe bus over the sample period, including both protocol headers and data payloads.") + m.data.SetUnit("By/s") + m.data.SetEmptyGauge() + m.data.Gauge().DataPoints().EnsureCapacity(m.capacity) +} + +func (m *metricDcgmGpuProfilingPcieTrafficRate) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, directionAttributeValue string) { + if !m.config.Enabled { + return + } + dp := m.data.Gauge().DataPoints().AppendEmpty() + dp.SetStartTimestamp(start) + dp.SetTimestamp(ts) + dp.SetIntValue(val) + dp.Attributes().PutStr("direction", directionAttributeValue) +} + +// updateCapacity saves max length of data point slices that will be used for the slice capacity. +func (m *metricDcgmGpuProfilingPcieTrafficRate) updateCapacity() { + if m.data.Gauge().DataPoints().Len() > m.capacity { + m.capacity = m.data.Gauge().DataPoints().Len() + } +} + +// emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. +func (m *metricDcgmGpuProfilingPcieTrafficRate) emit(metrics pmetric.MetricSlice) { + if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { + m.updateCapacity() + m.data.MoveTo(metrics.AppendEmpty()) + m.init() + } +} + +func newMetricDcgmGpuProfilingPcieTrafficRate(cfg MetricConfig) metricDcgmGpuProfilingPcieTrafficRate { + m := metricDcgmGpuProfilingPcieTrafficRate{config: cfg} + if cfg.Enabled { + m.data = pmetric.NewMetric() + m.init() + } + return m +} + +type metricDcgmGpuProfilingPipeUtilization struct { + data pmetric.Metric // data buffer for generated metric. + config MetricConfig // metric config provided by user. + capacity int // max observed number of data points added to the metric. +} + +// init fills dcgm.gpu.profiling.pipe_utilization metric with initial data. +func (m *metricDcgmGpuProfilingPipeUtilization) init() { + m.data.SetName("dcgm.gpu.profiling.pipe_utilization") + m.data.SetDescription("Fraction of cycles the corresponding GPU pipe was active, averaged over time and all multiprocessors.") + m.data.SetUnit("1") + m.data.SetEmptyGauge() + m.data.Gauge().DataPoints().EnsureCapacity(m.capacity) +} + +func (m *metricDcgmGpuProfilingPipeUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64, pipeAttributeValue string) { + if !m.config.Enabled { + return + } + dp := m.data.Gauge().DataPoints().AppendEmpty() + dp.SetStartTimestamp(start) + dp.SetTimestamp(ts) + dp.SetDoubleValue(val) + dp.Attributes().PutStr("pipe", pipeAttributeValue) +} + +// updateCapacity saves max length of data point slices that will be used for the slice capacity. +func (m *metricDcgmGpuProfilingPipeUtilization) updateCapacity() { + if m.data.Gauge().DataPoints().Len() > m.capacity { + m.capacity = m.data.Gauge().DataPoints().Len() + } +} + +// emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. +func (m *metricDcgmGpuProfilingPipeUtilization) emit(metrics pmetric.MetricSlice) { + if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { + m.updateCapacity() + m.data.MoveTo(metrics.AppendEmpty()) + m.init() + } +} + +func newMetricDcgmGpuProfilingPipeUtilization(cfg MetricConfig) metricDcgmGpuProfilingPipeUtilization { + m := metricDcgmGpuProfilingPipeUtilization{config: cfg} + if cfg.Enabled { + m.data = pmetric.NewMetric() + m.init() + } + return m +} + +type metricDcgmGpuProfilingSmOccupancy struct { + data pmetric.Metric // data buffer for generated metric. + config MetricConfig // metric config provided by user. + capacity int // max observed number of data points added to the metric. +} + +// init fills dcgm.gpu.profiling.sm_occupancy metric with initial data. +func (m *metricDcgmGpuProfilingSmOccupancy) init() { + m.data.SetName("dcgm.gpu.profiling.sm_occupancy") + m.data.SetDescription("Fraction of resident warps on a multiprocessor relative to the maximum number supported, averaged over time and all multiprocessors.") + m.data.SetUnit("1") + m.data.SetEmptyGauge() +} + +func (m *metricDcgmGpuProfilingSmOccupancy) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64) { + if !m.config.Enabled { + return + } + dp := m.data.Gauge().DataPoints().AppendEmpty() + dp.SetStartTimestamp(start) + dp.SetTimestamp(ts) + dp.SetDoubleValue(val) +} + +// updateCapacity saves max length of data point slices that will be used for the slice capacity. +func (m *metricDcgmGpuProfilingSmOccupancy) updateCapacity() { + if m.data.Gauge().DataPoints().Len() > m.capacity { + m.capacity = m.data.Gauge().DataPoints().Len() + } +} + +// emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. +func (m *metricDcgmGpuProfilingSmOccupancy) emit(metrics pmetric.MetricSlice) { + if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { + m.updateCapacity() + m.data.MoveTo(metrics.AppendEmpty()) + m.init() + } +} + +func newMetricDcgmGpuProfilingSmOccupancy(cfg MetricConfig) metricDcgmGpuProfilingSmOccupancy { + m := metricDcgmGpuProfilingSmOccupancy{config: cfg} + if cfg.Enabled { + m.data = pmetric.NewMetric() + m.init() + } + return m +} + +type metricDcgmGpuProfilingSmUtilization struct { + data pmetric.Metric // data buffer for generated metric. + config MetricConfig // metric config provided by user. + capacity int // max observed number of data points added to the metric. +} + +// init fills dcgm.gpu.profiling.sm_utilization metric with initial data. +func (m *metricDcgmGpuProfilingSmUtilization) init() { + m.data.SetName("dcgm.gpu.profiling.sm_utilization") + m.data.SetDescription("Fraction of time at least one warp was active on a multiprocessor, averaged over all multiprocessors.") + m.data.SetUnit("1") + m.data.SetEmptyGauge() +} + +func (m *metricDcgmGpuProfilingSmUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64) { + if !m.config.Enabled { + return + } + dp := m.data.Gauge().DataPoints().AppendEmpty() + dp.SetStartTimestamp(start) + dp.SetTimestamp(ts) + dp.SetDoubleValue(val) +} + +// updateCapacity saves max length of data point slices that will be used for the slice capacity. +func (m *metricDcgmGpuProfilingSmUtilization) updateCapacity() { + if m.data.Gauge().DataPoints().Len() > m.capacity { + m.capacity = m.data.Gauge().DataPoints().Len() + } +} + +// emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. +func (m *metricDcgmGpuProfilingSmUtilization) emit(metrics pmetric.MetricSlice) { + if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { + m.updateCapacity() + m.data.MoveTo(metrics.AppendEmpty()) + m.init() + } +} + +func newMetricDcgmGpuProfilingSmUtilization(cfg MetricConfig) metricDcgmGpuProfilingSmUtilization { + m := metricDcgmGpuProfilingSmUtilization{config: cfg} + if cfg.Enabled { + m.data = pmetric.NewMetric() + m.init() + } + return m +} + +type metricDcgmGpuUtilization struct { + data pmetric.Metric // data buffer for generated metric. + config MetricConfig // metric config provided by user. + capacity int // max observed number of data points added to the metric. +} + +// init fills dcgm.gpu.utilization metric with initial data. +func (m *metricDcgmGpuUtilization) init() { + m.data.SetName("dcgm.gpu.utilization") + m.data.SetDescription("Fraction of time the GPU was not idle.") + m.data.SetUnit("1") + m.data.SetEmptyGauge() +} + +func (m *metricDcgmGpuUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64) { + if !m.config.Enabled { + return + } + dp := m.data.Gauge().DataPoints().AppendEmpty() + dp.SetStartTimestamp(start) + dp.SetTimestamp(ts) + dp.SetDoubleValue(val) +} + +// updateCapacity saves max length of data point slices that will be used for the slice capacity. +func (m *metricDcgmGpuUtilization) updateCapacity() { + if m.data.Gauge().DataPoints().Len() > m.capacity { + m.capacity = m.data.Gauge().DataPoints().Len() + } +} + +// emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. +func (m *metricDcgmGpuUtilization) emit(metrics pmetric.MetricSlice) { + if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { + m.updateCapacity() + m.data.MoveTo(metrics.AppendEmpty()) + m.init() + } +} + +func newMetricDcgmGpuUtilization(cfg MetricConfig) metricDcgmGpuUtilization { + m := metricDcgmGpuUtilization{config: cfg} + if cfg.Enabled { + m.data = pmetric.NewMetric() + m.init() + } + return m +} + +type metricGpuDcgmClockFrequency struct { + data pmetric.Metric // data buffer for generated metric. + config MetricConfig // metric config provided by user. + capacity int // max observed number of data points added to the metric. +} + +// init fills gpu.dcgm.clock.frequency metric with initial data. +func (m *metricGpuDcgmClockFrequency) init() { + m.data.SetName("gpu.dcgm.clock.frequency") + m.data.SetDescription("Multiprocessor clock frequency.") + m.data.SetUnit("Hz") + m.data.SetEmptyGauge() +} + +func (m *metricGpuDcgmClockFrequency) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64) { + if !m.config.Enabled { + return + } + dp := m.data.Gauge().DataPoints().AppendEmpty() + dp.SetStartTimestamp(start) + dp.SetTimestamp(ts) + dp.SetDoubleValue(val) +} + +// updateCapacity saves max length of data point slices that will be used for the slice capacity. +func (m *metricGpuDcgmClockFrequency) updateCapacity() { + if m.data.Gauge().DataPoints().Len() > m.capacity { + m.capacity = m.data.Gauge().DataPoints().Len() + } +} + +// emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. +func (m *metricGpuDcgmClockFrequency) emit(metrics pmetric.MetricSlice) { + if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { + m.updateCapacity() + m.data.MoveTo(metrics.AppendEmpty()) + m.init() + } +} + +func newMetricGpuDcgmClockFrequency(cfg MetricConfig) metricGpuDcgmClockFrequency { + m := metricGpuDcgmClockFrequency{config: cfg} + if cfg.Enabled { + m.data = pmetric.NewMetric() + m.init() + } + return m +} + +type metricGpuDcgmClockThrottleDurationTime struct { + data pmetric.Metric // data buffer for generated metric. + config MetricConfig // metric config provided by user. + capacity int // max observed number of data points added to the metric. +} + +// init fills gpu.dcgm.clock.throttle_duration.time metric with initial data. +func (m *metricGpuDcgmClockThrottleDurationTime) init() { + m.data.SetName("gpu.dcgm.clock.throttle_duration.time") + m.data.SetDescription("Clock throttle total duration.") + m.data.SetUnit("s") + m.data.SetEmptySum() + m.data.Sum().SetIsMonotonic(true) + m.data.Sum().SetAggregationTemporality(pmetric.AggregationTemporalityCumulative) + m.data.Sum().DataPoints().EnsureCapacity(m.capacity) +} + +func (m *metricGpuDcgmClockThrottleDurationTime) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64, violationAttributeValue string) { + if !m.config.Enabled { + return + } + dp := m.data.Sum().DataPoints().AppendEmpty() + dp.SetStartTimestamp(start) + dp.SetTimestamp(ts) + dp.SetDoubleValue(val) + dp.Attributes().PutStr("violation", violationAttributeValue) +} + +// updateCapacity saves max length of data point slices that will be used for the slice capacity. +func (m *metricGpuDcgmClockThrottleDurationTime) updateCapacity() { + if m.data.Sum().DataPoints().Len() > m.capacity { + m.capacity = m.data.Sum().DataPoints().Len() + } +} + +// emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. +func (m *metricGpuDcgmClockThrottleDurationTime) emit(metrics pmetric.MetricSlice) { + if m.config.Enabled && m.data.Sum().DataPoints().Len() > 0 { + m.updateCapacity() + m.data.MoveTo(metrics.AppendEmpty()) + m.init() + } +} + +func newMetricGpuDcgmClockThrottleDurationTime(cfg MetricConfig) metricGpuDcgmClockThrottleDurationTime { + m := metricGpuDcgmClockThrottleDurationTime{config: cfg} + if cfg.Enabled { + m.data = pmetric.NewMetric() + m.init() + } + return m +} + +type metricGpuDcgmCodecDecoderUtilization struct { + data pmetric.Metric // data buffer for generated metric. + config MetricConfig // metric config provided by user. + capacity int // max observed number of data points added to the metric. +} + +// init fills gpu.dcgm.codec.decoder.utilization metric with initial data. +func (m *metricGpuDcgmCodecDecoderUtilization) init() { + m.data.SetName("gpu.dcgm.codec.decoder.utilization") + m.data.SetDescription("Decoder utilization.") + m.data.SetUnit("1") + m.data.SetEmptyGauge() +} + +func (m *metricGpuDcgmCodecDecoderUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64) { + if !m.config.Enabled { + return + } + dp := m.data.Gauge().DataPoints().AppendEmpty() + dp.SetStartTimestamp(start) + dp.SetTimestamp(ts) + dp.SetDoubleValue(val) +} + +// updateCapacity saves max length of data point slices that will be used for the slice capacity. +func (m *metricGpuDcgmCodecDecoderUtilization) updateCapacity() { + if m.data.Gauge().DataPoints().Len() > m.capacity { + m.capacity = m.data.Gauge().DataPoints().Len() + } +} + +// emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. +func (m *metricGpuDcgmCodecDecoderUtilization) emit(metrics pmetric.MetricSlice) { + if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { + m.updateCapacity() + m.data.MoveTo(metrics.AppendEmpty()) + m.init() + } +} + +func newMetricGpuDcgmCodecDecoderUtilization(cfg MetricConfig) metricGpuDcgmCodecDecoderUtilization { + m := metricGpuDcgmCodecDecoderUtilization{config: cfg} + if cfg.Enabled { + m.data = pmetric.NewMetric() + m.init() + } + return m +} + +type metricGpuDcgmCodecEncoderUtilization struct { + data pmetric.Metric // data buffer for generated metric. + config MetricConfig // metric config provided by user. + capacity int // max observed number of data points added to the metric. +} + +// init fills gpu.dcgm.codec.encoder.utilization metric with initial data. +func (m *metricGpuDcgmCodecEncoderUtilization) init() { + m.data.SetName("gpu.dcgm.codec.encoder.utilization") + m.data.SetDescription("Encoder utilization.") + m.data.SetUnit("1") + m.data.SetEmptyGauge() +} + +func (m *metricGpuDcgmCodecEncoderUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64) { + if !m.config.Enabled { + return + } + dp := m.data.Gauge().DataPoints().AppendEmpty() + dp.SetStartTimestamp(start) + dp.SetTimestamp(ts) + dp.SetDoubleValue(val) +} + +// updateCapacity saves max length of data point slices that will be used for the slice capacity. +func (m *metricGpuDcgmCodecEncoderUtilization) updateCapacity() { + if m.data.Gauge().DataPoints().Len() > m.capacity { + m.capacity = m.data.Gauge().DataPoints().Len() + } +} + +// emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. +func (m *metricGpuDcgmCodecEncoderUtilization) emit(metrics pmetric.MetricSlice) { + if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { + m.updateCapacity() + m.data.MoveTo(metrics.AppendEmpty()) + m.init() + } +} + +func newMetricGpuDcgmCodecEncoderUtilization(cfg MetricConfig) metricGpuDcgmCodecEncoderUtilization { + m := metricGpuDcgmCodecEncoderUtilization{config: cfg} + if cfg.Enabled { + m.data = pmetric.NewMetric() + m.init() + } + return m +} + +type metricGpuDcgmEccErrors struct { + data pmetric.Metric // data buffer for generated metric. + config MetricConfig // metric config provided by user. + capacity int // max observed number of data points added to the metric. +} + +// init fills gpu.dcgm.ecc_errors metric with initial data. +func (m *metricGpuDcgmEccErrors) init() { + m.data.SetName("gpu.dcgm.ecc_errors") + m.data.SetDescription("Data corruption errors.") + m.data.SetUnit("1") + m.data.SetEmptySum() + m.data.Sum().SetIsMonotonic(true) + m.data.Sum().SetAggregationTemporality(pmetric.AggregationTemporalityCumulative) + m.data.Sum().DataPoints().EnsureCapacity(m.capacity) +} + +func (m *metricGpuDcgmEccErrors) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, errorTypeAttributeValue string) { + if !m.config.Enabled { + return + } + dp := m.data.Sum().DataPoints().AppendEmpty() + dp.SetStartTimestamp(start) + dp.SetTimestamp(ts) + dp.SetIntValue(val) + dp.Attributes().PutStr("error_type", errorTypeAttributeValue) +} + +// updateCapacity saves max length of data point slices that will be used for the slice capacity. +func (m *metricGpuDcgmEccErrors) updateCapacity() { + if m.data.Sum().DataPoints().Len() > m.capacity { + m.capacity = m.data.Sum().DataPoints().Len() + } +} + +// emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. +func (m *metricGpuDcgmEccErrors) emit(metrics pmetric.MetricSlice) { + if m.config.Enabled && m.data.Sum().DataPoints().Len() > 0 { + m.updateCapacity() + m.data.MoveTo(metrics.AppendEmpty()) + m.init() + } +} + +func newMetricGpuDcgmEccErrors(cfg MetricConfig) metricGpuDcgmEccErrors { + m := metricGpuDcgmEccErrors{config: cfg} + if cfg.Enabled { + m.data = pmetric.NewMetric() + m.init() + } + return m +} + +type metricGpuDcgmEnergyConsumption struct { + data pmetric.Metric // data buffer for generated metric. + config MetricConfig // metric config provided by user. + capacity int // max observed number of data points added to the metric. +} + +// init fills gpu.dcgm.energy_consumption metric with initial data. +func (m *metricGpuDcgmEnergyConsumption) init() { + m.data.SetName("gpu.dcgm.energy_consumption") + m.data.SetDescription("Total energy consumption for the GPU in J since the driver was last reloaded.") + m.data.SetUnit("J") + m.data.SetEmptySum() + m.data.Sum().SetIsMonotonic(true) + m.data.Sum().SetAggregationTemporality(pmetric.AggregationTemporalityCumulative) +} + +func (m *metricGpuDcgmEnergyConsumption) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64) { + if !m.config.Enabled { + return + } + dp := m.data.Sum().DataPoints().AppendEmpty() + dp.SetStartTimestamp(start) + dp.SetTimestamp(ts) + dp.SetDoubleValue(val) +} + +// updateCapacity saves max length of data point slices that will be used for the slice capacity. +func (m *metricGpuDcgmEnergyConsumption) updateCapacity() { + if m.data.Sum().DataPoints().Len() > m.capacity { + m.capacity = m.data.Sum().DataPoints().Len() + } +} + +// emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. +func (m *metricGpuDcgmEnergyConsumption) emit(metrics pmetric.MetricSlice) { + if m.config.Enabled && m.data.Sum().DataPoints().Len() > 0 { + m.updateCapacity() + m.data.MoveTo(metrics.AppendEmpty()) + m.init() } - return "" } -// MapAttributeMemoryState is a helper map of string to AttributeMemoryState attribute value. -var MapAttributeMemoryState = map[string]AttributeMemoryState{ - "used": AttributeMemoryStateUsed, - "free": AttributeMemoryStateFree, +func newMetricGpuDcgmEnergyConsumption(cfg MetricConfig) metricGpuDcgmEnergyConsumption { + m := metricGpuDcgmEnergyConsumption{config: cfg} + if cfg.Enabled { + m.data = pmetric.NewMetric() + m.init() + } + return m } -// AttributePipe specifies the a value pipe attribute. -type AttributePipe int +type metricGpuDcgmMemoryBandwidthUtilization struct { + data pmetric.Metric // data buffer for generated metric. + config MetricConfig // metric config provided by user. + capacity int // max observed number of data points added to the metric. +} -const ( - _ AttributePipe = iota - AttributePipeTensor - AttributePipeFp64 - AttributePipeFp32 - AttributePipeFp16 -) +// init fills gpu.dcgm.memory.bandwidth_utilization metric with initial data. +func (m *metricGpuDcgmMemoryBandwidthUtilization) init() { + m.data.SetName("gpu.dcgm.memory.bandwidth_utilization") + m.data.SetDescription("Fraction of cycles data was being sent or received from GPU memory.") + m.data.SetUnit("1") + m.data.SetEmptyGauge() +} -// String returns the string representation of the AttributePipe. -func (av AttributePipe) String() string { - switch av { - case AttributePipeTensor: - return "tensor" - case AttributePipeFp64: - return "fp64" - case AttributePipeFp32: - return "fp32" - case AttributePipeFp16: - return "fp16" +func (m *metricGpuDcgmMemoryBandwidthUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64) { + if !m.config.Enabled { + return } - return "" + dp := m.data.Gauge().DataPoints().AppendEmpty() + dp.SetStartTimestamp(start) + dp.SetTimestamp(ts) + dp.SetDoubleValue(val) } -// MapAttributePipe is a helper map of string to AttributePipe attribute value. -var MapAttributePipe = map[string]AttributePipe{ - "tensor": AttributePipeTensor, - "fp64": AttributePipeFp64, - "fp32": AttributePipeFp32, - "fp16": AttributePipeFp16, +// updateCapacity saves max length of data point slices that will be used for the slice capacity. +func (m *metricGpuDcgmMemoryBandwidthUtilization) updateCapacity() { + if m.data.Gauge().DataPoints().Len() > m.capacity { + m.capacity = m.data.Gauge().DataPoints().Len() + } } -type metricDcgmGpuMemoryBytesUsed struct { +// emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. +func (m *metricGpuDcgmMemoryBandwidthUtilization) emit(metrics pmetric.MetricSlice) { + if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { + m.updateCapacity() + m.data.MoveTo(metrics.AppendEmpty()) + m.init() + } +} + +func newMetricGpuDcgmMemoryBandwidthUtilization(cfg MetricConfig) metricGpuDcgmMemoryBandwidthUtilization { + m := metricGpuDcgmMemoryBandwidthUtilization{config: cfg} + if cfg.Enabled { + m.data = pmetric.NewMetric() + m.init() + } + return m +} + +type metricGpuDcgmMemoryBytesUsed struct { data pmetric.Metric // data buffer for generated metric. config MetricConfig // metric config provided by user. capacity int // max observed number of data points added to the metric. } -// init fills dcgm.gpu.memory.bytes_used metric with initial data. -func (m *metricDcgmGpuMemoryBytesUsed) init() { - m.data.SetName("dcgm.gpu.memory.bytes_used") +// init fills gpu.dcgm.memory.bytes_used metric with initial data. +func (m *metricGpuDcgmMemoryBytesUsed) init() { + m.data.SetName("gpu.dcgm.memory.bytes_used") m.data.SetDescription("Current number of GPU memory bytes used by state. Summing the values of all states yields the total GPU memory space.") m.data.SetUnit("By") m.data.SetEmptyGauge() m.data.Gauge().DataPoints().EnsureCapacity(m.capacity) } -func (m *metricDcgmGpuMemoryBytesUsed) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, memoryStateAttributeValue string) { +func (m *metricGpuDcgmMemoryBytesUsed) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, memoryStateAttributeValue string) { if !m.config.Enabled { return } @@ -125,14 +958,14 @@ func (m *metricDcgmGpuMemoryBytesUsed) recordDataPoint(start pcommon.Timestamp, } // updateCapacity saves max length of data point slices that will be used for the slice capacity. -func (m *metricDcgmGpuMemoryBytesUsed) updateCapacity() { +func (m *metricGpuDcgmMemoryBytesUsed) updateCapacity() { if m.data.Gauge().DataPoints().Len() > m.capacity { m.capacity = m.data.Gauge().DataPoints().Len() } } // emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. -func (m *metricDcgmGpuMemoryBytesUsed) emit(metrics pmetric.MetricSlice) { +func (m *metricGpuDcgmMemoryBytesUsed) emit(metrics pmetric.MetricSlice) { if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { m.updateCapacity() m.data.MoveTo(metrics.AppendEmpty()) @@ -140,8 +973,8 @@ func (m *metricDcgmGpuMemoryBytesUsed) emit(metrics pmetric.MetricSlice) { } } -func newMetricDcgmGpuMemoryBytesUsed(cfg MetricConfig) metricDcgmGpuMemoryBytesUsed { - m := metricDcgmGpuMemoryBytesUsed{config: cfg} +func newMetricGpuDcgmMemoryBytesUsed(cfg MetricConfig) metricGpuDcgmMemoryBytesUsed { + m := metricGpuDcgmMemoryBytesUsed{config: cfg} if cfg.Enabled { m.data = pmetric.NewMetric() m.init() @@ -149,48 +982,52 @@ func newMetricDcgmGpuMemoryBytesUsed(cfg MetricConfig) metricDcgmGpuMemoryBytesU return m } -type metricDcgmGpuProfilingDramUtilization struct { +type metricGpuDcgmNvlinkTraffic struct { data pmetric.Metric // data buffer for generated metric. config MetricConfig // metric config provided by user. capacity int // max observed number of data points added to the metric. } -// init fills dcgm.gpu.profiling.dram_utilization metric with initial data. -func (m *metricDcgmGpuProfilingDramUtilization) init() { - m.data.SetName("dcgm.gpu.profiling.dram_utilization") - m.data.SetDescription("Fraction of cycles data was being sent or received from GPU memory.") - m.data.SetUnit("1") - m.data.SetEmptyGauge() +// init fills gpu.dcgm.nvlink.traffic metric with initial data. +func (m *metricGpuDcgmNvlinkTraffic) init() { + m.data.SetName("gpu.dcgm.nvlink.traffic") + m.data.SetDescription("The number of bytes sent over NVLink, not including protocol headers.") + m.data.SetUnit("By") + m.data.SetEmptySum() + m.data.Sum().SetIsMonotonic(true) + m.data.Sum().SetAggregationTemporality(pmetric.AggregationTemporalityDelta) + m.data.Sum().DataPoints().EnsureCapacity(m.capacity) } -func (m *metricDcgmGpuProfilingDramUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64) { +func (m *metricGpuDcgmNvlinkTraffic) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, directionAttributeValue string) { if !m.config.Enabled { return } - dp := m.data.Gauge().DataPoints().AppendEmpty() + dp := m.data.Sum().DataPoints().AppendEmpty() dp.SetStartTimestamp(start) dp.SetTimestamp(ts) - dp.SetDoubleValue(val) + dp.SetIntValue(val) + dp.Attributes().PutStr("direction", directionAttributeValue) } // updateCapacity saves max length of data point slices that will be used for the slice capacity. -func (m *metricDcgmGpuProfilingDramUtilization) updateCapacity() { - if m.data.Gauge().DataPoints().Len() > m.capacity { - m.capacity = m.data.Gauge().DataPoints().Len() +func (m *metricGpuDcgmNvlinkTraffic) updateCapacity() { + if m.data.Sum().DataPoints().Len() > m.capacity { + m.capacity = m.data.Sum().DataPoints().Len() } } // emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. -func (m *metricDcgmGpuProfilingDramUtilization) emit(metrics pmetric.MetricSlice) { - if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { +func (m *metricGpuDcgmNvlinkTraffic) emit(metrics pmetric.MetricSlice) { + if m.config.Enabled && m.data.Sum().DataPoints().Len() > 0 { m.updateCapacity() m.data.MoveTo(metrics.AppendEmpty()) m.init() } } -func newMetricDcgmGpuProfilingDramUtilization(cfg MetricConfig) metricDcgmGpuProfilingDramUtilization { - m := metricDcgmGpuProfilingDramUtilization{config: cfg} +func newMetricGpuDcgmNvlinkTraffic(cfg MetricConfig) metricGpuDcgmNvlinkTraffic { + m := metricGpuDcgmNvlinkTraffic{config: cfg} if cfg.Enabled { m.data = pmetric.NewMetric() m.init() @@ -198,26 +1035,28 @@ func newMetricDcgmGpuProfilingDramUtilization(cfg MetricConfig) metricDcgmGpuPro return m } -type metricDcgmGpuProfilingNvlinkTrafficRate struct { +type metricGpuDcgmPcieTraffic struct { data pmetric.Metric // data buffer for generated metric. config MetricConfig // metric config provided by user. capacity int // max observed number of data points added to the metric. } -// init fills dcgm.gpu.profiling.nvlink_traffic_rate metric with initial data. -func (m *metricDcgmGpuProfilingNvlinkTrafficRate) init() { - m.data.SetName("dcgm.gpu.profiling.nvlink_traffic_rate") - m.data.SetDescription("The average rate of bytes received from the GPU over NVLink over the sample period, not including protocol headers.") - m.data.SetUnit("By/s") - m.data.SetEmptyGauge() - m.data.Gauge().DataPoints().EnsureCapacity(m.capacity) +// init fills gpu.dcgm.pcie.traffic metric with initial data. +func (m *metricGpuDcgmPcieTraffic) init() { + m.data.SetName("gpu.dcgm.pcie.traffic") + m.data.SetDescription("The number of bytes sent over the PCIe bus, including both protocol headers and data payloads.") + m.data.SetUnit("By") + m.data.SetEmptySum() + m.data.Sum().SetIsMonotonic(true) + m.data.Sum().SetAggregationTemporality(pmetric.AggregationTemporalityDelta) + m.data.Sum().DataPoints().EnsureCapacity(m.capacity) } -func (m *metricDcgmGpuProfilingNvlinkTrafficRate) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, directionAttributeValue string) { +func (m *metricGpuDcgmPcieTraffic) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, directionAttributeValue string) { if !m.config.Enabled { return } - dp := m.data.Gauge().DataPoints().AppendEmpty() + dp := m.data.Sum().DataPoints().AppendEmpty() dp.SetStartTimestamp(start) dp.SetTimestamp(ts) dp.SetIntValue(val) @@ -225,23 +1064,23 @@ func (m *metricDcgmGpuProfilingNvlinkTrafficRate) recordDataPoint(start pcommon. } // updateCapacity saves max length of data point slices that will be used for the slice capacity. -func (m *metricDcgmGpuProfilingNvlinkTrafficRate) updateCapacity() { - if m.data.Gauge().DataPoints().Len() > m.capacity { - m.capacity = m.data.Gauge().DataPoints().Len() +func (m *metricGpuDcgmPcieTraffic) updateCapacity() { + if m.data.Sum().DataPoints().Len() > m.capacity { + m.capacity = m.data.Sum().DataPoints().Len() } } // emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. -func (m *metricDcgmGpuProfilingNvlinkTrafficRate) emit(metrics pmetric.MetricSlice) { - if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { +func (m *metricGpuDcgmPcieTraffic) emit(metrics pmetric.MetricSlice) { + if m.config.Enabled && m.data.Sum().DataPoints().Len() > 0 { m.updateCapacity() m.data.MoveTo(metrics.AppendEmpty()) m.init() } } -func newMetricDcgmGpuProfilingNvlinkTrafficRate(cfg MetricConfig) metricDcgmGpuProfilingNvlinkTrafficRate { - m := metricDcgmGpuProfilingNvlinkTrafficRate{config: cfg} +func newMetricGpuDcgmPcieTraffic(cfg MetricConfig) metricGpuDcgmPcieTraffic { + m := metricGpuDcgmPcieTraffic{config: cfg} if cfg.Enabled { m.data = pmetric.NewMetric() m.init() @@ -249,41 +1088,41 @@ func newMetricDcgmGpuProfilingNvlinkTrafficRate(cfg MetricConfig) metricDcgmGpuP return m } -type metricDcgmGpuProfilingPcieTrafficRate struct { +type metricGpuDcgmPipeUtilization struct { data pmetric.Metric // data buffer for generated metric. config MetricConfig // metric config provided by user. capacity int // max observed number of data points added to the metric. } -// init fills dcgm.gpu.profiling.pcie_traffic_rate metric with initial data. -func (m *metricDcgmGpuProfilingPcieTrafficRate) init() { - m.data.SetName("dcgm.gpu.profiling.pcie_traffic_rate") - m.data.SetDescription("The average rate of bytes sent from the GPU over the PCIe bus over the sample period, including both protocol headers and data payloads.") - m.data.SetUnit("By/s") +// init fills gpu.dcgm.pipe.utilization metric with initial data. +func (m *metricGpuDcgmPipeUtilization) init() { + m.data.SetName("gpu.dcgm.pipe.utilization") + m.data.SetDescription("Fraction of cycles the corresponding GPU pipe was active, averaged over time and all multiprocessors.") + m.data.SetUnit("1") m.data.SetEmptyGauge() m.data.Gauge().DataPoints().EnsureCapacity(m.capacity) } -func (m *metricDcgmGpuProfilingPcieTrafficRate) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, directionAttributeValue string) { +func (m *metricGpuDcgmPipeUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64, pipeAttributeValue string) { if !m.config.Enabled { return } dp := m.data.Gauge().DataPoints().AppendEmpty() dp.SetStartTimestamp(start) dp.SetTimestamp(ts) - dp.SetIntValue(val) - dp.Attributes().PutStr("direction", directionAttributeValue) + dp.SetDoubleValue(val) + dp.Attributes().PutStr("pipe", pipeAttributeValue) } // updateCapacity saves max length of data point slices that will be used for the slice capacity. -func (m *metricDcgmGpuProfilingPcieTrafficRate) updateCapacity() { +func (m *metricGpuDcgmPipeUtilization) updateCapacity() { if m.data.Gauge().DataPoints().Len() > m.capacity { m.capacity = m.data.Gauge().DataPoints().Len() } } // emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. -func (m *metricDcgmGpuProfilingPcieTrafficRate) emit(metrics pmetric.MetricSlice) { +func (m *metricGpuDcgmPipeUtilization) emit(metrics pmetric.MetricSlice) { if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { m.updateCapacity() m.data.MoveTo(metrics.AppendEmpty()) @@ -291,8 +1130,8 @@ func (m *metricDcgmGpuProfilingPcieTrafficRate) emit(metrics pmetric.MetricSlice } } -func newMetricDcgmGpuProfilingPcieTrafficRate(cfg MetricConfig) metricDcgmGpuProfilingPcieTrafficRate { - m := metricDcgmGpuProfilingPcieTrafficRate{config: cfg} +func newMetricGpuDcgmPipeUtilization(cfg MetricConfig) metricGpuDcgmPipeUtilization { + m := metricGpuDcgmPipeUtilization{config: cfg} if cfg.Enabled { m.data = pmetric.NewMetric() m.init() @@ -300,22 +1139,21 @@ func newMetricDcgmGpuProfilingPcieTrafficRate(cfg MetricConfig) metricDcgmGpuPro return m } -type metricDcgmGpuProfilingPipeUtilization struct { +type metricGpuDcgmSmOccupancy struct { data pmetric.Metric // data buffer for generated metric. config MetricConfig // metric config provided by user. capacity int // max observed number of data points added to the metric. } -// init fills dcgm.gpu.profiling.pipe_utilization metric with initial data. -func (m *metricDcgmGpuProfilingPipeUtilization) init() { - m.data.SetName("dcgm.gpu.profiling.pipe_utilization") - m.data.SetDescription("Fraction of cycles the corresponding GPU pipe was active, averaged over time and all multiprocessors.") +// init fills gpu.dcgm.sm.occupancy metric with initial data. +func (m *metricGpuDcgmSmOccupancy) init() { + m.data.SetName("gpu.dcgm.sm.occupancy") + m.data.SetDescription("Fraction of the number of warps resident on a multiprocessor, averaged over all multiprocessors.") m.data.SetUnit("1") m.data.SetEmptyGauge() - m.data.Gauge().DataPoints().EnsureCapacity(m.capacity) } -func (m *metricDcgmGpuProfilingPipeUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64, pipeAttributeValue string) { +func (m *metricGpuDcgmSmOccupancy) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64) { if !m.config.Enabled { return } @@ -323,18 +1161,17 @@ func (m *metricDcgmGpuProfilingPipeUtilization) recordDataPoint(start pcommon.Ti dp.SetStartTimestamp(start) dp.SetTimestamp(ts) dp.SetDoubleValue(val) - dp.Attributes().PutStr("pipe", pipeAttributeValue) } // updateCapacity saves max length of data point slices that will be used for the slice capacity. -func (m *metricDcgmGpuProfilingPipeUtilization) updateCapacity() { +func (m *metricGpuDcgmSmOccupancy) updateCapacity() { if m.data.Gauge().DataPoints().Len() > m.capacity { m.capacity = m.data.Gauge().DataPoints().Len() } } // emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. -func (m *metricDcgmGpuProfilingPipeUtilization) emit(metrics pmetric.MetricSlice) { +func (m *metricGpuDcgmSmOccupancy) emit(metrics pmetric.MetricSlice) { if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { m.updateCapacity() m.data.MoveTo(metrics.AppendEmpty()) @@ -342,8 +1179,8 @@ func (m *metricDcgmGpuProfilingPipeUtilization) emit(metrics pmetric.MetricSlice } } -func newMetricDcgmGpuProfilingPipeUtilization(cfg MetricConfig) metricDcgmGpuProfilingPipeUtilization { - m := metricDcgmGpuProfilingPipeUtilization{config: cfg} +func newMetricGpuDcgmSmOccupancy(cfg MetricConfig) metricGpuDcgmSmOccupancy { + m := metricGpuDcgmSmOccupancy{config: cfg} if cfg.Enabled { m.data = pmetric.NewMetric() m.init() @@ -351,21 +1188,21 @@ func newMetricDcgmGpuProfilingPipeUtilization(cfg MetricConfig) metricDcgmGpuPro return m } -type metricDcgmGpuProfilingSmOccupancy struct { +type metricGpuDcgmSmUtilization struct { data pmetric.Metric // data buffer for generated metric. config MetricConfig // metric config provided by user. capacity int // max observed number of data points added to the metric. } -// init fills dcgm.gpu.profiling.sm_occupancy metric with initial data. -func (m *metricDcgmGpuProfilingSmOccupancy) init() { - m.data.SetName("dcgm.gpu.profiling.sm_occupancy") - m.data.SetDescription("Fraction of resident warps on a multiprocessor relative to the maximum number supported, averaged over time and all multiprocessors.") +// init fills gpu.dcgm.sm.utilization metric with initial data. +func (m *metricGpuDcgmSmUtilization) init() { + m.data.SetName("gpu.dcgm.sm.utilization") + m.data.SetDescription("Fraction of time at least one warp was active on a multiprocessor, averaged over all multiprocessors.") m.data.SetUnit("1") m.data.SetEmptyGauge() } -func (m *metricDcgmGpuProfilingSmOccupancy) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64) { +func (m *metricGpuDcgmSmUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64) { if !m.config.Enabled { return } @@ -376,14 +1213,14 @@ func (m *metricDcgmGpuProfilingSmOccupancy) recordDataPoint(start pcommon.Timest } // updateCapacity saves max length of data point slices that will be used for the slice capacity. -func (m *metricDcgmGpuProfilingSmOccupancy) updateCapacity() { +func (m *metricGpuDcgmSmUtilization) updateCapacity() { if m.data.Gauge().DataPoints().Len() > m.capacity { m.capacity = m.data.Gauge().DataPoints().Len() } } // emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. -func (m *metricDcgmGpuProfilingSmOccupancy) emit(metrics pmetric.MetricSlice) { +func (m *metricGpuDcgmSmUtilization) emit(metrics pmetric.MetricSlice) { if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { m.updateCapacity() m.data.MoveTo(metrics.AppendEmpty()) @@ -391,8 +1228,8 @@ func (m *metricDcgmGpuProfilingSmOccupancy) emit(metrics pmetric.MetricSlice) { } } -func newMetricDcgmGpuProfilingSmOccupancy(cfg MetricConfig) metricDcgmGpuProfilingSmOccupancy { - m := metricDcgmGpuProfilingSmOccupancy{config: cfg} +func newMetricGpuDcgmSmUtilization(cfg MetricConfig) metricGpuDcgmSmUtilization { + m := metricGpuDcgmSmUtilization{config: cfg} if cfg.Enabled { m.data = pmetric.NewMetric() m.init() @@ -400,21 +1237,21 @@ func newMetricDcgmGpuProfilingSmOccupancy(cfg MetricConfig) metricDcgmGpuProfili return m } -type metricDcgmGpuProfilingSmUtilization struct { +type metricGpuDcgmTemperature struct { data pmetric.Metric // data buffer for generated metric. config MetricConfig // metric config provided by user. capacity int // max observed number of data points added to the metric. } -// init fills dcgm.gpu.profiling.sm_utilization metric with initial data. -func (m *metricDcgmGpuProfilingSmUtilization) init() { - m.data.SetName("dcgm.gpu.profiling.sm_utilization") - m.data.SetDescription("Fraction of time at least one warp was active on a multiprocessor, averaged over all multiprocessors.") - m.data.SetUnit("1") +// init fills gpu.dcgm.temperature metric with initial data. +func (m *metricGpuDcgmTemperature) init() { + m.data.SetName("gpu.dcgm.temperature") + m.data.SetDescription("Current temperature readings for the device, in ˚C.") + m.data.SetUnit("Cel") m.data.SetEmptyGauge() } -func (m *metricDcgmGpuProfilingSmUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64) { +func (m *metricGpuDcgmTemperature) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64) { if !m.config.Enabled { return } @@ -425,14 +1262,14 @@ func (m *metricDcgmGpuProfilingSmUtilization) recordDataPoint(start pcommon.Time } // updateCapacity saves max length of data point slices that will be used for the slice capacity. -func (m *metricDcgmGpuProfilingSmUtilization) updateCapacity() { +func (m *metricGpuDcgmTemperature) updateCapacity() { if m.data.Gauge().DataPoints().Len() > m.capacity { m.capacity = m.data.Gauge().DataPoints().Len() } } // emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. -func (m *metricDcgmGpuProfilingSmUtilization) emit(metrics pmetric.MetricSlice) { +func (m *metricGpuDcgmTemperature) emit(metrics pmetric.MetricSlice) { if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { m.updateCapacity() m.data.MoveTo(metrics.AppendEmpty()) @@ -440,8 +1277,8 @@ func (m *metricDcgmGpuProfilingSmUtilization) emit(metrics pmetric.MetricSlice) } } -func newMetricDcgmGpuProfilingSmUtilization(cfg MetricConfig) metricDcgmGpuProfilingSmUtilization { - m := metricDcgmGpuProfilingSmUtilization{config: cfg} +func newMetricGpuDcgmTemperature(cfg MetricConfig) metricGpuDcgmTemperature { + m := metricGpuDcgmTemperature{config: cfg} if cfg.Enabled { m.data = pmetric.NewMetric() m.init() @@ -449,21 +1286,21 @@ func newMetricDcgmGpuProfilingSmUtilization(cfg MetricConfig) metricDcgmGpuProfi return m } -type metricDcgmGpuUtilization struct { +type metricGpuDcgmUtilization struct { data pmetric.Metric // data buffer for generated metric. config MetricConfig // metric config provided by user. capacity int // max observed number of data points added to the metric. } -// init fills dcgm.gpu.utilization metric with initial data. -func (m *metricDcgmGpuUtilization) init() { - m.data.SetName("dcgm.gpu.utilization") - m.data.SetDescription("Fraction of time the GPU was not idle.") +// init fills gpu.dcgm.utilization metric with initial data. +func (m *metricGpuDcgmUtilization) init() { + m.data.SetName("gpu.dcgm.utilization") + m.data.SetDescription("Ratio of time the graphics engine is active.") m.data.SetUnit("1") m.data.SetEmptyGauge() } -func (m *metricDcgmGpuUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64) { +func (m *metricGpuDcgmUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64) { if !m.config.Enabled { return } @@ -474,14 +1311,14 @@ func (m *metricDcgmGpuUtilization) recordDataPoint(start pcommon.Timestamp, ts p } // updateCapacity saves max length of data point slices that will be used for the slice capacity. -func (m *metricDcgmGpuUtilization) updateCapacity() { +func (m *metricGpuDcgmUtilization) updateCapacity() { if m.data.Gauge().DataPoints().Len() > m.capacity { m.capacity = m.data.Gauge().DataPoints().Len() } } // emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. -func (m *metricDcgmGpuUtilization) emit(metrics pmetric.MetricSlice) { +func (m *metricGpuDcgmUtilization) emit(metrics pmetric.MetricSlice) { if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { m.updateCapacity() m.data.MoveTo(metrics.AppendEmpty()) @@ -489,8 +1326,61 @@ func (m *metricDcgmGpuUtilization) emit(metrics pmetric.MetricSlice) { } } -func newMetricDcgmGpuUtilization(cfg MetricConfig) metricDcgmGpuUtilization { - m := metricDcgmGpuUtilization{config: cfg} +func newMetricGpuDcgmUtilization(cfg MetricConfig) metricGpuDcgmUtilization { + m := metricGpuDcgmUtilization{config: cfg} + if cfg.Enabled { + m.data = pmetric.NewMetric() + m.init() + } + return m +} + +type metricGpuDcgmXidErrors struct { + data pmetric.Metric // data buffer for generated metric. + config MetricConfig // metric config provided by user. + capacity int // max observed number of data points added to the metric. +} + +// init fills gpu.dcgm.xid_errors metric with initial data. +func (m *metricGpuDcgmXidErrors) init() { + m.data.SetName("gpu.dcgm.xid_errors") + m.data.SetDescription("XID errors.") + m.data.SetUnit("1") + m.data.SetEmptySum() + m.data.Sum().SetIsMonotonic(true) + m.data.Sum().SetAggregationTemporality(pmetric.AggregationTemporalityCumulative) + m.data.Sum().DataPoints().EnsureCapacity(m.capacity) +} + +func (m *metricGpuDcgmXidErrors) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, xidAttributeValue int64) { + if !m.config.Enabled { + return + } + dp := m.data.Sum().DataPoints().AppendEmpty() + dp.SetStartTimestamp(start) + dp.SetTimestamp(ts) + dp.SetIntValue(val) + dp.Attributes().PutInt("xid", xidAttributeValue) +} + +// updateCapacity saves max length of data point slices that will be used for the slice capacity. +func (m *metricGpuDcgmXidErrors) updateCapacity() { + if m.data.Sum().DataPoints().Len() > m.capacity { + m.capacity = m.data.Sum().DataPoints().Len() + } +} + +// emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. +func (m *metricGpuDcgmXidErrors) emit(metrics pmetric.MetricSlice) { + if m.config.Enabled && m.data.Sum().DataPoints().Len() > 0 { + m.updateCapacity() + m.data.MoveTo(metrics.AppendEmpty()) + m.init() + } +} + +func newMetricGpuDcgmXidErrors(cfg MetricConfig) metricGpuDcgmXidErrors { + m := metricGpuDcgmXidErrors{config: cfg} if cfg.Enabled { m.data = pmetric.NewMetric() m.init() @@ -516,6 +1406,22 @@ type MetricsBuilder struct { metricDcgmGpuProfilingSmOccupancy metricDcgmGpuProfilingSmOccupancy metricDcgmGpuProfilingSmUtilization metricDcgmGpuProfilingSmUtilization metricDcgmGpuUtilization metricDcgmGpuUtilization + metricGpuDcgmClockFrequency metricGpuDcgmClockFrequency + metricGpuDcgmClockThrottleDurationTime metricGpuDcgmClockThrottleDurationTime + metricGpuDcgmCodecDecoderUtilization metricGpuDcgmCodecDecoderUtilization + metricGpuDcgmCodecEncoderUtilization metricGpuDcgmCodecEncoderUtilization + metricGpuDcgmEccErrors metricGpuDcgmEccErrors + metricGpuDcgmEnergyConsumption metricGpuDcgmEnergyConsumption + metricGpuDcgmMemoryBandwidthUtilization metricGpuDcgmMemoryBandwidthUtilization + metricGpuDcgmMemoryBytesUsed metricGpuDcgmMemoryBytesUsed + metricGpuDcgmNvlinkTraffic metricGpuDcgmNvlinkTraffic + metricGpuDcgmPcieTraffic metricGpuDcgmPcieTraffic + metricGpuDcgmPipeUtilization metricGpuDcgmPipeUtilization + metricGpuDcgmSmOccupancy metricGpuDcgmSmOccupancy + metricGpuDcgmSmUtilization metricGpuDcgmSmUtilization + metricGpuDcgmTemperature metricGpuDcgmTemperature + metricGpuDcgmUtilization metricGpuDcgmUtilization + metricGpuDcgmXidErrors metricGpuDcgmXidErrors } // metricBuilderOption applies changes to default metrics builder. @@ -542,6 +1448,22 @@ func NewMetricsBuilder(mbc MetricsBuilderConfig, settings receiver.CreateSetting metricDcgmGpuProfilingSmOccupancy: newMetricDcgmGpuProfilingSmOccupancy(mbc.Metrics.DcgmGpuProfilingSmOccupancy), metricDcgmGpuProfilingSmUtilization: newMetricDcgmGpuProfilingSmUtilization(mbc.Metrics.DcgmGpuProfilingSmUtilization), metricDcgmGpuUtilization: newMetricDcgmGpuUtilization(mbc.Metrics.DcgmGpuUtilization), + metricGpuDcgmClockFrequency: newMetricGpuDcgmClockFrequency(mbc.Metrics.GpuDcgmClockFrequency), + metricGpuDcgmClockThrottleDurationTime: newMetricGpuDcgmClockThrottleDurationTime(mbc.Metrics.GpuDcgmClockThrottleDurationTime), + metricGpuDcgmCodecDecoderUtilization: newMetricGpuDcgmCodecDecoderUtilization(mbc.Metrics.GpuDcgmCodecDecoderUtilization), + metricGpuDcgmCodecEncoderUtilization: newMetricGpuDcgmCodecEncoderUtilization(mbc.Metrics.GpuDcgmCodecEncoderUtilization), + metricGpuDcgmEccErrors: newMetricGpuDcgmEccErrors(mbc.Metrics.GpuDcgmEccErrors), + metricGpuDcgmEnergyConsumption: newMetricGpuDcgmEnergyConsumption(mbc.Metrics.GpuDcgmEnergyConsumption), + metricGpuDcgmMemoryBandwidthUtilization: newMetricGpuDcgmMemoryBandwidthUtilization(mbc.Metrics.GpuDcgmMemoryBandwidthUtilization), + metricGpuDcgmMemoryBytesUsed: newMetricGpuDcgmMemoryBytesUsed(mbc.Metrics.GpuDcgmMemoryBytesUsed), + metricGpuDcgmNvlinkTraffic: newMetricGpuDcgmNvlinkTraffic(mbc.Metrics.GpuDcgmNvlinkTraffic), + metricGpuDcgmPcieTraffic: newMetricGpuDcgmPcieTraffic(mbc.Metrics.GpuDcgmPcieTraffic), + metricGpuDcgmPipeUtilization: newMetricGpuDcgmPipeUtilization(mbc.Metrics.GpuDcgmPipeUtilization), + metricGpuDcgmSmOccupancy: newMetricGpuDcgmSmOccupancy(mbc.Metrics.GpuDcgmSmOccupancy), + metricGpuDcgmSmUtilization: newMetricGpuDcgmSmUtilization(mbc.Metrics.GpuDcgmSmUtilization), + metricGpuDcgmTemperature: newMetricGpuDcgmTemperature(mbc.Metrics.GpuDcgmTemperature), + metricGpuDcgmUtilization: newMetricGpuDcgmUtilization(mbc.Metrics.GpuDcgmUtilization), + metricGpuDcgmXidErrors: newMetricGpuDcgmXidErrors(mbc.Metrics.GpuDcgmXidErrors), resourceAttributeIncludeFilter: make(map[string]filter.Filter), resourceAttributeExcludeFilter: make(map[string]filter.Filter), } @@ -632,6 +1554,22 @@ func (mb *MetricsBuilder) EmitForResource(rmo ...ResourceMetricsOption) { mb.metricDcgmGpuProfilingSmOccupancy.emit(ils.Metrics()) mb.metricDcgmGpuProfilingSmUtilization.emit(ils.Metrics()) mb.metricDcgmGpuUtilization.emit(ils.Metrics()) + mb.metricGpuDcgmClockFrequency.emit(ils.Metrics()) + mb.metricGpuDcgmClockThrottleDurationTime.emit(ils.Metrics()) + mb.metricGpuDcgmCodecDecoderUtilization.emit(ils.Metrics()) + mb.metricGpuDcgmCodecEncoderUtilization.emit(ils.Metrics()) + mb.metricGpuDcgmEccErrors.emit(ils.Metrics()) + mb.metricGpuDcgmEnergyConsumption.emit(ils.Metrics()) + mb.metricGpuDcgmMemoryBandwidthUtilization.emit(ils.Metrics()) + mb.metricGpuDcgmMemoryBytesUsed.emit(ils.Metrics()) + mb.metricGpuDcgmNvlinkTraffic.emit(ils.Metrics()) + mb.metricGpuDcgmPcieTraffic.emit(ils.Metrics()) + mb.metricGpuDcgmPipeUtilization.emit(ils.Metrics()) + mb.metricGpuDcgmSmOccupancy.emit(ils.Metrics()) + mb.metricGpuDcgmSmUtilization.emit(ils.Metrics()) + mb.metricGpuDcgmTemperature.emit(ils.Metrics()) + mb.metricGpuDcgmUtilization.emit(ils.Metrics()) + mb.metricGpuDcgmXidErrors.emit(ils.Metrics()) for _, op := range rmo { op(rm) @@ -703,6 +1641,86 @@ func (mb *MetricsBuilder) RecordDcgmGpuUtilizationDataPoint(ts pcommon.Timestamp mb.metricDcgmGpuUtilization.recordDataPoint(mb.startTime, ts, val) } +// RecordGpuDcgmClockFrequencyDataPoint adds a data point to gpu.dcgm.clock.frequency metric. +func (mb *MetricsBuilder) RecordGpuDcgmClockFrequencyDataPoint(ts pcommon.Timestamp, val float64) { + mb.metricGpuDcgmClockFrequency.recordDataPoint(mb.startTime, ts, val) +} + +// RecordGpuDcgmClockThrottleDurationTimeDataPoint adds a data point to gpu.dcgm.clock.throttle_duration.time metric. +func (mb *MetricsBuilder) RecordGpuDcgmClockThrottleDurationTimeDataPoint(ts pcommon.Timestamp, val float64, violationAttributeValue AttributeViolation) { + mb.metricGpuDcgmClockThrottleDurationTime.recordDataPoint(mb.startTime, ts, val, violationAttributeValue.String()) +} + +// RecordGpuDcgmCodecDecoderUtilizationDataPoint adds a data point to gpu.dcgm.codec.decoder.utilization metric. +func (mb *MetricsBuilder) RecordGpuDcgmCodecDecoderUtilizationDataPoint(ts pcommon.Timestamp, val float64) { + mb.metricGpuDcgmCodecDecoderUtilization.recordDataPoint(mb.startTime, ts, val) +} + +// RecordGpuDcgmCodecEncoderUtilizationDataPoint adds a data point to gpu.dcgm.codec.encoder.utilization metric. +func (mb *MetricsBuilder) RecordGpuDcgmCodecEncoderUtilizationDataPoint(ts pcommon.Timestamp, val float64) { + mb.metricGpuDcgmCodecEncoderUtilization.recordDataPoint(mb.startTime, ts, val) +} + +// RecordGpuDcgmEccErrorsDataPoint adds a data point to gpu.dcgm.ecc_errors metric. +func (mb *MetricsBuilder) RecordGpuDcgmEccErrorsDataPoint(ts pcommon.Timestamp, val int64, errorTypeAttributeValue AttributeErrorType) { + mb.metricGpuDcgmEccErrors.recordDataPoint(mb.startTime, ts, val, errorTypeAttributeValue.String()) +} + +// RecordGpuDcgmEnergyConsumptionDataPoint adds a data point to gpu.dcgm.energy_consumption metric. +func (mb *MetricsBuilder) RecordGpuDcgmEnergyConsumptionDataPoint(ts pcommon.Timestamp, val float64) { + mb.metricGpuDcgmEnergyConsumption.recordDataPoint(mb.startTime, ts, val) +} + +// RecordGpuDcgmMemoryBandwidthUtilizationDataPoint adds a data point to gpu.dcgm.memory.bandwidth_utilization metric. +func (mb *MetricsBuilder) RecordGpuDcgmMemoryBandwidthUtilizationDataPoint(ts pcommon.Timestamp, val float64) { + mb.metricGpuDcgmMemoryBandwidthUtilization.recordDataPoint(mb.startTime, ts, val) +} + +// RecordGpuDcgmMemoryBytesUsedDataPoint adds a data point to gpu.dcgm.memory.bytes_used metric. +func (mb *MetricsBuilder) RecordGpuDcgmMemoryBytesUsedDataPoint(ts pcommon.Timestamp, val int64, memoryStateAttributeValue AttributeMemoryState) { + mb.metricGpuDcgmMemoryBytesUsed.recordDataPoint(mb.startTime, ts, val, memoryStateAttributeValue.String()) +} + +// RecordGpuDcgmNvlinkTrafficDataPoint adds a data point to gpu.dcgm.nvlink.traffic metric. +func (mb *MetricsBuilder) RecordGpuDcgmNvlinkTrafficDataPoint(ts pcommon.Timestamp, val int64, directionAttributeValue AttributeDirection) { + mb.metricGpuDcgmNvlinkTraffic.recordDataPoint(mb.startTime, ts, val, directionAttributeValue.String()) +} + +// RecordGpuDcgmPcieTrafficDataPoint adds a data point to gpu.dcgm.pcie.traffic metric. +func (mb *MetricsBuilder) RecordGpuDcgmPcieTrafficDataPoint(ts pcommon.Timestamp, val int64, directionAttributeValue AttributeDirection) { + mb.metricGpuDcgmPcieTraffic.recordDataPoint(mb.startTime, ts, val, directionAttributeValue.String()) +} + +// RecordGpuDcgmPipeUtilizationDataPoint adds a data point to gpu.dcgm.pipe.utilization metric. +func (mb *MetricsBuilder) RecordGpuDcgmPipeUtilizationDataPoint(ts pcommon.Timestamp, val float64, pipeAttributeValue AttributePipe) { + mb.metricGpuDcgmPipeUtilization.recordDataPoint(mb.startTime, ts, val, pipeAttributeValue.String()) +} + +// RecordGpuDcgmSmOccupancyDataPoint adds a data point to gpu.dcgm.sm.occupancy metric. +func (mb *MetricsBuilder) RecordGpuDcgmSmOccupancyDataPoint(ts pcommon.Timestamp, val float64) { + mb.metricGpuDcgmSmOccupancy.recordDataPoint(mb.startTime, ts, val) +} + +// RecordGpuDcgmSmUtilizationDataPoint adds a data point to gpu.dcgm.sm.utilization metric. +func (mb *MetricsBuilder) RecordGpuDcgmSmUtilizationDataPoint(ts pcommon.Timestamp, val float64) { + mb.metricGpuDcgmSmUtilization.recordDataPoint(mb.startTime, ts, val) +} + +// RecordGpuDcgmTemperatureDataPoint adds a data point to gpu.dcgm.temperature metric. +func (mb *MetricsBuilder) RecordGpuDcgmTemperatureDataPoint(ts pcommon.Timestamp, val float64) { + mb.metricGpuDcgmTemperature.recordDataPoint(mb.startTime, ts, val) +} + +// RecordGpuDcgmUtilizationDataPoint adds a data point to gpu.dcgm.utilization metric. +func (mb *MetricsBuilder) RecordGpuDcgmUtilizationDataPoint(ts pcommon.Timestamp, val float64) { + mb.metricGpuDcgmUtilization.recordDataPoint(mb.startTime, ts, val) +} + +// RecordGpuDcgmXidErrorsDataPoint adds a data point to gpu.dcgm.xid_errors metric. +func (mb *MetricsBuilder) RecordGpuDcgmXidErrorsDataPoint(ts pcommon.Timestamp, val int64, xidAttributeValue int64) { + mb.metricGpuDcgmXidErrors.recordDataPoint(mb.startTime, ts, val, xidAttributeValue) +} + // Reset resets metrics builder to its initial state. It should be used when external metrics source is restarted, // and metrics builder should update its startTime and reset it's internal state accordingly. func (mb *MetricsBuilder) Reset(options ...metricBuilderOption) { diff --git a/receiver/dcgmreceiver/internal/metadata/generated_metrics_test.go b/receiver/dcgmreceiver/internal/metadata/generated_metrics_test.go index 9de3b9b1d..36680c0ac 100644 --- a/receiver/dcgmreceiver/internal/metadata/generated_metrics_test.go +++ b/receiver/dcgmreceiver/internal/metadata/generated_metrics_test.go @@ -100,6 +100,70 @@ func TestMetricsBuilder(t *testing.T) { allMetricsCount++ mb.RecordDcgmGpuUtilizationDataPoint(ts, 1) + defaultMetricsCount++ + allMetricsCount++ + mb.RecordGpuDcgmClockFrequencyDataPoint(ts, 1) + + defaultMetricsCount++ + allMetricsCount++ + mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(ts, 1, AttributeViolationPower) + + defaultMetricsCount++ + allMetricsCount++ + mb.RecordGpuDcgmCodecDecoderUtilizationDataPoint(ts, 1) + + defaultMetricsCount++ + allMetricsCount++ + mb.RecordGpuDcgmCodecEncoderUtilizationDataPoint(ts, 1) + + defaultMetricsCount++ + allMetricsCount++ + mb.RecordGpuDcgmEccErrorsDataPoint(ts, 1, AttributeErrorTypeSbe) + + defaultMetricsCount++ + allMetricsCount++ + mb.RecordGpuDcgmEnergyConsumptionDataPoint(ts, 1) + + defaultMetricsCount++ + allMetricsCount++ + mb.RecordGpuDcgmMemoryBandwidthUtilizationDataPoint(ts, 1) + + defaultMetricsCount++ + allMetricsCount++ + mb.RecordGpuDcgmMemoryBytesUsedDataPoint(ts, 1, AttributeMemoryStateUsed) + + defaultMetricsCount++ + allMetricsCount++ + mb.RecordGpuDcgmNvlinkTrafficDataPoint(ts, 1, AttributeDirectionTx) + + defaultMetricsCount++ + allMetricsCount++ + mb.RecordGpuDcgmPcieTrafficDataPoint(ts, 1, AttributeDirectionTx) + + defaultMetricsCount++ + allMetricsCount++ + mb.RecordGpuDcgmPipeUtilizationDataPoint(ts, 1, AttributePipeTensor) + + defaultMetricsCount++ + allMetricsCount++ + mb.RecordGpuDcgmSmOccupancyDataPoint(ts, 1) + + defaultMetricsCount++ + allMetricsCount++ + mb.RecordGpuDcgmSmUtilizationDataPoint(ts, 1) + + defaultMetricsCount++ + allMetricsCount++ + mb.RecordGpuDcgmTemperatureDataPoint(ts, 1) + + defaultMetricsCount++ + allMetricsCount++ + mb.RecordGpuDcgmUtilizationDataPoint(ts, 1) + + defaultMetricsCount++ + allMetricsCount++ + mb.RecordGpuDcgmXidErrorsDataPoint(ts, 1, 3) + rb := mb.NewResourceBuilder() rb.SetGpuModel("gpu.model-val") rb.SetGpuNumber("gpu.number-val") @@ -234,6 +298,231 @@ func TestMetricsBuilder(t *testing.T) { assert.Equal(t, ts, dp.Timestamp()) assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) assert.Equal(t, float64(1), dp.DoubleValue()) + case "gpu.dcgm.clock.frequency": + assert.False(t, validatedMetrics["gpu.dcgm.clock.frequency"], "Found a duplicate in the metrics slice: gpu.dcgm.clock.frequency") + validatedMetrics["gpu.dcgm.clock.frequency"] = true + assert.Equal(t, pmetric.MetricTypeGauge, ms.At(i).Type()) + assert.Equal(t, 1, ms.At(i).Gauge().DataPoints().Len()) + assert.Equal(t, "Multiprocessor clock frequency.", ms.At(i).Description()) + assert.Equal(t, "Hz", ms.At(i).Unit()) + dp := ms.At(i).Gauge().DataPoints().At(0) + assert.Equal(t, start, dp.StartTimestamp()) + assert.Equal(t, ts, dp.Timestamp()) + assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) + assert.Equal(t, float64(1), dp.DoubleValue()) + case "gpu.dcgm.clock.throttle_duration.time": + assert.False(t, validatedMetrics["gpu.dcgm.clock.throttle_duration.time"], "Found a duplicate in the metrics slice: gpu.dcgm.clock.throttle_duration.time") + validatedMetrics["gpu.dcgm.clock.throttle_duration.time"] = true + assert.Equal(t, pmetric.MetricTypeSum, ms.At(i).Type()) + assert.Equal(t, 1, ms.At(i).Sum().DataPoints().Len()) + assert.Equal(t, "Clock throttle total duration.", ms.At(i).Description()) + assert.Equal(t, "s", ms.At(i).Unit()) + assert.Equal(t, true, ms.At(i).Sum().IsMonotonic()) + assert.Equal(t, pmetric.AggregationTemporalityCumulative, ms.At(i).Sum().AggregationTemporality()) + dp := ms.At(i).Sum().DataPoints().At(0) + assert.Equal(t, start, dp.StartTimestamp()) + assert.Equal(t, ts, dp.Timestamp()) + assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) + assert.Equal(t, float64(1), dp.DoubleValue()) + attrVal, ok := dp.Attributes().Get("violation") + assert.True(t, ok) + assert.EqualValues(t, "power", attrVal.Str()) + case "gpu.dcgm.codec.decoder.utilization": + assert.False(t, validatedMetrics["gpu.dcgm.codec.decoder.utilization"], "Found a duplicate in the metrics slice: gpu.dcgm.codec.decoder.utilization") + validatedMetrics["gpu.dcgm.codec.decoder.utilization"] = true + assert.Equal(t, pmetric.MetricTypeGauge, ms.At(i).Type()) + assert.Equal(t, 1, ms.At(i).Gauge().DataPoints().Len()) + assert.Equal(t, "Decoder utilization.", ms.At(i).Description()) + assert.Equal(t, "1", ms.At(i).Unit()) + dp := ms.At(i).Gauge().DataPoints().At(0) + assert.Equal(t, start, dp.StartTimestamp()) + assert.Equal(t, ts, dp.Timestamp()) + assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) + assert.Equal(t, float64(1), dp.DoubleValue()) + case "gpu.dcgm.codec.encoder.utilization": + assert.False(t, validatedMetrics["gpu.dcgm.codec.encoder.utilization"], "Found a duplicate in the metrics slice: gpu.dcgm.codec.encoder.utilization") + validatedMetrics["gpu.dcgm.codec.encoder.utilization"] = true + assert.Equal(t, pmetric.MetricTypeGauge, ms.At(i).Type()) + assert.Equal(t, 1, ms.At(i).Gauge().DataPoints().Len()) + assert.Equal(t, "Encoder utilization.", ms.At(i).Description()) + assert.Equal(t, "1", ms.At(i).Unit()) + dp := ms.At(i).Gauge().DataPoints().At(0) + assert.Equal(t, start, dp.StartTimestamp()) + assert.Equal(t, ts, dp.Timestamp()) + assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) + assert.Equal(t, float64(1), dp.DoubleValue()) + case "gpu.dcgm.ecc_errors": + assert.False(t, validatedMetrics["gpu.dcgm.ecc_errors"], "Found a duplicate in the metrics slice: gpu.dcgm.ecc_errors") + validatedMetrics["gpu.dcgm.ecc_errors"] = true + assert.Equal(t, pmetric.MetricTypeSum, ms.At(i).Type()) + assert.Equal(t, 1, ms.At(i).Sum().DataPoints().Len()) + assert.Equal(t, "Data corruption errors.", ms.At(i).Description()) + assert.Equal(t, "1", ms.At(i).Unit()) + assert.Equal(t, true, ms.At(i).Sum().IsMonotonic()) + assert.Equal(t, pmetric.AggregationTemporalityCumulative, ms.At(i).Sum().AggregationTemporality()) + dp := ms.At(i).Sum().DataPoints().At(0) + assert.Equal(t, start, dp.StartTimestamp()) + assert.Equal(t, ts, dp.Timestamp()) + assert.Equal(t, pmetric.NumberDataPointValueTypeInt, dp.ValueType()) + assert.Equal(t, int64(1), dp.IntValue()) + attrVal, ok := dp.Attributes().Get("error_type") + assert.True(t, ok) + assert.EqualValues(t, "sbe", attrVal.Str()) + case "gpu.dcgm.energy_consumption": + assert.False(t, validatedMetrics["gpu.dcgm.energy_consumption"], "Found a duplicate in the metrics slice: gpu.dcgm.energy_consumption") + validatedMetrics["gpu.dcgm.energy_consumption"] = true + assert.Equal(t, pmetric.MetricTypeSum, ms.At(i).Type()) + assert.Equal(t, 1, ms.At(i).Sum().DataPoints().Len()) + assert.Equal(t, "Total energy consumption for the GPU in J since the driver was last reloaded.", ms.At(i).Description()) + assert.Equal(t, "J", ms.At(i).Unit()) + assert.Equal(t, true, ms.At(i).Sum().IsMonotonic()) + assert.Equal(t, pmetric.AggregationTemporalityCumulative, ms.At(i).Sum().AggregationTemporality()) + dp := ms.At(i).Sum().DataPoints().At(0) + assert.Equal(t, start, dp.StartTimestamp()) + assert.Equal(t, ts, dp.Timestamp()) + assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) + assert.Equal(t, float64(1), dp.DoubleValue()) + case "gpu.dcgm.memory.bandwidth_utilization": + assert.False(t, validatedMetrics["gpu.dcgm.memory.bandwidth_utilization"], "Found a duplicate in the metrics slice: gpu.dcgm.memory.bandwidth_utilization") + validatedMetrics["gpu.dcgm.memory.bandwidth_utilization"] = true + assert.Equal(t, pmetric.MetricTypeGauge, ms.At(i).Type()) + assert.Equal(t, 1, ms.At(i).Gauge().DataPoints().Len()) + assert.Equal(t, "Fraction of cycles data was being sent or received from GPU memory.", ms.At(i).Description()) + assert.Equal(t, "1", ms.At(i).Unit()) + dp := ms.At(i).Gauge().DataPoints().At(0) + assert.Equal(t, start, dp.StartTimestamp()) + assert.Equal(t, ts, dp.Timestamp()) + assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) + assert.Equal(t, float64(1), dp.DoubleValue()) + case "gpu.dcgm.memory.bytes_used": + assert.False(t, validatedMetrics["gpu.dcgm.memory.bytes_used"], "Found a duplicate in the metrics slice: gpu.dcgm.memory.bytes_used") + validatedMetrics["gpu.dcgm.memory.bytes_used"] = true + assert.Equal(t, pmetric.MetricTypeGauge, ms.At(i).Type()) + assert.Equal(t, 1, ms.At(i).Gauge().DataPoints().Len()) + assert.Equal(t, "Current number of GPU memory bytes used by state. Summing the values of all states yields the total GPU memory space.", ms.At(i).Description()) + assert.Equal(t, "By", ms.At(i).Unit()) + dp := ms.At(i).Gauge().DataPoints().At(0) + assert.Equal(t, start, dp.StartTimestamp()) + assert.Equal(t, ts, dp.Timestamp()) + assert.Equal(t, pmetric.NumberDataPointValueTypeInt, dp.ValueType()) + assert.Equal(t, int64(1), dp.IntValue()) + attrVal, ok := dp.Attributes().Get("memory_state") + assert.True(t, ok) + assert.EqualValues(t, "used", attrVal.Str()) + case "gpu.dcgm.nvlink.traffic": + assert.False(t, validatedMetrics["gpu.dcgm.nvlink.traffic"], "Found a duplicate in the metrics slice: gpu.dcgm.nvlink.traffic") + validatedMetrics["gpu.dcgm.nvlink.traffic"] = true + assert.Equal(t, pmetric.MetricTypeSum, ms.At(i).Type()) + assert.Equal(t, 1, ms.At(i).Sum().DataPoints().Len()) + assert.Equal(t, "The number of bytes sent over NVLink, not including protocol headers.", ms.At(i).Description()) + assert.Equal(t, "By", ms.At(i).Unit()) + assert.Equal(t, true, ms.At(i).Sum().IsMonotonic()) + assert.Equal(t, pmetric.AggregationTemporalityDelta, ms.At(i).Sum().AggregationTemporality()) + dp := ms.At(i).Sum().DataPoints().At(0) + assert.Equal(t, start, dp.StartTimestamp()) + assert.Equal(t, ts, dp.Timestamp()) + assert.Equal(t, pmetric.NumberDataPointValueTypeInt, dp.ValueType()) + assert.Equal(t, int64(1), dp.IntValue()) + attrVal, ok := dp.Attributes().Get("direction") + assert.True(t, ok) + assert.EqualValues(t, "tx", attrVal.Str()) + case "gpu.dcgm.pcie.traffic": + assert.False(t, validatedMetrics["gpu.dcgm.pcie.traffic"], "Found a duplicate in the metrics slice: gpu.dcgm.pcie.traffic") + validatedMetrics["gpu.dcgm.pcie.traffic"] = true + assert.Equal(t, pmetric.MetricTypeSum, ms.At(i).Type()) + assert.Equal(t, 1, ms.At(i).Sum().DataPoints().Len()) + assert.Equal(t, "The number of bytes sent over the PCIe bus, including both protocol headers and data payloads.", ms.At(i).Description()) + assert.Equal(t, "By", ms.At(i).Unit()) + assert.Equal(t, true, ms.At(i).Sum().IsMonotonic()) + assert.Equal(t, pmetric.AggregationTemporalityDelta, ms.At(i).Sum().AggregationTemporality()) + dp := ms.At(i).Sum().DataPoints().At(0) + assert.Equal(t, start, dp.StartTimestamp()) + assert.Equal(t, ts, dp.Timestamp()) + assert.Equal(t, pmetric.NumberDataPointValueTypeInt, dp.ValueType()) + assert.Equal(t, int64(1), dp.IntValue()) + attrVal, ok := dp.Attributes().Get("direction") + assert.True(t, ok) + assert.EqualValues(t, "tx", attrVal.Str()) + case "gpu.dcgm.pipe.utilization": + assert.False(t, validatedMetrics["gpu.dcgm.pipe.utilization"], "Found a duplicate in the metrics slice: gpu.dcgm.pipe.utilization") + validatedMetrics["gpu.dcgm.pipe.utilization"] = true + assert.Equal(t, pmetric.MetricTypeGauge, ms.At(i).Type()) + assert.Equal(t, 1, ms.At(i).Gauge().DataPoints().Len()) + assert.Equal(t, "Fraction of cycles the corresponding GPU pipe was active, averaged over time and all multiprocessors.", ms.At(i).Description()) + assert.Equal(t, "1", ms.At(i).Unit()) + dp := ms.At(i).Gauge().DataPoints().At(0) + assert.Equal(t, start, dp.StartTimestamp()) + assert.Equal(t, ts, dp.Timestamp()) + assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) + assert.Equal(t, float64(1), dp.DoubleValue()) + attrVal, ok := dp.Attributes().Get("pipe") + assert.True(t, ok) + assert.EqualValues(t, "tensor", attrVal.Str()) + case "gpu.dcgm.sm.occupancy": + assert.False(t, validatedMetrics["gpu.dcgm.sm.occupancy"], "Found a duplicate in the metrics slice: gpu.dcgm.sm.occupancy") + validatedMetrics["gpu.dcgm.sm.occupancy"] = true + assert.Equal(t, pmetric.MetricTypeGauge, ms.At(i).Type()) + assert.Equal(t, 1, ms.At(i).Gauge().DataPoints().Len()) + assert.Equal(t, "Fraction of the number of warps resident on a multiprocessor, averaged over all multiprocessors.", ms.At(i).Description()) + assert.Equal(t, "1", ms.At(i).Unit()) + dp := ms.At(i).Gauge().DataPoints().At(0) + assert.Equal(t, start, dp.StartTimestamp()) + assert.Equal(t, ts, dp.Timestamp()) + assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) + assert.Equal(t, float64(1), dp.DoubleValue()) + case "gpu.dcgm.sm.utilization": + assert.False(t, validatedMetrics["gpu.dcgm.sm.utilization"], "Found a duplicate in the metrics slice: gpu.dcgm.sm.utilization") + validatedMetrics["gpu.dcgm.sm.utilization"] = true + assert.Equal(t, pmetric.MetricTypeGauge, ms.At(i).Type()) + assert.Equal(t, 1, ms.At(i).Gauge().DataPoints().Len()) + assert.Equal(t, "Fraction of time at least one warp was active on a multiprocessor, averaged over all multiprocessors.", ms.At(i).Description()) + assert.Equal(t, "1", ms.At(i).Unit()) + dp := ms.At(i).Gauge().DataPoints().At(0) + assert.Equal(t, start, dp.StartTimestamp()) + assert.Equal(t, ts, dp.Timestamp()) + assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) + assert.Equal(t, float64(1), dp.DoubleValue()) + case "gpu.dcgm.temperature": + assert.False(t, validatedMetrics["gpu.dcgm.temperature"], "Found a duplicate in the metrics slice: gpu.dcgm.temperature") + validatedMetrics["gpu.dcgm.temperature"] = true + assert.Equal(t, pmetric.MetricTypeGauge, ms.At(i).Type()) + assert.Equal(t, 1, ms.At(i).Gauge().DataPoints().Len()) + assert.Equal(t, "Current temperature readings for the device, in ˚C.", ms.At(i).Description()) + assert.Equal(t, "Cel", ms.At(i).Unit()) + dp := ms.At(i).Gauge().DataPoints().At(0) + assert.Equal(t, start, dp.StartTimestamp()) + assert.Equal(t, ts, dp.Timestamp()) + assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) + assert.Equal(t, float64(1), dp.DoubleValue()) + case "gpu.dcgm.utilization": + assert.False(t, validatedMetrics["gpu.dcgm.utilization"], "Found a duplicate in the metrics slice: gpu.dcgm.utilization") + validatedMetrics["gpu.dcgm.utilization"] = true + assert.Equal(t, pmetric.MetricTypeGauge, ms.At(i).Type()) + assert.Equal(t, 1, ms.At(i).Gauge().DataPoints().Len()) + assert.Equal(t, "Ratio of time the graphics engine is active.", ms.At(i).Description()) + assert.Equal(t, "1", ms.At(i).Unit()) + dp := ms.At(i).Gauge().DataPoints().At(0) + assert.Equal(t, start, dp.StartTimestamp()) + assert.Equal(t, ts, dp.Timestamp()) + assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) + assert.Equal(t, float64(1), dp.DoubleValue()) + case "gpu.dcgm.xid_errors": + assert.False(t, validatedMetrics["gpu.dcgm.xid_errors"], "Found a duplicate in the metrics slice: gpu.dcgm.xid_errors") + validatedMetrics["gpu.dcgm.xid_errors"] = true + assert.Equal(t, pmetric.MetricTypeSum, ms.At(i).Type()) + assert.Equal(t, 1, ms.At(i).Sum().DataPoints().Len()) + assert.Equal(t, "XID errors.", ms.At(i).Description()) + assert.Equal(t, "1", ms.At(i).Unit()) + assert.Equal(t, true, ms.At(i).Sum().IsMonotonic()) + assert.Equal(t, pmetric.AggregationTemporalityCumulative, ms.At(i).Sum().AggregationTemporality()) + dp := ms.At(i).Sum().DataPoints().At(0) + assert.Equal(t, start, dp.StartTimestamp()) + assert.Equal(t, ts, dp.Timestamp()) + assert.Equal(t, pmetric.NumberDataPointValueTypeInt, dp.ValueType()) + assert.Equal(t, int64(1), dp.IntValue()) + attrVal, ok := dp.Attributes().Get("xid") + assert.True(t, ok) + assert.EqualValues(t, 3, attrVal.Int()) } } }) diff --git a/receiver/dcgmreceiver/internal/metadata/testdata/config.yaml b/receiver/dcgmreceiver/internal/metadata/testdata/config.yaml index 2b5c665af..b18d5284b 100644 --- a/receiver/dcgmreceiver/internal/metadata/testdata/config.yaml +++ b/receiver/dcgmreceiver/internal/metadata/testdata/config.yaml @@ -17,6 +17,38 @@ all_set: enabled: true dcgm.gpu.utilization: enabled: true + gpu.dcgm.clock.frequency: + enabled: true + gpu.dcgm.clock.throttle_duration.time: + enabled: true + gpu.dcgm.codec.decoder.utilization: + enabled: true + gpu.dcgm.codec.encoder.utilization: + enabled: true + gpu.dcgm.ecc_errors: + enabled: true + gpu.dcgm.energy_consumption: + enabled: true + gpu.dcgm.memory.bandwidth_utilization: + enabled: true + gpu.dcgm.memory.bytes_used: + enabled: true + gpu.dcgm.nvlink.traffic: + enabled: true + gpu.dcgm.pcie.traffic: + enabled: true + gpu.dcgm.pipe.utilization: + enabled: true + gpu.dcgm.sm.occupancy: + enabled: true + gpu.dcgm.sm.utilization: + enabled: true + gpu.dcgm.temperature: + enabled: true + gpu.dcgm.utilization: + enabled: true + gpu.dcgm.xid_errors: + enabled: true resource_attributes: gpu.model: enabled: true @@ -42,6 +74,38 @@ none_set: enabled: false dcgm.gpu.utilization: enabled: false + gpu.dcgm.clock.frequency: + enabled: false + gpu.dcgm.clock.throttle_duration.time: + enabled: false + gpu.dcgm.codec.decoder.utilization: + enabled: false + gpu.dcgm.codec.encoder.utilization: + enabled: false + gpu.dcgm.ecc_errors: + enabled: false + gpu.dcgm.energy_consumption: + enabled: false + gpu.dcgm.memory.bandwidth_utilization: + enabled: false + gpu.dcgm.memory.bytes_used: + enabled: false + gpu.dcgm.nvlink.traffic: + enabled: false + gpu.dcgm.pcie.traffic: + enabled: false + gpu.dcgm.pipe.utilization: + enabled: false + gpu.dcgm.sm.occupancy: + enabled: false + gpu.dcgm.sm.utilization: + enabled: false + gpu.dcgm.temperature: + enabled: false + gpu.dcgm.utilization: + enabled: false + gpu.dcgm.xid_errors: + enabled: false resource_attributes: gpu.model: enabled: false diff --git a/receiver/dcgmreceiver/metadata.yaml b/receiver/dcgmreceiver/metadata.yaml index d226fbad6..15c91783e 100644 --- a/receiver/dcgmreceiver/metadata.yaml +++ b/receiver/dcgmreceiver/metadata.yaml @@ -24,8 +24,8 @@ resource_attributes: attributes: memory_state: type: string - description: GPU memory used or free - enum: [used, free] + description: GPU memory state, one of [free, used, reserved]. + enum: [used, free, reserved] pipe: type: string @@ -37,7 +37,153 @@ attributes: description: Direction of the link traffic, one of [tx, rx]. enum: [tx, rx] + violation: + type: string + description: Reason for throttling, one of [power, thermal, sync_boost, board_limit, low_util, reliability, app_clock, base_clock]. + enum: [power, thermal, sync_boost, board_limit, low_util, reliability, app_clock, base_clock] + + error_type: + type: string + description: The type of error, one of [sbe, dbe]. + enum: [sbe, dbe] + + xid: + type: int + description: The XID code for the error, 1..143. + metrics: + gpu.dcgm.utilization: + enabled: true + description: Ratio of time the graphics engine is active. + unit: 1 + gauge: + value_type: double + + gpu.dcgm.sm.utilization: + enabled: true + description: Fraction of time at least one warp was active on a multiprocessor, averaged over all multiprocessors. + unit: 1 + gauge: + value_type: double + + gpu.dcgm.sm.occupancy: + enabled: true + description: Fraction of the number of warps resident on a multiprocessor, averaged over all multiprocessors. + unit: 1 + gauge: + value_type: double + + gpu.dcgm.pipe.utilization: + enabled: true + description: Fraction of cycles the corresponding GPU pipe was active, averaged over time and all multiprocessors. + unit: 1 + gauge: + value_type: double + attributes: [pipe] + + gpu.dcgm.codec.encoder.utilization: + enabled: true + description: Encoder utilization. + unit: 1 + gauge: + value_type: double + + gpu.dcgm.codec.decoder.utilization: + enabled: true + description: Decoder utilization. + unit: 1 + gauge: + value_type: double + + gpu.dcgm.memory.bytes_used: + enabled: true + description: Current number of GPU memory bytes used by state. Summing the values of all states yields the total GPU memory space. + unit: By + gauge: + value_type: int + attributes: [memory_state] + + gpu.dcgm.memory.bandwidth_utilization: + enabled: true + description: Fraction of cycles data was being sent or received from GPU memory. + unit: 1 + gauge: + value_type: double + + gpu.dcgm.pcie.traffic: + enabled: true + description: The number of bytes sent over the PCIe bus, including both protocol headers and data payloads. + unit: By + sum: + value_type: int + aggregation_temporality: delta + monotonic: true + attributes: [direction] + + gpu.dcgm.nvlink.traffic: + enabled: true + description: The number of bytes sent over NVLink, not including protocol headers. + unit: By + sum: + value_type: int + aggregation_temporality: delta + monotonic: true + attributes: [direction] + + gpu.dcgm.energy_consumption: + enabled: true + description: Total energy consumption for the GPU in J since the driver was last reloaded. + unit: J + sum: + value_type: double + aggregation_temporality: cumulative + monotonic: true + + gpu.dcgm.temperature: + enabled: true + description: Current temperature readings for the device, in ˚C. + unit: Cel + gauge: + value_type: double + + gpu.dcgm.clock.frequency: + enabled: true + description: Multiprocessor clock frequency. + unit: Hz + gauge: + value_type: double + + gpu.dcgm.clock.throttle_duration.time: + enabled: true + description: Clock throttle total duration. + unit: s + sum: + value_type: double + aggregation_temporality: cumulative + monotonic: true + attributes: [violation] + + gpu.dcgm.ecc_errors: + enabled: true + description: Data corruption errors. + unit: 1 + sum: + value_type: int + aggregation_temporality: cumulative + monotonic: true + attributes: [error_type] + + gpu.dcgm.xid_errors: + enabled: true + description: XID errors. + unit: 1 + sum: + value_type: int + aggregation_temporality: cumulative + monotonic: true + attributes: [xid] + +#--- dcgm.gpu.utilization: description: Fraction of time the GPU was not idle. unit: 1 From 356066137051fec61b61df33df70449a79048311 Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Fri, 21 Jun 2024 18:23:15 -0400 Subject: [PATCH 09/38] Turn gpu.dcgm.sm.occupancy off by default. --- receiver/dcgmreceiver/documentation.md | 26 ++++++++++----- .../internal/metadata/generated_config.go | 2 +- .../metadata/generated_metrics_test.go | 1 - receiver/dcgmreceiver/metadata.yaml | 32 +++++++++---------- .../testdata/NVIDIA_A100-SXM4-40GB.yaml | 1 - receiver/dcgmreceiver/testdata/NVIDIA_L4.yaml | 1 - .../testdata/Tesla_P100-PCIE-16GB.yaml | 1 - receiver/dcgmreceiver/testdata/Tesla_P4.yaml | 1 - receiver/dcgmreceiver/testdata/Tesla_T4.yaml | 1 - .../testdata/Tesla_V100-SXM2-16GB.yaml | 1 - 10 files changed, 35 insertions(+), 32 deletions(-) diff --git a/receiver/dcgmreceiver/documentation.md b/receiver/dcgmreceiver/documentation.md index 5e17a9c1d..d7bb01b7f 100644 --- a/receiver/dcgmreceiver/documentation.md +++ b/receiver/dcgmreceiver/documentation.md @@ -224,14 +224,6 @@ Fraction of cycles the corresponding GPU pipe was active, averaged over time and | ---- | ----------- | ------ | | pipe | GPU pipe in use, one of [tensor, fp64, fp32, fp16]. | Str: ``tensor``, ``fp64``, ``fp32``, ``fp16`` | -### gpu.dcgm.sm.occupancy - -Fraction of the number of warps resident on a multiprocessor, averaged over all multiprocessors. - -| Unit | Metric Type | Value Type | -| ---- | ----------- | ---------- | -| 1 | Gauge | Double | - ### gpu.dcgm.sm.utilization Fraction of time at least one warp was active on a multiprocessor, averaged over all multiprocessors. @@ -270,6 +262,24 @@ XID errors. | ---- | ----------- | ------ | | xid | The XID code for the error, 1..143. | Any Int | +## Optional Metrics + +The following metrics are not emitted by default. Each of them can be enabled by applying the following configuration: + +```yaml +metrics: + : + enabled: true +``` + +### gpu.dcgm.sm.occupancy + +Fraction of the number of warps resident on a multiprocessor, averaged over all multiprocessors. + +| Unit | Metric Type | Value Type | +| ---- | ----------- | ---------- | +| 1 | Gauge | Double | + ## Resource Attributes | Name | Description | Values | Enabled | diff --git a/receiver/dcgmreceiver/internal/metadata/generated_config.go b/receiver/dcgmreceiver/internal/metadata/generated_config.go index 2d6f5708b..17902b040 100644 --- a/receiver/dcgmreceiver/internal/metadata/generated_config.go +++ b/receiver/dcgmreceiver/internal/metadata/generated_config.go @@ -114,7 +114,7 @@ func DefaultMetricsConfig() MetricsConfig { Enabled: true, }, GpuDcgmSmOccupancy: MetricConfig{ - Enabled: true, + Enabled: false, }, GpuDcgmSmUtilization: MetricConfig{ Enabled: true, diff --git a/receiver/dcgmreceiver/internal/metadata/generated_metrics_test.go b/receiver/dcgmreceiver/internal/metadata/generated_metrics_test.go index 36680c0ac..d56d05412 100644 --- a/receiver/dcgmreceiver/internal/metadata/generated_metrics_test.go +++ b/receiver/dcgmreceiver/internal/metadata/generated_metrics_test.go @@ -144,7 +144,6 @@ func TestMetricsBuilder(t *testing.T) { allMetricsCount++ mb.RecordGpuDcgmPipeUtilizationDataPoint(ts, 1, AttributePipeTensor) - defaultMetricsCount++ allMetricsCount++ mb.RecordGpuDcgmSmOccupancyDataPoint(ts, 1) diff --git a/receiver/dcgmreceiver/metadata.yaml b/receiver/dcgmreceiver/metadata.yaml index 15c91783e..45783be65 100644 --- a/receiver/dcgmreceiver/metadata.yaml +++ b/receiver/dcgmreceiver/metadata.yaml @@ -53,65 +53,64 @@ attributes: metrics: gpu.dcgm.utilization: - enabled: true description: Ratio of time the graphics engine is active. unit: 1 gauge: value_type: double + enabled: true gpu.dcgm.sm.utilization: - enabled: true description: Fraction of time at least one warp was active on a multiprocessor, averaged over all multiprocessors. unit: 1 gauge: value_type: double + enabled: true gpu.dcgm.sm.occupancy: - enabled: true description: Fraction of the number of warps resident on a multiprocessor, averaged over all multiprocessors. unit: 1 gauge: value_type: double + enabled: false gpu.dcgm.pipe.utilization: - enabled: true description: Fraction of cycles the corresponding GPU pipe was active, averaged over time and all multiprocessors. unit: 1 gauge: value_type: double attributes: [pipe] + enabled: true gpu.dcgm.codec.encoder.utilization: - enabled: true description: Encoder utilization. unit: 1 gauge: value_type: double + enabled: true gpu.dcgm.codec.decoder.utilization: - enabled: true description: Decoder utilization. unit: 1 gauge: value_type: double + enabled: true gpu.dcgm.memory.bytes_used: - enabled: true description: Current number of GPU memory bytes used by state. Summing the values of all states yields the total GPU memory space. unit: By gauge: value_type: int attributes: [memory_state] + enabled: true gpu.dcgm.memory.bandwidth_utilization: - enabled: true description: Fraction of cycles data was being sent or received from GPU memory. unit: 1 gauge: value_type: double + enabled: true gpu.dcgm.pcie.traffic: - enabled: true description: The number of bytes sent over the PCIe bus, including both protocol headers and data payloads. unit: By sum: @@ -119,9 +118,9 @@ metrics: aggregation_temporality: delta monotonic: true attributes: [direction] + enabled: true gpu.dcgm.nvlink.traffic: - enabled: true description: The number of bytes sent over NVLink, not including protocol headers. unit: By sum: @@ -129,32 +128,32 @@ metrics: aggregation_temporality: delta monotonic: true attributes: [direction] + enabled: true gpu.dcgm.energy_consumption: - enabled: true description: Total energy consumption for the GPU in J since the driver was last reloaded. unit: J sum: value_type: double aggregation_temporality: cumulative monotonic: true + enabled: true gpu.dcgm.temperature: - enabled: true description: Current temperature readings for the device, in ˚C. unit: Cel gauge: value_type: double + enabled: true gpu.dcgm.clock.frequency: - enabled: true description: Multiprocessor clock frequency. unit: Hz gauge: value_type: double + enabled: true gpu.dcgm.clock.throttle_duration.time: - enabled: true description: Clock throttle total duration. unit: s sum: @@ -162,9 +161,9 @@ metrics: aggregation_temporality: cumulative monotonic: true attributes: [violation] + enabled: true gpu.dcgm.ecc_errors: - enabled: true description: Data corruption errors. unit: 1 sum: @@ -172,9 +171,9 @@ metrics: aggregation_temporality: cumulative monotonic: true attributes: [error_type] + enabled: true gpu.dcgm.xid_errors: - enabled: true description: XID errors. unit: 1 sum: @@ -182,6 +181,7 @@ metrics: aggregation_temporality: cumulative monotonic: true attributes: [xid] + enabled: true #--- dcgm.gpu.utilization: diff --git a/receiver/dcgmreceiver/testdata/NVIDIA_A100-SXM4-40GB.yaml b/receiver/dcgmreceiver/testdata/NVIDIA_A100-SXM4-40GB.yaml index 230ab0c17..46631337b 100644 --- a/receiver/dcgmreceiver/testdata/NVIDIA_A100-SXM4-40GB.yaml +++ b/receiver/dcgmreceiver/testdata/NVIDIA_A100-SXM4-40GB.yaml @@ -4,7 +4,6 @@ supported_fields: - DCGM_FI_DEV_FB_USED - DCGM_FI_DEV_FB_FREE - DCGM_FI_PROF_SM_ACTIVE - - DCGM_FI_PROF_SM_OCCUPANCY - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE - DCGM_FI_PROF_PIPE_FP64_ACTIVE - DCGM_FI_PROF_PIPE_FP32_ACTIVE diff --git a/receiver/dcgmreceiver/testdata/NVIDIA_L4.yaml b/receiver/dcgmreceiver/testdata/NVIDIA_L4.yaml index ff81429c2..2a7083e80 100644 --- a/receiver/dcgmreceiver/testdata/NVIDIA_L4.yaml +++ b/receiver/dcgmreceiver/testdata/NVIDIA_L4.yaml @@ -4,7 +4,6 @@ supported_fields: - DCGM_FI_DEV_FB_USED - DCGM_FI_DEV_FB_FREE - DCGM_FI_PROF_SM_ACTIVE - - DCGM_FI_PROF_SM_OCCUPANCY - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE - DCGM_FI_PROF_PIPE_FP32_ACTIVE - DCGM_FI_PROF_PIPE_FP16_ACTIVE diff --git a/receiver/dcgmreceiver/testdata/Tesla_P100-PCIE-16GB.yaml b/receiver/dcgmreceiver/testdata/Tesla_P100-PCIE-16GB.yaml index 729a6f39c..296efe4d2 100644 --- a/receiver/dcgmreceiver/testdata/Tesla_P100-PCIE-16GB.yaml +++ b/receiver/dcgmreceiver/testdata/Tesla_P100-PCIE-16GB.yaml @@ -5,7 +5,6 @@ supported_fields: - DCGM_FI_DEV_FB_FREE unsupported_fields: - DCGM_FI_PROF_SM_ACTIVE - - DCGM_FI_PROF_SM_OCCUPANCY - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE - DCGM_FI_PROF_PIPE_FP64_ACTIVE - DCGM_FI_PROF_PIPE_FP32_ACTIVE diff --git a/receiver/dcgmreceiver/testdata/Tesla_P4.yaml b/receiver/dcgmreceiver/testdata/Tesla_P4.yaml index 9b115f49a..d9f715dde 100644 --- a/receiver/dcgmreceiver/testdata/Tesla_P4.yaml +++ b/receiver/dcgmreceiver/testdata/Tesla_P4.yaml @@ -5,7 +5,6 @@ supported_fields: - DCGM_FI_DEV_FB_FREE unsupported_fields: - DCGM_FI_PROF_SM_ACTIVE - - DCGM_FI_PROF_SM_OCCUPANCY - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE - DCGM_FI_PROF_PIPE_FP64_ACTIVE - DCGM_FI_PROF_PIPE_FP32_ACTIVE diff --git a/receiver/dcgmreceiver/testdata/Tesla_T4.yaml b/receiver/dcgmreceiver/testdata/Tesla_T4.yaml index 37a066b37..29978f7e5 100644 --- a/receiver/dcgmreceiver/testdata/Tesla_T4.yaml +++ b/receiver/dcgmreceiver/testdata/Tesla_T4.yaml @@ -4,7 +4,6 @@ supported_fields: - DCGM_FI_DEV_FB_USED - DCGM_FI_DEV_FB_FREE - DCGM_FI_PROF_SM_ACTIVE - - DCGM_FI_PROF_SM_OCCUPANCY - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE - DCGM_FI_PROF_PIPE_FP64_ACTIVE - DCGM_FI_PROF_PIPE_FP32_ACTIVE diff --git a/receiver/dcgmreceiver/testdata/Tesla_V100-SXM2-16GB.yaml b/receiver/dcgmreceiver/testdata/Tesla_V100-SXM2-16GB.yaml index aec19e80c..151e5a3ed 100644 --- a/receiver/dcgmreceiver/testdata/Tesla_V100-SXM2-16GB.yaml +++ b/receiver/dcgmreceiver/testdata/Tesla_V100-SXM2-16GB.yaml @@ -4,7 +4,6 @@ supported_fields: - DCGM_FI_DEV_FB_USED - DCGM_FI_DEV_FB_FREE - DCGM_FI_PROF_SM_ACTIVE - - DCGM_FI_PROF_SM_OCCUPANCY - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE - DCGM_FI_PROF_PIPE_FP64_ACTIVE - DCGM_FI_PROF_PIPE_FP32_ACTIVE From 24a21cbe6a990c4b41e25c8dd340ed62613ac408 Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Wed, 29 May 2024 22:19:35 -0400 Subject: [PATCH 10/38] Ingest new metrics instead. --- receiver/dcgmreceiver/client.go | 60 ++++-- receiver/dcgmreceiver/client_gpu_test.go | 55 +++++- receiver/dcgmreceiver/scraper.go | 102 ++++++++-- receiver/dcgmreceiver/scraper_gpu_test.go | 183 ++++++++++++++---- .../testdata/NVIDIA_A100-SXM4-40GB.yaml | 23 ++- receiver/dcgmreceiver/testdata/NVIDIA_L4.yaml | 23 ++- .../testdata/Tesla_P100-PCIE-16GB.yaml | 21 +- receiver/dcgmreceiver/testdata/Tesla_P4.yaml | 21 +- receiver/dcgmreceiver/testdata/Tesla_T4.yaml | 23 ++- .../testdata/Tesla_V100-SXM2-16GB.yaml | 23 ++- 10 files changed, 449 insertions(+), 85 deletions(-) diff --git a/receiver/dcgmreceiver/client.go b/receiver/dcgmreceiver/client.go index b10283194..d141a76c3 100644 --- a/receiver/dcgmreceiver/client.go +++ b/receiver/dcgmreceiver/client.go @@ -166,36 +166,72 @@ func createDeviceGroup(logger *zap.Logger, deviceIndices []uint) (dcgm.GroupHand func discoverRequestedFieldIDs(config *Config) []dcgm.Short { requestedFieldIDs := []dcgm.Short{} - if config.Metrics.DcgmGpuUtilization.Enabled { - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_GPU_UTIL"]) + if config.Metrics.GpuDcgmUtilization.Enabled { + requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_GR_ENGINE_ACTIVE"]) + requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_GPU_UTIL"]) // fallback } - if config.Metrics.DcgmGpuMemoryBytesUsed.Enabled { - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_FB_USED"]) - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_FB_FREE"]) - } - if config.Metrics.DcgmGpuProfilingSmUtilization.Enabled { + if config.Metrics.GpuDcgmSmUtilization.Enabled { requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_SM_ACTIVE"]) } - if config.Metrics.DcgmGpuProfilingSmOccupancy.Enabled { + if config.Metrics.GpuDcgmSmOccupancy.Enabled { requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_SM_OCCUPANCY"]) } - if config.Metrics.DcgmGpuProfilingPipeUtilization.Enabled { + if config.Metrics.GpuDcgmPipeUtilization.Enabled { requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_PIPE_TENSOR_ACTIVE"]) requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_PIPE_FP64_ACTIVE"]) requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_PIPE_FP32_ACTIVE"]) requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_PIPE_FP16_ACTIVE"]) } - if config.Metrics.DcgmGpuProfilingDramUtilization.Enabled { + if config.Metrics.GpuDcgmCodecEncoderUtilization.Enabled { + requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_ENC_UTIL"]) + } + if config.Metrics.GpuDcgmCodecDecoderUtilization.Enabled { + requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_DEC_UTIL"]) + } + if config.Metrics.GpuDcgmMemoryBytesUsed.Enabled { + requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_FB_FREE"]) + requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_FB_USED"]) + requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_FB_RESERVED"]) + } + if config.Metrics.GpuDcgmMemoryBandwidthUtilization.Enabled { requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_DRAM_ACTIVE"]) + requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_MEM_COPY_UTIL"]) // fallback } - if config.Metrics.DcgmGpuProfilingPcieTrafficRate.Enabled { + if config.Metrics.GpuDcgmPcieTraffic.Enabled { requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_PCIE_TX_BYTES"]) requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_PCIE_RX_BYTES"]) } - if config.Metrics.DcgmGpuProfilingNvlinkTrafficRate.Enabled { + if config.Metrics.GpuDcgmNvlinkTraffic.Enabled { requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_NVLINK_TX_BYTES"]) requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_NVLINK_RX_BYTES"]) } + if config.Metrics.GpuDcgmEnergyConsumption.Enabled { + requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION"]) + requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_POWER_USAGE"]) // fallback + } + if config.Metrics.GpuDcgmTemperature.Enabled { + requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_GPU_TEMP"]) + } + if config.Metrics.GpuDcgmClockFrequency.Enabled { + requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_SM_CLOCK"]) + } + if config.Metrics.GpuDcgmClockThrottleDurationTime.Enabled { + requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_POWER_VIOLATION"]) + requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_THERMAL_VIOLATION"]) + requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_SYNC_BOOST_VIOLATION"]) + requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_BOARD_LIMIT_VIOLATION"]) + requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_LOW_UTIL_VIOLATION"]) + requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_RELIABILITY_VIOLATION"]) + requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION"]) + requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION"]) + } + if config.Metrics.GpuDcgmEccErrors.Enabled { + requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_ECC_SBE_VOL_TOTAL"]) + requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_ECC_DBE_VOL_TOTAL"]) + } + if config.Metrics.GpuDcgmXidErrors.Enabled { + //requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI[""]) + } return requestedFieldIDs } diff --git a/receiver/dcgmreceiver/client_gpu_test.go b/receiver/dcgmreceiver/client_gpu_test.go index b451eaf58..3da83501c 100644 --- a/receiver/dcgmreceiver/client_gpu_test.go +++ b/receiver/dcgmreceiver/client_gpu_test.go @@ -145,9 +145,13 @@ func TestCollectGpuProfilingMetrics(t *testing.T) { for gpuIndex, metrics := range deviceMetrics { for _, metric := range metrics { switch metric.name { - case "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE": + case "DCGM_FI_PROF_GR_ENGINE_ACTIVE": fallthrough - case "DCGM_FI_PROF_DRAM_ACTIVE": + case "DCGM_FI_PROF_SM_ACTIVE": + fallthrough + case "DCGM_FI_PROF_SM_OCCUPANCY": + fallthrough + case "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE": fallthrough case "DCGM_FI_PROF_PIPE_FP64_ACTIVE": fallthrough @@ -155,17 +159,23 @@ func TestCollectGpuProfilingMetrics(t *testing.T) { fallthrough case "DCGM_FI_PROF_PIPE_FP16_ACTIVE": fallthrough - case "DCGM_FI_PROF_SM_OCCUPANCY": - fallthrough - case "DCGM_FI_PROF_SM_ACTIVE": + case "DCGM_FI_PROF_DRAM_ACTIVE": assert.GreaterOrEqual(t, metric.asFloat64(), float64(0.0)) assert.LessOrEqual(t, metric.asFloat64(), float64(1.0)) case "DCGM_FI_DEV_GPU_UTIL": + fallthrough + case "DCGM_FI_DEV_MEM_COPY_UTIL": + fallthrough + case "DCGM_FI_DEV_ENC_UTIL": + fallthrough + case "DCGM_FI_DEV_DEC_UTIL": assert.GreaterOrEqual(t, metric.asInt64(), int64(0)) assert.LessOrEqual(t, metric.asInt64(), int64(100)) case "DCGM_FI_DEV_FB_FREE": fallthrough case "DCGM_FI_DEV_FB_USED": + fallthrough + case "DCGM_FI_DEV_FB_RESERVED": // arbitrary max of 10 TiB assert.GreaterOrEqual(t, metric.asInt64(), int64(0)) assert.LessOrEqual(t, metric.asInt64(), int64(10485760)) @@ -179,6 +189,41 @@ func TestCollectGpuProfilingMetrics(t *testing.T) { // arbitrary max of 10 TiB/sec assert.GreaterOrEqual(t, metric.asInt64(), int64(0)) assert.LessOrEqual(t, metric.asInt64(), int64(10995116277760)) + case "DCGM_FI_DEV_BOARD_LIMIT_VIOLATION": + fallthrough + case "DCGM_FI_DEV_LOW_UTIL_VIOLATION": + fallthrough + case "DCGM_FI_DEV_POWER_VIOLATION": + fallthrough + case "DCGM_FI_DEV_RELIABILITY_VIOLATION": + fallthrough + case "DCGM_FI_DEV_SYNC_BOOST_VIOLATION": + fallthrough + case "DCGM_FI_DEV_THERMAL_VIOLATION": + fallthrough + case "DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION": + fallthrough + case "DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION": + assert.GreaterOrEqual(t, metric.asInt64(), int64(0)) + assert.LessOrEqual(t, metric.asInt64(), time.Now().UnixMicro()) + case "DCGM_FI_DEV_ECC_DBE_VOL_TOTAL": + fallthrough + case "DCGM_FI_DEV_ECC_SBE_VOL_TOTAL": + // arbitrary max of 100000000 errors + assert.GreaterOrEqual(t, metric.asInt64(), int64(0)) + assert.LessOrEqual(t, metric.asInt64(), int64(100000000)) + case "DCGM_FI_DEV_GPU_TEMP": + // arbitrary max of 100000 °C + assert.GreaterOrEqual(t, metric.asFloat64(), float64(0.0)) + assert.LessOrEqual(t, metric.asFloat64(), float64(100000.0)) + case "DCGM_FI_DEV_SM_CLOCK": + // arbitrary max of 100000 MHz + assert.GreaterOrEqual(t, metric.asFloat64(), float64(0.0)) + assert.LessOrEqual(t, metric.asFloat64(), float64(100000.0)) + case "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION": + // TODO + case "DCGM_FI_DEV_POWER_USAGE": + // TODO default: t.Errorf("Unexpected metric '%s'", metric.name) } diff --git a/receiver/dcgmreceiver/scraper.go b/receiver/dcgmreceiver/scraper.go index e8db2904b..3b1c7c9e5 100644 --- a/receiver/dcgmreceiver/scraper.go +++ b/receiver/dcgmreceiver/scraper.go @@ -98,40 +98,100 @@ func (s *dcgmScraper) scrape(_ context.Context) (pmetric.Metrics, error) { gpuResource := rb.Emit() for _, metric := range metrics { switch metric.name { - case "DCGM_FI_DEV_GPU_UTIL": - gpuUtil := float64(metric.asInt64()) / 100.0 /* normalize */ - s.mb.RecordDcgmGpuUtilizationDataPoint(now, gpuUtil) - case "DCGM_FI_DEV_FB_USED": - bytesUsed := 1e6 * metric.asInt64() /* MB to B */ - s.mb.RecordDcgmGpuMemoryBytesUsedDataPoint(now, bytesUsed, metadata.AttributeMemoryStateUsed) - case "DCGM_FI_DEV_FB_FREE": - bytesFree := 1e6 * metric.asInt64() /* MB to B */ - s.mb.RecordDcgmGpuMemoryBytesUsedDataPoint(now, bytesFree, metadata.AttributeMemoryStateFree) + case "DCGM_FI_PROF_GR_ENGINE_ACTIVE": + s.mb.RecordGpuDcgmUtilizationDataPoint(now, metric.asFloat64()) + // TODO: fallback + //case "DCGM_FI_DEV_GPU_UTIL": + // gpuUtil := float64(metric.asInt64()) / 100.0 /* normalize */ + // s.mb.RecordGpuDcgmUtilizationDataPoint(now, gpuUtil) case "DCGM_FI_PROF_SM_ACTIVE": - s.mb.RecordDcgmGpuProfilingSmUtilizationDataPoint(now, metric.asFloat64()) + s.mb.RecordGpuDcgmSmUtilizationDataPoint(now, metric.asFloat64()) case "DCGM_FI_PROF_SM_OCCUPANCY": - s.mb.RecordDcgmGpuProfilingSmOccupancyDataPoint(now, metric.asFloat64()) + s.mb.RecordGpuDcgmSmOccupancyDataPoint(now, metric.asFloat64()) case "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE": - s.mb.RecordDcgmGpuProfilingPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributePipeTensor) + s.mb.RecordGpuDcgmPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributePipeTensor) case "DCGM_FI_PROF_PIPE_FP64_ACTIVE": - s.mb.RecordDcgmGpuProfilingPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributePipeFp64) + s.mb.RecordGpuDcgmPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributePipeFp64) case "DCGM_FI_PROF_PIPE_FP32_ACTIVE": - s.mb.RecordDcgmGpuProfilingPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributePipeFp32) + s.mb.RecordGpuDcgmPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributePipeFp32) case "DCGM_FI_PROF_PIPE_FP16_ACTIVE": - s.mb.RecordDcgmGpuProfilingPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributePipeFp16) + s.mb.RecordGpuDcgmPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributePipeFp16) + case "DCGM_FI_DEV_ENC_UTIL": + encUtil := float64(metric.asInt64()) / 100.0 /* normalize */ + s.mb.RecordGpuDcgmCodecEncoderUtilizationDataPoint(now, encUtil) + case "DCGM_FI_DEV_DEC_UTIL": + decUtil := float64(metric.asInt64()) / 100.0 /* normalize */ + s.mb.RecordGpuDcgmCodecDecoderUtilizationDataPoint(now, decUtil) + case "DCGM_FI_DEV_FB_FREE": + bytesFree := 1e6 * metric.asInt64() /* MBy to By */ + s.mb.RecordGpuDcgmMemoryBytesUsedDataPoint(now, bytesFree, metadata.AttributeMemoryStateFree) + case "DCGM_FI_DEV_FB_USED": + bytesUsed := 1e6 * metric.asInt64() /* MBy to By */ + s.mb.RecordGpuDcgmMemoryBytesUsedDataPoint(now, bytesUsed, metadata.AttributeMemoryStateUsed) + case "DCGM_FI_DEV_FB_RESERVED": + bytesFree := 1e6 * metric.asInt64() /* MBy to By */ + s.mb.RecordGpuDcgmMemoryBytesUsedDataPoint(now, bytesFree, metadata.AttributeMemoryStateReserved) case "DCGM_FI_PROF_DRAM_ACTIVE": - s.mb.RecordDcgmGpuProfilingDramUtilizationDataPoint(now, metric.asFloat64()) + s.mb.RecordGpuDcgmMemoryBandwidthUtilizationDataPoint(now, metric.asFloat64()) + // TODO: fallback + //case "DCGM_FI_DEV_MEM_COPY_UTIL": + // memCopyUtil := float64(metric.asInt64()) / 100.0 /* normalize */ + // s.mb.RecordGpuDcgmMemoryBandwidthUtilizationDataPoint(now, memCopyUtil) case "DCGM_FI_PROF_PCIE_TX_BYTES": - /* DCGM already returns these as bytes/sec despite the name */ - s.mb.RecordDcgmGpuProfilingPcieTrafficRateDataPoint(now, metric.asInt64(), metadata.AttributeDirectionTx) + pcieTx := int64(float64(metric.asInt64()) * (s.config.CollectionInterval.Seconds())) /* rate to delta */ + s.mb.RecordGpuDcgmPcieTrafficDataPoint(now, pcieTx, metadata.AttributeDirectionTx) case "DCGM_FI_PROF_PCIE_RX_BYTES": - s.mb.RecordDcgmGpuProfilingPcieTrafficRateDataPoint(now, metric.asInt64(), metadata.AttributeDirectionRx) + pcieRx := int64(float64(metric.asInt64()) * (s.config.CollectionInterval.Seconds())) /* rate to delta */ + s.mb.RecordGpuDcgmPcieTrafficDataPoint(now, pcieRx, metadata.AttributeDirectionRx) case "DCGM_FI_PROF_NVLINK_TX_BYTES": - s.mb.RecordDcgmGpuProfilingNvlinkTrafficRateDataPoint(now, metric.asInt64(), metadata.AttributeDirectionTx) + nvlinkTx := int64(float64(metric.asInt64()) * (s.config.CollectionInterval.Seconds())) /* rate to delta */ + s.mb.RecordGpuDcgmNvlinkTrafficDataPoint(now, nvlinkTx, metadata.AttributeDirectionTx) case "DCGM_FI_PROF_NVLINK_RX_BYTES": - s.mb.RecordDcgmGpuProfilingNvlinkTrafficRateDataPoint(now, metric.asInt64(), metadata.AttributeDirectionRx) + nvlinkRx := int64(float64(metric.asInt64()) * (s.config.CollectionInterval.Seconds())) /* rate to delta */ + s.mb.RecordGpuDcgmNvlinkTrafficDataPoint(now, nvlinkRx, metadata.AttributeDirectionRx) + case "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION": + s.mb.RecordGpuDcgmEnergyConsumptionDataPoint(now, metric.asFloat64()) + // TODO: fallback + //case "DCGM_FI_DEV_POWER_USAGE": + // powerUsage := metric.asFloat64() * (s.config.CollectionInterval.Seconds()) /* rate to delta */ // TODO: cumulative + // s.mb.RecordGpuDcgmEnergyConsumptionDataPoint(now, powerUsage) + case "DCGM_FI_DEV_GPU_TEMP": + s.mb.RecordGpuDcgmTemperatureDataPoint(now, metric.asFloat64()) + case "DCGM_FI_DEV_SM_CLOCK": + clockFreq := 1e6 * metric.asFloat64() /* MHz to Hz */ + s.mb.RecordGpuDcgmClockFrequencyDataPoint(now, clockFreq) + case "DCGM_FI_DEV_POWER_VIOLATION": + violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeViolationPower) + case "DCGM_FI_DEV_THERMAL_VIOLATION": + violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeViolationThermal) + case "DCGM_FI_DEV_SYNC_BOOST_VIOLATION": + violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeViolationSyncBoost) + case "DCGM_FI_DEV_BOARD_LIMIT_VIOLATION": + violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeViolationBoardLimit) + case "DCGM_FI_DEV_LOW_UTIL_VIOLATION": + violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeViolationLowUtil) + case "DCGM_FI_DEV_RELIABILITY_VIOLATION": + violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeViolationReliability) + case "DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION": + violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeViolationAppClock) + case "DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION": + violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeViolationBaseClock) + case "DCGM_FI_DEV_ECC_SBE_VOL_TOTAL": + s.mb.RecordGpuDcgmEccErrorsDataPoint(now, metric.asInt64(), metadata.AttributeErrorTypeSbe) + case "DCGM_FI_DEV_ECC_DBE_VOL_TOTAL": + s.mb.RecordGpuDcgmEccErrorsDataPoint(now, metric.asInt64(), metadata.AttributeErrorTypeDbe) } } + // TODO: XID errors. + //s.mb.RecordGpuDcgmXidErrorsDataPoint(now, metric.asInt64(), xid) s.mb.EmitForResource(metadata.WithResource(gpuResource)) } diff --git a/receiver/dcgmreceiver/scraper_gpu_test.go b/receiver/dcgmreceiver/scraper_gpu_test.go index 2cd5845b7..0fd2b0ffc 100644 --- a/receiver/dcgmreceiver/scraper_gpu_test.go +++ b/receiver/dcgmreceiver/scraper_gpu_test.go @@ -96,28 +96,52 @@ func TestScrapeWithEmptyMetricsConfig(t *testing.T) { Endpoint: defaultEndpoint, }, Metrics: metadata.MetricsConfig{ - DcgmGpuMemoryBytesUsed: metadata.MetricConfig{ + GpuDcgmClockFrequency: metadata.MetricConfig{ Enabled: false, }, - DcgmGpuProfilingDramUtilization: metadata.MetricConfig{ + GpuDcgmClockThrottleDurationTime: metadata.MetricConfig{ Enabled: false, }, - DcgmGpuProfilingNvlinkTrafficRate: metadata.MetricConfig{ + GpuDcgmCodecDecoderUtilization: metadata.MetricConfig{ Enabled: false, }, - DcgmGpuProfilingPcieTrafficRate: metadata.MetricConfig{ + GpuDcgmCodecEncoderUtilization: metadata.MetricConfig{ Enabled: false, }, - DcgmGpuProfilingPipeUtilization: metadata.MetricConfig{ + GpuDcgmEccErrors: metadata.MetricConfig{ Enabled: false, }, - DcgmGpuProfilingSmOccupancy: metadata.MetricConfig{ + GpuDcgmEnergyConsumption: metadata.MetricConfig{ Enabled: false, }, - DcgmGpuProfilingSmUtilization: metadata.MetricConfig{ + GpuDcgmMemoryBandwidthUtilization: metadata.MetricConfig{ Enabled: false, }, - DcgmGpuUtilization: metadata.MetricConfig{ + GpuDcgmMemoryBytesUsed: metadata.MetricConfig{ + Enabled: false, + }, + GpuDcgmNvlinkTraffic: metadata.MetricConfig{ + Enabled: false, + }, + GpuDcgmPcieTraffic: metadata.MetricConfig{ + Enabled: false, + }, + GpuDcgmPipeUtilization: metadata.MetricConfig{ + Enabled: false, + }, + GpuDcgmSmOccupancy: metadata.MetricConfig{ + Enabled: false, + }, + GpuDcgmSmUtilization: metadata.MetricConfig{ + Enabled: false, + }, + GpuDcgmTemperature: metadata.MetricConfig{ + Enabled: false, + }, + GpuDcgmUtilization: metadata.MetricConfig{ + Enabled: false, + }, + GpuDcgmXidErrors: metadata.MetricConfig{ Enabled: false, }, }, @@ -178,15 +202,23 @@ func TestScrapeOnProfilingPaused(t *testing.T) { assert.NoError(t, err) expectedMetrics := []string{ - "dcgm.gpu.utilization", - "dcgm.gpu.memory.bytes_used", + //TODO "gpu.dcgm.utilization", + "gpu.dcgm.codec.decoder.utilization", + "gpu.dcgm.codec.encoder.utilization", + "gpu.dcgm.memory.bytes_used", + //TODO "gpu.dcgm.memory.bandwidth_utilization", + //TODO "gpu.dcgm.energy_consumption", + "gpu.dcgm.temperature", + "gpu.dcgm.clock.frequency", + "gpu.dcgm.clock.throttle_duration.time", + "gpu.dcgm.ecc_errors", } ilms := metrics.ResourceMetrics().At(0).ScopeMetrics() require.Equal(t, 1, ilms.Len()) ms := ilms.At(0).Metrics() - require.Equal(t, len(expectedMetrics), ms.Len()) + require.LessOrEqual(t, len(expectedMetrics), ms.Len()) metricWasSeen := make(map[string]bool) for i := 0; i < ms.Len(); i++ { @@ -205,24 +237,47 @@ func loadExpectedScraperMetrics(t *testing.T, model string) map[string]int { t.Helper() expectedMetrics := make(map[string]int) receiverMetricNameToScraperMetricName := map[string]string{ - "DCGM_FI_DEV_GPU_UTIL": "dcgm.gpu.utilization", - "DCGM_FI_DEV_FB_USED": "dcgm.gpu.memory.bytes_used", - "DCGM_FI_DEV_FB_FREE": "dcgm.gpu.memory.bytes_used", - "DCGM_FI_PROF_SM_ACTIVE": "dcgm.gpu.profiling.sm_utilization", - "DCGM_FI_PROF_SM_OCCUPANCY": "dcgm.gpu.profiling.sm_occupancy", - "DCGM_FI_PROF_DRAM_ACTIVE": "dcgm.gpu.profiling.dram_utilization", - "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE": "dcgm.gpu.profiling.pipe_utilization", - "DCGM_FI_PROF_PIPE_FP64_ACTIVE": "dcgm.gpu.profiling.pipe_utilization", - "DCGM_FI_PROF_PIPE_FP32_ACTIVE": "dcgm.gpu.profiling.pipe_utilization", - "DCGM_FI_PROF_PIPE_FP16_ACTIVE": "dcgm.gpu.profiling.pipe_utilization", - "DCGM_FI_PROF_PCIE_TX_BYTES": "dcgm.gpu.profiling.pcie_traffic_rate", - "DCGM_FI_PROF_PCIE_RX_BYTES": "dcgm.gpu.profiling.pcie_traffic_rate", - "DCGM_FI_PROF_NVLINK_TX_BYTES": "dcgm.gpu.profiling.nvlink_traffic_rate", - "DCGM_FI_PROF_NVLINK_RX_BYTES": "dcgm.gpu.profiling.nvlink_traffic_rate", + "DCGM_FI_PROF_GR_ENGINE_ACTIVE": "gpu.dcgm.utilization", + //"DCGM_FI_DEV_GPU_UTIL": "gpu.dcgm.utilization", + "DCGM_FI_PROF_SM_ACTIVE": "gpu.dcgm.sm.utilization", + "DCGM_FI_PROF_SM_OCCUPANCY": "gpu.dcgm.sm.occupancy", + "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE": "gpu.dcgm.pipe.utilization", + "DCGM_FI_PROF_PIPE_FP64_ACTIVE": "gpu.dcgm.pipe.utilization", + "DCGM_FI_PROF_PIPE_FP32_ACTIVE": "gpu.dcgm.pipe.utilization", + "DCGM_FI_PROF_PIPE_FP16_ACTIVE": "gpu.dcgm.pipe.utilization", + "DCGM_FI_DEV_ENC_UTIL": "gpu.dcgm.codec.encoder.utilization", + "DCGM_FI_DEV_DEC_UTIL": "gpu.dcgm.codec.decoder.utilization", + "DCGM_FI_DEV_FB_FREE": "gpu.dcgm.memory.bytes_used", + "DCGM_FI_DEV_FB_USED": "gpu.dcgm.memory.bytes_used", + "DCGM_FI_DEV_FB_RESERVED": "gpu.dcgm.memory.bytes_used", + "DCGM_FI_PROF_DRAM_ACTIVE": "gpu.dcgm.memory.bandwidth_utilization", + //"DCGM_FI_DEV_MEM_COPY_UTIL": "gpu.dcgm.memory.bandwidth_utilization", + "DCGM_FI_PROF_PCIE_TX_BYTES": "gpu.dcgm.pcie.traffic", + "DCGM_FI_PROF_PCIE_RX_BYTES": "gpu.dcgm.pcie.traffic", + "DCGM_FI_PROF_NVLINK_TX_BYTES": "gpu.dcgm.nvlink.traffic", + "DCGM_FI_PROF_NVLINK_RX_BYTES": "gpu.dcgm.nvlink.traffic", + "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION": "gpu.dcgm.energy_consumption", + //"DCGM_FI_DEV_POWER_USAGE": "gpu.dcgm.energy_consumption", + "DCGM_FI_DEV_GPU_TEMP": "gpu.dcgm.temperature", + "DCGM_FI_DEV_SM_CLOCK": "gpu.dcgm.clock.frequency", + "DCGM_FI_DEV_POWER_VIOLATION": "gpu.dcgm.clock.throttle_duration.time", + "DCGM_FI_DEV_THERMAL_VIOLATION": "gpu.dcgm.clock.throttle_duration.time", + "DCGM_FI_DEV_SYNC_BOOST_VIOLATION": "gpu.dcgm.clock.throttle_duration.time", + "DCGM_FI_DEV_BOARD_LIMIT_VIOLATION": "gpu.dcgm.clock.throttle_duration.time", + "DCGM_FI_DEV_LOW_UTIL_VIOLATION": "gpu.dcgm.clock.throttle_duration.time", + "DCGM_FI_DEV_RELIABILITY_VIOLATION": "gpu.dcgm.clock.throttle_duration.time", + "DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION": "gpu.dcgm.clock.throttle_duration.time", + "DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION": "gpu.dcgm.clock.throttle_duration.time", + "DCGM_FI_DEV_ECC_SBE_VOL_TOTAL": "gpu.dcgm.ecc_errors", + "DCGM_FI_DEV_ECC_DBE_VOL_TOTAL": "gpu.dcgm.ecc_errors", } expectedReceiverMetrics := LoadExpectedMetrics(t, model) for _, em := range expectedReceiverMetrics { - expectedMetrics[receiverMetricNameToScraperMetricName[em]] += 1 + scraperMetric := receiverMetricNameToScraperMetricName[em] + if scraperMetric != "" { + expectedMetrics[scraperMetric] += 1 + } + // TODO: fallbacks. } return expectedMetrics } @@ -250,29 +305,83 @@ func validateScraperResult(t *testing.T, metrics pmetric.Metrics, expectedMetric ms := ilms.At(0).Metrics() for i := 0; i < ms.Len(); i++ { m := ms.At(i) - dps := m.Gauge().DataPoints() + var dps pmetric.NumberDataPointSlice + switch m.Name() { + case "gpu.dcgm.utilization": + fallthrough + case "gpu.dcgm.sm.utilization": + fallthrough + case "gpu.dcgm.sm.occupancy": + fallthrough + case "gpu.dcgm.pipe.utilization": + fallthrough + case "gpu.dcgm.codec.encoder.utilization": + fallthrough + case "gpu.dcgm.codec.decoder.utilization": + fallthrough + case "gpu.dcgm.memory.bytes_used": + fallthrough + case "gpu.dcgm.memory.bandwidth_utilization": + fallthrough + case "gpu.dcgm.temperature": + fallthrough + case "gpu.dcgm.clock.frequency": + dps = m.Gauge().DataPoints() + case "gpu.dcgm.energy_consumption": + fallthrough + case "gpu.dcgm.clock.throttle_duration.time": + fallthrough + case "gpu.dcgm.pcie.traffic": + fallthrough + case "gpu.dcgm.nvlink.traffic": + fallthrough + case "gpu.dcgm.ecc_errors": + fallthrough + case "gpu.dcgm.xid_errors": + dps = m.Sum().DataPoints() + default: + t.Errorf("Unexpected metric %s", m.Name()) + } assert.LessOrEqual(t, expectedMetrics[m.Name()], dps.Len()) switch m.Name() { - case "dcgm.gpu.utilization": - case "dcgm.gpu.memory.bytes_used": + case "gpu.dcgm.utilization": + case "gpu.dcgm.sm.utilization": + case "gpu.dcgm.sm.occupancy": + case "gpu.dcgm.pipe.utilization": for j := 0; j < dps.Len(); j++ { - assert.Contains(t, dps.At(j).Attributes().AsRaw(), "memory_state") + assert.Contains(t, dps.At(j).Attributes().AsRaw(), "pipe") } - case "dcgm.gpu.profiling.sm_utilization": - case "dcgm.gpu.profiling.sm_occupancy": - case "dcgm.gpu.profiling.dram_utilization": - case "dcgm.gpu.profiling.pipe_utilization": + case "gpu.dcgm.codec.encoder.utilization": + case "gpu.dcgm.codec.decoder.utilization": + case "gpu.dcgm.memory.bytes_used": for j := 0; j < dps.Len(); j++ { - assert.Contains(t, dps.At(j).Attributes().AsRaw(), "pipe") + assert.Contains(t, dps.At(j).Attributes().AsRaw(), "memory_state") } - case "dcgm.gpu.profiling.pcie_traffic_rate": + case "gpu.dcgm.memory.bandwidth_utilization": + case "gpu.dcgm.pcie.traffic": fallthrough - case "dcgm.gpu.profiling.nvlink_traffic_rate": + case "gpu.dcgm.nvlink.traffic": for j := 0; j < dps.Len(); j++ { assert.Contains(t, dps.At(j).Attributes().AsRaw(), "direction") } + case "gpu.dcgm.energy_consumption": + case "gpu.dcgm.temperature": + case "gpu.dcgm.clock.frequency": + case "gpu.dcgm.clock.throttle_duration.time": + for j := 0; j < dps.Len(); j++ { + assert.Contains(t, dps.At(j).Attributes().AsRaw(), "violation") + } + case "gpu.dcgm.ecc_errors": + for j := 0; j < dps.Len(); j++ { + assert.Contains(t, dps.At(j).Attributes().AsRaw(), "error_type") + } + // TODO + //case "gpu.dcgm.xid_errors": + // for j := 0; j < dps.Len(); j++ { + // assert.Contains(t, dps.At(j).Attributes().AsRaw(), "xid") + // } default: t.Errorf("Unexpected metric %s", m.Name()) } diff --git a/receiver/dcgmreceiver/testdata/NVIDIA_A100-SXM4-40GB.yaml b/receiver/dcgmreceiver/testdata/NVIDIA_A100-SXM4-40GB.yaml index 46631337b..71585345e 100644 --- a/receiver/dcgmreceiver/testdata/NVIDIA_A100-SXM4-40GB.yaml +++ b/receiver/dcgmreceiver/testdata/NVIDIA_A100-SXM4-40GB.yaml @@ -1,16 +1,35 @@ model: NVIDIA A100-SXM4-40GB supported_fields: + - DCGM_FI_PROF_GR_ENGINE_ACTIVE - DCGM_FI_DEV_GPU_UTIL - - DCGM_FI_DEV_FB_USED - - DCGM_FI_DEV_FB_FREE - DCGM_FI_PROF_SM_ACTIVE - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE - DCGM_FI_PROF_PIPE_FP64_ACTIVE - DCGM_FI_PROF_PIPE_FP32_ACTIVE - DCGM_FI_PROF_PIPE_FP16_ACTIVE + - DCGM_FI_DEV_ENC_UTIL + - DCGM_FI_DEV_DEC_UTIL + - DCGM_FI_DEV_FB_FREE + - DCGM_FI_DEV_FB_USED + - DCGM_FI_DEV_FB_RESERVED - DCGM_FI_PROF_DRAM_ACTIVE + - DCGM_FI_DEV_MEM_COPY_UTIL - DCGM_FI_PROF_PCIE_TX_BYTES - DCGM_FI_PROF_PCIE_RX_BYTES - DCGM_FI_PROF_NVLINK_TX_BYTES - DCGM_FI_PROF_NVLINK_RX_BYTES + - DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION + - DCGM_FI_DEV_POWER_USAGE + - DCGM_FI_DEV_GPU_TEMP + - DCGM_FI_DEV_SM_CLOCK + - DCGM_FI_DEV_POWER_VIOLATION + - DCGM_FI_DEV_THERMAL_VIOLATION + - DCGM_FI_DEV_SYNC_BOOST_VIOLATION + - DCGM_FI_DEV_BOARD_LIMIT_VIOLATION + - DCGM_FI_DEV_LOW_UTIL_VIOLATION + - DCGM_FI_DEV_RELIABILITY_VIOLATION + - DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION + - DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION + - DCGM_FI_DEV_ECC_SBE_VOL_TOTAL + - DCGM_FI_DEV_ECC_DBE_VOL_TOTAL unsupported_fields: [] diff --git a/receiver/dcgmreceiver/testdata/NVIDIA_L4.yaml b/receiver/dcgmreceiver/testdata/NVIDIA_L4.yaml index 2a7083e80..faf59ac8a 100644 --- a/receiver/dcgmreceiver/testdata/NVIDIA_L4.yaml +++ b/receiver/dcgmreceiver/testdata/NVIDIA_L4.yaml @@ -1,16 +1,35 @@ model: NVIDIA L4 supported_fields: + - DCGM_FI_PROF_GR_ENGINE_ACTIVE - DCGM_FI_DEV_GPU_UTIL - - DCGM_FI_DEV_FB_USED - - DCGM_FI_DEV_FB_FREE - DCGM_FI_PROF_SM_ACTIVE - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE - DCGM_FI_PROF_PIPE_FP32_ACTIVE - DCGM_FI_PROF_PIPE_FP16_ACTIVE + - DCGM_FI_DEV_ENC_UTIL + - DCGM_FI_DEV_DEC_UTIL + - DCGM_FI_DEV_FB_FREE + - DCGM_FI_DEV_FB_USED + - DCGM_FI_DEV_FB_RESERVED - DCGM_FI_PROF_DRAM_ACTIVE + - DCGM_FI_DEV_MEM_COPY_UTIL - DCGM_FI_PROF_PCIE_TX_BYTES - DCGM_FI_PROF_PCIE_RX_BYTES - DCGM_FI_PROF_NVLINK_TX_BYTES - DCGM_FI_PROF_NVLINK_RX_BYTES + - DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION + - DCGM_FI_DEV_POWER_USAGE + - DCGM_FI_DEV_GPU_TEMP + - DCGM_FI_DEV_SM_CLOCK + - DCGM_FI_DEV_POWER_VIOLATION + - DCGM_FI_DEV_THERMAL_VIOLATION + - DCGM_FI_DEV_SYNC_BOOST_VIOLATION + - DCGM_FI_DEV_BOARD_LIMIT_VIOLATION + - DCGM_FI_DEV_LOW_UTIL_VIOLATION + - DCGM_FI_DEV_RELIABILITY_VIOLATION + - DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION + - DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION + - DCGM_FI_DEV_ECC_SBE_VOL_TOTAL + - DCGM_FI_DEV_ECC_DBE_VOL_TOTAL unsupported_fields: - DCGM_FI_PROF_PIPE_FP64_ACTIVE diff --git a/receiver/dcgmreceiver/testdata/Tesla_P100-PCIE-16GB.yaml b/receiver/dcgmreceiver/testdata/Tesla_P100-PCIE-16GB.yaml index 296efe4d2..b7c168027 100644 --- a/receiver/dcgmreceiver/testdata/Tesla_P100-PCIE-16GB.yaml +++ b/receiver/dcgmreceiver/testdata/Tesla_P100-PCIE-16GB.yaml @@ -1,9 +1,28 @@ model: Tesla P100-PCIE-16GB supported_fields: - DCGM_FI_DEV_GPU_UTIL - - DCGM_FI_DEV_FB_USED + - DCGM_FI_DEV_ENC_UTIL + - DCGM_FI_DEV_DEC_UTIL - DCGM_FI_DEV_FB_FREE + - DCGM_FI_DEV_FB_USED + - DCGM_FI_DEV_FB_RESERVED + - DCGM_FI_DEV_MEM_COPY_UTIL + - DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION + - DCGM_FI_DEV_POWER_USAGE + - DCGM_FI_DEV_GPU_TEMP + - DCGM_FI_DEV_SM_CLOCK + - DCGM_FI_DEV_POWER_VIOLATION + - DCGM_FI_DEV_THERMAL_VIOLATION + - DCGM_FI_DEV_SYNC_BOOST_VIOLATION + - DCGM_FI_DEV_BOARD_LIMIT_VIOLATION + - DCGM_FI_DEV_LOW_UTIL_VIOLATION + - DCGM_FI_DEV_RELIABILITY_VIOLATION + - DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION + - DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION + - DCGM_FI_DEV_ECC_SBE_VOL_TOTAL + - DCGM_FI_DEV_ECC_DBE_VOL_TOTAL unsupported_fields: + - DCGM_FI_PROF_GR_ENGINE_ACTIVE - DCGM_FI_PROF_SM_ACTIVE - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE - DCGM_FI_PROF_PIPE_FP64_ACTIVE diff --git a/receiver/dcgmreceiver/testdata/Tesla_P4.yaml b/receiver/dcgmreceiver/testdata/Tesla_P4.yaml index d9f715dde..251ec04fa 100644 --- a/receiver/dcgmreceiver/testdata/Tesla_P4.yaml +++ b/receiver/dcgmreceiver/testdata/Tesla_P4.yaml @@ -1,9 +1,28 @@ model: Tesla P4 supported_fields: - DCGM_FI_DEV_GPU_UTIL - - DCGM_FI_DEV_FB_USED + - DCGM_FI_DEV_ENC_UTIL + - DCGM_FI_DEV_DEC_UTIL - DCGM_FI_DEV_FB_FREE + - DCGM_FI_DEV_FB_USED + - DCGM_FI_DEV_FB_RESERVED + - DCGM_FI_DEV_MEM_COPY_UTIL + - DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION + - DCGM_FI_DEV_POWER_USAGE + - DCGM_FI_DEV_GPU_TEMP + - DCGM_FI_DEV_SM_CLOCK + - DCGM_FI_DEV_POWER_VIOLATION + - DCGM_FI_DEV_THERMAL_VIOLATION + - DCGM_FI_DEV_SYNC_BOOST_VIOLATION + - DCGM_FI_DEV_BOARD_LIMIT_VIOLATION + - DCGM_FI_DEV_LOW_UTIL_VIOLATION + - DCGM_FI_DEV_RELIABILITY_VIOLATION + - DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION + - DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION + - DCGM_FI_DEV_ECC_SBE_VOL_TOTAL + - DCGM_FI_DEV_ECC_DBE_VOL_TOTAL unsupported_fields: + - DCGM_FI_PROF_GR_ENGINE_ACTIVE - DCGM_FI_PROF_SM_ACTIVE - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE - DCGM_FI_PROF_PIPE_FP64_ACTIVE diff --git a/receiver/dcgmreceiver/testdata/Tesla_T4.yaml b/receiver/dcgmreceiver/testdata/Tesla_T4.yaml index 29978f7e5..3ab8dba88 100644 --- a/receiver/dcgmreceiver/testdata/Tesla_T4.yaml +++ b/receiver/dcgmreceiver/testdata/Tesla_T4.yaml @@ -1,16 +1,35 @@ model: Tesla T4 supported_fields: + - DCGM_FI_PROF_GR_ENGINE_ACTIVE - DCGM_FI_DEV_GPU_UTIL - - DCGM_FI_DEV_FB_USED - - DCGM_FI_DEV_FB_FREE - DCGM_FI_PROF_SM_ACTIVE - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE - DCGM_FI_PROF_PIPE_FP64_ACTIVE - DCGM_FI_PROF_PIPE_FP32_ACTIVE - DCGM_FI_PROF_PIPE_FP16_ACTIVE + - DCGM_FI_DEV_ENC_UTIL + - DCGM_FI_DEV_DEC_UTIL + - DCGM_FI_DEV_FB_FREE + - DCGM_FI_DEV_FB_USED + - DCGM_FI_DEV_FB_RESERVED - DCGM_FI_PROF_DRAM_ACTIVE + - DCGM_FI_DEV_MEM_COPY_UTIL - DCGM_FI_PROF_PCIE_TX_BYTES - DCGM_FI_PROF_PCIE_RX_BYTES - DCGM_FI_PROF_NVLINK_TX_BYTES - DCGM_FI_PROF_NVLINK_RX_BYTES + - DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION + - DCGM_FI_DEV_POWER_USAGE + - DCGM_FI_DEV_GPU_TEMP + - DCGM_FI_DEV_SM_CLOCK + - DCGM_FI_DEV_POWER_VIOLATION + - DCGM_FI_DEV_THERMAL_VIOLATION + - DCGM_FI_DEV_SYNC_BOOST_VIOLATION + - DCGM_FI_DEV_BOARD_LIMIT_VIOLATION + - DCGM_FI_DEV_LOW_UTIL_VIOLATION + - DCGM_FI_DEV_RELIABILITY_VIOLATION + - DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION + - DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION + - DCGM_FI_DEV_ECC_SBE_VOL_TOTAL + - DCGM_FI_DEV_ECC_DBE_VOL_TOTAL unsupported_fields: [] diff --git a/receiver/dcgmreceiver/testdata/Tesla_V100-SXM2-16GB.yaml b/receiver/dcgmreceiver/testdata/Tesla_V100-SXM2-16GB.yaml index 151e5a3ed..ef5321980 100644 --- a/receiver/dcgmreceiver/testdata/Tesla_V100-SXM2-16GB.yaml +++ b/receiver/dcgmreceiver/testdata/Tesla_V100-SXM2-16GB.yaml @@ -1,16 +1,35 @@ model: Tesla V100-SXM2-16GB supported_fields: + - DCGM_FI_PROF_GR_ENGINE_ACTIVE - DCGM_FI_DEV_GPU_UTIL - - DCGM_FI_DEV_FB_USED - - DCGM_FI_DEV_FB_FREE - DCGM_FI_PROF_SM_ACTIVE - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE - DCGM_FI_PROF_PIPE_FP64_ACTIVE - DCGM_FI_PROF_PIPE_FP32_ACTIVE - DCGM_FI_PROF_PIPE_FP16_ACTIVE + - DCGM_FI_DEV_ENC_UTIL + - DCGM_FI_DEV_DEC_UTIL + - DCGM_FI_DEV_FB_FREE + - DCGM_FI_DEV_FB_USED + - DCGM_FI_DEV_FB_RESERVED - DCGM_FI_PROF_DRAM_ACTIVE + - DCGM_FI_DEV_MEM_COPY_UTIL - DCGM_FI_PROF_PCIE_TX_BYTES - DCGM_FI_PROF_PCIE_RX_BYTES - DCGM_FI_PROF_NVLINK_TX_BYTES - DCGM_FI_PROF_NVLINK_RX_BYTES + - DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION + - DCGM_FI_DEV_POWER_USAGE + - DCGM_FI_DEV_GPU_TEMP + - DCGM_FI_DEV_SM_CLOCK + - DCGM_FI_DEV_POWER_VIOLATION + - DCGM_FI_DEV_THERMAL_VIOLATION + - DCGM_FI_DEV_SYNC_BOOST_VIOLATION + - DCGM_FI_DEV_BOARD_LIMIT_VIOLATION + - DCGM_FI_DEV_LOW_UTIL_VIOLATION + - DCGM_FI_DEV_RELIABILITY_VIOLATION + - DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION + - DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION + - DCGM_FI_DEV_ECC_SBE_VOL_TOTAL + - DCGM_FI_DEV_ECC_DBE_VOL_TOTAL unsupported_fields: [] From 5c288689a4858da687936adefd0d0f150d1b04b4 Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Wed, 19 Jun 2024 14:17:36 -0400 Subject: [PATCH 11/38] Add test data for H100. --- .../testdata/NVIDIA_H100_80GB_HBM3.yaml | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 receiver/dcgmreceiver/testdata/NVIDIA_H100_80GB_HBM3.yaml diff --git a/receiver/dcgmreceiver/testdata/NVIDIA_H100_80GB_HBM3.yaml b/receiver/dcgmreceiver/testdata/NVIDIA_H100_80GB_HBM3.yaml new file mode 100644 index 000000000..8874e9331 --- /dev/null +++ b/receiver/dcgmreceiver/testdata/NVIDIA_H100_80GB_HBM3.yaml @@ -0,0 +1,35 @@ +model: NVIDIA H100 80GB HBM3 +supported_fields: + - DCGM_FI_PROF_GR_ENGINE_ACTIVE + - DCGM_FI_DEV_GPU_UTIL + - DCGM_FI_PROF_SM_ACTIVE + - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE + - DCGM_FI_PROF_PIPE_FP64_ACTIVE + - DCGM_FI_PROF_PIPE_FP32_ACTIVE + - DCGM_FI_PROF_PIPE_FP16_ACTIVE + - DCGM_FI_DEV_ENC_UTIL + - DCGM_FI_DEV_DEC_UTIL + - DCGM_FI_DEV_FB_FREE + - DCGM_FI_DEV_FB_USED + - DCGM_FI_DEV_FB_RESERVED + - DCGM_FI_PROF_DRAM_ACTIVE + - DCGM_FI_DEV_MEM_COPY_UTIL + - DCGM_FI_PROF_PCIE_TX_BYTES + - DCGM_FI_PROF_PCIE_RX_BYTES + - DCGM_FI_PROF_NVLINK_TX_BYTES + - DCGM_FI_PROF_NVLINK_RX_BYTES + - DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION + - DCGM_FI_DEV_POWER_USAGE + - DCGM_FI_DEV_GPU_TEMP + - DCGM_FI_DEV_SM_CLOCK + - DCGM_FI_DEV_POWER_VIOLATION + - DCGM_FI_DEV_THERMAL_VIOLATION + - DCGM_FI_DEV_SYNC_BOOST_VIOLATION + - DCGM_FI_DEV_BOARD_LIMIT_VIOLATION + - DCGM_FI_DEV_LOW_UTIL_VIOLATION + - DCGM_FI_DEV_RELIABILITY_VIOLATION + - DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION + - DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION + - DCGM_FI_DEV_ECC_SBE_VOL_TOTAL + - DCGM_FI_DEV_ECC_DBE_VOL_TOTAL +unsupported_fields: [] From 80430df0eaa14b8dafe891c0fda34078ed9d5aad Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Wed, 29 May 2024 22:19:53 -0400 Subject: [PATCH 12/38] Remove old metrics. --- receiver/dcgmreceiver/documentation.md | 88 ---- .../internal/metadata/generated_config.go | 32 -- .../metadata/generated_config_test.go | 16 - .../internal/metadata/generated_metrics.go | 464 ------------------ .../metadata/generated_metrics_test.go | 140 ------ .../internal/metadata/testdata/config.yaml | 32 -- receiver/dcgmreceiver/metadata.yaml | 61 --- 7 files changed, 833 deletions(-) diff --git a/receiver/dcgmreceiver/documentation.md b/receiver/dcgmreceiver/documentation.md index d7bb01b7f..83483d856 100644 --- a/receiver/dcgmreceiver/documentation.md +++ b/receiver/dcgmreceiver/documentation.md @@ -12,94 +12,6 @@ metrics: enabled: false ``` -### dcgm.gpu.memory.bytes_used - -Current number of GPU memory bytes used by state. Summing the values of all states yields the total GPU memory space. - -| Unit | Metric Type | Value Type | -| ---- | ----------- | ---------- | -| By | Gauge | Int | - -#### Attributes - -| Name | Description | Values | -| ---- | ----------- | ------ | -| memory_state | GPU memory state, one of [free, used, reserved]. | Str: ``used``, ``free``, ``reserved`` | - -### dcgm.gpu.profiling.dram_utilization - -Fraction of cycles data was being sent or received from GPU memory. - -| Unit | Metric Type | Value Type | -| ---- | ----------- | ---------- | -| 1 | Gauge | Double | - -### dcgm.gpu.profiling.nvlink_traffic_rate - -The average rate of bytes received from the GPU over NVLink over the sample period, not including protocol headers. - -| Unit | Metric Type | Value Type | -| ---- | ----------- | ---------- | -| By/s | Gauge | Int | - -#### Attributes - -| Name | Description | Values | -| ---- | ----------- | ------ | -| direction | Direction of the link traffic, one of [tx, rx]. | Str: ``tx``, ``rx`` | - -### dcgm.gpu.profiling.pcie_traffic_rate - -The average rate of bytes sent from the GPU over the PCIe bus over the sample period, including both protocol headers and data payloads. - -| Unit | Metric Type | Value Type | -| ---- | ----------- | ---------- | -| By/s | Gauge | Int | - -#### Attributes - -| Name | Description | Values | -| ---- | ----------- | ------ | -| direction | Direction of the link traffic, one of [tx, rx]. | Str: ``tx``, ``rx`` | - -### dcgm.gpu.profiling.pipe_utilization - -Fraction of cycles the corresponding GPU pipe was active, averaged over time and all multiprocessors. - -| Unit | Metric Type | Value Type | -| ---- | ----------- | ---------- | -| 1 | Gauge | Double | - -#### Attributes - -| Name | Description | Values | -| ---- | ----------- | ------ | -| pipe | GPU pipe in use, one of [tensor, fp64, fp32, fp16]. | Str: ``tensor``, ``fp64``, ``fp32``, ``fp16`` | - -### dcgm.gpu.profiling.sm_occupancy - -Fraction of resident warps on a multiprocessor relative to the maximum number supported, averaged over time and all multiprocessors. - -| Unit | Metric Type | Value Type | -| ---- | ----------- | ---------- | -| 1 | Gauge | Double | - -### dcgm.gpu.profiling.sm_utilization - -Fraction of time at least one warp was active on a multiprocessor, averaged over all multiprocessors. - -| Unit | Metric Type | Value Type | -| ---- | ----------- | ---------- | -| 1 | Gauge | Double | - -### dcgm.gpu.utilization - -Fraction of time the GPU was not idle. - -| Unit | Metric Type | Value Type | -| ---- | ----------- | ---------- | -| 1 | Gauge | Double | - ### gpu.dcgm.clock.frequency Multiprocessor clock frequency. diff --git a/receiver/dcgmreceiver/internal/metadata/generated_config.go b/receiver/dcgmreceiver/internal/metadata/generated_config.go index 17902b040..f1724c008 100644 --- a/receiver/dcgmreceiver/internal/metadata/generated_config.go +++ b/receiver/dcgmreceiver/internal/metadata/generated_config.go @@ -28,14 +28,6 @@ func (ms *MetricConfig) Unmarshal(parser *confmap.Conf) error { // MetricsConfig provides config for dcgm metrics. type MetricsConfig struct { - DcgmGpuMemoryBytesUsed MetricConfig `mapstructure:"dcgm.gpu.memory.bytes_used"` - DcgmGpuProfilingDramUtilization MetricConfig `mapstructure:"dcgm.gpu.profiling.dram_utilization"` - DcgmGpuProfilingNvlinkTrafficRate MetricConfig `mapstructure:"dcgm.gpu.profiling.nvlink_traffic_rate"` - DcgmGpuProfilingPcieTrafficRate MetricConfig `mapstructure:"dcgm.gpu.profiling.pcie_traffic_rate"` - DcgmGpuProfilingPipeUtilization MetricConfig `mapstructure:"dcgm.gpu.profiling.pipe_utilization"` - DcgmGpuProfilingSmOccupancy MetricConfig `mapstructure:"dcgm.gpu.profiling.sm_occupancy"` - DcgmGpuProfilingSmUtilization MetricConfig `mapstructure:"dcgm.gpu.profiling.sm_utilization"` - DcgmGpuUtilization MetricConfig `mapstructure:"dcgm.gpu.utilization"` GpuDcgmClockFrequency MetricConfig `mapstructure:"gpu.dcgm.clock.frequency"` GpuDcgmClockThrottleDurationTime MetricConfig `mapstructure:"gpu.dcgm.clock.throttle_duration.time"` GpuDcgmCodecDecoderUtilization MetricConfig `mapstructure:"gpu.dcgm.codec.decoder.utilization"` @@ -56,30 +48,6 @@ type MetricsConfig struct { func DefaultMetricsConfig() MetricsConfig { return MetricsConfig{ - DcgmGpuMemoryBytesUsed: MetricConfig{ - Enabled: true, - }, - DcgmGpuProfilingDramUtilization: MetricConfig{ - Enabled: true, - }, - DcgmGpuProfilingNvlinkTrafficRate: MetricConfig{ - Enabled: true, - }, - DcgmGpuProfilingPcieTrafficRate: MetricConfig{ - Enabled: true, - }, - DcgmGpuProfilingPipeUtilization: MetricConfig{ - Enabled: true, - }, - DcgmGpuProfilingSmOccupancy: MetricConfig{ - Enabled: true, - }, - DcgmGpuProfilingSmUtilization: MetricConfig{ - Enabled: true, - }, - DcgmGpuUtilization: MetricConfig{ - Enabled: true, - }, GpuDcgmClockFrequency: MetricConfig{ Enabled: true, }, diff --git a/receiver/dcgmreceiver/internal/metadata/generated_config_test.go b/receiver/dcgmreceiver/internal/metadata/generated_config_test.go index d9336c8dd..631be1e52 100644 --- a/receiver/dcgmreceiver/internal/metadata/generated_config_test.go +++ b/receiver/dcgmreceiver/internal/metadata/generated_config_test.go @@ -26,14 +26,6 @@ func TestMetricsBuilderConfig(t *testing.T) { name: "all_set", want: MetricsBuilderConfig{ Metrics: MetricsConfig{ - DcgmGpuMemoryBytesUsed: MetricConfig{Enabled: true}, - DcgmGpuProfilingDramUtilization: MetricConfig{Enabled: true}, - DcgmGpuProfilingNvlinkTrafficRate: MetricConfig{Enabled: true}, - DcgmGpuProfilingPcieTrafficRate: MetricConfig{Enabled: true}, - DcgmGpuProfilingPipeUtilization: MetricConfig{Enabled: true}, - DcgmGpuProfilingSmOccupancy: MetricConfig{Enabled: true}, - DcgmGpuProfilingSmUtilization: MetricConfig{Enabled: true}, - DcgmGpuUtilization: MetricConfig{Enabled: true}, GpuDcgmClockFrequency: MetricConfig{Enabled: true}, GpuDcgmClockThrottleDurationTime: MetricConfig{Enabled: true}, GpuDcgmCodecDecoderUtilization: MetricConfig{Enabled: true}, @@ -62,14 +54,6 @@ func TestMetricsBuilderConfig(t *testing.T) { name: "none_set", want: MetricsBuilderConfig{ Metrics: MetricsConfig{ - DcgmGpuMemoryBytesUsed: MetricConfig{Enabled: false}, - DcgmGpuProfilingDramUtilization: MetricConfig{Enabled: false}, - DcgmGpuProfilingNvlinkTrafficRate: MetricConfig{Enabled: false}, - DcgmGpuProfilingPcieTrafficRate: MetricConfig{Enabled: false}, - DcgmGpuProfilingPipeUtilization: MetricConfig{Enabled: false}, - DcgmGpuProfilingSmOccupancy: MetricConfig{Enabled: false}, - DcgmGpuProfilingSmUtilization: MetricConfig{Enabled: false}, - DcgmGpuUtilization: MetricConfig{Enabled: false}, GpuDcgmClockFrequency: MetricConfig{Enabled: false}, GpuDcgmClockThrottleDurationTime: MetricConfig{Enabled: false}, GpuDcgmCodecDecoderUtilization: MetricConfig{Enabled: false}, diff --git a/receiver/dcgmreceiver/internal/metadata/generated_metrics.go b/receiver/dcgmreceiver/internal/metadata/generated_metrics.go index 0e54765c5..f709357f6 100644 --- a/receiver/dcgmreceiver/internal/metadata/generated_metrics.go +++ b/receiver/dcgmreceiver/internal/metadata/generated_metrics.go @@ -178,406 +178,6 @@ var MapAttributeViolation = map[string]AttributeViolation{ "base_clock": AttributeViolationBaseClock, } -type metricDcgmGpuMemoryBytesUsed struct { - data pmetric.Metric // data buffer for generated metric. - config MetricConfig // metric config provided by user. - capacity int // max observed number of data points added to the metric. -} - -// init fills dcgm.gpu.memory.bytes_used metric with initial data. -func (m *metricDcgmGpuMemoryBytesUsed) init() { - m.data.SetName("dcgm.gpu.memory.bytes_used") - m.data.SetDescription("Current number of GPU memory bytes used by state. Summing the values of all states yields the total GPU memory space.") - m.data.SetUnit("By") - m.data.SetEmptyGauge() - m.data.Gauge().DataPoints().EnsureCapacity(m.capacity) -} - -func (m *metricDcgmGpuMemoryBytesUsed) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, memoryStateAttributeValue string) { - if !m.config.Enabled { - return - } - dp := m.data.Gauge().DataPoints().AppendEmpty() - dp.SetStartTimestamp(start) - dp.SetTimestamp(ts) - dp.SetIntValue(val) - dp.Attributes().PutStr("memory_state", memoryStateAttributeValue) -} - -// updateCapacity saves max length of data point slices that will be used for the slice capacity. -func (m *metricDcgmGpuMemoryBytesUsed) updateCapacity() { - if m.data.Gauge().DataPoints().Len() > m.capacity { - m.capacity = m.data.Gauge().DataPoints().Len() - } -} - -// emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. -func (m *metricDcgmGpuMemoryBytesUsed) emit(metrics pmetric.MetricSlice) { - if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { - m.updateCapacity() - m.data.MoveTo(metrics.AppendEmpty()) - m.init() - } -} - -func newMetricDcgmGpuMemoryBytesUsed(cfg MetricConfig) metricDcgmGpuMemoryBytesUsed { - m := metricDcgmGpuMemoryBytesUsed{config: cfg} - if cfg.Enabled { - m.data = pmetric.NewMetric() - m.init() - } - return m -} - -type metricDcgmGpuProfilingDramUtilization struct { - data pmetric.Metric // data buffer for generated metric. - config MetricConfig // metric config provided by user. - capacity int // max observed number of data points added to the metric. -} - -// init fills dcgm.gpu.profiling.dram_utilization metric with initial data. -func (m *metricDcgmGpuProfilingDramUtilization) init() { - m.data.SetName("dcgm.gpu.profiling.dram_utilization") - m.data.SetDescription("Fraction of cycles data was being sent or received from GPU memory.") - m.data.SetUnit("1") - m.data.SetEmptyGauge() -} - -func (m *metricDcgmGpuProfilingDramUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64) { - if !m.config.Enabled { - return - } - dp := m.data.Gauge().DataPoints().AppendEmpty() - dp.SetStartTimestamp(start) - dp.SetTimestamp(ts) - dp.SetDoubleValue(val) -} - -// updateCapacity saves max length of data point slices that will be used for the slice capacity. -func (m *metricDcgmGpuProfilingDramUtilization) updateCapacity() { - if m.data.Gauge().DataPoints().Len() > m.capacity { - m.capacity = m.data.Gauge().DataPoints().Len() - } -} - -// emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. -func (m *metricDcgmGpuProfilingDramUtilization) emit(metrics pmetric.MetricSlice) { - if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { - m.updateCapacity() - m.data.MoveTo(metrics.AppendEmpty()) - m.init() - } -} - -func newMetricDcgmGpuProfilingDramUtilization(cfg MetricConfig) metricDcgmGpuProfilingDramUtilization { - m := metricDcgmGpuProfilingDramUtilization{config: cfg} - if cfg.Enabled { - m.data = pmetric.NewMetric() - m.init() - } - return m -} - -type metricDcgmGpuProfilingNvlinkTrafficRate struct { - data pmetric.Metric // data buffer for generated metric. - config MetricConfig // metric config provided by user. - capacity int // max observed number of data points added to the metric. -} - -// init fills dcgm.gpu.profiling.nvlink_traffic_rate metric with initial data. -func (m *metricDcgmGpuProfilingNvlinkTrafficRate) init() { - m.data.SetName("dcgm.gpu.profiling.nvlink_traffic_rate") - m.data.SetDescription("The average rate of bytes received from the GPU over NVLink over the sample period, not including protocol headers.") - m.data.SetUnit("By/s") - m.data.SetEmptyGauge() - m.data.Gauge().DataPoints().EnsureCapacity(m.capacity) -} - -func (m *metricDcgmGpuProfilingNvlinkTrafficRate) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, directionAttributeValue string) { - if !m.config.Enabled { - return - } - dp := m.data.Gauge().DataPoints().AppendEmpty() - dp.SetStartTimestamp(start) - dp.SetTimestamp(ts) - dp.SetIntValue(val) - dp.Attributes().PutStr("direction", directionAttributeValue) -} - -// updateCapacity saves max length of data point slices that will be used for the slice capacity. -func (m *metricDcgmGpuProfilingNvlinkTrafficRate) updateCapacity() { - if m.data.Gauge().DataPoints().Len() > m.capacity { - m.capacity = m.data.Gauge().DataPoints().Len() - } -} - -// emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. -func (m *metricDcgmGpuProfilingNvlinkTrafficRate) emit(metrics pmetric.MetricSlice) { - if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { - m.updateCapacity() - m.data.MoveTo(metrics.AppendEmpty()) - m.init() - } -} - -func newMetricDcgmGpuProfilingNvlinkTrafficRate(cfg MetricConfig) metricDcgmGpuProfilingNvlinkTrafficRate { - m := metricDcgmGpuProfilingNvlinkTrafficRate{config: cfg} - if cfg.Enabled { - m.data = pmetric.NewMetric() - m.init() - } - return m -} - -type metricDcgmGpuProfilingPcieTrafficRate struct { - data pmetric.Metric // data buffer for generated metric. - config MetricConfig // metric config provided by user. - capacity int // max observed number of data points added to the metric. -} - -// init fills dcgm.gpu.profiling.pcie_traffic_rate metric with initial data. -func (m *metricDcgmGpuProfilingPcieTrafficRate) init() { - m.data.SetName("dcgm.gpu.profiling.pcie_traffic_rate") - m.data.SetDescription("The average rate of bytes sent from the GPU over the PCIe bus over the sample period, including both protocol headers and data payloads.") - m.data.SetUnit("By/s") - m.data.SetEmptyGauge() - m.data.Gauge().DataPoints().EnsureCapacity(m.capacity) -} - -func (m *metricDcgmGpuProfilingPcieTrafficRate) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, directionAttributeValue string) { - if !m.config.Enabled { - return - } - dp := m.data.Gauge().DataPoints().AppendEmpty() - dp.SetStartTimestamp(start) - dp.SetTimestamp(ts) - dp.SetIntValue(val) - dp.Attributes().PutStr("direction", directionAttributeValue) -} - -// updateCapacity saves max length of data point slices that will be used for the slice capacity. -func (m *metricDcgmGpuProfilingPcieTrafficRate) updateCapacity() { - if m.data.Gauge().DataPoints().Len() > m.capacity { - m.capacity = m.data.Gauge().DataPoints().Len() - } -} - -// emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. -func (m *metricDcgmGpuProfilingPcieTrafficRate) emit(metrics pmetric.MetricSlice) { - if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { - m.updateCapacity() - m.data.MoveTo(metrics.AppendEmpty()) - m.init() - } -} - -func newMetricDcgmGpuProfilingPcieTrafficRate(cfg MetricConfig) metricDcgmGpuProfilingPcieTrafficRate { - m := metricDcgmGpuProfilingPcieTrafficRate{config: cfg} - if cfg.Enabled { - m.data = pmetric.NewMetric() - m.init() - } - return m -} - -type metricDcgmGpuProfilingPipeUtilization struct { - data pmetric.Metric // data buffer for generated metric. - config MetricConfig // metric config provided by user. - capacity int // max observed number of data points added to the metric. -} - -// init fills dcgm.gpu.profiling.pipe_utilization metric with initial data. -func (m *metricDcgmGpuProfilingPipeUtilization) init() { - m.data.SetName("dcgm.gpu.profiling.pipe_utilization") - m.data.SetDescription("Fraction of cycles the corresponding GPU pipe was active, averaged over time and all multiprocessors.") - m.data.SetUnit("1") - m.data.SetEmptyGauge() - m.data.Gauge().DataPoints().EnsureCapacity(m.capacity) -} - -func (m *metricDcgmGpuProfilingPipeUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64, pipeAttributeValue string) { - if !m.config.Enabled { - return - } - dp := m.data.Gauge().DataPoints().AppendEmpty() - dp.SetStartTimestamp(start) - dp.SetTimestamp(ts) - dp.SetDoubleValue(val) - dp.Attributes().PutStr("pipe", pipeAttributeValue) -} - -// updateCapacity saves max length of data point slices that will be used for the slice capacity. -func (m *metricDcgmGpuProfilingPipeUtilization) updateCapacity() { - if m.data.Gauge().DataPoints().Len() > m.capacity { - m.capacity = m.data.Gauge().DataPoints().Len() - } -} - -// emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. -func (m *metricDcgmGpuProfilingPipeUtilization) emit(metrics pmetric.MetricSlice) { - if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { - m.updateCapacity() - m.data.MoveTo(metrics.AppendEmpty()) - m.init() - } -} - -func newMetricDcgmGpuProfilingPipeUtilization(cfg MetricConfig) metricDcgmGpuProfilingPipeUtilization { - m := metricDcgmGpuProfilingPipeUtilization{config: cfg} - if cfg.Enabled { - m.data = pmetric.NewMetric() - m.init() - } - return m -} - -type metricDcgmGpuProfilingSmOccupancy struct { - data pmetric.Metric // data buffer for generated metric. - config MetricConfig // metric config provided by user. - capacity int // max observed number of data points added to the metric. -} - -// init fills dcgm.gpu.profiling.sm_occupancy metric with initial data. -func (m *metricDcgmGpuProfilingSmOccupancy) init() { - m.data.SetName("dcgm.gpu.profiling.sm_occupancy") - m.data.SetDescription("Fraction of resident warps on a multiprocessor relative to the maximum number supported, averaged over time and all multiprocessors.") - m.data.SetUnit("1") - m.data.SetEmptyGauge() -} - -func (m *metricDcgmGpuProfilingSmOccupancy) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64) { - if !m.config.Enabled { - return - } - dp := m.data.Gauge().DataPoints().AppendEmpty() - dp.SetStartTimestamp(start) - dp.SetTimestamp(ts) - dp.SetDoubleValue(val) -} - -// updateCapacity saves max length of data point slices that will be used for the slice capacity. -func (m *metricDcgmGpuProfilingSmOccupancy) updateCapacity() { - if m.data.Gauge().DataPoints().Len() > m.capacity { - m.capacity = m.data.Gauge().DataPoints().Len() - } -} - -// emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. -func (m *metricDcgmGpuProfilingSmOccupancy) emit(metrics pmetric.MetricSlice) { - if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { - m.updateCapacity() - m.data.MoveTo(metrics.AppendEmpty()) - m.init() - } -} - -func newMetricDcgmGpuProfilingSmOccupancy(cfg MetricConfig) metricDcgmGpuProfilingSmOccupancy { - m := metricDcgmGpuProfilingSmOccupancy{config: cfg} - if cfg.Enabled { - m.data = pmetric.NewMetric() - m.init() - } - return m -} - -type metricDcgmGpuProfilingSmUtilization struct { - data pmetric.Metric // data buffer for generated metric. - config MetricConfig // metric config provided by user. - capacity int // max observed number of data points added to the metric. -} - -// init fills dcgm.gpu.profiling.sm_utilization metric with initial data. -func (m *metricDcgmGpuProfilingSmUtilization) init() { - m.data.SetName("dcgm.gpu.profiling.sm_utilization") - m.data.SetDescription("Fraction of time at least one warp was active on a multiprocessor, averaged over all multiprocessors.") - m.data.SetUnit("1") - m.data.SetEmptyGauge() -} - -func (m *metricDcgmGpuProfilingSmUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64) { - if !m.config.Enabled { - return - } - dp := m.data.Gauge().DataPoints().AppendEmpty() - dp.SetStartTimestamp(start) - dp.SetTimestamp(ts) - dp.SetDoubleValue(val) -} - -// updateCapacity saves max length of data point slices that will be used for the slice capacity. -func (m *metricDcgmGpuProfilingSmUtilization) updateCapacity() { - if m.data.Gauge().DataPoints().Len() > m.capacity { - m.capacity = m.data.Gauge().DataPoints().Len() - } -} - -// emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. -func (m *metricDcgmGpuProfilingSmUtilization) emit(metrics pmetric.MetricSlice) { - if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { - m.updateCapacity() - m.data.MoveTo(metrics.AppendEmpty()) - m.init() - } -} - -func newMetricDcgmGpuProfilingSmUtilization(cfg MetricConfig) metricDcgmGpuProfilingSmUtilization { - m := metricDcgmGpuProfilingSmUtilization{config: cfg} - if cfg.Enabled { - m.data = pmetric.NewMetric() - m.init() - } - return m -} - -type metricDcgmGpuUtilization struct { - data pmetric.Metric // data buffer for generated metric. - config MetricConfig // metric config provided by user. - capacity int // max observed number of data points added to the metric. -} - -// init fills dcgm.gpu.utilization metric with initial data. -func (m *metricDcgmGpuUtilization) init() { - m.data.SetName("dcgm.gpu.utilization") - m.data.SetDescription("Fraction of time the GPU was not idle.") - m.data.SetUnit("1") - m.data.SetEmptyGauge() -} - -func (m *metricDcgmGpuUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64) { - if !m.config.Enabled { - return - } - dp := m.data.Gauge().DataPoints().AppendEmpty() - dp.SetStartTimestamp(start) - dp.SetTimestamp(ts) - dp.SetDoubleValue(val) -} - -// updateCapacity saves max length of data point slices that will be used for the slice capacity. -func (m *metricDcgmGpuUtilization) updateCapacity() { - if m.data.Gauge().DataPoints().Len() > m.capacity { - m.capacity = m.data.Gauge().DataPoints().Len() - } -} - -// emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. -func (m *metricDcgmGpuUtilization) emit(metrics pmetric.MetricSlice) { - if m.config.Enabled && m.data.Gauge().DataPoints().Len() > 0 { - m.updateCapacity() - m.data.MoveTo(metrics.AppendEmpty()) - m.init() - } -} - -func newMetricDcgmGpuUtilization(cfg MetricConfig) metricDcgmGpuUtilization { - m := metricDcgmGpuUtilization{config: cfg} - if cfg.Enabled { - m.data = pmetric.NewMetric() - m.init() - } - return m -} - type metricGpuDcgmClockFrequency struct { data pmetric.Metric // data buffer for generated metric. config MetricConfig // metric config provided by user. @@ -1398,14 +998,6 @@ type MetricsBuilder struct { buildInfo component.BuildInfo // contains version information. resourceAttributeIncludeFilter map[string]filter.Filter resourceAttributeExcludeFilter map[string]filter.Filter - metricDcgmGpuMemoryBytesUsed metricDcgmGpuMemoryBytesUsed - metricDcgmGpuProfilingDramUtilization metricDcgmGpuProfilingDramUtilization - metricDcgmGpuProfilingNvlinkTrafficRate metricDcgmGpuProfilingNvlinkTrafficRate - metricDcgmGpuProfilingPcieTrafficRate metricDcgmGpuProfilingPcieTrafficRate - metricDcgmGpuProfilingPipeUtilization metricDcgmGpuProfilingPipeUtilization - metricDcgmGpuProfilingSmOccupancy metricDcgmGpuProfilingSmOccupancy - metricDcgmGpuProfilingSmUtilization metricDcgmGpuProfilingSmUtilization - metricDcgmGpuUtilization metricDcgmGpuUtilization metricGpuDcgmClockFrequency metricGpuDcgmClockFrequency metricGpuDcgmClockThrottleDurationTime metricGpuDcgmClockThrottleDurationTime metricGpuDcgmCodecDecoderUtilization metricGpuDcgmCodecDecoderUtilization @@ -1440,14 +1032,6 @@ func NewMetricsBuilder(mbc MetricsBuilderConfig, settings receiver.CreateSetting startTime: pcommon.NewTimestampFromTime(time.Now()), metricsBuffer: pmetric.NewMetrics(), buildInfo: settings.BuildInfo, - metricDcgmGpuMemoryBytesUsed: newMetricDcgmGpuMemoryBytesUsed(mbc.Metrics.DcgmGpuMemoryBytesUsed), - metricDcgmGpuProfilingDramUtilization: newMetricDcgmGpuProfilingDramUtilization(mbc.Metrics.DcgmGpuProfilingDramUtilization), - metricDcgmGpuProfilingNvlinkTrafficRate: newMetricDcgmGpuProfilingNvlinkTrafficRate(mbc.Metrics.DcgmGpuProfilingNvlinkTrafficRate), - metricDcgmGpuProfilingPcieTrafficRate: newMetricDcgmGpuProfilingPcieTrafficRate(mbc.Metrics.DcgmGpuProfilingPcieTrafficRate), - metricDcgmGpuProfilingPipeUtilization: newMetricDcgmGpuProfilingPipeUtilization(mbc.Metrics.DcgmGpuProfilingPipeUtilization), - metricDcgmGpuProfilingSmOccupancy: newMetricDcgmGpuProfilingSmOccupancy(mbc.Metrics.DcgmGpuProfilingSmOccupancy), - metricDcgmGpuProfilingSmUtilization: newMetricDcgmGpuProfilingSmUtilization(mbc.Metrics.DcgmGpuProfilingSmUtilization), - metricDcgmGpuUtilization: newMetricDcgmGpuUtilization(mbc.Metrics.DcgmGpuUtilization), metricGpuDcgmClockFrequency: newMetricGpuDcgmClockFrequency(mbc.Metrics.GpuDcgmClockFrequency), metricGpuDcgmClockThrottleDurationTime: newMetricGpuDcgmClockThrottleDurationTime(mbc.Metrics.GpuDcgmClockThrottleDurationTime), metricGpuDcgmCodecDecoderUtilization: newMetricGpuDcgmCodecDecoderUtilization(mbc.Metrics.GpuDcgmCodecDecoderUtilization), @@ -1546,14 +1130,6 @@ func (mb *MetricsBuilder) EmitForResource(rmo ...ResourceMetricsOption) { ils.Scope().SetName("github.com/GoogleCloudPlatform/opentelemetry-operations-collector/receiver/dcgmreceiver") ils.Scope().SetVersion(mb.buildInfo.Version) ils.Metrics().EnsureCapacity(mb.metricsCapacity) - mb.metricDcgmGpuMemoryBytesUsed.emit(ils.Metrics()) - mb.metricDcgmGpuProfilingDramUtilization.emit(ils.Metrics()) - mb.metricDcgmGpuProfilingNvlinkTrafficRate.emit(ils.Metrics()) - mb.metricDcgmGpuProfilingPcieTrafficRate.emit(ils.Metrics()) - mb.metricDcgmGpuProfilingPipeUtilization.emit(ils.Metrics()) - mb.metricDcgmGpuProfilingSmOccupancy.emit(ils.Metrics()) - mb.metricDcgmGpuProfilingSmUtilization.emit(ils.Metrics()) - mb.metricDcgmGpuUtilization.emit(ils.Metrics()) mb.metricGpuDcgmClockFrequency.emit(ils.Metrics()) mb.metricGpuDcgmClockThrottleDurationTime.emit(ils.Metrics()) mb.metricGpuDcgmCodecDecoderUtilization.emit(ils.Metrics()) @@ -1601,46 +1177,6 @@ func (mb *MetricsBuilder) Emit(rmo ...ResourceMetricsOption) pmetric.Metrics { return metrics } -// RecordDcgmGpuMemoryBytesUsedDataPoint adds a data point to dcgm.gpu.memory.bytes_used metric. -func (mb *MetricsBuilder) RecordDcgmGpuMemoryBytesUsedDataPoint(ts pcommon.Timestamp, val int64, memoryStateAttributeValue AttributeMemoryState) { - mb.metricDcgmGpuMemoryBytesUsed.recordDataPoint(mb.startTime, ts, val, memoryStateAttributeValue.String()) -} - -// RecordDcgmGpuProfilingDramUtilizationDataPoint adds a data point to dcgm.gpu.profiling.dram_utilization metric. -func (mb *MetricsBuilder) RecordDcgmGpuProfilingDramUtilizationDataPoint(ts pcommon.Timestamp, val float64) { - mb.metricDcgmGpuProfilingDramUtilization.recordDataPoint(mb.startTime, ts, val) -} - -// RecordDcgmGpuProfilingNvlinkTrafficRateDataPoint adds a data point to dcgm.gpu.profiling.nvlink_traffic_rate metric. -func (mb *MetricsBuilder) RecordDcgmGpuProfilingNvlinkTrafficRateDataPoint(ts pcommon.Timestamp, val int64, directionAttributeValue AttributeDirection) { - mb.metricDcgmGpuProfilingNvlinkTrafficRate.recordDataPoint(mb.startTime, ts, val, directionAttributeValue.String()) -} - -// RecordDcgmGpuProfilingPcieTrafficRateDataPoint adds a data point to dcgm.gpu.profiling.pcie_traffic_rate metric. -func (mb *MetricsBuilder) RecordDcgmGpuProfilingPcieTrafficRateDataPoint(ts pcommon.Timestamp, val int64, directionAttributeValue AttributeDirection) { - mb.metricDcgmGpuProfilingPcieTrafficRate.recordDataPoint(mb.startTime, ts, val, directionAttributeValue.String()) -} - -// RecordDcgmGpuProfilingPipeUtilizationDataPoint adds a data point to dcgm.gpu.profiling.pipe_utilization metric. -func (mb *MetricsBuilder) RecordDcgmGpuProfilingPipeUtilizationDataPoint(ts pcommon.Timestamp, val float64, pipeAttributeValue AttributePipe) { - mb.metricDcgmGpuProfilingPipeUtilization.recordDataPoint(mb.startTime, ts, val, pipeAttributeValue.String()) -} - -// RecordDcgmGpuProfilingSmOccupancyDataPoint adds a data point to dcgm.gpu.profiling.sm_occupancy metric. -func (mb *MetricsBuilder) RecordDcgmGpuProfilingSmOccupancyDataPoint(ts pcommon.Timestamp, val float64) { - mb.metricDcgmGpuProfilingSmOccupancy.recordDataPoint(mb.startTime, ts, val) -} - -// RecordDcgmGpuProfilingSmUtilizationDataPoint adds a data point to dcgm.gpu.profiling.sm_utilization metric. -func (mb *MetricsBuilder) RecordDcgmGpuProfilingSmUtilizationDataPoint(ts pcommon.Timestamp, val float64) { - mb.metricDcgmGpuProfilingSmUtilization.recordDataPoint(mb.startTime, ts, val) -} - -// RecordDcgmGpuUtilizationDataPoint adds a data point to dcgm.gpu.utilization metric. -func (mb *MetricsBuilder) RecordDcgmGpuUtilizationDataPoint(ts pcommon.Timestamp, val float64) { - mb.metricDcgmGpuUtilization.recordDataPoint(mb.startTime, ts, val) -} - // RecordGpuDcgmClockFrequencyDataPoint adds a data point to gpu.dcgm.clock.frequency metric. func (mb *MetricsBuilder) RecordGpuDcgmClockFrequencyDataPoint(ts pcommon.Timestamp, val float64) { mb.metricGpuDcgmClockFrequency.recordDataPoint(mb.startTime, ts, val) diff --git a/receiver/dcgmreceiver/internal/metadata/generated_metrics_test.go b/receiver/dcgmreceiver/internal/metadata/generated_metrics_test.go index d56d05412..802d27c9a 100644 --- a/receiver/dcgmreceiver/internal/metadata/generated_metrics_test.go +++ b/receiver/dcgmreceiver/internal/metadata/generated_metrics_test.go @@ -68,38 +68,6 @@ func TestMetricsBuilder(t *testing.T) { defaultMetricsCount := 0 allMetricsCount := 0 - defaultMetricsCount++ - allMetricsCount++ - mb.RecordDcgmGpuMemoryBytesUsedDataPoint(ts, 1, AttributeMemoryStateUsed) - - defaultMetricsCount++ - allMetricsCount++ - mb.RecordDcgmGpuProfilingDramUtilizationDataPoint(ts, 1) - - defaultMetricsCount++ - allMetricsCount++ - mb.RecordDcgmGpuProfilingNvlinkTrafficRateDataPoint(ts, 1, AttributeDirectionTx) - - defaultMetricsCount++ - allMetricsCount++ - mb.RecordDcgmGpuProfilingPcieTrafficRateDataPoint(ts, 1, AttributeDirectionTx) - - defaultMetricsCount++ - allMetricsCount++ - mb.RecordDcgmGpuProfilingPipeUtilizationDataPoint(ts, 1, AttributePipeTensor) - - defaultMetricsCount++ - allMetricsCount++ - mb.RecordDcgmGpuProfilingSmOccupancyDataPoint(ts, 1) - - defaultMetricsCount++ - allMetricsCount++ - mb.RecordDcgmGpuProfilingSmUtilizationDataPoint(ts, 1) - - defaultMetricsCount++ - allMetricsCount++ - mb.RecordDcgmGpuUtilizationDataPoint(ts, 1) - defaultMetricsCount++ allMetricsCount++ mb.RecordGpuDcgmClockFrequencyDataPoint(ts, 1) @@ -189,114 +157,6 @@ func TestMetricsBuilder(t *testing.T) { validatedMetrics := make(map[string]bool) for i := 0; i < ms.Len(); i++ { switch ms.At(i).Name() { - case "dcgm.gpu.memory.bytes_used": - assert.False(t, validatedMetrics["dcgm.gpu.memory.bytes_used"], "Found a duplicate in the metrics slice: dcgm.gpu.memory.bytes_used") - validatedMetrics["dcgm.gpu.memory.bytes_used"] = true - assert.Equal(t, pmetric.MetricTypeGauge, ms.At(i).Type()) - assert.Equal(t, 1, ms.At(i).Gauge().DataPoints().Len()) - assert.Equal(t, "Current number of GPU memory bytes used by state. Summing the values of all states yields the total GPU memory space.", ms.At(i).Description()) - assert.Equal(t, "By", ms.At(i).Unit()) - dp := ms.At(i).Gauge().DataPoints().At(0) - assert.Equal(t, start, dp.StartTimestamp()) - assert.Equal(t, ts, dp.Timestamp()) - assert.Equal(t, pmetric.NumberDataPointValueTypeInt, dp.ValueType()) - assert.Equal(t, int64(1), dp.IntValue()) - attrVal, ok := dp.Attributes().Get("memory_state") - assert.True(t, ok) - assert.EqualValues(t, "used", attrVal.Str()) - case "dcgm.gpu.profiling.dram_utilization": - assert.False(t, validatedMetrics["dcgm.gpu.profiling.dram_utilization"], "Found a duplicate in the metrics slice: dcgm.gpu.profiling.dram_utilization") - validatedMetrics["dcgm.gpu.profiling.dram_utilization"] = true - assert.Equal(t, pmetric.MetricTypeGauge, ms.At(i).Type()) - assert.Equal(t, 1, ms.At(i).Gauge().DataPoints().Len()) - assert.Equal(t, "Fraction of cycles data was being sent or received from GPU memory.", ms.At(i).Description()) - assert.Equal(t, "1", ms.At(i).Unit()) - dp := ms.At(i).Gauge().DataPoints().At(0) - assert.Equal(t, start, dp.StartTimestamp()) - assert.Equal(t, ts, dp.Timestamp()) - assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) - assert.Equal(t, float64(1), dp.DoubleValue()) - case "dcgm.gpu.profiling.nvlink_traffic_rate": - assert.False(t, validatedMetrics["dcgm.gpu.profiling.nvlink_traffic_rate"], "Found a duplicate in the metrics slice: dcgm.gpu.profiling.nvlink_traffic_rate") - validatedMetrics["dcgm.gpu.profiling.nvlink_traffic_rate"] = true - assert.Equal(t, pmetric.MetricTypeGauge, ms.At(i).Type()) - assert.Equal(t, 1, ms.At(i).Gauge().DataPoints().Len()) - assert.Equal(t, "The average rate of bytes received from the GPU over NVLink over the sample period, not including protocol headers.", ms.At(i).Description()) - assert.Equal(t, "By/s", ms.At(i).Unit()) - dp := ms.At(i).Gauge().DataPoints().At(0) - assert.Equal(t, start, dp.StartTimestamp()) - assert.Equal(t, ts, dp.Timestamp()) - assert.Equal(t, pmetric.NumberDataPointValueTypeInt, dp.ValueType()) - assert.Equal(t, int64(1), dp.IntValue()) - attrVal, ok := dp.Attributes().Get("direction") - assert.True(t, ok) - assert.EqualValues(t, "tx", attrVal.Str()) - case "dcgm.gpu.profiling.pcie_traffic_rate": - assert.False(t, validatedMetrics["dcgm.gpu.profiling.pcie_traffic_rate"], "Found a duplicate in the metrics slice: dcgm.gpu.profiling.pcie_traffic_rate") - validatedMetrics["dcgm.gpu.profiling.pcie_traffic_rate"] = true - assert.Equal(t, pmetric.MetricTypeGauge, ms.At(i).Type()) - assert.Equal(t, 1, ms.At(i).Gauge().DataPoints().Len()) - assert.Equal(t, "The average rate of bytes sent from the GPU over the PCIe bus over the sample period, including both protocol headers and data payloads.", ms.At(i).Description()) - assert.Equal(t, "By/s", ms.At(i).Unit()) - dp := ms.At(i).Gauge().DataPoints().At(0) - assert.Equal(t, start, dp.StartTimestamp()) - assert.Equal(t, ts, dp.Timestamp()) - assert.Equal(t, pmetric.NumberDataPointValueTypeInt, dp.ValueType()) - assert.Equal(t, int64(1), dp.IntValue()) - attrVal, ok := dp.Attributes().Get("direction") - assert.True(t, ok) - assert.EqualValues(t, "tx", attrVal.Str()) - case "dcgm.gpu.profiling.pipe_utilization": - assert.False(t, validatedMetrics["dcgm.gpu.profiling.pipe_utilization"], "Found a duplicate in the metrics slice: dcgm.gpu.profiling.pipe_utilization") - validatedMetrics["dcgm.gpu.profiling.pipe_utilization"] = true - assert.Equal(t, pmetric.MetricTypeGauge, ms.At(i).Type()) - assert.Equal(t, 1, ms.At(i).Gauge().DataPoints().Len()) - assert.Equal(t, "Fraction of cycles the corresponding GPU pipe was active, averaged over time and all multiprocessors.", ms.At(i).Description()) - assert.Equal(t, "1", ms.At(i).Unit()) - dp := ms.At(i).Gauge().DataPoints().At(0) - assert.Equal(t, start, dp.StartTimestamp()) - assert.Equal(t, ts, dp.Timestamp()) - assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) - assert.Equal(t, float64(1), dp.DoubleValue()) - attrVal, ok := dp.Attributes().Get("pipe") - assert.True(t, ok) - assert.EqualValues(t, "tensor", attrVal.Str()) - case "dcgm.gpu.profiling.sm_occupancy": - assert.False(t, validatedMetrics["dcgm.gpu.profiling.sm_occupancy"], "Found a duplicate in the metrics slice: dcgm.gpu.profiling.sm_occupancy") - validatedMetrics["dcgm.gpu.profiling.sm_occupancy"] = true - assert.Equal(t, pmetric.MetricTypeGauge, ms.At(i).Type()) - assert.Equal(t, 1, ms.At(i).Gauge().DataPoints().Len()) - assert.Equal(t, "Fraction of resident warps on a multiprocessor relative to the maximum number supported, averaged over time and all multiprocessors.", ms.At(i).Description()) - assert.Equal(t, "1", ms.At(i).Unit()) - dp := ms.At(i).Gauge().DataPoints().At(0) - assert.Equal(t, start, dp.StartTimestamp()) - assert.Equal(t, ts, dp.Timestamp()) - assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) - assert.Equal(t, float64(1), dp.DoubleValue()) - case "dcgm.gpu.profiling.sm_utilization": - assert.False(t, validatedMetrics["dcgm.gpu.profiling.sm_utilization"], "Found a duplicate in the metrics slice: dcgm.gpu.profiling.sm_utilization") - validatedMetrics["dcgm.gpu.profiling.sm_utilization"] = true - assert.Equal(t, pmetric.MetricTypeGauge, ms.At(i).Type()) - assert.Equal(t, 1, ms.At(i).Gauge().DataPoints().Len()) - assert.Equal(t, "Fraction of time at least one warp was active on a multiprocessor, averaged over all multiprocessors.", ms.At(i).Description()) - assert.Equal(t, "1", ms.At(i).Unit()) - dp := ms.At(i).Gauge().DataPoints().At(0) - assert.Equal(t, start, dp.StartTimestamp()) - assert.Equal(t, ts, dp.Timestamp()) - assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) - assert.Equal(t, float64(1), dp.DoubleValue()) - case "dcgm.gpu.utilization": - assert.False(t, validatedMetrics["dcgm.gpu.utilization"], "Found a duplicate in the metrics slice: dcgm.gpu.utilization") - validatedMetrics["dcgm.gpu.utilization"] = true - assert.Equal(t, pmetric.MetricTypeGauge, ms.At(i).Type()) - assert.Equal(t, 1, ms.At(i).Gauge().DataPoints().Len()) - assert.Equal(t, "Fraction of time the GPU was not idle.", ms.At(i).Description()) - assert.Equal(t, "1", ms.At(i).Unit()) - dp := ms.At(i).Gauge().DataPoints().At(0) - assert.Equal(t, start, dp.StartTimestamp()) - assert.Equal(t, ts, dp.Timestamp()) - assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) - assert.Equal(t, float64(1), dp.DoubleValue()) case "gpu.dcgm.clock.frequency": assert.False(t, validatedMetrics["gpu.dcgm.clock.frequency"], "Found a duplicate in the metrics slice: gpu.dcgm.clock.frequency") validatedMetrics["gpu.dcgm.clock.frequency"] = true diff --git a/receiver/dcgmreceiver/internal/metadata/testdata/config.yaml b/receiver/dcgmreceiver/internal/metadata/testdata/config.yaml index b18d5284b..114d2a1b9 100644 --- a/receiver/dcgmreceiver/internal/metadata/testdata/config.yaml +++ b/receiver/dcgmreceiver/internal/metadata/testdata/config.yaml @@ -1,22 +1,6 @@ default: all_set: metrics: - dcgm.gpu.memory.bytes_used: - enabled: true - dcgm.gpu.profiling.dram_utilization: - enabled: true - dcgm.gpu.profiling.nvlink_traffic_rate: - enabled: true - dcgm.gpu.profiling.pcie_traffic_rate: - enabled: true - dcgm.gpu.profiling.pipe_utilization: - enabled: true - dcgm.gpu.profiling.sm_occupancy: - enabled: true - dcgm.gpu.profiling.sm_utilization: - enabled: true - dcgm.gpu.utilization: - enabled: true gpu.dcgm.clock.frequency: enabled: true gpu.dcgm.clock.throttle_duration.time: @@ -58,22 +42,6 @@ all_set: enabled: true none_set: metrics: - dcgm.gpu.memory.bytes_used: - enabled: false - dcgm.gpu.profiling.dram_utilization: - enabled: false - dcgm.gpu.profiling.nvlink_traffic_rate: - enabled: false - dcgm.gpu.profiling.pcie_traffic_rate: - enabled: false - dcgm.gpu.profiling.pipe_utilization: - enabled: false - dcgm.gpu.profiling.sm_occupancy: - enabled: false - dcgm.gpu.profiling.sm_utilization: - enabled: false - dcgm.gpu.utilization: - enabled: false gpu.dcgm.clock.frequency: enabled: false gpu.dcgm.clock.throttle_duration.time: diff --git a/receiver/dcgmreceiver/metadata.yaml b/receiver/dcgmreceiver/metadata.yaml index 45783be65..46c907b40 100644 --- a/receiver/dcgmreceiver/metadata.yaml +++ b/receiver/dcgmreceiver/metadata.yaml @@ -182,64 +182,3 @@ metrics: monotonic: true attributes: [xid] enabled: true - -#--- - dcgm.gpu.utilization: - description: Fraction of time the GPU was not idle. - unit: 1 - gauge: - value_type: double - enabled: true - - dcgm.gpu.memory.bytes_used: - description: Current number of GPU memory bytes used by state. Summing the values of all states yields the total GPU memory space. - unit: By - gauge: - value_type: int - attributes: [memory_state] - enabled: true - - dcgm.gpu.profiling.sm_utilization: - description: Fraction of time at least one warp was active on a multiprocessor, averaged over all multiprocessors. - unit: 1 - gauge: - value_type: double - enabled: true - - dcgm.gpu.profiling.sm_occupancy: - description: Fraction of resident warps on a multiprocessor relative to the maximum number supported, averaged over time and all multiprocessors. - unit: 1 - gauge: - value_type: double - enabled: true - - dcgm.gpu.profiling.pipe_utilization: - description: Fraction of cycles the corresponding GPU pipe was active, averaged over time and all multiprocessors. - unit: 1 - gauge: - value_type: double - attributes: [pipe] - enabled: true - - dcgm.gpu.profiling.dram_utilization: - description: Fraction of cycles data was being sent or received from GPU memory. - unit: 1 - gauge: - value_type: double - enabled: true - - dcgm.gpu.profiling.pcie_traffic_rate: - description: The average rate of bytes sent from the GPU over the PCIe bus over the sample period, including both protocol headers and data payloads. - unit: By/s - gauge: - value_type: int - attributes: [direction] - enabled: true - - dcgm.gpu.profiling.nvlink_traffic_rate: - description: The average rate of bytes received from the GPU over NVLink over the sample period, not including protocol headers. - unit: By/s - gauge: - value_type: int - attributes: [direction] - enabled: true From ef78211a4cc5196bb2a5ef884196741f47761d3e Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Thu, 20 Jun 2024 19:17:03 -0400 Subject: [PATCH 13/38] Check for supported non-profiling fields by validating polled values. --- receiver/dcgmreceiver/client.go | 131 +++++++++++++++--- receiver/dcgmreceiver/client_gpu_test.go | 9 +- .../testdata/Tesla_P100-PCIE-16GB.yaml | 12 +- receiver/dcgmreceiver/testdata/Tesla_P4.yaml | 10 +- 4 files changed, 128 insertions(+), 34 deletions(-) diff --git a/receiver/dcgmreceiver/client.go b/receiver/dcgmreceiver/client.go index d141a76c3..2a9626d4f 100644 --- a/receiver/dcgmreceiver/client.go +++ b/receiver/dcgmreceiver/client.go @@ -67,13 +67,19 @@ func newClient(config *Config, logger *zap.Logger) (*dcgmClient, error) { UUIDs := make([]string, 0) enabledFieldGroup := dcgm.FieldHandle{} requestedFieldIDs := discoverRequestedFieldIDs(config) + supportedRegularFieldIDs, err := getSupportedRegularFields(requestedFieldIDs, logger) + if err != nil { + // TODO: If there is error querying the supported fields at all, let the + // receiver collect no metrics. + logger.Sugar().Warnf("Error querying supported regular fields on '%w'. Regular GPU metrics will not be collected.", err) + } supportedProfilingFieldIDs, err := getSupportedProfilingFields() if err != nil { // If there is error querying the supported fields at all, let the // receiver collect basic metrics: (GPU utilization, used/free memory). logger.Sugar().Warnf("Error querying supported profiling fields on '%w'. GPU profiling metrics will not be collected.", err) } - enabledFields, unavailableFields := filterSupportedFields(requestedFieldIDs, supportedProfilingFieldIDs) + enabledFields, unavailableFields := filterSupportedFields(requestedFieldIDs, supportedRegularFieldIDs, supportedProfilingFieldIDs) for _, f := range unavailableFields { logger.Sugar().Warnf("Field '%s' is not supported. Metric '%s' will not be collected", dcgmIDToName[f], dcgmIDToName[f]) } @@ -269,15 +275,16 @@ func getSupportedProfilingFields() ([]dcgm.Short, error) { // filterSupportedFields takes the user requested fields and device supported // profiling fields, and filters to return those that are requested & supported // to be the enabledFields and requested but not supported as unavailableFields -func filterSupportedFields(requestedFields []dcgm.Short, supportedProfilingFields []dcgm.Short) ([]dcgm.Short, []dcgm.Short) { +func filterSupportedFields(requestedFields []dcgm.Short, supportedRegularFields []dcgm.Short, supportedProfilingFields []dcgm.Short) ([]dcgm.Short, []dcgm.Short) { var enabledFields []dcgm.Short var unavailableFields []dcgm.Short for _, ef := range requestedFields { support := false - if ef < dcgmProfilingFieldsStart { - // Fields like `DCGM_FI_DEV_*` are not profiling - // fields, and they are always supported on all devices - support = true + for _, sf := range supportedRegularFields { + if sf == ef { + support = true + break + } } for _, sf := range supportedProfilingFields { if sf == ef { @@ -294,33 +301,117 @@ func filterSupportedFields(requestedFields []dcgm.Short, supportedProfilingField return enabledFields, unavailableFields } -func setWatchesOnEnabledFields(config *Config, logger *zap.Logger, deviceGroup dcgm.GroupHandle, enabledFieldIDs []dcgm.Short) (dcgm.FieldHandle, error) { +func getSupportedRegularFields(requestedFields []dcgm.Short, logger *zap.Logger) ([]dcgm.Short, error) { + var regularFields []dcgm.Short + for _, ef := range requestedFields { + if ef < dcgmProfilingFieldsStart { + // For fields like `DCGM_FI_DEV_*`, which are not + // profiling fields, try to actually retrieve the values + // all devices + regularFields = append(regularFields, ef) + } + } + if len(regularFields) == 0 { + return nil, nil + } + deviceIndices, err := dcgm.GetSupportedDevices() + if err != nil { + return nil, fmt.Errorf("Unable to discover supported GPUs on %w", err) + } + deviceGroupName := "google-cloud-ops-agent-initial-watch-group" + deviceGroup, err := dcgm.NewDefaultGroup(deviceGroupName) + defer dcgm.DestroyGroup(deviceGroup) + if err != nil { + return nil, fmt.Errorf("Unable to create DCGM GPU default group on %w", err) + } + testFieldGroup, err := setWatchesOnFields(logger, deviceGroup, regularFields, dcgmWatchParams{ + fieldGroupName: "google-cloud-ops-agent-initial-discovery", + updateFreqUs: 3600000000, // call UpdateAllFields manually + maxKeepTime: 600, + maxKeepSamples: 1, + }) + if err != nil { + return nil, fmt.Errorf("Unable to set field watches on %w", err) + } + err = dcgm.UpdateAllFields() + if err != nil { + return nil, fmt.Errorf("Unable to update fields on %w", err) + } + defer dcgm.FieldGroupDestroy(testFieldGroup) + found := make(map[dcgm.Short]bool) + for _, gpuIndex := range deviceIndices { + fieldValues, pollErr := dcgm.GetLatestValuesForFields(gpuIndex, regularFields) + if pollErr != nil { + continue + } + for _, fieldValue := range fieldValues { + dcgmName := dcgmIDToName[dcgm.Short(fieldValue.FieldId)] + if err := isValidValue(fieldValue); err != nil { + logger.Sugar().Warnf("Received invalid value (ts %d gpu %d) %s: %v", fieldValue.Ts, gpuIndex, dcgmName, err) + continue + } + switch fieldValue.FieldType { + case dcgm.DCGM_FT_DOUBLE: + logger.Sugar().Debugf("Discovered (ts %d gpu %d) %s = %.3f (f64)", fieldValue.Ts, gpuIndex, dcgmName, fieldValue.Float64()) + case dcgm.DCGM_FT_INT64: + logger.Sugar().Debugf("Discovered (ts %d gpu %d) %s = %d (i64)", fieldValue.Ts, gpuIndex, dcgmName, fieldValue.Int64()) + } + found[dcgm.Short(fieldValue.FieldId)] = true + } + } + // TODO: dcgmUnwatchFields is not available. + supported := make([]dcgm.Short, len(found)) + for fieldId, _ := range found { + supported = append(supported, fieldId) + } + return supported, nil +} + +// Internal-only +type dcgmWatchParams struct { + fieldGroupName string + updateFreqUs int64 + maxKeepTime float64 + maxKeepSamples int32 +} + +// Internal-only +func setWatchesOnFields(logger *zap.Logger, deviceGroup dcgm.GroupHandle, fieldIDs []dcgm.Short, params dcgmWatchParams) (dcgm.FieldHandle, error) { var err error - // Note: Add random suffix to avoid conflict amongnst any parallel collectors - fieldGroupName := fmt.Sprintf("google-cloud-ops-agent-metrics-%d", randSource.Intn(10000)) - enabledFieldGroup, err := dcgm.FieldGroupCreate(fieldGroupName, enabledFieldIDs) + fieldGroup, err := dcgm.FieldGroupCreate(params.fieldGroupName, fieldIDs) if err != nil { - return dcgm.FieldHandle{}, fmt.Errorf("Unable to create DCGM field group '%s'", fieldGroupName) + return dcgm.FieldHandle{}, fmt.Errorf("Unable to create DCGM field group '%s'", params.fieldGroupName) } - msg := fmt.Sprintf("Created DCGM field group '%s' with field ids: ", fieldGroupName) - for _, fieldID := range enabledFieldIDs { + msg := fmt.Sprintf("Created DCGM field group '%s' with field ids: ", params.fieldGroupName) + for _, fieldID := range fieldIDs { msg += fmt.Sprintf("%d ", fieldID) } logger.Sugar().Info(msg) // Note: DCGM retained samples = Max(maxKeepSamples, maxKeepTime/updateFreq) - dcgmUpdateFreq := int64(config.CollectionInterval / time.Microsecond) - dcgmMaxKeepTime := 600.0 /* 10 min */ - dcgmMaxKeepSamples := int32(15) - err = dcgm.WatchFieldsWithGroupEx(enabledFieldGroup, deviceGroup, dcgmUpdateFreq, dcgmMaxKeepTime, dcgmMaxKeepSamples) + dcgmUpdateFreq := params.updateFreqUs + dcgmMaxKeepTime := params.maxKeepTime + dcgmMaxKeepSamples := params.maxKeepSamples + err = dcgm.WatchFieldsWithGroupEx(fieldGroup, deviceGroup, dcgmUpdateFreq, dcgmMaxKeepTime, dcgmMaxKeepSamples) if err != nil { - return dcgm.FieldHandle{}, fmt.Errorf("Setting watches for DCGM field group '%s' failed on %w", fieldGroupName, err) + return dcgm.FieldHandle{}, fmt.Errorf("Setting watches for DCGM field group '%s' failed on %w", params.fieldGroupName, err) } - logger.Sugar().Infof("Setting watches for DCGM field group '%s' succeeded", fieldGroupName) + logger.Sugar().Infof("Setting watches for DCGM field group '%s' succeeded", params.fieldGroupName) + + return fieldGroup, nil +} - return enabledFieldGroup, nil +func setWatchesOnEnabledFields(config *Config, logger *zap.Logger, deviceGroup dcgm.GroupHandle, enabledFieldIDs []dcgm.Short) (dcgm.FieldHandle, error) { + return setWatchesOnFields(logger, deviceGroup, enabledFieldIDs, dcgmWatchParams{ + // Note: Add random suffix to avoid conflict amongnst any parallel collectors + fieldGroupName: fmt.Sprintf("google-cloud-ops-agent-metrics-%d", randSource.Intn(10000)), + // Note: DCGM retained samples = Max(maxKeepSamples, maxKeepTime/updateFreq) + updateFreqUs: int64(config.CollectionInterval / time.Microsecond), + maxKeepTime: 600.0, /* 10 min */ + maxKeepSamples: int32(15), + }) } func (client *dcgmClient) cleanup() { diff --git a/receiver/dcgmreceiver/client_gpu_test.go b/receiver/dcgmreceiver/client_gpu_test.go index 3da83501c..123c55298 100644 --- a/receiver/dcgmreceiver/client_gpu_test.go +++ b/receiver/dcgmreceiver/client_gpu_test.go @@ -47,8 +47,9 @@ type modelSupportedFields struct { UnsupportedFields []string `yaml:"unsupported_fields"` } -// TestSupportedFieldsWithGolden test getSupportedProfilingFields() against the -// golden files for the current GPU model +// TestSupportedFieldsWithGolden tests getSupportedRegularFields() and +// getSupportedProfilingFields() against the golden files for the current GPU +// model func TestSupportedFieldsWithGolden(t *testing.T) { config := createDefaultConfig().(*Config) client, err := newClient(config, zaptest.NewLogger(t)) @@ -57,9 +58,11 @@ func TestSupportedFieldsWithGolden(t *testing.T) { assert.NotEmpty(t, client.devicesModelName) gpuModel := client.getDeviceModelName(0) allFields := discoverRequestedFieldIDs(config) + supportedRegularFields, err := getSupportedRegularFields(allFields, zaptest.NewLogger(t)) + require.Nil(t, err) supportedProfilingFields, err := getSupportedProfilingFields() require.Nil(t, err) - enabledFields, unavailableFields := filterSupportedFields(allFields, supportedProfilingFields) + enabledFields, unavailableFields := filterSupportedFields(allFields, supportedRegularFields, supportedProfilingFields) var enabledFieldsString []string var unavailableFieldsString []string diff --git a/receiver/dcgmreceiver/testdata/Tesla_P100-PCIE-16GB.yaml b/receiver/dcgmreceiver/testdata/Tesla_P100-PCIE-16GB.yaml index b7c168027..c1df656c8 100644 --- a/receiver/dcgmreceiver/testdata/Tesla_P100-PCIE-16GB.yaml +++ b/receiver/dcgmreceiver/testdata/Tesla_P100-PCIE-16GB.yaml @@ -7,18 +7,12 @@ supported_fields: - DCGM_FI_DEV_FB_USED - DCGM_FI_DEV_FB_RESERVED - DCGM_FI_DEV_MEM_COPY_UTIL - - DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION - DCGM_FI_DEV_POWER_USAGE - DCGM_FI_DEV_GPU_TEMP - DCGM_FI_DEV_SM_CLOCK - DCGM_FI_DEV_POWER_VIOLATION - DCGM_FI_DEV_THERMAL_VIOLATION - DCGM_FI_DEV_SYNC_BOOST_VIOLATION - - DCGM_FI_DEV_BOARD_LIMIT_VIOLATION - - DCGM_FI_DEV_LOW_UTIL_VIOLATION - - DCGM_FI_DEV_RELIABILITY_VIOLATION - - DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION - - DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION - DCGM_FI_DEV_ECC_SBE_VOL_TOTAL - DCGM_FI_DEV_ECC_DBE_VOL_TOTAL unsupported_fields: @@ -33,3 +27,9 @@ unsupported_fields: - DCGM_FI_PROF_PCIE_RX_BYTES - DCGM_FI_PROF_NVLINK_TX_BYTES - DCGM_FI_PROF_NVLINK_RX_BYTES + - DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION + - DCGM_FI_DEV_BOARD_LIMIT_VIOLATION + - DCGM_FI_DEV_LOW_UTIL_VIOLATION + - DCGM_FI_DEV_RELIABILITY_VIOLATION + - DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION + - DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION diff --git a/receiver/dcgmreceiver/testdata/Tesla_P4.yaml b/receiver/dcgmreceiver/testdata/Tesla_P4.yaml index 251ec04fa..aea5cf2dc 100644 --- a/receiver/dcgmreceiver/testdata/Tesla_P4.yaml +++ b/receiver/dcgmreceiver/testdata/Tesla_P4.yaml @@ -14,11 +14,6 @@ supported_fields: - DCGM_FI_DEV_POWER_VIOLATION - DCGM_FI_DEV_THERMAL_VIOLATION - DCGM_FI_DEV_SYNC_BOOST_VIOLATION - - DCGM_FI_DEV_BOARD_LIMIT_VIOLATION - - DCGM_FI_DEV_LOW_UTIL_VIOLATION - - DCGM_FI_DEV_RELIABILITY_VIOLATION - - DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION - - DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION - DCGM_FI_DEV_ECC_SBE_VOL_TOTAL - DCGM_FI_DEV_ECC_DBE_VOL_TOTAL unsupported_fields: @@ -33,3 +28,8 @@ unsupported_fields: - DCGM_FI_PROF_PCIE_RX_BYTES - DCGM_FI_PROF_NVLINK_TX_BYTES - DCGM_FI_PROF_NVLINK_RX_BYTES + - DCGM_FI_DEV_BOARD_LIMIT_VIOLATION + - DCGM_FI_DEV_LOW_UTIL_VIOLATION + - DCGM_FI_DEV_RELIABILITY_VIOLATION + - DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION + - DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION From c7e015fd4e6fbd40423d4edba9f6d6b44e014ca1 Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Thu, 13 Jun 2024 12:48:38 -0400 Subject: [PATCH 14/38] Pull in deltatorate processor. --- go.mod | 1 + go.sum | 2 ++ service/components.go | 2 ++ 3 files changed, 5 insertions(+) diff --git a/go.mod b/go.mod index 839af2c7f..67fd8397f 100644 --- a/go.mod +++ b/go.mod @@ -9,6 +9,7 @@ require ( github.com/open-telemetry/opentelemetry-collector-contrib/exporter/googlecloudexporter v0.102.0 github.com/open-telemetry/opentelemetry-collector-contrib/exporter/googlemanagedprometheusexporter v0.102.0 github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatautil v0.102.0 + github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatorateprocessor v0.102.0 github.com/open-telemetry/opentelemetry-collector-contrib/processor/filterprocessor v0.102.0 github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbyattrsprocessor v0.102.0 github.com/open-telemetry/opentelemetry-collector-contrib/processor/metricstransformprocessor v0.102.0 diff --git a/go.sum b/go.sum index fbda53497..05e3e68c5 100644 --- a/go.sum +++ b/go.sum @@ -747,6 +747,8 @@ github.com/open-telemetry/opentelemetry-collector-contrib/pkg/translator/prometh github.com/open-telemetry/opentelemetry-collector-contrib/pkg/translator/prometheusremotewrite v0.102.0/go.mod h1:+Vlutd4t2XluxHYbIAfZiz3z5uWbsbiIUpipV5AnLtk= github.com/open-telemetry/opentelemetry-collector-contrib/pkg/winperfcounters v0.102.0 h1:adfJy3Sev2MaD6+plcmsSecpzy8h4MJT7eXEuif/2Ew= github.com/open-telemetry/opentelemetry-collector-contrib/pkg/winperfcounters v0.102.0/go.mod h1:FJmA939yem9GSEbqjCK6CXVbPfNPFKhvKnn+nWNpWio= +github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatorateprocessor v0.102.0 h1:mj3t9/FAQZjcZJA2kjgbpz2fSK9yD/pYpmqKEWpHJ1A= +github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatorateprocessor v0.102.0/go.mod h1:IIIjEblgrNISbDY7GPMMto9kEVIf0n9IeJoVru89kfY= github.com/open-telemetry/opentelemetry-collector-contrib/processor/filterprocessor v0.102.0 h1:DaEYlVCn58GtkyYVK0IT/ZMjRFJ+BfmR0p9I0Eq42aQ= github.com/open-telemetry/opentelemetry-collector-contrib/processor/filterprocessor v0.102.0/go.mod h1:u9x08rUCWdgI8Nle5XOMTCmxd0K26KTZvMMA5H8Xjyg= github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbyattrsprocessor v0.102.0 h1:huh7V8uqMakQGdnbOqTSZihfoDeOIbNHfFt62HMsk5k= diff --git a/service/components.go b/service/components.go index e6b46bbc2..8dbb8cc9e 100644 --- a/service/components.go +++ b/service/components.go @@ -18,6 +18,7 @@ import ( "github.com/open-telemetry/opentelemetry-collector-contrib/exporter/fileexporter" "github.com/open-telemetry/opentelemetry-collector-contrib/exporter/googlecloudexporter" "github.com/open-telemetry/opentelemetry-collector-contrib/exporter/googlemanagedprometheusexporter" + "github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatorateprocessor" "github.com/open-telemetry/opentelemetry-collector-contrib/processor/filterprocessor" "github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbyattrsprocessor" "github.com/open-telemetry/opentelemetry-collector-contrib/processor/metricstransformprocessor" @@ -141,6 +142,7 @@ func components() (otelcol.Factories, error) { processors := []processor.Factory{ agentmetricsprocessor.NewFactory(), casttosumprocessor.NewFactory(), + deltatorateprocessor.NewFactory(), filterprocessor.NewFactory(), normalizesumsprocessor.NewFactory(), metricstransformprocessor.NewFactory(), From af0f4d9801101335525a3a810ab5741d464990fb Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Mon, 24 Jun 2024 14:17:34 -0400 Subject: [PATCH 15/38] Update attribute names; rename {pcie|nvlink}.traffic to {pcie|nvlink}.io. --- receiver/dcgmreceiver/client.go | 4 +- receiver/dcgmreceiver/documentation.md | 18 +- .../internal/metadata/generated_config.go | 8 +- .../metadata/generated_config_test.go | 8 +- .../internal/metadata/generated_metrics.go | 308 +++++++++--------- .../metadata/generated_metrics_test.go | 46 +-- .../internal/metadata/testdata/config.yaml | 8 +- receiver/dcgmreceiver/metadata.yaml | 32 +- receiver/dcgmreceiver/scraper.go | 42 +-- receiver/dcgmreceiver/scraper_gpu_test.go | 32 +- 10 files changed, 253 insertions(+), 253 deletions(-) diff --git a/receiver/dcgmreceiver/client.go b/receiver/dcgmreceiver/client.go index 2a9626d4f..99431cf5f 100644 --- a/receiver/dcgmreceiver/client.go +++ b/receiver/dcgmreceiver/client.go @@ -203,11 +203,11 @@ func discoverRequestedFieldIDs(config *Config) []dcgm.Short { requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_DRAM_ACTIVE"]) requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_MEM_COPY_UTIL"]) // fallback } - if config.Metrics.GpuDcgmPcieTraffic.Enabled { + if config.Metrics.GpuDcgmPcieIo.Enabled { requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_PCIE_TX_BYTES"]) requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_PCIE_RX_BYTES"]) } - if config.Metrics.GpuDcgmNvlinkTraffic.Enabled { + if config.Metrics.GpuDcgmNvlinkIo.Enabled { requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_NVLINK_TX_BYTES"]) requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_NVLINK_RX_BYTES"]) } diff --git a/receiver/dcgmreceiver/documentation.md b/receiver/dcgmreceiver/documentation.md index 83483d856..9ac286be8 100644 --- a/receiver/dcgmreceiver/documentation.md +++ b/receiver/dcgmreceiver/documentation.md @@ -32,7 +32,7 @@ Clock throttle total duration. | Name | Description | Values | | ---- | ----------- | ------ | -| violation | Reason for throttling, one of [power, thermal, sync_boost, board_limit, low_util, reliability, app_clock, base_clock]. | Str: ``power``, ``thermal``, ``sync_boost``, ``board_limit``, ``low_util``, ``reliability``, ``app_clock``, ``base_clock`` | +| gpu.clock.violation | Reason for throttling, one of [power, thermal, sync_boost, board_limit, low_util, reliability, app_clock, base_clock]. | Str: ``power``, ``thermal``, ``sync_boost``, ``board_limit``, ``low_util``, ``reliability``, ``app_clock``, ``base_clock`` | ### gpu.dcgm.codec.decoder.utilization @@ -62,7 +62,7 @@ Data corruption errors. | Name | Description | Values | | ---- | ----------- | ------ | -| error_type | The type of error, one of [sbe, dbe]. | Str: ``sbe``, ``dbe`` | +| gpu.error.type | The type of error, one of [sbe, dbe]. | Str: ``sbe``, ``dbe`` | ### gpu.dcgm.energy_consumption @@ -92,9 +92,9 @@ Current number of GPU memory bytes used by state. Summing the values of all stat | Name | Description | Values | | ---- | ----------- | ------ | -| memory_state | GPU memory state, one of [free, used, reserved]. | Str: ``used``, ``free``, ``reserved`` | +| gpu.memory.state | GPU memory state, one of [free, used, reserved]. | Str: ``used``, ``free``, ``reserved`` | -### gpu.dcgm.nvlink.traffic +### gpu.dcgm.nvlink.io The number of bytes sent over NVLink, not including protocol headers. @@ -106,9 +106,9 @@ The number of bytes sent over NVLink, not including protocol headers. | Name | Description | Values | | ---- | ----------- | ------ | -| direction | Direction of the link traffic, one of [tx, rx]. | Str: ``tx``, ``rx`` | +| network.io.direction | Direction of the link traffic, one of [tx, rx]. | Str: ``transmit``, ``receive`` | -### gpu.dcgm.pcie.traffic +### gpu.dcgm.pcie.io The number of bytes sent over the PCIe bus, including both protocol headers and data payloads. @@ -120,7 +120,7 @@ The number of bytes sent over the PCIe bus, including both protocol headers and | Name | Description | Values | | ---- | ----------- | ------ | -| direction | Direction of the link traffic, one of [tx, rx]. | Str: ``tx``, ``rx`` | +| network.io.direction | Direction of the link traffic, one of [tx, rx]. | Str: ``transmit``, ``receive`` | ### gpu.dcgm.pipe.utilization @@ -134,7 +134,7 @@ Fraction of cycles the corresponding GPU pipe was active, averaged over time and | Name | Description | Values | | ---- | ----------- | ------ | -| pipe | GPU pipe in use, one of [tensor, fp64, fp32, fp16]. | Str: ``tensor``, ``fp64``, ``fp32``, ``fp16`` | +| gpu.pipe | GPU pipe in use, one of [tensor, fp64, fp32, fp16]. | Str: ``tensor``, ``fp64``, ``fp32``, ``fp16`` | ### gpu.dcgm.sm.utilization @@ -172,7 +172,7 @@ XID errors. | Name | Description | Values | | ---- | ----------- | ------ | -| xid | The XID code for the error, 1..143. | Any Int | +| gpu.error.xid | The XID code for the error, 1..143. | Any Int | ## Optional Metrics diff --git a/receiver/dcgmreceiver/internal/metadata/generated_config.go b/receiver/dcgmreceiver/internal/metadata/generated_config.go index f1724c008..c41448340 100644 --- a/receiver/dcgmreceiver/internal/metadata/generated_config.go +++ b/receiver/dcgmreceiver/internal/metadata/generated_config.go @@ -36,8 +36,8 @@ type MetricsConfig struct { GpuDcgmEnergyConsumption MetricConfig `mapstructure:"gpu.dcgm.energy_consumption"` GpuDcgmMemoryBandwidthUtilization MetricConfig `mapstructure:"gpu.dcgm.memory.bandwidth_utilization"` GpuDcgmMemoryBytesUsed MetricConfig `mapstructure:"gpu.dcgm.memory.bytes_used"` - GpuDcgmNvlinkTraffic MetricConfig `mapstructure:"gpu.dcgm.nvlink.traffic"` - GpuDcgmPcieTraffic MetricConfig `mapstructure:"gpu.dcgm.pcie.traffic"` + GpuDcgmNvlinkIo MetricConfig `mapstructure:"gpu.dcgm.nvlink.io"` + GpuDcgmPcieIo MetricConfig `mapstructure:"gpu.dcgm.pcie.io"` GpuDcgmPipeUtilization MetricConfig `mapstructure:"gpu.dcgm.pipe.utilization"` GpuDcgmSmOccupancy MetricConfig `mapstructure:"gpu.dcgm.sm.occupancy"` GpuDcgmSmUtilization MetricConfig `mapstructure:"gpu.dcgm.sm.utilization"` @@ -72,10 +72,10 @@ func DefaultMetricsConfig() MetricsConfig { GpuDcgmMemoryBytesUsed: MetricConfig{ Enabled: true, }, - GpuDcgmNvlinkTraffic: MetricConfig{ + GpuDcgmNvlinkIo: MetricConfig{ Enabled: true, }, - GpuDcgmPcieTraffic: MetricConfig{ + GpuDcgmPcieIo: MetricConfig{ Enabled: true, }, GpuDcgmPipeUtilization: MetricConfig{ diff --git a/receiver/dcgmreceiver/internal/metadata/generated_config_test.go b/receiver/dcgmreceiver/internal/metadata/generated_config_test.go index 631be1e52..ca1405d53 100644 --- a/receiver/dcgmreceiver/internal/metadata/generated_config_test.go +++ b/receiver/dcgmreceiver/internal/metadata/generated_config_test.go @@ -34,8 +34,8 @@ func TestMetricsBuilderConfig(t *testing.T) { GpuDcgmEnergyConsumption: MetricConfig{Enabled: true}, GpuDcgmMemoryBandwidthUtilization: MetricConfig{Enabled: true}, GpuDcgmMemoryBytesUsed: MetricConfig{Enabled: true}, - GpuDcgmNvlinkTraffic: MetricConfig{Enabled: true}, - GpuDcgmPcieTraffic: MetricConfig{Enabled: true}, + GpuDcgmNvlinkIo: MetricConfig{Enabled: true}, + GpuDcgmPcieIo: MetricConfig{Enabled: true}, GpuDcgmPipeUtilization: MetricConfig{Enabled: true}, GpuDcgmSmOccupancy: MetricConfig{Enabled: true}, GpuDcgmSmUtilization: MetricConfig{Enabled: true}, @@ -62,8 +62,8 @@ func TestMetricsBuilderConfig(t *testing.T) { GpuDcgmEnergyConsumption: MetricConfig{Enabled: false}, GpuDcgmMemoryBandwidthUtilization: MetricConfig{Enabled: false}, GpuDcgmMemoryBytesUsed: MetricConfig{Enabled: false}, - GpuDcgmNvlinkTraffic: MetricConfig{Enabled: false}, - GpuDcgmPcieTraffic: MetricConfig{Enabled: false}, + GpuDcgmNvlinkIo: MetricConfig{Enabled: false}, + GpuDcgmPcieIo: MetricConfig{Enabled: false}, GpuDcgmPipeUtilization: MetricConfig{Enabled: false}, GpuDcgmSmOccupancy: MetricConfig{Enabled: false}, GpuDcgmSmUtilization: MetricConfig{Enabled: false}, diff --git a/receiver/dcgmreceiver/internal/metadata/generated_metrics.go b/receiver/dcgmreceiver/internal/metadata/generated_metrics.go index f709357f6..0940036d2 100644 --- a/receiver/dcgmreceiver/internal/metadata/generated_metrics.go +++ b/receiver/dcgmreceiver/internal/metadata/generated_metrics.go @@ -12,170 +12,170 @@ import ( "go.opentelemetry.io/collector/receiver" ) -// AttributeDirection specifies the a value direction attribute. -type AttributeDirection int +// AttributeGpuClockViolation specifies the a value gpu.clock.violation attribute. +type AttributeGpuClockViolation int const ( - _ AttributeDirection = iota - AttributeDirectionTx - AttributeDirectionRx + _ AttributeGpuClockViolation = iota + AttributeGpuClockViolationPower + AttributeGpuClockViolationThermal + AttributeGpuClockViolationSyncBoost + AttributeGpuClockViolationBoardLimit + AttributeGpuClockViolationLowUtil + AttributeGpuClockViolationReliability + AttributeGpuClockViolationAppClock + AttributeGpuClockViolationBaseClock ) -// String returns the string representation of the AttributeDirection. -func (av AttributeDirection) String() string { +// String returns the string representation of the AttributeGpuClockViolation. +func (av AttributeGpuClockViolation) String() string { switch av { - case AttributeDirectionTx: - return "tx" - case AttributeDirectionRx: - return "rx" + case AttributeGpuClockViolationPower: + return "power" + case AttributeGpuClockViolationThermal: + return "thermal" + case AttributeGpuClockViolationSyncBoost: + return "sync_boost" + case AttributeGpuClockViolationBoardLimit: + return "board_limit" + case AttributeGpuClockViolationLowUtil: + return "low_util" + case AttributeGpuClockViolationReliability: + return "reliability" + case AttributeGpuClockViolationAppClock: + return "app_clock" + case AttributeGpuClockViolationBaseClock: + return "base_clock" } return "" } -// MapAttributeDirection is a helper map of string to AttributeDirection attribute value. -var MapAttributeDirection = map[string]AttributeDirection{ - "tx": AttributeDirectionTx, - "rx": AttributeDirectionRx, +// MapAttributeGpuClockViolation is a helper map of string to AttributeGpuClockViolation attribute value. +var MapAttributeGpuClockViolation = map[string]AttributeGpuClockViolation{ + "power": AttributeGpuClockViolationPower, + "thermal": AttributeGpuClockViolationThermal, + "sync_boost": AttributeGpuClockViolationSyncBoost, + "board_limit": AttributeGpuClockViolationBoardLimit, + "low_util": AttributeGpuClockViolationLowUtil, + "reliability": AttributeGpuClockViolationReliability, + "app_clock": AttributeGpuClockViolationAppClock, + "base_clock": AttributeGpuClockViolationBaseClock, } -// AttributeErrorType specifies the a value error_type attribute. -type AttributeErrorType int +// AttributeGpuErrorType specifies the a value gpu.error.type attribute. +type AttributeGpuErrorType int const ( - _ AttributeErrorType = iota - AttributeErrorTypeSbe - AttributeErrorTypeDbe + _ AttributeGpuErrorType = iota + AttributeGpuErrorTypeSbe + AttributeGpuErrorTypeDbe ) -// String returns the string representation of the AttributeErrorType. -func (av AttributeErrorType) String() string { +// String returns the string representation of the AttributeGpuErrorType. +func (av AttributeGpuErrorType) String() string { switch av { - case AttributeErrorTypeSbe: + case AttributeGpuErrorTypeSbe: return "sbe" - case AttributeErrorTypeDbe: + case AttributeGpuErrorTypeDbe: return "dbe" } return "" } -// MapAttributeErrorType is a helper map of string to AttributeErrorType attribute value. -var MapAttributeErrorType = map[string]AttributeErrorType{ - "sbe": AttributeErrorTypeSbe, - "dbe": AttributeErrorTypeDbe, +// MapAttributeGpuErrorType is a helper map of string to AttributeGpuErrorType attribute value. +var MapAttributeGpuErrorType = map[string]AttributeGpuErrorType{ + "sbe": AttributeGpuErrorTypeSbe, + "dbe": AttributeGpuErrorTypeDbe, } -// AttributeMemoryState specifies the a value memory_state attribute. -type AttributeMemoryState int +// AttributeGpuMemoryState specifies the a value gpu.memory.state attribute. +type AttributeGpuMemoryState int const ( - _ AttributeMemoryState = iota - AttributeMemoryStateUsed - AttributeMemoryStateFree - AttributeMemoryStateReserved + _ AttributeGpuMemoryState = iota + AttributeGpuMemoryStateUsed + AttributeGpuMemoryStateFree + AttributeGpuMemoryStateReserved ) -// String returns the string representation of the AttributeMemoryState. -func (av AttributeMemoryState) String() string { +// String returns the string representation of the AttributeGpuMemoryState. +func (av AttributeGpuMemoryState) String() string { switch av { - case AttributeMemoryStateUsed: + case AttributeGpuMemoryStateUsed: return "used" - case AttributeMemoryStateFree: + case AttributeGpuMemoryStateFree: return "free" - case AttributeMemoryStateReserved: + case AttributeGpuMemoryStateReserved: return "reserved" } return "" } -// MapAttributeMemoryState is a helper map of string to AttributeMemoryState attribute value. -var MapAttributeMemoryState = map[string]AttributeMemoryState{ - "used": AttributeMemoryStateUsed, - "free": AttributeMemoryStateFree, - "reserved": AttributeMemoryStateReserved, +// MapAttributeGpuMemoryState is a helper map of string to AttributeGpuMemoryState attribute value. +var MapAttributeGpuMemoryState = map[string]AttributeGpuMemoryState{ + "used": AttributeGpuMemoryStateUsed, + "free": AttributeGpuMemoryStateFree, + "reserved": AttributeGpuMemoryStateReserved, } -// AttributePipe specifies the a value pipe attribute. -type AttributePipe int +// AttributeGpuPipe specifies the a value gpu.pipe attribute. +type AttributeGpuPipe int const ( - _ AttributePipe = iota - AttributePipeTensor - AttributePipeFp64 - AttributePipeFp32 - AttributePipeFp16 + _ AttributeGpuPipe = iota + AttributeGpuPipeTensor + AttributeGpuPipeFp64 + AttributeGpuPipeFp32 + AttributeGpuPipeFp16 ) -// String returns the string representation of the AttributePipe. -func (av AttributePipe) String() string { +// String returns the string representation of the AttributeGpuPipe. +func (av AttributeGpuPipe) String() string { switch av { - case AttributePipeTensor: + case AttributeGpuPipeTensor: return "tensor" - case AttributePipeFp64: + case AttributeGpuPipeFp64: return "fp64" - case AttributePipeFp32: + case AttributeGpuPipeFp32: return "fp32" - case AttributePipeFp16: + case AttributeGpuPipeFp16: return "fp16" } return "" } -// MapAttributePipe is a helper map of string to AttributePipe attribute value. -var MapAttributePipe = map[string]AttributePipe{ - "tensor": AttributePipeTensor, - "fp64": AttributePipeFp64, - "fp32": AttributePipeFp32, - "fp16": AttributePipeFp16, +// MapAttributeGpuPipe is a helper map of string to AttributeGpuPipe attribute value. +var MapAttributeGpuPipe = map[string]AttributeGpuPipe{ + "tensor": AttributeGpuPipeTensor, + "fp64": AttributeGpuPipeFp64, + "fp32": AttributeGpuPipeFp32, + "fp16": AttributeGpuPipeFp16, } -// AttributeViolation specifies the a value violation attribute. -type AttributeViolation int +// AttributeNetworkIoDirection specifies the a value network.io.direction attribute. +type AttributeNetworkIoDirection int const ( - _ AttributeViolation = iota - AttributeViolationPower - AttributeViolationThermal - AttributeViolationSyncBoost - AttributeViolationBoardLimit - AttributeViolationLowUtil - AttributeViolationReliability - AttributeViolationAppClock - AttributeViolationBaseClock + _ AttributeNetworkIoDirection = iota + AttributeNetworkIoDirectionTransmit + AttributeNetworkIoDirectionReceive ) -// String returns the string representation of the AttributeViolation. -func (av AttributeViolation) String() string { +// String returns the string representation of the AttributeNetworkIoDirection. +func (av AttributeNetworkIoDirection) String() string { switch av { - case AttributeViolationPower: - return "power" - case AttributeViolationThermal: - return "thermal" - case AttributeViolationSyncBoost: - return "sync_boost" - case AttributeViolationBoardLimit: - return "board_limit" - case AttributeViolationLowUtil: - return "low_util" - case AttributeViolationReliability: - return "reliability" - case AttributeViolationAppClock: - return "app_clock" - case AttributeViolationBaseClock: - return "base_clock" + case AttributeNetworkIoDirectionTransmit: + return "transmit" + case AttributeNetworkIoDirectionReceive: + return "receive" } return "" } -// MapAttributeViolation is a helper map of string to AttributeViolation attribute value. -var MapAttributeViolation = map[string]AttributeViolation{ - "power": AttributeViolationPower, - "thermal": AttributeViolationThermal, - "sync_boost": AttributeViolationSyncBoost, - "board_limit": AttributeViolationBoardLimit, - "low_util": AttributeViolationLowUtil, - "reliability": AttributeViolationReliability, - "app_clock": AttributeViolationAppClock, - "base_clock": AttributeViolationBaseClock, +// MapAttributeNetworkIoDirection is a helper map of string to AttributeNetworkIoDirection attribute value. +var MapAttributeNetworkIoDirection = map[string]AttributeNetworkIoDirection{ + "transmit": AttributeNetworkIoDirectionTransmit, + "receive": AttributeNetworkIoDirectionReceive, } type metricGpuDcgmClockFrequency struct { @@ -244,7 +244,7 @@ func (m *metricGpuDcgmClockThrottleDurationTime) init() { m.data.Sum().DataPoints().EnsureCapacity(m.capacity) } -func (m *metricGpuDcgmClockThrottleDurationTime) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64, violationAttributeValue string) { +func (m *metricGpuDcgmClockThrottleDurationTime) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64, gpuClockViolationAttributeValue string) { if !m.config.Enabled { return } @@ -252,7 +252,7 @@ func (m *metricGpuDcgmClockThrottleDurationTime) recordDataPoint(start pcommon.T dp.SetStartTimestamp(start) dp.SetTimestamp(ts) dp.SetDoubleValue(val) - dp.Attributes().PutStr("violation", violationAttributeValue) + dp.Attributes().PutStr("gpu.clock.violation", gpuClockViolationAttributeValue) } // updateCapacity saves max length of data point slices that will be used for the slice capacity. @@ -395,7 +395,7 @@ func (m *metricGpuDcgmEccErrors) init() { m.data.Sum().DataPoints().EnsureCapacity(m.capacity) } -func (m *metricGpuDcgmEccErrors) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, errorTypeAttributeValue string) { +func (m *metricGpuDcgmEccErrors) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, gpuErrorTypeAttributeValue string) { if !m.config.Enabled { return } @@ -403,7 +403,7 @@ func (m *metricGpuDcgmEccErrors) recordDataPoint(start pcommon.Timestamp, ts pco dp.SetStartTimestamp(start) dp.SetTimestamp(ts) dp.SetIntValue(val) - dp.Attributes().PutStr("error_type", errorTypeAttributeValue) + dp.Attributes().PutStr("gpu.error.type", gpuErrorTypeAttributeValue) } // updateCapacity saves max length of data point slices that will be used for the slice capacity. @@ -546,7 +546,7 @@ func (m *metricGpuDcgmMemoryBytesUsed) init() { m.data.Gauge().DataPoints().EnsureCapacity(m.capacity) } -func (m *metricGpuDcgmMemoryBytesUsed) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, memoryStateAttributeValue string) { +func (m *metricGpuDcgmMemoryBytesUsed) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, gpuMemoryStateAttributeValue string) { if !m.config.Enabled { return } @@ -554,7 +554,7 @@ func (m *metricGpuDcgmMemoryBytesUsed) recordDataPoint(start pcommon.Timestamp, dp.SetStartTimestamp(start) dp.SetTimestamp(ts) dp.SetIntValue(val) - dp.Attributes().PutStr("memory_state", memoryStateAttributeValue) + dp.Attributes().PutStr("gpu.memory.state", gpuMemoryStateAttributeValue) } // updateCapacity saves max length of data point slices that will be used for the slice capacity. @@ -582,15 +582,15 @@ func newMetricGpuDcgmMemoryBytesUsed(cfg MetricConfig) metricGpuDcgmMemoryBytesU return m } -type metricGpuDcgmNvlinkTraffic struct { +type metricGpuDcgmNvlinkIo struct { data pmetric.Metric // data buffer for generated metric. config MetricConfig // metric config provided by user. capacity int // max observed number of data points added to the metric. } -// init fills gpu.dcgm.nvlink.traffic metric with initial data. -func (m *metricGpuDcgmNvlinkTraffic) init() { - m.data.SetName("gpu.dcgm.nvlink.traffic") +// init fills gpu.dcgm.nvlink.io metric with initial data. +func (m *metricGpuDcgmNvlinkIo) init() { + m.data.SetName("gpu.dcgm.nvlink.io") m.data.SetDescription("The number of bytes sent over NVLink, not including protocol headers.") m.data.SetUnit("By") m.data.SetEmptySum() @@ -599,7 +599,7 @@ func (m *metricGpuDcgmNvlinkTraffic) init() { m.data.Sum().DataPoints().EnsureCapacity(m.capacity) } -func (m *metricGpuDcgmNvlinkTraffic) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, directionAttributeValue string) { +func (m *metricGpuDcgmNvlinkIo) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, networkIoDirectionAttributeValue string) { if !m.config.Enabled { return } @@ -607,18 +607,18 @@ func (m *metricGpuDcgmNvlinkTraffic) recordDataPoint(start pcommon.Timestamp, ts dp.SetStartTimestamp(start) dp.SetTimestamp(ts) dp.SetIntValue(val) - dp.Attributes().PutStr("direction", directionAttributeValue) + dp.Attributes().PutStr("network.io.direction", networkIoDirectionAttributeValue) } // updateCapacity saves max length of data point slices that will be used for the slice capacity. -func (m *metricGpuDcgmNvlinkTraffic) updateCapacity() { +func (m *metricGpuDcgmNvlinkIo) updateCapacity() { if m.data.Sum().DataPoints().Len() > m.capacity { m.capacity = m.data.Sum().DataPoints().Len() } } // emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. -func (m *metricGpuDcgmNvlinkTraffic) emit(metrics pmetric.MetricSlice) { +func (m *metricGpuDcgmNvlinkIo) emit(metrics pmetric.MetricSlice) { if m.config.Enabled && m.data.Sum().DataPoints().Len() > 0 { m.updateCapacity() m.data.MoveTo(metrics.AppendEmpty()) @@ -626,8 +626,8 @@ func (m *metricGpuDcgmNvlinkTraffic) emit(metrics pmetric.MetricSlice) { } } -func newMetricGpuDcgmNvlinkTraffic(cfg MetricConfig) metricGpuDcgmNvlinkTraffic { - m := metricGpuDcgmNvlinkTraffic{config: cfg} +func newMetricGpuDcgmNvlinkIo(cfg MetricConfig) metricGpuDcgmNvlinkIo { + m := metricGpuDcgmNvlinkIo{config: cfg} if cfg.Enabled { m.data = pmetric.NewMetric() m.init() @@ -635,15 +635,15 @@ func newMetricGpuDcgmNvlinkTraffic(cfg MetricConfig) metricGpuDcgmNvlinkTraffic return m } -type metricGpuDcgmPcieTraffic struct { +type metricGpuDcgmPcieIo struct { data pmetric.Metric // data buffer for generated metric. config MetricConfig // metric config provided by user. capacity int // max observed number of data points added to the metric. } -// init fills gpu.dcgm.pcie.traffic metric with initial data. -func (m *metricGpuDcgmPcieTraffic) init() { - m.data.SetName("gpu.dcgm.pcie.traffic") +// init fills gpu.dcgm.pcie.io metric with initial data. +func (m *metricGpuDcgmPcieIo) init() { + m.data.SetName("gpu.dcgm.pcie.io") m.data.SetDescription("The number of bytes sent over the PCIe bus, including both protocol headers and data payloads.") m.data.SetUnit("By") m.data.SetEmptySum() @@ -652,7 +652,7 @@ func (m *metricGpuDcgmPcieTraffic) init() { m.data.Sum().DataPoints().EnsureCapacity(m.capacity) } -func (m *metricGpuDcgmPcieTraffic) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, directionAttributeValue string) { +func (m *metricGpuDcgmPcieIo) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, networkIoDirectionAttributeValue string) { if !m.config.Enabled { return } @@ -660,18 +660,18 @@ func (m *metricGpuDcgmPcieTraffic) recordDataPoint(start pcommon.Timestamp, ts p dp.SetStartTimestamp(start) dp.SetTimestamp(ts) dp.SetIntValue(val) - dp.Attributes().PutStr("direction", directionAttributeValue) + dp.Attributes().PutStr("network.io.direction", networkIoDirectionAttributeValue) } // updateCapacity saves max length of data point slices that will be used for the slice capacity. -func (m *metricGpuDcgmPcieTraffic) updateCapacity() { +func (m *metricGpuDcgmPcieIo) updateCapacity() { if m.data.Sum().DataPoints().Len() > m.capacity { m.capacity = m.data.Sum().DataPoints().Len() } } // emit appends recorded metric data to a metrics slice and prepares it for recording another set of data points. -func (m *metricGpuDcgmPcieTraffic) emit(metrics pmetric.MetricSlice) { +func (m *metricGpuDcgmPcieIo) emit(metrics pmetric.MetricSlice) { if m.config.Enabled && m.data.Sum().DataPoints().Len() > 0 { m.updateCapacity() m.data.MoveTo(metrics.AppendEmpty()) @@ -679,8 +679,8 @@ func (m *metricGpuDcgmPcieTraffic) emit(metrics pmetric.MetricSlice) { } } -func newMetricGpuDcgmPcieTraffic(cfg MetricConfig) metricGpuDcgmPcieTraffic { - m := metricGpuDcgmPcieTraffic{config: cfg} +func newMetricGpuDcgmPcieIo(cfg MetricConfig) metricGpuDcgmPcieIo { + m := metricGpuDcgmPcieIo{config: cfg} if cfg.Enabled { m.data = pmetric.NewMetric() m.init() @@ -703,7 +703,7 @@ func (m *metricGpuDcgmPipeUtilization) init() { m.data.Gauge().DataPoints().EnsureCapacity(m.capacity) } -func (m *metricGpuDcgmPipeUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64, pipeAttributeValue string) { +func (m *metricGpuDcgmPipeUtilization) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val float64, gpuPipeAttributeValue string) { if !m.config.Enabled { return } @@ -711,7 +711,7 @@ func (m *metricGpuDcgmPipeUtilization) recordDataPoint(start pcommon.Timestamp, dp.SetStartTimestamp(start) dp.SetTimestamp(ts) dp.SetDoubleValue(val) - dp.Attributes().PutStr("pipe", pipeAttributeValue) + dp.Attributes().PutStr("gpu.pipe", gpuPipeAttributeValue) } // updateCapacity saves max length of data point slices that will be used for the slice capacity. @@ -952,7 +952,7 @@ func (m *metricGpuDcgmXidErrors) init() { m.data.Sum().DataPoints().EnsureCapacity(m.capacity) } -func (m *metricGpuDcgmXidErrors) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, xidAttributeValue int64) { +func (m *metricGpuDcgmXidErrors) recordDataPoint(start pcommon.Timestamp, ts pcommon.Timestamp, val int64, gpuErrorXidAttributeValue int64) { if !m.config.Enabled { return } @@ -960,7 +960,7 @@ func (m *metricGpuDcgmXidErrors) recordDataPoint(start pcommon.Timestamp, ts pco dp.SetStartTimestamp(start) dp.SetTimestamp(ts) dp.SetIntValue(val) - dp.Attributes().PutInt("xid", xidAttributeValue) + dp.Attributes().PutInt("gpu.error.xid", gpuErrorXidAttributeValue) } // updateCapacity saves max length of data point slices that will be used for the slice capacity. @@ -1006,8 +1006,8 @@ type MetricsBuilder struct { metricGpuDcgmEnergyConsumption metricGpuDcgmEnergyConsumption metricGpuDcgmMemoryBandwidthUtilization metricGpuDcgmMemoryBandwidthUtilization metricGpuDcgmMemoryBytesUsed metricGpuDcgmMemoryBytesUsed - metricGpuDcgmNvlinkTraffic metricGpuDcgmNvlinkTraffic - metricGpuDcgmPcieTraffic metricGpuDcgmPcieTraffic + metricGpuDcgmNvlinkIo metricGpuDcgmNvlinkIo + metricGpuDcgmPcieIo metricGpuDcgmPcieIo metricGpuDcgmPipeUtilization metricGpuDcgmPipeUtilization metricGpuDcgmSmOccupancy metricGpuDcgmSmOccupancy metricGpuDcgmSmUtilization metricGpuDcgmSmUtilization @@ -1040,8 +1040,8 @@ func NewMetricsBuilder(mbc MetricsBuilderConfig, settings receiver.CreateSetting metricGpuDcgmEnergyConsumption: newMetricGpuDcgmEnergyConsumption(mbc.Metrics.GpuDcgmEnergyConsumption), metricGpuDcgmMemoryBandwidthUtilization: newMetricGpuDcgmMemoryBandwidthUtilization(mbc.Metrics.GpuDcgmMemoryBandwidthUtilization), metricGpuDcgmMemoryBytesUsed: newMetricGpuDcgmMemoryBytesUsed(mbc.Metrics.GpuDcgmMemoryBytesUsed), - metricGpuDcgmNvlinkTraffic: newMetricGpuDcgmNvlinkTraffic(mbc.Metrics.GpuDcgmNvlinkTraffic), - metricGpuDcgmPcieTraffic: newMetricGpuDcgmPcieTraffic(mbc.Metrics.GpuDcgmPcieTraffic), + metricGpuDcgmNvlinkIo: newMetricGpuDcgmNvlinkIo(mbc.Metrics.GpuDcgmNvlinkIo), + metricGpuDcgmPcieIo: newMetricGpuDcgmPcieIo(mbc.Metrics.GpuDcgmPcieIo), metricGpuDcgmPipeUtilization: newMetricGpuDcgmPipeUtilization(mbc.Metrics.GpuDcgmPipeUtilization), metricGpuDcgmSmOccupancy: newMetricGpuDcgmSmOccupancy(mbc.Metrics.GpuDcgmSmOccupancy), metricGpuDcgmSmUtilization: newMetricGpuDcgmSmUtilization(mbc.Metrics.GpuDcgmSmUtilization), @@ -1138,8 +1138,8 @@ func (mb *MetricsBuilder) EmitForResource(rmo ...ResourceMetricsOption) { mb.metricGpuDcgmEnergyConsumption.emit(ils.Metrics()) mb.metricGpuDcgmMemoryBandwidthUtilization.emit(ils.Metrics()) mb.metricGpuDcgmMemoryBytesUsed.emit(ils.Metrics()) - mb.metricGpuDcgmNvlinkTraffic.emit(ils.Metrics()) - mb.metricGpuDcgmPcieTraffic.emit(ils.Metrics()) + mb.metricGpuDcgmNvlinkIo.emit(ils.Metrics()) + mb.metricGpuDcgmPcieIo.emit(ils.Metrics()) mb.metricGpuDcgmPipeUtilization.emit(ils.Metrics()) mb.metricGpuDcgmSmOccupancy.emit(ils.Metrics()) mb.metricGpuDcgmSmUtilization.emit(ils.Metrics()) @@ -1183,8 +1183,8 @@ func (mb *MetricsBuilder) RecordGpuDcgmClockFrequencyDataPoint(ts pcommon.Timest } // RecordGpuDcgmClockThrottleDurationTimeDataPoint adds a data point to gpu.dcgm.clock.throttle_duration.time metric. -func (mb *MetricsBuilder) RecordGpuDcgmClockThrottleDurationTimeDataPoint(ts pcommon.Timestamp, val float64, violationAttributeValue AttributeViolation) { - mb.metricGpuDcgmClockThrottleDurationTime.recordDataPoint(mb.startTime, ts, val, violationAttributeValue.String()) +func (mb *MetricsBuilder) RecordGpuDcgmClockThrottleDurationTimeDataPoint(ts pcommon.Timestamp, val float64, gpuClockViolationAttributeValue AttributeGpuClockViolation) { + mb.metricGpuDcgmClockThrottleDurationTime.recordDataPoint(mb.startTime, ts, val, gpuClockViolationAttributeValue.String()) } // RecordGpuDcgmCodecDecoderUtilizationDataPoint adds a data point to gpu.dcgm.codec.decoder.utilization metric. @@ -1198,8 +1198,8 @@ func (mb *MetricsBuilder) RecordGpuDcgmCodecEncoderUtilizationDataPoint(ts pcomm } // RecordGpuDcgmEccErrorsDataPoint adds a data point to gpu.dcgm.ecc_errors metric. -func (mb *MetricsBuilder) RecordGpuDcgmEccErrorsDataPoint(ts pcommon.Timestamp, val int64, errorTypeAttributeValue AttributeErrorType) { - mb.metricGpuDcgmEccErrors.recordDataPoint(mb.startTime, ts, val, errorTypeAttributeValue.String()) +func (mb *MetricsBuilder) RecordGpuDcgmEccErrorsDataPoint(ts pcommon.Timestamp, val int64, gpuErrorTypeAttributeValue AttributeGpuErrorType) { + mb.metricGpuDcgmEccErrors.recordDataPoint(mb.startTime, ts, val, gpuErrorTypeAttributeValue.String()) } // RecordGpuDcgmEnergyConsumptionDataPoint adds a data point to gpu.dcgm.energy_consumption metric. @@ -1213,23 +1213,23 @@ func (mb *MetricsBuilder) RecordGpuDcgmMemoryBandwidthUtilizationDataPoint(ts pc } // RecordGpuDcgmMemoryBytesUsedDataPoint adds a data point to gpu.dcgm.memory.bytes_used metric. -func (mb *MetricsBuilder) RecordGpuDcgmMemoryBytesUsedDataPoint(ts pcommon.Timestamp, val int64, memoryStateAttributeValue AttributeMemoryState) { - mb.metricGpuDcgmMemoryBytesUsed.recordDataPoint(mb.startTime, ts, val, memoryStateAttributeValue.String()) +func (mb *MetricsBuilder) RecordGpuDcgmMemoryBytesUsedDataPoint(ts pcommon.Timestamp, val int64, gpuMemoryStateAttributeValue AttributeGpuMemoryState) { + mb.metricGpuDcgmMemoryBytesUsed.recordDataPoint(mb.startTime, ts, val, gpuMemoryStateAttributeValue.String()) } -// RecordGpuDcgmNvlinkTrafficDataPoint adds a data point to gpu.dcgm.nvlink.traffic metric. -func (mb *MetricsBuilder) RecordGpuDcgmNvlinkTrafficDataPoint(ts pcommon.Timestamp, val int64, directionAttributeValue AttributeDirection) { - mb.metricGpuDcgmNvlinkTraffic.recordDataPoint(mb.startTime, ts, val, directionAttributeValue.String()) +// RecordGpuDcgmNvlinkIoDataPoint adds a data point to gpu.dcgm.nvlink.io metric. +func (mb *MetricsBuilder) RecordGpuDcgmNvlinkIoDataPoint(ts pcommon.Timestamp, val int64, networkIoDirectionAttributeValue AttributeNetworkIoDirection) { + mb.metricGpuDcgmNvlinkIo.recordDataPoint(mb.startTime, ts, val, networkIoDirectionAttributeValue.String()) } -// RecordGpuDcgmPcieTrafficDataPoint adds a data point to gpu.dcgm.pcie.traffic metric. -func (mb *MetricsBuilder) RecordGpuDcgmPcieTrafficDataPoint(ts pcommon.Timestamp, val int64, directionAttributeValue AttributeDirection) { - mb.metricGpuDcgmPcieTraffic.recordDataPoint(mb.startTime, ts, val, directionAttributeValue.String()) +// RecordGpuDcgmPcieIoDataPoint adds a data point to gpu.dcgm.pcie.io metric. +func (mb *MetricsBuilder) RecordGpuDcgmPcieIoDataPoint(ts pcommon.Timestamp, val int64, networkIoDirectionAttributeValue AttributeNetworkIoDirection) { + mb.metricGpuDcgmPcieIo.recordDataPoint(mb.startTime, ts, val, networkIoDirectionAttributeValue.String()) } // RecordGpuDcgmPipeUtilizationDataPoint adds a data point to gpu.dcgm.pipe.utilization metric. -func (mb *MetricsBuilder) RecordGpuDcgmPipeUtilizationDataPoint(ts pcommon.Timestamp, val float64, pipeAttributeValue AttributePipe) { - mb.metricGpuDcgmPipeUtilization.recordDataPoint(mb.startTime, ts, val, pipeAttributeValue.String()) +func (mb *MetricsBuilder) RecordGpuDcgmPipeUtilizationDataPoint(ts pcommon.Timestamp, val float64, gpuPipeAttributeValue AttributeGpuPipe) { + mb.metricGpuDcgmPipeUtilization.recordDataPoint(mb.startTime, ts, val, gpuPipeAttributeValue.String()) } // RecordGpuDcgmSmOccupancyDataPoint adds a data point to gpu.dcgm.sm.occupancy metric. @@ -1253,8 +1253,8 @@ func (mb *MetricsBuilder) RecordGpuDcgmUtilizationDataPoint(ts pcommon.Timestamp } // RecordGpuDcgmXidErrorsDataPoint adds a data point to gpu.dcgm.xid_errors metric. -func (mb *MetricsBuilder) RecordGpuDcgmXidErrorsDataPoint(ts pcommon.Timestamp, val int64, xidAttributeValue int64) { - mb.metricGpuDcgmXidErrors.recordDataPoint(mb.startTime, ts, val, xidAttributeValue) +func (mb *MetricsBuilder) RecordGpuDcgmXidErrorsDataPoint(ts pcommon.Timestamp, val int64, gpuErrorXidAttributeValue int64) { + mb.metricGpuDcgmXidErrors.recordDataPoint(mb.startTime, ts, val, gpuErrorXidAttributeValue) } // Reset resets metrics builder to its initial state. It should be used when external metrics source is restarted, diff --git a/receiver/dcgmreceiver/internal/metadata/generated_metrics_test.go b/receiver/dcgmreceiver/internal/metadata/generated_metrics_test.go index 802d27c9a..d9437a1b6 100644 --- a/receiver/dcgmreceiver/internal/metadata/generated_metrics_test.go +++ b/receiver/dcgmreceiver/internal/metadata/generated_metrics_test.go @@ -74,7 +74,7 @@ func TestMetricsBuilder(t *testing.T) { defaultMetricsCount++ allMetricsCount++ - mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(ts, 1, AttributeViolationPower) + mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(ts, 1, AttributeGpuClockViolationPower) defaultMetricsCount++ allMetricsCount++ @@ -86,7 +86,7 @@ func TestMetricsBuilder(t *testing.T) { defaultMetricsCount++ allMetricsCount++ - mb.RecordGpuDcgmEccErrorsDataPoint(ts, 1, AttributeErrorTypeSbe) + mb.RecordGpuDcgmEccErrorsDataPoint(ts, 1, AttributeGpuErrorTypeSbe) defaultMetricsCount++ allMetricsCount++ @@ -98,19 +98,19 @@ func TestMetricsBuilder(t *testing.T) { defaultMetricsCount++ allMetricsCount++ - mb.RecordGpuDcgmMemoryBytesUsedDataPoint(ts, 1, AttributeMemoryStateUsed) + mb.RecordGpuDcgmMemoryBytesUsedDataPoint(ts, 1, AttributeGpuMemoryStateUsed) defaultMetricsCount++ allMetricsCount++ - mb.RecordGpuDcgmNvlinkTrafficDataPoint(ts, 1, AttributeDirectionTx) + mb.RecordGpuDcgmNvlinkIoDataPoint(ts, 1, AttributeNetworkIoDirectionTransmit) defaultMetricsCount++ allMetricsCount++ - mb.RecordGpuDcgmPcieTrafficDataPoint(ts, 1, AttributeDirectionTx) + mb.RecordGpuDcgmPcieIoDataPoint(ts, 1, AttributeNetworkIoDirectionTransmit) defaultMetricsCount++ allMetricsCount++ - mb.RecordGpuDcgmPipeUtilizationDataPoint(ts, 1, AttributePipeTensor) + mb.RecordGpuDcgmPipeUtilizationDataPoint(ts, 1, AttributeGpuPipeTensor) allMetricsCount++ mb.RecordGpuDcgmSmOccupancyDataPoint(ts, 1) @@ -129,7 +129,7 @@ func TestMetricsBuilder(t *testing.T) { defaultMetricsCount++ allMetricsCount++ - mb.RecordGpuDcgmXidErrorsDataPoint(ts, 1, 3) + mb.RecordGpuDcgmXidErrorsDataPoint(ts, 1, 13) rb := mb.NewResourceBuilder() rb.SetGpuModel("gpu.model-val") @@ -183,7 +183,7 @@ func TestMetricsBuilder(t *testing.T) { assert.Equal(t, ts, dp.Timestamp()) assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) assert.Equal(t, float64(1), dp.DoubleValue()) - attrVal, ok := dp.Attributes().Get("violation") + attrVal, ok := dp.Attributes().Get("gpu.clock.violation") assert.True(t, ok) assert.EqualValues(t, "power", attrVal.Str()) case "gpu.dcgm.codec.decoder.utilization": @@ -224,7 +224,7 @@ func TestMetricsBuilder(t *testing.T) { assert.Equal(t, ts, dp.Timestamp()) assert.Equal(t, pmetric.NumberDataPointValueTypeInt, dp.ValueType()) assert.Equal(t, int64(1), dp.IntValue()) - attrVal, ok := dp.Attributes().Get("error_type") + attrVal, ok := dp.Attributes().Get("gpu.error.type") assert.True(t, ok) assert.EqualValues(t, "sbe", attrVal.Str()) case "gpu.dcgm.energy_consumption": @@ -265,12 +265,12 @@ func TestMetricsBuilder(t *testing.T) { assert.Equal(t, ts, dp.Timestamp()) assert.Equal(t, pmetric.NumberDataPointValueTypeInt, dp.ValueType()) assert.Equal(t, int64(1), dp.IntValue()) - attrVal, ok := dp.Attributes().Get("memory_state") + attrVal, ok := dp.Attributes().Get("gpu.memory.state") assert.True(t, ok) assert.EqualValues(t, "used", attrVal.Str()) - case "gpu.dcgm.nvlink.traffic": - assert.False(t, validatedMetrics["gpu.dcgm.nvlink.traffic"], "Found a duplicate in the metrics slice: gpu.dcgm.nvlink.traffic") - validatedMetrics["gpu.dcgm.nvlink.traffic"] = true + case "gpu.dcgm.nvlink.io": + assert.False(t, validatedMetrics["gpu.dcgm.nvlink.io"], "Found a duplicate in the metrics slice: gpu.dcgm.nvlink.io") + validatedMetrics["gpu.dcgm.nvlink.io"] = true assert.Equal(t, pmetric.MetricTypeSum, ms.At(i).Type()) assert.Equal(t, 1, ms.At(i).Sum().DataPoints().Len()) assert.Equal(t, "The number of bytes sent over NVLink, not including protocol headers.", ms.At(i).Description()) @@ -282,12 +282,12 @@ func TestMetricsBuilder(t *testing.T) { assert.Equal(t, ts, dp.Timestamp()) assert.Equal(t, pmetric.NumberDataPointValueTypeInt, dp.ValueType()) assert.Equal(t, int64(1), dp.IntValue()) - attrVal, ok := dp.Attributes().Get("direction") + attrVal, ok := dp.Attributes().Get("network.io.direction") assert.True(t, ok) - assert.EqualValues(t, "tx", attrVal.Str()) - case "gpu.dcgm.pcie.traffic": - assert.False(t, validatedMetrics["gpu.dcgm.pcie.traffic"], "Found a duplicate in the metrics slice: gpu.dcgm.pcie.traffic") - validatedMetrics["gpu.dcgm.pcie.traffic"] = true + assert.EqualValues(t, "transmit", attrVal.Str()) + case "gpu.dcgm.pcie.io": + assert.False(t, validatedMetrics["gpu.dcgm.pcie.io"], "Found a duplicate in the metrics slice: gpu.dcgm.pcie.io") + validatedMetrics["gpu.dcgm.pcie.io"] = true assert.Equal(t, pmetric.MetricTypeSum, ms.At(i).Type()) assert.Equal(t, 1, ms.At(i).Sum().DataPoints().Len()) assert.Equal(t, "The number of bytes sent over the PCIe bus, including both protocol headers and data payloads.", ms.At(i).Description()) @@ -299,9 +299,9 @@ func TestMetricsBuilder(t *testing.T) { assert.Equal(t, ts, dp.Timestamp()) assert.Equal(t, pmetric.NumberDataPointValueTypeInt, dp.ValueType()) assert.Equal(t, int64(1), dp.IntValue()) - attrVal, ok := dp.Attributes().Get("direction") + attrVal, ok := dp.Attributes().Get("network.io.direction") assert.True(t, ok) - assert.EqualValues(t, "tx", attrVal.Str()) + assert.EqualValues(t, "transmit", attrVal.Str()) case "gpu.dcgm.pipe.utilization": assert.False(t, validatedMetrics["gpu.dcgm.pipe.utilization"], "Found a duplicate in the metrics slice: gpu.dcgm.pipe.utilization") validatedMetrics["gpu.dcgm.pipe.utilization"] = true @@ -314,7 +314,7 @@ func TestMetricsBuilder(t *testing.T) { assert.Equal(t, ts, dp.Timestamp()) assert.Equal(t, pmetric.NumberDataPointValueTypeDouble, dp.ValueType()) assert.Equal(t, float64(1), dp.DoubleValue()) - attrVal, ok := dp.Attributes().Get("pipe") + attrVal, ok := dp.Attributes().Get("gpu.pipe") assert.True(t, ok) assert.EqualValues(t, "tensor", attrVal.Str()) case "gpu.dcgm.sm.occupancy": @@ -379,9 +379,9 @@ func TestMetricsBuilder(t *testing.T) { assert.Equal(t, ts, dp.Timestamp()) assert.Equal(t, pmetric.NumberDataPointValueTypeInt, dp.ValueType()) assert.Equal(t, int64(1), dp.IntValue()) - attrVal, ok := dp.Attributes().Get("xid") + attrVal, ok := dp.Attributes().Get("gpu.error.xid") assert.True(t, ok) - assert.EqualValues(t, 3, attrVal.Int()) + assert.EqualValues(t, 13, attrVal.Int()) } } }) diff --git a/receiver/dcgmreceiver/internal/metadata/testdata/config.yaml b/receiver/dcgmreceiver/internal/metadata/testdata/config.yaml index 114d2a1b9..2047c57a8 100644 --- a/receiver/dcgmreceiver/internal/metadata/testdata/config.yaml +++ b/receiver/dcgmreceiver/internal/metadata/testdata/config.yaml @@ -17,9 +17,9 @@ all_set: enabled: true gpu.dcgm.memory.bytes_used: enabled: true - gpu.dcgm.nvlink.traffic: + gpu.dcgm.nvlink.io: enabled: true - gpu.dcgm.pcie.traffic: + gpu.dcgm.pcie.io: enabled: true gpu.dcgm.pipe.utilization: enabled: true @@ -58,9 +58,9 @@ none_set: enabled: false gpu.dcgm.memory.bytes_used: enabled: false - gpu.dcgm.nvlink.traffic: + gpu.dcgm.nvlink.io: enabled: false - gpu.dcgm.pcie.traffic: + gpu.dcgm.pcie.io: enabled: false gpu.dcgm.pipe.utilization: enabled: false diff --git a/receiver/dcgmreceiver/metadata.yaml b/receiver/dcgmreceiver/metadata.yaml index 46c907b40..b0de6a042 100644 --- a/receiver/dcgmreceiver/metadata.yaml +++ b/receiver/dcgmreceiver/metadata.yaml @@ -22,32 +22,32 @@ resource_attributes: enabled: true attributes: - memory_state: + gpu.memory.state: type: string description: GPU memory state, one of [free, used, reserved]. enum: [used, free, reserved] - pipe: + gpu.pipe: type: string description: GPU pipe in use, one of [tensor, fp64, fp32, fp16]. enum: [tensor, fp64, fp32, fp16] - direction: + network.io.direction: type: string description: Direction of the link traffic, one of [tx, rx]. - enum: [tx, rx] + enum: [transmit, receive] - violation: + gpu.clock.violation: type: string description: Reason for throttling, one of [power, thermal, sync_boost, board_limit, low_util, reliability, app_clock, base_clock]. enum: [power, thermal, sync_boost, board_limit, low_util, reliability, app_clock, base_clock] - error_type: + gpu.error.type: type: string description: The type of error, one of [sbe, dbe]. enum: [sbe, dbe] - xid: + gpu.error.xid: type: int description: The XID code for the error, 1..143. @@ -78,7 +78,7 @@ metrics: unit: 1 gauge: value_type: double - attributes: [pipe] + attributes: [gpu.pipe] enabled: true gpu.dcgm.codec.encoder.utilization: @@ -100,7 +100,7 @@ metrics: unit: By gauge: value_type: int - attributes: [memory_state] + attributes: [gpu.memory.state] enabled: true gpu.dcgm.memory.bandwidth_utilization: @@ -110,24 +110,24 @@ metrics: value_type: double enabled: true - gpu.dcgm.pcie.traffic: + gpu.dcgm.pcie.io: description: The number of bytes sent over the PCIe bus, including both protocol headers and data payloads. unit: By sum: value_type: int aggregation_temporality: delta monotonic: true - attributes: [direction] + attributes: [network.io.direction] enabled: true - gpu.dcgm.nvlink.traffic: + gpu.dcgm.nvlink.io: description: The number of bytes sent over NVLink, not including protocol headers. unit: By sum: value_type: int aggregation_temporality: delta monotonic: true - attributes: [direction] + attributes: [network.io.direction] enabled: true gpu.dcgm.energy_consumption: @@ -160,7 +160,7 @@ metrics: value_type: double aggregation_temporality: cumulative monotonic: true - attributes: [violation] + attributes: [gpu.clock.violation] enabled: true gpu.dcgm.ecc_errors: @@ -170,7 +170,7 @@ metrics: value_type: int aggregation_temporality: cumulative monotonic: true - attributes: [error_type] + attributes: [gpu.error.type] enabled: true gpu.dcgm.xid_errors: @@ -180,5 +180,5 @@ metrics: value_type: int aggregation_temporality: cumulative monotonic: true - attributes: [xid] + attributes: [gpu.error.xid] enabled: true diff --git a/receiver/dcgmreceiver/scraper.go b/receiver/dcgmreceiver/scraper.go index 3b1c7c9e5..de008b372 100644 --- a/receiver/dcgmreceiver/scraper.go +++ b/receiver/dcgmreceiver/scraper.go @@ -109,13 +109,13 @@ func (s *dcgmScraper) scrape(_ context.Context) (pmetric.Metrics, error) { case "DCGM_FI_PROF_SM_OCCUPANCY": s.mb.RecordGpuDcgmSmOccupancyDataPoint(now, metric.asFloat64()) case "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE": - s.mb.RecordGpuDcgmPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributePipeTensor) + s.mb.RecordGpuDcgmPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributeGpuPipeTensor) case "DCGM_FI_PROF_PIPE_FP64_ACTIVE": - s.mb.RecordGpuDcgmPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributePipeFp64) + s.mb.RecordGpuDcgmPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributeGpuPipeFp64) case "DCGM_FI_PROF_PIPE_FP32_ACTIVE": - s.mb.RecordGpuDcgmPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributePipeFp32) + s.mb.RecordGpuDcgmPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributeGpuPipeFp32) case "DCGM_FI_PROF_PIPE_FP16_ACTIVE": - s.mb.RecordGpuDcgmPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributePipeFp16) + s.mb.RecordGpuDcgmPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributeGpuPipeFp16) case "DCGM_FI_DEV_ENC_UTIL": encUtil := float64(metric.asInt64()) / 100.0 /* normalize */ s.mb.RecordGpuDcgmCodecEncoderUtilizationDataPoint(now, encUtil) @@ -124,13 +124,13 @@ func (s *dcgmScraper) scrape(_ context.Context) (pmetric.Metrics, error) { s.mb.RecordGpuDcgmCodecDecoderUtilizationDataPoint(now, decUtil) case "DCGM_FI_DEV_FB_FREE": bytesFree := 1e6 * metric.asInt64() /* MBy to By */ - s.mb.RecordGpuDcgmMemoryBytesUsedDataPoint(now, bytesFree, metadata.AttributeMemoryStateFree) + s.mb.RecordGpuDcgmMemoryBytesUsedDataPoint(now, bytesFree, metadata.AttributeGpuMemoryStateFree) case "DCGM_FI_DEV_FB_USED": bytesUsed := 1e6 * metric.asInt64() /* MBy to By */ - s.mb.RecordGpuDcgmMemoryBytesUsedDataPoint(now, bytesUsed, metadata.AttributeMemoryStateUsed) + s.mb.RecordGpuDcgmMemoryBytesUsedDataPoint(now, bytesUsed, metadata.AttributeGpuMemoryStateUsed) case "DCGM_FI_DEV_FB_RESERVED": bytesFree := 1e6 * metric.asInt64() /* MBy to By */ - s.mb.RecordGpuDcgmMemoryBytesUsedDataPoint(now, bytesFree, metadata.AttributeMemoryStateReserved) + s.mb.RecordGpuDcgmMemoryBytesUsedDataPoint(now, bytesFree, metadata.AttributeGpuMemoryStateReserved) case "DCGM_FI_PROF_DRAM_ACTIVE": s.mb.RecordGpuDcgmMemoryBandwidthUtilizationDataPoint(now, metric.asFloat64()) // TODO: fallback @@ -139,16 +139,16 @@ func (s *dcgmScraper) scrape(_ context.Context) (pmetric.Metrics, error) { // s.mb.RecordGpuDcgmMemoryBandwidthUtilizationDataPoint(now, memCopyUtil) case "DCGM_FI_PROF_PCIE_TX_BYTES": pcieTx := int64(float64(metric.asInt64()) * (s.config.CollectionInterval.Seconds())) /* rate to delta */ - s.mb.RecordGpuDcgmPcieTrafficDataPoint(now, pcieTx, metadata.AttributeDirectionTx) + s.mb.RecordGpuDcgmPcieIoDataPoint(now, pcieTx, metadata.AttributeNetworkIoDirectionTransmit) case "DCGM_FI_PROF_PCIE_RX_BYTES": pcieRx := int64(float64(metric.asInt64()) * (s.config.CollectionInterval.Seconds())) /* rate to delta */ - s.mb.RecordGpuDcgmPcieTrafficDataPoint(now, pcieRx, metadata.AttributeDirectionRx) + s.mb.RecordGpuDcgmPcieIoDataPoint(now, pcieRx, metadata.AttributeNetworkIoDirectionReceive) case "DCGM_FI_PROF_NVLINK_TX_BYTES": nvlinkTx := int64(float64(metric.asInt64()) * (s.config.CollectionInterval.Seconds())) /* rate to delta */ - s.mb.RecordGpuDcgmNvlinkTrafficDataPoint(now, nvlinkTx, metadata.AttributeDirectionTx) + s.mb.RecordGpuDcgmNvlinkIoDataPoint(now, nvlinkTx, metadata.AttributeNetworkIoDirectionTransmit) case "DCGM_FI_PROF_NVLINK_RX_BYTES": nvlinkRx := int64(float64(metric.asInt64()) * (s.config.CollectionInterval.Seconds())) /* rate to delta */ - s.mb.RecordGpuDcgmNvlinkTrafficDataPoint(now, nvlinkRx, metadata.AttributeDirectionRx) + s.mb.RecordGpuDcgmNvlinkIoDataPoint(now, nvlinkRx, metadata.AttributeNetworkIoDirectionReceive) case "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION": s.mb.RecordGpuDcgmEnergyConsumptionDataPoint(now, metric.asFloat64()) // TODO: fallback @@ -162,32 +162,32 @@ func (s *dcgmScraper) scrape(_ context.Context) (pmetric.Metrics, error) { s.mb.RecordGpuDcgmClockFrequencyDataPoint(now, clockFreq) case "DCGM_FI_DEV_POWER_VIOLATION": violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ - s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeViolationPower) + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationPower) case "DCGM_FI_DEV_THERMAL_VIOLATION": violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ - s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeViolationThermal) + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationThermal) case "DCGM_FI_DEV_SYNC_BOOST_VIOLATION": violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ - s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeViolationSyncBoost) + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationSyncBoost) case "DCGM_FI_DEV_BOARD_LIMIT_VIOLATION": violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ - s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeViolationBoardLimit) + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationBoardLimit) case "DCGM_FI_DEV_LOW_UTIL_VIOLATION": violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ - s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeViolationLowUtil) + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationLowUtil) case "DCGM_FI_DEV_RELIABILITY_VIOLATION": violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ - s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeViolationReliability) + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationReliability) case "DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION": violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ - s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeViolationAppClock) + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationAppClock) case "DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION": violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ - s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeViolationBaseClock) + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationBaseClock) case "DCGM_FI_DEV_ECC_SBE_VOL_TOTAL": - s.mb.RecordGpuDcgmEccErrorsDataPoint(now, metric.asInt64(), metadata.AttributeErrorTypeSbe) + s.mb.RecordGpuDcgmEccErrorsDataPoint(now, metric.asInt64(), metadata.AttributeGpuErrorTypeSbe) case "DCGM_FI_DEV_ECC_DBE_VOL_TOTAL": - s.mb.RecordGpuDcgmEccErrorsDataPoint(now, metric.asInt64(), metadata.AttributeErrorTypeDbe) + s.mb.RecordGpuDcgmEccErrorsDataPoint(now, metric.asInt64(), metadata.AttributeGpuErrorTypeDbe) } } // TODO: XID errors. diff --git a/receiver/dcgmreceiver/scraper_gpu_test.go b/receiver/dcgmreceiver/scraper_gpu_test.go index 0fd2b0ffc..333e5e5a2 100644 --- a/receiver/dcgmreceiver/scraper_gpu_test.go +++ b/receiver/dcgmreceiver/scraper_gpu_test.go @@ -120,10 +120,10 @@ func TestScrapeWithEmptyMetricsConfig(t *testing.T) { GpuDcgmMemoryBytesUsed: metadata.MetricConfig{ Enabled: false, }, - GpuDcgmNvlinkTraffic: metadata.MetricConfig{ + GpuDcgmNvlinkIo: metadata.MetricConfig{ Enabled: false, }, - GpuDcgmPcieTraffic: metadata.MetricConfig{ + GpuDcgmPcieIo: metadata.MetricConfig{ Enabled: false, }, GpuDcgmPipeUtilization: metadata.MetricConfig{ @@ -252,10 +252,10 @@ func loadExpectedScraperMetrics(t *testing.T, model string) map[string]int { "DCGM_FI_DEV_FB_RESERVED": "gpu.dcgm.memory.bytes_used", "DCGM_FI_PROF_DRAM_ACTIVE": "gpu.dcgm.memory.bandwidth_utilization", //"DCGM_FI_DEV_MEM_COPY_UTIL": "gpu.dcgm.memory.bandwidth_utilization", - "DCGM_FI_PROF_PCIE_TX_BYTES": "gpu.dcgm.pcie.traffic", - "DCGM_FI_PROF_PCIE_RX_BYTES": "gpu.dcgm.pcie.traffic", - "DCGM_FI_PROF_NVLINK_TX_BYTES": "gpu.dcgm.nvlink.traffic", - "DCGM_FI_PROF_NVLINK_RX_BYTES": "gpu.dcgm.nvlink.traffic", + "DCGM_FI_PROF_PCIE_TX_BYTES": "gpu.dcgm.pcie.io", + "DCGM_FI_PROF_PCIE_RX_BYTES": "gpu.dcgm.pcie.io", + "DCGM_FI_PROF_NVLINK_TX_BYTES": "gpu.dcgm.nvlink.io", + "DCGM_FI_PROF_NVLINK_RX_BYTES": "gpu.dcgm.nvlink.io", "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION": "gpu.dcgm.energy_consumption", //"DCGM_FI_DEV_POWER_USAGE": "gpu.dcgm.energy_consumption", "DCGM_FI_DEV_GPU_TEMP": "gpu.dcgm.temperature", @@ -332,9 +332,9 @@ func validateScraperResult(t *testing.T, metrics pmetric.Metrics, expectedMetric fallthrough case "gpu.dcgm.clock.throttle_duration.time": fallthrough - case "gpu.dcgm.pcie.traffic": + case "gpu.dcgm.pcie.io": fallthrough - case "gpu.dcgm.nvlink.traffic": + case "gpu.dcgm.nvlink.io": fallthrough case "gpu.dcgm.ecc_errors": fallthrough @@ -351,36 +351,36 @@ func validateScraperResult(t *testing.T, metrics pmetric.Metrics, expectedMetric case "gpu.dcgm.sm.occupancy": case "gpu.dcgm.pipe.utilization": for j := 0; j < dps.Len(); j++ { - assert.Contains(t, dps.At(j).Attributes().AsRaw(), "pipe") + assert.Contains(t, dps.At(j).Attributes().AsRaw(), "gpu.pipe") } case "gpu.dcgm.codec.encoder.utilization": case "gpu.dcgm.codec.decoder.utilization": case "gpu.dcgm.memory.bytes_used": for j := 0; j < dps.Len(); j++ { - assert.Contains(t, dps.At(j).Attributes().AsRaw(), "memory_state") + assert.Contains(t, dps.At(j).Attributes().AsRaw(), "gpu.memory.state") } case "gpu.dcgm.memory.bandwidth_utilization": - case "gpu.dcgm.pcie.traffic": + case "gpu.dcgm.pcie.io": fallthrough - case "gpu.dcgm.nvlink.traffic": + case "gpu.dcgm.nvlink.io": for j := 0; j < dps.Len(); j++ { - assert.Contains(t, dps.At(j).Attributes().AsRaw(), "direction") + assert.Contains(t, dps.At(j).Attributes().AsRaw(), "network.io.direction") } case "gpu.dcgm.energy_consumption": case "gpu.dcgm.temperature": case "gpu.dcgm.clock.frequency": case "gpu.dcgm.clock.throttle_duration.time": for j := 0; j < dps.Len(); j++ { - assert.Contains(t, dps.At(j).Attributes().AsRaw(), "violation") + assert.Contains(t, dps.At(j).Attributes().AsRaw(), "gpu.clock.violation") } case "gpu.dcgm.ecc_errors": for j := 0; j < dps.Len(); j++ { - assert.Contains(t, dps.At(j).Attributes().AsRaw(), "error_type") + assert.Contains(t, dps.At(j).Attributes().AsRaw(), "gpu.error.type") } // TODO //case "gpu.dcgm.xid_errors": // for j := 0; j < dps.Len(); j++ { - // assert.Contains(t, dps.At(j).Attributes().AsRaw(), "xid") + // assert.Contains(t, dps.At(j).Attributes().AsRaw(), "gpu.error.xid") // } default: t.Errorf("Unexpected metric %s", m.Name()) From 20f3b8d3098840e13531a0ce7275880e42981c66 Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Tue, 25 Jun 2024 14:49:32 -0400 Subject: [PATCH 16/38] Implement fallbacks. --- receiver/dcgmreceiver/scraper.go | 228 +++++++++++++--------- receiver/dcgmreceiver/scraper_gpu_test.go | 10 +- 2 files changed, 139 insertions(+), 99 deletions(-) diff --git a/receiver/dcgmreceiver/scraper.go b/receiver/dcgmreceiver/scraper.go index de008b372..293e3e3c5 100644 --- a/receiver/dcgmreceiver/scraper.go +++ b/receiver/dcgmreceiver/scraper.go @@ -18,9 +18,11 @@ package dcgmreceiver import ( + "cmp" "context" "errors" "fmt" + "slices" "time" "go.opentelemetry.io/collector/component" @@ -36,6 +38,8 @@ type dcgmScraper struct { settings receiver.CreateSettings client *dcgmClient mb *metadata.MetricsBuilder + // Aggregate cumulative values from power usage rate. + energyConsumptionFallback float64 } func newDcgmScraper(config *Config, settings receiver.CreateSettings) *dcgmScraper { @@ -70,6 +74,7 @@ func (s *dcgmScraper) start(_ context.Context, _ component.Host) error { mbConfig.Metrics = s.config.Metrics s.mb = metadata.NewMetricsBuilder( mbConfig, s.settings, metadata.WithStartTime(startTime)) + s.energyConsumptionFallback = 0.0 return nil } @@ -90,105 +95,140 @@ func (s *dcgmScraper) scrape(_ context.Context) (pmetric.Metrics, error) { deviceMetrics, err := s.client.collectDeviceMetrics() now := pcommon.NewTimestampFromTime(time.Now()) - for gpuIndex, metrics := range deviceMetrics { + for gpuIndex, gpuMetrics := range deviceMetrics { + metricsByName := make(map[string][]dcgmMetric) + for _, metric := range gpuMetrics { + metricsByName[metric.name] = append(metricsByName[metric.name], metric) + } + metrics := make(map[string]dcgmMetric) + for name, points := range metricsByName { + slices.SortStableFunc(points, func(a, b dcgmMetric) int { + return cmp.Compare(a.timestamp, b.timestamp) + }) + metrics[name] = points[len(points)-1] + } rb := s.mb.NewResourceBuilder() rb.SetGpuNumber(fmt.Sprintf("%d", gpuIndex)) rb.SetGpuUUID(s.client.getDeviceUUID(gpuIndex)) rb.SetGpuModel(s.client.getDeviceModelName(gpuIndex)) gpuResource := rb.Emit() - for _, metric := range metrics { - switch metric.name { - case "DCGM_FI_PROF_GR_ENGINE_ACTIVE": - s.mb.RecordGpuDcgmUtilizationDataPoint(now, metric.asFloat64()) - // TODO: fallback - //case "DCGM_FI_DEV_GPU_UTIL": - // gpuUtil := float64(metric.asInt64()) / 100.0 /* normalize */ - // s.mb.RecordGpuDcgmUtilizationDataPoint(now, gpuUtil) - case "DCGM_FI_PROF_SM_ACTIVE": - s.mb.RecordGpuDcgmSmUtilizationDataPoint(now, metric.asFloat64()) - case "DCGM_FI_PROF_SM_OCCUPANCY": - s.mb.RecordGpuDcgmSmOccupancyDataPoint(now, metric.asFloat64()) - case "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE": - s.mb.RecordGpuDcgmPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributeGpuPipeTensor) - case "DCGM_FI_PROF_PIPE_FP64_ACTIVE": - s.mb.RecordGpuDcgmPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributeGpuPipeFp64) - case "DCGM_FI_PROF_PIPE_FP32_ACTIVE": - s.mb.RecordGpuDcgmPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributeGpuPipeFp32) - case "DCGM_FI_PROF_PIPE_FP16_ACTIVE": - s.mb.RecordGpuDcgmPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributeGpuPipeFp16) - case "DCGM_FI_DEV_ENC_UTIL": - encUtil := float64(metric.asInt64()) / 100.0 /* normalize */ - s.mb.RecordGpuDcgmCodecEncoderUtilizationDataPoint(now, encUtil) - case "DCGM_FI_DEV_DEC_UTIL": - decUtil := float64(metric.asInt64()) / 100.0 /* normalize */ - s.mb.RecordGpuDcgmCodecDecoderUtilizationDataPoint(now, decUtil) - case "DCGM_FI_DEV_FB_FREE": - bytesFree := 1e6 * metric.asInt64() /* MBy to By */ - s.mb.RecordGpuDcgmMemoryBytesUsedDataPoint(now, bytesFree, metadata.AttributeGpuMemoryStateFree) - case "DCGM_FI_DEV_FB_USED": - bytesUsed := 1e6 * metric.asInt64() /* MBy to By */ - s.mb.RecordGpuDcgmMemoryBytesUsedDataPoint(now, bytesUsed, metadata.AttributeGpuMemoryStateUsed) - case "DCGM_FI_DEV_FB_RESERVED": - bytesFree := 1e6 * metric.asInt64() /* MBy to By */ - s.mb.RecordGpuDcgmMemoryBytesUsedDataPoint(now, bytesFree, metadata.AttributeGpuMemoryStateReserved) - case "DCGM_FI_PROF_DRAM_ACTIVE": - s.mb.RecordGpuDcgmMemoryBandwidthUtilizationDataPoint(now, metric.asFloat64()) - // TODO: fallback - //case "DCGM_FI_DEV_MEM_COPY_UTIL": - // memCopyUtil := float64(metric.asInt64()) / 100.0 /* normalize */ - // s.mb.RecordGpuDcgmMemoryBandwidthUtilizationDataPoint(now, memCopyUtil) - case "DCGM_FI_PROF_PCIE_TX_BYTES": - pcieTx := int64(float64(metric.asInt64()) * (s.config.CollectionInterval.Seconds())) /* rate to delta */ - s.mb.RecordGpuDcgmPcieIoDataPoint(now, pcieTx, metadata.AttributeNetworkIoDirectionTransmit) - case "DCGM_FI_PROF_PCIE_RX_BYTES": - pcieRx := int64(float64(metric.asInt64()) * (s.config.CollectionInterval.Seconds())) /* rate to delta */ - s.mb.RecordGpuDcgmPcieIoDataPoint(now, pcieRx, metadata.AttributeNetworkIoDirectionReceive) - case "DCGM_FI_PROF_NVLINK_TX_BYTES": - nvlinkTx := int64(float64(metric.asInt64()) * (s.config.CollectionInterval.Seconds())) /* rate to delta */ - s.mb.RecordGpuDcgmNvlinkIoDataPoint(now, nvlinkTx, metadata.AttributeNetworkIoDirectionTransmit) - case "DCGM_FI_PROF_NVLINK_RX_BYTES": - nvlinkRx := int64(float64(metric.asInt64()) * (s.config.CollectionInterval.Seconds())) /* rate to delta */ - s.mb.RecordGpuDcgmNvlinkIoDataPoint(now, nvlinkRx, metadata.AttributeNetworkIoDirectionReceive) - case "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION": - s.mb.RecordGpuDcgmEnergyConsumptionDataPoint(now, metric.asFloat64()) - // TODO: fallback - //case "DCGM_FI_DEV_POWER_USAGE": - // powerUsage := metric.asFloat64() * (s.config.CollectionInterval.Seconds()) /* rate to delta */ // TODO: cumulative - // s.mb.RecordGpuDcgmEnergyConsumptionDataPoint(now, powerUsage) - case "DCGM_FI_DEV_GPU_TEMP": - s.mb.RecordGpuDcgmTemperatureDataPoint(now, metric.asFloat64()) - case "DCGM_FI_DEV_SM_CLOCK": - clockFreq := 1e6 * metric.asFloat64() /* MHz to Hz */ - s.mb.RecordGpuDcgmClockFrequencyDataPoint(now, clockFreq) - case "DCGM_FI_DEV_POWER_VIOLATION": - violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ - s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationPower) - case "DCGM_FI_DEV_THERMAL_VIOLATION": - violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ - s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationThermal) - case "DCGM_FI_DEV_SYNC_BOOST_VIOLATION": - violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ - s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationSyncBoost) - case "DCGM_FI_DEV_BOARD_LIMIT_VIOLATION": - violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ - s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationBoardLimit) - case "DCGM_FI_DEV_LOW_UTIL_VIOLATION": - violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ - s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationLowUtil) - case "DCGM_FI_DEV_RELIABILITY_VIOLATION": - violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ - s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationReliability) - case "DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION": - violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ - s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationAppClock) - case "DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION": - violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ - s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationBaseClock) - case "DCGM_FI_DEV_ECC_SBE_VOL_TOTAL": - s.mb.RecordGpuDcgmEccErrorsDataPoint(now, metric.asInt64(), metadata.AttributeGpuErrorTypeSbe) - case "DCGM_FI_DEV_ECC_DBE_VOL_TOTAL": - s.mb.RecordGpuDcgmEccErrorsDataPoint(now, metric.asInt64(), metadata.AttributeGpuErrorTypeDbe) - } + if metric, ok := metrics["DCGM_FI_PROF_GR_ENGINE_ACTIVE"]; ok { + s.mb.RecordGpuDcgmUtilizationDataPoint(now, metric.asFloat64()) + } else if metric, ok := metrics["DCGM_FI_DEV_GPU_UTIL"]; ok { // fallback + gpuUtil := float64(metric.asInt64()) / 100.0 /* normalize */ + s.mb.RecordGpuDcgmUtilizationDataPoint(now, gpuUtil) + } + if metric, ok := metrics["DCGM_FI_PROF_SM_ACTIVE"]; ok { + s.mb.RecordGpuDcgmSmUtilizationDataPoint(now, metric.asFloat64()) + } + if metric, ok := metrics["DCGM_FI_PROF_SM_OCCUPANCY"]; ok { + s.mb.RecordGpuDcgmSmOccupancyDataPoint(now, metric.asFloat64()) + } + if metric, ok := metrics["DCGM_FI_PROF_PIPE_TENSOR_ACTIVE"]; ok { + s.mb.RecordGpuDcgmPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributeGpuPipeTensor) + } + if metric, ok := metrics["DCGM_FI_PROF_PIPE_FP64_ACTIVE"]; ok { + s.mb.RecordGpuDcgmPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributeGpuPipeFp64) + } + if metric, ok := metrics["DCGM_FI_PROF_PIPE_FP32_ACTIVE"]; ok { + s.mb.RecordGpuDcgmPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributeGpuPipeFp32) + } + if metric, ok := metrics["DCGM_FI_PROF_PIPE_FP16_ACTIVE"]; ok { + s.mb.RecordGpuDcgmPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributeGpuPipeFp16) + } + if metric, ok := metrics["DCGM_FI_DEV_ENC_UTIL"]; ok { + encUtil := float64(metric.asInt64()) / 100.0 /* normalize */ + s.mb.RecordGpuDcgmCodecEncoderUtilizationDataPoint(now, encUtil) + } + if metric, ok := metrics["DCGM_FI_DEV_DEC_UTIL"]; ok { + decUtil := float64(metric.asInt64()) / 100.0 /* normalize */ + s.mb.RecordGpuDcgmCodecDecoderUtilizationDataPoint(now, decUtil) + } + if metric, ok := metrics["DCGM_FI_DEV_FB_FREE"]; ok { + bytesFree := 1e6 * metric.asInt64() /* MBy to By */ + s.mb.RecordGpuDcgmMemoryBytesUsedDataPoint(now, bytesFree, metadata.AttributeGpuMemoryStateFree) + } + if metric, ok := metrics["DCGM_FI_DEV_FB_USED"]; ok { + bytesUsed := 1e6 * metric.asInt64() /* MBy to By */ + s.mb.RecordGpuDcgmMemoryBytesUsedDataPoint(now, bytesUsed, metadata.AttributeGpuMemoryStateUsed) + } + if metric, ok := metrics["DCGM_FI_DEV_FB_RESERVED"]; ok { + bytesFree := 1e6 * metric.asInt64() /* MBy to By */ + s.mb.RecordGpuDcgmMemoryBytesUsedDataPoint(now, bytesFree, metadata.AttributeGpuMemoryStateReserved) + } + if metric, ok := metrics["DCGM_FI_PROF_DRAM_ACTIVE"]; ok { + s.mb.RecordGpuDcgmMemoryBandwidthUtilizationDataPoint(now, metric.asFloat64()) + } else if metric, ok := metrics["DCGM_FI_DEV_MEM_COPY_UTIL"]; ok { // fallback + memCopyUtil := float64(metric.asInt64()) / 100.0 /* normalize */ + s.mb.RecordGpuDcgmMemoryBandwidthUtilizationDataPoint(now, memCopyUtil) + } + if metric, ok := metrics["DCGM_FI_PROF_PCIE_TX_BYTES"]; ok { + pcieTx := int64(float64(metric.asInt64()) * (s.config.CollectionInterval.Seconds())) /* rate to delta */ + s.mb.RecordGpuDcgmPcieIoDataPoint(now, pcieTx, metadata.AttributeNetworkIoDirectionTransmit) + } + if metric, ok := metrics["DCGM_FI_PROF_PCIE_RX_BYTES"]; ok { + pcieRx := int64(float64(metric.asInt64()) * (s.config.CollectionInterval.Seconds())) /* rate to delta */ + s.mb.RecordGpuDcgmPcieIoDataPoint(now, pcieRx, metadata.AttributeNetworkIoDirectionReceive) + } + if metric, ok := metrics["DCGM_FI_PROF_NVLINK_TX_BYTES"]; ok { + nvlinkTx := int64(float64(metric.asInt64()) * (s.config.CollectionInterval.Seconds())) /* rate to delta */ + s.mb.RecordGpuDcgmNvlinkIoDataPoint(now, nvlinkTx, metadata.AttributeNetworkIoDirectionTransmit) + } + if metric, ok := metrics["DCGM_FI_PROF_NVLINK_RX_BYTES"]; ok { + nvlinkRx := int64(float64(metric.asInt64()) * (s.config.CollectionInterval.Seconds())) /* rate to delta */ + s.mb.RecordGpuDcgmNvlinkIoDataPoint(now, nvlinkRx, metadata.AttributeNetworkIoDirectionReceive) + } + if metric, ok := metrics["DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION"]; ok { + s.mb.RecordGpuDcgmEnergyConsumptionDataPoint(now, metric.asFloat64()) + } else if metric, ok := metrics["DCGM_FI_DEV_POWER_USAGE"]; ok { // fallback + powerUsage := metric.asFloat64() * (s.config.CollectionInterval.Seconds()) /* rate to delta */ + s.energyConsumptionFallback += powerUsage /* delta to cumulative */ + s.mb.RecordGpuDcgmEnergyConsumptionDataPoint(now, s.energyConsumptionFallback) + } + if metric, ok := metrics["DCGM_FI_DEV_GPU_TEMP"]; ok { + s.mb.RecordGpuDcgmTemperatureDataPoint(now, metric.asFloat64()) + } + if metric, ok := metrics["DCGM_FI_DEV_SM_CLOCK"]; ok { + clockFreq := 1e6 * metric.asFloat64() /* MHz to Hz */ + s.mb.RecordGpuDcgmClockFrequencyDataPoint(now, clockFreq) + } + if metric, ok := metrics["DCGM_FI_DEV_POWER_VIOLATION"]; ok { + violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationPower) + } + if metric, ok := metrics["DCGM_FI_DEV_THERMAL_VIOLATION"]; ok { + violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationThermal) + } + if metric, ok := metrics["DCGM_FI_DEV_SYNC_BOOST_VIOLATION"]; ok { + violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationSyncBoost) + } + if metric, ok := metrics["DCGM_FI_DEV_BOARD_LIMIT_VIOLATION"]; ok { + violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationBoardLimit) + } + if metric, ok := metrics["DCGM_FI_DEV_LOW_UTIL_VIOLATION"]; ok { + violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationLowUtil) + } + if metric, ok := metrics["DCGM_FI_DEV_RELIABILITY_VIOLATION"]; ok { + violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationReliability) + } + if metric, ok := metrics["DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION"]; ok { + violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationAppClock) + } + if metric, ok := metrics["DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION"]; ok { + violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationBaseClock) + } + if metric, ok := metrics["DCGM_FI_DEV_ECC_SBE_VOL_TOTAL"]; ok { + s.mb.RecordGpuDcgmEccErrorsDataPoint(now, metric.asInt64(), metadata.AttributeGpuErrorTypeSbe) + } + if metric, ok := metrics["DCGM_FI_DEV_ECC_DBE_VOL_TOTAL"]; ok { + s.mb.RecordGpuDcgmEccErrorsDataPoint(now, metric.asInt64(), metadata.AttributeGpuErrorTypeDbe) } // TODO: XID errors. //s.mb.RecordGpuDcgmXidErrorsDataPoint(now, metric.asInt64(), xid) diff --git a/receiver/dcgmreceiver/scraper_gpu_test.go b/receiver/dcgmreceiver/scraper_gpu_test.go index 333e5e5a2..2c7890e13 100644 --- a/receiver/dcgmreceiver/scraper_gpu_test.go +++ b/receiver/dcgmreceiver/scraper_gpu_test.go @@ -202,12 +202,12 @@ func TestScrapeOnProfilingPaused(t *testing.T) { assert.NoError(t, err) expectedMetrics := []string{ - //TODO "gpu.dcgm.utilization", + "gpu.dcgm.utilization", "gpu.dcgm.codec.decoder.utilization", "gpu.dcgm.codec.encoder.utilization", "gpu.dcgm.memory.bytes_used", - //TODO "gpu.dcgm.memory.bandwidth_utilization", - //TODO "gpu.dcgm.energy_consumption", + "gpu.dcgm.memory.bandwidth_utilization", + "gpu.dcgm.energy_consumption", "gpu.dcgm.temperature", "gpu.dcgm.clock.frequency", "gpu.dcgm.clock.throttle_duration.time", @@ -218,8 +218,6 @@ func TestScrapeOnProfilingPaused(t *testing.T) { require.Equal(t, 1, ilms.Len()) ms := ilms.At(0).Metrics() - require.LessOrEqual(t, len(expectedMetrics), ms.Len()) - metricWasSeen := make(map[string]bool) for i := 0; i < ms.Len(); i++ { metricWasSeen[ms.At(i).Name()] = true @@ -227,7 +225,9 @@ func TestScrapeOnProfilingPaused(t *testing.T) { for _, metric := range expectedMetrics { assert.True(t, metricWasSeen[metric], metric) + delete(metricWasSeen, metric) } + assert.Equal(t, len(expectedMetrics), ms.Len(), fmt.Sprintf("%v", metricWasSeen)) } // loadExpectedScraperMetrics calls LoadExpectedMetrics to read the supported From 2ab8868d76863bd90593a93fef84de575c2f146b Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Thu, 27 Jun 2024 19:50:49 -0400 Subject: [PATCH 17/38] Skip test gracefully when pausing profiling not supported. --- receiver/dcgmreceiver/scraper_gpu_test.go | 12 ++++- .../testprofilepause/test_profile_pause.go | 44 +++++++++++++++---- 2 files changed, 46 insertions(+), 10 deletions(-) diff --git a/receiver/dcgmreceiver/scraper_gpu_test.go b/receiver/dcgmreceiver/scraper_gpu_test.go index 2c7890e13..f2bb545d7 100644 --- a/receiver/dcgmreceiver/scraper_gpu_test.go +++ b/receiver/dcgmreceiver/scraper_gpu_test.go @@ -19,6 +19,7 @@ package dcgmreceiver import ( "context" + "errors" "fmt" "testing" "time" @@ -191,10 +192,17 @@ func TestScrapeOnProfilingPaused(t *testing.T) { require.NotNil(t, scraper) defer func() { testprofilepause.ResumeProfilingMetrics() }() - testprofilepause.PauseProfilingMetrics() + err := testprofilepause.PauseProfilingMetrics() + if err != nil { + if errors.Is(err, testprofilepause.FeatureNotSupportedError) { + t.Skipf("Pausing profiling not supported") + } else { + t.Errorf("Pausing profiling failed with error %v", err) + } + } time.Sleep(20 * time.Millisecond) - err := scraper.start(context.Background(), componenttest.NewNopHost()) + err = scraper.start(context.Background(), componenttest.NewNopHost()) require.NoError(t, err) metrics, err := scraper.scrape(context.Background()) diff --git a/receiver/dcgmreceiver/testprofilepause/test_profile_pause.go b/receiver/dcgmreceiver/testprofilepause/test_profile_pause.go index 15a329095..1621cc3ed 100644 --- a/receiver/dcgmreceiver/testprofilepause/test_profile_pause.go +++ b/receiver/dcgmreceiver/testprofilepause/test_profile_pause.go @@ -22,9 +22,10 @@ package testprofilepause /* #include typedef uintptr_t dcgmHandle_t; -typedef enum dcgmReturn_enum { DCGM_ST_OK = 0 } dcgmReturn_t; +typedef enum dcgmReturn_enum { DCGM_ST_OK = 0, DCGM_ST_NOT_SUPPORTED = -6 } dcgmReturn_t; dcgmReturn_t dcgmProfPause(dcgmHandle_t pDcgmHandle); dcgmReturn_t dcgmProfResume(dcgmHandle_t pDcgmHandle); +const char *errorString(dcgmReturn_t result); */ import "C" import ( @@ -39,17 +40,44 @@ type dcgmHandle struct{ handle C.dcgmHandle_t } //go:linkname handle github.com/NVIDIA/go-dcgm/pkg/dcgm.handle var handle dcgmHandle -func PauseProfilingMetrics() { +var errorMap = map[C.dcgmReturn_t]error{ + C.DCGM_ST_OK: nil, +} + +func errorString(result C.dcgmReturn_t) error { + if err, ok := errorMap[result]; ok { + return err + } + msg := C.GoString(C.errorString(result)) + err := fmt.Errorf("%v", msg) + errorMap[result] = err + return err +} + +var FeatureNotSupportedError error +var initErrors = func() { + if FeatureNotSupportedError == nil { + FeatureNotSupportedError = errorString(C.DCGM_ST_NOT_SUPPORTED) + } +} + +func PauseProfilingMetrics() error { + initErrors() result := C.dcgmProfPause(handle.handle) - if result != 0 { - fmt.Printf("CUDA version %d", dcgm.DCGM_FI_CUDA_DRIVER_VERSION) - fmt.Printf("Failed to pause profiling (result %d)\n", result) + err := errorString(result) + if err != nil { + fmt.Printf("CUDA version %d\n", dcgm.DCGM_FI_CUDA_DRIVER_VERSION) + fmt.Printf("Failed to pause profiling (%v)\n", err) } + return err } -func ResumeProfilingMetrics() { +func ResumeProfilingMetrics() error { + initErrors() result := C.dcgmProfResume(handle.handle) - if result != 0 { - fmt.Printf("Failed to resume profiling (result %d)\n", result) + err := errorString(result) + if err != nil { + fmt.Printf("Failed to resume profiling (%v)\n", err) } + return err } From 4cc8a8b03908e18e1a920e404c9b276944aeaa79 Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Thu, 27 Jun 2024 20:37:12 -0400 Subject: [PATCH 18/38] Don't fail client tests on blank values. --- receiver/dcgmreceiver/client.go | 38 +++++++++++++++++++++++--------- receiver/dcgmreceiver/config.go | 2 ++ receiver/dcgmreceiver/factory.go | 4 +++- 3 files changed, 33 insertions(+), 11 deletions(-) diff --git a/receiver/dcgmreceiver/client.go b/receiver/dcgmreceiver/client.go index 99431cf5f..c86dec412 100644 --- a/receiver/dcgmreceiver/client.go +++ b/receiver/dcgmreceiver/client.go @@ -42,6 +42,9 @@ type dcgmClient struct { devicesModelName []string devicesUUID []string deviceMetricToFailedQueryCount map[string]uint64 + collectionInterval time.Duration + retryBlankValues bool + maxRetries int } type dcgmMetric struct { @@ -106,6 +109,9 @@ func newClient(config *Config, logger *zap.Logger) (*dcgmClient, error) { devicesModelName: names, devicesUUID: UUIDs, deviceMetricToFailedQueryCount: make(map[string]uint64), + collectionInterval: config.CollectionInterval, + retryBlankValues: config.retryBlankValues, + maxRetries: config.maxRetries, }, nil } @@ -434,26 +440,38 @@ func (client *dcgmClient) collectDeviceMetrics() (map[uint][]dcgmMetric, error) var err scrapererror.ScrapeErrors gpuMetrics := make(map[uint][]dcgmMetric) for _, gpuIndex := range client.deviceIndices { - fieldValues, pollErr := dcgmGetLatestValuesForFields(gpuIndex, client.enabledFieldIDs) - if pollErr == nil { - gpuMetrics[gpuIndex] = client.appendMetric(gpuMetrics[gpuIndex], gpuIndex, fieldValues) - client.logger.Debugf("Successful poll of DCGM daemon for GPU %d", gpuIndex) - } else { - msg := fmt.Sprintf("Unable to poll DCGM daemon for GPU %d on %s", gpuIndex, pollErr) - client.issueWarningForFailedQueryUptoThreshold(gpuIndex, "all-profiling-metrics", msg) - err.AddPartial(1, fmt.Errorf("%s", msg)) + retry := true + for i := 0; retry && i < client.maxRetries; i++ { + fieldValues, pollErr := dcgmGetLatestValuesForFields(gpuIndex, client.enabledFieldIDs) + if pollErr == nil { + gpuMetrics[gpuIndex], retry = client.appendMetric(gpuMetrics[gpuIndex], gpuIndex, fieldValues) + if retry { + client.logger.Warnf("Retrying poll of DCGM daemon for GPU %d; attempt %d", gpuIndex, i+1) + time.Sleep(client.collectionInterval) + continue + } + client.logger.Debugf("Successful poll of DCGM daemon for GPU %d", gpuIndex) + } else { + msg := fmt.Sprintf("Unable to poll DCGM daemon for GPU %d on %s", gpuIndex, pollErr) + client.issueWarningForFailedQueryUptoThreshold(gpuIndex, "all-profiling-metrics", msg) + err.AddPartial(1, fmt.Errorf("%s", msg)) + } } } return gpuMetrics, err.Combine() } -func (client *dcgmClient) appendMetric(gpuMetrics []dcgmMetric, gpuIndex uint, fieldValues []dcgm.FieldValue_v1) []dcgmMetric { +func (client *dcgmClient) appendMetric(gpuMetrics []dcgmMetric, gpuIndex uint, fieldValues []dcgm.FieldValue_v1) (result []dcgmMetric, retry bool) { + retry = false for _, fieldValue := range fieldValues { dcgmName := dcgmIDToName[dcgm.Short(fieldValue.FieldId)] if err := isValidValue(fieldValue); err != nil { msg := fmt.Sprintf("Received invalid value (ts %d gpu %d) %s: %v", fieldValue.Ts, gpuIndex, dcgmName, err) client.issueWarningForFailedQueryUptoThreshold(gpuIndex, dcgmName, msg) + if client.retryBlankValues && errors.Is(err, blankValueError) { + retry = true + } continue } @@ -466,7 +484,7 @@ func (client *dcgmClient) appendMetric(gpuMetrics []dcgmMetric, gpuIndex uint, f gpuMetrics = append(gpuMetrics, dcgmMetric{fieldValue.Ts, dcgmName, fieldValue.Value}) } - return gpuMetrics + return gpuMetrics, retry } func (client *dcgmClient) issueWarningForFailedQueryUptoThreshold(deviceIdx uint, dcgmName string, reason string) { diff --git a/receiver/dcgmreceiver/config.go b/receiver/dcgmreceiver/config.go index 4d6be25b2..96ce31cf8 100644 --- a/receiver/dcgmreceiver/config.go +++ b/receiver/dcgmreceiver/config.go @@ -30,4 +30,6 @@ type Config struct { scraperhelper.ControllerConfig `mapstructure:",squash"` confignet.TCPAddrConfig `mapstructure:",squash"` Metrics metadata.MetricsConfig `mapstructure:"metrics"` + retryBlankValues bool + maxRetries int } diff --git a/receiver/dcgmreceiver/factory.go b/receiver/dcgmreceiver/factory.go index 9682eb46a..49b057cd1 100644 --- a/receiver/dcgmreceiver/factory.go +++ b/receiver/dcgmreceiver/factory.go @@ -39,6 +39,8 @@ func createDefaultConfig() component.Config { TCPAddrConfig: confignet.TCPAddrConfig{ Endpoint: defaultEndpoint, }, - Metrics: metadata.DefaultMetricsConfig(), + Metrics: metadata.DefaultMetricsConfig(), + retryBlankValues: true, + maxRetries: 5, } } From d0ad1ad8438c58a79e19e4c9eaac0d652eee5043 Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Wed, 17 Jul 2024 17:56:07 -0400 Subject: [PATCH 19/38] Fix supported field error handling. Minor comment fixes. --- receiver/dcgmreceiver/client.go | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/receiver/dcgmreceiver/client.go b/receiver/dcgmreceiver/client.go index c86dec412..6aab72142 100644 --- a/receiver/dcgmreceiver/client.go +++ b/receiver/dcgmreceiver/client.go @@ -72,9 +72,7 @@ func newClient(config *Config, logger *zap.Logger) (*dcgmClient, error) { requestedFieldIDs := discoverRequestedFieldIDs(config) supportedRegularFieldIDs, err := getSupportedRegularFields(requestedFieldIDs, logger) if err != nil { - // TODO: If there is error querying the supported fields at all, let the - // receiver collect no metrics. - logger.Sugar().Warnf("Error querying supported regular fields on '%w'. Regular GPU metrics will not be collected.", err) + return nil, fmt.Errorf("Error querying supported regular fields: %w", err) } supportedProfilingFieldIDs, err := getSupportedProfilingFields() if err != nil { @@ -313,7 +311,7 @@ func getSupportedRegularFields(requestedFields []dcgm.Short, logger *zap.Logger) if ef < dcgmProfilingFieldsStart { // For fields like `DCGM_FI_DEV_*`, which are not // profiling fields, try to actually retrieve the values - // all devices + // from all devices regularFields = append(regularFields, ef) } } From d19b09e949cdef72caa503fee000b35b62ed2753 Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Thu, 18 Jul 2024 18:21:10 -0400 Subject: [PATCH 20/38] Fix lint errors. --- receiver/dcgmreceiver/client.go | 13 ++++----- receiver/dcgmreceiver/client_test.go | 2 +- receiver/dcgmreceiver/factory_gpu_on.go | 2 -- receiver/dcgmreceiver/scraper.go | 2 +- receiver/dcgmreceiver/util.go | 36 ++++++++++++------------- 5 files changed, 27 insertions(+), 28 deletions(-) diff --git a/receiver/dcgmreceiver/client.go b/receiver/dcgmreceiver/client.go index 6aab72142..f27d85e23 100644 --- a/receiver/dcgmreceiver/client.go +++ b/receiver/dcgmreceiver/client.go @@ -240,7 +240,8 @@ func discoverRequestedFieldIDs(config *Config) []dcgm.Short { requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_ECC_DBE_VOL_TOTAL"]) } if config.Metrics.GpuDcgmXidErrors.Enabled { - //requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI[""]) + // requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI[""]) + func() {}() // no-op } return requestedFieldIDs @@ -324,10 +325,10 @@ func getSupportedRegularFields(requestedFields []dcgm.Short, logger *zap.Logger) } deviceGroupName := "google-cloud-ops-agent-initial-watch-group" deviceGroup, err := dcgm.NewDefaultGroup(deviceGroupName) - defer dcgm.DestroyGroup(deviceGroup) if err != nil { return nil, fmt.Errorf("Unable to create DCGM GPU default group on %w", err) } + defer func() { _ = dcgm.DestroyGroup(deviceGroup) }() testFieldGroup, err := setWatchesOnFields(logger, deviceGroup, regularFields, dcgmWatchParams{ fieldGroupName: "google-cloud-ops-agent-initial-discovery", updateFreqUs: 3600000000, // call UpdateAllFields manually @@ -337,11 +338,11 @@ func getSupportedRegularFields(requestedFields []dcgm.Short, logger *zap.Logger) if err != nil { return nil, fmt.Errorf("Unable to set field watches on %w", err) } + defer func() { _ = dcgm.FieldGroupDestroy(testFieldGroup) }() err = dcgm.UpdateAllFields() if err != nil { return nil, fmt.Errorf("Unable to update fields on %w", err) } - defer dcgm.FieldGroupDestroy(testFieldGroup) found := make(map[dcgm.Short]bool) for _, gpuIndex := range deviceIndices { fieldValues, pollErr := dcgm.GetLatestValuesForFields(gpuIndex, regularFields) @@ -365,8 +366,8 @@ func getSupportedRegularFields(requestedFields []dcgm.Short, logger *zap.Logger) } // TODO: dcgmUnwatchFields is not available. supported := make([]dcgm.Short, len(found)) - for fieldId, _ := range found { - supported = append(supported, fieldId) + for fieldID := range found { + supported = append(supported, fieldID) } return supported, nil } @@ -467,7 +468,7 @@ func (client *dcgmClient) appendMetric(gpuMetrics []dcgmMetric, gpuIndex uint, f if err := isValidValue(fieldValue); err != nil { msg := fmt.Sprintf("Received invalid value (ts %d gpu %d) %s: %v", fieldValue.Ts, gpuIndex, dcgmName, err) client.issueWarningForFailedQueryUptoThreshold(gpuIndex, dcgmName, msg) - if client.retryBlankValues && errors.Is(err, blankValueError) { + if client.retryBlankValues && errors.Is(err, errBlankValue) { retry = true } continue diff --git a/receiver/dcgmreceiver/client_test.go b/receiver/dcgmreceiver/client_test.go index 6d390aa60..b113ed70f 100644 --- a/receiver/dcgmreceiver/client_test.go +++ b/receiver/dcgmreceiver/client_test.go @@ -32,7 +32,7 @@ import ( func TestNewDcgmClientOnInitializationError(t *testing.T) { realDcgmInit := dcgmInit defer func() { dcgmInit = realDcgmInit }() - dcgmInit = func(args ...string) (func(), error) { + dcgmInit = func(...string) (func(), error) { return nil, fmt.Errorf("No DCGM client library *OR* No DCGM connection") } diff --git a/receiver/dcgmreceiver/factory_gpu_on.go b/receiver/dcgmreceiver/factory_gpu_on.go index b3114e334..9601940d0 100644 --- a/receiver/dcgmreceiver/factory_gpu_on.go +++ b/receiver/dcgmreceiver/factory_gpu_on.go @@ -34,8 +34,6 @@ import ( ) var dcgmIDToName map[dcgm.Short]string -var metricNameToDcgmName map[string]string -var dcgmNameToMetricName map[string]string var randSource = rand.New(rand.NewSource(time.Now().UnixMicro())) func init() { diff --git a/receiver/dcgmreceiver/scraper.go b/receiver/dcgmreceiver/scraper.go index 293e3e3c5..d154db7df 100644 --- a/receiver/dcgmreceiver/scraper.go +++ b/receiver/dcgmreceiver/scraper.go @@ -231,7 +231,7 @@ func (s *dcgmScraper) scrape(_ context.Context) (pmetric.Metrics, error) { s.mb.RecordGpuDcgmEccErrorsDataPoint(now, metric.asInt64(), metadata.AttributeGpuErrorTypeDbe) } // TODO: XID errors. - //s.mb.RecordGpuDcgmXidErrorsDataPoint(now, metric.asInt64(), xid) + // s.mb.RecordGpuDcgmXidErrorsDataPoint(now, metric.asInt64(), xid) s.mb.EmitForResource(metadata.WithResource(gpuResource)) } diff --git a/receiver/dcgmreceiver/util.go b/receiver/dcgmreceiver/util.go index 9961b0105..2c3457fd4 100644 --- a/receiver/dcgmreceiver/util.go +++ b/receiver/dcgmreceiver/util.go @@ -25,11 +25,11 @@ import ( ) var ( - blankValueError = fmt.Errorf("unspecified blank value") - dataNotFoundError = fmt.Errorf("data not found") - notSupportedError = fmt.Errorf("field not supported") - permissionDeniedError = fmt.Errorf("no permission to fetch value") - unexpectedTypeError = fmt.Errorf("unexpected data type") + errBlankValue = fmt.Errorf("unspecified blank value") + errDataNotFound = fmt.Errorf("data not found") + errNotSupported = fmt.Errorf("field not supported") + errPermissionDenied = fmt.Errorf("no permission to fetch value") + errUnexpectedType = fmt.Errorf("unexpected data type") ) func (m *dcgmMetric) setFloat64(val float64) { @@ -53,38 +53,38 @@ func isValidValue(fieldValue dcgm.FieldValue_v1) error { case dcgm.DCGM_FT_DOUBLE: switch v := fieldValue.Float64(); v { case dcgm.DCGM_FT_FP64_BLANK: - return blankValueError + return errBlankValue case dcgm.DCGM_FT_FP64_NOT_FOUND: - return dataNotFoundError + return errDataNotFound case dcgm.DCGM_FT_FP64_NOT_SUPPORTED: - return notSupportedError + return errNotSupported case dcgm.DCGM_FT_FP64_NOT_PERMISSIONED: - return permissionDeniedError + return errPermissionDenied } case dcgm.DCGM_FT_INT64: switch v := fieldValue.Int64(); v { case dcgm.DCGM_FT_INT32_BLANK: - return blankValueError + return errBlankValue case dcgm.DCGM_FT_INT32_NOT_FOUND: - return dataNotFoundError + return errDataNotFound case dcgm.DCGM_FT_INT32_NOT_SUPPORTED: - return notSupportedError + return errNotSupported case dcgm.DCGM_FT_INT32_NOT_PERMISSIONED: - return permissionDeniedError + return errPermissionDenied case dcgm.DCGM_FT_INT64_BLANK: - return blankValueError + return errBlankValue case dcgm.DCGM_FT_INT64_NOT_FOUND: - return dataNotFoundError + return errDataNotFound case dcgm.DCGM_FT_INT64_NOT_SUPPORTED: - return notSupportedError + return errNotSupported case dcgm.DCGM_FT_INT64_NOT_PERMISSIONED: - return permissionDeniedError + return errPermissionDenied } // dcgm.DCGM_FT_STRING also exists but we don't expect it default: - return unexpectedTypeError + return errUnexpectedType } return nil From 59d2a4a7513bbf398c8d65703a0fbeeb57db6567 Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Tue, 23 Jul 2024 02:14:29 -0400 Subject: [PATCH 21/38] Fix description of the network.io.direction label. --- receiver/dcgmreceiver/metadata.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/receiver/dcgmreceiver/metadata.yaml b/receiver/dcgmreceiver/metadata.yaml index b0de6a042..2f4047c70 100644 --- a/receiver/dcgmreceiver/metadata.yaml +++ b/receiver/dcgmreceiver/metadata.yaml @@ -34,7 +34,7 @@ attributes: network.io.direction: type: string - description: Direction of the link traffic, one of [tx, rx]. + description: Direction of the link traffic, one of [transmit, receive]. enum: [transmit, receive] gpu.clock.violation: From 97d26891064b362b6651aebe48117b51c382d640 Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Thu, 25 Jul 2024 12:52:26 -0400 Subject: [PATCH 22/38] Avoid panics when tests run with no GPU. --- receiver/dcgmreceiver/client_gpu_test.go | 2 +- receiver/dcgmreceiver/scraper_gpu_test.go | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/receiver/dcgmreceiver/client_gpu_test.go b/receiver/dcgmreceiver/client_gpu_test.go index 123c55298..42d9b7b98 100644 --- a/receiver/dcgmreceiver/client_gpu_test.go +++ b/receiver/dcgmreceiver/client_gpu_test.go @@ -55,7 +55,7 @@ func TestSupportedFieldsWithGolden(t *testing.T) { client, err := newClient(config, zaptest.NewLogger(t)) require.Nil(t, err, "cannot initialize DCGM. Install and run DCGM before running tests.") - assert.NotEmpty(t, client.devicesModelName) + require.NotEmpty(t, client.devicesModelName) gpuModel := client.getDeviceModelName(0) allFields := discoverRequestedFieldIDs(config) supportedRegularFields, err := getSupportedRegularFields(allFields, zaptest.NewLogger(t)) diff --git a/receiver/dcgmreceiver/scraper_gpu_test.go b/receiver/dcgmreceiver/scraper_gpu_test.go index f2bb545d7..55f6c1aca 100644 --- a/receiver/dcgmreceiver/scraper_gpu_test.go +++ b/receiver/dcgmreceiver/scraper_gpu_test.go @@ -49,6 +49,10 @@ func TestScrapeWithGpuPresent(t *testing.T) { require.NoError(t, err) metrics, err := scraper.scrape(context.Background()) + assert.NoError(t, err) + + require.NotNil(t, scraper.client) + require.NotEmpty(t, scraper.client.devicesModelName) expectedMetrics := loadExpectedScraperMetrics(t, scraper.client.getDeviceModelName(0)) validateScraperResult(t, metrics, expectedMetrics) } @@ -82,6 +86,9 @@ func TestScrapeWithDelayedDcgmService(t *testing.T) { dcgmInit = realDcgmInit metrics, err = scraper.scrape(context.Background()) assert.NoError(t, err) + + require.NotNil(t, scraper.client) + require.NotEmpty(t, scraper.client.devicesModelName) expectedMetrics := loadExpectedScraperMetrics(t, scraper.client.getDeviceModelName(0)) validateScraperResult(t, metrics, expectedMetrics) } From d47313e6c1a319c08b1441348c45ce6d74c19d09 Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Thu, 25 Jul 2024 13:33:30 -0400 Subject: [PATCH 23/38] Review feedback. * Enable field group cleanup. * Better name for bytesReserved. * Disable gpu.dcgm.xid_errors. --- receiver/dcgmreceiver/client.go | 6 ++-- receiver/dcgmreceiver/documentation.md | 28 +++++++++---------- .../internal/metadata/generated_config.go | 2 +- .../metadata/generated_metrics_test.go | 1 - receiver/dcgmreceiver/metadata.yaml | 2 +- receiver/dcgmreceiver/scraper.go | 4 +-- 6 files changed, 22 insertions(+), 21 deletions(-) diff --git a/receiver/dcgmreceiver/client.go b/receiver/dcgmreceiver/client.go index f27d85e23..9ef6f8762 100644 --- a/receiver/dcgmreceiver/client.go +++ b/receiver/dcgmreceiver/client.go @@ -95,6 +95,7 @@ func newClient(config *Config, logger *zap.Logger) (*dcgmClient, error) { } enabledFieldGroup, err = setWatchesOnEnabledFields(config, logger, deviceGroup, enabledFields) if err != nil { + _ = dcgm.FieldGroupDestroy(enabledFieldGroup) return nil, fmt.Errorf("Unable to set field watches on %w", err) } } @@ -335,10 +336,10 @@ func getSupportedRegularFields(requestedFields []dcgm.Short, logger *zap.Logger) maxKeepTime: 600, maxKeepSamples: 1, }) + defer func() { _ = dcgm.FieldGroupDestroy(testFieldGroup) }() if err != nil { return nil, fmt.Errorf("Unable to set field watches on %w", err) } - defer func() { _ = dcgm.FieldGroupDestroy(testFieldGroup) }() err = dcgm.UpdateAllFields() if err != nil { return nil, fmt.Errorf("Unable to update fields on %w", err) @@ -401,7 +402,7 @@ func setWatchesOnFields(logger *zap.Logger, deviceGroup dcgm.GroupHandle, fieldI dcgmMaxKeepSamples := params.maxKeepSamples err = dcgm.WatchFieldsWithGroupEx(fieldGroup, deviceGroup, dcgmUpdateFreq, dcgmMaxKeepTime, dcgmMaxKeepSamples) if err != nil { - return dcgm.FieldHandle{}, fmt.Errorf("Setting watches for DCGM field group '%s' failed on %w", params.fieldGroupName, err) + return fieldGroup, fmt.Errorf("Setting watches for DCGM field group '%s' failed on %w", params.fieldGroupName, err) } logger.Sugar().Infof("Setting watches for DCGM field group '%s' succeeded", params.fieldGroupName) @@ -420,6 +421,7 @@ func setWatchesOnEnabledFields(config *Config, logger *zap.Logger, deviceGroup d } func (client *dcgmClient) cleanup() { + _ = dcgm.FieldGroupDestroy(client.enabledFieldGroup) if client.handleCleanup != nil { client.handleCleanup() } diff --git a/receiver/dcgmreceiver/documentation.md b/receiver/dcgmreceiver/documentation.md index 9ac286be8..64932e5da 100644 --- a/receiver/dcgmreceiver/documentation.md +++ b/receiver/dcgmreceiver/documentation.md @@ -160,20 +160,6 @@ Ratio of time the graphics engine is active. | ---- | ----------- | ---------- | | 1 | Gauge | Double | -### gpu.dcgm.xid_errors - -XID errors. - -| Unit | Metric Type | Value Type | Aggregation Temporality | Monotonic | -| ---- | ----------- | ---------- | ----------------------- | --------- | -| 1 | Sum | Int | Cumulative | true | - -#### Attributes - -| Name | Description | Values | -| ---- | ----------- | ------ | -| gpu.error.xid | The XID code for the error, 1..143. | Any Int | - ## Optional Metrics The following metrics are not emitted by default. Each of them can be enabled by applying the following configuration: @@ -192,6 +178,20 @@ Fraction of the number of warps resident on a multiprocessor, averaged over all | ---- | ----------- | ---------- | | 1 | Gauge | Double | +### gpu.dcgm.xid_errors + +XID errors. + +| Unit | Metric Type | Value Type | Aggregation Temporality | Monotonic | +| ---- | ----------- | ---------- | ----------------------- | --------- | +| 1 | Sum | Int | Cumulative | true | + +#### Attributes + +| Name | Description | Values | +| ---- | ----------- | ------ | +| gpu.error.xid | The XID code for the error, 1..143. | Any Int | + ## Resource Attributes | Name | Description | Values | Enabled | diff --git a/receiver/dcgmreceiver/internal/metadata/generated_config.go b/receiver/dcgmreceiver/internal/metadata/generated_config.go index c41448340..bb5070e70 100644 --- a/receiver/dcgmreceiver/internal/metadata/generated_config.go +++ b/receiver/dcgmreceiver/internal/metadata/generated_config.go @@ -94,7 +94,7 @@ func DefaultMetricsConfig() MetricsConfig { Enabled: true, }, GpuDcgmXidErrors: MetricConfig{ - Enabled: true, + Enabled: false, }, } } diff --git a/receiver/dcgmreceiver/internal/metadata/generated_metrics_test.go b/receiver/dcgmreceiver/internal/metadata/generated_metrics_test.go index d9437a1b6..243ecab2d 100644 --- a/receiver/dcgmreceiver/internal/metadata/generated_metrics_test.go +++ b/receiver/dcgmreceiver/internal/metadata/generated_metrics_test.go @@ -127,7 +127,6 @@ func TestMetricsBuilder(t *testing.T) { allMetricsCount++ mb.RecordGpuDcgmUtilizationDataPoint(ts, 1) - defaultMetricsCount++ allMetricsCount++ mb.RecordGpuDcgmXidErrorsDataPoint(ts, 1, 13) diff --git a/receiver/dcgmreceiver/metadata.yaml b/receiver/dcgmreceiver/metadata.yaml index 2f4047c70..4b3b67e32 100644 --- a/receiver/dcgmreceiver/metadata.yaml +++ b/receiver/dcgmreceiver/metadata.yaml @@ -181,4 +181,4 @@ metrics: aggregation_temporality: cumulative monotonic: true attributes: [gpu.error.xid] - enabled: true + enabled: false diff --git a/receiver/dcgmreceiver/scraper.go b/receiver/dcgmreceiver/scraper.go index d154db7df..54a57e885 100644 --- a/receiver/dcgmreceiver/scraper.go +++ b/receiver/dcgmreceiver/scraper.go @@ -153,8 +153,8 @@ func (s *dcgmScraper) scrape(_ context.Context) (pmetric.Metrics, error) { s.mb.RecordGpuDcgmMemoryBytesUsedDataPoint(now, bytesUsed, metadata.AttributeGpuMemoryStateUsed) } if metric, ok := metrics["DCGM_FI_DEV_FB_RESERVED"]; ok { - bytesFree := 1e6 * metric.asInt64() /* MBy to By */ - s.mb.RecordGpuDcgmMemoryBytesUsedDataPoint(now, bytesFree, metadata.AttributeGpuMemoryStateReserved) + bytesReserved := 1e6 * metric.asInt64() /* MBy to By */ + s.mb.RecordGpuDcgmMemoryBytesUsedDataPoint(now, bytesReserved, metadata.AttributeGpuMemoryStateReserved) } if metric, ok := metrics["DCGM_FI_PROF_DRAM_ACTIVE"]; ok { s.mb.RecordGpuDcgmMemoryBandwidthUtilizationDataPoint(now, metric.asFloat64()) From 2ad3400095fb3356af753178a132f35754dc9c69 Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Sat, 27 Jul 2024 21:40:34 -0400 Subject: [PATCH 24/38] Need to aggregate per device. --- receiver/dcgmreceiver/scraper.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/receiver/dcgmreceiver/scraper.go b/receiver/dcgmreceiver/scraper.go index 54a57e885..13730715b 100644 --- a/receiver/dcgmreceiver/scraper.go +++ b/receiver/dcgmreceiver/scraper.go @@ -39,7 +39,7 @@ type dcgmScraper struct { client *dcgmClient mb *metadata.MetricsBuilder // Aggregate cumulative values from power usage rate. - energyConsumptionFallback float64 + energyConsumptionFallback map[uint]float64 } func newDcgmScraper(config *Config, settings receiver.CreateSettings) *dcgmScraper { @@ -74,7 +74,7 @@ func (s *dcgmScraper) start(_ context.Context, _ component.Host) error { mbConfig.Metrics = s.config.Metrics s.mb = metadata.NewMetricsBuilder( mbConfig, s.settings, metadata.WithStartTime(startTime)) - s.energyConsumptionFallback = 0.0 + s.energyConsumptionFallback = make(map[uint]float64) return nil } @@ -182,8 +182,8 @@ func (s *dcgmScraper) scrape(_ context.Context) (pmetric.Metrics, error) { s.mb.RecordGpuDcgmEnergyConsumptionDataPoint(now, metric.asFloat64()) } else if metric, ok := metrics["DCGM_FI_DEV_POWER_USAGE"]; ok { // fallback powerUsage := metric.asFloat64() * (s.config.CollectionInterval.Seconds()) /* rate to delta */ - s.energyConsumptionFallback += powerUsage /* delta to cumulative */ - s.mb.RecordGpuDcgmEnergyConsumptionDataPoint(now, s.energyConsumptionFallback) + s.energyConsumptionFallback[gpuIndex] += powerUsage /* delta to cumulative */ + s.mb.RecordGpuDcgmEnergyConsumptionDataPoint(now, s.energyConsumptionFallback[gpuIndex]) } if metric, ok := metrics["DCGM_FI_DEV_GPU_TEMP"]; ok { s.mb.RecordGpuDcgmTemperatureDataPoint(now, metric.asFloat64()) From e9f33002a237650097648aa0e2ae52ec5300056d Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Thu, 25 Jul 2024 13:07:49 -0400 Subject: [PATCH 25/38] Make gpu.dcgm.pcie.io and gpu.dcgm.nvlink.io cumulative. --- receiver/dcgmreceiver/documentation.md | 8 ++--- .../internal/metadata/generated_metrics.go | 4 +-- .../metadata/generated_metrics_test.go | 4 +-- receiver/dcgmreceiver/metadata.yaml | 4 +-- receiver/dcgmreceiver/scraper.go | 32 +++++++++++++------ 5 files changed, 33 insertions(+), 19 deletions(-) diff --git a/receiver/dcgmreceiver/documentation.md b/receiver/dcgmreceiver/documentation.md index 64932e5da..122ded718 100644 --- a/receiver/dcgmreceiver/documentation.md +++ b/receiver/dcgmreceiver/documentation.md @@ -100,13 +100,13 @@ The number of bytes sent over NVLink, not including protocol headers. | Unit | Metric Type | Value Type | Aggregation Temporality | Monotonic | | ---- | ----------- | ---------- | ----------------------- | --------- | -| By | Sum | Int | Delta | true | +| By | Sum | Int | Cumulative | true | #### Attributes | Name | Description | Values | | ---- | ----------- | ------ | -| network.io.direction | Direction of the link traffic, one of [tx, rx]. | Str: ``transmit``, ``receive`` | +| network.io.direction | Direction of the link traffic, one of [transmit, receive]. | Str: ``transmit``, ``receive`` | ### gpu.dcgm.pcie.io @@ -114,13 +114,13 @@ The number of bytes sent over the PCIe bus, including both protocol headers and | Unit | Metric Type | Value Type | Aggregation Temporality | Monotonic | | ---- | ----------- | ---------- | ----------------------- | --------- | -| By | Sum | Int | Delta | true | +| By | Sum | Int | Cumulative | true | #### Attributes | Name | Description | Values | | ---- | ----------- | ------ | -| network.io.direction | Direction of the link traffic, one of [tx, rx]. | Str: ``transmit``, ``receive`` | +| network.io.direction | Direction of the link traffic, one of [transmit, receive]. | Str: ``transmit``, ``receive`` | ### gpu.dcgm.pipe.utilization diff --git a/receiver/dcgmreceiver/internal/metadata/generated_metrics.go b/receiver/dcgmreceiver/internal/metadata/generated_metrics.go index 0940036d2..435157c38 100644 --- a/receiver/dcgmreceiver/internal/metadata/generated_metrics.go +++ b/receiver/dcgmreceiver/internal/metadata/generated_metrics.go @@ -595,7 +595,7 @@ func (m *metricGpuDcgmNvlinkIo) init() { m.data.SetUnit("By") m.data.SetEmptySum() m.data.Sum().SetIsMonotonic(true) - m.data.Sum().SetAggregationTemporality(pmetric.AggregationTemporalityDelta) + m.data.Sum().SetAggregationTemporality(pmetric.AggregationTemporalityCumulative) m.data.Sum().DataPoints().EnsureCapacity(m.capacity) } @@ -648,7 +648,7 @@ func (m *metricGpuDcgmPcieIo) init() { m.data.SetUnit("By") m.data.SetEmptySum() m.data.Sum().SetIsMonotonic(true) - m.data.Sum().SetAggregationTemporality(pmetric.AggregationTemporalityDelta) + m.data.Sum().SetAggregationTemporality(pmetric.AggregationTemporalityCumulative) m.data.Sum().DataPoints().EnsureCapacity(m.capacity) } diff --git a/receiver/dcgmreceiver/internal/metadata/generated_metrics_test.go b/receiver/dcgmreceiver/internal/metadata/generated_metrics_test.go index 243ecab2d..e4ba17cd5 100644 --- a/receiver/dcgmreceiver/internal/metadata/generated_metrics_test.go +++ b/receiver/dcgmreceiver/internal/metadata/generated_metrics_test.go @@ -275,7 +275,7 @@ func TestMetricsBuilder(t *testing.T) { assert.Equal(t, "The number of bytes sent over NVLink, not including protocol headers.", ms.At(i).Description()) assert.Equal(t, "By", ms.At(i).Unit()) assert.Equal(t, true, ms.At(i).Sum().IsMonotonic()) - assert.Equal(t, pmetric.AggregationTemporalityDelta, ms.At(i).Sum().AggregationTemporality()) + assert.Equal(t, pmetric.AggregationTemporalityCumulative, ms.At(i).Sum().AggregationTemporality()) dp := ms.At(i).Sum().DataPoints().At(0) assert.Equal(t, start, dp.StartTimestamp()) assert.Equal(t, ts, dp.Timestamp()) @@ -292,7 +292,7 @@ func TestMetricsBuilder(t *testing.T) { assert.Equal(t, "The number of bytes sent over the PCIe bus, including both protocol headers and data payloads.", ms.At(i).Description()) assert.Equal(t, "By", ms.At(i).Unit()) assert.Equal(t, true, ms.At(i).Sum().IsMonotonic()) - assert.Equal(t, pmetric.AggregationTemporalityDelta, ms.At(i).Sum().AggregationTemporality()) + assert.Equal(t, pmetric.AggregationTemporalityCumulative, ms.At(i).Sum().AggregationTemporality()) dp := ms.At(i).Sum().DataPoints().At(0) assert.Equal(t, start, dp.StartTimestamp()) assert.Equal(t, ts, dp.Timestamp()) diff --git a/receiver/dcgmreceiver/metadata.yaml b/receiver/dcgmreceiver/metadata.yaml index 4b3b67e32..6201aeb00 100644 --- a/receiver/dcgmreceiver/metadata.yaml +++ b/receiver/dcgmreceiver/metadata.yaml @@ -115,7 +115,7 @@ metrics: unit: By sum: value_type: int - aggregation_temporality: delta + aggregation_temporality: cumulative monotonic: true attributes: [network.io.direction] enabled: true @@ -125,7 +125,7 @@ metrics: unit: By sum: value_type: int - aggregation_temporality: delta + aggregation_temporality: cumulative monotonic: true attributes: [network.io.direction] enabled: true diff --git a/receiver/dcgmreceiver/scraper.go b/receiver/dcgmreceiver/scraper.go index 13730715b..3e6e17a35 100644 --- a/receiver/dcgmreceiver/scraper.go +++ b/receiver/dcgmreceiver/scraper.go @@ -38,8 +38,14 @@ type dcgmScraper struct { settings receiver.CreateSettings client *dcgmClient mb *metadata.MetricsBuilder - // Aggregate cumulative values from power usage rate. - energyConsumptionFallback map[uint]float64 + // Aggregate cumulative values. + aggregates struct { + energyConsumptionFallback map[uint]float64 // ...from power usage rate. + pcieTxTotal map[uint]int64 // ...from pcie tx. + pcieRxTotal map[uint]int64 // ...from pcie rx. + nvlinkTxTotal map[uint]int64 // ...from nvlink tx. + nvlinkRxTotal map[uint]int64 // ...from nvlink rx. + } } func newDcgmScraper(config *Config, settings receiver.CreateSettings) *dcgmScraper { @@ -74,7 +80,11 @@ func (s *dcgmScraper) start(_ context.Context, _ component.Host) error { mbConfig.Metrics = s.config.Metrics s.mb = metadata.NewMetricsBuilder( mbConfig, s.settings, metadata.WithStartTime(startTime)) - s.energyConsumptionFallback = make(map[uint]float64) + s.aggregates.energyConsumptionFallback = make(map[uint]float64) + s.aggregates.pcieTxTotal = make(map[uint]int64) + s.aggregates.pcieRxTotal = make(map[uint]int64) + s.aggregates.nvlinkTxTotal = make(map[uint]int64) + s.aggregates.nvlinkRxTotal = make(map[uint]int64) return nil } @@ -164,26 +174,30 @@ func (s *dcgmScraper) scrape(_ context.Context) (pmetric.Metrics, error) { } if metric, ok := metrics["DCGM_FI_PROF_PCIE_TX_BYTES"]; ok { pcieTx := int64(float64(metric.asInt64()) * (s.config.CollectionInterval.Seconds())) /* rate to delta */ - s.mb.RecordGpuDcgmPcieIoDataPoint(now, pcieTx, metadata.AttributeNetworkIoDirectionTransmit) + s.aggregates.pcieTxTotal[gpuIndex] += pcieTx /* delta to cumulative */ + s.mb.RecordGpuDcgmPcieIoDataPoint(now, s.aggregates.pcieTxTotal[gpuIndex], metadata.AttributeNetworkIoDirectionTransmit) } if metric, ok := metrics["DCGM_FI_PROF_PCIE_RX_BYTES"]; ok { pcieRx := int64(float64(metric.asInt64()) * (s.config.CollectionInterval.Seconds())) /* rate to delta */ - s.mb.RecordGpuDcgmPcieIoDataPoint(now, pcieRx, metadata.AttributeNetworkIoDirectionReceive) + s.aggregates.pcieRxTotal[gpuIndex] += pcieRx /* delta to cumulative */ + s.mb.RecordGpuDcgmPcieIoDataPoint(now, s.aggregates.pcieRxTotal[gpuIndex], metadata.AttributeNetworkIoDirectionReceive) } if metric, ok := metrics["DCGM_FI_PROF_NVLINK_TX_BYTES"]; ok { nvlinkTx := int64(float64(metric.asInt64()) * (s.config.CollectionInterval.Seconds())) /* rate to delta */ - s.mb.RecordGpuDcgmNvlinkIoDataPoint(now, nvlinkTx, metadata.AttributeNetworkIoDirectionTransmit) + s.aggregates.nvlinkTxTotal[gpuIndex] += nvlinkTx /* delta to cumulative */ + s.mb.RecordGpuDcgmNvlinkIoDataPoint(now, s.aggregates.nvlinkTxTotal[gpuIndex], metadata.AttributeNetworkIoDirectionTransmit) } if metric, ok := metrics["DCGM_FI_PROF_NVLINK_RX_BYTES"]; ok { nvlinkRx := int64(float64(metric.asInt64()) * (s.config.CollectionInterval.Seconds())) /* rate to delta */ - s.mb.RecordGpuDcgmNvlinkIoDataPoint(now, nvlinkRx, metadata.AttributeNetworkIoDirectionReceive) + s.aggregates.nvlinkRxTotal[gpuIndex] += nvlinkRx /* delta to cumulative */ + s.mb.RecordGpuDcgmNvlinkIoDataPoint(now, s.aggregates.nvlinkRxTotal[gpuIndex], metadata.AttributeNetworkIoDirectionReceive) } if metric, ok := metrics["DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION"]; ok { s.mb.RecordGpuDcgmEnergyConsumptionDataPoint(now, metric.asFloat64()) } else if metric, ok := metrics["DCGM_FI_DEV_POWER_USAGE"]; ok { // fallback powerUsage := metric.asFloat64() * (s.config.CollectionInterval.Seconds()) /* rate to delta */ - s.energyConsumptionFallback[gpuIndex] += powerUsage /* delta to cumulative */ - s.mb.RecordGpuDcgmEnergyConsumptionDataPoint(now, s.energyConsumptionFallback[gpuIndex]) + s.aggregates.energyConsumptionFallback[gpuIndex] += powerUsage /* delta to cumulative */ + s.mb.RecordGpuDcgmEnergyConsumptionDataPoint(now, s.aggregates.energyConsumptionFallback[gpuIndex]) } if metric, ok := metrics["DCGM_FI_DEV_GPU_TEMP"]; ok { s.mb.RecordGpuDcgmTemperatureDataPoint(now, metric.asFloat64()) From adb4c2e88d1be9af3d17c4f8a8aef664014ed989 Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Fri, 26 Jul 2024 00:00:03 -0400 Subject: [PATCH 26/38] Pull in cumulativetodelta processor. --- go.mod | 1 + go.sum | 2 ++ 2 files changed, 3 insertions(+) diff --git a/go.mod b/go.mod index 67fd8397f..b9eb18387 100644 --- a/go.mod +++ b/go.mod @@ -9,6 +9,7 @@ require ( github.com/open-telemetry/opentelemetry-collector-contrib/exporter/googlecloudexporter v0.102.0 github.com/open-telemetry/opentelemetry-collector-contrib/exporter/googlemanagedprometheusexporter v0.102.0 github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatautil v0.102.0 + github.com/open-telemetry/opentelemetry-collector-contrib/processor/cumulativetodeltaprocessor v0.102.0 github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatorateprocessor v0.102.0 github.com/open-telemetry/opentelemetry-collector-contrib/processor/filterprocessor v0.102.0 github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbyattrsprocessor v0.102.0 diff --git a/go.sum b/go.sum index 05e3e68c5..2a5e13ef8 100644 --- a/go.sum +++ b/go.sum @@ -747,6 +747,8 @@ github.com/open-telemetry/opentelemetry-collector-contrib/pkg/translator/prometh github.com/open-telemetry/opentelemetry-collector-contrib/pkg/translator/prometheusremotewrite v0.102.0/go.mod h1:+Vlutd4t2XluxHYbIAfZiz3z5uWbsbiIUpipV5AnLtk= github.com/open-telemetry/opentelemetry-collector-contrib/pkg/winperfcounters v0.102.0 h1:adfJy3Sev2MaD6+plcmsSecpzy8h4MJT7eXEuif/2Ew= github.com/open-telemetry/opentelemetry-collector-contrib/pkg/winperfcounters v0.102.0/go.mod h1:FJmA939yem9GSEbqjCK6CXVbPfNPFKhvKnn+nWNpWio= +github.com/open-telemetry/opentelemetry-collector-contrib/processor/cumulativetodeltaprocessor v0.102.0 h1:q4VV17TxeMm0FOeyFXAO4gSRf2ZLtKTh0/l5goxhRsY= +github.com/open-telemetry/opentelemetry-collector-contrib/processor/cumulativetodeltaprocessor v0.102.0/go.mod h1:FlP/8TVT768TAh5kpvVX3AQ5/UXJWBuSSCFhO3fE+E0= github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatorateprocessor v0.102.0 h1:mj3t9/FAQZjcZJA2kjgbpz2fSK9yD/pYpmqKEWpHJ1A= github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatorateprocessor v0.102.0/go.mod h1:IIIjEblgrNISbDY7GPMMto9kEVIf0n9IeJoVru89kfY= github.com/open-telemetry/opentelemetry-collector-contrib/processor/filterprocessor v0.102.0 h1:DaEYlVCn58GtkyYVK0IT/ZMjRFJ+BfmR0p9I0Eq42aQ= From 85bd141818002df6cc5919823c4080f20f30996a Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Thu, 25 Jul 2024 23:07:09 -0400 Subject: [PATCH 27/38] Decouple the client from the receiver config. --- receiver/dcgmreceiver/client.go | 111 ++++++----------------- receiver/dcgmreceiver/client_gpu_test.go | 21 ++++- receiver/dcgmreceiver/client_test.go | 2 +- receiver/dcgmreceiver/config.go | 2 - receiver/dcgmreceiver/factory.go | 4 +- receiver/dcgmreceiver/scraper.go | 82 ++++++++++++++++- 6 files changed, 126 insertions(+), 96 deletions(-) diff --git a/receiver/dcgmreceiver/client.go b/receiver/dcgmreceiver/client.go index 9ef6f8762..4ea472d31 100644 --- a/receiver/dcgmreceiver/client.go +++ b/receiver/dcgmreceiver/client.go @@ -33,6 +33,14 @@ const dcgmProfilingFieldsStart = dcgm.Short(1000) var ErrDcgmInitialization = errors.New("error initializing DCGM") +type dcgmClientSettings struct { + endpoint string + pollingInterval time.Duration + retryBlankValues bool + maxRetries int + fields []string +} + type dcgmClient struct { logger *zap.SugaredLogger handleCleanup func() @@ -42,7 +50,7 @@ type dcgmClient struct { devicesModelName []string devicesUUID []string deviceMetricToFailedQueryCount map[string]uint64 - collectionInterval time.Duration + pollingInterval time.Duration retryBlankValues bool maxRetries int } @@ -60,8 +68,8 @@ var dcgmInit = func(args ...string) (func(), error) { var dcgmGetLatestValuesForFields = dcgm.GetLatestValuesForFields -func newClient(config *Config, logger *zap.Logger) (*dcgmClient, error) { - dcgmCleanup, err := initializeDcgm(config, logger) +func newClient(settings *dcgmClientSettings, logger *zap.Logger) (*dcgmClient, error) { + dcgmCleanup, err := initializeDcgm(settings.endpoint, logger) if err != nil { return nil, errors.Join(ErrDcgmInitialization, err) } @@ -69,7 +77,7 @@ func newClient(config *Config, logger *zap.Logger) (*dcgmClient, error) { names := make([]string, 0) UUIDs := make([]string, 0) enabledFieldGroup := dcgm.FieldHandle{} - requestedFieldIDs := discoverRequestedFieldIDs(config) + requestedFieldIDs := toFieldIDs(settings.fields) supportedRegularFieldIDs, err := getSupportedRegularFields(requestedFieldIDs, logger) if err != nil { return nil, fmt.Errorf("Error querying supported regular fields: %w", err) @@ -93,7 +101,7 @@ func newClient(config *Config, logger *zap.Logger) (*dcgmClient, error) { if err != nil { return nil, err } - enabledFieldGroup, err = setWatchesOnEnabledFields(config, logger, deviceGroup, enabledFields) + enabledFieldGroup, err = setWatchesOnEnabledFields(settings.pollingInterval, logger, deviceGroup, enabledFields) if err != nil { _ = dcgm.FieldGroupDestroy(enabledFieldGroup) return nil, fmt.Errorf("Unable to set field watches on %w", err) @@ -108,26 +116,26 @@ func newClient(config *Config, logger *zap.Logger) (*dcgmClient, error) { devicesModelName: names, devicesUUID: UUIDs, deviceMetricToFailedQueryCount: make(map[string]uint64), - collectionInterval: config.CollectionInterval, - retryBlankValues: config.retryBlankValues, - maxRetries: config.maxRetries, + pollingInterval: settings.pollingInterval, + retryBlankValues: settings.retryBlankValues, + maxRetries: settings.maxRetries, }, nil } // initializeDcgm tries to initialize a DCGM connection; returns a cleanup func // only if the connection is initialized successfully without error -func initializeDcgm(config *Config, logger *zap.Logger) (func(), error) { +func initializeDcgm(endpoint string, logger *zap.Logger) (func(), error) { isSocket := "0" - dcgmCleanup, err := dcgmInit(config.TCPAddrConfig.Endpoint, isSocket) + dcgmCleanup, err := dcgmInit(endpoint, isSocket) if err != nil { - msg := fmt.Sprintf("Unable to connect to DCGM daemon at %s on %v; Is the DCGM daemon running?", config.TCPAddrConfig.Endpoint, err) + msg := fmt.Sprintf("Unable to connect to DCGM daemon at %s on %v; Is the DCGM daemon running?", endpoint, err) logger.Sugar().Warn(msg) if dcgmCleanup != nil { dcgmCleanup() } return nil, fmt.Errorf("%s", msg) } - logger.Sugar().Infof("Connected to DCGM daemon at %s", config.TCPAddrConfig.Endpoint) + logger.Sugar().Infof("Connected to DCGM daemon at %s", endpoint) return dcgmCleanup, nil } @@ -175,76 +183,11 @@ func createDeviceGroup(logger *zap.Logger, deviceIndices []uint) (dcgm.GroupHand return deviceGroup, nil } -func discoverRequestedFieldIDs(config *Config) []dcgm.Short { - requestedFieldIDs := []dcgm.Short{} - if config.Metrics.GpuDcgmUtilization.Enabled { - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_GR_ENGINE_ACTIVE"]) - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_GPU_UTIL"]) // fallback - } - if config.Metrics.GpuDcgmSmUtilization.Enabled { - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_SM_ACTIVE"]) - } - if config.Metrics.GpuDcgmSmOccupancy.Enabled { - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_SM_OCCUPANCY"]) - } - if config.Metrics.GpuDcgmPipeUtilization.Enabled { - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_PIPE_TENSOR_ACTIVE"]) - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_PIPE_FP64_ACTIVE"]) - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_PIPE_FP32_ACTIVE"]) - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_PIPE_FP16_ACTIVE"]) - } - if config.Metrics.GpuDcgmCodecEncoderUtilization.Enabled { - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_ENC_UTIL"]) - } - if config.Metrics.GpuDcgmCodecDecoderUtilization.Enabled { - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_DEC_UTIL"]) - } - if config.Metrics.GpuDcgmMemoryBytesUsed.Enabled { - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_FB_FREE"]) - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_FB_USED"]) - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_FB_RESERVED"]) - } - if config.Metrics.GpuDcgmMemoryBandwidthUtilization.Enabled { - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_DRAM_ACTIVE"]) - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_MEM_COPY_UTIL"]) // fallback +func toFieldIDs(fields []string) []dcgm.Short { + requestedFieldIDs := make([]dcgm.Short, len(fields)) + for i, f := range fields { + requestedFieldIDs[i] = dcgm.DCGM_FI[f] } - if config.Metrics.GpuDcgmPcieIo.Enabled { - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_PCIE_TX_BYTES"]) - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_PCIE_RX_BYTES"]) - } - if config.Metrics.GpuDcgmNvlinkIo.Enabled { - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_NVLINK_TX_BYTES"]) - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_PROF_NVLINK_RX_BYTES"]) - } - if config.Metrics.GpuDcgmEnergyConsumption.Enabled { - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION"]) - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_POWER_USAGE"]) // fallback - } - if config.Metrics.GpuDcgmTemperature.Enabled { - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_GPU_TEMP"]) - } - if config.Metrics.GpuDcgmClockFrequency.Enabled { - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_SM_CLOCK"]) - } - if config.Metrics.GpuDcgmClockThrottleDurationTime.Enabled { - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_POWER_VIOLATION"]) - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_THERMAL_VIOLATION"]) - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_SYNC_BOOST_VIOLATION"]) - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_BOARD_LIMIT_VIOLATION"]) - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_LOW_UTIL_VIOLATION"]) - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_RELIABILITY_VIOLATION"]) - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION"]) - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION"]) - } - if config.Metrics.GpuDcgmEccErrors.Enabled { - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_ECC_SBE_VOL_TOTAL"]) - requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI["DCGM_FI_DEV_ECC_DBE_VOL_TOTAL"]) - } - if config.Metrics.GpuDcgmXidErrors.Enabled { - // requestedFieldIDs = append(requestedFieldIDs, dcgm.DCGM_FI[""]) - func() {}() // no-op - } - return requestedFieldIDs } @@ -409,12 +352,12 @@ func setWatchesOnFields(logger *zap.Logger, deviceGroup dcgm.GroupHandle, fieldI return fieldGroup, nil } -func setWatchesOnEnabledFields(config *Config, logger *zap.Logger, deviceGroup dcgm.GroupHandle, enabledFieldIDs []dcgm.Short) (dcgm.FieldHandle, error) { +func setWatchesOnEnabledFields(pollingInterval time.Duration, logger *zap.Logger, deviceGroup dcgm.GroupHandle, enabledFieldIDs []dcgm.Short) (dcgm.FieldHandle, error) { return setWatchesOnFields(logger, deviceGroup, enabledFieldIDs, dcgmWatchParams{ // Note: Add random suffix to avoid conflict amongnst any parallel collectors fieldGroupName: fmt.Sprintf("google-cloud-ops-agent-metrics-%d", randSource.Intn(10000)), // Note: DCGM retained samples = Max(maxKeepSamples, maxKeepTime/updateFreq) - updateFreqUs: int64(config.CollectionInterval / time.Microsecond), + updateFreqUs: int64(pollingInterval / time.Microsecond), maxKeepTime: 600.0, /* 10 min */ maxKeepSamples: int32(15), }) @@ -448,7 +391,7 @@ func (client *dcgmClient) collectDeviceMetrics() (map[uint][]dcgmMetric, error) gpuMetrics[gpuIndex], retry = client.appendMetric(gpuMetrics[gpuIndex], gpuIndex, fieldValues) if retry { client.logger.Warnf("Retrying poll of DCGM daemon for GPU %d; attempt %d", gpuIndex, i+1) - time.Sleep(client.collectionInterval) + time.Sleep(client.pollingInterval) continue } client.logger.Debugf("Successful poll of DCGM daemon for GPU %d", gpuIndex) diff --git a/receiver/dcgmreceiver/client_gpu_test.go b/receiver/dcgmreceiver/client_gpu_test.go index 42d9b7b98..a224e2c3f 100644 --- a/receiver/dcgmreceiver/client_gpu_test.go +++ b/receiver/dcgmreceiver/client_gpu_test.go @@ -47,17 +47,28 @@ type modelSupportedFields struct { UnsupportedFields []string `yaml:"unsupported_fields"` } +func defaultClientSettings() *dcgmClientSettings { + requestedFields := discoverRequestedFields(createDefaultConfig().(*Config)) + return &dcgmClientSettings{ + endpoint: defaultEndpoint, + pollingInterval: 10 * time.Second, + retryBlankValues: true, + maxRetries: 5, + fields: requestedFields, + } +} + // TestSupportedFieldsWithGolden tests getSupportedRegularFields() and // getSupportedProfilingFields() against the golden files for the current GPU // model func TestSupportedFieldsWithGolden(t *testing.T) { - config := createDefaultConfig().(*Config) - client, err := newClient(config, zaptest.NewLogger(t)) + clientSettings := defaultClientSettings() + client, err := newClient(clientSettings, zaptest.NewLogger(t)) require.Nil(t, err, "cannot initialize DCGM. Install and run DCGM before running tests.") require.NotEmpty(t, client.devicesModelName) gpuModel := client.getDeviceModelName(0) - allFields := discoverRequestedFieldIDs(config) + allFields := toFieldIDs(clientSettings.fields) supportedRegularFields, err := getSupportedRegularFields(allFields, zaptest.NewLogger(t)) require.Nil(t, err) supportedProfilingFields, err := getSupportedProfilingFields() @@ -119,7 +130,7 @@ func getModelGoldenFilePath(t *testing.T, model string) string { } func TestNewDcgmClientWithGpuPresent(t *testing.T) { - client, err := newClient(createDefaultConfig().(*Config), zaptest.NewLogger(t)) + client, err := newClient(defaultClientSettings(), zaptest.NewLogger(t)) require.Nil(t, err, "cannot initialize DCGM. Install and run DCGM before running tests.") assert.NotNil(t, client) @@ -133,7 +144,7 @@ func TestNewDcgmClientWithGpuPresent(t *testing.T) { } func TestCollectGpuProfilingMetrics(t *testing.T) { - client, err := newClient(createDefaultConfig().(*Config), zaptest.NewLogger(t)) + client, err := newClient(defaultClientSettings(), zaptest.NewLogger(t)) require.Nil(t, err, "cannot initialize DCGM. Install and run DCGM before running tests.") expectedMetrics := LoadExpectedMetrics(t, client.devicesModelName[0]) var maxCollectionInterval = 60 * time.Second diff --git a/receiver/dcgmreceiver/client_test.go b/receiver/dcgmreceiver/client_test.go index b113ed70f..010929a09 100644 --- a/receiver/dcgmreceiver/client_test.go +++ b/receiver/dcgmreceiver/client_test.go @@ -44,7 +44,7 @@ func TestNewDcgmClientOnInitializationError(t *testing.T) { return nil }))) - client, err := newClient(createDefaultConfig().(*Config), logger) + client, err := newClient(&dcgmClientSettings{endpoint: defaultEndpoint}, logger) assert.Equal(t, seenDcgmConnectionWarning, true) assert.True(t, errors.Is(err, ErrDcgmInitialization)) assert.Regexp(t, ".*Unable to connect.*", err) diff --git a/receiver/dcgmreceiver/config.go b/receiver/dcgmreceiver/config.go index 96ce31cf8..4d6be25b2 100644 --- a/receiver/dcgmreceiver/config.go +++ b/receiver/dcgmreceiver/config.go @@ -30,6 +30,4 @@ type Config struct { scraperhelper.ControllerConfig `mapstructure:",squash"` confignet.TCPAddrConfig `mapstructure:",squash"` Metrics metadata.MetricsConfig `mapstructure:"metrics"` - retryBlankValues bool - maxRetries int } diff --git a/receiver/dcgmreceiver/factory.go b/receiver/dcgmreceiver/factory.go index 49b057cd1..9682eb46a 100644 --- a/receiver/dcgmreceiver/factory.go +++ b/receiver/dcgmreceiver/factory.go @@ -39,8 +39,6 @@ func createDefaultConfig() component.Config { TCPAddrConfig: confignet.TCPAddrConfig{ Endpoint: defaultEndpoint, }, - Metrics: metadata.DefaultMetricsConfig(), - retryBlankValues: true, - maxRetries: 5, + Metrics: metadata.DefaultMetricsConfig(), } } diff --git a/receiver/dcgmreceiver/scraper.go b/receiver/dcgmreceiver/scraper.go index 3e6e17a35..a8f892f5f 100644 --- a/receiver/dcgmreceiver/scraper.go +++ b/receiver/dcgmreceiver/scraper.go @@ -60,7 +60,14 @@ func (s *dcgmScraper) initClient() error { if s.client != nil { return nil } - client, err := newClient(s.config, s.settings.Logger) + clientSettings := &dcgmClientSettings{ + endpoint: s.config.TCPAddrConfig.Endpoint, + pollingInterval: s.config.CollectionInterval, + fields: discoverRequestedFields(s.config), + retryBlankValues: true, + maxRetries: 5, + } + client, err := newClient(clientSettings, s.settings.Logger) if err != nil { s.settings.Logger.Sugar().Warn(err) if errors.Is(err, ErrDcgmInitialization) { @@ -96,6 +103,79 @@ func (s *dcgmScraper) stop(_ context.Context) error { return nil } +func discoverRequestedFields(config *Config) []string { + requestedFields := []string{} + if config.Metrics.GpuDcgmUtilization.Enabled { + requestedFields = append(requestedFields, "DCGM_FI_PROF_GR_ENGINE_ACTIVE") + requestedFields = append(requestedFields, "DCGM_FI_DEV_GPU_UTIL") // fallback + } + if config.Metrics.GpuDcgmSmUtilization.Enabled { + requestedFields = append(requestedFields, "DCGM_FI_PROF_SM_ACTIVE") + } + if config.Metrics.GpuDcgmSmOccupancy.Enabled { + requestedFields = append(requestedFields, "DCGM_FI_PROF_SM_OCCUPANCY") + } + if config.Metrics.GpuDcgmPipeUtilization.Enabled { + requestedFields = append(requestedFields, "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE") + requestedFields = append(requestedFields, "DCGM_FI_PROF_PIPE_FP64_ACTIVE") + requestedFields = append(requestedFields, "DCGM_FI_PROF_PIPE_FP32_ACTIVE") + requestedFields = append(requestedFields, "DCGM_FI_PROF_PIPE_FP16_ACTIVE") + } + if config.Metrics.GpuDcgmCodecEncoderUtilization.Enabled { + requestedFields = append(requestedFields, "DCGM_FI_DEV_ENC_UTIL") + } + if config.Metrics.GpuDcgmCodecDecoderUtilization.Enabled { + requestedFields = append(requestedFields, "DCGM_FI_DEV_DEC_UTIL") + } + if config.Metrics.GpuDcgmMemoryBytesUsed.Enabled { + requestedFields = append(requestedFields, "DCGM_FI_DEV_FB_FREE") + requestedFields = append(requestedFields, "DCGM_FI_DEV_FB_USED") + requestedFields = append(requestedFields, "DCGM_FI_DEV_FB_RESERVED") + } + if config.Metrics.GpuDcgmMemoryBandwidthUtilization.Enabled { + requestedFields = append(requestedFields, "DCGM_FI_PROF_DRAM_ACTIVE") + requestedFields = append(requestedFields, "DCGM_FI_DEV_MEM_COPY_UTIL") // fallback + } + if config.Metrics.GpuDcgmPcieIo.Enabled { + requestedFields = append(requestedFields, "DCGM_FI_PROF_PCIE_TX_BYTES") + requestedFields = append(requestedFields, "DCGM_FI_PROF_PCIE_RX_BYTES") + } + if config.Metrics.GpuDcgmNvlinkIo.Enabled { + requestedFields = append(requestedFields, "DCGM_FI_PROF_NVLINK_TX_BYTES") + requestedFields = append(requestedFields, "DCGM_FI_PROF_NVLINK_RX_BYTES") + } + if config.Metrics.GpuDcgmEnergyConsumption.Enabled { + requestedFields = append(requestedFields, "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION") + requestedFields = append(requestedFields, "DCGM_FI_DEV_POWER_USAGE") // fallback + } + if config.Metrics.GpuDcgmTemperature.Enabled { + requestedFields = append(requestedFields, "DCGM_FI_DEV_GPU_TEMP") + } + if config.Metrics.GpuDcgmClockFrequency.Enabled { + requestedFields = append(requestedFields, "DCGM_FI_DEV_SM_CLOCK") + } + if config.Metrics.GpuDcgmClockThrottleDurationTime.Enabled { + requestedFields = append(requestedFields, "DCGM_FI_DEV_POWER_VIOLATION") + requestedFields = append(requestedFields, "DCGM_FI_DEV_THERMAL_VIOLATION") + requestedFields = append(requestedFields, "DCGM_FI_DEV_SYNC_BOOST_VIOLATION") + requestedFields = append(requestedFields, "DCGM_FI_DEV_BOARD_LIMIT_VIOLATION") + requestedFields = append(requestedFields, "DCGM_FI_DEV_LOW_UTIL_VIOLATION") + requestedFields = append(requestedFields, "DCGM_FI_DEV_RELIABILITY_VIOLATION") + requestedFields = append(requestedFields, "DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION") + requestedFields = append(requestedFields, "DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION") + } + if config.Metrics.GpuDcgmEccErrors.Enabled { + requestedFields = append(requestedFields, "DCGM_FI_DEV_ECC_SBE_VOL_TOTAL") + requestedFields = append(requestedFields, "DCGM_FI_DEV_ECC_DBE_VOL_TOTAL") + } + if config.Metrics.GpuDcgmXidErrors.Enabled { + // requestedFields = append(requestedFields, "") + func() {}() // no-op + } + + return requestedFields +} + func (s *dcgmScraper) scrape(_ context.Context) (pmetric.Metrics, error) { err := s.initClient() if err != nil || s.client == nil { From b82fa9b8a21fa6542dc25baa087657fc43fdab11 Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Thu, 25 Jul 2024 23:49:29 -0400 Subject: [PATCH 28/38] Store the typed value directly in dcgmMetric; avoid unsafe. --- receiver/dcgmreceiver/client.go | 19 ++++++--- receiver/dcgmreceiver/client_gpu_test.go | 51 ++++++++++++++++-------- receiver/dcgmreceiver/util.go | 13 +----- receiver/dcgmreceiver/util_test.go | 40 ------------------- 4 files changed, 50 insertions(+), 73 deletions(-) delete mode 100644 receiver/dcgmreceiver/util_test.go diff --git a/receiver/dcgmreceiver/client.go b/receiver/dcgmreceiver/client.go index 4ea472d31..796dc04d8 100644 --- a/receiver/dcgmreceiver/client.go +++ b/receiver/dcgmreceiver/client.go @@ -58,7 +58,7 @@ type dcgmClient struct { type dcgmMetric struct { timestamp int64 name string - value [4096]byte + value interface{} } // Can't pass argument dcgm.mode because it is unexported @@ -388,7 +388,7 @@ func (client *dcgmClient) collectDeviceMetrics() (map[uint][]dcgmMetric, error) for i := 0; retry && i < client.maxRetries; i++ { fieldValues, pollErr := dcgmGetLatestValuesForFields(gpuIndex, client.enabledFieldIDs) if pollErr == nil { - gpuMetrics[gpuIndex], retry = client.appendMetric(gpuMetrics[gpuIndex], gpuIndex, fieldValues) + gpuMetrics[gpuIndex], retry = client.appendMetrics(gpuMetrics[gpuIndex], gpuIndex, fieldValues) if retry { client.logger.Warnf("Retrying poll of DCGM daemon for GPU %d; attempt %d", gpuIndex, i+1) time.Sleep(client.pollingInterval) @@ -406,7 +406,7 @@ func (client *dcgmClient) collectDeviceMetrics() (map[uint][]dcgmMetric, error) return gpuMetrics, err.Combine() } -func (client *dcgmClient) appendMetric(gpuMetrics []dcgmMetric, gpuIndex uint, fieldValues []dcgm.FieldValue_v1) (result []dcgmMetric, retry bool) { +func (client *dcgmClient) appendMetrics(gpuMetrics []dcgmMetric, gpuIndex uint, fieldValues []dcgm.FieldValue_v1) (result []dcgmMetric, retry bool) { retry = false for _, fieldValue := range fieldValues { dcgmName := dcgmIDToName[dcgm.Short(fieldValue.FieldId)] @@ -419,13 +419,20 @@ func (client *dcgmClient) appendMetric(gpuMetrics []dcgmMetric, gpuIndex uint, f continue } + var metricValue interface{} switch fieldValue.FieldType { case dcgm.DCGM_FT_DOUBLE: - client.logger.Debugf("Discovered (ts %d gpu %d) %s = %.3f (f64)", fieldValue.Ts, gpuIndex, dcgmName, fieldValue.Float64()) + value := fieldValue.Float64() + client.logger.Debugf("Discovered (ts %d gpu %d) %s = %.3f (f64)", fieldValue.Ts, gpuIndex, dcgmName, value) + metricValue = value case dcgm.DCGM_FT_INT64: - client.logger.Debugf("Discovered (ts %d gpu %d) %s = %d (i64)", fieldValue.Ts, gpuIndex, dcgmName, fieldValue.Int64()) + value := fieldValue.Int64() + client.logger.Debugf("Discovered (ts %d gpu %d) %s = %d (i64)", fieldValue.Ts, gpuIndex, dcgmName, value) + metricValue = value + default: + metricValue = fieldValue.Value } - gpuMetrics = append(gpuMetrics, dcgmMetric{fieldValue.Ts, dcgmName, fieldValue.Value}) + gpuMetrics = append(gpuMetrics, dcgmMetric{fieldValue.Ts, dcgmName, metricValue}) } return gpuMetrics, retry diff --git a/receiver/dcgmreceiver/client_gpu_test.go b/receiver/dcgmreceiver/client_gpu_test.go index a224e2c3f..c53a619c7 100644 --- a/receiver/dcgmreceiver/client_gpu_test.go +++ b/receiver/dcgmreceiver/client_gpu_test.go @@ -153,6 +153,17 @@ func TestCollectGpuProfilingMetrics(t *testing.T) { after := time.Now().UnixMicro() assert.Nil(t, err) + asFloat64 := func(metric dcgmMetric) float64 { + require.IsTypef(t, float64(0), metric.value, "Unexpected metric type: %T", metric.value) + value, _ := metric.value.(float64) + return value + } + asInt64 := func(metric dcgmMetric) int64 { + require.IsTypef(t, int64(0), metric.value, "Unexpected metric type: %T", metric.value) + value, _ := metric.value.(int64) + return value + } + seenMetric := make(map[string]bool) assert.GreaterOrEqual(t, len(deviceMetrics), 0) assert.LessOrEqual(t, len(deviceMetrics), 32) @@ -174,8 +185,9 @@ func TestCollectGpuProfilingMetrics(t *testing.T) { case "DCGM_FI_PROF_PIPE_FP16_ACTIVE": fallthrough case "DCGM_FI_PROF_DRAM_ACTIVE": - assert.GreaterOrEqual(t, metric.asFloat64(), float64(0.0)) - assert.LessOrEqual(t, metric.asFloat64(), float64(1.0)) + value := asFloat64(metric) + assert.GreaterOrEqual(t, value, float64(0.0)) + assert.LessOrEqual(t, value, float64(1.0)) case "DCGM_FI_DEV_GPU_UTIL": fallthrough case "DCGM_FI_DEV_MEM_COPY_UTIL": @@ -183,16 +195,18 @@ func TestCollectGpuProfilingMetrics(t *testing.T) { case "DCGM_FI_DEV_ENC_UTIL": fallthrough case "DCGM_FI_DEV_DEC_UTIL": - assert.GreaterOrEqual(t, metric.asInt64(), int64(0)) - assert.LessOrEqual(t, metric.asInt64(), int64(100)) + value := asInt64(metric) + assert.GreaterOrEqual(t, value, int64(0)) + assert.LessOrEqual(t, value, int64(100)) case "DCGM_FI_DEV_FB_FREE": fallthrough case "DCGM_FI_DEV_FB_USED": fallthrough case "DCGM_FI_DEV_FB_RESERVED": // arbitrary max of 10 TiB - assert.GreaterOrEqual(t, metric.asInt64(), int64(0)) - assert.LessOrEqual(t, metric.asInt64(), int64(10485760)) + value := asInt64(metric) + assert.GreaterOrEqual(t, value, int64(0)) + assert.LessOrEqual(t, value, int64(10485760)) case "DCGM_FI_PROF_PCIE_TX_BYTES": fallthrough case "DCGM_FI_PROF_PCIE_RX_BYTES": @@ -201,8 +215,9 @@ func TestCollectGpuProfilingMetrics(t *testing.T) { fallthrough case "DCGM_FI_PROF_NVLINK_RX_BYTES": // arbitrary max of 10 TiB/sec - assert.GreaterOrEqual(t, metric.asInt64(), int64(0)) - assert.LessOrEqual(t, metric.asInt64(), int64(10995116277760)) + value := asInt64(metric) + assert.GreaterOrEqual(t, value, int64(0)) + assert.LessOrEqual(t, value, int64(10995116277760)) case "DCGM_FI_DEV_BOARD_LIMIT_VIOLATION": fallthrough case "DCGM_FI_DEV_LOW_UTIL_VIOLATION": @@ -218,22 +233,26 @@ func TestCollectGpuProfilingMetrics(t *testing.T) { case "DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION": fallthrough case "DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION": - assert.GreaterOrEqual(t, metric.asInt64(), int64(0)) - assert.LessOrEqual(t, metric.asInt64(), time.Now().UnixMicro()) + value := asInt64(metric) + assert.GreaterOrEqual(t, value, int64(0)) + assert.LessOrEqual(t, value, time.Now().UnixMicro()) case "DCGM_FI_DEV_ECC_DBE_VOL_TOTAL": fallthrough case "DCGM_FI_DEV_ECC_SBE_VOL_TOTAL": // arbitrary max of 100000000 errors - assert.GreaterOrEqual(t, metric.asInt64(), int64(0)) - assert.LessOrEqual(t, metric.asInt64(), int64(100000000)) + value := asInt64(metric) + assert.GreaterOrEqual(t, value, int64(0)) + assert.LessOrEqual(t, value, int64(100000000)) case "DCGM_FI_DEV_GPU_TEMP": // arbitrary max of 100000 °C - assert.GreaterOrEqual(t, metric.asFloat64(), float64(0.0)) - assert.LessOrEqual(t, metric.asFloat64(), float64(100000.0)) + value := asFloat64(metric) + assert.GreaterOrEqual(t, value, float64(0.0)) + assert.LessOrEqual(t, value, float64(100000.0)) case "DCGM_FI_DEV_SM_CLOCK": // arbitrary max of 100000 MHz - assert.GreaterOrEqual(t, metric.asFloat64(), float64(0.0)) - assert.LessOrEqual(t, metric.asFloat64(), float64(100000.0)) + value := asFloat64(metric) + assert.GreaterOrEqual(t, value, float64(0.0)) + assert.LessOrEqual(t, value, float64(100000.0)) case "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION": // TODO case "DCGM_FI_DEV_POWER_USAGE": diff --git a/receiver/dcgmreceiver/util.go b/receiver/dcgmreceiver/util.go index 2c3457fd4..1f6de1233 100644 --- a/receiver/dcgmreceiver/util.go +++ b/receiver/dcgmreceiver/util.go @@ -19,7 +19,6 @@ package dcgmreceiver import ( "fmt" - "unsafe" "github.com/NVIDIA/go-dcgm/pkg/dcgm" ) @@ -32,20 +31,12 @@ var ( errUnexpectedType = fmt.Errorf("unexpected data type") ) -func (m *dcgmMetric) setFloat64(val float64) { - *(*float64)(unsafe.Pointer(&m.value[0])) = val -} - func (m *dcgmMetric) asFloat64() float64 { - return *(*float64)(unsafe.Pointer(&m.value[0])) -} - -func (m *dcgmMetric) setInt64(val int64) { - *(*int64)(unsafe.Pointer(&m.value[0])) = val + return m.value.(float64) } func (m *dcgmMetric) asInt64() int64 { - return *(*int64)(unsafe.Pointer(&m.value[0])) + return m.value.(int64) } func isValidValue(fieldValue dcgm.FieldValue_v1) error { diff --git a/receiver/dcgmreceiver/util_test.go b/receiver/dcgmreceiver/util_test.go deleted file mode 100644 index daeace14d..000000000 --- a/receiver/dcgmreceiver/util_test.go +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2023 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//go:build gpu -// +build gpu - -package dcgmreceiver - -import ( - "testing" - - "github.com/stretchr/testify/require" -) - -func TestDcgmMetricSetFloat64(t *testing.T) { - var metric dcgmMetric - metric.setFloat64(23.0) - require.Equal(t, metric.asFloat64(), 23.0) - metric.setFloat64(43.0) - require.Equal(t, metric.asFloat64(), 43.0) -} - -func TestDcgmMetricSetInt64(t *testing.T) { - var metric dcgmMetric - metric.setInt64(23) - require.Equal(t, metric.asInt64(), int64(23)) - metric.setInt64(43) - require.Equal(t, metric.asInt64(), int64(43)) -} From ccfd16e5b172a7aca11615277d04baebf19a18d1 Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Fri, 26 Jul 2024 04:02:45 -0400 Subject: [PATCH 29/38] Oops, temperature and clock frequency and energy consumption are int64. --- receiver/dcgmreceiver/client_gpu_test.go | 16 ++++++++++------ receiver/dcgmreceiver/scraper.go | 6 +++--- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/receiver/dcgmreceiver/client_gpu_test.go b/receiver/dcgmreceiver/client_gpu_test.go index c53a619c7..e6df9924b 100644 --- a/receiver/dcgmreceiver/client_gpu_test.go +++ b/receiver/dcgmreceiver/client_gpu_test.go @@ -245,17 +245,21 @@ func TestCollectGpuProfilingMetrics(t *testing.T) { assert.LessOrEqual(t, value, int64(100000000)) case "DCGM_FI_DEV_GPU_TEMP": // arbitrary max of 100000 °C - value := asFloat64(metric) - assert.GreaterOrEqual(t, value, float64(0.0)) - assert.LessOrEqual(t, value, float64(100000.0)) + value := asInt64(metric) + assert.GreaterOrEqual(t, value, int64(0)) + assert.LessOrEqual(t, value, int64(100000)) case "DCGM_FI_DEV_SM_CLOCK": // arbitrary max of 100000 MHz - value := asFloat64(metric) - assert.GreaterOrEqual(t, value, float64(0.0)) - assert.LessOrEqual(t, value, float64(100000.0)) + value := asInt64(metric) + assert.GreaterOrEqual(t, value, int64(0)) + assert.LessOrEqual(t, value, int64(100000)) case "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION": + value := asInt64(metric) + assert.GreaterOrEqual(t, value, int64(0)) // TODO case "DCGM_FI_DEV_POWER_USAGE": + value := asFloat64(metric) + assert.GreaterOrEqual(t, value, float64(0.0)) // TODO default: t.Errorf("Unexpected metric '%s'", metric.name) diff --git a/receiver/dcgmreceiver/scraper.go b/receiver/dcgmreceiver/scraper.go index a8f892f5f..47cafea85 100644 --- a/receiver/dcgmreceiver/scraper.go +++ b/receiver/dcgmreceiver/scraper.go @@ -273,17 +273,17 @@ func (s *dcgmScraper) scrape(_ context.Context) (pmetric.Metrics, error) { s.mb.RecordGpuDcgmNvlinkIoDataPoint(now, s.aggregates.nvlinkRxTotal[gpuIndex], metadata.AttributeNetworkIoDirectionReceive) } if metric, ok := metrics["DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION"]; ok { - s.mb.RecordGpuDcgmEnergyConsumptionDataPoint(now, metric.asFloat64()) + s.mb.RecordGpuDcgmEnergyConsumptionDataPoint(now, float64(metric.asInt64())) } else if metric, ok := metrics["DCGM_FI_DEV_POWER_USAGE"]; ok { // fallback powerUsage := metric.asFloat64() * (s.config.CollectionInterval.Seconds()) /* rate to delta */ s.aggregates.energyConsumptionFallback[gpuIndex] += powerUsage /* delta to cumulative */ s.mb.RecordGpuDcgmEnergyConsumptionDataPoint(now, s.aggregates.energyConsumptionFallback[gpuIndex]) } if metric, ok := metrics["DCGM_FI_DEV_GPU_TEMP"]; ok { - s.mb.RecordGpuDcgmTemperatureDataPoint(now, metric.asFloat64()) + s.mb.RecordGpuDcgmTemperatureDataPoint(now, float64(metric.asInt64())) } if metric, ok := metrics["DCGM_FI_DEV_SM_CLOCK"]; ok { - clockFreq := 1e6 * metric.asFloat64() /* MHz to Hz */ + clockFreq := 1e6 * float64(metric.asInt64()) /* MHz to Hz */ s.mb.RecordGpuDcgmClockFrequencyDataPoint(now, clockFreq) } if metric, ok := metrics["DCGM_FI_DEV_POWER_VIOLATION"]; ok { From e7ffb3ed1978f67147693231ecf0bef1e22fbd81 Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Fri, 26 Jul 2024 15:32:25 -0400 Subject: [PATCH 30/38] Scale energy consumption properly. --- receiver/dcgmreceiver/scraper.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/receiver/dcgmreceiver/scraper.go b/receiver/dcgmreceiver/scraper.go index 47cafea85..a063fdeb4 100644 --- a/receiver/dcgmreceiver/scraper.go +++ b/receiver/dcgmreceiver/scraper.go @@ -273,7 +273,8 @@ func (s *dcgmScraper) scrape(_ context.Context) (pmetric.Metrics, error) { s.mb.RecordGpuDcgmNvlinkIoDataPoint(now, s.aggregates.nvlinkRxTotal[gpuIndex], metadata.AttributeNetworkIoDirectionReceive) } if metric, ok := metrics["DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION"]; ok { - s.mb.RecordGpuDcgmEnergyConsumptionDataPoint(now, float64(metric.asInt64())) + energyUsed := float64(metric.asInt64()) / 1e3 /* mJ to J */ + s.mb.RecordGpuDcgmEnergyConsumptionDataPoint(now, energyUsed) } else if metric, ok := metrics["DCGM_FI_DEV_POWER_USAGE"]; ok { // fallback powerUsage := metric.asFloat64() * (s.config.CollectionInterval.Seconds()) /* rate to delta */ s.aggregates.energyConsumptionFallback[gpuIndex] += powerUsage /* delta to cumulative */ From e4a0026fdfb006b89c2ce27ed8f708df8be8c6a0 Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Fri, 26 Jul 2024 04:22:13 -0400 Subject: [PATCH 31/38] More debug logging. --- receiver/dcgmreceiver/client.go | 2 ++ receiver/dcgmreceiver/scraper.go | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/receiver/dcgmreceiver/client.go b/receiver/dcgmreceiver/client.go index 796dc04d8..a806643ea 100644 --- a/receiver/dcgmreceiver/client.go +++ b/receiver/dcgmreceiver/client.go @@ -384,9 +384,11 @@ func (client *dcgmClient) collectDeviceMetrics() (map[uint][]dcgmMetric, error) var err scrapererror.ScrapeErrors gpuMetrics := make(map[uint][]dcgmMetric) for _, gpuIndex := range client.deviceIndices { + client.logger.Debugf("Polling DCGM daemon for GPU %d", gpuIndex) retry := true for i := 0; retry && i < client.maxRetries; i++ { fieldValues, pollErr := dcgmGetLatestValuesForFields(gpuIndex, client.enabledFieldIDs) + client.logger.Debugf("Got %d field values", len(fieldValues)) if pollErr == nil { gpuMetrics[gpuIndex], retry = client.appendMetrics(gpuMetrics[gpuIndex], gpuIndex, fieldValues) if retry { diff --git a/receiver/dcgmreceiver/scraper.go b/receiver/dcgmreceiver/scraper.go index a063fdeb4..7929a4f53 100644 --- a/receiver/dcgmreceiver/scraper.go +++ b/receiver/dcgmreceiver/scraper.go @@ -182,7 +182,13 @@ func (s *dcgmScraper) scrape(_ context.Context) (pmetric.Metrics, error) { return s.mb.Emit(), err } + s.settings.Logger.Sugar().Debug("Client created, collecting metrics") deviceMetrics, err := s.client.collectDeviceMetrics() + if err != nil { + s.settings.Logger.Sugar().Warnf("Metrics not collected; err=%v", err) + return s.mb.Emit(), err + } + s.settings.Logger.Sugar().Debugf("Metrics collected: %d", len(deviceMetrics)) now := pcommon.NewTimestampFromTime(time.Now()) for gpuIndex, gpuMetrics := range deviceMetrics { @@ -190,6 +196,7 @@ func (s *dcgmScraper) scrape(_ context.Context) (pmetric.Metrics, error) { for _, metric := range gpuMetrics { metricsByName[metric.name] = append(metricsByName[metric.name], metric) } + s.settings.Logger.Sugar().Debugf("Got %d unique metrics: %v", len(metricsByName), metricsByName) metrics := make(map[string]dcgmMetric) for name, points := range metricsByName { slices.SortStableFunc(points, func(a, b dcgmMetric) int { From e673697f8eb6c944eec12b712c283ded158e595e Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Fri, 26 Jul 2024 19:53:21 -0400 Subject: [PATCH 32/38] Implement a rateIntegrator struct. --- receiver/dcgmreceiver/scraper.go | 56 +++++++++++--------- receiver/dcgmreceiver/util.go | 57 ++++++++++++++++++++ receiver/dcgmreceiver/util_test.go | 84 ++++++++++++++++++++++++++++++ 3 files changed, 172 insertions(+), 25 deletions(-) create mode 100644 receiver/dcgmreceiver/util_test.go diff --git a/receiver/dcgmreceiver/scraper.go b/receiver/dcgmreceiver/scraper.go index 7929a4f53..36232fa8d 100644 --- a/receiver/dcgmreceiver/scraper.go +++ b/receiver/dcgmreceiver/scraper.go @@ -40,11 +40,11 @@ type dcgmScraper struct { mb *metadata.MetricsBuilder // Aggregate cumulative values. aggregates struct { - energyConsumptionFallback map[uint]float64 // ...from power usage rate. - pcieTxTotal map[uint]int64 // ...from pcie tx. - pcieRxTotal map[uint]int64 // ...from pcie rx. - nvlinkTxTotal map[uint]int64 // ...from nvlink tx. - nvlinkRxTotal map[uint]int64 // ...from nvlink rx. + energyConsumptionFallback *defaultMap[uint, *rateIntegrator[float64]] // ...from power usage rate. + pcieTxTotal *defaultMap[uint, *rateIntegrator[int64]] // ...from pcie tx. + pcieRxTotal *defaultMap[uint, *rateIntegrator[int64]] // ...from pcie rx. + nvlinkTxTotal *defaultMap[uint, *rateIntegrator[int64]] // ...from nvlink tx. + nvlinkRxTotal *defaultMap[uint, *rateIntegrator[int64]] // ...from nvlink rx. } } @@ -81,17 +81,23 @@ func (s *dcgmScraper) initClient() error { return nil } +func newRateIntegrator[V int64 | float64]() *rateIntegrator[V] { + ri := new(rateIntegrator[V]) + ri.Reset() + return ri +} + func (s *dcgmScraper) start(_ context.Context, _ component.Host) error { startTime := pcommon.NewTimestampFromTime(time.Now()) mbConfig := metadata.DefaultMetricsBuilderConfig() mbConfig.Metrics = s.config.Metrics s.mb = metadata.NewMetricsBuilder( mbConfig, s.settings, metadata.WithStartTime(startTime)) - s.aggregates.energyConsumptionFallback = make(map[uint]float64) - s.aggregates.pcieTxTotal = make(map[uint]int64) - s.aggregates.pcieRxTotal = make(map[uint]int64) - s.aggregates.nvlinkTxTotal = make(map[uint]int64) - s.aggregates.nvlinkRxTotal = make(map[uint]int64) + s.aggregates.energyConsumptionFallback = newDefaultMap[uint](newRateIntegrator[float64]) + s.aggregates.pcieTxTotal = newDefaultMap[uint](newRateIntegrator[int64]) + s.aggregates.pcieRxTotal = newDefaultMap[uint](newRateIntegrator[int64]) + s.aggregates.nvlinkTxTotal = newDefaultMap[uint](newRateIntegrator[int64]) + s.aggregates.nvlinkRxTotal = newDefaultMap[uint](newRateIntegrator[int64]) return nil } @@ -260,32 +266,32 @@ func (s *dcgmScraper) scrape(_ context.Context) (pmetric.Metrics, error) { s.mb.RecordGpuDcgmMemoryBandwidthUtilizationDataPoint(now, memCopyUtil) } if metric, ok := metrics["DCGM_FI_PROF_PCIE_TX_BYTES"]; ok { - pcieTx := int64(float64(metric.asInt64()) * (s.config.CollectionInterval.Seconds())) /* rate to delta */ - s.aggregates.pcieTxTotal[gpuIndex] += pcieTx /* delta to cumulative */ - s.mb.RecordGpuDcgmPcieIoDataPoint(now, s.aggregates.pcieTxTotal[gpuIndex], metadata.AttributeNetworkIoDirectionTransmit) + s.aggregates.pcieTxTotal.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) + _, pcieTx := s.aggregates.pcieTxTotal.Get(gpuIndex).Value() + s.mb.RecordGpuDcgmPcieIoDataPoint(now, pcieTx, metadata.AttributeNetworkIoDirectionTransmit) } if metric, ok := metrics["DCGM_FI_PROF_PCIE_RX_BYTES"]; ok { - pcieRx := int64(float64(metric.asInt64()) * (s.config.CollectionInterval.Seconds())) /* rate to delta */ - s.aggregates.pcieRxTotal[gpuIndex] += pcieRx /* delta to cumulative */ - s.mb.RecordGpuDcgmPcieIoDataPoint(now, s.aggregates.pcieRxTotal[gpuIndex], metadata.AttributeNetworkIoDirectionReceive) + s.aggregates.pcieRxTotal.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) + _, pcieRx := s.aggregates.pcieRxTotal.Get(gpuIndex).Value() + s.mb.RecordGpuDcgmPcieIoDataPoint(now, pcieRx, metadata.AttributeNetworkIoDirectionReceive) } if metric, ok := metrics["DCGM_FI_PROF_NVLINK_TX_BYTES"]; ok { - nvlinkTx := int64(float64(metric.asInt64()) * (s.config.CollectionInterval.Seconds())) /* rate to delta */ - s.aggregates.nvlinkTxTotal[gpuIndex] += nvlinkTx /* delta to cumulative */ - s.mb.RecordGpuDcgmNvlinkIoDataPoint(now, s.aggregates.nvlinkTxTotal[gpuIndex], metadata.AttributeNetworkIoDirectionTransmit) + s.aggregates.nvlinkTxTotal.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) + _, nvlinkTx := s.aggregates.nvlinkTxTotal.Get(gpuIndex).Value() + s.mb.RecordGpuDcgmNvlinkIoDataPoint(now, nvlinkTx, metadata.AttributeNetworkIoDirectionTransmit) } if metric, ok := metrics["DCGM_FI_PROF_NVLINK_RX_BYTES"]; ok { - nvlinkRx := int64(float64(metric.asInt64()) * (s.config.CollectionInterval.Seconds())) /* rate to delta */ - s.aggregates.nvlinkRxTotal[gpuIndex] += nvlinkRx /* delta to cumulative */ - s.mb.RecordGpuDcgmNvlinkIoDataPoint(now, s.aggregates.nvlinkRxTotal[gpuIndex], metadata.AttributeNetworkIoDirectionReceive) + s.aggregates.nvlinkRxTotal.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) + _, nvlinkRx := s.aggregates.nvlinkRxTotal.Get(gpuIndex).Value() + s.mb.RecordGpuDcgmNvlinkIoDataPoint(now, nvlinkRx, metadata.AttributeNetworkIoDirectionReceive) } if metric, ok := metrics["DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION"]; ok { energyUsed := float64(metric.asInt64()) / 1e3 /* mJ to J */ s.mb.RecordGpuDcgmEnergyConsumptionDataPoint(now, energyUsed) } else if metric, ok := metrics["DCGM_FI_DEV_POWER_USAGE"]; ok { // fallback - powerUsage := metric.asFloat64() * (s.config.CollectionInterval.Seconds()) /* rate to delta */ - s.aggregates.energyConsumptionFallback[gpuIndex] += powerUsage /* delta to cumulative */ - s.mb.RecordGpuDcgmEnergyConsumptionDataPoint(now, s.aggregates.energyConsumptionFallback[gpuIndex]) + s.aggregates.energyConsumptionFallback.Get(gpuIndex).Update(metric.timestamp, metric.asFloat64()) + _, energyUsed := s.aggregates.energyConsumptionFallback.Get(gpuIndex).Value() + s.mb.RecordGpuDcgmEnergyConsumptionDataPoint(now, energyUsed) } if metric, ok := metrics["DCGM_FI_DEV_GPU_TEMP"]; ok { s.mb.RecordGpuDcgmTemperatureDataPoint(now, float64(metric.asInt64())) diff --git a/receiver/dcgmreceiver/util.go b/receiver/dcgmreceiver/util.go index 1f6de1233..03079d707 100644 --- a/receiver/dcgmreceiver/util.go +++ b/receiver/dcgmreceiver/util.go @@ -19,10 +19,67 @@ package dcgmreceiver import ( "fmt" + "time" "github.com/NVIDIA/go-dcgm/pkg/dcgm" ) +var nowUnixMicro = func() int64 { return time.Now().UnixNano() / 1e3 } + +// rateIntegrator converts timestamped values that represent rates into +// cumulative values. It assumes the rate stays constant since the last +// timestamp. +type rateIntegrator[V int64 | float64] struct { + lastTimestamp int64 + aggregatedRateUs V // the integration of the rate over microsecond timestamps. +} + +func (ri *rateIntegrator[V]) Reset() { + ri.lastTimestamp = nowUnixMicro() + ri.aggregatedRateUs = V(0) +} + +func (ri *rateIntegrator[V]) Update(ts int64, v V) { + // Drop stale points. + if ts <= ri.lastTimestamp { + return + } + // v is the rate per second, and timestamps are in microseconds, so the + // delta will be 1e6 times the actual increment. + ri.aggregatedRateUs += v * V(ts-ri.lastTimestamp) + ri.lastTimestamp = ts +} + +func (ri *rateIntegrator[V]) Value() (int64, V) { + return ri.lastTimestamp, ri.aggregatedRateUs / V(1e6) +} + +type defaultMap[K comparable, V any] struct { + m map[K]V + f func() V +} + +func newDefaultMap[K comparable, V any](f func() V) *defaultMap[K, V] { + return &defaultMap[K, V]{ + m: make(map[K]V), + f: f, + } +} + +func (m *defaultMap[K, V]) Get(k K) V { + if v, ok := m.m[k]; ok { + return v + } + v := m.f() + m.m[k] = v + return v +} + +func (m *defaultMap[K, V]) TryGet(k K) (V, bool) { + v, ok := m.m[k] + return v, ok +} + var ( errBlankValue = fmt.Errorf("unspecified blank value") errDataNotFound = fmt.Errorf("data not found") diff --git a/receiver/dcgmreceiver/util_test.go b/receiver/dcgmreceiver/util_test.go new file mode 100644 index 000000000..40510f42a --- /dev/null +++ b/receiver/dcgmreceiver/util_test.go @@ -0,0 +1,84 @@ +// Copyright 2024 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build gpu +// +build gpu + +package dcgmreceiver + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func testRateIntegrator[V int64 | float64](t *testing.T) { + origNowUnixMicro := nowUnixMicro + nowUnixMicro = func() int64 { return 10 } + defer func() { nowUnixMicro = origNowUnixMicro }() + + type P struct { + ts int64 + v V + } + p := func(ts int64, v V) P { return P{ts, v} } + + var ri rateIntegrator[V] + + ri.Reset() + require.Equal(t, P{10, 0}, p(ri.Value())) + // Ensure updates affect aggregated values. + ri.Update(15, 1e6) + assert.Equal(t, P{15, 5}, p(ri.Value())) + // Ensure stale points are ignored. + ri.Update(12, 1e8) + assert.Equal(t, P{15, 5}, p(ri.Value())) + ri.Update(15, 1.e8) + assert.Equal(t, P{15, 5}, p(ri.Value())) + // Ensure updates affect aggregated values. + ri.Update(20, 2.e6) + assert.Equal(t, P{20, 15}, p(ri.Value())) + // Ensure zero rates don't change the aggregated value. + ri.Update(25, 0) + assert.Equal(t, P{25, 15}, p(ri.Value())) + + // Ensure the value is cleared on reset. + ri.Reset() + assert.Equal(t, P{10, 0}, p(ri.Value())) +} + +func TestRateIntegratorInt64(t *testing.T) { + testRateIntegrator[int64](t) +} + +func TestRateIntegratorFloat64(t *testing.T) { + testRateIntegrator[float64](t) +} + +func TestDefaultMap(t *testing.T) { + called := false + m := newDefaultMap[int, int64](func() int64 { + called = true + return 8 + }) + _, ok := m.TryGet(3) + assert.False(t, ok) + assert.False(t, called) + v := m.Get(3) + assert.True(t, called) + assert.Equal(t, int64(8), v) + _, ok = m.TryGet(3) + assert.True(t, ok) +} From b779893b24eda0f2833a37167c3f046356554a79 Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Sat, 27 Jul 2024 14:06:21 -0400 Subject: [PATCH 33/38] Implement a cumulativeTracker struct. Rearrange aggregates in scraper. --- receiver/dcgmreceiver/scraper.go | 122 +++++++++++++++++++++-------- receiver/dcgmreceiver/util.go | 35 +++++++++ receiver/dcgmreceiver/util_test.go | 50 ++++++++++++ 3 files changed, 176 insertions(+), 31 deletions(-) diff --git a/receiver/dcgmreceiver/scraper.go b/receiver/dcgmreceiver/scraper.go index 36232fa8d..3fb6a3d2d 100644 --- a/receiver/dcgmreceiver/scraper.go +++ b/receiver/dcgmreceiver/scraper.go @@ -40,11 +40,32 @@ type dcgmScraper struct { mb *metadata.MetricsBuilder // Aggregate cumulative values. aggregates struct { - energyConsumptionFallback *defaultMap[uint, *rateIntegrator[float64]] // ...from power usage rate. - pcieTxTotal *defaultMap[uint, *rateIntegrator[int64]] // ...from pcie tx. - pcieRxTotal *defaultMap[uint, *rateIntegrator[int64]] // ...from pcie rx. - nvlinkTxTotal *defaultMap[uint, *rateIntegrator[int64]] // ...from nvlink tx. - nvlinkRxTotal *defaultMap[uint, *rateIntegrator[int64]] // ...from nvlink rx. + energyConsumption struct { + total *defaultMap[uint, *cumulativeTracker[int64]] + fallback *defaultMap[uint, *rateIntegrator[float64]] // ...from power usage rate. + } + pcieTotal struct { + tx *defaultMap[uint, *rateIntegrator[int64]] // ...from pcie tx. + rx *defaultMap[uint, *rateIntegrator[int64]] // ...from pcie rx. + } + nvlinkTotal struct { + tx *defaultMap[uint, *rateIntegrator[int64]] // ...from nvlink tx. + rx *defaultMap[uint, *rateIntegrator[int64]] // ...from nvlink rx. + } + throttleDuration struct { + powerViolation *defaultMap[uint, *cumulativeTracker[int64]] + thermalViolation *defaultMap[uint, *cumulativeTracker[int64]] + syncBoostViolation *defaultMap[uint, *cumulativeTracker[int64]] + boardLimitViolation *defaultMap[uint, *cumulativeTracker[int64]] + lowUtilViolation *defaultMap[uint, *cumulativeTracker[int64]] + reliabilityViolation *defaultMap[uint, *cumulativeTracker[int64]] + totalAppClocksViolation *defaultMap[uint, *cumulativeTracker[int64]] + totalBaseClocksViolation *defaultMap[uint, *cumulativeTracker[int64]] + } + eccTotal struct { + sbe *defaultMap[uint, *cumulativeTracker[int64]] + dbe *defaultMap[uint, *cumulativeTracker[int64]] + } } } @@ -87,17 +108,34 @@ func newRateIntegrator[V int64 | float64]() *rateIntegrator[V] { return ri } +func newCumulativeTracker[V int64 | float64]() *cumulativeTracker[V] { + ct := new(cumulativeTracker[V]) + ct.Reset() + return ct +} + func (s *dcgmScraper) start(_ context.Context, _ component.Host) error { startTime := pcommon.NewTimestampFromTime(time.Now()) mbConfig := metadata.DefaultMetricsBuilderConfig() mbConfig.Metrics = s.config.Metrics s.mb = metadata.NewMetricsBuilder( mbConfig, s.settings, metadata.WithStartTime(startTime)) - s.aggregates.energyConsumptionFallback = newDefaultMap[uint](newRateIntegrator[float64]) - s.aggregates.pcieTxTotal = newDefaultMap[uint](newRateIntegrator[int64]) - s.aggregates.pcieRxTotal = newDefaultMap[uint](newRateIntegrator[int64]) - s.aggregates.nvlinkTxTotal = newDefaultMap[uint](newRateIntegrator[int64]) - s.aggregates.nvlinkRxTotal = newDefaultMap[uint](newRateIntegrator[int64]) + s.aggregates.energyConsumption.total = newDefaultMap[uint](newCumulativeTracker[int64]) + s.aggregates.energyConsumption.fallback = newDefaultMap[uint](newRateIntegrator[float64]) + s.aggregates.pcieTotal.tx = newDefaultMap[uint](newRateIntegrator[int64]) + s.aggregates.pcieTotal.rx = newDefaultMap[uint](newRateIntegrator[int64]) + s.aggregates.nvlinkTotal.tx = newDefaultMap[uint](newRateIntegrator[int64]) + s.aggregates.nvlinkTotal.rx = newDefaultMap[uint](newRateIntegrator[int64]) + s.aggregates.throttleDuration.powerViolation = newDefaultMap[uint](newCumulativeTracker[int64]) + s.aggregates.throttleDuration.thermalViolation = newDefaultMap[uint](newCumulativeTracker[int64]) + s.aggregates.throttleDuration.syncBoostViolation = newDefaultMap[uint](newCumulativeTracker[int64]) + s.aggregates.throttleDuration.boardLimitViolation = newDefaultMap[uint](newCumulativeTracker[int64]) + s.aggregates.throttleDuration.lowUtilViolation = newDefaultMap[uint](newCumulativeTracker[int64]) + s.aggregates.throttleDuration.reliabilityViolation = newDefaultMap[uint](newCumulativeTracker[int64]) + s.aggregates.throttleDuration.totalAppClocksViolation = newDefaultMap[uint](newCumulativeTracker[int64]) + s.aggregates.throttleDuration.totalBaseClocksViolation = newDefaultMap[uint](newCumulativeTracker[int64]) + s.aggregates.eccTotal.sbe = newDefaultMap[uint](newCumulativeTracker[int64]) + s.aggregates.eccTotal.dbe = newDefaultMap[uint](newCumulativeTracker[int64]) return nil } @@ -266,31 +304,33 @@ func (s *dcgmScraper) scrape(_ context.Context) (pmetric.Metrics, error) { s.mb.RecordGpuDcgmMemoryBandwidthUtilizationDataPoint(now, memCopyUtil) } if metric, ok := metrics["DCGM_FI_PROF_PCIE_TX_BYTES"]; ok { - s.aggregates.pcieTxTotal.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) - _, pcieTx := s.aggregates.pcieTxTotal.Get(gpuIndex).Value() + s.aggregates.pcieTotal.tx.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) + _, pcieTx := s.aggregates.pcieTotal.tx.Get(gpuIndex).Value() s.mb.RecordGpuDcgmPcieIoDataPoint(now, pcieTx, metadata.AttributeNetworkIoDirectionTransmit) } if metric, ok := metrics["DCGM_FI_PROF_PCIE_RX_BYTES"]; ok { - s.aggregates.pcieRxTotal.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) - _, pcieRx := s.aggregates.pcieRxTotal.Get(gpuIndex).Value() + s.aggregates.pcieTotal.rx.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) + _, pcieRx := s.aggregates.pcieTotal.rx.Get(gpuIndex).Value() s.mb.RecordGpuDcgmPcieIoDataPoint(now, pcieRx, metadata.AttributeNetworkIoDirectionReceive) } if metric, ok := metrics["DCGM_FI_PROF_NVLINK_TX_BYTES"]; ok { - s.aggregates.nvlinkTxTotal.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) - _, nvlinkTx := s.aggregates.nvlinkTxTotal.Get(gpuIndex).Value() + s.aggregates.nvlinkTotal.tx.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) + _, nvlinkTx := s.aggregates.nvlinkTotal.tx.Get(gpuIndex).Value() s.mb.RecordGpuDcgmNvlinkIoDataPoint(now, nvlinkTx, metadata.AttributeNetworkIoDirectionTransmit) } if metric, ok := metrics["DCGM_FI_PROF_NVLINK_RX_BYTES"]; ok { - s.aggregates.nvlinkRxTotal.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) - _, nvlinkRx := s.aggregates.nvlinkRxTotal.Get(gpuIndex).Value() + s.aggregates.nvlinkTotal.rx.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) + _, nvlinkRx := s.aggregates.nvlinkTotal.rx.Get(gpuIndex).Value() s.mb.RecordGpuDcgmNvlinkIoDataPoint(now, nvlinkRx, metadata.AttributeNetworkIoDirectionReceive) } if metric, ok := metrics["DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION"]; ok { - energyUsed := float64(metric.asInt64()) / 1e3 /* mJ to J */ + s.aggregates.energyConsumption.total.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) + _, value := s.aggregates.energyConsumption.total.Get(gpuIndex).Value() + energyUsed := float64(value) / 1e3 /* mJ to J */ s.mb.RecordGpuDcgmEnergyConsumptionDataPoint(now, energyUsed) } else if metric, ok := metrics["DCGM_FI_DEV_POWER_USAGE"]; ok { // fallback - s.aggregates.energyConsumptionFallback.Get(gpuIndex).Update(metric.timestamp, metric.asFloat64()) - _, energyUsed := s.aggregates.energyConsumptionFallback.Get(gpuIndex).Value() + s.aggregates.energyConsumption.fallback.Get(gpuIndex).Update(metric.timestamp, metric.asFloat64()) + _, energyUsed := s.aggregates.energyConsumption.fallback.Get(gpuIndex).Value() s.mb.RecordGpuDcgmEnergyConsumptionDataPoint(now, energyUsed) } if metric, ok := metrics["DCGM_FI_DEV_GPU_TEMP"]; ok { @@ -301,42 +341,62 @@ func (s *dcgmScraper) scrape(_ context.Context) (pmetric.Metrics, error) { s.mb.RecordGpuDcgmClockFrequencyDataPoint(now, clockFreq) } if metric, ok := metrics["DCGM_FI_DEV_POWER_VIOLATION"]; ok { - violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ + s.aggregates.throttleDuration.powerViolation.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) + _, value := s.aggregates.throttleDuration.powerViolation.Get(gpuIndex).Value() + violationTime := float64(value) / 1e6 /* us to s */ s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationPower) } if metric, ok := metrics["DCGM_FI_DEV_THERMAL_VIOLATION"]; ok { - violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ + s.aggregates.throttleDuration.thermalViolation.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) + _, value := s.aggregates.throttleDuration.thermalViolation.Get(gpuIndex).Value() + violationTime := float64(value) / 1e6 /* us to s */ s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationThermal) } if metric, ok := metrics["DCGM_FI_DEV_SYNC_BOOST_VIOLATION"]; ok { - violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ + s.aggregates.throttleDuration.syncBoostViolation.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) + _, value := s.aggregates.throttleDuration.syncBoostViolation.Get(gpuIndex).Value() + violationTime := float64(value) / 1e6 /* us to s */ s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationSyncBoost) } if metric, ok := metrics["DCGM_FI_DEV_BOARD_LIMIT_VIOLATION"]; ok { - violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ + s.aggregates.throttleDuration.boardLimitViolation.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) + _, value := s.aggregates.throttleDuration.boardLimitViolation.Get(gpuIndex).Value() + violationTime := float64(value) / 1e6 /* us to s */ s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationBoardLimit) } if metric, ok := metrics["DCGM_FI_DEV_LOW_UTIL_VIOLATION"]; ok { - violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ + s.aggregates.throttleDuration.lowUtilViolation.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) + _, value := s.aggregates.throttleDuration.lowUtilViolation.Get(gpuIndex).Value() + violationTime := float64(value) / 1e6 /* us to s */ s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationLowUtil) } if metric, ok := metrics["DCGM_FI_DEV_RELIABILITY_VIOLATION"]; ok { - violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ + s.aggregates.throttleDuration.reliabilityViolation.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) + _, value := s.aggregates.throttleDuration.reliabilityViolation.Get(gpuIndex).Value() + violationTime := float64(value) / 1e6 /* us to s */ s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationReliability) } if metric, ok := metrics["DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION"]; ok { - violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ + s.aggregates.throttleDuration.totalAppClocksViolation.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) + _, value := s.aggregates.throttleDuration.totalAppClocksViolation.Get(gpuIndex).Value() + violationTime := float64(value) / 1e6 /* us to s */ s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationAppClock) } if metric, ok := metrics["DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION"]; ok { - violationTime := float64(metric.asInt64()) / 1e6 /* us to s */ + s.aggregates.throttleDuration.totalBaseClocksViolation.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) + _, value := s.aggregates.throttleDuration.totalBaseClocksViolation.Get(gpuIndex).Value() + violationTime := float64(value) / 1e6 /* us to s */ s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationBaseClock) } if metric, ok := metrics["DCGM_FI_DEV_ECC_SBE_VOL_TOTAL"]; ok { - s.mb.RecordGpuDcgmEccErrorsDataPoint(now, metric.asInt64(), metadata.AttributeGpuErrorTypeSbe) + s.aggregates.eccTotal.sbe.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) + _, sbeErrors := s.aggregates.eccTotal.sbe.Get(gpuIndex).Value() + s.mb.RecordGpuDcgmEccErrorsDataPoint(now, sbeErrors, metadata.AttributeGpuErrorTypeSbe) } if metric, ok := metrics["DCGM_FI_DEV_ECC_DBE_VOL_TOTAL"]; ok { - s.mb.RecordGpuDcgmEccErrorsDataPoint(now, metric.asInt64(), metadata.AttributeGpuErrorTypeDbe) + s.aggregates.eccTotal.dbe.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) + _, dbeErrors := s.aggregates.eccTotal.dbe.Get(gpuIndex).Value() + s.mb.RecordGpuDcgmEccErrorsDataPoint(now, dbeErrors, metadata.AttributeGpuErrorTypeDbe) } // TODO: XID errors. // s.mb.RecordGpuDcgmXidErrorsDataPoint(now, metric.asInt64(), xid) diff --git a/receiver/dcgmreceiver/util.go b/receiver/dcgmreceiver/util.go index 03079d707..c3cf19883 100644 --- a/receiver/dcgmreceiver/util.go +++ b/receiver/dcgmreceiver/util.go @@ -80,6 +80,41 @@ func (m *defaultMap[K, V]) TryGet(k K) (V, bool) { return v, ok } +// cumulativeTracker records cumulative values since last reset. +type cumulativeTracker[V int64 | float64] struct { + baseTimestamp int64 + baseline V // the value seen at baseTimestamp. + lastTimestamp int64 + lastValue V // the value seen at lastTimestamp. +} + +func (i *cumulativeTracker[V]) Reset() { + i.baseTimestamp = 0 + i.lastTimestamp = nowUnixMicro() + i.baseline = V(0) + i.lastValue = V(0) +} + +func (i *cumulativeTracker[V]) Update(ts int64, v V) { + // On first update, record the value as the baseline. + if i.baseTimestamp == 0 { + i.baseTimestamp, i.baseline = ts, v + } + // Drop stale points. + if ts <= i.lastTimestamp { + return + } + i.lastTimestamp, i.lastValue = ts, v +} + +func (i *cumulativeTracker[V]) Value() (int64, V) { + return i.lastTimestamp, i.lastValue - i.baseline +} + +func (i *cumulativeTracker[V]) Baseline() (int64, V) { + return i.baseTimestamp, i.baseline +} + var ( errBlankValue = fmt.Errorf("unspecified blank value") errDataNotFound = fmt.Errorf("data not found") diff --git a/receiver/dcgmreceiver/util_test.go b/receiver/dcgmreceiver/util_test.go index 40510f42a..3b35b9646 100644 --- a/receiver/dcgmreceiver/util_test.go +++ b/receiver/dcgmreceiver/util_test.go @@ -67,6 +67,56 @@ func TestRateIntegratorFloat64(t *testing.T) { testRateIntegrator[float64](t) } +func testCumulativeTracker[V int64 | float64](t *testing.T) { + origNowUnixMicro := nowUnixMicro + nowUnixMicro = func() int64 { return 10 } + defer func() { nowUnixMicro = origNowUnixMicro }() + + type P struct { + ts int64 + v V + } + p := func(ts int64, v V) P { return P{ts, v} } + + var ct cumulativeTracker[V] + + ct.Reset() + require.Equal(t, P{0, 0}, p(ct.Baseline())) + require.Equal(t, P{10, 0}, p(ct.Value())) + // Ensure first updates sets the baseline. + ct.Update(15, 50) + require.Equal(t, P{15, 50}, p(ct.Baseline())) + assert.Equal(t, P{15, 0}, p(ct.Value())) + // Ensure updates affect values, but not the baseline. + ct.Update(20, 80) + assert.Equal(t, P{15, 50}, p(ct.Baseline())) + assert.Equal(t, P{20, 30}, p(ct.Value())) + // Ensure stale points are ignored. + ct.Update(18, 1e8) + assert.Equal(t, P{20, 30}, p(ct.Value())) + ct.Update(20, 1e8) + assert.Equal(t, P{20, 30}, p(ct.Value())) + // Ensure updates affect values. + ct.Update(25, 100) + assert.Equal(t, P{25, 50}, p(ct.Value())) + // Ensure same inputs don't affect values. + ct.Update(30, 100) + assert.Equal(t, P{30, 50}, p(ct.Value())) + + // Ensure the value and baseline are cleared on reset. + ct.Reset() + assert.Equal(t, P{0, 0}, p(ct.Baseline())) + assert.Equal(t, P{10, 0}, p(ct.Value())) +} + +func TestCumulativeTrackerInt64(t *testing.T) { + testCumulativeTracker[int64](t) +} + +func TestCumulativeTrackerFloat64(t *testing.T) { + testCumulativeTracker[float64](t) +} + func TestDefaultMap(t *testing.T) { called := false m := newDefaultMap[int, int64](func() int64 { From d8362b33364ea6612abaff7915c428a01158ec1c Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Wed, 14 Aug 2024 18:09:33 -0400 Subject: [PATCH 34/38] Really pull in the cumulativetodelta processor. A follow-on to adb4c2e88d1be9af3d17c4f8a8aef664014ed989. --- service/components.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/service/components.go b/service/components.go index 8dbb8cc9e..60b84f5c0 100644 --- a/service/components.go +++ b/service/components.go @@ -18,6 +18,7 @@ import ( "github.com/open-telemetry/opentelemetry-collector-contrib/exporter/fileexporter" "github.com/open-telemetry/opentelemetry-collector-contrib/exporter/googlecloudexporter" "github.com/open-telemetry/opentelemetry-collector-contrib/exporter/googlemanagedprometheusexporter" + "github.com/open-telemetry/opentelemetry-collector-contrib/processor/cumulativetodeltaprocessor" "github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatorateprocessor" "github.com/open-telemetry/opentelemetry-collector-contrib/processor/filterprocessor" "github.com/open-telemetry/opentelemetry-collector-contrib/processor/groupbyattrsprocessor" @@ -142,6 +143,7 @@ func components() (otelcol.Factories, error) { processors := []processor.Factory{ agentmetricsprocessor.NewFactory(), casttosumprocessor.NewFactory(), + cumulativetodeltaprocessor.NewFactory(), deltatorateprocessor.NewFactory(), filterprocessor.NewFactory(), normalizesumsprocessor.NewFactory(), From dd39c32e02d79a9cf86759c76a3ef3ece06dcdd7 Mon Sep 17 00:00:00 2001 From: Quentin Smith Date: Wed, 11 Sep 2024 15:41:29 -0400 Subject: [PATCH 35/38] Collect metrics asynchronously (#223) Metrics are now scraped independently of collector polling cycles, allowing higher resolution and better integrations. Co-authored-by: Igor Peshansky --- receiver/dcgmreceiver/client.go | 311 +++++------- receiver/dcgmreceiver/client_gpu_test.go | 181 ++++--- receiver/dcgmreceiver/component_test.go | 13 +- .../dcgmreceiver/generated_package_test.go | 3 +- receiver/dcgmreceiver/scraper.go | 460 ++++++++---------- receiver/dcgmreceiver/scraper_gpu_test.go | 139 ++++-- .../testdata/NVIDIA_A100-SXM4-40GB.yaml | 46 +- .../testdata/NVIDIA_H100_80GB_HBM3.yaml | 46 +- receiver/dcgmreceiver/testdata/NVIDIA_L4.yaml | 44 +- .../testdata/Tesla_P100-PCIE-16GB.yaml | 40 +- receiver/dcgmreceiver/testdata/Tesla_P4.yaml | 40 +- receiver/dcgmreceiver/testdata/Tesla_T4.yaml | 46 +- .../testdata/Tesla_V100-SXM2-16GB.yaml | 46 +- .../testprofilepause/test_profile_pause.go | 18 +- receiver/dcgmreceiver/util.go | 156 +++--- receiver/dcgmreceiver/util_test.go | 155 +++--- 16 files changed, 874 insertions(+), 870 deletions(-) diff --git a/receiver/dcgmreceiver/client.go b/receiver/dcgmreceiver/client.go index a806643ea..85b01915f 100644 --- a/receiver/dcgmreceiver/client.go +++ b/receiver/dcgmreceiver/client.go @@ -20,10 +20,10 @@ package dcgmreceiver import ( "errors" "fmt" + "math" "time" "github.com/NVIDIA/go-dcgm/pkg/dcgm" - "go.opentelemetry.io/collector/receiver/scrapererror" "go.uber.org/zap" ) @@ -41,63 +41,61 @@ type dcgmClientSettings struct { fields []string } +type deviceMetrics struct { + ModelName string + UUID string + Metrics MetricsMap +} + type dcgmClient struct { - logger *zap.SugaredLogger - handleCleanup func() - enabledFieldIDs []dcgm.Short - enabledFieldGroup dcgm.FieldHandle - deviceIndices []uint - devicesModelName []string - devicesUUID []string - deviceMetricToFailedQueryCount map[string]uint64 + logger *zap.SugaredLogger + handleCleanup func() + enabledFieldIDs []dcgm.Short + enabledFieldGroup dcgm.FieldHandle + deviceGroup dcgm.GroupHandle + + devices map[uint]deviceMetrics + lastSuccessfulPoll time.Time + + deviceMetricToFailedQueryCount map[string]int pollingInterval time.Duration retryBlankValues bool maxRetries int } -type dcgmMetric struct { - timestamp int64 - name string - value interface{} -} - // Can't pass argument dcgm.mode because it is unexported var dcgmInit = func(args ...string) (func(), error) { return dcgm.Init(dcgm.Standalone, args...) } -var dcgmGetLatestValuesForFields = dcgm.GetLatestValuesForFields +var dcgmGetValuesSince = dcgm.GetValuesSince func newClient(settings *dcgmClientSettings, logger *zap.Logger) (*dcgmClient, error) { dcgmCleanup, err := initializeDcgm(settings.endpoint, logger) if err != nil { return nil, errors.Join(ErrDcgmInitialization, err) } - deviceIndices := make([]uint, 0) - names := make([]string, 0) - UUIDs := make([]string, 0) enabledFieldGroup := dcgm.FieldHandle{} requestedFieldIDs := toFieldIDs(settings.fields) - supportedRegularFieldIDs, err := getSupportedRegularFields(requestedFieldIDs, logger) - if err != nil { - return nil, fmt.Errorf("Error querying supported regular fields: %w", err) - } supportedProfilingFieldIDs, err := getSupportedProfilingFields() if err != nil { // If there is error querying the supported fields at all, let the // receiver collect basic metrics: (GPU utilization, used/free memory). logger.Sugar().Warnf("Error querying supported profiling fields on '%w'. GPU profiling metrics will not be collected.", err) } - enabledFields, unavailableFields := filterSupportedFields(requestedFieldIDs, supportedRegularFieldIDs, supportedProfilingFieldIDs) + enabledFields, unavailableFields := filterSupportedFields(requestedFieldIDs, supportedProfilingFieldIDs) for _, f := range unavailableFields { logger.Sugar().Warnf("Field '%s' is not supported. Metric '%s' will not be collected", dcgmIDToName[f], dcgmIDToName[f]) } + var deviceGroup dcgm.GroupHandle if len(enabledFields) != 0 { - deviceIndices, names, UUIDs, err = discoverDevices(logger) + supportedDeviceIndices, err := dcgm.GetSupportedDevices() if err != nil { - return nil, err + return nil, fmt.Errorf("Unable to discover supported GPUs on %w", err) } - deviceGroup, err := createDeviceGroup(logger, deviceIndices) + logger.Sugar().Infof("Discovered %d supported GPU devices", len(supportedDeviceIndices)) + + deviceGroup, err = createDeviceGroup(logger, supportedDeviceIndices) if err != nil { return nil, err } @@ -112,10 +110,10 @@ func newClient(settings *dcgmClientSettings, logger *zap.Logger) (*dcgmClient, e handleCleanup: dcgmCleanup, enabledFieldIDs: enabledFields, enabledFieldGroup: enabledFieldGroup, - deviceIndices: deviceIndices, - devicesModelName: names, - devicesUUID: UUIDs, - deviceMetricToFailedQueryCount: make(map[string]uint64), + deviceGroup: deviceGroup, + devices: map[uint]deviceMetrics{}, + lastSuccessfulPoll: time.Now(), + deviceMetricToFailedQueryCount: make(map[string]int), pollingInterval: settings.pollingInterval, retryBlankValues: settings.retryBlankValues, maxRetries: settings.maxRetries, @@ -139,30 +137,20 @@ func initializeDcgm(endpoint string, logger *zap.Logger) (func(), error) { return dcgmCleanup, nil } -func discoverDevices(logger *zap.Logger) ([]uint, []string, []string, error) { - supportedDeviceIndices, err := dcgm.GetSupportedDevices() +func newDeviceMetrics(logger *zap.SugaredLogger, gpuIndex uint) (deviceMetrics, error) { + deviceInfo, err := dcgm.GetDeviceInfo(gpuIndex) if err != nil { - return nil, nil, nil, fmt.Errorf("Unable to discover supported GPUs on %w", err) + logger.Warnf("Unable to query device info for NVIDIA device %d on '%w'", gpuIndex, err) + return deviceMetrics{}, err } - logger.Sugar().Infof("Discovered %d supported GPU devices", len(supportedDeviceIndices)) - - devices := make([]uint, 0, len(supportedDeviceIndices)) - names := make([]string, 0, len(supportedDeviceIndices)) - UUIDs := make([]string, 0, len(supportedDeviceIndices)) - for _, gpuIndex := range supportedDeviceIndices { - deviceInfo, err := dcgm.GetDeviceInfo(gpuIndex) - if err != nil { - logger.Sugar().Warnf("Unable to query device info for NVIDIA device %d on '%w'", gpuIndex, err) - continue - } - devices = append(devices, gpuIndex) - names = append(names, deviceInfo.Identifiers.Model) - UUIDs = append(UUIDs, deviceInfo.UUID) - logger.Sugar().Infof("Discovered NVIDIA device %s with UUID %s", names[gpuIndex], UUIDs[gpuIndex]) + device := deviceMetrics{ + ModelName: deviceInfo.Identifiers.Model, + UUID: deviceInfo.UUID, + Metrics: MetricsMap{}, } - - return devices, names, UUIDs, nil + logger.Infof("Discovered NVIDIA device %s with UUID %s (DCGM GPU ID %d)", device.ModelName, device.UUID, gpuIndex) + return device, nil } func createDeviceGroup(logger *zap.Logger, deviceIndices []uint) (dcgm.GroupHandle, error) { @@ -224,17 +212,13 @@ func getSupportedProfilingFields() ([]dcgm.Short, error) { // filterSupportedFields takes the user requested fields and device supported // profiling fields, and filters to return those that are requested & supported // to be the enabledFields and requested but not supported as unavailableFields -func filterSupportedFields(requestedFields []dcgm.Short, supportedRegularFields []dcgm.Short, supportedProfilingFields []dcgm.Short) ([]dcgm.Short, []dcgm.Short) { +func filterSupportedFields(requestedFields []dcgm.Short, supportedProfilingFields []dcgm.Short) ([]dcgm.Short, []dcgm.Short) { var enabledFields []dcgm.Short var unavailableFields []dcgm.Short for _, ef := range requestedFields { - support := false - for _, sf := range supportedRegularFields { - if sf == ef { - support = true - break - } - } + // For fields like `DCGM_FI_DEV_*`, which are not + // profiling fields, assume they are always present. + support := ef < dcgmProfilingFieldsStart for _, sf := range supportedProfilingFields { if sf == ef { support = true @@ -250,72 +234,6 @@ func filterSupportedFields(requestedFields []dcgm.Short, supportedRegularFields return enabledFields, unavailableFields } -func getSupportedRegularFields(requestedFields []dcgm.Short, logger *zap.Logger) ([]dcgm.Short, error) { - var regularFields []dcgm.Short - for _, ef := range requestedFields { - if ef < dcgmProfilingFieldsStart { - // For fields like `DCGM_FI_DEV_*`, which are not - // profiling fields, try to actually retrieve the values - // from all devices - regularFields = append(regularFields, ef) - } - } - if len(regularFields) == 0 { - return nil, nil - } - deviceIndices, err := dcgm.GetSupportedDevices() - if err != nil { - return nil, fmt.Errorf("Unable to discover supported GPUs on %w", err) - } - deviceGroupName := "google-cloud-ops-agent-initial-watch-group" - deviceGroup, err := dcgm.NewDefaultGroup(deviceGroupName) - if err != nil { - return nil, fmt.Errorf("Unable to create DCGM GPU default group on %w", err) - } - defer func() { _ = dcgm.DestroyGroup(deviceGroup) }() - testFieldGroup, err := setWatchesOnFields(logger, deviceGroup, regularFields, dcgmWatchParams{ - fieldGroupName: "google-cloud-ops-agent-initial-discovery", - updateFreqUs: 3600000000, // call UpdateAllFields manually - maxKeepTime: 600, - maxKeepSamples: 1, - }) - defer func() { _ = dcgm.FieldGroupDestroy(testFieldGroup) }() - if err != nil { - return nil, fmt.Errorf("Unable to set field watches on %w", err) - } - err = dcgm.UpdateAllFields() - if err != nil { - return nil, fmt.Errorf("Unable to update fields on %w", err) - } - found := make(map[dcgm.Short]bool) - for _, gpuIndex := range deviceIndices { - fieldValues, pollErr := dcgm.GetLatestValuesForFields(gpuIndex, regularFields) - if pollErr != nil { - continue - } - for _, fieldValue := range fieldValues { - dcgmName := dcgmIDToName[dcgm.Short(fieldValue.FieldId)] - if err := isValidValue(fieldValue); err != nil { - logger.Sugar().Warnf("Received invalid value (ts %d gpu %d) %s: %v", fieldValue.Ts, gpuIndex, dcgmName, err) - continue - } - switch fieldValue.FieldType { - case dcgm.DCGM_FT_DOUBLE: - logger.Sugar().Debugf("Discovered (ts %d gpu %d) %s = %.3f (f64)", fieldValue.Ts, gpuIndex, dcgmName, fieldValue.Float64()) - case dcgm.DCGM_FT_INT64: - logger.Sugar().Debugf("Discovered (ts %d gpu %d) %s = %d (i64)", fieldValue.Ts, gpuIndex, dcgmName, fieldValue.Int64()) - } - found[dcgm.Short(fieldValue.FieldId)] = true - } - } - // TODO: dcgmUnwatchFields is not available. - supported := make([]dcgm.Short, len(found)) - for fieldID := range found { - supported = append(supported, fieldID) - } - return supported, nil -} - // Internal-only type dcgmWatchParams struct { fieldGroupName string @@ -352,6 +270,8 @@ func setWatchesOnFields(logger *zap.Logger, deviceGroup dcgm.GroupHandle, fieldI return fieldGroup, nil } +const maxKeepSamples = 100 // TODO: Is this enough? + func setWatchesOnEnabledFields(pollingInterval time.Duration, logger *zap.Logger, deviceGroup dcgm.GroupHandle, enabledFieldIDs []dcgm.Short) (dcgm.FieldHandle, error) { return setWatchesOnFields(logger, deviceGroup, enabledFieldIDs, dcgmWatchParams{ // Note: Add random suffix to avoid conflict amongnst any parallel collectors @@ -359,12 +279,13 @@ func setWatchesOnEnabledFields(pollingInterval time.Duration, logger *zap.Logger // Note: DCGM retained samples = Max(maxKeepSamples, maxKeepTime/updateFreq) updateFreqUs: int64(pollingInterval / time.Microsecond), maxKeepTime: 600.0, /* 10 min */ - maxKeepSamples: int32(15), + maxKeepSamples: maxKeepSamples, }) } func (client *dcgmClient) cleanup() { _ = dcgm.FieldGroupDestroy(client.enabledFieldGroup) + _ = dcgm.DestroyGroup(client.deviceGroup) if client.handleCleanup != nil { client.handleCleanup() } @@ -372,83 +293,91 @@ func (client *dcgmClient) cleanup() { client.logger.Info("Shutdown DCGM") } -func (client *dcgmClient) getDeviceModelName(gpuIndex uint) string { - return client.devicesModelName[gpuIndex] -} - -func (client *dcgmClient) getDeviceUUID(gpuIndex uint) string { - return client.devicesUUID[gpuIndex] -} - -func (client *dcgmClient) collectDeviceMetrics() (map[uint][]dcgmMetric, error) { - var err scrapererror.ScrapeErrors - gpuMetrics := make(map[uint][]dcgmMetric) - for _, gpuIndex := range client.deviceIndices { - client.logger.Debugf("Polling DCGM daemon for GPU %d", gpuIndex) - retry := true - for i := 0; retry && i < client.maxRetries; i++ { - fieldValues, pollErr := dcgmGetLatestValuesForFields(gpuIndex, client.enabledFieldIDs) - client.logger.Debugf("Got %d field values", len(fieldValues)) - if pollErr == nil { - gpuMetrics[gpuIndex], retry = client.appendMetrics(gpuMetrics[gpuIndex], gpuIndex, fieldValues) - if retry { - client.logger.Warnf("Retrying poll of DCGM daemon for GPU %d; attempt %d", gpuIndex, i+1) - time.Sleep(client.pollingInterval) - continue - } - client.logger.Debugf("Successful poll of DCGM daemon for GPU %d", gpuIndex) - } else { - msg := fmt.Sprintf("Unable to poll DCGM daemon for GPU %d on %s", gpuIndex, pollErr) - client.issueWarningForFailedQueryUptoThreshold(gpuIndex, "all-profiling-metrics", msg) - err.AddPartial(1, fmt.Errorf("%s", msg)) - } - } +// collect will poll dcgm for any new metrics, updating client.devices as appropriate +// It returns the estimated polling interval. +func (client *dcgmClient) collect() (time.Duration, error) { + client.logger.Debugf("Polling DCGM daemon for field values") + if len(client.enabledFieldIDs) == 0 { + // Make sure we don't try to scrape without a device group (since we don't construct one when there are no enabled fields). + return 0, nil } - - return gpuMetrics, err.Combine() -} - -func (client *dcgmClient) appendMetrics(gpuMetrics []dcgmMetric, gpuIndex uint, fieldValues []dcgm.FieldValue_v1) (result []dcgmMetric, retry bool) { - retry = false + fieldValues, pollTime, err := dcgmGetValuesSince(client.deviceGroup, client.enabledFieldGroup, client.lastSuccessfulPoll) + if err != nil { + msg := fmt.Sprintf("Unable to poll DCGM daemon for metrics: %s", err) + client.issueWarningForFailedQueryUptoThreshold("all-profiling-metrics", maxWarningsForFailedDeviceMetricQuery, msg) + return 0, err + } + client.logger.Debugf("Got %d field values over %s", len(fieldValues), pollTime.Sub(client.lastSuccessfulPoll)) + client.lastSuccessfulPoll = pollTime + oldestTs := int64(math.MaxInt64) + newestTs := int64(0) for _, fieldValue := range fieldValues { + if fieldValue.EntityGroupId != dcgm.FE_GPU { + continue + } + gpuIndex := fieldValue.EntityId + if _, ok := client.devices[gpuIndex]; !ok { + device, err := newDeviceMetrics(client.logger, gpuIndex) + if err != nil { + continue + } + client.devices[gpuIndex] = device + } + device := client.devices[gpuIndex] dcgmName := dcgmIDToName[dcgm.Short(fieldValue.FieldId)] - if err := isValidValue(fieldValue); err != nil { + if err := isValidValue(fieldValue); err == errBlankValue { + // Blank values are expected at startup. + continue + } else if err == errNotSupported { + client.issueWarningForFailedQueryUptoThreshold(dcgmName, 1, fmt.Sprintf("Field '%s' is not supported. Metric '%s' will not be collected", dcgmName, dcgmName)) + continue + } else if err != nil { msg := fmt.Sprintf("Received invalid value (ts %d gpu %d) %s: %v", fieldValue.Ts, gpuIndex, dcgmName, err) - client.issueWarningForFailedQueryUptoThreshold(gpuIndex, dcgmName, msg) - if client.retryBlankValues && errors.Is(err, errBlankValue) { - retry = true - } + client.issueWarningForFailedQueryUptoThreshold(fmt.Sprintf("device%d.%s", gpuIndex, dcgmName), maxWarningsForFailedDeviceMetricQuery, msg) continue } - - var metricValue interface{} - switch fieldValue.FieldType { - case dcgm.DCGM_FT_DOUBLE: - value := fieldValue.Float64() - client.logger.Debugf("Discovered (ts %d gpu %d) %s = %.3f (f64)", fieldValue.Ts, gpuIndex, dcgmName, value) - metricValue = value - case dcgm.DCGM_FT_INT64: - value := fieldValue.Int64() - client.logger.Debugf("Discovered (ts %d gpu %d) %s = %d (i64)", fieldValue.Ts, gpuIndex, dcgmName, value) - metricValue = value - default: - metricValue = fieldValue.Value + if fieldValue.Ts < oldestTs { + oldestTs = fieldValue.Ts + } + if fieldValue.Ts > newestTs { + newestTs = fieldValue.Ts } - gpuMetrics = append(gpuMetrics, dcgmMetric{fieldValue.Ts, dcgmName, metricValue}) + if _, ok := device.Metrics[dcgmName]; !ok { + device.Metrics[dcgmName] = &metricStats{} + } + device.Metrics[dcgmName].Update(fieldValue) } + duration := time.Duration(newestTs-oldestTs) * time.Microsecond + client.logger.Debugf("Successful poll of DCGM daemon returned %v of data", duration) + // If we did a partial poll, there should be more room in the buffer. + duration = max(duration, client.pollingInterval*maxKeepSamples) + return duration, nil +} - return gpuMetrics, retry +// getDeviceMetrics returns a deep copy of client.devices +func (client *dcgmClient) getDeviceMetrics() map[uint]deviceMetrics { + out := map[uint]deviceMetrics{} + for gpuIndex, device := range client.devices { + new := MetricsMap{} + for key, value := range device.Metrics { + newValue := *value + new[key] = &newValue + } + // device is already a copy here + device.Metrics = new + out[gpuIndex] = device + } + return out } -func (client *dcgmClient) issueWarningForFailedQueryUptoThreshold(deviceIdx uint, dcgmName string, reason string) { - deviceMetric := fmt.Sprintf("device%d.%s", deviceIdx, dcgmName) - client.deviceMetricToFailedQueryCount[deviceMetric]++ +func (client *dcgmClient) issueWarningForFailedQueryUptoThreshold(dcgmName string, limit int, reason string) { + client.deviceMetricToFailedQueryCount[dcgmName]++ - failedCount := client.deviceMetricToFailedQueryCount[deviceMetric] - if failedCount <= maxWarningsForFailedDeviceMetricQuery { - client.logger.Warnf("Unable to query '%s' for Nvidia device %d on '%s'", dcgmName, deviceIdx, reason) - if failedCount == maxWarningsForFailedDeviceMetricQuery { - client.logger.Warnf("Surpressing further device query warnings for '%s' for Nvidia device %d", dcgmName, deviceIdx) + failedCount := client.deviceMetricToFailedQueryCount[dcgmName] + if failedCount <= limit { + client.logger.Warnf("%s", reason) + if limit > 1 && failedCount == limit { + client.logger.Warnf("Surpressing further device query warnings for '%s'", dcgmName) } } } diff --git a/receiver/dcgmreceiver/client_gpu_test.go b/receiver/dcgmreceiver/client_gpu_test.go index e6df9924b..24d79bec3 100644 --- a/receiver/dcgmreceiver/client_gpu_test.go +++ b/receiver/dcgmreceiver/client_gpu_test.go @@ -24,10 +24,13 @@ import ( "io/ioutil" "os" "path" + "slices" + "sort" "strings" "testing" "time" + "github.com/NVIDIA/go-dcgm/pkg/dcgm" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.uber.org/zap/zaptest" @@ -51,56 +54,62 @@ func defaultClientSettings() *dcgmClientSettings { requestedFields := discoverRequestedFields(createDefaultConfig().(*Config)) return &dcgmClientSettings{ endpoint: defaultEndpoint, - pollingInterval: 10 * time.Second, + pollingInterval: 1 * time.Second, retryBlankValues: true, maxRetries: 5, fields: requestedFields, } } -// TestSupportedFieldsWithGolden tests getSupportedRegularFields() and +// TestSupportedProfilingFieldsWithGolden tests getSupportedRegularFields() and // getSupportedProfilingFields() against the golden files for the current GPU // model -func TestSupportedFieldsWithGolden(t *testing.T) { +func TestSupportedProfilingFieldsWithGolden(t *testing.T) { clientSettings := defaultClientSettings() client, err := newClient(clientSettings, zaptest.NewLogger(t)) require.Nil(t, err, "cannot initialize DCGM. Install and run DCGM before running tests.") + defer client.cleanup() - require.NotEmpty(t, client.devicesModelName) - gpuModel := client.getDeviceModelName(0) allFields := toFieldIDs(clientSettings.fields) - supportedRegularFields, err := getSupportedRegularFields(allFields, zaptest.NewLogger(t)) - require.Nil(t, err) supportedProfilingFields, err := getSupportedProfilingFields() require.Nil(t, err) - enabledFields, unavailableFields := filterSupportedFields(allFields, supportedRegularFields, supportedProfilingFields) + enabledFields, unavailableFields := filterSupportedFields(allFields, supportedProfilingFields) var enabledFieldsString []string var unavailableFieldsString []string for _, f := range enabledFields { - enabledFieldsString = append(enabledFieldsString, dcgmIDToName[f]) + name := dcgmIDToName[f] + if !strings.HasPrefix(name, "DCGM_FI_DEV_") { + enabledFieldsString = append(enabledFieldsString, name) + } } for _, f := range unavailableFields { - unavailableFieldsString = append(unavailableFieldsString, dcgmIDToName[f]) - } - m := modelSupportedFields{ - Model: gpuModel, - SupportedFields: enabledFieldsString, - UnsupportedFields: unavailableFieldsString, - } - actual, err := yaml.Marshal(&m) - if err != nil { - t.Fatal(err) + name := dcgmIDToName[f] + if !strings.HasPrefix(name, "DCGM_FI_DEV_") { + unavailableFieldsString = append(unavailableFieldsString, name) + } } - assert.Equal(t, len(allFields), len(client.enabledFieldIDs)+len(unavailableFieldsString)) - goldenPath := getModelGoldenFilePath(t, gpuModel) - golden.Assert(t, string(actual), goldenPath) - client.cleanup() + sort.Strings(enabledFieldsString) + sort.Strings(unavailableFieldsString) + _, err = client.collect() + require.Nil(t, err) + require.NotEmpty(t, client.devices) + gpuModel := client.devices[0].ModelName + + want := LoadExpectedMetrics(t, gpuModel) + want.SupportedFields = slices.DeleteFunc(want.SupportedFields, func(name string) bool { + return strings.HasPrefix(name, "DCGM_FI_DEV_") + }) + want.UnsupportedFields = slices.DeleteFunc(want.UnsupportedFields, func(name string) bool { + return strings.HasPrefix(name, "DCGM_FI_DEV_") + }) + assert.ElementsMatch(t, enabledFieldsString, want.SupportedFields, "supported profiling fields") + assert.ElementsMatch(t, unavailableFieldsString, want.UnsupportedFields) } // LoadExpectedMetrics read the supported metrics of a GPU model from the golden // file, given a GPU model string -func LoadExpectedMetrics(t *testing.T, model string) []string { +func LoadExpectedMetrics(t *testing.T, model string) modelSupportedFields { t.Helper() goldenPath := getModelGoldenFilePath(t, model) goldenFile, err := ioutil.ReadFile(goldenPath) @@ -112,11 +121,7 @@ func LoadExpectedMetrics(t *testing.T, model string) []string { if err != nil { t.Fatal(err) } - var expectedMetrics []string - for _, supported := range m.SupportedFields { - expectedMetrics = append(expectedMetrics, supported) - } - return expectedMetrics + return m } // getModelGoldenFilePath returns golden file path given a GPU model string @@ -135,41 +140,56 @@ func TestNewDcgmClientWithGpuPresent(t *testing.T) { assert.NotNil(t, client) assert.NotNil(t, client.handleCleanup) - assert.Greater(t, len(client.deviceIndices), 0) - for gpuIndex := range client.deviceIndices { - assert.Greater(t, len(client.devicesModelName[gpuIndex]), 0) - assert.Greater(t, len(client.devicesUUID[gpuIndex]), 0) - } client.cleanup() } func TestCollectGpuProfilingMetrics(t *testing.T) { - client, err := newClient(defaultClientSettings(), zaptest.NewLogger(t)) + clientSettings := defaultClientSettings() + client, err := newClient(clientSettings, zaptest.NewLogger(t)) + defer client.cleanup() require.Nil(t, err, "cannot initialize DCGM. Install and run DCGM before running tests.") - expectedMetrics := LoadExpectedMetrics(t, client.devicesModelName[0]) var maxCollectionInterval = 60 * time.Second - before := time.Now().UnixMicro() - maxCollectionInterval.Microseconds() - deviceMetrics, err := client.collectDeviceMetrics() - after := time.Now().UnixMicro() - assert.Nil(t, err) + var before, after int64 + for { + before = time.Now().UnixMicro() - maxCollectionInterval.Microseconds() + duration, err := client.collect() + after = time.Now().UnixMicro() + assert.Greater(t, duration, time.Duration(0)) + assert.Nil(t, err) + var metricCount int + for _, device := range client.devices { + for _, metric := range device.Metrics { + if metric.lastFieldValue != nil { + metricCount++ + } + } + } + if metricCount > 0 { + break + } + time.Sleep(client.pollingInterval) + } + deviceMetrics := client.devices - asFloat64 := func(metric dcgmMetric) float64 { - require.IsTypef(t, float64(0), metric.value, "Unexpected metric type: %T", metric.value) - value, _ := metric.value.(float64) + lastFloat64 := func(metric *metricStats) float64 { + assert.Equal(t, dcgm.DCGM_FT_DOUBLE, metric.lastFieldValue.FieldType, "Unexpected metric type: %+v", metric.lastFieldValue) + value, ok := asFloat64(*metric.lastFieldValue) + require.True(t, ok, "Unexpected metric type: %+v", metric.lastFieldValue) return value } - asInt64 := func(metric dcgmMetric) int64 { - require.IsTypef(t, int64(0), metric.value, "Unexpected metric type: %T", metric.value) - value, _ := metric.value.(int64) + lastInt64 := func(metric *metricStats) int64 { + assert.Equal(t, dcgm.DCGM_FT_INT64, metric.lastFieldValue.FieldType, "Unexpected metric type: %+v", metric.lastFieldValue) + value, ok := asInt64(*metric.lastFieldValue) + require.True(t, ok, "Unexpected metric type: %+v", metric.lastFieldValue) return value } - seenMetric := make(map[string]bool) + seenMetric := make(map[string]int) assert.GreaterOrEqual(t, len(deviceMetrics), 0) assert.LessOrEqual(t, len(deviceMetrics), 32) - for gpuIndex, metrics := range deviceMetrics { - for _, metric := range metrics { - switch metric.name { + for _, device := range deviceMetrics { + for name, metric := range device.Metrics { + switch name { case "DCGM_FI_PROF_GR_ENGINE_ACTIVE": fallthrough case "DCGM_FI_PROF_SM_ACTIVE": @@ -185,7 +205,7 @@ func TestCollectGpuProfilingMetrics(t *testing.T) { case "DCGM_FI_PROF_PIPE_FP16_ACTIVE": fallthrough case "DCGM_FI_PROF_DRAM_ACTIVE": - value := asFloat64(metric) + value := lastFloat64(metric) assert.GreaterOrEqual(t, value, float64(0.0)) assert.LessOrEqual(t, value, float64(1.0)) case "DCGM_FI_DEV_GPU_UTIL": @@ -195,7 +215,7 @@ func TestCollectGpuProfilingMetrics(t *testing.T) { case "DCGM_FI_DEV_ENC_UTIL": fallthrough case "DCGM_FI_DEV_DEC_UTIL": - value := asInt64(metric) + value := lastInt64(metric) assert.GreaterOrEqual(t, value, int64(0)) assert.LessOrEqual(t, value, int64(100)) case "DCGM_FI_DEV_FB_FREE": @@ -204,7 +224,7 @@ func TestCollectGpuProfilingMetrics(t *testing.T) { fallthrough case "DCGM_FI_DEV_FB_RESERVED": // arbitrary max of 10 TiB - value := asInt64(metric) + value := lastInt64(metric) assert.GreaterOrEqual(t, value, int64(0)) assert.LessOrEqual(t, value, int64(10485760)) case "DCGM_FI_PROF_PCIE_TX_BYTES": @@ -215,7 +235,7 @@ func TestCollectGpuProfilingMetrics(t *testing.T) { fallthrough case "DCGM_FI_PROF_NVLINK_RX_BYTES": // arbitrary max of 10 TiB/sec - value := asInt64(metric) + value := lastInt64(metric) assert.GreaterOrEqual(t, value, int64(0)) assert.LessOrEqual(t, value, int64(10995116277760)) case "DCGM_FI_DEV_BOARD_LIMIT_VIOLATION": @@ -233,49 +253,72 @@ func TestCollectGpuProfilingMetrics(t *testing.T) { case "DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION": fallthrough case "DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION": - value := asInt64(metric) + value := lastInt64(metric) assert.GreaterOrEqual(t, value, int64(0)) - assert.LessOrEqual(t, value, time.Now().UnixMicro()) + assert.LessOrEqual(t, value, time.Now().UnixNano(), name) case "DCGM_FI_DEV_ECC_DBE_VOL_TOTAL": fallthrough case "DCGM_FI_DEV_ECC_SBE_VOL_TOTAL": // arbitrary max of 100000000 errors - value := asInt64(metric) + value := lastInt64(metric) assert.GreaterOrEqual(t, value, int64(0)) assert.LessOrEqual(t, value, int64(100000000)) case "DCGM_FI_DEV_GPU_TEMP": // arbitrary max of 100000 °C - value := asInt64(metric) + value := lastInt64(metric) assert.GreaterOrEqual(t, value, int64(0)) assert.LessOrEqual(t, value, int64(100000)) case "DCGM_FI_DEV_SM_CLOCK": // arbitrary max of 100000 MHz - value := asInt64(metric) + value := lastInt64(metric) assert.GreaterOrEqual(t, value, int64(0)) assert.LessOrEqual(t, value, int64(100000)) case "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION": - value := asInt64(metric) + value := lastInt64(metric) assert.GreaterOrEqual(t, value, int64(0)) // TODO case "DCGM_FI_DEV_POWER_USAGE": - value := asFloat64(metric) + value := lastFloat64(metric) assert.GreaterOrEqual(t, value, float64(0.0)) // TODO default: - t.Errorf("Unexpected metric '%s'", metric.name) + t.Errorf("Unexpected metric '%s'", name) } - assert.GreaterOrEqual(t, metric.timestamp, before) - assert.LessOrEqual(t, metric.timestamp, after) + assert.GreaterOrEqual(t, metric.lastFieldValue.Ts, before) + assert.LessOrEqual(t, metric.lastFieldValue.Ts, after) - seenMetric[fmt.Sprintf("gpu{%d}.metric{%s}", gpuIndex, metric.name)] = true + seenMetric[name]++ } } - for _, gpuIndex := range client.deviceIndices { - for _, metric := range expectedMetrics { - assert.True(t, seenMetric[fmt.Sprintf("gpu{%d}.metric{%s}", gpuIndex, metric)], fmt.Sprintf("%s on gpu %d", metric, gpuIndex)) + for name, count := range seenMetric { + assert.Equalf(t, count, len(deviceMetrics), "metric %q found on an unexpected number of GPUs", name) + } + + allFields := clientSettings.fields + + var enabledFieldsString []string + var unavailableFieldsString []string + for _, f := range allFields { + if seenMetric[f] > 0 { + enabledFieldsString = append(enabledFieldsString, f) + } else { + unavailableFieldsString = append(unavailableFieldsString, f) } } - client.cleanup() + sort.Strings(enabledFieldsString) + sort.Strings(unavailableFieldsString) + gpuModel := client.devices[0].ModelName + m := modelSupportedFields{ + Model: gpuModel, + SupportedFields: enabledFieldsString, + UnsupportedFields: unavailableFieldsString, + } + actual, err := yaml.Marshal(&m) + if err != nil { + t.Fatal(err) + } + goldenPath := getModelGoldenFilePath(t, gpuModel) + golden.Assert(t, string(actual), goldenPath) } diff --git a/receiver/dcgmreceiver/component_test.go b/receiver/dcgmreceiver/component_test.go index 18ad1e214..795ef9e5f 100644 --- a/receiver/dcgmreceiver/component_test.go +++ b/receiver/dcgmreceiver/component_test.go @@ -35,6 +35,7 @@ import ( "go.opentelemetry.io/collector/consumer/consumertest" "go.opentelemetry.io/collector/receiver" "go.opentelemetry.io/collector/receiver/receivertest" + "go.uber.org/zap/zaptest" ) func TestComponentFactoryType(t *testing.T) { @@ -45,6 +46,12 @@ func TestComponentConfigStruct(t *testing.T) { require.NoError(t, componenttest.CheckConfigStruct(NewFactory().CreateDefaultConfig())) } +func newCreateSettings(t *testing.T) receiver.CreateSettings { + settings := receivertest.NewNopCreateSettings() + settings.Logger = zaptest.NewLogger(t) + return settings +} + func TestComponentLifecycle(t *testing.T) { factory := NewFactory() @@ -70,19 +77,19 @@ func TestComponentLifecycle(t *testing.T) { for _, test := range tests { t.Run(test.name+"-shutdown", func(t *testing.T) { - c, err := test.createFn(context.Background(), receivertest.NewNopCreateSettings(), cfg) + c, err := test.createFn(context.Background(), newCreateSettings(t), cfg) require.NoError(t, err) err = c.Shutdown(context.Background()) require.NoError(t, err) }) t.Run(test.name+"-lifecycle", func(t *testing.T) { - firstRcvr, err := test.createFn(context.Background(), receivertest.NewNopCreateSettings(), cfg) + firstRcvr, err := test.createFn(context.Background(), newCreateSettings(t), cfg) require.NoError(t, err) host := componenttest.NewNopHost() require.NoError(t, err) require.NoError(t, firstRcvr.Start(context.Background(), host)) require.NoError(t, firstRcvr.Shutdown(context.Background())) - secondRcvr, err := test.createFn(context.Background(), receivertest.NewNopCreateSettings(), cfg) + secondRcvr, err := test.createFn(context.Background(), newCreateSettings(t), cfg) require.NoError(t, err) require.NoError(t, secondRcvr.Start(context.Background(), host)) require.NoError(t, secondRcvr.Shutdown(context.Background())) diff --git a/receiver/dcgmreceiver/generated_package_test.go b/receiver/dcgmreceiver/generated_package_test.go index 90d299c5c..0da0bc8c9 100644 --- a/receiver/dcgmreceiver/generated_package_test.go +++ b/receiver/dcgmreceiver/generated_package_test.go @@ -3,8 +3,9 @@ package dcgmreceiver import ( - "go.uber.org/goleak" "testing" + + "go.uber.org/goleak" ) func TestMain(m *testing.M) { diff --git a/receiver/dcgmreceiver/scraper.go b/receiver/dcgmreceiver/scraper.go index 3fb6a3d2d..79f47b5f7 100644 --- a/receiver/dcgmreceiver/scraper.go +++ b/receiver/dcgmreceiver/scraper.go @@ -18,72 +18,43 @@ package dcgmreceiver import ( - "cmp" "context" "errors" "fmt" - "slices" "time" "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/pdata/pcommon" "go.opentelemetry.io/collector/pdata/pmetric" "go.opentelemetry.io/collector/receiver" + "golang.org/x/sync/errgroup" "github.com/GoogleCloudPlatform/opentelemetry-operations-collector/receiver/dcgmreceiver/internal/metadata" ) type dcgmScraper struct { - config *Config - settings receiver.CreateSettings - client *dcgmClient - mb *metadata.MetricsBuilder - // Aggregate cumulative values. - aggregates struct { - energyConsumption struct { - total *defaultMap[uint, *cumulativeTracker[int64]] - fallback *defaultMap[uint, *rateIntegrator[float64]] // ...from power usage rate. - } - pcieTotal struct { - tx *defaultMap[uint, *rateIntegrator[int64]] // ...from pcie tx. - rx *defaultMap[uint, *rateIntegrator[int64]] // ...from pcie rx. - } - nvlinkTotal struct { - tx *defaultMap[uint, *rateIntegrator[int64]] // ...from nvlink tx. - rx *defaultMap[uint, *rateIntegrator[int64]] // ...from nvlink rx. - } - throttleDuration struct { - powerViolation *defaultMap[uint, *cumulativeTracker[int64]] - thermalViolation *defaultMap[uint, *cumulativeTracker[int64]] - syncBoostViolation *defaultMap[uint, *cumulativeTracker[int64]] - boardLimitViolation *defaultMap[uint, *cumulativeTracker[int64]] - lowUtilViolation *defaultMap[uint, *cumulativeTracker[int64]] - reliabilityViolation *defaultMap[uint, *cumulativeTracker[int64]] - totalAppClocksViolation *defaultMap[uint, *cumulativeTracker[int64]] - totalBaseClocksViolation *defaultMap[uint, *cumulativeTracker[int64]] - } - eccTotal struct { - sbe *defaultMap[uint, *cumulativeTracker[int64]] - dbe *defaultMap[uint, *cumulativeTracker[int64]] - } - } + config *Config + settings receiver.CreateSettings + initRetryDelay time.Duration + mb *metadata.MetricsBuilder + collectTriggerCh chan<- struct{} + metricsCh <-chan map[uint]deviceMetrics + cancel func() } func newDcgmScraper(config *Config, settings receiver.CreateSettings) *dcgmScraper { - return &dcgmScraper{config: config, settings: settings} + return &dcgmScraper{config: config, settings: settings, initRetryDelay: 10 * time.Second} } -// initClient will try to create a new dcgmClient if currently has no client; -// it will try to initialize the communication with the DCGM service; if +const scrapePollingInterval = 100 * time.Millisecond // TODO: Choose an appropriate value + +// initClient will try to initialize the communication with the DCGM service; if // success, create a client; only return errors if DCGM service is available but // failed to create client. -func (s *dcgmScraper) initClient() error { - if s.client != nil { - return nil - } +func (s *dcgmScraper) initClient() (*dcgmClient, error) { clientSettings := &dcgmClientSettings{ endpoint: s.config.TCPAddrConfig.Endpoint, - pollingInterval: s.config.CollectionInterval, + pollingInterval: scrapePollingInterval, fields: discoverRequestedFields(s.config), retryBlankValues: true, maxRetries: 5, @@ -94,55 +65,44 @@ func (s *dcgmScraper) initClient() error { if errors.Is(err, ErrDcgmInitialization) { // If cannot connect to DCGM, return no error and retry at next // collection time - return nil + return nil, nil } - return err + return nil, err } - s.client = client - return nil -} - -func newRateIntegrator[V int64 | float64]() *rateIntegrator[V] { - ri := new(rateIntegrator[V]) - ri.Reset() - return ri + return client, nil } -func newCumulativeTracker[V int64 | float64]() *cumulativeTracker[V] { - ct := new(cumulativeTracker[V]) - ct.Reset() - return ct -} - -func (s *dcgmScraper) start(_ context.Context, _ component.Host) error { +func (s *dcgmScraper) start(ctx context.Context, _ component.Host) error { startTime := pcommon.NewTimestampFromTime(time.Now()) mbConfig := metadata.DefaultMetricsBuilderConfig() mbConfig.Metrics = s.config.Metrics s.mb = metadata.NewMetricsBuilder( mbConfig, s.settings, metadata.WithStartTime(startTime)) - s.aggregates.energyConsumption.total = newDefaultMap[uint](newCumulativeTracker[int64]) - s.aggregates.energyConsumption.fallback = newDefaultMap[uint](newRateIntegrator[float64]) - s.aggregates.pcieTotal.tx = newDefaultMap[uint](newRateIntegrator[int64]) - s.aggregates.pcieTotal.rx = newDefaultMap[uint](newRateIntegrator[int64]) - s.aggregates.nvlinkTotal.tx = newDefaultMap[uint](newRateIntegrator[int64]) - s.aggregates.nvlinkTotal.rx = newDefaultMap[uint](newRateIntegrator[int64]) - s.aggregates.throttleDuration.powerViolation = newDefaultMap[uint](newCumulativeTracker[int64]) - s.aggregates.throttleDuration.thermalViolation = newDefaultMap[uint](newCumulativeTracker[int64]) - s.aggregates.throttleDuration.syncBoostViolation = newDefaultMap[uint](newCumulativeTracker[int64]) - s.aggregates.throttleDuration.boardLimitViolation = newDefaultMap[uint](newCumulativeTracker[int64]) - s.aggregates.throttleDuration.lowUtilViolation = newDefaultMap[uint](newCumulativeTracker[int64]) - s.aggregates.throttleDuration.reliabilityViolation = newDefaultMap[uint](newCumulativeTracker[int64]) - s.aggregates.throttleDuration.totalAppClocksViolation = newDefaultMap[uint](newCumulativeTracker[int64]) - s.aggregates.throttleDuration.totalBaseClocksViolation = newDefaultMap[uint](newCumulativeTracker[int64]) - s.aggregates.eccTotal.sbe = newDefaultMap[uint](newCumulativeTracker[int64]) - s.aggregates.eccTotal.dbe = newDefaultMap[uint](newCumulativeTracker[int64]) + + scrapeCtx, scrapeCancel := context.WithCancel(context.WithoutCancel(ctx)) + g, scrapeCtx := errgroup.WithContext(scrapeCtx) + + s.cancel = func() { + scrapeCancel() + g.Wait() + } + + metricsCh := make(chan map[uint]deviceMetrics) + collectTriggerCh := make(chan struct{}, 1) // Capacity of 1 makes this asynchronous + s.metricsCh = metricsCh + s.collectTriggerCh = collectTriggerCh + + g.Go(func() error { + return s.runConnectLoop(scrapeCtx, metricsCh, collectTriggerCh) + }) return nil } func (s *dcgmScraper) stop(_ context.Context) error { - if s.client != nil { - s.client.cleanup() + if s.cancel != nil { + s.cancel() + s.cancel = nil } return nil } @@ -220,188 +180,194 @@ func discoverRequestedFields(config *Config) []string { return requestedFields } -func (s *dcgmScraper) scrape(_ context.Context) (pmetric.Metrics, error) { - err := s.initClient() - if err != nil || s.client == nil { - return s.mb.Emit(), err +func (s *dcgmScraper) runConnectLoop(ctx context.Context, metricsCh chan<- map[uint]deviceMetrics, collectTriggerCh <-chan struct{}) error { + defer close(metricsCh) + for { + client, _ := s.initClient() + // Ignore the error; it's logged in initClient. + if client != nil { + s.pollClient(ctx, client, metricsCh, collectTriggerCh) + } + select { + case <-ctx.Done(): + return ctx.Err() + case metricsCh <- map[uint]deviceMetrics{}: + // Un-hang any scrapers waiting for data, since we currently have no metrics to offer. + case <-time.After(s.initRetryDelay): + } } + return nil +} - s.settings.Logger.Sugar().Debug("Client created, collecting metrics") - deviceMetrics, err := s.client.collectDeviceMetrics() - if err != nil { - s.settings.Logger.Sugar().Warnf("Metrics not collected; err=%v", err) - return s.mb.Emit(), err +func (s *dcgmScraper) pollClient(ctx context.Context, client *dcgmClient, metricsCh chan<- map[uint]deviceMetrics, collectTriggerCh <-chan struct{}) { + defer client.cleanup() + for { + waitTime, err := client.collect() + // Ignore the error; it's logged in collect() + if err != nil { + waitTime = 10 * time.Second + } + // Try to poll at least twice per collection interval + waitTime = max( + 100*time.Millisecond, + min( + s.config.CollectionInterval, + waitTime, + )/2, + ) + s.settings.Logger.Sugar().Debugf("Waiting %s for the next collection", waitTime) + after := time.After(waitTime) + for after != nil { + deviceMetrics := client.getDeviceMetrics() + select { + case <-ctx.Done(): + return + case <-collectTriggerCh: + // Loop and trigger a collect() again. + after = nil + case metricsCh <- deviceMetrics: + case <-after: + after = nil + } + } + } +} + +func (s *dcgmScraper) scrape(ctx context.Context) (pmetric.Metrics, error) { + var deviceMetrics map[uint]deviceMetrics + // Trigger a collection cycle to make sure we have fresh metrics. + // The select ensures that if there's already a request registered we don't block. + select { + case s.collectTriggerCh <- struct{}{}: + default: + } + // Now wait for metrics. + select { + case deviceMetrics = <-s.metricsCh: + case <-ctx.Done(): + return pmetric.NewMetrics(), ctx.Err() } s.settings.Logger.Sugar().Debugf("Metrics collected: %d", len(deviceMetrics)) now := pcommon.NewTimestampFromTime(time.Now()) - for gpuIndex, gpuMetrics := range deviceMetrics { - metricsByName := make(map[string][]dcgmMetric) - for _, metric := range gpuMetrics { - metricsByName[metric.name] = append(metricsByName[metric.name], metric) - } - s.settings.Logger.Sugar().Debugf("Got %d unique metrics: %v", len(metricsByName), metricsByName) - metrics := make(map[string]dcgmMetric) - for name, points := range metricsByName { - slices.SortStableFunc(points, func(a, b dcgmMetric) int { - return cmp.Compare(a.timestamp, b.timestamp) - }) - metrics[name] = points[len(points)-1] - } + for gpuIndex, gpu := range deviceMetrics { + s.settings.Logger.Sugar().Debugf("Got %d unique metrics: %v", len(gpu.Metrics), gpu.Metrics) rb := s.mb.NewResourceBuilder() rb.SetGpuNumber(fmt.Sprintf("%d", gpuIndex)) - rb.SetGpuUUID(s.client.getDeviceUUID(gpuIndex)) - rb.SetGpuModel(s.client.getDeviceModelName(gpuIndex)) + rb.SetGpuUUID(gpu.UUID) + rb.SetGpuModel(gpu.ModelName) gpuResource := rb.Emit() - if metric, ok := metrics["DCGM_FI_PROF_GR_ENGINE_ACTIVE"]; ok { - s.mb.RecordGpuDcgmUtilizationDataPoint(now, metric.asFloat64()) - } else if metric, ok := metrics["DCGM_FI_DEV_GPU_UTIL"]; ok { // fallback - gpuUtil := float64(metric.asInt64()) / 100.0 /* normalize */ - s.mb.RecordGpuDcgmUtilizationDataPoint(now, gpuUtil) - } - if metric, ok := metrics["DCGM_FI_PROF_SM_ACTIVE"]; ok { - s.mb.RecordGpuDcgmSmUtilizationDataPoint(now, metric.asFloat64()) - } - if metric, ok := metrics["DCGM_FI_PROF_SM_OCCUPANCY"]; ok { - s.mb.RecordGpuDcgmSmOccupancyDataPoint(now, metric.asFloat64()) - } - if metric, ok := metrics["DCGM_FI_PROF_PIPE_TENSOR_ACTIVE"]; ok { - s.mb.RecordGpuDcgmPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributeGpuPipeTensor) - } - if metric, ok := metrics["DCGM_FI_PROF_PIPE_FP64_ACTIVE"]; ok { - s.mb.RecordGpuDcgmPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributeGpuPipeFp64) - } - if metric, ok := metrics["DCGM_FI_PROF_PIPE_FP32_ACTIVE"]; ok { - s.mb.RecordGpuDcgmPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributeGpuPipeFp32) - } - if metric, ok := metrics["DCGM_FI_PROF_PIPE_FP16_ACTIVE"]; ok { - s.mb.RecordGpuDcgmPipeUtilizationDataPoint(now, metric.asFloat64(), metadata.AttributeGpuPipeFp16) - } - if metric, ok := metrics["DCGM_FI_DEV_ENC_UTIL"]; ok { - encUtil := float64(metric.asInt64()) / 100.0 /* normalize */ - s.mb.RecordGpuDcgmCodecEncoderUtilizationDataPoint(now, encUtil) - } - if metric, ok := metrics["DCGM_FI_DEV_DEC_UTIL"]; ok { - decUtil := float64(metric.asInt64()) / 100.0 /* normalize */ - s.mb.RecordGpuDcgmCodecDecoderUtilizationDataPoint(now, decUtil) - } - if metric, ok := metrics["DCGM_FI_DEV_FB_FREE"]; ok { - bytesFree := 1e6 * metric.asInt64() /* MBy to By */ - s.mb.RecordGpuDcgmMemoryBytesUsedDataPoint(now, bytesFree, metadata.AttributeGpuMemoryStateFree) - } - if metric, ok := metrics["DCGM_FI_DEV_FB_USED"]; ok { - bytesUsed := 1e6 * metric.asInt64() /* MBy to By */ - s.mb.RecordGpuDcgmMemoryBytesUsedDataPoint(now, bytesUsed, metadata.AttributeGpuMemoryStateUsed) - } - if metric, ok := metrics["DCGM_FI_DEV_FB_RESERVED"]; ok { - bytesReserved := 1e6 * metric.asInt64() /* MBy to By */ - s.mb.RecordGpuDcgmMemoryBytesUsedDataPoint(now, bytesReserved, metadata.AttributeGpuMemoryStateReserved) - } - if metric, ok := metrics["DCGM_FI_PROF_DRAM_ACTIVE"]; ok { - s.mb.RecordGpuDcgmMemoryBandwidthUtilizationDataPoint(now, metric.asFloat64()) - } else if metric, ok := metrics["DCGM_FI_DEV_MEM_COPY_UTIL"]; ok { // fallback - memCopyUtil := float64(metric.asInt64()) / 100.0 /* normalize */ - s.mb.RecordGpuDcgmMemoryBandwidthUtilizationDataPoint(now, memCopyUtil) - } - if metric, ok := metrics["DCGM_FI_PROF_PCIE_TX_BYTES"]; ok { - s.aggregates.pcieTotal.tx.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) - _, pcieTx := s.aggregates.pcieTotal.tx.Get(gpuIndex).Value() - s.mb.RecordGpuDcgmPcieIoDataPoint(now, pcieTx, metadata.AttributeNetworkIoDirectionTransmit) - } - if metric, ok := metrics["DCGM_FI_PROF_PCIE_RX_BYTES"]; ok { - s.aggregates.pcieTotal.rx.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) - _, pcieRx := s.aggregates.pcieTotal.rx.Get(gpuIndex).Value() - s.mb.RecordGpuDcgmPcieIoDataPoint(now, pcieRx, metadata.AttributeNetworkIoDirectionReceive) - } - if metric, ok := metrics["DCGM_FI_PROF_NVLINK_TX_BYTES"]; ok { - s.aggregates.nvlinkTotal.tx.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) - _, nvlinkTx := s.aggregates.nvlinkTotal.tx.Get(gpuIndex).Value() - s.mb.RecordGpuDcgmNvlinkIoDataPoint(now, nvlinkTx, metadata.AttributeNetworkIoDirectionTransmit) - } - if metric, ok := metrics["DCGM_FI_PROF_NVLINK_RX_BYTES"]; ok { - s.aggregates.nvlinkTotal.rx.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) - _, nvlinkRx := s.aggregates.nvlinkTotal.rx.Get(gpuIndex).Value() - s.mb.RecordGpuDcgmNvlinkIoDataPoint(now, nvlinkRx, metadata.AttributeNetworkIoDirectionReceive) - } - if metric, ok := metrics["DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION"]; ok { - s.aggregates.energyConsumption.total.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) - _, value := s.aggregates.energyConsumption.total.Get(gpuIndex).Value() - energyUsed := float64(value) / 1e3 /* mJ to J */ - s.mb.RecordGpuDcgmEnergyConsumptionDataPoint(now, energyUsed) - } else if metric, ok := metrics["DCGM_FI_DEV_POWER_USAGE"]; ok { // fallback - s.aggregates.energyConsumption.fallback.Get(gpuIndex).Update(metric.timestamp, metric.asFloat64()) - _, energyUsed := s.aggregates.energyConsumption.fallback.Get(gpuIndex).Value() - s.mb.RecordGpuDcgmEnergyConsumptionDataPoint(now, energyUsed) - } - if metric, ok := metrics["DCGM_FI_DEV_GPU_TEMP"]; ok { - s.mb.RecordGpuDcgmTemperatureDataPoint(now, float64(metric.asInt64())) - } - if metric, ok := metrics["DCGM_FI_DEV_SM_CLOCK"]; ok { - clockFreq := 1e6 * float64(metric.asInt64()) /* MHz to Hz */ - s.mb.RecordGpuDcgmClockFrequencyDataPoint(now, clockFreq) - } - if metric, ok := metrics["DCGM_FI_DEV_POWER_VIOLATION"]; ok { - s.aggregates.throttleDuration.powerViolation.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) - _, value := s.aggregates.throttleDuration.powerViolation.Get(gpuIndex).Value() - violationTime := float64(value) / 1e6 /* us to s */ - s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationPower) - } - if metric, ok := metrics["DCGM_FI_DEV_THERMAL_VIOLATION"]; ok { - s.aggregates.throttleDuration.thermalViolation.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) - _, value := s.aggregates.throttleDuration.thermalViolation.Get(gpuIndex).Value() - violationTime := float64(value) / 1e6 /* us to s */ - s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationThermal) - } - if metric, ok := metrics["DCGM_FI_DEV_SYNC_BOOST_VIOLATION"]; ok { - s.aggregates.throttleDuration.syncBoostViolation.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) - _, value := s.aggregates.throttleDuration.syncBoostViolation.Get(gpuIndex).Value() - violationTime := float64(value) / 1e6 /* us to s */ - s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationSyncBoost) - } - if metric, ok := metrics["DCGM_FI_DEV_BOARD_LIMIT_VIOLATION"]; ok { - s.aggregates.throttleDuration.boardLimitViolation.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) - _, value := s.aggregates.throttleDuration.boardLimitViolation.Get(gpuIndex).Value() - violationTime := float64(value) / 1e6 /* us to s */ - s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationBoardLimit) - } - if metric, ok := metrics["DCGM_FI_DEV_LOW_UTIL_VIOLATION"]; ok { - s.aggregates.throttleDuration.lowUtilViolation.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) - _, value := s.aggregates.throttleDuration.lowUtilViolation.Get(gpuIndex).Value() - violationTime := float64(value) / 1e6 /* us to s */ - s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationLowUtil) - } - if metric, ok := metrics["DCGM_FI_DEV_RELIABILITY_VIOLATION"]; ok { - s.aggregates.throttleDuration.reliabilityViolation.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) - _, value := s.aggregates.throttleDuration.reliabilityViolation.Get(gpuIndex).Value() - violationTime := float64(value) / 1e6 /* us to s */ - s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationReliability) - } - if metric, ok := metrics["DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION"]; ok { - s.aggregates.throttleDuration.totalAppClocksViolation.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) - _, value := s.aggregates.throttleDuration.totalAppClocksViolation.Get(gpuIndex).Value() - violationTime := float64(value) / 1e6 /* us to s */ - s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationAppClock) - } - if metric, ok := metrics["DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION"]; ok { - s.aggregates.throttleDuration.totalBaseClocksViolation.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) - _, value := s.aggregates.throttleDuration.totalBaseClocksViolation.Get(gpuIndex).Value() - violationTime := float64(value) / 1e6 /* us to s */ - s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, violationTime, metadata.AttributeGpuClockViolationBaseClock) - } - if metric, ok := metrics["DCGM_FI_DEV_ECC_SBE_VOL_TOTAL"]; ok { - s.aggregates.eccTotal.sbe.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) - _, sbeErrors := s.aggregates.eccTotal.sbe.Get(gpuIndex).Value() - s.mb.RecordGpuDcgmEccErrorsDataPoint(now, sbeErrors, metadata.AttributeGpuErrorTypeSbe) - } - if metric, ok := metrics["DCGM_FI_DEV_ECC_DBE_VOL_TOTAL"]; ok { - s.aggregates.eccTotal.dbe.Get(gpuIndex).Update(metric.timestamp, metric.asInt64()) - _, dbeErrors := s.aggregates.eccTotal.dbe.Get(gpuIndex).Value() - s.mb.RecordGpuDcgmEccErrorsDataPoint(now, dbeErrors, metadata.AttributeGpuErrorTypeDbe) + + v, ok := gpu.Metrics.LastFloat64("DCGM_FI_PROF_GR_ENGINE_ACTIVE") + if !ok { + v, ok = gpu.Metrics.LastFloat64("DCGM_FI_DEV_GPU_UTIL") + v /= 100.0 /* normalize */ + } + if ok { + s.mb.RecordGpuDcgmUtilizationDataPoint(now, v) + } + if v, ok := gpu.Metrics.LastFloat64("DCGM_FI_PROF_SM_ACTIVE"); ok { + s.mb.RecordGpuDcgmSmUtilizationDataPoint(now, v) + } + if v, ok := gpu.Metrics.LastFloat64("DCGM_FI_PROF_SM_OCCUPANCY"); ok { + s.mb.RecordGpuDcgmSmOccupancyDataPoint(now, v) + } + if v, ok := gpu.Metrics.LastFloat64("DCGM_FI_PROF_PIPE_TENSOR_ACTIVE"); ok { + s.mb.RecordGpuDcgmPipeUtilizationDataPoint(now, v, metadata.AttributeGpuPipeTensor) + } + if v, ok := gpu.Metrics.LastFloat64("DCGM_FI_PROF_PIPE_FP64_ACTIVE"); ok { + s.mb.RecordGpuDcgmPipeUtilizationDataPoint(now, v, metadata.AttributeGpuPipeFp64) + } + if v, ok := gpu.Metrics.LastFloat64("DCGM_FI_PROF_PIPE_FP32_ACTIVE"); ok { + s.mb.RecordGpuDcgmPipeUtilizationDataPoint(now, v, metadata.AttributeGpuPipeFp32) + } + if v, ok := gpu.Metrics.LastFloat64("DCGM_FI_PROF_PIPE_FP16_ACTIVE"); ok { + s.mb.RecordGpuDcgmPipeUtilizationDataPoint(now, v, metadata.AttributeGpuPipeFp16) + } + if v, ok := gpu.Metrics.LastFloat64("DCGM_FI_DEV_ENC_UTIL"); ok { + s.mb.RecordGpuDcgmCodecEncoderUtilizationDataPoint(now, v/100.0) /* normalize */ + } + if v, ok := gpu.Metrics.LastFloat64("DCGM_FI_DEV_DEC_UTIL"); ok { + s.mb.RecordGpuDcgmCodecDecoderUtilizationDataPoint(now, v/100.0) /* normalize */ + } + if v, ok := gpu.Metrics.LastInt64("DCGM_FI_DEV_FB_FREE"); ok { + s.mb.RecordGpuDcgmMemoryBytesUsedDataPoint(now, 1e6*v, metadata.AttributeGpuMemoryStateFree) /* MBy to By */ + } + if v, ok := gpu.Metrics.LastInt64("DCGM_FI_DEV_FB_USED"); ok { + s.mb.RecordGpuDcgmMemoryBytesUsedDataPoint(now, 1e6*v, metadata.AttributeGpuMemoryStateUsed) /* MBy to By */ + } + if v, ok := gpu.Metrics.LastInt64("DCGM_FI_DEV_FB_RESERVED"); ok { + s.mb.RecordGpuDcgmMemoryBytesUsedDataPoint(now, 1e6*v, metadata.AttributeGpuMemoryStateReserved) /* MBy to By */ + } + v, ok = gpu.Metrics.LastFloat64("DCGM_FI_PROF_DRAM_ACTIVE") + if !ok { // fallback + v, ok = gpu.Metrics.LastFloat64("DCGM_FI_DEV_MEM_COPY_UTIL") + v /= 100.0 /* normalize */ + } + if ok { + s.mb.RecordGpuDcgmMemoryBandwidthUtilizationDataPoint(now, v) + } + if v, ok := gpu.Metrics.IntegratedRate("DCGM_FI_PROF_PCIE_TX_BYTES"); ok { + s.mb.RecordGpuDcgmPcieIoDataPoint(now, v, metadata.AttributeNetworkIoDirectionTransmit) + } + if v, ok := gpu.Metrics.IntegratedRate("DCGM_FI_PROF_PCIE_RX_BYTES"); ok { + s.mb.RecordGpuDcgmPcieIoDataPoint(now, v, metadata.AttributeNetworkIoDirectionReceive) + } + if v, ok := gpu.Metrics.IntegratedRate("DCGM_FI_PROF_NVLINK_TX_BYTES"); ok { + s.mb.RecordGpuDcgmNvlinkIoDataPoint(now, v, metadata.AttributeNetworkIoDirectionTransmit) + } + if v, ok := gpu.Metrics.IntegratedRate("DCGM_FI_PROF_NVLINK_RX_BYTES"); ok { + s.mb.RecordGpuDcgmNvlinkIoDataPoint(now, v, metadata.AttributeNetworkIoDirectionReceive) + } + i, ok := gpu.Metrics.CumulativeTotal("DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION") + v = float64(i) / 1e3 /* mJ to J */ + if !ok { // fallback + i, ok = gpu.Metrics.IntegratedRate("DCGM_FI_DEV_POWER_USAGE") + v = float64(i) + } + if ok { + s.mb.RecordGpuDcgmEnergyConsumptionDataPoint(now, v) + } + if v, ok := gpu.Metrics.LastFloat64("DCGM_FI_DEV_GPU_TEMP"); ok { + s.mb.RecordGpuDcgmTemperatureDataPoint(now, v) + } + if v, ok := gpu.Metrics.LastFloat64("DCGM_FI_DEV_SM_CLOCK"); ok { + s.mb.RecordGpuDcgmClockFrequencyDataPoint(now, 1e6*v) /* MHz to Hz */ + } + if v, ok := gpu.Metrics.CumulativeTotal("DCGM_FI_DEV_POWER_VIOLATION"); ok { + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, float64(v)/1e9, metadata.AttributeGpuClockViolationPower) /* ns to s */ + } + if v, ok := gpu.Metrics.CumulativeTotal("DCGM_FI_DEV_THERMAL_VIOLATION"); ok { + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, float64(v)/1e9, metadata.AttributeGpuClockViolationThermal) /* ns to s */ + } + if v, ok := gpu.Metrics.CumulativeTotal("DCGM_FI_DEV_SYNC_BOOST_VIOLATION"); ok { + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, float64(v)/1e9, metadata.AttributeGpuClockViolationSyncBoost) /* ns to s */ + } + if v, ok := gpu.Metrics.CumulativeTotal("DCGM_FI_DEV_BOARD_LIMIT_VIOLATION"); ok { + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, float64(v)/1e9, metadata.AttributeGpuClockViolationBoardLimit) /* ns to s */ + } + if v, ok := gpu.Metrics.CumulativeTotal("DCGM_FI_DEV_LOW_UTIL_VIOLATION"); ok { + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, float64(v)/1e9, metadata.AttributeGpuClockViolationLowUtil) /* ns to s */ + } + if v, ok := gpu.Metrics.CumulativeTotal("DCGM_FI_DEV_RELIABILITY_VIOLATION"); ok { + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, float64(v)/1e9, metadata.AttributeGpuClockViolationReliability) /* ns to s */ + } + if v, ok := gpu.Metrics.CumulativeTotal("DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION"); ok { + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, float64(v)/1e9, metadata.AttributeGpuClockViolationAppClock) /* ns to s */ + } + if v, ok := gpu.Metrics.CumulativeTotal("DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION"); ok { + s.mb.RecordGpuDcgmClockThrottleDurationTimeDataPoint(now, float64(v)/1e9, metadata.AttributeGpuClockViolationBaseClock) /* ns to s */ + } + if v, ok := gpu.Metrics.CumulativeTotal("DCGM_FI_DEV_ECC_SBE_VOL_TOTAL"); ok { + s.mb.RecordGpuDcgmEccErrorsDataPoint(now, v, metadata.AttributeGpuErrorTypeSbe) + } + if v, ok := gpu.Metrics.CumulativeTotal("DCGM_FI_DEV_ECC_DBE_VOL_TOTAL"); ok { + s.mb.RecordGpuDcgmEccErrorsDataPoint(now, v, metadata.AttributeGpuErrorTypeDbe) } // TODO: XID errors. // s.mb.RecordGpuDcgmXidErrorsDataPoint(now, metric.asInt64(), xid) s.mb.EmitForResource(metadata.WithResource(gpuResource)) } - return s.mb.Emit(), err + return s.mb.Emit(), nil } diff --git a/receiver/dcgmreceiver/scraper_gpu_test.go b/receiver/dcgmreceiver/scraper_gpu_test.go index 55f6c1aca..393bc1912 100644 --- a/receiver/dcgmreceiver/scraper_gpu_test.go +++ b/receiver/dcgmreceiver/scraper_gpu_test.go @@ -38,6 +38,19 @@ import ( "github.com/GoogleCloudPlatform/opentelemetry-operations-collector/receiver/dcgmreceiver/testprofilepause" ) +func collectScraperResult(t *testing.T, ctx context.Context, scraper *dcgmScraper) (pmetric.Metrics, error) { + for { + metrics, err := scraper.scrape(ctx) + assert.NoError(t, err) + if metrics.MetricCount() > 0 { + // We expect cumulative metrics to be missing on the first scrape. + time.Sleep(scrapePollingInterval) + return scraper.scrape(ctx) + } + time.Sleep(scrapePollingInterval) + } +} + func TestScrapeWithGpuPresent(t *testing.T) { var settings receiver.CreateSettings settings.Logger = zaptest.NewLogger(t) @@ -48,20 +61,60 @@ func TestScrapeWithGpuPresent(t *testing.T) { err := scraper.start(context.Background(), componenttest.NewNopHost()) require.NoError(t, err) - metrics, err := scraper.scrape(context.Background()) + metrics, err := collectScraperResult(t, context.Background(), scraper) + assert.NoError(t, err) + + assert.NoError(t, scraper.stop(context.Background())) + + validateScraperResult(t, metrics) +} + +func TestScrapeCollectionInterval(t *testing.T) { + var settings receiver.CreateSettings + settings.Logger = zaptest.NewLogger(t) + + var fetchCount int + + realDcgmGetValuesSince := dcgmGetValuesSince + defer func() { dcgmGetValuesSince = realDcgmGetValuesSince }() + dcgmGetValuesSince = func(g dcgm.GroupHandle, f dcgm.FieldHandle, t time.Time) ([]dcgm.FieldValue_v2, time.Time, error) { + fetchCount++ + return realDcgmGetValuesSince(g, f, t) + } + + scraper := newDcgmScraper(createDefaultConfig().(*Config), settings) + require.NotNil(t, scraper) + + err := scraper.start(context.Background(), componenttest.NewNopHost()) + require.NoError(t, err) + + // We expect to scrape every maxKeepSamples * scrapePollingInterval / 2. + // Wait long enough that we expect three scrapes. + const sleepTime = 3.5 * maxKeepSamples * scrapePollingInterval / 2 + + time.Sleep(sleepTime) + + metrics, err := collectScraperResult(t, context.Background(), scraper) assert.NoError(t, err) - require.NotNil(t, scraper.client) - require.NotEmpty(t, scraper.client.devicesModelName) - expectedMetrics := loadExpectedScraperMetrics(t, scraper.client.getDeviceModelName(0)) - validateScraperResult(t, metrics, expectedMetrics) + assert.NoError(t, scraper.stop(context.Background())) + + // We should have seen 1 initial scrape + 3 timed scrapes + 2 scrapes triggered by `collectScraperResult`. + assert.Less(t, fetchCount, 7, "too many fetches") + + validateScraperResult(t, metrics) } func TestScrapeWithDelayedDcgmService(t *testing.T) { realDcgmInit := dcgmInit defer func() { dcgmInit = realDcgmInit }() + failures := 2 dcgmInit = func(args ...string) (func(), error) { - return nil, fmt.Errorf("No DCGM client library *OR* No DCGM connection") + if failures > 0 { + failures-- + return nil, fmt.Errorf("No DCGM client library *OR* No DCGM connection") + } + return realDcgmInit(args...) } var settings receiver.CreateSettings @@ -70,27 +123,21 @@ func TestScrapeWithDelayedDcgmService(t *testing.T) { scraper := newDcgmScraper(createDefaultConfig().(*Config), settings) require.NotNil(t, scraper) + scraper.initRetryDelay = 0 // retry immediately + err := scraper.start(context.Background(), componenttest.NewNopHost()) require.NoError(t, err) - metrics, err := scraper.scrape(context.Background()) - assert.NoError(t, err) // If failed to init DCGM, should have no error - assert.Equal(t, 0, metrics.MetricCount()) - - // Scrape again with DCGM not available - metrics, err = scraper.scrape(context.Background()) + // Simulate DCGM becomes available after 3 attempts + // scrape should block until DCGM is available + metrics, err := collectScraperResult(t, context.Background(), scraper) assert.NoError(t, err) - assert.Equal(t, 0, metrics.MetricCount()) - // Simulate DCGM becomes available - dcgmInit = realDcgmInit - metrics, err = scraper.scrape(context.Background()) - assert.NoError(t, err) + assert.NoError(t, scraper.stop(context.Background())) + + assert.Equal(t, 0, failures) - require.NotNil(t, scraper.client) - require.NotEmpty(t, scraper.client.devicesModelName) - expectedMetrics := loadExpectedScraperMetrics(t, scraper.client.getDeviceModelName(0)) - validateScraperResult(t, metrics, expectedMetrics) + validateScraperResult(t, metrics) } func TestScrapeWithEmptyMetricsConfig(t *testing.T) { @@ -164,13 +211,15 @@ func TestScrapeWithEmptyMetricsConfig(t *testing.T) { metrics, err := scraper.scrape(context.Background()) assert.NoError(t, err) assert.Equal(t, 0, metrics.MetricCount()) + + assert.NoError(t, scraper.stop(context.Background())) } func TestScrapeOnPollingError(t *testing.T) { - realDcgmGetLatestValuesForFields := dcgmGetLatestValuesForFields - defer func() { dcgmGetLatestValuesForFields = realDcgmGetLatestValuesForFields }() - dcgmGetLatestValuesForFields = func(gpu uint, fields []dcgm.Short) ([]dcgm.FieldValue_v1, error) { - return nil, fmt.Errorf("DCGM polling error") + realDcgmGetValuesSince := dcgmGetValuesSince + defer func() { dcgmGetValuesSince = realDcgmGetValuesSince }() + dcgmGetValuesSince = func(_ dcgm.GroupHandle, _ dcgm.FieldHandle, _ time.Time) ([]dcgm.FieldValue_v2, time.Time, error) { + return nil, time.Time{}, fmt.Errorf("DCGM polling error") } var settings receiver.CreateSettings @@ -184,8 +233,10 @@ func TestScrapeOnPollingError(t *testing.T) { metrics, err := scraper.scrape(context.Background()) - assert.Error(t, err) + assert.NoError(t, err) assert.Equal(t, 0, metrics.MetricCount()) + + assert.NoError(t, scraper.stop(context.Background())) } func TestScrapeOnProfilingPaused(t *testing.T) { @@ -198,24 +249,24 @@ func TestScrapeOnProfilingPaused(t *testing.T) { scraper := newDcgmScraper(config, settings) require.NotNil(t, scraper) - defer func() { testprofilepause.ResumeProfilingMetrics() }() - err := testprofilepause.PauseProfilingMetrics() - if err != nil { - if errors.Is(err, testprofilepause.FeatureNotSupportedError) { - t.Skipf("Pausing profiling not supported") - } else { - t.Errorf("Pausing profiling failed with error %v", err) - } + defer testprofilepause.ResumeProfilingMetrics(config.TCPAddrConfig.Endpoint) + err := testprofilepause.PauseProfilingMetrics(config.TCPAddrConfig.Endpoint) + if errors.Is(err, testprofilepause.FeatureNotSupportedError) { + t.Skipf("Pausing profiling not supported") + } else if err != nil { + t.Fatalf("Pausing profiling failed with error %v", err) } time.Sleep(20 * time.Millisecond) err = scraper.start(context.Background(), componenttest.NewNopHost()) require.NoError(t, err) - metrics, err := scraper.scrape(context.Background()) + metrics, err := collectScraperResult(t, context.Background(), scraper) assert.NoError(t, err) + assert.NoError(t, scraper.stop(context.Background())) + expectedMetrics := []string{ "gpu.dcgm.utilization", "gpu.dcgm.codec.decoder.utilization", @@ -229,6 +280,8 @@ func TestScrapeOnProfilingPaused(t *testing.T) { "gpu.dcgm.ecc_errors", } + require.Greater(t, metrics.ResourceMetrics().Len(), 0) + ilms := metrics.ResourceMetrics().At(0).ScopeMetrics() require.Equal(t, 1, ilms.Len()) @@ -286,8 +339,8 @@ func loadExpectedScraperMetrics(t *testing.T, model string) map[string]int { "DCGM_FI_DEV_ECC_SBE_VOL_TOTAL": "gpu.dcgm.ecc_errors", "DCGM_FI_DEV_ECC_DBE_VOL_TOTAL": "gpu.dcgm.ecc_errors", } - expectedReceiverMetrics := LoadExpectedMetrics(t, model) - for _, em := range expectedReceiverMetrics { + supportedFields := LoadExpectedMetrics(t, model) + for _, em := range supportedFields.SupportedFields { scraperMetric := receiverMetricNameToScraperMetricName[em] if scraperMetric != "" { expectedMetrics[scraperMetric] += 1 @@ -297,8 +350,14 @@ func loadExpectedScraperMetrics(t *testing.T, model string) map[string]int { return expectedMetrics } -func validateScraperResult(t *testing.T, metrics pmetric.Metrics, expectedMetrics map[string]int) { +func validateScraperResult(t *testing.T, metrics pmetric.Metrics) { t.Helper() + rms := metrics.ResourceMetrics() + require.NotEmpty(t, rms.Len(), "missing ResourceMetrics") + modelValue, ok := rms.At(0).Resource().Attributes().Get("gpu.model") + require.True(t, ok, "missing gpu.model resource attribute") + expectedMetrics := loadExpectedScraperMetrics(t, modelValue.Str()) + metricWasSeen := make(map[string]bool) expectedDataPointCount := 0 for metric, expectedMetricDataPoints := range expectedMetrics { @@ -306,8 +365,8 @@ func validateScraperResult(t *testing.T, metrics pmetric.Metrics, expectedMetric expectedDataPointCount += expectedMetricDataPoints } - assert.LessOrEqual(t, len(expectedMetrics), metrics.MetricCount()) - assert.LessOrEqual(t, expectedDataPointCount, metrics.DataPointCount()) + assert.LessOrEqual(t, len(expectedMetrics), metrics.MetricCount(), "metric count") + assert.LessOrEqual(t, expectedDataPointCount, metrics.DataPointCount(), "data point count") r := metrics.ResourceMetrics().At(0).Resource() assert.Contains(t, r.Attributes().AsRaw(), "gpu.number") diff --git a/receiver/dcgmreceiver/testdata/NVIDIA_A100-SXM4-40GB.yaml b/receiver/dcgmreceiver/testdata/NVIDIA_A100-SXM4-40GB.yaml index 71585345e..30b24a858 100644 --- a/receiver/dcgmreceiver/testdata/NVIDIA_A100-SXM4-40GB.yaml +++ b/receiver/dcgmreceiver/testdata/NVIDIA_A100-SXM4-40GB.yaml @@ -1,35 +1,35 @@ model: NVIDIA A100-SXM4-40GB supported_fields: - - DCGM_FI_PROF_GR_ENGINE_ACTIVE - - DCGM_FI_DEV_GPU_UTIL - - DCGM_FI_PROF_SM_ACTIVE - - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE - - DCGM_FI_PROF_PIPE_FP64_ACTIVE - - DCGM_FI_PROF_PIPE_FP32_ACTIVE - - DCGM_FI_PROF_PIPE_FP16_ACTIVE - - DCGM_FI_DEV_ENC_UTIL + - DCGM_FI_DEV_BOARD_LIMIT_VIOLATION - DCGM_FI_DEV_DEC_UTIL + - DCGM_FI_DEV_ECC_DBE_VOL_TOTAL + - DCGM_FI_DEV_ECC_SBE_VOL_TOTAL + - DCGM_FI_DEV_ENC_UTIL - DCGM_FI_DEV_FB_FREE - - DCGM_FI_DEV_FB_USED - DCGM_FI_DEV_FB_RESERVED - - DCGM_FI_PROF_DRAM_ACTIVE + - DCGM_FI_DEV_FB_USED + - DCGM_FI_DEV_GPU_TEMP + - DCGM_FI_DEV_GPU_UTIL + - DCGM_FI_DEV_LOW_UTIL_VIOLATION - DCGM_FI_DEV_MEM_COPY_UTIL - - DCGM_FI_PROF_PCIE_TX_BYTES - - DCGM_FI_PROF_PCIE_RX_BYTES - - DCGM_FI_PROF_NVLINK_TX_BYTES - - DCGM_FI_PROF_NVLINK_RX_BYTES - - DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION - DCGM_FI_DEV_POWER_USAGE - - DCGM_FI_DEV_GPU_TEMP - - DCGM_FI_DEV_SM_CLOCK - DCGM_FI_DEV_POWER_VIOLATION - - DCGM_FI_DEV_THERMAL_VIOLATION - - DCGM_FI_DEV_SYNC_BOOST_VIOLATION - - DCGM_FI_DEV_BOARD_LIMIT_VIOLATION - - DCGM_FI_DEV_LOW_UTIL_VIOLATION - DCGM_FI_DEV_RELIABILITY_VIOLATION + - DCGM_FI_DEV_SM_CLOCK + - DCGM_FI_DEV_SYNC_BOOST_VIOLATION + - DCGM_FI_DEV_THERMAL_VIOLATION - DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION - DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION - - DCGM_FI_DEV_ECC_SBE_VOL_TOTAL - - DCGM_FI_DEV_ECC_DBE_VOL_TOTAL + - DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION + - DCGM_FI_PROF_DRAM_ACTIVE + - DCGM_FI_PROF_GR_ENGINE_ACTIVE + - DCGM_FI_PROF_NVLINK_RX_BYTES + - DCGM_FI_PROF_NVLINK_TX_BYTES + - DCGM_FI_PROF_PCIE_RX_BYTES + - DCGM_FI_PROF_PCIE_TX_BYTES + - DCGM_FI_PROF_PIPE_FP16_ACTIVE + - DCGM_FI_PROF_PIPE_FP32_ACTIVE + - DCGM_FI_PROF_PIPE_FP64_ACTIVE + - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE + - DCGM_FI_PROF_SM_ACTIVE unsupported_fields: [] diff --git a/receiver/dcgmreceiver/testdata/NVIDIA_H100_80GB_HBM3.yaml b/receiver/dcgmreceiver/testdata/NVIDIA_H100_80GB_HBM3.yaml index 8874e9331..4c9dd91b4 100644 --- a/receiver/dcgmreceiver/testdata/NVIDIA_H100_80GB_HBM3.yaml +++ b/receiver/dcgmreceiver/testdata/NVIDIA_H100_80GB_HBM3.yaml @@ -1,35 +1,35 @@ model: NVIDIA H100 80GB HBM3 supported_fields: - - DCGM_FI_PROF_GR_ENGINE_ACTIVE - - DCGM_FI_DEV_GPU_UTIL - - DCGM_FI_PROF_SM_ACTIVE - - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE - - DCGM_FI_PROF_PIPE_FP64_ACTIVE - - DCGM_FI_PROF_PIPE_FP32_ACTIVE - - DCGM_FI_PROF_PIPE_FP16_ACTIVE - - DCGM_FI_DEV_ENC_UTIL + - DCGM_FI_DEV_BOARD_LIMIT_VIOLATION - DCGM_FI_DEV_DEC_UTIL + - DCGM_FI_DEV_ECC_DBE_VOL_TOTAL + - DCGM_FI_DEV_ECC_SBE_VOL_TOTAL + - DCGM_FI_DEV_ENC_UTIL - DCGM_FI_DEV_FB_FREE - - DCGM_FI_DEV_FB_USED - DCGM_FI_DEV_FB_RESERVED - - DCGM_FI_PROF_DRAM_ACTIVE + - DCGM_FI_DEV_FB_USED + - DCGM_FI_DEV_GPU_TEMP + - DCGM_FI_DEV_GPU_UTIL + - DCGM_FI_DEV_LOW_UTIL_VIOLATION - DCGM_FI_DEV_MEM_COPY_UTIL - - DCGM_FI_PROF_PCIE_TX_BYTES - - DCGM_FI_PROF_PCIE_RX_BYTES - - DCGM_FI_PROF_NVLINK_TX_BYTES - - DCGM_FI_PROF_NVLINK_RX_BYTES - - DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION - DCGM_FI_DEV_POWER_USAGE - - DCGM_FI_DEV_GPU_TEMP - - DCGM_FI_DEV_SM_CLOCK - DCGM_FI_DEV_POWER_VIOLATION - - DCGM_FI_DEV_THERMAL_VIOLATION - - DCGM_FI_DEV_SYNC_BOOST_VIOLATION - - DCGM_FI_DEV_BOARD_LIMIT_VIOLATION - - DCGM_FI_DEV_LOW_UTIL_VIOLATION - DCGM_FI_DEV_RELIABILITY_VIOLATION + - DCGM_FI_DEV_SM_CLOCK + - DCGM_FI_DEV_SYNC_BOOST_VIOLATION + - DCGM_FI_DEV_THERMAL_VIOLATION - DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION - DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION - - DCGM_FI_DEV_ECC_SBE_VOL_TOTAL - - DCGM_FI_DEV_ECC_DBE_VOL_TOTAL + - DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION + - DCGM_FI_PROF_DRAM_ACTIVE + - DCGM_FI_PROF_GR_ENGINE_ACTIVE + - DCGM_FI_PROF_NVLINK_RX_BYTES + - DCGM_FI_PROF_NVLINK_TX_BYTES + - DCGM_FI_PROF_PCIE_RX_BYTES + - DCGM_FI_PROF_PCIE_TX_BYTES + - DCGM_FI_PROF_PIPE_FP16_ACTIVE + - DCGM_FI_PROF_PIPE_FP32_ACTIVE + - DCGM_FI_PROF_PIPE_FP64_ACTIVE + - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE + - DCGM_FI_PROF_SM_ACTIVE unsupported_fields: [] diff --git a/receiver/dcgmreceiver/testdata/NVIDIA_L4.yaml b/receiver/dcgmreceiver/testdata/NVIDIA_L4.yaml index faf59ac8a..16ba2008d 100644 --- a/receiver/dcgmreceiver/testdata/NVIDIA_L4.yaml +++ b/receiver/dcgmreceiver/testdata/NVIDIA_L4.yaml @@ -1,35 +1,35 @@ model: NVIDIA L4 supported_fields: - - DCGM_FI_PROF_GR_ENGINE_ACTIVE - - DCGM_FI_DEV_GPU_UTIL - - DCGM_FI_PROF_SM_ACTIVE - - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE - - DCGM_FI_PROF_PIPE_FP32_ACTIVE - - DCGM_FI_PROF_PIPE_FP16_ACTIVE - - DCGM_FI_DEV_ENC_UTIL + - DCGM_FI_DEV_BOARD_LIMIT_VIOLATION - DCGM_FI_DEV_DEC_UTIL + - DCGM_FI_DEV_ECC_DBE_VOL_TOTAL + - DCGM_FI_DEV_ECC_SBE_VOL_TOTAL + - DCGM_FI_DEV_ENC_UTIL - DCGM_FI_DEV_FB_FREE - - DCGM_FI_DEV_FB_USED - DCGM_FI_DEV_FB_RESERVED - - DCGM_FI_PROF_DRAM_ACTIVE + - DCGM_FI_DEV_FB_USED + - DCGM_FI_DEV_GPU_TEMP + - DCGM_FI_DEV_GPU_UTIL + - DCGM_FI_DEV_LOW_UTIL_VIOLATION - DCGM_FI_DEV_MEM_COPY_UTIL - - DCGM_FI_PROF_PCIE_TX_BYTES - - DCGM_FI_PROF_PCIE_RX_BYTES - - DCGM_FI_PROF_NVLINK_TX_BYTES - - DCGM_FI_PROF_NVLINK_RX_BYTES - - DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION - DCGM_FI_DEV_POWER_USAGE - - DCGM_FI_DEV_GPU_TEMP - - DCGM_FI_DEV_SM_CLOCK - DCGM_FI_DEV_POWER_VIOLATION - - DCGM_FI_DEV_THERMAL_VIOLATION - - DCGM_FI_DEV_SYNC_BOOST_VIOLATION - - DCGM_FI_DEV_BOARD_LIMIT_VIOLATION - - DCGM_FI_DEV_LOW_UTIL_VIOLATION - DCGM_FI_DEV_RELIABILITY_VIOLATION + - DCGM_FI_DEV_SM_CLOCK + - DCGM_FI_DEV_SYNC_BOOST_VIOLATION + - DCGM_FI_DEV_THERMAL_VIOLATION - DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION - DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION - - DCGM_FI_DEV_ECC_SBE_VOL_TOTAL - - DCGM_FI_DEV_ECC_DBE_VOL_TOTAL + - DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION + - DCGM_FI_PROF_DRAM_ACTIVE + - DCGM_FI_PROF_GR_ENGINE_ACTIVE + - DCGM_FI_PROF_NVLINK_RX_BYTES + - DCGM_FI_PROF_NVLINK_TX_BYTES + - DCGM_FI_PROF_PCIE_RX_BYTES + - DCGM_FI_PROF_PCIE_TX_BYTES + - DCGM_FI_PROF_PIPE_FP16_ACTIVE + - DCGM_FI_PROF_PIPE_FP32_ACTIVE + - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE + - DCGM_FI_PROF_SM_ACTIVE unsupported_fields: - DCGM_FI_PROF_PIPE_FP64_ACTIVE diff --git a/receiver/dcgmreceiver/testdata/Tesla_P100-PCIE-16GB.yaml b/receiver/dcgmreceiver/testdata/Tesla_P100-PCIE-16GB.yaml index c1df656c8..f2986c873 100644 --- a/receiver/dcgmreceiver/testdata/Tesla_P100-PCIE-16GB.yaml +++ b/receiver/dcgmreceiver/testdata/Tesla_P100-PCIE-16GB.yaml @@ -1,35 +1,35 @@ model: Tesla P100-PCIE-16GB supported_fields: - - DCGM_FI_DEV_GPU_UTIL - - DCGM_FI_DEV_ENC_UTIL - DCGM_FI_DEV_DEC_UTIL + - DCGM_FI_DEV_ECC_DBE_VOL_TOTAL + - DCGM_FI_DEV_ECC_SBE_VOL_TOTAL + - DCGM_FI_DEV_ENC_UTIL - DCGM_FI_DEV_FB_FREE - - DCGM_FI_DEV_FB_USED - DCGM_FI_DEV_FB_RESERVED + - DCGM_FI_DEV_FB_USED + - DCGM_FI_DEV_GPU_TEMP + - DCGM_FI_DEV_GPU_UTIL - DCGM_FI_DEV_MEM_COPY_UTIL - DCGM_FI_DEV_POWER_USAGE - - DCGM_FI_DEV_GPU_TEMP - - DCGM_FI_DEV_SM_CLOCK - DCGM_FI_DEV_POWER_VIOLATION - - DCGM_FI_DEV_THERMAL_VIOLATION + - DCGM_FI_DEV_SM_CLOCK - DCGM_FI_DEV_SYNC_BOOST_VIOLATION - - DCGM_FI_DEV_ECC_SBE_VOL_TOTAL - - DCGM_FI_DEV_ECC_DBE_VOL_TOTAL + - DCGM_FI_DEV_THERMAL_VIOLATION unsupported_fields: - - DCGM_FI_PROF_GR_ENGINE_ACTIVE - - DCGM_FI_PROF_SM_ACTIVE - - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE - - DCGM_FI_PROF_PIPE_FP64_ACTIVE - - DCGM_FI_PROF_PIPE_FP32_ACTIVE - - DCGM_FI_PROF_PIPE_FP16_ACTIVE - - DCGM_FI_PROF_DRAM_ACTIVE - - DCGM_FI_PROF_PCIE_TX_BYTES - - DCGM_FI_PROF_PCIE_RX_BYTES - - DCGM_FI_PROF_NVLINK_TX_BYTES - - DCGM_FI_PROF_NVLINK_RX_BYTES - - DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION - DCGM_FI_DEV_BOARD_LIMIT_VIOLATION - DCGM_FI_DEV_LOW_UTIL_VIOLATION - DCGM_FI_DEV_RELIABILITY_VIOLATION - DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION - DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION + - DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION + - DCGM_FI_PROF_DRAM_ACTIVE + - DCGM_FI_PROF_GR_ENGINE_ACTIVE + - DCGM_FI_PROF_NVLINK_RX_BYTES + - DCGM_FI_PROF_NVLINK_TX_BYTES + - DCGM_FI_PROF_PCIE_RX_BYTES + - DCGM_FI_PROF_PCIE_TX_BYTES + - DCGM_FI_PROF_PIPE_FP16_ACTIVE + - DCGM_FI_PROF_PIPE_FP32_ACTIVE + - DCGM_FI_PROF_PIPE_FP64_ACTIVE + - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE + - DCGM_FI_PROF_SM_ACTIVE diff --git a/receiver/dcgmreceiver/testdata/Tesla_P4.yaml b/receiver/dcgmreceiver/testdata/Tesla_P4.yaml index aea5cf2dc..052302234 100644 --- a/receiver/dcgmreceiver/testdata/Tesla_P4.yaml +++ b/receiver/dcgmreceiver/testdata/Tesla_P4.yaml @@ -1,35 +1,35 @@ model: Tesla P4 supported_fields: - - DCGM_FI_DEV_GPU_UTIL - - DCGM_FI_DEV_ENC_UTIL - DCGM_FI_DEV_DEC_UTIL + - DCGM_FI_DEV_ECC_DBE_VOL_TOTAL + - DCGM_FI_DEV_ECC_SBE_VOL_TOTAL + - DCGM_FI_DEV_ENC_UTIL - DCGM_FI_DEV_FB_FREE - - DCGM_FI_DEV_FB_USED - DCGM_FI_DEV_FB_RESERVED + - DCGM_FI_DEV_FB_USED + - DCGM_FI_DEV_GPU_TEMP + - DCGM_FI_DEV_GPU_UTIL - DCGM_FI_DEV_MEM_COPY_UTIL - - DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION - DCGM_FI_DEV_POWER_USAGE - - DCGM_FI_DEV_GPU_TEMP - - DCGM_FI_DEV_SM_CLOCK - DCGM_FI_DEV_POWER_VIOLATION - - DCGM_FI_DEV_THERMAL_VIOLATION + - DCGM_FI_DEV_SM_CLOCK - DCGM_FI_DEV_SYNC_BOOST_VIOLATION - - DCGM_FI_DEV_ECC_SBE_VOL_TOTAL - - DCGM_FI_DEV_ECC_DBE_VOL_TOTAL + - DCGM_FI_DEV_THERMAL_VIOLATION + - DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION unsupported_fields: - - DCGM_FI_PROF_GR_ENGINE_ACTIVE - - DCGM_FI_PROF_SM_ACTIVE - - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE - - DCGM_FI_PROF_PIPE_FP64_ACTIVE - - DCGM_FI_PROF_PIPE_FP32_ACTIVE - - DCGM_FI_PROF_PIPE_FP16_ACTIVE - - DCGM_FI_PROF_DRAM_ACTIVE - - DCGM_FI_PROF_PCIE_TX_BYTES - - DCGM_FI_PROF_PCIE_RX_BYTES - - DCGM_FI_PROF_NVLINK_TX_BYTES - - DCGM_FI_PROF_NVLINK_RX_BYTES - DCGM_FI_DEV_BOARD_LIMIT_VIOLATION - DCGM_FI_DEV_LOW_UTIL_VIOLATION - DCGM_FI_DEV_RELIABILITY_VIOLATION - DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION - DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION + - DCGM_FI_PROF_DRAM_ACTIVE + - DCGM_FI_PROF_GR_ENGINE_ACTIVE + - DCGM_FI_PROF_NVLINK_RX_BYTES + - DCGM_FI_PROF_NVLINK_TX_BYTES + - DCGM_FI_PROF_PCIE_RX_BYTES + - DCGM_FI_PROF_PCIE_TX_BYTES + - DCGM_FI_PROF_PIPE_FP16_ACTIVE + - DCGM_FI_PROF_PIPE_FP32_ACTIVE + - DCGM_FI_PROF_PIPE_FP64_ACTIVE + - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE + - DCGM_FI_PROF_SM_ACTIVE diff --git a/receiver/dcgmreceiver/testdata/Tesla_T4.yaml b/receiver/dcgmreceiver/testdata/Tesla_T4.yaml index 3ab8dba88..e63ae2d89 100644 --- a/receiver/dcgmreceiver/testdata/Tesla_T4.yaml +++ b/receiver/dcgmreceiver/testdata/Tesla_T4.yaml @@ -1,35 +1,35 @@ model: Tesla T4 supported_fields: - - DCGM_FI_PROF_GR_ENGINE_ACTIVE - - DCGM_FI_DEV_GPU_UTIL - - DCGM_FI_PROF_SM_ACTIVE - - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE - - DCGM_FI_PROF_PIPE_FP64_ACTIVE - - DCGM_FI_PROF_PIPE_FP32_ACTIVE - - DCGM_FI_PROF_PIPE_FP16_ACTIVE - - DCGM_FI_DEV_ENC_UTIL + - DCGM_FI_DEV_BOARD_LIMIT_VIOLATION - DCGM_FI_DEV_DEC_UTIL + - DCGM_FI_DEV_ECC_DBE_VOL_TOTAL + - DCGM_FI_DEV_ECC_SBE_VOL_TOTAL + - DCGM_FI_DEV_ENC_UTIL - DCGM_FI_DEV_FB_FREE - - DCGM_FI_DEV_FB_USED - DCGM_FI_DEV_FB_RESERVED - - DCGM_FI_PROF_DRAM_ACTIVE + - DCGM_FI_DEV_FB_USED + - DCGM_FI_DEV_GPU_TEMP + - DCGM_FI_DEV_GPU_UTIL + - DCGM_FI_DEV_LOW_UTIL_VIOLATION - DCGM_FI_DEV_MEM_COPY_UTIL - - DCGM_FI_PROF_PCIE_TX_BYTES - - DCGM_FI_PROF_PCIE_RX_BYTES - - DCGM_FI_PROF_NVLINK_TX_BYTES - - DCGM_FI_PROF_NVLINK_RX_BYTES - - DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION - DCGM_FI_DEV_POWER_USAGE - - DCGM_FI_DEV_GPU_TEMP - - DCGM_FI_DEV_SM_CLOCK - DCGM_FI_DEV_POWER_VIOLATION - - DCGM_FI_DEV_THERMAL_VIOLATION - - DCGM_FI_DEV_SYNC_BOOST_VIOLATION - - DCGM_FI_DEV_BOARD_LIMIT_VIOLATION - - DCGM_FI_DEV_LOW_UTIL_VIOLATION - DCGM_FI_DEV_RELIABILITY_VIOLATION + - DCGM_FI_DEV_SM_CLOCK + - DCGM_FI_DEV_SYNC_BOOST_VIOLATION + - DCGM_FI_DEV_THERMAL_VIOLATION - DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION - DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION - - DCGM_FI_DEV_ECC_SBE_VOL_TOTAL - - DCGM_FI_DEV_ECC_DBE_VOL_TOTAL + - DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION + - DCGM_FI_PROF_DRAM_ACTIVE + - DCGM_FI_PROF_GR_ENGINE_ACTIVE + - DCGM_FI_PROF_NVLINK_RX_BYTES + - DCGM_FI_PROF_NVLINK_TX_BYTES + - DCGM_FI_PROF_PCIE_RX_BYTES + - DCGM_FI_PROF_PCIE_TX_BYTES + - DCGM_FI_PROF_PIPE_FP16_ACTIVE + - DCGM_FI_PROF_PIPE_FP32_ACTIVE + - DCGM_FI_PROF_PIPE_FP64_ACTIVE + - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE + - DCGM_FI_PROF_SM_ACTIVE unsupported_fields: [] diff --git a/receiver/dcgmreceiver/testdata/Tesla_V100-SXM2-16GB.yaml b/receiver/dcgmreceiver/testdata/Tesla_V100-SXM2-16GB.yaml index ef5321980..903ed6130 100644 --- a/receiver/dcgmreceiver/testdata/Tesla_V100-SXM2-16GB.yaml +++ b/receiver/dcgmreceiver/testdata/Tesla_V100-SXM2-16GB.yaml @@ -1,35 +1,35 @@ model: Tesla V100-SXM2-16GB supported_fields: - - DCGM_FI_PROF_GR_ENGINE_ACTIVE - - DCGM_FI_DEV_GPU_UTIL - - DCGM_FI_PROF_SM_ACTIVE - - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE - - DCGM_FI_PROF_PIPE_FP64_ACTIVE - - DCGM_FI_PROF_PIPE_FP32_ACTIVE - - DCGM_FI_PROF_PIPE_FP16_ACTIVE - - DCGM_FI_DEV_ENC_UTIL + - DCGM_FI_DEV_BOARD_LIMIT_VIOLATION - DCGM_FI_DEV_DEC_UTIL + - DCGM_FI_DEV_ECC_DBE_VOL_TOTAL + - DCGM_FI_DEV_ECC_SBE_VOL_TOTAL + - DCGM_FI_DEV_ENC_UTIL - DCGM_FI_DEV_FB_FREE - - DCGM_FI_DEV_FB_USED - DCGM_FI_DEV_FB_RESERVED - - DCGM_FI_PROF_DRAM_ACTIVE + - DCGM_FI_DEV_FB_USED + - DCGM_FI_DEV_GPU_TEMP + - DCGM_FI_DEV_GPU_UTIL + - DCGM_FI_DEV_LOW_UTIL_VIOLATION - DCGM_FI_DEV_MEM_COPY_UTIL - - DCGM_FI_PROF_PCIE_TX_BYTES - - DCGM_FI_PROF_PCIE_RX_BYTES - - DCGM_FI_PROF_NVLINK_TX_BYTES - - DCGM_FI_PROF_NVLINK_RX_BYTES - - DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION - DCGM_FI_DEV_POWER_USAGE - - DCGM_FI_DEV_GPU_TEMP - - DCGM_FI_DEV_SM_CLOCK - DCGM_FI_DEV_POWER_VIOLATION - - DCGM_FI_DEV_THERMAL_VIOLATION - - DCGM_FI_DEV_SYNC_BOOST_VIOLATION - - DCGM_FI_DEV_BOARD_LIMIT_VIOLATION - - DCGM_FI_DEV_LOW_UTIL_VIOLATION - DCGM_FI_DEV_RELIABILITY_VIOLATION + - DCGM_FI_DEV_SM_CLOCK + - DCGM_FI_DEV_SYNC_BOOST_VIOLATION + - DCGM_FI_DEV_THERMAL_VIOLATION - DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION - DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION - - DCGM_FI_DEV_ECC_SBE_VOL_TOTAL - - DCGM_FI_DEV_ECC_DBE_VOL_TOTAL + - DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION + - DCGM_FI_PROF_DRAM_ACTIVE + - DCGM_FI_PROF_GR_ENGINE_ACTIVE + - DCGM_FI_PROF_NVLINK_RX_BYTES + - DCGM_FI_PROF_NVLINK_TX_BYTES + - DCGM_FI_PROF_PCIE_RX_BYTES + - DCGM_FI_PROF_PCIE_TX_BYTES + - DCGM_FI_PROF_PIPE_FP16_ACTIVE + - DCGM_FI_PROF_PIPE_FP32_ACTIVE + - DCGM_FI_PROF_PIPE_FP64_ACTIVE + - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE + - DCGM_FI_PROF_SM_ACTIVE unsupported_fields: [] diff --git a/receiver/dcgmreceiver/testprofilepause/test_profile_pause.go b/receiver/dcgmreceiver/testprofilepause/test_profile_pause.go index 1621cc3ed..3700382e9 100644 --- a/receiver/dcgmreceiver/testprofilepause/test_profile_pause.go +++ b/receiver/dcgmreceiver/testprofilepause/test_profile_pause.go @@ -61,10 +61,15 @@ var initErrors = func() { } } -func PauseProfilingMetrics() error { +func PauseProfilingMetrics(endpoint string) error { initErrors() + cleanup, err := dcgm.Init(dcgm.Standalone, endpoint, "0") + if err != nil { + return err + } + defer cleanup() result := C.dcgmProfPause(handle.handle) - err := errorString(result) + err = errorString(result) if err != nil { fmt.Printf("CUDA version %d\n", dcgm.DCGM_FI_CUDA_DRIVER_VERSION) fmt.Printf("Failed to pause profiling (%v)\n", err) @@ -72,10 +77,15 @@ func PauseProfilingMetrics() error { return err } -func ResumeProfilingMetrics() error { +func ResumeProfilingMetrics(endpoint string) error { initErrors() + cleanup, err := dcgm.Init(dcgm.Standalone, endpoint, "0") + if err != nil { + return err + } + defer cleanup() result := C.dcgmProfResume(handle.handle) - err := errorString(result) + err = errorString(result) if err != nil { fmt.Printf("Failed to resume profiling (%v)\n", err) } diff --git a/receiver/dcgmreceiver/util.go b/receiver/dcgmreceiver/util.go index c3cf19883..b33317531 100644 --- a/receiver/dcgmreceiver/util.go +++ b/receiver/dcgmreceiver/util.go @@ -19,100 +19,98 @@ package dcgmreceiver import ( "fmt" - "time" "github.com/NVIDIA/go-dcgm/pkg/dcgm" ) -var nowUnixMicro = func() int64 { return time.Now().UnixNano() / 1e3 } - -// rateIntegrator converts timestamped values that represent rates into -// cumulative values. It assumes the rate stays constant since the last -// timestamp. -type rateIntegrator[V int64 | float64] struct { - lastTimestamp int64 - aggregatedRateUs V // the integration of the rate over microsecond timestamps. +// For each metric, we need to track: +type metricStats struct { + // Timestamp (µs) + // Last value (for gauge metrics), as int64 or double + lastFieldValue *dcgm.FieldValue_v2 + // Integrated rate (always int), as {unit-seconds,unit-microseconds} + // This is intended for metrics that have a per-second unit, such as By/s. + // The metric value is multiplied by the timestamp delta, producing us.By/s in integratedRateMicroseconds + // When that overflows past 1e6, the overflow is put in integratedRateSeconds, which is in units of s.By/s, or just By. + integratedRateSeconds int64 + integratedRateMicroseconds int64 + // Cumulative value (always int) + initialCumulativeValue int64 + cumulativeValue int64 +} + +func asInt64(fieldValue dcgm.FieldValue_v2) (int64, bool) { + // TODO: dcgm's Float64 and Int64 use undefined behavior + switch fieldValue.FieldType { + case dcgm.DCGM_FT_DOUBLE: + return int64(fieldValue.Float64()), true + case dcgm.DCGM_FT_INT64: + return fieldValue.Int64(), true + } + return 0, false } -func (ri *rateIntegrator[V]) Reset() { - ri.lastTimestamp = nowUnixMicro() - ri.aggregatedRateUs = V(0) +func asFloat64(fieldValue dcgm.FieldValue_v2) (float64, bool) { + switch fieldValue.FieldType { + case dcgm.DCGM_FT_DOUBLE: + return fieldValue.Float64(), true + case dcgm.DCGM_FT_INT64: + return float64(fieldValue.Int64()), true + } + return 0, false } -func (ri *rateIntegrator[V]) Update(ts int64, v V) { - // Drop stale points. - if ts <= ri.lastTimestamp { +func (m *metricStats) Update(fieldValue dcgm.FieldValue_v2) { + ts := fieldValue.Ts + intValue, intOk := asInt64(fieldValue) + if !intOk { return } - // v is the rate per second, and timestamps are in microseconds, so the - // delta will be 1e6 times the actual increment. - ri.aggregatedRateUs += v * V(ts-ri.lastTimestamp) - ri.lastTimestamp = ts -} + if m.lastFieldValue == nil { + m.initialCumulativeValue = intValue + } else { + if m.lastFieldValue.Ts >= ts { + return + } + m.cumulativeValue = intValue - m.initialCumulativeValue -func (ri *rateIntegrator[V]) Value() (int64, V) { - return ri.lastTimestamp, ri.aggregatedRateUs / V(1e6) + tsDelta := ts - m.lastFieldValue.Ts + if fieldValue.FieldType == dcgm.DCGM_FT_DOUBLE { + m.integratedRateMicroseconds += int64(float64(tsDelta) * fieldValue.Float64()) + } else { + m.integratedRateMicroseconds += tsDelta * intValue + } + m.integratedRateSeconds += m.integratedRateMicroseconds / 1000000 + m.integratedRateMicroseconds %= 1000000 + } + m.lastFieldValue = &fieldValue } -type defaultMap[K comparable, V any] struct { - m map[K]V - f func() V -} +type MetricsMap map[string]*metricStats -func newDefaultMap[K comparable, V any](f func() V) *defaultMap[K, V] { - return &defaultMap[K, V]{ - m: make(map[K]V), - f: f, +func (m MetricsMap) LastFloat64(name string) (float64, bool) { + if metric, ok := m[name]; ok && metric.lastFieldValue != nil { + return asFloat64(*metric.lastFieldValue) } + return 0, false } - -func (m *defaultMap[K, V]) Get(k K) V { - if v, ok := m.m[k]; ok { - return v +func (m MetricsMap) LastInt64(name string) (int64, bool) { + if metric, ok := m[name]; ok && metric.lastFieldValue != nil { + return asInt64(*metric.lastFieldValue) } - v := m.f() - m.m[k] = v - return v -} - -func (m *defaultMap[K, V]) TryGet(k K) (V, bool) { - v, ok := m.m[k] - return v, ok -} - -// cumulativeTracker records cumulative values since last reset. -type cumulativeTracker[V int64 | float64] struct { - baseTimestamp int64 - baseline V // the value seen at baseTimestamp. - lastTimestamp int64 - lastValue V // the value seen at lastTimestamp. -} - -func (i *cumulativeTracker[V]) Reset() { - i.baseTimestamp = 0 - i.lastTimestamp = nowUnixMicro() - i.baseline = V(0) - i.lastValue = V(0) + return 0, false } - -func (i *cumulativeTracker[V]) Update(ts int64, v V) { - // On first update, record the value as the baseline. - if i.baseTimestamp == 0 { - i.baseTimestamp, i.baseline = ts, v - } - // Drop stale points. - if ts <= i.lastTimestamp { - return +func (m MetricsMap) IntegratedRate(name string) (int64, bool) { + if metric, ok := m[name]; ok { + return metric.integratedRateSeconds, true } - i.lastTimestamp, i.lastValue = ts, v + return 0, false } - -func (i *cumulativeTracker[V]) Value() (int64, V) { - return i.lastTimestamp, i.lastValue - i.baseline -} - -func (i *cumulativeTracker[V]) Baseline() (int64, V) { - return i.baseTimestamp, i.baseline +func (m MetricsMap) CumulativeTotal(name string) (int64, bool) { + if metric, ok := m[name]; ok { + return metric.cumulativeValue, true + } + return 0, false } var ( @@ -123,15 +121,7 @@ var ( errUnexpectedType = fmt.Errorf("unexpected data type") ) -func (m *dcgmMetric) asFloat64() float64 { - return m.value.(float64) -} - -func (m *dcgmMetric) asInt64() int64 { - return m.value.(int64) -} - -func isValidValue(fieldValue dcgm.FieldValue_v1) error { +func isValidValue(fieldValue dcgm.FieldValue_v2) error { switch fieldValue.FieldType { case dcgm.DCGM_FT_DOUBLE: switch v := fieldValue.Float64(); v { diff --git a/receiver/dcgmreceiver/util_test.go b/receiver/dcgmreceiver/util_test.go index 3b35b9646..a9a206afc 100644 --- a/receiver/dcgmreceiver/util_test.go +++ b/receiver/dcgmreceiver/util_test.go @@ -18,117 +18,116 @@ package dcgmreceiver import ( + "bytes" + "encoding/binary" "testing" + "github.com/NVIDIA/go-dcgm/pkg/dcgm" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) -func testRateIntegrator[V int64 | float64](t *testing.T) { - origNowUnixMicro := nowUnixMicro - nowUnixMicro = func() int64 { return 10 } - defer func() { nowUnixMicro = origNowUnixMicro }() +func fieldValue(t *testing.T, ts int64, fieldType uint, value any) dcgm.FieldValue_v2 { + buf := new(bytes.Buffer) + require.NoError(t, binary.Write(buf, binary.NativeEndian, value)) + var valueArr [4096]byte + copy(valueArr[:], buf.Bytes()) + return dcgm.FieldValue_v2{ + Ts: ts, + FieldType: fieldType, + Value: valueArr, + } +} + +func fieldValueInt64(t *testing.T, ts int64, value int64) dcgm.FieldValue_v2 { + return fieldValue(t, ts, dcgm.DCGM_FT_INT64, value) +} + +func fieldValueFloat64(t *testing.T, ts int64, value float64) dcgm.FieldValue_v2 { + return fieldValue(t, ts, dcgm.DCGM_FT_DOUBLE, value) +} + +func testMetricStatsRate[V int64 | float64](t *testing.T, fv func(*testing.T, int64, V) dcgm.FieldValue_v2) { + stats := &metricStats{} type P struct { ts int64 - v V + v int64 + } + p := func(stats *metricStats) P { + if stats.lastFieldValue == nil { + return P{0, stats.integratedRateSeconds} + } + return P{stats.lastFieldValue.Ts, stats.integratedRateSeconds} } - p := func(ts int64, v V) P { return P{ts, v} } - - var ri rateIntegrator[V] - ri.Reset() - require.Equal(t, P{10, 0}, p(ri.Value())) + stats.Update(fv(t, 10, 0)) + require.Equal(t, P{10, 0}, p(stats)) // Ensure updates affect aggregated values. - ri.Update(15, 1e6) - assert.Equal(t, P{15, 5}, p(ri.Value())) + stats.Update(fv(t, 15, 1e6)) + assert.Equal(t, P{15, 5}, p(stats)) // Ensure stale points are ignored. - ri.Update(12, 1e8) - assert.Equal(t, P{15, 5}, p(ri.Value())) - ri.Update(15, 1.e8) - assert.Equal(t, P{15, 5}, p(ri.Value())) + stats.Update(fv(t, 12, 1e8)) + assert.Equal(t, P{15, 5}, p(stats)) + stats.Update(fv(t, 15, 1.e8)) + assert.Equal(t, P{15, 5}, p(stats)) // Ensure updates affect aggregated values. - ri.Update(20, 2.e6) - assert.Equal(t, P{20, 15}, p(ri.Value())) + stats.Update(fv(t, 20, 2.e6)) + assert.Equal(t, P{20, 15}, p(stats)) // Ensure zero rates don't change the aggregated value. - ri.Update(25, 0) - assert.Equal(t, P{25, 15}, p(ri.Value())) - - // Ensure the value is cleared on reset. - ri.Reset() - assert.Equal(t, P{10, 0}, p(ri.Value())) + stats.Update(fv(t, 25, 0)) + assert.Equal(t, P{25, 15}, p(stats)) } -func TestRateIntegratorInt64(t *testing.T) { - testRateIntegrator[int64](t) +func TestMetricStatsRateInt64(t *testing.T) { + testMetricStatsRate[int64](t, fieldValueInt64) } -func TestRateIntegratorFloat64(t *testing.T) { - testRateIntegrator[float64](t) +func TestMetricStatsRateFloat64(t *testing.T) { + testMetricStatsRate[float64](t, fieldValueFloat64) } -func testCumulativeTracker[V int64 | float64](t *testing.T) { - origNowUnixMicro := nowUnixMicro - nowUnixMicro = func() int64 { return 10 } - defer func() { nowUnixMicro = origNowUnixMicro }() +func testMetricStatsCumulative[V int64 | float64](t *testing.T, fv func(*testing.T, int64, V) dcgm.FieldValue_v2) { + stats := &metricStats{} type P struct { ts int64 - v V + v int64 + } + p := func(stats *metricStats) P { + if stats.lastFieldValue == nil { + return P{0, stats.cumulativeValue} + } + return P{stats.lastFieldValue.Ts, stats.cumulativeValue} } - p := func(ts int64, v V) P { return P{ts, v} } - - var ct cumulativeTracker[V] - ct.Reset() - require.Equal(t, P{0, 0}, p(ct.Baseline())) - require.Equal(t, P{10, 0}, p(ct.Value())) + require.Equal(t, int64(0), stats.initialCumulativeValue) + require.Equal(t, P{0, 0}, p(stats)) // Ensure first updates sets the baseline. - ct.Update(15, 50) - require.Equal(t, P{15, 50}, p(ct.Baseline())) - assert.Equal(t, P{15, 0}, p(ct.Value())) + stats.Update(fv(t, 15, 50)) + require.Equal(t, int64(50), stats.initialCumulativeValue) + assert.Equal(t, P{15, 0}, p(stats)) // Ensure updates affect values, but not the baseline. - ct.Update(20, 80) - assert.Equal(t, P{15, 50}, p(ct.Baseline())) - assert.Equal(t, P{20, 30}, p(ct.Value())) + stats.Update(fv(t, 20, 80)) + assert.Equal(t, int64(50), stats.initialCumulativeValue) + assert.Equal(t, P{20, 30}, p(stats)) // Ensure stale points are ignored. - ct.Update(18, 1e8) - assert.Equal(t, P{20, 30}, p(ct.Value())) - ct.Update(20, 1e8) - assert.Equal(t, P{20, 30}, p(ct.Value())) + stats.Update(fv(t, 18, 1e8)) + assert.Equal(t, P{20, 30}, p(stats)) + stats.Update(fv(t, 20, 1e8)) + assert.Equal(t, P{20, 30}, p(stats)) // Ensure updates affect values. - ct.Update(25, 100) - assert.Equal(t, P{25, 50}, p(ct.Value())) + stats.Update(fv(t, 25, 100)) + assert.Equal(t, P{25, 50}, p(stats)) // Ensure same inputs don't affect values. - ct.Update(30, 100) - assert.Equal(t, P{30, 50}, p(ct.Value())) - - // Ensure the value and baseline are cleared on reset. - ct.Reset() - assert.Equal(t, P{0, 0}, p(ct.Baseline())) - assert.Equal(t, P{10, 0}, p(ct.Value())) -} - -func TestCumulativeTrackerInt64(t *testing.T) { - testCumulativeTracker[int64](t) + stats.Update(fv(t, 30, 100)) + assert.Equal(t, P{30, 50}, p(stats)) } -func TestCumulativeTrackerFloat64(t *testing.T) { - testCumulativeTracker[float64](t) +func TestMetricStatsCumulativeInt64(t *testing.T) { + testMetricStatsCumulative[int64](t, fieldValueInt64) } -func TestDefaultMap(t *testing.T) { - called := false - m := newDefaultMap[int, int64](func() int64 { - called = true - return 8 - }) - _, ok := m.TryGet(3) - assert.False(t, ok) - assert.False(t, called) - v := m.Get(3) - assert.True(t, called) - assert.Equal(t, int64(8), v) - _, ok = m.TryGet(3) - assert.True(t, ok) +func TestMetricStatsCumulativeFloat64(t *testing.T) { + testMetricStatsCumulative[float64](t, fieldValueFloat64) } From 5de8b1ce2ea1a6e0108c8ec9a09ee6a9dbe7de36 Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Wed, 11 Sep 2024 17:17:33 -0400 Subject: [PATCH 36/38] Fix lint errors. --- .golangci.yaml | 1 + receiver/dcgmreceiver/client.go | 6 +++--- receiver/dcgmreceiver/scraper.go | 3 +-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.golangci.yaml b/.golangci.yaml index 349cc25c1..3c6fe5109 100644 --- a/.golangci.yaml +++ b/.golangci.yaml @@ -57,6 +57,7 @@ linters-settings: - (github.com/golangci/golangci-lint/pkg/logutils.Log).Fatalf disable: - fieldalignment + - shadow enable-all: true misspell: locale: US diff --git a/receiver/dcgmreceiver/client.go b/receiver/dcgmreceiver/client.go index 85b01915f..afed5427e 100644 --- a/receiver/dcgmreceiver/client.go +++ b/receiver/dcgmreceiver/client.go @@ -358,13 +358,13 @@ func (client *dcgmClient) collect() (time.Duration, error) { func (client *dcgmClient) getDeviceMetrics() map[uint]deviceMetrics { out := map[uint]deviceMetrics{} for gpuIndex, device := range client.devices { - new := MetricsMap{} + newMetrics := MetricsMap{} for key, value := range device.Metrics { newValue := *value - new[key] = &newValue + newMetrics[key] = &newValue } // device is already a copy here - device.Metrics = new + device.Metrics = newMetrics out[gpuIndex] = device } return out diff --git a/receiver/dcgmreceiver/scraper.go b/receiver/dcgmreceiver/scraper.go index 79f47b5f7..f84b17318 100644 --- a/receiver/dcgmreceiver/scraper.go +++ b/receiver/dcgmreceiver/scraper.go @@ -84,7 +84,7 @@ func (s *dcgmScraper) start(ctx context.Context, _ component.Host) error { s.cancel = func() { scrapeCancel() - g.Wait() + _ = g.Wait() // Ignore the error from a canceled context } metricsCh := make(chan map[uint]deviceMetrics) @@ -196,7 +196,6 @@ func (s *dcgmScraper) runConnectLoop(ctx context.Context, metricsCh chan<- map[u case <-time.After(s.initRetryDelay): } } - return nil } func (s *dcgmScraper) pollClient(ctx context.Context, client *dcgmClient, metricsCh chan<- map[uint]deviceMetrics, collectTriggerCh <-chan struct{}) { From ea0eba5d208c4a209eab7e287b3890a494547e71 Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Thu, 12 Sep 2024 14:59:07 -0400 Subject: [PATCH 37/38] Fix data race in scraper_test. --- receiver/dcgmreceiver/scraper_test.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/receiver/dcgmreceiver/scraper_test.go b/receiver/dcgmreceiver/scraper_test.go index b4900aaa1..55b58f65f 100644 --- a/receiver/dcgmreceiver/scraper_test.go +++ b/receiver/dcgmreceiver/scraper_test.go @@ -20,6 +20,7 @@ package dcgmreceiver import ( "context" "strings" + "sync" "testing" "github.com/stretchr/testify/assert" @@ -33,10 +34,13 @@ import ( func TestScraperWithoutDcgm(t *testing.T) { var settings receiver.CreateSettings + var mu sync.Mutex seenDcgmNotInstalledWarning := false settings.Logger = zaptest.NewLogger(t, zaptest.WrapOptions(zap.Hooks(func(e zapcore.Entry) error { if e.Level == zap.WarnLevel && strings.Contains(e.Message, "Unable to connect to DCGM daemon at localhost:5555 on libdcgm.so not Found; Is the DCGM daemon running") { + mu.Lock() seenDcgmNotInstalledWarning = true + mu.Unlock() } return nil }))) @@ -48,13 +52,17 @@ func TestScraperWithoutDcgm(t *testing.T) { require.NoError(t, err) metrics, err := scraper.scrape(context.Background()) + mu.Lock() assert.Equal(t, true, seenDcgmNotInstalledWarning) + mu.Unlock() assert.NoError(t, err) // If failed to init DCGM, should have no error assert.Equal(t, 0, metrics.MetricCount()) // Scrape again with DCGM not available metrics, err = scraper.scrape(context.Background()) + mu.Lock() assert.Equal(t, true, seenDcgmNotInstalledWarning) + mu.Unlock() assert.NoError(t, err) assert.Equal(t, 0, metrics.MetricCount()) From ac52c0f3a5cda5450a3c5a854525a6d959a54427 Mon Sep 17 00:00:00 2001 From: Igor Peshansky Date: Fri, 13 Sep 2024 12:28:01 -0400 Subject: [PATCH 38/38] Cleanups. --- receiver/dcgmreceiver/client.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/receiver/dcgmreceiver/client.go b/receiver/dcgmreceiver/client.go index afed5427e..7986d4f5d 100644 --- a/receiver/dcgmreceiver/client.go +++ b/receiver/dcgmreceiver/client.go @@ -85,7 +85,7 @@ func newClient(settings *dcgmClientSettings, logger *zap.Logger) (*dcgmClient, e } enabledFields, unavailableFields := filterSupportedFields(requestedFieldIDs, supportedProfilingFieldIDs) for _, f := range unavailableFields { - logger.Sugar().Warnf("Field '%s' is not supported. Metric '%s' will not be collected", dcgmIDToName[f], dcgmIDToName[f]) + logger.Sugar().Warnf("Field '%s' is not supported", dcgmIDToName[f]) } var deviceGroup dcgm.GroupHandle if len(enabledFields) != 0 { @@ -329,7 +329,7 @@ func (client *dcgmClient) collect() (time.Duration, error) { // Blank values are expected at startup. continue } else if err == errNotSupported { - client.issueWarningForFailedQueryUptoThreshold(dcgmName, 1, fmt.Sprintf("Field '%s' is not supported. Metric '%s' will not be collected", dcgmName, dcgmName)) + client.issueWarningForFailedQueryUptoThreshold(dcgmName, 1, fmt.Sprintf("Field '%s' is not supported", dcgmName)) continue } else if err != nil { msg := fmt.Sprintf("Received invalid value (ts %d gpu %d) %s: %v", fieldValue.Ts, gpuIndex, dcgmName, err) @@ -375,7 +375,7 @@ func (client *dcgmClient) issueWarningForFailedQueryUptoThreshold(dcgmName strin failedCount := client.deviceMetricToFailedQueryCount[dcgmName] if failedCount <= limit { - client.logger.Warnf("%s", reason) + client.logger.Warn(reason) if limit > 1 && failedCount == limit { client.logger.Warnf("Surpressing further device query warnings for '%s'", dcgmName) }