diff --git a/pkg/roachprod/promhelperclient/client.go b/pkg/roachprod/promhelperclient/client.go index 139c62273812..3f0193f82961 100644 --- a/pkg/roachprod/promhelperclient/client.go +++ b/pkg/roachprod/promhelperclient/client.go @@ -77,7 +77,7 @@ func (c *PromClient) UpdatePrometheusTargets( ctx context.Context, promUrl, clusterName string, forceFetchCreds bool, - nodes map[int]string, + nodes map[int]*NodeInfo, insecure bool, l *logger.Logger, ) error { @@ -157,17 +157,29 @@ type CCParams struct { Labels map[string]string `yaml:"labels"` } +// NodeInfo contains the target and labels for the node +type NodeInfo struct { + Target string // Name of the node + CustomLabels map[string]string // Custom labels to be added to the cluster config +} + // createClusterConfigFile creates the cluster config file per node -func buildCreateRequest(nodes map[int]string, insecure bool) (io.Reader, error) { +func buildCreateRequest(nodes map[int]*NodeInfo, insecure bool) (io.Reader, error) { configs := make([]*CCParams, 0) for i, n := range nodes { params := &CCParams{ - Targets: []string{n}, + Targets: []string{n.Target}, Labels: map[string]string{ + // default labels "node": strconv.Itoa(i), "tenant": install.SystemInterfaceName, + "job": "cockroachdb", }, } + // custom labels - this can override the default labels if needed + for n, v := range n.CustomLabels { + params.Labels[n] = v + } configs = append(configs, params) } cb, err := yaml.Marshal(&configs) diff --git a/pkg/roachprod/promhelperclient/client_test.go b/pkg/roachprod/promhelperclient/client_test.go index e83c6c71c23e..748988c3bf8b 100644 --- a/pkg/roachprod/promhelperclient/client_test.go +++ b/pkg/roachprod/promhelperclient/client_test.go @@ -49,12 +49,16 @@ func TestUpdatePrometheusTargets(t *testing.T) { Body: io.NopCloser(strings.NewReader("failed")), }, nil } - err := c.UpdatePrometheusTargets(ctx, promUrl, "c1", false, map[int]string{1: "n1"}, true, l) + err := c.UpdatePrometheusTargets(ctx, promUrl, "c1", false, + map[int]*NodeInfo{1: {Target: "n1"}}, true, l) require.NotNil(t, err) require.Equal(t, "request failed with status 400 and error failed", err.Error()) }) t.Run("UpdatePrometheusTargets succeeds", func(t *testing.T) { - nodeInfos := map[int]string{1: "n1", 3: "n3"} + nodeInfos := map[int]*NodeInfo{1: {Target: "n1"}, 3: { + Target: "n3", + CustomLabels: map[string]string{"custom": "label"}, + }} c.httpPut = func(ctx context.Context, url string, h *http.Header, body io.Reader) ( resp *http.Response, err error) { require.Equal(t, getUrl(promUrl, "c1"), url) @@ -67,8 +71,12 @@ func TestUpdatePrometheusTargets(t *testing.T) { for _, c := range configs { nodeID, err := strconv.Atoi(c.Labels["node"]) require.NoError(t, err) - require.Equal(t, nodeInfos[nodeID], c.Targets[0]) + require.Equal(t, nodeInfos[nodeID].Target, c.Targets[0]) require.Equal(t, "system", c.Labels["tenant"]) + require.Equal(t, "cockroachdb", c.Labels["job"]) + for k, v := range nodeInfos[nodeID].CustomLabels { + require.Equal(t, v, c.Labels[k]) + } } return &http.Response{ StatusCode: 200, diff --git a/pkg/roachprod/roachprod.go b/pkg/roachprod/roachprod.go index 1a88a0fde6fa..cd5f3fe1f29a 100644 --- a/pkg/roachprod/roachprod.go +++ b/pkg/roachprod/roachprod.go @@ -794,7 +794,7 @@ func UpdateTargets( // updatePrometheusTargets updates the prometheus instance cluster config. Any error is logged and ignored. func updatePrometheusTargets(ctx context.Context, l *logger.Logger, c *install.SyncedCluster) { - nodeIPPorts := make(map[int]string) + nodeIPPorts := make(map[int]*promhelperclient.NodeInfo) nodeIPPortsMutex := syncutil.RWMutex{} var wg sync.WaitGroup for _, node := range c.Nodes { @@ -808,10 +808,10 @@ func updatePrometheusTargets(ctx context.Context, l *logger.Logger, c *install.S l.Errorf("error getting the port for node %d: %v", index, err) return } - nodeInfo := fmt.Sprintf("%s:%d", v.PublicIP, desc.Port) + nodeInfo := fmt.Sprintf("%s:%d", v.PrivateIP, desc.Port) nodeIPPortsMutex.Lock() // ensure atomicity in map update - nodeIPPorts[index] = nodeInfo + nodeIPPorts[index] = &promhelperclient.NodeInfo{Target: nodeInfo, CustomLabels: getLabels(v)} nodeIPPortsMutex.Unlock() }(int(node), c.VMs[node-1]) } @@ -826,6 +826,32 @@ func updatePrometheusTargets(ctx context.Context, l *logger.Logger, c *install.S } } +// regionRegEx is the regex to extract the region label from zone available as vm property +var regionRegEx = regexp.MustCompile("(^.+[0-9]+)(-[a-f]$)") + +// getLabels returns the labels to be populated in the target configuration in prometheus +func getLabels(v vm.VM) map[string]string { + labels := map[string]string{ + "cluster": v.Labels["cluster"], + "instance": v.Name, + "host_ip": v.PrivateIP, + "project": v.Project, + "zone": v.Zone, + } + match := regionRegEx.FindStringSubmatch(v.Zone) + if len(match) > 1 { + labels["region"] = match[1] + } + // the following labels are present if the test labels are added before the VM is started + if t, ok := v.Labels["test_name"]; ok { + labels["test_name"] = t + } + if t, ok := v.Labels["test_run_id"]; ok { + labels["test_run_id"] = t + } + return labels +} + // Monitor monitors the status of cockroach nodes in a cluster. func Monitor( ctx context.Context, l *logger.Logger, clusterName string, opts install.MonitorOpts,