Skip to content

Commit

Permalink
Refactor metric collection (#254)
Browse files Browse the repository at this point in the history
* fix: refactor metrics

* fix: lint

* fix: refactor metric collection
  • Loading branch information
Mryashbhardwaj authored Sep 24, 2024
1 parent 405eb34 commit d70c112
Show file tree
Hide file tree
Showing 15 changed files with 183 additions and 272 deletions.
45 changes: 21 additions & 24 deletions core/job/handler/v1beta1/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,25 +8,21 @@ import (
"time"

"github.com/goto/salt/log"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"google.golang.org/protobuf/types/known/timestamppb"

"github.com/goto/optimus/core/job"
"github.com/goto/optimus/core/job/dto"
"github.com/goto/optimus/core/tenant"
"github.com/goto/optimus/internal/errors"
"github.com/goto/optimus/internal/models"
"github.com/goto/optimus/internal/telemetry"
"github.com/goto/optimus/internal/utils/filter"
"github.com/goto/optimus/internal/writer"
pb "github.com/goto/optimus/protos/gotocompany/optimus/core/v1beta1"
"github.com/goto/optimus/sdk/plugin"
)

const (
metricReplaceAllDuration = "job_replace_all_duration_seconds"
metricRefreshDuration = "job_refresh_duration_seconds"
)

type JobHandler struct {
l log.Logger
jobService JobService
Expand All @@ -43,6 +39,14 @@ func NewJobHandler(jobService JobService, changeLogService ChangeLogService, log
}
}

var jobReplaceAllDurationMetric = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "job_replace_all_duration_seconds",
}, []string{"project", "namespace"})

var jobRefreshDurationMetric = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "job_refresh_duration_seconds",
}, []string{"project"})

type JobModificationService interface {
Add(ctx context.Context, jobTenant tenant.Tenant, jobs []*job.Spec) ([]job.Name, error)
Update(ctx context.Context, jobTenant tenant.Tenant, jobs []*job.Spec) ([]job.Name, error)
Expand Down Expand Up @@ -222,12 +226,6 @@ func (jh *JobHandler) ChangeJobNamespace(ctx context.Context, changeRequest *pb.
return nil, errors.GRPCErr(err, errorMsg)
}

telemetry.NewCounter("job_namespace_migrations_total", map[string]string{
"project": jobSourceTenant.ProjectName().String(),
"namespace_source": jobSourceTenant.NamespaceName().String(),
"namespace_destination": jobNewTenant.NamespaceName().String(),
}).Inc()

return &pb.ChangeJobNamespaceResponse{}, nil
}

Expand Down Expand Up @@ -524,10 +522,11 @@ func (jh *JobHandler) ReplaceAllJobSpecifications(stream pb.JobSpecificationServ

processDuration := time.Since(startTime)
jh.l.Debug("finished replacing all job specifications for project [%s] namespace [%s], took %s", request.GetProjectName(), request.GetNamespaceName(), processDuration)
telemetry.NewGauge(metricReplaceAllDuration, map[string]string{
"project": jobTenant.ProjectName().String(),
"namespace": jobTenant.NamespaceName().String(),
}).Add(processDuration.Seconds())

jobReplaceAllDurationMetric.WithLabelValues(
jobTenant.ProjectName().String(),
jobTenant.NamespaceName().String(),
).Add(processDuration.Seconds())
}
if len(errNamespaces) > 0 {
errMessageSummary := strings.Join(errMessages, "\n")
Expand All @@ -543,9 +542,7 @@ func (jh *JobHandler) RefreshJobs(request *pb.RefreshJobsRequest, stream pb.JobS
startTime := time.Now()
defer func() {
processDuration := time.Since(startTime)
telemetry.NewGauge(metricRefreshDuration, map[string]string{
"project": request.ProjectName,
}).Add(processDuration.Seconds())
jobRefreshDurationMetric.WithLabelValues(request.ProjectName).Add(processDuration.Seconds())
jh.l.Debug("finished refreshing jobs for project [%s], took %s", request.GetProjectName(), processDuration)
}()

Expand Down Expand Up @@ -790,11 +787,11 @@ func (jh *JobHandler) JobInspect(ctx context.Context, req *pb.JobInspectRequest)
}

func raiseJobEventMetric(jobTenant tenant.Tenant, state string, metricValue int) {
telemetry.NewCounter(job.MetricJobEvent, map[string]string{
"project": jobTenant.ProjectName().String(),
"namespace": jobTenant.NamespaceName().String(),
"status": state,
}).Add(float64(metricValue))
job.EventMetric.WithLabelValues(
jobTenant.ProjectName().String(),
jobTenant.NamespaceName().String(),
state,
).Add(float64(metricValue))
}

func toValidateResultProto(result map[job.Name][]dto.ValidateResult) map[string]*pb.ValidateResponse_ResultList {
Expand Down
20 changes: 15 additions & 5 deletions core/job/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ import (
"fmt"
"strings"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"

"github.com/goto/optimus/core/resource"
"github.com/goto/optimus/core/tenant"
"github.com/goto/optimus/internal/errors"
Expand All @@ -19,7 +22,6 @@ const (
UpstreamStateResolved UpstreamState = "resolved"
UpstreamStateUnresolved UpstreamState = "unresolved"

MetricJobEvent = "job_events_total"
MetricJobEventStateAdded = "added"
MetricJobEventStateUpdated = "updated"
MetricJobEventStateDeleted = "deleted"
Expand All @@ -30,10 +32,6 @@ const (
MetricJobEventDisabled = "disabled"
MetricJobEventFoundDirty = "found_dirty"

MetricJobValidation = "job_validation"

MetricJobRefreshResourceDownstream = "refresh_resource_downstream_total"

UnspecifiedImpactChange UpdateImpact = "unspecified_impact"
JobInternalImpact UpdateImpact = "internal_impact"
JobBehaviourImpact UpdateImpact = "behaviour_impact"
Expand All @@ -43,6 +41,18 @@ const (
DeployStateFailed DeployState = "failed"
)

var EventMetric = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "job_events_total",
}, []string{"project", "namespace", "status"})

var ValidationMetric = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "job_validation",
}, []string{"project", "namespace", "stage", "success"})

var RefreshResourceDownstreamMetric = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "refresh_resource_downstream_total",
}, []string{"project", "status"})

type Job struct {
tenant tenant.Tenant

Expand Down
34 changes: 15 additions & 19 deletions core/job/service/job_service.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ import (
"github.com/goto/optimus/internal/errors"
"github.com/goto/optimus/internal/lib/tree"
"github.com/goto/optimus/internal/lib/window"
"github.com/goto/optimus/internal/telemetry"
"github.com/goto/optimus/internal/utils/filter"
"github.com/goto/optimus/internal/writer"
"github.com/goto/optimus/sdk/plugin"
Expand Down Expand Up @@ -812,11 +811,10 @@ func (j *JobService) RefreshResourceDownstream(ctx context.Context, resourceURNs
status = "failed"
}

counter := telemetry.NewCounter(job.MetricJobRefreshResourceDownstream, map[string]string{
"project": projectName.String(),
"status": status,
})
counter.Add(float64(len(jobNames)))
job.RefreshResourceDownstreamMetric.WithLabelValues(
projectName.String(),
status,
).Add(float64(len(jobNames)))
}

return me.ToErr()
Expand Down Expand Up @@ -1338,11 +1336,11 @@ func (*JobService) groupDownstreamPerProject(downstreams []*job.Downstream) map[
}

func raiseJobEventMetric(jobTenant tenant.Tenant, state string, metricValue int) {
telemetry.NewCounter(job.MetricJobEvent, map[string]string{
"project": jobTenant.ProjectName().String(),
"namespace": jobTenant.NamespaceName().String(),
"status": state,
}).Add(float64(metricValue))
job.EventMetric.WithLabelValues(
jobTenant.ProjectName().String(),
jobTenant.NamespaceName().String(),
state,
).Add(float64(metricValue))
}

func (j *JobService) identifyUpstreamURNs(ctx context.Context, tenantWithDetails *tenant.WithDetails, spec *job.Spec) ([]resource.URN, error) {
Expand Down Expand Up @@ -1941,14 +1939,12 @@ func (*JobService) validateWindow(tenantDetails *tenant.WithDetails, windowConfi
}

func registerJobValidationMetric(tnnt tenant.Tenant, stage dto.ValidateStage, success bool) {
counter := telemetry.NewCounter(job.MetricJobValidation, map[string]string{
"project": tnnt.ProjectName().String(),
"namespace": tnnt.NamespaceName().String(),
"stage": stage.String(),
"success": fmt.Sprintf("%t", success),
})

counter.Add(1)
job.ValidationMetric.WithLabelValues(
tnnt.ProjectName().String(),
tnnt.NamespaceName().String(),
stage.String(),
fmt.Sprintf("%t", success),
).Add(1)
}

func (j *JobService) GetDownstreamByResourceURN(ctx context.Context, tnnt tenant.Tenant, urn resource.URN) (job.DownstreamList, error) {
Expand Down
43 changes: 21 additions & 22 deletions core/resource/handler/v1beta1/resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,24 @@ import (
"time"

"github.com/goto/salt/log"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"google.golang.org/protobuf/types/known/structpb"

"github.com/goto/optimus/core/resource"
"github.com/goto/optimus/core/tenant"
"github.com/goto/optimus/internal/errors"
"github.com/goto/optimus/internal/telemetry"
"github.com/goto/optimus/internal/writer"
pb "github.com/goto/optimus/protos/gotocompany/optimus/core/v1beta1"
)

const (
metricResourceEvents = "resource_events_total"
metricResourcesUploadAllDuration = "resource_upload_all_duration_seconds_total"
)
var resourceEventsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "resource_events_total",
}, []string{"project", "namespace", "datastore", "type", "status"})

var resourcesUploadAllDurationMetric = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "resource_upload_all_duration_seconds_total",
}, []string{"operator_name", "event_type"})

type ResourceService interface {
Create(ctx context.Context, res *resource.Resource) error
Expand Down Expand Up @@ -129,10 +133,11 @@ func (rh ResourceHandler) DeployResourceSpecification(stream pb.ResourceService_
}

processDuration := time.Since(startTime)
telemetry.NewGauge(metricResourcesUploadAllDuration, map[string]string{
"project": tnnt.ProjectName().String(),
"namespace": tnnt.NamespaceName().String(),
}).Add(processDuration.Seconds())

resourcesUploadAllDurationMetric.WithLabelValues(
tnnt.ProjectName().String(),
tnnt.NamespaceName().String(),
).Add(processDuration.Seconds())
}

if len(errNamespaces) > 0 {
Expand Down Expand Up @@ -383,12 +388,6 @@ func (rh ResourceHandler) ChangeResourceNamespace(ctx context.Context, req *pb.C
return nil, errors.GRPCErr(err, "failed to update resource "+req.GetResourceName())
}

telemetry.NewCounter("resource_namespace_migrations_total", map[string]string{
"project": tnnt.ProjectName().String(),
"namespace_source": tnnt.NamespaceName().String(),
"namespace_destination": newTnnt.NamespaceName().String(),
}).Inc()

return &pb.ChangeResourceNamespaceResponse{}, nil
}

Expand Down Expand Up @@ -542,13 +541,13 @@ func toResourceProto(res *resource.Resource) (*pb.ResourceSpecification, error)
}

func raiseResourceDatastoreEventMetric(jobTenant tenant.Tenant, datastoreName, resourceKind, state string) {
telemetry.NewCounter(metricResourceEvents, map[string]string{
"project": jobTenant.ProjectName().String(),
"namespace": jobTenant.NamespaceName().String(),
"datastore": datastoreName,
"type": resourceKind,
"status": state,
}).Inc()
resourceEventsTotal.WithLabelValues(
jobTenant.ProjectName().String(),
jobTenant.NamespaceName().String(),
datastoreName,
resourceKind,
state,
).Inc()
}

func toChangelogProto(cl *resource.ChangeLog) *pb.ResourceChangelog {
Expand Down
20 changes: 12 additions & 8 deletions core/resource/service/backup_service.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,26 @@ import (
"time"

"github.com/goto/salt/log"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"

"github.com/goto/optimus/core/resource"
"github.com/goto/optimus/core/tenant"
"github.com/goto/optimus/internal/errors"
"github.com/goto/optimus/internal/telemetry"
)

const (
// recentBackupWindowMonths contains the window interval to consider for recent backups
recentBackupWindowMonths = -3

metricBackupRequest = "resource_backup_requests_total"
backupRequestStatusSuccess = "success"
backupRequestStatusFailed = "failed"
)

var backupRequestMetric = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "resource_backup_requests_total",
}, []string{"project", "namespace", "resource", "status"})

type BackupRepository interface {
GetByID(ctx context.Context, id resource.BackupID) (*resource.Backup, error)
GetAll(ctx context.Context, tnnt tenant.Tenant, store resource.Store) ([]*resource.Backup, error)
Expand Down Expand Up @@ -142,10 +146,10 @@ func raiseBackupRequestMetrics(jobTenant tenant.Tenant, backupResult *resource.B
}

func raiseBackupRequestMetric(jobTenant tenant.Tenant, resourceName, state string) {
telemetry.NewCounter(metricBackupRequest, map[string]string{
"project": jobTenant.ProjectName().String(),
"namespace": jobTenant.NamespaceName().String(),
"resource": resourceName,
"status": state,
}).Inc()
backupRequestMetric.WithLabelValues(
jobTenant.ProjectName().String(),
jobTenant.NamespaceName().String(),
resourceName,
state,
).Inc()
}
17 changes: 11 additions & 6 deletions core/scheduler/service/events_service.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,24 @@ import (
"strings"

"github.com/goto/salt/log"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"

"github.com/goto/optimus/core/scheduler"
"github.com/goto/optimus/core/tenant"
"github.com/goto/optimus/internal/compiler"
"github.com/goto/optimus/internal/errors"
"github.com/goto/optimus/internal/telemetry"
)

const (
NotificationSchemeSlack = "slack"
NotificationSchemePagerDuty = "pagerduty"
)

var jobrunAlertsMetric = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "jobrun_alerts_total",
}, []string{"project", "namespace", "type"})

type Notifier interface {
io.Closer
Notify(ctx context.Context, attr scheduler.NotifyAttrs) error
Expand Down Expand Up @@ -179,11 +184,11 @@ func (e *EventsService) Push(ctx context.Context, event *scheduler.Event) error
}
}
}
telemetry.NewCounter("jobrun_alerts_total", map[string]string{
"project": event.Tenant.ProjectName().String(),
"namespace": event.Tenant.NamespaceName().String(),
"type": event.Type.String(),
}).Inc()
jobrunAlertsMetric.WithLabelValues(
event.Tenant.ProjectName().String(),
event.Tenant.NamespaceName().String(),
event.Type.String(),
).Inc()
}
}
return multierror.ToErr()
Expand Down
Loading

0 comments on commit d70c112

Please sign in to comment.