Skip to content

Commit

Permalink
neonvm: metrics for failing reconcile (#920)
Browse files Browse the repository at this point in the history
The need for this appeared during the investigation of an elevated level
of reconciliation failures after neonvm-controllers were moved to an
another node group.

Adds a single metric `reconcile_duration_seconds` with an `outcome`
label.

\+ relevant log messages

Part of a #916 and #918. Easier to do together because of conflicts.

---------

Signed-off-by: Oleg Vasilev <[email protected]>
  • Loading branch information
Omrigan authored May 9, 2024
1 parent da5dbf2 commit 3b08db3
Showing 1 changed file with 44 additions and 0 deletions.
44 changes: 44 additions & 0 deletions neonvm/controllers/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,17 @@ package controllers
import (
"context"
"sync"
"time"

"github.com/prometheus/client_golang/prometheus"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/metrics"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

"k8s.io/apimachinery/pkg/api/errors"

"github.com/neondatabase/autoscaling/pkg/util"
)

Expand All @@ -19,8 +23,11 @@ type ReconcilerMetrics struct {
runnerCreationToVMRunningTime prometheus.Histogram
vmCreationToVMRunningTime prometheus.Histogram
vmRestartCounts prometheus.Counter
reconcileDuration prometheus.HistogramVec
}

const OutcomeLabel = "outcome"

func MakeReconcilerMetrics() ReconcilerMetrics {
// Copied bucket values from controller runtime latency metric. We can
// adjust them in the future if needed.
Expand Down Expand Up @@ -62,10 +69,32 @@ func MakeReconcilerMetrics() ReconcilerMetrics {
Help: "Total number of VM restarts across the cluster captured by VirtualMachine reconciler",
},
)),
reconcileDuration: *util.RegisterMetric(metrics.Registry, prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "reconcile_duration_seconds",
Help: "Time duration of reconciles",
Buckets: buckets,
}, []string{OutcomeLabel},
)),
}
return m
}

type ReconcileOutcome string

const (
SuccessOutcome ReconcileOutcome = "success"
FailureOutcome ReconcileOutcome = "failure"
ConflictOutcome ReconcileOutcome = "conflict"
)

func (m ReconcilerMetrics) ObserveReconcileDuration(
outcome ReconcileOutcome,
duration time.Duration,
) {
m.reconcileDuration.WithLabelValues(string(outcome)).Observe(duration.Seconds())
}

type wrappedReconciler struct {
ControllerName string
Reconciler reconcile.Reconciler
Expand Down Expand Up @@ -112,7 +141,11 @@ func WithMetrics(reconciler reconcile.Reconciler, rm ReconcilerMetrics, cntrlNam
}

func (d *wrappedReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
log := log.FromContext(ctx)

now := time.Now()
res, err := d.Reconciler.Reconcile(ctx, req)
duration := time.Since(now)

// This part is executed sequentially since we acquire a mutex lock. It
// should be quite fast since a mutex lock/unlock + 2 memory writes takes less
Expand All @@ -121,11 +154,22 @@ func (d *wrappedReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
// async approach.
d.lock.Lock()
defer d.lock.Unlock()
outcome := SuccessOutcome
if err != nil {
d.failing[req.NamespacedName] = struct{}{}
if errors.IsConflict(err) {
outcome = ConflictOutcome
} else {
outcome = FailureOutcome
}

log.Error(err, "Failed to reconcile VirtualMachine",
"duration", duration.String(), "outcome", outcome)
} else {
delete(d.failing, req.NamespacedName)
log.Info("Successful reconciliation", "duration", duration.String())
}
d.Metrics.ObserveReconcileDuration(outcome, duration)
d.Metrics.failing.WithLabelValues(d.ControllerName).Set(float64(len(d.failing)))

return res, err
Expand Down

0 comments on commit 3b08db3

Please sign in to comment.