Skip to content

Commit

Permalink
feat: add counter to track alerts dropped outside of time_intervals (p…
Browse files Browse the repository at this point in the history
…rometheus#3565)

* feat: add counter to track alerts dropped outside of time_intervals

Addresses: prometheus#3512

This adds a new counter metric `alertmanager_alerts_supressed_total`
that is incremented by `len(alerts)` when an alert is suppressed for
being outside of a time_interval, ie inside of a mute_time_intervals or
outside of an active_time_intervals.

Signed-off-by: TJ Hoplock <[email protected]>

* test: add time interval suppression metric checks for notify

Signed-off-by: TJ Hoplock <[email protected]>

* test: fix failure message log values in notifier

Signed-off-by: TJ Hoplock <[email protected]>

* ref: address PR feedback for prometheus#3565

Signed-off-by: TJ Hoplock <[email protected]>

* fix: track suppressed notifications metric for inhibit/silence

Based on PR feedback:

https://github.com/prometheus/alertmanager/pull/3565/files#r1393068026

Signed-off-by: TJ Hoplock <[email protected]>

* fix: broken notifier tests

- fixed metric count check to properly check the diff between
  input/output notifications from the suppression to compare to suppression
metric, was previously inverted to compare to how many notifications it
suppressed.
- stopped using `Reset()` to compare collection counts between the
  multiple stages that are executed in `TestMuteStageWithSilences()`.
the intent was to compare a clean metric collection after each stage
execution, but the final stage where all silences are lifted results in
no metric being created in the test, causing `prom_testutil.ToFloat64()`
to panic. changed to separate vars to check counts between each stage,
with care to consider prior counts.

Signed-off-by: TJ Hoplock <[email protected]>

* rename metric and add constants

Signed-off-by: gotjosh <[email protected]>

---------

Signed-off-by: TJ Hoplock <[email protected]>
Signed-off-by: gotjosh <[email protected]>
Co-authored-by: gotjosh <[email protected]>
Signed-off-by: Gokhan Sari <[email protected]>
  • Loading branch information
2 people authored and th0th committed Mar 23, 2024
1 parent 22efa9b commit dcddc12
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 20 deletions.
58 changes: 43 additions & 15 deletions notify/notify.go
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@ type Metrics struct {
numTotalFailedNotifications *prometheus.CounterVec
numNotificationRequestsTotal *prometheus.CounterVec
numNotificationRequestsFailedTotal *prometheus.CounterVec
numNotificationSuppressedTotal *prometheus.CounterVec
notificationLatencySeconds *prometheus.HistogramVec

ff featurecontrol.Flagger
Expand Down Expand Up @@ -284,6 +285,11 @@ func NewMetrics(r prometheus.Registerer, ff featurecontrol.Flagger) *Metrics {
Name: "notification_requests_failed_total",
Help: "The total number of failed notification requests.",
}, labels),
numNotificationSuppressedTotal: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: "alertmanager",
Name: "notifications_suppressed_total",
Help: "The total number of notifications suppressed for being outside of active time intervals or within muted time intervals.",
}, []string{"reason"}),
notificationLatencySeconds: prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: "alertmanager",
Name: "notification_latency_seconds",
Expand All @@ -296,7 +302,7 @@ func NewMetrics(r prometheus.Registerer, ff featurecontrol.Flagger) *Metrics {
r.MustRegister(
m.numNotifications, m.numTotalFailedNotifications,
m.numNotificationRequestsTotal, m.numNotificationRequestsFailedTotal,
m.notificationLatencySeconds,
m.numNotificationSuppressedTotal, m.notificationLatencySeconds,
)

return m
Expand Down Expand Up @@ -381,10 +387,10 @@ func (pb *PipelineBuilder) New(
rs := make(RoutingStage, len(receivers))

ms := NewGossipSettleStage(peer)
is := NewMuteStage(inhibitor)
tas := NewTimeActiveStage(intervener)
tms := NewTimeMuteStage(intervener)
ss := NewMuteStage(silencer)
is := NewMuteStage(inhibitor, pb.metrics)
tas := NewTimeActiveStage(intervener, pb.metrics)
tms := NewTimeMuteStage(intervener, pb.metrics)
ss := NewMuteStage(silencer, pb.metrics)

for name := range receivers {
st := createReceiverStage(name, receivers[name], wait, notificationLog, pb.metrics)
Expand Down Expand Up @@ -507,14 +513,22 @@ func (n *GossipSettleStage) Exec(ctx context.Context, _ log.Logger, alerts ...*t
return ctx, alerts, nil
}

const (
suppressedReasonSilence = "silence"
suppressedReasonInhibition = "inhibition"
suppressedReasonMuteTimeInterval = "mute_time_interval"
suppressedReasonActiveTimeInterval = "active_time_interval"
)

// MuteStage filters alerts through a Muter.
type MuteStage struct {
muter types.Muter
muter types.Muter
metrics *Metrics
}

// NewMuteStage return a new MuteStage.
func NewMuteStage(m types.Muter) *MuteStage {
return &MuteStage{muter: m}
func NewMuteStage(m types.Muter, metrics *Metrics) *MuteStage {
return &MuteStage{muter: m, metrics: metrics}
}

// Exec implements the Stage interface.
Expand All @@ -535,7 +549,18 @@ func (n *MuteStage) Exec(ctx context.Context, logger log.Logger, alerts ...*type
}
if len(muted) > 0 {
level.Debug(logger).Log("msg", "Notifications will not be sent for muted alerts", "alerts", fmt.Sprintf("%v", muted))

var reason string
switch n.muter.(type) {
case *silence.Silencer:
reason = suppressedReasonSilence
case *inhibit.Inhibitor:
reason = suppressedReasonInhibition
default:
}
n.metrics.numNotificationSuppressedTotal.WithLabelValues(reason).Add(float64(len(muted)))
}

return ctx, filtered, nil
}

Expand Down Expand Up @@ -894,13 +919,14 @@ func (n SetNotifiesStage) Exec(ctx context.Context, l log.Logger, alerts ...*typ
}

type timeStage struct {
muter types.TimeMuter
muter types.TimeMuter
metrics *Metrics
}

type TimeMuteStage timeStage

func NewTimeMuteStage(m types.TimeMuter) *TimeMuteStage {
return &TimeMuteStage{m}
func NewTimeMuteStage(m types.TimeMuter, metrics *Metrics) *TimeMuteStage {
return &TimeMuteStage{m, metrics}
}

// Exec implements the stage interface for TimeMuteStage.
Expand All @@ -927,16 +953,17 @@ func (tms TimeMuteStage) Exec(ctx context.Context, l log.Logger, alerts ...*type

// If the current time is inside a mute time, all alerts are removed from the pipeline.
if muted {
level.Debug(l).Log("msg", "Notifications not sent, route is within mute time")
tms.metrics.numNotificationSuppressedTotal.WithLabelValues(suppressedReasonMuteTimeInterval).Add(float64(len(alerts)))
level.Debug(l).Log("msg", "Notifications not sent, route is within mute time", "alerts", len(alerts))
return ctx, nil, nil
}
return ctx, alerts, nil
}

type TimeActiveStage timeStage

func NewTimeActiveStage(m types.TimeMuter) *TimeActiveStage {
return &TimeActiveStage{m}
func NewTimeActiveStage(m types.TimeMuter, metrics *Metrics) *TimeActiveStage {
return &TimeActiveStage{m, metrics}
}

// Exec implements the stage interface for TimeActiveStage.
Expand Down Expand Up @@ -964,7 +991,8 @@ func (tas TimeActiveStage) Exec(ctx context.Context, l log.Logger, alerts ...*ty

// If the current time is not inside an active time, all alerts are removed from the pipeline
if !muted {
level.Debug(l).Log("msg", "Notifications not sent, route is not within active time")
tas.metrics.numNotificationSuppressedTotal.WithLabelValues(suppressedReasonActiveTimeInterval).Add(float64(len(alerts)))
level.Debug(l).Log("msg", "Notifications not sent, route is not within active time", "alerts", len(alerts))
return ctx, nil, nil
}

Expand Down
45 changes: 40 additions & 5 deletions notify/notify_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -666,7 +666,8 @@ func TestMuteStage(t *testing.T) {
return ok
})

stage := NewMuteStage(muter)
metrics := NewMetrics(prometheus.NewRegistry(), featurecontrol.NoopFlags{})
stage := NewMuteStage(muter, metrics)

in := []model.LabelSet{
{},
Expand Down Expand Up @@ -705,6 +706,10 @@ func TestMuteStage(t *testing.T) {
if !reflect.DeepEqual(got, out) {
t.Fatalf("Muting failed, expected: %v\ngot %v", out, got)
}
suppressed := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal))
if (len(in) - len(got)) != suppressed {
t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(in) - len(got)), suppressed)
}
}

func TestMuteStageWithSilences(t *testing.T) {
Expand All @@ -720,9 +725,11 @@ func TestMuteStageWithSilences(t *testing.T) {
t.Fatal(err)
}

marker := types.NewMarker(prometheus.NewRegistry())
reg := prometheus.NewRegistry()
marker := types.NewMarker(reg)
silencer := silence.NewSilencer(silences, marker, log.NewNopLogger())
stage := NewMuteStage(silencer)
metrics := NewMetrics(reg, featurecontrol.NoopFlags{})
stage := NewMuteStage(silencer, metrics)

in := []model.LabelSet{
{},
Expand Down Expand Up @@ -765,6 +772,10 @@ func TestMuteStageWithSilences(t *testing.T) {
if !reflect.DeepEqual(got, out) {
t.Fatalf("Muting failed, expected: %v\ngot %v", out, got)
}
suppressedRoundOne := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal))
if (len(in) - len(got)) != suppressedRoundOne {
t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(in) - len(got)), suppressedRoundOne)
}

// Do it again to exercise the version tracking of silences.
_, alerts, err = stage.Exec(context.Background(), log.NewNopLogger(), inAlerts...)
Expand All @@ -781,6 +792,11 @@ func TestMuteStageWithSilences(t *testing.T) {
t.Fatalf("Muting failed, expected: %v\ngot %v", out, got)
}

suppressedRoundTwo := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal))
if (len(in) - len(got) + suppressedRoundOne) != suppressedRoundTwo {
t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(in) - len(got)), suppressedRoundTwo)
}

// Expire the silence and verify that no alerts are silenced now.
if err := silences.Expire(silID); err != nil {
t.Fatal(err)
Expand All @@ -798,6 +814,10 @@ func TestMuteStageWithSilences(t *testing.T) {
if !reflect.DeepEqual(got, in) {
t.Fatalf("Unmuting failed, expected: %v\ngot %v", in, got)
}
suppressedRoundThree := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal))
if (len(in) - len(got) + suppressedRoundTwo) != suppressedRoundThree {
t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(in) - len(got)), suppressedRoundThree)
}
}

func TestTimeMuteStage(t *testing.T) {
Expand Down Expand Up @@ -874,7 +894,8 @@ func TestTimeMuteStage(t *testing.T) {
}
m := map[string][]timeinterval.TimeInterval{"test": intervals}
intervener := timeinterval.NewIntervener(m)
stage := NewTimeMuteStage(intervener)
metrics := NewMetrics(prometheus.NewRegistry(), featurecontrol.NoopFlags{})
stage := NewTimeMuteStage(intervener, metrics)

outAlerts := []*types.Alert{}
nonMuteCount := 0
Expand Down Expand Up @@ -908,6 +929,10 @@ func TestTimeMuteStage(t *testing.T) {
if len(outAlerts) != nonMuteCount {
t.Fatalf("Expected %d alerts after time mute stage but got %d", nonMuteCount, len(outAlerts))
}
suppressed := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal))
if (len(cases) - nonMuteCount) != suppressed {
t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(cases) - nonMuteCount), suppressed)
}
}

func TestTimeActiveStage(t *testing.T) {
Expand All @@ -933,6 +958,11 @@ func TestTimeActiveStage(t *testing.T) {
labels: model.LabelSet{"mute": "me"},
shouldMute: true,
},
{
fireTime: "02 Dec 20 16:59 +0000",
labels: model.LabelSet{"mute": "me"},
shouldMute: true,
},
{
// Tuesday before 5pm
fireTime: "01 Dec 20 16:59 +0000",
Expand All @@ -959,7 +989,8 @@ func TestTimeActiveStage(t *testing.T) {
}
m := map[string][]timeinterval.TimeInterval{"test": intervals}
intervener := timeinterval.NewIntervener(m)
stage := NewTimeActiveStage(intervener)
metrics := NewMetrics(prometheus.NewRegistry(), featurecontrol.NoopFlags{})
stage := NewTimeActiveStage(intervener, metrics)

outAlerts := []*types.Alert{}
nonMuteCount := 0
Expand Down Expand Up @@ -993,6 +1024,10 @@ func TestTimeActiveStage(t *testing.T) {
if len(outAlerts) != nonMuteCount {
t.Fatalf("Expected %d alerts after time mute stage but got %d", nonMuteCount, len(outAlerts))
}
suppressed := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal))
if (len(cases) - nonMuteCount) != suppressed {
t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(cases) - nonMuteCount), suppressed)
}
}

func BenchmarkHashAlert(b *testing.B) {
Expand Down

0 comments on commit dcddc12

Please sign in to comment.