Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add counter to track alerts dropped outside of time_intervals #3565

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 43 additions & 15 deletions notify/notify.go
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@ type Metrics struct {
numTotalFailedNotifications *prometheus.CounterVec
numNotificationRequestsTotal *prometheus.CounterVec
numNotificationRequestsFailedTotal *prometheus.CounterVec
numNotificationSuppressedTotal *prometheus.CounterVec
notificationLatencySeconds *prometheus.HistogramVec

ff featurecontrol.Flagger
Expand Down Expand Up @@ -284,6 +285,11 @@ func NewMetrics(r prometheus.Registerer, ff featurecontrol.Flagger) *Metrics {
Name: "notification_requests_failed_total",
Help: "The total number of failed notification requests.",
}, labels),
numNotificationSuppressedTotal: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: "alertmanager",
Name: "notifications_suppressed_total",
Help: "The total number of notifications suppressed for being outside of active time intervals or within muted time intervals.",
}, []string{"reason"}),
notificationLatencySeconds: prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: "alertmanager",
Name: "notification_latency_seconds",
Expand All @@ -296,7 +302,7 @@ func NewMetrics(r prometheus.Registerer, ff featurecontrol.Flagger) *Metrics {
r.MustRegister(
m.numNotifications, m.numTotalFailedNotifications,
m.numNotificationRequestsTotal, m.numNotificationRequestsFailedTotal,
m.notificationLatencySeconds,
m.numNotificationSuppressedTotal, m.notificationLatencySeconds,
)

return m
Expand Down Expand Up @@ -381,10 +387,10 @@ func (pb *PipelineBuilder) New(
rs := make(RoutingStage, len(receivers))

ms := NewGossipSettleStage(peer)
is := NewMuteStage(inhibitor)
tas := NewTimeActiveStage(intervener)
tms := NewTimeMuteStage(intervener)
ss := NewMuteStage(silencer)
is := NewMuteStage(inhibitor, pb.metrics)
tas := NewTimeActiveStage(intervener, pb.metrics)
tms := NewTimeMuteStage(intervener, pb.metrics)
ss := NewMuteStage(silencer, pb.metrics)

for name := range receivers {
st := createReceiverStage(name, receivers[name], wait, notificationLog, pb.metrics)
Expand Down Expand Up @@ -507,14 +513,22 @@ func (n *GossipSettleStage) Exec(ctx context.Context, _ log.Logger, alerts ...*t
return ctx, alerts, nil
}

const (
suppressedReasonSilence = "silence"
suppressedReasonInhibition = "inhibition"
suppressedReasonMuteTimeInterval = "mute_time_interval"
suppressedReasonActiveTimeInterval = "active_time_interval"
)

// MuteStage filters alerts through a Muter.
type MuteStage struct {
muter types.Muter
muter types.Muter
metrics *Metrics
}

// NewMuteStage return a new MuteStage.
func NewMuteStage(m types.Muter) *MuteStage {
return &MuteStage{muter: m}
func NewMuteStage(m types.Muter, metrics *Metrics) *MuteStage {
return &MuteStage{muter: m, metrics: metrics}
}

// Exec implements the Stage interface.
Expand All @@ -535,7 +549,18 @@ func (n *MuteStage) Exec(ctx context.Context, logger log.Logger, alerts ...*type
}
if len(muted) > 0 {
level.Debug(logger).Log("msg", "Notifications will not be sent for muted alerts", "alerts", fmt.Sprintf("%v", muted))

var reason string
switch n.muter.(type) {
case *silence.Silencer:
reason = suppressedReasonSilence
case *inhibit.Inhibitor:
reason = suppressedReasonInhibition
default:
}
n.metrics.numNotificationSuppressedTotal.WithLabelValues(reason).Add(float64(len(muted)))
}

return ctx, filtered, nil
}

Expand Down Expand Up @@ -877,13 +902,14 @@ func (n SetNotifiesStage) Exec(ctx context.Context, l log.Logger, alerts ...*typ
}

type timeStage struct {
muter types.TimeMuter
muter types.TimeMuter
metrics *Metrics
}

type TimeMuteStage timeStage

func NewTimeMuteStage(m types.TimeMuter) *TimeMuteStage {
return &TimeMuteStage{m}
func NewTimeMuteStage(m types.TimeMuter, metrics *Metrics) *TimeMuteStage {
return &TimeMuteStage{m, metrics}
}

// Exec implements the stage interface for TimeMuteStage.
Expand All @@ -910,16 +936,17 @@ func (tms TimeMuteStage) Exec(ctx context.Context, l log.Logger, alerts ...*type

// If the current time is inside a mute time, all alerts are removed from the pipeline.
if muted {
level.Debug(l).Log("msg", "Notifications not sent, route is within mute time")
tms.metrics.numNotificationSuppressedTotal.WithLabelValues(suppressedReasonMuteTimeInterval).Add(float64(len(alerts)))
level.Debug(l).Log("msg", "Notifications not sent, route is within mute time", "alerts", len(alerts))
return ctx, nil, nil
}
return ctx, alerts, nil
}

type TimeActiveStage timeStage

func NewTimeActiveStage(m types.TimeMuter) *TimeActiveStage {
return &TimeActiveStage{m}
func NewTimeActiveStage(m types.TimeMuter, metrics *Metrics) *TimeActiveStage {
return &TimeActiveStage{m, metrics}
}

// Exec implements the stage interface for TimeActiveStage.
Expand Down Expand Up @@ -947,7 +974,8 @@ func (tas TimeActiveStage) Exec(ctx context.Context, l log.Logger, alerts ...*ty

// If the current time is not inside an active time, all alerts are removed from the pipeline
if !muted {
level.Debug(l).Log("msg", "Notifications not sent, route is not within active time")
tas.metrics.numNotificationSuppressedTotal.WithLabelValues(suppressedReasonActiveTimeInterval).Add(float64(len(alerts)))
level.Debug(l).Log("msg", "Notifications not sent, route is not within active time", "alerts", len(alerts))
return ctx, nil, nil
}

Expand Down
45 changes: 40 additions & 5 deletions notify/notify_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -633,7 +633,8 @@ func TestMuteStage(t *testing.T) {
return ok
})

stage := NewMuteStage(muter)
metrics := NewMetrics(prometheus.NewRegistry(), featurecontrol.NoopFlags{})
stage := NewMuteStage(muter, metrics)

in := []model.LabelSet{
{},
Expand Down Expand Up @@ -672,6 +673,10 @@ func TestMuteStage(t *testing.T) {
if !reflect.DeepEqual(got, out) {
t.Fatalf("Muting failed, expected: %v\ngot %v", out, got)
}
suppressed := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal))
if (len(in) - len(got)) != suppressed {
t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(in) - len(got)), suppressed)
}
}

func TestMuteStageWithSilences(t *testing.T) {
Expand All @@ -687,9 +692,11 @@ func TestMuteStageWithSilences(t *testing.T) {
t.Fatal(err)
}

marker := types.NewMarker(prometheus.NewRegistry())
reg := prometheus.NewRegistry()
marker := types.NewMarker(reg)
silencer := silence.NewSilencer(silences, marker, log.NewNopLogger())
stage := NewMuteStage(silencer)
metrics := NewMetrics(reg, featurecontrol.NoopFlags{})
stage := NewMuteStage(silencer, metrics)

in := []model.LabelSet{
{},
Expand Down Expand Up @@ -732,6 +739,10 @@ func TestMuteStageWithSilences(t *testing.T) {
if !reflect.DeepEqual(got, out) {
t.Fatalf("Muting failed, expected: %v\ngot %v", out, got)
}
suppressedRoundOne := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal))
if (len(in) - len(got)) != suppressedRoundOne {
t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(in) - len(got)), suppressedRoundOne)
}

// Do it again to exercise the version tracking of silences.
_, alerts, err = stage.Exec(context.Background(), log.NewNopLogger(), inAlerts...)
Expand All @@ -748,6 +759,11 @@ func TestMuteStageWithSilences(t *testing.T) {
t.Fatalf("Muting failed, expected: %v\ngot %v", out, got)
}

suppressedRoundTwo := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal))
if (len(in) - len(got) + suppressedRoundOne) != suppressedRoundTwo {
t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(in) - len(got)), suppressedRoundTwo)
}

// Expire the silence and verify that no alerts are silenced now.
if err := silences.Expire(silID); err != nil {
t.Fatal(err)
Expand All @@ -765,6 +781,10 @@ func TestMuteStageWithSilences(t *testing.T) {
if !reflect.DeepEqual(got, in) {
t.Fatalf("Unmuting failed, expected: %v\ngot %v", in, got)
}
suppressedRoundThree := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal))
if (len(in) - len(got) + suppressedRoundTwo) != suppressedRoundThree {
t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(in) - len(got)), suppressedRoundThree)
}
}

func TestTimeMuteStage(t *testing.T) {
Expand Down Expand Up @@ -841,7 +861,8 @@ func TestTimeMuteStage(t *testing.T) {
}
m := map[string][]timeinterval.TimeInterval{"test": intervals}
intervener := timeinterval.NewIntervener(m)
stage := NewTimeMuteStage(intervener)
metrics := NewMetrics(prometheus.NewRegistry(), featurecontrol.NoopFlags{})
stage := NewTimeMuteStage(intervener, metrics)

outAlerts := []*types.Alert{}
nonMuteCount := 0
Expand Down Expand Up @@ -875,6 +896,10 @@ func TestTimeMuteStage(t *testing.T) {
if len(outAlerts) != nonMuteCount {
t.Fatalf("Expected %d alerts after time mute stage but got %d", nonMuteCount, len(outAlerts))
}
suppressed := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal))
if (len(cases) - nonMuteCount) != suppressed {
t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(cases) - nonMuteCount), suppressed)
}
}

func TestTimeActiveStage(t *testing.T) {
Expand All @@ -900,6 +925,11 @@ func TestTimeActiveStage(t *testing.T) {
labels: model.LabelSet{"mute": "me"},
shouldMute: true,
},
{
fireTime: "02 Dec 20 16:59 +0000",
labels: model.LabelSet{"mute": "me"},
shouldMute: true,
},
{
// Tuesday before 5pm
fireTime: "01 Dec 20 16:59 +0000",
Expand All @@ -926,7 +956,8 @@ func TestTimeActiveStage(t *testing.T) {
}
m := map[string][]timeinterval.TimeInterval{"test": intervals}
intervener := timeinterval.NewIntervener(m)
stage := NewTimeActiveStage(intervener)
metrics := NewMetrics(prometheus.NewRegistry(), featurecontrol.NoopFlags{})
stage := NewTimeActiveStage(intervener, metrics)

outAlerts := []*types.Alert{}
nonMuteCount := 0
Expand Down Expand Up @@ -960,6 +991,10 @@ func TestTimeActiveStage(t *testing.T) {
if len(outAlerts) != nonMuteCount {
t.Fatalf("Expected %d alerts after time mute stage but got %d", nonMuteCount, len(outAlerts))
}
suppressed := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal))
if (len(cases) - nonMuteCount) != suppressed {
t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(cases) - nonMuteCount), suppressed)
}
}

func BenchmarkHashAlert(b *testing.B) {
Expand Down
Loading