Skip to content

Commit

Permalink
Capture and expose notification delivery errors (#31)
Browse files Browse the repository at this point in the history
This PR makes it possible to store the last error for each receiver in case of notification delivery failure. These errors are exposed via the `/api/v2/receivers` endpoint.

Co-authored-by: gotjosh <[email protected]>
  • Loading branch information
santihernandezc and gotjosh authored Jan 19, 2023
1 parent f59460b commit ec19b0a
Show file tree
Hide file tree
Showing 12 changed files with 469 additions and 66 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,7 @@
!/.travis.yml
!/.promu.yml
!/api/v2/openapi.yaml

# Editor
.vscode
.DS_Store
5 changes: 3 additions & 2 deletions api/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
"github.com/prometheus/alertmanager/cluster"
"github.com/prometheus/alertmanager/config"
"github.com/prometheus/alertmanager/dispatch"
"github.com/prometheus/alertmanager/notify"
"github.com/prometheus/alertmanager/provider"
"github.com/prometheus/alertmanager/silence"
"github.com/prometheus/alertmanager/types"
Expand Down Expand Up @@ -195,9 +196,9 @@ func (api *API) Register(r *route.Router, routePrefix string) *http.ServeMux {

// Update config and resolve timeout of each API. APIv2 also needs
// setAlertStatus to be updated.
func (api *API) Update(cfg *config.Config, setAlertStatus func(model.LabelSet)) {
func (api *API) Update(cfg *config.Config, receivers []*notify.Receiver, setAlertStatus func(model.LabelSet)) {
api.v1.Update(cfg)
api.v2.Update(cfg, setAlertStatus)
api.v2.Update(cfg, setAlertStatus, receivers)
}

func (api *API) limitHandler(h http.Handler) http.Handler {
Expand Down
47 changes: 40 additions & 7 deletions api/v2/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ import (
"github.com/prometheus/common/version"
"github.com/rs/cors"

"github.com/prometheus/alertmanager/api/metrics"
open_api_models "github.com/prometheus/alertmanager/api/v2/models"
"github.com/prometheus/alertmanager/api/v2/restapi"
"github.com/prometheus/alertmanager/api/v2/restapi/operations"
Expand All @@ -41,9 +40,12 @@ import (
general_ops "github.com/prometheus/alertmanager/api/v2/restapi/operations/general"
receiver_ops "github.com/prometheus/alertmanager/api/v2/restapi/operations/receiver"
silence_ops "github.com/prometheus/alertmanager/api/v2/restapi/operations/silence"

"github.com/prometheus/alertmanager/api/metrics"
"github.com/prometheus/alertmanager/cluster"
"github.com/prometheus/alertmanager/config"
"github.com/prometheus/alertmanager/dispatch"
"github.com/prometheus/alertmanager/notify"
"github.com/prometheus/alertmanager/pkg/labels"
"github.com/prometheus/alertmanager/provider"
"github.com/prometheus/alertmanager/silence"
Expand Down Expand Up @@ -71,7 +73,8 @@ type API struct {
logger log.Logger
m *metrics.Alerts

Handler http.Handler
Handler http.Handler
receivers []*notify.Receiver
}

type (
Expand Down Expand Up @@ -140,13 +143,14 @@ func (api *API) requestLogger(req *http.Request) log.Logger {
}

// Update sets the API struct members that may change between reloads of alertmanager.
func (api *API) Update(cfg *config.Config, setAlertStatus setAlertStatusFn) {
func (api *API) Update(cfg *config.Config, setAlertStatus setAlertStatusFn, receivers []*notify.Receiver) {
api.mtx.Lock()
defer api.mtx.Unlock()

api.alertmanagerConfig = cfg
api.route = dispatch.NewRoute(cfg.Route, nil)
api.setAlertStatus = setAlertStatus
api.receivers = receivers
}

func (api *API) getStatusHandler(params general_ops.GetStatusParams) middleware.Responder {
Expand Down Expand Up @@ -207,11 +211,40 @@ func (api *API) getStatusHandler(params general_ops.GetStatusParams) middleware.

func (api *API) getReceiversHandler(params receiver_ops.GetReceiversParams) middleware.Responder {
api.mtx.RLock()
defer api.mtx.RUnlock()
configReceivers := api.receivers
api.mtx.RUnlock()

receivers := make([]*open_api_models.Receiver, 0, len(configReceivers))
for _, r := range configReceivers {
integrations := make([]*open_api_models.Integration, 0, len(r.Integrations()))

for _, integration := range r.Integrations() {
notify, duration, err := integration.GetReport()
iname := integration.String()
sendResolved := integration.SendResolved()
integrations = append(integrations, &open_api_models.Integration{
Name: &iname,
SendResolved: &sendResolved,
LastNotifyAttempt: strfmt.DateTime(notify.UTC()),
LastNotifyAttemptDuration: duration.String(),
LastNotifyAttemptError: func() string {
if err != nil {
return err.Error()
}
return ""
}(),
})
}

rName := r.Name()
active := r.Active()
model := &open_api_models.Receiver{
Name: &rName,
Active: &active,
Integrations: integrations,
}

receivers := make([]*open_api_models.Receiver, 0, len(api.alertmanagerConfig.Receivers))
for _, r := range api.alertmanagerConfig.Receivers {
receivers = append(receivers, &open_api_models.Receiver{Name: &r.Name})
receivers = append(receivers, model)
}

return receiver_ops.NewGetReceiversOK().WithPayload(receivers)
Expand Down
122 changes: 122 additions & 0 deletions api/v2/models/integration.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

51 changes: 51 additions & 0 deletions api/v2/models/receiver.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

28 changes: 28 additions & 0 deletions api/v2/openapi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -506,8 +506,36 @@ definitions:
properties:
name:
type: string
active:
type: boolean
integrations:
type: array
items:
$ref: '#/definitions/integration'
required:
- name
- active
- integrations
integration:
type: object
properties:
name:
type: string
sendResolved:
type: boolean
lastNotifyAttempt:
description: A timestamp indicating the last attempt to deliver a notification regardless of the outcome.
type: string
format: date-time
lastNotifyAttemptDuration:
description: Duration of the last attempt to deliver a notification in humanized format (`1s` or `15ms`, etc).
type: string
lastNotifyAttemptError:
description: Error string for the last attempt to deliver a notification. Empty if the last attempt was successful.
type: string
required:
- name
- sendResolved
labelSet:
type: object
additionalProperties:
Expand Down
Loading

0 comments on commit ec19b0a

Please sign in to comment.