Skip to content

Commit

Permalink
DEV-47164 - Add alerting observability in logs (#68)
Browse files Browse the repository at this point in the history
We need more logs data to be able to investigate alerting issues.
Main data missing is to correlate on a single evaluation all the logs - that requires rule uid, and eval time.

Also removed some debug log that were excessive and changed log levels in some places.
  • Loading branch information
yasmin-tr authored Jan 5, 2025
1 parent 0e5c976 commit f0130ac
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 14 deletions.
9 changes: 5 additions & 4 deletions pkg/services/ngalert/api/alerting_logzio.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,19 +76,20 @@ func (srv *LogzioAlertingService) addQuerySourceHeader(c *contextmodel.ReqContex
}

func (srv *LogzioAlertingService) RouteSendAlertNotifications(c *contextmodel.ReqContext, sendNotificationsRequest apimodels.AlertSendNotificationsRequest) response.Response {
c.Logger.Info("Sending alerts to local notifier", "count", len(sendNotificationsRequest.Alerts.PostableAlerts))
logger := c.Logger.New(sendNotificationsRequest.AlertRuleKey.LogContext()...).FromContext(c.Req.Context())
logger.Info("Sending alerts to local notifier", "count", len(sendNotificationsRequest.Alerts.PostableAlerts))
n, err := srv.MultiOrgAlertmanager.AlertmanagerFor(sendNotificationsRequest.AlertRuleKey.OrgID)
if err == nil {
if err := n.PutAlerts(c.Req.Context(), sendNotificationsRequest.Alerts); err != nil {
c.Logger.Error("Failed to put alerts in the local notifier", "count", len(sendNotificationsRequest.Alerts.PostableAlerts), "error", err)
logger.Error("Failed to put alerts in the local notifier", "count", len(sendNotificationsRequest.Alerts.PostableAlerts), "error", err)
} else {
return response.Success("Put alerts was successful")
}
} else {
if errors.Is(err, notifier.ErrNoAlertmanagerForOrg) {
c.Logger.Debug("Local notifier was not found")
logger.Debug("Local notifier was not found")
} else {
c.Logger.Error("Local notifier is not available", "error", err)
logger.Error("Local notifier is not available", "error", err)
}
}

Expand Down
1 change: 1 addition & 0 deletions pkg/services/ngalert/ngalert.go
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,7 @@ func (ng *AlertNG) init() error {

// LOGZ.IO GRAFANA CHANGE :: DEV-43744 Add logzio notification route
var alertsSender schedule.AlertsSender
ng.Log.Debug("Unified Alerting scheduled evaluation config", "scheduled_evaluation_enabled", ng.Cfg.UnifiedAlerting.ScheduledEvalEnabled)
if ng.Cfg.UnifiedAlerting.ScheduledEvalEnabled {
alertsSender = alertsRouter
} else {
Expand Down
16 changes: 8 additions & 8 deletions pkg/services/ngalert/schedule/schedule.go
Original file line number Diff line number Diff line change
Expand Up @@ -301,9 +301,8 @@ func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup.
folderTitle: folderTitle,
}})
}
} else {
sch.log.Debug("Scheduled evaluation disabled, not adding alerts to run")
}
// LOGZ.IO GRAFANA CHANGE :: End

if _, isUpdated := updated[key]; isUpdated && !isReadyToRun {
// if we do not need to eval the rule, check the whether rule was just updated and if it was, notify evaluation routine about that
Expand Down Expand Up @@ -363,7 +362,7 @@ func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup.

// LOGZ.IO GRAFANA CHANGE :: DEV-43744 Add logzio external evaluation
func (sch *schedule) RunRuleEvaluation(ctx context.Context, evalReq ngmodels.ExternalAlertEvaluationRequest) error {
logger := sch.log.FromContext(ctx)
logger := sch.log.New(ctx, "eval_time", evalReq.EvalTime, "rule_title", evalReq.AlertRule.Title, "rule_uid", evalReq.AlertRule.UID, "org_id", evalReq.AlertRule.OrgID).FromContext(ctx)
alertKey := ngmodels.AlertRuleKey{
OrgID: evalReq.AlertRule.OrgID,
UID: evalReq.AlertRule.UID,
Expand All @@ -385,7 +384,7 @@ func (sch *schedule) RunRuleEvaluation(ctx context.Context, evalReq ngmodels.Ext
return fmt.Errorf("evaluation was not sent for alert key %s", alertKey)
}
if dropped != nil {
logger.Warn("RunRuleEvaluation: got dropped eval", "dropped", dropped, "rule_uid", alertKey.UID, "org_id", alertKey.OrgID)
logger.Warn("RunRuleEvaluation: got dropped eval", "dropped_eval_time", dropped.scheduledAt)
}
}
} else {
Expand Down Expand Up @@ -428,7 +427,7 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
}

evaluate := func(ctx context.Context, f fingerprint, attempt int64, e *evaluation, span trace.Span, retry bool) error {
logger := logger.New("version", e.rule.Version, "fingerprint", f, "attempt", attempt, "now", e.scheduledAt).FromContext(ctx)
logger := logger.New("version", e.rule.Version, "fingerprint", f, "attempt", attempt, "eval_time", e.scheduledAt).FromContext(ctx) // LOGZ.IO GRAFANA CHANGE :: DEV-47164: Add observability to alerting
start := sch.clock.Now()

// LOGZ.IO GRAFANA CHANGE :: DEV-43889 - Add headers for logzio datasources support
Expand Down Expand Up @@ -517,6 +516,7 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
attribute.Int64("alerts_to_send", int64(len(alerts.PostableAlerts))),
))
if len(alerts.PostableAlerts) > 0 {
logger.Info("Sending postable alerts", "alerts", len(alerts.PostableAlerts)) // LOGZ.IO GRAFANA CHANGE :: DEV-47164: Add observability to alerting
sch.alertsSender.Send(ctx, key, alerts)
}
sendDuration.Observe(sch.clock.Now().Sub(start).Seconds())
Expand Down Expand Up @@ -593,7 +593,7 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
if tracingCtx.Err() != nil {
span.SetStatus(codes.Error, "rule evaluation cancelled")
span.End()
logger.Error("Skip evaluation and updating the state because the context has been cancelled", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt)
logger.Error("Skip evaluation and updating the state because the context has been cancelled", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "eval_time", ctx.scheduledAt)
return
}

Expand All @@ -606,10 +606,10 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR
return
}

logger.Error("Failed to evaluate rule", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt, "error", err)
logger.Error("Failed to evaluate rule", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "eval_time", ctx.scheduledAt, "error", err)
select {
case <-tracingCtx.Done():
logger.Error("Context has been cancelled while backing off", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt)
logger.Error("Context has been cancelled while backing off", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "eval_time", ctx.scheduledAt)
return
case <-time.After(retryDelay):
continue
Expand Down
3 changes: 1 addition & 2 deletions pkg/services/ngalert/sender/router_logzio.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ func NewLogzioAlertsRouter(alertsRouteUrl string) (*LogzioAlertsRouter, error) {

func (d *LogzioAlertsRouter) Send(ctx context.Context, key models.AlertRuleKey, alerts definitions.PostableAlerts) {
logger := d.logger.New(key.LogContext()...)
logger.Debug("Sending alerts on logzio sender")
if len(alerts.PostableAlerts) == 0 {
logger.Info("No alerts to notify about")
return
Expand All @@ -55,7 +54,7 @@ func (d *LogzioAlertsRouter) Send(ctx context.Context, key models.AlertRuleKey,
return
}

logger.Info("Sending alerts to external url", "url", d.url, "headers", headers, "payload", body)
logger.Debug("Sending alerts to external url", "url", d.url, "headers", headers, "payload", body)
err = sendAlert(logger, ctx, d.client, d.url, payload, headers)
if err != nil {
logger.Warn("Error from sending alerts to notify", "err", err)
Expand Down

0 comments on commit f0130ac

Please sign in to comment.