From f0130ac5eae8cbe7c8e0733e6850a713d05948ee Mon Sep 17 00:00:00 2001 From: Yasmin T Date: Sun, 5 Jan 2025 13:19:36 +0200 Subject: [PATCH] DEV-47164 - Add alerting observability in logs (#68) We need more logs data to be able to investigate alerting issues. Main data missing is to correlate on a single evaluation all the logs - that requires rule uid, and eval time. Also removed some debug log that were excessive and changed log levels in some places. --- pkg/services/ngalert/api/alerting_logzio.go | 9 +++++---- pkg/services/ngalert/ngalert.go | 1 + pkg/services/ngalert/schedule/schedule.go | 16 ++++++++-------- pkg/services/ngalert/sender/router_logzio.go | 3 +-- 4 files changed, 15 insertions(+), 14 deletions(-) diff --git a/pkg/services/ngalert/api/alerting_logzio.go b/pkg/services/ngalert/api/alerting_logzio.go index 33fb7155c0caf..a42982454f280 100644 --- a/pkg/services/ngalert/api/alerting_logzio.go +++ b/pkg/services/ngalert/api/alerting_logzio.go @@ -76,19 +76,20 @@ func (srv *LogzioAlertingService) addQuerySourceHeader(c *contextmodel.ReqContex } func (srv *LogzioAlertingService) RouteSendAlertNotifications(c *contextmodel.ReqContext, sendNotificationsRequest apimodels.AlertSendNotificationsRequest) response.Response { - c.Logger.Info("Sending alerts to local notifier", "count", len(sendNotificationsRequest.Alerts.PostableAlerts)) + logger := c.Logger.New(sendNotificationsRequest.AlertRuleKey.LogContext()...).FromContext(c.Req.Context()) + logger.Info("Sending alerts to local notifier", "count", len(sendNotificationsRequest.Alerts.PostableAlerts)) n, err := srv.MultiOrgAlertmanager.AlertmanagerFor(sendNotificationsRequest.AlertRuleKey.OrgID) if err == nil { if err := n.PutAlerts(c.Req.Context(), sendNotificationsRequest.Alerts); err != nil { - c.Logger.Error("Failed to put alerts in the local notifier", "count", len(sendNotificationsRequest.Alerts.PostableAlerts), "error", err) + logger.Error("Failed to put alerts in the local notifier", "count", len(sendNotificationsRequest.Alerts.PostableAlerts), "error", err) } else { return response.Success("Put alerts was successful") } } else { if errors.Is(err, notifier.ErrNoAlertmanagerForOrg) { - c.Logger.Debug("Local notifier was not found") + logger.Debug("Local notifier was not found") } else { - c.Logger.Error("Local notifier is not available", "error", err) + logger.Error("Local notifier is not available", "error", err) } } diff --git a/pkg/services/ngalert/ngalert.go b/pkg/services/ngalert/ngalert.go index 91f9b5ad36a2d..0748877b852d6 100644 --- a/pkg/services/ngalert/ngalert.go +++ b/pkg/services/ngalert/ngalert.go @@ -271,6 +271,7 @@ func (ng *AlertNG) init() error { // LOGZ.IO GRAFANA CHANGE :: DEV-43744 Add logzio notification route var alertsSender schedule.AlertsSender + ng.Log.Debug("Unified Alerting scheduled evaluation config", "scheduled_evaluation_enabled", ng.Cfg.UnifiedAlerting.ScheduledEvalEnabled) if ng.Cfg.UnifiedAlerting.ScheduledEvalEnabled { alertsSender = alertsRouter } else { diff --git a/pkg/services/ngalert/schedule/schedule.go b/pkg/services/ngalert/schedule/schedule.go index 0df8715b8a0b8..94659b2cef778 100644 --- a/pkg/services/ngalert/schedule/schedule.go +++ b/pkg/services/ngalert/schedule/schedule.go @@ -301,9 +301,8 @@ func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup. folderTitle: folderTitle, }}) } - } else { - sch.log.Debug("Scheduled evaluation disabled, not adding alerts to run") } + // LOGZ.IO GRAFANA CHANGE :: End if _, isUpdated := updated[key]; isUpdated && !isReadyToRun { // if we do not need to eval the rule, check the whether rule was just updated and if it was, notify evaluation routine about that @@ -363,7 +362,7 @@ func (sch *schedule) processTick(ctx context.Context, dispatcherGroup *errgroup. // LOGZ.IO GRAFANA CHANGE :: DEV-43744 Add logzio external evaluation func (sch *schedule) RunRuleEvaluation(ctx context.Context, evalReq ngmodels.ExternalAlertEvaluationRequest) error { - logger := sch.log.FromContext(ctx) + logger := sch.log.New(ctx, "eval_time", evalReq.EvalTime, "rule_title", evalReq.AlertRule.Title, "rule_uid", evalReq.AlertRule.UID, "org_id", evalReq.AlertRule.OrgID).FromContext(ctx) alertKey := ngmodels.AlertRuleKey{ OrgID: evalReq.AlertRule.OrgID, UID: evalReq.AlertRule.UID, @@ -385,7 +384,7 @@ func (sch *schedule) RunRuleEvaluation(ctx context.Context, evalReq ngmodels.Ext return fmt.Errorf("evaluation was not sent for alert key %s", alertKey) } if dropped != nil { - logger.Warn("RunRuleEvaluation: got dropped eval", "dropped", dropped, "rule_uid", alertKey.UID, "org_id", alertKey.OrgID) + logger.Warn("RunRuleEvaluation: got dropped eval", "dropped_eval_time", dropped.scheduledAt) } } } else { @@ -428,7 +427,7 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR } evaluate := func(ctx context.Context, f fingerprint, attempt int64, e *evaluation, span trace.Span, retry bool) error { - logger := logger.New("version", e.rule.Version, "fingerprint", f, "attempt", attempt, "now", e.scheduledAt).FromContext(ctx) + logger := logger.New("version", e.rule.Version, "fingerprint", f, "attempt", attempt, "eval_time", e.scheduledAt).FromContext(ctx) // LOGZ.IO GRAFANA CHANGE :: DEV-47164: Add observability to alerting start := sch.clock.Now() // LOGZ.IO GRAFANA CHANGE :: DEV-43889 - Add headers for logzio datasources support @@ -517,6 +516,7 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR attribute.Int64("alerts_to_send", int64(len(alerts.PostableAlerts))), )) if len(alerts.PostableAlerts) > 0 { + logger.Info("Sending postable alerts", "alerts", len(alerts.PostableAlerts)) // LOGZ.IO GRAFANA CHANGE :: DEV-47164: Add observability to alerting sch.alertsSender.Send(ctx, key, alerts) } sendDuration.Observe(sch.clock.Now().Sub(start).Seconds()) @@ -593,7 +593,7 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR if tracingCtx.Err() != nil { span.SetStatus(codes.Error, "rule evaluation cancelled") span.End() - logger.Error("Skip evaluation and updating the state because the context has been cancelled", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt) + logger.Error("Skip evaluation and updating the state because the context has been cancelled", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "eval_time", ctx.scheduledAt) return } @@ -606,10 +606,10 @@ func (sch *schedule) ruleRoutine(grafanaCtx context.Context, key ngmodels.AlertR return } - logger.Error("Failed to evaluate rule", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt, "error", err) + logger.Error("Failed to evaluate rule", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "eval_time", ctx.scheduledAt, "error", err) select { case <-tracingCtx.Done(): - logger.Error("Context has been cancelled while backing off", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "now", ctx.scheduledAt) + logger.Error("Context has been cancelled while backing off", "version", ctx.rule.Version, "fingerprint", f, "attempt", attempt, "eval_time", ctx.scheduledAt) return case <-time.After(retryDelay): continue diff --git a/pkg/services/ngalert/sender/router_logzio.go b/pkg/services/ngalert/sender/router_logzio.go index fc2eff2b61f68..96ef35d325e28 100644 --- a/pkg/services/ngalert/sender/router_logzio.go +++ b/pkg/services/ngalert/sender/router_logzio.go @@ -37,7 +37,6 @@ func NewLogzioAlertsRouter(alertsRouteUrl string) (*LogzioAlertsRouter, error) { func (d *LogzioAlertsRouter) Send(ctx context.Context, key models.AlertRuleKey, alerts definitions.PostableAlerts) { logger := d.logger.New(key.LogContext()...) - logger.Debug("Sending alerts on logzio sender") if len(alerts.PostableAlerts) == 0 { logger.Info("No alerts to notify about") return @@ -55,7 +54,7 @@ func (d *LogzioAlertsRouter) Send(ctx context.Context, key models.AlertRuleKey, return } - logger.Info("Sending alerts to external url", "url", d.url, "headers", headers, "payload", body) + logger.Debug("Sending alerts to external url", "url", d.url, "headers", headers, "payload", body) err = sendAlert(logger, ctx, d.client, d.url, payload, headers) if err != nil { logger.Warn("Error from sending alerts to notify", "err", err)