diff --git a/pkg/headtracker/head_tracker.go b/pkg/headtracker/head_tracker.go new file mode 100644 index 0000000000..e12f5c7eab --- /dev/null +++ b/pkg/headtracker/head_tracker.go @@ -0,0 +1,357 @@ +package headtracker + +import ( + "context" + "fmt" + "math/big" + "sync" + "time" + + "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" + htrktypes "github.com/smartcontractkit/chainlink-relay/pkg/headtracker/types" + "github.com/smartcontractkit/chainlink-relay/pkg/logger" + "github.com/smartcontractkit/chainlink-relay/pkg/types" + "github.com/smartcontractkit/chainlink-relay/pkg/utils" + "golang.org/x/exp/maps" +) + +var ( + promCurrentHead = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "head_tracker_current_head", + Help: "The highest seen head number", + }, []string{"evmChainID"}) + + promOldHead = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "head_tracker_very_old_head", + Help: "Counter is incremented every time we get a head that is much lower than the highest seen head ('much lower' is defined as a block that is EVM.FinalityDepth or greater below the highest seen head)", + }, []string{"evmChainID"}) +) + +// HeadsBufferSize - The buffer is used when heads sampling is disabled, to ensure the callback is run for every head +const HeadsBufferSize = 10 + +type HeadTracker[ + HTH htrktypes.Head[BLOCK_HASH, ID], + S types.Subscription, + ID types.ID, + BLOCK_HASH types.Hashable, +] struct { + log logger.Logger + headBroadcaster types.HeadBroadcaster[HTH, BLOCK_HASH] + headSaver types.HeadSaver[HTH, BLOCK_HASH] + mailMon *utils.MailboxMonitor + client htrktypes.Client[HTH, S, ID, BLOCK_HASH] + chainID ID + config htrktypes.Config + + backfillMB *utils.Mailbox[HTH] + broadcastMB *utils.Mailbox[HTH] + headListener types.HeadListener[HTH, BLOCK_HASH] + chStop utils.StopChan + wgDone sync.WaitGroup + utils.StartStopOnce + getNilHead func() HTH +} + +// NewHeadTracker instantiates a new HeadTracker using HeadSaver to persist new block numbers. +func NewHeadTracker[ + HTH htrktypes.Head[BLOCK_HASH, ID], + S types.Subscription, + ID types.ID, + BLOCK_HASH types.Hashable, +]( + lggr logger.Logger, + client htrktypes.Client[HTH, S, ID, BLOCK_HASH], + config htrktypes.Config, + headBroadcaster types.HeadBroadcaster[HTH, BLOCK_HASH], + headSaver types.HeadSaver[HTH, BLOCK_HASH], + mailMon *utils.MailboxMonitor, + getNilHead func() HTH, +) types.HeadTracker[HTH, BLOCK_HASH] { + chStop := make(chan struct{}) + lggr = logger.Named(lggr, "HeadTracker") + return &HeadTracker[HTH, S, ID, BLOCK_HASH]{ + headBroadcaster: headBroadcaster, + client: client, + chainID: client.ConfiguredChainID(), + config: config, + log: lggr, + backfillMB: utils.NewSingleMailbox[HTH](), + broadcastMB: utils.NewMailbox[HTH](HeadsBufferSize), + chStop: chStop, + headListener: NewHeadListener[HTH, S, ID, BLOCK_HASH](lggr, client, config, chStop), + headSaver: headSaver, + mailMon: mailMon, + getNilHead: getNilHead, + } +} + +// Start starts HeadTracker service. +func (ht *HeadTracker[HTH, S, ID, BLOCK_HASH]) Start(ctx context.Context) error { + return ht.StartOnce("HeadTracker", func() error { + ht.log.Debugw("Starting HeadTracker", "chainID", ht.chainID) + latestChain, err := ht.headSaver.Load(ctx) + if err != nil { + return err + } + if latestChain.IsValid() { + ht.log.Debugw( + fmt.Sprintf("HeadTracker: Tracking logs from last block %v with hash %s", htrktypes.FriendlyInt64(latestChain.BlockNumber()), latestChain.BlockHash()), + "blockNumber", latestChain.BlockNumber(), + "blockHash", latestChain.BlockHash(), + ) + } + + // NOTE: Always try to start the head tracker off with whatever the + // latest head is, without waiting for the subscription to send us one. + // + // In some cases the subscription will send us the most recent head + // anyway when we connect (but we should not rely on this because it is + // not specced). If it happens this is fine, and the head will be + // ignored as a duplicate. + initialHead, err := ht.getInitialHead(ctx) + if err != nil { + if errors.Is(err, ctx.Err()) { + return nil + } + ht.log.Errorw("Error getting initial head", "err", err) + } else if initialHead.IsValid() { + if err := ht.handleNewHead(ctx, initialHead); err != nil { + return errors.Wrap(err, "error handling initial head") + } + } else { + ht.log.Debug("Got nil initial head") + } + + ht.wgDone.Add(3) + go ht.headListener.ListenForNewHeads(ht.handleNewHead, ht.wgDone.Done) + go ht.backfillLoop() + go ht.broadcastLoop() + + ht.mailMon.Monitor(ht.broadcastMB, "HeadTracker", "Broadcast", ht.chainID.String()) + + return nil + }) +} + +// Close stops HeadTracker service. +func (ht *HeadTracker[HTH, S, ID, BLOCK_HASH]) Close() error { + return ht.StopOnce("HeadTracker", func() error { + close(ht.chStop) + ht.wgDone.Wait() + return ht.broadcastMB.Close() + }) +} + +func (ht *HeadTracker[HTH, S, ID, BLOCK_HASH]) Name() string { + return ht.log.Name() +} + +func (ht *HeadTracker[HTH, S, ID, BLOCK_HASH]) HealthReport() map[string]error { + report := map[string]error{ + ht.Name(): ht.StartStopOnce.Healthy(), + } + maps.Copy(report, ht.headListener.HealthReport()) + return report +} + +func (ht *HeadTracker[HTH, S, ID, BLOCK_HASH]) Backfill(ctx context.Context, headWithChain HTH, depth uint) (err error) { + if uint(headWithChain.ChainLength()) >= depth { + return nil + } + + baseHeight := headWithChain.BlockNumber() - int64(depth-1) + if baseHeight < 0 { + baseHeight = 0 + } + + return ht.backfill(ctx, headWithChain.EarliestHeadInChain(), baseHeight) +} + +func (ht *HeadTracker[HTH, S, ID, BLOCK_HASH]) LatestChain() HTH { + return ht.headSaver.LatestChain() +} + +func (ht *HeadTracker[HTH, S, ID, BLOCK_HASH]) getInitialHead(ctx context.Context) (HTH, error) { + head, err := ht.client.HeadByNumber(ctx, nil) + if err != nil { + return ht.getNilHead(), errors.Wrap(err, "failed to fetch initial head") + } + loggerFields := []interface{}{"head", head} + if head.IsValid() { + loggerFields = append(loggerFields, "blockNumber", head.BlockNumber(), "blockHash", head.BlockHash()) + } + ht.log.Debugw("Got initial head", loggerFields...) + return head, nil +} + +func (ht *HeadTracker[HTH, S, ID, BLOCK_HASH]) handleNewHead(ctx context.Context, head HTH) error { + prevHead := ht.headSaver.LatestChain() + + ht.log.Debugw(fmt.Sprintf("Received new head %v", htrktypes.FriendlyInt64(head.BlockNumber())), + "blockHeight", head.BlockNumber(), + "blockHash", head.BlockHash(), + "parentHeadHash", head.GetParentHash(), + ) + + err := ht.headSaver.Save(ctx, head) + if ctx.Err() != nil { + return nil + } else if err != nil { + return errors.Wrapf(err, "failed to save head: %#v", head) + } + + if !prevHead.IsValid() || head.BlockNumber() > prevHead.BlockNumber() { + promCurrentHead.WithLabelValues(ht.chainID.String()).Set(float64(head.BlockNumber())) + + headWithChain := ht.headSaver.Chain(head.BlockHash()) + if !headWithChain.IsValid() { + return errors.Errorf("HeadTracker#handleNewHighestHead headWithChain was unexpectedly nil") + } + ht.backfillMB.Deliver(headWithChain) + ht.broadcastMB.Deliver(headWithChain) + } else if head.BlockNumber() == prevHead.BlockNumber() { + if head.BlockHash() != prevHead.BlockHash() { + ht.log.Debugw("Got duplicate head", "blockNum", head.BlockNumber(), "head", head.BlockHash(), "prevHead", prevHead.BlockHash()) + } else { + ht.log.Debugw("Head already in the database", "head", head.BlockHash()) + } + } else { + ht.log.Debugw("Got out of order head", "blockNum", head.BlockNumber(), "head", head.BlockHash(), "prevHead", prevHead.BlockNumber()) + prevUnFinalizedHead := prevHead.BlockNumber() - int64(ht.config.FinalityDepth()) + if head.BlockNumber() < prevUnFinalizedHead { + promOldHead.WithLabelValues(ht.chainID.String()).Inc() + ht.log.Criticalf("Got very old block with number %d (highest seen was %d). This is a problem and either means a very deep re-org occurred, one of the RPC nodes has gotten far out of sync, or the chain went backwards in block numbers. This node may not function correctly without manual intervention.", head.BlockNumber(), prevHead.BlockNumber()) + ht.SvcErrBuffer.Append(errors.New("got very old block")) + } + } + return nil +} + +func (ht *HeadTracker[HTH, S, ID, BLOCK_HASH]) broadcastLoop() { + defer ht.wgDone.Done() + + samplingInterval := ht.config.HeadTrackerSamplingInterval() + if samplingInterval > 0 { + ht.log.Debugf("Head sampling is enabled - sampling interval is set to: %v", samplingInterval) + debounceHead := time.NewTicker(samplingInterval) + defer debounceHead.Stop() + for { + select { + case <-ht.chStop: + return + case <-debounceHead.C: + item := ht.broadcastMB.RetrieveLatestAndClear() + if !item.IsValid() { + continue + } + ht.headBroadcaster.BroadcastNewLongestChain(item) + } + } + } else { + ht.log.Info("Head sampling is disabled - callback will be called on every head") + for { + select { + case <-ht.chStop: + return + case <-ht.broadcastMB.Notify(): + for { + item, exists := ht.broadcastMB.Retrieve() + if !exists { + break + } + ht.headBroadcaster.BroadcastNewLongestChain(item) + } + } + } + } +} + +func (ht *HeadTracker[HTH, S, ID, BLOCK_HASH]) backfillLoop() { + defer ht.wgDone.Done() + + ctx, cancel := ht.chStop.NewCtx() + defer cancel() + + for { + select { + case <-ht.chStop: + return + case <-ht.backfillMB.Notify(): + for { + head, exists := ht.backfillMB.Retrieve() + if !exists { + break + } + { + err := ht.Backfill(ctx, head, uint(ht.config.FinalityDepth())) + if err != nil { + ht.log.Warnw("Unexpected error while backfilling heads", "err", err) + } else if ctx.Err() != nil { + break + } + } + } + } + } +} + +// backfill fetches all missing heads up until the base height +func (ht *HeadTracker[HTH, S, ID, BLOCK_HASH]) backfill(ctx context.Context, head types.Head[BLOCK_HASH], baseHeight int64) (err error) { + headNumberInt64 := head.BlockNumber() + if headNumberInt64 <= baseHeight { + return nil + } + mark := time.Now() + fetched := 0 + l := logger.With(ht.log, "blockNumber", headNumberInt64, + "n", headNumberInt64-baseHeight, + "fromBlockHeight", baseHeight, + "toBlockHeight", headNumberInt64-1) + l.Debug("Starting backfill") + defer func() { + if ctx.Err() != nil { + l.Warnw("Backfill context error", "err", ctx.Err()) + return + } + l.Debugw("Finished backfill", + "fetched", fetched, + "time", time.Since(mark), + "err", err) + }() + + for i := head.BlockNumber() - 1; i >= baseHeight; i-- { + // NOTE: Sequential requests here mean it's a potential performance bottleneck, be aware! + existingHead := ht.headSaver.Chain(head.GetParentHash()) + if existingHead.IsValid() { + head = existingHead + continue + } + head, err = ht.fetchAndSaveHead(ctx, i) + fetched++ + if ctx.Err() != nil { + ht.log.Debugw("context canceled, aborting backfill", "err", err, "ctx.Err", ctx.Err()) + break + } else if err != nil { + return errors.Wrap(err, "fetchAndSaveHead failed") + } + } + return +} + +func (ht *HeadTracker[HTH, S, ID, BLOCK_HASH]) fetchAndSaveHead(ctx context.Context, n int64) (HTH, error) { + ht.log.Debugw("Fetching head", "blockHeight", n) + head, err := ht.client.HeadByNumber(ctx, big.NewInt(n)) + if err != nil { + return ht.getNilHead(), err + } else if !head.IsValid() { + return ht.getNilHead(), errors.New("got nil head") + } + err = ht.headSaver.Save(ctx, head) + if err != nil { + return ht.getNilHead(), err + } + return head, nil +} diff --git a/pkg/headtracker/types/config.go b/pkg/headtracker/types/config.go index 520d79ca28..08d72be9ee 100644 --- a/pkg/headtracker/types/config.go +++ b/pkg/headtracker/types/config.go @@ -1,6 +1,9 @@ package types -import "time" +import ( + "fmt" + "time" +) type Config interface { BlockEmissionIdleWarningThreshold() time.Duration @@ -9,3 +12,7 @@ type Config interface { HeadTrackerMaxBufferSize() uint32 HeadTrackerSamplingInterval() time.Duration } + +func FriendlyInt64(n int64) string { + return fmt.Sprintf("#%[1]v (0x%[1]x)", n) +} diff --git a/pkg/logger/critical.go b/pkg/logger/critical.go new file mode 100644 index 0000000000..ed3b5849e6 --- /dev/null +++ b/pkg/logger/critical.go @@ -0,0 +1,7 @@ +package logger + +import "go.uber.org/zap" + +func (l *logger) Criticalf(format string, values ...interface{}) { + l.WithOptions(zap.AddCallerSkip(1)).DPanicf(format, values...) +} diff --git a/pkg/logger/logger.go b/pkg/logger/logger.go index ab8e66d36a..ee007113a9 100644 --- a/pkg/logger/logger.go +++ b/pkg/logger/logger.go @@ -27,6 +27,7 @@ type Logger interface { Errorf(format string, values ...interface{}) Panicf(format string, values ...interface{}) Fatalf(format string, values ...interface{}) + Criticalf(format string, values ...interface{}) Debugw(msg string, keysAndValues ...interface{}) Infow(msg string, keysAndValues ...interface{}) diff --git a/pkg/logger/logger_test.go b/pkg/logger/logger_test.go index 8b06f8c6c2..049e36de97 100644 --- a/pkg/logger/logger_test.go +++ b/pkg/logger/logger_test.go @@ -119,6 +119,10 @@ func (o *other) Named(name string) Logger { return &newLogger } +func (d *other) Criticalf(format string, values ...interface{}) { + d.WithOptions(zap.AddCallerSkip(1)).DPanicf(format, values...) +} + type different struct { *zap.SugaredLogger name string @@ -139,6 +143,10 @@ func (d *different) Named(name string) Logger { return &newLogger } +func (d *different) Criticalf(format string, values ...interface{}) { + d.WithOptions(zap.AddCallerSkip(1)).DPanicf(format, values...) +} + type mismatch struct { *zap.SugaredLogger name string @@ -152,6 +160,10 @@ func (m *mismatch) Name() string { return m.name } +func (d *mismatch) Criticalf(format string, values ...interface{}) { + d.WithOptions(zap.AddCallerSkip(1)).DPanicf(format, values...) +} + type differentLogger interface { Name() string Named(string) Logger diff --git a/pkg/utils/error_buffer.go b/pkg/utils/error_buffer.go new file mode 100644 index 0000000000..9f4372188c --- /dev/null +++ b/pkg/utils/error_buffer.go @@ -0,0 +1,47 @@ +package utils + +import ( + "errors" + "sync" +) + +// ErrorBuffer uses joinedErrors interface to join multiple errors into a single error. +// This is useful to track the most recent N errors in a service and flush them as a single error. +type ErrorBuffer struct { + // buffer is a slice of errors + buffer []error + + // cap is the maximum number of errors that the buffer can hold. + // Exceeding the cap results in discarding the oldest error + cap int + + mu sync.RWMutex +} + +func (eb *ErrorBuffer) Flush() (err error) { + eb.mu.RLock() + defer eb.mu.RUnlock() + err = errors.Join(eb.buffer...) + eb.buffer = nil + return +} + +func (eb *ErrorBuffer) Append(incoming error) { + eb.mu.Lock() + defer eb.mu.Unlock() + + if len(eb.buffer) == eb.cap && eb.cap != 0 { + eb.buffer = append(eb.buffer[1:], incoming) + return + } + eb.buffer = append(eb.buffer, incoming) +} + +func (eb *ErrorBuffer) SetCap(cap int) { + eb.mu.Lock() + defer eb.mu.Unlock() + if len(eb.buffer) > cap { + eb.buffer = eb.buffer[len(eb.buffer)-cap:] + } + eb.cap = cap +} diff --git a/pkg/utils/mailbox_prom.go b/pkg/utils/mailbox_prom.go new file mode 100644 index 0000000000..30bb707a2b --- /dev/null +++ b/pkg/utils/mailbox_prom.go @@ -0,0 +1,91 @@ +package utils + +import ( + "context" + "strconv" + "strings" + "sync" + "time" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" +) + +var mailboxLoad = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "mailbox_load_percent", + Help: "Percent of mailbox capacity used", +}, + []string{"appID", "name", "capacity"}, +) + +const mailboxPromInterval = 5 * time.Second + +type MailboxMonitor struct { + StartStopOnce + appID string + + mailboxes sync.Map + stop func() + done chan struct{} +} + +func NewMailboxMonitor(appID string) *MailboxMonitor { + return &MailboxMonitor{appID: appID} +} + +func (m *MailboxMonitor) Name() string { return "MailboxMonitor" } + +func (m *MailboxMonitor) Start(context.Context) error { + return m.StartOnce("MailboxMonitor", func() error { + t := time.NewTicker(WithJitter(mailboxPromInterval)) + ctx, cancel := context.WithCancel(context.Background()) + m.stop = func() { + t.Stop() + cancel() + } + m.done = make(chan struct{}) + go m.monitorLoop(ctx, t.C) + return nil + }) +} + +func (m *MailboxMonitor) Close() error { + return m.StopOnce("MailboxMonitor", func() error { + m.stop() + <-m.done + return nil + }) +} + +func (m *MailboxMonitor) HealthReport() map[string]error { + return map[string]error{m.Name(): m.StartStopOnce.Healthy()} +} + +func (m *MailboxMonitor) monitorLoop(ctx context.Context, c <-chan time.Time) { + defer close(m.done) + for { + select { + case <-ctx.Done(): + return + case <-c: + m.mailboxes.Range(func(k, v any) bool { + name, mb := k.(string), v.(mailbox) + c, p := mb.load() + capacity := strconv.FormatUint(c, 10) + mailboxLoad.WithLabelValues(m.appID, name, capacity).Set(p) + return true + }) + } + } +} + +type mailbox interface { + load() (capacity uint64, percent float64) + onClose(func()) +} + +func (m *MailboxMonitor) Monitor(mb mailbox, name ...string) { + n := strings.Join(name, ".") + m.mailboxes.Store(n, mb) + mb.onClose(func() { m.mailboxes.Delete(n) }) +} diff --git a/pkg/utils/start_stop_once.go b/pkg/utils/start_stop_once.go index 68a4f75c85..ca0d89d54b 100644 --- a/pkg/utils/start_stop_once.go +++ b/pkg/utils/start_stop_once.go @@ -48,6 +48,9 @@ func (s startStopOnceState) String() string { type StartStopOnce struct { state atomic.Int32 sync.RWMutex // lock is held during startup/shutdown, RLock is held while executing functions dependent on a particular state + + // SvcErrBuffer is an ErrorBuffer that let service owners track critical errors happening in the service. + SvcErrBuffer ErrorBuffer } // StartOnce sets the state to Started diff --git a/pkg/utils/utils.go b/pkg/utils/utils.go index aed521c621..f30a4814da 100644 --- a/pkg/utils/utils.go +++ b/pkg/utils/utils.go @@ -5,6 +5,8 @@ import ( "math" mrand "math/rand" "time" + + "github.com/jpillora/backoff" ) // WithJitter adds +/- 10% to a duration @@ -21,6 +23,17 @@ func WithJitter(d time.Duration) time.Duration { return time.Duration(int(d) + jitter) } +// NewRedialBackoff is a standard backoff to use for redialling or reconnecting to +// unreachable network endpoints +func NewRedialBackoff() backoff.Backoff { + return backoff.Backoff{ + Min: 1 * time.Second, + Max: 15 * time.Second, + Jitter: true, + } + +} + // ContextFromChan creates a context that finishes when the provided channel // receives or is closed. // When channel closes, the ctx.Err() will always be context.Canceled