river-build · clemire · Dec 17, 2024 · Dec 18, 2024 · Dec 18, 2024 · Dec 18, 2024
diff --git a/core/config/config.go b/core/config/config.go
@@ -446,6 +446,7 @@ type DebugEndpointsConfig struct {
 	Stacks          bool
 	StacksMaxSizeKb int
 	TxPool          bool
+	Archive         bool
 
 	// Make storage statistics available via debug endpoints. This may involve running queries
 	// on the underlying database.

@@ -2,6 +2,7 @@ package rpc
 
 import (
 	"context"
+	"strings"
 	"sync"
 	"sync/atomic"
 	"time"
@@ -23,11 +24,23 @@ import (
 	"github.com/river-build/river/core/node/storage"
 )
 
-// nodeUpdateGracePeriod describes the maximum delay we would expect to see
-// when a stream's miniblocks updates in the registry before we consider a
-// node behind of it does not have the latest miniblock.
-// Two blocks plus change
-var nodeUpdateGracePeriod = 5 * time.Second
+var (
+	// nodeUpdateGracePeriod describes the maximum delay we would expect to see
+	// when a stream's miniblocks updates in the registry before we consider a
+	// node behind and attempt to advance to the next peer to retrieve the miniblock.
+	// Two blocks plus change
+	nodeUpdateGracePeriod = 5 * time.Second
+
+	// For streams that cannot be updated to the current contract state, this is the
+	// grace period we give before we consider this stream to be corrupt.
+	// By the time a stream is this out of date with the contract state, we should have
+	// cycled through every node that hosts the replicated stream.
+	staleStreamGracePeriod = 100 * time.Second
+
+	// If for some reason the contract state continues to advance, but we are unable to
+	// download miniblocks past a certain point, consider the stream corrupt.
+	maxBlocksBehind = 50
+)
 
 type contractState struct {
 	// Everything in the registry state is protected by this mutex.
@@ -62,7 +75,7 @@ type ArchiveStream struct {
 	registryState contractState
 	numBlocksInDb atomic.Int64 // -1 means not loaded
 
-	stale atomic.Bool
+	corrupt atomic.Bool
 
 	// Mutex is used so only one archive operation is performed at a time.
 	mu sync.Mutex
@@ -219,6 +232,30 @@ func (a *Archiver) setupStatisticsMetrics(factory infra.MetricsFactory) {
 	)
 }
 
+// getCorruptStreams iterates over all streams in the in-memory cache and collects ids for
+// streams that are considered corrupt. This list does not represent a snapshot of the archiver
+// at any particular state, as the cache iteration is not thread-safe. However, for the purposes
+// of generating a periodic report of corrupt streams, this is good enough.
+func (a *Archiver) getCorruptStreams(ctx context.Context) map[StreamId]*ArchiveStream {
+	corruptStreams := make(map[StreamId]*ArchiveStream, 0)
+
+	a.streams.Range(
+		func(key, value any) bool {
+			stream, ok := value.(*ArchiveStream)
+			if ok && stream.corrupt.Load() {
+				corruptStreams[stream.streamId] = stream
+			} else if !ok {
+				dlog.FromCtx(ctx).
+					Error("Unexpected value stored in stream cache (not an ArchiveStream)", "value", value)
+			}
+
+			return true
+		},
+	)
+
+	return corruptStreams
+}
+
 func (a *Archiver) addNewStream(
 	ctx context.Context,
 	streamId StreamId,
@@ -238,12 +275,11 @@ func (a *Archiver) addNewStream(
 	a.streamsExamined.Add(1)
 }
 
+// Consider this node behind for this stream and advance the node pointer
 func (a *Archiver) onNodeBehind(
 	stream *ArchiveStream,
 	nodeAddr common.Address,
 ) {
-	// Mark the stream as stale and advance the node pointer.
-	stream.stale.Store(true)
 	// We now consider this node behind for this stream. Let's advance it.
 	if a.nodeAdvances != nil {
 		nodeAddress := prometheus.Labels{"node_address": nodeAddr.String()}
@@ -253,6 +289,15 @@ func (a *Archiver) onNodeBehind(
 	stream.nodes.AdvanceStickyPeer(nodeAddr)
 }
 
+// isCorrupt determines whether we consider a stream corrupt. We consider a stream corrupt
+// if we have not been able to catch up to the current contract block for a certain period
+// of time, or alternatively if we are unable to download the current miniblock(s) from any
+// of the replicas that host the node, and advance the stream locally.
+func isCorrupt(mbsInContract int64, mbsInDb int64, timeSinceLastContractUpdate time.Time) bool {
+	return time.Since(timeSinceLastContractUpdate) > staleStreamGracePeriod ||
+		mbsInContract-mbsInDb > int64(maxBlocksBehind)
+}
+
 // ArchiveStream attempts to add all new miniblocks seen, according to the registry contract,
 // since the last time the stream was archived into storage.  It creates a new stream for
 // streams that have not yet been seen.
@@ -333,16 +378,36 @@ func (a *Archiver) ArchiveStream(ctx context.Context, stream *ArchiveStream) err
 				err,
 				"streamId",
 				stream.streamId,
+				"node",
+				nodeAddr.Hex(),
 			)
 			if a.nodeAdvances != nil {
 				a.nodeAdvances.With(prometheus.Labels{"node_address": nodeAddr.String()}).Inc()
 			}
 			stream.nodes.AdvanceStickyPeer(nodeAddr)
+
+			if isCorrupt(mbsInContract, mbsInDb, contractMbsUpdated) {
+				// Mark this stream as corrupt
+				stream.corrupt.Store(true)
+			} else {
+				// We remove the stream from the rotation when it passes the grace period for stale
+				// streams. Keeping it here gives us a chance to fetch the stream if a node is booting
+				// or otherwise unavailable for an intermittent period, or fetch from another node if
+				// only a subset of nodes are unavailable.
+				time.AfterFunc(5*time.Second, func() {
+					a.tasks <- stream.streamId
+				})
+			}
+
 			return err
 		}
 
 		if (err != nil && AsRiverError(err).Code == Err_NOT_FOUND) || resp.Msg == nil || len(resp.Msg.Miniblocks) == 0 {
-			if time.Since(contractMbsUpdated) > nodeUpdateGracePeriod {
+			if isCorrupt(mbsInContract, mbsInDb, contractMbsUpdated) {
+				stream.corrupt.Store(true)
+				// Do not re-insert this stream back into the task queue, it is now considered un-updatable.
+				return nil
+			} else if time.Since(contractMbsUpdated) > nodeUpdateGracePeriod {
 				a.onNodeBehind(
 					stream,
 					nodeAddr,
@@ -403,14 +468,34 @@ func (a *Archiver) ArchiveStream(ctx context.Context, stream *ArchiveStream) err
 		a.miniblocksProcessed.Add(uint64(len(serialized)))
 	}
 
-	// All blocks processed, mark stream as current
-	stream.stale.Store(false)
-
 	return nil
 }
 
+func (a *Archiver) emitPeriodicCorruptStreamReport(ctx context.Context) {
+	ticker := time.NewTicker(15 * time.Minute)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-ticker.C:
+			corruptStreams := a.getCorruptStreams(ctx)
+
+			var builder strings.Builder
+			for _, as := range corruptStreams {
+				builder.WriteString(as.streamId.String())
+				builder.WriteString("\n")
+			}
+			dlog.FromCtx(ctx).
+				Info("Corrupt streams report", "total", len(corruptStreams), "streams", builder.String())
+		}
+	}
+}
+
 func (a *Archiver) Start(ctx context.Context, once bool, metrics infra.MetricsFactory, exitSignal chan<- error) {
 	defer a.startedWG.Done()
+	go a.emitPeriodicCorruptStreamReport(ctx)
 	err := a.startImpl(ctx, once, metrics)
 	if err != nil {
 		exitSignal <- err