From d3ebd759cde258de74952c3f18891010247209ff Mon Sep 17 00:00:00 2001 From: anonpenguin Date: Tue, 12 Aug 2025 22:43:23 +0300 Subject: [PATCH] Add exponential backoff for bootstrap peer reconnection Implements jitter and interval growth to improve robustness when connecting to bootstrap peers. Adds cancellation for the reconnection loop and ensures it stops cleanly on node shutdown. --- pkg/node/node.go | 151 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 140 insertions(+), 11 deletions(-) diff --git a/pkg/node/node.go b/pkg/node/node.go index c43b964..4d8d33c 100644 --- a/pkg/node/node.go +++ b/pkg/node/node.go @@ -4,6 +4,7 @@ import ( "context" "crypto/rand" "fmt" + mathrand "math/rand" "os" "path/filepath" "strings" @@ -39,6 +40,7 @@ type Node struct { // Peer discovery discoveryCancel context.CancelFunc + bootstrapCancel context.CancelFunc } // NewNode creates a new network node @@ -77,6 +79,68 @@ func (n *Node) startRQLite(ctx context.Context) error { return nil } +// hasBootstrapConnections checks if we're connected to any bootstrap peers +func (n *Node) hasBootstrapConnections() bool { + if n.host == nil || len(n.config.Discovery.BootstrapPeers) == 0 { + return false + } + + connectedPeers := n.host.Network().Peers() + if len(connectedPeers) == 0 { + return false + } + + // Parse bootstrap peer IDs + bootstrapPeerIDs := make(map[peer.ID]bool) + for _, bootstrapAddr := range n.config.Discovery.BootstrapPeers { + ma, err := multiaddr.NewMultiaddr(bootstrapAddr) + if err != nil { + continue + } + peerInfo, err := peer.AddrInfoFromP2pAddr(ma) + if err != nil { + continue + } + bootstrapPeerIDs[peerInfo.ID] = true + } + + // Check if any connected peer is a bootstrap peer + for _, peerID := range connectedPeers { + if bootstrapPeerIDs[peerID] { + return true + } + } + + return false +} + +// calculateNextBackoff calculates the next backoff interval with exponential growth +func calculateNextBackoff(current time.Duration) time.Duration { + // Multiply by 1.5 for gentler exponential growth + next := time.Duration(float64(current) * 1.5) + // Cap at 10 minutes + maxInterval := 10 * time.Minute + if next > maxInterval { + next = maxInterval + } + return next +} + +// addJitter adds random jitter to prevent thundering herd +func addJitter(interval time.Duration) time.Duration { + // Add ±20% jitter + jitterPercent := 0.2 + jitterRange := float64(interval) * jitterPercent + jitter := (mathrand.Float64() - 0.5) * 2 * jitterRange // -jitterRange to +jitterRange + + result := time.Duration(float64(interval) + jitter) + // Ensure we don't go below 1 second + if result < time.Second { + result = time.Second + } + return result +} + // connectToBootstrapPeer connects to a single bootstrap peer func (n *Node) connectToBootstrapPeer(ctx context.Context, addr string) error { ma, err := multiaddr.NewMultiaddr(addr) @@ -198,22 +262,82 @@ func (n *Node) startLibP2P() error { // Don't fail - continue without bootstrap connections } - // Background reconnect loop: keep trying to connect to bootstrap peers for a short window - // This helps when nodes are started slightly out-of-order in dev. + // Start exponential backoff reconnection for bootstrap peers if len(n.config.Discovery.BootstrapPeers) > 0 { + bootstrapCtx, cancel := context.WithCancel(context.Background()) + n.bootstrapCancel = cancel + go func() { - for i := 0; i < 12; i++ { // ~60s total - if n.host == nil { + interval := 5 * time.Second + consecutiveFailures := 0 + + n.logger.ComponentInfo(logging.ComponentNode, "Starting bootstrap peer reconnection with exponential backoff", + zap.Duration("initial_interval", interval), + zap.Duration("max_interval", 10*time.Minute)) + + for { + select { + case <-bootstrapCtx.Done(): + n.logger.ComponentDebug(logging.ComponentNode, "Bootstrap reconnection loop stopped") return + default: } - // If we already have peers, stop retrying - if len(n.host.Network().Peers()) > 0 { - return + + // Check if we need to attempt connection + if !n.hasBootstrapConnections() { + n.logger.ComponentDebug(logging.ComponentNode, "Attempting bootstrap peer connection", + zap.Duration("current_interval", interval), + zap.Int("consecutive_failures", consecutiveFailures)) + + if err := n.connectToBootstrapPeers(context.Background()); err != nil { + consecutiveFailures++ + // Calculate next backoff interval + jitteredInterval := addJitter(interval) + n.logger.ComponentDebug(logging.ComponentNode, "Bootstrap connection failed, backing off", + zap.Error(err), + zap.Duration("next_attempt_in", jitteredInterval), + zap.Int("consecutive_failures", consecutiveFailures)) + + // Sleep with jitter + select { + case <-bootstrapCtx.Done(): + return + case <-time.After(jitteredInterval): + } + + // Increase interval for next attempt + interval = calculateNextBackoff(interval) + + // Log interval increases occasionally to show progress + if consecutiveFailures%5 == 0 { + n.logger.ComponentInfo(logging.ComponentNode, "Bootstrap connection still failing", + zap.Int("consecutive_failures", consecutiveFailures), + zap.Duration("current_interval", interval)) + } + } else { + // Success! Reset interval and counters + if consecutiveFailures > 0 { + n.logger.ComponentInfo(logging.ComponentNode, "Successfully connected to bootstrap peers", + zap.Int("failures_overcome", consecutiveFailures)) + } + interval = 5 * time.Second + consecutiveFailures = 0 + + // Wait 30 seconds before checking connection again + select { + case <-bootstrapCtx.Done(): + return + case <-time.After(30 * time.Second): + } + } + } else { + // We have bootstrap connections, just wait and check periodically + select { + case <-bootstrapCtx.Done(): + return + case <-time.After(30 * time.Second): + } } - if err := n.connectToBootstrapPeers(context.Background()); err == nil { - n.logger.ComponentDebug(logging.ComponentNode, "Bootstrap reconnect attempt completed") - } - time.Sleep(5 * time.Second) } }() } @@ -473,6 +597,11 @@ func (n *Node) stopPeerDiscovery() { func (n *Node) Stop() error { n.logger.ComponentInfo(logging.ComponentNode, "Stopping network node") + // Stop bootstrap reconnection loop + if n.bootstrapCancel != nil { + n.bootstrapCancel() + } + // Stop peer discovery n.stopPeerDiscovery()