mirror of
https://github.com/DeBrosOfficial/network.git
synced 2025-10-06 10:19:07 +00:00
Add exponential backoff for bootstrap peer reconnection
Implements jitter and interval growth to improve robustness when connecting to bootstrap peers. Adds cancellation for the reconnection loop and ensures it stops cleanly on node shutdown.
This commit is contained in:
parent
135711bb97
commit
d3ebd759cd
151
pkg/node/node.go
151
pkg/node/node.go
@ -4,6 +4,7 @@ import (
|
|||||||
"context"
|
"context"
|
||||||
"crypto/rand"
|
"crypto/rand"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
mathrand "math/rand"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
@ -39,6 +40,7 @@ type Node struct {
|
|||||||
|
|
||||||
// Peer discovery
|
// Peer discovery
|
||||||
discoveryCancel context.CancelFunc
|
discoveryCancel context.CancelFunc
|
||||||
|
bootstrapCancel context.CancelFunc
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewNode creates a new network node
|
// NewNode creates a new network node
|
||||||
@ -77,6 +79,68 @@ func (n *Node) startRQLite(ctx context.Context) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// hasBootstrapConnections checks if we're connected to any bootstrap peers
|
||||||
|
func (n *Node) hasBootstrapConnections() bool {
|
||||||
|
if n.host == nil || len(n.config.Discovery.BootstrapPeers) == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
connectedPeers := n.host.Network().Peers()
|
||||||
|
if len(connectedPeers) == 0 {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse bootstrap peer IDs
|
||||||
|
bootstrapPeerIDs := make(map[peer.ID]bool)
|
||||||
|
for _, bootstrapAddr := range n.config.Discovery.BootstrapPeers {
|
||||||
|
ma, err := multiaddr.NewMultiaddr(bootstrapAddr)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
peerInfo, err := peer.AddrInfoFromP2pAddr(ma)
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
bootstrapPeerIDs[peerInfo.ID] = true
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if any connected peer is a bootstrap peer
|
||||||
|
for _, peerID := range connectedPeers {
|
||||||
|
if bootstrapPeerIDs[peerID] {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// calculateNextBackoff calculates the next backoff interval with exponential growth
|
||||||
|
func calculateNextBackoff(current time.Duration) time.Duration {
|
||||||
|
// Multiply by 1.5 for gentler exponential growth
|
||||||
|
next := time.Duration(float64(current) * 1.5)
|
||||||
|
// Cap at 10 minutes
|
||||||
|
maxInterval := 10 * time.Minute
|
||||||
|
if next > maxInterval {
|
||||||
|
next = maxInterval
|
||||||
|
}
|
||||||
|
return next
|
||||||
|
}
|
||||||
|
|
||||||
|
// addJitter adds random jitter to prevent thundering herd
|
||||||
|
func addJitter(interval time.Duration) time.Duration {
|
||||||
|
// Add ±20% jitter
|
||||||
|
jitterPercent := 0.2
|
||||||
|
jitterRange := float64(interval) * jitterPercent
|
||||||
|
jitter := (mathrand.Float64() - 0.5) * 2 * jitterRange // -jitterRange to +jitterRange
|
||||||
|
|
||||||
|
result := time.Duration(float64(interval) + jitter)
|
||||||
|
// Ensure we don't go below 1 second
|
||||||
|
if result < time.Second {
|
||||||
|
result = time.Second
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
// connectToBootstrapPeer connects to a single bootstrap peer
|
// connectToBootstrapPeer connects to a single bootstrap peer
|
||||||
func (n *Node) connectToBootstrapPeer(ctx context.Context, addr string) error {
|
func (n *Node) connectToBootstrapPeer(ctx context.Context, addr string) error {
|
||||||
ma, err := multiaddr.NewMultiaddr(addr)
|
ma, err := multiaddr.NewMultiaddr(addr)
|
||||||
@ -198,22 +262,82 @@ func (n *Node) startLibP2P() error {
|
|||||||
// Don't fail - continue without bootstrap connections
|
// Don't fail - continue without bootstrap connections
|
||||||
}
|
}
|
||||||
|
|
||||||
// Background reconnect loop: keep trying to connect to bootstrap peers for a short window
|
// Start exponential backoff reconnection for bootstrap peers
|
||||||
// This helps when nodes are started slightly out-of-order in dev.
|
|
||||||
if len(n.config.Discovery.BootstrapPeers) > 0 {
|
if len(n.config.Discovery.BootstrapPeers) > 0 {
|
||||||
|
bootstrapCtx, cancel := context.WithCancel(context.Background())
|
||||||
|
n.bootstrapCancel = cancel
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
for i := 0; i < 12; i++ { // ~60s total
|
interval := 5 * time.Second
|
||||||
if n.host == nil {
|
consecutiveFailures := 0
|
||||||
|
|
||||||
|
n.logger.ComponentInfo(logging.ComponentNode, "Starting bootstrap peer reconnection with exponential backoff",
|
||||||
|
zap.Duration("initial_interval", interval),
|
||||||
|
zap.Duration("max_interval", 10*time.Minute))
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-bootstrapCtx.Done():
|
||||||
|
n.logger.ComponentDebug(logging.ComponentNode, "Bootstrap reconnection loop stopped")
|
||||||
return
|
return
|
||||||
|
default:
|
||||||
}
|
}
|
||||||
// If we already have peers, stop retrying
|
|
||||||
if len(n.host.Network().Peers()) > 0 {
|
// Check if we need to attempt connection
|
||||||
return
|
if !n.hasBootstrapConnections() {
|
||||||
|
n.logger.ComponentDebug(logging.ComponentNode, "Attempting bootstrap peer connection",
|
||||||
|
zap.Duration("current_interval", interval),
|
||||||
|
zap.Int("consecutive_failures", consecutiveFailures))
|
||||||
|
|
||||||
|
if err := n.connectToBootstrapPeers(context.Background()); err != nil {
|
||||||
|
consecutiveFailures++
|
||||||
|
// Calculate next backoff interval
|
||||||
|
jitteredInterval := addJitter(interval)
|
||||||
|
n.logger.ComponentDebug(logging.ComponentNode, "Bootstrap connection failed, backing off",
|
||||||
|
zap.Error(err),
|
||||||
|
zap.Duration("next_attempt_in", jitteredInterval),
|
||||||
|
zap.Int("consecutive_failures", consecutiveFailures))
|
||||||
|
|
||||||
|
// Sleep with jitter
|
||||||
|
select {
|
||||||
|
case <-bootstrapCtx.Done():
|
||||||
|
return
|
||||||
|
case <-time.After(jitteredInterval):
|
||||||
|
}
|
||||||
|
|
||||||
|
// Increase interval for next attempt
|
||||||
|
interval = calculateNextBackoff(interval)
|
||||||
|
|
||||||
|
// Log interval increases occasionally to show progress
|
||||||
|
if consecutiveFailures%5 == 0 {
|
||||||
|
n.logger.ComponentInfo(logging.ComponentNode, "Bootstrap connection still failing",
|
||||||
|
zap.Int("consecutive_failures", consecutiveFailures),
|
||||||
|
zap.Duration("current_interval", interval))
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Success! Reset interval and counters
|
||||||
|
if consecutiveFailures > 0 {
|
||||||
|
n.logger.ComponentInfo(logging.ComponentNode, "Successfully connected to bootstrap peers",
|
||||||
|
zap.Int("failures_overcome", consecutiveFailures))
|
||||||
|
}
|
||||||
|
interval = 5 * time.Second
|
||||||
|
consecutiveFailures = 0
|
||||||
|
|
||||||
|
// Wait 30 seconds before checking connection again
|
||||||
|
select {
|
||||||
|
case <-bootstrapCtx.Done():
|
||||||
|
return
|
||||||
|
case <-time.After(30 * time.Second):
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// We have bootstrap connections, just wait and check periodically
|
||||||
|
select {
|
||||||
|
case <-bootstrapCtx.Done():
|
||||||
|
return
|
||||||
|
case <-time.After(30 * time.Second):
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if err := n.connectToBootstrapPeers(context.Background()); err == nil {
|
|
||||||
n.logger.ComponentDebug(logging.ComponentNode, "Bootstrap reconnect attempt completed")
|
|
||||||
}
|
|
||||||
time.Sleep(5 * time.Second)
|
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
}
|
}
|
||||||
@ -473,6 +597,11 @@ func (n *Node) stopPeerDiscovery() {
|
|||||||
func (n *Node) Stop() error {
|
func (n *Node) Stop() error {
|
||||||
n.logger.ComponentInfo(logging.ComponentNode, "Stopping network node")
|
n.logger.ComponentInfo(logging.ComponentNode, "Stopping network node")
|
||||||
|
|
||||||
|
// Stop bootstrap reconnection loop
|
||||||
|
if n.bootstrapCancel != nil {
|
||||||
|
n.bootstrapCancel()
|
||||||
|
}
|
||||||
|
|
||||||
// Stop peer discovery
|
// Stop peer discovery
|
||||||
n.stopPeerDiscovery()
|
n.stopPeerDiscovery()
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user