feat: enhance production service initialization and logging

- Updated the `Phase2cInitializeServices` function to accept bootstrap peers and VPS IP, improving service configuration for non-bootstrap nodes.
- Refactored the `handleProdInstall` and `handleProdUpgrade` functions to ensure proper initialization of services with the new parameters.
- Improved logging to provide clearer feedback during service initialization and configuration, enhancing user experience and troubleshooting capabilities.
This commit is contained in:
anonpenguin23 2025-11-13 10:26:50 +02:00
parent 47ffe817b4
commit 358de8a8ad
No known key found for this signature in database
GPG Key ID: 1CBB1FE35AFBEE30
11 changed files with 745 additions and 160 deletions

View File

@ -13,6 +13,24 @@ The format is based on [Keep a Changelog][keepachangelog] and adheres to [Semant
### Deprecated ### Deprecated
### Fixed ### Fixed
## [0.69.11] - 2025-11-13
### Added
- Added a new comprehensive shell script (`scripts/test-cluster-health.sh`) for checking the health and replication status of RQLite, IPFS, and IPFS Cluster across production environments.
### Changed
- Improved RQLite cluster discovery logic to ensure `peers.json` is correctly generated and includes the local node, which is crucial for reliable cluster recovery.
- Refactored logging across discovery and RQLite components for cleaner, more concise output, especially for routine operations.
- Updated the installation and upgrade process to correctly configure IPFS Cluster bootstrap peers using the node's public IP, improving cluster formation reliability.
### Deprecated
### Removed
### Fixed
- Fixed an issue where RQLite recovery operations (like clearing Raft state) did not correctly force the regeneration of `peers.json`, preventing successful cluster rejoin.
- Corrected the port calculation logic for IPFS Cluster to ensure the correct LibP2P listen port (9098) is used for bootstrap peer addressing.
## [0.69.10] - 2025-11-13 ## [0.69.10] - 2025-11-13
### Added ### Added

View File

@ -19,7 +19,7 @@ test-e2e:
.PHONY: build clean test run-node run-node2 run-node3 run-example deps tidy fmt vet lint clear-ports install-hooks kill .PHONY: build clean test run-node run-node2 run-node3 run-example deps tidy fmt vet lint clear-ports install-hooks kill
VERSION := 0.69.10 VERSION := 0.69.11
COMMIT ?= $(shell git rev-parse --short HEAD 2>/dev/null || echo unknown) COMMIT ?= $(shell git rev-parse --short HEAD 2>/dev/null || echo unknown)
DATE ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ) DATE ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ)
LDFLAGS := -X 'main.version=$(VERSION)' -X 'main.commit=$(COMMIT)' -X 'main.date=$(DATE)' LDFLAGS := -X 'main.version=$(VERSION)' -X 'main.commit=$(COMMIT)' -X 'main.date=$(DATE)'

View File

@ -264,7 +264,7 @@ func handleProdInstall(args []string) {
// Phase 2c: Initialize services (after secrets are in place) // Phase 2c: Initialize services (after secrets are in place)
fmt.Printf("\nPhase 2c: Initializing services...\n") fmt.Printf("\nPhase 2c: Initializing services...\n")
if err := setup.Phase2cInitializeServices(nodeType); err != nil { if err := setup.Phase2cInitializeServices(nodeType, bootstrapPeers, *vpsIP); err != nil {
fmt.Fprintf(os.Stderr, "❌ Service initialization failed: %v\n", err) fmt.Fprintf(os.Stderr, "❌ Service initialization failed: %v\n", err)
os.Exit(1) os.Exit(1)
} }
@ -447,13 +447,6 @@ func handleProdUpgrade(args []string) {
nodeType = "bootstrap" // Default for upgrade if nothing exists nodeType = "bootstrap" // Default for upgrade if nothing exists
} }
// Phase 2c: Ensure services are properly initialized (fixes existing repos)
fmt.Printf("\nPhase 2c: Ensuring services are properly initialized...\n")
if err := setup.Phase2cInitializeServices(nodeType); err != nil {
fmt.Fprintf(os.Stderr, "❌ Service initialization failed: %v\n", err)
os.Exit(1)
}
// Phase 3: Ensure secrets exist (preserves existing secrets) // Phase 3: Ensure secrets exist (preserves existing secrets)
fmt.Printf("\n🔐 Phase 3: Ensuring secrets...\n") fmt.Printf("\n🔐 Phase 3: Ensuring secrets...\n")
if err := setup.Phase3GenerateSecrets(nodeType == "bootstrap"); err != nil { if err := setup.Phase3GenerateSecrets(nodeType == "bootstrap"); err != nil {
@ -584,6 +577,14 @@ func handleProdUpgrade(args []string) {
fmt.Printf(" - Bootstrap join address: %s\n", bootstrapJoin) fmt.Printf(" - Bootstrap join address: %s\n", bootstrapJoin)
} }
// Phase 2c: Ensure services are properly initialized (fixes existing repos)
// Now that we have bootstrap peers and VPS IP, we can properly configure IPFS Cluster
fmt.Printf("\nPhase 2c: Ensuring services are properly initialized...\n")
if err := setup.Phase2cInitializeServices(nodeType, bootstrapPeers, vpsIP); err != nil {
fmt.Fprintf(os.Stderr, "❌ Service initialization failed: %v\n", err)
os.Exit(1)
}
if err := setup.Phase4GenerateConfigs(nodeType == "bootstrap", bootstrapPeers, vpsIP, enableHTTPS, domain, bootstrapJoin); err != nil { if err := setup.Phase4GenerateConfigs(nodeType == "bootstrap", bootstrapPeers, vpsIP, enableHTTPS, domain, bootstrapJoin); err != nil {
fmt.Fprintf(os.Stderr, "⚠️ Config generation warning: %v\n", err) fmt.Fprintf(os.Stderr, "⚠️ Config generation warning: %v\n", err)
fmt.Fprintf(os.Stderr, " Existing configs preserved\n") fmt.Fprintf(os.Stderr, " Existing configs preserved\n")

View File

@ -140,19 +140,9 @@ func (d *Manager) handlePeerExchangeStream(s network.Stream) {
} }
} }
// Log if addresses were filtered out
if filteredCount > 0 {
d.logger.Debug("Filtered out non-libp2p addresses",
zap.String("peer_id", pid.String()[:8]+"..."),
zap.Int("filtered_count", filteredCount),
zap.Int("valid_count", len(filteredAddrs)))
}
// If no addresses remain after filtering, skip this peer // If no addresses remain after filtering, skip this peer
// (Filtering is routine - no need to log every occurrence)
if len(filteredAddrs) == 0 { if len(filteredAddrs) == 0 {
d.logger.Debug("No valid addresses after filtering",
zap.String("peer_id", pid.String()[:8]+"..."),
zap.Int("original_count", len(addrs)))
continue continue
} }
@ -186,9 +176,7 @@ func (d *Manager) handlePeerExchangeStream(s network.Stream) {
return return
} }
d.logger.Debug("Sent peer exchange response", // Response sent - routine operation, no need to log
zap.Int("peer_count", len(resp.Peers)),
zap.Bool("has_rqlite_metadata", resp.RQLiteMetadata != nil))
} }
// Start begins periodic peer discovery // Start begins periodic peer discovery
@ -231,9 +219,6 @@ func (d *Manager) discoverPeers(ctx context.Context, config Config) {
connectedPeers := d.host.Network().Peers() connectedPeers := d.host.Network().Peers()
initialCount := len(connectedPeers) initialCount := len(connectedPeers)
d.logger.Debug("Starting peer discovery",
zap.Int("current_peers", initialCount))
newConnections := 0 newConnections := 0
// Strategy 1: Try to connect to peers learned from the host's peerstore // Strategy 1: Try to connect to peers learned from the host's peerstore
@ -246,11 +231,12 @@ func (d *Manager) discoverPeers(ctx context.Context, config Config) {
finalPeerCount := len(d.host.Network().Peers()) finalPeerCount := len(d.host.Network().Peers())
// Summary log: only log if there were changes or new connections
if newConnections > 0 || finalPeerCount != initialCount { if newConnections > 0 || finalPeerCount != initialCount {
d.logger.Debug("Peer discovery completed", d.logger.Debug("Discovery summary",
zap.Int("new_connections", newConnections), zap.Int("connected", finalPeerCount),
zap.Int("initial_peers", initialCount), zap.Int("new", newConnections),
zap.Int("final_peers", finalPeerCount)) zap.Int("was", initialCount))
} }
} }
@ -265,7 +251,10 @@ func (d *Manager) discoverViaPeerstore(ctx context.Context, maxConnections int)
// Iterate over peerstore known peers // Iterate over peerstore known peers
peers := d.host.Peerstore().Peers() peers := d.host.Peerstore().Peers()
d.logger.Debug("Peerstore contains peers", zap.Int("count", len(peers)))
// Only connect to peers on our standard LibP2P port to avoid cross-connecting
// with IPFS/IPFS Cluster instances that use different ports
const libp2pPort = 4001
for _, pid := range peers { for _, pid := range peers {
if connected >= maxConnections { if connected >= maxConnections {
@ -280,6 +269,24 @@ func (d *Manager) discoverViaPeerstore(ctx context.Context, maxConnections int)
continue continue
} }
// Filter peers to only include those with addresses on our port (4001)
// This prevents attempting to connect to IPFS (port 4101) or IPFS Cluster (port 9096)
peerInfo := d.host.Peerstore().PeerInfo(pid)
hasValidPort := false
for _, addr := range peerInfo.Addrs {
if port, err := addr.ValueForProtocol(multiaddr.P_TCP); err == nil {
if portNum, err := strconv.Atoi(port); err == nil && portNum == libp2pPort {
hasValidPort = true
break
}
}
}
// Skip peers without valid port 4001 addresses
if !hasValidPort {
continue
}
// Try to connect // Try to connect
if err := d.connectToPeer(ctx, pid); err == nil { if err := d.connectToPeer(ctx, pid); err == nil {
connected++ connected++
@ -302,8 +309,8 @@ func (d *Manager) discoverViaPeerExchange(ctx context.Context, maxConnections in
return 0 return 0
} }
d.logger.Debug("Starting peer exchange with connected peers", exchangedPeers := 0
zap.Int("num_peers", len(connectedPeers))) metadataCollected := 0
for _, peerID := range connectedPeers { for _, peerID := range connectedPeers {
if connected >= maxConnections { if connected >= maxConnections {
@ -316,9 +323,13 @@ func (d *Manager) discoverViaPeerExchange(ctx context.Context, maxConnections in
continue continue
} }
d.logger.Debug("Received peer list from peer", exchangedPeers++
zap.String("from_peer", peerID.String()[:8]+"..."), // Check if we got RQLite metadata
zap.Int("peer_count", len(peers))) if val, err := d.host.Peerstore().Get(peerID, "rqlite_metadata"); err == nil {
if _, ok := val.([]byte); ok {
metadataCollected++
}
}
// Try to connect to discovered peers // Try to connect to discovered peers
for _, peerInfo := range peers { for _, peerInfo := range peers {
@ -365,9 +376,7 @@ func (d *Manager) discoverViaPeerExchange(ctx context.Context, maxConnections in
} }
if len(addrs) == 0 { if len(addrs) == 0 {
d.logger.Debug("No valid libp2p addresses (port 4001) for peer", // Skip peers without valid addresses - no need to log every occurrence
zap.String("peer_id", parsedID.String()[:8]+"..."),
zap.Int("total_addresses", len(peerInfo.Addrs)))
continue continue
} }
@ -380,20 +389,29 @@ func (d *Manager) discoverViaPeerExchange(ctx context.Context, maxConnections in
if err := d.host.Connect(connectCtx, peerAddrInfo); err != nil { if err := d.host.Connect(connectCtx, peerAddrInfo); err != nil {
cancel() cancel()
d.logger.Debug("Failed to connect to discovered peer", // Only log connection failures for debugging - errors are still useful
zap.String("peer_id", parsedID.String()[:8]+"..."), d.logger.Debug("Connect failed",
zap.String("peer", parsedID.String()[:8]+"..."),
zap.Error(err)) zap.Error(err))
continue continue
} }
cancel() cancel()
d.logger.Info("Successfully connected to discovered peer", d.logger.Info("Connected",
zap.String("peer_id", parsedID.String()[:8]+"..."), zap.String("peer", parsedID.String()[:8]+"..."),
zap.String("discovered_from", peerID.String()[:8]+"...")) zap.String("from", peerID.String()[:8]+"..."))
connected++ connected++
} }
} }
// Summary log for peer exchange
if exchangedPeers > 0 {
d.logger.Debug("Exchange summary",
zap.Int("exchanged_with", exchangedPeers),
zap.Int("metadata_collected", metadataCollected),
zap.Int("new_connections", connected))
}
return connected return connected
} }
@ -446,9 +464,10 @@ func (d *Manager) requestPeersFromPeer(ctx context.Context, peerID peer.ID, limi
metadataJSON, err := json.Marshal(resp.RQLiteMetadata) metadataJSON, err := json.Marshal(resp.RQLiteMetadata)
if err == nil { if err == nil {
_ = d.host.Peerstore().Put(peerID, "rqlite_metadata", metadataJSON) _ = d.host.Peerstore().Put(peerID, "rqlite_metadata", metadataJSON)
d.logger.Debug("Stored RQLite metadata from peer", // Only log when new metadata is stored (useful for debugging)
zap.String("peer_id", peerID.String()[:8]+"..."), d.logger.Debug("Metadata stored",
zap.String("node_id", resp.RQLiteMetadata.NodeID)) zap.String("peer", peerID.String()[:8]+"..."),
zap.String("node", resp.RQLiteMetadata.NodeID))
} }
} }
@ -464,9 +483,6 @@ func (d *Manager) TriggerPeerExchange(ctx context.Context) int {
return 0 return 0
} }
d.logger.Info("Manually triggering peer exchange",
zap.Int("connected_peers", len(connectedPeers)))
metadataCollected := 0 metadataCollected := 0
for _, peerID := range connectedPeers { for _, peerID := range connectedPeers {
// Request peer list from this peer (which includes their RQLite metadata) // Request peer list from this peer (which includes their RQLite metadata)
@ -480,9 +496,9 @@ func (d *Manager) TriggerPeerExchange(ctx context.Context) int {
} }
} }
d.logger.Info("Peer exchange completed", d.logger.Info("Exchange completed",
zap.Int("peers_with_metadata", metadataCollected), zap.Int("peers", len(connectedPeers)),
zap.Int("total_peers", len(connectedPeers))) zap.Int("with_metadata", metadataCollected))
return metadataCollected return metadataCollected
} }
@ -502,8 +518,7 @@ func (d *Manager) connectToPeer(ctx context.Context, peerID peer.ID) error {
return err return err
} }
d.logger.Debug("Successfully connected to peer", // Connection success logged at higher level - no need for duplicate DEBUG log
zap.String("peer_id", peerID.String()[:8]+"..."))
return nil return nil
} }

View File

@ -532,7 +532,8 @@ func (bi *BinaryInstaller) configureIPFSAddresses(ipfsRepoPath string, apiPort,
// InitializeIPFSClusterConfig initializes IPFS Cluster configuration // InitializeIPFSClusterConfig initializes IPFS Cluster configuration
// This runs `ipfs-cluster-service init` to create the service.json configuration file. // This runs `ipfs-cluster-service init` to create the service.json configuration file.
// For existing installations, it ensures the cluster secret is up to date. // For existing installations, it ensures the cluster secret is up to date.
func (bi *BinaryInstaller) InitializeIPFSClusterConfig(nodeType, clusterPath, clusterSecret string, ipfsAPIPort int) error { // bootstrapClusterPeers should be in format: ["/ip4/<ip>/tcp/9098/p2p/<cluster-peer-id>"]
func (bi *BinaryInstaller) InitializeIPFSClusterConfig(nodeType, clusterPath, clusterSecret string, ipfsAPIPort int, bootstrapClusterPeers []string) error {
serviceJSONPath := filepath.Join(clusterPath, "service.json") serviceJSONPath := filepath.Join(clusterPath, "service.json")
configExists := false configExists := false
if _, err := os.Stat(serviceJSONPath); err == nil { if _, err := os.Stat(serviceJSONPath); err == nil {
@ -567,11 +568,11 @@ func (bi *BinaryInstaller) InitializeIPFSClusterConfig(nodeType, clusterPath, cl
} }
} }
// Always update the cluster secret and IPFS port (for both new and existing configs) // Always update the cluster secret, IPFS port, and peer addresses (for both new and existing configs)
// This ensures existing installations get the secret and port synchronized // This ensures existing installations get the secret and port synchronized
if clusterSecret != "" { if clusterSecret != "" {
fmt.Fprintf(bi.logWriter.(interface{ Write([]byte) (int, error) }), " Updating cluster secret and IPFS port...\n") fmt.Fprintf(bi.logWriter.(interface{ Write([]byte) (int, error) }), " Updating cluster secret, IPFS port, and peer addresses...\n")
if err := bi.updateClusterConfig(clusterPath, clusterSecret, ipfsAPIPort); err != nil { if err := bi.updateClusterConfig(clusterPath, clusterSecret, ipfsAPIPort, bootstrapClusterPeers); err != nil {
return fmt.Errorf("failed to update cluster config: %w", err) return fmt.Errorf("failed to update cluster config: %w", err)
} }
} }
@ -582,8 +583,8 @@ func (bi *BinaryInstaller) InitializeIPFSClusterConfig(nodeType, clusterPath, cl
return nil return nil
} }
// updateClusterConfig updates the secret and IPFS port in IPFS Cluster service.json // updateClusterConfig updates the secret, IPFS port, and peer addresses in IPFS Cluster service.json
func (bi *BinaryInstaller) updateClusterConfig(clusterPath, secret string, ipfsAPIPort int) error { func (bi *BinaryInstaller) updateClusterConfig(clusterPath, secret string, ipfsAPIPort int, bootstrapClusterPeers []string) error {
serviceJSONPath := filepath.Join(clusterPath, "service.json") serviceJSONPath := filepath.Join(clusterPath, "service.json")
// Read existing config // Read existing config
@ -598,13 +599,22 @@ func (bi *BinaryInstaller) updateClusterConfig(clusterPath, secret string, ipfsA
return fmt.Errorf("failed to parse service.json: %w", err) return fmt.Errorf("failed to parse service.json: %w", err)
} }
// Update cluster secret // Update cluster secret and peer addresses
if cluster, ok := config["cluster"].(map[string]interface{}); ok { if cluster, ok := config["cluster"].(map[string]interface{}); ok {
cluster["secret"] = secret cluster["secret"] = secret
// Configure peer addresses for cluster discovery
// This allows nodes to find and connect to each other
if len(bootstrapClusterPeers) > 0 {
cluster["peer_addresses"] = bootstrapClusterPeers
}
} else { } else {
config["cluster"] = map[string]interface{}{ clusterConfig := map[string]interface{}{
"secret": secret, "secret": secret,
} }
if len(bootstrapClusterPeers) > 0 {
clusterConfig["peer_addresses"] = bootstrapClusterPeers
}
config["cluster"] = clusterConfig
} }
// Update IPFS port in IPFS Proxy configuration // Update IPFS port in IPFS Proxy configuration
@ -635,6 +645,35 @@ func (bi *BinaryInstaller) updateClusterConfig(clusterPath, secret string, ipfsA
return nil return nil
} }
// GetClusterPeerMultiaddr reads the IPFS Cluster peer ID and returns its multiaddress
// Returns format: /ip4/<ip>/tcp/9098/p2p/<cluster-peer-id>
func (bi *BinaryInstaller) GetClusterPeerMultiaddr(clusterPath string, nodeIP string) (string, error) {
identityPath := filepath.Join(clusterPath, "identity.json")
// Read identity file
data, err := os.ReadFile(identityPath)
if err != nil {
return "", fmt.Errorf("failed to read identity.json: %w", err)
}
// Parse JSON
var identity map[string]interface{}
if err := json.Unmarshal(data, &identity); err != nil {
return "", fmt.Errorf("failed to parse identity.json: %w", err)
}
// Get peer ID
peerID, ok := identity["id"].(string)
if !ok || peerID == "" {
return "", fmt.Errorf("peer ID not found in identity.json")
}
// Construct multiaddress: /ip4/<ip>/tcp/9098/p2p/<peer-id>
// Port 9098 is the default cluster listen port
multiaddr := fmt.Sprintf("/ip4/%s/tcp/9098/p2p/%s", nodeIP, peerID)
return multiaddr, nil
}
// InitializeRQLiteDataDir initializes RQLite data directory // InitializeRQLiteDataDir initializes RQLite data directory
func (bi *BinaryInstaller) InitializeRQLiteDataDir(nodeType, dataDir string) error { func (bi *BinaryInstaller) InitializeRQLiteDataDir(nodeType, dataDir string) error {
fmt.Fprintf(bi.logWriter.(interface{ Write([]byte) (int, error) }), " Initializing RQLite data dir for %s...\n", nodeType) fmt.Fprintf(bi.logWriter.(interface{ Write([]byte) (int, error) }), " Initializing RQLite data dir for %s...\n", nodeType)

View File

@ -265,7 +265,7 @@ func (ps *ProductionSetup) Phase2bInstallBinaries() error {
} }
// Phase2cInitializeServices initializes service repositories and configurations // Phase2cInitializeServices initializes service repositories and configurations
func (ps *ProductionSetup) Phase2cInitializeServices(nodeType string) error { func (ps *ProductionSetup) Phase2cInitializeServices(nodeType string, bootstrapPeers []string, vpsIP string) error {
ps.logf("Phase 2c: Initializing services...") ps.logf("Phase 2c: Initializing services...")
// Ensure node-specific directories exist // Ensure node-specific directories exist
@ -289,7 +289,34 @@ func (ps *ProductionSetup) Phase2cInitializeServices(nodeType string) error {
if err != nil { if err != nil {
return fmt.Errorf("failed to get cluster secret: %w", err) return fmt.Errorf("failed to get cluster secret: %w", err)
} }
if err := ps.binaryInstaller.InitializeIPFSClusterConfig(nodeType, clusterPath, clusterSecret, 4501); err != nil {
// Get bootstrap cluster peer addresses for non-bootstrap nodes
var bootstrapClusterPeers []string
if nodeType != "bootstrap" && len(bootstrapPeers) > 0 {
// Try to read bootstrap cluster peer ID and construct multiaddress
bootstrapClusterPath := filepath.Join(ps.debrosDir, "data", "bootstrap", "ipfs-cluster")
// Infer bootstrap IP from bootstrap peers
bootstrapIP := inferBootstrapIP(bootstrapPeers, vpsIP)
if bootstrapIP != "" {
// Check if bootstrap cluster identity exists
if _, err := os.Stat(filepath.Join(bootstrapClusterPath, "identity.json")); err == nil {
// Bootstrap cluster is initialized, get its multiaddress
if clusterMultiaddr, err := ps.binaryInstaller.GetClusterPeerMultiaddr(bootstrapClusterPath, bootstrapIP); err == nil {
bootstrapClusterPeers = []string{clusterMultiaddr}
ps.logf(" Configured IPFS Cluster to connect to bootstrap: %s", clusterMultiaddr)
} else {
ps.logf(" ⚠️ Could not read bootstrap cluster peer ID: %v", err)
ps.logf(" ⚠️ IPFS Cluster will rely on mDNS discovery (may not work across internet)")
}
} else {
ps.logf(" Bootstrap cluster not yet initialized, peer_addresses will be empty")
ps.logf(" IPFS Cluster will rely on mDNS discovery (may not work across internet)")
}
}
}
if err := ps.binaryInstaller.InitializeIPFSClusterConfig(nodeType, clusterPath, clusterSecret, 4501, bootstrapClusterPeers); err != nil {
return fmt.Errorf("failed to initialize IPFS Cluster: %w", err) return fmt.Errorf("failed to initialize IPFS Cluster: %w", err)
} }

View File

@ -161,9 +161,15 @@ func (cm *ClusterConfigManager) EnsureConfig() error {
} }
// Calculate ports based on pattern // Calculate ports based on pattern
proxyPort := clusterPort - 1 // REST API: 9094
pinSvcPort := clusterPort + 1 // Proxy: 9094 - 1 = 9093 (NOT USED - keeping for reference)
clusterListenPort := clusterPort + 2 // PinSvc: 9094 + 1 = 9095
// Proxy API: 9094 + 1 = 9095 (actual proxy port)
// PinSvc API: 9094 + 3 = 9097
// Cluster LibP2P: 9094 + 4 = 9098
proxyPort := clusterPort + 1 // 9095 (IPFSProxy API)
pinSvcPort := clusterPort + 3 // 9097 (PinSvc API)
clusterListenPort := clusterPort + 4 // 9098 (Cluster LibP2P)
// If config doesn't exist, initialize it with ipfs-cluster-service init // If config doesn't exist, initialize it with ipfs-cluster-service init
// This ensures we have all required sections (datastore, informer, etc.) // This ensures we have all required sections (datastore, informer, etc.)
@ -246,8 +252,9 @@ func (cm *ClusterConfigManager) UpdateBootstrapPeers(bootstrapAPIURL string) (bo
return false, fmt.Errorf("failed to parse bootstrap cluster API URL: %w", err) return false, fmt.Errorf("failed to parse bootstrap cluster API URL: %w", err)
} }
// Bootstrap listens on clusterPort + 2 (same pattern) // Bootstrap cluster LibP2P listens on clusterPort + 4
bootstrapClusterPort := clusterPort + 2 // (REST API is 9094, LibP2P is 9098 = 9094 + 4)
bootstrapClusterPort := clusterPort + 4
// Determine IP protocol (ip4 or ip6) based on the host // Determine IP protocol (ip4 or ip6) based on the host
var ipProtocol string var ipProtocol string
@ -266,17 +273,31 @@ func (cm *ClusterConfigManager) UpdateBootstrapPeers(bootstrapAPIURL string) (bo
return false, fmt.Errorf("failed to load config: %w", err) return false, fmt.Errorf("failed to load config: %w", err)
} }
// Check if we already have the correct address configured // CRITICAL: Always update peerstore file to ensure no stale addresses remain
if len(cfg.Cluster.PeerAddresses) > 0 && cfg.Cluster.PeerAddresses[0] == bootstrapPeerAddr { // Stale addresses (e.g., from old port configurations) cause LibP2P dial backoff,
cm.logger.Debug("Bootstrap peer address already correct", zap.String("addr", bootstrapPeerAddr)) // preventing cluster peers from connecting even if the correct address is present.
return true, nil // We must clean and rewrite the peerstore on every update to avoid this.
peerstorePath := filepath.Join(cm.clusterPath, "peerstore")
// Check if peerstore needs updating (avoid unnecessary writes but always clean stale entries)
needsUpdate := true
if peerstoreData, err := os.ReadFile(peerstorePath); err == nil {
// Only skip update if peerstore contains EXACTLY the correct address and nothing else
existingAddrs := strings.Split(strings.TrimSpace(string(peerstoreData)), "\n")
if len(existingAddrs) == 1 && strings.TrimSpace(existingAddrs[0]) == bootstrapPeerAddr {
cm.logger.Debug("Bootstrap peer address already correct in peerstore", zap.String("addr", bootstrapPeerAddr))
needsUpdate = false
}
} }
// Update peerstore file FIRST - this is what IPFS Cluster reads for bootstrapping if needsUpdate {
// Peerstore is the source of truth, service.json is just for our tracking // Write ONLY the correct bootstrap peer address, removing any stale entries
peerstorePath := filepath.Join(cm.clusterPath, "peerstore") if err := os.WriteFile(peerstorePath, []byte(bootstrapPeerAddr+"\n"), 0644); err != nil {
if err := os.WriteFile(peerstorePath, []byte(bootstrapPeerAddr+"\n"), 0644); err != nil { return false, fmt.Errorf("failed to write peerstore: %w", err)
return false, fmt.Errorf("failed to write peerstore: %w", err) }
cm.logger.Info("Updated peerstore with bootstrap peer (cleaned stale entries)",
zap.String("addr", bootstrapPeerAddr),
zap.String("peerstore_path", peerstorePath))
} }
// Then sync service.json from peerstore to keep them in sync // Then sync service.json from peerstore to keep them in sync
@ -852,8 +873,9 @@ func parseClusterPorts(clusterAPIURL string) (clusterPort, restAPIPort int, err
return 0, 0, fmt.Errorf("invalid port: %s", portStr) return 0, 0, fmt.Errorf("invalid port: %s", portStr)
} }
// Cluster listen port is typically REST API port + 2 // clusterPort is used as the base port for calculations
clusterPort = restAPIPort + 2 // The actual cluster LibP2P listen port is calculated as clusterPort + 4
clusterPort = restAPIPort
return clusterPort, restAPIPort, nil return clusterPort, restAPIPort, nil
} }

View File

@ -101,8 +101,10 @@ func getLevelColor(level zapcore.Level) string {
// coloredConsoleEncoder creates a custom encoder with colors // coloredConsoleEncoder creates a custom encoder with colors
func coloredConsoleEncoder(enableColors bool) zapcore.Encoder { func coloredConsoleEncoder(enableColors bool) zapcore.Encoder {
config := zap.NewDevelopmentEncoderConfig() config := zap.NewDevelopmentEncoderConfig()
// Ultra-short timestamp: HH:MM:SS (no milliseconds, no date, no timezone)
config.EncodeTime = func(t time.Time, enc zapcore.PrimitiveArrayEncoder) { config.EncodeTime = func(t time.Time, enc zapcore.PrimitiveArrayEncoder) {
timeStr := t.Format("2006-01-02T15:04:05.000Z0700") timeStr := t.Format("15:04:05")
if enableColors { if enableColors {
enc.AppendString(fmt.Sprintf("%s%s%s", Dim, timeStr, Reset)) enc.AppendString(fmt.Sprintf("%s%s%s", Dim, timeStr, Reset))
} else { } else {
@ -110,21 +112,41 @@ func coloredConsoleEncoder(enableColors bool) zapcore.Encoder {
} }
} }
// Single letter level: D, I, W, E
config.EncodeLevel = func(level zapcore.Level, enc zapcore.PrimitiveArrayEncoder) { config.EncodeLevel = func(level zapcore.Level, enc zapcore.PrimitiveArrayEncoder) {
levelStr := strings.ToUpper(level.String()) levelMap := map[zapcore.Level]string{
zapcore.DebugLevel: "D",
zapcore.InfoLevel: "I",
zapcore.WarnLevel: "W",
zapcore.ErrorLevel: "E",
}
levelStr := levelMap[level]
if levelStr == "" {
levelStr = "?"
}
if enableColors { if enableColors {
color := getLevelColor(level) color := getLevelColor(level)
enc.AppendString(fmt.Sprintf("%s%s%-5s%s", color, Bold, levelStr, Reset)) enc.AppendString(fmt.Sprintf("%s%s%s%s", color, Bold, levelStr, Reset))
} else { } else {
enc.AppendString(fmt.Sprintf("%-5s", levelStr)) enc.AppendString(levelStr)
} }
} }
// Just filename, no line number for cleaner output
config.EncodeCaller = func(caller zapcore.EntryCaller, enc zapcore.PrimitiveArrayEncoder) { config.EncodeCaller = func(caller zapcore.EntryCaller, enc zapcore.PrimitiveArrayEncoder) {
file := caller.File
// Extract just the filename from the path
if idx := strings.LastIndex(file, "/"); idx >= 0 {
file = file[idx+1:]
}
// Remove .go extension for even more compact format
if strings.HasSuffix(file, ".go") {
file = file[:len(file)-3]
}
if enableColors { if enableColors {
enc.AppendString(fmt.Sprintf("%s%s%s", Dim, caller.TrimmedPath(), Reset)) enc.AppendString(fmt.Sprintf("%s%s%s", Dim, file, Reset))
} else { } else {
enc.AppendString(caller.TrimmedPath()) enc.AppendString(file)
} }
} }

View File

@ -166,8 +166,7 @@ func (c *ClusterDiscoveryService) collectPeerMetadata() []*discovery.RQLiteNodeM
connectedPeers := c.host.Network().Peers() connectedPeers := c.host.Network().Peers()
var metadata []*discovery.RQLiteNodeMetadata var metadata []*discovery.RQLiteNodeMetadata
c.logger.Debug("Collecting peer metadata from LibP2P", // Metadata collection is routine - no need to log every occurrence
zap.Int("connected_libp2p_peers", len(connectedPeers)))
c.mu.RLock() c.mu.RLock()
currentRaftAddr := c.raftAddress currentRaftAddr := c.raftAddress
@ -240,9 +239,6 @@ type membershipUpdateResult struct {
func (c *ClusterDiscoveryService) updateClusterMembership() { func (c *ClusterDiscoveryService) updateClusterMembership() {
metadata := c.collectPeerMetadata() metadata := c.collectPeerMetadata()
c.logger.Debug("Collected peer metadata",
zap.Int("metadata_count", len(metadata)))
// Compute membership changes while holding lock // Compute membership changes while holding lock
c.mu.Lock() c.mu.Lock()
result := c.computeMembershipChangesLocked(metadata) result := c.computeMembershipChangesLocked(metadata)
@ -252,35 +248,30 @@ func (c *ClusterDiscoveryService) updateClusterMembership() {
if result.changed { if result.changed {
// Log state changes (peer added/removed) at Info level // Log state changes (peer added/removed) at Info level
if len(result.added) > 0 || len(result.updated) > 0 { if len(result.added) > 0 || len(result.updated) > 0 {
c.logger.Info("Cluster membership changed", c.logger.Info("Membership changed",
zap.Int("added", len(result.added)), zap.Int("added", len(result.added)),
zap.Int("updated", len(result.updated)), zap.Int("updated", len(result.updated)),
zap.Strings("added_ids", result.added), zap.Strings("added", result.added),
zap.Strings("updated_ids", result.updated)) zap.Strings("updated", result.updated))
} }
// Write peers.json without holding lock // Write peers.json without holding lock
if err := c.writePeersJSONWithData(result.peersJSON); err != nil { if err := c.writePeersJSONWithData(result.peersJSON); err != nil {
c.logger.Error("CRITICAL: Failed to write peers.json", c.logger.Error("Failed to write peers.json",
zap.Error(err), zap.Error(err),
zap.String("data_dir", c.dataDir), zap.String("data_dir", c.dataDir),
zap.Int("peer_count", len(result.peersJSON))) zap.Int("peers", len(result.peersJSON)))
} else { } else {
c.logger.Debug("peers.json updated", c.logger.Debug("peers.json updated",
zap.Int("peer_count", len(result.peersJSON))) zap.Int("peers", len(result.peersJSON)))
} }
// Update lastUpdate timestamp // Update lastUpdate timestamp
c.mu.Lock() c.mu.Lock()
c.lastUpdate = time.Now() c.lastUpdate = time.Now()
c.mu.Unlock() c.mu.Unlock()
} else {
c.mu.RLock()
totalPeers := len(c.knownPeers)
c.mu.RUnlock()
c.logger.Debug("No changes to cluster membership",
zap.Int("total_peers", totalPeers))
} }
// No changes - don't log (reduces noise)
} }
// computeMembershipChangesLocked computes membership changes and returns snapshot data // computeMembershipChangesLocked computes membership changes and returns snapshot data
@ -305,10 +296,10 @@ func (c *ClusterDiscoveryService) computeMembershipChangesLocked(metadata []*dis
} else { } else {
// New peer discovered // New peer discovered
added = append(added, meta.NodeID) added = append(added, meta.NodeID)
c.logger.Info("Node added to cluster", c.logger.Info("Node added",
zap.String("node_id", meta.NodeID), zap.String("node", meta.NodeID),
zap.String("raft_address", meta.RaftAddress), zap.String("raft", meta.RaftAddress),
zap.String("node_type", meta.NodeType), zap.String("type", meta.NodeType),
zap.Uint64("log_index", meta.RaftLogIndex)) zap.Uint64("log_index", meta.RaftLogIndex))
} }
@ -354,11 +345,10 @@ func (c *ClusterDiscoveryService) computeMembershipChangesLocked(metadata []*dis
requiredRemotePeers := c.minClusterSize - 1 requiredRemotePeers := c.minClusterSize - 1
if remotePeerCount < requiredRemotePeers { if remotePeerCount < requiredRemotePeers {
c.logger.Info("Skipping initial peers.json write - not enough remote peers discovered", c.logger.Info("Waiting for peers",
zap.Int("remote_peers", remotePeerCount), zap.Int("have", remotePeerCount),
zap.Int("required_remote_peers", requiredRemotePeers), zap.Int("need", requiredRemotePeers),
zap.Int("min_cluster_size", c.minClusterSize), zap.Int("min_size", c.minClusterSize))
zap.String("action", "will write when enough peers are discovered"))
return membershipUpdateResult{ return membershipUpdateResult{
changed: false, changed: false,
} }
@ -367,8 +357,7 @@ func (c *ClusterDiscoveryService) computeMembershipChangesLocked(metadata []*dis
// Additional safety check: don't write empty peers.json (would cause single-node cluster) // Additional safety check: don't write empty peers.json (would cause single-node cluster)
if len(peers) == 0 && c.lastUpdate.IsZero() { if len(peers) == 0 && c.lastUpdate.IsZero() {
c.logger.Info("Skipping peers.json write - no remote peers to include", c.logger.Info("No remote peers - waiting")
zap.String("action", "will write when peers are discovered"))
return membershipUpdateResult{ return membershipUpdateResult{
changed: false, changed: false,
} }
@ -376,10 +365,10 @@ func (c *ClusterDiscoveryService) computeMembershipChangesLocked(metadata []*dis
// Log initial sync if this is the first time // Log initial sync if this is the first time
if c.lastUpdate.IsZero() { if c.lastUpdate.IsZero() {
c.logger.Info("Initial cluster membership sync", c.logger.Info("Initial sync",
zap.Int("total_peers", len(c.knownPeers)), zap.Int("total", len(c.knownPeers)),
zap.Int("remote_peers", remotePeerCount), zap.Int("remote", remotePeerCount),
zap.Int("peers_in_json", len(peers))) zap.Int("in_json", len(peers)))
} }
return membershipUpdateResult{ return membershipUpdateResult{
@ -408,8 +397,8 @@ func (c *ClusterDiscoveryService) removeInactivePeers() {
if inactiveDuration > c.inactivityLimit { if inactiveDuration > c.inactivityLimit {
// Mark as inactive and remove // Mark as inactive and remove
c.logger.Warn("Node removed from cluster", c.logger.Warn("Node removed",
zap.String("node_id", nodeID), zap.String("node", nodeID),
zap.String("reason", "inactive"), zap.String("reason", "inactive"),
zap.Duration("inactive_duration", inactiveDuration)) zap.Duration("inactive_duration", inactiveDuration))
@ -421,9 +410,9 @@ func (c *ClusterDiscoveryService) removeInactivePeers() {
// Regenerate peers.json if any peers were removed // Regenerate peers.json if any peers were removed
if len(removed) > 0 { if len(removed) > 0 {
c.logger.Info("Removed inactive nodes, regenerating peers.json", c.logger.Info("Removed inactive",
zap.Int("removed", len(removed)), zap.Int("count", len(removed)),
zap.Strings("node_ids", removed)) zap.Strings("nodes", removed))
if err := c.writePeersJSON(); err != nil { if err := c.writePeersJSON(); err != nil {
c.logger.Error("Failed to write peers.json after cleanup", zap.Error(err)) c.logger.Error("Failed to write peers.json after cleanup", zap.Error(err))
@ -443,10 +432,11 @@ func (c *ClusterDiscoveryService) getPeersJSONUnlocked() []map[string]interface{
peers := make([]map[string]interface{}, 0, len(c.knownPeers)) peers := make([]map[string]interface{}, 0, len(c.knownPeers))
for _, peer := range c.knownPeers { for _, peer := range c.knownPeers {
// Skip self - RQLite knows about itself, shouldn't be in peers.json // CRITICAL FIX: Include ALL peers (including self) in peers.json
if peer.NodeID == c.raftAddress { // When using bootstrap-expect with recovery, RQLite needs the complete
continue // expected cluster configuration to properly form consensus.
} // The peers.json file is used by RQLite's recovery mechanism to know
// what the full cluster membership should be, including the local node.
peerEntry := map[string]interface{}{ peerEntry := map[string]interface{}{
"id": peer.RaftAddress, // RQLite uses raft address as node ID "id": peer.RaftAddress, // RQLite uses raft address as node ID
"address": peer.RaftAddress, "address": peer.RaftAddress,
@ -482,11 +472,7 @@ func (c *ClusterDiscoveryService) writePeersJSONWithData(peers []map[string]inte
// Get the RQLite raft directory // Get the RQLite raft directory
rqliteDir := filepath.Join(dataDir, "rqlite", "raft") rqliteDir := filepath.Join(dataDir, "rqlite", "raft")
c.logger.Debug("Writing peers.json", // Writing peers.json - routine operation, no need to log details
zap.String("data_dir", c.dataDir),
zap.String("expanded_path", dataDir),
zap.String("raft_dir", rqliteDir),
zap.Int("peer_count", len(peers)))
if err := os.MkdirAll(rqliteDir, 0755); err != nil { if err := os.MkdirAll(rqliteDir, 0755); err != nil {
return fmt.Errorf("failed to create raft directory %s: %w", rqliteDir, err) return fmt.Errorf("failed to create raft directory %s: %w", rqliteDir, err)
@ -497,7 +483,7 @@ func (c *ClusterDiscoveryService) writePeersJSONWithData(peers []map[string]inte
// Backup existing peers.json if it exists // Backup existing peers.json if it exists
if _, err := os.Stat(peersFile); err == nil { if _, err := os.Stat(peersFile); err == nil {
c.logger.Debug("Backing up existing peers.json", zap.String("backup_file", backupFile)) // Backup existing peers.json if it exists - routine operation
data, err := os.ReadFile(peersFile) data, err := os.ReadFile(peersFile)
if err == nil { if err == nil {
_ = os.WriteFile(backupFile, data, 0644) _ = os.WriteFile(backupFile, data, 0644)
@ -510,7 +496,7 @@ func (c *ClusterDiscoveryService) writePeersJSONWithData(peers []map[string]inte
return fmt.Errorf("failed to marshal peers.json: %w", err) return fmt.Errorf("failed to marshal peers.json: %w", err)
} }
c.logger.Debug("Marshaled peers.json", zap.Int("data_size", len(data))) // Marshaled peers.json - routine operation
// Write atomically using temp file + rename // Write atomically using temp file + rename
tempFile := peersFile + ".tmp" tempFile := peersFile + ".tmp"
@ -530,9 +516,8 @@ func (c *ClusterDiscoveryService) writePeersJSONWithData(peers []map[string]inte
} }
c.logger.Info("peers.json written", c.logger.Info("peers.json written",
zap.String("file", peersFile), zap.Int("peers", len(peers)),
zap.Int("node_count", len(peers)), zap.Strings("nodes", nodeIDs))
zap.Strings("node_ids", nodeIDs))
return nil return nil
} }
@ -648,17 +633,58 @@ func (c *ClusterDiscoveryService) WaitForDiscoverySettling(ctx context.Context)
// TriggerSync manually triggers a cluster membership sync // TriggerSync manually triggers a cluster membership sync
func (c *ClusterDiscoveryService) TriggerSync() { func (c *ClusterDiscoveryService) TriggerSync() {
c.logger.Info("Manually triggering cluster membership sync")
// For bootstrap nodes, wait a bit for peer discovery to stabilize // For bootstrap nodes, wait a bit for peer discovery to stabilize
if c.nodeType == "bootstrap" { if c.nodeType == "bootstrap" {
c.logger.Info("Bootstrap node: waiting for peer discovery to complete")
time.Sleep(5 * time.Second) time.Sleep(5 * time.Second)
} }
c.updateClusterMembership() c.updateClusterMembership()
} }
// ForceWritePeersJSON forces writing peers.json regardless of membership changes
// This is useful after clearing raft state when we need to recreate peers.json
func (c *ClusterDiscoveryService) ForceWritePeersJSON() error {
c.logger.Info("Force writing peers.json")
// First, collect latest peer metadata to ensure we have current information
metadata := c.collectPeerMetadata()
// Update known peers with latest metadata (without writing file yet)
c.mu.Lock()
for _, meta := range metadata {
c.knownPeers[meta.NodeID] = meta
// Update health tracking for remote peers
if meta.NodeID != c.raftAddress {
if _, ok := c.peerHealth[meta.NodeID]; !ok {
c.peerHealth[meta.NodeID] = &PeerHealth{
LastSeen: time.Now(),
LastSuccessful: time.Now(),
Status: "active",
}
} else {
c.peerHealth[meta.NodeID].LastSeen = time.Now()
c.peerHealth[meta.NodeID].Status = "active"
}
}
}
peers := c.getPeersJSONUnlocked()
c.mu.Unlock()
// Now force write the file
if err := c.writePeersJSONWithData(peers); err != nil {
c.logger.Error("Failed to force write peers.json",
zap.Error(err),
zap.String("data_dir", c.dataDir),
zap.Int("peers", len(peers)))
return err
}
c.logger.Info("peers.json written",
zap.Int("peers", len(peers)))
return nil
}
// TriggerPeerExchange actively exchanges peer information with connected peers // TriggerPeerExchange actively exchanges peer information with connected peers
// This populates the peerstore with RQLite metadata from other nodes // This populates the peerstore with RQLite metadata from other nodes
func (c *ClusterDiscoveryService) TriggerPeerExchange(ctx context.Context) error { func (c *ClusterDiscoveryService) TriggerPeerExchange(ctx context.Context) error {
@ -666,9 +692,8 @@ func (c *ClusterDiscoveryService) TriggerPeerExchange(ctx context.Context) error
return fmt.Errorf("discovery manager not available") return fmt.Errorf("discovery manager not available")
} }
c.logger.Info("Triggering peer exchange via discovery manager")
collected := c.discoveryMgr.TriggerPeerExchange(ctx) collected := c.discoveryMgr.TriggerPeerExchange(ctx)
c.logger.Info("Peer exchange completed", zap.Int("peers_with_metadata", collected)) c.logger.Debug("Exchange completed", zap.Int("with_metadata", collected))
return nil return nil
} }
@ -709,8 +734,8 @@ func (c *ClusterDiscoveryService) UpdateOwnMetadata() {
return return
} }
c.logger.Debug("Updated own RQLite metadata", c.logger.Debug("Metadata updated",
zap.String("node_id", metadata.NodeID), zap.String("node", metadata.NodeID),
zap.Uint64("log_index", metadata.RaftLogIndex)) zap.Uint64("log_index", metadata.RaftLogIndex))
} }
@ -740,9 +765,9 @@ func (c *ClusterDiscoveryService) StoreRemotePeerMetadata(peerID peer.ID, metada
return fmt.Errorf("failed to store metadata: %w", err) return fmt.Errorf("failed to store metadata: %w", err)
} }
c.logger.Debug("Stored remote peer metadata", c.logger.Debug("Metadata stored",
zap.String("peer_id", shortPeerID(peerID)), zap.String("peer", shortPeerID(peerID)),
zap.String("node_id", metadata.NodeID)) zap.String("node", metadata.NodeID))
return nil return nil
} }
@ -758,9 +783,9 @@ func (c *ClusterDiscoveryService) adjustPeerAdvertisedAddresses(peerID peer.ID,
changed, stale := rewriteAdvertisedAddresses(meta, ip, true) changed, stale := rewriteAdvertisedAddresses(meta, ip, true)
if changed { if changed {
c.logger.Debug("Normalized peer advertised RQLite addresses", c.logger.Debug("Addresses normalized",
zap.String("peer_id", shortPeerID(peerID)), zap.String("peer", shortPeerID(peerID)),
zap.String("raft_address", meta.RaftAddress), zap.String("raft", meta.RaftAddress),
zap.String("http_address", meta.HTTPAddress)) zap.String("http_address", meta.HTTPAddress))
} }
return changed, stale return changed, stale

View File

@ -34,11 +34,6 @@ type RQLiteManager struct {
// waitForSQLAvailable waits until a simple query succeeds, indicating a leader is known and queries can be served. // waitForSQLAvailable waits until a simple query succeeds, indicating a leader is known and queries can be served.
func (r *RQLiteManager) waitForSQLAvailable(ctx context.Context) error { func (r *RQLiteManager) waitForSQLAvailable(ctx context.Context) error {
if r.connection == nil {
r.logger.Error("No rqlite connection")
return errors.New("no rqlite connection")
}
ticker := time.NewTicker(1 * time.Second) ticker := time.NewTicker(1 * time.Second)
defer ticker.Stop() defer ticker.Stop()
@ -48,6 +43,16 @@ func (r *RQLiteManager) waitForSQLAvailable(ctx context.Context) error {
case <-ctx.Done(): case <-ctx.Done():
return ctx.Err() return ctx.Err()
case <-ticker.C: case <-ticker.C:
// Check for nil connection inside the loop to handle cases where
// connection becomes nil during restart/recovery operations
if r.connection == nil {
attempts++
if attempts%5 == 0 { // log every ~5s to reduce noise
r.logger.Debug("Waiting for RQLite connection to be established")
}
continue
}
attempts++ attempts++
_, err := r.connection.QueryOne("SELECT 1") _, err := r.connection.QueryOne("SELECT 1")
if err == nil { if err == nil {
@ -417,6 +422,15 @@ func (r *RQLiteManager) establishLeadershipOrJoin(ctx context.Context, rqliteDat
if err := r.clearRaftState(rqliteDataDir); err != nil { if err := r.clearRaftState(rqliteDataDir); err != nil {
r.logger.Error("Failed to clear Raft state", zap.Error(err)) r.logger.Error("Failed to clear Raft state", zap.Error(err))
} else { } else {
// Force write peers.json after clearing state
if r.discoveryService != nil {
r.logger.Info("Force writing peers.json after clearing state for configuration mismatch recovery")
if err := r.discoveryService.ForceWritePeersJSON(); err != nil {
r.logger.Error("Failed to force write peers.json", zap.Error(err))
}
// Update peersPath after force write
peersPath = filepath.Join(rqliteDataDir, "raft", "peers.json")
}
// Restart RQLite with clean state // Restart RQLite with clean state
r.logger.Info("Raft state cleared, restarting RQLite for clean rejoin") r.logger.Info("Raft state cleared, restarting RQLite for clean rejoin")
if recoveryErr := r.recoverCluster(ctx, peersPath); recoveryErr == nil { if recoveryErr := r.recoverCluster(ctx, peersPath); recoveryErr == nil {
@ -1279,15 +1293,30 @@ func (r *RQLiteManager) recoverFromSplitBrain(ctx context.Context) error {
return fmt.Errorf("failed to clear Raft state: %w", err) return fmt.Errorf("failed to clear Raft state: %w", err)
} }
// Step 5: Ensure peers.json exists with all discovered peers // Step 5: Refresh peer metadata and force write peers.json
r.discoveryService.TriggerSync() // We trigger peer exchange again to ensure we have the absolute latest metadata
time.Sleep(2 * time.Second) // after clearing state, then force write peers.json regardless of changes
r.logger.Info("Refreshing peer metadata after clearing raft state")
r.discoveryService.TriggerPeerExchange(ctx)
time.Sleep(1 * time.Second) // Brief wait for peer exchange to complete
r.logger.Info("Force writing peers.json with all discovered peers")
// We use ForceWritePeersJSON instead of TriggerSync because TriggerSync
// only writes if membership changed, but after clearing state we need
// to write regardless of changes
if err := r.discoveryService.ForceWritePeersJSON(); err != nil {
return fmt.Errorf("failed to force write peers.json: %w", err)
}
// Verify peers.json was created
peersPath := filepath.Join(rqliteDataDir, "raft", "peers.json") peersPath := filepath.Join(rqliteDataDir, "raft", "peers.json")
if _, err := os.Stat(peersPath); err != nil { if _, err := os.Stat(peersPath); err != nil {
return fmt.Errorf("peers.json not created: %w", err) return fmt.Errorf("peers.json not created after force write: %w", err)
} }
r.logger.Info("peers.json verified after force write",
zap.String("peers_path", peersPath))
// Step 6: Restart RQLite to pick up new peers.json // Step 6: Restart RQLite to pick up new peers.json
r.logger.Info("Restarting RQLite to apply new cluster configuration") r.logger.Info("Restarting RQLite to apply new cluster configuration")
if err := r.recoverCluster(ctx, peersPath); err != nil { if err := r.recoverCluster(ctx, peersPath); err != nil {
@ -1435,6 +1464,14 @@ func (r *RQLiteManager) performPreStartClusterDiscovery(ctx context.Context, rql
if err := r.clearRaftState(rqliteDataDir); err != nil { if err := r.clearRaftState(rqliteDataDir); err != nil {
r.logger.Error("Failed to clear Raft state", zap.Error(err)) r.logger.Error("Failed to clear Raft state", zap.Error(err))
// Continue anyway - rqlite might still be able to recover // Continue anyway - rqlite might still be able to recover
} else {
// Force write peers.json after clearing stale state
if r.discoveryService != nil {
r.logger.Info("Force writing peers.json after clearing stale Raft state")
if err := r.discoveryService.ForceWritePeersJSON(); err != nil {
r.logger.Error("Failed to force write peers.json after clearing stale state", zap.Error(err))
}
}
} }
} }
} }

379
scripts/test-cluster-health.sh Executable file
View File

@ -0,0 +1,379 @@
#!/bin/bash
# Production Cluster Health Check Script
# Tests RQLite, IPFS, and IPFS Cluster connectivity and replication
# Note: We don't use 'set -e' here because we want to continue testing even if individual checks fail
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Node IPs - Update these if needed
BOOTSTRAP="${BOOTSTRAP:-51.83.128.181}"
NODE1="${NODE1:-57.128.223.92}"
NODE2="${NODE2:-185.185.83.89}"
ALL_NODES=($BOOTSTRAP $NODE1 $NODE2)
# Counters
PASSED=0
FAILED=0
WARNINGS=0
# Helper functions
print_header() {
echo ""
echo -e "${BLUE}========================================${NC}"
echo -e "${BLUE}$1${NC}"
echo -e "${BLUE}========================================${NC}"
}
print_test() {
echo -e "${YELLOW}$1${NC}"
}
print_pass() {
echo -e "${GREEN}$1${NC}"
PASSED=$((PASSED + 1))
}
print_fail() {
echo -e "${RED}$1${NC}"
FAILED=$((FAILED + 1))
}
print_warn() {
echo -e "${YELLOW}$1${NC}"
WARNINGS=$((WARNINGS + 1))
}
print_info() {
echo -e " $1"
}
# Test functions
test_rqlite_status() {
print_header "1. RQLITE CLUSTER STATUS"
local leader_found=false
local follower_count=0
local commit_indices=()
for i in "${!ALL_NODES[@]}"; do
local node="${ALL_NODES[$i]}"
print_test "Testing RQLite on $node"
if ! response=$(curl -s --max-time 5 http://$node:5001/status 2>/dev/null); then
print_fail "Cannot connect to RQLite on $node:5001"
continue
fi
local state=$(echo "$response" | jq -r '.store.raft.state // "unknown"')
local num_peers=$(echo "$response" | jq -r '.store.raft.num_peers // 0')
local commit_index=$(echo "$response" | jq -r '.store.raft.commit_index // 0')
local last_contact=$(echo "$response" | jq -r '.store.raft.last_contact // "N/A"')
local config=$(echo "$response" | jq -r '.store.raft.latest_configuration // "[]"')
local node_count=$(echo "$config" | grep -o "Address" | wc -l | tr -d ' ')
commit_indices+=($commit_index)
print_info "State: $state | Peers: $num_peers | Commit Index: $commit_index | Cluster Nodes: $node_count"
# Check state
if [ "$state" = "Leader" ]; then
leader_found=true
print_pass "Node $node is the Leader"
elif [ "$state" = "Follower" ]; then
follower_count=$((follower_count + 1))
# Check last contact
if [ "$last_contact" != "N/A" ] && [ "$last_contact" != "0" ]; then
print_pass "Node $node is a Follower (last contact: $last_contact)"
else
print_warn "Node $node is Follower but last_contact is $last_contact"
fi
else
print_fail "Node $node has unexpected state: $state"
fi
# Check peer count
if [ "$num_peers" = "2" ]; then
print_pass "Node $node has correct peer count: 2"
else
print_fail "Node $node has incorrect peer count: $num_peers (expected 2)"
fi
# Check cluster configuration
if [ "$node_count" = "3" ]; then
print_pass "Node $node sees all 3 cluster members"
else
print_fail "Node $node only sees $node_count cluster members (expected 3)"
fi
echo ""
done
# Check for exactly 1 leader
if [ "$leader_found" = true ] && [ "$follower_count" = "2" ]; then
print_pass "Cluster has 1 Leader and 2 Followers ✓"
else
print_fail "Invalid cluster state (Leader found: $leader_found, Followers: $follower_count)"
fi
# Check commit index sync
if [ ${#commit_indices[@]} -eq 3 ]; then
local first="${commit_indices[0]}"
local all_same=true
for idx in "${commit_indices[@]}"; do
if [ "$idx" != "$first" ]; then
all_same=false
break
fi
done
if [ "$all_same" = true ]; then
print_pass "All nodes have synced commit index: $first"
else
print_warn "Commit indices differ: ${commit_indices[*]} (might be normal if writes are happening)"
fi
fi
}
test_rqlite_replication() {
print_header "2. RQLITE REPLICATION TEST"
print_test "Creating test table and inserting data on leader ($BOOTSTRAP)"
# Create table
if ! response=$(curl -s --max-time 5 -XPOST "http://$BOOTSTRAP:5001/db/execute" \
-H "Content-Type: application/json" \
-d '[["CREATE TABLE IF NOT EXISTS test_cluster_health (id INTEGER PRIMARY KEY AUTOINCREMENT, timestamp TEXT, node TEXT, value TEXT)"]]' 2>/dev/null); then
print_fail "Failed to create table"
return
fi
if echo "$response" | jq -e '.results[0].error' >/dev/null 2>&1; then
local error=$(echo "$response" | jq -r '.results[0].error')
if [[ "$error" != "table test_cluster_health already exists" ]]; then
print_fail "Table creation error: $error"
return
fi
fi
print_pass "Table exists"
# Insert test data
local test_value="test_$(date +%s)"
if ! response=$(curl -s --max-time 5 -XPOST "http://$BOOTSTRAP:5001/db/execute" \
-H "Content-Type: application/json" \
-d "[
[\"INSERT INTO test_cluster_health (timestamp, node, value) VALUES (datetime('now'), 'bootstrap', '$test_value')\"]
]" 2>/dev/null); then
print_fail "Failed to insert data"
return
fi
if echo "$response" | jq -e '.results[0].error' >/dev/null 2>&1; then
local error=$(echo "$response" | jq -r '.results[0].error')
print_fail "Insert error: $error"
return
fi
print_pass "Data inserted: $test_value"
# Wait for replication
print_info "Waiting 2 seconds for replication..."
sleep 2
# Query from all nodes
for node in "${ALL_NODES[@]}"; do
print_test "Reading from $node"
if ! response=$(curl -s --max-time 5 -XPOST "http://$node:5001/db/query?level=weak" \
-H "Content-Type: application/json" \
-d "[\"SELECT * FROM test_cluster_health WHERE value = '$test_value' LIMIT 1\"]" 2>/dev/null); then
print_fail "Failed to query from $node"
continue
fi
if echo "$response" | jq -e '.results[0].error' >/dev/null 2>&1; then
local error=$(echo "$response" | jq -r '.results[0].error')
print_fail "Query error on $node: $error"
continue
fi
local row_count=$(echo "$response" | jq -r '.results[0].values | length // 0')
if [ "$row_count" = "1" ]; then
local retrieved_value=$(echo "$response" | jq -r '.results[0].values[0][3] // ""')
if [ "$retrieved_value" = "$test_value" ]; then
print_pass "Data replicated correctly to $node"
else
print_fail "Data mismatch on $node (got: $retrieved_value, expected: $test_value)"
fi
else
print_fail "Expected 1 row from $node, got $row_count"
fi
done
}
test_ipfs_status() {
print_header "3. IPFS DAEMON STATUS"
for node in "${ALL_NODES[@]}"; do
print_test "Testing IPFS on $node"
if ! response=$(curl -s --max-time 5 -X POST http://$node:4501/api/v0/id 2>/dev/null); then
print_fail "Cannot connect to IPFS on $node:4501"
continue
fi
local peer_id=$(echo "$response" | jq -r '.ID // "unknown"')
local addr_count=$(echo "$response" | jq -r '.Addresses | length // 0')
local agent=$(echo "$response" | jq -r '.AgentVersion // "unknown"')
if [ "$peer_id" != "unknown" ]; then
print_pass "IPFS running on $node (ID: ${peer_id:0:12}...)"
print_info "Agent: $agent | Addresses: $addr_count"
else
print_fail "IPFS not responding correctly on $node"
fi
done
}
test_ipfs_swarm() {
print_header "4. IPFS SWARM CONNECTIVITY"
for node in "${ALL_NODES[@]}"; do
print_test "Checking IPFS swarm peers on $node"
if ! response=$(curl -s --max-time 5 -X POST http://$node:4501/api/v0/swarm/peers 2>/dev/null); then
print_fail "Failed to get swarm peers from $node"
continue
fi
local peer_count=$(echo "$response" | jq -r '.Peers | length // 0')
if [ "$peer_count" = "2" ]; then
print_pass "Node $node connected to 2 IPFS peers"
elif [ "$peer_count" -gt "0" ]; then
print_warn "Node $node connected to $peer_count IPFS peers (expected 2)"
else
print_fail "Node $node has no IPFS swarm peers"
fi
done
}
test_ipfs_cluster_status() {
print_header "5. IPFS CLUSTER STATUS"
for node in "${ALL_NODES[@]}"; do
print_test "Testing IPFS Cluster on $node"
if ! response=$(curl -s --max-time 5 http://$node:9094/id 2>/dev/null); then
print_fail "Cannot connect to IPFS Cluster on $node:9094"
continue
fi
local cluster_id=$(echo "$response" | jq -r '.id // "unknown"')
local cluster_peers=$(echo "$response" | jq -r '.cluster_peers | length // 0')
local version=$(echo "$response" | jq -r '.version // "unknown"')
if [ "$cluster_id" != "unknown" ]; then
print_pass "IPFS Cluster running on $node (ID: ${cluster_id:0:12}...)"
print_info "Version: $version | Cluster Peers: $cluster_peers"
if [ "$cluster_peers" = "3" ]; then
print_pass "Node $node sees all 3 cluster peers"
else
print_warn "Node $node sees $cluster_peers cluster peers (expected 3)"
fi
else
print_fail "IPFS Cluster not responding correctly on $node"
fi
done
}
test_ipfs_cluster_pins() {
print_header "6. IPFS CLUSTER PIN CONSISTENCY"
local pin_counts=()
for node in "${ALL_NODES[@]}"; do
print_test "Checking pins on $node"
if ! response=$(curl -s --max-time 5 http://$node:9094/pins 2>/dev/null); then
print_fail "Failed to get pins from $node"
pin_counts+=(0)
continue
fi
local pin_count=$(echo "$response" | jq -r 'length // 0')
pin_counts+=($pin_count)
print_pass "Node $node has $pin_count pins"
done
# Check if all nodes have same pin count
if [ ${#pin_counts[@]} -eq 3 ]; then
local first="${pin_counts[0]}"
local all_same=true
for count in "${pin_counts[@]}"; do
if [ "$count" != "$first" ]; then
all_same=false
break
fi
done
if [ "$all_same" = true ]; then
print_pass "All nodes have consistent pin count: $first"
else
print_warn "Pin counts differ: ${pin_counts[*]} (might be syncing)"
fi
fi
}
print_summary() {
print_header "TEST SUMMARY"
echo ""
echo -e "${GREEN}Passed: $PASSED${NC}"
echo -e "${YELLOW}Warnings: $WARNINGS${NC}"
echo -e "${RED}Failed: $FAILED${NC}"
echo ""
if [ $FAILED -eq 0 ]; then
echo -e "${GREEN}🎉 All critical tests passed! Cluster is healthy.${NC}"
exit 0
elif [ $FAILED -le 2 ]; then
echo -e "${YELLOW}⚠️ Some tests failed. Review the output above.${NC}"
exit 1
else
echo -e "${RED}❌ Multiple failures detected. Cluster needs attention.${NC}"
exit 2
fi
}
# Main execution
main() {
echo ""
echo -e "${BLUE}╔════════════════════════════════════════════╗${NC}"
echo -e "${BLUE}║ DEBROS Production Cluster Health Check ║${NC}"
echo -e "${BLUE}╚════════════════════════════════════════════╝${NC}"
echo ""
echo "Testing cluster:"
echo " Bootstrap: $BOOTSTRAP"
echo " Node 1: $NODE1"
echo " Node 2: $NODE2"
test_rqlite_status
test_rqlite_replication
test_ipfs_status
test_ipfs_swarm
test_ipfs_cluster_status
test_ipfs_cluster_pins
print_summary
}
# Run main
main