diff --git a/CHANGELOG.md b/CHANGELOG.md index b6265c2..fbc66f7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,19 @@ The format is based on [Keep a Changelog][keepachangelog] and adheres to [Semant ### Deprecated ### Fixed +## [0.63.3] - 2025-11-10 + +### Added +\n +### Changed +- Improved RQLite cluster stability by automatically clearing stale Raft state on startup if peers have a higher log index, allowing the node to join cleanly. + +### Deprecated + +### Removed + +### Fixed +\n ## [0.63.2] - 2025-11-10 ### Added diff --git a/Makefile b/Makefile index 133f095..5d725cb 100644 --- a/Makefile +++ b/Makefile @@ -21,7 +21,7 @@ test-e2e: .PHONY: build clean test run-node run-node2 run-node3 run-example deps tidy fmt vet lint clear-ports install-hooks kill -VERSION := 0.63.2 +VERSION := 0.63.3 COMMIT ?= $(shell git rev-parse --short HEAD 2>/dev/null || echo unknown) DATE ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ) LDFLAGS := -X 'main.version=$(VERSION)' -X 'main.commit=$(COMMIT)' -X 'main.date=$(DATE)' diff --git a/pkg/rqlite/rqlite.go b/pkg/rqlite/rqlite.go index 679d627..ba0880d 100644 --- a/pkg/rqlite/rqlite.go +++ b/pkg/rqlite/rqlite.go @@ -774,6 +774,61 @@ func (r *RQLiteManager) checkNeedsClusterRecovery(rqliteDataDir string) (bool, e return false, nil } +// hasExistingRaftState checks if this node has any existing Raft state files +// Returns true if raft.db exists and has content, or if peers.json exists +func (r *RQLiteManager) hasExistingRaftState(rqliteDataDir string) bool { + // Check for raft.db + raftLogPath := filepath.Join(rqliteDataDir, "raft.db") + if info, err := os.Stat(raftLogPath); err == nil { + // If raft.db exists and has meaningful content (> 1KB), we have state + if info.Size() > 1024 { + return true + } + } + + // Check for peers.json + peersPath := filepath.Join(rqliteDataDir, "raft", "peers.json") + if _, err := os.Stat(peersPath); err == nil { + return true + } + + return false +} + +// clearRaftState safely removes Raft state files to allow a clean join +// This removes raft.db and peers.json but preserves db.sqlite +func (r *RQLiteManager) clearRaftState(rqliteDataDir string) error { + r.logger.Warn("Clearing Raft state to allow clean cluster join", + zap.String("data_dir", rqliteDataDir)) + + // Remove raft.db if it exists + raftLogPath := filepath.Join(rqliteDataDir, "raft.db") + if err := os.Remove(raftLogPath); err != nil && !os.IsNotExist(err) { + r.logger.Warn("Failed to remove raft.db", zap.Error(err)) + } else if err == nil { + r.logger.Info("Removed raft.db") + } + + // Remove peers.json if it exists + peersPath := filepath.Join(rqliteDataDir, "raft", "peers.json") + if err := os.Remove(peersPath); err != nil && !os.IsNotExist(err) { + r.logger.Warn("Failed to remove peers.json", zap.Error(err)) + } else if err == nil { + r.logger.Info("Removed peers.json") + } + + // Remove raft directory if it's empty + raftDir := filepath.Join(rqliteDataDir, "raft") + if entries, err := os.ReadDir(raftDir); err == nil && len(entries) == 0 { + if err := os.Remove(raftDir); err != nil { + r.logger.Debug("Failed to remove empty raft directory", zap.Error(err)) + } + } + + r.logger.Info("Raft state cleared successfully - node will join as fresh follower") + return nil +} + // performPreStartClusterDiscovery waits for peer discovery and builds a complete peers.json // before starting RQLite. This ensures all nodes use the same cluster membership for recovery. func (r *RQLiteManager) performPreStartClusterDiscovery(ctx context.Context, rqliteDataDir string) error { @@ -834,6 +889,38 @@ func (r *RQLiteManager) performPreStartClusterDiscovery(ctx context.Context, rql return nil } + // AUTOMATIC RECOVERY: Check if we have stale Raft state that conflicts with cluster + // If we have existing state but peers have higher log indexes, clear our state to allow clean join + allPeers := r.discoveryService.GetAllPeers() + hasExistingState := r.hasExistingRaftState(rqliteDataDir) + + if hasExistingState { + // Find the highest log index among other peers (excluding ourselves) + maxPeerIndex := uint64(0) + for _, peer := range allPeers { + // Skip ourselves (compare by raft address) + if peer.NodeID == r.discoverConfig.RaftAdvAddress { + continue + } + if peer.RaftLogIndex > maxPeerIndex { + maxPeerIndex = peer.RaftLogIndex + } + } + + // If peers have meaningful log history (> 0) and we have stale state, clear it + // This handles the case where we're starting with old state but the cluster has moved on + if maxPeerIndex > 0 { + r.logger.Warn("Detected stale Raft state - clearing to allow clean cluster join", + zap.Uint64("peer_max_log_index", maxPeerIndex), + zap.String("data_dir", rqliteDataDir)) + + if err := r.clearRaftState(rqliteDataDir); err != nil { + r.logger.Error("Failed to clear Raft state", zap.Error(err)) + // Continue anyway - rqlite might still be able to recover + } + } + } + // Trigger final sync to ensure peers.json is up to date with latest discovered peers r.logger.Info("Triggering final cluster membership sync to build complete peers.json") r.discoveryService.TriggerSync()