feat: implement Raft state management for cluster recovery

- Added methods to check for existing Raft state and clear it if necessary, allowing for a clean join to the cluster.
- Enhanced automatic recovery logic to detect stale Raft state and clear it when peers have higher log indexes.
- Improved logging for Raft state operations to provide better visibility during cluster management.
This commit is contained in:
anonpenguin23 2025-11-10 08:51:33 +02:00
parent a72aebc1fe
commit 263fbbb8b4
No known key found for this signature in database
GPG Key ID: 1CBB1FE35AFBEE30
3 changed files with 101 additions and 1 deletions

View File

@ -13,6 +13,19 @@ The format is based on [Keep a Changelog][keepachangelog] and adheres to [Semant
### Deprecated ### Deprecated
### Fixed ### Fixed
## [0.63.3] - 2025-11-10
### Added
\n
### Changed
- Improved RQLite cluster stability by automatically clearing stale Raft state on startup if peers have a higher log index, allowing the node to join cleanly.
### Deprecated
### Removed
### Fixed
\n
## [0.63.2] - 2025-11-10 ## [0.63.2] - 2025-11-10
### Added ### Added

View File

@ -21,7 +21,7 @@ test-e2e:
.PHONY: build clean test run-node run-node2 run-node3 run-example deps tidy fmt vet lint clear-ports install-hooks kill .PHONY: build clean test run-node run-node2 run-node3 run-example deps tidy fmt vet lint clear-ports install-hooks kill
VERSION := 0.63.2 VERSION := 0.63.3
COMMIT ?= $(shell git rev-parse --short HEAD 2>/dev/null || echo unknown) COMMIT ?= $(shell git rev-parse --short HEAD 2>/dev/null || echo unknown)
DATE ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ) DATE ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ)
LDFLAGS := -X 'main.version=$(VERSION)' -X 'main.commit=$(COMMIT)' -X 'main.date=$(DATE)' LDFLAGS := -X 'main.version=$(VERSION)' -X 'main.commit=$(COMMIT)' -X 'main.date=$(DATE)'

View File

@ -774,6 +774,61 @@ func (r *RQLiteManager) checkNeedsClusterRecovery(rqliteDataDir string) (bool, e
return false, nil return false, nil
} }
// hasExistingRaftState checks if this node has any existing Raft state files
// Returns true if raft.db exists and has content, or if peers.json exists
func (r *RQLiteManager) hasExistingRaftState(rqliteDataDir string) bool {
// Check for raft.db
raftLogPath := filepath.Join(rqliteDataDir, "raft.db")
if info, err := os.Stat(raftLogPath); err == nil {
// If raft.db exists and has meaningful content (> 1KB), we have state
if info.Size() > 1024 {
return true
}
}
// Check for peers.json
peersPath := filepath.Join(rqliteDataDir, "raft", "peers.json")
if _, err := os.Stat(peersPath); err == nil {
return true
}
return false
}
// clearRaftState safely removes Raft state files to allow a clean join
// This removes raft.db and peers.json but preserves db.sqlite
func (r *RQLiteManager) clearRaftState(rqliteDataDir string) error {
r.logger.Warn("Clearing Raft state to allow clean cluster join",
zap.String("data_dir", rqliteDataDir))
// Remove raft.db if it exists
raftLogPath := filepath.Join(rqliteDataDir, "raft.db")
if err := os.Remove(raftLogPath); err != nil && !os.IsNotExist(err) {
r.logger.Warn("Failed to remove raft.db", zap.Error(err))
} else if err == nil {
r.logger.Info("Removed raft.db")
}
// Remove peers.json if it exists
peersPath := filepath.Join(rqliteDataDir, "raft", "peers.json")
if err := os.Remove(peersPath); err != nil && !os.IsNotExist(err) {
r.logger.Warn("Failed to remove peers.json", zap.Error(err))
} else if err == nil {
r.logger.Info("Removed peers.json")
}
// Remove raft directory if it's empty
raftDir := filepath.Join(rqliteDataDir, "raft")
if entries, err := os.ReadDir(raftDir); err == nil && len(entries) == 0 {
if err := os.Remove(raftDir); err != nil {
r.logger.Debug("Failed to remove empty raft directory", zap.Error(err))
}
}
r.logger.Info("Raft state cleared successfully - node will join as fresh follower")
return nil
}
// performPreStartClusterDiscovery waits for peer discovery and builds a complete peers.json // performPreStartClusterDiscovery waits for peer discovery and builds a complete peers.json
// before starting RQLite. This ensures all nodes use the same cluster membership for recovery. // before starting RQLite. This ensures all nodes use the same cluster membership for recovery.
func (r *RQLiteManager) performPreStartClusterDiscovery(ctx context.Context, rqliteDataDir string) error { func (r *RQLiteManager) performPreStartClusterDiscovery(ctx context.Context, rqliteDataDir string) error {
@ -834,6 +889,38 @@ func (r *RQLiteManager) performPreStartClusterDiscovery(ctx context.Context, rql
return nil return nil
} }
// AUTOMATIC RECOVERY: Check if we have stale Raft state that conflicts with cluster
// If we have existing state but peers have higher log indexes, clear our state to allow clean join
allPeers := r.discoveryService.GetAllPeers()
hasExistingState := r.hasExistingRaftState(rqliteDataDir)
if hasExistingState {
// Find the highest log index among other peers (excluding ourselves)
maxPeerIndex := uint64(0)
for _, peer := range allPeers {
// Skip ourselves (compare by raft address)
if peer.NodeID == r.discoverConfig.RaftAdvAddress {
continue
}
if peer.RaftLogIndex > maxPeerIndex {
maxPeerIndex = peer.RaftLogIndex
}
}
// If peers have meaningful log history (> 0) and we have stale state, clear it
// This handles the case where we're starting with old state but the cluster has moved on
if maxPeerIndex > 0 {
r.logger.Warn("Detected stale Raft state - clearing to allow clean cluster join",
zap.Uint64("peer_max_log_index", maxPeerIndex),
zap.String("data_dir", rqliteDataDir))
if err := r.clearRaftState(rqliteDataDir); err != nil {
r.logger.Error("Failed to clear Raft state", zap.Error(err))
// Continue anyway - rqlite might still be able to recover
}
}
}
// Trigger final sync to ensure peers.json is up to date with latest discovered peers // Trigger final sync to ensure peers.json is up to date with latest discovered peers
r.logger.Info("Triggering final cluster membership sync to build complete peers.json") r.logger.Info("Triggering final cluster membership sync to build complete peers.json")
r.discoveryService.TriggerSync() r.discoveryService.TriggerSync()