feat: implement Raft state management for cluster recovery

- Added methods to check for existing Raft state and clear it if necessary, allowing for a clean join to the cluster.
- Enhanced automatic recovery logic to detect stale Raft state and clear it when peers have higher log indexes.
- Improved logging for Raft state operations to provide better visibility during cluster management.
This commit is contained in:
anonpenguin23 2025-11-10 08:51:33 +02:00
parent a72aebc1fe
commit 263fbbb8b4
No known key found for this signature in database
GPG Key ID: 1CBB1FE35AFBEE30
3 changed files with 101 additions and 1 deletions

View File

@ -13,6 +13,19 @@ The format is based on [Keep a Changelog][keepachangelog] and adheres to [Semant
### Deprecated
### Fixed
## [0.63.3] - 2025-11-10
### Added
\n
### Changed
- Improved RQLite cluster stability by automatically clearing stale Raft state on startup if peers have a higher log index, allowing the node to join cleanly.
### Deprecated
### Removed
### Fixed
\n
## [0.63.2] - 2025-11-10
### Added

View File

@ -21,7 +21,7 @@ test-e2e:
.PHONY: build clean test run-node run-node2 run-node3 run-example deps tidy fmt vet lint clear-ports install-hooks kill
VERSION := 0.63.2
VERSION := 0.63.3
COMMIT ?= $(shell git rev-parse --short HEAD 2>/dev/null || echo unknown)
DATE ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ)
LDFLAGS := -X 'main.version=$(VERSION)' -X 'main.commit=$(COMMIT)' -X 'main.date=$(DATE)'

View File

@ -774,6 +774,61 @@ func (r *RQLiteManager) checkNeedsClusterRecovery(rqliteDataDir string) (bool, e
return false, nil
}
// hasExistingRaftState checks if this node has any existing Raft state files
// Returns true if raft.db exists and has content, or if peers.json exists
func (r *RQLiteManager) hasExistingRaftState(rqliteDataDir string) bool {
// Check for raft.db
raftLogPath := filepath.Join(rqliteDataDir, "raft.db")
if info, err := os.Stat(raftLogPath); err == nil {
// If raft.db exists and has meaningful content (> 1KB), we have state
if info.Size() > 1024 {
return true
}
}
// Check for peers.json
peersPath := filepath.Join(rqliteDataDir, "raft", "peers.json")
if _, err := os.Stat(peersPath); err == nil {
return true
}
return false
}
// clearRaftState safely removes Raft state files to allow a clean join
// This removes raft.db and peers.json but preserves db.sqlite
func (r *RQLiteManager) clearRaftState(rqliteDataDir string) error {
r.logger.Warn("Clearing Raft state to allow clean cluster join",
zap.String("data_dir", rqliteDataDir))
// Remove raft.db if it exists
raftLogPath := filepath.Join(rqliteDataDir, "raft.db")
if err := os.Remove(raftLogPath); err != nil && !os.IsNotExist(err) {
r.logger.Warn("Failed to remove raft.db", zap.Error(err))
} else if err == nil {
r.logger.Info("Removed raft.db")
}
// Remove peers.json if it exists
peersPath := filepath.Join(rqliteDataDir, "raft", "peers.json")
if err := os.Remove(peersPath); err != nil && !os.IsNotExist(err) {
r.logger.Warn("Failed to remove peers.json", zap.Error(err))
} else if err == nil {
r.logger.Info("Removed peers.json")
}
// Remove raft directory if it's empty
raftDir := filepath.Join(rqliteDataDir, "raft")
if entries, err := os.ReadDir(raftDir); err == nil && len(entries) == 0 {
if err := os.Remove(raftDir); err != nil {
r.logger.Debug("Failed to remove empty raft directory", zap.Error(err))
}
}
r.logger.Info("Raft state cleared successfully - node will join as fresh follower")
return nil
}
// performPreStartClusterDiscovery waits for peer discovery and builds a complete peers.json
// before starting RQLite. This ensures all nodes use the same cluster membership for recovery.
func (r *RQLiteManager) performPreStartClusterDiscovery(ctx context.Context, rqliteDataDir string) error {
@ -834,6 +889,38 @@ func (r *RQLiteManager) performPreStartClusterDiscovery(ctx context.Context, rql
return nil
}
// AUTOMATIC RECOVERY: Check if we have stale Raft state that conflicts with cluster
// If we have existing state but peers have higher log indexes, clear our state to allow clean join
allPeers := r.discoveryService.GetAllPeers()
hasExistingState := r.hasExistingRaftState(rqliteDataDir)
if hasExistingState {
// Find the highest log index among other peers (excluding ourselves)
maxPeerIndex := uint64(0)
for _, peer := range allPeers {
// Skip ourselves (compare by raft address)
if peer.NodeID == r.discoverConfig.RaftAdvAddress {
continue
}
if peer.RaftLogIndex > maxPeerIndex {
maxPeerIndex = peer.RaftLogIndex
}
}
// If peers have meaningful log history (> 0) and we have stale state, clear it
// This handles the case where we're starting with old state but the cluster has moved on
if maxPeerIndex > 0 {
r.logger.Warn("Detected stale Raft state - clearing to allow clean cluster join",
zap.Uint64("peer_max_log_index", maxPeerIndex),
zap.String("data_dir", rqliteDataDir))
if err := r.clearRaftState(rqliteDataDir); err != nil {
r.logger.Error("Failed to clear Raft state", zap.Error(err))
// Continue anyway - rqlite might still be able to recover
}
}
}
// Trigger final sync to ensure peers.json is up to date with latest discovered peers
r.logger.Info("Triggering final cluster membership sync to build complete peers.json")
r.discoveryService.TriggerSync()