mirror of
https://github.com/DeBrosOfficial/network.git
synced 2025-12-11 09:18:50 +00:00
feat: implement Raft state management for cluster recovery
- Added methods to check for existing Raft state and clear it if necessary, allowing for a clean join to the cluster. - Enhanced automatic recovery logic to detect stale Raft state and clear it when peers have higher log indexes. - Improved logging for Raft state operations to provide better visibility during cluster management.
This commit is contained in:
parent
a72aebc1fe
commit
263fbbb8b4
13
CHANGELOG.md
13
CHANGELOG.md
@ -13,6 +13,19 @@ The format is based on [Keep a Changelog][keepachangelog] and adheres to [Semant
|
||||
### Deprecated
|
||||
|
||||
### Fixed
|
||||
## [0.63.3] - 2025-11-10
|
||||
|
||||
### Added
|
||||
\n
|
||||
### Changed
|
||||
- Improved RQLite cluster stability by automatically clearing stale Raft state on startup if peers have a higher log index, allowing the node to join cleanly.
|
||||
|
||||
### Deprecated
|
||||
|
||||
### Removed
|
||||
|
||||
### Fixed
|
||||
\n
|
||||
## [0.63.2] - 2025-11-10
|
||||
|
||||
### Added
|
||||
|
||||
2
Makefile
2
Makefile
@ -21,7 +21,7 @@ test-e2e:
|
||||
|
||||
.PHONY: build clean test run-node run-node2 run-node3 run-example deps tidy fmt vet lint clear-ports install-hooks kill
|
||||
|
||||
VERSION := 0.63.2
|
||||
VERSION := 0.63.3
|
||||
COMMIT ?= $(shell git rev-parse --short HEAD 2>/dev/null || echo unknown)
|
||||
DATE ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ)
|
||||
LDFLAGS := -X 'main.version=$(VERSION)' -X 'main.commit=$(COMMIT)' -X 'main.date=$(DATE)'
|
||||
|
||||
@ -774,6 +774,61 @@ func (r *RQLiteManager) checkNeedsClusterRecovery(rqliteDataDir string) (bool, e
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// hasExistingRaftState checks if this node has any existing Raft state files
|
||||
// Returns true if raft.db exists and has content, or if peers.json exists
|
||||
func (r *RQLiteManager) hasExistingRaftState(rqliteDataDir string) bool {
|
||||
// Check for raft.db
|
||||
raftLogPath := filepath.Join(rqliteDataDir, "raft.db")
|
||||
if info, err := os.Stat(raftLogPath); err == nil {
|
||||
// If raft.db exists and has meaningful content (> 1KB), we have state
|
||||
if info.Size() > 1024 {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
// Check for peers.json
|
||||
peersPath := filepath.Join(rqliteDataDir, "raft", "peers.json")
|
||||
if _, err := os.Stat(peersPath); err == nil {
|
||||
return true
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// clearRaftState safely removes Raft state files to allow a clean join
|
||||
// This removes raft.db and peers.json but preserves db.sqlite
|
||||
func (r *RQLiteManager) clearRaftState(rqliteDataDir string) error {
|
||||
r.logger.Warn("Clearing Raft state to allow clean cluster join",
|
||||
zap.String("data_dir", rqliteDataDir))
|
||||
|
||||
// Remove raft.db if it exists
|
||||
raftLogPath := filepath.Join(rqliteDataDir, "raft.db")
|
||||
if err := os.Remove(raftLogPath); err != nil && !os.IsNotExist(err) {
|
||||
r.logger.Warn("Failed to remove raft.db", zap.Error(err))
|
||||
} else if err == nil {
|
||||
r.logger.Info("Removed raft.db")
|
||||
}
|
||||
|
||||
// Remove peers.json if it exists
|
||||
peersPath := filepath.Join(rqliteDataDir, "raft", "peers.json")
|
||||
if err := os.Remove(peersPath); err != nil && !os.IsNotExist(err) {
|
||||
r.logger.Warn("Failed to remove peers.json", zap.Error(err))
|
||||
} else if err == nil {
|
||||
r.logger.Info("Removed peers.json")
|
||||
}
|
||||
|
||||
// Remove raft directory if it's empty
|
||||
raftDir := filepath.Join(rqliteDataDir, "raft")
|
||||
if entries, err := os.ReadDir(raftDir); err == nil && len(entries) == 0 {
|
||||
if err := os.Remove(raftDir); err != nil {
|
||||
r.logger.Debug("Failed to remove empty raft directory", zap.Error(err))
|
||||
}
|
||||
}
|
||||
|
||||
r.logger.Info("Raft state cleared successfully - node will join as fresh follower")
|
||||
return nil
|
||||
}
|
||||
|
||||
// performPreStartClusterDiscovery waits for peer discovery and builds a complete peers.json
|
||||
// before starting RQLite. This ensures all nodes use the same cluster membership for recovery.
|
||||
func (r *RQLiteManager) performPreStartClusterDiscovery(ctx context.Context, rqliteDataDir string) error {
|
||||
@ -834,6 +889,38 @@ func (r *RQLiteManager) performPreStartClusterDiscovery(ctx context.Context, rql
|
||||
return nil
|
||||
}
|
||||
|
||||
// AUTOMATIC RECOVERY: Check if we have stale Raft state that conflicts with cluster
|
||||
// If we have existing state but peers have higher log indexes, clear our state to allow clean join
|
||||
allPeers := r.discoveryService.GetAllPeers()
|
||||
hasExistingState := r.hasExistingRaftState(rqliteDataDir)
|
||||
|
||||
if hasExistingState {
|
||||
// Find the highest log index among other peers (excluding ourselves)
|
||||
maxPeerIndex := uint64(0)
|
||||
for _, peer := range allPeers {
|
||||
// Skip ourselves (compare by raft address)
|
||||
if peer.NodeID == r.discoverConfig.RaftAdvAddress {
|
||||
continue
|
||||
}
|
||||
if peer.RaftLogIndex > maxPeerIndex {
|
||||
maxPeerIndex = peer.RaftLogIndex
|
||||
}
|
||||
}
|
||||
|
||||
// If peers have meaningful log history (> 0) and we have stale state, clear it
|
||||
// This handles the case where we're starting with old state but the cluster has moved on
|
||||
if maxPeerIndex > 0 {
|
||||
r.logger.Warn("Detected stale Raft state - clearing to allow clean cluster join",
|
||||
zap.Uint64("peer_max_log_index", maxPeerIndex),
|
||||
zap.String("data_dir", rqliteDataDir))
|
||||
|
||||
if err := r.clearRaftState(rqliteDataDir); err != nil {
|
||||
r.logger.Error("Failed to clear Raft state", zap.Error(err))
|
||||
// Continue anyway - rqlite might still be able to recover
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Trigger final sync to ensure peers.json is up to date with latest discovered peers
|
||||
r.logger.Info("Triggering final cluster membership sync to build complete peers.json")
|
||||
r.discoveryService.TriggerSync()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user