network/pkg/rqlite/data_safety.go
anonpenguin23 239fb2084b
feat: enhance Raft log index retrieval and data directory management
- Improved the `getRaftLogIndex` method to accurately report the Raft log index by incorporating fallback logic to read persisted snapshot metadata when the RQLite status is unavailable or returns zero.
- Added a new method `getPersistedRaftLogIndex` to read the highest Raft log index from snapshot metadata files, ensuring accurate reporting even before RQLite starts.
- Centralized the data directory path resolution logic in `rqliteDataDirPath`, simplifying the codebase and enhancing maintainability.
2025-11-10 16:24:05 +02:00

195 lines
5.3 KiB
Go

package rqlite
import (
"encoding/json"
"fmt"
"io"
"net/http"
"os"
"path/filepath"
"time"
"go.uber.org/zap"
)
// getRaftLogIndex returns the current Raft log index for this node
// It first tries to get the index from the running RQLite instance via /status endpoint.
// If that fails or returns 0, it falls back to reading persisted snapshot metadata from disk.
// This ensures accurate log index reporting even before RQLite is fully started.
func (r *RQLiteManager) getRaftLogIndex() uint64 {
status, err := r.getRQLiteStatus()
if err == nil {
// Return the highest index we have from runtime status
maxIndex := status.Store.Raft.LastLogIndex
if status.Store.Raft.AppliedIndex > maxIndex {
maxIndex = status.Store.Raft.AppliedIndex
}
if status.Store.Raft.CommitIndex > maxIndex {
maxIndex = status.Store.Raft.CommitIndex
}
// If runtime status reports a valid index, use it
if maxIndex > 0 {
return maxIndex
}
// Runtime status returned 0, fall back to persisted snapshot metadata
// This handles the case where RQLite is running but hasn't applied any logs yet
if persisted := r.getPersistedRaftLogIndex(); persisted > 0 {
r.logger.Debug("Using persisted Raft log index because runtime status reported zero",
zap.Uint64("persisted_index", persisted))
return persisted
}
return 0
}
// RQLite status endpoint is not available (not started yet or unreachable)
// Fall back to reading persisted snapshot metadata from disk
persisted := r.getPersistedRaftLogIndex()
if persisted > 0 {
r.logger.Debug("Using persisted Raft log index before RQLite is reachable",
zap.Uint64("persisted_index", persisted),
zap.Error(err))
return persisted
}
r.logger.Debug("Failed to get Raft log index", zap.Error(err))
return 0
}
// getPersistedRaftLogIndex reads the highest Raft log index from snapshot metadata files
// This allows us to report accurate log indexes even before RQLite is started
func (r *RQLiteManager) getPersistedRaftLogIndex() uint64 {
rqliteDataDir, err := r.rqliteDataDirPath()
if err != nil {
return 0
}
snapshotsDir := filepath.Join(rqliteDataDir, "rsnapshots")
entries, err := os.ReadDir(snapshotsDir)
if err != nil {
return 0
}
var maxIndex uint64
for _, entry := range entries {
// Only process directories (snapshot directories)
if !entry.IsDir() {
continue
}
// Read meta.json from the snapshot directory
metaPath := filepath.Join(snapshotsDir, entry.Name(), "meta.json")
raw, err := os.ReadFile(metaPath)
if err != nil {
continue
}
// Parse the metadata JSON to extract the Index field
var meta struct {
Index uint64 `json:"Index"`
}
if err := json.Unmarshal(raw, &meta); err != nil {
continue
}
// Track the highest index found
if meta.Index > maxIndex {
maxIndex = meta.Index
}
}
return maxIndex
}
// getRQLiteStatus queries the /status endpoint for cluster information
func (r *RQLiteManager) getRQLiteStatus() (*RQLiteStatus, error) {
url := fmt.Sprintf("http://localhost:%d/status", r.config.RQLitePort)
client := &http.Client{Timeout: 5 * time.Second}
resp, err := client.Get(url)
if err != nil {
return nil, fmt.Errorf("failed to query status: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(resp.Body)
return nil, fmt.Errorf("status endpoint returned %d: %s", resp.StatusCode, string(body))
}
var status RQLiteStatus
if err := json.NewDecoder(resp.Body).Decode(&status); err != nil {
return nil, fmt.Errorf("failed to decode status: %w", err)
}
return &status, nil
}
// getRQLiteNodes queries the /nodes endpoint for cluster membership
func (r *RQLiteManager) getRQLiteNodes() (RQLiteNodes, error) {
url := fmt.Sprintf("http://localhost:%d/nodes?ver=2", r.config.RQLitePort)
client := &http.Client{Timeout: 5 * time.Second}
resp, err := client.Get(url)
if err != nil {
return nil, fmt.Errorf("failed to query nodes: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(resp.Body)
return nil, fmt.Errorf("nodes endpoint returned %d: %s", resp.StatusCode, string(body))
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("failed to read nodes response: %w", err)
}
// rqlite v8 wraps nodes in a top-level object; fall back to a raw array for older versions.
var wrapped struct {
Nodes RQLiteNodes `json:"nodes"`
}
if err := json.Unmarshal(body, &wrapped); err == nil && wrapped.Nodes != nil {
return wrapped.Nodes, nil
}
// Try legacy format (plain array)
var nodes RQLiteNodes
if err := json.Unmarshal(body, &nodes); err != nil {
return nil, fmt.Errorf("failed to decode nodes: %w", err)
}
return nodes, nil
}
// getRQLiteLeader returns the current leader address
func (r *RQLiteManager) getRQLiteLeader() (string, error) {
status, err := r.getRQLiteStatus()
if err != nil {
return "", err
}
leaderAddr := status.Store.Raft.LeaderAddr
if leaderAddr == "" {
return "", fmt.Errorf("no leader found")
}
return leaderAddr, nil
}
// isNodeReachable tests if a specific node is responding
func (r *RQLiteManager) isNodeReachable(httpAddress string) bool {
url := fmt.Sprintf("http://%s/status", httpAddress)
client := &http.Client{Timeout: 3 * time.Second}
resp, err := client.Get(url)
if err != nil {
return false
}
defer resp.Body.Close()
return resp.StatusCode == http.StatusOK
}