mirror of
https://github.com/DeBrosOfficial/orama.git
synced 2026-03-17 16:06:58 +00:00
245 lines
6.0 KiB
Go
245 lines
6.0 KiB
Go
package cluster
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
)
|
|
|
|
// checkResult represents the outcome of a single health check.
|
|
type checkResult struct {
|
|
Name string
|
|
Status string // "PASS", "FAIL", "WARN"
|
|
Detail string
|
|
}
|
|
|
|
// HandleHealth handles the "orama cluster health" command.
|
|
func HandleHealth(args []string) {
|
|
fmt.Printf("Cluster Health Check\n")
|
|
fmt.Printf("====================\n\n")
|
|
|
|
var results []checkResult
|
|
|
|
// Check 1: RQLite reachable
|
|
status, err := queryRQLiteStatus()
|
|
if err != nil {
|
|
results = append(results, checkResult{
|
|
Name: "RQLite reachable",
|
|
Status: "FAIL",
|
|
Detail: fmt.Sprintf("Cannot connect to RQLite: %v", err),
|
|
})
|
|
printHealthResults(results)
|
|
os.Exit(1)
|
|
return
|
|
}
|
|
results = append(results, checkResult{
|
|
Name: "RQLite reachable",
|
|
Status: "PASS",
|
|
Detail: fmt.Sprintf("HTTP API responding on %s", status.HTTP.Address),
|
|
})
|
|
|
|
// Check 2: Raft state is leader or follower (not candidate or shutdown)
|
|
raftState := status.Store.Raft.State
|
|
switch raftState {
|
|
case "Leader", "Follower":
|
|
results = append(results, checkResult{
|
|
Name: "Raft state healthy",
|
|
Status: "PASS",
|
|
Detail: fmt.Sprintf("Node is %s", raftState),
|
|
})
|
|
case "Candidate":
|
|
results = append(results, checkResult{
|
|
Name: "Raft state healthy",
|
|
Status: "WARN",
|
|
Detail: "Node is Candidate (election in progress)",
|
|
})
|
|
default:
|
|
results = append(results, checkResult{
|
|
Name: "Raft state healthy",
|
|
Status: "FAIL",
|
|
Detail: fmt.Sprintf("Node is in unexpected state: %s", raftState),
|
|
})
|
|
}
|
|
|
|
// Check 3: Leader exists
|
|
if status.Store.Raft.Leader != "" {
|
|
results = append(results, checkResult{
|
|
Name: "Leader exists",
|
|
Status: "PASS",
|
|
Detail: fmt.Sprintf("Leader: %s", status.Store.Raft.Leader),
|
|
})
|
|
} else {
|
|
results = append(results, checkResult{
|
|
Name: "Leader exists",
|
|
Status: "FAIL",
|
|
Detail: "No leader detected in Raft cluster",
|
|
})
|
|
}
|
|
|
|
// Check 4: Applied index is advancing (commit == applied means caught up)
|
|
if status.Store.Raft.AppliedIndex >= status.Store.Raft.CommitIndex {
|
|
results = append(results, checkResult{
|
|
Name: "Log replication",
|
|
Status: "PASS",
|
|
Detail: fmt.Sprintf("Applied index (%d) >= commit index (%d)",
|
|
status.Store.Raft.AppliedIndex, status.Store.Raft.CommitIndex),
|
|
})
|
|
} else {
|
|
lag := status.Store.Raft.CommitIndex - status.Store.Raft.AppliedIndex
|
|
severity := "WARN"
|
|
if lag > 1000 {
|
|
severity = "FAIL"
|
|
}
|
|
results = append(results, checkResult{
|
|
Name: "Log replication",
|
|
Status: severity,
|
|
Detail: fmt.Sprintf("Applied index (%d) behind commit index (%d) by %d entries",
|
|
status.Store.Raft.AppliedIndex, status.Store.Raft.CommitIndex, lag),
|
|
})
|
|
}
|
|
|
|
// Check 5: Query nodes to validate cluster membership
|
|
nodes, err := queryRQLiteNodes(true)
|
|
if err != nil {
|
|
results = append(results, checkResult{
|
|
Name: "Cluster nodes reachable",
|
|
Status: "FAIL",
|
|
Detail: fmt.Sprintf("Cannot query /nodes: %v", err),
|
|
})
|
|
} else {
|
|
totalNodes := len(nodes)
|
|
voters := 0
|
|
nonVoters := 0
|
|
reachable := 0
|
|
leaders := 0
|
|
|
|
for _, node := range nodes {
|
|
if node.Voter {
|
|
voters++
|
|
} else {
|
|
nonVoters++
|
|
}
|
|
if node.Reachable {
|
|
reachable++
|
|
}
|
|
if node.Leader {
|
|
leaders++
|
|
}
|
|
}
|
|
|
|
// Check 5a: Node count
|
|
results = append(results, checkResult{
|
|
Name: "Cluster membership",
|
|
Status: "PASS",
|
|
Detail: fmt.Sprintf("%d nodes (%d voters, %d non-voters)", totalNodes, voters, nonVoters),
|
|
})
|
|
|
|
// Check 5b: All nodes reachable
|
|
if reachable == totalNodes {
|
|
results = append(results, checkResult{
|
|
Name: "All nodes reachable",
|
|
Status: "PASS",
|
|
Detail: fmt.Sprintf("%d/%d nodes reachable", reachable, totalNodes),
|
|
})
|
|
} else {
|
|
unreachable := totalNodes - reachable
|
|
results = append(results, checkResult{
|
|
Name: "All nodes reachable",
|
|
Status: "WARN",
|
|
Detail: fmt.Sprintf("%d/%d nodes reachable (%d unreachable)", reachable, totalNodes, unreachable),
|
|
})
|
|
}
|
|
|
|
// Check 5c: Exactly one leader
|
|
if leaders == 1 {
|
|
results = append(results, checkResult{
|
|
Name: "Single leader",
|
|
Status: "PASS",
|
|
Detail: "Exactly 1 leader in cluster",
|
|
})
|
|
} else if leaders == 0 {
|
|
results = append(results, checkResult{
|
|
Name: "Single leader",
|
|
Status: "FAIL",
|
|
Detail: "No leader found among nodes",
|
|
})
|
|
} else {
|
|
results = append(results, checkResult{
|
|
Name: "Single leader",
|
|
Status: "FAIL",
|
|
Detail: fmt.Sprintf("Multiple leaders detected: %d (split-brain?)", leaders),
|
|
})
|
|
}
|
|
|
|
// Check 5d: Quorum check (majority of voters must be reachable)
|
|
quorum := (voters / 2) + 1
|
|
reachableVoters := 0
|
|
for _, node := range nodes {
|
|
if node.Voter && node.Reachable {
|
|
reachableVoters++
|
|
}
|
|
}
|
|
if reachableVoters >= quorum {
|
|
results = append(results, checkResult{
|
|
Name: "Quorum healthy",
|
|
Status: "PASS",
|
|
Detail: fmt.Sprintf("%d/%d voters reachable (quorum requires %d)", reachableVoters, voters, quorum),
|
|
})
|
|
} else {
|
|
results = append(results, checkResult{
|
|
Name: "Quorum healthy",
|
|
Status: "FAIL",
|
|
Detail: fmt.Sprintf("%d/%d voters reachable (quorum requires %d)", reachableVoters, voters, quorum),
|
|
})
|
|
}
|
|
}
|
|
|
|
printHealthResults(results)
|
|
|
|
// Exit with non-zero if any failures
|
|
for _, r := range results {
|
|
if r.Status == "FAIL" {
|
|
os.Exit(1)
|
|
}
|
|
}
|
|
}
|
|
|
|
// printHealthResults prints the health check results in a formatted table.
|
|
func printHealthResults(results []checkResult) {
|
|
// Find the longest check name for alignment
|
|
maxName := 0
|
|
for _, r := range results {
|
|
if len(r.Name) > maxName {
|
|
maxName = len(r.Name)
|
|
}
|
|
}
|
|
|
|
for _, r := range results {
|
|
indicator := " "
|
|
switch r.Status {
|
|
case "PASS":
|
|
indicator = "PASS"
|
|
case "FAIL":
|
|
indicator = "FAIL"
|
|
case "WARN":
|
|
indicator = "WARN"
|
|
}
|
|
|
|
fmt.Printf(" [%s] %-*s %s\n", indicator, maxName, r.Name, r.Detail)
|
|
}
|
|
fmt.Println()
|
|
|
|
// Summary
|
|
pass, fail, warn := 0, 0, 0
|
|
for _, r := range results {
|
|
switch r.Status {
|
|
case "PASS":
|
|
pass++
|
|
case "FAIL":
|
|
fail++
|
|
case "WARN":
|
|
warn++
|
|
}
|
|
}
|
|
fmt.Printf("Summary: %d passed, %d failed, %d warnings\n", pass, fail, warn)
|
|
}
|