mirror of
https://github.com/DeBrosOfficial/orama.git
synced 2026-03-17 13:16:58 +00:00
156 lines
5.1 KiB
Go
156 lines
5.1 KiB
Go
package checks
|
|
|
|
import (
|
|
"fmt"
|
|
|
|
"github.com/DeBrosOfficial/network/pkg/inspector"
|
|
)
|
|
|
|
func init() {
|
|
inspector.RegisterChecker("namespace", CheckNamespace)
|
|
}
|
|
|
|
const nsSub = "namespace"
|
|
|
|
// CheckNamespace runs all namespace-level health checks.
|
|
func CheckNamespace(data *inspector.ClusterData) []inspector.CheckResult {
|
|
var results []inspector.CheckResult
|
|
|
|
for _, nd := range data.Nodes {
|
|
if len(nd.Namespaces) == 0 {
|
|
continue
|
|
}
|
|
results = append(results, checkNamespacesPerNode(nd)...)
|
|
}
|
|
|
|
results = append(results, checkNamespacesCrossNode(data)...)
|
|
|
|
return results
|
|
}
|
|
|
|
func checkNamespacesPerNode(nd *inspector.NodeData) []inspector.CheckResult {
|
|
var r []inspector.CheckResult
|
|
node := nd.Node.Name()
|
|
|
|
for _, ns := range nd.Namespaces {
|
|
prefix := fmt.Sprintf("ns.%s", ns.Name)
|
|
|
|
// RQLite health
|
|
if ns.RQLiteUp {
|
|
r = append(r, inspector.Pass(prefix+".rqlite_up", fmt.Sprintf("Namespace %s RQLite responding", ns.Name), nsSub, node,
|
|
fmt.Sprintf("port_base=%d state=%s", ns.PortBase, ns.RQLiteState), inspector.Critical))
|
|
} else {
|
|
r = append(r, inspector.Fail(prefix+".rqlite_up", fmt.Sprintf("Namespace %s RQLite responding", ns.Name), nsSub, node,
|
|
fmt.Sprintf("port_base=%d not responding", ns.PortBase), inspector.Critical))
|
|
}
|
|
|
|
// RQLite Raft state
|
|
if ns.RQLiteUp {
|
|
switch ns.RQLiteState {
|
|
case "Leader", "Follower":
|
|
r = append(r, inspector.Pass(prefix+".rqlite_state", fmt.Sprintf("Namespace %s RQLite raft state valid", ns.Name), nsSub, node,
|
|
fmt.Sprintf("state=%s", ns.RQLiteState), inspector.Critical))
|
|
case "Candidate":
|
|
r = append(r, inspector.Warn(prefix+".rqlite_state", fmt.Sprintf("Namespace %s RQLite raft state valid", ns.Name), nsSub, node,
|
|
"state=Candidate (election in progress)", inspector.Critical))
|
|
default:
|
|
r = append(r, inspector.Fail(prefix+".rqlite_state", fmt.Sprintf("Namespace %s RQLite raft state valid", ns.Name), nsSub, node,
|
|
fmt.Sprintf("state=%s", ns.RQLiteState), inspector.Critical))
|
|
}
|
|
}
|
|
|
|
// RQLite readiness
|
|
if ns.RQLiteReady {
|
|
r = append(r, inspector.Pass(prefix+".rqlite_ready", fmt.Sprintf("Namespace %s RQLite ready", ns.Name), nsSub, node,
|
|
"/readyz OK", inspector.Critical))
|
|
} else if ns.RQLiteUp {
|
|
r = append(r, inspector.Fail(prefix+".rqlite_ready", fmt.Sprintf("Namespace %s RQLite ready", ns.Name), nsSub, node,
|
|
"/readyz failed", inspector.Critical))
|
|
}
|
|
|
|
// Olric health
|
|
if ns.OlricUp {
|
|
r = append(r, inspector.Pass(prefix+".olric_up", fmt.Sprintf("Namespace %s Olric port listening", ns.Name), nsSub, node,
|
|
"memberlist port bound", inspector.High))
|
|
} else {
|
|
r = append(r, inspector.Fail(prefix+".olric_up", fmt.Sprintf("Namespace %s Olric port listening", ns.Name), nsSub, node,
|
|
"memberlist port not bound", inspector.High))
|
|
}
|
|
|
|
// Gateway health
|
|
if ns.GatewayUp {
|
|
r = append(r, inspector.Pass(prefix+".gateway_up", fmt.Sprintf("Namespace %s Gateway responding", ns.Name), nsSub, node,
|
|
fmt.Sprintf("HTTP status=%d", ns.GatewayStatus), inspector.High))
|
|
} else {
|
|
r = append(r, inspector.Fail(prefix+".gateway_up", fmt.Sprintf("Namespace %s Gateway responding", ns.Name), nsSub, node,
|
|
fmt.Sprintf("HTTP status=%d", ns.GatewayStatus), inspector.High))
|
|
}
|
|
}
|
|
|
|
return r
|
|
}
|
|
|
|
func checkNamespacesCrossNode(data *inspector.ClusterData) []inspector.CheckResult {
|
|
var r []inspector.CheckResult
|
|
|
|
// Collect all namespace names across nodes
|
|
nsNodes := map[string]int{} // namespace name → count of nodes running it
|
|
nsHealthy := map[string]int{} // namespace name → count of nodes where all services are up
|
|
|
|
for _, nd := range data.Nodes {
|
|
for _, ns := range nd.Namespaces {
|
|
nsNodes[ns.Name]++
|
|
if ns.RQLiteUp && ns.OlricUp && ns.GatewayUp {
|
|
nsHealthy[ns.Name]++
|
|
}
|
|
}
|
|
}
|
|
|
|
for name, total := range nsNodes {
|
|
healthy := nsHealthy[name]
|
|
if healthy == total {
|
|
r = append(r, inspector.Pass(
|
|
fmt.Sprintf("ns.%s.all_healthy", name),
|
|
fmt.Sprintf("Namespace %s healthy on all nodes", name),
|
|
nsSub, "",
|
|
fmt.Sprintf("%d/%d nodes fully healthy", healthy, total),
|
|
inspector.Critical))
|
|
} else {
|
|
r = append(r, inspector.Fail(
|
|
fmt.Sprintf("ns.%s.all_healthy", name),
|
|
fmt.Sprintf("Namespace %s healthy on all nodes", name),
|
|
nsSub, "",
|
|
fmt.Sprintf("%d/%d nodes fully healthy", healthy, total),
|
|
inspector.Critical))
|
|
}
|
|
|
|
// Check namespace has quorum (>= N/2+1 RQLite instances)
|
|
rqliteUp := 0
|
|
for _, nd := range data.Nodes {
|
|
for _, ns := range nd.Namespaces {
|
|
if ns.Name == name && ns.RQLiteUp {
|
|
rqliteUp++
|
|
}
|
|
}
|
|
}
|
|
quorumNeeded := total/2 + 1
|
|
if rqliteUp >= quorumNeeded {
|
|
r = append(r, inspector.Pass(
|
|
fmt.Sprintf("ns.%s.quorum", name),
|
|
fmt.Sprintf("Namespace %s RQLite quorum", name),
|
|
nsSub, "",
|
|
fmt.Sprintf("rqlite_up=%d/%d quorum_needed=%d", rqliteUp, total, quorumNeeded),
|
|
inspector.Critical))
|
|
} else {
|
|
r = append(r, inspector.Fail(
|
|
fmt.Sprintf("ns.%s.quorum", name),
|
|
fmt.Sprintf("Namespace %s RQLite quorum", name),
|
|
nsSub, "",
|
|
fmt.Sprintf("rqlite_up=%d/%d quorum_needed=%d (QUORUM LOST)", rqliteUp, total, quorumNeeded),
|
|
inspector.Critical))
|
|
}
|
|
}
|
|
|
|
return r
|
|
}
|