mirror of
https://github.com/DeBrosOfficial/orama.git
synced 2026-03-17 23:06:58 +00:00
158 lines
5.3 KiB
Go
158 lines
5.3 KiB
Go
package checks
|
|
|
|
import (
|
|
"fmt"
|
|
|
|
"github.com/DeBrosOfficial/network/pkg/inspector"
|
|
)
|
|
|
|
func init() {
|
|
inspector.RegisterChecker("olric", CheckOlric)
|
|
}
|
|
|
|
const olricSub = "olric"
|
|
|
|
// CheckOlric runs all Olric health checks against cluster data.
|
|
func CheckOlric(data *inspector.ClusterData) []inspector.CheckResult {
|
|
var results []inspector.CheckResult
|
|
|
|
for _, nd := range data.Nodes {
|
|
if nd.Olric == nil {
|
|
continue
|
|
}
|
|
results = append(results, checkOlricPerNode(nd)...)
|
|
}
|
|
|
|
results = append(results, checkOlricCrossNode(data)...)
|
|
|
|
return results
|
|
}
|
|
|
|
func checkOlricPerNode(nd *inspector.NodeData) []inspector.CheckResult {
|
|
var r []inspector.CheckResult
|
|
ol := nd.Olric
|
|
node := nd.Node.Name()
|
|
|
|
// 2.1 Service active
|
|
if ol.ServiceActive {
|
|
r = append(r, inspector.Pass("olric.service_active", "Olric service active", olricSub, node,
|
|
"orama-olric is active", inspector.Critical))
|
|
} else {
|
|
r = append(r, inspector.Fail("olric.service_active", "Olric service active", olricSub, node,
|
|
"orama-olric is not active", inspector.Critical))
|
|
return r
|
|
}
|
|
|
|
// 2.7 Memberlist port accepting connections
|
|
if ol.MemberlistUp {
|
|
r = append(r, inspector.Pass("olric.memberlist_port", "Memberlist port 3322 listening", olricSub, node,
|
|
"TCP 3322 is bound", inspector.Critical))
|
|
} else {
|
|
r = append(r, inspector.Fail("olric.memberlist_port", "Memberlist port 3322 listening", olricSub, node,
|
|
"TCP 3322 is not listening", inspector.Critical))
|
|
}
|
|
|
|
// 2.3 Restart count
|
|
if ol.RestartCount == 0 {
|
|
r = append(r, inspector.Pass("olric.restarts", "Low restart count", olricSub, node,
|
|
"NRestarts=0", inspector.High))
|
|
} else if ol.RestartCount <= 3 {
|
|
r = append(r, inspector.Warn("olric.restarts", "Low restart count", olricSub, node,
|
|
fmt.Sprintf("NRestarts=%d", ol.RestartCount), inspector.High))
|
|
} else {
|
|
r = append(r, inspector.Fail("olric.restarts", "Low restart count", olricSub, node,
|
|
fmt.Sprintf("NRestarts=%d (crash-looping?)", ol.RestartCount), inspector.High))
|
|
}
|
|
|
|
// 2.4 Process memory
|
|
if ol.ProcessMemMB > 0 {
|
|
if ol.ProcessMemMB < 200 {
|
|
r = append(r, inspector.Pass("olric.memory", "Process memory healthy", olricSub, node,
|
|
fmt.Sprintf("RSS=%dMB", ol.ProcessMemMB), inspector.Medium))
|
|
} else if ol.ProcessMemMB < 500 {
|
|
r = append(r, inspector.Warn("olric.memory", "Process memory healthy", olricSub, node,
|
|
fmt.Sprintf("RSS=%dMB (elevated)", ol.ProcessMemMB), inspector.Medium))
|
|
} else {
|
|
r = append(r, inspector.Fail("olric.memory", "Process memory healthy", olricSub, node,
|
|
fmt.Sprintf("RSS=%dMB (high)", ol.ProcessMemMB), inspector.High))
|
|
}
|
|
}
|
|
|
|
// 2.9-2.11 Log analysis: suspects
|
|
if ol.LogSuspects == 0 {
|
|
r = append(r, inspector.Pass("olric.log_suspects", "No suspect/failed members in logs", olricSub, node,
|
|
"no suspect messages in last hour", inspector.Critical))
|
|
} else {
|
|
r = append(r, inspector.Fail("olric.log_suspects", "No suspect/failed members in logs", olricSub, node,
|
|
fmt.Sprintf("%d suspect/failed messages in last hour", ol.LogSuspects), inspector.Critical))
|
|
}
|
|
|
|
// 2.13 Flapping detection
|
|
if ol.LogFlapping < 5 {
|
|
r = append(r, inspector.Pass("olric.log_flapping", "No rapid join/leave cycles", olricSub, node,
|
|
fmt.Sprintf("join/leave events=%d in last hour", ol.LogFlapping), inspector.High))
|
|
} else {
|
|
r = append(r, inspector.Warn("olric.log_flapping", "No rapid join/leave cycles", olricSub, node,
|
|
fmt.Sprintf("join/leave events=%d in last hour (flapping?)", ol.LogFlapping), inspector.High))
|
|
}
|
|
|
|
// 2.39 Log error rate
|
|
if ol.LogErrors < 5 {
|
|
r = append(r, inspector.Pass("olric.log_errors", "Log error rate low", olricSub, node,
|
|
fmt.Sprintf("errors=%d in last hour", ol.LogErrors), inspector.High))
|
|
} else if ol.LogErrors < 20 {
|
|
r = append(r, inspector.Warn("olric.log_errors", "Log error rate low", olricSub, node,
|
|
fmt.Sprintf("errors=%d in last hour", ol.LogErrors), inspector.High))
|
|
} else {
|
|
r = append(r, inspector.Fail("olric.log_errors", "Log error rate low", olricSub, node,
|
|
fmt.Sprintf("errors=%d in last hour (high)", ol.LogErrors), inspector.High))
|
|
}
|
|
|
|
return r
|
|
}
|
|
|
|
func checkOlricCrossNode(data *inspector.ClusterData) []inspector.CheckResult {
|
|
var r []inspector.CheckResult
|
|
|
|
activeCount := 0
|
|
memberlistCount := 0
|
|
totalNodes := 0
|
|
|
|
for _, nd := range data.Nodes {
|
|
if nd.Olric == nil {
|
|
continue
|
|
}
|
|
totalNodes++
|
|
if nd.Olric.ServiceActive {
|
|
activeCount++
|
|
}
|
|
if nd.Olric.MemberlistUp {
|
|
memberlistCount++
|
|
}
|
|
}
|
|
|
|
if totalNodes < 2 {
|
|
return r
|
|
}
|
|
|
|
// All nodes have Olric running
|
|
if activeCount == totalNodes {
|
|
r = append(r, inspector.Pass("olric.all_active", "All nodes running Olric", olricSub, "",
|
|
fmt.Sprintf("%d/%d nodes active", activeCount, totalNodes), inspector.Critical))
|
|
} else {
|
|
r = append(r, inspector.Fail("olric.all_active", "All nodes running Olric", olricSub, "",
|
|
fmt.Sprintf("%d/%d nodes active", activeCount, totalNodes), inspector.Critical))
|
|
}
|
|
|
|
// All memberlist ports up
|
|
if memberlistCount == totalNodes {
|
|
r = append(r, inspector.Pass("olric.all_memberlist", "All memberlist ports listening", olricSub, "",
|
|
fmt.Sprintf("%d/%d nodes with memberlist", memberlistCount, totalNodes), inspector.High))
|
|
} else {
|
|
r = append(r, inspector.Warn("olric.all_memberlist", "All memberlist ports listening", olricSub, "",
|
|
fmt.Sprintf("%d/%d nodes with memberlist", memberlistCount, totalNodes), inspector.High))
|
|
}
|
|
|
|
return r
|
|
}
|