mirror of
https://github.com/DeBrosOfficial/orama.git
synced 2026-05-01 05:04:13 +00:00
feat(monitor): add vault health checks and reporting
- integrate vault into node alerts (service, responsive, status, restarts) - add vault report collection (systemd, logs, HTTP status) - update production CLI (clean, restart, stop, services) - add comprehensive unit tests for vault alerts
This commit is contained in:
parent
318eea33ae
commit
2017fcb432
@ -124,6 +124,7 @@ func DeriveAlerts(snap *ClusterSnapshot) []Alert {
|
||||
alerts = append(alerts, checkNodeNetwork(r, host)...)
|
||||
alerts = append(alerts, checkNodeOlric(r, host)...)
|
||||
alerts = append(alerts, checkNodeIPFS(r, host)...)
|
||||
alerts = append(alerts, checkNodeVault(r, host)...)
|
||||
alerts = append(alerts, checkNodeGateway(r, host)...)
|
||||
}
|
||||
|
||||
@ -866,6 +867,41 @@ func checkNodeIPFS(r *report.NodeReport, host string) []Alert {
|
||||
return alerts
|
||||
}
|
||||
|
||||
func checkNodeVault(r *report.NodeReport, host string) []Alert {
|
||||
if r.Vault == nil {
|
||||
return nil
|
||||
}
|
||||
var alerts []Alert
|
||||
|
||||
if !r.Vault.ServiceActive {
|
||||
alerts = append(alerts, Alert{AlertCritical, "vault", host, "Vault service not running"})
|
||||
return alerts
|
||||
}
|
||||
|
||||
if !r.Vault.Responsive {
|
||||
alerts = append(alerts, Alert{AlertWarning, "vault", host, "Vault not responding to health queries"})
|
||||
return alerts
|
||||
}
|
||||
|
||||
switch r.Vault.Status {
|
||||
case "unavailable":
|
||||
alerts = append(alerts, Alert{AlertCritical, "vault", host,
|
||||
fmt.Sprintf("Vault unavailable: %d/%d guardians healthy (need %d for reads)",
|
||||
r.Vault.Healthy, r.Vault.Guardians, r.Vault.Threshold)})
|
||||
case "degraded":
|
||||
alerts = append(alerts, Alert{AlertWarning, "vault", host,
|
||||
fmt.Sprintf("Vault degraded: %d/%d guardians healthy (need %d for writes)",
|
||||
r.Vault.Healthy, r.Vault.Guardians, r.Vault.WriteQuorum)})
|
||||
}
|
||||
|
||||
if r.Vault.RestartCount > 3 {
|
||||
alerts = append(alerts, Alert{AlertWarning, "vault", host,
|
||||
fmt.Sprintf("Vault restarted %d times", r.Vault.RestartCount)})
|
||||
}
|
||||
|
||||
return alerts
|
||||
}
|
||||
|
||||
func checkNodeGateway(r *report.NodeReport, host string) []Alert {
|
||||
if r.Gateway == nil {
|
||||
return nil
|
||||
|
||||
120
core/pkg/cli/monitor/alerts_vault_test.go
Normal file
120
core/pkg/cli/monitor/alerts_vault_test.go
Normal file
@ -0,0 +1,120 @@
|
||||
package monitor
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/DeBrosOfficial/network/pkg/cli/production/report"
|
||||
)
|
||||
|
||||
func TestCheckNodeVault_nil(t *testing.T) {
|
||||
r := &report.NodeReport{}
|
||||
alerts := checkNodeVault(r, "10.0.0.1")
|
||||
if len(alerts) != 0 {
|
||||
t.Errorf("expected 0 alerts for nil vault, got %d", len(alerts))
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckNodeVault_serviceInactive(t *testing.T) {
|
||||
r := &report.NodeReport{
|
||||
Vault: &report.VaultReport{ServiceActive: false},
|
||||
}
|
||||
alerts := checkNodeVault(r, "10.0.0.1")
|
||||
if len(alerts) != 1 {
|
||||
t.Fatalf("expected 1 alert, got %d", len(alerts))
|
||||
}
|
||||
if alerts[0].Severity != AlertCritical {
|
||||
t.Errorf("expected critical, got %s", alerts[0].Severity)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckNodeVault_unresponsive(t *testing.T) {
|
||||
r := &report.NodeReport{
|
||||
Vault: &report.VaultReport{ServiceActive: true, Responsive: false},
|
||||
}
|
||||
alerts := checkNodeVault(r, "10.0.0.1")
|
||||
if len(alerts) != 1 {
|
||||
t.Fatalf("expected 1 alert, got %d", len(alerts))
|
||||
}
|
||||
if alerts[0].Severity != AlertWarning {
|
||||
t.Errorf("expected warning, got %s", alerts[0].Severity)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckNodeVault_unavailable(t *testing.T) {
|
||||
r := &report.NodeReport{
|
||||
Vault: &report.VaultReport{
|
||||
ServiceActive: true,
|
||||
Responsive: true,
|
||||
Status: "unavailable",
|
||||
Guardians: 5,
|
||||
Healthy: 1,
|
||||
Threshold: 3,
|
||||
WriteQuorum: 4,
|
||||
},
|
||||
}
|
||||
alerts := checkNodeVault(r, "10.0.0.1")
|
||||
if len(alerts) != 1 {
|
||||
t.Fatalf("expected 1 alert, got %d", len(alerts))
|
||||
}
|
||||
if alerts[0].Severity != AlertCritical {
|
||||
t.Errorf("expected critical, got %s", alerts[0].Severity)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckNodeVault_degraded(t *testing.T) {
|
||||
r := &report.NodeReport{
|
||||
Vault: &report.VaultReport{
|
||||
ServiceActive: true,
|
||||
Responsive: true,
|
||||
Status: "degraded",
|
||||
Guardians: 5,
|
||||
Healthy: 3,
|
||||
Threshold: 3,
|
||||
WriteQuorum: 4,
|
||||
},
|
||||
}
|
||||
alerts := checkNodeVault(r, "10.0.0.1")
|
||||
if len(alerts) != 1 {
|
||||
t.Fatalf("expected 1 alert, got %d", len(alerts))
|
||||
}
|
||||
if alerts[0].Severity != AlertWarning {
|
||||
t.Errorf("expected warning, got %s", alerts[0].Severity)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckNodeVault_excessiveRestarts(t *testing.T) {
|
||||
r := &report.NodeReport{
|
||||
Vault: &report.VaultReport{
|
||||
ServiceActive: true,
|
||||
Responsive: true,
|
||||
Status: "healthy",
|
||||
RestartCount: 5,
|
||||
},
|
||||
}
|
||||
alerts := checkNodeVault(r, "10.0.0.1")
|
||||
if len(alerts) != 1 {
|
||||
t.Fatalf("expected 1 alert, got %d", len(alerts))
|
||||
}
|
||||
if alerts[0].Severity != AlertWarning {
|
||||
t.Errorf("expected warning, got %s", alerts[0].Severity)
|
||||
}
|
||||
}
|
||||
|
||||
func TestCheckNodeVault_healthy(t *testing.T) {
|
||||
r := &report.NodeReport{
|
||||
Vault: &report.VaultReport{
|
||||
ServiceActive: true,
|
||||
Responsive: true,
|
||||
Status: "healthy",
|
||||
Guardians: 5,
|
||||
Healthy: 5,
|
||||
Threshold: 3,
|
||||
WriteQuorum: 4,
|
||||
RestartCount: 0,
|
||||
},
|
||||
}
|
||||
alerts := checkNodeVault(r, "10.0.0.1")
|
||||
if len(alerts) != 0 {
|
||||
t.Errorf("expected 0 alerts for healthy vault, got %d", len(alerts))
|
||||
}
|
||||
}
|
||||
@ -133,7 +133,7 @@ func cleanNode(node inspector.Node, nuclear bool) error {
|
||||
%s
|
||||
|
||||
# Stop services
|
||||
for svc in caddy coredns orama-node orama-gateway orama-ipfs-cluster orama-ipfs orama-olric orama-anyone-relay orama-anyone-client; do
|
||||
for svc in caddy coredns orama-node orama-gateway orama-ipfs-cluster orama-ipfs orama-olric orama-vault orama-anyone-relay orama-anyone-client; do
|
||||
systemctl stop "$svc" 2>/dev/null
|
||||
systemctl disable "$svc" 2>/dev/null
|
||||
done
|
||||
|
||||
@ -53,6 +53,7 @@ func HandleRestartWithFlags(force bool) {
|
||||
{"orama-node"},
|
||||
{"orama-olric"},
|
||||
{"orama-ipfs-cluster", "orama-ipfs"},
|
||||
{"orama-vault"},
|
||||
{"orama-anyone-relay", "orama-anyone-client"},
|
||||
{"coredns", "caddy"},
|
||||
}
|
||||
|
||||
@ -55,8 +55,9 @@ func HandleStopWithFlags(force bool) {
|
||||
{"orama-node"}, // 1. Stop node (includes gateway + RQLite with leadership transfer)
|
||||
{"orama-olric"}, // 2. Stop cache
|
||||
{"orama-ipfs-cluster", "orama-ipfs"}, // 3. Stop storage
|
||||
{"orama-anyone-relay", "orama-anyone-client"}, // 4. Stop privacy relay
|
||||
{"coredns", "caddy"}, // 5. Stop DNS/TLS last
|
||||
{"orama-vault"}, // 4. Stop vault
|
||||
{"orama-anyone-relay", "orama-anyone-client"}, // 5. Stop privacy relay
|
||||
{"coredns", "caddy"}, // 6. Stop DNS/TLS last
|
||||
}
|
||||
|
||||
// Mask all services to immediately prevent Restart=always from reviving them.
|
||||
|
||||
@ -89,6 +89,7 @@ func collectProcesses() *ProcessReport {
|
||||
var managedServiceUnits = []string{
|
||||
"orama-node", "orama-olric",
|
||||
"orama-ipfs", "orama-ipfs-cluster",
|
||||
"orama-vault",
|
||||
"orama-anyone-relay", "orama-anyone-client",
|
||||
"coredns", "caddy", "rqlited",
|
||||
}
|
||||
|
||||
@ -71,6 +71,10 @@ func Handle(jsonFlag bool, version string) error {
|
||||
rpt.IPFS = collectIPFS()
|
||||
})
|
||||
|
||||
safeGo(&wg, "vault", func() {
|
||||
rpt.Vault = collectVault()
|
||||
})
|
||||
|
||||
safeGo(&wg, "gateway", func() {
|
||||
rpt.Gateway = collectGateway()
|
||||
})
|
||||
|
||||
@ -13,6 +13,7 @@ var coreServices = []string{
|
||||
"orama-olric",
|
||||
"orama-ipfs",
|
||||
"orama-ipfs-cluster",
|
||||
"orama-vault",
|
||||
"orama-anyone-relay",
|
||||
"orama-anyone-client",
|
||||
"coredns",
|
||||
|
||||
@ -17,6 +17,7 @@ type NodeReport struct {
|
||||
RQLite *RQLiteReport `json:"rqlite,omitempty"`
|
||||
Olric *OlricReport `json:"olric,omitempty"`
|
||||
IPFS *IPFSReport `json:"ipfs,omitempty"`
|
||||
Vault *VaultReport `json:"vault,omitempty"`
|
||||
Gateway *GatewayReport `json:"gateway,omitempty"`
|
||||
WireGuard *WireGuardReport `json:"wireguard,omitempty"`
|
||||
DNS *DNSReport `json:"dns,omitempty"`
|
||||
@ -150,6 +151,21 @@ type IPFSReport struct {
|
||||
BootstrapEmpty bool `json:"bootstrap_empty"`
|
||||
}
|
||||
|
||||
// --- Vault ---
|
||||
|
||||
type VaultReport struct {
|
||||
ServiceActive bool `json:"service_active"`
|
||||
Responsive bool `json:"responsive"`
|
||||
Status string `json:"status,omitempty"` // "healthy", "degraded", "unavailable"
|
||||
Guardians int `json:"guardians,omitempty"`
|
||||
Healthy int `json:"healthy,omitempty"`
|
||||
Threshold int `json:"threshold,omitempty"`
|
||||
WriteQuorum int `json:"write_quorum,omitempty"`
|
||||
ProcessMemMB int `json:"process_mem_mb"`
|
||||
RestartCount int `json:"restart_count"`
|
||||
LogErrors int `json:"log_errors_1h"`
|
||||
}
|
||||
|
||||
// --- Gateway ---
|
||||
|
||||
type GatewayReport struct {
|
||||
|
||||
70
core/pkg/cli/production/report/vault.go
Normal file
70
core/pkg/cli/production/report/vault.go
Normal file
@ -0,0 +1,70 @@
|
||||
package report
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
func collectVault() *VaultReport {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
r := &VaultReport{}
|
||||
|
||||
// 1. Service active
|
||||
if out, err := runCmd(ctx, "systemctl", "is-active", "orama-vault"); err == nil {
|
||||
r.ServiceActive = strings.TrimSpace(out) == "active"
|
||||
}
|
||||
|
||||
// 2. Restart count
|
||||
if out, err := runCmd(ctx, "systemctl", "show", "orama-vault", "--property=NRestarts"); err == nil {
|
||||
if parts := strings.SplitN(out, "=", 2); len(parts) == 2 {
|
||||
r.RestartCount, _ = strconv.Atoi(strings.TrimSpace(parts[1]))
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Process memory
|
||||
if out, err := runCmd(ctx, "systemctl", "show", "orama-vault", "--property=MemoryCurrent"); err == nil {
|
||||
if parts := strings.SplitN(out, "=", 2); len(parts) == 2 {
|
||||
r.ProcessMemMB = parseMemoryMB(parts[1])
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Log errors in last hour
|
||||
if out, err := runCmd(ctx, "bash", "-c",
|
||||
`journalctl -u orama-vault --no-pager -n 200 --since "1 hour ago" 2>/dev/null | grep -ciE "(error|ERR)" || echo 0`); err == nil {
|
||||
r.LogErrors, _ = strconv.Atoi(strings.TrimSpace(out))
|
||||
}
|
||||
|
||||
// 5. Query vault status via gateway (provides guardian health)
|
||||
if body, err := httpGet(ctx, "http://localhost:6001/v1/vault/status"); err == nil {
|
||||
var status struct {
|
||||
Guardians int `json:"guardians"`
|
||||
Healthy int `json:"healthy"`
|
||||
Threshold int `json:"threshold"`
|
||||
WriteQuorum int `json:"write_quorum"`
|
||||
}
|
||||
if json.Unmarshal(body, &status) == nil {
|
||||
r.Responsive = true
|
||||
r.Guardians = status.Guardians
|
||||
r.Healthy = status.Healthy
|
||||
r.Threshold = status.Threshold
|
||||
r.WriteQuorum = status.WriteQuorum
|
||||
}
|
||||
}
|
||||
|
||||
// 6. Query vault health status
|
||||
if body, err := httpGet(ctx, "http://localhost:6001/v1/vault/health"); err == nil {
|
||||
var health struct {
|
||||
Status string `json:"status"`
|
||||
}
|
||||
if json.Unmarshal(body, &health) == nil {
|
||||
r.Status = health.Status
|
||||
}
|
||||
}
|
||||
|
||||
return r
|
||||
}
|
||||
@ -17,6 +17,7 @@ func Handle() {
|
||||
"orama-ipfs-cluster",
|
||||
// Note: RQLite is managed by node process, not as separate service
|
||||
"orama-olric",
|
||||
"orama-vault",
|
||||
"orama-node",
|
||||
// Note: gateway is embedded in orama-node, no separate service
|
||||
}
|
||||
@ -26,6 +27,7 @@ func Handle() {
|
||||
"orama-ipfs": "IPFS Daemon",
|
||||
"orama-ipfs-cluster": "IPFS Cluster",
|
||||
"orama-olric": "Olric Cache Server",
|
||||
"orama-vault": "Vault Guardian",
|
||||
"orama-node": "Orama Node (includes RQLite + Gateway)",
|
||||
}
|
||||
|
||||
|
||||
@ -376,6 +376,7 @@ func (o *Orchestrator) stopServices() error {
|
||||
"orama-ipfs-cluster.service", // Depends on IPFS
|
||||
"orama-ipfs.service", // Base IPFS
|
||||
"orama-olric.service", // Independent
|
||||
"orama-vault.service", // Vault guardian
|
||||
"orama-anyone-client.service", // Client mode
|
||||
"orama-anyone-relay.service", // Relay mode
|
||||
}
|
||||
@ -683,6 +684,7 @@ func (o *Orchestrator) restartServices() error {
|
||||
"orama-olric", // Distributed cache
|
||||
"orama-ipfs", // IPFS daemon
|
||||
"orama-ipfs-cluster", // IPFS cluster
|
||||
"orama-vault", // Vault guardian
|
||||
"orama-gateway", // Gateway (legacy)
|
||||
"coredns", // DNS server
|
||||
"caddy", // Reverse proxy
|
||||
|
||||
@ -162,6 +162,7 @@ func GetProductionServices() []string {
|
||||
"orama-olric",
|
||||
"orama-ipfs-cluster",
|
||||
"orama-ipfs",
|
||||
"orama-vault",
|
||||
"orama-anyone-client",
|
||||
"orama-anyone-relay",
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user