feat(monitor): add vault health checks and reporting

- integrate vault into node alerts (service, responsive, status, restarts)
- add vault report collection (systemd, logs, HTTP status)
- update production CLI (clean, restart, stop, services)
- add comprehensive unit tests for vault alerts
This commit is contained in:
anonpenguin23 2026-03-27 14:52:41 +02:00
parent 318eea33ae
commit 2017fcb432
13 changed files with 258 additions and 3 deletions

View File

@ -124,6 +124,7 @@ func DeriveAlerts(snap *ClusterSnapshot) []Alert {
alerts = append(alerts, checkNodeNetwork(r, host)...)
alerts = append(alerts, checkNodeOlric(r, host)...)
alerts = append(alerts, checkNodeIPFS(r, host)...)
alerts = append(alerts, checkNodeVault(r, host)...)
alerts = append(alerts, checkNodeGateway(r, host)...)
}
@ -866,6 +867,41 @@ func checkNodeIPFS(r *report.NodeReport, host string) []Alert {
return alerts
}
func checkNodeVault(r *report.NodeReport, host string) []Alert {
if r.Vault == nil {
return nil
}
var alerts []Alert
if !r.Vault.ServiceActive {
alerts = append(alerts, Alert{AlertCritical, "vault", host, "Vault service not running"})
return alerts
}
if !r.Vault.Responsive {
alerts = append(alerts, Alert{AlertWarning, "vault", host, "Vault not responding to health queries"})
return alerts
}
switch r.Vault.Status {
case "unavailable":
alerts = append(alerts, Alert{AlertCritical, "vault", host,
fmt.Sprintf("Vault unavailable: %d/%d guardians healthy (need %d for reads)",
r.Vault.Healthy, r.Vault.Guardians, r.Vault.Threshold)})
case "degraded":
alerts = append(alerts, Alert{AlertWarning, "vault", host,
fmt.Sprintf("Vault degraded: %d/%d guardians healthy (need %d for writes)",
r.Vault.Healthy, r.Vault.Guardians, r.Vault.WriteQuorum)})
}
if r.Vault.RestartCount > 3 {
alerts = append(alerts, Alert{AlertWarning, "vault", host,
fmt.Sprintf("Vault restarted %d times", r.Vault.RestartCount)})
}
return alerts
}
func checkNodeGateway(r *report.NodeReport, host string) []Alert {
if r.Gateway == nil {
return nil

View File

@ -0,0 +1,120 @@
package monitor
import (
"testing"
"github.com/DeBrosOfficial/network/pkg/cli/production/report"
)
func TestCheckNodeVault_nil(t *testing.T) {
r := &report.NodeReport{}
alerts := checkNodeVault(r, "10.0.0.1")
if len(alerts) != 0 {
t.Errorf("expected 0 alerts for nil vault, got %d", len(alerts))
}
}
func TestCheckNodeVault_serviceInactive(t *testing.T) {
r := &report.NodeReport{
Vault: &report.VaultReport{ServiceActive: false},
}
alerts := checkNodeVault(r, "10.0.0.1")
if len(alerts) != 1 {
t.Fatalf("expected 1 alert, got %d", len(alerts))
}
if alerts[0].Severity != AlertCritical {
t.Errorf("expected critical, got %s", alerts[0].Severity)
}
}
func TestCheckNodeVault_unresponsive(t *testing.T) {
r := &report.NodeReport{
Vault: &report.VaultReport{ServiceActive: true, Responsive: false},
}
alerts := checkNodeVault(r, "10.0.0.1")
if len(alerts) != 1 {
t.Fatalf("expected 1 alert, got %d", len(alerts))
}
if alerts[0].Severity != AlertWarning {
t.Errorf("expected warning, got %s", alerts[0].Severity)
}
}
func TestCheckNodeVault_unavailable(t *testing.T) {
r := &report.NodeReport{
Vault: &report.VaultReport{
ServiceActive: true,
Responsive: true,
Status: "unavailable",
Guardians: 5,
Healthy: 1,
Threshold: 3,
WriteQuorum: 4,
},
}
alerts := checkNodeVault(r, "10.0.0.1")
if len(alerts) != 1 {
t.Fatalf("expected 1 alert, got %d", len(alerts))
}
if alerts[0].Severity != AlertCritical {
t.Errorf("expected critical, got %s", alerts[0].Severity)
}
}
func TestCheckNodeVault_degraded(t *testing.T) {
r := &report.NodeReport{
Vault: &report.VaultReport{
ServiceActive: true,
Responsive: true,
Status: "degraded",
Guardians: 5,
Healthy: 3,
Threshold: 3,
WriteQuorum: 4,
},
}
alerts := checkNodeVault(r, "10.0.0.1")
if len(alerts) != 1 {
t.Fatalf("expected 1 alert, got %d", len(alerts))
}
if alerts[0].Severity != AlertWarning {
t.Errorf("expected warning, got %s", alerts[0].Severity)
}
}
func TestCheckNodeVault_excessiveRestarts(t *testing.T) {
r := &report.NodeReport{
Vault: &report.VaultReport{
ServiceActive: true,
Responsive: true,
Status: "healthy",
RestartCount: 5,
},
}
alerts := checkNodeVault(r, "10.0.0.1")
if len(alerts) != 1 {
t.Fatalf("expected 1 alert, got %d", len(alerts))
}
if alerts[0].Severity != AlertWarning {
t.Errorf("expected warning, got %s", alerts[0].Severity)
}
}
func TestCheckNodeVault_healthy(t *testing.T) {
r := &report.NodeReport{
Vault: &report.VaultReport{
ServiceActive: true,
Responsive: true,
Status: "healthy",
Guardians: 5,
Healthy: 5,
Threshold: 3,
WriteQuorum: 4,
RestartCount: 0,
},
}
alerts := checkNodeVault(r, "10.0.0.1")
if len(alerts) != 0 {
t.Errorf("expected 0 alerts for healthy vault, got %d", len(alerts))
}
}

View File

@ -133,7 +133,7 @@ func cleanNode(node inspector.Node, nuclear bool) error {
%s
# Stop services
for svc in caddy coredns orama-node orama-gateway orama-ipfs-cluster orama-ipfs orama-olric orama-anyone-relay orama-anyone-client; do
for svc in caddy coredns orama-node orama-gateway orama-ipfs-cluster orama-ipfs orama-olric orama-vault orama-anyone-relay orama-anyone-client; do
systemctl stop "$svc" 2>/dev/null
systemctl disable "$svc" 2>/dev/null
done

View File

@ -53,6 +53,7 @@ func HandleRestartWithFlags(force bool) {
{"orama-node"},
{"orama-olric"},
{"orama-ipfs-cluster", "orama-ipfs"},
{"orama-vault"},
{"orama-anyone-relay", "orama-anyone-client"},
{"coredns", "caddy"},
}

View File

@ -55,8 +55,9 @@ func HandleStopWithFlags(force bool) {
{"orama-node"}, // 1. Stop node (includes gateway + RQLite with leadership transfer)
{"orama-olric"}, // 2. Stop cache
{"orama-ipfs-cluster", "orama-ipfs"}, // 3. Stop storage
{"orama-anyone-relay", "orama-anyone-client"}, // 4. Stop privacy relay
{"coredns", "caddy"}, // 5. Stop DNS/TLS last
{"orama-vault"}, // 4. Stop vault
{"orama-anyone-relay", "orama-anyone-client"}, // 5. Stop privacy relay
{"coredns", "caddy"}, // 6. Stop DNS/TLS last
}
// Mask all services to immediately prevent Restart=always from reviving them.

View File

@ -89,6 +89,7 @@ func collectProcesses() *ProcessReport {
var managedServiceUnits = []string{
"orama-node", "orama-olric",
"orama-ipfs", "orama-ipfs-cluster",
"orama-vault",
"orama-anyone-relay", "orama-anyone-client",
"coredns", "caddy", "rqlited",
}

View File

@ -71,6 +71,10 @@ func Handle(jsonFlag bool, version string) error {
rpt.IPFS = collectIPFS()
})
safeGo(&wg, "vault", func() {
rpt.Vault = collectVault()
})
safeGo(&wg, "gateway", func() {
rpt.Gateway = collectGateway()
})

View File

@ -13,6 +13,7 @@ var coreServices = []string{
"orama-olric",
"orama-ipfs",
"orama-ipfs-cluster",
"orama-vault",
"orama-anyone-relay",
"orama-anyone-client",
"coredns",

View File

@ -17,6 +17,7 @@ type NodeReport struct {
RQLite *RQLiteReport `json:"rqlite,omitempty"`
Olric *OlricReport `json:"olric,omitempty"`
IPFS *IPFSReport `json:"ipfs,omitempty"`
Vault *VaultReport `json:"vault,omitempty"`
Gateway *GatewayReport `json:"gateway,omitempty"`
WireGuard *WireGuardReport `json:"wireguard,omitempty"`
DNS *DNSReport `json:"dns,omitempty"`
@ -150,6 +151,21 @@ type IPFSReport struct {
BootstrapEmpty bool `json:"bootstrap_empty"`
}
// --- Vault ---
type VaultReport struct {
ServiceActive bool `json:"service_active"`
Responsive bool `json:"responsive"`
Status string `json:"status,omitempty"` // "healthy", "degraded", "unavailable"
Guardians int `json:"guardians,omitempty"`
Healthy int `json:"healthy,omitempty"`
Threshold int `json:"threshold,omitempty"`
WriteQuorum int `json:"write_quorum,omitempty"`
ProcessMemMB int `json:"process_mem_mb"`
RestartCount int `json:"restart_count"`
LogErrors int `json:"log_errors_1h"`
}
// --- Gateway ---
type GatewayReport struct {

View File

@ -0,0 +1,70 @@
package report
import (
"context"
"encoding/json"
"strconv"
"strings"
"time"
)
func collectVault() *VaultReport {
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
r := &VaultReport{}
// 1. Service active
if out, err := runCmd(ctx, "systemctl", "is-active", "orama-vault"); err == nil {
r.ServiceActive = strings.TrimSpace(out) == "active"
}
// 2. Restart count
if out, err := runCmd(ctx, "systemctl", "show", "orama-vault", "--property=NRestarts"); err == nil {
if parts := strings.SplitN(out, "=", 2); len(parts) == 2 {
r.RestartCount, _ = strconv.Atoi(strings.TrimSpace(parts[1]))
}
}
// 3. Process memory
if out, err := runCmd(ctx, "systemctl", "show", "orama-vault", "--property=MemoryCurrent"); err == nil {
if parts := strings.SplitN(out, "=", 2); len(parts) == 2 {
r.ProcessMemMB = parseMemoryMB(parts[1])
}
}
// 4. Log errors in last hour
if out, err := runCmd(ctx, "bash", "-c",
`journalctl -u orama-vault --no-pager -n 200 --since "1 hour ago" 2>/dev/null | grep -ciE "(error|ERR)" || echo 0`); err == nil {
r.LogErrors, _ = strconv.Atoi(strings.TrimSpace(out))
}
// 5. Query vault status via gateway (provides guardian health)
if body, err := httpGet(ctx, "http://localhost:6001/v1/vault/status"); err == nil {
var status struct {
Guardians int `json:"guardians"`
Healthy int `json:"healthy"`
Threshold int `json:"threshold"`
WriteQuorum int `json:"write_quorum"`
}
if json.Unmarshal(body, &status) == nil {
r.Responsive = true
r.Guardians = status.Guardians
r.Healthy = status.Healthy
r.Threshold = status.Threshold
r.WriteQuorum = status.WriteQuorum
}
}
// 6. Query vault health status
if body, err := httpGet(ctx, "http://localhost:6001/v1/vault/health"); err == nil {
var health struct {
Status string `json:"status"`
}
if json.Unmarshal(body, &health) == nil {
r.Status = health.Status
}
}
return r
}

View File

@ -17,6 +17,7 @@ func Handle() {
"orama-ipfs-cluster",
// Note: RQLite is managed by node process, not as separate service
"orama-olric",
"orama-vault",
"orama-node",
// Note: gateway is embedded in orama-node, no separate service
}
@ -26,6 +27,7 @@ func Handle() {
"orama-ipfs": "IPFS Daemon",
"orama-ipfs-cluster": "IPFS Cluster",
"orama-olric": "Olric Cache Server",
"orama-vault": "Vault Guardian",
"orama-node": "Orama Node (includes RQLite + Gateway)",
}

View File

@ -376,6 +376,7 @@ func (o *Orchestrator) stopServices() error {
"orama-ipfs-cluster.service", // Depends on IPFS
"orama-ipfs.service", // Base IPFS
"orama-olric.service", // Independent
"orama-vault.service", // Vault guardian
"orama-anyone-client.service", // Client mode
"orama-anyone-relay.service", // Relay mode
}
@ -683,6 +684,7 @@ func (o *Orchestrator) restartServices() error {
"orama-olric", // Distributed cache
"orama-ipfs", // IPFS daemon
"orama-ipfs-cluster", // IPFS cluster
"orama-vault", // Vault guardian
"orama-gateway", // Gateway (legacy)
"coredns", // DNS server
"caddy", // Reverse proxy

View File

@ -162,6 +162,7 @@ func GetProductionServices() []string {
"orama-olric",
"orama-ipfs-cluster",
"orama-ipfs",
"orama-vault",
"orama-anyone-client",
"orama-anyone-relay",
}