From 2017fcb43292cca09be8eed3d6122838cf9f6685 Mon Sep 17 00:00:00 2001 From: anonpenguin23 Date: Fri, 27 Mar 2026 14:52:41 +0200 Subject: [PATCH] feat(monitor): add vault health checks and reporting - integrate vault into node alerts (service, responsive, status, restarts) - add vault report collection (systemd, logs, HTTP status) - update production CLI (clean, restart, stop, services) - add comprehensive unit tests for vault alerts --- core/pkg/cli/monitor/alerts.go | 36 ++++++ core/pkg/cli/monitor/alerts_vault_test.go | 120 ++++++++++++++++++ core/pkg/cli/production/clean/clean.go | 2 +- core/pkg/cli/production/lifecycle/restart.go | 1 + core/pkg/cli/production/lifecycle/stop.go | 5 +- core/pkg/cli/production/report/processes.go | 1 + core/pkg/cli/production/report/report.go | 4 + core/pkg/cli/production/report/services.go | 1 + core/pkg/cli/production/report/types.go | 16 +++ core/pkg/cli/production/report/vault.go | 70 ++++++++++ core/pkg/cli/production/status/command.go | 2 + .../cli/production/upgrade/orchestrator.go | 2 + core/pkg/cli/utils/systemd.go | 1 + 13 files changed, 258 insertions(+), 3 deletions(-) create mode 100644 core/pkg/cli/monitor/alerts_vault_test.go create mode 100644 core/pkg/cli/production/report/vault.go diff --git a/core/pkg/cli/monitor/alerts.go b/core/pkg/cli/monitor/alerts.go index 49e1437..317b74b 100644 --- a/core/pkg/cli/monitor/alerts.go +++ b/core/pkg/cli/monitor/alerts.go @@ -124,6 +124,7 @@ func DeriveAlerts(snap *ClusterSnapshot) []Alert { alerts = append(alerts, checkNodeNetwork(r, host)...) alerts = append(alerts, checkNodeOlric(r, host)...) alerts = append(alerts, checkNodeIPFS(r, host)...) + alerts = append(alerts, checkNodeVault(r, host)...) alerts = append(alerts, checkNodeGateway(r, host)...) } @@ -866,6 +867,41 @@ func checkNodeIPFS(r *report.NodeReport, host string) []Alert { return alerts } +func checkNodeVault(r *report.NodeReport, host string) []Alert { + if r.Vault == nil { + return nil + } + var alerts []Alert + + if !r.Vault.ServiceActive { + alerts = append(alerts, Alert{AlertCritical, "vault", host, "Vault service not running"}) + return alerts + } + + if !r.Vault.Responsive { + alerts = append(alerts, Alert{AlertWarning, "vault", host, "Vault not responding to health queries"}) + return alerts + } + + switch r.Vault.Status { + case "unavailable": + alerts = append(alerts, Alert{AlertCritical, "vault", host, + fmt.Sprintf("Vault unavailable: %d/%d guardians healthy (need %d for reads)", + r.Vault.Healthy, r.Vault.Guardians, r.Vault.Threshold)}) + case "degraded": + alerts = append(alerts, Alert{AlertWarning, "vault", host, + fmt.Sprintf("Vault degraded: %d/%d guardians healthy (need %d for writes)", + r.Vault.Healthy, r.Vault.Guardians, r.Vault.WriteQuorum)}) + } + + if r.Vault.RestartCount > 3 { + alerts = append(alerts, Alert{AlertWarning, "vault", host, + fmt.Sprintf("Vault restarted %d times", r.Vault.RestartCount)}) + } + + return alerts +} + func checkNodeGateway(r *report.NodeReport, host string) []Alert { if r.Gateway == nil { return nil diff --git a/core/pkg/cli/monitor/alerts_vault_test.go b/core/pkg/cli/monitor/alerts_vault_test.go new file mode 100644 index 0000000..2ea302d --- /dev/null +++ b/core/pkg/cli/monitor/alerts_vault_test.go @@ -0,0 +1,120 @@ +package monitor + +import ( + "testing" + + "github.com/DeBrosOfficial/network/pkg/cli/production/report" +) + +func TestCheckNodeVault_nil(t *testing.T) { + r := &report.NodeReport{} + alerts := checkNodeVault(r, "10.0.0.1") + if len(alerts) != 0 { + t.Errorf("expected 0 alerts for nil vault, got %d", len(alerts)) + } +} + +func TestCheckNodeVault_serviceInactive(t *testing.T) { + r := &report.NodeReport{ + Vault: &report.VaultReport{ServiceActive: false}, + } + alerts := checkNodeVault(r, "10.0.0.1") + if len(alerts) != 1 { + t.Fatalf("expected 1 alert, got %d", len(alerts)) + } + if alerts[0].Severity != AlertCritical { + t.Errorf("expected critical, got %s", alerts[0].Severity) + } +} + +func TestCheckNodeVault_unresponsive(t *testing.T) { + r := &report.NodeReport{ + Vault: &report.VaultReport{ServiceActive: true, Responsive: false}, + } + alerts := checkNodeVault(r, "10.0.0.1") + if len(alerts) != 1 { + t.Fatalf("expected 1 alert, got %d", len(alerts)) + } + if alerts[0].Severity != AlertWarning { + t.Errorf("expected warning, got %s", alerts[0].Severity) + } +} + +func TestCheckNodeVault_unavailable(t *testing.T) { + r := &report.NodeReport{ + Vault: &report.VaultReport{ + ServiceActive: true, + Responsive: true, + Status: "unavailable", + Guardians: 5, + Healthy: 1, + Threshold: 3, + WriteQuorum: 4, + }, + } + alerts := checkNodeVault(r, "10.0.0.1") + if len(alerts) != 1 { + t.Fatalf("expected 1 alert, got %d", len(alerts)) + } + if alerts[0].Severity != AlertCritical { + t.Errorf("expected critical, got %s", alerts[0].Severity) + } +} + +func TestCheckNodeVault_degraded(t *testing.T) { + r := &report.NodeReport{ + Vault: &report.VaultReport{ + ServiceActive: true, + Responsive: true, + Status: "degraded", + Guardians: 5, + Healthy: 3, + Threshold: 3, + WriteQuorum: 4, + }, + } + alerts := checkNodeVault(r, "10.0.0.1") + if len(alerts) != 1 { + t.Fatalf("expected 1 alert, got %d", len(alerts)) + } + if alerts[0].Severity != AlertWarning { + t.Errorf("expected warning, got %s", alerts[0].Severity) + } +} + +func TestCheckNodeVault_excessiveRestarts(t *testing.T) { + r := &report.NodeReport{ + Vault: &report.VaultReport{ + ServiceActive: true, + Responsive: true, + Status: "healthy", + RestartCount: 5, + }, + } + alerts := checkNodeVault(r, "10.0.0.1") + if len(alerts) != 1 { + t.Fatalf("expected 1 alert, got %d", len(alerts)) + } + if alerts[0].Severity != AlertWarning { + t.Errorf("expected warning, got %s", alerts[0].Severity) + } +} + +func TestCheckNodeVault_healthy(t *testing.T) { + r := &report.NodeReport{ + Vault: &report.VaultReport{ + ServiceActive: true, + Responsive: true, + Status: "healthy", + Guardians: 5, + Healthy: 5, + Threshold: 3, + WriteQuorum: 4, + RestartCount: 0, + }, + } + alerts := checkNodeVault(r, "10.0.0.1") + if len(alerts) != 0 { + t.Errorf("expected 0 alerts for healthy vault, got %d", len(alerts)) + } +} diff --git a/core/pkg/cli/production/clean/clean.go b/core/pkg/cli/production/clean/clean.go index 547a9a3..f473683 100644 --- a/core/pkg/cli/production/clean/clean.go +++ b/core/pkg/cli/production/clean/clean.go @@ -133,7 +133,7 @@ func cleanNode(node inspector.Node, nuclear bool) error { %s # Stop services -for svc in caddy coredns orama-node orama-gateway orama-ipfs-cluster orama-ipfs orama-olric orama-anyone-relay orama-anyone-client; do +for svc in caddy coredns orama-node orama-gateway orama-ipfs-cluster orama-ipfs orama-olric orama-vault orama-anyone-relay orama-anyone-client; do systemctl stop "$svc" 2>/dev/null systemctl disable "$svc" 2>/dev/null done diff --git a/core/pkg/cli/production/lifecycle/restart.go b/core/pkg/cli/production/lifecycle/restart.go index 3560b1c..07e7e1d 100644 --- a/core/pkg/cli/production/lifecycle/restart.go +++ b/core/pkg/cli/production/lifecycle/restart.go @@ -53,6 +53,7 @@ func HandleRestartWithFlags(force bool) { {"orama-node"}, {"orama-olric"}, {"orama-ipfs-cluster", "orama-ipfs"}, + {"orama-vault"}, {"orama-anyone-relay", "orama-anyone-client"}, {"coredns", "caddy"}, } diff --git a/core/pkg/cli/production/lifecycle/stop.go b/core/pkg/cli/production/lifecycle/stop.go index 0e7f289..53433ad 100644 --- a/core/pkg/cli/production/lifecycle/stop.go +++ b/core/pkg/cli/production/lifecycle/stop.go @@ -55,8 +55,9 @@ func HandleStopWithFlags(force bool) { {"orama-node"}, // 1. Stop node (includes gateway + RQLite with leadership transfer) {"orama-olric"}, // 2. Stop cache {"orama-ipfs-cluster", "orama-ipfs"}, // 3. Stop storage - {"orama-anyone-relay", "orama-anyone-client"}, // 4. Stop privacy relay - {"coredns", "caddy"}, // 5. Stop DNS/TLS last + {"orama-vault"}, // 4. Stop vault + {"orama-anyone-relay", "orama-anyone-client"}, // 5. Stop privacy relay + {"coredns", "caddy"}, // 6. Stop DNS/TLS last } // Mask all services to immediately prevent Restart=always from reviving them. diff --git a/core/pkg/cli/production/report/processes.go b/core/pkg/cli/production/report/processes.go index bd5038d..1cc8243 100644 --- a/core/pkg/cli/production/report/processes.go +++ b/core/pkg/cli/production/report/processes.go @@ -89,6 +89,7 @@ func collectProcesses() *ProcessReport { var managedServiceUnits = []string{ "orama-node", "orama-olric", "orama-ipfs", "orama-ipfs-cluster", + "orama-vault", "orama-anyone-relay", "orama-anyone-client", "coredns", "caddy", "rqlited", } diff --git a/core/pkg/cli/production/report/report.go b/core/pkg/cli/production/report/report.go index 317a44b..2c72791 100644 --- a/core/pkg/cli/production/report/report.go +++ b/core/pkg/cli/production/report/report.go @@ -71,6 +71,10 @@ func Handle(jsonFlag bool, version string) error { rpt.IPFS = collectIPFS() }) + safeGo(&wg, "vault", func() { + rpt.Vault = collectVault() + }) + safeGo(&wg, "gateway", func() { rpt.Gateway = collectGateway() }) diff --git a/core/pkg/cli/production/report/services.go b/core/pkg/cli/production/report/services.go index 5138927..5939e28 100644 --- a/core/pkg/cli/production/report/services.go +++ b/core/pkg/cli/production/report/services.go @@ -13,6 +13,7 @@ var coreServices = []string{ "orama-olric", "orama-ipfs", "orama-ipfs-cluster", + "orama-vault", "orama-anyone-relay", "orama-anyone-client", "coredns", diff --git a/core/pkg/cli/production/report/types.go b/core/pkg/cli/production/report/types.go index 7607917..29f6df4 100644 --- a/core/pkg/cli/production/report/types.go +++ b/core/pkg/cli/production/report/types.go @@ -17,6 +17,7 @@ type NodeReport struct { RQLite *RQLiteReport `json:"rqlite,omitempty"` Olric *OlricReport `json:"olric,omitempty"` IPFS *IPFSReport `json:"ipfs,omitempty"` + Vault *VaultReport `json:"vault,omitempty"` Gateway *GatewayReport `json:"gateway,omitempty"` WireGuard *WireGuardReport `json:"wireguard,omitempty"` DNS *DNSReport `json:"dns,omitempty"` @@ -150,6 +151,21 @@ type IPFSReport struct { BootstrapEmpty bool `json:"bootstrap_empty"` } +// --- Vault --- + +type VaultReport struct { + ServiceActive bool `json:"service_active"` + Responsive bool `json:"responsive"` + Status string `json:"status,omitempty"` // "healthy", "degraded", "unavailable" + Guardians int `json:"guardians,omitempty"` + Healthy int `json:"healthy,omitempty"` + Threshold int `json:"threshold,omitempty"` + WriteQuorum int `json:"write_quorum,omitempty"` + ProcessMemMB int `json:"process_mem_mb"` + RestartCount int `json:"restart_count"` + LogErrors int `json:"log_errors_1h"` +} + // --- Gateway --- type GatewayReport struct { diff --git a/core/pkg/cli/production/report/vault.go b/core/pkg/cli/production/report/vault.go new file mode 100644 index 0000000..45e269f --- /dev/null +++ b/core/pkg/cli/production/report/vault.go @@ -0,0 +1,70 @@ +package report + +import ( + "context" + "encoding/json" + "strconv" + "strings" + "time" +) + +func collectVault() *VaultReport { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + r := &VaultReport{} + + // 1. Service active + if out, err := runCmd(ctx, "systemctl", "is-active", "orama-vault"); err == nil { + r.ServiceActive = strings.TrimSpace(out) == "active" + } + + // 2. Restart count + if out, err := runCmd(ctx, "systemctl", "show", "orama-vault", "--property=NRestarts"); err == nil { + if parts := strings.SplitN(out, "=", 2); len(parts) == 2 { + r.RestartCount, _ = strconv.Atoi(strings.TrimSpace(parts[1])) + } + } + + // 3. Process memory + if out, err := runCmd(ctx, "systemctl", "show", "orama-vault", "--property=MemoryCurrent"); err == nil { + if parts := strings.SplitN(out, "=", 2); len(parts) == 2 { + r.ProcessMemMB = parseMemoryMB(parts[1]) + } + } + + // 4. Log errors in last hour + if out, err := runCmd(ctx, "bash", "-c", + `journalctl -u orama-vault --no-pager -n 200 --since "1 hour ago" 2>/dev/null | grep -ciE "(error|ERR)" || echo 0`); err == nil { + r.LogErrors, _ = strconv.Atoi(strings.TrimSpace(out)) + } + + // 5. Query vault status via gateway (provides guardian health) + if body, err := httpGet(ctx, "http://localhost:6001/v1/vault/status"); err == nil { + var status struct { + Guardians int `json:"guardians"` + Healthy int `json:"healthy"` + Threshold int `json:"threshold"` + WriteQuorum int `json:"write_quorum"` + } + if json.Unmarshal(body, &status) == nil { + r.Responsive = true + r.Guardians = status.Guardians + r.Healthy = status.Healthy + r.Threshold = status.Threshold + r.WriteQuorum = status.WriteQuorum + } + } + + // 6. Query vault health status + if body, err := httpGet(ctx, "http://localhost:6001/v1/vault/health"); err == nil { + var health struct { + Status string `json:"status"` + } + if json.Unmarshal(body, &health) == nil { + r.Status = health.Status + } + } + + return r +} diff --git a/core/pkg/cli/production/status/command.go b/core/pkg/cli/production/status/command.go index 4120693..c7ea512 100644 --- a/core/pkg/cli/production/status/command.go +++ b/core/pkg/cli/production/status/command.go @@ -17,6 +17,7 @@ func Handle() { "orama-ipfs-cluster", // Note: RQLite is managed by node process, not as separate service "orama-olric", + "orama-vault", "orama-node", // Note: gateway is embedded in orama-node, no separate service } @@ -26,6 +27,7 @@ func Handle() { "orama-ipfs": "IPFS Daemon", "orama-ipfs-cluster": "IPFS Cluster", "orama-olric": "Olric Cache Server", + "orama-vault": "Vault Guardian", "orama-node": "Orama Node (includes RQLite + Gateway)", } diff --git a/core/pkg/cli/production/upgrade/orchestrator.go b/core/pkg/cli/production/upgrade/orchestrator.go index 8c20bdb..38f3319 100644 --- a/core/pkg/cli/production/upgrade/orchestrator.go +++ b/core/pkg/cli/production/upgrade/orchestrator.go @@ -376,6 +376,7 @@ func (o *Orchestrator) stopServices() error { "orama-ipfs-cluster.service", // Depends on IPFS "orama-ipfs.service", // Base IPFS "orama-olric.service", // Independent + "orama-vault.service", // Vault guardian "orama-anyone-client.service", // Client mode "orama-anyone-relay.service", // Relay mode } @@ -683,6 +684,7 @@ func (o *Orchestrator) restartServices() error { "orama-olric", // Distributed cache "orama-ipfs", // IPFS daemon "orama-ipfs-cluster", // IPFS cluster + "orama-vault", // Vault guardian "orama-gateway", // Gateway (legacy) "coredns", // DNS server "caddy", // Reverse proxy diff --git a/core/pkg/cli/utils/systemd.go b/core/pkg/cli/utils/systemd.go index b4a6ffb..2869f33 100644 --- a/core/pkg/cli/utils/systemd.go +++ b/core/pkg/cli/utils/systemd.go @@ -162,6 +162,7 @@ func GetProductionServices() []string { "orama-olric", "orama-ipfs-cluster", "orama-ipfs", + "orama-vault", "orama-anyone-client", "orama-anyone-relay", }