From f889c2e3584510a9c74b25af3882203182c20cf1 Mon Sep 17 00:00:00 2001 From: anonpenguin23 Date: Mon, 16 Feb 2026 11:47:18 +0200 Subject: [PATCH] Added some new alerts on monitoring --- docs/DEV_DEPLOY.md | 30 ++ docs/MONITORING.md | 275 ++++++++++++++++++ pkg/cli/monitor/alerts.go | 382 +++++++++++++++++++++++-- pkg/cli/production/report/processes.go | 38 ++- 4 files changed, 701 insertions(+), 24 deletions(-) create mode 100644 docs/MONITORING.md diff --git a/docs/DEV_DEPLOY.md b/docs/DEV_DEPLOY.md index e3b61e2..ec7a8bf 100644 --- a/docs/DEV_DEPLOY.md +++ b/docs/DEV_DEPLOY.md @@ -203,6 +203,36 @@ sudo orama node doctor **Note:** Always use `orama node stop` instead of manually running `systemctl stop`. The CLI ensures all related services (including CoreDNS and Caddy on nameserver nodes) are handled correctly. +#### `orama node report` + +Outputs comprehensive health data as JSON. Used by `orama monitor` over SSH: + +```bash +sudo orama node report --json +``` + +See [MONITORING.md](MONITORING.md) for full details. + +#### `orama monitor` + +Real-time cluster monitoring from your local machine: + +```bash +# Interactive TUI +orama monitor --env testnet + +# Cluster overview +orama monitor cluster --env testnet + +# Alerts only +orama monitor alerts --env testnet + +# Full JSON for LLM analysis +orama monitor report --env testnet +``` + +See [MONITORING.md](MONITORING.md) for all subcommands and flags. + ### Node Join Flow ```bash diff --git a/docs/MONITORING.md b/docs/MONITORING.md new file mode 100644 index 0000000..698f5cd --- /dev/null +++ b/docs/MONITORING.md @@ -0,0 +1,275 @@ +# Monitoring + +Real-time cluster health monitoring via SSH. The system has two parts: + +1. **`orama node report`** — Runs on each VPS node, collects all local health data, outputs JSON +2. **`orama monitor`** — Runs on your local machine, SSHes into nodes, aggregates results, displays via TUI or tables + +## Architecture + +``` +Developer Machine VPS Nodes (via SSH) +┌──────────────────┐ ┌────────────────────┐ +│ orama monitor │ ──SSH──────────>│ orama node report │ +│ (TUI / tables) │ <──JSON─────── │ (local collector) │ +│ │ └────────────────────┘ +│ CollectOnce() │ ──SSH──────────>│ orama node report │ +│ DeriveAlerts() │ <──JSON─────── │ (local collector) │ +│ Render() │ └────────────────────┘ +└──────────────────┘ +``` + +Each node runs `orama node report --json` locally (no SSH to other nodes), collecting data via `os/exec` and `net/http` to localhost services. The monitor SSHes into all nodes in parallel, collects reports, then runs cross-node analysis to detect cluster-wide issues. + +## Quick Start + +```bash +# Interactive TUI (auto-refreshes every 30s) +orama monitor --env testnet + +# Cluster overview table +orama monitor cluster --env testnet + +# Alerts only +orama monitor alerts --env testnet + +# Full JSON report (pipe to jq or feed to LLM) +orama monitor report --env testnet +``` + +## `orama monitor` — Local Orchestrator + +### Usage + +``` +orama monitor [subcommand] --env [flags] +``` + +Without a subcommand, launches the interactive TUI. + +### Global Flags + +| Flag | Default | Description | +|------|---------|-------------| +| `--env` | *(required)* | Environment: `devnet`, `testnet`, `mainnet` | +| `--json` | `false` | Machine-readable JSON output (for one-shot subcommands) | +| `--node` | | Filter to a specific node host/IP | +| `--config` | `scripts/remote-nodes.conf` | Path to node configuration file | + +### Subcommands + +| Subcommand | Description | +|------------|-------------| +| `live` | Interactive TUI monitor (default when no subcommand) | +| `cluster` | Cluster overview: all nodes, roles, RQLite state, WG peers | +| `node` | Per-node health details (system, services, WG, DNS) | +| `service` | Service status matrix across all nodes | +| `mesh` | WireGuard mesh connectivity and peer details | +| `dns` | DNS health: CoreDNS, Caddy, TLS cert expiry, resolution | +| `namespaces` | Namespace health across nodes | +| `alerts` | Active alerts and warnings sorted by severity | +| `report` | Full JSON dump optimized for LLM consumption | + +### Examples + +```bash +# Cluster overview +orama monitor cluster --env testnet + +# Cluster overview as JSON +orama monitor cluster --env testnet --json + +# Alerts for all nodes +orama monitor alerts --env testnet + +# Single-node deep dive +orama monitor node --env testnet --node 51.195.109.238 + +# Services for one node +orama monitor service --env testnet --node 51.195.109.238 + +# WireGuard mesh details +orama monitor mesh --env testnet + +# DNS health +orama monitor dns --env testnet + +# Namespace health +orama monitor namespaces --env testnet + +# Full report for LLM analysis +orama monitor report --env testnet | jq . + +# Single-node report +orama monitor report --env testnet --node 51.195.109.238 + +# Custom config file +orama monitor cluster --config /path/to/nodes.conf --env devnet +``` + +### Interactive TUI + +The `live` subcommand (default) launches a full-screen terminal UI: + +**Tabs:** Overview | Nodes | Services | WG Mesh | DNS | Namespaces | Alerts + +**Key Bindings:** + +| Key | Action | +|-----|--------| +| `Tab` / `Shift+Tab` | Switch tabs | +| `j` / `k` or `↑` / `↓` | Scroll content | +| `r` | Force refresh | +| `q` / `Ctrl+C` | Quit | + +The TUI auto-refreshes every 30 seconds. A spinner shows during data collection. Colors indicate health: green = healthy, red = critical, yellow = warning. + +### LLM Report Format + +`orama monitor report` outputs structured JSON designed for AI consumption: + +```json +{ + "meta": { + "environment": "testnet", + "collected_at": "2026-02-16T12:00:00Z", + "duration_seconds": 3.2, + "node_count": 3, + "healthy_count": 3 + }, + "summary": { + "rqlite_leader": "10.0.0.1", + "rqlite_voters": "3/3", + "rqlite_raft_term": 42, + "wg_mesh_status": "all connected", + "service_health": "all nominal", + "critical_alerts": 0, + "warning_alerts": 1, + "info_alerts": 0 + }, + "alerts": [...], + "nodes": [ + { + "host": "51.195.109.238", + "status": "healthy", + "collection_ms": 526, + "report": { ... } + } + ] +} +``` + +## `orama node report` — VPS-Side Collector + +Runs locally on a VPS node. Collects all system and service data in parallel and outputs a single JSON blob. Requires root privileges. + +### Usage + +```bash +# On a VPS node +sudo orama node report --json +``` + +### What It Collects + +| Section | Data | +|---------|------| +| **system** | CPU count, load average, memory/disk/swap usage, OOM kills, kernel version, uptime, clock time | +| **services** | Systemd service states (active, restarts, memory, CPU, restart loop detection) for 10 core services | +| **rqlite** | Raft state, leader, term, applied/commit index, peers, strong read test, readyz, debug vars | +| **olric** | Service state, memberlist, member count, restarts, memory, log analysis | +| **ipfs** | Daemon/cluster state, swarm/cluster peers, repo size, versions, swarm key | +| **gateway** | HTTP health check, subsystem status | +| **wireguard** | Interface state, WG IP, peers, handshake ages, MTU, config permissions | +| **dns** | CoreDNS/Caddy state, port bindings, resolution tests, TLS cert expiry | +| **anyone** | Relay/client state, bootstrap progress, fingerprint | +| **network** | Internet reachability, TCP stats, retransmission rate, listening ports, UFW rules | +| **processes** | Zombie count, orphan orama processes, panic/fatal count in logs | +| **namespaces** | Per-namespace service probes (RQLite, Olric, Gateway) | + +### Performance + +All 12 collectors run in parallel with goroutines. Typical collection time is **< 1 second** per node. HTTP timeouts are 3 seconds, command timeouts are 4 seconds. + +### Output Schema + +```json +{ + "timestamp": "2026-02-16T12:00:00Z", + "hostname": "ns1", + "version": "0.107.0", + "collect_ms": 526, + "errors": [], + "system": { "cpu_count": 4, "load_avg_1": 0.1, "mem_total_mb": 7937, ... }, + "services": { "services": [...], "failed_units": [] }, + "rqlite": { "responsive": true, "raft_state": "Leader", "term": 42, ... }, + "olric": { "service_active": true, "memberlist_up": true, ... }, + "ipfs": { "daemon_active": true, "swarm_peers": 2, ... }, + "gateway": { "responsive": true, "http_status": 200, ... }, + "wireguard": { "interface_up": true, "wg_ip": "10.0.0.1", "peers": [...], ... }, + "dns": { "coredns_active": true, "caddy_active": true, "base_tls_days_left": 88, ... }, + "anyone": { "relay_active": true, "bootstrapped": true, ... }, + "network": { "internet_reachable": true, "ufw_active": true, ... }, + "processes": { "zombie_count": 0, "orphan_count": 0, "panic_count": 0, ... }, + "namespaces": [] +} +``` + +## Alert Detection + +Alerts are derived from cross-node analysis of all collected reports. Each alert has a severity level and identifies the affected subsystem and node. + +### Alert Severities + +| Severity | Examples | +|----------|----------| +| **critical** | SSH collection failed (node unreachable), no RQLite leader, split brain, RQLite unresponsive, WireGuard interface down, WG peer never handshaked, OOM kills, service failed, UFW inactive | +| **warning** | Strong read failed, memory > 90%, disk > 85%, stale WG handshake (> 3min), Raft term inconsistency, applied index lag > 100, restart loop detected, TLS cert < 14 days, DNS down, namespace gateway down, Anyone not bootstrapped, clock skew > 5s, binary version mismatch, internet unreachable, high TCP retransmission | +| **info** | Zombie processes, orphan orama processes, swap usage > 30% | + +### Cross-Node Checks + +These checks compare data across all nodes: + +- **RQLite Leader**: Exactly one leader exists (no split brain) +- **Leader Agreement**: All nodes agree on the same leader address +- **Raft Term Consistency**: Term values within 1 of each other +- **Applied Index Lag**: Followers within 100 entries of the leader +- **WireGuard Peer Symmetry**: Each node has N-1 peers +- **Clock Skew**: Node clocks within 5 seconds of each other +- **Binary Version**: All nodes running the same version + +### Per-Node Checks + +- **RQLite**: Responsive, ready, strong read +- **WireGuard**: Interface up, handshake freshness +- **System**: Memory, disk, load, OOM kills, swap +- **Services**: Systemd state, restart loops +- **DNS**: CoreDNS/Caddy up, TLS cert expiry, SOA resolution +- **Anyone**: Bootstrap progress +- **Processes**: Zombies, orphans, panics in logs +- **Namespaces**: Gateway and RQLite per namespace +- **Network**: UFW, internet reachability, TCP retransmission + +## Monitor vs Inspector + +Both tools check cluster health, but they serve different purposes: + +| | `orama monitor` | `orama inspect` | +|---|---|---| +| **Data source** | `orama node report --json` (single SSH call per node) | 15+ SSH commands per node per subsystem | +| **Speed** | ~3-5s for full cluster | ~4-10s for full cluster | +| **Output** | TUI, tables, JSON | Tables, JSON | +| **Focus** | Real-time monitoring, alert detection | Deep diagnostic checks with pass/fail/warn | +| **AI support** | `report` subcommand for LLM input | `--ai` flag for inline analysis | +| **Use case** | "Is anything wrong right now?" | "What exactly is wrong and why?" | + +Use `monitor` for day-to-day health checks and the interactive TUI. Use `inspect` for deep diagnostics when something is already known to be broken. + +## Configuration + +Uses the same `scripts/remote-nodes.conf` as the inspector. See [INSPECTOR.md](INSPECTOR.md#configuration) for format details. + +## Prerequisites + +Nodes must have the `orama` CLI installed (via `orama node install` or `upload-source.sh`). The monitor runs `sudo orama node report --json` over SSH, so the binary must be at `/usr/local/bin/orama` on each node. diff --git a/pkg/cli/monitor/alerts.go b/pkg/cli/monitor/alerts.go index 64c4216..3f81e71 100644 --- a/pkg/cli/monitor/alerts.go +++ b/pkg/cli/monitor/alerts.go @@ -2,6 +2,7 @@ package monitor import ( "fmt" + "strings" "github.com/DeBrosOfficial/network/pkg/cli/production/report" ) @@ -44,23 +45,17 @@ func DeriveAlerts(snap *ClusterSnapshot) []Alert { return alerts } - // Cross-node: RQLite leader + // Cross-node checks alerts = append(alerts, checkRQLiteLeader(reports)...) - - // Cross-node: Raft term consistency + alerts = append(alerts, checkRQLiteQuorum(reports)...) alerts = append(alerts, checkRaftTermConsistency(reports)...) - - // Cross-node: Applied index lag alerts = append(alerts, checkAppliedIndexLag(reports)...) - - // Cross-node: WireGuard peer symmetry alerts = append(alerts, checkWGPeerSymmetry(reports)...) - - // Cross-node: Clock skew alerts = append(alerts, checkClockSkew(reports)...) - - // Cross-node: Binary version alerts = append(alerts, checkBinaryVersion(reports)...) + alerts = append(alerts, checkOlricMemberConsistency(reports)...) + alerts = append(alerts, checkIPFSSwarmConsistency(reports)...) + alerts = append(alerts, checkIPFSClusterConsistency(reports)...) // Per-node checks for _, r := range reports { @@ -74,6 +69,9 @@ func DeriveAlerts(snap *ClusterSnapshot) []Alert { alerts = append(alerts, checkNodeProcesses(r, host)...) alerts = append(alerts, checkNodeNamespaces(r, host)...) alerts = append(alerts, checkNodeNetwork(r, host)...) + alerts = append(alerts, checkNodeOlric(r, host)...) + alerts = append(alerts, checkNodeIPFS(r, host)...) + alerts = append(alerts, checkNodeGateway(r, host)...) } return alerts @@ -86,7 +84,9 @@ func nodeHost(r *report.NodeReport) string { return r.Hostname } -// --- Cross-node checks --- +// --------------------------------------------------------------------------- +// Cross-node checks +// --------------------------------------------------------------------------- func checkRQLiteLeader(reports []*report.NodeReport) []Alert { var alerts []Alert @@ -116,6 +116,51 @@ func checkRQLiteLeader(reports []*report.NodeReport) []Alert { return alerts } +func checkRQLiteQuorum(reports []*report.NodeReport) []Alert { + var voters, responsive int + for _, r := range reports { + if r.RQLite == nil { + continue + } + if r.RQLite.Responsive { + responsive++ + if r.RQLite.Voter { + voters++ + } + } + } + + if responsive == 0 { + return nil // no rqlite data at all + } + + // Total voters = responsive voters + unresponsive nodes that should be voters. + // For quorum calculation, use the total voter count (responsive + unreachable). + totalVoters := voters + for _, r := range reports { + if r.RQLite != nil && !r.RQLite.Responsive { + // Assume unresponsive nodes were voters (conservative estimate). + totalVoters++ + } + } + + if totalVoters < 2 { + return nil // single-node cluster, no quorum concept + } + + quorum := totalVoters/2 + 1 + if voters < quorum { + return []Alert{{AlertCritical, "rqlite", "cluster", + fmt.Sprintf("Quorum lost: only %d/%d voters reachable (need %d)", voters, totalVoters, quorum)}} + } + if voters == quorum { + return []Alert{{AlertWarning, "rqlite", "cluster", + fmt.Sprintf("Quorum fragile: exactly %d/%d voters reachable (one more failure = quorum loss)", voters, totalVoters)}} + } + + return nil +} + func checkRaftTermConsistency(reports []*report.NodeReport) []Alert { var minTerm, maxTerm uint64 first := true @@ -126,7 +171,7 @@ func checkRaftTermConsistency(reports []*report.NodeReport) []Alert { if first { minTerm = r.RQLite.Term maxTerm = r.RQLite.Term - first = true + first = false } if r.RQLite.Term < minTerm { minTerm = r.RQLite.Term @@ -134,7 +179,6 @@ func checkRaftTermConsistency(reports []*report.NodeReport) []Alert { if r.RQLite.Term > maxTerm { maxTerm = r.RQLite.Term } - first = false } if maxTerm-minTerm > 1 { return []Alert{{AlertWarning, "rqlite", "cluster", @@ -166,10 +210,8 @@ func checkAppliedIndexLag(reports []*report.NodeReport) []Alert { } func checkWGPeerSymmetry(reports []*report.NodeReport) []Alert { - // Build map: wg_ip -> set of peer public keys type nodeInfo struct { host string - wgIP string peerKeys map[string]bool } var nodes []nodeInfo @@ -177,14 +219,13 @@ func checkWGPeerSymmetry(reports []*report.NodeReport) []Alert { if r.WireGuard == nil || !r.WireGuard.InterfaceUp { continue } - ni := nodeInfo{host: nodeHost(r), wgIP: r.WireGuard.WgIP, peerKeys: map[string]bool{}} + ni := nodeInfo{host: nodeHost(r), peerKeys: map[string]bool{}} for _, p := range r.WireGuard.Peers { ni.peerKeys[p.PublicKey] = true } nodes = append(nodes, ni) } - // For WG peer symmetry, we check peer counts match (N-1 peers expected) var alerts []Alert expectedPeers := len(nodes) - 1 for _, ni := range nodes { @@ -254,22 +295,164 @@ func checkBinaryVersion(reports []*report.NodeReport) []Alert { return nil } -// --- Per-node checks --- +func checkOlricMemberConsistency(reports []*report.NodeReport) []Alert { + // Count nodes where Olric is active to determine expected member count. + activeCount := 0 + for _, r := range reports { + if r.Olric != nil && r.Olric.ServiceActive { + activeCount++ + } + } + if activeCount < 2 { + return nil + } + + var alerts []Alert + for _, r := range reports { + if r.Olric == nil || !r.Olric.ServiceActive || r.Olric.MemberCount == 0 { + continue + } + if r.Olric.MemberCount < activeCount { + alerts = append(alerts, Alert{AlertWarning, "olric", nodeHost(r), + fmt.Sprintf("Olric member count: %d (expected %d active nodes)", r.Olric.MemberCount, activeCount)}) + } + } + return alerts +} + +func checkIPFSSwarmConsistency(reports []*report.NodeReport) []Alert { + // Count IPFS-active nodes to determine expected peer count. + activeCount := 0 + for _, r := range reports { + if r.IPFS != nil && r.IPFS.DaemonActive { + activeCount++ + } + } + if activeCount < 2 { + return nil + } + + expectedPeers := activeCount - 1 + var alerts []Alert + for _, r := range reports { + if r.IPFS == nil || !r.IPFS.DaemonActive { + continue + } + if r.IPFS.SwarmPeerCount == 0 { + alerts = append(alerts, Alert{AlertCritical, "ipfs", nodeHost(r), + "IPFS node isolated: 0 swarm peers"}) + } else if r.IPFS.SwarmPeerCount < expectedPeers { + alerts = append(alerts, Alert{AlertWarning, "ipfs", nodeHost(r), + fmt.Sprintf("IPFS swarm peers: %d (expected %d)", r.IPFS.SwarmPeerCount, expectedPeers)}) + } + } + return alerts +} + +func checkIPFSClusterConsistency(reports []*report.NodeReport) []Alert { + activeCount := 0 + for _, r := range reports { + if r.IPFS != nil && r.IPFS.ClusterActive { + activeCount++ + } + } + if activeCount < 2 { + return nil + } + + var alerts []Alert + for _, r := range reports { + if r.IPFS == nil || !r.IPFS.ClusterActive { + continue + } + if r.IPFS.ClusterPeerCount < activeCount { + alerts = append(alerts, Alert{AlertWarning, "ipfs", nodeHost(r), + fmt.Sprintf("IPFS cluster peers: %d (expected %d)", r.IPFS.ClusterPeerCount, activeCount)}) + } + } + return alerts +} + +// --------------------------------------------------------------------------- +// Per-node checks +// --------------------------------------------------------------------------- func checkNodeRQLite(r *report.NodeReport, host string) []Alert { if r.RQLite == nil { return nil } var alerts []Alert + if !r.RQLite.Responsive { alerts = append(alerts, Alert{AlertCritical, "rqlite", host, "RQLite not responding"}) + return alerts // no point checking further } - if r.RQLite.Responsive && !r.RQLite.Ready { + + if !r.RQLite.Ready { alerts = append(alerts, Alert{AlertWarning, "rqlite", host, "RQLite not ready (/readyz failed)"}) } - if r.RQLite.Responsive && !r.RQLite.StrongRead { + if !r.RQLite.StrongRead { alerts = append(alerts, Alert{AlertWarning, "rqlite", host, "Strong read failed"}) } + + // Raft state anomalies + if r.RQLite.RaftState == "Candidate" { + alerts = append(alerts, Alert{AlertWarning, "rqlite", host, "RQLite in election (Candidate state)"}) + } + if r.RQLite.RaftState == "Shutdown" { + alerts = append(alerts, Alert{AlertCritical, "rqlite", host, "RQLite in Shutdown state"}) + } + + // FSM backlog + if r.RQLite.FsmPending > 10 { + alerts = append(alerts, Alert{AlertWarning, "rqlite", host, + fmt.Sprintf("RQLite FSM backlog: %d entries pending", r.RQLite.FsmPending)}) + } + + // Commit-applied gap (per-node, distinct from cross-node applied index lag) + if r.RQLite.Commit > 0 && r.RQLite.Applied > 0 && r.RQLite.Commit > r.RQLite.Applied { + gap := r.RQLite.Commit - r.RQLite.Applied + if gap > 100 { + alerts = append(alerts, Alert{AlertWarning, "rqlite", host, + fmt.Sprintf("RQLite commit-applied gap: %d (commit=%d, applied=%d)", gap, r.RQLite.Commit, r.RQLite.Applied)}) + } + } + + // Resource pressure + if r.RQLite.Goroutines > 1000 { + alerts = append(alerts, Alert{AlertWarning, "rqlite", host, + fmt.Sprintf("RQLite goroutine count high: %d", r.RQLite.Goroutines)}) + } + if r.RQLite.HeapMB > 1000 { + alerts = append(alerts, Alert{AlertWarning, "rqlite", host, + fmt.Sprintf("RQLite heap memory high: %dMB", r.RQLite.HeapMB)}) + } + + // Cluster partition detection: check if this node reports other nodes as unreachable + for nodeAddr, info := range r.RQLite.Nodes { + if !info.Reachable { + alerts = append(alerts, Alert{AlertCritical, "rqlite", host, + fmt.Sprintf("RQLite reports node %s unreachable (cluster partition)", nodeAddr)}) + } + } + + // Debug vars + if dv := r.RQLite.DebugVars; dv != nil { + if dv.LeaderNotFound > 0 { + alerts = append(alerts, Alert{AlertWarning, "rqlite", host, + fmt.Sprintf("RQLite leader_not_found errors: %d", dv.LeaderNotFound)}) + } + if dv.SnapshotErrors > 0 { + alerts = append(alerts, Alert{AlertWarning, "rqlite", host, + fmt.Sprintf("RQLite snapshot errors: %d", dv.SnapshotErrors)}) + } + totalQueryErrors := dv.QueryErrors + dv.ExecuteErrors + if totalQueryErrors > 0 { + alerts = append(alerts, Alert{AlertInfo, "rqlite", host, + fmt.Sprintf("RQLite query/execute errors: %d", totalQueryErrors)}) + } + } + return alerts } @@ -327,6 +510,14 @@ func checkNodeSystem(r *report.NodeReport, host string) []Alert { fmt.Sprintf("High load: %.1f (%.1fx CPU count)", r.System.LoadAvg1, loadRatio)}) } } + // Inode exhaustion + if r.System.InodePct > 95 { + alerts = append(alerts, Alert{AlertCritical, "system", host, + fmt.Sprintf("Inode exhaustion imminent: %d%%", r.System.InodePct)}) + } else if r.System.InodePct > 90 { + alerts = append(alerts, Alert{AlertWarning, "system", host, + fmt.Sprintf("Inode usage at %d%%", r.System.InodePct)}) + } return alerts } @@ -377,6 +568,24 @@ func checkNodeDNS(r *report.NodeReport, host string) []Alert { if r.DNS.CoreDNSActive && !r.DNS.SOAResolves { alerts = append(alerts, Alert{AlertWarning, "dns", host, "SOA record not resolving"}) } + // Additional DNS checks (only when CoreDNS is running) + if r.DNS.CoreDNSActive { + if !r.DNS.WildcardResolves { + alerts = append(alerts, Alert{AlertWarning, "dns", host, "Wildcard DNS not resolving"}) + } + if !r.DNS.BaseAResolves { + alerts = append(alerts, Alert{AlertWarning, "dns", host, "Base domain A record not resolving"}) + } + if !r.DNS.NSResolves { + alerts = append(alerts, Alert{AlertWarning, "dns", host, "NS records not resolving"}) + } + if !r.DNS.Port53Bound { + alerts = append(alerts, Alert{AlertCritical, "dns", host, "CoreDNS active but port 53 not bound"}) + } + } + if r.DNS.CaddyActive && !r.DNS.Port443Bound { + alerts = append(alerts, Alert{AlertCritical, "dns", host, "Caddy active but port 443 not bound"}) + } return alerts } @@ -442,6 +651,136 @@ func checkNodeNetwork(r *report.NodeReport, host string) []Alert { alerts = append(alerts, Alert{AlertWarning, "network", host, fmt.Sprintf("High TCP retransmission rate: %.1f%%", r.Network.TCPRetransRate)}) } + + // Check for internal ports exposed in UFW rules. + // Ports 5001 (RQLite), 6001 (Gateway), 3320 (Olric), 4501 (IPFS API) should be internal only. + internalPorts := []string{"5001", "6001", "3320", "4501"} + for _, rule := range r.Network.UFWRules { + ruleLower := strings.ToLower(rule) + // Only flag ALLOW rules (not deny/reject). + if !strings.Contains(ruleLower, "allow") { + continue + } + for _, port := range internalPorts { + // Match rules like "5001 ALLOW Anywhere" or "5001/tcp ALLOW IN" + // but not rules restricted to 10.0.0.0/24 (WG subnet). + if strings.Contains(rule, port) && !strings.Contains(rule, "10.0.0.") { + alerts = append(alerts, Alert{AlertCritical, "network", host, + fmt.Sprintf("Internal port %s exposed in UFW: %s", port, strings.TrimSpace(rule))}) + } + } + } + + return alerts +} + +func checkNodeOlric(r *report.NodeReport, host string) []Alert { + if r.Olric == nil { + return nil + } + var alerts []Alert + + if !r.Olric.ServiceActive { + alerts = append(alerts, Alert{AlertCritical, "olric", host, "Olric service down"}) + return alerts + } + if !r.Olric.MemberlistUp { + alerts = append(alerts, Alert{AlertCritical, "olric", host, "Olric memberlist port down"}) + } + if r.Olric.LogSuspects > 0 { + alerts = append(alerts, Alert{AlertWarning, "olric", host, + fmt.Sprintf("Olric member suspects: %d in last hour", r.Olric.LogSuspects)}) + } + if r.Olric.LogFlapping > 5 { + alerts = append(alerts, Alert{AlertWarning, "olric", host, + fmt.Sprintf("Olric members flapping: %d join/leave events in last hour", r.Olric.LogFlapping)}) + } + if r.Olric.LogErrors > 20 { + alerts = append(alerts, Alert{AlertWarning, "olric", host, + fmt.Sprintf("High Olric error rate: %d errors in last hour", r.Olric.LogErrors)}) + } + if r.Olric.RestartCount > 3 { + alerts = append(alerts, Alert{AlertWarning, "olric", host, + fmt.Sprintf("Olric excessive restarts: %d", r.Olric.RestartCount)}) + } + if r.Olric.ProcessMemMB > 500 { + alerts = append(alerts, Alert{AlertWarning, "olric", host, + fmt.Sprintf("Olric high memory: %dMB", r.Olric.ProcessMemMB)}) + } + + return alerts +} + +func checkNodeIPFS(r *report.NodeReport, host string) []Alert { + if r.IPFS == nil { + return nil + } + var alerts []Alert + + if !r.IPFS.DaemonActive { + alerts = append(alerts, Alert{AlertCritical, "ipfs", host, "IPFS daemon down"}) + } + if !r.IPFS.ClusterActive { + alerts = append(alerts, Alert{AlertCritical, "ipfs", host, "IPFS cluster down"}) + } + + // Only check these if daemon is running (otherwise data is meaningless). + if r.IPFS.DaemonActive { + if r.IPFS.SwarmPeerCount == 0 { + alerts = append(alerts, Alert{AlertCritical, "ipfs", host, "IPFS isolated: no swarm peers"}) + } + if !r.IPFS.HasSwarmKey { + alerts = append(alerts, Alert{AlertCritical, "ipfs", host, + "IPFS swarm key missing (private network compromised)"}) + } + if !r.IPFS.BootstrapEmpty { + alerts = append(alerts, Alert{AlertWarning, "ipfs", host, + "IPFS bootstrap list not empty (should be empty for private swarm)"}) + } + } + + if r.IPFS.RepoUsePct > 95 { + alerts = append(alerts, Alert{AlertCritical, "ipfs", host, + fmt.Sprintf("IPFS repo nearly full: %d%%", r.IPFS.RepoUsePct)}) + } else if r.IPFS.RepoUsePct > 90 { + alerts = append(alerts, Alert{AlertWarning, "ipfs", host, + fmt.Sprintf("IPFS repo at %d%%", r.IPFS.RepoUsePct)}) + } + + if r.IPFS.ClusterErrors > 0 { + alerts = append(alerts, Alert{AlertWarning, "ipfs", host, + fmt.Sprintf("IPFS cluster peer errors: %d", r.IPFS.ClusterErrors)}) + } + + return alerts +} + +func checkNodeGateway(r *report.NodeReport, host string) []Alert { + if r.Gateway == nil { + return nil + } + var alerts []Alert + + if !r.Gateway.Responsive { + alerts = append(alerts, Alert{AlertCritical, "gateway", host, "Gateway not responding"}) + return alerts + } + + if r.Gateway.HTTPStatus != 200 { + alerts = append(alerts, Alert{AlertWarning, "gateway", host, + fmt.Sprintf("Gateway health check returned HTTP %d", r.Gateway.HTTPStatus)}) + } + + for name, sub := range r.Gateway.Subsystems { + if sub.Status != "ok" && sub.Status != "" { + msg := fmt.Sprintf("Gateway subsystem %s: status=%s", name, sub.Status) + if sub.Error != "" { + msg += fmt.Sprintf(" error=%s", sub.Error) + } + alerts = append(alerts, Alert{AlertWarning, "gateway", host, msg}) + } + } + return alerts } @@ -451,4 +790,3 @@ func truncateKey(key string) string { } return key } - diff --git a/pkg/cli/production/report/processes.go b/pkg/cli/production/report/processes.go index 6657112..b1d21d9 100644 --- a/pkg/cli/production/report/processes.go +++ b/pkg/cli/production/report/processes.go @@ -16,6 +16,10 @@ var oramaProcessNames = []string{ func collectProcesses() *ProcessReport { r := &ProcessReport{} + // Collect known systemd-managed PIDs to avoid false positive orphan detection. + // Processes with PPID=1 that are systemd-managed daemons are NOT orphans. + managedPIDs := collectManagedPIDs() + // Run ps once and reuse the output for both zombies and orphans. ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) defer cancel() @@ -50,8 +54,9 @@ func collectProcesses() *ProcessReport { r.Zombies = append(r.Zombies, proc) } - // Orphans: PPID == 1 and command contains an orama-related name. - if ppid == 1 && isOramaProcess(command) { + // Orphans: PPID == 1 and command is orama-related, + // but NOT a known systemd-managed service PID. + if ppid == 1 && isOramaProcess(command) && !managedPIDs[pid] { r.Orphans = append(r.Orphans, proc) } } @@ -77,6 +82,35 @@ func collectProcesses() *ProcessReport { return r } +// managedServiceUnits lists systemd units whose MainPID should be excluded from orphan detection. +var managedServiceUnits = []string{ + "orama-node", "orama-gateway", "orama-olric", + "orama-ipfs", "orama-ipfs-cluster", + "orama-anyone-relay", "orama-anyone-client", + "coredns", "caddy", "rqlited", +} + +// collectManagedPIDs queries systemd for the MainPID of each known service. +// Returns a set of PIDs that are legitimately managed by systemd (not orphans). +func collectManagedPIDs() map[int]bool { + pids := make(map[int]bool) + for _, unit := range managedServiceUnits { + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + out, err := runCmd(ctx, "systemctl", "show", unit, "--property=MainPID") + cancel() + if err != nil { + continue + } + props := parseProperties(out) + if pidStr, ok := props["MainPID"]; ok { + if pid, err := strconv.Atoi(pidStr); err == nil && pid > 0 { + pids[pid] = true + } + } + } + return pids +} + // isOramaProcess checks if a command string contains any orama-related process name. func isOramaProcess(command string) bool { lower := strings.ToLower(command)