mirror of
https://github.com/DeBrosOfficial/orama.git
synced 2026-03-17 06:23:00 +00:00
Added some new alerts on monitoring
This commit is contained in:
parent
1e38fc2861
commit
f889c2e358
@ -203,6 +203,36 @@ sudo orama node doctor
|
|||||||
|
|
||||||
**Note:** Always use `orama node stop` instead of manually running `systemctl stop`. The CLI ensures all related services (including CoreDNS and Caddy on nameserver nodes) are handled correctly.
|
**Note:** Always use `orama node stop` instead of manually running `systemctl stop`. The CLI ensures all related services (including CoreDNS and Caddy on nameserver nodes) are handled correctly.
|
||||||
|
|
||||||
|
#### `orama node report`
|
||||||
|
|
||||||
|
Outputs comprehensive health data as JSON. Used by `orama monitor` over SSH:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo orama node report --json
|
||||||
|
```
|
||||||
|
|
||||||
|
See [MONITORING.md](MONITORING.md) for full details.
|
||||||
|
|
||||||
|
#### `orama monitor`
|
||||||
|
|
||||||
|
Real-time cluster monitoring from your local machine:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Interactive TUI
|
||||||
|
orama monitor --env testnet
|
||||||
|
|
||||||
|
# Cluster overview
|
||||||
|
orama monitor cluster --env testnet
|
||||||
|
|
||||||
|
# Alerts only
|
||||||
|
orama monitor alerts --env testnet
|
||||||
|
|
||||||
|
# Full JSON for LLM analysis
|
||||||
|
orama monitor report --env testnet
|
||||||
|
```
|
||||||
|
|
||||||
|
See [MONITORING.md](MONITORING.md) for all subcommands and flags.
|
||||||
|
|
||||||
### Node Join Flow
|
### Node Join Flow
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|||||||
275
docs/MONITORING.md
Normal file
275
docs/MONITORING.md
Normal file
@ -0,0 +1,275 @@
|
|||||||
|
# Monitoring
|
||||||
|
|
||||||
|
Real-time cluster health monitoring via SSH. The system has two parts:
|
||||||
|
|
||||||
|
1. **`orama node report`** — Runs on each VPS node, collects all local health data, outputs JSON
|
||||||
|
2. **`orama monitor`** — Runs on your local machine, SSHes into nodes, aggregates results, displays via TUI or tables
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
Developer Machine VPS Nodes (via SSH)
|
||||||
|
┌──────────────────┐ ┌────────────────────┐
|
||||||
|
│ orama monitor │ ──SSH──────────>│ orama node report │
|
||||||
|
│ (TUI / tables) │ <──JSON─────── │ (local collector) │
|
||||||
|
│ │ └────────────────────┘
|
||||||
|
│ CollectOnce() │ ──SSH──────────>│ orama node report │
|
||||||
|
│ DeriveAlerts() │ <──JSON─────── │ (local collector) │
|
||||||
|
│ Render() │ └────────────────────┘
|
||||||
|
└──────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
Each node runs `orama node report --json` locally (no SSH to other nodes), collecting data via `os/exec` and `net/http` to localhost services. The monitor SSHes into all nodes in parallel, collects reports, then runs cross-node analysis to detect cluster-wide issues.
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Interactive TUI (auto-refreshes every 30s)
|
||||||
|
orama monitor --env testnet
|
||||||
|
|
||||||
|
# Cluster overview table
|
||||||
|
orama monitor cluster --env testnet
|
||||||
|
|
||||||
|
# Alerts only
|
||||||
|
orama monitor alerts --env testnet
|
||||||
|
|
||||||
|
# Full JSON report (pipe to jq or feed to LLM)
|
||||||
|
orama monitor report --env testnet
|
||||||
|
```
|
||||||
|
|
||||||
|
## `orama monitor` — Local Orchestrator
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
|
||||||
|
```
|
||||||
|
orama monitor [subcommand] --env <environment> [flags]
|
||||||
|
```
|
||||||
|
|
||||||
|
Without a subcommand, launches the interactive TUI.
|
||||||
|
|
||||||
|
### Global Flags
|
||||||
|
|
||||||
|
| Flag | Default | Description |
|
||||||
|
|------|---------|-------------|
|
||||||
|
| `--env` | *(required)* | Environment: `devnet`, `testnet`, `mainnet` |
|
||||||
|
| `--json` | `false` | Machine-readable JSON output (for one-shot subcommands) |
|
||||||
|
| `--node` | | Filter to a specific node host/IP |
|
||||||
|
| `--config` | `scripts/remote-nodes.conf` | Path to node configuration file |
|
||||||
|
|
||||||
|
### Subcommands
|
||||||
|
|
||||||
|
| Subcommand | Description |
|
||||||
|
|------------|-------------|
|
||||||
|
| `live` | Interactive TUI monitor (default when no subcommand) |
|
||||||
|
| `cluster` | Cluster overview: all nodes, roles, RQLite state, WG peers |
|
||||||
|
| `node` | Per-node health details (system, services, WG, DNS) |
|
||||||
|
| `service` | Service status matrix across all nodes |
|
||||||
|
| `mesh` | WireGuard mesh connectivity and peer details |
|
||||||
|
| `dns` | DNS health: CoreDNS, Caddy, TLS cert expiry, resolution |
|
||||||
|
| `namespaces` | Namespace health across nodes |
|
||||||
|
| `alerts` | Active alerts and warnings sorted by severity |
|
||||||
|
| `report` | Full JSON dump optimized for LLM consumption |
|
||||||
|
|
||||||
|
### Examples
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Cluster overview
|
||||||
|
orama monitor cluster --env testnet
|
||||||
|
|
||||||
|
# Cluster overview as JSON
|
||||||
|
orama monitor cluster --env testnet --json
|
||||||
|
|
||||||
|
# Alerts for all nodes
|
||||||
|
orama monitor alerts --env testnet
|
||||||
|
|
||||||
|
# Single-node deep dive
|
||||||
|
orama monitor node --env testnet --node 51.195.109.238
|
||||||
|
|
||||||
|
# Services for one node
|
||||||
|
orama monitor service --env testnet --node 51.195.109.238
|
||||||
|
|
||||||
|
# WireGuard mesh details
|
||||||
|
orama monitor mesh --env testnet
|
||||||
|
|
||||||
|
# DNS health
|
||||||
|
orama monitor dns --env testnet
|
||||||
|
|
||||||
|
# Namespace health
|
||||||
|
orama monitor namespaces --env testnet
|
||||||
|
|
||||||
|
# Full report for LLM analysis
|
||||||
|
orama monitor report --env testnet | jq .
|
||||||
|
|
||||||
|
# Single-node report
|
||||||
|
orama monitor report --env testnet --node 51.195.109.238
|
||||||
|
|
||||||
|
# Custom config file
|
||||||
|
orama monitor cluster --config /path/to/nodes.conf --env devnet
|
||||||
|
```
|
||||||
|
|
||||||
|
### Interactive TUI
|
||||||
|
|
||||||
|
The `live` subcommand (default) launches a full-screen terminal UI:
|
||||||
|
|
||||||
|
**Tabs:** Overview | Nodes | Services | WG Mesh | DNS | Namespaces | Alerts
|
||||||
|
|
||||||
|
**Key Bindings:**
|
||||||
|
|
||||||
|
| Key | Action |
|
||||||
|
|-----|--------|
|
||||||
|
| `Tab` / `Shift+Tab` | Switch tabs |
|
||||||
|
| `j` / `k` or `↑` / `↓` | Scroll content |
|
||||||
|
| `r` | Force refresh |
|
||||||
|
| `q` / `Ctrl+C` | Quit |
|
||||||
|
|
||||||
|
The TUI auto-refreshes every 30 seconds. A spinner shows during data collection. Colors indicate health: green = healthy, red = critical, yellow = warning.
|
||||||
|
|
||||||
|
### LLM Report Format
|
||||||
|
|
||||||
|
`orama monitor report` outputs structured JSON designed for AI consumption:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"meta": {
|
||||||
|
"environment": "testnet",
|
||||||
|
"collected_at": "2026-02-16T12:00:00Z",
|
||||||
|
"duration_seconds": 3.2,
|
||||||
|
"node_count": 3,
|
||||||
|
"healthy_count": 3
|
||||||
|
},
|
||||||
|
"summary": {
|
||||||
|
"rqlite_leader": "10.0.0.1",
|
||||||
|
"rqlite_voters": "3/3",
|
||||||
|
"rqlite_raft_term": 42,
|
||||||
|
"wg_mesh_status": "all connected",
|
||||||
|
"service_health": "all nominal",
|
||||||
|
"critical_alerts": 0,
|
||||||
|
"warning_alerts": 1,
|
||||||
|
"info_alerts": 0
|
||||||
|
},
|
||||||
|
"alerts": [...],
|
||||||
|
"nodes": [
|
||||||
|
{
|
||||||
|
"host": "51.195.109.238",
|
||||||
|
"status": "healthy",
|
||||||
|
"collection_ms": 526,
|
||||||
|
"report": { ... }
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## `orama node report` — VPS-Side Collector
|
||||||
|
|
||||||
|
Runs locally on a VPS node. Collects all system and service data in parallel and outputs a single JSON blob. Requires root privileges.
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# On a VPS node
|
||||||
|
sudo orama node report --json
|
||||||
|
```
|
||||||
|
|
||||||
|
### What It Collects
|
||||||
|
|
||||||
|
| Section | Data |
|
||||||
|
|---------|------|
|
||||||
|
| **system** | CPU count, load average, memory/disk/swap usage, OOM kills, kernel version, uptime, clock time |
|
||||||
|
| **services** | Systemd service states (active, restarts, memory, CPU, restart loop detection) for 10 core services |
|
||||||
|
| **rqlite** | Raft state, leader, term, applied/commit index, peers, strong read test, readyz, debug vars |
|
||||||
|
| **olric** | Service state, memberlist, member count, restarts, memory, log analysis |
|
||||||
|
| **ipfs** | Daemon/cluster state, swarm/cluster peers, repo size, versions, swarm key |
|
||||||
|
| **gateway** | HTTP health check, subsystem status |
|
||||||
|
| **wireguard** | Interface state, WG IP, peers, handshake ages, MTU, config permissions |
|
||||||
|
| **dns** | CoreDNS/Caddy state, port bindings, resolution tests, TLS cert expiry |
|
||||||
|
| **anyone** | Relay/client state, bootstrap progress, fingerprint |
|
||||||
|
| **network** | Internet reachability, TCP stats, retransmission rate, listening ports, UFW rules |
|
||||||
|
| **processes** | Zombie count, orphan orama processes, panic/fatal count in logs |
|
||||||
|
| **namespaces** | Per-namespace service probes (RQLite, Olric, Gateway) |
|
||||||
|
|
||||||
|
### Performance
|
||||||
|
|
||||||
|
All 12 collectors run in parallel with goroutines. Typical collection time is **< 1 second** per node. HTTP timeouts are 3 seconds, command timeouts are 4 seconds.
|
||||||
|
|
||||||
|
### Output Schema
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"timestamp": "2026-02-16T12:00:00Z",
|
||||||
|
"hostname": "ns1",
|
||||||
|
"version": "0.107.0",
|
||||||
|
"collect_ms": 526,
|
||||||
|
"errors": [],
|
||||||
|
"system": { "cpu_count": 4, "load_avg_1": 0.1, "mem_total_mb": 7937, ... },
|
||||||
|
"services": { "services": [...], "failed_units": [] },
|
||||||
|
"rqlite": { "responsive": true, "raft_state": "Leader", "term": 42, ... },
|
||||||
|
"olric": { "service_active": true, "memberlist_up": true, ... },
|
||||||
|
"ipfs": { "daemon_active": true, "swarm_peers": 2, ... },
|
||||||
|
"gateway": { "responsive": true, "http_status": 200, ... },
|
||||||
|
"wireguard": { "interface_up": true, "wg_ip": "10.0.0.1", "peers": [...], ... },
|
||||||
|
"dns": { "coredns_active": true, "caddy_active": true, "base_tls_days_left": 88, ... },
|
||||||
|
"anyone": { "relay_active": true, "bootstrapped": true, ... },
|
||||||
|
"network": { "internet_reachable": true, "ufw_active": true, ... },
|
||||||
|
"processes": { "zombie_count": 0, "orphan_count": 0, "panic_count": 0, ... },
|
||||||
|
"namespaces": []
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Alert Detection
|
||||||
|
|
||||||
|
Alerts are derived from cross-node analysis of all collected reports. Each alert has a severity level and identifies the affected subsystem and node.
|
||||||
|
|
||||||
|
### Alert Severities
|
||||||
|
|
||||||
|
| Severity | Examples |
|
||||||
|
|----------|----------|
|
||||||
|
| **critical** | SSH collection failed (node unreachable), no RQLite leader, split brain, RQLite unresponsive, WireGuard interface down, WG peer never handshaked, OOM kills, service failed, UFW inactive |
|
||||||
|
| **warning** | Strong read failed, memory > 90%, disk > 85%, stale WG handshake (> 3min), Raft term inconsistency, applied index lag > 100, restart loop detected, TLS cert < 14 days, DNS down, namespace gateway down, Anyone not bootstrapped, clock skew > 5s, binary version mismatch, internet unreachable, high TCP retransmission |
|
||||||
|
| **info** | Zombie processes, orphan orama processes, swap usage > 30% |
|
||||||
|
|
||||||
|
### Cross-Node Checks
|
||||||
|
|
||||||
|
These checks compare data across all nodes:
|
||||||
|
|
||||||
|
- **RQLite Leader**: Exactly one leader exists (no split brain)
|
||||||
|
- **Leader Agreement**: All nodes agree on the same leader address
|
||||||
|
- **Raft Term Consistency**: Term values within 1 of each other
|
||||||
|
- **Applied Index Lag**: Followers within 100 entries of the leader
|
||||||
|
- **WireGuard Peer Symmetry**: Each node has N-1 peers
|
||||||
|
- **Clock Skew**: Node clocks within 5 seconds of each other
|
||||||
|
- **Binary Version**: All nodes running the same version
|
||||||
|
|
||||||
|
### Per-Node Checks
|
||||||
|
|
||||||
|
- **RQLite**: Responsive, ready, strong read
|
||||||
|
- **WireGuard**: Interface up, handshake freshness
|
||||||
|
- **System**: Memory, disk, load, OOM kills, swap
|
||||||
|
- **Services**: Systemd state, restart loops
|
||||||
|
- **DNS**: CoreDNS/Caddy up, TLS cert expiry, SOA resolution
|
||||||
|
- **Anyone**: Bootstrap progress
|
||||||
|
- **Processes**: Zombies, orphans, panics in logs
|
||||||
|
- **Namespaces**: Gateway and RQLite per namespace
|
||||||
|
- **Network**: UFW, internet reachability, TCP retransmission
|
||||||
|
|
||||||
|
## Monitor vs Inspector
|
||||||
|
|
||||||
|
Both tools check cluster health, but they serve different purposes:
|
||||||
|
|
||||||
|
| | `orama monitor` | `orama inspect` |
|
||||||
|
|---|---|---|
|
||||||
|
| **Data source** | `orama node report --json` (single SSH call per node) | 15+ SSH commands per node per subsystem |
|
||||||
|
| **Speed** | ~3-5s for full cluster | ~4-10s for full cluster |
|
||||||
|
| **Output** | TUI, tables, JSON | Tables, JSON |
|
||||||
|
| **Focus** | Real-time monitoring, alert detection | Deep diagnostic checks with pass/fail/warn |
|
||||||
|
| **AI support** | `report` subcommand for LLM input | `--ai` flag for inline analysis |
|
||||||
|
| **Use case** | "Is anything wrong right now?" | "What exactly is wrong and why?" |
|
||||||
|
|
||||||
|
Use `monitor` for day-to-day health checks and the interactive TUI. Use `inspect` for deep diagnostics when something is already known to be broken.
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
Uses the same `scripts/remote-nodes.conf` as the inspector. See [INSPECTOR.md](INSPECTOR.md#configuration) for format details.
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
Nodes must have the `orama` CLI installed (via `orama node install` or `upload-source.sh`). The monitor runs `sudo orama node report --json` over SSH, so the binary must be at `/usr/local/bin/orama` on each node.
|
||||||
@ -2,6 +2,7 @@ package monitor
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
|
||||||
"github.com/DeBrosOfficial/network/pkg/cli/production/report"
|
"github.com/DeBrosOfficial/network/pkg/cli/production/report"
|
||||||
)
|
)
|
||||||
@ -44,23 +45,17 @@ func DeriveAlerts(snap *ClusterSnapshot) []Alert {
|
|||||||
return alerts
|
return alerts
|
||||||
}
|
}
|
||||||
|
|
||||||
// Cross-node: RQLite leader
|
// Cross-node checks
|
||||||
alerts = append(alerts, checkRQLiteLeader(reports)...)
|
alerts = append(alerts, checkRQLiteLeader(reports)...)
|
||||||
|
alerts = append(alerts, checkRQLiteQuorum(reports)...)
|
||||||
// Cross-node: Raft term consistency
|
|
||||||
alerts = append(alerts, checkRaftTermConsistency(reports)...)
|
alerts = append(alerts, checkRaftTermConsistency(reports)...)
|
||||||
|
|
||||||
// Cross-node: Applied index lag
|
|
||||||
alerts = append(alerts, checkAppliedIndexLag(reports)...)
|
alerts = append(alerts, checkAppliedIndexLag(reports)...)
|
||||||
|
|
||||||
// Cross-node: WireGuard peer symmetry
|
|
||||||
alerts = append(alerts, checkWGPeerSymmetry(reports)...)
|
alerts = append(alerts, checkWGPeerSymmetry(reports)...)
|
||||||
|
|
||||||
// Cross-node: Clock skew
|
|
||||||
alerts = append(alerts, checkClockSkew(reports)...)
|
alerts = append(alerts, checkClockSkew(reports)...)
|
||||||
|
|
||||||
// Cross-node: Binary version
|
|
||||||
alerts = append(alerts, checkBinaryVersion(reports)...)
|
alerts = append(alerts, checkBinaryVersion(reports)...)
|
||||||
|
alerts = append(alerts, checkOlricMemberConsistency(reports)...)
|
||||||
|
alerts = append(alerts, checkIPFSSwarmConsistency(reports)...)
|
||||||
|
alerts = append(alerts, checkIPFSClusterConsistency(reports)...)
|
||||||
|
|
||||||
// Per-node checks
|
// Per-node checks
|
||||||
for _, r := range reports {
|
for _, r := range reports {
|
||||||
@ -74,6 +69,9 @@ func DeriveAlerts(snap *ClusterSnapshot) []Alert {
|
|||||||
alerts = append(alerts, checkNodeProcesses(r, host)...)
|
alerts = append(alerts, checkNodeProcesses(r, host)...)
|
||||||
alerts = append(alerts, checkNodeNamespaces(r, host)...)
|
alerts = append(alerts, checkNodeNamespaces(r, host)...)
|
||||||
alerts = append(alerts, checkNodeNetwork(r, host)...)
|
alerts = append(alerts, checkNodeNetwork(r, host)...)
|
||||||
|
alerts = append(alerts, checkNodeOlric(r, host)...)
|
||||||
|
alerts = append(alerts, checkNodeIPFS(r, host)...)
|
||||||
|
alerts = append(alerts, checkNodeGateway(r, host)...)
|
||||||
}
|
}
|
||||||
|
|
||||||
return alerts
|
return alerts
|
||||||
@ -86,7 +84,9 @@ func nodeHost(r *report.NodeReport) string {
|
|||||||
return r.Hostname
|
return r.Hostname
|
||||||
}
|
}
|
||||||
|
|
||||||
// --- Cross-node checks ---
|
// ---------------------------------------------------------------------------
|
||||||
|
// Cross-node checks
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
func checkRQLiteLeader(reports []*report.NodeReport) []Alert {
|
func checkRQLiteLeader(reports []*report.NodeReport) []Alert {
|
||||||
var alerts []Alert
|
var alerts []Alert
|
||||||
@ -116,6 +116,51 @@ func checkRQLiteLeader(reports []*report.NodeReport) []Alert {
|
|||||||
return alerts
|
return alerts
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func checkRQLiteQuorum(reports []*report.NodeReport) []Alert {
|
||||||
|
var voters, responsive int
|
||||||
|
for _, r := range reports {
|
||||||
|
if r.RQLite == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if r.RQLite.Responsive {
|
||||||
|
responsive++
|
||||||
|
if r.RQLite.Voter {
|
||||||
|
voters++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if responsive == 0 {
|
||||||
|
return nil // no rqlite data at all
|
||||||
|
}
|
||||||
|
|
||||||
|
// Total voters = responsive voters + unresponsive nodes that should be voters.
|
||||||
|
// For quorum calculation, use the total voter count (responsive + unreachable).
|
||||||
|
totalVoters := voters
|
||||||
|
for _, r := range reports {
|
||||||
|
if r.RQLite != nil && !r.RQLite.Responsive {
|
||||||
|
// Assume unresponsive nodes were voters (conservative estimate).
|
||||||
|
totalVoters++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if totalVoters < 2 {
|
||||||
|
return nil // single-node cluster, no quorum concept
|
||||||
|
}
|
||||||
|
|
||||||
|
quorum := totalVoters/2 + 1
|
||||||
|
if voters < quorum {
|
||||||
|
return []Alert{{AlertCritical, "rqlite", "cluster",
|
||||||
|
fmt.Sprintf("Quorum lost: only %d/%d voters reachable (need %d)", voters, totalVoters, quorum)}}
|
||||||
|
}
|
||||||
|
if voters == quorum {
|
||||||
|
return []Alert{{AlertWarning, "rqlite", "cluster",
|
||||||
|
fmt.Sprintf("Quorum fragile: exactly %d/%d voters reachable (one more failure = quorum loss)", voters, totalVoters)}}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func checkRaftTermConsistency(reports []*report.NodeReport) []Alert {
|
func checkRaftTermConsistency(reports []*report.NodeReport) []Alert {
|
||||||
var minTerm, maxTerm uint64
|
var minTerm, maxTerm uint64
|
||||||
first := true
|
first := true
|
||||||
@ -126,7 +171,7 @@ func checkRaftTermConsistency(reports []*report.NodeReport) []Alert {
|
|||||||
if first {
|
if first {
|
||||||
minTerm = r.RQLite.Term
|
minTerm = r.RQLite.Term
|
||||||
maxTerm = r.RQLite.Term
|
maxTerm = r.RQLite.Term
|
||||||
first = true
|
first = false
|
||||||
}
|
}
|
||||||
if r.RQLite.Term < minTerm {
|
if r.RQLite.Term < minTerm {
|
||||||
minTerm = r.RQLite.Term
|
minTerm = r.RQLite.Term
|
||||||
@ -134,7 +179,6 @@ func checkRaftTermConsistency(reports []*report.NodeReport) []Alert {
|
|||||||
if r.RQLite.Term > maxTerm {
|
if r.RQLite.Term > maxTerm {
|
||||||
maxTerm = r.RQLite.Term
|
maxTerm = r.RQLite.Term
|
||||||
}
|
}
|
||||||
first = false
|
|
||||||
}
|
}
|
||||||
if maxTerm-minTerm > 1 {
|
if maxTerm-minTerm > 1 {
|
||||||
return []Alert{{AlertWarning, "rqlite", "cluster",
|
return []Alert{{AlertWarning, "rqlite", "cluster",
|
||||||
@ -166,10 +210,8 @@ func checkAppliedIndexLag(reports []*report.NodeReport) []Alert {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func checkWGPeerSymmetry(reports []*report.NodeReport) []Alert {
|
func checkWGPeerSymmetry(reports []*report.NodeReport) []Alert {
|
||||||
// Build map: wg_ip -> set of peer public keys
|
|
||||||
type nodeInfo struct {
|
type nodeInfo struct {
|
||||||
host string
|
host string
|
||||||
wgIP string
|
|
||||||
peerKeys map[string]bool
|
peerKeys map[string]bool
|
||||||
}
|
}
|
||||||
var nodes []nodeInfo
|
var nodes []nodeInfo
|
||||||
@ -177,14 +219,13 @@ func checkWGPeerSymmetry(reports []*report.NodeReport) []Alert {
|
|||||||
if r.WireGuard == nil || !r.WireGuard.InterfaceUp {
|
if r.WireGuard == nil || !r.WireGuard.InterfaceUp {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
ni := nodeInfo{host: nodeHost(r), wgIP: r.WireGuard.WgIP, peerKeys: map[string]bool{}}
|
ni := nodeInfo{host: nodeHost(r), peerKeys: map[string]bool{}}
|
||||||
for _, p := range r.WireGuard.Peers {
|
for _, p := range r.WireGuard.Peers {
|
||||||
ni.peerKeys[p.PublicKey] = true
|
ni.peerKeys[p.PublicKey] = true
|
||||||
}
|
}
|
||||||
nodes = append(nodes, ni)
|
nodes = append(nodes, ni)
|
||||||
}
|
}
|
||||||
|
|
||||||
// For WG peer symmetry, we check peer counts match (N-1 peers expected)
|
|
||||||
var alerts []Alert
|
var alerts []Alert
|
||||||
expectedPeers := len(nodes) - 1
|
expectedPeers := len(nodes) - 1
|
||||||
for _, ni := range nodes {
|
for _, ni := range nodes {
|
||||||
@ -254,22 +295,164 @@ func checkBinaryVersion(reports []*report.NodeReport) []Alert {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// --- Per-node checks ---
|
func checkOlricMemberConsistency(reports []*report.NodeReport) []Alert {
|
||||||
|
// Count nodes where Olric is active to determine expected member count.
|
||||||
|
activeCount := 0
|
||||||
|
for _, r := range reports {
|
||||||
|
if r.Olric != nil && r.Olric.ServiceActive {
|
||||||
|
activeCount++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if activeCount < 2 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var alerts []Alert
|
||||||
|
for _, r := range reports {
|
||||||
|
if r.Olric == nil || !r.Olric.ServiceActive || r.Olric.MemberCount == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if r.Olric.MemberCount < activeCount {
|
||||||
|
alerts = append(alerts, Alert{AlertWarning, "olric", nodeHost(r),
|
||||||
|
fmt.Sprintf("Olric member count: %d (expected %d active nodes)", r.Olric.MemberCount, activeCount)})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return alerts
|
||||||
|
}
|
||||||
|
|
||||||
|
func checkIPFSSwarmConsistency(reports []*report.NodeReport) []Alert {
|
||||||
|
// Count IPFS-active nodes to determine expected peer count.
|
||||||
|
activeCount := 0
|
||||||
|
for _, r := range reports {
|
||||||
|
if r.IPFS != nil && r.IPFS.DaemonActive {
|
||||||
|
activeCount++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if activeCount < 2 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
expectedPeers := activeCount - 1
|
||||||
|
var alerts []Alert
|
||||||
|
for _, r := range reports {
|
||||||
|
if r.IPFS == nil || !r.IPFS.DaemonActive {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if r.IPFS.SwarmPeerCount == 0 {
|
||||||
|
alerts = append(alerts, Alert{AlertCritical, "ipfs", nodeHost(r),
|
||||||
|
"IPFS node isolated: 0 swarm peers"})
|
||||||
|
} else if r.IPFS.SwarmPeerCount < expectedPeers {
|
||||||
|
alerts = append(alerts, Alert{AlertWarning, "ipfs", nodeHost(r),
|
||||||
|
fmt.Sprintf("IPFS swarm peers: %d (expected %d)", r.IPFS.SwarmPeerCount, expectedPeers)})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return alerts
|
||||||
|
}
|
||||||
|
|
||||||
|
func checkIPFSClusterConsistency(reports []*report.NodeReport) []Alert {
|
||||||
|
activeCount := 0
|
||||||
|
for _, r := range reports {
|
||||||
|
if r.IPFS != nil && r.IPFS.ClusterActive {
|
||||||
|
activeCount++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if activeCount < 2 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var alerts []Alert
|
||||||
|
for _, r := range reports {
|
||||||
|
if r.IPFS == nil || !r.IPFS.ClusterActive {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if r.IPFS.ClusterPeerCount < activeCount {
|
||||||
|
alerts = append(alerts, Alert{AlertWarning, "ipfs", nodeHost(r),
|
||||||
|
fmt.Sprintf("IPFS cluster peers: %d (expected %d)", r.IPFS.ClusterPeerCount, activeCount)})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return alerts
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Per-node checks
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
func checkNodeRQLite(r *report.NodeReport, host string) []Alert {
|
func checkNodeRQLite(r *report.NodeReport, host string) []Alert {
|
||||||
if r.RQLite == nil {
|
if r.RQLite == nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
var alerts []Alert
|
var alerts []Alert
|
||||||
|
|
||||||
if !r.RQLite.Responsive {
|
if !r.RQLite.Responsive {
|
||||||
alerts = append(alerts, Alert{AlertCritical, "rqlite", host, "RQLite not responding"})
|
alerts = append(alerts, Alert{AlertCritical, "rqlite", host, "RQLite not responding"})
|
||||||
|
return alerts // no point checking further
|
||||||
}
|
}
|
||||||
if r.RQLite.Responsive && !r.RQLite.Ready {
|
|
||||||
|
if !r.RQLite.Ready {
|
||||||
alerts = append(alerts, Alert{AlertWarning, "rqlite", host, "RQLite not ready (/readyz failed)"})
|
alerts = append(alerts, Alert{AlertWarning, "rqlite", host, "RQLite not ready (/readyz failed)"})
|
||||||
}
|
}
|
||||||
if r.RQLite.Responsive && !r.RQLite.StrongRead {
|
if !r.RQLite.StrongRead {
|
||||||
alerts = append(alerts, Alert{AlertWarning, "rqlite", host, "Strong read failed"})
|
alerts = append(alerts, Alert{AlertWarning, "rqlite", host, "Strong read failed"})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Raft state anomalies
|
||||||
|
if r.RQLite.RaftState == "Candidate" {
|
||||||
|
alerts = append(alerts, Alert{AlertWarning, "rqlite", host, "RQLite in election (Candidate state)"})
|
||||||
|
}
|
||||||
|
if r.RQLite.RaftState == "Shutdown" {
|
||||||
|
alerts = append(alerts, Alert{AlertCritical, "rqlite", host, "RQLite in Shutdown state"})
|
||||||
|
}
|
||||||
|
|
||||||
|
// FSM backlog
|
||||||
|
if r.RQLite.FsmPending > 10 {
|
||||||
|
alerts = append(alerts, Alert{AlertWarning, "rqlite", host,
|
||||||
|
fmt.Sprintf("RQLite FSM backlog: %d entries pending", r.RQLite.FsmPending)})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Commit-applied gap (per-node, distinct from cross-node applied index lag)
|
||||||
|
if r.RQLite.Commit > 0 && r.RQLite.Applied > 0 && r.RQLite.Commit > r.RQLite.Applied {
|
||||||
|
gap := r.RQLite.Commit - r.RQLite.Applied
|
||||||
|
if gap > 100 {
|
||||||
|
alerts = append(alerts, Alert{AlertWarning, "rqlite", host,
|
||||||
|
fmt.Sprintf("RQLite commit-applied gap: %d (commit=%d, applied=%d)", gap, r.RQLite.Commit, r.RQLite.Applied)})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Resource pressure
|
||||||
|
if r.RQLite.Goroutines > 1000 {
|
||||||
|
alerts = append(alerts, Alert{AlertWarning, "rqlite", host,
|
||||||
|
fmt.Sprintf("RQLite goroutine count high: %d", r.RQLite.Goroutines)})
|
||||||
|
}
|
||||||
|
if r.RQLite.HeapMB > 1000 {
|
||||||
|
alerts = append(alerts, Alert{AlertWarning, "rqlite", host,
|
||||||
|
fmt.Sprintf("RQLite heap memory high: %dMB", r.RQLite.HeapMB)})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cluster partition detection: check if this node reports other nodes as unreachable
|
||||||
|
for nodeAddr, info := range r.RQLite.Nodes {
|
||||||
|
if !info.Reachable {
|
||||||
|
alerts = append(alerts, Alert{AlertCritical, "rqlite", host,
|
||||||
|
fmt.Sprintf("RQLite reports node %s unreachable (cluster partition)", nodeAddr)})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Debug vars
|
||||||
|
if dv := r.RQLite.DebugVars; dv != nil {
|
||||||
|
if dv.LeaderNotFound > 0 {
|
||||||
|
alerts = append(alerts, Alert{AlertWarning, "rqlite", host,
|
||||||
|
fmt.Sprintf("RQLite leader_not_found errors: %d", dv.LeaderNotFound)})
|
||||||
|
}
|
||||||
|
if dv.SnapshotErrors > 0 {
|
||||||
|
alerts = append(alerts, Alert{AlertWarning, "rqlite", host,
|
||||||
|
fmt.Sprintf("RQLite snapshot errors: %d", dv.SnapshotErrors)})
|
||||||
|
}
|
||||||
|
totalQueryErrors := dv.QueryErrors + dv.ExecuteErrors
|
||||||
|
if totalQueryErrors > 0 {
|
||||||
|
alerts = append(alerts, Alert{AlertInfo, "rqlite", host,
|
||||||
|
fmt.Sprintf("RQLite query/execute errors: %d", totalQueryErrors)})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return alerts
|
return alerts
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -327,6 +510,14 @@ func checkNodeSystem(r *report.NodeReport, host string) []Alert {
|
|||||||
fmt.Sprintf("High load: %.1f (%.1fx CPU count)", r.System.LoadAvg1, loadRatio)})
|
fmt.Sprintf("High load: %.1f (%.1fx CPU count)", r.System.LoadAvg1, loadRatio)})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// Inode exhaustion
|
||||||
|
if r.System.InodePct > 95 {
|
||||||
|
alerts = append(alerts, Alert{AlertCritical, "system", host,
|
||||||
|
fmt.Sprintf("Inode exhaustion imminent: %d%%", r.System.InodePct)})
|
||||||
|
} else if r.System.InodePct > 90 {
|
||||||
|
alerts = append(alerts, Alert{AlertWarning, "system", host,
|
||||||
|
fmt.Sprintf("Inode usage at %d%%", r.System.InodePct)})
|
||||||
|
}
|
||||||
return alerts
|
return alerts
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -377,6 +568,24 @@ func checkNodeDNS(r *report.NodeReport, host string) []Alert {
|
|||||||
if r.DNS.CoreDNSActive && !r.DNS.SOAResolves {
|
if r.DNS.CoreDNSActive && !r.DNS.SOAResolves {
|
||||||
alerts = append(alerts, Alert{AlertWarning, "dns", host, "SOA record not resolving"})
|
alerts = append(alerts, Alert{AlertWarning, "dns", host, "SOA record not resolving"})
|
||||||
}
|
}
|
||||||
|
// Additional DNS checks (only when CoreDNS is running)
|
||||||
|
if r.DNS.CoreDNSActive {
|
||||||
|
if !r.DNS.WildcardResolves {
|
||||||
|
alerts = append(alerts, Alert{AlertWarning, "dns", host, "Wildcard DNS not resolving"})
|
||||||
|
}
|
||||||
|
if !r.DNS.BaseAResolves {
|
||||||
|
alerts = append(alerts, Alert{AlertWarning, "dns", host, "Base domain A record not resolving"})
|
||||||
|
}
|
||||||
|
if !r.DNS.NSResolves {
|
||||||
|
alerts = append(alerts, Alert{AlertWarning, "dns", host, "NS records not resolving"})
|
||||||
|
}
|
||||||
|
if !r.DNS.Port53Bound {
|
||||||
|
alerts = append(alerts, Alert{AlertCritical, "dns", host, "CoreDNS active but port 53 not bound"})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if r.DNS.CaddyActive && !r.DNS.Port443Bound {
|
||||||
|
alerts = append(alerts, Alert{AlertCritical, "dns", host, "Caddy active but port 443 not bound"})
|
||||||
|
}
|
||||||
return alerts
|
return alerts
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -442,6 +651,136 @@ func checkNodeNetwork(r *report.NodeReport, host string) []Alert {
|
|||||||
alerts = append(alerts, Alert{AlertWarning, "network", host,
|
alerts = append(alerts, Alert{AlertWarning, "network", host,
|
||||||
fmt.Sprintf("High TCP retransmission rate: %.1f%%", r.Network.TCPRetransRate)})
|
fmt.Sprintf("High TCP retransmission rate: %.1f%%", r.Network.TCPRetransRate)})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check for internal ports exposed in UFW rules.
|
||||||
|
// Ports 5001 (RQLite), 6001 (Gateway), 3320 (Olric), 4501 (IPFS API) should be internal only.
|
||||||
|
internalPorts := []string{"5001", "6001", "3320", "4501"}
|
||||||
|
for _, rule := range r.Network.UFWRules {
|
||||||
|
ruleLower := strings.ToLower(rule)
|
||||||
|
// Only flag ALLOW rules (not deny/reject).
|
||||||
|
if !strings.Contains(ruleLower, "allow") {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
for _, port := range internalPorts {
|
||||||
|
// Match rules like "5001 ALLOW Anywhere" or "5001/tcp ALLOW IN"
|
||||||
|
// but not rules restricted to 10.0.0.0/24 (WG subnet).
|
||||||
|
if strings.Contains(rule, port) && !strings.Contains(rule, "10.0.0.") {
|
||||||
|
alerts = append(alerts, Alert{AlertCritical, "network", host,
|
||||||
|
fmt.Sprintf("Internal port %s exposed in UFW: %s", port, strings.TrimSpace(rule))})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return alerts
|
||||||
|
}
|
||||||
|
|
||||||
|
func checkNodeOlric(r *report.NodeReport, host string) []Alert {
|
||||||
|
if r.Olric == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
var alerts []Alert
|
||||||
|
|
||||||
|
if !r.Olric.ServiceActive {
|
||||||
|
alerts = append(alerts, Alert{AlertCritical, "olric", host, "Olric service down"})
|
||||||
|
return alerts
|
||||||
|
}
|
||||||
|
if !r.Olric.MemberlistUp {
|
||||||
|
alerts = append(alerts, Alert{AlertCritical, "olric", host, "Olric memberlist port down"})
|
||||||
|
}
|
||||||
|
if r.Olric.LogSuspects > 0 {
|
||||||
|
alerts = append(alerts, Alert{AlertWarning, "olric", host,
|
||||||
|
fmt.Sprintf("Olric member suspects: %d in last hour", r.Olric.LogSuspects)})
|
||||||
|
}
|
||||||
|
if r.Olric.LogFlapping > 5 {
|
||||||
|
alerts = append(alerts, Alert{AlertWarning, "olric", host,
|
||||||
|
fmt.Sprintf("Olric members flapping: %d join/leave events in last hour", r.Olric.LogFlapping)})
|
||||||
|
}
|
||||||
|
if r.Olric.LogErrors > 20 {
|
||||||
|
alerts = append(alerts, Alert{AlertWarning, "olric", host,
|
||||||
|
fmt.Sprintf("High Olric error rate: %d errors in last hour", r.Olric.LogErrors)})
|
||||||
|
}
|
||||||
|
if r.Olric.RestartCount > 3 {
|
||||||
|
alerts = append(alerts, Alert{AlertWarning, "olric", host,
|
||||||
|
fmt.Sprintf("Olric excessive restarts: %d", r.Olric.RestartCount)})
|
||||||
|
}
|
||||||
|
if r.Olric.ProcessMemMB > 500 {
|
||||||
|
alerts = append(alerts, Alert{AlertWarning, "olric", host,
|
||||||
|
fmt.Sprintf("Olric high memory: %dMB", r.Olric.ProcessMemMB)})
|
||||||
|
}
|
||||||
|
|
||||||
|
return alerts
|
||||||
|
}
|
||||||
|
|
||||||
|
func checkNodeIPFS(r *report.NodeReport, host string) []Alert {
|
||||||
|
if r.IPFS == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
var alerts []Alert
|
||||||
|
|
||||||
|
if !r.IPFS.DaemonActive {
|
||||||
|
alerts = append(alerts, Alert{AlertCritical, "ipfs", host, "IPFS daemon down"})
|
||||||
|
}
|
||||||
|
if !r.IPFS.ClusterActive {
|
||||||
|
alerts = append(alerts, Alert{AlertCritical, "ipfs", host, "IPFS cluster down"})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Only check these if daemon is running (otherwise data is meaningless).
|
||||||
|
if r.IPFS.DaemonActive {
|
||||||
|
if r.IPFS.SwarmPeerCount == 0 {
|
||||||
|
alerts = append(alerts, Alert{AlertCritical, "ipfs", host, "IPFS isolated: no swarm peers"})
|
||||||
|
}
|
||||||
|
if !r.IPFS.HasSwarmKey {
|
||||||
|
alerts = append(alerts, Alert{AlertCritical, "ipfs", host,
|
||||||
|
"IPFS swarm key missing (private network compromised)"})
|
||||||
|
}
|
||||||
|
if !r.IPFS.BootstrapEmpty {
|
||||||
|
alerts = append(alerts, Alert{AlertWarning, "ipfs", host,
|
||||||
|
"IPFS bootstrap list not empty (should be empty for private swarm)"})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if r.IPFS.RepoUsePct > 95 {
|
||||||
|
alerts = append(alerts, Alert{AlertCritical, "ipfs", host,
|
||||||
|
fmt.Sprintf("IPFS repo nearly full: %d%%", r.IPFS.RepoUsePct)})
|
||||||
|
} else if r.IPFS.RepoUsePct > 90 {
|
||||||
|
alerts = append(alerts, Alert{AlertWarning, "ipfs", host,
|
||||||
|
fmt.Sprintf("IPFS repo at %d%%", r.IPFS.RepoUsePct)})
|
||||||
|
}
|
||||||
|
|
||||||
|
if r.IPFS.ClusterErrors > 0 {
|
||||||
|
alerts = append(alerts, Alert{AlertWarning, "ipfs", host,
|
||||||
|
fmt.Sprintf("IPFS cluster peer errors: %d", r.IPFS.ClusterErrors)})
|
||||||
|
}
|
||||||
|
|
||||||
|
return alerts
|
||||||
|
}
|
||||||
|
|
||||||
|
func checkNodeGateway(r *report.NodeReport, host string) []Alert {
|
||||||
|
if r.Gateway == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
var alerts []Alert
|
||||||
|
|
||||||
|
if !r.Gateway.Responsive {
|
||||||
|
alerts = append(alerts, Alert{AlertCritical, "gateway", host, "Gateway not responding"})
|
||||||
|
return alerts
|
||||||
|
}
|
||||||
|
|
||||||
|
if r.Gateway.HTTPStatus != 200 {
|
||||||
|
alerts = append(alerts, Alert{AlertWarning, "gateway", host,
|
||||||
|
fmt.Sprintf("Gateway health check returned HTTP %d", r.Gateway.HTTPStatus)})
|
||||||
|
}
|
||||||
|
|
||||||
|
for name, sub := range r.Gateway.Subsystems {
|
||||||
|
if sub.Status != "ok" && sub.Status != "" {
|
||||||
|
msg := fmt.Sprintf("Gateway subsystem %s: status=%s", name, sub.Status)
|
||||||
|
if sub.Error != "" {
|
||||||
|
msg += fmt.Sprintf(" error=%s", sub.Error)
|
||||||
|
}
|
||||||
|
alerts = append(alerts, Alert{AlertWarning, "gateway", host, msg})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return alerts
|
return alerts
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -451,4 +790,3 @@ func truncateKey(key string) string {
|
|||||||
}
|
}
|
||||||
return key
|
return key
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -16,6 +16,10 @@ var oramaProcessNames = []string{
|
|||||||
func collectProcesses() *ProcessReport {
|
func collectProcesses() *ProcessReport {
|
||||||
r := &ProcessReport{}
|
r := &ProcessReport{}
|
||||||
|
|
||||||
|
// Collect known systemd-managed PIDs to avoid false positive orphan detection.
|
||||||
|
// Processes with PPID=1 that are systemd-managed daemons are NOT orphans.
|
||||||
|
managedPIDs := collectManagedPIDs()
|
||||||
|
|
||||||
// Run ps once and reuse the output for both zombies and orphans.
|
// Run ps once and reuse the output for both zombies and orphans.
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second)
|
ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
@ -50,8 +54,9 @@ func collectProcesses() *ProcessReport {
|
|||||||
r.Zombies = append(r.Zombies, proc)
|
r.Zombies = append(r.Zombies, proc)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Orphans: PPID == 1 and command contains an orama-related name.
|
// Orphans: PPID == 1 and command is orama-related,
|
||||||
if ppid == 1 && isOramaProcess(command) {
|
// but NOT a known systemd-managed service PID.
|
||||||
|
if ppid == 1 && isOramaProcess(command) && !managedPIDs[pid] {
|
||||||
r.Orphans = append(r.Orphans, proc)
|
r.Orphans = append(r.Orphans, proc)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -77,6 +82,35 @@ func collectProcesses() *ProcessReport {
|
|||||||
return r
|
return r
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// managedServiceUnits lists systemd units whose MainPID should be excluded from orphan detection.
|
||||||
|
var managedServiceUnits = []string{
|
||||||
|
"orama-node", "orama-gateway", "orama-olric",
|
||||||
|
"orama-ipfs", "orama-ipfs-cluster",
|
||||||
|
"orama-anyone-relay", "orama-anyone-client",
|
||||||
|
"coredns", "caddy", "rqlited",
|
||||||
|
}
|
||||||
|
|
||||||
|
// collectManagedPIDs queries systemd for the MainPID of each known service.
|
||||||
|
// Returns a set of PIDs that are legitimately managed by systemd (not orphans).
|
||||||
|
func collectManagedPIDs() map[int]bool {
|
||||||
|
pids := make(map[int]bool)
|
||||||
|
for _, unit := range managedServiceUnits {
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||||
|
out, err := runCmd(ctx, "systemctl", "show", unit, "--property=MainPID")
|
||||||
|
cancel()
|
||||||
|
if err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
props := parseProperties(out)
|
||||||
|
if pidStr, ok := props["MainPID"]; ok {
|
||||||
|
if pid, err := strconv.Atoi(pidStr); err == nil && pid > 0 {
|
||||||
|
pids[pid] = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return pids
|
||||||
|
}
|
||||||
|
|
||||||
// isOramaProcess checks if a command string contains any orama-related process name.
|
// isOramaProcess checks if a command string contains any orama-related process name.
|
||||||
func isOramaProcess(command string) bool {
|
func isOramaProcess(command string) bool {
|
||||||
lower := strings.ToLower(command)
|
lower := strings.ToLower(command)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user