diff --git a/README.md b/README.md index 61fa656..059a55d 100644 --- a/README.md +++ b/README.md @@ -130,11 +130,11 @@ orama deploy go --name myapp # Go binaries (must have /health e orama deploy nodejs --name myapp # Node.js apps (must have /health endpoint) # Manage deployments -orama deployments list # List all deployments -orama deployments get # Get deployment details -orama deployments logs --follow # View logs -orama deployments delete # Delete deployment -orama deployments rollback --version 1 # Rollback to version +orama app list # List all deployments +orama app get # Get deployment details +orama app logs --follow # View logs +orama app delete # Delete deployment +orama app rollback --version 1 # Rollback to version ``` ### SQLite Databases @@ -147,28 +147,12 @@ orama db backup # Backup to IPFS orama db backups # List backups ``` -### Network Status +### Environment Management ```bash -orama health # Cluster health check -orama peers # List connected peers -orama status # Network status -``` - -### RQLite Operations - -```bash -orama query "SELECT * FROM users" -orama query "CREATE TABLE users (id INTEGER PRIMARY KEY)" -orama transaction --file ops.json -``` - -### Pub/Sub - -```bash -orama pubsub publish -orama pubsub subscribe 30s -orama pubsub topics +orama env list # List available environments +orama env current # Show active environment +orama env use # Switch environment ``` ## Serverless Functions (WASM) @@ -267,14 +251,14 @@ Orama Network integrates with the [Anyone Protocol](https://anyone.io) for anony ```bash # Install as relay operator (earn rewards) -sudo orama install --vps-ip --domain \ +sudo orama node install --vps-ip --domain \ --anyone-relay \ --anyone-nickname "MyRelay" \ --anyone-contact "operator@email.com" \ --anyone-wallet "0x1234...abcd" # With exit relay (legal implications apply) -sudo orama install --vps-ip --domain \ +sudo orama node install --vps-ip --domain \ --anyone-relay \ --anyone-exit \ --anyone-nickname "MyExitRelay" \ @@ -282,7 +266,7 @@ sudo orama install --vps-ip --domain \ --anyone-wallet "0x1234...abcd" # Migrate existing Anyone installation -sudo orama install --vps-ip --domain \ +sudo orama node install --vps-ip --domain \ --anyone-relay \ --anyone-migrate \ --anyone-nickname "MyRelay" \ @@ -317,31 +301,34 @@ go install github.com/DeBrosOfficial/network/cmd/cli@latest **Setup (after installation):** ```bash -sudo orama install --interactive +sudo orama node install --interactive ``` ### Service Management ```bash # Status -orama status +sudo orama node status # Control services -sudo orama start -sudo orama stop -sudo orama restart +sudo orama node start +sudo orama node stop +sudo orama node restart + +# Diagnose issues +sudo orama node doctor # View logs -orama logs node --follow -orama logs gateway --follow -orama logs ipfs --follow +orama node logs node --follow +orama node logs gateway --follow +orama node logs ipfs --follow ``` ### Upgrade ```bash # Upgrade to latest version -sudo orama upgrade --interactive +sudo orama node upgrade --restart ``` ## Configuration @@ -397,9 +384,9 @@ rqlite -H localhost -p 5001 ```bash # Production reset (⚠️ DESTROYS DATA) -sudo orama uninstall +sudo orama node uninstall sudo rm -rf /opt/orama/.orama -sudo orama install +sudo orama node install ``` ## HTTP Gateway API diff --git a/cmd/cli/root.go b/cmd/cli/root.go index 021448a..11df00f 100644 --- a/cmd/cli/root.go +++ b/cmd/cli/root.go @@ -13,6 +13,7 @@ import ( deploycmd "github.com/DeBrosOfficial/network/pkg/cli/cmd/deploy" "github.com/DeBrosOfficial/network/pkg/cli/cmd/envcmd" "github.com/DeBrosOfficial/network/pkg/cli/cmd/inspectcmd" + "github.com/DeBrosOfficial/network/pkg/cli/cmd/monitorcmd" "github.com/DeBrosOfficial/network/pkg/cli/cmd/namespacecmd" "github.com/DeBrosOfficial/network/pkg/cli/cmd/node" ) @@ -75,6 +76,9 @@ and interacting with the Orama distributed network.`, // Inspect command rootCmd.AddCommand(inspectcmd.Cmd) + // Monitor command + rootCmd.AddCommand(monitorcmd.Cmd) + return rootCmd } diff --git a/pkg/cli/cmd/monitorcmd/monitor.go b/pkg/cli/cmd/monitorcmd/monitor.go new file mode 100644 index 0000000..f1a9495 --- /dev/null +++ b/pkg/cli/cmd/monitorcmd/monitor.go @@ -0,0 +1,200 @@ +package monitorcmd + +import ( + "context" + "os" + "time" + + "github.com/DeBrosOfficial/network/pkg/cli/monitor" + "github.com/DeBrosOfficial/network/pkg/cli/monitor/display" + "github.com/DeBrosOfficial/network/pkg/cli/monitor/tui" + "github.com/spf13/cobra" +) + +// Cmd is the root monitor command. +var Cmd = &cobra.Command{ + Use: "monitor", + Short: "Monitor cluster health from your local machine", + Long: `SSH into cluster nodes and display real-time health data. +Runs 'orama node report --json' on each node and aggregates results. + +Without a subcommand, launches the interactive TUI.`, + RunE: runLive, +} + +// Shared persistent flags. +var ( + flagEnv string + flagJSON bool + flagNode string + flagConfig string +) + +func init() { + Cmd.PersistentFlags().StringVar(&flagEnv, "env", "", "Environment: devnet, testnet, mainnet (required)") + Cmd.PersistentFlags().BoolVar(&flagJSON, "json", false, "Machine-readable JSON output") + Cmd.PersistentFlags().StringVar(&flagNode, "node", "", "Filter to specific node host/IP") + Cmd.PersistentFlags().StringVar(&flagConfig, "config", "scripts/remote-nodes.conf", "Path to remote-nodes.conf") + Cmd.MarkPersistentFlagRequired("env") + + Cmd.AddCommand(liveCmd) + Cmd.AddCommand(clusterCmd) + Cmd.AddCommand(nodeCmd) + Cmd.AddCommand(serviceCmd) + Cmd.AddCommand(meshCmd) + Cmd.AddCommand(dnsCmd) + Cmd.AddCommand(namespacesCmd) + Cmd.AddCommand(alertsCmd) + Cmd.AddCommand(reportCmd) +} + +// --------------------------------------------------------------------------- +// Subcommands +// --------------------------------------------------------------------------- + +var liveCmd = &cobra.Command{ + Use: "live", + Short: "Interactive TUI monitor", + RunE: runLive, +} + +var clusterCmd = &cobra.Command{ + Use: "cluster", + Short: "Cluster overview (one-shot)", + RunE: func(cmd *cobra.Command, args []string) error { + snap, err := collectSnapshot() + if err != nil { + return err + } + if flagJSON { + return display.ClusterJSON(snap, os.Stdout) + } + return display.ClusterTable(snap, os.Stdout) + }, +} + +var nodeCmd = &cobra.Command{ + Use: "node", + Short: "Per-node health details (one-shot)", + RunE: func(cmd *cobra.Command, args []string) error { + snap, err := collectSnapshot() + if err != nil { + return err + } + if flagJSON { + return display.NodeJSON(snap, os.Stdout) + } + return display.NodeTable(snap, os.Stdout) + }, +} + +var serviceCmd = &cobra.Command{ + Use: "service", + Short: "Service status across the cluster (one-shot)", + RunE: func(cmd *cobra.Command, args []string) error { + snap, err := collectSnapshot() + if err != nil { + return err + } + if flagJSON { + return display.ServiceJSON(snap, os.Stdout) + } + return display.ServiceTable(snap, os.Stdout) + }, +} + +var meshCmd = &cobra.Command{ + Use: "mesh", + Short: "Mesh connectivity status (one-shot)", + RunE: func(cmd *cobra.Command, args []string) error { + snap, err := collectSnapshot() + if err != nil { + return err + } + if flagJSON { + return display.MeshJSON(snap, os.Stdout) + } + return display.MeshTable(snap, os.Stdout) + }, +} + +var dnsCmd = &cobra.Command{ + Use: "dns", + Short: "DNS health overview (one-shot)", + RunE: func(cmd *cobra.Command, args []string) error { + snap, err := collectSnapshot() + if err != nil { + return err + } + if flagJSON { + return display.DNSJSON(snap, os.Stdout) + } + return display.DNSTable(snap, os.Stdout) + }, +} + +var namespacesCmd = &cobra.Command{ + Use: "namespaces", + Short: "Namespace usage summary (one-shot)", + RunE: func(cmd *cobra.Command, args []string) error { + snap, err := collectSnapshot() + if err != nil { + return err + } + if flagJSON { + return display.NamespacesJSON(snap, os.Stdout) + } + return display.NamespacesTable(snap, os.Stdout) + }, +} + +var alertsCmd = &cobra.Command{ + Use: "alerts", + Short: "Active alerts and warnings (one-shot)", + RunE: func(cmd *cobra.Command, args []string) error { + snap, err := collectSnapshot() + if err != nil { + return err + } + if flagJSON { + return display.AlertsJSON(snap, os.Stdout) + } + return display.AlertsTable(snap, os.Stdout) + }, +} + +var reportCmd = &cobra.Command{ + Use: "report", + Short: "Full cluster report (JSON)", + RunE: func(cmd *cobra.Command, args []string) error { + snap, err := collectSnapshot() + if err != nil { + return err + } + return display.FullReport(snap, os.Stdout) + }, +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +func collectSnapshot() (*monitor.ClusterSnapshot, error) { + cfg := newConfig() + return monitor.CollectOnce(context.Background(), cfg) +} + +func newConfig() monitor.CollectorConfig { + return monitor.CollectorConfig{ + ConfigPath: flagConfig, + Env: flagEnv, + NodeFilter: flagNode, + Timeout: 30 * time.Second, + } +} + +func runLive(cmd *cobra.Command, args []string) error { + cfg := newConfig() + return tui.Run(cfg) +} + diff --git a/pkg/cli/cmd/node/node.go b/pkg/cli/cmd/node/node.go index a17c19b..400f7fb 100644 --- a/pkg/cli/cmd/node/node.go +++ b/pkg/cli/cmd/node/node.go @@ -25,4 +25,5 @@ func init() { Cmd.AddCommand(inviteCmd) Cmd.AddCommand(migrateCmd) Cmd.AddCommand(doctorCmd) + Cmd.AddCommand(reportCmd) } diff --git a/pkg/cli/cmd/node/report.go b/pkg/cli/cmd/node/report.go new file mode 100644 index 0000000..ad25b7b --- /dev/null +++ b/pkg/cli/cmd/node/report.go @@ -0,0 +1,22 @@ +package node + +import ( + "github.com/DeBrosOfficial/network/pkg/cli/production/report" + "github.com/spf13/cobra" +) + +var reportCmd = &cobra.Command{ + Use: "report", + Short: "Output comprehensive node health data as JSON", + Long: `Collect all system and service data from this node and output +as a single JSON blob. Designed to be called by 'orama monitor' over SSH. +Requires root privileges for full data collection.`, + RunE: func(cmd *cobra.Command, args []string) error { + jsonFlag, _ := cmd.Flags().GetBool("json") + return report.Handle(jsonFlag, "") + }, +} + +func init() { + reportCmd.Flags().Bool("json", true, "Output as JSON (default)") +} diff --git a/pkg/cli/monitor/alerts.go b/pkg/cli/monitor/alerts.go new file mode 100644 index 0000000..64c4216 --- /dev/null +++ b/pkg/cli/monitor/alerts.go @@ -0,0 +1,454 @@ +package monitor + +import ( + "fmt" + + "github.com/DeBrosOfficial/network/pkg/cli/production/report" +) + +// AlertSeverity represents the severity of an alert. +type AlertSeverity string + +const ( + AlertCritical AlertSeverity = "critical" + AlertWarning AlertSeverity = "warning" + AlertInfo AlertSeverity = "info" +) + +// Alert represents a detected issue. +type Alert struct { + Severity AlertSeverity `json:"severity"` + Subsystem string `json:"subsystem"` + Node string `json:"node"` + Message string `json:"message"` +} + +// DeriveAlerts scans a ClusterSnapshot and produces alerts. +func DeriveAlerts(snap *ClusterSnapshot) []Alert { + var alerts []Alert + + // Collection failures + for _, cs := range snap.Nodes { + if cs.Error != nil { + alerts = append(alerts, Alert{ + Severity: AlertCritical, + Subsystem: "ssh", + Node: cs.Node.Host, + Message: fmt.Sprintf("Collection failed: %v", cs.Error), + }) + } + } + + reports := snap.Healthy() + if len(reports) == 0 { + return alerts + } + + // Cross-node: RQLite leader + alerts = append(alerts, checkRQLiteLeader(reports)...) + + // Cross-node: Raft term consistency + alerts = append(alerts, checkRaftTermConsistency(reports)...) + + // Cross-node: Applied index lag + alerts = append(alerts, checkAppliedIndexLag(reports)...) + + // Cross-node: WireGuard peer symmetry + alerts = append(alerts, checkWGPeerSymmetry(reports)...) + + // Cross-node: Clock skew + alerts = append(alerts, checkClockSkew(reports)...) + + // Cross-node: Binary version + alerts = append(alerts, checkBinaryVersion(reports)...) + + // Per-node checks + for _, r := range reports { + host := nodeHost(r) + alerts = append(alerts, checkNodeRQLite(r, host)...) + alerts = append(alerts, checkNodeWireGuard(r, host)...) + alerts = append(alerts, checkNodeSystem(r, host)...) + alerts = append(alerts, checkNodeServices(r, host)...) + alerts = append(alerts, checkNodeDNS(r, host)...) + alerts = append(alerts, checkNodeAnyone(r, host)...) + alerts = append(alerts, checkNodeProcesses(r, host)...) + alerts = append(alerts, checkNodeNamespaces(r, host)...) + alerts = append(alerts, checkNodeNetwork(r, host)...) + } + + return alerts +} + +func nodeHost(r *report.NodeReport) string { + if r.PublicIP != "" { + return r.PublicIP + } + return r.Hostname +} + +// --- Cross-node checks --- + +func checkRQLiteLeader(reports []*report.NodeReport) []Alert { + var alerts []Alert + leaders := 0 + leaderAddrs := map[string]bool{} + for _, r := range reports { + if r.RQLite != nil && r.RQLite.RaftState == "Leader" { + leaders++ + } + if r.RQLite != nil && r.RQLite.LeaderAddr != "" { + leaderAddrs[r.RQLite.LeaderAddr] = true + } + } + + if leaders == 0 { + alerts = append(alerts, Alert{AlertCritical, "rqlite", "cluster", "No RQLite leader found"}) + } else if leaders > 1 { + alerts = append(alerts, Alert{AlertCritical, "rqlite", "cluster", + fmt.Sprintf("Split brain: %d leaders detected", leaders)}) + } + + if len(leaderAddrs) > 1 { + alerts = append(alerts, Alert{AlertWarning, "rqlite", "cluster", + fmt.Sprintf("Leader disagreement: nodes report %d different leader addresses", len(leaderAddrs))}) + } + + return alerts +} + +func checkRaftTermConsistency(reports []*report.NodeReport) []Alert { + var minTerm, maxTerm uint64 + first := true + for _, r := range reports { + if r.RQLite == nil || !r.RQLite.Responsive { + continue + } + if first { + minTerm = r.RQLite.Term + maxTerm = r.RQLite.Term + first = true + } + if r.RQLite.Term < minTerm { + minTerm = r.RQLite.Term + } + if r.RQLite.Term > maxTerm { + maxTerm = r.RQLite.Term + } + first = false + } + if maxTerm-minTerm > 1 { + return []Alert{{AlertWarning, "rqlite", "cluster", + fmt.Sprintf("Raft term inconsistency: min=%d, max=%d (delta=%d)", minTerm, maxTerm, maxTerm-minTerm)}} + } + return nil +} + +func checkAppliedIndexLag(reports []*report.NodeReport) []Alert { + var maxApplied uint64 + for _, r := range reports { + if r.RQLite != nil && r.RQLite.Applied > maxApplied { + maxApplied = r.RQLite.Applied + } + } + + var alerts []Alert + for _, r := range reports { + if r.RQLite == nil || !r.RQLite.Responsive { + continue + } + lag := maxApplied - r.RQLite.Applied + if lag > 100 { + alerts = append(alerts, Alert{AlertWarning, "rqlite", nodeHost(r), + fmt.Sprintf("Applied index lag: %d behind leader (local=%d, max=%d)", lag, r.RQLite.Applied, maxApplied)}) + } + } + return alerts +} + +func checkWGPeerSymmetry(reports []*report.NodeReport) []Alert { + // Build map: wg_ip -> set of peer public keys + type nodeInfo struct { + host string + wgIP string + peerKeys map[string]bool + } + var nodes []nodeInfo + for _, r := range reports { + if r.WireGuard == nil || !r.WireGuard.InterfaceUp { + continue + } + ni := nodeInfo{host: nodeHost(r), wgIP: r.WireGuard.WgIP, peerKeys: map[string]bool{}} + for _, p := range r.WireGuard.Peers { + ni.peerKeys[p.PublicKey] = true + } + nodes = append(nodes, ni) + } + + // For WG peer symmetry, we check peer counts match (N-1 peers expected) + var alerts []Alert + expectedPeers := len(nodes) - 1 + for _, ni := range nodes { + if len(ni.peerKeys) < expectedPeers { + alerts = append(alerts, Alert{AlertCritical, "wireguard", ni.host, + fmt.Sprintf("WG peer count mismatch: has %d peers, expected %d", len(ni.peerKeys), expectedPeers)}) + } + } + + return alerts +} + +func checkClockSkew(reports []*report.NodeReport) []Alert { + var times []struct { + host string + t int64 + } + for _, r := range reports { + if r.System != nil && r.System.TimeUnix > 0 { + times = append(times, struct { + host string + t int64 + }{nodeHost(r), r.System.TimeUnix}) + } + } + if len(times) < 2 { + return nil + } + + var minT, maxT int64 = times[0].t, times[0].t + var minHost, maxHost string = times[0].host, times[0].host + for _, t := range times[1:] { + if t.t < minT { + minT = t.t + minHost = t.host + } + if t.t > maxT { + maxT = t.t + maxHost = t.host + } + } + + delta := maxT - minT + if delta > 5 { + return []Alert{{AlertWarning, "system", "cluster", + fmt.Sprintf("Clock skew: %ds between %s and %s", delta, minHost, maxHost)}} + } + return nil +} + +func checkBinaryVersion(reports []*report.NodeReport) []Alert { + versions := map[string][]string{} // version -> list of hosts + for _, r := range reports { + v := r.Version + if v == "" { + v = "unknown" + } + versions[v] = append(versions[v], nodeHost(r)) + } + if len(versions) > 1 { + msg := "Binary version mismatch:" + for v, hosts := range versions { + msg += fmt.Sprintf(" %s=%v", v, hosts) + } + return []Alert{{AlertWarning, "system", "cluster", msg}} + } + return nil +} + +// --- Per-node checks --- + +func checkNodeRQLite(r *report.NodeReport, host string) []Alert { + if r.RQLite == nil { + return nil + } + var alerts []Alert + if !r.RQLite.Responsive { + alerts = append(alerts, Alert{AlertCritical, "rqlite", host, "RQLite not responding"}) + } + if r.RQLite.Responsive && !r.RQLite.Ready { + alerts = append(alerts, Alert{AlertWarning, "rqlite", host, "RQLite not ready (/readyz failed)"}) + } + if r.RQLite.Responsive && !r.RQLite.StrongRead { + alerts = append(alerts, Alert{AlertWarning, "rqlite", host, "Strong read failed"}) + } + return alerts +} + +func checkNodeWireGuard(r *report.NodeReport, host string) []Alert { + if r.WireGuard == nil { + return nil + } + var alerts []Alert + if !r.WireGuard.InterfaceUp { + alerts = append(alerts, Alert{AlertCritical, "wireguard", host, "WireGuard interface down"}) + return alerts + } + for _, p := range r.WireGuard.Peers { + if p.HandshakeAgeSec > 180 && p.LatestHandshake > 0 { + alerts = append(alerts, Alert{AlertWarning, "wireguard", host, + fmt.Sprintf("Stale WG handshake with peer %s: %ds ago", truncateKey(p.PublicKey), p.HandshakeAgeSec)}) + } + if p.LatestHandshake == 0 { + alerts = append(alerts, Alert{AlertCritical, "wireguard", host, + fmt.Sprintf("WG peer %s has never handshaked", truncateKey(p.PublicKey))}) + } + } + return alerts +} + +func checkNodeSystem(r *report.NodeReport, host string) []Alert { + if r.System == nil { + return nil + } + var alerts []Alert + if r.System.MemUsePct > 90 { + alerts = append(alerts, Alert{AlertWarning, "system", host, + fmt.Sprintf("Memory at %d%%", r.System.MemUsePct)}) + } + if r.System.DiskUsePct > 85 { + alerts = append(alerts, Alert{AlertWarning, "system", host, + fmt.Sprintf("Disk at %d%%", r.System.DiskUsePct)}) + } + if r.System.OOMKills > 0 { + alerts = append(alerts, Alert{AlertCritical, "system", host, + fmt.Sprintf("%d OOM kills detected", r.System.OOMKills)}) + } + if r.System.SwapUsedMB > 0 && r.System.SwapTotalMB > 0 { + pct := r.System.SwapUsedMB * 100 / r.System.SwapTotalMB + if pct > 30 { + alerts = append(alerts, Alert{AlertInfo, "system", host, + fmt.Sprintf("Swap usage at %d%%", pct)}) + } + } + // High load + if r.System.CPUCount > 0 { + loadRatio := r.System.LoadAvg1 / float64(r.System.CPUCount) + if loadRatio > 2.0 { + alerts = append(alerts, Alert{AlertWarning, "system", host, + fmt.Sprintf("High load: %.1f (%.1fx CPU count)", r.System.LoadAvg1, loadRatio)}) + } + } + return alerts +} + +func checkNodeServices(r *report.NodeReport, host string) []Alert { + if r.Services == nil { + return nil + } + var alerts []Alert + for _, svc := range r.Services.Services { + if svc.ActiveState == "failed" { + alerts = append(alerts, Alert{AlertCritical, "service", host, + fmt.Sprintf("Service %s is FAILED", svc.Name)}) + } else if svc.ActiveState != "active" && svc.ActiveState != "" && svc.ActiveState != "unknown" { + alerts = append(alerts, Alert{AlertWarning, "service", host, + fmt.Sprintf("Service %s is %s", svc.Name, svc.ActiveState)}) + } + if svc.RestartLoopRisk { + alerts = append(alerts, Alert{AlertCritical, "service", host, + fmt.Sprintf("Service %s restart loop: %d restarts, active for %ds", svc.Name, svc.NRestarts, svc.ActiveSinceSec)}) + } + } + for _, unit := range r.Services.FailedUnits { + alerts = append(alerts, Alert{AlertWarning, "service", host, + fmt.Sprintf("Failed systemd unit: %s", unit)}) + } + return alerts +} + +func checkNodeDNS(r *report.NodeReport, host string) []Alert { + if r.DNS == nil { + return nil + } + var alerts []Alert + if !r.DNS.CoreDNSActive { + alerts = append(alerts, Alert{AlertCritical, "dns", host, "CoreDNS is down"}) + } + if !r.DNS.CaddyActive { + alerts = append(alerts, Alert{AlertCritical, "dns", host, "Caddy is down"}) + } + if r.DNS.BaseTLSDaysLeft >= 0 && r.DNS.BaseTLSDaysLeft < 14 { + alerts = append(alerts, Alert{AlertWarning, "dns", host, + fmt.Sprintf("Base TLS cert expires in %d days", r.DNS.BaseTLSDaysLeft)}) + } + if r.DNS.WildTLSDaysLeft >= 0 && r.DNS.WildTLSDaysLeft < 14 { + alerts = append(alerts, Alert{AlertWarning, "dns", host, + fmt.Sprintf("Wildcard TLS cert expires in %d days", r.DNS.WildTLSDaysLeft)}) + } + if r.DNS.CoreDNSActive && !r.DNS.SOAResolves { + alerts = append(alerts, Alert{AlertWarning, "dns", host, "SOA record not resolving"}) + } + return alerts +} + +func checkNodeAnyone(r *report.NodeReport, host string) []Alert { + if r.Anyone == nil { + return nil + } + var alerts []Alert + if (r.Anyone.RelayActive || r.Anyone.ClientActive) && !r.Anyone.Bootstrapped { + alerts = append(alerts, Alert{AlertWarning, "anyone", host, + fmt.Sprintf("Anyone bootstrap at %d%%", r.Anyone.BootstrapPct)}) + } + return alerts +} + +func checkNodeProcesses(r *report.NodeReport, host string) []Alert { + if r.Processes == nil { + return nil + } + var alerts []Alert + if r.Processes.ZombieCount > 0 { + alerts = append(alerts, Alert{AlertInfo, "system", host, + fmt.Sprintf("%d zombie processes", r.Processes.ZombieCount)}) + } + if r.Processes.OrphanCount > 0 { + alerts = append(alerts, Alert{AlertInfo, "system", host, + fmt.Sprintf("%d orphan orama processes", r.Processes.OrphanCount)}) + } + if r.Processes.PanicCount > 0 { + alerts = append(alerts, Alert{AlertCritical, "system", host, + fmt.Sprintf("%d panic/fatal in orama-node logs (1h)", r.Processes.PanicCount)}) + } + return alerts +} + +func checkNodeNamespaces(r *report.NodeReport, host string) []Alert { + var alerts []Alert + for _, ns := range r.Namespaces { + if !ns.GatewayUp { + alerts = append(alerts, Alert{AlertWarning, "namespace", host, + fmt.Sprintf("Namespace %s gateway down", ns.Name)}) + } + if !ns.RQLiteUp { + alerts = append(alerts, Alert{AlertWarning, "namespace", host, + fmt.Sprintf("Namespace %s RQLite down", ns.Name)}) + } + } + return alerts +} + +func checkNodeNetwork(r *report.NodeReport, host string) []Alert { + if r.Network == nil { + return nil + } + var alerts []Alert + if !r.Network.UFWActive { + alerts = append(alerts, Alert{AlertCritical, "network", host, "UFW firewall is inactive"}) + } + if !r.Network.InternetReachable { + alerts = append(alerts, Alert{AlertWarning, "network", host, "Internet not reachable (ping 8.8.8.8 failed)"}) + } + if r.Network.TCPRetransRate > 5.0 { + alerts = append(alerts, Alert{AlertWarning, "network", host, + fmt.Sprintf("High TCP retransmission rate: %.1f%%", r.Network.TCPRetransRate)}) + } + return alerts +} + +func truncateKey(key string) string { + if len(key) > 8 { + return key[:8] + "..." + } + return key +} + diff --git a/pkg/cli/monitor/collector.go b/pkg/cli/monitor/collector.go new file mode 100644 index 0000000..2adc726 --- /dev/null +++ b/pkg/cli/monitor/collector.go @@ -0,0 +1,115 @@ +package monitor + +import ( + "context" + "encoding/json" + "fmt" + "sync" + "time" + + "github.com/DeBrosOfficial/network/pkg/cli/production/report" + "github.com/DeBrosOfficial/network/pkg/inspector" +) + +// CollectorConfig holds configuration for the collection pipeline. +type CollectorConfig struct { + ConfigPath string + Env string + NodeFilter string + Timeout time.Duration +} + +// CollectOnce runs `sudo orama node report --json` on all matching nodes +// in parallel and returns a ClusterSnapshot. +func CollectOnce(ctx context.Context, cfg CollectorConfig) (*ClusterSnapshot, error) { + nodes, err := inspector.LoadNodes(cfg.ConfigPath) + if err != nil { + return nil, fmt.Errorf("load nodes: %w", err) + } + nodes = inspector.FilterByEnv(nodes, cfg.Env) + if cfg.NodeFilter != "" { + nodes = filterByHost(nodes, cfg.NodeFilter) + } + if len(nodes) == 0 { + return nil, fmt.Errorf("no nodes found for env %q", cfg.Env) + } + + timeout := cfg.Timeout + if timeout == 0 { + timeout = 30 * time.Second + } + + start := time.Now() + snap := &ClusterSnapshot{ + Environment: cfg.Env, + CollectedAt: start, + Nodes: make([]CollectionStatus, len(nodes)), + } + + var wg sync.WaitGroup + for i, node := range nodes { + wg.Add(1) + go func(idx int, n inspector.Node) { + defer wg.Done() + snap.Nodes[idx] = collectNodeReport(ctx, n, timeout) + }(i, node) + } + wg.Wait() + + snap.Duration = time.Since(start) + snap.Alerts = DeriveAlerts(snap) + + return snap, nil +} + +// collectNodeReport SSHes into a single node and parses the JSON report. +func collectNodeReport(ctx context.Context, node inspector.Node, timeout time.Duration) CollectionStatus { + nodeCtx, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + start := time.Now() + result := inspector.RunSSH(nodeCtx, node, "sudo orama node report --json") + + cs := CollectionStatus{ + Node: node, + Duration: time.Since(start), + Retries: result.Retries, + } + + if !result.OK() { + cs.Error = fmt.Errorf("SSH failed (exit %d): %s", result.ExitCode, truncate(result.Stderr, 200)) + return cs + } + + var rpt report.NodeReport + if err := json.Unmarshal([]byte(result.Stdout), &rpt); err != nil { + cs.Error = fmt.Errorf("parse report JSON: %w (first 200 bytes: %s)", err, truncate(result.Stdout, 200)) + return cs + } + + // Enrich with node metadata from remote-nodes.conf + if rpt.Hostname == "" { + rpt.Hostname = node.Host + } + rpt.PublicIP = node.Host + + cs.Report = &rpt + return cs +} + +func filterByHost(nodes []inspector.Node, host string) []inspector.Node { + var filtered []inspector.Node + for _, n := range nodes { + if n.Host == host { + filtered = append(filtered, n) + } + } + return filtered +} + +func truncate(s string, maxLen int) string { + if len(s) <= maxLen { + return s + } + return s[:maxLen] + "..." +} diff --git a/pkg/cli/monitor/display/alerts.go b/pkg/cli/monitor/display/alerts.go new file mode 100644 index 0000000..13b4e43 --- /dev/null +++ b/pkg/cli/monitor/display/alerts.go @@ -0,0 +1,64 @@ +package display + +import ( + "fmt" + "io" + "sort" + "strings" + + "github.com/DeBrosOfficial/network/pkg/cli/monitor" +) + +// AlertsTable prints alerts sorted by severity to w. +func AlertsTable(snap *monitor.ClusterSnapshot, w io.Writer) error { + critCount, warnCount := countAlerts(snap.Alerts) + + fmt.Fprintf(w, "%s\n", styleBold.Render( + fmt.Sprintf("Alerts \u2014 %s (%d critical, %d warning)", + snap.Environment, critCount, warnCount))) + fmt.Fprintln(w, strings.Repeat("\u2550", 44)) + fmt.Fprintln(w) + + if len(snap.Alerts) == 0 { + fmt.Fprintln(w, styleGreen.Render(" No alerts")) + return nil + } + + // Sort by severity: critical first, then warning, then info + sorted := make([]monitor.Alert, len(snap.Alerts)) + copy(sorted, snap.Alerts) + sort.Slice(sorted, func(i, j int) bool { + return severityRank(sorted[i].Severity) < severityRank(sorted[j].Severity) + }) + + for _, a := range sorted { + tag := severityTag(a.Severity) + node := a.Node + if node == "" { + node = "cluster" + } + fmt.Fprintf(w, "%s %-18s %-12s %s\n", + tag, node, a.Subsystem, a.Message) + } + + return nil +} + +// AlertsJSON writes alerts as JSON. +func AlertsJSON(snap *monitor.ClusterSnapshot, w io.Writer) error { + return writeJSON(w, snap.Alerts) +} + +// severityRank returns a sort rank for severity (lower = higher priority). +func severityRank(s monitor.AlertSeverity) int { + switch s { + case monitor.AlertCritical: + return 0 + case monitor.AlertWarning: + return 1 + case monitor.AlertInfo: + return 2 + default: + return 3 + } +} diff --git a/pkg/cli/monitor/display/cluster.go b/pkg/cli/monitor/display/cluster.go new file mode 100644 index 0000000..53ee53f --- /dev/null +++ b/pkg/cli/monitor/display/cluster.go @@ -0,0 +1,204 @@ +package display + +import ( + "fmt" + "io" + "strings" + + "github.com/DeBrosOfficial/network/pkg/cli/monitor" +) + +// ClusterTable prints a cluster overview table to w. +func ClusterTable(snap *monitor.ClusterSnapshot, w io.Writer) error { + dur := snap.Duration.Seconds() + fmt.Fprintf(w, "%s\n", styleBold.Render( + fmt.Sprintf("Cluster Overview \u2014 %s (%d nodes, collected in %.1fs)", + snap.Environment, snap.TotalCount(), dur))) + fmt.Fprintln(w, strings.Repeat("\u2550", 60)) + fmt.Fprintln(w) + + // Header + fmt.Fprintf(w, "%-18s %-12s %-6s %-6s %-11s %-5s %s\n", + styleHeader.Render("NODE"), + styleHeader.Render("ROLE"), + styleHeader.Render("MEM"), + styleHeader.Render("DISK"), + styleHeader.Render("RQLITE"), + styleHeader.Render("WG"), + styleHeader.Render("SERVICES")) + fmt.Fprintln(w, separator(70)) + + // Healthy nodes + for _, cs := range snap.Nodes { + if cs.Error != nil { + continue + } + r := cs.Report + if r == nil { + continue + } + + host := cs.Node.Host + role := cs.Node.Role + + // Memory % + memStr := "--" + if r.System != nil { + memStr = fmt.Sprintf("%d%%", r.System.MemUsePct) + } + + // Disk % + diskStr := "--" + if r.System != nil { + diskStr = fmt.Sprintf("%d%%", r.System.DiskUsePct) + } + + // RQLite state + rqliteStr := "--" + if r.RQLite != nil && r.RQLite.Responsive { + rqliteStr = r.RQLite.RaftState + } else if r.RQLite != nil { + rqliteStr = styleRed.Render("DOWN") + } + + // WireGuard + wgStr := statusIcon(r.WireGuard != nil && r.WireGuard.InterfaceUp) + + // Services: active/total + svcStr := "--" + if r.Services != nil { + active := 0 + total := len(r.Services.Services) + for _, svc := range r.Services.Services { + if svc.ActiveState == "active" { + active++ + } + } + svcStr = fmt.Sprintf("%d/%d", active, total) + } + + fmt.Fprintf(w, "%-18s %-12s %-6s %-6s %-11s %-5s %s\n", + host, role, memStr, diskStr, rqliteStr, wgStr, svcStr) + } + + // Unreachable nodes + failed := snap.Failed() + if len(failed) > 0 { + fmt.Fprintln(w) + for _, cs := range failed { + fmt.Fprintf(w, "%-18s %-12s %s\n", + styleRed.Render(cs.Node.Host), + cs.Node.Role, + styleRed.Render("UNREACHABLE")) + } + } + + // Alerts summary + critCount, warnCount := countAlerts(snap.Alerts) + fmt.Fprintln(w) + fmt.Fprintf(w, "Alerts: %s critical, %s warning\n", + alertCountStr(critCount, monitor.AlertCritical), + alertCountStr(warnCount, monitor.AlertWarning)) + + for _, a := range snap.Alerts { + if a.Severity == monitor.AlertCritical || a.Severity == monitor.AlertWarning { + tag := severityTag(a.Severity) + fmt.Fprintf(w, " %s %s: %s\n", tag, a.Node, a.Message) + } + } + + return nil +} + +// ClusterJSON writes the cluster snapshot as JSON. +func ClusterJSON(snap *monitor.ClusterSnapshot, w io.Writer) error { + type clusterEntry struct { + Host string `json:"host"` + Role string `json:"role"` + MemPct int `json:"mem_pct"` + DiskPct int `json:"disk_pct"` + RQLite string `json:"rqlite_state"` + WGUp bool `json:"wg_up"` + Services string `json:"services"` + Status string `json:"status"` + Error string `json:"error,omitempty"` + } + + var entries []clusterEntry + for _, cs := range snap.Nodes { + e := clusterEntry{ + Host: cs.Node.Host, + Role: cs.Node.Role, + } + if cs.Error != nil { + e.Status = "unreachable" + e.Error = cs.Error.Error() + entries = append(entries, e) + continue + } + r := cs.Report + if r == nil { + e.Status = "unreachable" + entries = append(entries, e) + continue + } + e.Status = "ok" + if r.System != nil { + e.MemPct = r.System.MemUsePct + e.DiskPct = r.System.DiskUsePct + } + if r.RQLite != nil && r.RQLite.Responsive { + e.RQLite = r.RQLite.RaftState + } + e.WGUp = r.WireGuard != nil && r.WireGuard.InterfaceUp + if r.Services != nil { + active := 0 + total := len(r.Services.Services) + for _, svc := range r.Services.Services { + if svc.ActiveState == "active" { + active++ + } + } + e.Services = fmt.Sprintf("%d/%d", active, total) + } + entries = append(entries, e) + } + + return writeJSON(w, entries) +} + +// countAlerts returns the number of critical and warning alerts. +func countAlerts(alerts []monitor.Alert) (crit, warn int) { + for _, a := range alerts { + switch a.Severity { + case monitor.AlertCritical: + crit++ + case monitor.AlertWarning: + warn++ + } + } + return +} + +// severityTag returns a colored tag like [CRIT], [WARN], [INFO]. +func severityTag(s monitor.AlertSeverity) string { + switch s { + case monitor.AlertCritical: + return styleRed.Render("[CRIT]") + case monitor.AlertWarning: + return styleYellow.Render("[WARN]") + case monitor.AlertInfo: + return styleMuted.Render("[INFO]") + default: + return styleMuted.Render("[????]") + } +} + +// alertCountStr renders the count with appropriate color. +func alertCountStr(count int, sev monitor.AlertSeverity) string { + s := fmt.Sprintf("%d", count) + if count > 0 { + return severityColor(sev).Render(s) + } + return s +} diff --git a/pkg/cli/monitor/display/dns.go b/pkg/cli/monitor/display/dns.go new file mode 100644 index 0000000..b38b9d1 --- /dev/null +++ b/pkg/cli/monitor/display/dns.go @@ -0,0 +1,129 @@ +package display + +import ( + "fmt" + "io" + "strings" + + "github.com/DeBrosOfficial/network/pkg/cli/monitor" +) + +// DNSTable prints DNS status for nameserver nodes to w. +func DNSTable(snap *monitor.ClusterSnapshot, w io.Writer) error { + fmt.Fprintf(w, "%s\n", styleBold.Render( + fmt.Sprintf("DNS Status \u2014 %s", snap.Environment))) + fmt.Fprintln(w, strings.Repeat("\u2550", 22)) + fmt.Fprintln(w) + + // Header + fmt.Fprintf(w, "%-18s %-9s %-7s %-5s %-5s %-10s %-10s %s\n", + styleHeader.Render("NODE"), + styleHeader.Render("COREDNS"), + styleHeader.Render("CADDY"), + styleHeader.Render("SOA"), + styleHeader.Render("NS"), + styleHeader.Render("WILDCARD"), + styleHeader.Render("BASE TLS"), + styleHeader.Render("WILD TLS")) + fmt.Fprintln(w, separator(78)) + + found := false + for _, cs := range snap.Nodes { + // Only show nameserver nodes + if !cs.Node.IsNameserver() { + continue + } + found = true + + if cs.Error != nil || cs.Report == nil { + fmt.Fprintf(w, "%-18s %s\n", + styleRed.Render(cs.Node.Host), + styleRed.Render("UNREACHABLE")) + continue + } + + r := cs.Report + if r.DNS == nil { + fmt.Fprintf(w, "%-18s %s\n", + cs.Node.Host, + styleMuted.Render("no DNS data")) + continue + } + + dns := r.DNS + fmt.Fprintf(w, "%-18s %-9s %-7s %-5s %-5s %-10s %-10s %s\n", + cs.Node.Host, + statusIcon(dns.CoreDNSActive), + statusIcon(dns.CaddyActive), + statusIcon(dns.SOAResolves), + statusIcon(dns.NSResolves), + statusIcon(dns.WildcardResolves), + tlsDaysStr(dns.BaseTLSDaysLeft), + tlsDaysStr(dns.WildTLSDaysLeft)) + } + + if !found { + fmt.Fprintln(w, styleMuted.Render(" No nameserver nodes found")) + } + + return nil +} + +// DNSJSON writes DNS status as JSON. +func DNSJSON(snap *monitor.ClusterSnapshot, w io.Writer) error { + type dnsEntry struct { + Host string `json:"host"` + CoreDNSActive bool `json:"coredns_active"` + CaddyActive bool `json:"caddy_active"` + SOAResolves bool `json:"soa_resolves"` + NSResolves bool `json:"ns_resolves"` + WildcardResolves bool `json:"wildcard_resolves"` + BaseTLSDaysLeft int `json:"base_tls_days_left"` + WildTLSDaysLeft int `json:"wild_tls_days_left"` + Error string `json:"error,omitempty"` + } + + var entries []dnsEntry + for _, cs := range snap.Nodes { + if !cs.Node.IsNameserver() { + continue + } + e := dnsEntry{Host: cs.Node.Host} + if cs.Error != nil { + e.Error = cs.Error.Error() + entries = append(entries, e) + continue + } + if cs.Report == nil || cs.Report.DNS == nil { + entries = append(entries, e) + continue + } + dns := cs.Report.DNS + e.CoreDNSActive = dns.CoreDNSActive + e.CaddyActive = dns.CaddyActive + e.SOAResolves = dns.SOAResolves + e.NSResolves = dns.NSResolves + e.WildcardResolves = dns.WildcardResolves + e.BaseTLSDaysLeft = dns.BaseTLSDaysLeft + e.WildTLSDaysLeft = dns.WildTLSDaysLeft + entries = append(entries, e) + } + + return writeJSON(w, entries) +} + +// tlsDaysStr formats TLS days left with appropriate coloring. +func tlsDaysStr(days int) string { + if days < 0 { + return styleMuted.Render("--") + } + s := fmt.Sprintf("%d days", days) + switch { + case days < 7: + return styleRed.Render(s) + case days < 30: + return styleYellow.Render(s) + default: + return styleGreen.Render(s) + } +} diff --git a/pkg/cli/monitor/display/mesh.go b/pkg/cli/monitor/display/mesh.go new file mode 100644 index 0000000..c380d69 --- /dev/null +++ b/pkg/cli/monitor/display/mesh.go @@ -0,0 +1,194 @@ +package display + +import ( + "fmt" + "io" + "strings" + + "github.com/DeBrosOfficial/network/pkg/cli/monitor" +) + +// MeshTable prints WireGuard mesh status to w. +func MeshTable(snap *monitor.ClusterSnapshot, w io.Writer) error { + fmt.Fprintf(w, "%s\n", styleBold.Render( + fmt.Sprintf("WireGuard Mesh \u2014 %s", snap.Environment))) + fmt.Fprintln(w, strings.Repeat("\u2550", 28)) + fmt.Fprintln(w) + + // Header + fmt.Fprintf(w, "%-18s %-12s %-7s %-7s %s\n", + styleHeader.Render("NODE"), + styleHeader.Render("WG IP"), + styleHeader.Render("PORT"), + styleHeader.Render("PEERS"), + styleHeader.Render("STATUS")) + fmt.Fprintln(w, separator(54)) + + // Collect mesh info for peer details + type meshNode struct { + host string + wgIP string + port int + peers int + total int + healthy bool + } + var meshNodes []meshNode + + expectedPeers := snap.HealthyCount() - 1 + + for _, cs := range snap.Nodes { + if cs.Error != nil || cs.Report == nil { + continue + } + r := cs.Report + if r.WireGuard == nil { + fmt.Fprintf(w, "%-18s %s\n", cs.Node.Host, styleMuted.Render("no WireGuard")) + continue + } + + wg := r.WireGuard + peerCount := wg.PeerCount + allOK := wg.InterfaceUp + if allOK { + for _, p := range wg.Peers { + if p.LatestHandshake == 0 || p.HandshakeAgeSec > 180 { + allOK = false + break + } + } + } + + mn := meshNode{ + host: cs.Node.Host, + wgIP: wg.WgIP, + port: wg.ListenPort, + peers: peerCount, + total: expectedPeers, + healthy: allOK, + } + meshNodes = append(meshNodes, mn) + + peerStr := fmt.Sprintf("%d/%d", peerCount, expectedPeers) + statusStr := statusIcon(allOK) + if !wg.InterfaceUp { + statusStr = styleRed.Render("DOWN") + } + + fmt.Fprintf(w, "%-18s %-12s %-7d %-7s %s\n", + cs.Node.Host, wg.WgIP, wg.ListenPort, peerStr, statusStr) + } + + // Peer details + fmt.Fprintln(w) + fmt.Fprintln(w, styleBold.Render("Peer Details:")) + + for _, cs := range snap.Nodes { + if cs.Error != nil || cs.Report == nil || cs.Report.WireGuard == nil { + continue + } + wg := cs.Report.WireGuard + if !wg.InterfaceUp { + continue + } + localIP := wg.WgIP + for _, p := range wg.Peers { + hsAge := formatDuration(p.HandshakeAgeSec) + rx := formatBytes(p.TransferRx) + tx := formatBytes(p.TransferTx) + + peerIP := p.AllowedIPs + // Strip CIDR if present + if idx := strings.Index(peerIP, "/"); idx > 0 { + peerIP = peerIP[:idx] + } + + hsColor := styleGreen + if p.LatestHandshake == 0 { + hsAge = "never" + hsColor = styleRed + } else if p.HandshakeAgeSec > 180 { + hsColor = styleYellow + } + + fmt.Fprintf(w, " %s \u2194 %s: handshake %s, rx: %s, tx: %s\n", + localIP, peerIP, hsColor.Render(hsAge), rx, tx) + } + } + + return nil +} + +// MeshJSON writes the WireGuard mesh as JSON. +func MeshJSON(snap *monitor.ClusterSnapshot, w io.Writer) error { + type peerEntry struct { + AllowedIPs string `json:"allowed_ips"` + HandshakeAgeSec int64 `json:"handshake_age_sec"` + TransferRxBytes int64 `json:"transfer_rx_bytes"` + TransferTxBytes int64 `json:"transfer_tx_bytes"` + } + type meshEntry struct { + Host string `json:"host"` + WgIP string `json:"wg_ip"` + ListenPort int `json:"listen_port"` + PeerCount int `json:"peer_count"` + Up bool `json:"up"` + Peers []peerEntry `json:"peers,omitempty"` + } + + var entries []meshEntry + for _, cs := range snap.Nodes { + if cs.Error != nil || cs.Report == nil || cs.Report.WireGuard == nil { + continue + } + wg := cs.Report.WireGuard + e := meshEntry{ + Host: cs.Node.Host, + WgIP: wg.WgIP, + ListenPort: wg.ListenPort, + PeerCount: wg.PeerCount, + Up: wg.InterfaceUp, + } + for _, p := range wg.Peers { + e.Peers = append(e.Peers, peerEntry{ + AllowedIPs: p.AllowedIPs, + HandshakeAgeSec: p.HandshakeAgeSec, + TransferRxBytes: p.TransferRx, + TransferTxBytes: p.TransferTx, + }) + } + entries = append(entries, e) + } + + return writeJSON(w, entries) +} + +// formatDuration formats seconds into a human-readable string. +func formatDuration(sec int64) string { + if sec < 60 { + return fmt.Sprintf("%ds ago", sec) + } + if sec < 3600 { + return fmt.Sprintf("%dm ago", sec/60) + } + return fmt.Sprintf("%dh ago", sec/3600) +} + +// formatBytes formats bytes into a human-readable string. +func formatBytes(b int64) string { + const ( + kb = 1024 + mb = 1024 * kb + gb = 1024 * mb + ) + switch { + case b >= gb: + return fmt.Sprintf("%.1fGB", float64(b)/float64(gb)) + case b >= mb: + return fmt.Sprintf("%.1fMB", float64(b)/float64(mb)) + case b >= kb: + return fmt.Sprintf("%.1fKB", float64(b)/float64(kb)) + default: + return fmt.Sprintf("%dB", b) + } +} diff --git a/pkg/cli/monitor/display/namespaces.go b/pkg/cli/monitor/display/namespaces.go new file mode 100644 index 0000000..f097ce5 --- /dev/null +++ b/pkg/cli/monitor/display/namespaces.go @@ -0,0 +1,114 @@ +package display + +import ( + "fmt" + "io" + "sort" + "strings" + + "github.com/DeBrosOfficial/network/pkg/cli/monitor" +) + +// NamespacesTable prints per-namespace health across nodes to w. +func NamespacesTable(snap *monitor.ClusterSnapshot, w io.Writer) error { + fmt.Fprintf(w, "%s\n", styleBold.Render( + fmt.Sprintf("Namespace Health \u2014 %s", snap.Environment))) + fmt.Fprintln(w, strings.Repeat("\u2550", 28)) + fmt.Fprintln(w) + + // Collect all namespace entries across nodes + type nsRow struct { + namespace string + host string + rqlite string + olric string + gateway string + } + + var rows []nsRow + nsNames := map[string]bool{} + + for _, cs := range snap.Nodes { + if cs.Error != nil || cs.Report == nil { + continue + } + for _, ns := range cs.Report.Namespaces { + nsNames[ns.Name] = true + + rqliteStr := statusIcon(ns.RQLiteUp) + if ns.RQLiteUp && ns.RQLiteState != "" { + rqliteStr = ns.RQLiteState + } + + rows = append(rows, nsRow{ + namespace: ns.Name, + host: cs.Node.Host, + rqlite: rqliteStr, + olric: statusIcon(ns.OlricUp), + gateway: statusIcon(ns.GatewayUp), + }) + } + } + + if len(rows) == 0 { + fmt.Fprintln(w, styleMuted.Render(" No namespaces found")) + return nil + } + + // Sort by namespace name, then host + sort.Slice(rows, func(i, j int) bool { + if rows[i].namespace != rows[j].namespace { + return rows[i].namespace < rows[j].namespace + } + return rows[i].host < rows[j].host + }) + + // Header + fmt.Fprintf(w, "%-13s %-18s %-11s %-7s %s\n", + styleHeader.Render("NAMESPACE"), + styleHeader.Render("NODE"), + styleHeader.Render("RQLITE"), + styleHeader.Render("OLRIC"), + styleHeader.Render("GATEWAY")) + fmt.Fprintln(w, separator(58)) + + for _, r := range rows { + fmt.Fprintf(w, "%-13s %-18s %-11s %-7s %s\n", + r.namespace, r.host, r.rqlite, r.olric, r.gateway) + } + + return nil +} + +// NamespacesJSON writes namespace health as JSON. +func NamespacesJSON(snap *monitor.ClusterSnapshot, w io.Writer) error { + type nsEntry struct { + Namespace string `json:"namespace"` + Host string `json:"host"` + RQLiteUp bool `json:"rqlite_up"` + RQLiteState string `json:"rqlite_state,omitempty"` + OlricUp bool `json:"olric_up"` + GatewayUp bool `json:"gateway_up"` + GatewayStatus int `json:"gateway_status,omitempty"` + } + + var entries []nsEntry + for _, cs := range snap.Nodes { + if cs.Error != nil || cs.Report == nil { + continue + } + for _, ns := range cs.Report.Namespaces { + entries = append(entries, nsEntry{ + Namespace: ns.Name, + Host: cs.Node.Host, + RQLiteUp: ns.RQLiteUp, + RQLiteState: ns.RQLiteState, + OlricUp: ns.OlricUp, + GatewayUp: ns.GatewayUp, + GatewayStatus: ns.GatewayStatus, + }) + } + } + + return writeJSON(w, entries) +} diff --git a/pkg/cli/monitor/display/node.go b/pkg/cli/monitor/display/node.go new file mode 100644 index 0000000..ade3386 --- /dev/null +++ b/pkg/cli/monitor/display/node.go @@ -0,0 +1,167 @@ +package display + +import ( + "fmt" + "io" + + "github.com/DeBrosOfficial/network/pkg/cli/monitor" +) + +// NodeTable prints detailed per-node information to w. +func NodeTable(snap *monitor.ClusterSnapshot, w io.Writer) error { + for i, cs := range snap.Nodes { + if i > 0 { + fmt.Fprintln(w) + } + + host := cs.Node.Host + role := cs.Node.Role + + if cs.Error != nil { + fmt.Fprintf(w, "%s (%s)\n", styleRed.Render("Node: "+host), role) + fmt.Fprintf(w, " %s\n", styleRed.Render(fmt.Sprintf("UNREACHABLE: %v", cs.Error))) + continue + } + + r := cs.Report + if r == nil { + fmt.Fprintf(w, "%s (%s)\n", styleRed.Render("Node: "+host), role) + fmt.Fprintf(w, " %s\n", styleRed.Render("No report available")) + continue + } + + fmt.Fprintf(w, "%s\n", styleBold.Render(fmt.Sprintf("Node: %s (%s)", host, role))) + + // System + if r.System != nil { + sys := r.System + fmt.Fprintf(w, " System: CPU %d | Load %.2f | Mem %d%% (%d/%d MB) | Disk %d%%\n", + sys.CPUCount, sys.LoadAvg1, sys.MemUsePct, sys.MemUsedMB, sys.MemTotalMB, sys.DiskUsePct) + } else { + fmt.Fprintln(w, " System: "+styleMuted.Render("no data")) + } + + // RQLite + if r.RQLite != nil { + rq := r.RQLite + readyStr := styleRed.Render("Not Ready") + if rq.Ready { + readyStr = styleGreen.Render("Ready") + } + if rq.Responsive { + fmt.Fprintf(w, " RQLite: %s | Term %d | Applied %d | Peers %d | %s\n", + rq.RaftState, rq.Term, rq.Applied, rq.NumPeers, readyStr) + } else { + fmt.Fprintf(w, " RQLite: %s\n", styleRed.Render("NOT RESPONDING")) + } + } else { + fmt.Fprintln(w, " RQLite: "+styleMuted.Render("not configured")) + } + + // WireGuard + if r.WireGuard != nil { + wg := r.WireGuard + if wg.InterfaceUp { + // Check handshakes + hsOK := true + for _, p := range wg.Peers { + if p.LatestHandshake == 0 || p.HandshakeAgeSec > 180 { + hsOK = false + break + } + } + hsStr := statusIcon(hsOK) + fmt.Fprintf(w, " WireGuard: UP | %s | %d peers | handshakes %s\n", + wg.WgIP, wg.PeerCount, hsStr) + } else { + fmt.Fprintf(w, " WireGuard: %s\n", styleRed.Render("DOWN")) + } + } else { + fmt.Fprintln(w, " WireGuard: "+styleMuted.Render("not configured")) + } + + // Olric + if r.Olric != nil { + ol := r.Olric + stateStr := styleRed.Render("inactive") + if ol.ServiceActive { + stateStr = styleGreen.Render("active") + } + fmt.Fprintf(w, " Olric: %s | %d members\n", stateStr, ol.MemberCount) + } else { + fmt.Fprintln(w, " Olric: "+styleMuted.Render("not configured")) + } + + // IPFS + if r.IPFS != nil { + ipfs := r.IPFS + daemonStr := styleRed.Render("inactive") + if ipfs.DaemonActive { + daemonStr = styleGreen.Render("active") + } + clusterStr := styleRed.Render("DOWN") + if ipfs.ClusterActive { + clusterStr = styleGreen.Render("OK") + } + fmt.Fprintf(w, " IPFS: %s | %d swarm peers | cluster %s\n", + daemonStr, ipfs.SwarmPeerCount, clusterStr) + } else { + fmt.Fprintln(w, " IPFS: "+styleMuted.Render("not configured")) + } + + // Anyone + if r.Anyone != nil { + an := r.Anyone + mode := an.Mode + if mode == "" { + if an.RelayActive { + mode = "relay" + } else if an.ClientActive { + mode = "client" + } else { + mode = "inactive" + } + } + bootStr := styleRed.Render("not bootstrapped") + if an.Bootstrapped { + bootStr = styleGreen.Render("bootstrapped") + } + fmt.Fprintf(w, " Anyone: %s | %s\n", mode, bootStr) + } else { + fmt.Fprintln(w, " Anyone: "+styleMuted.Render("not configured")) + } + } + + return nil +} + +// NodeJSON writes the node details as JSON. +func NodeJSON(snap *monitor.ClusterSnapshot, w io.Writer) error { + type nodeDetail struct { + Host string `json:"host"` + Role string `json:"role"` + Status string `json:"status"` + Error string `json:"error,omitempty"` + Report interface{} `json:"report,omitempty"` + } + + var entries []nodeDetail + for _, cs := range snap.Nodes { + e := nodeDetail{ + Host: cs.Node.Host, + Role: cs.Node.Role, + } + if cs.Error != nil { + e.Status = "unreachable" + e.Error = cs.Error.Error() + } else if cs.Report != nil { + e.Status = "ok" + e.Report = cs.Report + } else { + e.Status = "unknown" + } + entries = append(entries, e) + } + + return writeJSON(w, entries) +} diff --git a/pkg/cli/monitor/display/report.go b/pkg/cli/monitor/display/report.go new file mode 100644 index 0000000..6a82904 --- /dev/null +++ b/pkg/cli/monitor/display/report.go @@ -0,0 +1,182 @@ +package display + +import ( + "io" + "time" + + "github.com/DeBrosOfficial/network/pkg/cli/monitor" + "github.com/DeBrosOfficial/network/pkg/cli/production/report" +) + +type fullReport struct { + Meta struct { + Environment string `json:"environment"` + CollectedAt time.Time `json:"collected_at"` + DurationSec float64 `json:"duration_seconds"` + NodeCount int `json:"node_count"` + HealthyCount int `json:"healthy_count"` + FailedCount int `json:"failed_count"` + } `json:"meta"` + Summary struct { + RQLiteLeader string `json:"rqlite_leader"` + RQLiteQuorum string `json:"rqlite_quorum"` + WGMeshStatus string `json:"wg_mesh_status"` + ServiceHealth string `json:"service_health"` + CriticalAlerts int `json:"critical_alerts"` + WarningAlerts int `json:"warning_alerts"` + } `json:"summary"` + Alerts []monitor.Alert `json:"alerts"` + Nodes []nodeEntry `json:"nodes"` +} + +type nodeEntry struct { + Host string `json:"host"` + Role string `json:"role"` + Status string `json:"status"` // "ok", "unreachable", "degraded" + Report *report.NodeReport `json:"report,omitempty"` + Error string `json:"error,omitempty"` +} + +// FullReport outputs the LLM-optimized JSON report to w. +func FullReport(snap *monitor.ClusterSnapshot, w io.Writer) error { + fr := fullReport{} + + // Meta + fr.Meta.Environment = snap.Environment + fr.Meta.CollectedAt = snap.CollectedAt + fr.Meta.DurationSec = snap.Duration.Seconds() + fr.Meta.NodeCount = snap.TotalCount() + fr.Meta.HealthyCount = snap.HealthyCount() + fr.Meta.FailedCount = len(snap.Failed()) + + // Summary + fr.Summary.RQLiteLeader = findRQLiteLeader(snap) + fr.Summary.RQLiteQuorum = computeQuorumStatus(snap) + fr.Summary.WGMeshStatus = computeWGMeshStatus(snap) + fr.Summary.ServiceHealth = computeServiceHealth(snap) + + crit, warn := countAlerts(snap.Alerts) + fr.Summary.CriticalAlerts = crit + fr.Summary.WarningAlerts = warn + + // Alerts + fr.Alerts = snap.Alerts + + // Build set of hosts with critical alerts for "degraded" detection + criticalHosts := map[string]bool{} + for _, a := range snap.Alerts { + if a.Severity == monitor.AlertCritical && a.Node != "" && a.Node != "cluster" { + criticalHosts[a.Node] = true + } + } + + // Nodes + for _, cs := range snap.Nodes { + ne := nodeEntry{ + Host: cs.Node.Host, + Role: cs.Node.Role, + } + if cs.Error != nil { + ne.Status = "unreachable" + ne.Error = cs.Error.Error() + } else if cs.Report != nil { + if criticalHosts[cs.Node.Host] { + ne.Status = "degraded" + } else { + ne.Status = "ok" + } + ne.Report = cs.Report + } else { + ne.Status = "unreachable" + } + fr.Nodes = append(fr.Nodes, ne) + } + + return writeJSON(w, fr) +} + +// findRQLiteLeader returns the host of the RQLite leader, or "none". +func findRQLiteLeader(snap *monitor.ClusterSnapshot) string { + for _, cs := range snap.Nodes { + if cs.Report != nil && cs.Report.RQLite != nil && cs.Report.RQLite.RaftState == "Leader" { + return cs.Node.Host + } + } + return "none" +} + +// computeQuorumStatus returns "ok", "degraded", or "lost". +func computeQuorumStatus(snap *monitor.ClusterSnapshot) string { + total := 0 + responsive := 0 + for _, cs := range snap.Nodes { + if cs.Report != nil && cs.Report.RQLite != nil { + total++ + if cs.Report.RQLite.Responsive { + responsive++ + } + } + } + if total == 0 { + return "unknown" + } + quorum := (total / 2) + 1 + if responsive >= quorum { + return "ok" + } + if responsive > 0 { + return "degraded" + } + return "lost" +} + +// computeWGMeshStatus returns "ok", "degraded", or "down". +func computeWGMeshStatus(snap *monitor.ClusterSnapshot) string { + totalWG := 0 + upCount := 0 + for _, cs := range snap.Nodes { + if cs.Report != nil && cs.Report.WireGuard != nil { + totalWG++ + if cs.Report.WireGuard.InterfaceUp { + upCount++ + } + } + } + if totalWG == 0 { + return "unknown" + } + if upCount == totalWG { + return "ok" + } + if upCount > 0 { + return "degraded" + } + return "down" +} + +// computeServiceHealth returns "ok", "degraded", or "critical". +func computeServiceHealth(snap *monitor.ClusterSnapshot) string { + totalSvc := 0 + failedSvc := 0 + for _, cs := range snap.Nodes { + if cs.Report == nil || cs.Report.Services == nil { + continue + } + for _, svc := range cs.Report.Services.Services { + totalSvc++ + if svc.ActiveState == "failed" { + failedSvc++ + } + } + } + if totalSvc == 0 { + return "unknown" + } + if failedSvc == 0 { + return "ok" + } + if failedSvc < totalSvc/2 { + return "degraded" + } + return "critical" +} diff --git a/pkg/cli/monitor/display/service.go b/pkg/cli/monitor/display/service.go new file mode 100644 index 0000000..f5fc2c8 --- /dev/null +++ b/pkg/cli/monitor/display/service.go @@ -0,0 +1,131 @@ +package display + +import ( + "fmt" + "io" + "sort" + "strings" + + "github.com/DeBrosOfficial/network/pkg/cli/monitor" +) + +// ServiceTable prints a cross-node service status matrix to w. +func ServiceTable(snap *monitor.ClusterSnapshot, w io.Writer) error { + fmt.Fprintf(w, "%s\n", styleBold.Render( + fmt.Sprintf("Service Status Matrix \u2014 %s", snap.Environment))) + fmt.Fprintln(w, strings.Repeat("\u2550", 36)) + fmt.Fprintln(w) + + // Collect all service names and build per-host maps + type hostServices struct { + host string + shortIP string + services map[string]string // name -> active_state + } + + var hosts []hostServices + serviceSet := map[string]bool{} + + for _, cs := range snap.Nodes { + if cs.Error != nil || cs.Report == nil || cs.Report.Services == nil { + continue + } + hs := hostServices{ + host: cs.Node.Host, + shortIP: shortIP(cs.Node.Host), + services: make(map[string]string), + } + for _, svc := range cs.Report.Services.Services { + hs.services[svc.Name] = svc.ActiveState + serviceSet[svc.Name] = true + } + hosts = append(hosts, hs) + } + + // Sort service names + var svcNames []string + for name := range serviceSet { + svcNames = append(svcNames, name) + } + sort.Strings(svcNames) + + if len(hosts) == 0 || len(svcNames) == 0 { + fmt.Fprintln(w, styleMuted.Render(" No service data available")) + return nil + } + + // Header: SERVICE + each host short IP + hdr := fmt.Sprintf("%-22s", styleHeader.Render("SERVICE")) + for _, h := range hosts { + hdr += fmt.Sprintf("%-12s", styleHeader.Render(h.shortIP)) + } + fmt.Fprintln(w, hdr) + fmt.Fprintln(w, separator(22+12*len(hosts))) + + // Rows + for _, name := range svcNames { + row := fmt.Sprintf("%-22s", name) + for _, h := range hosts { + state, ok := h.services[name] + if !ok { + row += fmt.Sprintf("%-12s", styleMuted.Render("--")) + } else { + row += fmt.Sprintf("%-12s", colorServiceState(state)) + } + } + fmt.Fprintln(w, row) + } + + return nil +} + +// ServiceJSON writes the service matrix as JSON. +func ServiceJSON(snap *monitor.ClusterSnapshot, w io.Writer) error { + type svcEntry struct { + Host string `json:"host"` + Services map[string]string `json:"services"` + } + + var entries []svcEntry + for _, cs := range snap.Nodes { + if cs.Error != nil || cs.Report == nil || cs.Report.Services == nil { + continue + } + e := svcEntry{ + Host: cs.Node.Host, + Services: make(map[string]string), + } + for _, svc := range cs.Report.Services.Services { + e.Services[svc.Name] = svc.ActiveState + } + entries = append(entries, e) + } + + return writeJSON(w, entries) +} + +// shortIP truncates an IP to the first 3 octets for compact display. +func shortIP(ip string) string { + parts := strings.Split(ip, ".") + if len(parts) == 4 { + return parts[0] + "." + parts[1] + "." + parts[2] + } + if len(ip) > 12 { + return ip[:12] + } + return ip +} + +// colorServiceState renders a service state with appropriate color. +func colorServiceState(state string) string { + switch state { + case "active": + return styleGreen.Render("ACTIVE") + case "failed": + return styleRed.Render("FAILED") + case "inactive": + return styleMuted.Render("inactive") + default: + return styleYellow.Render(state) + } +} diff --git a/pkg/cli/monitor/display/table.go b/pkg/cli/monitor/display/table.go new file mode 100644 index 0000000..796c00f --- /dev/null +++ b/pkg/cli/monitor/display/table.go @@ -0,0 +1,53 @@ +package display + +import ( + "encoding/json" + "io" + "strings" + + "github.com/DeBrosOfficial/network/pkg/cli/monitor" + "github.com/charmbracelet/lipgloss" +) + +var ( + styleGreen = lipgloss.NewStyle().Foreground(lipgloss.Color("#00ff00")) + styleRed = lipgloss.NewStyle().Foreground(lipgloss.Color("#ff0000")) + styleYellow = lipgloss.NewStyle().Foreground(lipgloss.Color("#ffff00")) + styleMuted = lipgloss.NewStyle().Foreground(lipgloss.Color("#888888")) + styleBold = lipgloss.NewStyle().Bold(true) + styleHeader = lipgloss.NewStyle().Bold(true).Foreground(lipgloss.Color("#ffffff")) +) + +// statusIcon returns a green "OK" or red "!!" indicator. +func statusIcon(ok bool) string { + if ok { + return styleGreen.Render("OK") + } + return styleRed.Render("!!") +} + +// severityColor returns the lipgloss style for a given alert severity. +func severityColor(s monitor.AlertSeverity) lipgloss.Style { + switch s { + case monitor.AlertCritical: + return styleRed + case monitor.AlertWarning: + return styleYellow + case monitor.AlertInfo: + return styleMuted + default: + return styleMuted + } +} + +// separator returns a dashed line of the given width. +func separator(width int) string { + return strings.Repeat("\u2500", width) +} + +// writeJSON encodes v as indented JSON to w. +func writeJSON(w io.Writer, v interface{}) error { + enc := json.NewEncoder(w) + enc.SetIndent("", " ") + return enc.Encode(v) +} diff --git a/pkg/cli/monitor/snapshot.go b/pkg/cli/monitor/snapshot.go new file mode 100644 index 0000000..9338615 --- /dev/null +++ b/pkg/cli/monitor/snapshot.go @@ -0,0 +1,75 @@ +package monitor + +import ( + "time" + + "github.com/DeBrosOfficial/network/pkg/cli/production/report" + "github.com/DeBrosOfficial/network/pkg/inspector" +) + +// CollectionStatus tracks the SSH collection result for a single node. +type CollectionStatus struct { + Node inspector.Node + Report *report.NodeReport + Error error + Duration time.Duration + Retries int +} + +// ClusterSnapshot is the aggregated state of the entire cluster at a point in time. +type ClusterSnapshot struct { + Environment string + CollectedAt time.Time + Duration time.Duration + Nodes []CollectionStatus + Alerts []Alert +} + +// Healthy returns only nodes that reported successfully. +func (cs *ClusterSnapshot) Healthy() []*report.NodeReport { + var out []*report.NodeReport + for _, n := range cs.Nodes { + if n.Report != nil { + out = append(out, n.Report) + } + } + return out +} + +// Failed returns nodes where SSH or parsing failed. +func (cs *ClusterSnapshot) Failed() []CollectionStatus { + var out []CollectionStatus + for _, n := range cs.Nodes { + if n.Error != nil { + out = append(out, n) + } + } + return out +} + +// ByHost returns a map of host -> NodeReport for quick lookup. +func (cs *ClusterSnapshot) ByHost() map[string]*report.NodeReport { + m := make(map[string]*report.NodeReport, len(cs.Nodes)) + for _, n := range cs.Nodes { + if n.Report != nil { + m[n.Node.Host] = n.Report + } + } + return m +} + +// HealthyCount returns the number of nodes that reported successfully. +func (cs *ClusterSnapshot) HealthyCount() int { + count := 0 + for _, n := range cs.Nodes { + if n.Report != nil { + count++ + } + } + return count +} + +// TotalCount returns the total number of nodes attempted. +func (cs *ClusterSnapshot) TotalCount() int { + return len(cs.Nodes) +} diff --git a/pkg/cli/monitor/tui/alerts.go b/pkg/cli/monitor/tui/alerts.go new file mode 100644 index 0000000..0c73b56 --- /dev/null +++ b/pkg/cli/monitor/tui/alerts.go @@ -0,0 +1,88 @@ +package tui + +import ( + "fmt" + "sort" + "strings" + + "github.com/DeBrosOfficial/network/pkg/cli/monitor" +) + +// renderAlertsTab renders all alerts sorted by severity. +func renderAlertsTab(snap *monitor.ClusterSnapshot, width int) string { + if snap == nil { + return styleMuted.Render("Collecting cluster data...") + } + + if len(snap.Alerts) == 0 { + return styleHealthy.Render(" No alerts. All systems nominal.") + } + + var b strings.Builder + + critCount, warnCount, infoCount := countAlertsBySeverity(snap.Alerts) + b.WriteString(styleBold.Render("Alerts")) + b.WriteString(fmt.Sprintf(" %s %s %s\n", + styleCritical.Render(fmt.Sprintf("%d critical", critCount)), + styleWarning.Render(fmt.Sprintf("%d warning", warnCount)), + styleMuted.Render(fmt.Sprintf("%d info", infoCount)), + )) + b.WriteString(separator(width)) + b.WriteString("\n\n") + + // Sort: critical first, then warning, then info + sorted := make([]monitor.Alert, len(snap.Alerts)) + copy(sorted, snap.Alerts) + sort.Slice(sorted, func(i, j int) bool { + return severityRank(sorted[i].Severity) < severityRank(sorted[j].Severity) + }) + + // Group by severity + currentSev := monitor.AlertSeverity("") + for _, a := range sorted { + if a.Severity != currentSev { + currentSev = a.Severity + label := strings.ToUpper(string(a.Severity)) + b.WriteString(severityStyle(string(a.Severity)).Render(fmt.Sprintf(" ── %s ", label))) + b.WriteString("\n") + } + + sevTag := formatSeverityTag(a.Severity) + b.WriteString(fmt.Sprintf(" %s %-12s %-18s %s\n", + sevTag, + styleMuted.Render("["+a.Subsystem+"]"), + a.Node, + a.Message, + )) + } + + return b.String() +} + +// severityRank returns a sort rank (lower = more severe). +func severityRank(s monitor.AlertSeverity) int { + switch s { + case monitor.AlertCritical: + return 0 + case monitor.AlertWarning: + return 1 + case monitor.AlertInfo: + return 2 + default: + return 3 + } +} + +// formatSeverityTag returns a styled severity label. +func formatSeverityTag(s monitor.AlertSeverity) string { + switch s { + case monitor.AlertCritical: + return styleCritical.Render("CRIT") + case monitor.AlertWarning: + return styleWarning.Render("WARN") + case monitor.AlertInfo: + return styleMuted.Render("INFO") + default: + return styleMuted.Render("????") + } +} diff --git a/pkg/cli/monitor/tui/dns.go b/pkg/cli/monitor/tui/dns.go new file mode 100644 index 0000000..2603688 --- /dev/null +++ b/pkg/cli/monitor/tui/dns.go @@ -0,0 +1,109 @@ +package tui + +import ( + "fmt" + "strings" + + "github.com/DeBrosOfficial/network/pkg/cli/monitor" +) + +// renderDNSTab renders DNS status for nameserver nodes. +func renderDNSTab(snap *monitor.ClusterSnapshot, width int) string { + if snap == nil { + return styleMuted.Render("Collecting cluster data...") + } + + if snap.HealthyCount() == 0 { + return styleMuted.Render("No healthy nodes to display.") + } + + var b strings.Builder + + b.WriteString(styleBold.Render("DNS / Nameserver Status")) + b.WriteString("\n") + b.WriteString(separator(width)) + b.WriteString("\n\n") + + hasDNS := false + for _, cs := range snap.Nodes { + if cs.Report == nil || cs.Report.DNS == nil { + continue + } + hasDNS = true + r := cs.Report + dns := r.DNS + host := nodeHost(r) + role := cs.Node.Role + + b.WriteString(styleBold.Render(fmt.Sprintf(" %s", host))) + if role != "" { + b.WriteString(fmt.Sprintf(" (%s)", role)) + } + b.WriteString("\n") + + // Service status + b.WriteString(fmt.Sprintf(" CoreDNS: %s", statusStr(dns.CoreDNSActive))) + if dns.CoreDNSMemMB > 0 { + b.WriteString(fmt.Sprintf(" mem=%dMB", dns.CoreDNSMemMB)) + } + if dns.CoreDNSRestarts > 0 { + b.WriteString(fmt.Sprintf(" restarts=%s", styleWarning.Render(fmt.Sprintf("%d", dns.CoreDNSRestarts)))) + } + b.WriteString("\n") + + b.WriteString(fmt.Sprintf(" Caddy: %s\n", statusStr(dns.CaddyActive))) + + // Port bindings + b.WriteString(fmt.Sprintf(" Ports: 53=%s 80=%s 443=%s\n", + statusStr(dns.Port53Bound), + statusStr(dns.Port80Bound), + statusStr(dns.Port443Bound), + )) + + // DNS resolution checks + b.WriteString(fmt.Sprintf(" SOA: %s\n", statusStr(dns.SOAResolves))) + b.WriteString(fmt.Sprintf(" NS: %s", statusStr(dns.NSResolves))) + if dns.NSRecordCount > 0 { + b.WriteString(fmt.Sprintf(" (%d records)", dns.NSRecordCount)) + } + b.WriteString("\n") + b.WriteString(fmt.Sprintf(" Base A: %s\n", statusStr(dns.BaseAResolves))) + b.WriteString(fmt.Sprintf(" Wildcard: %s\n", statusStr(dns.WildcardResolves))) + b.WriteString(fmt.Sprintf(" Corefile: %s\n", statusStr(dns.CorefileExists))) + + // TLS certificates + baseTLS := renderTLSDays(dns.BaseTLSDaysLeft, "base") + wildTLS := renderTLSDays(dns.WildTLSDaysLeft, "wildcard") + b.WriteString(fmt.Sprintf(" TLS: %s %s\n", baseTLS, wildTLS)) + + // Log errors + if dns.LogErrors > 0 { + b.WriteString(fmt.Sprintf(" Log errors: %s (5m)\n", + styleWarning.Render(fmt.Sprintf("%d", dns.LogErrors)))) + } + + b.WriteString("\n") + } + + if !hasDNS { + return styleMuted.Render("No nameserver nodes found (no DNS data reported).") + } + + return b.String() +} + +// renderTLSDays formats TLS certificate expiry with color coding. +func renderTLSDays(days int, label string) string { + if days < 0 { + return styleMuted.Render(fmt.Sprintf("%s: n/a", label)) + } + s := fmt.Sprintf("%s: %dd", label, days) + switch { + case days < 7: + return styleCritical.Render(s) + case days < 14: + return styleWarning.Render(s) + default: + return styleHealthy.Render(s) + } +} diff --git a/pkg/cli/monitor/tui/keys.go b/pkg/cli/monitor/tui/keys.go new file mode 100644 index 0000000..970554e --- /dev/null +++ b/pkg/cli/monitor/tui/keys.go @@ -0,0 +1,21 @@ +package tui + +import "github.com/charmbracelet/bubbles/key" + +type keyMap struct { + Quit key.Binding + NextTab key.Binding + PrevTab key.Binding + Refresh key.Binding + ScrollUp key.Binding + ScrollDown key.Binding +} + +var keys = keyMap{ + Quit: key.NewBinding(key.WithKeys("q", "ctrl+c"), key.WithHelp("q", "quit")), + NextTab: key.NewBinding(key.WithKeys("tab", "l"), key.WithHelp("tab", "next tab")), + PrevTab: key.NewBinding(key.WithKeys("shift+tab", "h"), key.WithHelp("shift+tab", "prev tab")), + Refresh: key.NewBinding(key.WithKeys("r"), key.WithHelp("r", "refresh")), + ScrollUp: key.NewBinding(key.WithKeys("up", "k")), + ScrollDown: key.NewBinding(key.WithKeys("down", "j")), +} diff --git a/pkg/cli/monitor/tui/model.go b/pkg/cli/monitor/tui/model.go new file mode 100644 index 0000000..f4fbe0a --- /dev/null +++ b/pkg/cli/monitor/tui/model.go @@ -0,0 +1,226 @@ +package tui + +import ( + "context" + "fmt" + "time" + + "github.com/charmbracelet/bubbles/viewport" + tea "github.com/charmbracelet/bubbletea" + + "github.com/DeBrosOfficial/network/pkg/cli/monitor" +) + +const ( + tabOverview = iota + tabNodes + tabServices + tabMesh + tabDNS + tabNamespaces + tabAlerts + tabCount +) + +var tabNames = []string{"Overview", "Nodes", "Services", "WG Mesh", "DNS", "Namespaces", "Alerts"} + +// snapshotMsg carries the result of a background collection. +type snapshotMsg struct { + snap *monitor.ClusterSnapshot + err error +} + +// tickMsg fires on each refresh interval. +type tickMsg time.Time + +// model is the root Bubbletea model for the Orama monitor TUI. +type model struct { + cfg monitor.CollectorConfig + interval time.Duration + activeTab int + viewport viewport.Model + width int + height int + snapshot *monitor.ClusterSnapshot + loading bool + lastError error + lastUpdate time.Time + quitting bool +} + +// newModel creates a fresh model with default viewport dimensions. +func newModel(cfg monitor.CollectorConfig, interval time.Duration) model { + vp := viewport.New(80, 24) + return model{ + cfg: cfg, + interval: interval, + viewport: vp, + loading: true, + } +} + +func (m model) Init() tea.Cmd { + return tea.Batch(doCollect(m.cfg), tickCmd(m.interval)) +} + +func (m model) Update(msg tea.Msg) (tea.Model, tea.Cmd) { + var cmds []tea.Cmd + + switch msg := msg.(type) { + case tea.KeyMsg: + switch { + case msg.String() == "q" || msg.String() == "ctrl+c": + m.quitting = true + return m, tea.Quit + + case msg.String() == "tab" || msg.String() == "l": + m.activeTab = (m.activeTab + 1) % tabCount + m.updateContent() + m.viewport.GotoTop() + return m, nil + + case msg.String() == "shift+tab" || msg.String() == "h": + m.activeTab = (m.activeTab - 1 + tabCount) % tabCount + m.updateContent() + m.viewport.GotoTop() + return m, nil + + case msg.String() == "r": + if !m.loading { + m.loading = true + return m, doCollect(m.cfg) + } + return m, nil + + default: + // Delegate scrolling to viewport + var cmd tea.Cmd + m.viewport, cmd = m.viewport.Update(msg) + return m, cmd + } + + case tea.WindowSizeMsg: + m.width = msg.Width + m.height = msg.Height + // Reserve 4 lines: header, tab bar, blank separator, footer + vpHeight := msg.Height - 4 + if vpHeight < 1 { + vpHeight = 1 + } + m.viewport.Width = msg.Width + m.viewport.Height = vpHeight + m.updateContent() + return m, nil + + case snapshotMsg: + m.loading = false + if msg.err != nil { + m.lastError = msg.err + } else { + m.snapshot = msg.snap + m.lastError = nil + m.lastUpdate = time.Now() + } + m.updateContent() + return m, nil + + case tickMsg: + if !m.loading { + m.loading = true + cmds = append(cmds, doCollect(m.cfg)) + } + cmds = append(cmds, tickCmd(m.interval)) + return m, tea.Batch(cmds...) + } + + return m, nil +} + +func (m model) View() string { + if m.quitting { + return "" + } + + // Header + var header string + if m.snapshot != nil { + ago := time.Since(m.lastUpdate).Truncate(time.Second) + header = headerStyle.Render(fmt.Sprintf( + "Orama Monitor — %s — Last: %s (%s ago)", + m.snapshot.Environment, + m.lastUpdate.Format("15:04:05"), + ago, + )) + } else if m.loading { + header = headerStyle.Render("Orama Monitor — collecting...") + } else if m.lastError != nil { + header = headerStyle.Render(fmt.Sprintf("Orama Monitor — error: %v", m.lastError)) + } else { + header = headerStyle.Render("Orama Monitor") + } + + if m.loading && m.snapshot != nil { + header += styleMuted.Render(" (refreshing...)") + } + + // Tab bar + tabs := renderTabBar(m.activeTab, m.width) + + // Footer + footer := footerStyle.Render("tab: switch | j/k: scroll | r: refresh | q: quit") + + return header + "\n" + tabs + "\n" + m.viewport.View() + "\n" + footer +} + +// updateContent renders the active tab and sets it on the viewport. +func (m *model) updateContent() { + w := m.width + if w == 0 { + w = 80 + } + + var content string + switch m.activeTab { + case tabOverview: + content = renderOverview(m.snapshot, w) + case tabNodes: + content = renderNodes(m.snapshot, w) + case tabServices: + content = renderServicesTab(m.snapshot, w) + case tabMesh: + content = renderWGMesh(m.snapshot, w) + case tabDNS: + content = renderDNSTab(m.snapshot, w) + case tabNamespaces: + content = renderNamespacesTab(m.snapshot, w) + case tabAlerts: + content = renderAlertsTab(m.snapshot, w) + } + + m.viewport.SetContent(content) +} + +// doCollect returns a tea.Cmd that runs monitor.CollectOnce in a goroutine. +func doCollect(cfg monitor.CollectorConfig) tea.Cmd { + return func() tea.Msg { + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + snap, err := monitor.CollectOnce(ctx, cfg) + return snapshotMsg{snap: snap, err: err} + } +} + +// tickCmd returns a tea.Cmd that fires a tickMsg after the given interval. +func tickCmd(d time.Duration) tea.Cmd { + return tea.Tick(d, func(t time.Time) tea.Msg { + return tickMsg(t) + }) +} + +// Run starts the TUI program with the given collector config. +func Run(cfg monitor.CollectorConfig) error { + m := newModel(cfg, 30*time.Second) + p := tea.NewProgram(m, tea.WithAltScreen()) + _, err := p.Run() + return err +} diff --git a/pkg/cli/monitor/tui/namespaces.go b/pkg/cli/monitor/tui/namespaces.go new file mode 100644 index 0000000..9f722dc --- /dev/null +++ b/pkg/cli/monitor/tui/namespaces.go @@ -0,0 +1,158 @@ +package tui + +import ( + "fmt" + "sort" + "strings" + + "github.com/DeBrosOfficial/network/pkg/cli/monitor" +) + +// renderNamespacesTab renders per-namespace health across all nodes. +func renderNamespacesTab(snap *monitor.ClusterSnapshot, width int) string { + if snap == nil { + return styleMuted.Render("Collecting cluster data...") + } + + reports := snap.Healthy() + if len(reports) == 0 { + return styleMuted.Render("No healthy nodes to display.") + } + + var b strings.Builder + + b.WriteString(styleBold.Render("Namespace Health")) + b.WriteString("\n") + b.WriteString(separator(width)) + b.WriteString("\n\n") + + // Collect unique namespace names + nsSet := make(map[string]bool) + for _, r := range reports { + for _, ns := range r.Namespaces { + nsSet[ns.Name] = true + } + } + + nsNames := make([]string, 0, len(nsSet)) + for name := range nsSet { + nsNames = append(nsNames, name) + } + sort.Strings(nsNames) + + if len(nsNames) == 0 { + return styleMuted.Render("No namespaces found on any node.") + } + + // Header + header := fmt.Sprintf(" %-20s", headerStyle.Render("NAMESPACE")) + for _, r := range reports { + host := nodeHost(r) + if len(host) > 15 { + host = host[:15] + } + header += fmt.Sprintf(" %-17s", headerStyle.Render(host)) + } + b.WriteString(header) + b.WriteString("\n") + + // Build lookup: host -> ns name -> NamespaceReport + type nsKey struct { + host string + name string + } + nsMap := make(map[nsKey]nsStatus) + for _, r := range reports { + host := nodeHost(r) + for _, ns := range r.Namespaces { + nsMap[nsKey{host, ns.Name}] = nsStatus{ + gateway: ns.GatewayUp, + rqlite: ns.RQLiteUp, + rqliteState: ns.RQLiteState, + rqliteReady: ns.RQLiteReady, + olric: ns.OlricUp, + } + } + } + + // Rows + for _, nsName := range nsNames { + row := fmt.Sprintf(" %-20s", nsName) + for _, r := range reports { + host := nodeHost(r) + ns, ok := nsMap[nsKey{host, nsName}] + if !ok { + row += fmt.Sprintf(" %-17s", styleMuted.Render("-")) + continue + } + row += fmt.Sprintf(" %-17s", renderNsCell(ns)) + } + b.WriteString(row) + b.WriteString("\n") + } + + // Detailed per-namespace view + b.WriteString("\n") + b.WriteString(styleBold.Render("Namespace Details")) + b.WriteString("\n") + b.WriteString(separator(width)) + b.WriteString("\n") + + for _, nsName := range nsNames { + b.WriteString(fmt.Sprintf("\n %s\n", styleBold.Render(nsName))) + for _, r := range reports { + host := nodeHost(r) + for _, ns := range r.Namespaces { + if ns.Name != nsName { + continue + } + b.WriteString(fmt.Sprintf(" %-18s gw=%s rqlite=%s", + host, + statusStr(ns.GatewayUp), + statusStr(ns.RQLiteUp), + )) + if ns.RQLiteState != "" { + b.WriteString(fmt.Sprintf("(%s)", ns.RQLiteState)) + } + b.WriteString(fmt.Sprintf(" olric=%s", statusStr(ns.OlricUp))) + if ns.PortBase > 0 { + b.WriteString(fmt.Sprintf(" port=%d", ns.PortBase)) + } + b.WriteString("\n") + } + } + } + + return b.String() +} + +// nsStatus holds a namespace's health indicators for one node. +type nsStatus struct { + gateway bool + rqlite bool + rqliteState string + rqliteReady bool + olric bool +} + +// renderNsCell renders a compact cell for the namespace matrix. +func renderNsCell(ns nsStatus) string { + if ns.gateway && ns.rqlite && ns.olric { + return styleHealthy.Render("OK") + } + if !ns.gateway && !ns.rqlite { + return styleCritical.Render("DOWN") + } + // Partial + parts := []string{} + if !ns.gateway { + parts = append(parts, "gw") + } + if !ns.rqlite { + parts = append(parts, "rq") + } + if !ns.olric { + parts = append(parts, "ol") + } + return styleWarning.Render("!" + strings.Join(parts, ",")) +} diff --git a/pkg/cli/monitor/tui/nodes.go b/pkg/cli/monitor/tui/nodes.go new file mode 100644 index 0000000..bccc3bd --- /dev/null +++ b/pkg/cli/monitor/tui/nodes.go @@ -0,0 +1,147 @@ +package tui + +import ( + "fmt" + "strings" + "time" + + "github.com/DeBrosOfficial/network/pkg/cli/monitor" +) + +// renderNodes renders the Nodes tab with detailed per-node information. +func renderNodes(snap *monitor.ClusterSnapshot, width int) string { + if snap == nil { + return styleMuted.Render("Collecting cluster data...") + } + + var b strings.Builder + + for i, cs := range snap.Nodes { + if i > 0 { + b.WriteString("\n") + } + + host := cs.Node.Host + role := cs.Node.Role + if role == "" { + role = "node" + } + + if cs.Error != nil { + b.WriteString(styleBold.Render(fmt.Sprintf("Node: %s", host))) + b.WriteString(fmt.Sprintf(" (%s)", role)) + b.WriteString("\n") + b.WriteString(separator(width)) + b.WriteString("\n") + b.WriteString(fmt.Sprintf(" Status: %s\n", styleCritical.Render("UNREACHABLE"))) + b.WriteString(fmt.Sprintf(" Error: %s\n", styleCritical.Render(cs.Error.Error()))) + b.WriteString(fmt.Sprintf(" Took: %s\n", styleMuted.Render(cs.Duration.Truncate(time.Millisecond).String()))) + if cs.Retries > 0 { + b.WriteString(fmt.Sprintf(" Retries: %d\n", cs.Retries)) + } + continue + } + + r := cs.Report + if r == nil { + continue + } + + b.WriteString(styleBold.Render(fmt.Sprintf("Node: %s", host))) + b.WriteString(fmt.Sprintf(" (%s) ", role)) + b.WriteString(styleHealthy.Render("ONLINE")) + if r.Version != "" { + b.WriteString(fmt.Sprintf(" v%s", r.Version)) + } + b.WriteString("\n") + b.WriteString(separator(width)) + b.WriteString("\n") + + // System Resources + if r.System != nil { + sys := r.System + b.WriteString(styleBold.Render(" System")) + b.WriteString("\n") + b.WriteString(fmt.Sprintf(" CPU: %d cores, load %.1f / %.1f / %.1f\n", + sys.CPUCount, sys.LoadAvg1, sys.LoadAvg5, sys.LoadAvg15)) + b.WriteString(fmt.Sprintf(" Memory: %s (%d / %d MB, %d MB avail)\n", + colorPct(sys.MemUsePct), sys.MemUsedMB, sys.MemTotalMB, sys.MemAvailMB)) + b.WriteString(fmt.Sprintf(" Disk: %s (%s / %s, %s avail)\n", + colorPct(sys.DiskUsePct), sys.DiskUsedGB, sys.DiskTotalGB, sys.DiskAvailGB)) + if sys.SwapTotalMB > 0 { + b.WriteString(fmt.Sprintf(" Swap: %d / %d MB\n", sys.SwapUsedMB, sys.SwapTotalMB)) + } + b.WriteString(fmt.Sprintf(" Uptime: %s\n", sys.UptimeSince)) + if sys.OOMKills > 0 { + b.WriteString(fmt.Sprintf(" OOM: %s\n", styleCritical.Render(fmt.Sprintf("%d kills", sys.OOMKills)))) + } + } + + // Services + if r.Services != nil && len(r.Services.Services) > 0 { + b.WriteString(styleBold.Render(" Services")) + b.WriteString("\n") + for _, svc := range r.Services.Services { + stateStr := styleHealthy.Render(svc.ActiveState) + if svc.ActiveState == "failed" { + stateStr = styleCritical.Render("FAILED") + } else if svc.ActiveState != "active" { + stateStr = styleWarning.Render(svc.ActiveState) + } + extra := "" + if svc.MemoryCurrentMB > 0 { + extra += fmt.Sprintf(" mem=%dMB", svc.MemoryCurrentMB) + } + if svc.NRestarts > 0 { + extra += fmt.Sprintf(" restarts=%d", svc.NRestarts) + } + if svc.RestartLoopRisk { + extra += styleCritical.Render(" RESTART-LOOP") + } + b.WriteString(fmt.Sprintf(" %-28s %s%s\n", svc.Name, stateStr, extra)) + } + if len(r.Services.FailedUnits) > 0 { + b.WriteString(fmt.Sprintf(" Failed units: %s\n", + styleCritical.Render(strings.Join(r.Services.FailedUnits, ", ")))) + } + } + + // RQLite + if r.RQLite != nil { + rq := r.RQLite + b.WriteString(styleBold.Render(" RQLite")) + b.WriteString("\n") + b.WriteString(fmt.Sprintf(" Responsive: %s Ready: %s Strong Read: %s\n", + statusStr(rq.Responsive), statusStr(rq.Ready), statusStr(rq.StrongRead))) + if rq.Responsive { + b.WriteString(fmt.Sprintf(" Raft: %s Leader: %s Term: %d Applied: %d\n", + styleBold.Render(rq.RaftState), rq.LeaderAddr, rq.Term, rq.Applied)) + if rq.DBSize != "" { + b.WriteString(fmt.Sprintf(" DB size: %s Peers: %d Goroutines: %d Heap: %dMB\n", + rq.DBSize, rq.NumPeers, rq.Goroutines, rq.HeapMB)) + } + } + } + + // WireGuard + if r.WireGuard != nil { + wg := r.WireGuard + b.WriteString(styleBold.Render(" WireGuard")) + b.WriteString("\n") + b.WriteString(fmt.Sprintf(" Interface: %s IP: %s Peers: %d\n", + statusStr(wg.InterfaceUp), wg.WgIP, wg.PeerCount)) + } + + // Network + if r.Network != nil { + net := r.Network + b.WriteString(styleBold.Render(" Network")) + b.WriteString("\n") + b.WriteString(fmt.Sprintf(" Internet: %s UFW: %s TCP est: %d retrans: %.1f%%\n", + statusStr(net.InternetReachable), statusStr(net.UFWActive), + net.TCPEstablished, net.TCPRetransRate)) + } + } + + return b.String() +} diff --git a/pkg/cli/monitor/tui/overview.go b/pkg/cli/monitor/tui/overview.go new file mode 100644 index 0000000..cddce5a --- /dev/null +++ b/pkg/cli/monitor/tui/overview.go @@ -0,0 +1,183 @@ +package tui + +import ( + "fmt" + "strings" + + "github.com/DeBrosOfficial/network/pkg/cli/monitor" +) + +// renderOverview renders the Overview tab: cluster summary, node table, alert summary. +func renderOverview(snap *monitor.ClusterSnapshot, width int) string { + if snap == nil { + return styleMuted.Render("Collecting cluster data...") + } + + var b strings.Builder + + // -- Cluster Summary -- + b.WriteString(styleBold.Render("Cluster Summary")) + b.WriteString("\n") + b.WriteString(separator(width)) + b.WriteString("\n") + + healthy := snap.HealthyCount() + total := snap.TotalCount() + failed := total - healthy + + healthColor := styleHealthy + if failed > 0 { + healthColor = styleWarning + } + if healthy == 0 && total > 0 { + healthColor = styleCritical + } + + b.WriteString(fmt.Sprintf(" Environment: %s\n", styleBold.Render(snap.Environment))) + b.WriteString(fmt.Sprintf(" Nodes: %s / %d\n", healthColor.Render(fmt.Sprintf("%d healthy", healthy)), total)) + if failed > 0 { + b.WriteString(fmt.Sprintf(" Failed: %s\n", styleCritical.Render(fmt.Sprintf("%d", failed)))) + } + b.WriteString(fmt.Sprintf(" Collect time: %s\n", styleMuted.Render(snap.Duration.Truncate(1e6).String()))) + b.WriteString("\n") + + // -- Node Table -- + b.WriteString(styleBold.Render("Nodes")) + b.WriteString("\n") + b.WriteString(separator(width)) + b.WriteString("\n") + + // Header row + b.WriteString(fmt.Sprintf(" %-18s %-8s %-10s %-8s %-8s %-8s %-10s\n", + headerStyle.Render("HOST"), + headerStyle.Render("STATUS"), + headerStyle.Render("ROLE"), + headerStyle.Render("CPU"), + headerStyle.Render("MEM%"), + headerStyle.Render("DISK%"), + headerStyle.Render("RQLITE"), + )) + + for _, cs := range snap.Nodes { + if cs.Error != nil { + b.WriteString(fmt.Sprintf(" %-18s %s %s\n", + cs.Node.Host, + styleCritical.Render("FAIL"), + styleMuted.Render(truncateStr(cs.Error.Error(), 40)), + )) + continue + } + r := cs.Report + if r == nil { + continue + } + + host := r.PublicIP + if host == "" { + host = r.Hostname + } + + var status string + if cs.Error == nil && r != nil { + status = styleHealthy.Render("OK") + } else { + status = styleCritical.Render("FAIL") + } + + role := cs.Node.Role + if role == "" { + role = "node" + } + + cpuStr := "-" + memStr := "-" + diskStr := "-" + if r.System != nil { + cpuStr = fmt.Sprintf("%.1f", r.System.LoadAvg1) + memStr = colorPct(r.System.MemUsePct) + diskStr = colorPct(r.System.DiskUsePct) + } + + rqliteStr := "-" + if r.RQLite != nil { + if r.RQLite.Responsive { + rqliteStr = styleHealthy.Render(r.RQLite.RaftState) + } else { + rqliteStr = styleCritical.Render("DOWN") + } + } + + b.WriteString(fmt.Sprintf(" %-18s %-8s %-10s %-8s %-8s %-8s %-10s\n", + host, status, role, cpuStr, memStr, diskStr, rqliteStr)) + } + b.WriteString("\n") + + // -- Alert Summary -- + critCount, warnCount, infoCount := countAlertsBySeverity(snap.Alerts) + b.WriteString(styleBold.Render("Alerts")) + b.WriteString(fmt.Sprintf(" %s %s %s\n", + styleCritical.Render(fmt.Sprintf("%d critical", critCount)), + styleWarning.Render(fmt.Sprintf("%d warning", warnCount)), + styleMuted.Render(fmt.Sprintf("%d info", infoCount)), + )) + + if critCount > 0 { + b.WriteString("\n") + for _, a := range snap.Alerts { + if a.Severity == monitor.AlertCritical { + b.WriteString(fmt.Sprintf(" %s [%s] %s: %s\n", + styleCritical.Render("CRIT"), + a.Subsystem, + a.Node, + a.Message, + )) + } + } + } + + return b.String() +} + +// colorPct returns a percentage string colored by threshold. +func colorPct(pct int) string { + s := fmt.Sprintf("%d%%", pct) + switch { + case pct >= 90: + return styleCritical.Render(s) + case pct >= 75: + return styleWarning.Render(s) + default: + return styleHealthy.Render(s) + } +} + +// countAlertsBySeverity counts alerts by severity level. +func countAlertsBySeverity(alerts []monitor.Alert) (crit, warn, info int) { + for _, a := range alerts { + switch a.Severity { + case monitor.AlertCritical: + crit++ + case monitor.AlertWarning: + warn++ + case monitor.AlertInfo: + info++ + } + } + return +} + +// truncateStr truncates a string to maxLen characters. +func truncateStr(s string, maxLen int) string { + if len(s) <= maxLen { + return s + } + return s[:maxLen] + "..." +} + +// separator returns a dashed line of the given width. +func separator(width int) string { + if width <= 0 { + width = 80 + } + return styleMuted.Render(strings.Repeat("\u2500", width)) +} diff --git a/pkg/cli/monitor/tui/services.go b/pkg/cli/monitor/tui/services.go new file mode 100644 index 0000000..019f56b --- /dev/null +++ b/pkg/cli/monitor/tui/services.go @@ -0,0 +1,133 @@ +package tui + +import ( + "fmt" + "sort" + "strings" + + "github.com/DeBrosOfficial/network/pkg/cli/monitor" +) + +// renderServicesTab renders a cross-node service matrix. +func renderServicesTab(snap *monitor.ClusterSnapshot, width int) string { + if snap == nil { + return styleMuted.Render("Collecting cluster data...") + } + + reports := snap.Healthy() + if len(reports) == 0 { + return styleMuted.Render("No healthy nodes to display.") + } + + var b strings.Builder + + // Collect all unique service names across nodes + svcSet := make(map[string]bool) + for _, r := range reports { + if r.Services == nil { + continue + } + for _, svc := range r.Services.Services { + svcSet[svc.Name] = true + } + } + + svcNames := make([]string, 0, len(svcSet)) + for name := range svcSet { + svcNames = append(svcNames, name) + } + sort.Strings(svcNames) + + if len(svcNames) == 0 { + return styleMuted.Render("No services found on any node.") + } + + b.WriteString(styleBold.Render("Service Matrix")) + b.WriteString("\n") + b.WriteString(separator(width)) + b.WriteString("\n\n") + + // Header: service name + each node host + header := fmt.Sprintf(" %-28s", headerStyle.Render("SERVICE")) + for _, r := range reports { + host := nodeHost(r) + if len(host) > 15 { + host = host[:15] + } + header += fmt.Sprintf(" %-17s", headerStyle.Render(host)) + } + b.WriteString(header) + b.WriteString("\n") + + // Build a lookup: host -> service name -> ServiceInfo + type svcKey struct { + host string + name string + } + svcMap := make(map[svcKey]string) // status string + for _, r := range reports { + host := nodeHost(r) + if r.Services == nil { + continue + } + for _, svc := range r.Services.Services { + var st string + switch { + case svc.ActiveState == "active": + st = styleHealthy.Render("active") + case svc.ActiveState == "failed": + st = styleCritical.Render("FAILED") + case svc.ActiveState == "": + st = styleMuted.Render("n/a") + default: + st = styleWarning.Render(svc.ActiveState) + } + if svc.RestartLoopRisk { + st = styleCritical.Render("LOOP!") + } + svcMap[svcKey{host, svc.Name}] = st + } + } + + // Rows + for _, svcName := range svcNames { + row := fmt.Sprintf(" %-28s", svcName) + for _, r := range reports { + host := nodeHost(r) + st, ok := svcMap[svcKey{host, svcName}] + if !ok { + st = styleMuted.Render("-") + } + row += fmt.Sprintf(" %-17s", st) + } + b.WriteString(row) + b.WriteString("\n") + } + + // Failed units per node + hasFailedUnits := false + for _, r := range reports { + if r.Services != nil && len(r.Services.FailedUnits) > 0 { + hasFailedUnits = true + break + } + } + if hasFailedUnits { + b.WriteString("\n") + b.WriteString(styleBold.Render("Failed Systemd Units")) + b.WriteString("\n") + b.WriteString(separator(width)) + b.WriteString("\n") + for _, r := range reports { + if r.Services == nil || len(r.Services.FailedUnits) == 0 { + continue + } + b.WriteString(fmt.Sprintf(" %s: %s\n", + styleBold.Render(nodeHost(r)), + styleCritical.Render(strings.Join(r.Services.FailedUnits, ", ")), + )) + } + } + + return b.String() +} diff --git a/pkg/cli/monitor/tui/styles.go b/pkg/cli/monitor/tui/styles.go new file mode 100644 index 0000000..83479c3 --- /dev/null +++ b/pkg/cli/monitor/tui/styles.go @@ -0,0 +1,58 @@ +package tui + +import ( + "github.com/charmbracelet/lipgloss" + + "github.com/DeBrosOfficial/network/pkg/cli/production/report" +) + +var ( + colorGreen = lipgloss.Color("#00ff00") + colorRed = lipgloss.Color("#ff0000") + colorYellow = lipgloss.Color("#ffff00") + colorMuted = lipgloss.Color("#888888") + colorWhite = lipgloss.Color("#ffffff") + colorBg = lipgloss.Color("#1a1a2e") + + styleHealthy = lipgloss.NewStyle().Foreground(colorGreen) + styleWarning = lipgloss.NewStyle().Foreground(colorYellow) + styleCritical = lipgloss.NewStyle().Foreground(colorRed) + styleMuted = lipgloss.NewStyle().Foreground(colorMuted) + styleBold = lipgloss.NewStyle().Bold(true) + + activeTab = lipgloss.NewStyle().Bold(true).Foreground(colorWhite).Background(lipgloss.Color("#333333")).Padding(0, 1) + inactiveTab = lipgloss.NewStyle().Foreground(colorMuted).Padding(0, 1) + + headerStyle = lipgloss.NewStyle().Bold(true).Foreground(colorWhite) + footerStyle = lipgloss.NewStyle().Foreground(colorMuted) +) + +// statusStr returns a green "OK" when ok is true, red "DOWN" when false. +func statusStr(ok bool) string { + if ok { + return styleHealthy.Render("OK") + } + return styleCritical.Render("DOWN") +} + +// severityStyle returns the appropriate lipgloss style for an alert severity. +func severityStyle(s string) lipgloss.Style { + switch s { + case "critical": + return styleCritical + case "warning": + return styleWarning + case "info": + return styleMuted + default: + return styleMuted + } +} + +// nodeHost returns the best display host for a NodeReport. +func nodeHost(r *report.NodeReport) string { + if r.PublicIP != "" { + return r.PublicIP + } + return r.Hostname +} diff --git a/pkg/cli/monitor/tui/tabs.go b/pkg/cli/monitor/tui/tabs.go new file mode 100644 index 0000000..0e1557f --- /dev/null +++ b/pkg/cli/monitor/tui/tabs.go @@ -0,0 +1,47 @@ +package tui + +import "strings" + +// renderTabBar renders the tab bar with the active tab highlighted. +func renderTabBar(active int, width int) string { + var parts []string + for i, name := range tabNames { + if i == active { + parts = append(parts, activeTab.Render(name)) + } else { + parts = append(parts, inactiveTab.Render(name)) + } + } + + bar := strings.Join(parts, styleMuted.Render(" | ")) + + // Pad to full width if needed + if width > 0 { + rendered := stripAnsi(bar) + if len(rendered) < width { + bar += strings.Repeat(" ", width-len(rendered)) + } + } + + return bar +} + +// stripAnsi removes ANSI escape codes for length calculation. +func stripAnsi(s string) string { + var out []byte + inEsc := false + for i := 0; i < len(s); i++ { + if s[i] == '\x1b' { + inEsc = true + continue + } + if inEsc { + if (s[i] >= 'a' && s[i] <= 'z') || (s[i] >= 'A' && s[i] <= 'Z') { + inEsc = false + } + continue + } + out = append(out, s[i]) + } + return string(out) +} diff --git a/pkg/cli/monitor/tui/wgmesh.go b/pkg/cli/monitor/tui/wgmesh.go new file mode 100644 index 0000000..1db06ae --- /dev/null +++ b/pkg/cli/monitor/tui/wgmesh.go @@ -0,0 +1,129 @@ +package tui + +import ( + "fmt" + "strings" + + "github.com/DeBrosOfficial/network/pkg/cli/monitor" + "github.com/DeBrosOfficial/network/pkg/cli/production/report" +) + +// renderWGMesh renders the WireGuard mesh status tab with peer details. +func renderWGMesh(snap *monitor.ClusterSnapshot, width int) string { + if snap == nil { + return styleMuted.Render("Collecting cluster data...") + } + + reports := snap.Healthy() + if len(reports) == 0 { + return styleMuted.Render("No healthy nodes to display.") + } + + var b strings.Builder + + // Mesh overview + b.WriteString(styleBold.Render("WireGuard Mesh Overview")) + b.WriteString("\n") + b.WriteString(separator(width)) + b.WriteString("\n\n") + + // Summary header + b.WriteString(fmt.Sprintf(" %-18s %-10s %-18s %-6s %-8s\n", + headerStyle.Render("HOST"), + headerStyle.Render("IFACE"), + headerStyle.Render("WG IP"), + headerStyle.Render("PEERS"), + headerStyle.Render("PORT"), + )) + + wgNodes := 0 + for _, r := range reports { + if r.WireGuard == nil { + continue + } + wgNodes++ + wg := r.WireGuard + ifaceStr := statusStr(wg.InterfaceUp) + b.WriteString(fmt.Sprintf(" %-18s %-10s %-18s %-6d %-8d\n", + nodeHost(r), ifaceStr, wg.WgIP, wg.PeerCount, wg.ListenPort)) + } + + if wgNodes == 0 { + return styleMuted.Render("No nodes have WireGuard configured.") + } + + expectedPeers := wgNodes - 1 + + // Per-node peer details + b.WriteString("\n") + b.WriteString(styleBold.Render("Peer Details")) + b.WriteString("\n") + b.WriteString(separator(width)) + b.WriteString("\n") + + for _, r := range reports { + if r.WireGuard == nil || len(r.WireGuard.Peers) == 0 { + continue + } + + b.WriteString("\n") + host := nodeHost(r) + peerCountStr := fmt.Sprintf("%d/%d peers", len(r.WireGuard.Peers), expectedPeers) + if len(r.WireGuard.Peers) < expectedPeers { + peerCountStr = styleCritical.Render(peerCountStr) + } else { + peerCountStr = styleHealthy.Render(peerCountStr) + } + b.WriteString(fmt.Sprintf(" %s %s\n", styleBold.Render(host), peerCountStr)) + + for _, p := range r.WireGuard.Peers { + b.WriteString(renderPeerLine(p)) + } + } + + return b.String() +} + +// renderPeerLine formats a single WG peer. +func renderPeerLine(p report.WGPeerInfo) string { + keyShort := p.PublicKey + if len(keyShort) > 12 { + keyShort = keyShort[:12] + "..." + } + + // Handshake status + var hsStr string + if p.LatestHandshake == 0 { + hsStr = styleCritical.Render("never") + } else if p.HandshakeAgeSec > 180 { + hsStr = styleWarning.Render(fmt.Sprintf("%ds ago", p.HandshakeAgeSec)) + } else { + hsStr = styleHealthy.Render(fmt.Sprintf("%ds ago", p.HandshakeAgeSec)) + } + + // Transfer + rx := formatBytes(p.TransferRx) + tx := formatBytes(p.TransferTx) + + return fmt.Sprintf(" key=%s endpoint=%-22s hs=%s rx=%s tx=%s ips=%s\n", + styleMuted.Render(keyShort), + p.Endpoint, + hsStr, + rx, tx, + p.AllowedIPs, + ) +} + +// formatBytes formats bytes into a human-readable string. +func formatBytes(b int64) string { + switch { + case b >= 1<<30: + return fmt.Sprintf("%.1fGB", float64(b)/(1<<30)) + case b >= 1<<20: + return fmt.Sprintf("%.1fMB", float64(b)/(1<<20)) + case b >= 1<<10: + return fmt.Sprintf("%.1fKB", float64(b)/(1<<10)) + default: + return fmt.Sprintf("%dB", b) + } +} diff --git a/pkg/cli/production/report/anyone.go b/pkg/cli/production/report/anyone.go new file mode 100644 index 0000000..5a9b5ce --- /dev/null +++ b/pkg/cli/production/report/anyone.go @@ -0,0 +1,97 @@ +package report + +import ( + "context" + "os" + "regexp" + "strconv" + "strings" + "time" +) + +// collectAnyone gathers Anyone Protocol relay/client health information. +func collectAnyone() *AnyoneReport { + r := &AnyoneReport{} + + // 1. RelayActive: systemctl is-active orama-anyone-relay + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "systemctl", "is-active", "orama-anyone-relay"); err == nil { + r.RelayActive = strings.TrimSpace(out) == "active" + } + } + + // 2. ClientActive: systemctl is-active orama-anyone-client + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "systemctl", "is-active", "orama-anyone-client"); err == nil { + r.ClientActive = strings.TrimSpace(out) == "active" + } + } + + // 3. Mode: derive from active state + if r.RelayActive { + r.Mode = "relay" + } else if r.ClientActive { + r.Mode = "client" + } + + // 4. ORPortListening, SocksListening, ControlListening: check ports in ss -tlnp + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "ss", "-tlnp"); err == nil { + r.ORPortListening = portIsListening(out, 9001) + r.SocksListening = portIsListening(out, 9050) + r.ControlListening = portIsListening(out, 9051) + } + } + + // 5. Bootstrapped / BootstrapPct: parse last "Bootstrapped" line from notices.log + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "bash", "-c", + `grep "Bootstrapped" /var/log/anon/notices.log 2>/dev/null | tail -1`); err == nil { + out = strings.TrimSpace(out) + if out != "" { + // Parse percentage from lines like: + // "... Bootstrapped 100% (done): Done" + // "... Bootstrapped 85%: Loading relay descriptors" + re := regexp.MustCompile(`Bootstrapped\s+(\d+)%`) + if m := re.FindStringSubmatch(out); len(m) >= 2 { + if pct, err := strconv.Atoi(m[1]); err == nil { + r.BootstrapPct = pct + r.Bootstrapped = pct == 100 + } + } + } + } + } + + // 6. Fingerprint: read /var/lib/anon/fingerprint + if data, err := os.ReadFile("/var/lib/anon/fingerprint"); err == nil { + line := strings.TrimSpace(string(data)) + // The file may contain "nickname fingerprint" — extract just the fingerprint. + fields := strings.Fields(line) + if len(fields) >= 2 { + r.Fingerprint = fields[1] + } else if len(fields) == 1 { + r.Fingerprint = fields[0] + } + } + + // 7. Nickname: extract from anonrc config + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "bash", "-c", + `grep "^Nickname" /etc/anon/anonrc 2>/dev/null | awk '{print $2}'`); err == nil { + r.Nickname = strings.TrimSpace(out) + } + } + + return r +} diff --git a/pkg/cli/production/report/dns.go b/pkg/cli/production/report/dns.go new file mode 100644 index 0000000..cb463e6 --- /dev/null +++ b/pkg/cli/production/report/dns.go @@ -0,0 +1,254 @@ +package report + +import ( + "context" + "math" + "os" + "regexp" + "strconv" + "strings" + "time" +) + +// collectDNS gathers CoreDNS, Caddy, and DNS resolution health information. +// Only called when /etc/coredns exists. +func collectDNS() *DNSReport { + r := &DNSReport{} + + // Set TLS days to -1 by default (failure state). + r.BaseTLSDaysLeft = -1 + r.WildTLSDaysLeft = -1 + + // 1. CoreDNSActive: systemctl is-active coredns + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "systemctl", "is-active", "coredns"); err == nil { + r.CoreDNSActive = strings.TrimSpace(out) == "active" + } + } + + // 2. CaddyActive: systemctl is-active caddy + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "systemctl", "is-active", "caddy"); err == nil { + r.CaddyActive = strings.TrimSpace(out) == "active" + } + } + + // 3. Port53Bound: check :53 in ss -ulnp + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "ss", "-ulnp"); err == nil { + r.Port53Bound = strings.Contains(out, ":53 ") || strings.Contains(out, ":53\t") + } + } + + // 4. Port80Bound and Port443Bound: check in ss -tlnp + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "ss", "-tlnp"); err == nil { + r.Port80Bound = strings.Contains(out, ":80 ") || strings.Contains(out, ":80\t") + r.Port443Bound = strings.Contains(out, ":443 ") || strings.Contains(out, ":443\t") + } + } + + // 5. CoreDNSMemMB: ps -C coredns -o rss= + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "ps", "-C", "coredns", "-o", "rss=", "--no-headers"); err == nil { + line := strings.TrimSpace(out) + if line != "" { + first := strings.Fields(line)[0] + if kb, err := strconv.Atoi(first); err == nil { + r.CoreDNSMemMB = kb / 1024 + } + } + } + } + + // 6. CoreDNSRestarts: systemctl show coredns --property=NRestarts + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "systemctl", "show", "coredns", "--property=NRestarts"); err == nil { + props := parseProperties(out) + r.CoreDNSRestarts = parseInt(props["NRestarts"]) + } + } + + // 7. LogErrors: grep errors from coredns journal (last 5 min) + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "bash", "-c", + `journalctl -u coredns --no-pager -n 100 --since "5 min ago" 2>/dev/null | grep -ciE "(error|ERR)" || echo 0`); err == nil { + if n, err := strconv.Atoi(strings.TrimSpace(out)); err == nil { + r.LogErrors = n + } + } + } + + // 8. CorefileExists: check /etc/coredns/Corefile + if _, err := os.Stat("/etc/coredns/Corefile"); err == nil { + r.CorefileExists = true + } + + // Parse domain from Corefile for DNS resolution tests. + domain := parseDomain() + if domain == "" { + return r + } + + // 9. SOAResolves: dig SOA + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "dig", "@127.0.0.1", "SOA", domain, "+short", "+time=2"); err == nil { + r.SOAResolves = strings.TrimSpace(out) != "" + } + } + + // 10. NSResolves and NSRecordCount: dig NS + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "dig", "@127.0.0.1", "NS", domain, "+short", "+time=2"); err == nil { + out = strings.TrimSpace(out) + if out != "" { + r.NSResolves = true + lines := strings.Split(out, "\n") + count := 0 + for _, l := range lines { + if strings.TrimSpace(l) != "" { + count++ + } + } + r.NSRecordCount = count + } + } + } + + // 11. WildcardResolves: dig A test. + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "dig", "@127.0.0.1", "A", "test."+domain, "+short", "+time=2"); err == nil { + r.WildcardResolves = strings.TrimSpace(out) != "" + } + } + + // 12. BaseAResolves: dig A + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "dig", "@127.0.0.1", "A", domain, "+short", "+time=2"); err == nil { + r.BaseAResolves = strings.TrimSpace(out) != "" + } + } + + // 13. BaseTLSDaysLeft: check TLS cert expiry for base domain + r.BaseTLSDaysLeft = checkTLSDaysLeft(domain, domain) + + // 14. WildTLSDaysLeft: check TLS cert expiry for wildcard + r.WildTLSDaysLeft = checkTLSDaysLeft("*."+domain, domain) + + return r +} + +// parseDomain reads /etc/coredns/Corefile and extracts the base domain. +// It looks for zone block declarations like "example.com {" or "*.example.com {" +// and returns the base domain (without wildcard prefix). +func parseDomain() string { + data, err := os.ReadFile("/etc/coredns/Corefile") + if err != nil { + return "" + } + + content := string(data) + + // Look for domain patterns in the Corefile. + // Common patterns: + // example.com { + // *.example.com { + // example.com:53 { + // We want to find a real domain, not "." (root zone). + domainRe := regexp.MustCompile(`(?m)^\s*\*?\.?([a-zA-Z0-9][-a-zA-Z0-9]*\.[a-zA-Z0-9][-a-zA-Z0-9.]*[a-zA-Z])(?::\d+)?\s*\{`) + matches := domainRe.FindStringSubmatch(content) + if len(matches) >= 2 { + return matches[1] + } + + // Fallback: look for any line that looks like a domain block declaration. + for _, line := range strings.Split(content, "\n") { + line = strings.TrimSpace(line) + if line == "" || strings.HasPrefix(line, "#") { + continue + } + + // Strip trailing "{" and port suffix. + line = strings.TrimSuffix(line, "{") + line = strings.TrimSpace(line) + + // Remove port if present. + if idx := strings.LastIndex(line, ":"); idx > 0 { + if _, err := strconv.Atoi(line[idx+1:]); err == nil { + line = line[:idx] + } + } + + // Strip wildcard prefix. + line = strings.TrimPrefix(line, "*.") + + // Check if it looks like a domain (has at least one dot and no spaces). + if strings.Contains(line, ".") && !strings.Contains(line, " ") && line != "." { + return strings.TrimSpace(line) + } + } + + return "" +} + +// checkTLSDaysLeft uses openssl to check the TLS certificate expiry date +// for a given servername connecting to localhost:443. +// Returns days until expiry, or -1 on any failure. +func checkTLSDaysLeft(servername, domain string) int { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + + cmd := `echo | openssl s_client -servername ` + servername + ` -connect localhost:443 2>/dev/null | openssl x509 -noout -enddate 2>/dev/null` + out, err := runCmd(ctx, "bash", "-c", cmd) + if err != nil { + return -1 + } + + // Output looks like: "notAfter=Mar 15 12:00:00 2025 GMT" + out = strings.TrimSpace(out) + if !strings.HasPrefix(out, "notAfter=") { + return -1 + } + + dateStr := strings.TrimPrefix(out, "notAfter=") + dateStr = strings.TrimSpace(dateStr) + + // Parse the date. OpenSSL uses the format: "Jan 2 15:04:05 2006 GMT" + layouts := []string{ + "Jan 2 15:04:05 2006 GMT", + "Jan 2 15:04:05 2006 GMT", + "Jan 02 15:04:05 2006 GMT", + } + + for _, layout := range layouts { + t, err := time.Parse(layout, dateStr) + if err == nil { + days := int(math.Floor(time.Until(t).Hours() / 24)) + return days + } + } + + return -1 +} diff --git a/pkg/cli/production/report/gateway.go b/pkg/cli/production/report/gateway.go new file mode 100644 index 0000000..e8c3c14 --- /dev/null +++ b/pkg/cli/production/report/gateway.go @@ -0,0 +1,63 @@ +package report + +import ( + "context" + "encoding/json" + "io" + "net/http" + "time" +) + +// collectGateway checks the main gateway health endpoint and parses subsystem status. +func collectGateway() *GatewayReport { + r := &GatewayReport{} + + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel() + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, "http://localhost:6001/v1/health", nil) + if err != nil { + return r + } + + resp, err := http.DefaultClient.Do(req) + if err != nil { + r.Responsive = false + return r + } + defer resp.Body.Close() + + r.Responsive = true + r.HTTPStatus = resp.StatusCode + + body, err := io.ReadAll(resp.Body) + if err != nil { + return r + } + + // Try to parse the health response JSON. + // Expected: {"status":"ok","version":"...","subsystems":{"rqlite":{"status":"ok","latency":"2ms"},...}} + var health struct { + Status string `json:"status"` + Version string `json:"version"` + Subsystems map[string]json.RawMessage `json:"subsystems"` + } + + if err := json.Unmarshal(body, &health); err != nil { + return r + } + + r.Version = health.Version + + if len(health.Subsystems) > 0 { + r.Subsystems = make(map[string]SubsystemHealth, len(health.Subsystems)) + for name, raw := range health.Subsystems { + var sub SubsystemHealth + if err := json.Unmarshal(raw, &sub); err == nil { + r.Subsystems[name] = sub + } + } + } + + return r +} diff --git a/pkg/cli/production/report/ipfs.go b/pkg/cli/production/report/ipfs.go new file mode 100644 index 0000000..8ffed76 --- /dev/null +++ b/pkg/cli/production/report/ipfs.go @@ -0,0 +1,148 @@ +package report + +import ( + "bytes" + "context" + "encoding/json" + "io" + "net/http" + "os" + "strings" + "time" +) + +// collectIPFS gathers IPFS daemon and cluster health information. +func collectIPFS() *IPFSReport { + r := &IPFSReport{} + + // 1. DaemonActive: systemctl is-active orama-ipfs + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "systemctl", "is-active", "orama-ipfs"); err == nil { + r.DaemonActive = strings.TrimSpace(out) == "active" + } + } + + // 2. ClusterActive: systemctl is-active orama-ipfs-cluster + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "systemctl", "is-active", "orama-ipfs-cluster"); err == nil { + r.ClusterActive = strings.TrimSpace(out) == "active" + } + } + + // 3. SwarmPeerCount: POST /api/v0/swarm/peers + { + body, err := ipfsPost("http://localhost:4501/api/v0/swarm/peers") + if err == nil { + var resp struct { + Peers []interface{} `json:"Peers"` + } + if err := json.Unmarshal(body, &resp); err == nil { + r.SwarmPeerCount = len(resp.Peers) + } + } + } + + // 4. ClusterPeerCount: GET /peers + { + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel() + if body, err := httpGet(ctx, "http://localhost:9094/peers"); err == nil { + var peers []interface{} + if err := json.Unmarshal(body, &peers); err == nil { + r.ClusterPeerCount = len(peers) + } + } + } + + // 5. RepoSizeBytes/RepoMaxBytes: POST /api/v0/repo/stat + { + body, err := ipfsPost("http://localhost:4501/api/v0/repo/stat") + if err == nil { + var resp struct { + RepoSize int64 `json:"RepoSize"` + StorageMax int64 `json:"StorageMax"` + } + if err := json.Unmarshal(body, &resp); err == nil { + r.RepoSizeBytes = resp.RepoSize + r.RepoMaxBytes = resp.StorageMax + if resp.StorageMax > 0 && resp.RepoSize > 0 { + r.RepoUsePct = int(float64(resp.RepoSize) / float64(resp.StorageMax) * 100) + } + } + } + } + + // 6. KuboVersion: POST /api/v0/version + { + body, err := ipfsPost("http://localhost:4501/api/v0/version") + if err == nil { + var resp struct { + Version string `json:"Version"` + } + if err := json.Unmarshal(body, &resp); err == nil { + r.KuboVersion = resp.Version + } + } + } + + // 7. ClusterVersion: GET /id + { + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel() + if body, err := httpGet(ctx, "http://localhost:9094/id"); err == nil { + var resp struct { + Version string `json:"version"` + } + if err := json.Unmarshal(body, &resp); err == nil { + r.ClusterVersion = resp.Version + } + } + } + + // 8. HasSwarmKey: check file existence + if _, err := os.Stat("/opt/orama/.orama/data/ipfs/repo/swarm.key"); err == nil { + r.HasSwarmKey = true + } + + // 9. BootstrapEmpty: POST /api/v0/bootstrap/list + { + body, err := ipfsPost("http://localhost:4501/api/v0/bootstrap/list") + if err == nil { + var resp struct { + Peers []interface{} `json:"Peers"` + } + if err := json.Unmarshal(body, &resp); err == nil { + r.BootstrapEmpty = resp.Peers == nil || len(resp.Peers) == 0 + } else { + // If we got a response but Peers is missing, treat as empty. + r.BootstrapEmpty = true + } + } + } + + return r +} + +// ipfsPost sends a POST request with an empty body to an IPFS API endpoint. +// IPFS uses POST for all API calls. Uses a 3-second timeout. +func ipfsPost(url string) ([]byte, error) { + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel() + + req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(nil)) + if err != nil { + return nil, err + } + + resp, err := http.DefaultClient.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + return io.ReadAll(resp.Body) +} diff --git a/pkg/cli/production/report/namespaces.go b/pkg/cli/production/report/namespaces.go new file mode 100644 index 0000000..8132a37 --- /dev/null +++ b/pkg/cli/production/report/namespaces.go @@ -0,0 +1,205 @@ +package report + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "os" + "path/filepath" + "regexp" + "strconv" + "strings" + "time" +) + +// collectNamespaces discovers deployed namespaces and checks health of their +// per-namespace services (RQLite, Olric, Gateway). +func collectNamespaces() []NamespaceReport { + namespaces := discoverNamespaces() + if len(namespaces) == 0 { + return nil + } + + var reports []NamespaceReport + for _, ns := range namespaces { + reports = append(reports, collectNamespaceReport(ns)) + } + return reports +} + +type nsInfo struct { + name string + portBase int +} + +// discoverNamespaces finds deployed namespaces by looking for systemd service units +// and/or the filesystem namespace directory. +func discoverNamespaces() []nsInfo { + var result []nsInfo + seen := make(map[string]bool) + + // Strategy 1: Glob for orama-deploy-*-rqlite.service files. + matches, _ := filepath.Glob("/etc/systemd/system/orama-deploy-*-rqlite.service") + for _, path := range matches { + base := filepath.Base(path) + // Extract namespace name: orama-deploy--rqlite.service + name := strings.TrimPrefix(base, "orama-deploy-") + name = strings.TrimSuffix(name, "-rqlite.service") + if name == "" || seen[name] { + continue + } + seen[name] = true + + portBase := parsePortBaseFromUnit(path) + if portBase > 0 { + result = append(result, nsInfo{name: name, portBase: portBase}) + } + } + + // Strategy 2: Check filesystem for any namespaces not found via systemd. + nsDir := "/opt/orama/.orama/data/namespaces" + entries, err := os.ReadDir(nsDir) + if err == nil { + for _, entry := range entries { + if !entry.IsDir() || seen[entry.Name()] { + continue + } + name := entry.Name() + seen[name] = true + + // Try to find the port base from a corresponding service unit. + unitPath := fmt.Sprintf("/etc/systemd/system/orama-deploy-%s-rqlite.service", name) + portBase := parsePortBaseFromUnit(unitPath) + if portBase > 0 { + result = append(result, nsInfo{name: name, portBase: portBase}) + } + } + } + + return result +} + +// parsePortBaseFromUnit reads a systemd unit file and extracts the port base +// from ExecStart arguments or environment variables. +// +// It looks for patterns like: +// - "-http-addr localhost:PORT" or "-http-addr 0.0.0.0:PORT" in ExecStart +// - "PORT_BASE=NNNN" in environment files +// - Any port number that appears to be the RQLite HTTP port (the base port) +func parsePortBaseFromUnit(unitPath string) int { + data, err := os.ReadFile(unitPath) + if err != nil { + return 0 + } + content := string(data) + + // Look for -http-addr with a port number in ExecStart line. + httpAddrRe := regexp.MustCompile(`-http-addr\s+\S+:(\d+)`) + if m := httpAddrRe.FindStringSubmatch(content); len(m) >= 2 { + if port, err := strconv.Atoi(m[1]); err == nil { + return port + } + } + + // Look for a port in -addr or -http flags. + addrRe := regexp.MustCompile(`(?:-addr|-http)\s+\S*:(\d+)`) + if m := addrRe.FindStringSubmatch(content); len(m) >= 2 { + if port, err := strconv.Atoi(m[1]); err == nil { + return port + } + } + + // Look for PORT_BASE environment variable in EnvironmentFile or Environment= directives. + portBaseRe := regexp.MustCompile(`PORT_BASE=(\d+)`) + if m := portBaseRe.FindStringSubmatch(content); len(m) >= 2 { + if port, err := strconv.Atoi(m[1]); err == nil { + return port + } + } + + // Check referenced EnvironmentFile for PORT_BASE. + envFileRe := regexp.MustCompile(`EnvironmentFile=(.+)`) + if m := envFileRe.FindStringSubmatch(content); len(m) >= 2 { + envPath := strings.TrimSpace(m[1]) + envPath = strings.TrimPrefix(envPath, "-") // optional prefix means "ignore if missing" + if envData, err := os.ReadFile(envPath); err == nil { + if m2 := portBaseRe.FindStringSubmatch(string(envData)); len(m2) >= 2 { + if port, err := strconv.Atoi(m2[1]); err == nil { + return port + } + } + } + } + + return 0 +} + +// collectNamespaceReport checks the health of services for a single namespace. +func collectNamespaceReport(ns nsInfo) NamespaceReport { + r := NamespaceReport{ + Name: ns.name, + PortBase: ns.portBase, + } + + // 1. RQLiteUp + RQLiteState: GET http://localhost:/status + { + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel() + + url := fmt.Sprintf("http://localhost:%d/status", ns.portBase) + if body, err := httpGet(ctx, url); err == nil { + r.RQLiteUp = true + + var status map[string]interface{} + if err := json.Unmarshal(body, &status); err == nil { + r.RQLiteState = getNestedString(status, "store", "raft", "state") + } + } + } + + // 2. RQLiteReady: GET http://localhost:/readyz + { + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel() + + url := fmt.Sprintf("http://localhost:%d/readyz", ns.portBase) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err == nil { + if resp, err := http.DefaultClient.Do(req); err == nil { + io.Copy(io.Discard, resp.Body) + resp.Body.Close() + r.RQLiteReady = resp.StatusCode == http.StatusOK + } + } + } + + // 3. OlricUp: check if port_base+2 is listening + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "ss", "-tlnp"); err == nil { + r.OlricUp = portIsListening(out, ns.portBase+2) + } + } + + // 4. GatewayUp + GatewayStatus: GET http://localhost:/v1/health + { + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel() + + url := fmt.Sprintf("http://localhost:%d/v1/health", ns.portBase+4) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err == nil { + if resp, err := http.DefaultClient.Do(req); err == nil { + io.Copy(io.Discard, resp.Body) + resp.Body.Close() + r.GatewayUp = true + r.GatewayStatus = resp.StatusCode + } + } + } + + return r +} diff --git a/pkg/cli/production/report/network.go b/pkg/cli/production/report/network.go new file mode 100644 index 0000000..e241e8f --- /dev/null +++ b/pkg/cli/production/report/network.go @@ -0,0 +1,253 @@ +package report + +import ( + "context" + "os" + "regexp" + "sort" + "strconv" + "strings" + "time" +) + +// collectNetwork gathers network connectivity, TCP stats, listening ports, +// and firewall status. +func collectNetwork() *NetworkReport { + r := &NetworkReport{} + + // 1. InternetReachable: ping 8.8.8.8 + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if _, err := runCmd(ctx, "ping", "-c", "1", "-W", "2", "8.8.8.8"); err == nil { + r.InternetReachable = true + } + } + + // 2. DefaultRoute: ip route show default + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "ip", "route", "show", "default"); err == nil { + r.DefaultRoute = strings.TrimSpace(out) != "" + } + } + + // 3. WGRouteExists: ip route show dev wg0 + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "ip", "route", "show", "dev", "wg0"); err == nil { + r.WGRouteExists = strings.TrimSpace(out) != "" + } + } + + // 4. TCPEstablished / TCPTimeWait: parse `ss -s` + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "ss", "-s"); err == nil { + for _, line := range strings.Split(out, "\n") { + lower := strings.ToLower(line) + if strings.HasPrefix(lower, "tcp:") || strings.Contains(lower, "estab") { + // Parse "estab N" and "timewait N" patterns from the line. + r.TCPEstablished = extractSSCount(line, "estab") + r.TCPTimeWait = extractSSCount(line, "timewait") + } + } + } + } + + // 5. TCPRetransRate: read /proc/net/snmp + { + if data, err := os.ReadFile("/proc/net/snmp"); err == nil { + r.TCPRetransRate = parseTCPRetransRate(string(data)) + } + } + + // 6. ListeningPorts: ss -tlnp (TCP) + ss -ulnp (UDP) + { + seen := make(map[string]bool) + + ctx1, cancel1 := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel1() + if out, err := runCmd(ctx1, "ss", "-tlnp"); err == nil { + for _, pi := range parseSSListening(out, "tcp") { + key := strconv.Itoa(pi.Port) + "/" + pi.Proto + if !seen[key] { + seen[key] = true + r.ListeningPorts = append(r.ListeningPorts, pi) + } + } + } + + ctx2, cancel2 := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel2() + if out, err := runCmd(ctx2, "ss", "-ulnp"); err == nil { + for _, pi := range parseSSListening(out, "udp") { + key := strconv.Itoa(pi.Port) + "/" + pi.Proto + if !seen[key] { + seen[key] = true + r.ListeningPorts = append(r.ListeningPorts, pi) + } + } + } + + // Sort by port number for consistent output. + sort.Slice(r.ListeningPorts, func(i, j int) bool { + if r.ListeningPorts[i].Port != r.ListeningPorts[j].Port { + return r.ListeningPorts[i].Port < r.ListeningPorts[j].Port + } + return r.ListeningPorts[i].Proto < r.ListeningPorts[j].Proto + }) + } + + // 7. UFWActive: ufw status + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "ufw", "status"); err == nil { + r.UFWActive = strings.Contains(out, "Status: active") + } + } + + // 8. UFWRules: ufw status numbered + if r.UFWActive { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "ufw", "status", "numbered"); err == nil { + r.UFWRules = parseUFWRules(out) + } + } + + return r +} + +// extractSSCount finds a pattern like "estab 42" or "timewait 7" in an ss -s line. +func extractSSCount(line, keyword string) int { + re := regexp.MustCompile(keyword + `\s+(\d+)`) + m := re.FindStringSubmatch(line) + if len(m) >= 2 { + if n, err := strconv.Atoi(m[1]); err == nil { + return n + } + } + return 0 +} + +// parseTCPRetransRate parses /proc/net/snmp content to compute +// RetransSegs / OutSegs * 100. +// +// The file has paired lines: a header line followed by a values line. +// We look for the "Tcp:" header and extract RetransSegs and OutSegs. +func parseTCPRetransRate(data string) float64 { + lines := strings.Split(data, "\n") + for i := 0; i+1 < len(lines); i++ { + if !strings.HasPrefix(lines[i], "Tcp:") { + continue + } + header := strings.Fields(lines[i]) + values := strings.Fields(lines[i+1]) + if !strings.HasPrefix(lines[i+1], "Tcp:") || len(header) != len(values) { + continue + } + + var outSegs, retransSegs float64 + for j, h := range header { + switch h { + case "OutSegs": + if v, err := strconv.ParseFloat(values[j], 64); err == nil { + outSegs = v + } + case "RetransSegs": + if v, err := strconv.ParseFloat(values[j], 64); err == nil { + retransSegs = v + } + } + } + if outSegs > 0 { + return retransSegs / outSegs * 100 + } + return 0 + } + return 0 +} + +// parseSSListening parses the output of `ss -tlnp` or `ss -ulnp` to extract +// port numbers and process names. +func parseSSListening(output, proto string) []PortInfo { + var ports []PortInfo + processRe := regexp.MustCompile(`users:\(\("([^"]+)"`) + + for _, line := range strings.Split(output, "\n") { + line = strings.TrimSpace(line) + // Skip header and empty lines. + if line == "" || strings.HasPrefix(line, "State") || strings.HasPrefix(line, "Netid") { + continue + } + + fields := strings.Fields(line) + if len(fields) < 4 { + continue + } + + // The local address:port is typically the 4th field (index 3) for ss -tlnp + // or the 5th field (index 4) for some formats. We look for a field with ":PORT". + localAddr := "" + for _, f := range fields { + if strings.Contains(f, ":") && !strings.HasPrefix(f, "users:") { + // Could be *:port, 0.0.0.0:port, [::]:port, 127.0.0.1:port, etc. + if idx := strings.LastIndex(f, ":"); idx >= 0 { + portStr := f[idx+1:] + if _, err := strconv.Atoi(portStr); err == nil { + localAddr = f + break + } + } + } + } + + if localAddr == "" { + continue + } + + idx := strings.LastIndex(localAddr, ":") + if idx < 0 { + continue + } + portStr := localAddr[idx+1:] + port, err := strconv.Atoi(portStr) + if err != nil { + continue + } + + process := "" + if m := processRe.FindStringSubmatch(line); len(m) >= 2 { + process = m[1] + } + + ports = append(ports, PortInfo{ + Port: port, + Proto: proto, + Process: process, + }) + } + return ports +} + +// parseUFWRules extracts rule lines from `ufw status numbered` output. +// Skips the header lines (Status, To, ---, blank lines). +func parseUFWRules(output string) []string { + var rules []string + for _, line := range strings.Split(output, "\n") { + line = strings.TrimSpace(line) + if line == "" { + continue + } + // Rule lines start with "[ N]" pattern. + if strings.HasPrefix(line, "[") && strings.Contains(line, "]") { + rules = append(rules, line) + } + } + return rules +} diff --git a/pkg/cli/production/report/olric.go b/pkg/cli/production/report/olric.go new file mode 100644 index 0000000..e29f330 --- /dev/null +++ b/pkg/cli/production/report/olric.go @@ -0,0 +1,150 @@ +package report + +import ( + "context" + "encoding/json" + "strconv" + "strings" + "time" +) + +// collectOlric gathers Olric distributed cache health information. +func collectOlric() *OlricReport { + r := &OlricReport{} + + // 1. ServiceActive: systemctl is-active orama-olric + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "systemctl", "is-active", "orama-olric"); err == nil { + r.ServiceActive = strings.TrimSpace(out) == "active" + } + } + + // 2. MemberlistUp: check if port 3322 is listening + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "ss", "-tlnp"); err == nil { + r.MemberlistUp = portIsListening(out, 3322) + } + } + + // 3. RestartCount: systemctl show NRestarts + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "systemctl", "show", "orama-olric", "--property=NRestarts"); err == nil { + props := parseProperties(out) + r.RestartCount = parseInt(props["NRestarts"]) + } + } + + // 4. ProcessMemMB: ps -C olric-server -o rss= + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "ps", "-C", "olric-server", "-o", "rss=", "--no-headers"); err == nil { + line := strings.TrimSpace(out) + if line != "" { + // May have multiple lines if multiple processes; take the first. + first := strings.Fields(line)[0] + if kb, err := strconv.Atoi(first); err == nil { + r.ProcessMemMB = kb / 1024 + } + } + } + } + + // 5. LogErrors: grep errors from journal + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "bash", "-c", + `journalctl -u orama-olric --no-pager -n 200 --since "1 hour ago" 2>/dev/null | grep -ciE "(error|ERR)" || echo 0`); err == nil { + if n, err := strconv.Atoi(strings.TrimSpace(out)); err == nil { + r.LogErrors = n + } + } + } + + // 6. LogSuspects: grep suspect/marking failed/dead + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "bash", "-c", + `journalctl -u orama-olric --no-pager -n 200 --since "1 hour ago" 2>/dev/null | grep -ciE "(suspect|marking.*(failed|dead))" || echo 0`); err == nil { + if n, err := strconv.Atoi(strings.TrimSpace(out)); err == nil { + r.LogSuspects = n + } + } + } + + // 7. LogFlapping: grep memberlist join/leave + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "bash", "-c", + `journalctl -u orama-olric --no-pager -n 200 --since "1 hour ago" 2>/dev/null | grep -ciE "(memberlist.*(join|leave))" || echo 0`); err == nil { + if n, err := strconv.Atoi(strings.TrimSpace(out)); err == nil { + r.LogFlapping = n + } + } + } + + // 8. Member info: try HTTP GET to http://localhost:3320/ + { + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel() + + if body, err := httpGet(ctx, "http://localhost:3320/"); err == nil { + var info struct { + Coordinator string `json:"coordinator"` + Members []struct { + Name string `json:"name"` + } `json:"members"` + // Some Olric versions expose a flat member list or a different structure. + } + if err := json.Unmarshal(body, &info); err == nil { + r.Coordinator = info.Coordinator + r.MemberCount = len(info.Members) + for _, m := range info.Members { + r.Members = append(r.Members, m.Name) + } + } + + // Fallback: try to extract member count from a different JSON layout. + if r.MemberCount == 0 { + var raw map[string]interface{} + if err := json.Unmarshal(body, &raw); err == nil { + if members, ok := raw["members"]; ok { + if arr, ok := members.([]interface{}); ok { + r.MemberCount = len(arr) + for _, m := range arr { + if s, ok := m.(string); ok { + r.Members = append(r.Members, s) + } + } + } + } + if coord, ok := raw["coordinator"].(string); ok && r.Coordinator == "" { + r.Coordinator = coord + } + } + } + } + } + + return r +} + +// portIsListening checks if a given port number appears in ss -tlnp output. +func portIsListening(ssOutput string, port int) bool { + portStr := ":" + strconv.Itoa(port) + for _, line := range strings.Split(ssOutput, "\n") { + if strings.Contains(line, portStr) { + return true + } + } + return false +} diff --git a/pkg/cli/production/report/processes.go b/pkg/cli/production/report/processes.go new file mode 100644 index 0000000..6657112 --- /dev/null +++ b/pkg/cli/production/report/processes.go @@ -0,0 +1,89 @@ +package report + +import ( + "context" + "strconv" + "strings" + "time" +) + +// oramaProcessNames lists command substrings that identify orama-related processes. +var oramaProcessNames = []string{ + "orama", "rqlite", "olric", "ipfs", "caddy", "coredns", +} + +// collectProcesses gathers zombie/orphan process info and panic counts from logs. +func collectProcesses() *ProcessReport { + r := &ProcessReport{} + + // Run ps once and reuse the output for both zombies and orphans. + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + + out, err := runCmd(ctx, "ps", "-eo", "pid,ppid,state,comm", "--no-headers") + if err == nil { + for _, line := range strings.Split(out, "\n") { + line = strings.TrimSpace(line) + if line == "" { + continue + } + + fields := strings.Fields(line) + if len(fields) < 4 { + continue + } + + pid, _ := strconv.Atoi(fields[0]) + ppid, _ := strconv.Atoi(fields[1]) + state := fields[2] + command := strings.Join(fields[3:], " ") + + proc := ProcessInfo{ + PID: pid, + PPID: ppid, + State: state, + Command: command, + } + + // Zombies: state == "Z" + if state == "Z" { + r.Zombies = append(r.Zombies, proc) + } + + // Orphans: PPID == 1 and command contains an orama-related name. + if ppid == 1 && isOramaProcess(command) { + r.Orphans = append(r.Orphans, proc) + } + } + } + + r.ZombieCount = len(r.Zombies) + r.OrphanCount = len(r.Orphans) + + // PanicCount: check journal for panic/fatal in last hour. + { + ctx2, cancel2 := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel2() + + out, err := runCmd(ctx2, "bash", "-c", + `journalctl -u orama-node --no-pager -n 500 --since "1 hour ago" 2>/dev/null | grep -ciE "(panic|fatal)" || echo 0`) + if err == nil { + if n, err := strconv.Atoi(strings.TrimSpace(out)); err == nil { + r.PanicCount = n + } + } + } + + return r +} + +// isOramaProcess checks if a command string contains any orama-related process name. +func isOramaProcess(command string) bool { + lower := strings.ToLower(command) + for _, name := range oramaProcessNames { + if strings.Contains(lower, name) { + return true + } + } + return false +} diff --git a/pkg/cli/production/report/report.go b/pkg/cli/production/report/report.go new file mode 100644 index 0000000..0b986f9 --- /dev/null +++ b/pkg/cli/production/report/report.go @@ -0,0 +1,165 @@ +package report + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "os" + "os/exec" + "strings" + "sync" + "time" +) + +// Handle is the main entry point for `orama node report`. +// It collects system, service, and component information in parallel, +// then outputs the full NodeReport as JSON to stdout. +func Handle(jsonFlag bool, version string) error { + start := time.Now() + + rpt := &NodeReport{ + Timestamp: start.UTC(), + Version: version, + } + + if h, err := os.Hostname(); err == nil { + rpt.Hostname = h + } + + var mu sync.Mutex + addError := func(msg string) { + mu.Lock() + rpt.Errors = append(rpt.Errors, msg) + mu.Unlock() + } + + // safeGo launches a collector goroutine with panic recovery. + safeGo := func(wg *sync.WaitGroup, name string, fn func()) { + wg.Add(1) + go func() { + defer wg.Done() + defer func() { + if r := recover(); r != nil { + addError(fmt.Sprintf("%s collector panicked: %v", name, r)) + } + }() + fn() + }() + } + + var wg sync.WaitGroup + + safeGo(&wg, "system", func() { + rpt.System = collectSystem() + }) + + safeGo(&wg, "services", func() { + rpt.Services = collectServices() + }) + + safeGo(&wg, "rqlite", func() { + rpt.RQLite = collectRQLite() + }) + + safeGo(&wg, "olric", func() { + rpt.Olric = collectOlric() + }) + + safeGo(&wg, "ipfs", func() { + rpt.IPFS = collectIPFS() + }) + + safeGo(&wg, "gateway", func() { + rpt.Gateway = collectGateway() + }) + + safeGo(&wg, "wireguard", func() { + rpt.WireGuard = collectWireGuard() + }) + + safeGo(&wg, "dns", func() { + // Only collect DNS info if this node runs CoreDNS. + if _, err := os.Stat("/etc/coredns"); err == nil { + rpt.DNS = collectDNS() + } + }) + + safeGo(&wg, "anyone", func() { + rpt.Anyone = collectAnyone() + }) + + safeGo(&wg, "network", func() { + rpt.Network = collectNetwork() + }) + + safeGo(&wg, "processes", func() { + rpt.Processes = collectProcesses() + }) + + safeGo(&wg, "namespaces", func() { + rpt.Namespaces = collectNamespaces() + }) + + wg.Wait() + + // Populate top-level WireGuard IP from the WireGuard collector result. + if rpt.WireGuard != nil && rpt.WireGuard.WgIP != "" { + rpt.WGIP = rpt.WireGuard.WgIP + } + + rpt.CollectMS = time.Since(start).Milliseconds() + + enc := json.NewEncoder(os.Stdout) + if !jsonFlag { + enc.SetIndent("", " ") + } + return enc.Encode(rpt) +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +// runCmd executes an external command with a 4-second timeout and returns its +// combined stdout as a trimmed string. +func runCmd(ctx context.Context, name string, args ...string) (string, error) { + ctx, cancel := context.WithTimeout(ctx, 4*time.Second) + defer cancel() + + cmd := exec.CommandContext(ctx, name, args...) + out, err := cmd.Output() + if err != nil { + return "", fmt.Errorf("%s: %w", name, err) + } + return strings.TrimSpace(string(out)), nil +} + +// httpGet performs an HTTP GET request with a 3-second timeout and returns the +// response body bytes. +func httpGet(ctx context.Context, url string) ([]byte, error) { + ctx, cancel := context.WithTimeout(ctx, 3*time.Second) + defer cancel() + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err != nil { + return nil, err + } + + resp, err := http.DefaultClient.Do(req) + if err != nil { + return nil, err + } + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, err + } + + if resp.StatusCode >= 400 { + return body, fmt.Errorf("HTTP %d from %s", resp.StatusCode, url) + } + return body, nil +} diff --git a/pkg/cli/production/report/rqlite.go b/pkg/cli/production/report/rqlite.go new file mode 100644 index 0000000..4b14118 --- /dev/null +++ b/pkg/cli/production/report/rqlite.go @@ -0,0 +1,260 @@ +package report + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "strconv" + "time" +) + +const rqliteBase = "http://localhost:5001" + +// collectRQLite queries the local RQLite HTTP API to build a health report. +func collectRQLite() *RQLiteReport { + r := &RQLiteReport{} + + // 1. GET /status — core Raft and node metadata. + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel() + + statusBody, err := httpGet(ctx, rqliteBase+"/status") + if err != nil { + r.Responsive = false + return r + } + + var status map[string]interface{} + if err := json.Unmarshal(statusBody, &status); err != nil { + r.Responsive = false + return r + } + r.Responsive = true + + // Extract fields from the nested status JSON. + r.RaftState = getNestedString(status, "store", "raft", "state") + r.LeaderAddr = getNestedString(status, "store", "leader", "addr") + r.LeaderID = getNestedString(status, "store", "leader", "node_id") + r.NodeID = getNestedString(status, "store", "node_id") + r.Term = uint64(getNestedFloat(status, "store", "raft", "current_term")) + r.Applied = uint64(getNestedFloat(status, "store", "raft", "applied_index")) + r.Commit = uint64(getNestedFloat(status, "store", "raft", "commit_index")) + r.FsmPending = uint64(getNestedFloat(status, "store", "raft", "fsm_pending")) + r.LastContact = getNestedString(status, "store", "raft", "last_contact") + r.Voter = getNestedBool(status, "store", "raft", "voter") + r.DBSize = getNestedString(status, "store", "sqlite3", "db_size_friendly") + r.Uptime = getNestedString(status, "http", "uptime") + r.Version = getNestedString(status, "build", "version") + r.Goroutines = int(getNestedFloat(status, "runtime", "num_goroutine")) + + // HeapMB: bytes → MB. + heapBytes := getNestedFloat(status, "runtime", "memory", "heap_alloc") + if heapBytes > 0 { + r.HeapMB = int(heapBytes / (1024 * 1024)) + } + + // NumPeers may be a number or a string in the JSON; handle both. + r.NumPeers = getNestedInt(status, "store", "raft", "num_peers") + + // 2. GET /nodes?nonvoters — cluster node list. + { + ctx2, cancel2 := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel2() + + if body, err := httpGet(ctx2, rqliteBase+"/nodes?nonvoters"); err == nil { + var rawNodes map[string]struct { + Addr string `json:"addr"` + Reachable bool `json:"reachable"` + Leader bool `json:"leader"` + Voter bool `json:"voter"` + Time float64 `json:"time"` + Error string `json:"error"` + } + if err := json.Unmarshal(body, &rawNodes); err == nil { + r.Nodes = make(map[string]RQLiteNodeInfo, len(rawNodes)) + for id, n := range rawNodes { + r.Nodes[id] = RQLiteNodeInfo{ + Reachable: n.Reachable, + Leader: n.Leader, + Voter: n.Voter, + TimeMS: n.Time * 1000, // seconds → milliseconds + Error: n.Error, + } + } + } + } + } + + // 3. GET /readyz — readiness probe. + { + ctx3, cancel3 := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel3() + + req, err := http.NewRequestWithContext(ctx3, http.MethodGet, rqliteBase+"/readyz", nil) + if err == nil { + if resp, err := http.DefaultClient.Do(req); err == nil { + resp.Body.Close() + r.Ready = resp.StatusCode == http.StatusOK + } + } + } + + // 4. POST /db/query?level=strong — strong read test. + { + ctx4, cancel4 := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel4() + + payload := []byte(`["SELECT 1"]`) + req, err := http.NewRequestWithContext(ctx4, http.MethodPost, rqliteBase+"/db/query?level=strong", bytes.NewReader(payload)) + if err == nil { + req.Header.Set("Content-Type", "application/json") + if resp, err := http.DefaultClient.Do(req); err == nil { + io.Copy(io.Discard, resp.Body) + resp.Body.Close() + r.StrongRead = resp.StatusCode == http.StatusOK + } + } + } + + // 5. GET /debug/vars — error counters. + { + ctx5, cancel5 := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel5() + + if body, err := httpGet(ctx5, rqliteBase+"/debug/vars"); err == nil { + var vars map[string]interface{} + if err := json.Unmarshal(body, &vars); err == nil { + r.DebugVars = &RQLiteDebugVarsReport{ + QueryErrors: jsonUint64(vars, "api_query_errors"), + ExecuteErrors: jsonUint64(vars, "api_execute_errors"), + RemoteExecErrors: jsonUint64(vars, "api_remote_exec_errors"), + LeaderNotFound: jsonUint64(vars, "store_leader_not_found"), + SnapshotErrors: jsonUint64(vars, "snapshot_errors"), + ClientRetries: jsonUint64(vars, "client_retries"), + ClientTimeouts: jsonUint64(vars, "client_timeouts"), + } + } + } + } + + return r +} + +// --------------------------------------------------------------------------- +// Nested-map extraction helpers +// --------------------------------------------------------------------------- + +// getNestedString traverses nested map[string]interface{} values and returns +// the final value as a string. Returns "" if any key is missing or the leaf +// is not a string. +func getNestedString(m map[string]interface{}, keys ...string) string { + v := getNestedValue(m, keys...) + if v == nil { + return "" + } + if s, ok := v.(string); ok { + return s + } + return fmt.Sprintf("%v", v) +} + +// getNestedFloat traverses nested maps and returns the leaf as a float64. +// JSON numbers are decoded as float64 by encoding/json into interface{}. +func getNestedFloat(m map[string]interface{}, keys ...string) float64 { + v := getNestedValue(m, keys...) + if v == nil { + return 0 + } + switch n := v.(type) { + case float64: + return n + case json.Number: + if f, err := n.Float64(); err == nil { + return f + } + case string: + if f, err := strconv.ParseFloat(n, 64); err == nil { + return f + } + } + return 0 +} + +// getNestedBool traverses nested maps and returns the leaf as a bool. +func getNestedBool(m map[string]interface{}, keys ...string) bool { + v := getNestedValue(m, keys...) + if v == nil { + return false + } + if b, ok := v.(bool); ok { + return b + } + return false +} + +// getNestedInt traverses nested maps and returns the leaf as an int. +// Handles both numeric and string representations (RQLite sometimes +// returns num_peers as a string). +func getNestedInt(m map[string]interface{}, keys ...string) int { + v := getNestedValue(m, keys...) + if v == nil { + return 0 + } + switch n := v.(type) { + case float64: + return int(n) + case json.Number: + if i, err := n.Int64(); err == nil { + return int(i) + } + case string: + if i, err := strconv.Atoi(n); err == nil { + return i + } + } + return 0 +} + +// getNestedValue walks through nested map[string]interface{} following the +// given key path and returns the leaf value, or nil if any step fails. +func getNestedValue(m map[string]interface{}, keys ...string) interface{} { + if len(keys) == 0 { + return nil + } + current := interface{}(m) + for _, key := range keys { + cm, ok := current.(map[string]interface{}) + if !ok { + return nil + } + current, ok = cm[key] + if !ok { + return nil + } + } + return current +} + +// jsonUint64 reads a top-level key from a flat map as uint64. +func jsonUint64(m map[string]interface{}, key string) uint64 { + v, ok := m[key] + if !ok { + return 0 + } + switch n := v.(type) { + case float64: + return uint64(n) + case json.Number: + if i, err := n.Int64(); err == nil { + return uint64(i) + } + case string: + if i, err := strconv.ParseUint(n, 10, 64); err == nil { + return i + } + } + return 0 +} diff --git a/pkg/cli/production/report/services.go b/pkg/cli/production/report/services.go new file mode 100644 index 0000000..e276d18 --- /dev/null +++ b/pkg/cli/production/report/services.go @@ -0,0 +1,201 @@ +package report + +import ( + "context" + "path/filepath" + "strconv" + "strings" + "time" +) + +var coreServices = []string{ + "orama-node", + "orama-gateway", + "orama-olric", + "orama-ipfs", + "orama-ipfs-cluster", + "orama-anyone-relay", + "orama-anyone-client", + "coredns", + "caddy", + "wg-quick@wg0", +} + +func collectServices() *ServicesReport { + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + report := &ServicesReport{} + + // Collect core services. + for _, name := range coreServices { + info := collectServiceInfo(ctx, name) + report.Services = append(report.Services, info) + } + + // Discover namespace services (orama-deploy-*.service). + nsServices := discoverNamespaceServices() + for _, name := range nsServices { + info := collectServiceInfo(ctx, name) + report.Services = append(report.Services, info) + } + + // Collect failed units. + report.FailedUnits = collectFailedUnits(ctx) + + return report +} + +func collectServiceInfo(ctx context.Context, name string) ServiceInfo { + info := ServiceInfo{Name: name} + + // Get all properties in a single systemctl show call. + out, err := runCmd(ctx, "systemctl", "show", name, + "--property=ActiveState,SubState,NRestarts,ActiveEnterTimestamp,MemoryCurrent,CPUUsageNSec,MainPID") + if err != nil { + info.ActiveState = "unknown" + info.SubState = "unknown" + return info + } + + props := parseProperties(out) + + info.ActiveState = props["ActiveState"] + info.SubState = props["SubState"] + info.NRestarts = parseInt(props["NRestarts"]) + info.MainPID = parseInt(props["MainPID"]) + info.MemoryCurrentMB = parseMemoryMB(props["MemoryCurrent"]) + info.CPUUsageNSec = parseInt64(props["CPUUsageNSec"]) + + // Calculate uptime from ActiveEnterTimestamp. + if ts := props["ActiveEnterTimestamp"]; ts != "" && ts != "n/a" { + info.ActiveSinceSec = parseActiveSince(ts) + } + + // Check if service is enabled. + enabledOut, err := runCmd(ctx, "systemctl", "is-enabled", name) + if err == nil && strings.TrimSpace(enabledOut) == "enabled" { + info.Enabled = true + } + + // Restart loop detection: restarted more than 3 times and running for less than 5 minutes. + info.RestartLoopRisk = info.NRestarts > 3 && info.ActiveSinceSec > 0 && info.ActiveSinceSec < 300 + + return info +} + +// parseProperties parses "Key=Value" lines from systemctl show output into a map. +func parseProperties(output string) map[string]string { + props := make(map[string]string) + for _, line := range strings.Split(output, "\n") { + line = strings.TrimSpace(line) + if line == "" { + continue + } + idx := strings.IndexByte(line, '=') + if idx < 0 { + continue + } + key := line[:idx] + value := line[idx+1:] + props[key] = value + } + return props +} + +// parseMemoryMB converts a MemoryCurrent value (bytes as uint64, "[not set]", or "infinity") to MB. +func parseMemoryMB(s string) int { + s = strings.TrimSpace(s) + if s == "" || s == "[not set]" || s == "infinity" { + return 0 + } + bytes, err := strconv.ParseUint(s, 10, 64) + if err != nil { + return 0 + } + return int(bytes / (1024 * 1024)) +} + +// parseActiveSince parses an ActiveEnterTimestamp like "Fri 2024-01-05 10:30:00 UTC" +// and returns the number of seconds elapsed since that time. +func parseActiveSince(ts string) int64 { + // systemctl outputs timestamps in the form: "Day YYYY-MM-DD HH:MM:SS TZ" + // e.g. "Fri 2024-01-05 10:30:00 UTC" + layouts := []string{ + "Mon 2006-01-02 15:04:05 MST", + "Mon 2006-01-02 15:04:05 -0700", + } + ts = strings.TrimSpace(ts) + for _, layout := range layouts { + t, err := time.Parse(layout, ts) + if err == nil { + sec := int64(time.Since(t).Seconds()) + if sec < 0 { + return 0 + } + return sec + } + } + return 0 +} + +func parseInt(s string) int { + s = strings.TrimSpace(s) + if s == "" || s == "[not set]" { + return 0 + } + v, _ := strconv.Atoi(s) + return v +} + +func parseInt64(s string) int64 { + s = strings.TrimSpace(s) + if s == "" || s == "[not set]" { + return 0 + } + v, _ := strconv.ParseInt(s, 10, 64) + return v +} + +// collectFailedUnits runs `systemctl --failed` and extracts unit names from the first column. +func collectFailedUnits(ctx context.Context) []string { + out, err := runCmd(ctx, "systemctl", "--failed", "--no-legend", "--no-pager") + if err != nil { + return nil + } + + var units []string + for _, line := range strings.Split(out, "\n") { + line = strings.TrimSpace(line) + if line == "" { + continue + } + fields := strings.Fields(line) + if len(fields) > 0 { + // First column may have a bullet prefix; strip common markers. + unit := strings.TrimLeft(fields[0], "●* ") + if unit != "" { + units = append(units, unit) + } + } + } + return units +} + +// discoverNamespaceServices finds orama-deploy-*.service files in /etc/systemd/system +// and returns the service names (without the .service suffix path). +func discoverNamespaceServices() []string { + matches, err := filepath.Glob("/etc/systemd/system/orama-deploy-*.service") + if err != nil || len(matches) == 0 { + return nil + } + + var services []string + for _, path := range matches { + base := filepath.Base(path) + // Strip the .service suffix to get the unit name. + name := strings.TrimSuffix(base, ".service") + services = append(services, name) + } + return services +} diff --git a/pkg/cli/production/report/system.go b/pkg/cli/production/report/system.go new file mode 100644 index 0000000..e139f78 --- /dev/null +++ b/pkg/cli/production/report/system.go @@ -0,0 +1,200 @@ +package report + +import ( + "context" + "os" + "strconv" + "strings" + "time" +) + +// collectSystem gathers system-level metrics using local commands and /proc files. +func collectSystem() *SystemReport { + r := &SystemReport{} + + // 1. Uptime seconds: read /proc/uptime, parse first field + if data, err := os.ReadFile("/proc/uptime"); err == nil { + fields := strings.Fields(string(data)) + if len(fields) >= 1 { + if f, err := strconv.ParseFloat(fields[0], 64); err == nil { + r.UptimeSeconds = int64(f) + } + } + } + + // 2. Uptime since: run `uptime -s` + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "uptime", "-s"); err == nil { + r.UptimeSince = strings.TrimSpace(out) + } + } + + // 3. CPU count: run `nproc` + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "nproc"); err == nil { + if n, err := strconv.Atoi(strings.TrimSpace(out)); err == nil { + r.CPUCount = n + } + } + } + + // 4. Load averages: read /proc/loadavg, parse first 3 fields + if data, err := os.ReadFile("/proc/loadavg"); err == nil { + fields := strings.Fields(string(data)) + if len(fields) >= 3 { + if f, err := strconv.ParseFloat(fields[0], 64); err == nil { + r.LoadAvg1 = f + } + if f, err := strconv.ParseFloat(fields[1], 64); err == nil { + r.LoadAvg5 = f + } + if f, err := strconv.ParseFloat(fields[2], 64); err == nil { + r.LoadAvg15 = f + } + } + } + + // 5 & 6. Memory and swap: run `free -m`, parse Mem: and Swap: lines + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "free", "-m"); err == nil { + for _, line := range strings.Split(out, "\n") { + fields := strings.Fields(line) + if len(fields) >= 4 && fields[0] == "Mem:" { + // Mem: total used free shared buff/cache available + if n, err := strconv.Atoi(fields[1]); err == nil { + r.MemTotalMB = n + } + if n, err := strconv.Atoi(fields[2]); err == nil { + r.MemUsedMB = n + } + if n, err := strconv.Atoi(fields[3]); err == nil { + r.MemFreeMB = n + } + if len(fields) >= 7 { + if n, err := strconv.Atoi(fields[6]); err == nil { + r.MemAvailMB = n + } + } + if r.MemTotalMB > 0 { + r.MemUsePct = (r.MemTotalMB - r.MemAvailMB) * 100 / r.MemTotalMB + } + } + if len(fields) >= 3 && fields[0] == "Swap:" { + if n, err := strconv.Atoi(fields[1]); err == nil { + r.SwapTotalMB = n + } + if n, err := strconv.Atoi(fields[2]); err == nil { + r.SwapUsedMB = n + } + } + } + } + } + + // 7. Disk usage: run `df -h /` and `df -h /opt/orama`, use whichever has higher usage + { + type diskInfo struct { + total string + used string + avail string + usePct int + } + + parseDf := func(out string) *diskInfo { + lines := strings.Split(out, "\n") + if len(lines) < 2 { + return nil + } + fields := strings.Fields(lines[1]) + if len(fields) < 5 { + return nil + } + pctStr := strings.TrimSuffix(fields[4], "%") + pct, err := strconv.Atoi(pctStr) + if err != nil { + return nil + } + return &diskInfo{ + total: fields[1], + used: fields[2], + avail: fields[3], + usePct: pct, + } + } + + ctx1, cancel1 := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel1() + rootDisk := (*diskInfo)(nil) + if out, err := runCmd(ctx1, "df", "-h", "/"); err == nil { + rootDisk = parseDf(out) + } + + ctx2, cancel2 := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel2() + optDisk := (*diskInfo)(nil) + if out, err := runCmd(ctx2, "df", "-h", "/opt/orama"); err == nil { + optDisk = parseDf(out) + } + + best := rootDisk + if optDisk != nil && (best == nil || optDisk.usePct > best.usePct) { + best = optDisk + } + if best != nil { + r.DiskTotalGB = best.total + r.DiskUsedGB = best.used + r.DiskAvailGB = best.avail + r.DiskUsePct = best.usePct + } + } + + // 8. Inode usage: run `df -i /`, parse Use% from second line + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "df", "-i", "/"); err == nil { + lines := strings.Split(out, "\n") + if len(lines) >= 2 { + fields := strings.Fields(lines[1]) + if len(fields) >= 5 { + pctStr := strings.TrimSuffix(fields[4], "%") + if n, err := strconv.Atoi(pctStr); err == nil { + r.InodePct = n + } + } + } + } + } + + // 9. OOM kills: run `dmesg 2>/dev/null | grep -ci 'out of memory'` via bash -c + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "bash", "-c", "dmesg 2>/dev/null | grep -ci 'out of memory'"); err == nil { + if n, err := strconv.Atoi(strings.TrimSpace(out)); err == nil { + r.OOMKills = n + } + } + // On error, OOMKills stays 0 (zero value) + } + + // 10. Kernel version: run `uname -r` + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "uname", "-r"); err == nil { + r.KernelVersion = strings.TrimSpace(out) + } + } + + // 11. Current unix timestamp + r.TimeUnix = time.Now().Unix() + + return r +} diff --git a/pkg/cli/production/report/types.go b/pkg/cli/production/report/types.go new file mode 100644 index 0000000..5295fcf --- /dev/null +++ b/pkg/cli/production/report/types.go @@ -0,0 +1,275 @@ +package report + +import "time" + +// NodeReport is the top-level JSON output of `orama node report --json`. +type NodeReport struct { + Timestamp time.Time `json:"timestamp"` + Hostname string `json:"hostname"` + PublicIP string `json:"public_ip,omitempty"` + WGIP string `json:"wireguard_ip,omitempty"` + Version string `json:"version"` + CollectMS int64 `json:"collect_ms"` + Errors []string `json:"errors,omitempty"` + + System *SystemReport `json:"system"` + Services *ServicesReport `json:"services"` + RQLite *RQLiteReport `json:"rqlite,omitempty"` + Olric *OlricReport `json:"olric,omitempty"` + IPFS *IPFSReport `json:"ipfs,omitempty"` + Gateway *GatewayReport `json:"gateway,omitempty"` + WireGuard *WireGuardReport `json:"wireguard,omitempty"` + DNS *DNSReport `json:"dns,omitempty"` + Anyone *AnyoneReport `json:"anyone,omitempty"` + Network *NetworkReport `json:"network"` + Processes *ProcessReport `json:"processes"` + Namespaces []NamespaceReport `json:"namespaces,omitempty"` +} + +// --- System --- + +type SystemReport struct { + UptimeSeconds int64 `json:"uptime_seconds"` + UptimeSince string `json:"uptime_since"` + CPUCount int `json:"cpu_count"` + LoadAvg1 float64 `json:"load_avg_1"` + LoadAvg5 float64 `json:"load_avg_5"` + LoadAvg15 float64 `json:"load_avg_15"` + MemTotalMB int `json:"mem_total_mb"` + MemUsedMB int `json:"mem_used_mb"` + MemFreeMB int `json:"mem_free_mb"` + MemAvailMB int `json:"mem_available_mb"` + MemUsePct int `json:"mem_use_pct"` + SwapTotalMB int `json:"swap_total_mb"` + SwapUsedMB int `json:"swap_used_mb"` + DiskTotalGB string `json:"disk_total_gb"` + DiskUsedGB string `json:"disk_used_gb"` + DiskAvailGB string `json:"disk_avail_gb"` + DiskUsePct int `json:"disk_use_pct"` + InodePct int `json:"inode_use_pct"` + OOMKills int `json:"oom_kills"` + KernelVersion string `json:"kernel_version"` + TimeUnix int64 `json:"time_unix"` +} + +// --- Systemd Services --- + +type ServicesReport struct { + Services []ServiceInfo `json:"services"` + FailedUnits []string `json:"failed_units,omitempty"` +} + +type ServiceInfo struct { + Name string `json:"name"` + ActiveState string `json:"active_state"` + SubState string `json:"sub_state"` + Enabled bool `json:"enabled"` + NRestarts int `json:"n_restarts"` + ActiveSinceSec int64 `json:"active_since_sec"` + MemoryCurrentMB int `json:"memory_current_mb"` + CPUUsageNSec int64 `json:"cpu_usage_nsec"` + MainPID int `json:"main_pid"` + RestartLoopRisk bool `json:"restart_loop_risk"` +} + +// --- RQLite --- + +type RQLiteReport struct { + Responsive bool `json:"responsive"` + Ready bool `json:"ready"` + StrongRead bool `json:"strong_read"` + RaftState string `json:"raft_state,omitempty"` + LeaderAddr string `json:"leader_addr,omitempty"` + LeaderID string `json:"leader_id,omitempty"` + NodeID string `json:"node_id,omitempty"` + Term uint64 `json:"term,omitempty"` + Applied uint64 `json:"applied_index,omitempty"` + Commit uint64 `json:"commit_index,omitempty"` + FsmPending uint64 `json:"fsm_pending,omitempty"` + LastContact string `json:"last_contact,omitempty"` + NumPeers int `json:"num_peers,omitempty"` + Voter bool `json:"voter,omitempty"` + DBSize string `json:"db_size,omitempty"` + Uptime string `json:"uptime,omitempty"` + Version string `json:"version,omitempty"` + Goroutines int `json:"goroutines,omitempty"` + HeapMB int `json:"heap_mb,omitempty"` + Nodes map[string]RQLiteNodeInfo `json:"nodes,omitempty"` + DebugVars *RQLiteDebugVarsReport `json:"debug_vars,omitempty"` +} + +type RQLiteNodeInfo struct { + Reachable bool `json:"reachable"` + Leader bool `json:"leader"` + Voter bool `json:"voter"` + TimeMS float64 `json:"time_ms"` + Error string `json:"error,omitempty"` +} + +type RQLiteDebugVarsReport struct { + QueryErrors uint64 `json:"query_errors"` + ExecuteErrors uint64 `json:"execute_errors"` + RemoteExecErrors uint64 `json:"remote_exec_errors"` + LeaderNotFound uint64 `json:"leader_not_found"` + SnapshotErrors uint64 `json:"snapshot_errors"` + ClientRetries uint64 `json:"client_retries"` + ClientTimeouts uint64 `json:"client_timeouts"` +} + +// --- Olric --- + +type OlricReport struct { + ServiceActive bool `json:"service_active"` + MemberlistUp bool `json:"memberlist_up"` + MemberCount int `json:"member_count,omitempty"` + Members []string `json:"members,omitempty"` + Coordinator string `json:"coordinator,omitempty"` + ProcessMemMB int `json:"process_mem_mb"` + RestartCount int `json:"restart_count"` + LogErrors int `json:"log_errors_1h"` + LogSuspects int `json:"log_suspects_1h"` + LogFlapping int `json:"log_flapping_1h"` +} + +// --- IPFS --- + +type IPFSReport struct { + DaemonActive bool `json:"daemon_active"` + ClusterActive bool `json:"cluster_active"` + SwarmPeerCount int `json:"swarm_peer_count"` + ClusterPeerCount int `json:"cluster_peer_count"` + ClusterErrors int `json:"cluster_errors"` + RepoSizeBytes int64 `json:"repo_size_bytes"` + RepoMaxBytes int64 `json:"repo_max_bytes"` + RepoUsePct int `json:"repo_use_pct"` + KuboVersion string `json:"kubo_version,omitempty"` + ClusterVersion string `json:"cluster_version,omitempty"` + HasSwarmKey bool `json:"has_swarm_key"` + BootstrapEmpty bool `json:"bootstrap_empty"` +} + +// --- Gateway --- + +type GatewayReport struct { + Responsive bool `json:"responsive"` + HTTPStatus int `json:"http_status,omitempty"` + Version string `json:"version,omitempty"` + Subsystems map[string]SubsystemHealth `json:"subsystems,omitempty"` +} + +type SubsystemHealth struct { + Status string `json:"status"` + Latency string `json:"latency,omitempty"` + Error string `json:"error,omitempty"` +} + +// --- WireGuard --- + +type WireGuardReport struct { + InterfaceUp bool `json:"interface_up"` + ServiceActive bool `json:"service_active"` + WgIP string `json:"wg_ip,omitempty"` + ListenPort int `json:"listen_port,omitempty"` + PeerCount int `json:"peer_count"` + MTU int `json:"mtu,omitempty"` + ConfigExists bool `json:"config_exists"` + ConfigPerms string `json:"config_perms,omitempty"` + Peers []WGPeerInfo `json:"peers,omitempty"` +} + +type WGPeerInfo struct { + PublicKey string `json:"public_key"` + Endpoint string `json:"endpoint,omitempty"` + AllowedIPs string `json:"allowed_ips"` + LatestHandshake int64 `json:"latest_handshake"` + HandshakeAgeSec int64 `json:"handshake_age_sec"` + TransferRx int64 `json:"transfer_rx_bytes"` + TransferTx int64 `json:"transfer_tx_bytes"` + Keepalive int `json:"keepalive,omitempty"` +} + +// --- DNS --- + +type DNSReport struct { + CoreDNSActive bool `json:"coredns_active"` + CaddyActive bool `json:"caddy_active"` + Port53Bound bool `json:"port_53_bound"` + Port80Bound bool `json:"port_80_bound"` + Port443Bound bool `json:"port_443_bound"` + CoreDNSMemMB int `json:"coredns_mem_mb"` + CoreDNSRestarts int `json:"coredns_restarts"` + LogErrors int `json:"log_errors_5m"` + CorefileExists bool `json:"corefile_exists"` + SOAResolves bool `json:"soa_resolves"` + NSResolves bool `json:"ns_resolves"` + NSRecordCount int `json:"ns_record_count"` + WildcardResolves bool `json:"wildcard_resolves"` + BaseAResolves bool `json:"base_a_resolves"` + BaseTLSDaysLeft int `json:"base_tls_days_left"` + WildTLSDaysLeft int `json:"wild_tls_days_left"` +} + +// --- Anyone --- + +type AnyoneReport struct { + RelayActive bool `json:"relay_active"` + ClientActive bool `json:"client_active"` + Mode string `json:"mode,omitempty"` + ORPortListening bool `json:"orport_listening"` + SocksListening bool `json:"socks_listening"` + ControlListening bool `json:"control_listening"` + Bootstrapped bool `json:"bootstrapped"` + BootstrapPct int `json:"bootstrap_pct"` + Fingerprint string `json:"fingerprint,omitempty"` + Nickname string `json:"nickname,omitempty"` +} + +// --- Network --- + +type NetworkReport struct { + InternetReachable bool `json:"internet_reachable"` + DefaultRoute bool `json:"default_route"` + WGRouteExists bool `json:"wg_route_exists"` + TCPEstablished int `json:"tcp_established"` + TCPTimeWait int `json:"tcp_time_wait"` + TCPRetransRate float64 `json:"tcp_retrans_pct"` + ListeningPorts []PortInfo `json:"listening_ports"` + UFWActive bool `json:"ufw_active"` + UFWRules []string `json:"ufw_rules,omitempty"` +} + +type PortInfo struct { + Port int `json:"port"` + Proto string `json:"proto"` + Process string `json:"process,omitempty"` +} + +// --- Processes --- + +type ProcessReport struct { + ZombieCount int `json:"zombie_count"` + Zombies []ProcessInfo `json:"zombies,omitempty"` + OrphanCount int `json:"orphan_count"` + Orphans []ProcessInfo `json:"orphans,omitempty"` + PanicCount int `json:"panic_count_1h"` +} + +type ProcessInfo struct { + PID int `json:"pid"` + PPID int `json:"ppid"` + State string `json:"state"` + Command string `json:"command"` +} + +// --- Namespaces --- + +type NamespaceReport struct { + Name string `json:"name"` + PortBase int `json:"port_base"` + RQLiteUp bool `json:"rqlite_up"` + RQLiteState string `json:"rqlite_state,omitempty"` + RQLiteReady bool `json:"rqlite_ready"` + OlricUp bool `json:"olric_up"` + GatewayUp bool `json:"gateway_up"` + GatewayStatus int `json:"gateway_status,omitempty"` +} diff --git a/pkg/cli/production/report/wireguard.go b/pkg/cli/production/report/wireguard.go new file mode 100644 index 0000000..a88b266 --- /dev/null +++ b/pkg/cli/production/report/wireguard.go @@ -0,0 +1,163 @@ +package report + +import ( + "context" + "os" + "strconv" + "strings" + "time" +) + +// collectWireGuard gathers WireGuard interface status, peer information, +// and configuration details using local commands and sysfs. +func collectWireGuard() *WireGuardReport { + r := &WireGuardReport{} + + // 1. ServiceActive: check if wg-quick@wg0 systemd service is active + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "systemctl", "is-active", "wg-quick@wg0"); err == nil { + r.ServiceActive = strings.TrimSpace(out) == "active" + } + } + + // 2. InterfaceUp: check if /sys/class/net/wg0 exists + if _, err := os.Stat("/sys/class/net/wg0"); err == nil { + r.InterfaceUp = true + } + + // If interface is not up, return partial data early. + if !r.InterfaceUp { + // Still check config existence even if interface is down. + if _, err := os.Stat("/etc/wireguard/wg0.conf"); err == nil { + r.ConfigExists = true + + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "stat", "-c", "%a", "/etc/wireguard/wg0.conf"); err == nil { + r.ConfigPerms = strings.TrimSpace(out) + } + } + return r + } + + // 3. WgIP: extract IP from `ip -4 addr show wg0` + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "ip", "-4", "addr", "show", "wg0"); err == nil { + for _, line := range strings.Split(out, "\n") { + line = strings.TrimSpace(line) + if strings.HasPrefix(line, "inet ") { + // Line format: "inet X.X.X.X/Y scope ..." + fields := strings.Fields(line) + if len(fields) >= 2 { + // Extract just the IP, strip the /prefix + ip := fields[1] + if idx := strings.Index(ip, "/"); idx != -1 { + ip = ip[:idx] + } + r.WgIP = ip + } + break + } + } + } + } + + // 4. MTU: read /sys/class/net/wg0/mtu + if data, err := os.ReadFile("/sys/class/net/wg0/mtu"); err == nil { + if n, err := strconv.Atoi(strings.TrimSpace(string(data))); err == nil { + r.MTU = n + } + } + + // 5. ListenPort: parse from `wg show wg0 listen-port` + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "wg", "show", "wg0", "listen-port"); err == nil { + if n, err := strconv.Atoi(strings.TrimSpace(out)); err == nil { + r.ListenPort = n + } + } + } + + // 6. ConfigExists: check if /etc/wireguard/wg0.conf exists + if _, err := os.Stat("/etc/wireguard/wg0.conf"); err == nil { + r.ConfigExists = true + } + + // 7. ConfigPerms: run `stat -c '%a' /etc/wireguard/wg0.conf` + if r.ConfigExists { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "stat", "-c", "%a", "/etc/wireguard/wg0.conf"); err == nil { + r.ConfigPerms = strings.TrimSpace(out) + } + } + + // 8. Peers: run `wg show wg0 dump` and parse peer lines + // Line 1: interface (private_key, public_key, listen_port, fwmark) + // Line 2+: peers (public_key, preshared_key, endpoint, allowed_ips, + // latest_handshake, transfer_rx, transfer_tx, persistent_keepalive) + { + ctx, cancel := context.WithTimeout(context.Background(), 4*time.Second) + defer cancel() + if out, err := runCmd(ctx, "wg", "show", "wg0", "dump"); err == nil { + lines := strings.Split(out, "\n") + now := time.Now().Unix() + for i, line := range lines { + if i == 0 { + // Skip interface line + continue + } + line = strings.TrimSpace(line) + if line == "" { + continue + } + fields := strings.Split(line, "\t") + if len(fields) < 8 { + continue + } + + peer := WGPeerInfo{ + PublicKey: fields[0], + Endpoint: fields[2], + AllowedIPs: fields[3], + } + + // LatestHandshake: unix timestamp (0 = never) + if ts, err := strconv.ParseInt(fields[4], 10, 64); err == nil { + peer.LatestHandshake = ts + if ts > 0 { + peer.HandshakeAgeSec = now - ts + } + } + + // TransferRx + if n, err := strconv.ParseInt(fields[5], 10, 64); err == nil { + peer.TransferRx = n + } + + // TransferTx + if n, err := strconv.ParseInt(fields[6], 10, 64); err == nil { + peer.TransferTx = n + } + + // PersistentKeepalive + if fields[7] != "off" { + if n, err := strconv.Atoi(fields[7]); err == nil { + peer.Keepalive = n + } + } + + r.Peers = append(r.Peers, peer) + } + r.PeerCount = len(r.Peers) + } + } + + return r +}