diff --git a/.codex/environments/environment.toml b/.codex/environments/environment.toml deleted file mode 100644 index e88452c..0000000 --- a/.codex/environments/environment.toml +++ /dev/null @@ -1,6 +0,0 @@ -# THIS IS AUTOGENERATED. DO NOT EDIT MANUALLY -version = 1 -name = "network" - -[setup] -script = "export MCP_BEARER_TOKEN=\"ra_9941ab97eb51668394a68963a2ab6fead0ca942afe437a6e2f4a520efcb24036\"" diff --git a/.gitignore b/.gitignore index 0e5f904..7207c6b 100644 --- a/.gitignore +++ b/.gitignore @@ -100,4 +100,9 @@ vps.txt bin-linux/ -website/ \ No newline at end of file +website/ + +terms-agreement + +cli +./inspector \ No newline at end of file diff --git a/Makefile b/Makefile index 827fc78..6f2363f 100644 --- a/Makefile +++ b/Makefile @@ -84,9 +84,9 @@ test-e2e-quick: # Network - Distributed P2P Database System # Makefile for development and build tasks -.PHONY: build clean test run-node run-node2 run-node3 run-example deps tidy fmt vet lint clear-ports install-hooks kill +.PHONY: build clean test run-node run-node2 run-node3 run-example deps tidy fmt vet lint clear-ports install-hooks kill redeploy-devnet redeploy-testnet release health -VERSION := 0.101.6 +VERSION := 0.102.0 COMMIT ?= $(shell git rev-parse --short HEAD 2>/dev/null || echo unknown) DATE ?= $(shell date -u +%Y-%m-%dT%H:%M:%SZ) LDFLAGS := -X 'main.version=$(VERSION)' -X 'main.commit=$(COMMIT)' -X 'main.date=$(DATE)' @@ -196,6 +196,42 @@ stop: kill: @bash scripts/dev-kill-all.sh +# Deploy to devnet (build + rolling upgrade all nodes) +redeploy-devnet: + @bash scripts/redeploy.sh --devnet + +# Deploy to devnet without rebuilding +redeploy-devnet-quick: + @bash scripts/redeploy.sh --devnet --no-build + +# Deploy to testnet (build + rolling upgrade all nodes) +redeploy-testnet: + @bash scripts/redeploy.sh --testnet + +# Deploy to testnet without rebuilding +redeploy-testnet-quick: + @bash scripts/redeploy.sh --testnet --no-build + +# Interactive release workflow (tag + push) +release: + @bash scripts/release.sh + +# Check health of all nodes in an environment +# Usage: make health ENV=devnet +health: + @if [ -z "$(ENV)" ]; then \ + echo "Usage: make health ENV=devnet|testnet"; \ + exit 1; \ + fi + @while IFS='|' read -r env host pass role key; do \ + [ -z "$$env" ] && continue; \ + case "$$env" in \#*) continue;; esac; \ + env="$$(echo "$$env" | xargs)"; \ + [ "$$env" != "$(ENV)" ] && continue; \ + role="$$(echo "$$role" | xargs)"; \ + bash scripts/check-node-health.sh "$$host" "$$pass" "$$host ($$role)"; \ + done < scripts/remote-nodes.conf + # Help help: @echo "Available targets:" @@ -225,6 +261,14 @@ help: @echo " Example production test:" @echo " ORAMA_GATEWAY_URL=https://dbrs.space make test-e2e-prod" @echo "" + @echo "Deployment:" + @echo " make redeploy-devnet - Build + rolling deploy to all devnet nodes" + @echo " make redeploy-devnet-quick - Deploy to devnet without rebuilding" + @echo " make redeploy-testnet - Build + rolling deploy to all testnet nodes" + @echo " make redeploy-testnet-quick- Deploy to testnet without rebuilding" + @echo " make health ENV=devnet - Check health of all nodes in an environment" + @echo " make release - Interactive release workflow (tag + push)" + @echo "" @echo "Development Management (via orama):" @echo " ./bin/orama dev status - Show status of all dev services" @echo " ./bin/orama dev logs [--follow]" diff --git a/cmd/cli/main.go b/cmd/cli/main.go index c947477..172fd78 100644 --- a/cmd/cli/main.go +++ b/cmd/cli/main.go @@ -88,6 +88,10 @@ func main() { case "db": cli.HandleDBCommand(args) + // Cluster inspection + case "inspect": + cli.HandleInspectCommand(args) + // Namespace management case "namespace": cli.HandleNamespaceCommand(args) @@ -173,6 +177,12 @@ func showHelp() { fmt.Printf("🏢 Namespaces:\n") fmt.Printf(" namespace delete - Delete current namespace and all resources\n\n") + fmt.Printf("🔍 Cluster Inspection:\n") + fmt.Printf(" inspect - Inspect cluster health via SSH\n") + fmt.Printf(" inspect --env devnet - Inspect devnet nodes\n") + fmt.Printf(" inspect --subsystem rqlite - Inspect only RQLite subsystem\n") + fmt.Printf(" inspect --format json - Output as JSON\n\n") + fmt.Printf("🌍 Environments:\n") fmt.Printf(" env list - List all environments\n") fmt.Printf(" env current - Show current environment\n") diff --git a/cmd/gateway/config.go b/cmd/gateway/config.go index 017ff0d..3983f2c 100644 --- a/cmd/gateway/config.go +++ b/cmd/gateway/config.go @@ -14,10 +14,6 @@ import ( "go.uber.org/zap" ) -// For transition, alias main.GatewayConfig to pkg/gateway.Config -// server.go will be removed; this keeps compatibility until then. -type GatewayConfig = gateway.Config - func getEnvDefault(key, def string) string { if v := os.Getenv(key); strings.TrimSpace(v) != "" { return v diff --git a/cmd/inspector/main.go b/cmd/inspector/main.go new file mode 100644 index 0000000..1dc9050 --- /dev/null +++ b/cmd/inspector/main.go @@ -0,0 +1,11 @@ +package main + +import ( + "os" + + "github.com/DeBrosOfficial/network/pkg/cli" +) + +func main() { + cli.HandleInspectCommand(os.Args[1:]) +} diff --git a/docs/COMMON_PROBLEMS.md b/docs/COMMON_PROBLEMS.md new file mode 100644 index 0000000..17ce745 --- /dev/null +++ b/docs/COMMON_PROBLEMS.md @@ -0,0 +1,160 @@ +# Common Problems & Solutions + +Troubleshooting guide for known issues in the Orama Network. + +--- + +## 1. Namespace Gateway: "Olric unavailable" + +**Symptom:** `ns-.orama-devnet.network/v1/health` returns `"olric": {"status": "unavailable"}`. + +**Cause:** The Olric memberlist gossip between namespace nodes is broken. Olric uses UDP pings for health checks — if those fail, the cluster can't bootstrap and the gateway reports Olric as unavailable. + +### Check 1: WireGuard packet loss between nodes + +SSH into each node and ping the other namespace nodes over WireGuard: + +```bash +ping -c 10 -W 2 10.0.0.X # replace with the WG IP of each peer +``` + +If you see packet loss over WireGuard but **not** over the public IP (`ping `), the WireGuard peer session is corrupted. + +**Fix — Reset the WireGuard peer on both sides:** + +```bash +# On Node A — replace and with Node B's values +wg set wg0 peer remove +wg set wg0 peer endpoint :51820 allowed-ips /32 persistent-keepalive 25 + +# On Node B — same but with Node A's values +wg set wg0 peer remove +wg set wg0 peer endpoint :51820 allowed-ips /32 persistent-keepalive 25 +``` + +Then restart services: `sudo orama prod restart` + +You can find peer public keys with `wg show wg0`. + +### Check 2: Olric bound to 0.0.0.0 instead of WireGuard IP + +Check the Olric config on each node: + +```bash +cat /home/debros/.orama/data/namespaces//configs/olric-*.yaml +``` + +If `bindAddr` is `0.0.0.0`, the node will try to bind to IPv6 on dual-stack hosts, breaking memberlist gossip. + +**Fix:** Edit the YAML to use the node's WireGuard IP (run `ip addr show wg0` to find it), then restart: `sudo orama prod restart` + +This was fixed in code (BindAddr validation in `SpawnOlric`), so new namespaces won't have this issue. + +### Check 3: Olric logs show "Failed UDP ping" constantly + +```bash +journalctl -u debros-namespace-olric@.service --no-pager -n 30 +``` + +If every UDP ping fails but TCP stream connections succeed, it's the WireGuard packet loss issue (see Check 1). + +--- + +## 2. Namespace Gateway: Missing config fields + +**Symptom:** Gateway config YAML is missing `global_rqlite_dsn`, has `olric_timeout: 0s`, or `olric_servers` only lists `localhost`. + +**Cause:** Before the spawn handler fix, `spawnGatewayRemote()` didn't send `global_rqlite_dsn` or `olric_timeout` to remote nodes. + +**Fix:** Edit the gateway config manually: + +```bash +vim /home/debros/.orama/data/namespaces//configs/gateway-*.yaml +``` + +Add/fix: +```yaml +global_rqlite_dsn: "http://10.0.0.X:10001" +olric_timeout: 30s +olric_servers: + - "10.0.0.X:10002" + - "10.0.0.Y:10002" + - "10.0.0.Z:10002" +``` + +Then: `sudo orama prod restart` + +This was fixed in code, so new namespaces get the correct config. + +--- + +## 3. Namespace not restoring after restart (missing cluster-state.json) + +**Symptom:** After `orama prod restart`, the namespace services don't come back because `RestoreLocalClustersFromDisk` has no state file. + +**Check:** + +```bash +ls /home/debros/.orama/data/namespaces//cluster-state.json +``` + +If the file doesn't exist, the node can't restore the namespace. + +**Fix:** Create the file manually from another node that has it, or reconstruct it. The format is: + +```json +{ + "namespace": "", + "rqlite": { "http_port": 10001, "raft_port": 10000, ... }, + "olric": { "http_port": 10002, "memberlist_port": 10003, ... }, + "gateway": { "http_port": 10004, ... } +} +``` + +This was fixed in code — `ProvisionCluster` now saves state to all nodes (including remote ones via the `save-cluster-state` spawn action). + +--- + +## 4. Namespace gateway processes not restarting after upgrade + +**Symptom:** After `orama upgrade --restart` or `orama prod restart`, namespace gateway/olric/rqlite services don't start. + +**Cause:** `orama prod stop` disables systemd template services (`debros-namespace-gateway@.service`). They have `PartOf=debros-node.service`, but that only propagates restart to **enabled** services. + +**Fix:** Re-enable the services before restarting: + +```bash +systemctl enable debros-namespace-rqlite@.service +systemctl enable debros-namespace-olric@.service +systemctl enable debros-namespace-gateway@.service +sudo orama prod restart +``` + +This was fixed in code — the upgrade orchestrator now re-enables `@` services before restarting. + +--- + +## 5. SSH commands eating stdin inside heredocs + +**Symptom:** When running a script that SSHes into multiple nodes inside a heredoc (`<<'EOS'`), only the first SSH command runs — the rest are silently skipped. + +**Cause:** `ssh` reads from stdin, consuming the rest of the heredoc. + +**Fix:** Add `-n` flag to all `ssh` calls inside heredocs: + +```bash +ssh -n user@host 'command' +``` + +`scp` is not affected (doesn't read stdin). + +--- + +## General Debugging Tips + +- **Always use `sudo orama prod restart`** instead of raw `systemctl` commands +- **Namespace data lives at:** `/home/debros/.orama/data/namespaces//` +- **Check service logs:** `journalctl -u debros-namespace-olric@.service --no-pager -n 50` +- **Check WireGuard:** `wg show wg0` — look for recent handshakes and transfer bytes +- **Check gateway health:** `curl http://localhost:/v1/health` from the node itself +- **Node IPs:** Check `scripts/remote-nodes.conf` for credentials, `wg show wg0` for WG IPs diff --git a/docs/INSPECTOR.md b/docs/INSPECTOR.md new file mode 100644 index 0000000..c8dcf39 --- /dev/null +++ b/docs/INSPECTOR.md @@ -0,0 +1,213 @@ +# Inspector + +The inspector is a cluster health check tool that SSHs into every node, collects subsystem data in parallel, runs deterministic checks, and optionally sends failures to an AI model for root-cause analysis. + +## Pipeline + +``` +Collect (parallel SSH) → Check (deterministic Go) → Report (table/JSON) → Analyze (optional AI) +``` + +1. **Collect** — SSH into every node in parallel, run diagnostic commands, parse results into structured data. +2. **Check** — Run pure Go check functions against the collected data. Each check produces a pass/fail/warn/skip result with a severity level. +3. **Report** — Print results as a table (default) or JSON. Failures sort first, grouped by subsystem. +4. **Analyze** — If `--ai` is enabled and there are failures or warnings, send them to an LLM via OpenRouter for root-cause analysis. + +## Quick Start + +```bash +# Inspect all subsystems on devnet +orama inspect --env devnet + +# Inspect only RQLite +orama inspect --env devnet --subsystem rqlite + +# JSON output +orama inspect --env devnet --format json + +# With AI analysis +orama inspect --env devnet --ai +``` + +## Usage + +``` +orama inspect [flags] +``` + +| Flag | Default | Description | +|------|---------|-------------| +| `--config` | `scripts/remote-nodes.conf` | Path to node configuration file | +| `--env` | *(required)* | Environment to inspect (`devnet`, `testnet`) | +| `--subsystem` | `all` | Comma-separated subsystems to inspect | +| `--format` | `table` | Output format: `table` or `json` | +| `--timeout` | `30s` | SSH command timeout per node | +| `--verbose` | `false` | Print collection progress | +| `--ai` | `false` | Enable AI analysis of failures | +| `--model` | `moonshotai/kimi-k2.5` | OpenRouter model for AI analysis | +| `--api-key` | `$OPENROUTER_API_KEY` | OpenRouter API key | + +### Subsystem Names + +`rqlite`, `olric`, `ipfs`, `dns`, `wireguard` (alias: `wg`), `system`, `network`, `namespace` + +Multiple subsystems can be combined: `--subsystem rqlite,olric,dns` + +## Subsystems + +| Subsystem | What It Checks | +|-----------|---------------| +| **rqlite** | Raft state, leader election, readyz, commit/applied gap, FSM pending, strong reads, debug vars (query errors, leader_not_found, snapshots), cross-node leader agreement, term consistency, applied index convergence, quorum, version match | +| **olric** | Service active, memberlist up, restart count, memory usage, log analysis (suspects, flapping, errors), cross-node memberlist consistency | +| **ipfs** | Daemon active, cluster active, swarm peer count, cluster peer count, cluster errors, repo usage %, swarm key present, bootstrap list empty, cross-node version consistency | +| **dns** | CoreDNS active, Caddy active, ports (53/80/443), memory, restart count, log errors, Corefile exists, SOA/NS/wildcard/base-A resolution, TLS cert expiry, cross-node nameserver availability | +| **wireguard** | Interface up, service active, correct 10.0.0.x IP, listen port 51820, peer count vs expected, MTU 1420, config exists + permissions 600, peer handshakes (fresh/stale/never), peer traffic, catch-all route detection, cross-node peer count + MTU consistency | +| **system** | Core services (debros-node, rqlite, olric, ipfs, ipfs-cluster, wg-quick), nameserver services (coredns, caddy), failed systemd units, memory/disk/inode usage, load average, OOM kills, swap, UFW active, process user (debros), panic count, expected ports | +| **network** | Internet reachability, default route, WireGuard route, TCP connection count, TIME_WAIT count, TCP retransmission rate, WireGuard mesh ping (all peers) | +| **namespace** | Per-namespace: RQLite up + raft state + readyz, Olric memberlist, Gateway HTTP health. Cross-namespace: all-healthy check, RQLite quorum per namespace | + +## Severity Levels + +| Level | When Used | +|-------|-----------| +| **CRITICAL** | Service completely down. Raft quorum lost, RQLite unresponsive, no leader. | +| **HIGH** | Service degraded. Olric down, gateway not responding, IPFS swarm key missing. | +| **MEDIUM** | Non-ideal but functional. Stale handshakes, elevated memory, log suspects. | +| **LOW** | Informational. Non-standard MTU, port mismatch, version skew. | + +## Check Statuses + +| Status | Meaning | +|--------|---------| +| **pass** | Check passed. | +| **fail** | Check failed — action needed. | +| **warn** | Degraded — monitor or investigate. | +| **skip** | Check could not run (insufficient data). | + +## Output Formats + +### Table (default) + +``` +Inspecting 14 devnet nodes... + +## RQLITE +---------------------------------------------------------------------- + OK [CRITICAL] RQLite responding (ubuntu@10.0.0.1) + responsive=true version=v8.36.16 + FAIL [CRITICAL] Cluster has exactly one leader + leaders=0 (NO LEADER) + ... + +====================================================================== +Summary: 800 passed, 12 failed, 31 warnings, 0 skipped (4.2s) +``` + +Failures sort first, then warnings, then passes. Within each group, higher severity checks appear first. + +### JSON (`--format json`) + +```json +{ + "summary": { + "passed": 800, + "failed": 12, + "warned": 31, + "skipped": 0, + "total": 843, + "duration_seconds": 4.2 + }, + "checks": [ + { + "id": "rqlite.responsive", + "name": "RQLite responding", + "subsystem": "rqlite", + "severity": 3, + "status": "pass", + "message": "responsive=true version=v8.36.16", + "node": "ubuntu@10.0.0.1" + } + ] +} +``` + +## AI Analysis + +When `--ai` is enabled, failures and warnings are sent to an LLM via OpenRouter for root-cause analysis. + +```bash +# Use default model (kimi-k2.5) +orama inspect --env devnet --ai + +# Use a different model +orama inspect --env devnet --ai --model openai/gpt-4o + +# Pass API key directly +orama inspect --env devnet --ai --api-key sk-or-... +``` + +The API key can be set via: +1. `--api-key` flag +2. `OPENROUTER_API_KEY` environment variable +3. `.env` file in the current directory + +The AI receives the full check results plus cluster metadata and returns a structured analysis with likely root causes and suggested fixes. + +## Exit Codes + +| Code | Meaning | +|------|---------| +| `0` | All checks passed (or only warnings). | +| `1` | At least one check failed. | + +## Configuration + +The inspector reads node definitions from a pipe-delimited config file (default: `scripts/remote-nodes.conf`). + +### Format + +``` +# environment|user@host|password|role|ssh_key +devnet|ubuntu@1.2.3.4|mypassword|node| +devnet|ubuntu@5.6.7.8|mypassword|nameserver-ns1|/path/to/key +``` + +| Field | Description | +|-------|-------------| +| `environment` | Cluster name (`devnet`, `testnet`) | +| `user@host` | SSH credentials | +| `password` | SSH password | +| `role` | `node` or `nameserver-ns1`, `nameserver-ns2`, etc. | +| `ssh_key` | Optional path to SSH private key | + +Blank lines and lines starting with `#` are ignored. + +### Node Roles + +- **`node`** — Regular cluster node. Runs RQLite, Olric, IPFS, WireGuard, namespaces. +- **`nameserver-*`** — DNS nameserver. Runs CoreDNS + Caddy in addition to base services. System checks verify nameserver-specific services. + +## Examples + +```bash +# Full cluster inspection +orama inspect --env devnet + +# Check only networking +orama inspect --env devnet --subsystem wireguard,network + +# Quick RQLite health check +orama inspect --env devnet --subsystem rqlite + +# Verbose mode (shows collection progress) +orama inspect --env devnet --verbose + +# JSON for scripting / piping +orama inspect --env devnet --format json | jq '.checks[] | select(.status == "fail")' + +# AI-assisted debugging +orama inspect --env devnet --ai --model anthropic/claude-sonnet-4 + +# Custom config file +orama inspect --config /path/to/nodes.conf --env testnet +``` diff --git a/examples/functions/build.sh b/docs/examples/functions/build.sh similarity index 100% rename from examples/functions/build.sh rename to docs/examples/functions/build.sh diff --git a/examples/functions/counter/main.go b/docs/examples/functions/counter/main.go similarity index 100% rename from examples/functions/counter/main.go rename to docs/examples/functions/counter/main.go diff --git a/examples/functions/echo/main.go b/docs/examples/functions/echo/main.go similarity index 100% rename from examples/functions/echo/main.go rename to docs/examples/functions/echo/main.go diff --git a/examples/functions/hello/main.go b/docs/examples/functions/hello/main.go similarity index 100% rename from examples/functions/hello/main.go rename to docs/examples/functions/hello/main.go diff --git a/e2e/cluster/ipfs_cluster_test.go b/e2e/cluster/ipfs_cluster_test.go deleted file mode 100644 index 76c01fd..0000000 --- a/e2e/cluster/ipfs_cluster_test.go +++ /dev/null @@ -1,415 +0,0 @@ -//go:build e2e - -package cluster_test - -import ( - "bytes" - "context" - "fmt" - "io" - "testing" - "time" - - "github.com/DeBrosOfficial/network/e2e" - "github.com/DeBrosOfficial/network/pkg/ipfs" -) - -// Note: These tests connect directly to IPFS Cluster API (localhost:9094) -// and IPFS API (localhost:4501). They are for local development only. -// For production testing, use storage_http_test.go which uses gateway endpoints. - -func TestIPFSCluster_Health(t *testing.T) { - e2e.SkipIfProduction(t) // Direct IPFS connection not available in production - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - - logger := e2e.NewTestLogger(t) - cfg := ipfs.Config{ - ClusterAPIURL: e2e.GetIPFSClusterURL(), - Timeout: 10 * time.Second, - } - - client, err := ipfs.NewClient(cfg, logger) - if err != nil { - t.Fatalf("failed to create IPFS client: %v", err) - } - - err = client.Health(ctx) - if err != nil { - t.Fatalf("health check failed: %v", err) - } -} - -func TestIPFSCluster_GetPeerCount(t *testing.T) { - e2e.SkipIfProduction(t) - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - - logger := e2e.NewTestLogger(t) - cfg := ipfs.Config{ - ClusterAPIURL: e2e.GetIPFSClusterURL(), - Timeout: 10 * time.Second, - } - - client, err := ipfs.NewClient(cfg, logger) - if err != nil { - t.Fatalf("failed to create IPFS client: %v", err) - } - - peerCount, err := client.GetPeerCount(ctx) - if err != nil { - t.Fatalf("get peer count failed: %v", err) - } - - if peerCount < 0 { - t.Fatalf("expected non-negative peer count, got %d", peerCount) - } - - t.Logf("IPFS cluster peers: %d", peerCount) -} - -func TestIPFSCluster_AddFile(t *testing.T) { - e2e.SkipIfProduction(t) - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() - - logger := e2e.NewTestLogger(t) - cfg := ipfs.Config{ - ClusterAPIURL: e2e.GetIPFSClusterURL(), - Timeout: 30 * time.Second, - } - - client, err := ipfs.NewClient(cfg, logger) - if err != nil { - t.Fatalf("failed to create IPFS client: %v", err) - } - - content := []byte("IPFS cluster test content") - result, err := client.Add(ctx, bytes.NewReader(content), "test.txt") - if err != nil { - t.Fatalf("add file failed: %v", err) - } - - if result.Cid == "" { - t.Fatalf("expected non-empty CID") - } - - if result.Size != int64(len(content)) { - t.Fatalf("expected size %d, got %d", len(content), result.Size) - } - - t.Logf("Added file with CID: %s", result.Cid) -} - -func TestIPFSCluster_PinFile(t *testing.T) { - e2e.SkipIfProduction(t) - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() - - logger := e2e.NewTestLogger(t) - cfg := ipfs.Config{ - ClusterAPIURL: e2e.GetIPFSClusterURL(), - Timeout: 30 * time.Second, - } - - client, err := ipfs.NewClient(cfg, logger) - if err != nil { - t.Fatalf("failed to create IPFS client: %v", err) - } - - // Add file first - content := []byte("IPFS pin test content") - addResult, err := client.Add(ctx, bytes.NewReader(content), "pin-test.txt") - if err != nil { - t.Fatalf("add file failed: %v", err) - } - - cid := addResult.Cid - - // Pin the file - pinResult, err := client.Pin(ctx, cid, "pinned-file", 1) - if err != nil { - t.Fatalf("pin file failed: %v", err) - } - - if pinResult.Cid != cid { - t.Fatalf("expected cid %s, got %s", cid, pinResult.Cid) - } - - t.Logf("Pinned file: %s", cid) -} - -func TestIPFSCluster_PinStatus(t *testing.T) { - e2e.SkipIfProduction(t) - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() - - logger := e2e.NewTestLogger(t) - cfg := ipfs.Config{ - ClusterAPIURL: e2e.GetIPFSClusterURL(), - Timeout: 30 * time.Second, - } - - client, err := ipfs.NewClient(cfg, logger) - if err != nil { - t.Fatalf("failed to create IPFS client: %v", err) - } - - // Add and pin file - content := []byte("IPFS status test content") - addResult, err := client.Add(ctx, bytes.NewReader(content), "status-test.txt") - if err != nil { - t.Fatalf("add file failed: %v", err) - } - - cid := addResult.Cid - - pinResult, err := client.Pin(ctx, cid, "status-test", 1) - if err != nil { - t.Fatalf("pin file failed: %v", err) - } - - if pinResult.Cid != cid { - t.Fatalf("expected cid %s, got %s", cid, pinResult.Cid) - } - - // Give pin time to propagate - e2e.Delay(1000) - - // Get status - status, err := client.PinStatus(ctx, cid) - if err != nil { - t.Fatalf("get pin status failed: %v", err) - } - - if status.Cid != cid { - t.Fatalf("expected cid %s, got %s", cid, status.Cid) - } - - if status.Name != "status-test" { - t.Fatalf("expected name 'status-test', got %s", status.Name) - } - - if status.ReplicationFactor < 1 { - t.Logf("warning: replication factor is %d, expected >= 1", status.ReplicationFactor) - } - - t.Logf("Pin status: %s (replication: %d, peers: %d)", status.Status, status.ReplicationFactor, len(status.Peers)) -} - -func TestIPFSCluster_UnpinFile(t *testing.T) { - e2e.SkipIfProduction(t) - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() - - logger := e2e.NewTestLogger(t) - cfg := ipfs.Config{ - ClusterAPIURL: e2e.GetIPFSClusterURL(), - Timeout: 30 * time.Second, - } - - client, err := ipfs.NewClient(cfg, logger) - if err != nil { - t.Fatalf("failed to create IPFS client: %v", err) - } - - // Add and pin file - content := []byte("IPFS unpin test content") - addResult, err := client.Add(ctx, bytes.NewReader(content), "unpin-test.txt") - if err != nil { - t.Fatalf("add file failed: %v", err) - } - - cid := addResult.Cid - - _, err = client.Pin(ctx, cid, "unpin-test", 1) - if err != nil { - t.Fatalf("pin file failed: %v", err) - } - - // Unpin file - err = client.Unpin(ctx, cid) - if err != nil { - t.Fatalf("unpin file failed: %v", err) - } - - t.Logf("Unpinned file: %s", cid) -} - -func TestIPFSCluster_GetFile(t *testing.T) { - e2e.SkipIfProduction(t) - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() - - logger := e2e.NewTestLogger(t) - cfg := ipfs.Config{ - ClusterAPIURL: e2e.GetIPFSClusterURL(), - Timeout: 30 * time.Second, - } - - client, err := ipfs.NewClient(cfg, logger) - if err != nil { - t.Fatalf("failed to create IPFS client: %v", err) - } - - // Add file - content := []byte("IPFS get test content") - addResult, err := client.Add(ctx, bytes.NewReader(content), "get-test.txt") - if err != nil { - t.Fatalf("add file failed: %v", err) - } - - cid := addResult.Cid - - // Give time for propagation - e2e.Delay(1000) - - // Get file - rc, err := client.Get(ctx, cid, e2e.GetIPFSAPIURL()) - if err != nil { - t.Fatalf("get file failed: %v", err) - } - defer rc.Close() - - retrievedContent, err := io.ReadAll(rc) - if err != nil { - t.Fatalf("failed to read content: %v", err) - } - - if !bytes.Equal(retrievedContent, content) { - t.Fatalf("content mismatch: expected %q, got %q", string(content), string(retrievedContent)) - } - - t.Logf("Retrieved file: %s (%d bytes)", cid, len(retrievedContent)) -} - -func TestIPFSCluster_LargeFile(t *testing.T) { - e2e.SkipIfProduction(t) - ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) - defer cancel() - - logger := e2e.NewTestLogger(t) - cfg := ipfs.Config{ - ClusterAPIURL: e2e.GetIPFSClusterURL(), - Timeout: 60 * time.Second, - } - - client, err := ipfs.NewClient(cfg, logger) - if err != nil { - t.Fatalf("failed to create IPFS client: %v", err) - } - - // Create 5MB file - content := bytes.Repeat([]byte("x"), 5*1024*1024) - result, err := client.Add(ctx, bytes.NewReader(content), "large.bin") - if err != nil { - t.Fatalf("add large file failed: %v", err) - } - - if result.Cid == "" { - t.Fatalf("expected non-empty CID") - } - - if result.Size != int64(len(content)) { - t.Fatalf("expected size %d, got %d", len(content), result.Size) - } - - t.Logf("Added large file with CID: %s (%d bytes)", result.Cid, result.Size) -} - -func TestIPFSCluster_ReplicationFactor(t *testing.T) { - e2e.SkipIfProduction(t) // Direct IPFS connection not available in production - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() - - logger := e2e.NewTestLogger(t) - cfg := ipfs.Config{ - ClusterAPIURL: e2e.GetIPFSClusterURL(), - Timeout: 30 * time.Second, - } - - client, err := ipfs.NewClient(cfg, logger) - if err != nil { - t.Fatalf("failed to create IPFS client: %v", err) - } - - // Add file - content := []byte("IPFS replication test content") - addResult, err := client.Add(ctx, bytes.NewReader(content), "replication-test.txt") - if err != nil { - t.Fatalf("add file failed: %v", err) - } - - cid := addResult.Cid - - // Pin with specific replication factor - replicationFactor := 2 - pinResult, err := client.Pin(ctx, cid, "replication-test", replicationFactor) - if err != nil { - t.Fatalf("pin file failed: %v", err) - } - - if pinResult.Cid != cid { - t.Fatalf("expected cid %s, got %s", cid, pinResult.Cid) - } - - // Give time for replication - e2e.Delay(2000) - - // Check status - status, err := client.PinStatus(ctx, cid) - if err != nil { - t.Fatalf("get pin status failed: %v", err) - } - - t.Logf("Replication factor: requested=%d, actual=%d, peers=%d", replicationFactor, status.ReplicationFactor, len(status.Peers)) -} - -func TestIPFSCluster_MultipleFiles(t *testing.T) { - e2e.SkipIfProduction(t) // Direct IPFS connection not available in production - ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) - defer cancel() - - logger := e2e.NewTestLogger(t) - cfg := ipfs.Config{ - ClusterAPIURL: e2e.GetIPFSClusterURL(), - Timeout: 30 * time.Second, - } - - client, err := ipfs.NewClient(cfg, logger) - if err != nil { - t.Fatalf("failed to create IPFS client: %v", err) - } - - // Add multiple files - numFiles := 5 - var cids []string - - for i := 0; i < numFiles; i++ { - content := []byte(fmt.Sprintf("File %d", i)) - result, err := client.Add(ctx, bytes.NewReader(content), fmt.Sprintf("file%d.txt", i)) - if err != nil { - t.Fatalf("add file %d failed: %v", i, err) - } - cids = append(cids, result.Cid) - } - - if len(cids) != numFiles { - t.Fatalf("expected %d files added, got %d", numFiles, len(cids)) - } - - // Verify all files exist - for i, cid := range cids { - status, err := client.PinStatus(ctx, cid) - if err != nil { - t.Logf("warning: failed to get status for file %d: %v", i, err) - continue - } - - if status.Cid != cid { - t.Fatalf("expected cid %s, got %s", cid, status.Cid) - } - } - - t.Logf("Successfully added and verified %d files", numFiles) -} diff --git a/e2e/cluster/libp2p_connectivity_test.go b/e2e/cluster/libp2p_connectivity_test.go deleted file mode 100644 index 225c751..0000000 --- a/e2e/cluster/libp2p_connectivity_test.go +++ /dev/null @@ -1,296 +0,0 @@ -//go:build e2e - -package cluster_test - -import ( - "context" - "net/http" - "strings" - "testing" - "time" - - "github.com/DeBrosOfficial/network/e2e" -) - -func TestLibP2P_PeerConnectivity(t *testing.T) { - e2e.SkipIfMissingGateway(t) - - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() - - // Create and connect client - c := e2e.NewNetworkClient(t) - if err := c.Connect(); err != nil { - t.Fatalf("connect failed: %v", err) - } - defer c.Disconnect() - - // Verify peer connectivity through the gateway - req := &e2e.HTTPRequest{ - Method: http.MethodGet, - URL: e2e.GetGatewayURL() + "/v1/network/peers", - } - - body, status, err := req.Do(ctx) - if err != nil { - t.Fatalf("peers request failed: %v", err) - } - - if status != http.StatusOK { - t.Fatalf("expected status 200, got %d", status) - } - - var resp map[string]interface{} - if err := e2e.DecodeJSON(body, &resp); err != nil { - t.Fatalf("failed to decode response: %v", err) - } - - peers := resp["peers"].([]interface{}) - if len(peers) == 0 { - t.Logf("warning: no peers connected (cluster may still be initializing)") - } -} - -func TestLibP2P_BootstrapPeers(t *testing.T) { - e2e.SkipIfMissingGateway(t) - - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() - - bootstrapPeers := e2e.GetBootstrapPeers() - if len(bootstrapPeers) == 0 { - t.Skipf("E2E_BOOTSTRAP_PEERS not set; skipping") - } - - // Create client with bootstrap peers explicitly set - c := e2e.NewNetworkClient(t) - if err := c.Connect(); err != nil { - t.Fatalf("connect failed: %v", err) - } - defer c.Disconnect() - - // Give peer discovery time - e2e.Delay(2000) - - // Verify we're connected (check via gateway status) - req := &e2e.HTTPRequest{ - Method: http.MethodGet, - URL: e2e.GetGatewayURL() + "/v1/network/status", - } - - body, status, err := req.Do(ctx) - if err != nil { - t.Fatalf("status request failed: %v", err) - } - - if status != http.StatusOK { - t.Fatalf("expected status 200, got %d", status) - } - - var resp map[string]interface{} - if err := e2e.DecodeJSON(body, &resp); err != nil { - t.Fatalf("failed to decode response: %v", err) - } - - if resp["connected"] != true { - t.Logf("warning: client not connected to network (cluster may still be initializing)") - } -} - -func TestLibP2P_MultipleClientConnections(t *testing.T) { - e2e.SkipIfMissingGateway(t) - - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() - - // Create multiple clients - c1 := e2e.NewNetworkClient(t) - c2 := e2e.NewNetworkClient(t) - c3 := e2e.NewNetworkClient(t) - - if err := c1.Connect(); err != nil { - t.Fatalf("c1 connect failed: %v", err) - } - defer c1.Disconnect() - - if err := c2.Connect(); err != nil { - t.Fatalf("c2 connect failed: %v", err) - } - defer c2.Disconnect() - - if err := c3.Connect(); err != nil { - t.Fatalf("c3 connect failed: %v", err) - } - defer c3.Disconnect() - - // Give peer discovery time - e2e.Delay(2000) - - // Verify gateway sees multiple peers - req := &e2e.HTTPRequest{ - Method: http.MethodGet, - URL: e2e.GetGatewayURL() + "/v1/network/peers", - } - - body, status, err := req.Do(ctx) - if err != nil { - t.Fatalf("peers request failed: %v", err) - } - - if status != http.StatusOK { - t.Fatalf("expected status 200, got %d", status) - } - - var resp map[string]interface{} - if err := e2e.DecodeJSON(body, &resp); err != nil { - t.Fatalf("failed to decode response: %v", err) - } - - peers := resp["peers"].([]interface{}) - if len(peers) < 1 { - t.Logf("warning: expected at least 1 peer, got %d", len(peers)) - } -} - -func TestLibP2P_ReconnectAfterDisconnect(t *testing.T) { - e2e.SkipIfMissingGateway(t) - - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() - - c := e2e.NewNetworkClient(t) - - // Connect - if err := c.Connect(); err != nil { - t.Fatalf("connect failed: %v", err) - } - - // Verify connected via gateway - req1 := &e2e.HTTPRequest{ - Method: http.MethodGet, - URL: e2e.GetGatewayURL() + "/v1/network/status", - } - - _, status1, err := req1.Do(ctx) - if err != nil || status1 != http.StatusOK { - t.Logf("warning: gateway check failed before disconnect: status %d, err %v", status1, err) - } - - // Disconnect - if err := c.Disconnect(); err != nil { - t.Logf("warning: disconnect failed: %v", err) - } - - // Give time for disconnect to propagate - e2e.Delay(500) - - // Reconnect - if err := c.Connect(); err != nil { - t.Fatalf("reconnect failed: %v", err) - } - defer c.Disconnect() - - // Verify connected via gateway again - req2 := &e2e.HTTPRequest{ - Method: http.MethodGet, - URL: e2e.GetGatewayURL() + "/v1/network/status", - } - - _, status2, err := req2.Do(ctx) - if err != nil || status2 != http.StatusOK { - t.Logf("warning: gateway check failed after reconnect: status %d, err %v", status2, err) - } -} - -func TestLibP2P_PeerDiscovery(t *testing.T) { - e2e.SkipIfMissingGateway(t) - - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() - - // Create client - c := e2e.NewNetworkClient(t) - if err := c.Connect(); err != nil { - t.Fatalf("connect failed: %v", err) - } - defer c.Disconnect() - - // Give peer discovery time - e2e.Delay(3000) - - // Get peer list - req := &e2e.HTTPRequest{ - Method: http.MethodGet, - URL: e2e.GetGatewayURL() + "/v1/network/peers", - } - - body, status, err := req.Do(ctx) - if err != nil { - t.Fatalf("peers request failed: %v", err) - } - - if status != http.StatusOK { - t.Fatalf("expected status 200, got %d", status) - } - - var resp map[string]interface{} - if err := e2e.DecodeJSON(body, &resp); err != nil { - t.Fatalf("failed to decode response: %v", err) - } - - peers := resp["peers"].([]interface{}) - if len(peers) == 0 { - t.Logf("warning: no peers discovered (cluster may not have multiple nodes)") - } else { - // Verify peer format (should be multiaddr strings) - for _, p := range peers { - peerStr := p.(string) - if !strings.Contains(peerStr, "/p2p/") && !strings.Contains(peerStr, "/ipfs/") { - t.Logf("warning: unexpected peer format: %s", peerStr) - } - } - } -} - -func TestLibP2P_PeerAddressFormat(t *testing.T) { - e2e.SkipIfMissingGateway(t) - - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() - - // Create client - c := e2e.NewNetworkClient(t) - if err := c.Connect(); err != nil { - t.Fatalf("connect failed: %v", err) - } - defer c.Disconnect() - - // Get peer list - req := &e2e.HTTPRequest{ - Method: http.MethodGet, - URL: e2e.GetGatewayURL() + "/v1/network/peers", - } - - body, status, err := req.Do(ctx) - if err != nil { - t.Fatalf("peers request failed: %v", err) - } - - if status != http.StatusOK { - t.Fatalf("expected status 200, got %d", status) - } - - var resp map[string]interface{} - if err := e2e.DecodeJSON(body, &resp); err != nil { - t.Fatalf("failed to decode response: %v", err) - } - - peers := resp["peers"].([]interface{}) - for _, p := range peers { - peerStr := p.(string) - // Multiaddrs should start with / - if !strings.HasPrefix(peerStr, "/") { - t.Fatalf("expected multiaddr format, got %s", peerStr) - } - } -} diff --git a/e2e/cluster/olric_cluster_test.go b/e2e/cluster/olric_cluster_test.go deleted file mode 100644 index e6359d8..0000000 --- a/e2e/cluster/olric_cluster_test.go +++ /dev/null @@ -1,338 +0,0 @@ -//go:build e2e - -package cluster_test - -import ( - "encoding/json" - "fmt" - "net" - "net/http" - "strings" - "testing" - "time" - - "github.com/DeBrosOfficial/network/e2e" - "github.com/stretchr/testify/require" -) - -// ============================================================================= -// STRICT OLRIC CACHE DISTRIBUTION TESTS -// These tests verify that Olric cache data is properly distributed across nodes. -// Tests FAIL if distribution doesn't work - no skips, no warnings. -// ============================================================================= - -// getOlricNodeAddresses returns HTTP addresses of Olric nodes -// Note: Olric HTTP port is typically on port 3320 for the main cluster -func getOlricNodeAddresses() []string { - // In dev mode, we have a single Olric instance - // In production, each node runs its own Olric instance - return []string{ - "http://localhost:3320", - } -} - -// TestOlric_BasicDistribution verifies cache operations work across the cluster. -func TestOlric_BasicDistribution(t *testing.T) { - // Note: Not using SkipIfMissingGateway() since LoadTestEnv() creates its own API key - env, err := e2e.LoadTestEnv() - require.NoError(t, err, "FAIL: Could not load test environment") - require.NotEmpty(t, env.APIKey, "FAIL: No API key available") - - dmap := fmt.Sprintf("dist_test_%d", time.Now().UnixNano()) - - t.Run("Put_and_get_from_same_gateway", func(t *testing.T) { - key := fmt.Sprintf("key_%d", time.Now().UnixNano()) - value := fmt.Sprintf("value_%d", time.Now().UnixNano()) - - // Put - err := e2e.PutToOlric(env.GatewayURL, env.APIKey, dmap, key, value) - require.NoError(t, err, "FAIL: Could not put value to cache") - - // Get - retrieved, err := e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key) - require.NoError(t, err, "FAIL: Could not get value from cache") - require.Equal(t, value, retrieved, "FAIL: Retrieved value doesn't match") - - t.Logf(" ✓ Put/Get works: %s = %s", key, value) - }) - - t.Run("Multiple_keys_distributed", func(t *testing.T) { - // Put multiple keys (should be distributed across partitions) - keys := make(map[string]string) - for i := 0; i < 20; i++ { - key := fmt.Sprintf("dist_key_%d_%d", i, time.Now().UnixNano()) - value := fmt.Sprintf("dist_value_%d", i) - keys[key] = value - - err := e2e.PutToOlric(env.GatewayURL, env.APIKey, dmap, key, value) - require.NoError(t, err, "FAIL: Could not put key %s", key) - } - - t.Logf(" Put 20 keys to cache") - - // Verify all keys are retrievable - for key, expectedValue := range keys { - retrieved, err := e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key) - require.NoError(t, err, "FAIL: Could not get key %s", key) - require.Equal(t, expectedValue, retrieved, "FAIL: Value mismatch for key %s", key) - } - - t.Logf(" ✓ All 20 keys are retrievable") - }) -} - -// TestOlric_ConcurrentAccess verifies cache handles concurrent operations correctly. -func TestOlric_ConcurrentAccess(t *testing.T) { - env, err := e2e.LoadTestEnv() - require.NoError(t, err, "FAIL: Could not load test environment") - - dmap := fmt.Sprintf("concurrent_test_%d", time.Now().UnixNano()) - - t.Run("Concurrent_writes_to_same_key", func(t *testing.T) { - key := fmt.Sprintf("concurrent_key_%d", time.Now().UnixNano()) - - // Launch multiple goroutines writing to the same key - done := make(chan error, 10) - for i := 0; i < 10; i++ { - go func(idx int) { - value := fmt.Sprintf("concurrent_value_%d", idx) - err := e2e.PutToOlric(env.GatewayURL, env.APIKey, dmap, key, value) - done <- err - }(i) - } - - // Wait for all writes - var errors []error - for i := 0; i < 10; i++ { - if err := <-done; err != nil { - errors = append(errors, err) - } - } - - require.Empty(t, errors, "FAIL: %d concurrent writes failed: %v", len(errors), errors) - - // The key should have ONE of the values (last write wins) - retrieved, err := e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key) - require.NoError(t, err, "FAIL: Could not get key after concurrent writes") - require.Contains(t, retrieved, "concurrent_value_", "FAIL: Value doesn't match expected pattern") - - t.Logf(" ✓ Concurrent writes succeeded, final value: %s", retrieved) - }) - - t.Run("Concurrent_reads_and_writes", func(t *testing.T) { - key := fmt.Sprintf("rw_key_%d", time.Now().UnixNano()) - initialValue := "initial_value" - - // Set initial value - err := e2e.PutToOlric(env.GatewayURL, env.APIKey, dmap, key, initialValue) - require.NoError(t, err, "FAIL: Could not set initial value") - - // Launch concurrent readers and writers - done := make(chan error, 20) - - // 10 readers - for i := 0; i < 10; i++ { - go func() { - _, err := e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key) - done <- err - }() - } - - // 10 writers - for i := 0; i < 10; i++ { - go func(idx int) { - value := fmt.Sprintf("updated_value_%d", idx) - err := e2e.PutToOlric(env.GatewayURL, env.APIKey, dmap, key, value) - done <- err - }(i) - } - - // Wait for all operations - var readErrors, writeErrors []error - for i := 0; i < 20; i++ { - if err := <-done; err != nil { - if i < 10 { - readErrors = append(readErrors, err) - } else { - writeErrors = append(writeErrors, err) - } - } - } - - require.Empty(t, readErrors, "FAIL: %d reads failed", len(readErrors)) - require.Empty(t, writeErrors, "FAIL: %d writes failed", len(writeErrors)) - - t.Logf(" ✓ Concurrent read/write operations succeeded") - }) -} - -// TestOlric_NamespaceClusterCache verifies cache works in namespace-specific clusters. -func TestOlric_NamespaceClusterCache(t *testing.T) { - // Create a new namespace - namespace := fmt.Sprintf("cache-test-%d", time.Now().UnixNano()) - - env, err := e2e.LoadTestEnvWithNamespace(namespace) - require.NoError(t, err, "FAIL: Could not create namespace for cache test") - require.NotEmpty(t, env.APIKey, "FAIL: No API key") - - t.Logf("Created namespace %s", namespace) - - dmap := fmt.Sprintf("ns_cache_%d", time.Now().UnixNano()) - - t.Run("Cache_operations_work_in_namespace", func(t *testing.T) { - key := fmt.Sprintf("ns_key_%d", time.Now().UnixNano()) - value := fmt.Sprintf("ns_value_%d", time.Now().UnixNano()) - - // Put using namespace API key - err := e2e.PutToOlric(env.GatewayURL, env.APIKey, dmap, key, value) - require.NoError(t, err, "FAIL: Could not put value in namespace cache") - - // Get - retrieved, err := e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key) - require.NoError(t, err, "FAIL: Could not get value from namespace cache") - require.Equal(t, value, retrieved, "FAIL: Value mismatch in namespace cache") - - t.Logf(" ✓ Namespace cache operations work: %s = %s", key, value) - }) - - // Check if namespace Olric instances are running (port 10003 offset in port blocks) - var nsOlricPorts []int - for port := 10003; port <= 10098; port += 5 { - conn, err := net.DialTimeout("tcp", fmt.Sprintf("localhost:%d", port), 1*time.Second) - if err == nil { - conn.Close() - nsOlricPorts = append(nsOlricPorts, port) - } - } - - if len(nsOlricPorts) > 0 { - t.Logf("Found %d namespace Olric memberlist ports: %v", len(nsOlricPorts), nsOlricPorts) - - t.Run("Namespace_Olric_nodes_connected", func(t *testing.T) { - // Verify all namespace Olric nodes can be reached - for _, port := range nsOlricPorts { - conn, err := net.DialTimeout("tcp", fmt.Sprintf("localhost:%d", port), 2*time.Second) - require.NoError(t, err, "FAIL: Cannot connect to namespace Olric on port %d", port) - conn.Close() - t.Logf(" ✓ Namespace Olric memberlist on port %d is reachable", port) - } - }) - } -} - -// TestOlric_DataConsistency verifies data remains consistent across operations. -func TestOlric_DataConsistency(t *testing.T) { - env, err := e2e.LoadTestEnv() - require.NoError(t, err, "FAIL: Could not load test environment") - - dmap := fmt.Sprintf("consistency_test_%d", time.Now().UnixNano()) - - t.Run("Update_preserves_latest_value", func(t *testing.T) { - key := fmt.Sprintf("update_key_%d", time.Now().UnixNano()) - - // Write multiple times - for i := 1; i <= 5; i++ { - value := fmt.Sprintf("version_%d", i) - err := e2e.PutToOlric(env.GatewayURL, env.APIKey, dmap, key, value) - require.NoError(t, err, "FAIL: Could not update key to version %d", i) - } - - // Final read should return latest version - retrieved, err := e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key) - require.NoError(t, err, "FAIL: Could not read final value") - require.Equal(t, "version_5", retrieved, "FAIL: Latest version not preserved") - - t.Logf(" ✓ Latest value preserved after 5 updates") - }) - - t.Run("Delete_removes_key", func(t *testing.T) { - key := fmt.Sprintf("delete_key_%d", time.Now().UnixNano()) - value := "to_be_deleted" - - // Put - err := e2e.PutToOlric(env.GatewayURL, env.APIKey, dmap, key, value) - require.NoError(t, err, "FAIL: Could not put value") - - // Verify it exists - retrieved, err := e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key) - require.NoError(t, err, "FAIL: Could not get value before delete") - require.Equal(t, value, retrieved) - - // Delete (POST with JSON body) - deleteBody := map[string]interface{}{ - "dmap": dmap, - "key": key, - } - deleteBytes, _ := json.Marshal(deleteBody) - req, _ := http.NewRequest("POST", env.GatewayURL+"/v1/cache/delete", strings.NewReader(string(deleteBytes))) - req.Header.Set("Content-Type", "application/json") - req.Header.Set("Authorization", "Bearer "+env.APIKey) - client := &http.Client{Timeout: 10 * time.Second} - resp, err := client.Do(req) - require.NoError(t, err, "FAIL: Delete request failed") - resp.Body.Close() - require.True(t, resp.StatusCode == http.StatusOK || resp.StatusCode == http.StatusNoContent, - "FAIL: Delete returned unexpected status %d", resp.StatusCode) - - // Verify key is gone - _, err = e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key) - require.Error(t, err, "FAIL: Key should not exist after delete") - require.Contains(t, err.Error(), "not found", "FAIL: Expected 'not found' error") - - t.Logf(" ✓ Delete properly removes key") - }) -} - -// TestOlric_TTLExpiration verifies TTL expiration works. -// NOTE: TTL is currently parsed but not applied by the cache handler (TODO in set_handler.go). -// This test is skipped until TTL support is fully implemented. -func TestOlric_TTLExpiration(t *testing.T) { - t.Skip("TTL support not yet implemented in cache handler - see set_handler.go lines 88-98") - - env, err := e2e.LoadTestEnv() - require.NoError(t, err, "FAIL: Could not load test environment") - - dmap := fmt.Sprintf("ttl_test_%d", time.Now().UnixNano()) - - t.Run("Key_expires_after_TTL", func(t *testing.T) { - key := fmt.Sprintf("ttl_key_%d", time.Now().UnixNano()) - value := "expires_soon" - ttlSeconds := 3 - - // Put with TTL (TTL is a duration string like "3s", "1m", etc.) - reqBody := map[string]interface{}{ - "dmap": dmap, - "key": key, - "value": value, - "ttl": fmt.Sprintf("%ds", ttlSeconds), - } - bodyBytes, _ := json.Marshal(reqBody) - - req, _ := http.NewRequest("POST", env.GatewayURL+"/v1/cache/put", strings.NewReader(string(bodyBytes))) - req.Header.Set("Content-Type", "application/json") - req.Header.Set("Authorization", "Bearer "+env.APIKey) - - client := &http.Client{Timeout: 10 * time.Second} - resp, err := client.Do(req) - require.NoError(t, err, "FAIL: Put with TTL failed") - resp.Body.Close() - require.True(t, resp.StatusCode == http.StatusOK || resp.StatusCode == http.StatusCreated, - "FAIL: Put returned status %d", resp.StatusCode) - - // Verify key exists immediately - retrieved, err := e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key) - require.NoError(t, err, "FAIL: Could not get key immediately after put") - require.Equal(t, value, retrieved) - t.Logf(" Key exists immediately after put") - - // Wait for TTL to expire (plus buffer) - time.Sleep(time.Duration(ttlSeconds+2) * time.Second) - - // Key should be gone - _, err = e2e.GetFromOlric(env.GatewayURL, env.APIKey, dmap, key) - require.Error(t, err, "FAIL: Key should have expired after %d seconds", ttlSeconds) - require.Contains(t, err.Error(), "not found", "FAIL: Expected 'not found' error after TTL") - - t.Logf(" ✓ Key expired after %d seconds as expected", ttlSeconds) - }) -} diff --git a/e2e/cluster/rqlite_cluster_test.go b/e2e/cluster/rqlite_cluster_test.go deleted file mode 100644 index 8f8e43a..0000000 --- a/e2e/cluster/rqlite_cluster_test.go +++ /dev/null @@ -1,479 +0,0 @@ -//go:build e2e - -package cluster_test - -import ( - "context" - "fmt" - "net/http" - "sync" - "testing" - "time" - - "github.com/DeBrosOfficial/network/e2e" - "github.com/stretchr/testify/require" -) - -// ============================================================================= -// STRICT RQLITE CLUSTER TESTS -// These tests verify that RQLite cluster operations work correctly. -// Tests FAIL if operations don't work - no skips, no warnings. -// ============================================================================= - -// TestRQLite_ClusterHealth verifies the RQLite cluster is healthy and operational. -func TestRQLite_ClusterHealth(t *testing.T) { - e2e.SkipIfMissingGateway(t) - - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - - // Check RQLite schema endpoint (proves cluster is reachable) - req := &e2e.HTTPRequest{ - Method: http.MethodGet, - URL: e2e.GetGatewayURL() + "/v1/rqlite/schema", - } - - body, status, err := req.Do(ctx) - require.NoError(t, err, "FAIL: Could not reach RQLite cluster") - require.Equal(t, http.StatusOK, status, "FAIL: RQLite schema endpoint returned %d: %s", status, string(body)) - - var schemaResp map[string]interface{} - err = e2e.DecodeJSON(body, &schemaResp) - require.NoError(t, err, "FAIL: Could not decode RQLite schema response") - - // Schema endpoint should return tables array - _, hasTables := schemaResp["tables"] - require.True(t, hasTables, "FAIL: RQLite schema response missing 'tables' field") - - t.Logf(" ✓ RQLite cluster is healthy and responding") -} - -// TestRQLite_WriteReadConsistency verifies data written can be read back consistently. -func TestRQLite_WriteReadConsistency(t *testing.T) { - e2e.SkipIfMissingGateway(t) - - ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) - defer cancel() - - table := e2e.GenerateTableName() - - // Cleanup - defer func() { - dropReq := &e2e.HTTPRequest{ - Method: http.MethodPost, - URL: e2e.GetGatewayURL() + "/v1/rqlite/drop-table", - Body: map[string]interface{}{"table": table}, - } - dropReq.Do(context.Background()) - }() - - // Create table - createReq := &e2e.HTTPRequest{ - Method: http.MethodPost, - URL: e2e.GetGatewayURL() + "/v1/rqlite/create-table", - Body: map[string]interface{}{ - "schema": fmt.Sprintf( - "CREATE TABLE IF NOT EXISTS %s (id INTEGER PRIMARY KEY, value TEXT, created_at DATETIME DEFAULT CURRENT_TIMESTAMP)", - table, - ), - }, - } - - _, status, err := createReq.Do(ctx) - require.NoError(t, err, "FAIL: Create table request failed") - require.True(t, status == http.StatusCreated || status == http.StatusOK, - "FAIL: Create table returned status %d", status) - t.Logf("Created table %s", table) - - t.Run("Write_then_read_returns_same_data", func(t *testing.T) { - uniqueValue := fmt.Sprintf("test_value_%d", time.Now().UnixNano()) - - // Insert - insertReq := &e2e.HTTPRequest{ - Method: http.MethodPost, - URL: e2e.GetGatewayURL() + "/v1/rqlite/transaction", - Body: map[string]interface{}{ - "statements": []string{ - fmt.Sprintf("INSERT INTO %s (value) VALUES ('%s')", table, uniqueValue), - }, - }, - } - - _, status, err := insertReq.Do(ctx) - require.NoError(t, err, "FAIL: Insert request failed") - require.Equal(t, http.StatusOK, status, "FAIL: Insert returned status %d", status) - - // Read back - queryReq := &e2e.HTTPRequest{ - Method: http.MethodPost, - URL: e2e.GetGatewayURL() + "/v1/rqlite/query", - Body: map[string]interface{}{ - "sql": fmt.Sprintf("SELECT value FROM %s WHERE value = '%s'", table, uniqueValue), - }, - } - - body, status, err := queryReq.Do(ctx) - require.NoError(t, err, "FAIL: Query request failed") - require.Equal(t, http.StatusOK, status, "FAIL: Query returned status %d", status) - - var queryResp map[string]interface{} - err = e2e.DecodeJSON(body, &queryResp) - require.NoError(t, err, "FAIL: Could not decode query response") - - // Verify we got our value back - count, ok := queryResp["count"].(float64) - require.True(t, ok, "FAIL: Response missing 'count' field") - require.Equal(t, float64(1), count, "FAIL: Expected 1 row, got %v", count) - - t.Logf(" ✓ Written value '%s' was read back correctly", uniqueValue) - }) - - t.Run("Multiple_writes_all_readable", func(t *testing.T) { - // Insert multiple values - var statements []string - for i := 0; i < 10; i++ { - statements = append(statements, - fmt.Sprintf("INSERT INTO %s (value) VALUES ('batch_%d')", table, i)) - } - - insertReq := &e2e.HTTPRequest{ - Method: http.MethodPost, - URL: e2e.GetGatewayURL() + "/v1/rqlite/transaction", - Body: map[string]interface{}{ - "statements": statements, - }, - } - - _, status, err := insertReq.Do(ctx) - require.NoError(t, err, "FAIL: Batch insert failed") - require.Equal(t, http.StatusOK, status, "FAIL: Batch insert returned status %d", status) - - // Count all batch rows - queryReq := &e2e.HTTPRequest{ - Method: http.MethodPost, - URL: e2e.GetGatewayURL() + "/v1/rqlite/query", - Body: map[string]interface{}{ - "sql": fmt.Sprintf("SELECT COUNT(*) as cnt FROM %s WHERE value LIKE 'batch_%%'", table), - }, - } - - body, status, err := queryReq.Do(ctx) - require.NoError(t, err, "FAIL: Count query failed") - require.Equal(t, http.StatusOK, status, "FAIL: Count query returned status %d", status) - - var queryResp map[string]interface{} - e2e.DecodeJSON(body, &queryResp) - - if rows, ok := queryResp["rows"].([]interface{}); ok && len(rows) > 0 { - row := rows[0].([]interface{}) - count := int(row[0].(float64)) - require.Equal(t, 10, count, "FAIL: Expected 10 batch rows, got %d", count) - } - - t.Logf(" ✓ All 10 batch writes are readable") - }) -} - -// TestRQLite_TransactionAtomicity verifies transactions are atomic. -func TestRQLite_TransactionAtomicity(t *testing.T) { - e2e.SkipIfMissingGateway(t) - - ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) - defer cancel() - - table := e2e.GenerateTableName() - - // Cleanup - defer func() { - dropReq := &e2e.HTTPRequest{ - Method: http.MethodPost, - URL: e2e.GetGatewayURL() + "/v1/rqlite/drop-table", - Body: map[string]interface{}{"table": table}, - } - dropReq.Do(context.Background()) - }() - - // Create table - createReq := &e2e.HTTPRequest{ - Method: http.MethodPost, - URL: e2e.GetGatewayURL() + "/v1/rqlite/create-table", - Body: map[string]interface{}{ - "schema": fmt.Sprintf( - "CREATE TABLE IF NOT EXISTS %s (id INTEGER PRIMARY KEY, value TEXT UNIQUE)", - table, - ), - }, - } - - _, status, err := createReq.Do(ctx) - require.NoError(t, err, "FAIL: Create table failed") - require.True(t, status == http.StatusCreated || status == http.StatusOK, - "FAIL: Create table returned status %d", status) - - t.Run("Successful_transaction_commits_all", func(t *testing.T) { - txReq := &e2e.HTTPRequest{ - Method: http.MethodPost, - URL: e2e.GetGatewayURL() + "/v1/rqlite/transaction", - Body: map[string]interface{}{ - "statements": []string{ - fmt.Sprintf("INSERT INTO %s (value) VALUES ('tx_val_1')", table), - fmt.Sprintf("INSERT INTO %s (value) VALUES ('tx_val_2')", table), - fmt.Sprintf("INSERT INTO %s (value) VALUES ('tx_val_3')", table), - }, - }, - } - - _, status, err := txReq.Do(ctx) - require.NoError(t, err, "FAIL: Transaction request failed") - require.Equal(t, http.StatusOK, status, "FAIL: Transaction returned status %d", status) - - // Verify all 3 rows exist - queryReq := &e2e.HTTPRequest{ - Method: http.MethodPost, - URL: e2e.GetGatewayURL() + "/v1/rqlite/query", - Body: map[string]interface{}{ - "sql": fmt.Sprintf("SELECT COUNT(*) FROM %s WHERE value LIKE 'tx_val_%%'", table), - }, - } - - body, _, _ := queryReq.Do(ctx) - var queryResp map[string]interface{} - e2e.DecodeJSON(body, &queryResp) - - if rows, ok := queryResp["rows"].([]interface{}); ok && len(rows) > 0 { - row := rows[0].([]interface{}) - count := int(row[0].(float64)) - require.Equal(t, 3, count, "FAIL: Transaction didn't commit all 3 rows - got %d", count) - } - - t.Logf(" ✓ Transaction committed all 3 rows atomically") - }) - - t.Run("Updates_preserve_consistency", func(t *testing.T) { - // Update a value - updateReq := &e2e.HTTPRequest{ - Method: http.MethodPost, - URL: e2e.GetGatewayURL() + "/v1/rqlite/transaction", - Body: map[string]interface{}{ - "statements": []string{ - fmt.Sprintf("UPDATE %s SET value = 'tx_val_1_updated' WHERE value = 'tx_val_1'", table), - }, - }, - } - - _, status, err := updateReq.Do(ctx) - require.NoError(t, err, "FAIL: Update request failed") - require.Equal(t, http.StatusOK, status, "FAIL: Update returned status %d", status) - - // Verify update took effect - queryReq := &e2e.HTTPRequest{ - Method: http.MethodPost, - URL: e2e.GetGatewayURL() + "/v1/rqlite/query", - Body: map[string]interface{}{ - "sql": fmt.Sprintf("SELECT value FROM %s WHERE value = 'tx_val_1_updated'", table), - }, - } - - body, _, _ := queryReq.Do(ctx) - var queryResp map[string]interface{} - e2e.DecodeJSON(body, &queryResp) - - count, _ := queryResp["count"].(float64) - require.Equal(t, float64(1), count, "FAIL: Update didn't take effect") - - t.Logf(" ✓ Update preserved consistency") - }) -} - -// TestRQLite_ConcurrentWrites verifies the cluster handles concurrent writes correctly. -func TestRQLite_ConcurrentWrites(t *testing.T) { - e2e.SkipIfMissingGateway(t) - - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() - - table := e2e.GenerateTableName() - - // Cleanup - defer func() { - dropReq := &e2e.HTTPRequest{ - Method: http.MethodPost, - URL: e2e.GetGatewayURL() + "/v1/rqlite/drop-table", - Body: map[string]interface{}{"table": table}, - } - dropReq.Do(context.Background()) - }() - - // Create table - createReq := &e2e.HTTPRequest{ - Method: http.MethodPost, - URL: e2e.GetGatewayURL() + "/v1/rqlite/create-table", - Body: map[string]interface{}{ - "schema": fmt.Sprintf( - "CREATE TABLE IF NOT EXISTS %s (id INTEGER PRIMARY KEY, worker INTEGER, seq INTEGER)", - table, - ), - }, - } - - _, status, err := createReq.Do(ctx) - require.NoError(t, err, "FAIL: Create table failed") - require.True(t, status == http.StatusCreated || status == http.StatusOK, - "FAIL: Create table returned status %d", status) - - t.Run("Concurrent_inserts_all_succeed", func(t *testing.T) { - numWorkers := 5 - insertsPerWorker := 10 - expectedTotal := numWorkers * insertsPerWorker - - var wg sync.WaitGroup - errChan := make(chan error, numWorkers*insertsPerWorker) - - for w := 0; w < numWorkers; w++ { - wg.Add(1) - go func(workerID int) { - defer wg.Done() - for i := 0; i < insertsPerWorker; i++ { - insertReq := &e2e.HTTPRequest{ - Method: http.MethodPost, - URL: e2e.GetGatewayURL() + "/v1/rqlite/transaction", - Body: map[string]interface{}{ - "statements": []string{ - fmt.Sprintf("INSERT INTO %s (worker, seq) VALUES (%d, %d)", table, workerID, i), - }, - }, - } - - _, status, err := insertReq.Do(ctx) - if err != nil { - errChan <- fmt.Errorf("worker %d insert %d failed: %w", workerID, i, err) - return - } - if status != http.StatusOK { - errChan <- fmt.Errorf("worker %d insert %d got status %d", workerID, i, status) - return - } - } - }(w) - } - - wg.Wait() - close(errChan) - - // Collect errors - var errors []error - for err := range errChan { - errors = append(errors, err) - } - require.Empty(t, errors, "FAIL: %d concurrent inserts failed: %v", len(errors), errors) - - // Verify total count - queryReq := &e2e.HTTPRequest{ - Method: http.MethodPost, - URL: e2e.GetGatewayURL() + "/v1/rqlite/query", - Body: map[string]interface{}{ - "sql": fmt.Sprintf("SELECT COUNT(*) FROM %s", table), - }, - } - - body, _, _ := queryReq.Do(ctx) - var queryResp map[string]interface{} - e2e.DecodeJSON(body, &queryResp) - - if rows, ok := queryResp["rows"].([]interface{}); ok && len(rows) > 0 { - row := rows[0].([]interface{}) - count := int(row[0].(float64)) - require.Equal(t, expectedTotal, count, - "FAIL: Expected %d total rows from concurrent inserts, got %d", expectedTotal, count) - } - - t.Logf(" ✓ All %d concurrent inserts succeeded", expectedTotal) - }) -} - -// TestRQLite_NamespaceClusterOperations verifies RQLite works in namespace clusters. -func TestRQLite_NamespaceClusterOperations(t *testing.T) { - // Create a new namespace - namespace := fmt.Sprintf("rqlite-test-%d", time.Now().UnixNano()) - - env, err := e2e.LoadTestEnvWithNamespace(namespace) - require.NoError(t, err, "FAIL: Could not create namespace for RQLite test") - require.NotEmpty(t, env.APIKey, "FAIL: No API key - namespace provisioning failed") - - t.Logf("Created namespace %s", namespace) - - ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second) - defer cancel() - - table := e2e.GenerateTableName() - - // Cleanup - defer func() { - dropReq := &e2e.HTTPRequest{ - Method: http.MethodPost, - URL: env.GatewayURL + "/v1/rqlite/drop-table", - Body: map[string]interface{}{"table": table}, - Headers: map[string]string{"Authorization": "Bearer " + env.APIKey}, - } - dropReq.Do(context.Background()) - }() - - t.Run("Namespace_RQLite_create_insert_query", func(t *testing.T) { - // Create table in namespace cluster - createReq := &e2e.HTTPRequest{ - Method: http.MethodPost, - URL: env.GatewayURL + "/v1/rqlite/create-table", - Headers: map[string]string{"Authorization": "Bearer " + env.APIKey}, - Body: map[string]interface{}{ - "schema": fmt.Sprintf( - "CREATE TABLE IF NOT EXISTS %s (id INTEGER PRIMARY KEY, value TEXT)", - table, - ), - }, - } - - _, status, err := createReq.Do(ctx) - require.NoError(t, err, "FAIL: Create table in namespace failed") - require.True(t, status == http.StatusCreated || status == http.StatusOK, - "FAIL: Create table returned status %d", status) - - // Insert data - uniqueValue := fmt.Sprintf("ns_value_%d", time.Now().UnixNano()) - insertReq := &e2e.HTTPRequest{ - Method: http.MethodPost, - URL: env.GatewayURL + "/v1/rqlite/transaction", - Headers: map[string]string{"Authorization": "Bearer " + env.APIKey}, - Body: map[string]interface{}{ - "statements": []string{ - fmt.Sprintf("INSERT INTO %s (value) VALUES ('%s')", table, uniqueValue), - }, - }, - } - - _, status, err = insertReq.Do(ctx) - require.NoError(t, err, "FAIL: Insert in namespace failed") - require.Equal(t, http.StatusOK, status, "FAIL: Insert returned status %d", status) - - // Query data - queryReq := &e2e.HTTPRequest{ - Method: http.MethodPost, - URL: env.GatewayURL + "/v1/rqlite/query", - Headers: map[string]string{"Authorization": "Bearer " + env.APIKey}, - Body: map[string]interface{}{ - "sql": fmt.Sprintf("SELECT value FROM %s WHERE value = '%s'", table, uniqueValue), - }, - } - - body, status, err := queryReq.Do(ctx) - require.NoError(t, err, "FAIL: Query in namespace failed") - require.Equal(t, http.StatusOK, status, "FAIL: Query returned status %d", status) - - var queryResp map[string]interface{} - e2e.DecodeJSON(body, &queryResp) - - count, _ := queryResp["count"].(float64) - require.Equal(t, float64(1), count, "FAIL: Data not found in namespace cluster") - - t.Logf(" ✓ Namespace RQLite operations work correctly") - }) -} diff --git a/e2e/env.go b/e2e/env.go index 6a96e16..a5be043 100644 --- a/e2e/env.go +++ b/e2e/env.go @@ -478,11 +478,6 @@ func GetAPIKey() string { return apiKey } -// GetJWT returns the gateway JWT token (currently not auto-discovered) -func GetJWT() string { - return "" -} - // GetBootstrapPeers returns bootstrap peer addresses from config func GetBootstrapPeers() []string { cacheMutex.RLock() @@ -748,10 +743,6 @@ func NewNetworkClient(t *testing.T) client.NetworkClient { cfg.APIKey = GetAPIKey() cfg.QuietMode = true // Suppress debug logs in tests - if jwt := GetJWT(); jwt != "" { - cfg.JWT = jwt - } - if peers := GetBootstrapPeers(); len(peers) > 0 { cfg.BootstrapPeers = peers } diff --git a/e2e/production/dns_replica_test.go b/e2e/production/dns_replica_test.go deleted file mode 100644 index 67e4adb..0000000 --- a/e2e/production/dns_replica_test.go +++ /dev/null @@ -1,333 +0,0 @@ -//go:build e2e && production - -package production - -import ( - "bytes" - "context" - "encoding/json" - "fmt" - "io" - "net" - "net/http" - "os" - "os/exec" - "path/filepath" - "testing" - "time" - - "github.com/DeBrosOfficial/network/e2e" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -// TestDNS_MultipleARecords verifies that deploying with replicas creates -// multiple A records (one per node) for DNS round-robin. -func TestDNS_MultipleARecords(t *testing.T) { - e2e.SkipIfLocal(t) - - env, err := e2e.LoadTestEnv() - require.NoError(t, err) - - if len(env.Config.Servers) < 2 { - t.Skip("Requires at least 2 servers") - } - - deploymentName := fmt.Sprintf("dns-multi-%d", time.Now().Unix()) - tarballPath := filepath.Join("../../testdata/apps/react-app") - - deploymentID := e2e.CreateTestDeployment(t, env, deploymentName, tarballPath) - require.NotEmpty(t, deploymentID) - - defer func() { - if !env.SkipCleanup { - e2e.DeleteDeployment(t, env, deploymentID) - } - }() - - // Wait for replica setup and DNS propagation - time.Sleep(15 * time.Second) - - t.Run("DNS returns multiple IPs", func(t *testing.T) { - deployment := e2e.GetDeployment(t, env, deploymentID) - subdomain, _ := deployment["subdomain"].(string) - if subdomain == "" { - subdomain = deploymentName - } - fqdn := fmt.Sprintf("%s.%s", subdomain, env.BaseDomain) - - // Query nameserver directly - nameserverIP := env.Config.Servers[0].IP - resolver := &net.Resolver{ - PreferGo: true, - Dial: func(ctx context.Context, network, address string) (net.Conn, error) { - d := net.Dialer{Timeout: 10 * time.Second} - return d.Dial("udp", nameserverIP+":53") - }, - } - - ctx := context.Background() - ips, err := resolver.LookupHost(ctx, fqdn) - if err != nil { - t.Logf("DNS lookup failed for %s: %v", fqdn, err) - t.Log("Trying net.LookupHost instead...") - ips, err = net.LookupHost(fqdn) - } - - if err != nil { - t.Logf("DNS lookup failed: %v (DNS may not be propagated yet)", err) - t.Skip("DNS not yet propagated") - } - - t.Logf("DNS returned %d IPs for %s: %v", len(ips), fqdn, ips) - assert.GreaterOrEqual(t, len(ips), 2, - "Should have at least 2 A records (home + replica)") - - // Verify returned IPs are from our server list - serverIPs := e2e.GetServerIPs(env.Config) - for _, ip := range ips { - assert.Contains(t, serverIPs, ip, - "DNS IP %s should be one of our servers", ip) - } - }) -} - -// TestDNS_CleanupOnDelete verifies that deleting a deployment removes all -// DNS records (both home and replica A records). -func TestDNS_CleanupOnDelete(t *testing.T) { - e2e.SkipIfLocal(t) - - env, err := e2e.LoadTestEnv() - require.NoError(t, err) - - deploymentName := fmt.Sprintf("dns-cleanup-%d", time.Now().Unix()) - tarballPath := filepath.Join("../../testdata/apps/react-app") - - deploymentID := e2e.CreateTestDeployment(t, env, deploymentName, tarballPath) - require.NotEmpty(t, deploymentID) - - // Wait for DNS - time.Sleep(10 * time.Second) - - // Get subdomain before deletion - deployment := e2e.GetDeployment(t, env, deploymentID) - subdomain, _ := deployment["subdomain"].(string) - if subdomain == "" { - subdomain = deploymentName - } - fqdn := fmt.Sprintf("%s.%s", subdomain, env.BaseDomain) - - // Verify DNS works before deletion - t.Run("DNS resolves before deletion", func(t *testing.T) { - nodeURL := extractNodeURLProd(t, deployment) - if nodeURL == "" { - t.Skip("No URL to test") - } - domain := extractDomainProd(nodeURL) - - req, _ := http.NewRequest("GET", env.GatewayURL+"/", nil) - req.Host = domain - - resp, err := env.HTTPClient.Do(req) - if err == nil { - resp.Body.Close() - t.Logf("Pre-delete: status=%d", resp.StatusCode) - } - }) - - // Delete - e2e.DeleteDeployment(t, env, deploymentID) - time.Sleep(10 * time.Second) - - t.Run("DNS records removed after deletion", func(t *testing.T) { - ips, err := net.LookupHost(fqdn) - if err != nil { - t.Logf("DNS lookup failed (expected): %v", err) - return // Good — no records - } - - // If we still get IPs, they might be cached. Log and warn. - if len(ips) > 0 { - t.Logf("WARNING: DNS still returns %d IPs after deletion (may be cached): %v", len(ips), ips) - } - }) -} - -// TestDNS_CustomSubdomain verifies that deploying with a custom subdomain -// creates DNS records using the custom name. -func TestDNS_CustomSubdomain(t *testing.T) { - e2e.SkipIfLocal(t) - - env, err := e2e.LoadTestEnv() - require.NoError(t, err) - - deploymentName := fmt.Sprintf("dns-custom-%d", time.Now().Unix()) - tarballPath := filepath.Join("../../testdata/apps/react-app") - - deploymentID := createDeploymentWithSubdomain(t, env, deploymentName, tarballPath) - require.NotEmpty(t, deploymentID) - - defer func() { - if !env.SkipCleanup { - e2e.DeleteDeployment(t, env, deploymentID) - } - }() - - time.Sleep(10 * time.Second) - - t.Run("Deployment has subdomain with random suffix", func(t *testing.T) { - deployment := e2e.GetDeployment(t, env, deploymentID) - subdomain, _ := deployment["subdomain"].(string) - require.NotEmpty(t, subdomain, "Deployment should have a subdomain") - t.Logf("Subdomain: %s", subdomain) - - // Verify the subdomain starts with the deployment name - assert.Contains(t, subdomain, deploymentName[:10], - "Subdomain should relate to deployment name") - }) -} - -// TestDNS_RedeployPreservesSubdomain verifies that updating a deployment -// does not change the subdomain/DNS. -func TestDNS_RedeployPreservesSubdomain(t *testing.T) { - e2e.SkipIfLocal(t) - - env, err := e2e.LoadTestEnv() - require.NoError(t, err) - - deploymentName := fmt.Sprintf("dns-preserve-%d", time.Now().Unix()) - tarballPath := filepath.Join("../../testdata/apps/react-app") - - deploymentID := e2e.CreateTestDeployment(t, env, deploymentName, tarballPath) - require.NotEmpty(t, deploymentID) - - defer func() { - if !env.SkipCleanup { - e2e.DeleteDeployment(t, env, deploymentID) - } - }() - - time.Sleep(5 * time.Second) - - // Get original subdomain - deployment := e2e.GetDeployment(t, env, deploymentID) - originalSubdomain, _ := deployment["subdomain"].(string) - originalURLs := deployment["urls"] - t.Logf("Original subdomain: %s, urls: %v", originalSubdomain, originalURLs) - - // Update - updateStaticDeploymentProd(t, env, deploymentName, tarballPath) - time.Sleep(5 * time.Second) - - // Verify subdomain unchanged - t.Run("Subdomain unchanged after update", func(t *testing.T) { - updated := e2e.GetDeployment(t, env, deploymentID) - updatedSubdomain, _ := updated["subdomain"].(string) - - assert.Equal(t, originalSubdomain, updatedSubdomain, - "Subdomain should not change after update") - t.Logf("After update: subdomain=%s", updatedSubdomain) - }) -} - -func createDeploymentWithSubdomain(t *testing.T, env *e2e.E2ETestEnv, name, tarballPath string) string { - t.Helper() - - var fileData []byte - info, err := os.Stat(tarballPath) - require.NoError(t, err) - if info.IsDir() { - fileData, err = exec.Command("tar", "-czf", "-", "-C", tarballPath, ".").Output() - require.NoError(t, err) - } else { - file, err := os.Open(tarballPath) - require.NoError(t, err) - defer file.Close() - fileData, _ = io.ReadAll(file) - } - - body := &bytes.Buffer{} - boundary := "----WebKitFormBoundary7MA4YWxkTrZu0gW" - - body.WriteString("--" + boundary + "\r\n") - body.WriteString("Content-Disposition: form-data; name=\"name\"\r\n\r\n") - body.WriteString(name + "\r\n") - - body.WriteString("--" + boundary + "\r\n") - body.WriteString("Content-Disposition: form-data; name=\"tarball\"; filename=\"app.tar.gz\"\r\n") - body.WriteString("Content-Type: application/gzip\r\n\r\n") - - body.Write(fileData) - body.WriteString("\r\n--" + boundary + "--\r\n") - - req, err := http.NewRequest("POST", env.GatewayURL+"/v1/deployments/static/upload", body) - require.NoError(t, err) - req.Header.Set("Content-Type", "multipart/form-data; boundary="+boundary) - req.Header.Set("Authorization", "Bearer "+env.APIKey) - - resp, err := env.HTTPClient.Do(req) - require.NoError(t, err) - defer resp.Body.Close() - - if resp.StatusCode != http.StatusCreated { - bodyBytes, _ := io.ReadAll(resp.Body) - t.Fatalf("Upload failed: status=%d body=%s", resp.StatusCode, string(bodyBytes)) - } - - var result map[string]interface{} - json.NewDecoder(resp.Body).Decode(&result) - - if id, ok := result["deployment_id"].(string); ok { - return id - } - if id, ok := result["id"].(string); ok { - return id - } - t.Fatalf("No id in response: %+v", result) - return "" -} - -func updateStaticDeploymentProd(t *testing.T, env *e2e.E2ETestEnv, name, tarballPath string) { - t.Helper() - - var fileData []byte - info, err := os.Stat(tarballPath) - require.NoError(t, err) - if info.IsDir() { - fileData, err = exec.Command("tar", "-czf", "-", "-C", tarballPath, ".").Output() - require.NoError(t, err) - } else { - file, err := os.Open(tarballPath) - require.NoError(t, err) - defer file.Close() - fileData, _ = io.ReadAll(file) - } - - body := &bytes.Buffer{} - boundary := "----WebKitFormBoundary7MA4YWxkTrZu0gW" - - body.WriteString("--" + boundary + "\r\n") - body.WriteString("Content-Disposition: form-data; name=\"name\"\r\n\r\n") - body.WriteString(name + "\r\n") - - body.WriteString("--" + boundary + "\r\n") - body.WriteString("Content-Disposition: form-data; name=\"tarball\"; filename=\"app.tar.gz\"\r\n") - body.WriteString("Content-Type: application/gzip\r\n\r\n") - - body.Write(fileData) - body.WriteString("\r\n--" + boundary + "--\r\n") - - req, err := http.NewRequest("POST", env.GatewayURL+"/v1/deployments/static/update", body) - require.NoError(t, err) - req.Header.Set("Content-Type", "multipart/form-data; boundary="+boundary) - req.Header.Set("Authorization", "Bearer "+env.APIKey) - - resp, err := env.HTTPClient.Do(req) - require.NoError(t, err) - defer resp.Body.Close() - - if resp.StatusCode != http.StatusOK { - bodyBytes, _ := io.ReadAll(resp.Body) - t.Fatalf("Update failed: status=%d body=%s", resp.StatusCode, string(bodyBytes)) - } -} diff --git a/e2e/production/dns_resolution_test.go b/e2e/production/dns_resolution_test.go deleted file mode 100644 index 100924b..0000000 --- a/e2e/production/dns_resolution_test.go +++ /dev/null @@ -1,121 +0,0 @@ -//go:build e2e && production - -package production - -import ( - "context" - "fmt" - "net" - "path/filepath" - "testing" - "time" - - "github.com/DeBrosOfficial/network/e2e" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -// TestDNS_DeploymentResolution tests that deployed applications are resolvable via DNS -// This test requires production mode as it performs real DNS lookups -func TestDNS_DeploymentResolution(t *testing.T) { - e2e.SkipIfLocal(t) - - env, err := e2e.LoadTestEnv() - require.NoError(t, err, "Failed to load test environment") - - deploymentName := fmt.Sprintf("dns-test-%d", time.Now().Unix()) - tarballPath := filepath.Join("../../testdata/apps/react-app") - - deploymentID := e2e.CreateTestDeployment(t, env, deploymentName, tarballPath) - defer func() { - if !env.SkipCleanup { - e2e.DeleteDeployment(t, env, deploymentID) - } - }() - - // Wait for DNS propagation - domain := env.BuildDeploymentDomain(deploymentName) - t.Logf("Testing DNS resolution for: %s", domain) - - t.Run("DNS resolves to valid server IP", func(t *testing.T) { - // Allow some time for DNS propagation - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() - - var ips []string - var err error - - // Poll for DNS resolution - for { - select { - case <-ctx.Done(): - t.Fatalf("DNS resolution timeout for %s", domain) - default: - ips, err = net.LookupHost(domain) - if err == nil && len(ips) > 0 { - goto resolved - } - time.Sleep(2 * time.Second) - } - } - - resolved: - t.Logf("DNS resolved: %s -> %v", domain, ips) - assert.NotEmpty(t, ips, "Should have IP addresses") - - // Verify resolved IP is one of our servers - validIPs := e2e.GetServerIPs(env.Config) - if len(validIPs) > 0 { - found := false - for _, ip := range ips { - for _, validIP := range validIPs { - if ip == validIP { - found = true - break - } - } - } - assert.True(t, found, "Resolved IP should be one of our servers: %v (valid: %v)", ips, validIPs) - } - }) -} - -// TestDNS_BaseDomainResolution tests that the base domain resolves correctly -func TestDNS_BaseDomainResolution(t *testing.T) { - e2e.SkipIfLocal(t) - - env, err := e2e.LoadTestEnv() - require.NoError(t, err, "Failed to load test environment") - - t.Run("Base domain resolves", func(t *testing.T) { - ips, err := net.LookupHost(env.BaseDomain) - require.NoError(t, err, "Base domain %s should resolve", env.BaseDomain) - assert.NotEmpty(t, ips, "Should have IP addresses") - - t.Logf("✓ Base domain %s resolves to: %v", env.BaseDomain, ips) - }) -} - -// TestDNS_WildcardResolution tests wildcard DNS for arbitrary subdomains -func TestDNS_WildcardResolution(t *testing.T) { - e2e.SkipIfLocal(t) - - env, err := e2e.LoadTestEnv() - require.NoError(t, err, "Failed to load test environment") - - t.Run("Wildcard subdomain resolves", func(t *testing.T) { - // Test with a random subdomain that doesn't exist as a deployment - randomSubdomain := fmt.Sprintf("random-test-%d.%s", time.Now().UnixNano(), env.BaseDomain) - - ips, err := net.LookupHost(randomSubdomain) - if err != nil { - // DNS may not support wildcard - that's OK for some setups - t.Logf("⚠ Wildcard DNS not configured (this may be expected): %v", err) - t.Skip("Wildcard DNS not configured") - return - } - - assert.NotEmpty(t, ips, "Wildcard subdomain should resolve") - t.Logf("✓ Wildcard subdomain resolves: %s -> %v", randomSubdomain, ips) - }) -} diff --git a/e2e/production/nameserver_test.go b/e2e/production/nameserver_test.go deleted file mode 100644 index 9705918..0000000 --- a/e2e/production/nameserver_test.go +++ /dev/null @@ -1,181 +0,0 @@ -//go:build e2e && production - -package production - -import ( - "context" - "net" - "strings" - "testing" - "time" - - "github.com/DeBrosOfficial/network/e2e" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -// TestNameserver_NSRecords tests that NS records are properly configured for the domain -func TestNameserver_NSRecords(t *testing.T) { - e2e.SkipIfLocal(t) - - env, err := e2e.LoadTestEnv() - require.NoError(t, err, "Failed to load test environment") - - if len(env.Config.Nameservers) == 0 { - t.Skip("No nameservers configured in e2e/config.yaml") - } - - t.Run("NS records exist for base domain", func(t *testing.T) { - nsRecords, err := net.LookupNS(env.BaseDomain) - require.NoError(t, err, "Should be able to look up NS records for %s", env.BaseDomain) - require.NotEmpty(t, nsRecords, "Should have NS records") - - t.Logf("Found %d NS records for %s:", len(nsRecords), env.BaseDomain) - for _, ns := range nsRecords { - t.Logf(" - %s", ns.Host) - } - - // Verify our nameservers are listed - for _, expected := range env.Config.Nameservers { - found := false - for _, ns := range nsRecords { - // Trim trailing dot for comparison - nsHost := strings.TrimSuffix(ns.Host, ".") - if nsHost == expected || nsHost == expected+"." { - found = true - break - } - } - assert.True(t, found, "NS records should include %s", expected) - } - }) -} - -// TestNameserver_GlueRecords tests that glue records point to correct IPs -func TestNameserver_GlueRecords(t *testing.T) { - e2e.SkipIfLocal(t) - - env, err := e2e.LoadTestEnv() - require.NoError(t, err, "Failed to load test environment") - - if len(env.Config.Nameservers) == 0 { - t.Skip("No nameservers configured in e2e/config.yaml") - } - - nameserverServers := e2e.GetNameserverServers(env.Config) - if len(nameserverServers) == 0 { - t.Skip("No servers marked as nameservers in config") - } - - t.Run("Glue records resolve to correct IPs", func(t *testing.T) { - for i, ns := range env.Config.Nameservers { - ips, err := net.LookupHost(ns) - require.NoError(t, err, "Nameserver %s should resolve", ns) - require.NotEmpty(t, ips, "Nameserver %s should have IP addresses", ns) - - t.Logf("Nameserver %s resolves to: %v", ns, ips) - - // If we have the expected IP, verify it matches - if i < len(nameserverServers) { - expectedIP := nameserverServers[i].IP - found := false - for _, ip := range ips { - if ip == expectedIP { - found = true - break - } - } - assert.True(t, found, "Glue record for %s should point to %s (got %v)", ns, expectedIP, ips) - } - } - }) -} - -// TestNameserver_CoreDNSResponds tests that our CoreDNS servers respond to queries -func TestNameserver_CoreDNSResponds(t *testing.T) { - e2e.SkipIfLocal(t) - - env, err := e2e.LoadTestEnv() - require.NoError(t, err, "Failed to load test environment") - - nameserverServers := e2e.GetNameserverServers(env.Config) - if len(nameserverServers) == 0 { - t.Skip("No servers marked as nameservers in config") - } - - t.Run("CoreDNS servers respond to queries", func(t *testing.T) { - for _, server := range nameserverServers { - t.Run(server.Name, func(t *testing.T) { - // Create a custom resolver that queries this specific server - resolver := &net.Resolver{ - PreferGo: true, - Dial: func(ctx context.Context, network, address string) (net.Conn, error) { - d := net.Dialer{ - Timeout: 5 * time.Second, - } - return d.DialContext(ctx, "udp", server.IP+":53") - }, - } - - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - - // Query the base domain - ips, err := resolver.LookupHost(ctx, env.BaseDomain) - if err != nil { - // Log the error but don't fail - server might be configured differently - t.Logf("⚠ CoreDNS at %s (%s) query error: %v", server.Name, server.IP, err) - return - } - - t.Logf("✓ CoreDNS at %s (%s) responded: %s -> %v", server.Name, server.IP, env.BaseDomain, ips) - assert.NotEmpty(t, ips, "CoreDNS should return IP addresses") - }) - } - }) -} - -// TestNameserver_QueryLatency tests DNS query latency from our nameservers -func TestNameserver_QueryLatency(t *testing.T) { - e2e.SkipIfLocal(t) - - env, err := e2e.LoadTestEnv() - require.NoError(t, err, "Failed to load test environment") - - nameserverServers := e2e.GetNameserverServers(env.Config) - if len(nameserverServers) == 0 { - t.Skip("No servers marked as nameservers in config") - } - - t.Run("DNS query latency is acceptable", func(t *testing.T) { - for _, server := range nameserverServers { - resolver := &net.Resolver{ - PreferGo: true, - Dial: func(ctx context.Context, network, address string) (net.Conn, error) { - d := net.Dialer{ - Timeout: 5 * time.Second, - } - return d.DialContext(ctx, "udp", server.IP+":53") - }, - } - - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - defer cancel() - - start := time.Now() - _, err := resolver.LookupHost(ctx, env.BaseDomain) - latency := time.Since(start) - - if err != nil { - t.Logf("⚠ Query to %s failed: %v", server.Name, err) - continue - } - - t.Logf("DNS latency from %s (%s): %v", server.Name, server.IP, latency) - - // DNS queries should be fast (under 500ms is reasonable) - assert.Less(t, latency, 500*time.Millisecond, - "DNS query to %s should complete in under 500ms", server.Name) - } - }) -} diff --git a/inspector b/inspector new file mode 100755 index 0000000..489d340 Binary files /dev/null and b/inspector differ diff --git a/pkg/cli/inspect_command.go b/pkg/cli/inspect_command.go new file mode 100644 index 0000000..b903d19 --- /dev/null +++ b/pkg/cli/inspect_command.go @@ -0,0 +1,158 @@ +package cli + +import ( + "bufio" + "context" + "flag" + "fmt" + "os" + "strings" + "time" + + "github.com/DeBrosOfficial/network/pkg/inspector" + // Import checks package so init() registers the checkers + _ "github.com/DeBrosOfficial/network/pkg/inspector/checks" +) + +// loadDotEnv loads key=value pairs from a .env file into os environment. +// Only sets vars that are not already set (env takes precedence over file). +func loadDotEnv(path string) { + f, err := os.Open(path) + if err != nil { + return // .env is optional + } + defer f.Close() + + scanner := bufio.NewScanner(f) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line == "" || strings.HasPrefix(line, "#") { + continue + } + eq := strings.IndexByte(line, '=') + if eq < 1 { + continue + } + key := line[:eq] + value := line[eq+1:] + // Only set if not already in environment + if os.Getenv(key) == "" { + os.Setenv(key, value) + } + } +} + +// HandleInspectCommand handles the "orama inspect" command. +func HandleInspectCommand(args []string) { + // Load .env file from current directory (only sets unset vars) + loadDotEnv(".env") + + fs := flag.NewFlagSet("inspect", flag.ExitOnError) + + configPath := fs.String("config", "scripts/remote-nodes.conf", "Path to remote-nodes.conf") + env := fs.String("env", "", "Environment to inspect (devnet, testnet)") + subsystem := fs.String("subsystem", "all", "Subsystem to inspect (rqlite,olric,ipfs,dns,wg,system,network,all)") + format := fs.String("format", "table", "Output format (table, json)") + timeout := fs.Duration("timeout", 30*time.Second, "SSH command timeout") + verbose := fs.Bool("verbose", false, "Verbose output") + // AI flags + aiEnabled := fs.Bool("ai", false, "Enable AI analysis of failures") + aiModel := fs.String("model", "moonshotai/kimi-k2.5", "OpenRouter model for AI analysis") + aiAPIKey := fs.String("api-key", "", "OpenRouter API key (or OPENROUTER_API_KEY env)") + + fs.Usage = func() { + fmt.Fprintf(os.Stderr, "Usage: orama inspect [flags]\n\n") + fmt.Fprintf(os.Stderr, "Inspect cluster health by SSHing into nodes and running checks.\n\n") + fmt.Fprintf(os.Stderr, "Flags:\n") + fs.PrintDefaults() + fmt.Fprintf(os.Stderr, "\nExamples:\n") + fmt.Fprintf(os.Stderr, " orama inspect --env devnet\n") + fmt.Fprintf(os.Stderr, " orama inspect --env devnet --subsystem rqlite\n") + fmt.Fprintf(os.Stderr, " orama inspect --env devnet --ai\n") + fmt.Fprintf(os.Stderr, " orama inspect --env devnet --ai --model openai/gpt-4o\n") + } + + if err := fs.Parse(args); err != nil { + os.Exit(1) + } + + if *env == "" { + fmt.Fprintf(os.Stderr, "Error: --env is required (devnet, testnet)\n") + os.Exit(1) + } + + // Load nodes + nodes, err := inspector.LoadNodes(*configPath) + if err != nil { + fmt.Fprintf(os.Stderr, "Error loading config: %v\n", err) + os.Exit(1) + } + + // Filter by environment + nodes = inspector.FilterByEnv(nodes, *env) + if len(nodes) == 0 { + fmt.Fprintf(os.Stderr, "Error: no nodes found for environment %q\n", *env) + os.Exit(1) + } + + // Parse subsystems + var subsystems []string + if *subsystem != "all" { + subsystems = strings.Split(*subsystem, ",") + } + + fmt.Printf("Inspecting %d %s nodes", len(nodes), *env) + if len(subsystems) > 0 { + fmt.Printf(" [%s]", strings.Join(subsystems, ",")) + } + if *aiEnabled { + fmt.Printf(" (AI: %s)", *aiModel) + } + fmt.Printf("...\n\n") + + // Phase 1: Collect + ctx, cancel := context.WithTimeout(context.Background(), *timeout+10*time.Second) + defer cancel() + + if *verbose { + fmt.Printf("Collecting data from %d nodes (timeout: %s)...\n", len(nodes), timeout) + } + + data := inspector.Collect(ctx, nodes, subsystems, *verbose) + + if *verbose { + fmt.Printf("Collection complete in %.1fs\n\n", data.Duration.Seconds()) + } + + // Phase 2: Check + results := inspector.RunChecks(data, subsystems) + + // Phase 3: Report + switch *format { + case "json": + inspector.PrintJSON(results, os.Stdout) + default: + inspector.PrintTable(results, os.Stdout) + } + + // Phase 4: AI Analysis (if enabled and there are failures or warnings) + if *aiEnabled { + issues := results.FailuresAndWarnings() + if len(issues) == 0 { + fmt.Printf("\nAll checks passed — no AI analysis needed.\n") + } else { + fmt.Printf("\nAnalyzing %d issues with %s...\n", len(issues), *aiModel) + analysis, err := inspector.Analyze(results, data, *aiModel, *aiAPIKey) + if err != nil { + fmt.Fprintf(os.Stderr, "\nAI analysis failed: %v\n", err) + } else { + inspector.PrintAnalysis(analysis, os.Stdout) + } + } + } + + // Exit with non-zero if any failures + if failures := results.Failures(); len(failures) > 0 { + os.Exit(1) + } +} diff --git a/pkg/cli/production/lifecycle/stop.go b/pkg/cli/production/lifecycle/stop.go index e179745..4a96835 100644 --- a/pkg/cli/production/lifecycle/stop.go +++ b/pkg/cli/production/lifecycle/stop.go @@ -53,13 +53,17 @@ func HandleStop() { // Reset failed state for any services that might be in failed state resetArgs := []string{"reset-failed"} resetArgs = append(resetArgs, services...) - exec.Command("systemctl", resetArgs...).Run() + if err := exec.Command("systemctl", resetArgs...).Run(); err != nil { + fmt.Printf(" ⚠️ Warning: Failed to reset-failed state: %v\n", err) + } // Wait again after reset-failed time.Sleep(1 * time.Second) // Stop again to ensure they're stopped - exec.Command("systemctl", stopArgs...).Run() + if err := exec.Command("systemctl", stopArgs...).Run(); err != nil { + fmt.Printf(" ⚠️ Warning: Second stop attempt had errors: %v\n", err) + } time.Sleep(1 * time.Second) hadError := false diff --git a/pkg/cli/production/upgrade/flags.go b/pkg/cli/production/upgrade/flags.go index cff6d1d..193aa63 100644 --- a/pkg/cli/production/upgrade/flags.go +++ b/pkg/cli/production/upgrade/flags.go @@ -60,10 +60,6 @@ func ParseFlags(args []string) (*Flags, error) { fs.IntVar(&flags.AnyoneBandwidth, "anyone-bandwidth", 30, "Limit relay to N% of VPS bandwidth (0=unlimited, runs speedtest)") fs.IntVar(&flags.AnyoneAccounting, "anyone-accounting", 0, "Monthly data cap for relay in GB (0=unlimited)") - // Support legacy flags for backwards compatibility - nightly := fs.Bool("nightly", false, "Use nightly branch (deprecated, use --branch nightly)") - main := fs.Bool("main", false, "Use main branch (deprecated, use --branch main)") - if err := fs.Parse(args); err != nil { if err == flag.ErrHelp { return nil, err @@ -71,14 +67,6 @@ func ParseFlags(args []string) (*Flags, error) { return nil, fmt.Errorf("failed to parse flags: %w", err) } - // Handle legacy flags - if *nightly { - flags.Branch = "nightly" - } - if *main { - flags.Branch = "main" - } - // Set nameserver if explicitly provided if *nameserver { flags.Nameserver = nameserver diff --git a/pkg/cli/utils/systemd.go b/pkg/cli/utils/systemd.go index 068825f..ef5f38d 100644 --- a/pkg/cli/utils/systemd.go +++ b/pkg/cli/utils/systemd.go @@ -10,6 +10,8 @@ import ( "strings" "syscall" "time" + + "github.com/DeBrosOfficial/network/pkg/constants" ) var ErrServiceNotFound = errors.New("service not found") @@ -22,15 +24,15 @@ type PortSpec struct { var ServicePorts = map[string][]PortSpec{ "debros-gateway": { - {Name: "Gateway API", Port: 6001}, + {Name: "Gateway API", Port: constants.GatewayAPIPort}, }, "debros-olric": { - {Name: "Olric HTTP", Port: 3320}, - {Name: "Olric Memberlist", Port: 3322}, + {Name: "Olric HTTP", Port: constants.OlricHTTPPort}, + {Name: "Olric Memberlist", Port: constants.OlricMemberlistPort}, }, "debros-node": { - {Name: "RQLite HTTP", Port: 5001}, - {Name: "RQLite Raft", Port: 7001}, + {Name: "RQLite HTTP", Port: constants.RQLiteHTTPPort}, + {Name: "RQLite Raft", Port: constants.RQLiteRaftPort}, }, "debros-ipfs": { {Name: "IPFS API", Port: 4501}, @@ -48,12 +50,12 @@ func DefaultPorts() []PortSpec { {Name: "IPFS Swarm", Port: 4001}, {Name: "IPFS API", Port: 4501}, {Name: "IPFS Gateway", Port: 8080}, - {Name: "Gateway API", Port: 6001}, - {Name: "RQLite HTTP", Port: 5001}, - {Name: "RQLite Raft", Port: 7001}, + {Name: "Gateway API", Port: constants.GatewayAPIPort}, + {Name: "RQLite HTTP", Port: constants.RQLiteHTTPPort}, + {Name: "RQLite Raft", Port: constants.RQLiteRaftPort}, {Name: "IPFS Cluster API", Port: 9094}, - {Name: "Olric HTTP", Port: 3320}, - {Name: "Olric Memberlist", Port: 3322}, + {Name: "Olric HTTP", Port: constants.OlricHTTPPort}, + {Name: "Olric Memberlist", Port: constants.OlricMemberlistPort}, } } diff --git a/pkg/constants/capacity.go b/pkg/constants/capacity.go new file mode 100644 index 0000000..39c1eed --- /dev/null +++ b/pkg/constants/capacity.go @@ -0,0 +1,9 @@ +package constants + +// Node capacity limits used by both deployment and namespace scheduling. +const ( + MaxDeploymentsPerNode = 100 + MaxMemoryMB = 8192 // 8GB + MaxCPUPercent = 400 // 400% = 4 cores + MaxPortsPerNode = 9900 // ~10k ports available +) diff --git a/pkg/constants/ports.go b/pkg/constants/ports.go new file mode 100644 index 0000000..3d36c69 --- /dev/null +++ b/pkg/constants/ports.go @@ -0,0 +1,11 @@ +package constants + +// Service ports used across the network. +const ( + WireGuardPort = 51820 + RQLiteHTTPPort = 5001 + RQLiteRaftPort = 7001 + OlricHTTPPort = 3320 + OlricMemberlistPort = 3322 + GatewayAPIPort = 6001 +) diff --git a/pkg/deployments/home_node.go b/pkg/deployments/home_node.go index d3d29a4..53becb2 100644 --- a/pkg/deployments/home_node.go +++ b/pkg/deployments/home_node.go @@ -6,6 +6,7 @@ import ( "time" "github.com/DeBrosOfficial/network/pkg/client" + "github.com/DeBrosOfficial/network/pkg/constants" "github.com/DeBrosOfficial/network/pkg/rqlite" "go.uber.org/zap" ) @@ -270,7 +271,7 @@ func (hnm *HomeNodeManager) getNodeCapacity(ctx context.Context, nodeID string) AllocatedPorts: allocatedPorts, AvailablePorts: availablePorts, UsedMemoryMB: totalMemoryMB, - AvailableMemoryMB: 8192 - totalMemoryMB, // Assume 8GB per node (make configurable later) + AvailableMemoryMB: constants.MaxMemoryMB - totalMemoryMB, UsedCPUPercent: totalCPUPercent, Score: score, } @@ -331,12 +332,10 @@ func (hnm *HomeNodeManager) getNodeResourceUsage(ctx context.Context, nodeID str // calculateCapacityScore calculates a 0.0-1.0 score (higher is better) func (hnm *HomeNodeManager) calculateCapacityScore(deploymentCount, allocatedPorts, availablePorts, usedMemoryMB, usedCPUPercent int) float64 { - const ( - maxDeployments = 100 // Max deployments per node - maxMemoryMB = 8192 // 8GB - maxCPUPercent = 400 // 400% = 4 cores - maxPorts = 9900 // ~10k ports available - ) + maxDeployments := constants.MaxDeploymentsPerNode + maxMemoryMB := constants.MaxMemoryMB + maxCPUPercent := constants.MaxCPUPercent + maxPorts := constants.MaxPortsPerNode // Calculate individual component scores (0.0 to 1.0) deploymentScore := 1.0 - (float64(deploymentCount) / float64(maxDeployments)) diff --git a/pkg/deployments/port_allocator.go b/pkg/deployments/port_allocator.go index a153de4..17fcbb0 100644 --- a/pkg/deployments/port_allocator.go +++ b/pkg/deployments/port_allocator.go @@ -3,6 +3,7 @@ package deployments import ( "context" "fmt" + "strings" "time" "github.com/DeBrosOfficial/network/pkg/client" @@ -216,21 +217,6 @@ func isConflictError(err error) bool { if err == nil { return false } - // RQLite returns constraint violation errors as strings containing "UNIQUE constraint failed" errStr := err.Error() - return contains(errStr, "UNIQUE") || contains(errStr, "constraint") || contains(errStr, "conflict") -} - -// contains checks if a string contains a substring (case-insensitive) -func contains(s, substr string) bool { - return len(s) >= len(substr) && (s == substr || len(s) > len(substr) && findSubstring(s, substr)) -} - -func findSubstring(s, substr string) bool { - for i := 0; i <= len(s)-len(substr); i++ { - if s[i:i+len(substr)] == substr { - return true - } - } - return false + return strings.Contains(errStr, "UNIQUE") || strings.Contains(errStr, "constraint") || strings.Contains(errStr, "conflict") } diff --git a/pkg/deployments/port_allocator_test.go b/pkg/deployments/port_allocator_test.go index 5acfe2f..89d9f23 100644 --- a/pkg/deployments/port_allocator_test.go +++ b/pkg/deployments/port_allocator_test.go @@ -4,6 +4,7 @@ import ( "context" "database/sql" "reflect" + "strings" "testing" "github.com/DeBrosOfficial/network/pkg/rqlite" @@ -410,7 +411,7 @@ func TestContains(t *testing.T) { for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - result := contains(tt.s, tt.substr) + result := strings.Contains(tt.s, tt.substr) if result != tt.expected { t.Errorf("contains(%q, %q) = %v, expected %v", tt.s, tt.substr, result, tt.expected) } diff --git a/pkg/deployments/types.go b/pkg/deployments/types.go index 8cbcbda..c34fbd9 100644 --- a/pkg/deployments/types.go +++ b/pkg/deployments/types.go @@ -249,9 +249,7 @@ var ( ErrNoNodesAvailable = &DeploymentError{Message: "no nodes available for deployment"} ErrDeploymentNotFound = &DeploymentError{Message: "deployment not found"} ErrNamespaceNotAssigned = &DeploymentError{Message: "namespace has no home node assigned"} - ErrInvalidDeploymentType = &DeploymentError{Message: "invalid deployment type"} ErrSubdomainTaken = &DeploymentError{Message: "subdomain already in use"} - ErrDomainReserved = &DeploymentError{Message: "domain is reserved"} ) // DeploymentError represents a deployment-related error diff --git a/pkg/environments/production/config.go b/pkg/environments/production/config.go index 435ed93..ab74e9e 100644 --- a/pkg/environments/production/config.go +++ b/pkg/environments/production/config.go @@ -429,7 +429,9 @@ func (sg *SecretGenerator) SaveConfig(filename string, content string) error { } // Fix ownership - exec.Command("chown", "debros:debros", configPath).Run() + if err := exec.Command("chown", "debros:debros", configPath).Run(); err != nil { + fmt.Printf("Warning: failed to chown %s to debros:debros: %v\n", configPath, err) + } return nil } diff --git a/pkg/environments/production/preferences.go b/pkg/environments/production/preferences.go index e3926be..ea34f05 100644 --- a/pkg/environments/production/preferences.go +++ b/pkg/environments/production/preferences.go @@ -3,7 +3,6 @@ package production import ( "os" "path/filepath" - "strings" "gopkg.in/yaml.v3" ) @@ -15,10 +14,7 @@ type NodePreferences struct { AnyoneClient bool `yaml:"anyone_client"` } -const ( - preferencesFile = "preferences.yaml" - legacyBranchFile = ".branch" -) +const preferencesFile = "preferences.yaml" // SavePreferences saves node preferences to disk func SavePreferences(oramaDir string, prefs *NodePreferences) error { @@ -38,10 +34,6 @@ func SavePreferences(oramaDir string, prefs *NodePreferences) error { return err } - // Also save branch to legacy .branch file for backward compatibility - legacyPath := filepath.Join(oramaDir, legacyBranchFile) - os.WriteFile(legacyPath, []byte(prefs.Branch), 0644) - return nil } @@ -53,7 +45,7 @@ func LoadPreferences(oramaDir string) *NodePreferences { Nameserver: false, } - // Try to load from preferences.yaml first + // Try to load from preferences.yaml path := filepath.Join(oramaDir, preferencesFile) if data, err := os.ReadFile(path); err == nil { if err := yaml.Unmarshal(data, prefs); err == nil { @@ -61,15 +53,6 @@ func LoadPreferences(oramaDir string) *NodePreferences { } } - // Fall back to legacy .branch file - legacyPath := filepath.Join(oramaDir, legacyBranchFile) - if data, err := os.ReadFile(legacyPath); err == nil { - branch := strings.TrimSpace(string(data)) - if branch != "" { - prefs.Branch = branch - } - } - return prefs } diff --git a/pkg/gateway/anon_proxy_handler.go b/pkg/gateway/anon_proxy_handler.go index 692434d..e6fe987 100644 --- a/pkg/gateway/anon_proxy_handler.go +++ b/pkg/gateway/anon_proxy_handler.go @@ -5,6 +5,7 @@ import ( "encoding/json" "fmt" "io" + "net" "net/http" "net/url" "strings" @@ -234,31 +235,15 @@ func isPrivateOrLocalHost(host string) bool { } // Check for localhost variants - if host == "localhost" || host == "::1" { + if host == "localhost" { return true } - // Check common private ranges (basic check) - if strings.HasPrefix(host, "10.") || - strings.HasPrefix(host, "192.168.") || - strings.HasPrefix(host, "172.16.") || - strings.HasPrefix(host, "172.17.") || - strings.HasPrefix(host, "172.18.") || - strings.HasPrefix(host, "172.19.") || - strings.HasPrefix(host, "172.20.") || - strings.HasPrefix(host, "172.21.") || - strings.HasPrefix(host, "172.22.") || - strings.HasPrefix(host, "172.23.") || - strings.HasPrefix(host, "172.24.") || - strings.HasPrefix(host, "172.25.") || - strings.HasPrefix(host, "172.26.") || - strings.HasPrefix(host, "172.27.") || - strings.HasPrefix(host, "172.28.") || - strings.HasPrefix(host, "172.29.") || - strings.HasPrefix(host, "172.30.") || - strings.HasPrefix(host, "172.31.") { - return true + // Parse as IP and use standard library checks + ip := net.ParseIP(host) + if ip == nil { + return false } - return false + return ip.IsLoopback() || ip.IsPrivate() || ip.IsLinkLocalUnicast() || ip.IsLinkLocalMulticast() } diff --git a/pkg/gateway/http_gateway.go b/pkg/gateway/http_gateway.go index 528f069..9c1d1d3 100644 --- a/pkg/gateway/http_gateway.go +++ b/pkg/gateway/http_gateway.go @@ -23,9 +23,8 @@ import ( type HTTPGateway struct { logger *logging.ColoredLogger config *config.HTTPGatewayConfig - router chi.Router - reverseProxies map[string]*httputil.ReverseProxy - mu sync.RWMutex + router chi.Router + mu sync.RWMutex server *http.Server } @@ -46,8 +45,7 @@ func NewHTTPGateway(logger *logging.ColoredLogger, cfg *config.HTTPGatewayConfig gateway := &HTTPGateway{ logger: logger, config: cfg, - router: chi.NewRouter(), - reverseProxies: make(map[string]*httputil.ReverseProxy), + router: chi.NewRouter(), } // Set up router middleware @@ -110,8 +108,6 @@ func (hg *HTTPGateway) initializeRoutes() error { } } - hg.reverseProxies[routeName] = proxy - // Register route handler hg.registerRouteHandler(routeName, routeConfig, proxy) diff --git a/pkg/gateway/middleware.go b/pkg/gateway/middleware.go index ac69ba3..6bf8a24 100644 --- a/pkg/gateway/middleware.go +++ b/pkg/gateway/middleware.go @@ -1111,35 +1111,6 @@ func (g *Gateway) getDeploymentByDomain(ctx context.Context, domain string) (*de } } - // Legacy format: {name}.node-{shortID}.{baseDomain} (backwards compatibility) - if len(parts) == 2 && strings.HasPrefix(parts[1], "node-") { - deploymentName := parts[0] - shortNodeID := parts[1] // e.g., "node-kv4la8" - - // Query by name and matching short node ID - query := ` - SELECT id, namespace, name, type, port, content_cid, status, home_node_id - FROM deployments - WHERE name = ? - AND ('node-' || substr(home_node_id, 9, 6) = ? OR home_node_id = ?) - AND status = 'active' - LIMIT 1 - ` - result, err := db.Query(internalCtx, query, deploymentName, shortNodeID, shortNodeID) - if err == nil && len(result.Rows) > 0 { - row := result.Rows[0] - return &deployments.Deployment{ - ID: getString(row[0]), - Namespace: getString(row[1]), - Name: getString(row[2]), - Type: deployments.DeploymentType(getString(row[3])), - Port: getInt(row[4]), - ContentCID: getString(row[5]), - Status: deployments.DeploymentStatus(getString(row[6])), - HomeNodeID: getString(row[7]), - }, nil - } - } } // Try custom domain from deployment_domains table diff --git a/pkg/gateway/peer_discovery.go b/pkg/gateway/peer_discovery.go index 404bb3b..43432f6 100644 --- a/pkg/gateway/peer_discovery.go +++ b/pkg/gateway/peer_discovery.go @@ -9,6 +9,7 @@ import ( "strings" "time" + "github.com/DeBrosOfficial/network/pkg/wireguard" "github.com/libp2p/go-libp2p/core/host" "github.com/libp2p/go-libp2p/core/peer" "github.com/multiformats/go-multiaddr" @@ -337,16 +338,21 @@ func (pd *PeerDiscovery) updateHeartbeat(ctx context.Context) error { } // GetWireGuardIP detects the local WireGuard IP address using the wg0 network -// interface or the WireGuard config file. It does not require a PeerDiscovery -// instance and can be called from anywhere in the gateway package. +// interface, the 'ip' command, or the WireGuard config file. +// It does not require a PeerDiscovery instance and can be called from anywhere +// in the gateway package. func GetWireGuardIP() (string, error) { - // Method 1: Use 'ip addr show wg0' command (works without root) - ip, err := getWireGuardIPFromCommand() - if err == nil { + // Method 1: Use net.InterfaceByName (shared implementation) + if ip, err := wireguard.GetIP(); err == nil { return ip, nil } - // Method 2: Try to read from WireGuard config file (requires root, may fail) + // Method 2: Use 'ip addr show wg0' command (works without root) + if ip, err := getWireGuardIPFromCommand(); err == nil { + return ip, nil + } + + // Method 3: Try to read from WireGuard config file (requires root, may fail) configPath := "/etc/wireguard/wg0.conf" data, err := os.ReadFile(configPath) if err == nil { @@ -359,7 +365,6 @@ func GetWireGuardIP() (string, error) { parts := strings.Split(line, "=") if len(parts) == 2 { addrWithCIDR := strings.TrimSpace(parts[1]) - // Remove /24 suffix ip := strings.Split(addrWithCIDR, "/")[0] ip = strings.TrimSpace(ip) return ip, nil diff --git a/pkg/inspector/analyzer.go b/pkg/inspector/analyzer.go new file mode 100644 index 0000000..7e5529f --- /dev/null +++ b/pkg/inspector/analyzer.go @@ -0,0 +1,229 @@ +package inspector + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "net/http" + "os" + "strings" + "time" +) + +// AnalysisResult holds the AI's analysis of check failures. +type AnalysisResult struct { + Model string + Analysis string + Duration time.Duration +} + +// Analyze sends failures and cluster context to OpenRouter for AI analysis. +func Analyze(results *Results, data *ClusterData, model, apiKey string) (*AnalysisResult, error) { + if apiKey == "" { + apiKey = os.Getenv("OPENROUTER_API_KEY") + } + if apiKey == "" { + return nil, fmt.Errorf("no API key: set --api-key or OPENROUTER_API_KEY env") + } + + // Build the prompt with failures, warnings, and cluster context + prompt := buildAnalysisPrompt(results, data) + + start := time.Now() + response, err := callOpenRouter(model, apiKey, prompt) + if err != nil { + return nil, fmt.Errorf("OpenRouter API call failed: %w", err) + } + + return &AnalysisResult{ + Model: model, + Analysis: response, + Duration: time.Since(start), + }, nil +} + +func buildAnalysisPrompt(results *Results, data *ClusterData) string { + var b strings.Builder + + // System context + b.WriteString("You are a distributed systems expert analyzing health check results for an Orama Network cluster.\n") + b.WriteString("The cluster runs RQLite (Raft consensus), Olric (distributed cache), IPFS, CoreDNS, and WireGuard.\n\n") + + // Cluster overview + b.WriteString("## Cluster Overview\n") + b.WriteString(fmt.Sprintf("Nodes inspected: %d\n", len(data.Nodes))) + for host, nd := range data.Nodes { + b.WriteString(fmt.Sprintf("- %s (role: %s)\n", host, nd.Node.Role)) + } + b.WriteString("\n") + + // Summary + passed, failed, warned, skipped := results.Summary() + b.WriteString(fmt.Sprintf("## Check Results: %d passed, %d failed, %d warnings, %d skipped\n\n", passed, failed, warned, skipped)) + + // List all failures + failures := results.Failures() + if len(failures) > 0 { + b.WriteString("## Failures (CRITICAL)\n") + for _, f := range failures { + node := f.Node + if node == "" { + node = "cluster-wide" + } + b.WriteString(fmt.Sprintf("- [%s] %s (%s): %s\n", f.Severity, f.Name, node, f.Message)) + } + b.WriteString("\n") + } + + // List all warnings + warnings := results.FailuresAndWarnings() + warningsOnly := make([]CheckResult, 0) + for _, w := range warnings { + if w.Status == StatusWarn { + warningsOnly = append(warningsOnly, w) + } + } + if len(warningsOnly) > 0 { + b.WriteString("## Warnings\n") + for _, w := range warningsOnly { + node := w.Node + if node == "" { + node = "cluster-wide" + } + b.WriteString(fmt.Sprintf("- [%s] %s (%s): %s\n", w.Severity, w.Name, node, w.Message)) + } + b.WriteString("\n") + } + + // Add raw RQLite status for context (condensed) + b.WriteString("## Raw Cluster Data (condensed)\n") + for host, nd := range data.Nodes { + if nd.RQLite != nil && nd.RQLite.Status != nil { + s := nd.RQLite.Status + b.WriteString(fmt.Sprintf("### %s (RQLite)\n", host)) + b.WriteString(fmt.Sprintf(" raft_state=%s term=%d applied=%d commit=%d leader=%s peers=%d voter=%v\n", + s.RaftState, s.Term, s.AppliedIndex, s.CommitIndex, s.LeaderNodeID, s.NumPeers, s.Voter)) + if nd.RQLite.Nodes != nil { + b.WriteString(fmt.Sprintf(" /nodes reports %d members:", len(nd.RQLite.Nodes))) + for addr, n := range nd.RQLite.Nodes { + reachable := "ok" + if !n.Reachable { + reachable = "UNREACHABLE" + } + leader := "" + if n.Leader { + leader = " LEADER" + } + b.WriteString(fmt.Sprintf(" %s(%s%s)", addr, reachable, leader)) + } + b.WriteString("\n") + } + } + } + + b.WriteString("\n## Task\n") + b.WriteString("Analyze the failures and warnings above. For each issue:\n") + b.WriteString("1. Explain the root cause\n") + b.WriteString("2. Assess the severity and impact on the cluster\n") + b.WriteString("3. Suggest specific commands or actions to fix it\n") + b.WriteString("\nBe concise and actionable. Group related issues together. Use markdown formatting.\n") + + return b.String() +} + +// OpenRouter API types (OpenAI-compatible) + +type openRouterRequest struct { + Model string `json:"model"` + Messages []openRouterMessage `json:"messages"` +} + +type openRouterMessage struct { + Role string `json:"role"` + Content string `json:"content"` +} + +type openRouterResponse struct { + Choices []struct { + Message struct { + Content string `json:"content"` + } `json:"message"` + } `json:"choices"` + Error *struct { + Message string `json:"message"` + Code int `json:"code"` + } `json:"error"` +} + +func callOpenRouter(model, apiKey, prompt string) (string, error) { + reqBody := openRouterRequest{ + Model: model, + Messages: []openRouterMessage{ + {Role: "user", Content: prompt}, + }, + } + + jsonBody, err := json.Marshal(reqBody) + if err != nil { + return "", fmt.Errorf("marshal request: %w", err) + } + + req, err := http.NewRequest("POST", "https://openrouter.ai/api/v1/chat/completions", bytes.NewReader(jsonBody)) + if err != nil { + return "", fmt.Errorf("create request: %w", err) + } + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Authorization", "Bearer "+apiKey) + + client := &http.Client{Timeout: 120 * time.Second} + resp, err := client.Do(req) + if err != nil { + return "", fmt.Errorf("HTTP request: %w", err) + } + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + if err != nil { + return "", fmt.Errorf("read response: %w", err) + } + + if resp.StatusCode != http.StatusOK { + return "", fmt.Errorf("API returned %d: %s", resp.StatusCode, string(body)) + } + + var orResp openRouterResponse + if err := json.Unmarshal(body, &orResp); err != nil { + return "", fmt.Errorf("unmarshal response: %w", err) + } + + if orResp.Error != nil { + return "", fmt.Errorf("API error: %s", orResp.Error.Message) + } + + if len(orResp.Choices) == 0 { + return "", fmt.Errorf("no choices in response (raw: %s)", truncate(string(body), 500)) + } + + content := orResp.Choices[0].Message.Content + if strings.TrimSpace(content) == "" { + return "", fmt.Errorf("model returned empty response (raw: %s)", truncate(string(body), 500)) + } + + return content, nil +} + +func truncate(s string, max int) string { + if len(s) <= max { + return s + } + return s[:max] + "..." +} + +// PrintAnalysis writes the AI analysis to the output. +func PrintAnalysis(analysis *AnalysisResult, w io.Writer) { + fmt.Fprintf(w, "\n## AI Analysis (%s)\n", analysis.Model) + fmt.Fprintf(w, "%s\n", strings.Repeat("-", 70)) + fmt.Fprintf(w, "%s\n", analysis.Analysis) + fmt.Fprintf(w, "\n(Analysis took %.1fs)\n", analysis.Duration.Seconds()) +} diff --git a/pkg/inspector/checker.go b/pkg/inspector/checker.go new file mode 100644 index 0000000..f7db218 --- /dev/null +++ b/pkg/inspector/checker.go @@ -0,0 +1,172 @@ +package inspector + +import ( + "time" +) + +// Severity levels for check results. +type Severity int + +const ( + Low Severity = iota + Medium + High + Critical +) + +func (s Severity) String() string { + switch s { + case Low: + return "LOW" + case Medium: + return "MEDIUM" + case High: + return "HIGH" + case Critical: + return "CRITICAL" + default: + return "UNKNOWN" + } +} + +// Status represents the outcome of a check. +type Status string + +const ( + StatusPass Status = "pass" + StatusFail Status = "fail" + StatusWarn Status = "warn" + StatusSkip Status = "skip" +) + +// CheckResult holds the outcome of a single health check. +type CheckResult struct { + ID string `json:"id"` // e.g. "rqlite.leader_exists" + Name string `json:"name"` // "Cluster has exactly one leader" + Subsystem string `json:"subsystem"` // "rqlite" + Severity Severity `json:"severity"` + Status Status `json:"status"` + Message string `json:"message"` // human-readable detail + Node string `json:"node,omitempty"` // which node (empty for cluster-wide) +} + +// Results holds all check outcomes. +type Results struct { + Checks []CheckResult `json:"checks"` + Duration time.Duration `json:"duration"` +} + +// Summary returns counts by status. +func (r *Results) Summary() (passed, failed, warned, skipped int) { + for _, c := range r.Checks { + switch c.Status { + case StatusPass: + passed++ + case StatusFail: + failed++ + case StatusWarn: + warned++ + case StatusSkip: + skipped++ + } + } + return +} + +// Failures returns only failed checks. +func (r *Results) Failures() []CheckResult { + var out []CheckResult + for _, c := range r.Checks { + if c.Status == StatusFail { + out = append(out, c) + } + } + return out +} + +// FailuresAndWarnings returns failed and warning checks. +func (r *Results) FailuresAndWarnings() []CheckResult { + var out []CheckResult + for _, c := range r.Checks { + if c.Status == StatusFail || c.Status == StatusWarn { + out = append(out, c) + } + } + return out +} + +// CheckFunc is the signature for a subsystem check function. +type CheckFunc func(data *ClusterData) []CheckResult + +// SubsystemCheckers maps subsystem names to their check functions. +// Populated by checks/ package init or by explicit registration. +var SubsystemCheckers = map[string]CheckFunc{} + +// RegisterChecker registers a check function for a subsystem. +func RegisterChecker(subsystem string, fn CheckFunc) { + SubsystemCheckers[subsystem] = fn +} + +// RunChecks executes checks for the requested subsystems against collected data. +func RunChecks(data *ClusterData, subsystems []string) *Results { + start := time.Now() + results := &Results{} + + shouldCheck := func(name string) bool { + if len(subsystems) == 0 { + return true + } + for _, s := range subsystems { + if s == name || s == "all" { + return true + } + // Alias: "wg" matches "wireguard" + if s == "wg" && name == "wireguard" { + return true + } + } + return false + } + + for name, fn := range SubsystemCheckers { + if shouldCheck(name) { + checks := fn(data) + results.Checks = append(results.Checks, checks...) + } + } + + results.Duration = time.Since(start) + return results +} + +// Pass creates a passing check result. +func Pass(id, name, subsystem, node, msg string, sev Severity) CheckResult { + return CheckResult{ + ID: id, Name: name, Subsystem: subsystem, + Severity: sev, Status: StatusPass, Message: msg, Node: node, + } +} + +// Fail creates a failing check result. +func Fail(id, name, subsystem, node, msg string, sev Severity) CheckResult { + return CheckResult{ + ID: id, Name: name, Subsystem: subsystem, + Severity: sev, Status: StatusFail, Message: msg, Node: node, + } +} + +// Warn creates a warning check result. +func Warn(id, name, subsystem, node, msg string, sev Severity) CheckResult { + return CheckResult{ + ID: id, Name: name, Subsystem: subsystem, + Severity: sev, Status: StatusWarn, Message: msg, Node: node, + } +} + +// Skip creates a skipped check result. +func Skip(id, name, subsystem, node, msg string, sev Severity) CheckResult { + return CheckResult{ + ID: id, Name: name, Subsystem: subsystem, + Severity: sev, Status: StatusSkip, Message: msg, Node: node, + } +} diff --git a/pkg/inspector/checker_test.go b/pkg/inspector/checker_test.go new file mode 100644 index 0000000..00e54b9 --- /dev/null +++ b/pkg/inspector/checker_test.go @@ -0,0 +1,190 @@ +package inspector + +import ( + "testing" + "time" +) + +func TestSummary(t *testing.T) { + r := &Results{ + Checks: []CheckResult{ + {ID: "a", Status: StatusPass}, + {ID: "b", Status: StatusPass}, + {ID: "c", Status: StatusFail}, + {ID: "d", Status: StatusWarn}, + {ID: "e", Status: StatusSkip}, + {ID: "f", Status: StatusPass}, + }, + } + passed, failed, warned, skipped := r.Summary() + if passed != 3 { + t.Errorf("passed: want 3, got %d", passed) + } + if failed != 1 { + t.Errorf("failed: want 1, got %d", failed) + } + if warned != 1 { + t.Errorf("warned: want 1, got %d", warned) + } + if skipped != 1 { + t.Errorf("skipped: want 1, got %d", skipped) + } +} + +func TestFailures(t *testing.T) { + r := &Results{ + Checks: []CheckResult{ + {ID: "a", Status: StatusPass}, + {ID: "b", Status: StatusFail}, + {ID: "c", Status: StatusWarn}, + {ID: "d", Status: StatusFail}, + }, + } + failures := r.Failures() + if len(failures) != 2 { + t.Fatalf("want 2 failures, got %d", len(failures)) + } + for _, f := range failures { + if f.Status != StatusFail { + t.Errorf("expected StatusFail, got %s for check %s", f.Status, f.ID) + } + } +} + +func TestFailuresAndWarnings(t *testing.T) { + r := &Results{ + Checks: []CheckResult{ + {ID: "a", Status: StatusPass}, + {ID: "b", Status: StatusFail}, + {ID: "c", Status: StatusWarn}, + {ID: "d", Status: StatusSkip}, + }, + } + fw := r.FailuresAndWarnings() + if len(fw) != 2 { + t.Fatalf("want 2 failures+warnings, got %d", len(fw)) + } +} + +func TestPass(t *testing.T) { + c := Pass("test.id", "Test Name", "sub", "node1", "msg", Critical) + if c.Status != StatusPass { + t.Errorf("want pass, got %s", c.Status) + } + if c.Severity != Critical { + t.Errorf("want Critical, got %s", c.Severity) + } + if c.Node != "node1" { + t.Errorf("want node1, got %s", c.Node) + } +} + +func TestFail(t *testing.T) { + c := Fail("test.id", "Test Name", "sub", "", "msg", High) + if c.Status != StatusFail { + t.Errorf("want fail, got %s", c.Status) + } + if c.Node != "" { + t.Errorf("want empty node, got %q", c.Node) + } +} + +func TestWarn(t *testing.T) { + c := Warn("test.id", "Test Name", "sub", "n", "msg", Medium) + if c.Status != StatusWarn { + t.Errorf("want warn, got %s", c.Status) + } +} + +func TestSkip(t *testing.T) { + c := Skip("test.id", "Test Name", "sub", "n", "msg", Low) + if c.Status != StatusSkip { + t.Errorf("want skip, got %s", c.Status) + } +} + +func TestSeverityString(t *testing.T) { + tests := []struct { + sev Severity + want string + }{ + {Low, "LOW"}, + {Medium, "MEDIUM"}, + {High, "HIGH"}, + {Critical, "CRITICAL"}, + {Severity(99), "UNKNOWN"}, + } + for _, tt := range tests { + t.Run(tt.want, func(t *testing.T) { + if got := tt.sev.String(); got != tt.want { + t.Errorf("Severity(%d).String() = %q, want %q", tt.sev, got, tt.want) + } + }) + } +} + +func TestRunChecks_EmptyData(t *testing.T) { + data := &ClusterData{ + Nodes: map[string]*NodeData{}, + Duration: time.Second, + } + results := RunChecks(data, nil) + if results == nil { + t.Fatal("RunChecks returned nil") + } + // Should not panic and should return a valid Results +} + +func TestRunChecks_FilterBySubsystem(t *testing.T) { + // Register a test checker + called := map[string]bool{} + SubsystemCheckers["test_sub_a"] = func(data *ClusterData) []CheckResult { + called["a"] = true + return []CheckResult{Pass("a.1", "A1", "test_sub_a", "", "ok", Low)} + } + SubsystemCheckers["test_sub_b"] = func(data *ClusterData) []CheckResult { + called["b"] = true + return []CheckResult{Pass("b.1", "B1", "test_sub_b", "", "ok", Low)} + } + defer delete(SubsystemCheckers, "test_sub_a") + defer delete(SubsystemCheckers, "test_sub_b") + + data := &ClusterData{Nodes: map[string]*NodeData{}} + + // Filter to only "test_sub_a" + results := RunChecks(data, []string{"test_sub_a"}) + if !called["a"] { + t.Error("test_sub_a checker was not called") + } + if called["b"] { + t.Error("test_sub_b checker should not have been called") + } + + found := false + for _, c := range results.Checks { + if c.ID == "a.1" { + found = true + } + if c.Subsystem == "test_sub_b" { + t.Error("should not have checks from test_sub_b") + } + } + if !found { + t.Error("expected check a.1 in results") + } +} + +func TestRunChecks_AliasWG(t *testing.T) { + called := false + SubsystemCheckers["wireguard"] = func(data *ClusterData) []CheckResult { + called = true + return nil + } + defer delete(SubsystemCheckers, "wireguard") + + data := &ClusterData{Nodes: map[string]*NodeData{}} + RunChecks(data, []string{"wg"}) + if !called { + t.Error("wireguard checker not called via 'wg' alias") + } +} diff --git a/pkg/inspector/checks/dns.go b/pkg/inspector/checks/dns.go new file mode 100644 index 0000000..1aca414 --- /dev/null +++ b/pkg/inspector/checks/dns.go @@ -0,0 +1,224 @@ +package checks + +import ( + "fmt" + + "github.com/DeBrosOfficial/network/pkg/inspector" +) + +func init() { + inspector.RegisterChecker("dns", CheckDNS) +} + +const dnsSub = "dns" + +// CheckDNS runs all DNS/CoreDNS health checks against cluster data. +func CheckDNS(data *inspector.ClusterData) []inspector.CheckResult { + var results []inspector.CheckResult + + for _, nd := range data.Nodes { + if nd.DNS == nil { + continue + } + results = append(results, checkDNSPerNode(nd)...) + } + + results = append(results, checkDNSCrossNode(data)...) + + return results +} + +func checkDNSPerNode(nd *inspector.NodeData) []inspector.CheckResult { + var r []inspector.CheckResult + dns := nd.DNS + node := nd.Node.Name() + + // 4.1 CoreDNS service running + if dns.CoreDNSActive { + r = append(r, inspector.Pass("dns.coredns_active", "CoreDNS service active", dnsSub, node, + "coredns is active", inspector.Critical)) + } else { + r = append(r, inspector.Fail("dns.coredns_active", "CoreDNS service active", dnsSub, node, + "coredns is not active", inspector.Critical)) + return r + } + + // 4.47 Caddy service running + if dns.CaddyActive { + r = append(r, inspector.Pass("dns.caddy_active", "Caddy service active", dnsSub, node, + "caddy is active", inspector.Critical)) + } else { + r = append(r, inspector.Fail("dns.caddy_active", "Caddy service active", dnsSub, node, + "caddy is not active", inspector.Critical)) + } + + // 4.8 DNS port 53 bound + if dns.Port53Bound { + r = append(r, inspector.Pass("dns.port_53", "DNS port 53 bound", dnsSub, node, + "UDP 53 is listening", inspector.Critical)) + } else { + r = append(r, inspector.Fail("dns.port_53", "DNS port 53 bound", dnsSub, node, + "UDP 53 is NOT listening", inspector.Critical)) + } + + // 4.10 HTTP port 80 + if dns.Port80Bound { + r = append(r, inspector.Pass("dns.port_80", "HTTP port 80 bound", dnsSub, node, + "TCP 80 is listening", inspector.High)) + } else { + r = append(r, inspector.Warn("dns.port_80", "HTTP port 80 bound", dnsSub, node, + "TCP 80 is NOT listening", inspector.High)) + } + + // 4.11 HTTPS port 443 + if dns.Port443Bound { + r = append(r, inspector.Pass("dns.port_443", "HTTPS port 443 bound", dnsSub, node, + "TCP 443 is listening", inspector.Critical)) + } else { + r = append(r, inspector.Fail("dns.port_443", "HTTPS port 443 bound", dnsSub, node, + "TCP 443 is NOT listening", inspector.Critical)) + } + + // 4.3 CoreDNS memory + if dns.CoreDNSMemMB > 0 { + if dns.CoreDNSMemMB < 100 { + r = append(r, inspector.Pass("dns.coredns_memory", "CoreDNS memory healthy", dnsSub, node, + fmt.Sprintf("RSS=%dMB", dns.CoreDNSMemMB), inspector.Medium)) + } else if dns.CoreDNSMemMB < 200 { + r = append(r, inspector.Warn("dns.coredns_memory", "CoreDNS memory healthy", dnsSub, node, + fmt.Sprintf("RSS=%dMB (elevated)", dns.CoreDNSMemMB), inspector.Medium)) + } else { + r = append(r, inspector.Fail("dns.coredns_memory", "CoreDNS memory healthy", dnsSub, node, + fmt.Sprintf("RSS=%dMB (high)", dns.CoreDNSMemMB), inspector.High)) + } + } + + // 4.4 CoreDNS restart count + if dns.CoreDNSRestarts == 0 { + r = append(r, inspector.Pass("dns.coredns_restarts", "CoreDNS low restart count", dnsSub, node, + "NRestarts=0", inspector.High)) + } else if dns.CoreDNSRestarts <= 3 { + r = append(r, inspector.Warn("dns.coredns_restarts", "CoreDNS low restart count", dnsSub, node, + fmt.Sprintf("NRestarts=%d", dns.CoreDNSRestarts), inspector.High)) + } else { + r = append(r, inspector.Fail("dns.coredns_restarts", "CoreDNS low restart count", dnsSub, node, + fmt.Sprintf("NRestarts=%d (crash-looping?)", dns.CoreDNSRestarts), inspector.High)) + } + + // 4.7 CoreDNS log error rate + if dns.LogErrors == 0 { + r = append(r, inspector.Pass("dns.coredns_log_errors", "No recent CoreDNS errors", dnsSub, node, + "0 errors in last 5 minutes", inspector.High)) + } else if dns.LogErrors < 5 { + r = append(r, inspector.Warn("dns.coredns_log_errors", "No recent CoreDNS errors", dnsSub, node, + fmt.Sprintf("%d errors in last 5 minutes", dns.LogErrors), inspector.High)) + } else { + r = append(r, inspector.Fail("dns.coredns_log_errors", "No recent CoreDNS errors", dnsSub, node, + fmt.Sprintf("%d errors in last 5 minutes", dns.LogErrors), inspector.High)) + } + + // 4.14 Corefile exists + if dns.CorefileExists { + r = append(r, inspector.Pass("dns.corefile_exists", "Corefile exists", dnsSub, node, + "/etc/coredns/Corefile present", inspector.High)) + } else { + r = append(r, inspector.Fail("dns.corefile_exists", "Corefile exists", dnsSub, node, + "/etc/coredns/Corefile NOT found", inspector.High)) + } + + // 4.20 SOA resolution + if dns.SOAResolves { + r = append(r, inspector.Pass("dns.soa_resolves", "SOA record resolves", dnsSub, node, + "dig SOA returned result", inspector.Critical)) + } else { + r = append(r, inspector.Fail("dns.soa_resolves", "SOA record resolves", dnsSub, node, + "dig SOA returned no result", inspector.Critical)) + } + + // 4.21 NS records resolve + if dns.NSResolves { + r = append(r, inspector.Pass("dns.ns_resolves", "NS records resolve", dnsSub, node, + fmt.Sprintf("%d NS records returned", dns.NSRecordCount), inspector.Critical)) + } else { + r = append(r, inspector.Fail("dns.ns_resolves", "NS records resolve", dnsSub, node, + "dig NS returned no results", inspector.Critical)) + } + + // 4.23 Wildcard DNS resolution + if dns.WildcardResolves { + r = append(r, inspector.Pass("dns.wildcard_resolves", "Wildcard DNS resolves", dnsSub, node, + "test-wildcard. returned IP", inspector.Critical)) + } else { + r = append(r, inspector.Fail("dns.wildcard_resolves", "Wildcard DNS resolves", dnsSub, node, + "test-wildcard. returned no IP", inspector.Critical)) + } + + // 4.24 Base domain A record + if dns.BaseAResolves { + r = append(r, inspector.Pass("dns.base_a_resolves", "Base domain A record resolves", dnsSub, node, + " A record returned IP", inspector.High)) + } else { + r = append(r, inspector.Warn("dns.base_a_resolves", "Base domain A record resolves", dnsSub, node, + " A record returned no IP", inspector.High)) + } + + // 4.50 TLS certificate - base domain + if dns.BaseTLSDaysLeft >= 0 { + if dns.BaseTLSDaysLeft > 30 { + r = append(r, inspector.Pass("dns.tls_base", "Base domain TLS cert valid", dnsSub, node, + fmt.Sprintf("%d days until expiry", dns.BaseTLSDaysLeft), inspector.Critical)) + } else if dns.BaseTLSDaysLeft > 7 { + r = append(r, inspector.Warn("dns.tls_base", "Base domain TLS cert valid", dnsSub, node, + fmt.Sprintf("%d days until expiry (expiring soon)", dns.BaseTLSDaysLeft), inspector.Critical)) + } else { + r = append(r, inspector.Fail("dns.tls_base", "Base domain TLS cert valid", dnsSub, node, + fmt.Sprintf("%d days until expiry (CRITICAL)", dns.BaseTLSDaysLeft), inspector.Critical)) + } + } + + // 4.51 TLS certificate - wildcard + if dns.WildTLSDaysLeft >= 0 { + if dns.WildTLSDaysLeft > 30 { + r = append(r, inspector.Pass("dns.tls_wildcard", "Wildcard TLS cert valid", dnsSub, node, + fmt.Sprintf("%d days until expiry", dns.WildTLSDaysLeft), inspector.Critical)) + } else if dns.WildTLSDaysLeft > 7 { + r = append(r, inspector.Warn("dns.tls_wildcard", "Wildcard TLS cert valid", dnsSub, node, + fmt.Sprintf("%d days until expiry (expiring soon)", dns.WildTLSDaysLeft), inspector.Critical)) + } else { + r = append(r, inspector.Fail("dns.tls_wildcard", "Wildcard TLS cert valid", dnsSub, node, + fmt.Sprintf("%d days until expiry (CRITICAL)", dns.WildTLSDaysLeft), inspector.Critical)) + } + } + + return r +} + +func checkDNSCrossNode(data *inspector.ClusterData) []inspector.CheckResult { + var r []inspector.CheckResult + + activeCount := 0 + totalNS := 0 + for _, nd := range data.Nodes { + if nd.DNS == nil { + continue + } + totalNS++ + if nd.DNS.CoreDNSActive { + activeCount++ + } + } + + if totalNS == 0 { + return r + } + + if activeCount == totalNS { + r = append(r, inspector.Pass("dns.all_ns_active", "All nameservers running CoreDNS", dnsSub, "", + fmt.Sprintf("%d/%d nameservers active", activeCount, totalNS), inspector.Critical)) + } else { + r = append(r, inspector.Fail("dns.all_ns_active", "All nameservers running CoreDNS", dnsSub, "", + fmt.Sprintf("%d/%d nameservers active", activeCount, totalNS), inspector.Critical)) + } + + return r +} diff --git a/pkg/inspector/checks/dns_test.go b/pkg/inspector/checks/dns_test.go new file mode 100644 index 0000000..c1b82c8 --- /dev/null +++ b/pkg/inspector/checks/dns_test.go @@ -0,0 +1,232 @@ +package checks + +import ( + "testing" + + "github.com/DeBrosOfficial/network/pkg/inspector" +) + +func TestCheckDNS_CoreDNSInactive(t *testing.T) { + nd := makeNodeData("5.5.5.5", "nameserver-ns1") + nd.DNS = &inspector.DNSData{CoreDNSActive: false} + + data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd}) + results := CheckDNS(data) + + expectStatus(t, results, "dns.coredns_active", inspector.StatusFail) + // Early return — no port checks + if findCheck(results, "dns.port_53") != nil { + t.Error("should not check ports when CoreDNS inactive") + } +} + +func TestCheckDNS_HealthyNode(t *testing.T) { + nd := makeNodeData("5.5.5.5", "nameserver-ns1") + nd.DNS = &inspector.DNSData{ + CoreDNSActive: true, + CaddyActive: true, + Port53Bound: true, + Port80Bound: true, + Port443Bound: true, + CoreDNSMemMB: 50, + CoreDNSRestarts: 0, + LogErrors: 0, + CorefileExists: true, + SOAResolves: true, + NSResolves: true, + NSRecordCount: 3, + WildcardResolves: true, + BaseAResolves: true, + BaseTLSDaysLeft: 60, + WildTLSDaysLeft: 60, + } + + data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd}) + results := CheckDNS(data) + + expectStatus(t, results, "dns.coredns_active", inspector.StatusPass) + expectStatus(t, results, "dns.caddy_active", inspector.StatusPass) + expectStatus(t, results, "dns.port_53", inspector.StatusPass) + expectStatus(t, results, "dns.port_80", inspector.StatusPass) + expectStatus(t, results, "dns.port_443", inspector.StatusPass) + expectStatus(t, results, "dns.coredns_memory", inspector.StatusPass) + expectStatus(t, results, "dns.coredns_restarts", inspector.StatusPass) + expectStatus(t, results, "dns.coredns_log_errors", inspector.StatusPass) + expectStatus(t, results, "dns.corefile_exists", inspector.StatusPass) + expectStatus(t, results, "dns.soa_resolves", inspector.StatusPass) + expectStatus(t, results, "dns.ns_resolves", inspector.StatusPass) + expectStatus(t, results, "dns.wildcard_resolves", inspector.StatusPass) + expectStatus(t, results, "dns.base_a_resolves", inspector.StatusPass) + expectStatus(t, results, "dns.tls_base", inspector.StatusPass) + expectStatus(t, results, "dns.tls_wildcard", inspector.StatusPass) +} + +func TestCheckDNS_PortsFailing(t *testing.T) { + nd := makeNodeData("5.5.5.5", "nameserver-ns1") + nd.DNS = &inspector.DNSData{ + CoreDNSActive: true, + Port53Bound: false, + Port80Bound: false, + Port443Bound: false, + } + data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd}) + results := CheckDNS(data) + expectStatus(t, results, "dns.port_53", inspector.StatusFail) + expectStatus(t, results, "dns.port_80", inspector.StatusWarn) + expectStatus(t, results, "dns.port_443", inspector.StatusFail) +} + +func TestCheckDNS_Memory(t *testing.T) { + tests := []struct { + name string + memMB int + status inspector.Status + }{ + {"healthy", 50, inspector.StatusPass}, + {"elevated", 150, inspector.StatusWarn}, + {"high", 250, inspector.StatusFail}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + nd := makeNodeData("5.5.5.5", "nameserver-ns1") + nd.DNS = &inspector.DNSData{CoreDNSActive: true, CoreDNSMemMB: tt.memMB} + data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd}) + results := CheckDNS(data) + expectStatus(t, results, "dns.coredns_memory", tt.status) + }) + } +} + +func TestCheckDNS_Restarts(t *testing.T) { + tests := []struct { + name string + restarts int + status inspector.Status + }{ + {"zero", 0, inspector.StatusPass}, + {"few", 2, inspector.StatusWarn}, + {"many", 5, inspector.StatusFail}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + nd := makeNodeData("5.5.5.5", "nameserver-ns1") + nd.DNS = &inspector.DNSData{CoreDNSActive: true, CoreDNSRestarts: tt.restarts} + data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd}) + results := CheckDNS(data) + expectStatus(t, results, "dns.coredns_restarts", tt.status) + }) + } +} + +func TestCheckDNS_LogErrors(t *testing.T) { + tests := []struct { + name string + errors int + status inspector.Status + }{ + {"none", 0, inspector.StatusPass}, + {"few", 3, inspector.StatusWarn}, + {"many", 10, inspector.StatusFail}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + nd := makeNodeData("5.5.5.5", "nameserver-ns1") + nd.DNS = &inspector.DNSData{CoreDNSActive: true, LogErrors: tt.errors} + data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd}) + results := CheckDNS(data) + expectStatus(t, results, "dns.coredns_log_errors", tt.status) + }) + } +} + +func TestCheckDNS_TLSExpiry(t *testing.T) { + tests := []struct { + name string + days int + status inspector.Status + }{ + {"healthy", 60, inspector.StatusPass}, + {"expiring soon", 20, inspector.StatusWarn}, + {"critical", 3, inspector.StatusFail}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + nd := makeNodeData("5.5.5.5", "nameserver-ns1") + nd.DNS = &inspector.DNSData{ + CoreDNSActive: true, + BaseTLSDaysLeft: tt.days, + WildTLSDaysLeft: tt.days, + } + data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd}) + results := CheckDNS(data) + expectStatus(t, results, "dns.tls_base", tt.status) + expectStatus(t, results, "dns.tls_wildcard", tt.status) + }) + } +} + +func TestCheckDNS_TLSNotChecked(t *testing.T) { + nd := makeNodeData("5.5.5.5", "nameserver-ns1") + nd.DNS = &inspector.DNSData{ + CoreDNSActive: true, + BaseTLSDaysLeft: -1, + WildTLSDaysLeft: -1, + } + data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd}) + results := CheckDNS(data) + // TLS checks should not be emitted when days == -1 + if findCheck(results, "dns.tls_base") != nil { + t.Error("should not emit tls_base when days == -1") + } +} + +func TestCheckDNS_ResolutionFailures(t *testing.T) { + nd := makeNodeData("5.5.5.5", "nameserver-ns1") + nd.DNS = &inspector.DNSData{ + CoreDNSActive: true, + SOAResolves: false, + NSResolves: false, + WildcardResolves: false, + BaseAResolves: false, + } + data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd}) + results := CheckDNS(data) + expectStatus(t, results, "dns.soa_resolves", inspector.StatusFail) + expectStatus(t, results, "dns.ns_resolves", inspector.StatusFail) + expectStatus(t, results, "dns.wildcard_resolves", inspector.StatusFail) + expectStatus(t, results, "dns.base_a_resolves", inspector.StatusWarn) +} + +func TestCheckDNS_CrossNode_AllActive(t *testing.T) { + nodes := map[string]*inspector.NodeData{} + for _, host := range []string{"5.5.5.5", "6.6.6.6", "7.7.7.7"} { + nd := makeNodeData(host, "nameserver-ns1") + nd.DNS = &inspector.DNSData{CoreDNSActive: true} + nodes[host] = nd + } + data := makeCluster(nodes) + results := CheckDNS(data) + expectStatus(t, results, "dns.all_ns_active", inspector.StatusPass) +} + +func TestCheckDNS_CrossNode_PartialActive(t *testing.T) { + nodes := map[string]*inspector.NodeData{} + active := []bool{true, true, false} + for i, host := range []string{"5.5.5.5", "6.6.6.6", "7.7.7.7"} { + nd := makeNodeData(host, "nameserver-ns1") + nd.DNS = &inspector.DNSData{CoreDNSActive: active[i]} + nodes[host] = nd + } + data := makeCluster(nodes) + results := CheckDNS(data) + expectStatus(t, results, "dns.all_ns_active", inspector.StatusFail) +} + +func TestCheckDNS_NilData(t *testing.T) { + nd := makeNodeData("5.5.5.5", "nameserver-ns1") + data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd}) + results := CheckDNS(data) + if len(results) != 0 { + t.Errorf("expected 0 results for nil DNS data, got %d", len(results)) + } +} diff --git a/pkg/inspector/checks/helpers_test.go b/pkg/inspector/checks/helpers_test.go new file mode 100644 index 0000000..7732028 --- /dev/null +++ b/pkg/inspector/checks/helpers_test.go @@ -0,0 +1,74 @@ +package checks + +import ( + "testing" + "time" + + "github.com/DeBrosOfficial/network/pkg/inspector" +) + +// makeNode creates a test Node with the given host and role. +func makeNode(host, role string) inspector.Node { + return inspector.Node{ + Environment: "devnet", + User: "ubuntu", + Host: host, + Password: "test", + Role: role, + } +} + +// makeNodeData creates a NodeData with a node but no subsystem data. +func makeNodeData(host, role string) *inspector.NodeData { + return &inspector.NodeData{ + Node: makeNode(host, role), + } +} + +// makeCluster creates a ClusterData from a map of host → NodeData. +func makeCluster(nodes map[string]*inspector.NodeData) *inspector.ClusterData { + return &inspector.ClusterData{ + Nodes: nodes, + Duration: 1 * time.Second, + } +} + +// countByStatus counts results with the given status. +func countByStatus(results []inspector.CheckResult, status inspector.Status) int { + n := 0 + for _, r := range results { + if r.Status == status { + n++ + } + } + return n +} + +// findCheck returns a pointer to the first check matching the given ID, or nil. +func findCheck(results []inspector.CheckResult, id string) *inspector.CheckResult { + for i := range results { + if results[i].ID == id { + return &results[i] + } + } + return nil +} + +// requireCheck finds a check by ID and fails the test if not found. +func requireCheck(t *testing.T, results []inspector.CheckResult, id string) inspector.CheckResult { + t.Helper() + c := findCheck(results, id) + if c == nil { + t.Fatalf("check %q not found in %d results", id, len(results)) + } + return *c +} + +// expectStatus asserts that a check with the given ID has the expected status. +func expectStatus(t *testing.T, results []inspector.CheckResult, id string, status inspector.Status) { + t.Helper() + c := requireCheck(t, results, id) + if c.Status != status { + t.Errorf("check %q: want status=%s, got status=%s (msg=%s)", id, status, c.Status, c.Message) + } +} diff --git a/pkg/inspector/checks/ipfs.go b/pkg/inspector/checks/ipfs.go new file mode 100644 index 0000000..538b1ac --- /dev/null +++ b/pkg/inspector/checks/ipfs.go @@ -0,0 +1,232 @@ +package checks + +import ( + "fmt" + + "github.com/DeBrosOfficial/network/pkg/inspector" +) + +func init() { + inspector.RegisterChecker("ipfs", CheckIPFS) +} + +const ipfsSub = "ipfs" + +// CheckIPFS runs all IPFS health checks against cluster data. +func CheckIPFS(data *inspector.ClusterData) []inspector.CheckResult { + var results []inspector.CheckResult + + for _, nd := range data.Nodes { + if nd.IPFS == nil { + continue + } + results = append(results, checkIPFSPerNode(nd, data)...) + } + + results = append(results, checkIPFSCrossNode(data)...) + + return results +} + +func checkIPFSPerNode(nd *inspector.NodeData, data *inspector.ClusterData) []inspector.CheckResult { + var r []inspector.CheckResult + ipfs := nd.IPFS + node := nd.Node.Name() + + // 3.1 IPFS daemon running + if ipfs.DaemonActive { + r = append(r, inspector.Pass("ipfs.daemon_active", "IPFS daemon active", ipfsSub, node, + "debros-ipfs is active", inspector.Critical)) + } else { + r = append(r, inspector.Fail("ipfs.daemon_active", "IPFS daemon active", ipfsSub, node, + "debros-ipfs is not active", inspector.Critical)) + return r + } + + // 3.2 IPFS Cluster running + if ipfs.ClusterActive { + r = append(r, inspector.Pass("ipfs.cluster_active", "IPFS Cluster active", ipfsSub, node, + "debros-ipfs-cluster is active", inspector.Critical)) + } else { + r = append(r, inspector.Fail("ipfs.cluster_active", "IPFS Cluster active", ipfsSub, node, + "debros-ipfs-cluster is not active", inspector.Critical)) + } + + // 3.6 Swarm peer count + expectedNodes := countIPFSNodes(data) + if ipfs.SwarmPeerCount >= 0 { + expectedPeers := expectedNodes - 1 + if expectedPeers < 0 { + expectedPeers = 0 + } + if ipfs.SwarmPeerCount >= expectedPeers { + r = append(r, inspector.Pass("ipfs.swarm_peers", "Swarm peer count sufficient", ipfsSub, node, + fmt.Sprintf("peers=%d (expected >=%d)", ipfs.SwarmPeerCount, expectedPeers), inspector.High)) + } else if ipfs.SwarmPeerCount > 0 { + r = append(r, inspector.Warn("ipfs.swarm_peers", "Swarm peer count sufficient", ipfsSub, node, + fmt.Sprintf("peers=%d (expected >=%d)", ipfs.SwarmPeerCount, expectedPeers), inspector.High)) + } else { + r = append(r, inspector.Fail("ipfs.swarm_peers", "Swarm peer count sufficient", ipfsSub, node, + fmt.Sprintf("peers=%d (isolated!)", ipfs.SwarmPeerCount), inspector.Critical)) + } + } + + // 3.12 Cluster peer count + if ipfs.ClusterPeerCount >= 0 { + if ipfs.ClusterPeerCount >= expectedNodes { + r = append(r, inspector.Pass("ipfs.cluster_peers", "Cluster peer count matches expected", ipfsSub, node, + fmt.Sprintf("cluster_peers=%d (expected=%d)", ipfs.ClusterPeerCount, expectedNodes), inspector.Critical)) + } else { + r = append(r, inspector.Warn("ipfs.cluster_peers", "Cluster peer count matches expected", ipfsSub, node, + fmt.Sprintf("cluster_peers=%d (expected=%d)", ipfs.ClusterPeerCount, expectedNodes), inspector.Critical)) + } + } + + // 3.14 Cluster peer errors + if ipfs.ClusterErrors == 0 { + r = append(r, inspector.Pass("ipfs.cluster_errors", "No cluster peer errors", ipfsSub, node, + "all cluster peers healthy", inspector.Critical)) + } else { + r = append(r, inspector.Fail("ipfs.cluster_errors", "No cluster peer errors", ipfsSub, node, + fmt.Sprintf("%d peers reporting errors", ipfs.ClusterErrors), inspector.Critical)) + } + + // 3.20 Repo size vs max + if ipfs.RepoMaxBytes > 0 && ipfs.RepoSizeBytes > 0 { + pct := float64(ipfs.RepoSizeBytes) / float64(ipfs.RepoMaxBytes) * 100 + sizeMB := ipfs.RepoSizeBytes / (1024 * 1024) + maxMB := ipfs.RepoMaxBytes / (1024 * 1024) + if pct < 80 { + r = append(r, inspector.Pass("ipfs.repo_size", "Repo size below limit", ipfsSub, node, + fmt.Sprintf("repo=%dMB/%dMB (%.0f%%)", sizeMB, maxMB, pct), inspector.High)) + } else if pct < 95 { + r = append(r, inspector.Warn("ipfs.repo_size", "Repo size below limit", ipfsSub, node, + fmt.Sprintf("repo=%dMB/%dMB (%.0f%%)", sizeMB, maxMB, pct), inspector.High)) + } else { + r = append(r, inspector.Fail("ipfs.repo_size", "Repo size below limit", ipfsSub, node, + fmt.Sprintf("repo=%dMB/%dMB (%.0f%% NEARLY FULL)", sizeMB, maxMB, pct), inspector.Critical)) + } + } + + // 3.3 Version + if ipfs.KuboVersion != "" && ipfs.KuboVersion != "unknown" { + r = append(r, inspector.Pass("ipfs.kubo_version", "Kubo version reported", ipfsSub, node, + fmt.Sprintf("kubo=%s", ipfs.KuboVersion), inspector.Low)) + } + if ipfs.ClusterVersion != "" && ipfs.ClusterVersion != "unknown" { + r = append(r, inspector.Pass("ipfs.cluster_version", "Cluster version reported", ipfsSub, node, + fmt.Sprintf("cluster=%s", ipfs.ClusterVersion), inspector.Low)) + } + + // 3.29 Swarm key exists (private swarm) + if ipfs.HasSwarmKey { + r = append(r, inspector.Pass("ipfs.swarm_key", "Swarm key exists (private swarm)", ipfsSub, node, + "swarm.key present", inspector.Critical)) + } else { + r = append(r, inspector.Fail("ipfs.swarm_key", "Swarm key exists (private swarm)", ipfsSub, node, + "swarm.key NOT found", inspector.Critical)) + } + + // 3.30 Bootstrap empty (private swarm) + if ipfs.BootstrapEmpty { + r = append(r, inspector.Pass("ipfs.bootstrap_empty", "Bootstrap list empty (private)", ipfsSub, node, + "no public bootstrap peers", inspector.High)) + } else { + r = append(r, inspector.Warn("ipfs.bootstrap_empty", "Bootstrap list empty (private)", ipfsSub, node, + "bootstrap list is not empty (should be empty for private swarm)", inspector.High)) + } + + return r +} + +func checkIPFSCrossNode(data *inspector.ClusterData) []inspector.CheckResult { + var r []inspector.CheckResult + + type nodeInfo struct { + name string + ipfs *inspector.IPFSData + } + var nodes []nodeInfo + for _, nd := range data.Nodes { + if nd.IPFS != nil && nd.IPFS.DaemonActive { + nodes = append(nodes, nodeInfo{name: nd.Node.Name(), ipfs: nd.IPFS}) + } + } + + if len(nodes) < 2 { + return r + } + + // Version consistency + kuboVersions := map[string][]string{} + clusterVersions := map[string][]string{} + for _, n := range nodes { + if n.ipfs.KuboVersion != "" && n.ipfs.KuboVersion != "unknown" { + kuboVersions[n.ipfs.KuboVersion] = append(kuboVersions[n.ipfs.KuboVersion], n.name) + } + if n.ipfs.ClusterVersion != "" && n.ipfs.ClusterVersion != "unknown" { + clusterVersions[n.ipfs.ClusterVersion] = append(clusterVersions[n.ipfs.ClusterVersion], n.name) + } + } + + if len(kuboVersions) == 1 { + for v := range kuboVersions { + r = append(r, inspector.Pass("ipfs.kubo_version_consistent", "Kubo version consistent", ipfsSub, "", + fmt.Sprintf("version=%s across %d nodes", v, len(nodes)), inspector.Medium)) + } + } else if len(kuboVersions) > 1 { + r = append(r, inspector.Warn("ipfs.kubo_version_consistent", "Kubo version consistent", ipfsSub, "", + fmt.Sprintf("%d different versions", len(kuboVersions)), inspector.Medium)) + } + + if len(clusterVersions) == 1 { + for v := range clusterVersions { + r = append(r, inspector.Pass("ipfs.cluster_version_consistent", "Cluster version consistent", ipfsSub, "", + fmt.Sprintf("version=%s across %d nodes", v, len(nodes)), inspector.Medium)) + } + } else if len(clusterVersions) > 1 { + r = append(r, inspector.Warn("ipfs.cluster_version_consistent", "Cluster version consistent", ipfsSub, "", + fmt.Sprintf("%d different versions", len(clusterVersions)), inspector.Medium)) + } + + // Repo size convergence + var sizes []int64 + for _, n := range nodes { + if n.ipfs.RepoSizeBytes > 0 { + sizes = append(sizes, n.ipfs.RepoSizeBytes) + } + } + if len(sizes) >= 2 { + minSize, maxSize := sizes[0], sizes[0] + for _, s := range sizes[1:] { + if s < minSize { + minSize = s + } + if s > maxSize { + maxSize = s + } + } + if minSize > 0 { + ratio := float64(maxSize) / float64(minSize) + if ratio <= 2.0 { + r = append(r, inspector.Pass("ipfs.repo_convergence", "Repo size similar across nodes", ipfsSub, "", + fmt.Sprintf("ratio=%.1fx", ratio), inspector.Medium)) + } else { + r = append(r, inspector.Warn("ipfs.repo_convergence", "Repo size similar across nodes", ipfsSub, "", + fmt.Sprintf("ratio=%.1fx (diverged)", ratio), inspector.Medium)) + } + } + } + + return r +} + +func countIPFSNodes(data *inspector.ClusterData) int { + count := 0 + for _, nd := range data.Nodes { + if nd.IPFS != nil { + count++ + } + } + return count +} diff --git a/pkg/inspector/checks/ipfs_test.go b/pkg/inspector/checks/ipfs_test.go new file mode 100644 index 0000000..a56130b --- /dev/null +++ b/pkg/inspector/checks/ipfs_test.go @@ -0,0 +1,183 @@ +package checks + +import ( + "testing" + + "github.com/DeBrosOfficial/network/pkg/inspector" +) + +func TestCheckIPFS_DaemonInactive(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.IPFS = &inspector.IPFSData{DaemonActive: false} + + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckIPFS(data) + + expectStatus(t, results, "ipfs.daemon_active", inspector.StatusFail) + // Early return — no swarm peer checks + if findCheck(results, "ipfs.swarm_peers") != nil { + t.Error("should not check swarm_peers when daemon inactive") + } +} + +func TestCheckIPFS_HealthyNode(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.IPFS = &inspector.IPFSData{ + DaemonActive: true, + ClusterActive: true, + SwarmPeerCount: 0, // single node: expected peers = 0 + ClusterPeerCount: 1, // single node cluster + ClusterErrors: 0, + RepoSizeBytes: 500 * 1024 * 1024, // 500MB + RepoMaxBytes: 1024 * 1024 * 1024, // 1GB + KuboVersion: "0.22.0", + ClusterVersion: "1.0.8", + HasSwarmKey: true, + BootstrapEmpty: true, + } + + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckIPFS(data) + + expectStatus(t, results, "ipfs.daemon_active", inspector.StatusPass) + expectStatus(t, results, "ipfs.cluster_active", inspector.StatusPass) + expectStatus(t, results, "ipfs.swarm_peers", inspector.StatusPass) + expectStatus(t, results, "ipfs.cluster_peers", inspector.StatusPass) + expectStatus(t, results, "ipfs.cluster_errors", inspector.StatusPass) + expectStatus(t, results, "ipfs.repo_size", inspector.StatusPass) + expectStatus(t, results, "ipfs.swarm_key", inspector.StatusPass) + expectStatus(t, results, "ipfs.bootstrap_empty", inspector.StatusPass) +} + +func TestCheckIPFS_SwarmPeers(t *testing.T) { + // Single-node cluster: expected peers = 0 + t.Run("enough", func(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.IPFS = &inspector.IPFSData{DaemonActive: true, SwarmPeerCount: 2} + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckIPFS(data) + // swarm_peers=2, expected=0 → pass + expectStatus(t, results, "ipfs.swarm_peers", inspector.StatusPass) + }) + + t.Run("low but nonzero", func(t *testing.T) { + // 3-node cluster: expected peers = 2 per node + nd := makeNodeData("1.1.1.1", "node") + nd.IPFS = &inspector.IPFSData{DaemonActive: true, SwarmPeerCount: 1} // has 1, expects 2 + nd2 := makeNodeData("2.2.2.2", "node") + nd2.IPFS = &inspector.IPFSData{DaemonActive: true, SwarmPeerCount: 2} + nd3 := makeNodeData("3.3.3.3", "node") + nd3.IPFS = &inspector.IPFSData{DaemonActive: true, SwarmPeerCount: 2} + data := makeCluster(map[string]*inspector.NodeData{ + "1.1.1.1": nd, "2.2.2.2": nd2, "3.3.3.3": nd3, + }) + results := CheckIPFS(data) + // Node 1.1.1.1 should warn (1 < 2) + found := false + for _, r := range results { + if r.ID == "ipfs.swarm_peers" && r.Node == "ubuntu@1.1.1.1" && r.Status == inspector.StatusWarn { + found = true + } + } + if !found { + t.Error("expected swarm_peers warn for node 1.1.1.1") + } + }) + + t.Run("zero isolated", func(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.IPFS = &inspector.IPFSData{DaemonActive: true, SwarmPeerCount: 0} + nd2 := makeNodeData("2.2.2.2", "node") + nd2.IPFS = &inspector.IPFSData{DaemonActive: true, SwarmPeerCount: 1} + data := makeCluster(map[string]*inspector.NodeData{ + "1.1.1.1": nd, "2.2.2.2": nd2, + }) + results := CheckIPFS(data) + found := false + for _, r := range results { + if r.ID == "ipfs.swarm_peers" && r.Node == "ubuntu@1.1.1.1" && r.Status == inspector.StatusFail { + found = true + } + } + if !found { + t.Error("expected swarm_peers fail for isolated node 1.1.1.1") + } + }) +} + +func TestCheckIPFS_RepoSize(t *testing.T) { + tests := []struct { + name string + size int64 + max int64 + status inspector.Status + }{ + {"healthy", 500 * 1024 * 1024, 1024 * 1024 * 1024, inspector.StatusPass}, // 50% + {"elevated", 870 * 1024 * 1024, 1024 * 1024 * 1024, inspector.StatusWarn}, // 85% + {"nearly full", 980 * 1024 * 1024, 1024 * 1024 * 1024, inspector.StatusFail}, // 96% + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.IPFS = &inspector.IPFSData{ + DaemonActive: true, + RepoSizeBytes: tt.size, + RepoMaxBytes: tt.max, + } + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckIPFS(data) + expectStatus(t, results, "ipfs.repo_size", tt.status) + }) + } +} + +func TestCheckIPFS_SwarmKeyMissing(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.IPFS = &inspector.IPFSData{DaemonActive: true, HasSwarmKey: false} + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckIPFS(data) + expectStatus(t, results, "ipfs.swarm_key", inspector.StatusFail) +} + +func TestCheckIPFS_BootstrapNotEmpty(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.IPFS = &inspector.IPFSData{DaemonActive: true, BootstrapEmpty: false} + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckIPFS(data) + expectStatus(t, results, "ipfs.bootstrap_empty", inspector.StatusWarn) +} + +func TestCheckIPFS_CrossNode_VersionConsistency(t *testing.T) { + nodes := map[string]*inspector.NodeData{} + for _, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} { + nd := makeNodeData(host, "node") + nd.IPFS = &inspector.IPFSData{DaemonActive: true, KuboVersion: "0.22.0", ClusterVersion: "1.0.8"} + nodes[host] = nd + } + data := makeCluster(nodes) + results := CheckIPFS(data) + expectStatus(t, results, "ipfs.kubo_version_consistent", inspector.StatusPass) + expectStatus(t, results, "ipfs.cluster_version_consistent", inspector.StatusPass) +} + +func TestCheckIPFS_CrossNode_VersionMismatch(t *testing.T) { + nodes := map[string]*inspector.NodeData{} + versions := []string{"0.22.0", "0.22.0", "0.21.0"} + for i, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} { + nd := makeNodeData(host, "node") + nd.IPFS = &inspector.IPFSData{DaemonActive: true, KuboVersion: versions[i]} + nodes[host] = nd + } + data := makeCluster(nodes) + results := CheckIPFS(data) + expectStatus(t, results, "ipfs.kubo_version_consistent", inspector.StatusWarn) +} + +func TestCheckIPFS_NilData(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckIPFS(data) + if len(results) != 0 { + t.Errorf("expected 0 results for nil IPFS data, got %d", len(results)) + } +} diff --git a/pkg/inspector/checks/namespace.go b/pkg/inspector/checks/namespace.go new file mode 100644 index 0000000..e3173b0 --- /dev/null +++ b/pkg/inspector/checks/namespace.go @@ -0,0 +1,155 @@ +package checks + +import ( + "fmt" + + "github.com/DeBrosOfficial/network/pkg/inspector" +) + +func init() { + inspector.RegisterChecker("namespace", CheckNamespace) +} + +const nsSub = "namespace" + +// CheckNamespace runs all namespace-level health checks. +func CheckNamespace(data *inspector.ClusterData) []inspector.CheckResult { + var results []inspector.CheckResult + + for _, nd := range data.Nodes { + if len(nd.Namespaces) == 0 { + continue + } + results = append(results, checkNamespacesPerNode(nd)...) + } + + results = append(results, checkNamespacesCrossNode(data)...) + + return results +} + +func checkNamespacesPerNode(nd *inspector.NodeData) []inspector.CheckResult { + var r []inspector.CheckResult + node := nd.Node.Name() + + for _, ns := range nd.Namespaces { + prefix := fmt.Sprintf("ns.%s", ns.Name) + + // RQLite health + if ns.RQLiteUp { + r = append(r, inspector.Pass(prefix+".rqlite_up", fmt.Sprintf("Namespace %s RQLite responding", ns.Name), nsSub, node, + fmt.Sprintf("port_base=%d state=%s", ns.PortBase, ns.RQLiteState), inspector.Critical)) + } else { + r = append(r, inspector.Fail(prefix+".rqlite_up", fmt.Sprintf("Namespace %s RQLite responding", ns.Name), nsSub, node, + fmt.Sprintf("port_base=%d not responding", ns.PortBase), inspector.Critical)) + } + + // RQLite Raft state + if ns.RQLiteUp { + switch ns.RQLiteState { + case "Leader", "Follower": + r = append(r, inspector.Pass(prefix+".rqlite_state", fmt.Sprintf("Namespace %s RQLite raft state valid", ns.Name), nsSub, node, + fmt.Sprintf("state=%s", ns.RQLiteState), inspector.Critical)) + case "Candidate": + r = append(r, inspector.Warn(prefix+".rqlite_state", fmt.Sprintf("Namespace %s RQLite raft state valid", ns.Name), nsSub, node, + "state=Candidate (election in progress)", inspector.Critical)) + default: + r = append(r, inspector.Fail(prefix+".rqlite_state", fmt.Sprintf("Namespace %s RQLite raft state valid", ns.Name), nsSub, node, + fmt.Sprintf("state=%s", ns.RQLiteState), inspector.Critical)) + } + } + + // RQLite readiness + if ns.RQLiteReady { + r = append(r, inspector.Pass(prefix+".rqlite_ready", fmt.Sprintf("Namespace %s RQLite ready", ns.Name), nsSub, node, + "/readyz OK", inspector.Critical)) + } else if ns.RQLiteUp { + r = append(r, inspector.Fail(prefix+".rqlite_ready", fmt.Sprintf("Namespace %s RQLite ready", ns.Name), nsSub, node, + "/readyz failed", inspector.Critical)) + } + + // Olric health + if ns.OlricUp { + r = append(r, inspector.Pass(prefix+".olric_up", fmt.Sprintf("Namespace %s Olric port listening", ns.Name), nsSub, node, + "memberlist port bound", inspector.High)) + } else { + r = append(r, inspector.Fail(prefix+".olric_up", fmt.Sprintf("Namespace %s Olric port listening", ns.Name), nsSub, node, + "memberlist port not bound", inspector.High)) + } + + // Gateway health + if ns.GatewayUp { + r = append(r, inspector.Pass(prefix+".gateway_up", fmt.Sprintf("Namespace %s Gateway responding", ns.Name), nsSub, node, + fmt.Sprintf("HTTP status=%d", ns.GatewayStatus), inspector.High)) + } else { + r = append(r, inspector.Fail(prefix+".gateway_up", fmt.Sprintf("Namespace %s Gateway responding", ns.Name), nsSub, node, + fmt.Sprintf("HTTP status=%d", ns.GatewayStatus), inspector.High)) + } + } + + return r +} + +func checkNamespacesCrossNode(data *inspector.ClusterData) []inspector.CheckResult { + var r []inspector.CheckResult + + // Collect all namespace names across nodes + nsNodes := map[string]int{} // namespace name → count of nodes running it + nsHealthy := map[string]int{} // namespace name → count of nodes where all services are up + + for _, nd := range data.Nodes { + for _, ns := range nd.Namespaces { + nsNodes[ns.Name]++ + if ns.RQLiteUp && ns.OlricUp && ns.GatewayUp { + nsHealthy[ns.Name]++ + } + } + } + + for name, total := range nsNodes { + healthy := nsHealthy[name] + if healthy == total { + r = append(r, inspector.Pass( + fmt.Sprintf("ns.%s.all_healthy", name), + fmt.Sprintf("Namespace %s healthy on all nodes", name), + nsSub, "", + fmt.Sprintf("%d/%d nodes fully healthy", healthy, total), + inspector.Critical)) + } else { + r = append(r, inspector.Fail( + fmt.Sprintf("ns.%s.all_healthy", name), + fmt.Sprintf("Namespace %s healthy on all nodes", name), + nsSub, "", + fmt.Sprintf("%d/%d nodes fully healthy", healthy, total), + inspector.Critical)) + } + + // Check namespace has quorum (>= N/2+1 RQLite instances) + rqliteUp := 0 + for _, nd := range data.Nodes { + for _, ns := range nd.Namespaces { + if ns.Name == name && ns.RQLiteUp { + rqliteUp++ + } + } + } + quorumNeeded := total/2 + 1 + if rqliteUp >= quorumNeeded { + r = append(r, inspector.Pass( + fmt.Sprintf("ns.%s.quorum", name), + fmt.Sprintf("Namespace %s RQLite quorum", name), + nsSub, "", + fmt.Sprintf("rqlite_up=%d/%d quorum_needed=%d", rqliteUp, total, quorumNeeded), + inspector.Critical)) + } else { + r = append(r, inspector.Fail( + fmt.Sprintf("ns.%s.quorum", name), + fmt.Sprintf("Namespace %s RQLite quorum", name), + nsSub, "", + fmt.Sprintf("rqlite_up=%d/%d quorum_needed=%d (QUORUM LOST)", rqliteUp, total, quorumNeeded), + inspector.Critical)) + } + } + + return r +} diff --git a/pkg/inspector/checks/namespace_test.go b/pkg/inspector/checks/namespace_test.go new file mode 100644 index 0000000..fa51ddd --- /dev/null +++ b/pkg/inspector/checks/namespace_test.go @@ -0,0 +1,165 @@ +package checks + +import ( + "testing" + + "github.com/DeBrosOfficial/network/pkg/inspector" +) + +func TestCheckNamespace_PerNodeHealthy(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.Namespaces = []inspector.NamespaceData{ + { + Name: "myapp", + PortBase: 10000, + RQLiteUp: true, + RQLiteState: "Leader", + RQLiteReady: true, + OlricUp: true, + GatewayUp: true, + GatewayStatus: 200, + }, + } + + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckNamespace(data) + + expectStatus(t, results, "ns.myapp.rqlite_up", inspector.StatusPass) + expectStatus(t, results, "ns.myapp.rqlite_state", inspector.StatusPass) + expectStatus(t, results, "ns.myapp.rqlite_ready", inspector.StatusPass) + expectStatus(t, results, "ns.myapp.olric_up", inspector.StatusPass) + expectStatus(t, results, "ns.myapp.gateway_up", inspector.StatusPass) +} + +func TestCheckNamespace_RQLiteDown(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.Namespaces = []inspector.NamespaceData{ + {Name: "myapp", PortBase: 10000, RQLiteUp: false}, + } + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckNamespace(data) + expectStatus(t, results, "ns.myapp.rqlite_up", inspector.StatusFail) +} + +func TestCheckNamespace_RQLiteStates(t *testing.T) { + tests := []struct { + state string + status inspector.Status + }{ + {"Leader", inspector.StatusPass}, + {"Follower", inspector.StatusPass}, + {"Candidate", inspector.StatusWarn}, + {"Unknown", inspector.StatusFail}, + } + for _, tt := range tests { + t.Run(tt.state, func(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.Namespaces = []inspector.NamespaceData{ + {Name: "myapp", PortBase: 10000, RQLiteUp: true, RQLiteState: tt.state}, + } + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckNamespace(data) + expectStatus(t, results, "ns.myapp.rqlite_state", tt.status) + }) + } +} + +func TestCheckNamespace_RQLiteNotReady(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.Namespaces = []inspector.NamespaceData{ + {Name: "myapp", PortBase: 10000, RQLiteUp: true, RQLiteState: "Follower", RQLiteReady: false}, + } + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckNamespace(data) + expectStatus(t, results, "ns.myapp.rqlite_ready", inspector.StatusFail) +} + +func TestCheckNamespace_OlricDown(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.Namespaces = []inspector.NamespaceData{ + {Name: "myapp", OlricUp: false}, + } + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckNamespace(data) + expectStatus(t, results, "ns.myapp.olric_up", inspector.StatusFail) +} + +func TestCheckNamespace_GatewayDown(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.Namespaces = []inspector.NamespaceData{ + {Name: "myapp", GatewayUp: false, GatewayStatus: 0}, + } + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckNamespace(data) + expectStatus(t, results, "ns.myapp.gateway_up", inspector.StatusFail) +} + +func TestCheckNamespace_CrossNode_AllHealthy(t *testing.T) { + nodes := map[string]*inspector.NodeData{} + for _, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} { + nd := makeNodeData(host, "node") + nd.Namespaces = []inspector.NamespaceData{ + {Name: "myapp", RQLiteUp: true, OlricUp: true, GatewayUp: true}, + } + nodes[host] = nd + } + data := makeCluster(nodes) + results := CheckNamespace(data) + expectStatus(t, results, "ns.myapp.all_healthy", inspector.StatusPass) + expectStatus(t, results, "ns.myapp.quorum", inspector.StatusPass) +} + +func TestCheckNamespace_CrossNode_PartialHealthy(t *testing.T) { + nodes := map[string]*inspector.NodeData{} + for i, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} { + nd := makeNodeData(host, "node") + nd.Namespaces = []inspector.NamespaceData{ + {Name: "myapp", RQLiteUp: true, OlricUp: i < 2, GatewayUp: true}, + } + nodes[host] = nd + } + data := makeCluster(nodes) + results := CheckNamespace(data) + expectStatus(t, results, "ns.myapp.all_healthy", inspector.StatusFail) + // Quorum should still pass (3/3 RQLite up, need 2) + expectStatus(t, results, "ns.myapp.quorum", inspector.StatusPass) +} + +func TestCheckNamespace_CrossNode_QuorumLost(t *testing.T) { + nodes := map[string]*inspector.NodeData{} + rqliteUp := []bool{true, false, false} + for i, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} { + nd := makeNodeData(host, "node") + nd.Namespaces = []inspector.NamespaceData{ + {Name: "myapp", RQLiteUp: rqliteUp[i], OlricUp: true, GatewayUp: true}, + } + nodes[host] = nd + } + data := makeCluster(nodes) + results := CheckNamespace(data) + expectStatus(t, results, "ns.myapp.quorum", inspector.StatusFail) +} + +func TestCheckNamespace_MultipleNamespaces(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.Namespaces = []inspector.NamespaceData{ + {Name: "app1", RQLiteUp: true, RQLiteState: "Leader", OlricUp: true, GatewayUp: true}, + {Name: "app2", RQLiteUp: false, OlricUp: true, GatewayUp: true}, + } + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckNamespace(data) + + expectStatus(t, results, "ns.app1.rqlite_up", inspector.StatusPass) + expectStatus(t, results, "ns.app2.rqlite_up", inspector.StatusFail) +} + +func TestCheckNamespace_NoNamespaces(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.Namespaces = nil + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckNamespace(data) + // No per-node results, only cross-node (which should be empty since no namespaces) + for _, r := range results { + t.Errorf("unexpected check: %s", r.ID) + } +} diff --git a/pkg/inspector/checks/network.go b/pkg/inspector/checks/network.go new file mode 100644 index 0000000..6fa8cbe --- /dev/null +++ b/pkg/inspector/checks/network.go @@ -0,0 +1,113 @@ +package checks + +import ( + "fmt" + + "github.com/DeBrosOfficial/network/pkg/inspector" +) + +func init() { + inspector.RegisterChecker("network", CheckNetwork) +} + +const networkSub = "network" + +// CheckNetwork runs all network-level health checks. +func CheckNetwork(data *inspector.ClusterData) []inspector.CheckResult { + var results []inspector.CheckResult + + for _, nd := range data.Nodes { + if nd.Network == nil { + continue + } + results = append(results, checkNetworkPerNode(nd)...) + } + + return results +} + +func checkNetworkPerNode(nd *inspector.NodeData) []inspector.CheckResult { + var r []inspector.CheckResult + net := nd.Network + node := nd.Node.Name() + + // 7.2 Internet connectivity + if net.InternetReachable { + r = append(r, inspector.Pass("network.internet", "Internet reachable (ping 8.8.8.8)", networkSub, node, + "ping 8.8.8.8 succeeded", inspector.High)) + } else { + r = append(r, inspector.Fail("network.internet", "Internet reachable (ping 8.8.8.8)", networkSub, node, + "ping 8.8.8.8 failed", inspector.High)) + } + + // 7.14 Default route + if net.DefaultRoute { + r = append(r, inspector.Pass("network.default_route", "Default route exists", networkSub, node, + "default route present", inspector.Critical)) + } else { + r = append(r, inspector.Fail("network.default_route", "Default route exists", networkSub, node, + "no default route", inspector.Critical)) + } + + // 7.15 WG subnet route + if net.WGRouteExists { + r = append(r, inspector.Pass("network.wg_route", "WG subnet route exists", networkSub, node, + "10.0.0.0/24 via wg0 present", inspector.Critical)) + } else { + r = append(r, inspector.Fail("network.wg_route", "WG subnet route exists", networkSub, node, + "10.0.0.0/24 route via wg0 NOT found", inspector.Critical)) + } + + // 7.4 TCP connections + if net.TCPEstablished > 0 { + if net.TCPEstablished < 5000 { + r = append(r, inspector.Pass("network.tcp_established", "TCP connections reasonable", networkSub, node, + fmt.Sprintf("established=%d", net.TCPEstablished), inspector.Medium)) + } else { + r = append(r, inspector.Warn("network.tcp_established", "TCP connections reasonable", networkSub, node, + fmt.Sprintf("established=%d (high)", net.TCPEstablished), inspector.Medium)) + } + } + + // 7.6 TIME_WAIT + if net.TCPTimeWait < 10000 { + r = append(r, inspector.Pass("network.tcp_timewait", "TIME_WAIT count low", networkSub, node, + fmt.Sprintf("timewait=%d", net.TCPTimeWait), inspector.Medium)) + } else { + r = append(r, inspector.Warn("network.tcp_timewait", "TIME_WAIT count low", networkSub, node, + fmt.Sprintf("timewait=%d (accumulating)", net.TCPTimeWait), inspector.Medium)) + } + + // 7.8 TCP retransmission rate + if net.TCPRetransRate >= 0 { + if net.TCPRetransRate < 1 { + r = append(r, inspector.Pass("network.tcp_retrans", "TCP retransmission rate low", networkSub, node, + fmt.Sprintf("retrans=%.2f%%", net.TCPRetransRate), inspector.Medium)) + } else if net.TCPRetransRate < 5 { + r = append(r, inspector.Warn("network.tcp_retrans", "TCP retransmission rate low", networkSub, node, + fmt.Sprintf("retrans=%.2f%% (elevated)", net.TCPRetransRate), inspector.Medium)) + } else { + r = append(r, inspector.Fail("network.tcp_retrans", "TCP retransmission rate low", networkSub, node, + fmt.Sprintf("retrans=%.2f%% (high packet loss)", net.TCPRetransRate), inspector.High)) + } + } + + // 7.10 WG mesh peer pings (NxN connectivity) + if len(net.PingResults) > 0 { + failCount := 0 + for _, ok := range net.PingResults { + if !ok { + failCount++ + } + } + if failCount == 0 { + r = append(r, inspector.Pass("network.wg_mesh_ping", "All WG peers reachable via ping", networkSub, node, + fmt.Sprintf("%d/%d peers pingable", len(net.PingResults), len(net.PingResults)), inspector.Critical)) + } else { + r = append(r, inspector.Fail("network.wg_mesh_ping", "All WG peers reachable via ping", networkSub, node, + fmt.Sprintf("%d/%d peers unreachable", failCount, len(net.PingResults)), inspector.Critical)) + } + } + + return r +} diff --git a/pkg/inspector/checks/network_test.go b/pkg/inspector/checks/network_test.go new file mode 100644 index 0000000..cb6a902 --- /dev/null +++ b/pkg/inspector/checks/network_test.go @@ -0,0 +1,151 @@ +package checks + +import ( + "testing" + + "github.com/DeBrosOfficial/network/pkg/inspector" +) + +func TestCheckNetwork_HealthyNode(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.Network = &inspector.NetworkData{ + InternetReachable: true, + DefaultRoute: true, + WGRouteExists: true, + TCPEstablished: 100, + TCPTimeWait: 50, + TCPRetransRate: 0.1, + PingResults: map[string]bool{"10.0.0.2": true, "10.0.0.3": true}, + } + + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckNetwork(data) + + expectStatus(t, results, "network.internet", inspector.StatusPass) + expectStatus(t, results, "network.default_route", inspector.StatusPass) + expectStatus(t, results, "network.wg_route", inspector.StatusPass) + expectStatus(t, results, "network.tcp_established", inspector.StatusPass) + expectStatus(t, results, "network.tcp_timewait", inspector.StatusPass) + expectStatus(t, results, "network.tcp_retrans", inspector.StatusPass) + expectStatus(t, results, "network.wg_mesh_ping", inspector.StatusPass) +} + +func TestCheckNetwork_InternetUnreachable(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.Network = &inspector.NetworkData{InternetReachable: false} + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckNetwork(data) + expectStatus(t, results, "network.internet", inspector.StatusFail) +} + +func TestCheckNetwork_MissingRoutes(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.Network = &inspector.NetworkData{DefaultRoute: false, WGRouteExists: false} + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckNetwork(data) + expectStatus(t, results, "network.default_route", inspector.StatusFail) + expectStatus(t, results, "network.wg_route", inspector.StatusFail) +} + +func TestCheckNetwork_TCPConnections(t *testing.T) { + tests := []struct { + name string + estab int + status inspector.Status + }{ + {"normal", 100, inspector.StatusPass}, + {"high", 6000, inspector.StatusWarn}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.Network = &inspector.NetworkData{TCPEstablished: tt.estab} + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckNetwork(data) + expectStatus(t, results, "network.tcp_established", tt.status) + }) + } +} + +func TestCheckNetwork_TCPTimeWait(t *testing.T) { + tests := []struct { + name string + tw int + status inspector.Status + }{ + {"normal", 50, inspector.StatusPass}, + {"high", 15000, inspector.StatusWarn}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.Network = &inspector.NetworkData{TCPTimeWait: tt.tw} + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckNetwork(data) + expectStatus(t, results, "network.tcp_timewait", tt.status) + }) + } +} + +func TestCheckNetwork_TCPRetransmission(t *testing.T) { + tests := []struct { + name string + rate float64 + status inspector.Status + }{ + {"low", 0.1, inspector.StatusPass}, + {"elevated", 3.0, inspector.StatusWarn}, + {"high", 8.0, inspector.StatusFail}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.Network = &inspector.NetworkData{TCPRetransRate: tt.rate} + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckNetwork(data) + expectStatus(t, results, "network.tcp_retrans", tt.status) + }) + } +} + +func TestCheckNetwork_WGMeshPing(t *testing.T) { + t.Run("all ok", func(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.Network = &inspector.NetworkData{ + PingResults: map[string]bool{"10.0.0.2": true, "10.0.0.3": true}, + } + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckNetwork(data) + expectStatus(t, results, "network.wg_mesh_ping", inspector.StatusPass) + }) + + t.Run("some fail", func(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.Network = &inspector.NetworkData{ + PingResults: map[string]bool{"10.0.0.2": true, "10.0.0.3": false}, + } + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckNetwork(data) + expectStatus(t, results, "network.wg_mesh_ping", inspector.StatusFail) + }) + + t.Run("no pings", func(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.Network = &inspector.NetworkData{PingResults: map[string]bool{}} + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckNetwork(data) + // No ping results → no wg_mesh_ping check + if findCheck(results, "network.wg_mesh_ping") != nil { + t.Error("should not emit wg_mesh_ping when no ping results") + } + }) +} + +func TestCheckNetwork_NilData(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckNetwork(data) + if len(results) != 0 { + t.Errorf("expected 0 results for nil Network data, got %d", len(results)) + } +} diff --git a/pkg/inspector/checks/olric.go b/pkg/inspector/checks/olric.go new file mode 100644 index 0000000..08eb6a0 --- /dev/null +++ b/pkg/inspector/checks/olric.go @@ -0,0 +1,157 @@ +package checks + +import ( + "fmt" + + "github.com/DeBrosOfficial/network/pkg/inspector" +) + +func init() { + inspector.RegisterChecker("olric", CheckOlric) +} + +const olricSub = "olric" + +// CheckOlric runs all Olric health checks against cluster data. +func CheckOlric(data *inspector.ClusterData) []inspector.CheckResult { + var results []inspector.CheckResult + + for _, nd := range data.Nodes { + if nd.Olric == nil { + continue + } + results = append(results, checkOlricPerNode(nd)...) + } + + results = append(results, checkOlricCrossNode(data)...) + + return results +} + +func checkOlricPerNode(nd *inspector.NodeData) []inspector.CheckResult { + var r []inspector.CheckResult + ol := nd.Olric + node := nd.Node.Name() + + // 2.1 Service active + if ol.ServiceActive { + r = append(r, inspector.Pass("olric.service_active", "Olric service active", olricSub, node, + "debros-olric is active", inspector.Critical)) + } else { + r = append(r, inspector.Fail("olric.service_active", "Olric service active", olricSub, node, + "debros-olric is not active", inspector.Critical)) + return r + } + + // 2.7 Memberlist port accepting connections + if ol.MemberlistUp { + r = append(r, inspector.Pass("olric.memberlist_port", "Memberlist port 3322 listening", olricSub, node, + "TCP 3322 is bound", inspector.Critical)) + } else { + r = append(r, inspector.Fail("olric.memberlist_port", "Memberlist port 3322 listening", olricSub, node, + "TCP 3322 is not listening", inspector.Critical)) + } + + // 2.3 Restart count + if ol.RestartCount == 0 { + r = append(r, inspector.Pass("olric.restarts", "Low restart count", olricSub, node, + "NRestarts=0", inspector.High)) + } else if ol.RestartCount <= 3 { + r = append(r, inspector.Warn("olric.restarts", "Low restart count", olricSub, node, + fmt.Sprintf("NRestarts=%d", ol.RestartCount), inspector.High)) + } else { + r = append(r, inspector.Fail("olric.restarts", "Low restart count", olricSub, node, + fmt.Sprintf("NRestarts=%d (crash-looping?)", ol.RestartCount), inspector.High)) + } + + // 2.4 Process memory + if ol.ProcessMemMB > 0 { + if ol.ProcessMemMB < 200 { + r = append(r, inspector.Pass("olric.memory", "Process memory healthy", olricSub, node, + fmt.Sprintf("RSS=%dMB", ol.ProcessMemMB), inspector.Medium)) + } else if ol.ProcessMemMB < 500 { + r = append(r, inspector.Warn("olric.memory", "Process memory healthy", olricSub, node, + fmt.Sprintf("RSS=%dMB (elevated)", ol.ProcessMemMB), inspector.Medium)) + } else { + r = append(r, inspector.Fail("olric.memory", "Process memory healthy", olricSub, node, + fmt.Sprintf("RSS=%dMB (high)", ol.ProcessMemMB), inspector.High)) + } + } + + // 2.9-2.11 Log analysis: suspects + if ol.LogSuspects == 0 { + r = append(r, inspector.Pass("olric.log_suspects", "No suspect/failed members in logs", olricSub, node, + "no suspect messages in last hour", inspector.Critical)) + } else { + r = append(r, inspector.Fail("olric.log_suspects", "No suspect/failed members in logs", olricSub, node, + fmt.Sprintf("%d suspect/failed messages in last hour", ol.LogSuspects), inspector.Critical)) + } + + // 2.13 Flapping detection + if ol.LogFlapping < 5 { + r = append(r, inspector.Pass("olric.log_flapping", "No rapid join/leave cycles", olricSub, node, + fmt.Sprintf("join/leave events=%d in last hour", ol.LogFlapping), inspector.High)) + } else { + r = append(r, inspector.Warn("olric.log_flapping", "No rapid join/leave cycles", olricSub, node, + fmt.Sprintf("join/leave events=%d in last hour (flapping?)", ol.LogFlapping), inspector.High)) + } + + // 2.39 Log error rate + if ol.LogErrors < 5 { + r = append(r, inspector.Pass("olric.log_errors", "Log error rate low", olricSub, node, + fmt.Sprintf("errors=%d in last hour", ol.LogErrors), inspector.High)) + } else if ol.LogErrors < 20 { + r = append(r, inspector.Warn("olric.log_errors", "Log error rate low", olricSub, node, + fmt.Sprintf("errors=%d in last hour", ol.LogErrors), inspector.High)) + } else { + r = append(r, inspector.Fail("olric.log_errors", "Log error rate low", olricSub, node, + fmt.Sprintf("errors=%d in last hour (high)", ol.LogErrors), inspector.High)) + } + + return r +} + +func checkOlricCrossNode(data *inspector.ClusterData) []inspector.CheckResult { + var r []inspector.CheckResult + + activeCount := 0 + memberlistCount := 0 + totalNodes := 0 + + for _, nd := range data.Nodes { + if nd.Olric == nil { + continue + } + totalNodes++ + if nd.Olric.ServiceActive { + activeCount++ + } + if nd.Olric.MemberlistUp { + memberlistCount++ + } + } + + if totalNodes < 2 { + return r + } + + // All nodes have Olric running + if activeCount == totalNodes { + r = append(r, inspector.Pass("olric.all_active", "All nodes running Olric", olricSub, "", + fmt.Sprintf("%d/%d nodes active", activeCount, totalNodes), inspector.Critical)) + } else { + r = append(r, inspector.Fail("olric.all_active", "All nodes running Olric", olricSub, "", + fmt.Sprintf("%d/%d nodes active", activeCount, totalNodes), inspector.Critical)) + } + + // All memberlist ports up + if memberlistCount == totalNodes { + r = append(r, inspector.Pass("olric.all_memberlist", "All memberlist ports listening", olricSub, "", + fmt.Sprintf("%d/%d nodes with memberlist", memberlistCount, totalNodes), inspector.High)) + } else { + r = append(r, inspector.Warn("olric.all_memberlist", "All memberlist ports listening", olricSub, "", + fmt.Sprintf("%d/%d nodes with memberlist", memberlistCount, totalNodes), inspector.High)) + } + + return r +} diff --git a/pkg/inspector/checks/olric_test.go b/pkg/inspector/checks/olric_test.go new file mode 100644 index 0000000..1cf55ae --- /dev/null +++ b/pkg/inspector/checks/olric_test.go @@ -0,0 +1,149 @@ +package checks + +import ( + "testing" + + "github.com/DeBrosOfficial/network/pkg/inspector" +) + +func TestCheckOlric_ServiceInactive(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.Olric = &inspector.OlricData{ServiceActive: false} + + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckOlric(data) + + expectStatus(t, results, "olric.service_active", inspector.StatusFail) + // Should return early — no further per-node checks + if findCheck(results, "olric.memberlist_port") != nil { + t.Error("should not check memberlist when service inactive") + } +} + +func TestCheckOlric_HealthyNode(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.Olric = &inspector.OlricData{ + ServiceActive: true, + MemberlistUp: true, + RestartCount: 0, + ProcessMemMB: 100, + LogSuspects: 0, + LogFlapping: 0, + LogErrors: 0, + } + + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckOlric(data) + + expectStatus(t, results, "olric.service_active", inspector.StatusPass) + expectStatus(t, results, "olric.memberlist_port", inspector.StatusPass) + expectStatus(t, results, "olric.restarts", inspector.StatusPass) + expectStatus(t, results, "olric.log_suspects", inspector.StatusPass) + expectStatus(t, results, "olric.log_flapping", inspector.StatusPass) + expectStatus(t, results, "olric.log_errors", inspector.StatusPass) +} + +func TestCheckOlric_RestartCounts(t *testing.T) { + tests := []struct { + name string + restarts int + status inspector.Status + }{ + {"zero", 0, inspector.StatusPass}, + {"few", 2, inspector.StatusWarn}, + {"many", 5, inspector.StatusFail}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.Olric = &inspector.OlricData{ServiceActive: true, RestartCount: tt.restarts} + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckOlric(data) + expectStatus(t, results, "olric.restarts", tt.status) + }) + } +} + +func TestCheckOlric_Memory(t *testing.T) { + tests := []struct { + name string + memMB int + status inspector.Status + }{ + {"healthy", 100, inspector.StatusPass}, + {"elevated", 300, inspector.StatusWarn}, + {"high", 600, inspector.StatusFail}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.Olric = &inspector.OlricData{ServiceActive: true, ProcessMemMB: tt.memMB} + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckOlric(data) + expectStatus(t, results, "olric.memory", tt.status) + }) + } +} + +func TestCheckOlric_LogSuspects(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.Olric = &inspector.OlricData{ServiceActive: true, LogSuspects: 5} + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckOlric(data) + expectStatus(t, results, "olric.log_suspects", inspector.StatusFail) +} + +func TestCheckOlric_LogErrors(t *testing.T) { + tests := []struct { + name string + errors int + status inspector.Status + }{ + {"none", 0, inspector.StatusPass}, + {"few", 10, inspector.StatusWarn}, + {"many", 30, inspector.StatusFail}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.Olric = &inspector.OlricData{ServiceActive: true, LogErrors: tt.errors} + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckOlric(data) + expectStatus(t, results, "olric.log_errors", tt.status) + }) + } +} + +func TestCheckOlric_CrossNode_AllActive(t *testing.T) { + nodes := map[string]*inspector.NodeData{} + for _, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} { + nd := makeNodeData(host, "node") + nd.Olric = &inspector.OlricData{ServiceActive: true, MemberlistUp: true} + nodes[host] = nd + } + data := makeCluster(nodes) + results := CheckOlric(data) + expectStatus(t, results, "olric.all_active", inspector.StatusPass) + expectStatus(t, results, "olric.all_memberlist", inspector.StatusPass) +} + +func TestCheckOlric_CrossNode_PartialActive(t *testing.T) { + nodes := map[string]*inspector.NodeData{} + for i, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} { + nd := makeNodeData(host, "node") + nd.Olric = &inspector.OlricData{ServiceActive: i < 2, MemberlistUp: i < 2} + nodes[host] = nd + } + data := makeCluster(nodes) + results := CheckOlric(data) + expectStatus(t, results, "olric.all_active", inspector.StatusFail) +} + +func TestCheckOlric_NilData(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckOlric(data) + if len(results) != 0 { + t.Errorf("expected 0 results for nil Olric data, got %d", len(results)) + } +} diff --git a/pkg/inspector/checks/rqlite.go b/pkg/inspector/checks/rqlite.go new file mode 100644 index 0000000..b54691e --- /dev/null +++ b/pkg/inspector/checks/rqlite.go @@ -0,0 +1,533 @@ +package checks + +import ( + "fmt" + "math" + "strings" + + "github.com/DeBrosOfficial/network/pkg/inspector" +) + +func init() { + inspector.RegisterChecker("rqlite", CheckRQLite) +} + +const rqliteSub = "rqlite" + +// CheckRQLite runs all RQLite health checks against cluster data. +func CheckRQLite(data *inspector.ClusterData) []inspector.CheckResult { + var results []inspector.CheckResult + + // Per-node checks + for _, nd := range data.Nodes { + if nd.RQLite == nil { + continue + } + results = append(results, checkRQLitePerNode(nd, data)...) + } + + // Cross-node checks + results = append(results, checkRQLiteCrossNode(data)...) + + return results +} + +func checkRQLitePerNode(nd *inspector.NodeData, data *inspector.ClusterData) []inspector.CheckResult { + var r []inspector.CheckResult + rq := nd.RQLite + node := nd.Node.Name() + + // 1.2 HTTP endpoint responsive + if !rq.Responsive { + r = append(r, inspector.Fail("rqlite.responsive", "RQLite HTTP endpoint responsive", rqliteSub, node, + "curl localhost:5001/status failed or returned error", inspector.Critical)) + return r + } + r = append(r, inspector.Pass("rqlite.responsive", "RQLite HTTP endpoint responsive", rqliteSub, node, + "responding on port 5001", inspector.Critical)) + + // 1.3 Full readiness (/readyz) + if rq.Readyz != nil { + if rq.Readyz.Ready { + r = append(r, inspector.Pass("rqlite.readyz", "Full readiness check", rqliteSub, node, + "node, leader, store all ready", inspector.Critical)) + } else { + var parts []string + if rq.Readyz.Node != "ready" { + parts = append(parts, "node: "+rq.Readyz.Node) + } + if rq.Readyz.Leader != "ready" { + parts = append(parts, "leader: "+rq.Readyz.Leader) + } + if rq.Readyz.Store != "ready" { + parts = append(parts, "store: "+rq.Readyz.Store) + } + r = append(r, inspector.Fail("rqlite.readyz", "Full readiness check", rqliteSub, node, + "not ready: "+strings.Join(parts, ", "), inspector.Critical)) + } + } + + s := rq.Status + if s == nil { + r = append(r, inspector.Skip("rqlite.status_parsed", "Status JSON parseable", rqliteSub, node, + "could not parse /status response", inspector.Critical)) + return r + } + + // 1.5 Raft state valid + switch s.RaftState { + case "Leader", "Follower": + r = append(r, inspector.Pass("rqlite.raft_state", "Raft state valid", rqliteSub, node, + fmt.Sprintf("state=%s", s.RaftState), inspector.Critical)) + case "Candidate": + r = append(r, inspector.Warn("rqlite.raft_state", "Raft state valid", rqliteSub, node, + "state=Candidate (election in progress)", inspector.Critical)) + case "Shutdown": + r = append(r, inspector.Fail("rqlite.raft_state", "Raft state valid", rqliteSub, node, + "state=Shutdown", inspector.Critical)) + default: + r = append(r, inspector.Fail("rqlite.raft_state", "Raft state valid", rqliteSub, node, + fmt.Sprintf("unexpected state=%q", s.RaftState), inspector.Critical)) + } + + // 1.7 Leader identity known + if s.LeaderNodeID == "" { + r = append(r, inspector.Fail("rqlite.leader_known", "Leader identity known", rqliteSub, node, + "leader node_id is empty", inspector.Critical)) + } else { + r = append(r, inspector.Pass("rqlite.leader_known", "Leader identity known", rqliteSub, node, + fmt.Sprintf("leader=%s", s.LeaderNodeID), inspector.Critical)) + } + + // 1.8 Voter status + if s.Voter { + r = append(r, inspector.Pass("rqlite.voter", "Node is voter", rqliteSub, node, + "voter=true", inspector.Low)) + } else { + r = append(r, inspector.Warn("rqlite.voter", "Node is voter", rqliteSub, node, + "voter=false (non-voter)", inspector.Low)) + } + + // 1.9 Num peers — use the node's own /nodes endpoint to determine cluster size + // (not config file, since not all config nodes are necessarily in the Raft cluster) + if rq.Nodes != nil && len(rq.Nodes) > 0 { + expectedPeers := len(rq.Nodes) - 1 // cluster members minus self + if expectedPeers < 0 { + expectedPeers = 0 + } + if s.NumPeers == expectedPeers { + r = append(r, inspector.Pass("rqlite.num_peers", "Peer count matches cluster size", rqliteSub, node, + fmt.Sprintf("peers=%d (cluster has %d nodes)", s.NumPeers, len(rq.Nodes)), inspector.Critical)) + } else { + r = append(r, inspector.Warn("rqlite.num_peers", "Peer count matches cluster size", rqliteSub, node, + fmt.Sprintf("peers=%d but /nodes reports %d members", s.NumPeers, len(rq.Nodes)), inspector.High)) + } + } else { + r = append(r, inspector.Pass("rqlite.num_peers", "Peer count reported", rqliteSub, node, + fmt.Sprintf("peers=%d", s.NumPeers), inspector.Medium)) + } + + // 1.11 Commit index vs applied index + if s.CommitIndex > 0 && s.AppliedIndex > 0 { + gap := s.CommitIndex - s.AppliedIndex + if s.AppliedIndex > s.CommitIndex { + gap = 0 + } + if gap <= 2 { + r = append(r, inspector.Pass("rqlite.commit_applied_gap", "Commit/applied index close", rqliteSub, node, + fmt.Sprintf("commit=%d applied=%d gap=%d", s.CommitIndex, s.AppliedIndex, gap), inspector.Critical)) + } else if gap <= 100 { + r = append(r, inspector.Warn("rqlite.commit_applied_gap", "Commit/applied index close", rqliteSub, node, + fmt.Sprintf("commit=%d applied=%d gap=%d (lagging)", s.CommitIndex, s.AppliedIndex, gap), inspector.Critical)) + } else { + r = append(r, inspector.Fail("rqlite.commit_applied_gap", "Commit/applied index close", rqliteSub, node, + fmt.Sprintf("commit=%d applied=%d gap=%d (severely behind)", s.CommitIndex, s.AppliedIndex, gap), inspector.Critical)) + } + } + + // 1.12 FSM pending + if s.FsmPending == 0 { + r = append(r, inspector.Pass("rqlite.fsm_pending", "FSM pending queue empty", rqliteSub, node, + "fsm_pending=0", inspector.High)) + } else if s.FsmPending <= 10 { + r = append(r, inspector.Warn("rqlite.fsm_pending", "FSM pending queue empty", rqliteSub, node, + fmt.Sprintf("fsm_pending=%d", s.FsmPending), inspector.High)) + } else { + r = append(r, inspector.Fail("rqlite.fsm_pending", "FSM pending queue empty", rqliteSub, node, + fmt.Sprintf("fsm_pending=%d (backlog)", s.FsmPending), inspector.High)) + } + + // 1.13 Last contact (followers only) + if s.RaftState == "Follower" && s.LastContact != "" { + r = append(r, inspector.Pass("rqlite.last_contact", "Follower last contact recent", rqliteSub, node, + fmt.Sprintf("last_contact=%s", s.LastContact), inspector.Critical)) + } + + // 1.14 Last log term matches current term + if s.LastLogTerm > 0 && s.Term > 0 { + if s.LastLogTerm == s.Term { + r = append(r, inspector.Pass("rqlite.log_term_match", "Last log term matches current", rqliteSub, node, + fmt.Sprintf("term=%d last_log_term=%d", s.Term, s.LastLogTerm), inspector.Medium)) + } else { + r = append(r, inspector.Warn("rqlite.log_term_match", "Last log term matches current", rqliteSub, node, + fmt.Sprintf("term=%d last_log_term=%d (mismatch)", s.Term, s.LastLogTerm), inspector.Medium)) + } + } + + // 1.15 db_applied_index == fsm_index + if s.DBAppliedIndex > 0 && s.FsmIndex > 0 { + if s.DBAppliedIndex == s.FsmIndex { + r = append(r, inspector.Pass("rqlite.db_fsm_sync", "DB applied index matches FSM index", rqliteSub, node, + fmt.Sprintf("db_applied=%d fsm=%d", s.DBAppliedIndex, s.FsmIndex), inspector.Critical)) + } else { + r = append(r, inspector.Fail("rqlite.db_fsm_sync", "DB applied index matches FSM index", rqliteSub, node, + fmt.Sprintf("db_applied=%d fsm=%d (diverged)", s.DBAppliedIndex, s.FsmIndex), inspector.Critical)) + } + } + + // 1.18 Last snapshot index close to applied + if s.LastSnapshot > 0 && s.AppliedIndex > 0 { + gap := s.AppliedIndex - s.LastSnapshot + if s.LastSnapshot > s.AppliedIndex { + gap = 0 + } + if gap < 10000 { + r = append(r, inspector.Pass("rqlite.snapshot_recent", "Snapshot recent", rqliteSub, node, + fmt.Sprintf("snapshot_index=%d applied=%d gap=%d", s.LastSnapshot, s.AppliedIndex, gap), inspector.Medium)) + } else { + r = append(r, inspector.Warn("rqlite.snapshot_recent", "Snapshot recent", rqliteSub, node, + fmt.Sprintf("snapshot_index=%d applied=%d gap=%d (old snapshot)", s.LastSnapshot, s.AppliedIndex, gap), inspector.Medium)) + } + } + + // 1.19 At least 1 snapshot exists + if s.LastSnapshot > 0 { + r = append(r, inspector.Pass("rqlite.has_snapshot", "At least one snapshot exists", rqliteSub, node, + fmt.Sprintf("last_snapshot_index=%d", s.LastSnapshot), inspector.Medium)) + } else { + r = append(r, inspector.Warn("rqlite.has_snapshot", "At least one snapshot exists", rqliteSub, node, + "no snapshots found", inspector.Medium)) + } + + // 1.27 Database size + if s.DBSizeFriendly != "" { + r = append(r, inspector.Pass("rqlite.db_size", "Database size reported", rqliteSub, node, + fmt.Sprintf("db_size=%s", s.DBSizeFriendly), inspector.Low)) + } + + // 1.31 Goroutine count + if s.Goroutines > 0 { + if s.Goroutines < 200 { + r = append(r, inspector.Pass("rqlite.goroutines", "Goroutine count healthy", rqliteSub, node, + fmt.Sprintf("goroutines=%d", s.Goroutines), inspector.Medium)) + } else if s.Goroutines < 1000 { + r = append(r, inspector.Warn("rqlite.goroutines", "Goroutine count healthy", rqliteSub, node, + fmt.Sprintf("goroutines=%d (elevated)", s.Goroutines), inspector.Medium)) + } else { + r = append(r, inspector.Fail("rqlite.goroutines", "Goroutine count healthy", rqliteSub, node, + fmt.Sprintf("goroutines=%d (high)", s.Goroutines), inspector.High)) + } + } + + // 1.32 Memory (HeapAlloc) + if s.HeapAlloc > 0 { + mb := s.HeapAlloc / (1024 * 1024) + if mb < 500 { + r = append(r, inspector.Pass("rqlite.memory", "Memory usage healthy", rqliteSub, node, + fmt.Sprintf("heap=%dMB", mb), inspector.Medium)) + } else if mb < 1000 { + r = append(r, inspector.Warn("rqlite.memory", "Memory usage healthy", rqliteSub, node, + fmt.Sprintf("heap=%dMB (elevated)", mb), inspector.Medium)) + } else { + r = append(r, inspector.Fail("rqlite.memory", "Memory usage healthy", rqliteSub, node, + fmt.Sprintf("heap=%dMB (high)", mb), inspector.High)) + } + } + + // 1.35 Version reported + if s.Version != "" { + r = append(r, inspector.Pass("rqlite.version", "Version reported", rqliteSub, node, + fmt.Sprintf("version=%s", s.Version), inspector.Low)) + } + + // Node reachability from /nodes endpoint + if rq.Nodes != nil { + unreachable := 0 + for addr, n := range rq.Nodes { + if !n.Reachable { + unreachable++ + r = append(r, inspector.Fail("rqlite.node_reachable", "Cluster node reachable", rqliteSub, node, + fmt.Sprintf("%s is unreachable from this node", addr), inspector.Critical)) + } + } + if unreachable == 0 { + r = append(r, inspector.Pass("rqlite.all_reachable", "All cluster nodes reachable", rqliteSub, node, + fmt.Sprintf("all %d nodes reachable", len(rq.Nodes)), inspector.Critical)) + } + } + + // 1.46 Strong read test + if rq.StrongRead { + r = append(r, inspector.Pass("rqlite.strong_read", "Strong read succeeds", rqliteSub, node, + "SELECT 1 at level=strong OK", inspector.Critical)) + } else if rq.Responsive { + r = append(r, inspector.Fail("rqlite.strong_read", "Strong read succeeds", rqliteSub, node, + "SELECT 1 at level=strong failed", inspector.Critical)) + } + + // Debug vars checks + if dv := rq.DebugVars; dv != nil { + // 1.28 Query errors + if dv.QueryErrors == 0 { + r = append(r, inspector.Pass("rqlite.query_errors", "No query errors", rqliteSub, node, + "query_errors=0", inspector.High)) + } else { + r = append(r, inspector.Warn("rqlite.query_errors", "No query errors", rqliteSub, node, + fmt.Sprintf("query_errors=%d", dv.QueryErrors), inspector.High)) + } + + // 1.29 Execute errors + if dv.ExecuteErrors == 0 { + r = append(r, inspector.Pass("rqlite.execute_errors", "No execute errors", rqliteSub, node, + "execute_errors=0", inspector.High)) + } else { + r = append(r, inspector.Warn("rqlite.execute_errors", "No execute errors", rqliteSub, node, + fmt.Sprintf("execute_errors=%d", dv.ExecuteErrors), inspector.High)) + } + + // 1.30 Leader not found events + if dv.LeaderNotFound == 0 { + r = append(r, inspector.Pass("rqlite.leader_not_found", "No leader-not-found events", rqliteSub, node, + "leader_not_found=0", inspector.Critical)) + } else { + r = append(r, inspector.Fail("rqlite.leader_not_found", "No leader-not-found events", rqliteSub, node, + fmt.Sprintf("leader_not_found=%d", dv.LeaderNotFound), inspector.Critical)) + } + + // Snapshot errors + if dv.SnapshotErrors == 0 { + r = append(r, inspector.Pass("rqlite.snapshot_errors", "No snapshot errors", rqliteSub, node, + "snapshot_errors=0", inspector.High)) + } else { + r = append(r, inspector.Fail("rqlite.snapshot_errors", "No snapshot errors", rqliteSub, node, + fmt.Sprintf("snapshot_errors=%d", dv.SnapshotErrors), inspector.High)) + } + + // Client retries/timeouts + if dv.ClientRetries == 0 && dv.ClientTimeouts == 0 { + r = append(r, inspector.Pass("rqlite.client_health", "No client retries or timeouts", rqliteSub, node, + "retries=0 timeouts=0", inspector.Medium)) + } else { + r = append(r, inspector.Warn("rqlite.client_health", "No client retries or timeouts", rqliteSub, node, + fmt.Sprintf("retries=%d timeouts=%d", dv.ClientRetries, dv.ClientTimeouts), inspector.Medium)) + } + } + + return r +} + +func checkRQLiteCrossNode(data *inspector.ClusterData) []inspector.CheckResult { + var r []inspector.CheckResult + + type nodeInfo struct { + host string + name string + status *inspector.RQLiteStatus + } + var nodes []nodeInfo + for host, nd := range data.Nodes { + if nd.RQLite != nil && nd.RQLite.Status != nil { + nodes = append(nodes, nodeInfo{host: host, name: nd.Node.Name(), status: nd.RQLite.Status}) + } + } + + if len(nodes) < 2 { + r = append(r, inspector.Skip("rqlite.cross_node", "Cross-node checks", rqliteSub, "", + fmt.Sprintf("only %d node(s) with RQLite data, need >=2", len(nodes)), inspector.Critical)) + return r + } + + // 1.5 Exactly one leader + leaders := 0 + var leaderName string + for _, n := range nodes { + if n.status.RaftState == "Leader" { + leaders++ + leaderName = n.name + } + } + switch leaders { + case 1: + r = append(r, inspector.Pass("rqlite.single_leader", "Exactly one leader in cluster", rqliteSub, "", + fmt.Sprintf("leader=%s", leaderName), inspector.Critical)) + case 0: + r = append(r, inspector.Fail("rqlite.single_leader", "Exactly one leader in cluster", rqliteSub, "", + "no leader found", inspector.Critical)) + default: + r = append(r, inspector.Fail("rqlite.single_leader", "Exactly one leader in cluster", rqliteSub, "", + fmt.Sprintf("found %d leaders (split brain!)", leaders), inspector.Critical)) + } + + // 1.6 Term consistency + terms := map[uint64][]string{} + for _, n := range nodes { + terms[n.status.Term] = append(terms[n.status.Term], n.name) + } + if len(terms) == 1 { + for t := range terms { + r = append(r, inspector.Pass("rqlite.term_consistent", "All nodes same Raft term", rqliteSub, "", + fmt.Sprintf("term=%d across %d nodes", t, len(nodes)), inspector.Critical)) + } + } else { + var parts []string + for t, names := range terms { + parts = append(parts, fmt.Sprintf("term=%d: %s", t, strings.Join(names, ","))) + } + r = append(r, inspector.Fail("rqlite.term_consistent", "All nodes same Raft term", rqliteSub, "", + "term divergence: "+strings.Join(parts, "; "), inspector.Critical)) + } + + // 1.36 All nodes agree on same leader + leaderIDs := map[string][]string{} + for _, n := range nodes { + leaderIDs[n.status.LeaderNodeID] = append(leaderIDs[n.status.LeaderNodeID], n.name) + } + if len(leaderIDs) == 1 { + for lid := range leaderIDs { + r = append(r, inspector.Pass("rqlite.leader_agreement", "All nodes agree on leader", rqliteSub, "", + fmt.Sprintf("leader_id=%s", lid), inspector.Critical)) + } + } else { + var parts []string + for lid, names := range leaderIDs { + id := lid + if id == "" { + id = "(none)" + } + parts = append(parts, fmt.Sprintf("%s: %s", id, strings.Join(names, ","))) + } + r = append(r, inspector.Fail("rqlite.leader_agreement", "All nodes agree on leader", rqliteSub, "", + "leader disagreement: "+strings.Join(parts, "; "), inspector.Critical)) + } + + // 1.38 Applied index convergence + var minApplied, maxApplied uint64 + hasApplied := false + for _, n := range nodes { + idx := n.status.AppliedIndex + if idx == 0 { + continue + } + if !hasApplied { + minApplied = idx + maxApplied = idx + hasApplied = true + continue + } + if idx < minApplied { + minApplied = idx + } + if idx > maxApplied { + maxApplied = idx + } + } + if hasApplied && maxApplied > 0 { + gap := maxApplied - minApplied + if gap < 100 { + r = append(r, inspector.Pass("rqlite.index_convergence", "Applied index convergence", rqliteSub, "", + fmt.Sprintf("min=%d max=%d gap=%d", minApplied, maxApplied, gap), inspector.Critical)) + } else if gap < 1000 { + r = append(r, inspector.Warn("rqlite.index_convergence", "Applied index convergence", rqliteSub, "", + fmt.Sprintf("min=%d max=%d gap=%d (lagging)", minApplied, maxApplied, gap), inspector.Critical)) + } else { + r = append(r, inspector.Fail("rqlite.index_convergence", "Applied index convergence", rqliteSub, "", + fmt.Sprintf("min=%d max=%d gap=%d (severely behind)", minApplied, maxApplied, gap), inspector.Critical)) + } + } + + // 1.35 Version consistency + versions := map[string][]string{} + for _, n := range nodes { + if n.status.Version != "" { + versions[n.status.Version] = append(versions[n.status.Version], n.name) + } + } + if len(versions) == 1 { + for v := range versions { + r = append(r, inspector.Pass("rqlite.version_consistent", "Version consistent across nodes", rqliteSub, "", + fmt.Sprintf("version=%s", v), inspector.Medium)) + } + } else if len(versions) > 1 { + var parts []string + for v, names := range versions { + parts = append(parts, fmt.Sprintf("%s: %s", v, strings.Join(names, ","))) + } + r = append(r, inspector.Warn("rqlite.version_consistent", "Version consistent across nodes", rqliteSub, "", + "version mismatch: "+strings.Join(parts, "; "), inspector.Medium)) + } + + // 1.40 Database size convergence + type sizeEntry struct { + name string + size int64 + } + var sizes []sizeEntry + for _, n := range nodes { + if n.status.DBSize > 0 { + sizes = append(sizes, sizeEntry{n.name, n.status.DBSize}) + } + } + if len(sizes) >= 2 { + minSize := sizes[0].size + maxSize := sizes[0].size + for _, s := range sizes[1:] { + if s.size < minSize { + minSize = s.size + } + if s.size > maxSize { + maxSize = s.size + } + } + if minSize > 0 { + ratio := float64(maxSize) / float64(minSize) + if ratio <= 1.05 { + r = append(r, inspector.Pass("rqlite.db_size_convergence", "Database size converged", rqliteSub, "", + fmt.Sprintf("min=%dB max=%dB ratio=%.2f", minSize, maxSize, ratio), inspector.Medium)) + } else { + r = append(r, inspector.Warn("rqlite.db_size_convergence", "Database size converged", rqliteSub, "", + fmt.Sprintf("min=%dB max=%dB ratio=%.2f (diverged)", minSize, maxSize, ratio), inspector.High)) + } + } + } + + // 1.42 Quorum math + voters := 0 + reachableVoters := 0 + for _, n := range nodes { + if n.status.Voter { + voters++ + reachableVoters++ // responded to SSH + curl = reachable + } + } + quorumNeeded := int(math.Floor(float64(voters)/2)) + 1 + if reachableVoters >= quorumNeeded { + r = append(r, inspector.Pass("rqlite.quorum", "Quorum maintained", rqliteSub, "", + fmt.Sprintf("reachable_voters=%d/%d quorum_needed=%d", reachableVoters, voters, quorumNeeded), inspector.Critical)) + } else { + r = append(r, inspector.Fail("rqlite.quorum", "Quorum maintained", rqliteSub, "", + fmt.Sprintf("reachable_voters=%d/%d quorum_needed=%d (QUORUM LOST)", reachableVoters, voters, quorumNeeded), inspector.Critical)) + } + + return r +} + +// countRQLiteNodes counts nodes that have RQLite data. +func countRQLiteNodes(data *inspector.ClusterData) int { + count := 0 + for _, nd := range data.Nodes { + if nd.RQLite != nil { + count++ + } + } + return count +} diff --git a/pkg/inspector/checks/rqlite_test.go b/pkg/inspector/checks/rqlite_test.go new file mode 100644 index 0000000..43a5da1 --- /dev/null +++ b/pkg/inspector/checks/rqlite_test.go @@ -0,0 +1,401 @@ +package checks + +import ( + "testing" + + "github.com/DeBrosOfficial/network/pkg/inspector" +) + +func TestCheckRQLite_Unresponsive(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.RQLite = &inspector.RQLiteData{Responsive: false} + + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckRQLite(data) + + expectStatus(t, results, "rqlite.responsive", inspector.StatusFail) + // Should return early — no raft_state check + if findCheck(results, "rqlite.raft_state") != nil { + t.Error("should not check raft_state when unresponsive") + } +} + +func TestCheckRQLite_HealthyLeader(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.RQLite = &inspector.RQLiteData{ + Responsive: true, + StrongRead: true, + Readyz: &inspector.RQLiteReadyz{Ready: true, Node: "ready", Leader: "ready", Store: "ready"}, + Status: &inspector.RQLiteStatus{ + RaftState: "Leader", + LeaderNodeID: "node1", + Voter: true, + NumPeers: 2, + Term: 5, + CommitIndex: 1000, + AppliedIndex: 1000, + FsmPending: 0, + LastLogTerm: 5, + DBAppliedIndex: 1000, + FsmIndex: 1000, + LastSnapshot: 995, + DBSizeFriendly: "1.2MB", + Goroutines: 50, + HeapAlloc: 100 * 1024 * 1024, // 100MB + Version: "8.0.0", + }, + Nodes: map[string]*inspector.RQLiteNode{ + "node1:5001": {Addr: "node1:5001", Reachable: true, Leader: true, Voter: true}, + "node2:5001": {Addr: "node2:5001", Reachable: true, Leader: false, Voter: true}, + "node3:5001": {Addr: "node3:5001", Reachable: true, Leader: false, Voter: true}, + }, + DebugVars: &inspector.RQLiteDebugVars{}, + } + + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckRQLite(data) + + expectStatus(t, results, "rqlite.responsive", inspector.StatusPass) + expectStatus(t, results, "rqlite.readyz", inspector.StatusPass) + expectStatus(t, results, "rqlite.raft_state", inspector.StatusPass) + expectStatus(t, results, "rqlite.leader_known", inspector.StatusPass) + expectStatus(t, results, "rqlite.voter", inspector.StatusPass) + expectStatus(t, results, "rqlite.commit_applied_gap", inspector.StatusPass) + expectStatus(t, results, "rqlite.fsm_pending", inspector.StatusPass) + expectStatus(t, results, "rqlite.db_fsm_sync", inspector.StatusPass) + expectStatus(t, results, "rqlite.strong_read", inspector.StatusPass) + expectStatus(t, results, "rqlite.all_reachable", inspector.StatusPass) + expectStatus(t, results, "rqlite.goroutines", inspector.StatusPass) + expectStatus(t, results, "rqlite.memory", inspector.StatusPass) + expectStatus(t, results, "rqlite.query_errors", inspector.StatusPass) + expectStatus(t, results, "rqlite.execute_errors", inspector.StatusPass) + expectStatus(t, results, "rqlite.leader_not_found", inspector.StatusPass) + expectStatus(t, results, "rqlite.snapshot_errors", inspector.StatusPass) + expectStatus(t, results, "rqlite.client_health", inspector.StatusPass) +} + +func TestCheckRQLite_RaftStates(t *testing.T) { + tests := []struct { + state string + status inspector.Status + }{ + {"Leader", inspector.StatusPass}, + {"Follower", inspector.StatusPass}, + {"Candidate", inspector.StatusWarn}, + {"Shutdown", inspector.StatusFail}, + {"Unknown", inspector.StatusFail}, + } + for _, tt := range tests { + t.Run(tt.state, func(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.RQLite = &inspector.RQLiteData{ + Responsive: true, + Status: &inspector.RQLiteStatus{ + RaftState: tt.state, + LeaderNodeID: "node1", + Voter: true, + }, + } + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckRQLite(data) + expectStatus(t, results, "rqlite.raft_state", tt.status) + }) + } +} + +func TestCheckRQLite_ReadyzFail(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.RQLite = &inspector.RQLiteData{ + Responsive: true, + Readyz: &inspector.RQLiteReadyz{Ready: false, Node: "ready", Leader: "not ready", Store: "ready"}, + Status: &inspector.RQLiteStatus{RaftState: "Follower", LeaderNodeID: "n1", Voter: true}, + } + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckRQLite(data) + expectStatus(t, results, "rqlite.readyz", inspector.StatusFail) +} + +func TestCheckRQLite_CommitAppliedGap(t *testing.T) { + tests := []struct { + name string + commit uint64 + applied uint64 + status inspector.Status + }{ + {"no gap", 1000, 1000, inspector.StatusPass}, + {"small gap", 1002, 1000, inspector.StatusPass}, + {"lagging", 1050, 1000, inspector.StatusWarn}, + {"severely behind", 2000, 1000, inspector.StatusFail}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.RQLite = &inspector.RQLiteData{ + Responsive: true, + Status: &inspector.RQLiteStatus{ + RaftState: "Follower", + LeaderNodeID: "n1", + Voter: true, + CommitIndex: tt.commit, + AppliedIndex: tt.applied, + }, + } + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckRQLite(data) + expectStatus(t, results, "rqlite.commit_applied_gap", tt.status) + }) + } +} + +func TestCheckRQLite_FsmPending(t *testing.T) { + tests := []struct { + name string + pending uint64 + status inspector.Status + }{ + {"zero", 0, inspector.StatusPass}, + {"small", 5, inspector.StatusWarn}, + {"backlog", 100, inspector.StatusFail}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.RQLite = &inspector.RQLiteData{ + Responsive: true, + Status: &inspector.RQLiteStatus{ + RaftState: "Follower", + LeaderNodeID: "n1", + Voter: true, + FsmPending: tt.pending, + }, + } + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckRQLite(data) + expectStatus(t, results, "rqlite.fsm_pending", tt.status) + }) + } +} + +func TestCheckRQLite_StrongReadFail(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.RQLite = &inspector.RQLiteData{ + Responsive: true, + StrongRead: false, + Status: &inspector.RQLiteStatus{RaftState: "Follower", LeaderNodeID: "n1", Voter: true}, + } + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckRQLite(data) + expectStatus(t, results, "rqlite.strong_read", inspector.StatusFail) +} + +func TestCheckRQLite_DebugVarsErrors(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.RQLite = &inspector.RQLiteData{ + Responsive: true, + Status: &inspector.RQLiteStatus{RaftState: "Leader", LeaderNodeID: "n1", Voter: true}, + DebugVars: &inspector.RQLiteDebugVars{ + QueryErrors: 5, + ExecuteErrors: 3, + LeaderNotFound: 1, + SnapshotErrors: 2, + ClientRetries: 10, + ClientTimeouts: 1, + }, + } + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckRQLite(data) + + expectStatus(t, results, "rqlite.query_errors", inspector.StatusWarn) + expectStatus(t, results, "rqlite.execute_errors", inspector.StatusWarn) + expectStatus(t, results, "rqlite.leader_not_found", inspector.StatusFail) + expectStatus(t, results, "rqlite.snapshot_errors", inspector.StatusFail) + expectStatus(t, results, "rqlite.client_health", inspector.StatusWarn) +} + +func TestCheckRQLite_Goroutines(t *testing.T) { + tests := []struct { + name string + goroutines int + status inspector.Status + }{ + {"healthy", 50, inspector.StatusPass}, + {"elevated", 500, inspector.StatusWarn}, + {"high", 2000, inspector.StatusFail}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.RQLite = &inspector.RQLiteData{ + Responsive: true, + Status: &inspector.RQLiteStatus{ + RaftState: "Leader", + LeaderNodeID: "n1", + Voter: true, + Goroutines: tt.goroutines, + }, + } + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckRQLite(data) + expectStatus(t, results, "rqlite.goroutines", tt.status) + }) + } +} + +// --- Cross-node tests --- + +func makeRQLiteCluster(leaderHost string, states map[string]string, term uint64) *inspector.ClusterData { + nodes := map[string]*inspector.NodeData{} + rqliteNodes := map[string]*inspector.RQLiteNode{} + for host := range states { + rqliteNodes[host+":5001"] = &inspector.RQLiteNode{ + Addr: host + ":5001", Reachable: true, Voter: true, + Leader: states[host] == "Leader", + } + } + + for host, state := range states { + nd := makeNodeData(host, "node") + nd.RQLite = &inspector.RQLiteData{ + Responsive: true, + Status: &inspector.RQLiteStatus{ + RaftState: state, + LeaderNodeID: leaderHost, + Voter: true, + Term: term, + AppliedIndex: 1000, + CommitIndex: 1000, + Version: "8.0.0", + DBSize: 4096, + }, + Nodes: rqliteNodes, + } + nodes[host] = nd + } + return makeCluster(nodes) +} + +func TestCheckRQLite_CrossNode_SingleLeader(t *testing.T) { + data := makeRQLiteCluster("1.1.1.1", map[string]string{ + "1.1.1.1": "Leader", + "2.2.2.2": "Follower", + "3.3.3.3": "Follower", + }, 5) + + results := CheckRQLite(data) + expectStatus(t, results, "rqlite.single_leader", inspector.StatusPass) + expectStatus(t, results, "rqlite.term_consistent", inspector.StatusPass) + expectStatus(t, results, "rqlite.leader_agreement", inspector.StatusPass) + expectStatus(t, results, "rqlite.index_convergence", inspector.StatusPass) + expectStatus(t, results, "rqlite.version_consistent", inspector.StatusPass) + expectStatus(t, results, "rqlite.quorum", inspector.StatusPass) +} + +func TestCheckRQLite_CrossNode_NoLeader(t *testing.T) { + data := makeRQLiteCluster("", map[string]string{ + "1.1.1.1": "Candidate", + "2.2.2.2": "Candidate", + "3.3.3.3": "Candidate", + }, 5) + results := CheckRQLite(data) + expectStatus(t, results, "rqlite.single_leader", inspector.StatusFail) +} + +func TestCheckRQLite_CrossNode_SplitBrain(t *testing.T) { + nodes := map[string]*inspector.NodeData{} + for _, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} { + nd := makeNodeData(host, "node") + state := "Follower" + leaderID := "1.1.1.1" + if host == "1.1.1.1" || host == "2.2.2.2" { + state = "Leader" + leaderID = host + } + nd.RQLite = &inspector.RQLiteData{ + Responsive: true, + Status: &inspector.RQLiteStatus{ + RaftState: state, + LeaderNodeID: leaderID, + Voter: true, + Term: 5, + AppliedIndex: 1000, + }, + } + nodes[host] = nd + } + data := makeCluster(nodes) + results := CheckRQLite(data) + expectStatus(t, results, "rqlite.single_leader", inspector.StatusFail) +} + +func TestCheckRQLite_CrossNode_TermDivergence(t *testing.T) { + nodes := map[string]*inspector.NodeData{} + terms := map[string]uint64{"1.1.1.1": 5, "2.2.2.2": 5, "3.3.3.3": 6} + for host, term := range terms { + nd := makeNodeData(host, "node") + nd.RQLite = &inspector.RQLiteData{ + Responsive: true, + Status: &inspector.RQLiteStatus{ + RaftState: "Follower", + LeaderNodeID: "1.1.1.1", + Voter: true, + Term: term, + AppliedIndex: 1000, + }, + } + nodes[host] = nd + } + data := makeCluster(nodes) + results := CheckRQLite(data) + expectStatus(t, results, "rqlite.term_consistent", inspector.StatusFail) +} + +func TestCheckRQLite_CrossNode_IndexLagging(t *testing.T) { + nodes := map[string]*inspector.NodeData{} + applied := map[string]uint64{"1.1.1.1": 1000, "2.2.2.2": 1000, "3.3.3.3": 500} + for host, idx := range applied { + nd := makeNodeData(host, "node") + state := "Follower" + if host == "1.1.1.1" { + state = "Leader" + } + nd.RQLite = &inspector.RQLiteData{ + Responsive: true, + Status: &inspector.RQLiteStatus{ + RaftState: state, + LeaderNodeID: "1.1.1.1", + Voter: true, + Term: 5, + AppliedIndex: idx, + CommitIndex: idx, + }, + } + nodes[host] = nd + } + data := makeCluster(nodes) + results := CheckRQLite(data) + expectStatus(t, results, "rqlite.index_convergence", inspector.StatusWarn) +} + +func TestCheckRQLite_CrossNode_SkipSingleNode(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.RQLite = &inspector.RQLiteData{ + Responsive: true, + Status: &inspector.RQLiteStatus{RaftState: "Leader", LeaderNodeID: "n1", Voter: true, Term: 5, AppliedIndex: 1000}, + } + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckRQLite(data) + expectStatus(t, results, "rqlite.cross_node", inspector.StatusSkip) +} + +func TestCheckRQLite_NilRQLiteData(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + // nd.RQLite is nil — no per-node checks, but cross-node skip is expected + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckRQLite(data) + // Should only have the cross-node skip (not enough nodes) + for _, r := range results { + if r.Status != inspector.StatusSkip { + t.Errorf("unexpected non-skip result: %s (status=%s)", r.ID, r.Status) + } + } +} diff --git a/pkg/inspector/checks/system.go b/pkg/inspector/checks/system.go new file mode 100644 index 0000000..ad125a3 --- /dev/null +++ b/pkg/inspector/checks/system.go @@ -0,0 +1,242 @@ +package checks + +import ( + "fmt" + "strconv" + "strings" + + "github.com/DeBrosOfficial/network/pkg/inspector" +) + +func init() { + inspector.RegisterChecker("system", CheckSystem) +} + +const systemSub = "system" + +// CheckSystem runs all system-level health checks. +func CheckSystem(data *inspector.ClusterData) []inspector.CheckResult { + var results []inspector.CheckResult + + for _, nd := range data.Nodes { + if nd.System == nil { + continue + } + results = append(results, checkSystemPerNode(nd)...) + } + + return results +} + +func checkSystemPerNode(nd *inspector.NodeData) []inspector.CheckResult { + var r []inspector.CheckResult + sys := nd.System + node := nd.Node.Name() + + // 6.1 Core services active + coreServices := []string{"debros-node", "debros-olric", "debros-ipfs", "debros-ipfs-cluster"} + for _, svc := range coreServices { + status, ok := sys.Services[svc] + if !ok { + status = "unknown" + } + id := fmt.Sprintf("system.svc_%s", strings.ReplaceAll(svc, "-", "_")) + name := fmt.Sprintf("%s service active", svc) + if status == "active" { + r = append(r, inspector.Pass(id, name, systemSub, node, "active", inspector.Critical)) + } else { + r = append(r, inspector.Fail(id, name, systemSub, node, + fmt.Sprintf("status=%s", status), inspector.Critical)) + } + } + + // 6.5 WireGuard service + if status, ok := sys.Services["wg-quick@wg0"]; ok { + if status == "active" { + r = append(r, inspector.Pass("system.svc_wg", "wg-quick@wg0 active", systemSub, node, "active", inspector.Critical)) + } else { + r = append(r, inspector.Fail("system.svc_wg", "wg-quick@wg0 active", systemSub, node, + fmt.Sprintf("status=%s", status), inspector.Critical)) + } + } + + // 6.3 Nameserver services (if applicable) + if nd.Node.IsNameserver() { + for _, svc := range []string{"coredns", "caddy"} { + status, ok := sys.Services[svc] + if !ok { + status = "unknown" + } + id := fmt.Sprintf("system.svc_%s", svc) + name := fmt.Sprintf("%s service active", svc) + if status == "active" { + r = append(r, inspector.Pass(id, name, systemSub, node, "active", inspector.Critical)) + } else { + r = append(r, inspector.Fail(id, name, systemSub, node, + fmt.Sprintf("status=%s", status), inspector.Critical)) + } + } + } + + // 6.6 Failed systemd units + if len(sys.FailedUnits) == 0 { + r = append(r, inspector.Pass("system.no_failed_units", "No failed systemd units", systemSub, node, + "no failed units", inspector.High)) + } else { + r = append(r, inspector.Fail("system.no_failed_units", "No failed systemd units", systemSub, node, + fmt.Sprintf("failed: %s", strings.Join(sys.FailedUnits, ", ")), inspector.High)) + } + + // 6.14 Memory usage + if sys.MemTotalMB > 0 { + pct := float64(sys.MemUsedMB) / float64(sys.MemTotalMB) * 100 + if pct < 80 { + r = append(r, inspector.Pass("system.memory", "Memory usage healthy", systemSub, node, + fmt.Sprintf("used=%dMB/%dMB (%.0f%%)", sys.MemUsedMB, sys.MemTotalMB, pct), inspector.Medium)) + } else if pct < 90 { + r = append(r, inspector.Warn("system.memory", "Memory usage healthy", systemSub, node, + fmt.Sprintf("used=%dMB/%dMB (%.0f%%)", sys.MemUsedMB, sys.MemTotalMB, pct), inspector.High)) + } else { + r = append(r, inspector.Fail("system.memory", "Memory usage healthy", systemSub, node, + fmt.Sprintf("used=%dMB/%dMB (%.0f%% CRITICAL)", sys.MemUsedMB, sys.MemTotalMB, pct), inspector.Critical)) + } + } + + // 6.15 Disk usage + if sys.DiskUsePct > 0 { + if sys.DiskUsePct < 80 { + r = append(r, inspector.Pass("system.disk", "Disk usage healthy", systemSub, node, + fmt.Sprintf("used=%s/%s (%d%%)", sys.DiskUsedGB, sys.DiskTotalGB, sys.DiskUsePct), inspector.High)) + } else if sys.DiskUsePct < 90 { + r = append(r, inspector.Warn("system.disk", "Disk usage healthy", systemSub, node, + fmt.Sprintf("used=%s/%s (%d%%)", sys.DiskUsedGB, sys.DiskTotalGB, sys.DiskUsePct), inspector.High)) + } else { + r = append(r, inspector.Fail("system.disk", "Disk usage healthy", systemSub, node, + fmt.Sprintf("used=%s/%s (%d%% CRITICAL)", sys.DiskUsedGB, sys.DiskTotalGB, sys.DiskUsePct), inspector.Critical)) + } + } + + // 6.17 Load average vs CPU count + if sys.LoadAvg != "" && sys.CPUCount > 0 { + parts := strings.Split(strings.TrimSpace(sys.LoadAvg), ",") + if len(parts) >= 1 { + load1, err := strconv.ParseFloat(strings.TrimSpace(parts[0]), 64) + if err == nil { + cpus := float64(sys.CPUCount) + if load1 < cpus { + r = append(r, inspector.Pass("system.load", "Load average healthy", systemSub, node, + fmt.Sprintf("load1=%.1f cpus=%d", load1, sys.CPUCount), inspector.Medium)) + } else if load1 < cpus*2 { + r = append(r, inspector.Warn("system.load", "Load average healthy", systemSub, node, + fmt.Sprintf("load1=%.1f cpus=%d (elevated)", load1, sys.CPUCount), inspector.Medium)) + } else { + r = append(r, inspector.Fail("system.load", "Load average healthy", systemSub, node, + fmt.Sprintf("load1=%.1f cpus=%d (overloaded)", load1, sys.CPUCount), inspector.High)) + } + } + } + } + + // 6.18 OOM kills + if sys.OOMKills == 0 { + r = append(r, inspector.Pass("system.oom", "No OOM kills", systemSub, node, + "no OOM kills in dmesg", inspector.Critical)) + } else { + r = append(r, inspector.Fail("system.oom", "No OOM kills", systemSub, node, + fmt.Sprintf("%d OOM kills in dmesg", sys.OOMKills), inspector.Critical)) + } + + // 6.19 Swap usage + if sys.SwapTotalMB > 0 { + pct := float64(sys.SwapUsedMB) / float64(sys.SwapTotalMB) * 100 + if pct < 30 { + r = append(r, inspector.Pass("system.swap", "Swap usage low", systemSub, node, + fmt.Sprintf("swap=%dMB/%dMB (%.0f%%)", sys.SwapUsedMB, sys.SwapTotalMB, pct), inspector.Medium)) + } else { + r = append(r, inspector.Warn("system.swap", "Swap usage low", systemSub, node, + fmt.Sprintf("swap=%dMB/%dMB (%.0f%%)", sys.SwapUsedMB, sys.SwapTotalMB, pct), inspector.Medium)) + } + } + + // 6.20 Uptime + if sys.UptimeRaw != "" && sys.UptimeRaw != "unknown" { + r = append(r, inspector.Pass("system.uptime", "System uptime reported", systemSub, node, + fmt.Sprintf("up since %s", sys.UptimeRaw), inspector.Low)) + } + + // 6.21 Inode usage + if sys.InodePct > 0 { + if sys.InodePct < 80 { + r = append(r, inspector.Pass("system.inodes", "Inode usage healthy", systemSub, node, + fmt.Sprintf("inode_use=%d%%", sys.InodePct), inspector.High)) + } else if sys.InodePct < 95 { + r = append(r, inspector.Warn("system.inodes", "Inode usage healthy", systemSub, node, + fmt.Sprintf("inode_use=%d%% (elevated)", sys.InodePct), inspector.High)) + } else { + r = append(r, inspector.Fail("system.inodes", "Inode usage healthy", systemSub, node, + fmt.Sprintf("inode_use=%d%% (CRITICAL)", sys.InodePct), inspector.Critical)) + } + } + + // 6.22 UFW firewall + if sys.UFWActive { + r = append(r, inspector.Pass("system.ufw", "UFW firewall active", systemSub, node, + "ufw is active", inspector.High)) + } else { + r = append(r, inspector.Warn("system.ufw", "UFW firewall active", systemSub, node, + "ufw is not active", inspector.High)) + } + + // 6.23 Process user + if sys.ProcessUser != "" && sys.ProcessUser != "unknown" { + if sys.ProcessUser == "debros" { + r = append(r, inspector.Pass("system.process_user", "debros-node runs as correct user", systemSub, node, + "user=debros", inspector.High)) + } else if sys.ProcessUser == "root" { + r = append(r, inspector.Warn("system.process_user", "debros-node runs as correct user", systemSub, node, + "user=root (should be debros)", inspector.High)) + } else { + r = append(r, inspector.Warn("system.process_user", "debros-node runs as correct user", systemSub, node, + fmt.Sprintf("user=%s (expected debros)", sys.ProcessUser), inspector.Medium)) + } + } + + // 6.24 Panic/fatal in logs + if sys.PanicCount == 0 { + r = append(r, inspector.Pass("system.panics", "No panics in recent logs", systemSub, node, + "0 panic/fatal in last hour", inspector.Critical)) + } else { + r = append(r, inspector.Fail("system.panics", "No panics in recent logs", systemSub, node, + fmt.Sprintf("%d panic/fatal in last hour", sys.PanicCount), inspector.Critical)) + } + + // 6.25 Expected ports listening + expectedPorts := map[int]string{ + 5001: "RQLite HTTP", + 3322: "Olric Memberlist", + 6001: "Gateway", + 4501: "IPFS API", + } + for port, svcName := range expectedPorts { + found := false + for _, p := range sys.ListeningPorts { + if p == port { + found = true + break + } + } + if found { + r = append(r, inspector.Pass( + fmt.Sprintf("system.port_%d", port), + fmt.Sprintf("%s port %d listening", svcName, port), + systemSub, node, "port is bound", inspector.High)) + } else { + r = append(r, inspector.Warn( + fmt.Sprintf("system.port_%d", port), + fmt.Sprintf("%s port %d listening", svcName, port), + systemSub, node, "port is NOT bound", inspector.High)) + } + } + + return r +} diff --git a/pkg/inspector/checks/system_test.go b/pkg/inspector/checks/system_test.go new file mode 100644 index 0000000..e33e9af --- /dev/null +++ b/pkg/inspector/checks/system_test.go @@ -0,0 +1,284 @@ +package checks + +import ( + "testing" + + "github.com/DeBrosOfficial/network/pkg/inspector" +) + +func TestCheckSystem_HealthyNode(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.System = &inspector.SystemData{ + Services: map[string]string{ + "debros-node": "active", + "debros-olric": "active", + "debros-ipfs": "active", + "debros-ipfs-cluster": "active", + "wg-quick@wg0": "active", + }, + FailedUnits: nil, + MemTotalMB: 8192, + MemUsedMB: 4096, + DiskUsePct: 50, + DiskUsedGB: "25G", + DiskTotalGB: "50G", + LoadAvg: "1.0, 0.8, 0.5", + CPUCount: 4, + OOMKills: 0, + SwapTotalMB: 2048, + SwapUsedMB: 100, + UptimeRaw: "2024-01-01 00:00:00", + InodePct: 10, + ListeningPorts: []int{5001, 3322, 6001, 4501}, + UFWActive: true, + ProcessUser: "debros", + PanicCount: 0, + } + + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckSystem(data) + + expectStatus(t, results, "system.svc_debros_node", inspector.StatusPass) + expectStatus(t, results, "system.svc_debros_olric", inspector.StatusPass) + expectStatus(t, results, "system.svc_debros_ipfs", inspector.StatusPass) + expectStatus(t, results, "system.svc_debros_ipfs_cluster", inspector.StatusPass) + expectStatus(t, results, "system.svc_wg", inspector.StatusPass) + expectStatus(t, results, "system.no_failed_units", inspector.StatusPass) + expectStatus(t, results, "system.memory", inspector.StatusPass) + expectStatus(t, results, "system.disk", inspector.StatusPass) + expectStatus(t, results, "system.load", inspector.StatusPass) + expectStatus(t, results, "system.oom", inspector.StatusPass) + expectStatus(t, results, "system.swap", inspector.StatusPass) + expectStatus(t, results, "system.inodes", inspector.StatusPass) + expectStatus(t, results, "system.ufw", inspector.StatusPass) + expectStatus(t, results, "system.process_user", inspector.StatusPass) + expectStatus(t, results, "system.panics", inspector.StatusPass) + expectStatus(t, results, "system.port_5001", inspector.StatusPass) + expectStatus(t, results, "system.port_3322", inspector.StatusPass) + expectStatus(t, results, "system.port_6001", inspector.StatusPass) + expectStatus(t, results, "system.port_4501", inspector.StatusPass) +} + +func TestCheckSystem_ServiceInactive(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.System = &inspector.SystemData{ + Services: map[string]string{ + "debros-node": "active", + "debros-olric": "inactive", + "debros-ipfs": "active", + "debros-ipfs-cluster": "failed", + }, + } + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckSystem(data) + + expectStatus(t, results, "system.svc_debros_node", inspector.StatusPass) + expectStatus(t, results, "system.svc_debros_olric", inspector.StatusFail) + expectStatus(t, results, "system.svc_debros_ipfs_cluster", inspector.StatusFail) +} + +func TestCheckSystem_NameserverServices(t *testing.T) { + nd := makeNodeData("5.5.5.5", "nameserver-ns1") + nd.System = &inspector.SystemData{ + Services: map[string]string{ + "debros-node": "active", + "debros-olric": "active", + "debros-ipfs": "active", + "debros-ipfs-cluster": "active", + "coredns": "active", + "caddy": "active", + }, + } + data := makeCluster(map[string]*inspector.NodeData{"5.5.5.5": nd}) + results := CheckSystem(data) + expectStatus(t, results, "system.svc_coredns", inspector.StatusPass) + expectStatus(t, results, "system.svc_caddy", inspector.StatusPass) +} + +func TestCheckSystem_NameserverServicesNotCheckedOnRegularNode(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.System = &inspector.SystemData{ + Services: map[string]string{ + "debros-node": "active", + "debros-olric": "active", + "debros-ipfs": "active", + "debros-ipfs-cluster": "active", + }, + } + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckSystem(data) + if findCheck(results, "system.svc_coredns") != nil { + t.Error("should not check coredns on regular node") + } +} + +func TestCheckSystem_FailedUnits(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.System = &inspector.SystemData{ + Services: map[string]string{}, + FailedUnits: []string{"some-service.service"}, + } + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckSystem(data) + expectStatus(t, results, "system.no_failed_units", inspector.StatusFail) +} + +func TestCheckSystem_Memory(t *testing.T) { + tests := []struct { + name string + used int + total int + status inspector.Status + }{ + {"healthy", 4000, 8000, inspector.StatusPass}, // 50% + {"elevated", 7000, 8000, inspector.StatusWarn}, // 87.5% + {"critical", 7500, 8000, inspector.StatusFail}, // 93.75% + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.System = &inspector.SystemData{ + Services: map[string]string{}, + MemTotalMB: tt.total, + MemUsedMB: tt.used, + } + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckSystem(data) + expectStatus(t, results, "system.memory", tt.status) + }) + } +} + +func TestCheckSystem_Disk(t *testing.T) { + tests := []struct { + name string + pct int + status inspector.Status + }{ + {"healthy", 60, inspector.StatusPass}, + {"elevated", 85, inspector.StatusWarn}, + {"critical", 92, inspector.StatusFail}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.System = &inspector.SystemData{ + Services: map[string]string{}, + DiskUsePct: tt.pct, + DiskUsedGB: "25G", + DiskTotalGB: "50G", + } + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckSystem(data) + expectStatus(t, results, "system.disk", tt.status) + }) + } +} + +func TestCheckSystem_Load(t *testing.T) { + tests := []struct { + name string + load string + cpus int + status inspector.Status + }{ + {"healthy", "1.0, 0.8, 0.5", 4, inspector.StatusPass}, + {"elevated", "6.0, 5.0, 4.0", 4, inspector.StatusWarn}, + {"overloaded", "10.0, 9.0, 8.0", 4, inspector.StatusFail}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.System = &inspector.SystemData{ + Services: map[string]string{}, + LoadAvg: tt.load, + CPUCount: tt.cpus, + } + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckSystem(data) + expectStatus(t, results, "system.load", tt.status) + }) + } +} + +func TestCheckSystem_OOMKills(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.System = &inspector.SystemData{Services: map[string]string{}, OOMKills: 3} + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckSystem(data) + expectStatus(t, results, "system.oom", inspector.StatusFail) +} + +func TestCheckSystem_Inodes(t *testing.T) { + tests := []struct { + name string + pct int + status inspector.Status + }{ + {"healthy", 50, inspector.StatusPass}, + {"elevated", 82, inspector.StatusWarn}, + {"critical", 96, inspector.StatusFail}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.System = &inspector.SystemData{Services: map[string]string{}, InodePct: tt.pct} + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckSystem(data) + expectStatus(t, results, "system.inodes", tt.status) + }) + } +} + +func TestCheckSystem_ProcessUser(t *testing.T) { + tests := []struct { + name string + user string + status inspector.Status + }{ + {"correct", "debros", inspector.StatusPass}, + {"root", "root", inspector.StatusWarn}, + {"other", "ubuntu", inspector.StatusWarn}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.System = &inspector.SystemData{Services: map[string]string{}, ProcessUser: tt.user} + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckSystem(data) + expectStatus(t, results, "system.process_user", tt.status) + }) + } +} + +func TestCheckSystem_Panics(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.System = &inspector.SystemData{Services: map[string]string{}, PanicCount: 5} + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckSystem(data) + expectStatus(t, results, "system.panics", inspector.StatusFail) +} + +func TestCheckSystem_ExpectedPorts(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.System = &inspector.SystemData{ + Services: map[string]string{}, + ListeningPorts: []int{5001, 6001}, // Missing 3322, 4501 + } + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckSystem(data) + + expectStatus(t, results, "system.port_5001", inspector.StatusPass) + expectStatus(t, results, "system.port_6001", inspector.StatusPass) + expectStatus(t, results, "system.port_3322", inspector.StatusWarn) + expectStatus(t, results, "system.port_4501", inspector.StatusWarn) +} + +func TestCheckSystem_NilData(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckSystem(data) + if len(results) != 0 { + t.Errorf("expected 0 results for nil System data, got %d", len(results)) + } +} diff --git a/pkg/inspector/checks/wireguard.go b/pkg/inspector/checks/wireguard.go new file mode 100644 index 0000000..2f13562 --- /dev/null +++ b/pkg/inspector/checks/wireguard.go @@ -0,0 +1,270 @@ +package checks + +import ( + "fmt" + "strings" + "time" + + "github.com/DeBrosOfficial/network/pkg/inspector" +) + +func init() { + inspector.RegisterChecker("wireguard", CheckWireGuard) +} + +const wgSub = "wireguard" + +// CheckWireGuard runs all WireGuard health checks. +func CheckWireGuard(data *inspector.ClusterData) []inspector.CheckResult { + var results []inspector.CheckResult + + for _, nd := range data.Nodes { + if nd.WireGuard == nil { + continue + } + results = append(results, checkWGPerNode(nd, data)...) + } + + results = append(results, checkWGCrossNode(data)...) + + return results +} + +func checkWGPerNode(nd *inspector.NodeData, data *inspector.ClusterData) []inspector.CheckResult { + var r []inspector.CheckResult + wg := nd.WireGuard + node := nd.Node.Name() + + // 5.1 Interface up + if wg.InterfaceUp { + r = append(r, inspector.Pass("wg.interface_up", "WireGuard interface up", wgSub, node, + fmt.Sprintf("wg0 up, IP=%s", wg.WgIP), inspector.Critical)) + } else { + r = append(r, inspector.Fail("wg.interface_up", "WireGuard interface up", wgSub, node, + "wg0 interface is DOWN", inspector.Critical)) + return r + } + + // 5.2 Service active + if wg.ServiceActive { + r = append(r, inspector.Pass("wg.service_active", "wg-quick@wg0 service active", wgSub, node, + "service is active", inspector.Critical)) + } else { + r = append(r, inspector.Warn("wg.service_active", "wg-quick@wg0 service active", wgSub, node, + "service not active (interface up but service not managed by systemd?)", inspector.High)) + } + + // 5.5 Correct IP in 10.0.0.0/24 + if wg.WgIP != "" && strings.HasPrefix(wg.WgIP, "10.0.0.") { + r = append(r, inspector.Pass("wg.correct_ip", "WG IP in expected range", wgSub, node, + fmt.Sprintf("IP=%s (10.0.0.0/24)", wg.WgIP), inspector.Critical)) + } else if wg.WgIP != "" { + r = append(r, inspector.Warn("wg.correct_ip", "WG IP in expected range", wgSub, node, + fmt.Sprintf("IP=%s (not in 10.0.0.0/24)", wg.WgIP), inspector.High)) + } + + // 5.4 Listen port + if wg.ListenPort == 51820 { + r = append(r, inspector.Pass("wg.listen_port", "Listen port is 51820", wgSub, node, + "port=51820", inspector.Critical)) + } else if wg.ListenPort > 0 { + r = append(r, inspector.Warn("wg.listen_port", "Listen port is 51820", wgSub, node, + fmt.Sprintf("port=%d (expected 51820)", wg.ListenPort), inspector.High)) + } + + // 5.7 Peer count + expectedNodes := countWGNodes(data) + expectedPeers := expectedNodes - 1 + if expectedPeers < 0 { + expectedPeers = 0 + } + if wg.PeerCount >= expectedPeers { + r = append(r, inspector.Pass("wg.peer_count", "Peer count matches expected", wgSub, node, + fmt.Sprintf("peers=%d (expected=%d)", wg.PeerCount, expectedPeers), inspector.High)) + } else if wg.PeerCount > 0 { + r = append(r, inspector.Warn("wg.peer_count", "Peer count matches expected", wgSub, node, + fmt.Sprintf("peers=%d (expected=%d)", wg.PeerCount, expectedPeers), inspector.High)) + } else { + r = append(r, inspector.Fail("wg.peer_count", "Peer count matches expected", wgSub, node, + fmt.Sprintf("peers=%d (isolated!)", wg.PeerCount), inspector.Critical)) + } + + // 5.29 MTU + if wg.MTU == 1420 { + r = append(r, inspector.Pass("wg.mtu", "MTU is 1420", wgSub, node, + "MTU=1420", inspector.High)) + } else if wg.MTU > 0 { + r = append(r, inspector.Warn("wg.mtu", "MTU is 1420", wgSub, node, + fmt.Sprintf("MTU=%d (expected 1420)", wg.MTU), inspector.High)) + } + + // 5.35 Config file exists + if wg.ConfigExists { + r = append(r, inspector.Pass("wg.config_exists", "Config file exists", wgSub, node, + "/etc/wireguard/wg0.conf present", inspector.High)) + } else { + r = append(r, inspector.Warn("wg.config_exists", "Config file exists", wgSub, node, + "/etc/wireguard/wg0.conf NOT found", inspector.High)) + } + + // 5.36 Config permissions + if wg.ConfigPerms == "600" { + r = append(r, inspector.Pass("wg.config_perms", "Config file permissions 600", wgSub, node, + "perms=600", inspector.Critical)) + } else if wg.ConfigPerms != "" && wg.ConfigPerms != "000" { + r = append(r, inspector.Warn("wg.config_perms", "Config file permissions 600", wgSub, node, + fmt.Sprintf("perms=%s (expected 600)", wg.ConfigPerms), inspector.Critical)) + } + + // Per-peer checks + now := time.Now().Unix() + neverHandshaked := 0 + staleHandshakes := 0 + noTraffic := 0 + + for _, peer := range wg.Peers { + // 5.20 Each peer has exactly one /32 allowed IP + if !strings.Contains(peer.AllowedIPs, "/32") { + r = append(r, inspector.Warn("wg.peer_allowed_ip", "Peer has /32 allowed IP", wgSub, node, + fmt.Sprintf("peer %s...%s has allowed_ips=%s", peer.PublicKey[:8], peer.PublicKey[len(peer.PublicKey)-4:], peer.AllowedIPs), inspector.High)) + } + + // 5.23 No peer has 0.0.0.0/0 + if strings.Contains(peer.AllowedIPs, "0.0.0.0/0") { + r = append(r, inspector.Fail("wg.peer_catch_all", "No catch-all route peer", wgSub, node, + fmt.Sprintf("peer %s...%s has 0.0.0.0/0 (route hijack!)", peer.PublicKey[:8], peer.PublicKey[len(peer.PublicKey)-4:]), inspector.Critical)) + } + + // 5.11-5.12 Handshake freshness + if peer.LatestHandshake == 0 { + neverHandshaked++ + } else { + age := now - peer.LatestHandshake + if age > 300 { + staleHandshakes++ + } + } + + // 5.13 Transfer stats + if peer.TransferRx == 0 && peer.TransferTx == 0 { + noTraffic++ + } + } + + if len(wg.Peers) > 0 { + // 5.12 Never handshaked + if neverHandshaked == 0 { + r = append(r, inspector.Pass("wg.handshake_all", "All peers have handshaked", wgSub, node, + fmt.Sprintf("%d/%d peers handshaked", len(wg.Peers), len(wg.Peers)), inspector.Critical)) + } else { + r = append(r, inspector.Fail("wg.handshake_all", "All peers have handshaked", wgSub, node, + fmt.Sprintf("%d/%d peers never handshaked", neverHandshaked, len(wg.Peers)), inspector.Critical)) + } + + // 5.11 Stale handshakes + if staleHandshakes == 0 { + r = append(r, inspector.Pass("wg.handshake_fresh", "All handshakes recent (<5m)", wgSub, node, + "all handshakes within 5 minutes", inspector.High)) + } else { + r = append(r, inspector.Warn("wg.handshake_fresh", "All handshakes recent (<5m)", wgSub, node, + fmt.Sprintf("%d/%d peers with stale handshake (>5m)", staleHandshakes, len(wg.Peers)), inspector.High)) + } + + // 5.13 Transfer + if noTraffic == 0 { + r = append(r, inspector.Pass("wg.peer_traffic", "All peers have traffic", wgSub, node, + fmt.Sprintf("%d/%d peers with traffic", len(wg.Peers), len(wg.Peers)), inspector.High)) + } else { + r = append(r, inspector.Warn("wg.peer_traffic", "All peers have traffic", wgSub, node, + fmt.Sprintf("%d/%d peers with zero traffic", noTraffic, len(wg.Peers)), inspector.High)) + } + } + + return r +} + +func checkWGCrossNode(data *inspector.ClusterData) []inspector.CheckResult { + var r []inspector.CheckResult + + type nodeInfo struct { + name string + wg *inspector.WireGuardData + } + var nodes []nodeInfo + for _, nd := range data.Nodes { + if nd.WireGuard != nil && nd.WireGuard.InterfaceUp { + nodes = append(nodes, nodeInfo{name: nd.Node.Name(), wg: nd.WireGuard}) + } + } + + if len(nodes) < 2 { + return r + } + + // 5.8 Peer count consistent + counts := map[int]int{} + for _, n := range nodes { + counts[n.wg.PeerCount]++ + } + if len(counts) == 1 { + for c := range counts { + r = append(r, inspector.Pass("wg.peer_count_consistent", "Peer count consistent across nodes", wgSub, "", + fmt.Sprintf("all nodes have %d peers", c), inspector.High)) + } + } else { + var parts []string + for c, num := range counts { + parts = append(parts, fmt.Sprintf("%d nodes have %d peers", num, c)) + } + r = append(r, inspector.Warn("wg.peer_count_consistent", "Peer count consistent across nodes", wgSub, "", + strings.Join(parts, "; "), inspector.High)) + } + + // 5.30 MTU consistent + mtus := map[int]int{} + for _, n := range nodes { + if n.wg.MTU > 0 { + mtus[n.wg.MTU]++ + } + } + if len(mtus) == 1 { + for m := range mtus { + r = append(r, inspector.Pass("wg.mtu_consistent", "MTU consistent across nodes", wgSub, "", + fmt.Sprintf("all nodes MTU=%d", m), inspector.High)) + } + } else if len(mtus) > 1 { + r = append(r, inspector.Warn("wg.mtu_consistent", "MTU consistent across nodes", wgSub, "", + fmt.Sprintf("%d different MTU values", len(mtus)), inspector.High)) + } + + // 5.50 Public key uniqueness + allKeys := map[string][]string{} + for _, n := range nodes { + for _, peer := range n.wg.Peers { + allKeys[peer.PublicKey] = append(allKeys[peer.PublicKey], n.name) + } + } + dupeKeys := 0 + for _, names := range allKeys { + if len(names) > len(nodes)-1 { + dupeKeys++ + } + } + // If all good, the same key should appear at most N-1 times (once per other node) + if dupeKeys == 0 { + r = append(r, inspector.Pass("wg.key_uniqueness", "Public keys unique across nodes", wgSub, "", + fmt.Sprintf("%d unique peer keys", len(allKeys)), inspector.Critical)) + } + + return r +} + +func countWGNodes(data *inspector.ClusterData) int { + count := 0 + for _, nd := range data.Nodes { + if nd.WireGuard != nil { + count++ + } + } + return count +} diff --git a/pkg/inspector/checks/wireguard_test.go b/pkg/inspector/checks/wireguard_test.go new file mode 100644 index 0000000..a3dc9eb --- /dev/null +++ b/pkg/inspector/checks/wireguard_test.go @@ -0,0 +1,230 @@ +package checks + +import ( + "testing" + "time" + + "github.com/DeBrosOfficial/network/pkg/inspector" +) + +func TestCheckWireGuard_InterfaceDown(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.WireGuard = &inspector.WireGuardData{InterfaceUp: false} + + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckWireGuard(data) + + expectStatus(t, results, "wg.interface_up", inspector.StatusFail) + // Early return — no further per-node checks + if findCheck(results, "wg.service_active") != nil { + t.Error("should not check service_active when interface down") + } +} + +func TestCheckWireGuard_HealthyNode(t *testing.T) { + now := time.Now().Unix() + nd := makeNodeData("1.1.1.1", "node") + nd.WireGuard = &inspector.WireGuardData{ + InterfaceUp: true, + ServiceActive: true, + WgIP: "10.0.0.1", + ListenPort: 51820, + PeerCount: 2, + MTU: 1420, + ConfigExists: true, + ConfigPerms: "600", + Peers: []inspector.WGPeer{ + {PublicKey: "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=", AllowedIPs: "10.0.0.2/32", LatestHandshake: now - 30, TransferRx: 1000, TransferTx: 2000}, + {PublicKey: "BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB=", AllowedIPs: "10.0.0.3/32", LatestHandshake: now - 60, TransferRx: 500, TransferTx: 800}, + }, + } + + // Single-node for per-node assertions (avoids helper node interference) + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckWireGuard(data) + + expectStatus(t, results, "wg.interface_up", inspector.StatusPass) + expectStatus(t, results, "wg.service_active", inspector.StatusPass) + expectStatus(t, results, "wg.correct_ip", inspector.StatusPass) + expectStatus(t, results, "wg.listen_port", inspector.StatusPass) + expectStatus(t, results, "wg.mtu", inspector.StatusPass) + expectStatus(t, results, "wg.config_exists", inspector.StatusPass) + expectStatus(t, results, "wg.config_perms", inspector.StatusPass) + expectStatus(t, results, "wg.handshake_all", inspector.StatusPass) + expectStatus(t, results, "wg.handshake_fresh", inspector.StatusPass) + expectStatus(t, results, "wg.peer_traffic", inspector.StatusPass) +} + +func TestCheckWireGuard_WrongIP(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.WireGuard = &inspector.WireGuardData{ + InterfaceUp: true, + WgIP: "192.168.1.5", + } + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckWireGuard(data) + expectStatus(t, results, "wg.correct_ip", inspector.StatusWarn) +} + +func TestCheckWireGuard_WrongPort(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.WireGuard = &inspector.WireGuardData{ + InterfaceUp: true, + WgIP: "10.0.0.1", + ListenPort: 12345, + } + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckWireGuard(data) + expectStatus(t, results, "wg.listen_port", inspector.StatusWarn) +} + +func TestCheckWireGuard_PeerCountMismatch(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.WireGuard = &inspector.WireGuardData{InterfaceUp: true, WgIP: "10.0.0.1", PeerCount: 1} + + nodes := map[string]*inspector.NodeData{"1.1.1.1": nd} + for _, host := range []string{"2.2.2.2", "3.3.3.3", "4.4.4.4"} { + other := makeNodeData(host, "node") + other.WireGuard = &inspector.WireGuardData{InterfaceUp: true, PeerCount: 3} + nodes[host] = other + } + data := makeCluster(nodes) + results := CheckWireGuard(data) + + // Node 1.1.1.1 has 1 peer but expects 3 (4 nodes - 1) + c := findCheck(results, "wg.peer_count") + if c == nil { + t.Fatal("expected wg.peer_count check") + } + // At least one node should have a warn + hasWarn := false + for _, r := range results { + if r.ID == "wg.peer_count" && r.Status == inspector.StatusWarn { + hasWarn = true + } + } + if !hasWarn { + t.Error("expected at least one wg.peer_count warn for mismatched peer count") + } +} + +func TestCheckWireGuard_ZeroPeers(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.WireGuard = &inspector.WireGuardData{InterfaceUp: true, WgIP: "10.0.0.1", PeerCount: 0} + + nodes := map[string]*inspector.NodeData{"1.1.1.1": nd} + for _, host := range []string{"2.2.2.2", "3.3.3.3"} { + other := makeNodeData(host, "node") + other.WireGuard = &inspector.WireGuardData{InterfaceUp: true, PeerCount: 2} + nodes[host] = other + } + data := makeCluster(nodes) + results := CheckWireGuard(data) + + // At least one node should fail (zero peers = isolated) + hasFail := false + for _, r := range results { + if r.ID == "wg.peer_count" && r.Status == inspector.StatusFail { + hasFail = true + } + } + if !hasFail { + t.Error("expected wg.peer_count fail for isolated node") + } +} + +func TestCheckWireGuard_StaleHandshakes(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.WireGuard = &inspector.WireGuardData{ + InterfaceUp: true, + WgIP: "10.0.0.1", + PeerCount: 2, + Peers: []inspector.WGPeer{ + {PublicKey: "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=", AllowedIPs: "10.0.0.2/32", LatestHandshake: time.Now().Unix() - 600, TransferRx: 100, TransferTx: 200}, + {PublicKey: "BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB=", AllowedIPs: "10.0.0.3/32", LatestHandshake: time.Now().Unix() - 600, TransferRx: 100, TransferTx: 200}, + }, + } + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckWireGuard(data) + expectStatus(t, results, "wg.handshake_fresh", inspector.StatusWarn) +} + +func TestCheckWireGuard_NeverHandshaked(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.WireGuard = &inspector.WireGuardData{ + InterfaceUp: true, + WgIP: "10.0.0.1", + PeerCount: 1, + Peers: []inspector.WGPeer{ + {PublicKey: "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=", AllowedIPs: "10.0.0.2/32", LatestHandshake: 0}, + }, + } + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckWireGuard(data) + expectStatus(t, results, "wg.handshake_all", inspector.StatusFail) +} + +func TestCheckWireGuard_NoTraffic(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.WireGuard = &inspector.WireGuardData{ + InterfaceUp: true, + WgIP: "10.0.0.1", + PeerCount: 1, + Peers: []inspector.WGPeer{ + {PublicKey: "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=", AllowedIPs: "10.0.0.2/32", LatestHandshake: time.Now().Unix(), TransferRx: 0, TransferTx: 0}, + }, + } + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckWireGuard(data) + expectStatus(t, results, "wg.peer_traffic", inspector.StatusWarn) +} + +func TestCheckWireGuard_CatchAllRoute(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + nd.WireGuard = &inspector.WireGuardData{ + InterfaceUp: true, + WgIP: "10.0.0.1", + PeerCount: 1, + Peers: []inspector.WGPeer{ + {PublicKey: "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA=", AllowedIPs: "0.0.0.0/0", LatestHandshake: time.Now().Unix(), TransferRx: 100, TransferTx: 200}, + }, + } + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckWireGuard(data) + expectStatus(t, results, "wg.peer_catch_all", inspector.StatusFail) +} + +func TestCheckWireGuard_CrossNode_PeerCountConsistent(t *testing.T) { + nodes := map[string]*inspector.NodeData{} + for _, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} { + nd := makeNodeData(host, "node") + nd.WireGuard = &inspector.WireGuardData{InterfaceUp: true, PeerCount: 2, MTU: 1420} + nodes[host] = nd + } + data := makeCluster(nodes) + results := CheckWireGuard(data) + expectStatus(t, results, "wg.peer_count_consistent", inspector.StatusPass) + expectStatus(t, results, "wg.mtu_consistent", inspector.StatusPass) +} + +func TestCheckWireGuard_CrossNode_PeerCountInconsistent(t *testing.T) { + nodes := map[string]*inspector.NodeData{} + counts := []int{2, 2, 1} + for i, host := range []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"} { + nd := makeNodeData(host, "node") + nd.WireGuard = &inspector.WireGuardData{InterfaceUp: true, PeerCount: counts[i], MTU: 1420} + nodes[host] = nd + } + data := makeCluster(nodes) + results := CheckWireGuard(data) + expectStatus(t, results, "wg.peer_count_consistent", inspector.StatusWarn) +} + +func TestCheckWireGuard_NilData(t *testing.T) { + nd := makeNodeData("1.1.1.1", "node") + data := makeCluster(map[string]*inspector.NodeData{"1.1.1.1": nd}) + results := CheckWireGuard(data) + if len(results) != 0 { + t.Errorf("expected 0 results for nil WireGuard data, got %d", len(results)) + } +} diff --git a/pkg/inspector/collector.go b/pkg/inspector/collector.go new file mode 100644 index 0000000..520d981 --- /dev/null +++ b/pkg/inspector/collector.go @@ -0,0 +1,1268 @@ +package inspector + +import ( + "context" + "encoding/json" + "fmt" + "strconv" + "strings" + "sync" + "time" +) + +// ClusterData holds all collected data from the cluster. +type ClusterData struct { + Nodes map[string]*NodeData // keyed by host IP + Duration time.Duration +} + +// NodeData holds collected data for a single node. +type NodeData struct { + Node Node + RQLite *RQLiteData + Olric *OlricData + IPFS *IPFSData + DNS *DNSData + WireGuard *WireGuardData + System *SystemData + Network *NetworkData + Namespaces []NamespaceData // namespace instances on this node + Errors []string // collection errors for this node +} + +// NamespaceData holds data for a single namespace on a node. +type NamespaceData struct { + Name string // namespace name (from systemd unit) + PortBase int // starting port of the 5-port block + RQLiteUp bool // RQLite HTTP port responding + RQLiteState string // Raft state (Leader/Follower) + RQLiteReady bool // /readyz + OlricUp bool // Olric memberlist port listening + GatewayUp bool // Gateway HTTP port responding + GatewayStatus int // HTTP status code from gateway health +} + +// RQLiteData holds parsed RQLite status from a single node. +type RQLiteData struct { + Responsive bool + StatusRaw string // raw JSON from /status + NodesRaw string // raw JSON from /nodes?nonvoters + ReadyzRaw string // raw response from /readyz + DebugRaw string // raw JSON from /debug/vars + Status *RQLiteStatus // parsed /status + Nodes map[string]*RQLiteNode // parsed /nodes + Readyz *RQLiteReadyz // parsed /readyz + DebugVars *RQLiteDebugVars // parsed /debug/vars + StrongRead bool // SELECT 1 with level=strong succeeded +} + +// RQLiteDebugVars holds metrics from /debug/vars. +type RQLiteDebugVars struct { + QueryErrors uint64 + ExecuteErrors uint64 + RemoteExecErrors uint64 + LeaderNotFound uint64 + SnapshotErrors uint64 + ClientRetries uint64 + ClientTimeouts uint64 +} + +// RQLiteStatus holds parsed fields from /status. +type RQLiteStatus struct { + RaftState string // Leader, Follower, Candidate, Shutdown + LeaderNodeID string // store.leader.node_id + LeaderAddr string // store.leader.addr + NodeID string // store.node_id + Term uint64 // store.raft.term (current_term) + AppliedIndex uint64 // store.raft.applied_index + CommitIndex uint64 // store.raft.commit_index + FsmPending uint64 // store.raft.fsm_pending + LastContact string // store.raft.last_contact (followers only) + LastLogIndex uint64 // store.raft.last_log_index + LastLogTerm uint64 // store.raft.last_log_term + NumPeers int // store.raft.num_peers (string in JSON) + LastSnapshot uint64 // store.raft.last_snapshot_index + Voter bool // store.raft.voter + DBSize int64 // store.sqlite3.db_size + DBSizeFriendly string // store.sqlite3.db_size_friendly + DBAppliedIndex uint64 // store.db_applied_index + FsmIndex uint64 // store.fsm_index + Uptime string // http.uptime + Version string // build.version + GoVersion string // runtime.GOARCH + runtime.version + Goroutines int // runtime.num_goroutine + HeapAlloc uint64 // runtime.memory.heap_alloc (bytes) +} + +// RQLiteNode holds parsed fields from /nodes response per node. +type RQLiteNode struct { + Addr string + Reachable bool + Leader bool + Voter bool + Time float64 // response time + Error string +} + +// RQLiteReadyz holds parsed readiness state. +type RQLiteReadyz struct { + Ready bool + Store string // "ready" or error + Leader string // "ready" or error + Node string // "ready" or error + RawBody string +} + +// OlricData holds parsed Olric status from a single node. +type OlricData struct { + ServiceActive bool + MemberlistUp bool + MemberCount int + Members []string // memberlist member addresses + Coordinator string // current coordinator address + LogErrors int // error count in recent logs + LogSuspects int // "suspect" or "Marking as failed" count + LogFlapping int // rapid join/leave count + ProcessMemMB int // RSS memory in MB + RestartCount int // NRestarts from systemd +} + +// IPFSData holds parsed IPFS status from a single node. +type IPFSData struct { + DaemonActive bool + ClusterActive bool + SwarmPeerCount int + ClusterPeerCount int + RepoSizeBytes int64 + RepoMaxBytes int64 + KuboVersion string + ClusterVersion string + ClusterErrors int // peers reporting errors + HasSwarmKey bool + BootstrapEmpty bool // true if bootstrap list is empty (private swarm) +} + +// DNSData holds parsed DNS/CoreDNS status from a nameserver node. +type DNSData struct { + CoreDNSActive bool + CaddyActive bool + Port53Bound bool + Port80Bound bool + Port443Bound bool + CoreDNSMemMB int + CoreDNSRestarts int + LogErrors int // error count in recent CoreDNS logs + // Resolution tests (dig results) + SOAResolves bool + NSResolves bool + NSRecordCount int + WildcardResolves bool + BaseAResolves bool + // TLS + BaseTLSDaysLeft int // -1 = failed to check + WildTLSDaysLeft int // -1 = failed to check + // Corefile + CorefileExists bool +} + +// WireGuardData holds parsed WireGuard status from a node. +type WireGuardData struct { + InterfaceUp bool + ServiceActive bool + WgIP string + PeerCount int + Peers []WGPeer + MTU int + ListenPort int + ConfigExists bool + ConfigPerms string // e.g. "600" +} + +// WGPeer holds parsed data for a single WireGuard peer. +type WGPeer struct { + PublicKey string + Endpoint string + AllowedIPs string + LatestHandshake int64 // seconds since epoch, 0 = never + TransferRx int64 + TransferTx int64 + Keepalive int +} + +// SystemData holds parsed system-level data from a node. +type SystemData struct { + Services map[string]string // service name → status + FailedUnits []string // systemd units in failed state + MemTotalMB int + MemUsedMB int + MemFreeMB int + DiskTotalGB string + DiskUsedGB string + DiskAvailGB string + DiskUsePct int + UptimeRaw string + LoadAvg string + CPUCount int + OOMKills int + SwapUsedMB int + SwapTotalMB int + InodePct int // inode usage percentage + ListeningPorts []int // ports from ss -tlnp + UFWActive bool + ProcessUser string // user running debros-node (e.g. "debros") + PanicCount int // panic/fatal in recent logs +} + +// NetworkData holds parsed network-level data from a node. +type NetworkData struct { + InternetReachable bool + TCPEstablished int + TCPTimeWait int + TCPRetransRate float64 // retransmission % from /proc/net/snmp + DefaultRoute bool + WGRouteExists bool + PingResults map[string]bool // WG peer IP → ping success +} + +// Collect gathers data from all nodes in parallel. +func Collect(ctx context.Context, nodes []Node, subsystems []string, verbose bool) *ClusterData { + start := time.Now() + data := &ClusterData{ + Nodes: make(map[string]*NodeData, len(nodes)), + } + + var mu sync.Mutex + var wg sync.WaitGroup + + for _, node := range nodes { + wg.Add(1) + go func(n Node) { + defer wg.Done() + nd := collectNode(ctx, n, subsystems, verbose) + mu.Lock() + data.Nodes[n.Host] = nd + mu.Unlock() + }(node) + } + + wg.Wait() + data.Duration = time.Since(start) + return data +} + +func collectNode(ctx context.Context, node Node, subsystems []string, verbose bool) *NodeData { + nd := &NodeData{Node: node} + + shouldCollect := func(name string) bool { + if len(subsystems) == 0 { + return true + } + for _, s := range subsystems { + if s == name || s == "all" { + return true + } + } + return false + } + + if shouldCollect("rqlite") { + nd.RQLite = collectRQLite(ctx, node, verbose) + } + if shouldCollect("olric") { + nd.Olric = collectOlric(ctx, node) + } + if shouldCollect("ipfs") { + nd.IPFS = collectIPFS(ctx, node) + } + if shouldCollect("dns") && node.IsNameserver() { + nd.DNS = collectDNS(ctx, node) + } + if shouldCollect("wireguard") || shouldCollect("wg") { + nd.WireGuard = collectWireGuard(ctx, node) + } + if shouldCollect("system") { + nd.System = collectSystem(ctx, node) + } + if shouldCollect("network") { + nd.Network = collectNetwork(ctx, node, nd.WireGuard) + } + // Namespace collection — always collect if any subsystem is collected + nd.Namespaces = collectNamespaces(ctx, node) + + return nd +} + +// collectRQLite gathers RQLite data from a node via SSH. +func collectRQLite(ctx context.Context, node Node, verbose bool) *RQLiteData { + data := &RQLiteData{} + + // Collect all endpoints in a single SSH session for efficiency. + // We use a separator to split the outputs. + cmd := ` +SEP="===INSPECTOR_SEP===" +echo "$SEP" +curl -sf http://localhost:5001/status 2>/dev/null || echo '{"error":"unreachable"}' +echo "$SEP" +curl -sf 'http://localhost:5001/nodes?nonvoters' 2>/dev/null || echo '{"error":"unreachable"}' +echo "$SEP" +curl -sf http://localhost:5001/readyz 2>/dev/null; echo "EXIT:$?" +echo "$SEP" +curl -sf http://localhost:5001/debug/vars 2>/dev/null || echo '{"error":"unreachable"}' +echo "$SEP" +curl -sf -H 'Content-Type: application/json' 'http://localhost:5001/db/query?level=strong' -d '["SELECT 1"]' 2>/dev/null && echo "STRONG_OK" || echo "STRONG_FAIL" +` + + result := RunSSH(ctx, node, cmd) + if !result.OK() && result.Stdout == "" { + return data + } + + parts := strings.Split(result.Stdout, "===INSPECTOR_SEP===") + if len(parts) < 5 { + return data + } + + data.StatusRaw = strings.TrimSpace(parts[1]) + data.NodesRaw = strings.TrimSpace(parts[2]) + readyzSection := strings.TrimSpace(parts[3]) + data.DebugRaw = strings.TrimSpace(parts[4]) + + // Parse /status + if data.StatusRaw != "" && !strings.Contains(data.StatusRaw, `"error":"unreachable"`) { + data.Responsive = true + data.Status = parseRQLiteStatus(data.StatusRaw) + } + + // Parse /nodes + if data.NodesRaw != "" && !strings.Contains(data.NodesRaw, `"error":"unreachable"`) { + data.Nodes = parseRQLiteNodes(data.NodesRaw) + } + + // Parse /readyz + data.Readyz = parseRQLiteReadyz(readyzSection) + + // Parse /debug/vars + if data.DebugRaw != "" && !strings.Contains(data.DebugRaw, `"error":"unreachable"`) { + data.DebugVars = parseRQLiteDebugVars(data.DebugRaw) + } + + // Parse strong read + if len(parts) > 5 { + data.StrongRead = strings.Contains(parts[5], "STRONG_OK") + } + + return data +} + +func parseRQLiteStatus(raw string) *RQLiteStatus { + var m map[string]interface{} + if err := json.Unmarshal([]byte(raw), &m); err != nil { + return nil + } + + s := &RQLiteStatus{} + + store, _ := m["store"].(map[string]interface{}) + if store == nil { + return s + } + + // Raft state + raft, _ := store["raft"].(map[string]interface{}) + if raft != nil { + s.RaftState, _ = raft["state"].(string) + s.Term = jsonUint64(raft, "current_term") + s.AppliedIndex = jsonUint64(raft, "applied_index") + s.CommitIndex = jsonUint64(raft, "commit_index") + s.FsmPending = jsonUint64(raft, "fsm_pending") + s.LastContact, _ = raft["last_contact"].(string) + s.LastLogIndex = jsonUint64(raft, "last_log_index") + s.LastLogTerm = jsonUint64(raft, "last_log_term") + s.LastSnapshot = jsonUint64(raft, "last_snapshot_index") + s.Voter = jsonBool(raft, "voter") + + // num_peers can be a string or number + if np, ok := raft["num_peers"].(string); ok { + s.NumPeers, _ = strconv.Atoi(np) + } else if np, ok := raft["num_peers"].(float64); ok { + s.NumPeers = int(np) + } + } + + // Leader info + leader, _ := store["leader"].(map[string]interface{}) + if leader != nil { + s.LeaderNodeID, _ = leader["node_id"].(string) + s.LeaderAddr, _ = leader["addr"].(string) + } + + s.NodeID, _ = store["node_id"].(string) + s.DBAppliedIndex = jsonUint64(store, "db_applied_index") + s.FsmIndex = jsonUint64(store, "fsm_index") + + // SQLite + sqlite3, _ := store["sqlite3"].(map[string]interface{}) + if sqlite3 != nil { + s.DBSize = int64(jsonUint64(sqlite3, "db_size")) + s.DBSizeFriendly, _ = sqlite3["db_size_friendly"].(string) + } + + // HTTP + httpMap, _ := m["http"].(map[string]interface{}) + if httpMap != nil { + s.Uptime, _ = httpMap["uptime"].(string) + } + + // Build + build, _ := m["build"].(map[string]interface{}) + if build != nil { + s.Version, _ = build["version"].(string) + } + + // Runtime + runtime, _ := m["runtime"].(map[string]interface{}) + if runtime != nil { + if ng, ok := runtime["num_goroutine"].(float64); ok { + s.Goroutines = int(ng) + } + s.GoVersion, _ = runtime["version"].(string) + if mem, ok := runtime["memory"].(map[string]interface{}); ok { + s.HeapAlloc = jsonUint64(mem, "heap_alloc") + } + } + + return s +} + +func parseRQLiteNodes(raw string) map[string]*RQLiteNode { + var m map[string]interface{} + if err := json.Unmarshal([]byte(raw), &m); err != nil { + return nil + } + + nodes := make(map[string]*RQLiteNode, len(m)) + for addr, v := range m { + info, _ := v.(map[string]interface{}) + if info == nil { + continue + } + n := &RQLiteNode{ + Addr: addr, + Reachable: jsonBool(info, "reachable"), + Leader: jsonBool(info, "leader"), + Voter: jsonBool(info, "voter"), + } + if t, ok := info["time"].(float64); ok { + n.Time = t + } + if e, ok := info["error"].(string); ok { + n.Error = e + } + nodes[addr] = n + } + return nodes +} + +func parseRQLiteReadyz(raw string) *RQLiteReadyz { + r := &RQLiteReadyz{RawBody: raw} + + // /readyz returns body like "[+]node ok\n[+]leader ok\n[+]store ok" with exit 0 + // or "[-]leader not ok\n..." with non-zero exit + lines := strings.Split(raw, "\n") + for _, line := range lines { + line = strings.TrimSpace(line) + if strings.HasPrefix(line, "[+]node") { + r.Node = "ready" + } else if strings.HasPrefix(line, "[-]node") { + r.Node = "not ready" + } else if strings.HasPrefix(line, "[+]leader") { + r.Leader = "ready" + } else if strings.HasPrefix(line, "[-]leader") { + r.Leader = "not ready" + } else if strings.HasPrefix(line, "[+]store") { + r.Store = "ready" + } else if strings.HasPrefix(line, "[-]store") { + r.Store = "not ready" + } + } + + r.Ready = r.Node == "ready" && r.Leader == "ready" && r.Store == "ready" + + // Check exit code from our appended "EXIT:$?" + for _, line := range lines { + if strings.HasPrefix(line, "EXIT:0") { + r.Ready = true + } + } + + return r +} + +func parseRQLiteDebugVars(raw string) *RQLiteDebugVars { + var m map[string]interface{} + if err := json.Unmarshal([]byte(raw), &m); err != nil { + return nil + } + + d := &RQLiteDebugVars{} + + // /debug/vars has flat keys like "store.query_errors", "store.execute_errors", etc. + // But they can also be nested under "cmdstats" or flat depending on rqlite version. + // Try flat numeric keys first. + getUint := func(keys ...string) uint64 { + for _, key := range keys { + if v, ok := m[key]; ok { + switch val := v.(type) { + case float64: + return uint64(val) + case string: + n, _ := strconv.ParseUint(val, 10, 64) + return n + } + } + } + return 0 + } + + d.QueryErrors = getUint("query_errors", "store.query_errors") + d.ExecuteErrors = getUint("execute_errors", "store.execute_errors") + d.RemoteExecErrors = getUint("remote_execute_errors", "store.remote_execute_errors") + d.LeaderNotFound = getUint("leader_not_found", "store.leader_not_found") + d.SnapshotErrors = getUint("snapshot_errors", "store.snapshot_errors") + d.ClientRetries = getUint("client_retries", "cluster.client_retries") + d.ClientTimeouts = getUint("client_timeouts", "cluster.client_timeouts") + + return d +} + +// Placeholder collectors for Phase 2 + +func collectOlric(ctx context.Context, node Node) *OlricData { + data := &OlricData{} + + cmd := ` +SEP="===INSPECTOR_SEP===" +echo "$SEP" +systemctl is-active debros-olric 2>/dev/null +echo "$SEP" +ss -tlnp 2>/dev/null | grep ':3322 ' | head -1 +echo "$SEP" +journalctl -u debros-olric --no-pager -n 200 --since "1 hour ago" 2>/dev/null | grep -ciE '(error|ERR)' || echo 0 +echo "$SEP" +journalctl -u debros-olric --no-pager -n 200 --since "1 hour ago" 2>/dev/null | grep -ciE '(suspect|marking.*(failed|dead))' || echo 0 +echo "$SEP" +journalctl -u debros-olric --no-pager -n 200 --since "1 hour ago" 2>/dev/null | grep -ciE '(memberlist.*(join|leave))' || echo 0 +echo "$SEP" +systemctl show debros-olric --property=NRestarts 2>/dev/null | cut -d= -f2 +echo "$SEP" +ps -C olric-server -o rss= 2>/dev/null | head -1 || echo 0 +` + res := RunSSH(ctx, node, cmd) + if !res.OK() && res.Stdout == "" { + return data + } + + parts := strings.Split(res.Stdout, "===INSPECTOR_SEP===") + if len(parts) < 8 { + return data + } + + data.ServiceActive = strings.TrimSpace(parts[1]) == "active" + data.MemberlistUp = strings.TrimSpace(parts[2]) != "" + + data.LogErrors = parseIntDefault(strings.TrimSpace(parts[3]), 0) + data.LogSuspects = parseIntDefault(strings.TrimSpace(parts[4]), 0) + data.LogFlapping = parseIntDefault(strings.TrimSpace(parts[5]), 0) + data.RestartCount = parseIntDefault(strings.TrimSpace(parts[6]), 0) + + rssKB := parseIntDefault(strings.TrimSpace(parts[7]), 0) + data.ProcessMemMB = rssKB / 1024 + + return data +} + +func collectIPFS(ctx context.Context, node Node) *IPFSData { + data := &IPFSData{} + + cmd := ` +SEP="===INSPECTOR_SEP===" +echo "$SEP" +systemctl is-active debros-ipfs 2>/dev/null +echo "$SEP" +systemctl is-active debros-ipfs-cluster 2>/dev/null +echo "$SEP" +curl -sf -X POST 'http://localhost:4501/api/v0/swarm/peers' 2>/dev/null | python3 -c "import sys,json; d=json.load(sys.stdin); print(len(d.get('Peers') or []))" 2>/dev/null || echo -1 +echo "$SEP" +curl -sf 'http://localhost:9094/peers' 2>/dev/null | python3 -c "import sys,json; peers=json.load(sys.stdin); print(len(peers)); errs=sum(1 for p in peers if p.get('error','')); print(errs)" 2>/dev/null || echo -1 +echo "$SEP" +curl -sf -X POST 'http://localhost:4501/api/v0/repo/stat' 2>/dev/null | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('RepoSize',0)); print(d.get('StorageMax',0))" 2>/dev/null || echo -1 +echo "$SEP" +curl -sf -X POST 'http://localhost:4501/api/v0/version' 2>/dev/null | python3 -c "import sys,json; print(json.load(sys.stdin).get('Version',''))" 2>/dev/null || echo unknown +echo "$SEP" +curl -sf 'http://localhost:9094/id' 2>/dev/null | python3 -c "import sys,json; print(json.load(sys.stdin).get('version',''))" 2>/dev/null || echo unknown +echo "$SEP" +test -f /home/debros/.orama/data/ipfs/repo/swarm.key && echo yes || echo no +echo "$SEP" +curl -sf -X POST 'http://localhost:4501/api/v0/bootstrap/list' 2>/dev/null | python3 -c "import sys,json; peers=json.load(sys.stdin).get('Peers',[]); print(len(peers))" 2>/dev/null || echo -1 +` + res := RunSSH(ctx, node, cmd) + if !res.OK() && res.Stdout == "" { + return data + } + + parts := strings.Split(res.Stdout, "===INSPECTOR_SEP===") + if len(parts) < 10 { + return data + } + + data.DaemonActive = strings.TrimSpace(parts[1]) == "active" + data.ClusterActive = strings.TrimSpace(parts[2]) == "active" + data.SwarmPeerCount = parseIntDefault(strings.TrimSpace(parts[3]), -1) + + // Cluster peers: first line = count, second = errors + clusterLines := strings.Split(strings.TrimSpace(parts[4]), "\n") + if len(clusterLines) >= 1 { + data.ClusterPeerCount = parseIntDefault(strings.TrimSpace(clusterLines[0]), -1) + } + if len(clusterLines) >= 2 { + data.ClusterErrors = parseIntDefault(strings.TrimSpace(clusterLines[1]), 0) + } + + // Repo stat: first line = size, second = max + repoLines := strings.Split(strings.TrimSpace(parts[5]), "\n") + if len(repoLines) >= 1 { + data.RepoSizeBytes = int64(parseIntDefault(strings.TrimSpace(repoLines[0]), 0)) + } + if len(repoLines) >= 2 { + data.RepoMaxBytes = int64(parseIntDefault(strings.TrimSpace(repoLines[1]), 0)) + } + + data.KuboVersion = strings.TrimSpace(parts[6]) + data.ClusterVersion = strings.TrimSpace(parts[7]) + data.HasSwarmKey = strings.TrimSpace(parts[8]) == "yes" + + bootstrapCount := parseIntDefault(strings.TrimSpace(parts[9]), -1) + data.BootstrapEmpty = bootstrapCount == 0 + + return data +} + +func collectDNS(ctx context.Context, node Node) *DNSData { + data := &DNSData{ + BaseTLSDaysLeft: -1, + WildTLSDaysLeft: -1, + } + + // Get the domain from the node's role (e.g. "nameserver-ns1" -> we need the domain) + // We'll discover the domain from Corefile + cmd := ` +SEP="===INSPECTOR_SEP===" +echo "$SEP" +systemctl is-active coredns 2>/dev/null +echo "$SEP" +systemctl is-active caddy 2>/dev/null +echo "$SEP" +ss -ulnp 2>/dev/null | grep ':53 ' | head -1 +echo "$SEP" +ss -tlnp 2>/dev/null | grep ':80 ' | head -1 +echo "$SEP" +ss -tlnp 2>/dev/null | grep ':443 ' | head -1 +echo "$SEP" +ps -C coredns -o rss= 2>/dev/null | head -1 || echo 0 +echo "$SEP" +systemctl show coredns --property=NRestarts 2>/dev/null | cut -d= -f2 +echo "$SEP" +journalctl -u coredns --no-pager -n 100 --since "5 minutes ago" 2>/dev/null | grep -ciE '(error|ERR)' || echo 0 +echo "$SEP" +test -f /etc/coredns/Corefile && echo yes || echo no +echo "$SEP" +DOMAIN=$(grep -oP '^\S+(?=\s*\{)' /etc/coredns/Corefile 2>/dev/null | grep -v '^\.' | head -1) +echo "DOMAIN:${DOMAIN}" +dig @127.0.0.1 SOA ${DOMAIN} +short 2>/dev/null | head -1 +echo "$SEP" +dig @127.0.0.1 NS ${DOMAIN} +short 2>/dev/null +echo "$SEP" +dig @127.0.0.1 A test-wildcard.${DOMAIN} +short 2>/dev/null | head -1 +echo "$SEP" +dig @127.0.0.1 A ${DOMAIN} +short 2>/dev/null | head -1 +echo "$SEP" +echo | openssl s_client -servername ${DOMAIN} -connect localhost:443 2>/dev/null | openssl x509 -noout -dates 2>/dev/null | grep notAfter | cut -d= -f2 +echo "$SEP" +echo | openssl s_client -servername "*.${DOMAIN}" -connect localhost:443 2>/dev/null | openssl x509 -noout -dates 2>/dev/null | grep notAfter | cut -d= -f2 +` + res := RunSSH(ctx, node, cmd) + if !res.OK() && res.Stdout == "" { + return data + } + + parts := strings.Split(res.Stdout, "===INSPECTOR_SEP===") + if len(parts) < 9 { + return data + } + + data.CoreDNSActive = strings.TrimSpace(parts[1]) == "active" + data.CaddyActive = strings.TrimSpace(parts[2]) == "active" + data.Port53Bound = strings.TrimSpace(parts[3]) != "" + data.Port80Bound = strings.TrimSpace(parts[4]) != "" + data.Port443Bound = strings.TrimSpace(parts[5]) != "" + + rssKB := parseIntDefault(strings.TrimSpace(parts[6]), 0) + data.CoreDNSMemMB = rssKB / 1024 + data.CoreDNSRestarts = parseIntDefault(strings.TrimSpace(parts[7]), 0) + data.LogErrors = parseIntDefault(strings.TrimSpace(parts[8]), 0) + + // Corefile exists + if len(parts) > 9 { + data.CorefileExists = strings.TrimSpace(parts[9]) == "yes" + } + + // SOA resolution + if len(parts) > 10 { + soaSection := strings.TrimSpace(parts[10]) + // First line might be DOMAIN:xxx, rest is dig output + for _, line := range strings.Split(soaSection, "\n") { + line = strings.TrimSpace(line) + if strings.HasPrefix(line, "DOMAIN:") { + continue + } + if line != "" { + data.SOAResolves = true + } + } + } + + // NS records + if len(parts) > 11 { + nsSection := strings.TrimSpace(parts[11]) + count := 0 + for _, line := range strings.Split(nsSection, "\n") { + if strings.TrimSpace(line) != "" { + count++ + } + } + data.NSRecordCount = count + data.NSResolves = count > 0 + } + + // Wildcard resolution + if len(parts) > 12 { + data.WildcardResolves = strings.TrimSpace(parts[12]) != "" + } + + // Base A record + if len(parts) > 13 { + data.BaseAResolves = strings.TrimSpace(parts[13]) != "" + } + + // TLS cert days left (base domain) + if len(parts) > 14 { + data.BaseTLSDaysLeft = parseTLSExpiry(strings.TrimSpace(parts[14])) + } + + // TLS cert days left (wildcard) + if len(parts) > 15 { + data.WildTLSDaysLeft = parseTLSExpiry(strings.TrimSpace(parts[15])) + } + + return data +} + +// parseTLSExpiry parses an openssl date string and returns days until expiry (-1 on error). +func parseTLSExpiry(dateStr string) int { + if dateStr == "" { + return -1 + } + // OpenSSL format: "Jan 2 15:04:05 2006 GMT" + layouts := []string{ + "Jan 2 15:04:05 2006 GMT", + "Jan 2 15:04:05 2006 GMT", + } + for _, layout := range layouts { + if t, err := time.Parse(layout, dateStr); err == nil { + days := int(time.Until(t).Hours() / 24) + return days + } + } + return -1 +} + +func collectWireGuard(ctx context.Context, node Node) *WireGuardData { + data := &WireGuardData{} + + cmd := ` +SEP="===INSPECTOR_SEP===" +echo "$SEP" +ip -4 addr show wg0 2>/dev/null | grep -oP 'inet \K[0-9.]+' +echo "$SEP" +systemctl is-active wg-quick@wg0 2>/dev/null +echo "$SEP" +cat /sys/class/net/wg0/mtu 2>/dev/null || echo 0 +echo "$SEP" +sudo wg show wg0 dump 2>/dev/null +echo "$SEP" +test -f /etc/wireguard/wg0.conf && echo yes || echo no +echo "$SEP" +stat -c '%a' /etc/wireguard/wg0.conf 2>/dev/null || echo 000 +` + res := RunSSH(ctx, node, cmd) + if !res.OK() && res.Stdout == "" { + return data + } + + parts := strings.Split(res.Stdout, "===INSPECTOR_SEP===") + if len(parts) < 7 { + return data + } + + wgIP := strings.TrimSpace(parts[1]) + data.WgIP = wgIP + data.InterfaceUp = wgIP != "" + data.ServiceActive = strings.TrimSpace(parts[2]) == "active" + data.MTU = parseIntDefault(strings.TrimSpace(parts[3]), 0) + data.ConfigExists = strings.TrimSpace(parts[5]) == "yes" + data.ConfigPerms = strings.TrimSpace(parts[6]) + + // Parse wg show dump output + // First line = interface: private-key public-key listen-port fwmark + // Subsequent lines = peers: public-key preshared-key endpoint allowed-ips latest-handshake transfer-rx transfer-tx persistent-keepalive + dumpLines := strings.Split(strings.TrimSpace(parts[4]), "\n") + if len(dumpLines) >= 1 { + ifFields := strings.Split(dumpLines[0], "\t") + if len(ifFields) >= 3 { + data.ListenPort = parseIntDefault(ifFields[2], 0) + } + } + for _, line := range dumpLines[1:] { + fields := strings.Split(line, "\t") + if len(fields) < 8 { + continue + } + handshake := int64(parseIntDefault(fields[4], 0)) + rx := int64(parseIntDefault(fields[5], 0)) + tx := int64(parseIntDefault(fields[6], 0)) + keepalive := parseIntDefault(fields[7], 0) + + data.Peers = append(data.Peers, WGPeer{ + PublicKey: fields[0], + Endpoint: fields[2], + AllowedIPs: fields[3], + LatestHandshake: handshake, + TransferRx: rx, + TransferTx: tx, + Keepalive: keepalive, + }) + } + data.PeerCount = len(data.Peers) + + return data +} + +func collectSystem(ctx context.Context, node Node) *SystemData { + data := &SystemData{ + Services: make(map[string]string), + } + + services := []string{ + "debros-node", "debros-ipfs", "debros-ipfs-cluster", + "debros-olric", "debros-anyone-relay", "debros-anyone-client", + "coredns", "caddy", "wg-quick@wg0", + } + + cmd := `SEP="===INSPECTOR_SEP==="` + // Service statuses + for _, svc := range services { + cmd += fmt.Sprintf(` && echo "%s:$(systemctl is-active %s 2>/dev/null || echo inactive)"`, svc, svc) + } + cmd += ` && echo "$SEP"` + cmd += ` && free -m | awk '/Mem:/{print $2","$3","$4} /Swap:/{print "SWAP:"$2","$3}'` + cmd += ` && echo "$SEP"` + cmd += ` && df -h / | awk 'NR==2{print $2","$3","$4","$5}'` + cmd += ` && echo "$SEP"` + cmd += ` && uptime -s 2>/dev/null || echo unknown` + cmd += ` && echo "$SEP"` + cmd += ` && nproc 2>/dev/null || echo 1` + cmd += ` && echo "$SEP"` + cmd += ` && uptime | grep -oP 'load average: \K.*'` + cmd += ` && echo "$SEP"` + cmd += ` && systemctl --failed --no-legend --no-pager 2>/dev/null | awk '{print $1}'` + cmd += ` && echo "$SEP"` + cmd += ` && dmesg 2>/dev/null | grep -ci 'out of memory' || echo 0` + cmd += ` && echo "$SEP"` + cmd += ` && df -i / 2>/dev/null | awk 'NR==2{print $5}' | tr -d '%'` + cmd += ` && echo "$SEP"` + cmd += ` && ss -tlnp 2>/dev/null | awk 'NR>1{split($4,a,":"); print a[length(a)]}' | sort -un` + cmd += ` && echo "$SEP"` + cmd += ` && ufw status 2>/dev/null | head -1` + cmd += ` && echo "$SEP"` + cmd += ` && ps -C debros-node -o user= 2>/dev/null | head -1 || echo unknown` + cmd += ` && echo "$SEP"` + cmd += ` && journalctl -u debros-node --no-pager -n 500 --since "1 hour ago" 2>/dev/null | grep -ciE '(panic|fatal)' || echo 0` + + res := RunSSH(ctx, node, cmd) + if !res.OK() && res.Stdout == "" { + return data + } + + parts := strings.Split(res.Stdout, "===INSPECTOR_SEP===") + + // Part 0: service statuses (before first SEP) + if len(parts) > 0 { + for _, line := range strings.Split(strings.TrimSpace(parts[0]), "\n") { + line = strings.TrimSpace(line) + if idx := strings.Index(line, ":"); idx > 0 { + data.Services[line[:idx]] = line[idx+1:] + } + } + } + + // Part 1: memory + if len(parts) > 1 { + for _, line := range strings.Split(strings.TrimSpace(parts[1]), "\n") { + line = strings.TrimSpace(line) + if strings.HasPrefix(line, "SWAP:") { + swapParts := strings.Split(strings.TrimPrefix(line, "SWAP:"), ",") + if len(swapParts) >= 2 { + data.SwapTotalMB = parseIntDefault(swapParts[0], 0) + data.SwapUsedMB = parseIntDefault(swapParts[1], 0) + } + } else { + memParts := strings.Split(line, ",") + if len(memParts) >= 3 { + data.MemTotalMB = parseIntDefault(memParts[0], 0) + data.MemUsedMB = parseIntDefault(memParts[1], 0) + data.MemFreeMB = parseIntDefault(memParts[2], 0) + } + } + } + } + + // Part 2: disk + if len(parts) > 2 { + diskParts := strings.Split(strings.TrimSpace(parts[2]), ",") + if len(diskParts) >= 4 { + data.DiskTotalGB = diskParts[0] + data.DiskUsedGB = diskParts[1] + data.DiskAvailGB = diskParts[2] + pct := strings.TrimSuffix(diskParts[3], "%") + data.DiskUsePct = parseIntDefault(pct, 0) + } + } + + // Part 3: uptime + if len(parts) > 3 { + data.UptimeRaw = strings.TrimSpace(parts[3]) + } + + // Part 4: CPU count + if len(parts) > 4 { + data.CPUCount = parseIntDefault(strings.TrimSpace(parts[4]), 1) + } + + // Part 5: load average + if len(parts) > 5 { + data.LoadAvg = strings.TrimSpace(parts[5]) + } + + // Part 6: failed units + if len(parts) > 6 { + for _, line := range strings.Split(strings.TrimSpace(parts[6]), "\n") { + line = strings.TrimSpace(line) + if line != "" { + data.FailedUnits = append(data.FailedUnits, line) + } + } + } + + // Part 7: OOM kills + if len(parts) > 7 { + data.OOMKills = parseIntDefault(strings.TrimSpace(parts[7]), 0) + } + + // Part 8: inode usage + if len(parts) > 8 { + data.InodePct = parseIntDefault(strings.TrimSpace(parts[8]), 0) + } + + // Part 9: listening ports + if len(parts) > 9 { + for _, line := range strings.Split(strings.TrimSpace(parts[9]), "\n") { + line = strings.TrimSpace(line) + if p := parseIntDefault(line, 0); p > 0 { + data.ListeningPorts = append(data.ListeningPorts, p) + } + } + } + + // Part 10: UFW status + if len(parts) > 10 { + data.UFWActive = strings.Contains(strings.TrimSpace(parts[10]), "active") + } + + // Part 11: process user + if len(parts) > 11 { + data.ProcessUser = strings.TrimSpace(parts[11]) + } + + // Part 12: panic count + if len(parts) > 12 { + data.PanicCount = parseIntDefault(strings.TrimSpace(parts[12]), 0) + } + + return data +} + +func collectNetwork(ctx context.Context, node Node, wg *WireGuardData) *NetworkData { + data := &NetworkData{ + PingResults: make(map[string]bool), + } + + // Build ping commands for WG peer IPs + var pingCmds string + if wg != nil { + for _, peer := range wg.Peers { + // Extract IP from AllowedIPs (e.g. "10.0.0.2/32") + ip := strings.Split(peer.AllowedIPs, "/")[0] + if ip != "" && strings.HasPrefix(ip, "10.0.0.") { + pingCmds += fmt.Sprintf(`echo "PING:%s:$(ping -c 1 -W 2 %s >/dev/null 2>&1 && echo ok || echo fail)" +`, ip, ip) + } + } + } + + cmd := fmt.Sprintf(` +SEP="===INSPECTOR_SEP===" +echo "$SEP" +ping -c 1 -W 2 8.8.8.8 >/dev/null 2>&1 && echo yes || echo no +echo "$SEP" +ss -s 2>/dev/null | awk '/^TCP:/{print $0}' +echo "$SEP" +ip route show default 2>/dev/null | head -1 +echo "$SEP" +ip route show 10.0.0.0/24 dev wg0 2>/dev/null | head -1 +echo "$SEP" +cat /proc/net/snmp 2>/dev/null | awk '/^Tcp:/{getline; print}' +echo "$SEP" +%s +`, pingCmds) + + res := RunSSH(ctx, node, cmd) + if !res.OK() && res.Stdout == "" { + return data + } + + parts := strings.Split(res.Stdout, "===INSPECTOR_SEP===") + + if len(parts) > 1 { + data.InternetReachable = strings.TrimSpace(parts[1]) == "yes" + } + + // Parse TCP stats: "TCP: 42 (estab 15, closed 3, orphaned 0, timewait 2/0), ports 0/0/0" + if len(parts) > 2 { + tcpLine := strings.TrimSpace(parts[2]) + if idx := strings.Index(tcpLine, "estab "); idx >= 0 { + rest := tcpLine[idx+6:] + if comma := strings.IndexByte(rest, ','); comma > 0 { + data.TCPEstablished = parseIntDefault(rest[:comma], 0) + } + } + if idx := strings.Index(tcpLine, "timewait "); idx >= 0 { + rest := tcpLine[idx+9:] + if slash := strings.IndexByte(rest, '/'); slash > 0 { + data.TCPTimeWait = parseIntDefault(rest[:slash], 0) + } else if comma := strings.IndexByte(rest, ')'); comma > 0 { + data.TCPTimeWait = parseIntDefault(rest[:comma], 0) + } + } + } + + if len(parts) > 3 { + data.DefaultRoute = strings.TrimSpace(parts[3]) != "" + } + if len(parts) > 4 { + data.WGRouteExists = strings.TrimSpace(parts[4]) != "" + } + + // Parse TCP retransmission rate from /proc/net/snmp + // Values line: "Tcp: ..." + // Index: 0 1 2 3 4 5 6 7 8 9 10 11 12 + if len(parts) > 5 { + fields := strings.Fields(strings.TrimSpace(parts[5])) + if len(fields) >= 13 { + outSegs := parseIntDefault(fields[11], 0) + retransSegs := parseIntDefault(fields[12], 0) + if outSegs > 0 { + data.TCPRetransRate = float64(retransSegs) / float64(outSegs) * 100 + } + } + } + + // Parse ping results + if len(parts) > 6 { + for _, line := range strings.Split(strings.TrimSpace(parts[6]), "\n") { + line = strings.TrimSpace(line) + if strings.HasPrefix(line, "PING:") { + // Format: PING:: + pingParts := strings.SplitN(line, ":", 3) + if len(pingParts) == 3 { + data.PingResults[pingParts[1]] = pingParts[2] == "ok" + } + } + } + } + + return data +} + +func collectNamespaces(ctx context.Context, node Node) []NamespaceData { + // Detect namespace services: debros-namespace-gateway@.service + cmd := ` +SEP="===INSPECTOR_SEP===" +echo "$SEP" +systemctl list-units --type=service --all --no-pager --no-legend 'debros-namespace-gateway@*.service' 2>/dev/null | awk '{print $1}' | sed 's/debros-namespace-gateway@//;s/\.service//' +echo "$SEP" +` + res := RunSSH(ctx, node, cmd) + if !res.OK() && res.Stdout == "" { + return nil + } + + parts := strings.Split(res.Stdout, "===INSPECTOR_SEP===") + if len(parts) < 2 { + return nil + } + + var names []string + for _, line := range strings.Split(strings.TrimSpace(parts[1]), "\n") { + line = strings.TrimSpace(line) + if line != "" { + names = append(names, line) + } + } + + if len(names) == 0 { + return nil + } + + // For each namespace, check its services + // Namespace ports: base = 10000 + (index * 5) + // offset 0=RQLite HTTP, 1=RQLite Raft, 2=Olric HTTP, 3=Olric Memberlist, 4=Gateway HTTP + // We discover actual ports by querying each namespace's services + var nsCmd string + for _, name := range names { + nsCmd += fmt.Sprintf(` +echo "NS_START:%s" +# Get gateway port from systemd or default discovery +GWPORT=$(ss -tlnp 2>/dev/null | grep 'debros-namespace-gateway@%s' | grep -oP ':\K[0-9]+' | head -1) +echo "GW_PORT:${GWPORT:-0}" +# Try common namespace port ranges (10000-10099) +for BASE in $(seq 10000 5 10099); do + RQLITE_PORT=$((BASE)) + if curl -sf --connect-timeout 1 "http://localhost:${RQLITE_PORT}/status" >/dev/null 2>&1; then + STATUS=$(curl -sf --connect-timeout 1 "http://localhost:${RQLITE_PORT}/status" 2>/dev/null) + STATE=$(echo "$STATUS" | python3 -c "import sys,json; print(json.load(sys.stdin).get('store',{}).get('raft',{}).get('state',''))" 2>/dev/null || echo "") + READYZ=$(curl -sf --connect-timeout 1 "http://localhost:${RQLITE_PORT}/readyz" 2>/dev/null && echo "yes" || echo "no") + echo "RQLITE:${BASE}:up:${STATE}:${READYZ}" + break + fi +done +# Check Olric memberlist +OLRIC_PORT=$((BASE + 2)) +ss -tlnp 2>/dev/null | grep -q ":${OLRIC_PORT} " && echo "OLRIC:up" || echo "OLRIC:down" +# Check Gateway +GW_PORT2=$((BASE + 4)) +GW_STATUS=$(curl -sf -o /dev/null -w '%%{http_code}' --connect-timeout 1 "http://localhost:${GW_PORT2}/health" 2>/dev/null || echo "0") +echo "GATEWAY:${GW_STATUS}" +echo "NS_END" +`, name, name) + } + + nsRes := RunSSH(ctx, node, nsCmd) + if !nsRes.OK() && nsRes.Stdout == "" { + // Return namespace names at minimum + var result []NamespaceData + for _, name := range names { + result = append(result, NamespaceData{Name: name}) + } + return result + } + + // Parse namespace results + var result []NamespaceData + var current *NamespaceData + for _, line := range strings.Split(nsRes.Stdout, "\n") { + line = strings.TrimSpace(line) + if strings.HasPrefix(line, "NS_START:") { + name := strings.TrimPrefix(line, "NS_START:") + nd := NamespaceData{Name: name} + current = &nd + } else if line == "NS_END" && current != nil { + result = append(result, *current) + current = nil + } else if current != nil { + if strings.HasPrefix(line, "RQLITE:") { + // RQLITE::up:: + rParts := strings.SplitN(line, ":", 5) + if len(rParts) >= 5 { + current.PortBase = parseIntDefault(rParts[1], 0) + current.RQLiteUp = rParts[2] == "up" + current.RQLiteState = rParts[3] + current.RQLiteReady = rParts[4] == "yes" + } + } else if strings.HasPrefix(line, "OLRIC:") { + current.OlricUp = strings.TrimPrefix(line, "OLRIC:") == "up" + } else if strings.HasPrefix(line, "GATEWAY:") { + code := parseIntDefault(strings.TrimPrefix(line, "GATEWAY:"), 0) + current.GatewayStatus = code + current.GatewayUp = code >= 200 && code < 500 + } + } + } + + return result +} + +// Parse helper functions + +func parseIntDefault(s string, def int) int { + n, err := strconv.Atoi(s) + if err != nil { + return def + } + return n +} + +// JSON helper functions + +func jsonUint64(m map[string]interface{}, key string) uint64 { + v, ok := m[key] + if !ok { + return 0 + } + switch val := v.(type) { + case float64: + return uint64(val) + case string: + n, _ := strconv.ParseUint(val, 10, 64) + return n + case json.Number: + n, _ := val.Int64() + return uint64(n) + default: + return 0 + } +} + +func jsonBool(m map[string]interface{}, key string) bool { + v, ok := m[key] + if !ok { + return false + } + switch val := v.(type) { + case bool: + return val + case string: + return val == "true" + default: + return false + } +} diff --git a/pkg/inspector/config.go b/pkg/inspector/config.go new file mode 100644 index 0000000..524f19e --- /dev/null +++ b/pkg/inspector/config.go @@ -0,0 +1,118 @@ +package inspector + +import ( + "bufio" + "fmt" + "os" + "strings" +) + +// Node represents a remote node parsed from remote-nodes.conf. +type Node struct { + Environment string // devnet, testnet + User string // SSH user + Host string // IP or hostname + Password string // SSH password + Role string // node, nameserver-ns1, nameserver-ns2, nameserver-ns3 + SSHKey string // optional path to SSH key +} + +// Name returns a short display name for the node (user@host). +func (n Node) Name() string { + return fmt.Sprintf("%s@%s", n.User, n.Host) +} + +// IsNameserver returns true if the node has a nameserver role. +func (n Node) IsNameserver() bool { + return strings.HasPrefix(n.Role, "nameserver") +} + +// LoadNodes parses a remote-nodes.conf file into a slice of Nodes. +// Format: environment|user@host|password|role|ssh_key (ssh_key optional) +func LoadNodes(path string) ([]Node, error) { + f, err := os.Open(path) + if err != nil { + return nil, fmt.Errorf("open config: %w", err) + } + defer f.Close() + + var nodes []Node + scanner := bufio.NewScanner(f) + lineNum := 0 + for scanner.Scan() { + lineNum++ + line := strings.TrimSpace(scanner.Text()) + if line == "" || strings.HasPrefix(line, "#") { + continue + } + + parts := strings.SplitN(line, "|", 5) + if len(parts) < 4 { + return nil, fmt.Errorf("line %d: expected at least 4 pipe-delimited fields, got %d", lineNum, len(parts)) + } + + env := parts[0] + userHost := parts[1] + password := parts[2] + role := parts[3] + + var sshKey string + if len(parts) == 5 { + sshKey = parts[4] + } + + // Parse user@host + at := strings.LastIndex(userHost, "@") + if at < 0 { + return nil, fmt.Errorf("line %d: expected user@host format, got %q", lineNum, userHost) + } + user := userHost[:at] + host := userHost[at+1:] + + nodes = append(nodes, Node{ + Environment: env, + User: user, + Host: host, + Password: password, + Role: role, + SSHKey: sshKey, + }) + } + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("reading config: %w", err) + } + return nodes, nil +} + +// FilterByEnv returns only nodes matching the given environment. +func FilterByEnv(nodes []Node, env string) []Node { + var filtered []Node + for _, n := range nodes { + if n.Environment == env { + filtered = append(filtered, n) + } + } + return filtered +} + +// FilterByRole returns only nodes matching the given role prefix. +func FilterByRole(nodes []Node, rolePrefix string) []Node { + var filtered []Node + for _, n := range nodes { + if strings.HasPrefix(n.Role, rolePrefix) { + filtered = append(filtered, n) + } + } + return filtered +} + +// RegularNodes returns non-nameserver nodes. +func RegularNodes(nodes []Node) []Node { + var filtered []Node + for _, n := range nodes { + if n.Role == "node" { + filtered = append(filtered, n) + } + } + return filtered +} diff --git a/pkg/inspector/config_test.go b/pkg/inspector/config_test.go new file mode 100644 index 0000000..9d7d368 --- /dev/null +++ b/pkg/inspector/config_test.go @@ -0,0 +1,179 @@ +package inspector + +import ( + "os" + "path/filepath" + "testing" +) + +func TestLoadNodes(t *testing.T) { + content := `# Comment line +devnet|ubuntu@1.2.3.4|pass123|node +devnet|ubuntu@1.2.3.5|pass456|node +devnet|ubuntu@5.6.7.8|pass789|nameserver-ns1|/path/to/key +` + path := writeTempFile(t, content) + + nodes, err := LoadNodes(path) + if err != nil { + t.Fatalf("LoadNodes: %v", err) + } + if len(nodes) != 3 { + t.Fatalf("want 3 nodes, got %d", len(nodes)) + } + + // First node + n := nodes[0] + if n.Environment != "devnet" { + t.Errorf("node[0].Environment = %q, want devnet", n.Environment) + } + if n.User != "ubuntu" { + t.Errorf("node[0].User = %q, want ubuntu", n.User) + } + if n.Host != "1.2.3.4" { + t.Errorf("node[0].Host = %q, want 1.2.3.4", n.Host) + } + if n.Password != "pass123" { + t.Errorf("node[0].Password = %q, want pass123", n.Password) + } + if n.Role != "node" { + t.Errorf("node[0].Role = %q, want node", n.Role) + } + if n.SSHKey != "" { + t.Errorf("node[0].SSHKey = %q, want empty", n.SSHKey) + } + + // Third node with SSH key + n3 := nodes[2] + if n3.Role != "nameserver-ns1" { + t.Errorf("node[2].Role = %q, want nameserver-ns1", n3.Role) + } + if n3.SSHKey != "/path/to/key" { + t.Errorf("node[2].SSHKey = %q, want /path/to/key", n3.SSHKey) + } +} + +func TestLoadNodes_EmptyLines(t *testing.T) { + content := ` +# Full line comment + +devnet|ubuntu@1.2.3.4|pass|node + +# Another comment +devnet|ubuntu@1.2.3.5|pass|node +` + path := writeTempFile(t, content) + + nodes, err := LoadNodes(path) + if err != nil { + t.Fatalf("LoadNodes: %v", err) + } + if len(nodes) != 2 { + t.Fatalf("want 2 nodes (blank/comment lines skipped), got %d", len(nodes)) + } +} + +func TestLoadNodes_InvalidFormat(t *testing.T) { + tests := []struct { + name string + content string + }{ + {"too few fields", "devnet|ubuntu@1.2.3.4|pass\n"}, + {"no @ in userhost", "devnet|localhost|pass|node\n"}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + path := writeTempFile(t, tt.content) + _, err := LoadNodes(path) + if err == nil { + t.Error("expected error for invalid format") + } + }) + } +} + +func TestLoadNodes_FileNotFound(t *testing.T) { + _, err := LoadNodes("/nonexistent/path/file.conf") + if err == nil { + t.Error("expected error for nonexistent file") + } +} + +func TestFilterByEnv(t *testing.T) { + nodes := []Node{ + {Environment: "devnet", Host: "1.1.1.1"}, + {Environment: "testnet", Host: "2.2.2.2"}, + {Environment: "devnet", Host: "3.3.3.3"}, + } + filtered := FilterByEnv(nodes, "devnet") + if len(filtered) != 2 { + t.Fatalf("want 2 devnet nodes, got %d", len(filtered)) + } + for _, n := range filtered { + if n.Environment != "devnet" { + t.Errorf("got env=%s, want devnet", n.Environment) + } + } +} + +func TestFilterByRole(t *testing.T) { + nodes := []Node{ + {Role: "node", Host: "1.1.1.1"}, + {Role: "nameserver-ns1", Host: "2.2.2.2"}, + {Role: "nameserver-ns2", Host: "3.3.3.3"}, + {Role: "node", Host: "4.4.4.4"}, + } + filtered := FilterByRole(nodes, "nameserver") + if len(filtered) != 2 { + t.Fatalf("want 2 nameserver nodes, got %d", len(filtered)) + } +} + +func TestRegularNodes(t *testing.T) { + nodes := []Node{ + {Role: "node", Host: "1.1.1.1"}, + {Role: "nameserver-ns1", Host: "2.2.2.2"}, + {Role: "node", Host: "3.3.3.3"}, + } + regular := RegularNodes(nodes) + if len(regular) != 2 { + t.Fatalf("want 2 regular nodes, got %d", len(regular)) + } +} + +func TestNode_Name(t *testing.T) { + n := Node{User: "ubuntu", Host: "1.2.3.4"} + if got := n.Name(); got != "ubuntu@1.2.3.4" { + t.Errorf("Name() = %q, want ubuntu@1.2.3.4", got) + } +} + +func TestNode_IsNameserver(t *testing.T) { + tests := []struct { + role string + want bool + }{ + {"nameserver-ns1", true}, + {"nameserver-ns2", true}, + {"node", false}, + {"", false}, + } + for _, tt := range tests { + t.Run(tt.role, func(t *testing.T) { + n := Node{Role: tt.role} + if got := n.IsNameserver(); got != tt.want { + t.Errorf("IsNameserver(%q) = %v, want %v", tt.role, got, tt.want) + } + }) + } +} + +func writeTempFile(t *testing.T, content string) string { + t.Helper() + dir := t.TempDir() + path := filepath.Join(dir, "test-nodes.conf") + if err := os.WriteFile(path, []byte(content), 0644); err != nil { + t.Fatalf("write temp file: %v", err) + } + return path +} diff --git a/pkg/inspector/report.go b/pkg/inspector/report.go new file mode 100644 index 0000000..f69725e --- /dev/null +++ b/pkg/inspector/report.go @@ -0,0 +1,136 @@ +package inspector + +import ( + "encoding/json" + "fmt" + "io" + "sort" + "strings" +) + +// PrintTable writes a human-readable table of check results. +func PrintTable(results *Results, w io.Writer) { + if len(results.Checks) == 0 { + fmt.Fprintf(w, "No checks executed.\n") + return + } + + // Sort: failures first, then warnings, then passes, then skips. + // Within each group, sort by severity (critical first). + sorted := make([]CheckResult, len(results.Checks)) + copy(sorted, results.Checks) + sort.Slice(sorted, func(i, j int) bool { + oi, oj := statusOrder(sorted[i].Status), statusOrder(sorted[j].Status) + if oi != oj { + return oi < oj + } + // Higher severity first + if sorted[i].Severity != sorted[j].Severity { + return sorted[i].Severity > sorted[j].Severity + } + return sorted[i].ID < sorted[j].ID + }) + + // Group by subsystem + groups := map[string][]CheckResult{} + var subsystems []string + for _, c := range sorted { + if _, exists := groups[c.Subsystem]; !exists { + subsystems = append(subsystems, c.Subsystem) + } + groups[c.Subsystem] = append(groups[c.Subsystem], c) + } + + for _, sub := range subsystems { + checks := groups[sub] + fmt.Fprintf(w, "\n%s %s\n", severityIcon(Critical), strings.ToUpper(sub)) + fmt.Fprintf(w, "%s\n", strings.Repeat("-", 70)) + + for _, c := range checks { + icon := statusIcon(c.Status) + sev := fmt.Sprintf("[%s]", c.Severity) + nodePart := "" + if c.Node != "" { + nodePart = fmt.Sprintf(" (%s)", c.Node) + } + fmt.Fprintf(w, " %s %-8s %s%s\n", icon, sev, c.Name, nodePart) + if c.Message != "" { + fmt.Fprintf(w, " %s\n", c.Message) + } + } + } + + passed, failed, warned, skipped := results.Summary() + fmt.Fprintf(w, "\n%s\n", strings.Repeat("=", 70)) + fmt.Fprintf(w, "Summary: %d passed, %d failed, %d warnings, %d skipped (%.1fs)\n", + passed, failed, warned, skipped, results.Duration.Seconds()) +} + +// PrintJSON writes check results as JSON. +func PrintJSON(results *Results, w io.Writer) { + passed, failed, warned, skipped := results.Summary() + output := struct { + Summary struct { + Passed int `json:"passed"` + Failed int `json:"failed"` + Warned int `json:"warned"` + Skipped int `json:"skipped"` + Total int `json:"total"` + Seconds float64 `json:"duration_seconds"` + } `json:"summary"` + Checks []CheckResult `json:"checks"` + }{ + Checks: results.Checks, + } + output.Summary.Passed = passed + output.Summary.Failed = failed + output.Summary.Warned = warned + output.Summary.Skipped = skipped + output.Summary.Total = len(results.Checks) + output.Summary.Seconds = results.Duration.Seconds() + + enc := json.NewEncoder(w) + enc.SetIndent("", " ") + enc.Encode(output) +} + +// SummaryLine returns a one-line summary string. +func SummaryLine(results *Results) string { + passed, failed, warned, skipped := results.Summary() + return fmt.Sprintf("%d passed, %d failed, %d warnings, %d skipped", + passed, failed, warned, skipped) +} + +func statusOrder(s Status) int { + switch s { + case StatusFail: + return 0 + case StatusWarn: + return 1 + case StatusPass: + return 2 + case StatusSkip: + return 3 + default: + return 4 + } +} + +func statusIcon(s Status) string { + switch s { + case StatusPass: + return "OK" + case StatusFail: + return "FAIL" + case StatusWarn: + return "WARN" + case StatusSkip: + return "SKIP" + default: + return "??" + } +} + +func severityIcon(_ Severity) string { + return "##" +} diff --git a/pkg/inspector/report_test.go b/pkg/inspector/report_test.go new file mode 100644 index 0000000..da74f44 --- /dev/null +++ b/pkg/inspector/report_test.go @@ -0,0 +1,135 @@ +package inspector + +import ( + "bytes" + "encoding/json" + "strings" + "testing" + "time" +) + +func TestPrintTable_EmptyResults(t *testing.T) { + r := &Results{} + var buf bytes.Buffer + PrintTable(r, &buf) + if !strings.Contains(buf.String(), "No checks executed") { + t.Errorf("expected 'No checks executed', got %q", buf.String()) + } +} + +func TestPrintTable_SortsFailuresFirst(t *testing.T) { + r := &Results{ + Duration: time.Second, + Checks: []CheckResult{ + {ID: "a", Name: "Pass check", Subsystem: "test", Status: StatusPass, Severity: Low}, + {ID: "b", Name: "Fail check", Subsystem: "test", Status: StatusFail, Severity: Critical}, + {ID: "c", Name: "Warn check", Subsystem: "test", Status: StatusWarn, Severity: High}, + }, + } + var buf bytes.Buffer + PrintTable(r, &buf) + output := buf.String() + + // FAIL should appear before WARN, which should appear before OK + failIdx := strings.Index(output, "FAIL") + warnIdx := strings.Index(output, "WARN") + okIdx := strings.Index(output, "OK") + + if failIdx < 0 || warnIdx < 0 || okIdx < 0 { + t.Fatalf("expected FAIL, WARN, and OK in output:\n%s", output) + } + if failIdx > warnIdx { + t.Errorf("FAIL (pos %d) should appear before WARN (pos %d)", failIdx, warnIdx) + } + if warnIdx > okIdx { + t.Errorf("WARN (pos %d) should appear before OK (pos %d)", warnIdx, okIdx) + } +} + +func TestPrintTable_IncludesNode(t *testing.T) { + r := &Results{ + Duration: time.Second, + Checks: []CheckResult{ + {ID: "a", Name: "Check A", Subsystem: "test", Status: StatusPass, Node: "ubuntu@1.2.3.4"}, + }, + } + var buf bytes.Buffer + PrintTable(r, &buf) + if !strings.Contains(buf.String(), "ubuntu@1.2.3.4") { + t.Error("expected node name in table output") + } +} + +func TestPrintTable_IncludesSummary(t *testing.T) { + r := &Results{ + Duration: 2 * time.Second, + Checks: []CheckResult{ + {ID: "a", Subsystem: "test", Status: StatusPass}, + {ID: "b", Subsystem: "test", Status: StatusFail}, + }, + } + var buf bytes.Buffer + PrintTable(r, &buf) + output := buf.String() + if !strings.Contains(output, "1 passed") { + t.Error("summary should mention passed count") + } + if !strings.Contains(output, "1 failed") { + t.Error("summary should mention failed count") + } +} + +func TestPrintJSON_ValidJSON(t *testing.T) { + r := &Results{ + Duration: time.Second, + Checks: []CheckResult{ + {ID: "a", Name: "A", Subsystem: "test", Status: StatusPass, Severity: Low, Message: "ok"}, + {ID: "b", Name: "B", Subsystem: "test", Status: StatusFail, Severity: High, Message: "bad"}, + }, + } + var buf bytes.Buffer + PrintJSON(r, &buf) + + var parsed map[string]interface{} + if err := json.Unmarshal(buf.Bytes(), &parsed); err != nil { + t.Fatalf("output is not valid JSON: %v\nraw: %s", err, buf.String()) + } + + summary, ok := parsed["summary"].(map[string]interface{}) + if !ok { + t.Fatal("missing 'summary' object in JSON") + } + if v := summary["passed"]; v != float64(1) { + t.Errorf("summary.passed = %v, want 1", v) + } + if v := summary["failed"]; v != float64(1) { + t.Errorf("summary.failed = %v, want 1", v) + } + if v := summary["total"]; v != float64(2) { + t.Errorf("summary.total = %v, want 2", v) + } + + checks, ok := parsed["checks"].([]interface{}) + if !ok { + t.Fatal("missing 'checks' array in JSON") + } + if len(checks) != 2 { + t.Errorf("want 2 checks, got %d", len(checks)) + } +} + +func TestSummaryLine(t *testing.T) { + r := &Results{ + Checks: []CheckResult{ + {Status: StatusPass}, + {Status: StatusPass}, + {Status: StatusFail}, + {Status: StatusWarn}, + }, + } + got := SummaryLine(r) + want := "2 passed, 1 failed, 1 warnings, 0 skipped" + if got != want { + t.Errorf("SummaryLine = %q, want %q", got, want) + } +} diff --git a/pkg/inspector/ssh.go b/pkg/inspector/ssh.go new file mode 100644 index 0000000..e16ad74 --- /dev/null +++ b/pkg/inspector/ssh.go @@ -0,0 +1,165 @@ +package inspector + +import ( + "bytes" + "context" + "fmt" + "os/exec" + "strings" + "syscall" + "time" +) + +const ( + sshMaxRetries = 3 + sshRetryDelay = 2 * time.Second +) + +// SSHResult holds the output of an SSH command execution. +type SSHResult struct { + Stdout string + Stderr string + ExitCode int + Duration time.Duration + Err error + Retries int // how many retries were needed +} + +// OK returns true if the command succeeded (exit code 0, no error). +func (r SSHResult) OK() bool { + return r.Err == nil && r.ExitCode == 0 +} + +// RunSSH executes a command on a remote node via SSH with retry on connection failure. +// Uses sshpass for password auth, falls back to -i for key-based auth. +// The -n flag is used to prevent SSH from reading stdin. +func RunSSH(ctx context.Context, node Node, command string) SSHResult { + var result SSHResult + for attempt := 0; attempt <= sshMaxRetries; attempt++ { + result = runSSHOnce(ctx, node, command) + result.Retries = attempt + + // Success — return immediately + if result.OK() { + return result + } + + // If the command ran but returned non-zero exit, that's the remote command + // failing (not a connection issue) — don't retry + if result.Err == nil && result.ExitCode != 0 { + return result + } + + // Check if it's a connection-level failure worth retrying + if !isSSHConnectionError(result) { + return result + } + + // Don't retry if context is done + if ctx.Err() != nil { + return result + } + + // Wait before retry (except on last attempt) + if attempt < sshMaxRetries { + select { + case <-time.After(sshRetryDelay): + case <-ctx.Done(): + return result + } + } + } + return result +} + +// runSSHOnce executes a single SSH attempt. +func runSSHOnce(ctx context.Context, node Node, command string) SSHResult { + start := time.Now() + + var args []string + if node.SSHKey != "" { + // Key-based auth + args = []string{ + "ssh", "-n", + "-o", "StrictHostKeyChecking=no", + "-o", "ConnectTimeout=10", + "-o", "BatchMode=yes", + "-i", node.SSHKey, + fmt.Sprintf("%s@%s", node.User, node.Host), + command, + } + } else { + // Password auth via sshpass + args = []string{ + "sshpass", "-p", node.Password, + "ssh", "-n", + "-o", "StrictHostKeyChecking=no", + "-o", "ConnectTimeout=10", + fmt.Sprintf("%s@%s", node.User, node.Host), + command, + } + } + + cmd := exec.CommandContext(ctx, args[0], args[1:]...) + + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + err := cmd.Run() + duration := time.Since(start) + + exitCode := 0 + if err != nil { + if exitErr, ok := err.(*exec.ExitError); ok { + if status, ok := exitErr.Sys().(syscall.WaitStatus); ok { + exitCode = status.ExitStatus() + } + } + } + + return SSHResult{ + Stdout: strings.TrimSpace(stdout.String()), + Stderr: strings.TrimSpace(stderr.String()), + ExitCode: exitCode, + Duration: duration, + Err: err, + } +} + +// isSSHConnectionError returns true if the failure looks like an SSH connection +// problem (timeout, refused, network unreachable) rather than a remote command error. +func isSSHConnectionError(r SSHResult) bool { + // sshpass exit code 5 = invalid/incorrect password (not retriable) + // sshpass exit code 6 = host key verification failed (not retriable) + // SSH exit code 255 = SSH connection error (retriable) + if r.ExitCode == 255 { + return true + } + + stderr := strings.ToLower(r.Stderr) + connectionErrors := []string{ + "connection refused", + "connection timed out", + "connection reset", + "no route to host", + "network is unreachable", + "could not resolve hostname", + "ssh_exchange_identification", + "broken pipe", + "connection closed by remote host", + } + for _, pattern := range connectionErrors { + if strings.Contains(stderr, pattern) { + return true + } + } + return false +} + +// RunSSHMulti executes a multi-command string on a remote node. +// Commands are joined with " && " so failure stops execution. +func RunSSHMulti(ctx context.Context, node Node, commands []string) SSHResult { + combined := strings.Join(commands, " && ") + return RunSSH(ctx, node, combined) +} diff --git a/pkg/ipfs/cluster_peer.go b/pkg/ipfs/cluster_peer.go index cdc0a7f..9f28ac1 100644 --- a/pkg/ipfs/cluster_peer.go +++ b/pkg/ipfs/cluster_peer.go @@ -61,7 +61,9 @@ func (cm *ClusterConfigManager) UpdateAllClusterPeers() error { func (cm *ClusterConfigManager) RepairPeerConfiguration() error { cm.logger.Info("Attempting to repair IPFS Cluster peer configuration") - _ = cm.FixIPFSConfigAddresses() + if err := cm.FixIPFSConfigAddresses(); err != nil { + cm.logger.Warn("Failed to fix IPFS config addresses during repair", zap.Error(err)) + } peers, err := cm.DiscoverClusterPeersFromGateway() if err != nil { @@ -72,7 +74,9 @@ func (cm *ClusterConfigManager) RepairPeerConfiguration() error { peerAddrs = append(peerAddrs, p.Multiaddress) } if len(peerAddrs) > 0 { - _ = cm.UpdatePeerAddresses(peerAddrs) + if err := cm.UpdatePeerAddresses(peerAddrs); err != nil { + cm.logger.Warn("Failed to update peer addresses during repair", zap.Error(err)) + } } } diff --git a/pkg/ipfs/cluster_util.go b/pkg/ipfs/cluster_util.go index 2f976da..4fd5777 100644 --- a/pkg/ipfs/cluster_util.go +++ b/pkg/ipfs/cluster_util.go @@ -77,19 +77,6 @@ func parseIPFSPort(rawURL string) (int, error) { return port, nil } -func parsePeerHostAndPort(multiaddr string) (string, int) { - parts := strings.Split(multiaddr, "/") - var hostStr string - var port int - for i, part := range parts { - if part == "ip4" || part == "dns" || part == "dns4" { - hostStr = parts[i+1] - } else if part == "tcp" { - fmt.Sscanf(parts[i+1], "%d", &port) - } - } - return hostStr, port -} func extractIPFromMultiaddrForCluster(maddr string) string { parts := strings.Split(maddr, "/") diff --git a/pkg/namespace/cluster_manager.go b/pkg/namespace/cluster_manager.go index 82914f5..e7edbe5 100644 --- a/pkg/namespace/cluster_manager.go +++ b/pkg/namespace/cluster_manager.go @@ -893,21 +893,35 @@ func (cm *ClusterManager) GetClusterStatus(ctx context.Context, clusterID string ClusterID: cluster.ID, } - // Check individual service status - // TODO: Actually check each service's health - if cluster.Status == ClusterStatusReady { - status.RQLiteReady = true - status.OlricReady = true - status.GatewayReady = true - status.DNSReady = true - } - - // Get node list + // Check individual service status by inspecting cluster nodes nodes, err := cm.getClusterNodes(ctx, clusterID) if err == nil { + runningCount := 0 + hasRQLite := false + hasOlric := false + hasGateway := false + for _, node := range nodes { status.Nodes = append(status.Nodes, node.NodeID) + if node.Status == NodeStatusRunning { + runningCount++ + } + if node.RQLiteHTTPPort > 0 { + hasRQLite = true + } + if node.OlricHTTPPort > 0 { + hasOlric = true + } + if node.GatewayHTTPPort > 0 { + hasGateway = true + } } + + allRunning := len(nodes) > 0 && runningCount == len(nodes) + status.RQLiteReady = allRunning && hasRQLite + status.OlricReady = allRunning && hasOlric + status.GatewayReady = allRunning && hasGateway + status.DNSReady = allRunning } if cluster.ErrorMessage != "" { diff --git a/pkg/namespace/node_selector.go b/pkg/namespace/node_selector.go index 013adff..929e645 100644 --- a/pkg/namespace/node_selector.go +++ b/pkg/namespace/node_selector.go @@ -6,6 +6,7 @@ import ( "time" "github.com/DeBrosOfficial/network/pkg/client" + "github.com/DeBrosOfficial/network/pkg/constants" "github.com/DeBrosOfficial/network/pkg/rqlite" "go.uber.org/zap" ) @@ -176,12 +177,10 @@ func (cns *ClusterNodeSelector) getNodeCapacity(ctx context.Context, nodeID, ipA } // Calculate available capacity - const ( - maxDeployments = 100 - maxPorts = 9900 // User deployment port range - maxMemoryMB = 8192 // 8GB - maxCPUPercent = 400 // 4 cores - ) + maxDeployments := constants.MaxDeploymentsPerNode + maxPorts := constants.MaxPortsPerNode + maxMemoryMB := constants.MaxMemoryMB + maxCPUPercent := constants.MaxCPUPercent availablePorts := maxPorts - allocatedPorts if availablePorts < 0 { @@ -363,23 +362,3 @@ func (cns *ClusterNodeSelector) calculateCapacityScore( return totalScore } -// GetNodeByID retrieves a node's information by ID -func (cns *ClusterNodeSelector) GetNodeByID(ctx context.Context, nodeID string) (*nodeInfo, error) { - internalCtx := client.WithInternalAuth(ctx) - - var results []nodeInfo - query := `SELECT id, ip_address, COALESCE(internal_ip, ip_address) as internal_ip FROM dns_nodes WHERE id = ? LIMIT 1` - err := cns.db.Query(internalCtx, &results, query, nodeID) - if err != nil { - return nil, &ClusterError{ - Message: "failed to query node", - Cause: err, - } - } - - if len(results) == 0 { - return nil, nil - } - - return &results[0], nil -} diff --git a/pkg/namespace/port_allocator.go b/pkg/namespace/port_allocator.go index d237d26..d58ef01 100644 --- a/pkg/namespace/port_allocator.go +++ b/pkg/namespace/port_allocator.go @@ -3,6 +3,7 @@ package namespace import ( "context" "fmt" + "strings" "time" "github.com/DeBrosOfficial/network/pkg/client" @@ -369,19 +370,5 @@ func isConflictError(err error) bool { return false } errStr := err.Error() - return contains(errStr, "UNIQUE") || contains(errStr, "constraint") || contains(errStr, "conflict") -} - -// contains checks if a string contains a substring (case-insensitive) -func contains(s, substr string) bool { - return len(s) >= len(substr) && (s == substr || len(s) > len(substr) && findSubstring(s, substr)) -} - -func findSubstring(s, substr string) bool { - for i := 0; i <= len(s)-len(substr); i++ { - if s[i:i+len(substr)] == substr { - return true - } - } - return false + return strings.Contains(errStr, "UNIQUE") || strings.Contains(errStr, "constraint") || strings.Contains(errStr, "conflict") } diff --git a/pkg/namespace/port_allocator_test.go b/pkg/namespace/port_allocator_test.go index 1a44148..1da7a7e 100644 --- a/pkg/namespace/port_allocator_test.go +++ b/pkg/namespace/port_allocator_test.go @@ -4,6 +4,7 @@ import ( "context" "database/sql" "errors" + "strings" "testing" "time" @@ -269,7 +270,7 @@ func TestContains(t *testing.T) { for _, tt := range tests { t.Run(tt.s+"_"+tt.substr, func(t *testing.T) { - result := contains(tt.s, tt.substr) + result := strings.Contains(tt.s, tt.substr) if result != tt.expected { t.Errorf("contains(%q, %q) = %v, want %v", tt.s, tt.substr, result, tt.expected) } diff --git a/pkg/namespace/wireguard.go b/pkg/namespace/wireguard.go index 38add16..3c71753 100644 --- a/pkg/namespace/wireguard.go +++ b/pkg/namespace/wireguard.go @@ -1,25 +1,9 @@ package namespace -import ( - "fmt" - "net" -) +import "github.com/DeBrosOfficial/network/pkg/wireguard" // getWireGuardIP returns the IPv4 address of the wg0 interface. // Used as a fallback when Olric BindAddr is empty or 0.0.0.0. func getWireGuardIP() (string, error) { - iface, err := net.InterfaceByName("wg0") - if err != nil { - return "", fmt.Errorf("wg0 interface not found: %w", err) - } - addrs, err := iface.Addrs() - if err != nil { - return "", fmt.Errorf("failed to get wg0 addresses: %w", err) - } - for _, addr := range addrs { - if ipnet, ok := addr.(*net.IPNet); ok && ipnet.IP.To4() != nil { - return ipnet.IP.String(), nil - } - } - return "", fmt.Errorf("no IPv4 address on wg0") + return wireguard.GetIP() } diff --git a/pkg/node/dns_registration.go b/pkg/node/dns_registration.go index ceba888..d4cb76c 100644 --- a/pkg/node/dns_registration.go +++ b/pkg/node/dns_registration.go @@ -11,6 +11,7 @@ import ( "time" "github.com/DeBrosOfficial/network/pkg/logging" + "github.com/DeBrosOfficial/network/pkg/wireguard" "go.uber.org/zap" ) @@ -414,20 +415,7 @@ func (n *Node) isNameserverNode(ctx context.Context) bool { // getWireGuardIP returns the IPv4 address assigned to the wg0 interface, if any func (n *Node) getWireGuardIP() (string, error) { - iface, err := net.InterfaceByName("wg0") - if err != nil { - return "", err - } - addrs, err := iface.Addrs() - if err != nil { - return "", err - } - for _, addr := range addrs { - if ipnet, ok := addr.(*net.IPNet); ok && ipnet.IP.To4() != nil { - return ipnet.IP.String(), nil - } - } - return "", fmt.Errorf("no IPv4 address on wg0") + return wireguard.GetIP() } // getNodeIPAddress attempts to determine the node's external IP address diff --git a/pkg/rqlite/cluster.go b/pkg/rqlite/cluster.go index 61228fc..ab1758d 100644 --- a/pkg/rqlite/cluster.go +++ b/pkg/rqlite/cluster.go @@ -47,7 +47,9 @@ func (r *RQLiteManager) waitForMinClusterSizeBeforeStart(ctx context.Context, rq return nil } - _ = r.discoveryService.TriggerPeerExchange(ctx) + if err := r.discoveryService.TriggerPeerExchange(ctx); err != nil { + r.logger.Warn("Failed to trigger peer exchange before cluster wait", zap.Error(err)) + } checkInterval := 2 * time.Second for { @@ -92,7 +94,9 @@ func (r *RQLiteManager) performPreStartClusterDiscovery(ctx context.Context, rql return fmt.Errorf("discovery service not available") } - _ = r.discoveryService.TriggerPeerExchange(ctx) + if err := r.discoveryService.TriggerPeerExchange(ctx); err != nil { + r.logger.Warn("Failed to trigger peer exchange during pre-start discovery", zap.Error(err)) + } time.Sleep(1 * time.Second) r.discoveryService.TriggerSync() time.Sleep(2 * time.Second) @@ -123,7 +127,9 @@ func (r *RQLiteManager) performPreStartClusterDiscovery(ctx context.Context, rql zap.Int("discovered_peers", discoveredPeers), zap.Int("min_cluster_size", r.config.MinClusterSize)) // Still write peers.json with just ourselves - better than nothing - _ = r.discoveryService.ForceWritePeersJSON() + if err := r.discoveryService.ForceWritePeersJSON(); err != nil { + r.logger.Warn("Failed to write single-node peers.json fallback", zap.Error(err)) + } return nil } @@ -137,8 +143,12 @@ func (r *RQLiteManager) performPreStartClusterDiscovery(ctx context.Context, rql } if ourLogIndex == 0 && maxPeerIndex > 0 { - _ = r.clearRaftState(rqliteDataDir) - _ = r.discoveryService.ForceWritePeersJSON() + if err := r.clearRaftState(rqliteDataDir); err != nil { + r.logger.Warn("Failed to clear raft state during pre-start discovery", zap.Error(err)) + } + if err := r.discoveryService.ForceWritePeersJSON(); err != nil { + r.logger.Warn("Failed to write peers.json after clearing raft state", zap.Error(err)) + } } } @@ -150,7 +160,9 @@ func (r *RQLiteManager) performPreStartClusterDiscovery(ctx context.Context, rql // recoverCluster restarts RQLite using peers.json func (r *RQLiteManager) recoverCluster(ctx context.Context, peersJSONPath string) error { - _ = r.Stop() + if err := r.Stop(); err != nil { + r.logger.Warn("Failed to stop RQLite during cluster recovery", zap.Error(err)) + } time.Sleep(2 * time.Second) rqliteDataDir, err := r.rqliteDataDirPath() @@ -187,10 +199,14 @@ func (r *RQLiteManager) recoverFromSplitBrain(ctx context.Context) error { } if ourIndex == 0 && maxPeerIndex > 0 { - _ = r.clearRaftState(rqliteDataDir) + if err := r.clearRaftState(rqliteDataDir); err != nil { + r.logger.Warn("Failed to clear raft state during split-brain recovery", zap.Error(err)) + } r.discoveryService.TriggerPeerExchange(ctx) time.Sleep(1 * time.Second) - _ = r.discoveryService.ForceWritePeersJSON() + if err := r.discoveryService.ForceWritePeersJSON(); err != nil { + r.logger.Warn("Failed to write peers.json during split-brain recovery", zap.Error(err)) + } return r.recoverCluster(ctx, filepath.Join(rqliteDataDir, "raft", "peers.json")) } @@ -265,7 +281,9 @@ func (r *RQLiteManager) startHealthMonitoring(ctx context.Context) { return case <-ticker.C: if r.isInSplitBrainState() { - _ = r.recoverFromSplitBrain(ctx) + if err := r.recoverFromSplitBrain(ctx); err != nil { + r.logger.Warn("Split-brain recovery attempt failed", zap.Error(err)) + } } } } diff --git a/pkg/serverless/cache/module_cache.go b/pkg/serverless/cache/module_cache.go index 2144606..a9e0a62 100644 --- a/pkg/serverless/cache/module_cache.go +++ b/pkg/serverless/cache/module_cache.go @@ -3,14 +3,21 @@ package cache import ( "context" "sync" + "time" "github.com/tetratelabs/wazero" "go.uber.org/zap" ) +// cacheEntry wraps a compiled module with access tracking for LRU eviction. +type cacheEntry struct { + module wazero.CompiledModule + lastAccessed time.Time +} + // ModuleCache manages compiled WASM module caching. type ModuleCache struct { - modules map[string]wazero.CompiledModule + modules map[string]*cacheEntry mu sync.RWMutex capacity int logger *zap.Logger @@ -19,7 +26,7 @@ type ModuleCache struct { // NewModuleCache creates a new ModuleCache. func NewModuleCache(capacity int, logger *zap.Logger) *ModuleCache { return &ModuleCache{ - modules: make(map[string]wazero.CompiledModule), + modules: make(map[string]*cacheEntry), capacity: capacity, logger: logger, } @@ -27,15 +34,20 @@ func NewModuleCache(capacity int, logger *zap.Logger) *ModuleCache { // Get retrieves a compiled module from the cache. func (c *ModuleCache) Get(wasmCID string) (wazero.CompiledModule, bool) { - c.mu.RLock() - defer c.mu.RUnlock() + c.mu.Lock() + defer c.mu.Unlock() - module, exists := c.modules[wasmCID] - return module, exists + entry, exists := c.modules[wasmCID] + if !exists { + return nil, false + } + + entry.lastAccessed = time.Now() + return entry.module, true } // Set stores a compiled module in the cache. -// If the cache is full, it evicts the oldest module. +// If the cache is full, it evicts the least recently used module. func (c *ModuleCache) Set(wasmCID string, module wazero.CompiledModule) { c.mu.Lock() defer c.mu.Unlock() @@ -50,7 +62,10 @@ func (c *ModuleCache) Set(wasmCID string, module wazero.CompiledModule) { c.evictOldest() } - c.modules[wasmCID] = module + c.modules[wasmCID] = &cacheEntry{ + module: module, + lastAccessed: time.Now(), + } c.logger.Debug("Module cached", zap.String("wasm_cid", wasmCID), @@ -63,8 +78,8 @@ func (c *ModuleCache) Delete(ctx context.Context, wasmCID string) { c.mu.Lock() defer c.mu.Unlock() - if module, exists := c.modules[wasmCID]; exists { - _ = module.Close(ctx) + if entry, exists := c.modules[wasmCID]; exists { + _ = entry.module.Close(ctx) delete(c.modules, wasmCID) c.logger.Debug("Module removed from cache", zap.String("wasm_cid", wasmCID)) } @@ -97,8 +112,8 @@ func (c *ModuleCache) Clear(ctx context.Context) { c.mu.Lock() defer c.mu.Unlock() - for cid, module := range c.modules { - if err := module.Close(ctx); err != nil { + for cid, entry := range c.modules { + if err := entry.module.Close(ctx); err != nil { c.logger.Warn("Failed to close cached module during clear", zap.String("cid", cid), zap.Error(err), @@ -106,7 +121,7 @@ func (c *ModuleCache) Clear(ctx context.Context) { } } - c.modules = make(map[string]wazero.CompiledModule) + c.modules = make(map[string]*cacheEntry) c.logger.Debug("Module cache cleared") } @@ -118,16 +133,23 @@ func (c *ModuleCache) GetStats() (size int, capacity int) { return len(c.modules), c.capacity } -// evictOldest removes the oldest module from cache. +// evictOldest removes the least recently accessed module from cache. // Must be called with mu held. func (c *ModuleCache) evictOldest() { - // Simple LRU: just remove the first one we find - // In production, you'd want proper LRU tracking - for cid, module := range c.modules { - _ = module.Close(context.Background()) - delete(c.modules, cid) - c.logger.Debug("Evicted module from cache", zap.String("wasm_cid", cid)) - break + var oldestCID string + var oldestTime time.Time + + for cid, entry := range c.modules { + if oldestCID == "" || entry.lastAccessed.Before(oldestTime) { + oldestCID = cid + oldestTime = entry.lastAccessed + } + } + + if oldestCID != "" { + _ = c.modules[oldestCID].module.Close(context.Background()) + delete(c.modules, oldestCID) + c.logger.Debug("Evicted LRU module from cache", zap.String("wasm_cid", oldestCID)) } } @@ -135,12 +157,13 @@ func (c *ModuleCache) evictOldest() { // The compute function is called with the lock released to avoid blocking. func (c *ModuleCache) GetOrCompute(wasmCID string, compute func() (wazero.CompiledModule, error)) (wazero.CompiledModule, error) { // Try to get from cache first - c.mu.RLock() - if module, exists := c.modules[wasmCID]; exists { - c.mu.RUnlock() - return module, nil + c.mu.Lock() + if entry, exists := c.modules[wasmCID]; exists { + entry.lastAccessed = time.Now() + c.mu.Unlock() + return entry.module, nil } - c.mu.RUnlock() + c.mu.Unlock() // Compute the module (without holding the lock) module, err := compute() @@ -153,9 +176,10 @@ func (c *ModuleCache) GetOrCompute(wasmCID string, compute func() (wazero.Compil defer c.mu.Unlock() // Double-check (another goroutine might have added it) - if existingModule, exists := c.modules[wasmCID]; exists { + if entry, exists := c.modules[wasmCID]; exists { _ = module.Close(context.Background()) // Discard our compilation - return existingModule, nil + entry.lastAccessed = time.Now() + return entry.module, nil } // Evict if cache is full @@ -163,7 +187,10 @@ func (c *ModuleCache) GetOrCompute(wasmCID string, compute func() (wazero.Compil c.evictOldest() } - c.modules[wasmCID] = module + c.modules[wasmCID] = &cacheEntry{ + module: module, + lastAccessed: time.Now(), + } c.logger.Debug("Module compiled and cached", zap.String("wasm_cid", wasmCID), diff --git a/pkg/serverless/execution/lifecycle.go b/pkg/serverless/execution/lifecycle.go index 22f9f20..ca94e64 100644 --- a/pkg/serverless/execution/lifecycle.go +++ b/pkg/serverless/execution/lifecycle.go @@ -81,36 +81,3 @@ func (m *ModuleLifecycle) ValidateModule(module wazero.CompiledModule) error { return nil } -// InstantiateModule creates a module instance for execution. -// Note: This method is currently unused but kept for potential future use. -func (m *ModuleLifecycle) InstantiateModule(ctx context.Context, compiled wazero.CompiledModule, config wazero.ModuleConfig) error { - if compiled == nil { - return fmt.Errorf("compiled module is nil") - } - - instance, err := m.runtime.InstantiateModule(ctx, compiled, config) - if err != nil { - return fmt.Errorf("failed to instantiate module: %w", err) - } - - // Close immediately - this is just for validation - _ = instance.Close(ctx) - - return nil -} - -// ModuleInfo provides information about a compiled module. -type ModuleInfo struct { - CID string - SizeBytes int - Compiled bool -} - -// GetModuleInfo returns information about a module. -func (m *ModuleLifecycle) GetModuleInfo(wasmCID string, wasmBytes []byte, isCompiled bool) *ModuleInfo { - return &ModuleInfo{ - CID: wasmCID, - SizeBytes: len(wasmBytes), - Compiled: isCompiled, - } -} diff --git a/pkg/serverless/invoke.go b/pkg/serverless/invoke.go index 87ba126..0108769 100644 --- a/pkg/serverless/invoke.go +++ b/pkg/serverless/invoke.go @@ -3,6 +3,7 @@ package serverless import ( "context" "encoding/json" + "errors" "fmt" "time" @@ -249,7 +250,7 @@ func (i *Invoker) isRetryable(err error) bool { // Retry execution errors (could be transient) var execErr *ExecutionError - if ok := errorAs(err, &execErr); ok { + if errors.As(err, &execErr) { return true } @@ -347,22 +348,6 @@ type DLQMessage struct { CallerWallet string `json:"caller_wallet,omitempty"` } -// errorAs is a helper to avoid import of errors package. -func errorAs(err error, target interface{}) bool { - if err == nil { - return false - } - // Simple type assertion for our custom error types - switch t := target.(type) { - case **ExecutionError: - if e, ok := err.(*ExecutionError); ok { - *t = e - return true - } - } - return false -} - // ----------------------------------------------------------------------------- // Batch Invocation (for future use) // ----------------------------------------------------------------------------- diff --git a/pkg/serverless/registry.go b/pkg/serverless/registry.go index 0d2bf6f..38102ed 100644 --- a/pkg/serverless/registry.go +++ b/pkg/serverless/registry.go @@ -438,27 +438,6 @@ func (r *Registry) uploadWASM(ctx context.Context, wasmBytes []byte, name string return resp.Cid, nil } -// getLatestVersion returns the latest version number for a function. -func (r *Registry) getLatestVersion(ctx context.Context, namespace, name string) (int, error) { - query := `SELECT MAX(version) FROM functions WHERE namespace = ? AND name = ?` - - var maxVersion sql.NullInt64 - var results []struct { - MaxVersion sql.NullInt64 `db:"max(version)"` - } - - if err := r.db.Query(ctx, &results, query, namespace, name); err != nil { - return 0, err - } - - if len(results) == 0 || !results[0].MaxVersion.Valid { - return 0, ErrFunctionNotFound - } - - maxVersion = results[0].MaxVersion - return int(maxVersion.Int64), nil -} - // getByNameInternal retrieves a function by name regardless of status. func (r *Registry) getByNameInternal(ctx context.Context, namespace, name string) (*Function, error) { namespace = strings.TrimSpace(namespace) diff --git a/pkg/tlsutil/client.go b/pkg/tlsutil/client.go index 28feadf..0d16ce1 100644 --- a/pkg/tlsutil/client.go +++ b/pkg/tlsutil/client.go @@ -82,12 +82,9 @@ func GetTLSConfig() *tls.Config { MinVersion: tls.VersionTLS12, } - // If we have a CA cert pool, use it + // If we have a CA cert pool, use it for verifying self-signed certs if caCertPool != nil { config.RootCAs = caCertPool - } else if len(trustedDomains) > 0 { - // Fallback: skip verification if trusted domains are configured but no CA pool - config.InsecureSkipVerify = true } return config @@ -103,11 +100,12 @@ func NewHTTPClient(timeout time.Duration) *http.Client { } } -// NewHTTPClientForDomain creates an HTTP client configured for a specific domain +// NewHTTPClientForDomain creates an HTTP client configured for a specific domain. +// Only skips TLS verification for explicitly trusted domains when no CA cert is available. func NewHTTPClientForDomain(timeout time.Duration, hostname string) *http.Client { tlsConfig := GetTLSConfig() - // If this domain is in trusted list and we don't have a CA pool, allow insecure + // Only skip TLS for explicitly trusted domains when no CA pool is configured if caCertPool == nil && ShouldSkipTLSVerify(hostname) { tlsConfig.InsecureSkipVerify = true } diff --git a/pkg/wireguard/ip.go b/pkg/wireguard/ip.go new file mode 100644 index 0000000..5bd14d7 --- /dev/null +++ b/pkg/wireguard/ip.go @@ -0,0 +1,24 @@ +package wireguard + +import ( + "fmt" + "net" +) + +// GetIP returns the IPv4 address of the wg0 interface. +func GetIP() (string, error) { + iface, err := net.InterfaceByName("wg0") + if err != nil { + return "", fmt.Errorf("wg0 interface not found: %w", err) + } + addrs, err := iface.Addrs() + if err != nil { + return "", fmt.Errorf("failed to get wg0 addresses: %w", err) + } + for _, addr := range addrs { + if ipnet, ok := addr.(*net.IPNet); ok && ipnet.IP.To4() != nil { + return ipnet.IP.String(), nil + } + } + return "", fmt.Errorf("no IPv4 address on wg0") +} diff --git a/scripts/block-node.sh b/scripts/block-node.sh deleted file mode 100755 index 674e48d..0000000 --- a/scripts/block-node.sh +++ /dev/null @@ -1,298 +0,0 @@ -#!/usr/bin/env bash -# block-node.sh - Temporarily block network access to a gateway node (local or remote) -# Usage: -# Local: ./scripts/block-node.sh -# Remote: ./scripts/block-node.sh --remote -# Example: -# ./scripts/block-node.sh 1 60 # Block local node-1 (port 6001) for 60 seconds -# ./scripts/block-node.sh --remote 2 120 # Block remote node-2 for 120 seconds - -set -euo pipefail - -# Colors for output -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -BLUE='\033[0;34m' -NC='\033[0m' # No Color - -# Remote node configurations - loaded from config file -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -CONFIG_FILE="$SCRIPT_DIR/remote-nodes.conf" - -# Function to get remote node config -get_remote_node_config() { - local node_num="$1" - local field="$2" # "user_host" or "password" - - if [ ! -f "$CONFIG_FILE" ]; then - echo "" - return 1 - fi - - while IFS='|' read -r num user_host password || [ -n "$num" ]; do - # Skip comments and empty lines - [[ "$num" =~ ^#.*$ ]] || [[ -z "$num" ]] && continue - # Trim whitespace - num=$(echo "$num" | xargs) - user_host=$(echo "$user_host" | xargs) - password=$(echo "$password" | xargs) - - if [ "$num" = "$node_num" ]; then - if [ "$field" = "user_host" ]; then - echo "$user_host" - elif [ "$field" = "password" ]; then - echo "$password" - fi - return 0 - fi - done < "$CONFIG_FILE" - - echo "" - return 1 -} - -# Display usage -usage() { - echo -e "${RED}Error:${NC} Invalid arguments" - echo "" - echo -e "${BLUE}Usage:${NC}" - echo " $0 # Local mode" - echo " $0 --remote # Remote mode" - echo "" - echo -e "${GREEN}Local Mode Examples:${NC}" - echo " $0 1 60 # Block local node-1 (port 6001) for 60 seconds" - echo " $0 2 120 # Block local node-2 (port 6002) for 120 seconds" - echo "" - echo -e "${GREEN}Remote Mode Examples:${NC}" - echo " $0 --remote 1 60 # Block remote node-1 (51.83.128.181) for 60 seconds" - echo " $0 --remote 3 120 # Block remote node-3 (83.171.248.66) for 120 seconds" - echo "" - echo -e "${YELLOW}Local Node Mapping:${NC}" - echo " Node 1 -> Port 6001" - echo " Node 2 -> Port 6002" - echo " Node 3 -> Port 6003" - echo " Node 4 -> Port 6004" - echo " Node 5 -> Port 6005" - echo "" - echo -e "${YELLOW}Remote Node Mapping:${NC}" - echo " Remote 1 -> ubuntu@51.83.128.181" - echo " Remote 2 -> root@194.61.28.7" - echo " Remote 3 -> root@83.171.248.66" - echo " Remote 4 -> root@62.72.44.87" - exit 1 -} - -# Parse arguments -REMOTE_MODE=false -if [ $# -eq 3 ] && [ "$1" == "--remote" ]; then - REMOTE_MODE=true - NODE_NUM="$2" - DURATION="$3" -elif [ $# -eq 2 ]; then - NODE_NUM="$1" - DURATION="$2" -else - usage -fi - -# Validate duration -if ! [[ "$DURATION" =~ ^[0-9]+$ ]] || [ "$DURATION" -le 0 ]; then - echo -e "${RED}Error:${NC} Duration must be a positive integer" - exit 1 -fi - -# Calculate port (local nodes use 6001-6005, remote nodes use 80 and 443) -if [ "$REMOTE_MODE" = true ]; then - # Remote nodes: block standard HTTP/HTTPS ports - PORTS="80 443" -else - # Local nodes: block the specific gateway port - PORT=$((6000 + NODE_NUM)) -fi - -# Function to block ports on remote server -block_remote_node() { - local node_num="$1" - local duration="$2" - local ports="$3" # Can be space-separated list like "80 443" - - # Validate remote node number - if ! [[ "$node_num" =~ ^[1-4]$ ]]; then - echo -e "${RED}Error:${NC} Remote node number must be between 1 and 4" - exit 1 - fi - - # Get credentials from config file - local user_host=$(get_remote_node_config "$node_num" "user_host") - local password=$(get_remote_node_config "$node_num" "password") - - if [ -z "$user_host" ] || [ -z "$password" ]; then - echo -e "${RED}Error:${NC} Configuration for remote node $node_num not found in $CONFIG_FILE" - exit 1 - fi - - local host="${user_host##*@}" - - echo -e "${BLUE}=== Remote Network Blocking Tool ===${NC}" - echo -e "Remote Node: ${GREEN}$node_num${NC} ($user_host)" - echo -e "Ports: ${GREEN}$ports${NC}" - echo -e "Duration: ${GREEN}$duration seconds${NC}" - echo "" - - # Check if sshpass is installed - if ! command -v sshpass &> /dev/null; then - echo -e "${RED}Error:${NC} sshpass is not installed. Install it first:" - echo -e " ${YELLOW}macOS:${NC} brew install hudochenkov/sshpass/sshpass" - echo -e " ${YELLOW}Ubuntu/Debian:${NC} sudo apt-get install sshpass" - exit 1 - fi - - # SSH options - force password authentication only to avoid "too many auth failures" - SSH_OPTS="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR -o PreferredAuthentications=password -o PubkeyAuthentication=no -o NumberOfPasswordPrompts=1" - - echo -e "${YELLOW}Connecting to remote server...${NC}" - - # Test connection - if ! sshpass -p "$password" ssh $SSH_OPTS "$user_host" "echo 'Connected successfully' > /dev/null"; then - echo -e "${RED}Error:${NC} Failed to connect to $user_host" - exit 1 - fi - - echo -e "${GREEN}✓${NC} Connected to $host" - - # Install iptables rules on remote server - echo -e "${YELLOW}Installing iptables rules on remote server...${NC}" - - # Build iptables commands for all ports - BLOCK_CMDS="" - for port in $ports; do - BLOCK_CMDS="${BLOCK_CMDS}iptables -I INPUT -p tcp --dport $port -j DROP 2>/dev/null || true; " - BLOCK_CMDS="${BLOCK_CMDS}iptables -I OUTPUT -p tcp --sport $port -j DROP 2>/dev/null || true; " - done - BLOCK_CMDS="${BLOCK_CMDS}echo 'Rules installed'" - - if ! sshpass -p "$password" ssh $SSH_OPTS "$user_host" "$BLOCK_CMDS"; then - echo -e "${RED}Error:${NC} Failed to install iptables rules" - exit 1 - fi - - echo -e "${GREEN}✓${NC} Ports $ports are now blocked on $host" - echo -e "${YELLOW}Waiting $duration seconds...${NC}" - echo "" - - # Show countdown - for ((i=duration; i>0; i--)); do - printf "\r${BLUE}Time remaining: %3d seconds${NC}" "$i" - sleep 1 - done - - echo "" - echo "" - echo -e "${YELLOW}Removing iptables rules from remote server...${NC}" - - # Build iptables removal commands for all ports - UNBLOCK_CMDS="" - for port in $ports; do - UNBLOCK_CMDS="${UNBLOCK_CMDS}iptables -D INPUT -p tcp --dport $port -j DROP 2>/dev/null || true; " - UNBLOCK_CMDS="${UNBLOCK_CMDS}iptables -D OUTPUT -p tcp --sport $port -j DROP 2>/dev/null || true; " - done - UNBLOCK_CMDS="${UNBLOCK_CMDS}echo 'Rules removed'" - - if ! sshpass -p "$password" ssh $SSH_OPTS "$user_host" "$UNBLOCK_CMDS"; then - echo -e "${YELLOW}Warning:${NC} Failed to remove some iptables rules. You may need to clean up manually." - else - echo -e "${GREEN}✓${NC} Ports $ports are now accessible again on $host" - fi - - echo "" - echo -e "${GREEN}=== Done! ===${NC}" - echo -e "Remote node ${GREEN}$node_num${NC} ($host) was unreachable for $duration seconds and is now accessible again." -} - -# Function to block port locally using process pause (SIGSTOP) -block_local_node() { - local node_num="$1" - local duration="$2" - local port="$3" - - # Validate node number - if ! [[ "$node_num" =~ ^[1-5]$ ]]; then - echo -e "${RED}Error:${NC} Local node number must be between 1 and 5" - exit 1 - fi - - echo -e "${BLUE}=== Local Network Blocking Tool ===${NC}" - echo -e "Node: ${GREEN}node-$node_num${NC}" - echo -e "Port: ${GREEN}$port${NC}" - echo -e "Duration: ${GREEN}$duration seconds${NC}" - echo -e "Method: ${GREEN}Process Pause (SIGSTOP/SIGCONT)${NC}" - echo "" - - # Find the process listening on the port - echo -e "${YELLOW}Finding process listening on port $port...${NC}" - - # macOS uses different tools than Linux - if [[ "$(uname -s)" == "Darwin" ]]; then - # macOS: use lsof - PID=$(lsof -ti :$port 2>/dev/null | head -1 || echo "") - else - # Linux: use ss or netstat - if command -v ss &> /dev/null; then - PID=$(ss -tlnp | grep ":$port " | grep -oP 'pid=\K[0-9]+' | head -1 || echo "") - else - PID=$(netstat -tlnp 2>/dev/null | grep ":$port " | awk '{print $7}' | cut -d'/' -f1 | head -1 || echo "") - fi - fi - - if [ -z "$PID" ]; then - echo -e "${RED}Error:${NC} No process found listening on port $port" - echo -e "Make sure node-$node_num is running first." - exit 1 - fi - - # Get process name - PROCESS_NAME=$(ps -p $PID -o comm= 2>/dev/null || echo "unknown") - - echo -e "${GREEN}✓${NC} Found process: ${BLUE}$PROCESS_NAME${NC} (PID: ${BLUE}$PID${NC})" - echo "" - - # Pause the process - echo -e "${YELLOW}Pausing process (SIGSTOP)...${NC}" - if ! kill -STOP $PID 2>/dev/null; then - echo -e "${RED}Error:${NC} Failed to pause process. You may need sudo privileges." - exit 1 - fi - - echo -e "${GREEN}✓${NC} Process paused - node-$node_num is now unreachable" - echo -e "${YELLOW}Waiting $duration seconds...${NC}" - echo "" - - # Show countdown - for ((i=duration; i>0; i--)); do - printf "\r${BLUE}Time remaining: %3d seconds${NC}" "$i" - sleep 1 - done - - echo "" - echo "" - - # Resume the process - echo -e "${YELLOW}Resuming process (SIGCONT)...${NC}" - if ! kill -CONT $PID 2>/dev/null; then - echo -e "${YELLOW}Warning:${NC} Failed to resume process. It may have been terminated." - else - echo -e "${GREEN}✓${NC} Process resumed - node-$node_num is now accessible again" - fi - - echo "" - echo -e "${GREEN}=== Done! ===${NC}" - echo -e "Local node ${GREEN}node-$node_num${NC} was unreachable for $duration seconds and is now accessible again." -} - -# Main execution -if [ "$REMOTE_MODE" = true ]; then - block_remote_node "$NODE_NUM" "$DURATION" "$PORTS" -else - block_local_node "$NODE_NUM" "$DURATION" "$PORT" -fi diff --git a/scripts/build-coredns.sh b/scripts/build-coredns.sh deleted file mode 100755 index c0ac5d1..0000000 --- a/scripts/build-coredns.sh +++ /dev/null @@ -1,112 +0,0 @@ -#!/bin/bash -set -e - -# Build custom CoreDNS binary with RQLite plugin -# This script compiles CoreDNS with the custom RQLite plugin - -COREDNS_VERSION="1.11.1" -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" -COREDNS_DIR="/tmp/coredns-build" - -echo "Building CoreDNS v${COREDNS_VERSION} with RQLite plugin..." - -# Clean previous build -rm -rf "$COREDNS_DIR" -mkdir -p "$COREDNS_DIR" - -# Clone CoreDNS -echo "Cloning CoreDNS..." -cd "$COREDNS_DIR" -git clone --depth 1 --branch v${COREDNS_VERSION} https://github.com/coredns/coredns.git -cd coredns - -# Create plugin.cfg with RQLite plugin -echo "Configuring plugins..." -cat > plugin.cfg </dev/null); then - print_fail "Cannot connect to RQLite on $node:5001" - continue - fi - - local state=$(echo "$response" | jq -r '.store.raft.state // "unknown"') - local num_peers=$(echo "$response" | jq -r '.store.raft.num_peers // 0') - local commit_index=$(echo "$response" | jq -r '.store.raft.commit_index // 0') - local last_contact=$(echo "$response" | jq -r '.store.raft.last_contact // "N/A"') - local config=$(echo "$response" | jq -r '.store.raft.latest_configuration // "[]"') - local node_count=$(echo "$config" | grep -o "Address" | wc -l | tr -d ' ') - - commit_indices+=($commit_index) - - print_info "State: $state | Peers: $num_peers | Commit Index: $commit_index | Cluster Nodes: $node_count" - - # Check state - if [ "$state" = "Leader" ]; then - leader_found=true - print_pass "Node $node is the Leader" - elif [ "$state" = "Follower" ]; then - follower_count=$((follower_count + 1)) - # Check last contact - if [ "$last_contact" != "N/A" ] && [ "$last_contact" != "0" ]; then - print_pass "Node $node is a Follower (last contact: $last_contact)" - else - print_warn "Node $node is Follower but last_contact is $last_contact" - fi - else - print_fail "Node $node has unexpected state: $state" - fi - - # Check peer count - if [ "$num_peers" = "2" ]; then - print_pass "Node $node has correct peer count: 2" - else - print_fail "Node $node has incorrect peer count: $num_peers (expected 2)" - fi - - # Check cluster configuration - if [ "$node_count" = "3" ]; then - print_pass "Node $node sees all 3 cluster members" - else - print_fail "Node $node only sees $node_count cluster members (expected 3)" - fi - - echo "" - done - - # Check for exactly 1 leader - if [ "$leader_found" = true ] && [ "$follower_count" = "2" ]; then - print_pass "Cluster has 1 Leader and 2 Followers ✓" - else - print_fail "Invalid cluster state (Leader found: $leader_found, Followers: $follower_count)" - fi - - # Check commit index sync - if [ ${#commit_indices[@]} -eq 3 ]; then - local first="${commit_indices[0]}" - local all_same=true - for idx in "${commit_indices[@]}"; do - if [ "$idx" != "$first" ]; then - all_same=false - break - fi - done - - if [ "$all_same" = true ]; then - print_pass "All nodes have synced commit index: $first" - else - print_warn "Commit indices differ: ${commit_indices[*]} (might be normal if writes are happening)" - fi - fi -} - -test_rqlite_replication() { - print_header "2. RQLITE REPLICATION TEST" - - print_test "Creating test table and inserting data on leader ($BOOTSTRAP)" - - # Create table - if ! response=$(curl -s --max-time 5 -XPOST "http://$BOOTSTRAP:5001/db/execute" \ - -H "Content-Type: application/json" \ - -d '[["CREATE TABLE IF NOT EXISTS test_cluster_health (id INTEGER PRIMARY KEY AUTOINCREMENT, timestamp TEXT, node TEXT, value TEXT)"]]' 2>/dev/null); then - print_fail "Failed to create table" - return - fi - - if echo "$response" | jq -e '.results[0].error' >/dev/null 2>&1; then - local error=$(echo "$response" | jq -r '.results[0].error') - if [[ "$error" != "table test_cluster_health already exists" ]]; then - print_fail "Table creation error: $error" - return - fi - fi - print_pass "Table exists" - - # Insert test data - local test_value="test_$(date +%s)" - if ! response=$(curl -s --max-time 5 -XPOST "http://$BOOTSTRAP:5001/db/execute" \ - -H "Content-Type: application/json" \ - -d "[ - [\"INSERT INTO test_cluster_health (timestamp, node, value) VALUES (datetime('now'), 'bootstrap', '$test_value')\"] - ]" 2>/dev/null); then - print_fail "Failed to insert data" - return - fi - - if echo "$response" | jq -e '.results[0].error' >/dev/null 2>&1; then - local error=$(echo "$response" | jq -r '.results[0].error') - print_fail "Insert error: $error" - return - fi - print_pass "Data inserted: $test_value" - - # Wait for replication - print_info "Waiting 2 seconds for replication..." - sleep 2 - - # Query from all nodes - for node in "${ALL_NODES[@]}"; do - print_test "Reading from $node" - - if ! response=$(curl -s --max-time 5 -XPOST "http://$node:5001/db/query?level=weak" \ - -H "Content-Type: application/json" \ - -d "[\"SELECT * FROM test_cluster_health WHERE value = '$test_value' LIMIT 1\"]" 2>/dev/null); then - print_fail "Failed to query from $node" - continue - fi - - if echo "$response" | jq -e '.results[0].error' >/dev/null 2>&1; then - local error=$(echo "$response" | jq -r '.results[0].error') - print_fail "Query error on $node: $error" - continue - fi - - local row_count=$(echo "$response" | jq -r '.results[0].values | length // 0') - if [ "$row_count" = "1" ]; then - local retrieved_value=$(echo "$response" | jq -r '.results[0].values[0][3] // ""') - if [ "$retrieved_value" = "$test_value" ]; then - print_pass "Data replicated correctly to $node" - else - print_fail "Data mismatch on $node (got: $retrieved_value, expected: $test_value)" - fi - else - print_fail "Expected 1 row from $node, got $row_count" - fi - done -} - -test_ipfs_status() { - print_header "3. IPFS DAEMON STATUS" - - for node in "${ALL_NODES[@]}"; do - print_test "Testing IPFS on $node" - - if ! response=$(curl -s --max-time 5 -X POST http://$node:4501/api/v0/id 2>/dev/null); then - print_fail "Cannot connect to IPFS on $node:4501" - continue - fi - - local peer_id=$(echo "$response" | jq -r '.ID // "unknown"') - local addr_count=$(echo "$response" | jq -r '.Addresses | length // 0') - local agent=$(echo "$response" | jq -r '.AgentVersion // "unknown"') - - if [ "$peer_id" != "unknown" ]; then - print_pass "IPFS running on $node (ID: ${peer_id:0:12}...)" - print_info "Agent: $agent | Addresses: $addr_count" - else - print_fail "IPFS not responding correctly on $node" - fi - done -} - -test_ipfs_swarm() { - print_header "4. IPFS SWARM CONNECTIVITY" - - for node in "${ALL_NODES[@]}"; do - print_test "Checking IPFS swarm peers on $node" - - if ! response=$(curl -s --max-time 5 -X POST http://$node:4501/api/v0/swarm/peers 2>/dev/null); then - print_fail "Failed to get swarm peers from $node" - continue - fi - - local peer_count=$(echo "$response" | jq -r '.Peers | length // 0') - - if [ "$peer_count" = "2" ]; then - print_pass "Node $node connected to 2 IPFS peers" - elif [ "$peer_count" -gt "0" ]; then - print_warn "Node $node connected to $peer_count IPFS peers (expected 2)" - else - print_fail "Node $node has no IPFS swarm peers" - fi - done -} - -test_ipfs_cluster_status() { - print_header "5. IPFS CLUSTER STATUS" - - for node in "${ALL_NODES[@]}"; do - print_test "Testing IPFS Cluster on $node" - - if ! response=$(curl -s --max-time 5 http://$node:9094/id 2>/dev/null); then - print_fail "Cannot connect to IPFS Cluster on $node:9094" - continue - fi - - local cluster_id=$(echo "$response" | jq -r '.id // "unknown"') - local cluster_peers=$(echo "$response" | jq -r '.cluster_peers | length // 0') - local version=$(echo "$response" | jq -r '.version // "unknown"') - - if [ "$cluster_id" != "unknown" ]; then - print_pass "IPFS Cluster running on $node (ID: ${cluster_id:0:12}...)" - print_info "Version: $version | Cluster Peers: $cluster_peers" - - if [ "$cluster_peers" = "3" ]; then - print_pass "Node $node sees all 3 cluster peers" - else - print_warn "Node $node sees $cluster_peers cluster peers (expected 3)" - fi - else - print_fail "IPFS Cluster not responding correctly on $node" - fi - done -} - -test_ipfs_cluster_pins() { - print_header "6. IPFS CLUSTER PIN CONSISTENCY" - - local pin_counts=() - - for node in "${ALL_NODES[@]}"; do - print_test "Checking pins on $node" - - if ! response=$(curl -s --max-time 5 http://$node:9094/pins 2>/dev/null); then - print_fail "Failed to get pins from $node" - pin_counts+=(0) - continue - fi - - local pin_count=$(echo "$response" | jq -r 'length // 0') - pin_counts+=($pin_count) - print_pass "Node $node has $pin_count pins" - done - - # Check if all nodes have same pin count - if [ ${#pin_counts[@]} -eq 3 ]; then - local first="${pin_counts[0]}" - local all_same=true - for count in "${pin_counts[@]}"; do - if [ "$count" != "$first" ]; then - all_same=false - break - fi - done - - if [ "$all_same" = true ]; then - print_pass "All nodes have consistent pin count: $first" - else - print_warn "Pin counts differ: ${pin_counts[*]} (might be syncing)" - fi - fi -} - -print_summary() { - print_header "TEST SUMMARY" - - echo "" - echo -e "${GREEN}Passed: $PASSED${NC}" - echo -e "${YELLOW}Warnings: $WARNINGS${NC}" - echo -e "${RED}Failed: $FAILED${NC}" - echo "" - - if [ $FAILED -eq 0 ]; then - echo -e "${GREEN}🎉 All critical tests passed! Cluster is healthy.${NC}" - exit 0 - elif [ $FAILED -le 2 ]; then - echo -e "${YELLOW}⚠️ Some tests failed. Review the output above.${NC}" - exit 1 - else - echo -e "${RED}❌ Multiple failures detected. Cluster needs attention.${NC}" - exit 2 - fi -} - -# Main execution -main() { - echo "" - echo -e "${BLUE}╔════════════════════════════════════════════╗${NC}" - echo -e "${BLUE}║ DEBROS Production Cluster Health Check ║${NC}" - echo -e "${BLUE}╚════════════════════════════════════════════╝${NC}" - echo "" - echo "Testing cluster:" - echo " Bootstrap: $BOOTSTRAP" - echo " Node 1: $NODE1" - echo " Node 2: $NODE2" - - test_rqlite_status - test_rqlite_replication - test_ipfs_status - test_ipfs_swarm - test_ipfs_cluster_status - test_ipfs_cluster_pins - print_summary -} - -# Run main -main - diff --git a/terms-agreement b/terms-agreement deleted file mode 100644 index e340065..0000000 --- a/terms-agreement +++ /dev/null @@ -1 +0,0 @@ -agreed \ No newline at end of file